From bc8b9578a374edd92cae8dbb31d6e5dca7f45d04 Mon Sep 17 00:00:00 2001
From: Jonas Mueller <1390638+jwmueller@users.noreply.github.com>
Date: Wed, 28 Aug 2024 11:29:40 -0700
Subject: [PATCH] link to article

---
 llm_evals_w_crowdlab/llm_evals_w_crowdlab.ipynb | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llm_evals_w_crowdlab/llm_evals_w_crowdlab.ipynb b/llm_evals_w_crowdlab/llm_evals_w_crowdlab.ipynb
index 502f469..005ae62 100644
--- a/llm_evals_w_crowdlab/llm_evals_w_crowdlab.ipynb
+++ b/llm_evals_w_crowdlab/llm_evals_w_crowdlab.ipynb
@@ -11,7 +11,7 @@
         "\n",
         "Here we consider the MT-Bench dataset, which contains: many user requests, two possible responses for each request from different LLM models, and annotations regarding which of the two responses is considered better. Each example has a varying number of judge annotations provided by authors of the original paper and other \"experts\" (graduate students). We use CROWDLAB to: produce high-quality final consensus annotations (to enable accurate LLM Evals) as well as measure the quality of the annotators. CROWDLAB relies on probabilistic predictions from any ML model -- here we use logprobs from GPT-4 applied in the LLM-as-judge framework.\n",
         "\n",
-        "You can use the same technique for any LLM Evals involving multiple human/AI judges, to help your team better evaluate models.\n"
+        "You can use the same technique for any LLM Evals involving multiple human/AI judges, to help your team better evaluate models. Read more in our [blog](https://cleanlab.ai/blog/team-llm-evals/).\n"
       ]
     },
     {
@@ -4520,7 +4520,9 @@
         "id": "87d37120-cd8c-4ce7-ac4e-a1e4c3ec19a3"
       },
       "source": [
-        "Experts and authors seem to have roughly similar annotator quality! That's a neat observation, especially since we don't have ground truth labels"
+        "Experts and authors seem to have roughly similar annotator quality! That's a neat observation, especially since we don't have ground truth labels.\n",
+        "\n",
+        "Learn more about proper Evals that combine human and LLM judges in our [blog](https://cleanlab.ai/blog/team-llm-evals/)."
       ]
     }
   ],