Add G-Pass@k Metric (#589)

jnanliu · NathanHB · web-flow · commit 89ba8807bb16 · 2025-04-22T13:18:14.000+02:00
* add gpassk metric

* fix pre-commit error

* fix return type check

* fix metrics

* support gpassk for aime24/25 and math_500

* fix List to list

* remove List

---------

Co-authored-by: Nathan Habib &lt;30601243+NathanHB@users.noreply.github.com&gt;
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
@@ -51,6 +51,7 @@
     Extractiveness,
     F1_score,
     Faithfulness,
+    GPassAtK,
     LoglikelihoodAcc,
     MajAtK,
     PassAtK,
@@ -578,6 +579,64 @@ class Metrics(Enum):
         corpus_level_fn=np.mean,
         higher_is_better=True,
     )
+    g_pass_at_16 = SampleLevelMetricGrouping(
+        metric_name="G-Pass@16:48_samples",
+        sample_level_fn=GPassAtK(k=16, n=48, strip_strings=True).compute,
+        category=MetricCategory.GENERATIVE_SAMPLING,
+        use_case=MetricUseCase.REASONING,
+        corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
+        higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
+    )
+    g_pass_at_8_16 = SampleLevelMetricGrouping(
+        metric_name="G-Pass@8-16:48_samples",
+        sample_level_fn=GPassAtK(k=[8, 16], n=48, strip_strings=True).compute,
+        category=MetricCategory.GENERATIVE_SAMPLING,
+        use_case=MetricUseCase.REASONING,
+        corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
+        higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
+    )
+    g_pass_at_16_expr_gold = SampleLevelMetricGrouping(
+        metric_name="G-Pass@16:48_samples",
+        sample_level_fn=GPassAtK(
+            k=16,
+            n=48,
+            strip_strings=True,
+            sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric(
+                language=Language.ENGLISH,
+                fallback_mode="first_match",
+                precision=5,
+                gold_extraction_target=(ExprExtractionConfig(),),
+                # Match boxed first before trying other regexes
+                pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)),
+                aggregation_function=max,
+            ).sample_level_fn([ref], [pred], doc),
+        ).compute,
+        category=MetricCategory.GENERATIVE_SAMPLING,
+        use_case=MetricUseCase.REASONING,
+        corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
+        higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
+    )
+    g_pass_at_16_latex_gold = SampleLevelMetricGrouping(
+        metric_name="G-Pass@16:48_samples",
+        sample_level_fn=GPassAtK(
+            k=16,
+            n=48,
+            strip_strings=True,
+            sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric(
+                language=Language.ENGLISH,
+                fallback_mode="first_match",
+                precision=5,
+                gold_extraction_target=(LatexExtractionConfig(),),
+                # Match boxed first before trying other regexes
+                pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)),
+                aggregation_function=max,
+            ).sample_level_fn([ref], [pred], doc),
+        ).compute,
+        category=MetricCategory.GENERATIVE_SAMPLING,
+        use_case=MetricUseCase.REASONING,
+        corpus_level_fn={metric: np.mean for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
+        higher_is_better={metric: True for metric in GPassAtK(k=16, n=48, strip_strings=True).all_metrics},
+    )
     perfect_exact_match = SampleLevelMetric(
         metric_name="perfect_em",
         sample_level_fn=ExactMatches().compute,
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
@@ -36,6 +36,7 @@
 from nltk.tokenize.treebank import TreebankWordTokenizer
 from nltk.translate.bleu_score import sentence_bleu
 from pydantic import BaseModel
+from scipy.stats import hypergeom
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
 from lighteval.metrics.imports.bert_scorer import BERTScorer
@@ -1189,3 +1190,164 @@ def pass_at_k(self, all_scores: list[int]) -> float:
             return 1.0
 
         return 1.0 - np.prod(1.0 - self.k / np.arange(self.n - c + 1, self.n + 1))
+
+
+class GPassAtK:
+    def __init__(
+        self,
+        k: Union[int, list[int]],
+        n: int = None,
+        thresholds: list[float] = [0.0, 0.25, 0.5, 0.75, 1.0],
+        normalize_gold: Callable = None,
+        normalize_pred: Callable = None,
+        strip_strings: bool = False,
+        sample_scoring_function: Union[Callable[[str, str], float], str] = None,
+    ):
+        """Computing G-Pass@k from http://arxiv.org/abs/2412.13147
+
+        Args:
+            k (int, list): The number of successful attempts to be considered.
+            n (int): Number of samples to generate.
+            thresholds (list): Thresholds to control successful attempts in k generate.
+            normalize_gold (callable, optional): Function to use to normalize the reference strings.
+                Defaults to None if no normalization is applied.
+            normalize_pred (callable, optional): Function to use to normalize the predicted strings.
+                Defaults to None if no normalization is applied.
+            strip_strings (bool, optional): Whether to strip both reference and predictions. Defaults to False.
+            sample_scoring_function (callable or str, optional): Function to use to score each sample.
+                Either pass the full function (should take a string prediction and a string gold, and return a score between 0 and 1)
+                a string (any of `prefix`, `suffix` or `full`) to define the type of exact match that you want, or nothing to defaults to "full".
+                    `prefix` checks if the prediction starts with the gold,
+                    `suffix` if the prediction ends with the gold,
+                    `full` if the prediction and gold are equal
+        """
+        self.k = as_list(k)
+        self.n = n
+        self.thresholds = thresholds
+        self.normalize_gold = normalize_gold
+        self.normalize_pred = normalize_pred
+        self.strip_strings = strip_strings
+
+        # Managed the logic of the per prediction of sample scoring
+        if callable(sample_scoring_function):
+            self.score_sample = sample_scoring_function
+            self.type_exact_match = None
+        else:
+            if isinstance(sample_scoring_function, str):
+                if sample_scoring_function not in ["prefix", "suffix", "full"]:
+                    raise ValueError(
+                        f"type_exact_match (used in parametrized_exact_match) must be one of prefix, suffix, or full. Was {sample_scoring_function} instead."
+                    )
+                self.type_exact_match = sample_scoring_function
+            else:
+                self.type_exact_match = "full"
+            self.score_sample = self.default_sample_scoring
+
+    def compute(self, predictions: list[str], formatted_doc: list[Doc], **kwargs) -> dict[str, float]:
+        """Computes the metric over a list of golds and predictions for one single item with possibly many samples.
+        It applies normalisation (if needed) to model prediction and gold, computes their per prediction score,
+        then aggregates the scores over the samples using a pass@k.
+
+        Args:
+            golds (list[str]): Reference targets
+            predictions (list[str]): k predicted strings
+
+        Returns:
+            float: Aggregated score over the current sample's items.
+        """
+        golds = formatted_doc.get_golds()
+
+        if len(golds) > 1:
+            raise Exception("Cannot compute G-Pass@k with several golds")
+
+        if self.n is None:
+            self.n = len(predictions)
+            logger.warning(
+                "n undefined in the G-Pass@k. We assume it's the same as the sample's number of predictions."
+            )
+        elif len(predictions) < self.n:
+            logger.warning(f"Number of predictions is less than {self.n} for G-Pass@k.")
+
+        gold = self.get_processed_gold(golds[0])
+
+        all_scores = []
+        for pred in predictions[: self.n]:
+            cur_pred = self.get_processed_pred(pred=pred)
+            all_scores.append(self.score_sample(cur_pred, gold, formatted_doc))
+
+        return self.g_pass_at_k(all_scores)
+
+    def get_processed_gold(self, gold: str) -> str:
+        if self.strip_strings:
+            gold = gold.strip()
+
+        if self.normalize_gold:
+            gold = self.normalize_gold(gold)
+
+        return gold
+
+    def get_processed_pred(self, pred: str) -> str:
+        if not pred:
+            return ""
+
+        if self.strip_strings:
+            pred = pred.strip()
+
+        if self.normalize_pred:
+            pred = self.normalize_pred(pred)
+
+        return pred
+
+    def default_sample_scoring(self, pred: str, gold: str) -> int:
+        if self.type_exact_match == "prefix":
+            return 1 if pred.startswith(gold) else 0
+        if self.type_exact_match == "suffix":
+            return 1 if pred.endswith(gold) else 0
+        return 1 if gold == pred else 0
+
+    def g_pass_at_k(self, all_scores: list[int]) -> float:
+        """Computation of G-Pass@k details from http://arxiv.org/abs/2412.13147"""
+        c: int = sum(all_scores)
+        n: int = self.n
+        ks: int = self.k
+        thresholds: list[float] = self.thresholds
+
+        def _compute_g_pass_at_k(n, c, k, m):
+            if m > min(c, k) or k > n or c < 0 or n <= 0 or m < 0:
+                return 0.0
+            return hypergeom.sf(m - 1, n, c, k)
+
+        def compute_g_pass_at_k(n, c, k, t):
+            m = max(int(np.ceil(k * t)), 1)
+            return _compute_g_pass_at_k(n, c, k, m)
+
+        def compute_mg_pass_at_k(n, c, k):
+            low, high = int(np.ceil(k * 0.5)), k
+
+            mg_pass_at_k = 0.0
+            for i in range(low + 1, high + 1):
+                mg_pass_at_k += _compute_g_pass_at_k(n, c, k, i)
+            mg_pass_at_k = 2 * mg_pass_at_k / k
+
+            return mg_pass_at_k
+
+        metrics = {}
+        for k in ks:
+            for t in thresholds:
+                metrics[f"G-Pass@{k}_{t}"] = compute_g_pass_at_k(n, c, k, t)
+            metrics[f"mG-Pass@{k}"] = compute_mg_pass_at_k(n, c, k)
+
+        return metrics
+
+    @property
+    def all_metrics(self):
+        ks: int = self.k
+        thresholds: list[float] = self.thresholds
+
+        metrics = []
+        for k in ks:
+            for t in thresholds:
+                metrics.append(f"G-Pass@{k}_{t}")
+            metrics.append(f"mG-Pass@{k}")
+
+        return metrics
diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py
@@ -329,6 +329,20 @@
     ],
     version=1,
 )
+aime24_gpassk = LightevalTaskConfig(
+    name="aime24_gpassk",
+    suite=["lighteval"],
+    prompt_function=prompt.aime_prompt_fn,
+    hf_repo="HuggingFaceH4/aime_2024",
+    hf_subset="default",
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split=None,
+    few_shots_select=None,
+    generation_size=8192,
+    metric=[Metrics.g_pass_at_16_expr_gold],
+    version=1,
+)
 aime25 = LightevalTaskConfig(
     name="aime25",
     suite=["lighteval"],
@@ -346,6 +360,20 @@
     ],
     version=1,
 )
+aime25_gpassk = LightevalTaskConfig(
+    name="aime25_gpassk",
+    suite=["lighteval"],
+    prompt_function=prompt.aime_prompt_fn,
+    hf_repo="yentinglin/aime_2025",
+    hf_subset="default",
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split=None,
+    few_shots_select=None,
+    generation_size=8192,
+    metric=[Metrics.g_pass_at_16_expr_gold],
+    version=1,
+)
 anachronisms_bigbench = LightevalTaskConfig(
     name="anachronisms",
     suite=["bigbench", "bigbench_json"],
@@ -9661,6 +9689,20 @@
     metric=[Metrics.latex_gold_metric],
     version=1,
 )
+math_500_gpassk = LightevalTaskConfig(
+    name="math_500_gpassk",
+    suite=["lighteval"],
+    prompt_function=prompt.math_500,
+    hf_repo="HuggingFaceH4/MATH-500",
+    hf_subset="default",
+    hf_avail_splits=["test"],
+    evaluation_splits=["test"],
+    few_shots_split=None,
+    few_shots_select=None,
+    generation_size=8192,
+    metric=[Metrics.g_pass_at_16_latex_gold],
+    version=1,
+)
 math_algebra_lighteval = LightevalTaskConfig(
     name="math:algebra",
     suite=["lighteval", "math"],