stanford-crfm · yifanmai · Jun 26, 2024 · Jun 27, 2024 · Jun 27, 2024 · Jun 27, 2024
diff --git a/src/helm/benchmark/adaptation/adapter_spec.py b/src/helm/benchmark/adaptation/adapter_spec.py
@@ -10,6 +10,7 @@
 ADAPT_MULTIPLE_CHOICE_JOINT: str = "multiple_choice_joint"
 ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL: str = "multiple_choice_separate_original"
 ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED: str = "multiple_choice_separate_calibrated"
+ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING: str = "multiple_choice_language_modeling"
 ADAPT_RANKING_BINARY: str = "ranking_binary"
 
 ADAPT_MULTIPLE_CHOICE_SEPARATE_METHODS: List[str] = [

diff --git a/src/helm/benchmark/adaptation/adapters/adapter_factory.py b/src/helm/benchmark/adaptation/adapters/adapter_factory.py
@@ -6,6 +6,7 @@
     ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
+    ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING,
     ADAPT_RANKING_BINARY,
     AdapterSpec,
 )
@@ -20,6 +21,7 @@
 from helm.benchmark.adaptation.adapters.multiple_choice_calibrated_adapter import MultipleChoiceCalibratedAdapter
 from helm.benchmark.adaptation.adapters.multiple_choice_joint_adapter import MultipleChoiceJointAdapter
 from helm.benchmark.adaptation.adapters.multiple_choice_separate_adapter import MultipleChoiceSeparateAdapter
+from helm.benchmark.adaptation.adapters.multiple_choice_language_modeling_adapter import MultipleChoiceLanguageModelingAdapter
 from helm.benchmark.window_services.tokenizer_service import TokenizerService
 
 
@@ -42,6 +44,8 @@ def get_adapter(adapter_spec: AdapterSpec, tokenizer_service: TokenizerService)
             adapter = MultipleChoiceSeparateAdapter(adapter_spec, tokenizer_service)
         elif method == ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED:
             adapter = MultipleChoiceCalibratedAdapter(adapter_spec, tokenizer_service)
+        elif method == ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING:
+            adapter = MultipleChoiceLanguageModelingAdapter(adapter_spec, tokenizer_service)
         elif method == ADAPT_RANKING_BINARY:
             adapter = BinaryRankingAdapter(adapter_spec, tokenizer_service)
         elif method == ADAPT_GENERATION_MULTIMODAL:

diff --git a/src/helm/benchmark/adaptation/adapters/language_modeling_adapter.py b/src/helm/benchmark/adaptation/adapters/language_modeling_adapter.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple, Optional
+from typing import Iterator, List, Tuple, Optional
 
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.scenarios.scenario import Instance, EVAL_SPLITS
@@ -48,6 +48,35 @@ def adapt(self, instances: List[Instance], parallelism: int) -> List[RequestStat
         return all_request_states
 
     def _generate_requests(self, eval_instance: Instance) -> List[RequestState]:
+        request_states: List[RequestState] = []
+        for prompt_text, num_conditioning_tokens in self.construct_language_modeling_prompts(eval_instance.input.text):
+            request = Request(
+                model=self.adapter_spec.model,
+                model_deployment=self.adapter_spec.model_deployment,
+                prompt=prompt_text,
+                num_completions=1,
+                temperature=0,
+                max_tokens=self.adapter_spec.max_tokens,  # usually this is zero
+                stop_sequences=self.adapter_spec.stop_sequences,
+                echo_prompt=True,
+                random=self.adapter_spec.random,
+            )
+            request_state = RequestState(
+                instance=eval_instance,
+                reference_index=None,
+                request_mode=None,
+                train_trial_index=0,
+                output_mapping=None,
+                request=request,
+                result=None,
+                num_conditioning_tokens=num_conditioning_tokens,
+                num_train_instances=self.adapter_spec.max_train_instances,
+                prompt_truncated=False,
+            )
+            request_states.append(request_state)
+        return request_states
+
+    def construct_language_modeling_prompts(self, target_text: str, prefix: str = "") -> Iterator[Tuple[str, int]]:
         """
         Adapted from https://github.com/EleutherAI/lm_perplexity/blob/main/lm_perplexity/utils.py.
         """
@@ -88,13 +117,12 @@ def _generate_requests(self, eval_instance: Instance) -> List[RequestState]:
             max_request_length,
             self.window_service.max_sequence_and_generated_tokens_length - self.adapter_spec.max_tokens,
         )
-        prefix_token: str = self.window_service.prefix_token
+        prefix = prefix or self.window_service.prefix_token
 
-        encode_result: EncodeResult = self.window_service.encode(eval_instance.input.text)
+        encode_result: EncodeResult = self.window_service.encode(target_text)
         tokens: List[TokenizationToken] = encode_result.tokens
         text: str = encode_result.text
 
-        request_states: List[RequestState] = []
         num_predicted_tokens: int = 0
 
         # Special handling for first window: predict all tokens
@@ -114,33 +142,9 @@ def _generate_requests(self, eval_instance: Instance) -> List[RequestState]:
 
         # Handle max_sequence_and_generated_tokens_length
         first_seq_len: int = min(max_sequence_length, len(tokens))
-        prompt_text, num_conditioning_tokens = self.construct_language_modeling_prompt(
-            self.window_service.encode(prefix_token).tokens, tokens[:first_seq_len], max_request_length, text
-        )
-        request = Request(
-            model=self.adapter_spec.model,
-            model_deployment=self.adapter_spec.model_deployment,
-            prompt=prompt_text,
-            num_completions=1,
-            temperature=0,
-            max_tokens=self.adapter_spec.max_tokens,  # usually this is zero
-            stop_sequences=self.adapter_spec.stop_sequences,
-            echo_prompt=True,
-            random=self.adapter_spec.random,
+        yield self.construct_language_modeling_prompt(
+            self.window_service.encode(prefix).tokens, tokens[:first_seq_len], max_request_length, text
         )
-        request_state = RequestState(
-            instance=eval_instance,
-            reference_index=None,
-            request_mode=None,
-            train_trial_index=0,
-            output_mapping=None,
-            request=request,
-            result=None,
-            num_conditioning_tokens=1 if len(prefix_token) > 0 else 0,
-            num_train_instances=self.adapter_spec.max_train_instances,
-            prompt_truncated=False,
-        )
-        request_states.append(request_state)
         num_predicted_tokens += first_seq_len
 
         while num_predicted_tokens < len(tokens):
@@ -162,37 +166,9 @@ def _generate_requests(self, eval_instance: Instance) -> List[RequestState]:
                 window_end - max_request_length : num_predicted_tokens
             ]
             pred_tokens: List[TokenizationToken] = tokens[num_predicted_tokens:window_end]
-            prompt_text, num_conditioning_tokens = self.construct_language_modeling_prompt(
-                conditioning_tokens, pred_tokens, max_request_length, text
-            )
-
-            request = Request(
-                model=self.adapter_spec.model,
-                model_deployment=self.adapter_spec.model_deployment,
-                prompt=prompt_text,
-                num_completions=1,
-                temperature=0,
-                max_tokens=self.adapter_spec.max_tokens,  # usually this is zero
-                stop_sequences=self.adapter_spec.stop_sequences,
-                echo_prompt=True,
-            )
-            request_state = RequestState(
-                instance=eval_instance,
-                reference_index=None,
-                request_mode=None,
-                train_trial_index=0,
-                output_mapping=None,
-                request=request,
-                result=None,
-                num_conditioning_tokens=num_conditioning_tokens,
-                num_train_instances=self.adapter_spec.max_train_instances,
-                prompt_truncated=False,
-            )
-            request_states.append(request_state)
+            yield self.construct_language_modeling_prompt(conditioning_tokens, pred_tokens, max_request_length, text)
             num_predicted_tokens += window_pred_len
 
-        return request_states
-
     def construct_language_modeling_prompt(
         self,
         conditioning_tokens: List[TokenizationToken],

diff --git a/src/helm/benchmark/adaptation/common_adapter_specs.py b/src/helm/benchmark/adaptation/common_adapter_specs.py
@@ -6,6 +6,7 @@
     ADAPT_MULTIPLE_CHOICE_JOINT,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
+    ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING,
     ADAPT_RANKING_BINARY,
     AdapterSpec,
 )
@@ -220,6 +221,7 @@ def get_generation_adapter_spec(
     stop_sequences: Optional[List] = None,  # default value of `stop_sequences` is ["\n"]
     temperature: float = 0.0,
     multi_label: bool = False,
+    sample_train: bool = True,
 ) -> AdapterSpec:
     """
     [instructions]
@@ -261,6 +263,7 @@ def format_prefix(noun: Optional[str], append_new_line: bool) -> str:
         temperature=temperature,
         stop_sequences=stop_sequences,
         multi_label=multi_label,
+        sample_train=sample_train,
     )
 
 
@@ -329,6 +332,26 @@ def get_language_modeling_adapter_spec() -> AdapterSpec:
     )
 
 
+def get_multiple_choice_language_modeling_adapter_spec() -> AdapterSpec:
+    """
+    Used for minimal pairs scenarios.
+    """
+    return AdapterSpec(
+        method=ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING,
+        instructions="",
+        input_prefix="",
+        input_suffix="",
+        reference_prefix="",
+        reference_suffix="",
+        output_prefix="",
+        output_suffix="",
+        max_train_instances=0,
+        num_outputs=1,
+        max_tokens=0,
+        temperature=0.0,
+    )
+
+
 def get_summarization_adapter_spec(num_sents: Optional[int], max_train_instances: int = 5, **kwargs) -> AdapterSpec:
     """
     Used for summarization.

diff --git a/src/helm/benchmark/metrics/basic_metrics.py b/src/helm/benchmark/metrics/basic_metrics.py
@@ -17,6 +17,7 @@
 from helm.benchmark.adaptation.adapters.adapter_factory import (
     ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
+    ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING,
     ADAPT_RANKING_BINARY,
 )
 from helm.benchmark.adaptation.request_state import RequestState
@@ -253,7 +254,7 @@ def compute_logprob_and_length(request_state: RequestState, window_service: Wind
             reference_key = ReferenceKey(request_state.reference_index, request_state.request_mode)
             reference_stats[reference_key] = compute_logprob_and_length(request_state, window_service)
 
-        if adapter_spec.method in [ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_RANKING_BINARY]:
+        if adapter_spec.method in [ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_RANKING_BINARY, ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING]:
             reference_scores = [
                 reference_stats[ReferenceKey(i, "original")].logprob
                 / reference_stats[ReferenceKey(i, "original")].num_tokens

diff --git a/src/helm/benchmark/presentation/run_entries_bhasa.conf b/src/helm/benchmark/presentation/run_entries_bhasa.conf
@@ -58,7 +58,10 @@ entries: [
 
     ###  1. Syntax: Minimal Pairs
     ###  Use this to run the minimal pairs evaluation as a MCQ task
-    {description: "lindsea_syntax_minimal_pairs:model=text,method=mcq,language=id", priority: 1},
+    {description: "lindsea_syntax_minimal_pairs:model=text,method=multiple_choice_joint,subset=npis_and_negation", priority: 1},
+    {description: "lindsea_syntax_minimal_pairs:model=text,method=multiple_choice_joint,subset=argument_structure", priority: 1},
+    {description: "lindsea_syntax_minimal_pairs:model=text,method=multiple_choice_joint,subset=filler_gap_dependencies", priority: 1},
+    {description: "lindsea_syntax_minimal_pairs:model=text,method=multiple_choice_joint,subset=morphology", priority: 1},
 
     ###  Use this instead of the above in order to run the minimal pairs evaluation using logprobs
     # {description: "lindsea_syntax_minimal_pairs:model=text,method=probs,language=id" priority: 1},

diff --git a/src/helm/benchmark/run_specs/bhasa_run_specs.py b/src/helm/benchmark/run_specs/bhasa_run_specs.py
@@ -1,9 +1,12 @@
 from helm.benchmark.adaptation.adapter_spec import (
     ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
+    ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
 )
 from helm.benchmark.adaptation.common_adapter_specs import (
     get_generation_adapter_spec,
     get_multiple_choice_separate_adapter_spec,
+    get_multiple_choice_joint_adapter_spec,
+    get_multiple_choice_language_modeling_adapter_spec,
 )
 from helm.benchmark.metrics.bhasa_metrics_specs import (
     get_bhasa_machine_translation_metric_specs,
@@ -374,6 +377,7 @@ def get_flores_spec(source="en", target="id") -> RunSpec:
         output_noun=TRANSLATION_PROMPTS[pair]["output_noun"],
         stop_sequences=["\n"],
         max_tokens=256,
+        sample_train=False,
     )
 
     scenario_spec = ScenarioSpec(
@@ -554,21 +558,40 @@ def get_xcopa_spec(language="id") -> RunSpec:
 
 
 @run_spec_function("lindsea_syntax_minimal_pairs")
-def get_lindsea_syntax_minimal_pairs_spec(language: str = "id", method: str = "mcq") -> RunSpec:
-    name = f"lindsea_syntax_minimal_pairs_{language}"
-    if method == "mcq":
-        adapter_spec = get_generation_adapter_spec(output_noun=LINDSEA_OUTPUT_NOUNS[language], max_tokens=2)
-    else:
+def get_lindsea_syntax_minimal_pairs_spec(language: str = "id", method: str = "multiple_choice_joint", subset: str = "all") -> RunSpec:
+    from helm.benchmark.scenarios.bhasa_scenario import LINDSEASyntaxMinimalPairsScenario
+    name = f"lindsea_syntax_minimal_pairs:language={language},method={method},subset={subset}"
+    if method == "multiple_choice_joint":
+        prompt_components = LINDSEASyntaxMinimalPairsScenario.LANGUAGE_TO_PROMPT_COMPONENTS[language]
+        instructions = prompt_components["instructions"]
+        output_prefix = prompt_components["output_prefix"]
+        adapter_spec = get_multiple_choice_joint_adapter_spec(
+            instructions=instructions,
+            input_noun=None,
+            output_noun=output_prefix
+        )
+        # adapter_spec = get_generation_adapter_spec(output_noun=LINDSEA_OUTPUT_NOUNS[language], max_tokens=2)
+    elif method == "multiple_choice_language_modeling":
+        adapter_spec = get_multiple_choice_language_modeling_adapter_spec()
+    elif method == "multiple_choice_separate_original":
         adapter_spec = get_multiple_choice_separate_adapter_spec(
             method=ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
             empty_input=True,
         )
+    elif method == "multiple_choice_separate_calibrated":
+        adapter_spec = get_multiple_choice_separate_adapter_spec(
+            method=ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
+            empty_input=True,
+        )
+    else:
+        raise ValueError(f"Unknown method {method}")
 
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEASyntaxMinimalPairsScenario",
         args={
             "method": method,
             "language": language,
+            "subset": subset,
         },
     )
 

diff --git a/src/helm/benchmark/scenarios/bhasa_scenario.py b/src/helm/benchmark/scenarios/bhasa_scenario.py
@@ -1,7 +1,7 @@
 import datasets
 import os
 import random
-from typing import List, Dict
+from typing import List, Dict, Optional
 
 import pandas as pd
 
@@ -1525,16 +1525,18 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
     description = "LINDSEA minimal pairs task"
     tags = ["minimal_pairs", "linguistic_diagnostic", "syntax"]
 
-    def __init__(self, method: str, language: str):
+    LANGUAGE_TO_PROMPT_COMPONENTS: Dict[str, Dict[str, str]] = {
+        "id": {
+            "instructions": "Kalimat mana yang lebih mungkin?",
+            "output_prefix": "Jawablah dengan satu huruf saja, A atau B",
+        }
+    }
+
+    def __init__(self, method: str, language: str, subset: str = "all"):
         super().__init__()
         self.method = method
         self.language = language
-        self.prompts = {
-            "id": {
-                "instructions": "Kalimat mana yang lebih mungkin?",
-                "output_prefix": "Jawablah dengan satu huruf saja, A atau B.",
-            }
-        }
+        self.subset = subset
 
     def download_dataset(self, output_path: str):
         BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
@@ -1545,8 +1547,10 @@ def download_dataset(self, output_path: str):
             "morphology": f"{BASE_URL}{self.language}/syntax/morphology.jsonl",
         }
 
+        subsets = list(URLS.keys()) if self.subset == "all" else [self.subset]
+
         data_files = {}
-        for file in list(URLS.keys()):
+        for file in subsets:
             target_path_file = os.path.join(output_path, file)
             ensure_file_downloaded(source_url=URLS[file], target_path=target_path_file)
             data_files[file] = pd.read_json(target_path_file, lines=True)
@@ -1569,15 +1573,11 @@ def get_instances(self, output_path: str) -> List[Instance]:
                     random.shuffle(options)
                     options_reversed = True if options[0][1] == 2 else False
 
-                    prompt_components = self.prompts[self.language]
-                    instructions = prompt_components["instructions"]
-                    output_prefix = prompt_components["output_prefix"]
-                    prompt = f"{instructions}\nA: {options[0][0]}\nB: {options[1][0]}\n{output_prefix}"
-                    input = Input(text=prompt)
+                    input = Input(text="")
                     # Determine correct option based on whether shuffling reversed the options
                     references = [
-                        Reference(Output(text="A"), tags=[] if options_reversed else [CORRECT_TAG]),
-                        Reference(Output(text="B"), tags=[CORRECT_TAG] if options_reversed else []),
+                        Reference(Output(text=options[0][0]), tags=[] if options_reversed else [CORRECT_TAG]),
+                        Reference(Output(text=options[1][0]), tags=[CORRECT_TAG] if options_reversed else []),
                     ]
                     instance = Instance(input=input, references=references, split=TEST_SPLIT)
                     outputs.append(instance)