BHASA minimal pairs

stanford-crfm · Jun 27, 2024 · 939afd4 · 939afd4
1 parent 8a80ac6
commit 939afd4
Show file tree

Hide file tree

Showing 8 changed files with 117 additions and 71 deletions.
diff --git a/src/helm/benchmark/adaptation/adapter_spec.py b/src/helm/benchmark/adaptation/adapter_spec.py
@@ -10,6 +10,7 @@
 ADAPT_MULTIPLE_CHOICE_JOINT: str = "multiple_choice_joint"
 ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL: str = "multiple_choice_separate_original"
 ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED: str = "multiple_choice_separate_calibrated"
+ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING: str = "multiple_choice_language_modeling"
 ADAPT_RANKING_BINARY: str = "ranking_binary"
 
 ADAPT_MULTIPLE_CHOICE_SEPARATE_METHODS: List[str] = [

diff --git a/src/helm/benchmark/adaptation/adapters/adapter_factory.py b/src/helm/benchmark/adaptation/adapters/adapter_factory.py
@@ -6,6 +6,7 @@
     ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
+    ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING,
     ADAPT_RANKING_BINARY,
     AdapterSpec,
 )
@@ -20,6 +21,7 @@
 from helm.benchmark.adaptation.adapters.multiple_choice_calibrated_adapter import MultipleChoiceCalibratedAdapter
 from helm.benchmark.adaptation.adapters.multiple_choice_joint_adapter import MultipleChoiceJointAdapter
 from helm.benchmark.adaptation.adapters.multiple_choice_separate_adapter import MultipleChoiceSeparateAdapter
+from helm.benchmark.adaptation.adapters.multiple_choice_language_modeling_adapter import MultipleChoiceLanguageModelingAdapter
 from helm.benchmark.window_services.tokenizer_service import TokenizerService
 
 
@@ -42,6 +44,8 @@ def get_adapter(adapter_spec: AdapterSpec, tokenizer_service: TokenizerService)
             adapter = MultipleChoiceSeparateAdapter(adapter_spec, tokenizer_service)
         elif method == ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED:
             adapter = MultipleChoiceCalibratedAdapter(adapter_spec, tokenizer_service)
+        elif method == ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING:
+            adapter = MultipleChoiceLanguageModelingAdapter(adapter_spec, tokenizer_service)
         elif method == ADAPT_RANKING_BINARY:
             adapter = BinaryRankingAdapter(adapter_spec, tokenizer_service)
         elif method == ADAPT_GENERATION_MULTIMODAL:

diff --git a/src/helm/benchmark/adaptation/common_adapter_specs.py b/src/helm/benchmark/adaptation/common_adapter_specs.py
@@ -6,6 +6,7 @@
     ADAPT_MULTIPLE_CHOICE_JOINT,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
+    ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING,
     ADAPT_RANKING_BINARY,
     AdapterSpec,
 )
@@ -66,7 +67,7 @@ def get_multiple_choice_separate_adapter_spec(method: str, empty_input: bool = F
     or
     [reference_i]
     """
-    assert method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}
+    assert method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED, ADAPT_MULTIPLE_CHOICE_SEPARATE_LANGUAGE_MODELING}
 
     return AdapterSpec(
         method=method,
@@ -329,6 +330,26 @@ def get_language_modeling_adapter_spec() -> AdapterSpec:
     )
 
 
+def get_multiple_choice_language_modeling_adapter_spec() -> AdapterSpec:
+    """
+    Used for minimal pairs scenarios.
+    """
+    return AdapterSpec(
+        method=ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING,
+        instructions="",
+        input_prefix="",
+        input_suffix="",
+        reference_prefix="",
+        reference_suffix="",
+        output_prefix="",
+        output_suffix="",
+        max_train_instances=0,
+        num_outputs=1,
+        max_tokens=0,
+        temperature=0.0,
+    )
+
+
 def get_summarization_adapter_spec(num_sents: Optional[int], max_train_instances: int = 5, **kwargs) -> AdapterSpec:
     """
     Used for summarization.

diff --git a/src/helm/benchmark/metrics/basic_metrics.py b/src/helm/benchmark/metrics/basic_metrics.py
@@ -17,6 +17,7 @@
 from helm.benchmark.adaptation.adapters.adapter_factory import (
     ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
+    ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING,
     ADAPT_RANKING_BINARY,
 )
 from helm.benchmark.adaptation.request_state import RequestState
@@ -253,7 +254,7 @@ def compute_logprob_and_length(request_state: RequestState, window_service: Wind
             reference_key = ReferenceKey(request_state.reference_index, request_state.request_mode)
             reference_stats[reference_key] = compute_logprob_and_length(request_state, window_service)
 
-        if adapter_spec.method in [ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_RANKING_BINARY]:
+        if adapter_spec.method in [ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_RANKING_BINARY, ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING]:
             reference_scores = [
                 reference_stats[ReferenceKey(i, "original")].logprob
                 / reference_stats[ReferenceKey(i, "original")].num_tokens

diff --git a/src/helm/benchmark/presentation/run_entries_bhasa.conf b/src/helm/benchmark/presentation/run_entries_bhasa.conf
@@ -58,7 +58,10 @@ entries: [
 
     ###  1. Syntax: Minimal Pairs
     ###  Use this to run the minimal pairs evaluation as a MCQ task
-    {description: "lindsea_syntax_minimal_pairs:model=text,method=mcq,language=id", priority: 1},
+    {description: "lindsea_syntax_minimal_pairs:model=openai/gpt-3.5-turbo-0125,method=multiple_choice_joint,subset=npis_and_negation", priority: 1},
+    {description: "lindsea_syntax_minimal_pairs:model=openai/gpt-3.5-turbo-0125,method=multiple_choice_joint,subset=argument_structure", priority: 1},
+    {description: "lindsea_syntax_minimal_pairs:model=openai/gpt-3.5-turbo-0125,method=multiple_choice_joint,subset=filler_gap_dependencies", priority: 1},
+    {description: "lindsea_syntax_minimal_pairs:model=openai/gpt-3.5-turbo-0125,method=multiple_choice_joint,subset=morphology", priority: 1},
 
     ###  Use this instead of the above in order to run the minimal pairs evaluation using logprobs
     # {description: "lindsea_syntax_minimal_pairs:model=text,method=probs,language=id" priority: 1},

diff --git a/src/helm/benchmark/run_specs/bhasa_run_specs.py b/src/helm/benchmark/run_specs/bhasa_run_specs.py
@@ -4,6 +4,8 @@
 from helm.benchmark.adaptation.common_adapter_specs import (
     get_generation_adapter_spec,
     get_multiple_choice_separate_adapter_spec,
+    get_multiple_choice_joint_adapter_spec,
+    get_multiple_choice_language_modeling_adapter_spec,
 )
 from helm.benchmark.metrics.bhasa_metrics_specs import (
     get_bhasa_machine_translation_metric_specs,
@@ -554,21 +556,35 @@ def get_xcopa_spec(language="id") -> RunSpec:
 
 
 @run_spec_function("lindsea_syntax_minimal_pairs")
-def get_lindsea_syntax_minimal_pairs_spec(language: str = "id", method: str = "mcq") -> RunSpec:
-    name = f"lindsea_syntax_minimal_pairs_{language}"
-    if method == "mcq":
-        adapter_spec = get_generation_adapter_spec(output_noun=LINDSEA_OUTPUT_NOUNS[language], max_tokens=2)
-    else:
+def get_lindsea_syntax_minimal_pairs_spec(language: str = "id", method: str = "multiple_choice_joint", subset: str = "all") -> RunSpec:
+    from helm.benchmark.scenarios.bhasa_scenario import LINDSEASyntaxMinimalPairsScenario
+    name = f"lindsea_syntax_minimal_pairs:language={language},method={method},subset={subset}"
+    if method == "multiple_choice_joint":
+        prompt_components = LINDSEASyntaxMinimalPairsScenario.LANGUAGE_TO_PROMPT_COMPONENTS[language]
+        instructions = prompt_components["instructions"]
+        output_prefix = prompt_components["output_prefix"]
+        adapter_spec = get_multiple_choice_joint_adapter_spec(
+            instructions=instructions,
+            input_noun=None,
+            output_noun=output_prefix
+        )
+        # adapter_spec = get_generation_adapter_spec(output_noun=LINDSEA_OUTPUT_NOUNS[language], max_tokens=2)
+    elif method == "multiple_choice_language_modeling":
+        adapter_spec = get_multiple_choice_language_modeling_adapter_spec()
+    elif method == "multiple_choice_separate_original":
         adapter_spec = get_multiple_choice_separate_adapter_spec(
             method=ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
             empty_input=True,
         )
+    else:
+        raise ValueError(f"Unknown method {method}")
 
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEASyntaxMinimalPairsScenario",
         args={
             "method": method,
             "language": language,
+            "subset": subset,
         },
     )
 

diff --git a/src/helm/benchmark/scenarios/bhasa_scenario.py b/src/helm/benchmark/scenarios/bhasa_scenario.py
@@ -1,7 +1,7 @@
 import datasets
 import os
 import random
-from typing import List, Dict
+from typing import List, Dict, Optional
 
 import pandas as pd
 
@@ -1525,16 +1525,18 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
     description = "LINDSEA minimal pairs task"
     tags = ["minimal_pairs", "linguistic_diagnostic", "syntax"]
 
-    def __init__(self, method: str, language: str):
+    LANGUAGE_TO_PROMPT_COMPONENTS: Dict[str, Dict[str, str]] = {
+        "id": {
+            "instructions": "Kalimat mana yang lebih mungkin?",
+            "output_prefix": "Jawablah dengan satu huruf saja, A atau B",
+        }
+    }
+
+    def __init__(self, method: str, language: str, subset: str = "all"):
         super().__init__()
         self.method = method
         self.language = language
-        self.prompts = {
-            "id": {
-                "instructions": "Kalimat mana yang lebih mungkin?",
-                "output_prefix": "Jawablah dengan satu huruf saja, A atau B.",
-            }
-        }
+        self.subset = subset
 
     def download_dataset(self, output_path: str):
         BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
@@ -1545,8 +1547,10 @@ def download_dataset(self, output_path: str):
             "morphology": f"{BASE_URL}{self.language}/syntax/morphology.jsonl",
         }
 
+        subsets = list(URLS.keys()) if self.subset == "all" else [self.subset]
+
         data_files = {}
-        for file in list(URLS.keys()):
+        for file in subsets:
             target_path_file = os.path.join(output_path, file)
             ensure_file_downloaded(source_url=URLS[file], target_path=target_path_file)
             data_files[file] = pd.read_json(target_path_file, lines=True)
@@ -1569,15 +1573,11 @@ def get_instances(self, output_path: str) -> List[Instance]:
                     random.shuffle(options)
                     options_reversed = True if options[0][1] == 2 else False
 
-                    prompt_components = self.prompts[self.language]
-                    instructions = prompt_components["instructions"]
-                    output_prefix = prompt_components["output_prefix"]
-                    prompt = f"{instructions}\nA: {options[0][0]}\nB: {options[1][0]}\n{output_prefix}"
-                    input = Input(text=prompt)
+                    input = Input(text="")
                     # Determine correct option based on whether shuffling reversed the options
                     references = [
-                        Reference(Output(text="A"), tags=[] if options_reversed else [CORRECT_TAG]),
-                        Reference(Output(text="B"), tags=[CORRECT_TAG] if options_reversed else []),
+                        Reference(Output(text=options[0][0]), tags=[] if options_reversed else [CORRECT_TAG]),
+                        Reference(Output(text=options[1][0]), tags=[CORRECT_TAG] if options_reversed else []),
                     ]
                     instance = Instance(input=input, references=references, split=TEST_SPLIT)
                     outputs.append(instance)

diff --git a/src/helm/benchmark/static/schema_bhasa.yaml b/src/helm/benchmark/static/schema_bhasa.yaml
@@ -110,57 +110,57 @@ metric_groups:
 ############################################################
 
 run_groups:
-  - name: bhasa
-    display_name: BHASA
-    description: BHASA scenarios
-    category: All scenarios
-    subgroups:
-      - bhasa_nlu
-      - bhasa_nlg
-      - bhasa_nlr
-      - bhasa_lindsea
+#   - name: bhasa
+#     display_name: BHASA
+#     description: BHASA scenarios
+#     category: All scenarios
+#     subgroups:
+#       - bhasa_nlu
+#       - bhasa_nlg
+#       - bhasa_nlr
+#       - bhasa_lindsea
 
-  - name: bhasa_nlu
-    display_name: BHASA natural language understanding (NLU)
-    description: BHASA natural language understanding (NLU) scenarios
-    category: BHASA scenarios
-    subgroups:
-      - tydiqa
-      - xquad
-      - indicqa
-      - nusax
-      - uitvsfc
-      - wisesight
-      - indicsentiment
-      - mlhsd
-      - vihsd
-      - thaitoxicitytweets
+#   - name: bhasa_nlu
+#     display_name: BHASA natural language understanding (NLU)
+#     description: BHASA natural language understanding (NLU) scenarios
+#     category: BHASA scenarios
+#     subgroups:
+#       - tydiqa
+#       - xquad
+#       - indicqa
+#       - nusax
+#       - uitvsfc
+#       - wisesight
+#       - indicsentiment
+#       - mlhsd
+#       - vihsd
+#       - thaitoxicitytweets
 
-  - name: bhasa_nlg
-  display_name: BHASA natural language generation (NLG)
-  description: BHASA natural language generation (NLG) scenarios
-  category: BHASA scenarios
-  subgroups:
-    - flores
+#   - name: bhasa_nlg
+#     display_name: BHASA natural language generation (NLG)
+#     description: BHASA natural language generation (NLG) scenarios
+#     category: BHASA scenarios
+#     subgroups:
+#       - flores
 
-  - name: bhasa_nlr
-  display_name: BHASA natural language reasoning (NLR)
-  description: BHASA natural language reasoning (NLR) scenarios
-  category: BHASA scenarios
-  subgroups:
-    - indonli
-    - xnli
-    - indicxnli
-    - xcopa
+#   - name: bhasa_nlr
+#     display_name: BHASA natural language reasoning (NLR)
+#     description: BHASA natural language reasoning (NLR) scenarios
+#     category: BHASA scenarios
+#     subgroups:
+#       - indonli
+#       - xnli
+#       - indicxnli
+#       - xcopa
 
   - name: bhasa_lindsea
-  display_name: BHASA Linguistic Diagnostics for Southeast Asian Languages (LINDSEA)
-  description: BHASA Linguistic Diagnostics for Southeast Asian Languages (LINDSEA) scenarios
-  category: BHASA scenarios
-  subgroups:
-    - lindsea_syntax_minimal_pairs
-    - lindsea_pragmatics_pragmatic_reasoning_single
-    - lindsea_pragmatics_pragmatic_reasoning_pair
+    display_name: BHASA Linguistic Diagnostics for Southeast Asian Languages (LINDSEA)
+    description: BHASA Linguistic Diagnostics for Southeast Asian Languages (LINDSEA) scenarios
+    category: BHASA scenarios
+    subgroups:
+      - lindsea_syntax_minimal_pairs_id
+      - lindsea_pragmatics_pragmatic_reasoning_single
+      - lindsea_pragmatics_pragmatic_reasoning_pair
 
   - name: tydiqa
     display_name: TyDiQA
@@ -433,7 +433,7 @@ run_groups:
       when: "?"
       language: Indonesian, Tamil, Thai, Vietnamese 
 
-  - name: lindsea_syntax_minimal_pairs
+  - name: lindsea_syntax_minimal_pairs_id
     display_name: LINDSEA Syntax Minimal Pairs
     description: >
       LINDSEA minimal pairs is a linguistic diagnostic for syntax dataset from BHASA [(Leong, 2023)](https://arxiv.org/abs/2309.06085), involving pairs of sentences that differ minimally from each other and contrast in grammatical acceptability.