Skip to content

Commit

Permalink
BHASA minimal pairs
Browse files Browse the repository at this point in the history
  • Loading branch information
yifanmai committed Jun 27, 2024
1 parent 8a80ac6 commit 939afd4
Show file tree
Hide file tree
Showing 8 changed files with 117 additions and 71 deletions.
1 change: 1 addition & 0 deletions src/helm/benchmark/adaptation/adapter_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
ADAPT_MULTIPLE_CHOICE_JOINT: str = "multiple_choice_joint"
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL: str = "multiple_choice_separate_original"
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED: str = "multiple_choice_separate_calibrated"
ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING: str = "multiple_choice_language_modeling"
ADAPT_RANKING_BINARY: str = "ranking_binary"

ADAPT_MULTIPLE_CHOICE_SEPARATE_METHODS: List[str] = [
Expand Down
4 changes: 4 additions & 0 deletions src/helm/benchmark/adaptation/adapters/adapter_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING,
ADAPT_RANKING_BINARY,
AdapterSpec,
)
Expand All @@ -20,6 +21,7 @@
from helm.benchmark.adaptation.adapters.multiple_choice_calibrated_adapter import MultipleChoiceCalibratedAdapter
from helm.benchmark.adaptation.adapters.multiple_choice_joint_adapter import MultipleChoiceJointAdapter
from helm.benchmark.adaptation.adapters.multiple_choice_separate_adapter import MultipleChoiceSeparateAdapter
from helm.benchmark.adaptation.adapters.multiple_choice_language_modeling_adapter import MultipleChoiceLanguageModelingAdapter
from helm.benchmark.window_services.tokenizer_service import TokenizerService


Expand All @@ -42,6 +44,8 @@ def get_adapter(adapter_spec: AdapterSpec, tokenizer_service: TokenizerService)
adapter = MultipleChoiceSeparateAdapter(adapter_spec, tokenizer_service)
elif method == ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED:
adapter = MultipleChoiceCalibratedAdapter(adapter_spec, tokenizer_service)
elif method == ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING:
adapter = MultipleChoiceLanguageModelingAdapter(adapter_spec, tokenizer_service)
elif method == ADAPT_RANKING_BINARY:
adapter = BinaryRankingAdapter(adapter_spec, tokenizer_service)
elif method == ADAPT_GENERATION_MULTIMODAL:
Expand Down
23 changes: 22 additions & 1 deletion src/helm/benchmark/adaptation/common_adapter_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
ADAPT_MULTIPLE_CHOICE_JOINT,
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING,
ADAPT_RANKING_BINARY,
AdapterSpec,
)
Expand Down Expand Up @@ -66,7 +67,7 @@ def get_multiple_choice_separate_adapter_spec(method: str, empty_input: bool = F
or
[reference_i]
"""
assert method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}
assert method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED, ADAPT_MULTIPLE_CHOICE_SEPARATE_LANGUAGE_MODELING}

return AdapterSpec(
method=method,
Expand Down Expand Up @@ -329,6 +330,26 @@ def get_language_modeling_adapter_spec() -> AdapterSpec:
)


def get_multiple_choice_language_modeling_adapter_spec() -> AdapterSpec:
"""
Used for minimal pairs scenarios.
"""
return AdapterSpec(
method=ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING,
instructions="",
input_prefix="",
input_suffix="",
reference_prefix="",
reference_suffix="",
output_prefix="",
output_suffix="",
max_train_instances=0,
num_outputs=1,
max_tokens=0,
temperature=0.0,
)


def get_summarization_adapter_spec(num_sents: Optional[int], max_train_instances: int = 5, **kwargs) -> AdapterSpec:
"""
Used for summarization.
Expand Down
3 changes: 2 additions & 1 deletion src/helm/benchmark/metrics/basic_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from helm.benchmark.adaptation.adapters.adapter_factory import (
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING,
ADAPT_RANKING_BINARY,
)
from helm.benchmark.adaptation.request_state import RequestState
Expand Down Expand Up @@ -253,7 +254,7 @@ def compute_logprob_and_length(request_state: RequestState, window_service: Wind
reference_key = ReferenceKey(request_state.reference_index, request_state.request_mode)
reference_stats[reference_key] = compute_logprob_and_length(request_state, window_service)

if adapter_spec.method in [ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_RANKING_BINARY]:
if adapter_spec.method in [ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_RANKING_BINARY, ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING]:
reference_scores = [
reference_stats[ReferenceKey(i, "original")].logprob
/ reference_stats[ReferenceKey(i, "original")].num_tokens
Expand Down
5 changes: 4 additions & 1 deletion src/helm/benchmark/presentation/run_entries_bhasa.conf
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,10 @@ entries: [

### 1. Syntax: Minimal Pairs
### Use this to run the minimal pairs evaluation as a MCQ task
{description: "lindsea_syntax_minimal_pairs:model=text,method=mcq,language=id", priority: 1},
{description: "lindsea_syntax_minimal_pairs:model=openai/gpt-3.5-turbo-0125,method=multiple_choice_joint,subset=npis_and_negation", priority: 1},
{description: "lindsea_syntax_minimal_pairs:model=openai/gpt-3.5-turbo-0125,method=multiple_choice_joint,subset=argument_structure", priority: 1},
{description: "lindsea_syntax_minimal_pairs:model=openai/gpt-3.5-turbo-0125,method=multiple_choice_joint,subset=filler_gap_dependencies", priority: 1},
{description: "lindsea_syntax_minimal_pairs:model=openai/gpt-3.5-turbo-0125,method=multiple_choice_joint,subset=morphology", priority: 1},

### Use this instead of the above in order to run the minimal pairs evaluation using logprobs
# {description: "lindsea_syntax_minimal_pairs:model=text,method=probs,language=id" priority: 1},
Expand Down
26 changes: 21 additions & 5 deletions src/helm/benchmark/run_specs/bhasa_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from helm.benchmark.adaptation.common_adapter_specs import (
get_generation_adapter_spec,
get_multiple_choice_separate_adapter_spec,
get_multiple_choice_joint_adapter_spec,
get_multiple_choice_language_modeling_adapter_spec,
)
from helm.benchmark.metrics.bhasa_metrics_specs import (
get_bhasa_machine_translation_metric_specs,
Expand Down Expand Up @@ -554,21 +556,35 @@ def get_xcopa_spec(language="id") -> RunSpec:


@run_spec_function("lindsea_syntax_minimal_pairs")
def get_lindsea_syntax_minimal_pairs_spec(language: str = "id", method: str = "mcq") -> RunSpec:
name = f"lindsea_syntax_minimal_pairs_{language}"
if method == "mcq":
adapter_spec = get_generation_adapter_spec(output_noun=LINDSEA_OUTPUT_NOUNS[language], max_tokens=2)
else:
def get_lindsea_syntax_minimal_pairs_spec(language: str = "id", method: str = "multiple_choice_joint", subset: str = "all") -> RunSpec:
from helm.benchmark.scenarios.bhasa_scenario import LINDSEASyntaxMinimalPairsScenario
name = f"lindsea_syntax_minimal_pairs:language={language},method={method},subset={subset}"
if method == "multiple_choice_joint":
prompt_components = LINDSEASyntaxMinimalPairsScenario.LANGUAGE_TO_PROMPT_COMPONENTS[language]
instructions = prompt_components["instructions"]
output_prefix = prompt_components["output_prefix"]
adapter_spec = get_multiple_choice_joint_adapter_spec(
instructions=instructions,
input_noun=None,
output_noun=output_prefix
)
# adapter_spec = get_generation_adapter_spec(output_noun=LINDSEA_OUTPUT_NOUNS[language], max_tokens=2)
elif method == "multiple_choice_language_modeling":
adapter_spec = get_multiple_choice_language_modeling_adapter_spec()
elif method == "multiple_choice_separate_original":
adapter_spec = get_multiple_choice_separate_adapter_spec(
method=ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
empty_input=True,
)
else:
raise ValueError(f"Unknown method {method}")

scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEASyntaxMinimalPairsScenario",
args={
"method": method,
"language": language,
"subset": subset,
},
)

Expand Down
32 changes: 16 additions & 16 deletions src/helm/benchmark/scenarios/bhasa_scenario.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import datasets
import os
import random
from typing import List, Dict
from typing import List, Dict, Optional

import pandas as pd

Expand Down Expand Up @@ -1525,16 +1525,18 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
description = "LINDSEA minimal pairs task"
tags = ["minimal_pairs", "linguistic_diagnostic", "syntax"]

def __init__(self, method: str, language: str):
LANGUAGE_TO_PROMPT_COMPONENTS: Dict[str, Dict[str, str]] = {
"id": {
"instructions": "Kalimat mana yang lebih mungkin?",
"output_prefix": "Jawablah dengan satu huruf saja, A atau B",
}
}

def __init__(self, method: str, language: str, subset: str = "all"):
super().__init__()
self.method = method
self.language = language
self.prompts = {
"id": {
"instructions": "Kalimat mana yang lebih mungkin?",
"output_prefix": "Jawablah dengan satu huruf saja, A atau B.",
}
}
self.subset = subset

def download_dataset(self, output_path: str):
BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
Expand All @@ -1545,8 +1547,10 @@ def download_dataset(self, output_path: str):
"morphology": f"{BASE_URL}{self.language}/syntax/morphology.jsonl",
}

subsets = list(URLS.keys()) if self.subset == "all" else [self.subset]

data_files = {}
for file in list(URLS.keys()):
for file in subsets:
target_path_file = os.path.join(output_path, file)
ensure_file_downloaded(source_url=URLS[file], target_path=target_path_file)
data_files[file] = pd.read_json(target_path_file, lines=True)
Expand All @@ -1569,15 +1573,11 @@ def get_instances(self, output_path: str) -> List[Instance]:
random.shuffle(options)
options_reversed = True if options[0][1] == 2 else False

prompt_components = self.prompts[self.language]
instructions = prompt_components["instructions"]
output_prefix = prompt_components["output_prefix"]
prompt = f"{instructions}\nA: {options[0][0]}\nB: {options[1][0]}\n{output_prefix}"
input = Input(text=prompt)
input = Input(text="")
# Determine correct option based on whether shuffling reversed the options
references = [
Reference(Output(text="A"), tags=[] if options_reversed else [CORRECT_TAG]),
Reference(Output(text="B"), tags=[CORRECT_TAG] if options_reversed else []),
Reference(Output(text=options[0][0]), tags=[] if options_reversed else [CORRECT_TAG]),
Reference(Output(text=options[1][0]), tags=[CORRECT_TAG] if options_reversed else []),
]
instance = Instance(input=input, references=references, split=TEST_SPLIT)
outputs.append(instance)
Expand Down
94 changes: 47 additions & 47 deletions src/helm/benchmark/static/schema_bhasa.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -110,57 +110,57 @@ metric_groups:
############################################################

run_groups:
- name: bhasa
display_name: BHASA
description: BHASA scenarios
category: All scenarios
subgroups:
- bhasa_nlu
- bhasa_nlg
- bhasa_nlr
- bhasa_lindsea
# - name: bhasa
# display_name: BHASA
# description: BHASA scenarios
# category: All scenarios
# subgroups:
# - bhasa_nlu
# - bhasa_nlg
# - bhasa_nlr
# - bhasa_lindsea

- name: bhasa_nlu
display_name: BHASA natural language understanding (NLU)
description: BHASA natural language understanding (NLU) scenarios
category: BHASA scenarios
subgroups:
- tydiqa
- xquad
- indicqa
- nusax
- uitvsfc
- wisesight
- indicsentiment
- mlhsd
- vihsd
- thaitoxicitytweets
# - name: bhasa_nlu
# display_name: BHASA natural language understanding (NLU)
# description: BHASA natural language understanding (NLU) scenarios
# category: BHASA scenarios
# subgroups:
# - tydiqa
# - xquad
# - indicqa
# - nusax
# - uitvsfc
# - wisesight
# - indicsentiment
# - mlhsd
# - vihsd
# - thaitoxicitytweets

- name: bhasa_nlg
display_name: BHASA natural language generation (NLG)
description: BHASA natural language generation (NLG) scenarios
category: BHASA scenarios
subgroups:
- flores
# - name: bhasa_nlg
# display_name: BHASA natural language generation (NLG)
# description: BHASA natural language generation (NLG) scenarios
# category: BHASA scenarios
# subgroups:
# - flores

- name: bhasa_nlr
display_name: BHASA natural language reasoning (NLR)
description: BHASA natural language reasoning (NLR) scenarios
category: BHASA scenarios
subgroups:
- indonli
- xnli
- indicxnli
- xcopa
# - name: bhasa_nlr
# display_name: BHASA natural language reasoning (NLR)
# description: BHASA natural language reasoning (NLR) scenarios
# category: BHASA scenarios
# subgroups:
# - indonli
# - xnli
# - indicxnli
# - xcopa

- name: bhasa_lindsea
display_name: BHASA Linguistic Diagnostics for Southeast Asian Languages (LINDSEA)
description: BHASA Linguistic Diagnostics for Southeast Asian Languages (LINDSEA) scenarios
category: BHASA scenarios
subgroups:
- lindsea_syntax_minimal_pairs
- lindsea_pragmatics_pragmatic_reasoning_single
- lindsea_pragmatics_pragmatic_reasoning_pair
display_name: BHASA Linguistic Diagnostics for Southeast Asian Languages (LINDSEA)
description: BHASA Linguistic Diagnostics for Southeast Asian Languages (LINDSEA) scenarios
category: BHASA scenarios
subgroups:
- lindsea_syntax_minimal_pairs_id
- lindsea_pragmatics_pragmatic_reasoning_single
- lindsea_pragmatics_pragmatic_reasoning_pair

- name: tydiqa
display_name: TyDiQA
Expand Down Expand Up @@ -433,7 +433,7 @@ run_groups:
when: "?"
language: Indonesian, Tamil, Thai, Vietnamese

- name: lindsea_syntax_minimal_pairs
- name: lindsea_syntax_minimal_pairs_id
display_name: LINDSEA Syntax Minimal Pairs
description: >
LINDSEA minimal pairs is a linguistic diagnostic for syntax dataset from BHASA [(Leong, 2023)](https://arxiv.org/abs/2309.06085), involving pairs of sentences that differ minimally from each other and contrast in grammatical acceptability.
Expand Down

0 comments on commit 939afd4

Please sign in to comment.