diff --git a/src/helm/benchmark/adaptation/common_adapter_specs.py b/src/helm/benchmark/adaptation/common_adapter_specs.py index b38ae84692..f6e32526a9 100644 --- a/src/helm/benchmark/adaptation/common_adapter_specs.py +++ b/src/helm/benchmark/adaptation/common_adapter_specs.py @@ -67,7 +67,7 @@ def get_multiple_choice_separate_adapter_spec(method: str, empty_input: bool = F or [reference_i] """ - assert method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED, ADAPT_MULTIPLE_CHOICE_SEPARATE_LANGUAGE_MODELING} + assert method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED} return AdapterSpec( method=method, diff --git a/src/helm/benchmark/presentation/run_entries_bhasa.conf b/src/helm/benchmark/presentation/run_entries_bhasa.conf index 2798c667ea..1e833e86c7 100644 --- a/src/helm/benchmark/presentation/run_entries_bhasa.conf +++ b/src/helm/benchmark/presentation/run_entries_bhasa.conf @@ -58,10 +58,10 @@ entries: [ ### 1. Syntax: Minimal Pairs ### Use this to run the minimal pairs evaluation as a MCQ task - {description: "lindsea_syntax_minimal_pairs:model=openai/gpt-3.5-turbo-0125,method=multiple_choice_joint,subset=npis_and_negation", priority: 1}, - {description: "lindsea_syntax_minimal_pairs:model=openai/gpt-3.5-turbo-0125,method=multiple_choice_joint,subset=argument_structure", priority: 1}, - {description: "lindsea_syntax_minimal_pairs:model=openai/gpt-3.5-turbo-0125,method=multiple_choice_joint,subset=filler_gap_dependencies", priority: 1}, - {description: "lindsea_syntax_minimal_pairs:model=openai/gpt-3.5-turbo-0125,method=multiple_choice_joint,subset=morphology", priority: 1}, + {description: "lindsea_syntax_minimal_pairs:model=text,method=multiple_choice_joint,subset=npis_and_negation", priority: 1}, + {description: "lindsea_syntax_minimal_pairs:model=text,method=multiple_choice_joint,subset=argument_structure", priority: 1}, + {description: "lindsea_syntax_minimal_pairs:model=text,method=multiple_choice_joint,subset=filler_gap_dependencies", priority: 1}, + {description: "lindsea_syntax_minimal_pairs:model=text,method=multiple_choice_joint,subset=morphology", priority: 1}, ### Use this instead of the above in order to run the minimal pairs evaluation using logprobs # {description: "lindsea_syntax_minimal_pairs:model=text,method=probs,language=id" priority: 1}, diff --git a/src/helm/benchmark/run_specs/bhasa_run_specs.py b/src/helm/benchmark/run_specs/bhasa_run_specs.py index 84f768eb2e..043d7803af 100644 --- a/src/helm/benchmark/run_specs/bhasa_run_specs.py +++ b/src/helm/benchmark/run_specs/bhasa_run_specs.py @@ -1,5 +1,6 @@ from helm.benchmark.adaptation.adapter_spec import ( ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, + ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED, ) from helm.benchmark.adaptation.common_adapter_specs import ( get_generation_adapter_spec, @@ -576,6 +577,11 @@ def get_lindsea_syntax_minimal_pairs_spec(language: str = "id", method: str = "m method=ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, empty_input=True, ) + elif method == "multiple_choice_separate_calibrated": + adapter_spec = get_multiple_choice_separate_adapter_spec( + method=ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED, + empty_input=True, + ) else: raise ValueError(f"Unknown method {method}")