diff --git a/src/helm/benchmark/adaptation/adapter_spec.py b/src/helm/benchmark/adaptation/adapter_spec.py index 370b53ce88..8f3b2c99a2 100644 --- a/src/helm/benchmark/adaptation/adapter_spec.py +++ b/src/helm/benchmark/adaptation/adapter_spec.py @@ -10,6 +10,7 @@ ADAPT_MULTIPLE_CHOICE_JOINT: str = "multiple_choice_joint" ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL: str = "multiple_choice_separate_original" ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED: str = "multiple_choice_separate_calibrated" +ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING: str = "multiple_choice_language_modeling" ADAPT_RANKING_BINARY: str = "ranking_binary" ADAPT_MULTIPLE_CHOICE_SEPARATE_METHODS: List[str] = [ diff --git a/src/helm/benchmark/adaptation/adapters/adapter_factory.py b/src/helm/benchmark/adaptation/adapters/adapter_factory.py index 11f7925f1e..3faff8a5fb 100644 --- a/src/helm/benchmark/adaptation/adapters/adapter_factory.py +++ b/src/helm/benchmark/adaptation/adapters/adapter_factory.py @@ -6,6 +6,7 @@ ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED, ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, + ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING, ADAPT_RANKING_BINARY, AdapterSpec, ) @@ -20,6 +21,7 @@ from helm.benchmark.adaptation.adapters.multiple_choice_calibrated_adapter import MultipleChoiceCalibratedAdapter from helm.benchmark.adaptation.adapters.multiple_choice_joint_adapter import MultipleChoiceJointAdapter from helm.benchmark.adaptation.adapters.multiple_choice_separate_adapter import MultipleChoiceSeparateAdapter +from helm.benchmark.adaptation.adapters.multiple_choice_language_modeling_adapter import MultipleChoiceLanguageModelingAdapter from helm.benchmark.window_services.tokenizer_service import TokenizerService @@ -42,6 +44,8 @@ def get_adapter(adapter_spec: AdapterSpec, tokenizer_service: TokenizerService) adapter = MultipleChoiceSeparateAdapter(adapter_spec, tokenizer_service) elif method == ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED: adapter = MultipleChoiceCalibratedAdapter(adapter_spec, tokenizer_service) + elif method == ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING: + adapter = MultipleChoiceLanguageModelingAdapter(adapter_spec, tokenizer_service) elif method == ADAPT_RANKING_BINARY: adapter = BinaryRankingAdapter(adapter_spec, tokenizer_service) elif method == ADAPT_GENERATION_MULTIMODAL: diff --git a/src/helm/benchmark/adaptation/adapters/language_modeling_adapter.py b/src/helm/benchmark/adaptation/adapters/language_modeling_adapter.py index a7c9120998..d6ac145a2b 100644 --- a/src/helm/benchmark/adaptation/adapters/language_modeling_adapter.py +++ b/src/helm/benchmark/adaptation/adapters/language_modeling_adapter.py @@ -1,4 +1,4 @@ -from typing import List, Tuple, Optional +from typing import Iterator, List, Tuple, Optional from helm.benchmark.adaptation.request_state import RequestState from helm.benchmark.scenarios.scenario import Instance, EVAL_SPLITS @@ -48,6 +48,35 @@ def adapt(self, instances: List[Instance], parallelism: int) -> List[RequestStat return all_request_states def _generate_requests(self, eval_instance: Instance) -> List[RequestState]: + request_states: List[RequestState] = [] + for prompt_text, num_conditioning_tokens in self.construct_language_modeling_prompts(eval_instance.input.text): + request = Request( + model=self.adapter_spec.model, + model_deployment=self.adapter_spec.model_deployment, + prompt=prompt_text, + num_completions=1, + temperature=0, + max_tokens=self.adapter_spec.max_tokens, # usually this is zero + stop_sequences=self.adapter_spec.stop_sequences, + echo_prompt=True, + random=self.adapter_spec.random, + ) + request_state = RequestState( + instance=eval_instance, + reference_index=None, + request_mode=None, + train_trial_index=0, + output_mapping=None, + request=request, + result=None, + num_conditioning_tokens=num_conditioning_tokens, + num_train_instances=self.adapter_spec.max_train_instances, + prompt_truncated=False, + ) + request_states.append(request_state) + return request_states + + def construct_language_modeling_prompts(self, target_text: str, prefix: str = "") -> Iterator[Tuple[str, int]]: """ Adapted from https://github.com/EleutherAI/lm_perplexity/blob/main/lm_perplexity/utils.py. """ @@ -88,13 +117,12 @@ def _generate_requests(self, eval_instance: Instance) -> List[RequestState]: max_request_length, self.window_service.max_sequence_and_generated_tokens_length - self.adapter_spec.max_tokens, ) - prefix_token: str = self.window_service.prefix_token + prefix = prefix or self.window_service.prefix_token - encode_result: EncodeResult = self.window_service.encode(eval_instance.input.text) + encode_result: EncodeResult = self.window_service.encode(target_text) tokens: List[TokenizationToken] = encode_result.tokens text: str = encode_result.text - request_states: List[RequestState] = [] num_predicted_tokens: int = 0 # Special handling for first window: predict all tokens @@ -114,33 +142,9 @@ def _generate_requests(self, eval_instance: Instance) -> List[RequestState]: # Handle max_sequence_and_generated_tokens_length first_seq_len: int = min(max_sequence_length, len(tokens)) - prompt_text, num_conditioning_tokens = self.construct_language_modeling_prompt( - self.window_service.encode(prefix_token).tokens, tokens[:first_seq_len], max_request_length, text - ) - request = Request( - model=self.adapter_spec.model, - model_deployment=self.adapter_spec.model_deployment, - prompt=prompt_text, - num_completions=1, - temperature=0, - max_tokens=self.adapter_spec.max_tokens, # usually this is zero - stop_sequences=self.adapter_spec.stop_sequences, - echo_prompt=True, - random=self.adapter_spec.random, + yield self.construct_language_modeling_prompt( + self.window_service.encode(prefix).tokens, tokens[:first_seq_len], max_request_length, text ) - request_state = RequestState( - instance=eval_instance, - reference_index=None, - request_mode=None, - train_trial_index=0, - output_mapping=None, - request=request, - result=None, - num_conditioning_tokens=1 if len(prefix_token) > 0 else 0, - num_train_instances=self.adapter_spec.max_train_instances, - prompt_truncated=False, - ) - request_states.append(request_state) num_predicted_tokens += first_seq_len while num_predicted_tokens < len(tokens): @@ -162,37 +166,9 @@ def _generate_requests(self, eval_instance: Instance) -> List[RequestState]: window_end - max_request_length : num_predicted_tokens ] pred_tokens: List[TokenizationToken] = tokens[num_predicted_tokens:window_end] - prompt_text, num_conditioning_tokens = self.construct_language_modeling_prompt( - conditioning_tokens, pred_tokens, max_request_length, text - ) - - request = Request( - model=self.adapter_spec.model, - model_deployment=self.adapter_spec.model_deployment, - prompt=prompt_text, - num_completions=1, - temperature=0, - max_tokens=self.adapter_spec.max_tokens, # usually this is zero - stop_sequences=self.adapter_spec.stop_sequences, - echo_prompt=True, - ) - request_state = RequestState( - instance=eval_instance, - reference_index=None, - request_mode=None, - train_trial_index=0, - output_mapping=None, - request=request, - result=None, - num_conditioning_tokens=num_conditioning_tokens, - num_train_instances=self.adapter_spec.max_train_instances, - prompt_truncated=False, - ) - request_states.append(request_state) + yield self.construct_language_modeling_prompt(conditioning_tokens, pred_tokens, max_request_length, text) num_predicted_tokens += window_pred_len - return request_states - def construct_language_modeling_prompt( self, conditioning_tokens: List[TokenizationToken], diff --git a/src/helm/benchmark/adaptation/common_adapter_specs.py b/src/helm/benchmark/adaptation/common_adapter_specs.py index 3b6a7659c5..1c79196729 100644 --- a/src/helm/benchmark/adaptation/common_adapter_specs.py +++ b/src/helm/benchmark/adaptation/common_adapter_specs.py @@ -6,6 +6,7 @@ ADAPT_MULTIPLE_CHOICE_JOINT, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED, ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, + ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING, ADAPT_RANKING_BINARY, AdapterSpec, ) @@ -220,6 +221,7 @@ def get_generation_adapter_spec( stop_sequences: Optional[List] = None, # default value of `stop_sequences` is ["\n"] temperature: float = 0.0, multi_label: bool = False, + sample_train: bool = True, ) -> AdapterSpec: """ [instructions] @@ -261,6 +263,7 @@ def format_prefix(noun: Optional[str], append_new_line: bool) -> str: temperature=temperature, stop_sequences=stop_sequences, multi_label=multi_label, + sample_train=sample_train, ) @@ -329,6 +332,26 @@ def get_language_modeling_adapter_spec() -> AdapterSpec: ) +def get_multiple_choice_language_modeling_adapter_spec() -> AdapterSpec: + """ + Used for minimal pairs scenarios. + """ + return AdapterSpec( + method=ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING, + instructions="", + input_prefix="", + input_suffix="", + reference_prefix="", + reference_suffix="", + output_prefix="", + output_suffix="", + max_train_instances=0, + num_outputs=1, + max_tokens=0, + temperature=0.0, + ) + + def get_summarization_adapter_spec(num_sents: Optional[int], max_train_instances: int = 5, **kwargs) -> AdapterSpec: """ Used for summarization. diff --git a/src/helm/benchmark/metrics/basic_metrics.py b/src/helm/benchmark/metrics/basic_metrics.py index 03d6c113f4..fb2f82f870 100644 --- a/src/helm/benchmark/metrics/basic_metrics.py +++ b/src/helm/benchmark/metrics/basic_metrics.py @@ -17,6 +17,7 @@ from helm.benchmark.adaptation.adapters.adapter_factory import ( ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED, + ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING, ADAPT_RANKING_BINARY, ) from helm.benchmark.adaptation.request_state import RequestState @@ -253,7 +254,7 @@ def compute_logprob_and_length(request_state: RequestState, window_service: Wind reference_key = ReferenceKey(request_state.reference_index, request_state.request_mode) reference_stats[reference_key] = compute_logprob_and_length(request_state, window_service) - if adapter_spec.method in [ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_RANKING_BINARY]: + if adapter_spec.method in [ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_RANKING_BINARY, ADAPT_MULTIPLE_CHOICE_LANGUAGE_MODELING]: reference_scores = [ reference_stats[ReferenceKey(i, "original")].logprob / reference_stats[ReferenceKey(i, "original")].num_tokens diff --git a/src/helm/benchmark/presentation/run_entries_bhasa.conf b/src/helm/benchmark/presentation/run_entries_bhasa.conf index a8b36ed773..1e833e86c7 100644 --- a/src/helm/benchmark/presentation/run_entries_bhasa.conf +++ b/src/helm/benchmark/presentation/run_entries_bhasa.conf @@ -58,7 +58,10 @@ entries: [ ### 1. Syntax: Minimal Pairs ### Use this to run the minimal pairs evaluation as a MCQ task - {description: "lindsea_syntax_minimal_pairs:model=text,method=mcq,language=id", priority: 1}, + {description: "lindsea_syntax_minimal_pairs:model=text,method=multiple_choice_joint,subset=npis_and_negation", priority: 1}, + {description: "lindsea_syntax_minimal_pairs:model=text,method=multiple_choice_joint,subset=argument_structure", priority: 1}, + {description: "lindsea_syntax_minimal_pairs:model=text,method=multiple_choice_joint,subset=filler_gap_dependencies", priority: 1}, + {description: "lindsea_syntax_minimal_pairs:model=text,method=multiple_choice_joint,subset=morphology", priority: 1}, ### Use this instead of the above in order to run the minimal pairs evaluation using logprobs # {description: "lindsea_syntax_minimal_pairs:model=text,method=probs,language=id" priority: 1}, diff --git a/src/helm/benchmark/run_specs/bhasa_run_specs.py b/src/helm/benchmark/run_specs/bhasa_run_specs.py index 89fe9de114..cfc21e821e 100644 --- a/src/helm/benchmark/run_specs/bhasa_run_specs.py +++ b/src/helm/benchmark/run_specs/bhasa_run_specs.py @@ -1,9 +1,12 @@ from helm.benchmark.adaptation.adapter_spec import ( ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, + ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED, ) from helm.benchmark.adaptation.common_adapter_specs import ( get_generation_adapter_spec, get_multiple_choice_separate_adapter_spec, + get_multiple_choice_joint_adapter_spec, + get_multiple_choice_language_modeling_adapter_spec, ) from helm.benchmark.metrics.bhasa_metrics_specs import ( get_bhasa_machine_translation_metric_specs, @@ -374,6 +377,7 @@ def get_flores_spec(source="en", target="id") -> RunSpec: output_noun=TRANSLATION_PROMPTS[pair]["output_noun"], stop_sequences=["\n"], max_tokens=256, + sample_train=False, ) scenario_spec = ScenarioSpec( @@ -554,21 +558,40 @@ def get_xcopa_spec(language="id") -> RunSpec: @run_spec_function("lindsea_syntax_minimal_pairs") -def get_lindsea_syntax_minimal_pairs_spec(language: str = "id", method: str = "mcq") -> RunSpec: - name = f"lindsea_syntax_minimal_pairs_{language}" - if method == "mcq": - adapter_spec = get_generation_adapter_spec(output_noun=LINDSEA_OUTPUT_NOUNS[language], max_tokens=2) - else: +def get_lindsea_syntax_minimal_pairs_spec(language: str = "id", method: str = "multiple_choice_joint", subset: str = "all") -> RunSpec: + from helm.benchmark.scenarios.bhasa_scenario import LINDSEASyntaxMinimalPairsScenario + name = f"lindsea_syntax_minimal_pairs:language={language},method={method},subset={subset}" + if method == "multiple_choice_joint": + prompt_components = LINDSEASyntaxMinimalPairsScenario.LANGUAGE_TO_PROMPT_COMPONENTS[language] + instructions = prompt_components["instructions"] + output_prefix = prompt_components["output_prefix"] + adapter_spec = get_multiple_choice_joint_adapter_spec( + instructions=instructions, + input_noun=None, + output_noun=output_prefix + ) + # adapter_spec = get_generation_adapter_spec(output_noun=LINDSEA_OUTPUT_NOUNS[language], max_tokens=2) + elif method == "multiple_choice_language_modeling": + adapter_spec = get_multiple_choice_language_modeling_adapter_spec() + elif method == "multiple_choice_separate_original": adapter_spec = get_multiple_choice_separate_adapter_spec( method=ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, empty_input=True, ) + elif method == "multiple_choice_separate_calibrated": + adapter_spec = get_multiple_choice_separate_adapter_spec( + method=ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED, + empty_input=True, + ) + else: + raise ValueError(f"Unknown method {method}") scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEASyntaxMinimalPairsScenario", args={ "method": method, "language": language, + "subset": subset, }, ) diff --git a/src/helm/benchmark/scenarios/bhasa_scenario.py b/src/helm/benchmark/scenarios/bhasa_scenario.py index eac155c7c1..414b04c4c9 100644 --- a/src/helm/benchmark/scenarios/bhasa_scenario.py +++ b/src/helm/benchmark/scenarios/bhasa_scenario.py @@ -1,7 +1,7 @@ import datasets import os import random -from typing import List, Dict +from typing import List, Dict, Optional import pandas as pd @@ -1525,16 +1525,18 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario): description = "LINDSEA minimal pairs task" tags = ["minimal_pairs", "linguistic_diagnostic", "syntax"] - def __init__(self, method: str, language: str): + LANGUAGE_TO_PROMPT_COMPONENTS: Dict[str, Dict[str, str]] = { + "id": { + "instructions": "Kalimat mana yang lebih mungkin?", + "output_prefix": "Jawablah dengan satu huruf saja, A atau B", + } + } + + def __init__(self, method: str, language: str, subset: str = "all"): super().__init__() self.method = method self.language = language - self.prompts = { - "id": { - "instructions": "Kalimat mana yang lebih mungkin?", - "output_prefix": "Jawablah dengan satu huruf saja, A atau B.", - } - } + self.subset = subset def download_dataset(self, output_path: str): BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/" @@ -1545,8 +1547,10 @@ def download_dataset(self, output_path: str): "morphology": f"{BASE_URL}{self.language}/syntax/morphology.jsonl", } + subsets = list(URLS.keys()) if self.subset == "all" else [self.subset] + data_files = {} - for file in list(URLS.keys()): + for file in subsets: target_path_file = os.path.join(output_path, file) ensure_file_downloaded(source_url=URLS[file], target_path=target_path_file) data_files[file] = pd.read_json(target_path_file, lines=True) @@ -1569,15 +1573,11 @@ def get_instances(self, output_path: str) -> List[Instance]: random.shuffle(options) options_reversed = True if options[0][1] == 2 else False - prompt_components = self.prompts[self.language] - instructions = prompt_components["instructions"] - output_prefix = prompt_components["output_prefix"] - prompt = f"{instructions}\nA: {options[0][0]}\nB: {options[1][0]}\n{output_prefix}" - input = Input(text=prompt) + input = Input(text="") # Determine correct option based on whether shuffling reversed the options references = [ - Reference(Output(text="A"), tags=[] if options_reversed else [CORRECT_TAG]), - Reference(Output(text="B"), tags=[CORRECT_TAG] if options_reversed else []), + Reference(Output(text=options[0][0]), tags=[] if options_reversed else [CORRECT_TAG]), + Reference(Output(text=options[1][0]), tags=[CORRECT_TAG] if options_reversed else []), ] instance = Instance(input=input, references=references, split=TEST_SPLIT) outputs.append(instance) diff --git a/src/helm/benchmark/static/schema_bhasa.yaml b/src/helm/benchmark/static/schema_bhasa.yaml index b1a382cba5..9bfb699033 100644 --- a/src/helm/benchmark/static/schema_bhasa.yaml +++ b/src/helm/benchmark/static/schema_bhasa.yaml @@ -110,57 +110,57 @@ metric_groups: ############################################################ run_groups: - - name: bhasa - display_name: BHASA - description: BHASA scenarios - category: All scenarios - subgroups: - - bhasa_nlu - - bhasa_nlg - - bhasa_nlr - - bhasa_lindsea +# - name: bhasa +# display_name: BHASA +# description: BHASA scenarios +# category: All scenarios +# subgroups: +# - bhasa_nlu +# - bhasa_nlg +# - bhasa_nlr +# - bhasa_lindsea - - name: bhasa_nlu - display_name: BHASA natural language understanding (NLU) - description: BHASA natural language understanding (NLU) scenarios - category: BHASA scenarios - subgroups: - - tydiqa - - xquad - - indicqa - - nusax - - uitvsfc - - wisesight - - indicsentiment - - mlhsd - - vihsd - - thaitoxicitytweets +# - name: bhasa_nlu +# display_name: BHASA natural language understanding (NLU) +# description: BHASA natural language understanding (NLU) scenarios +# category: BHASA scenarios +# subgroups: +# - tydiqa +# - xquad +# - indicqa +# - nusax +# - uitvsfc +# - wisesight +# - indicsentiment +# - mlhsd +# - vihsd +# - thaitoxicitytweets - - name: bhasa_nlg - display_name: BHASA natural language generation (NLG) - description: BHASA natural language generation (NLG) scenarios - category: BHASA scenarios - subgroups: - - flores +# - name: bhasa_nlg +# display_name: BHASA natural language generation (NLG) +# description: BHASA natural language generation (NLG) scenarios +# category: BHASA scenarios +# subgroups: +# - flores - - name: bhasa_nlr - display_name: BHASA natural language reasoning (NLR) - description: BHASA natural language reasoning (NLR) scenarios - category: BHASA scenarios - subgroups: - - indonli - - xnli - - indicxnli - - xcopa +# - name: bhasa_nlr +# display_name: BHASA natural language reasoning (NLR) +# description: BHASA natural language reasoning (NLR) scenarios +# category: BHASA scenarios +# subgroups: +# - indonli +# - xnli +# - indicxnli +# - xcopa - name: bhasa_lindsea - display_name: BHASA Linguistic Diagnostics for Southeast Asian Languages (LINDSEA) - description: BHASA Linguistic Diagnostics for Southeast Asian Languages (LINDSEA) scenarios - category: BHASA scenarios - subgroups: - - lindsea_syntax_minimal_pairs - - lindsea_pragmatics_pragmatic_reasoning_single - - lindsea_pragmatics_pragmatic_reasoning_pair + display_name: BHASA Linguistic Diagnostics for Southeast Asian Languages (LINDSEA) + description: BHASA Linguistic Diagnostics for Southeast Asian Languages (LINDSEA) scenarios + category: BHASA scenarios + subgroups: + - lindsea_syntax_minimal_pairs_id + - lindsea_pragmatics_pragmatic_reasoning_single + - lindsea_pragmatics_pragmatic_reasoning_pair - name: tydiqa display_name: TyDiQA @@ -433,7 +433,7 @@ run_groups: when: "?" language: Indonesian, Tamil, Thai, Vietnamese - - name: lindsea_syntax_minimal_pairs + - name: lindsea_syntax_minimal_pairs_id display_name: LINDSEA Syntax Minimal Pairs description: > LINDSEA minimal pairs is a linguistic diagnostic for syntax dataset from BHASA [(Leong, 2023)](https://arxiv.org/abs/2309.06085), involving pairs of sentences that differ minimally from each other and contrast in grammatical acceptability.