From 942cb480019839d4c6100913084f3b260425bf49 Mon Sep 17 00:00:00 2001 From: KevinNuNu <34083603+KevinNuNu@users.noreply.github.com> Date: Fri, 27 Oct 2023 15:47:52 +0800 Subject: [PATCH 1/2] [SIG] add HeadQA dataset --- configs/datasets/HeadQA/HeadQA_ppl.py | 4 ++ configs/datasets/HeadQA/HeadQA_ppl_983537.py | 53 ++++++++++++++++++++ opencompass/datasets/__init__.py | 1 + opencompass/datasets/headqa.py | 25 +++++++++ 4 files changed, 83 insertions(+) create mode 100644 configs/datasets/HeadQA/HeadQA_ppl.py create mode 100644 configs/datasets/HeadQA/HeadQA_ppl_983537.py create mode 100644 opencompass/datasets/headqa.py diff --git a/configs/datasets/HeadQA/HeadQA_ppl.py b/configs/datasets/HeadQA/HeadQA_ppl.py new file mode 100644 index 000000000..59fe5fe45 --- /dev/null +++ b/configs/datasets/HeadQA/HeadQA_ppl.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .HeadQA_ppl_983537 import HeadQA_datasets # noqa: F401, F403 diff --git a/configs/datasets/HeadQA/HeadQA_ppl_983537.py b/configs/datasets/HeadQA/HeadQA_ppl_983537.py new file mode 100644 index 000000000..02c6ff254 --- /dev/null +++ b/configs/datasets/HeadQA/HeadQA_ppl_983537.py @@ -0,0 +1,53 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import HeadQADataset + + +_hint = "The following questions come from exams to access a specialized position in the Spanish healthcare system. \n" \ + "Please choose the correct answer according to the question. \n" + +HeadQA_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template="This is a {category} question which was extracted from the {year} {name} exam.\n" \ + "{qtext}\n{choices}Answer: {ra}", + ), + prompt_template=dict( + type=PromptTemplate, + template={ + answer: + f"{_hint}This is a {{category}} question which was extracted from the {{year}} {{name}} exam.\n" \ + f"{{qtext}}\n{{choices}}Answer: {answer}" + for answer in [1, 2, 3, 4, 5] + }, + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[200, 400, 600, 800, 1000]), + inferencer=dict(type=PPLInferencer)) + +HeadQA_eval_cfg = dict(evaluator=dict(type=AccEvaluator), ) + +langs = ['en', 'es'] +HeadQA_datasets = [] +for lang in langs: + for _split in ['validation', 'test']: + + HeadQA_reader_cfg = dict( + input_columns=['name', 'year', 'category', 'qtext', 'choices'], + output_column='ra', + test_split=_split + ) + + HeadQA_datasets.append( + dict( + abbr=f'HeadQA-{_split}', + type=HeadQADataset, + path='head_qa', + name=lang, + reader_cfg=HeadQA_reader_cfg, + infer_cfg=HeadQA_infer_cfg, + eval_cfg=HeadQA_eval_cfg + ) + ) diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index 0c7583c4e..b9c678d05 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -32,6 +32,7 @@ from .GaokaoBench import * # noqa: F401, F403 from .govrepcrs import * # noqa: F401, F403 from .gsm8k import * # noqa: F401, F403 +from .headqa import * # noqa: F401, F403 from .hellaswag import * # noqa: F401, F403 from .huggingface import * # noqa: F401, F403 from .humaneval import * # noqa: F401, F403 diff --git a/opencompass/datasets/headqa.py b/opencompass/datasets/headqa.py new file mode 100644 index 000000000..c86f75bcb --- /dev/null +++ b/opencompass/datasets/headqa.py @@ -0,0 +1,25 @@ +from datasets import load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class HeadQADataset(BaseDataset): + + @staticmethod + def load(path: str, name: str): + dataset = load_dataset(path=path, name=name) + + def preprocess(example): + answers = example.pop('answers') + choices_str = '' + for ans in answers: + choices_str += f"{ans['aid']}. {ans['atext']}\n" + example['choices'] = choices_str + return example + + dataset = dataset.map(preprocess).remove_columns( + ['image']) + return dataset \ No newline at end of file From 4d5fdaa6a6ab45cfbaf2898a6ef3ae69d052c8d9 Mon Sep 17 00:00:00 2001 From: KevinNuNu <34083603+KevinNuNu@users.noreply.github.com> Date: Fri, 27 Oct 2023 15:57:09 +0800 Subject: [PATCH 2/2] fix hint --- opencompass/datasets/headqa.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/opencompass/datasets/headqa.py b/opencompass/datasets/headqa.py index c86f75bcb..10828339c 100644 --- a/opencompass/datasets/headqa.py +++ b/opencompass/datasets/headqa.py @@ -20,6 +20,5 @@ def preprocess(example): example['choices'] = choices_str return example - dataset = dataset.map(preprocess).remove_columns( - ['image']) - return dataset \ No newline at end of file + dataset = dataset.map(preprocess).remove_columns(['image']) + return dataset