From 43d9bd886ef2b8d0518efb79a1f37333c42ab92a Mon Sep 17 00:00:00 2001 From: kervias Date: Wed, 6 Mar 2024 00:23:07 +0800 Subject: [PATCH] [add] add external datasets.yaml support --- edustudio/assets/datasets.yaml | 3 +++ edustudio/atom_op/raw2mid/nips12.py | 2 +- edustudio/datatpl/common/base_datatpl.py | 16 +++++++++++++--- edustudio/settings.py | 2 +- examples/single_model/run_ncdm_demo.py | 2 +- 5 files changed, 19 insertions(+), 6 deletions(-) diff --git a/edustudio/assets/datasets.yaml b/edustudio/assets/datasets.yaml index 25e2d8e..3eb5d57 100644 --- a/edustudio/assets/datasets.yaml +++ b/edustudio/assets/datasets.yaml @@ -1,3 +1,6 @@ +# 1. all datasets are stored in https://huggingface.co/datasets/lmcRS/edustudio-datasets +# 2. some datasets may not list here, but can still download, as edustudio will look up from external yaml file: https://huggingface.co/datasets/lmcRS/edustudio-datasets/raw/main/datasets.yaml + ASSIST_0910: middata_url: https://huggingface.co/datasets/lmcRS/edustudio-datasets/resolve/main/ASSIST_0910/ASSIST_0910-middata.zip FrcSub: diff --git a/edustudio/atom_op/raw2mid/nips12.py b/edustudio/atom_op/raw2mid/nips12.py index 6651be2..05e89b4 100644 --- a/edustudio/atom_op/raw2mid/nips12.py +++ b/edustudio/atom_op/raw2mid/nips12.py @@ -8,7 +8,7 @@ class R2M_Eedi_20_T12(BaseRaw2Mid): - """R2M_NIPS12 is to preprocess NIPS 2020 challenge Task 1&2 dataset""" + """R2M_Eedi_20_T12 is to preprocess NIPS 2020 challenge Task 1&2 dataset""" def process(self): super().process() # 读入数据 查看 diff --git a/edustudio/datatpl/common/base_datatpl.py b/edustudio/datatpl/common/base_datatpl.py index 8824a3c..ebc08c8 100644 --- a/edustudio/datatpl/common/base_datatpl.py +++ b/edustudio/datatpl/common/base_datatpl.py @@ -6,6 +6,7 @@ import yaml import re import os +import requests class BaseDataTPL(Dataset): @@ -73,17 +74,26 @@ def download_dataset(cls, cfg): cfg (UnifyConfig):the global config object """ dt_name = cfg.dataset - cfg.logger.warning(f"Can't find dataset files of {dt_name} in local disk!") + cfg.logger.warning(f"Can't find dataset files of {dt_name} in local disk") fph = cfg.frame_cfg['DT_INFO_FILE_PATH'] dataset_info = cls.read_yml_file(fph) dataset_info_from_cfg: dict = cfg['frame_cfg']['DT_INFO_DICT'] dataset_info.update(dataset_info_from_cfg) + if dt_name not in dataset_info: + cfg.logger.info(f"Prepare download external datasets.yaml to find dataset:{dt_name}") + url = "https://huggingface.co/datasets/lmcRS/edustudio-datasets/raw/main/datasets.yaml" + cfg.logger.info(f"Eexternal datasets.yaml url: {url}") + resp = requests.get(url) + dataset_info_external = yaml.load(resp.text, Loader=cls._build_yaml_loader()) + if dt_name not in dataset_info_external: + raise Exception("Can't find dataset files from local disk and online") + else: + dataset_info.update(dataset_info_external) + cfg.logger.info(f"Prepare to download {dt_name} dataset from online") cfg.logger.info(f"Download_url: {dataset_info[dt_name]['middata_url']}") - if dt_name not in dataset_info: - raise Exception("Can't find dataset files from local disk and online") if not os.path.exists(cfg.frame_cfg.data_folder_path): os.makedirs(cfg.frame_cfg.data_folder_path) diff --git a/edustudio/settings.py b/edustudio/settings.py index ff50b5e..cf71e07 100644 --- a/edustudio/settings.py +++ b/edustudio/settings.py @@ -32,4 +32,4 @@ curr_file_folder = os.path.dirname(__file__) DT_INFO_FILE_PATH = os.path.realpath(curr_file_folder + "/assets/datasets.yaml") -DT_INFO_DICT = {} # additional dataset info entrypoint, example: {'ASSIST_0910': {middata_url: https://gitlab.com/hfut-lec/edudatafiles/-/raw/main/ASSIST_0910/ASSIST_0910-middata.zip} } +DT_INFO_DICT = {} # additional dataset info entrypoint, example: {'FrcSub': {middata_url: https://huggingface.co/datasets/lmcRS/edustudio-datasets/resolve/main/FrcSub/FrcSub-middata.zip} } diff --git a/examples/single_model/run_ncdm_demo.py b/examples/single_model/run_ncdm_demo.py index cb5f211..e6a4231 100644 --- a/examples/single_model/run_ncdm_demo.py +++ b/examples/single_model/run_ncdm_demo.py @@ -7,7 +7,7 @@ from edustudio.quickstart import run_edustudio run_edustudio( - dataset='FrcSub', + dataset='SLP-Math', cfg_file_name=None, traintpl_cfg_dict={ 'cls': 'GeneralTrainTPL',