From d4814e5c4c8509942e8ac009f6f6b151f83bbf1d Mon Sep 17 00:00:00 2001 From: kervias Date: Tue, 5 Mar 2024 23:37:13 +0800 Subject: [PATCH] [improve] Change the dataset download source from GitLab to HuggingFace. --- edustudio/assets/datasets.yaml | 12 ++++++------ edustudio/datatpl/common/base_datatpl.py | 8 +++++--- edustudio/datatpl/common/general_datatpl.py | 3 +-- edustudio/datatpl/utils/common.py | 2 +- examples/single_model/run_ncdm_demo.py | 2 +- 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/edustudio/assets/datasets.yaml b/edustudio/assets/datasets.yaml index 8fab0b5..25e2d8e 100644 --- a/edustudio/assets/datasets.yaml +++ b/edustudio/assets/datasets.yaml @@ -1,12 +1,12 @@ ASSIST_0910: - middata_url: https://gitlab.com/hfut-lec/edudatafiles/-/raw/main/ASSIST_0910/ASSIST_0910-middata.zip + middata_url: https://huggingface.co/datasets/lmcRS/edustudio-datasets/resolve/main/ASSIST_0910/ASSIST_0910-middata.zip FrcSub: - middata_url: https://gitlab.com/hfut-lec/edudatafiles/-/raw/main/FrcSub/FrcSub-middata.zip + middata_url: https://huggingface.co/datasets/lmcRS/edustudio-datasets/resolve/main/FrcSub/FrcSub-middata.zip Math1: - middata_url: https://gitlab.com/hfut-lec/edudatafiles/-/raw/main/Math1/Math1-middata.zip + middata_url: https://huggingface.co/datasets/lmcRS/edustudio-datasets/resolve/main/Math1/Math1-middata.zip Math2: - middata_url: https://gitlab.com/hfut-lec/edudatafiles/-/raw/main/Math2/Math2-middata.zip + middata_url: https://huggingface.co/datasets/lmcRS/edustudio-datasets/resolve/main/Math2/Math2-middata.zip AAAI_2023: - middata_url: https://gitlab.com/hfut-lec/edudatafiles/-/raw/main/AAAI_2023/AAAI_2023-middata.zip + middata_url: https://huggingface.co/datasets/lmcRS/edustudio-datasets/resolve/main/AAAI_2023/AAAI_2023-middata.zip PISA_2015_ECD: - middata_url: https://gitlab.com/hfut-lec/edudatafiles/-/raw/main/PISA_2015_ECD/PISA_2015_ECD-middata.zip + middata_url: https://huggingface.co/datasets/lmcRS/edustudio-datasets/resolve/main/PISA_2015_ECD/PISA_2015_ECD-middata.zip diff --git a/edustudio/datatpl/common/base_datatpl.py b/edustudio/datatpl/common/base_datatpl.py index 21e5afd..8824a3c 100644 --- a/edustudio/datatpl/common/base_datatpl.py +++ b/edustudio/datatpl/common/base_datatpl.py @@ -73,15 +73,17 @@ def download_dataset(cls, cfg): cfg (UnifyConfig):the global config object """ dt_name = cfg.dataset - cfg.logger.warning(f"Can't find dataset files of {dt_name} in local environment!") - cfg.logger.info(f"Prepare to download {dt_name} from Internet.") + cfg.logger.warning(f"Can't find dataset files of {dt_name} in local disk!") + fph = cfg.frame_cfg['DT_INFO_FILE_PATH'] dataset_info = cls.read_yml_file(fph) dataset_info_from_cfg: dict = cfg['frame_cfg']['DT_INFO_DICT'] dataset_info.update(dataset_info_from_cfg) + cfg.logger.info(f"Prepare to download {dt_name} dataset from online") + cfg.logger.info(f"Download_url: {dataset_info[dt_name]['middata_url']}") if dt_name not in dataset_info: - raise Exception("Can't find dataset files from Local and Internet!") + raise Exception("Can't find dataset files from local disk and online") if not os.path.exists(cfg.frame_cfg.data_folder_path): os.makedirs(cfg.frame_cfg.data_folder_path) diff --git a/edustudio/datatpl/common/general_datatpl.py b/edustudio/datatpl/common/general_datatpl.py index 1d4c6d8..d5cf217 100644 --- a/edustudio/datatpl/common/general_datatpl.py +++ b/edustudio/datatpl/common/general_datatpl.py @@ -91,8 +91,7 @@ def from_cfg(cls, cfg): Returns: BaseDataTPL """ - if not os.path.exists(f'{cfg.frame_cfg.data_folder_path}'): - print(cfg.frame_cfg.data_folder_path) + if not os.path.exists(cfg.frame_cfg.data_folder_path) or len(os.listdir(cfg.frame_cfg.data_folder_path)) == 0: cls.download_dataset(cfg) load_data_from = cfg.datatpl_cfg['load_data_from'] diff --git a/edustudio/datatpl/utils/common.py b/edustudio/datatpl/utils/common.py index 8202db9..cc892d6 100644 --- a/edustudio/datatpl/utils/common.py +++ b/edustudio/datatpl/utils/common.py @@ -7,7 +7,7 @@ class BigfileDownloader(object): @staticmethod def download(url, title, filepath, chunk_size=10240): - with closing(requests.get(url, stream=True)) as resp: + with closing(requests.get(url, stream=True, allow_redirects=True)) as resp: if resp.status_code != 200: raise Exception("[ERROR]: {} - {} -{}".format(str(resp.status_code), title, url)) chunk_size = chunk_size diff --git a/examples/single_model/run_ncdm_demo.py b/examples/single_model/run_ncdm_demo.py index 352511c..cb5f211 100644 --- a/examples/single_model/run_ncdm_demo.py +++ b/examples/single_model/run_ncdm_demo.py @@ -19,6 +19,6 @@ 'cls': 'NCDM', }, evaltpl_cfg_dict={ - 'clses': ['PredictionEvalTPL'], + 'clses': ['PredictionEvalTPL', 'InterpretabilityEvalTPL'], } )