Skip to content

Commit

Permalink
[add] add external datasets.yaml support
Browse files Browse the repository at this point in the history
  • Loading branch information
kervias committed Mar 5, 2024
1 parent d4814e5 commit 43d9bd8
Show file tree
Hide file tree
Showing 5 changed files with 19 additions and 6 deletions.
3 changes: 3 additions & 0 deletions edustudio/assets/datasets.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# 1. all datasets are stored in https://huggingface.co/datasets/lmcRS/edustudio-datasets
# 2. some datasets may not list here, but can still download, as edustudio will look up from external yaml file: https://huggingface.co/datasets/lmcRS/edustudio-datasets/raw/main/datasets.yaml

ASSIST_0910:
middata_url: https://huggingface.co/datasets/lmcRS/edustudio-datasets/resolve/main/ASSIST_0910/ASSIST_0910-middata.zip
FrcSub:
Expand Down
2 changes: 1 addition & 1 deletion edustudio/atom_op/raw2mid/nips12.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


class R2M_Eedi_20_T12(BaseRaw2Mid):
"""R2M_NIPS12 is to preprocess NIPS 2020 challenge Task 1&2 dataset"""
"""R2M_Eedi_20_T12 is to preprocess NIPS 2020 challenge Task 1&2 dataset"""
def process(self):
super().process()
# 读入数据 查看
Expand Down
16 changes: 13 additions & 3 deletions edustudio/datatpl/common/base_datatpl.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import yaml
import re
import os
import requests


class BaseDataTPL(Dataset):
Expand Down Expand Up @@ -73,17 +74,26 @@ def download_dataset(cls, cfg):
cfg (UnifyConfig):the global config object
"""
dt_name = cfg.dataset
cfg.logger.warning(f"Can't find dataset files of {dt_name} in local disk!")
cfg.logger.warning(f"Can't find dataset files of {dt_name} in local disk")

fph = cfg.frame_cfg['DT_INFO_FILE_PATH']
dataset_info = cls.read_yml_file(fph)
dataset_info_from_cfg: dict = cfg['frame_cfg']['DT_INFO_DICT']
dataset_info.update(dataset_info_from_cfg)

if dt_name not in dataset_info:
cfg.logger.info(f"Prepare download external datasets.yaml to find dataset:{dt_name}")
url = "https://huggingface.co/datasets/lmcRS/edustudio-datasets/raw/main/datasets.yaml"
cfg.logger.info(f"Eexternal datasets.yaml url: {url}")
resp = requests.get(url)
dataset_info_external = yaml.load(resp.text, Loader=cls._build_yaml_loader())
if dt_name not in dataset_info_external:
raise Exception("Can't find dataset files from local disk and online")
else:
dataset_info.update(dataset_info_external)

cfg.logger.info(f"Prepare to download {dt_name} dataset from online")
cfg.logger.info(f"Download_url: {dataset_info[dt_name]['middata_url']}")
if dt_name not in dataset_info:
raise Exception("Can't find dataset files from local disk and online")

if not os.path.exists(cfg.frame_cfg.data_folder_path):
os.makedirs(cfg.frame_cfg.data_folder_path)
Expand Down
2 changes: 1 addition & 1 deletion edustudio/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,4 @@
curr_file_folder = os.path.dirname(__file__)
DT_INFO_FILE_PATH = os.path.realpath(curr_file_folder + "/assets/datasets.yaml")

DT_INFO_DICT = {} # additional dataset info entrypoint, example: {'ASSIST_0910': {middata_url: https://gitlab.com/hfut-lec/edudatafiles/-/raw/main/ASSIST_0910/ASSIST_0910-middata.zip} }
DT_INFO_DICT = {} # additional dataset info entrypoint, example: {'FrcSub': {middata_url: https://huggingface.co/datasets/lmcRS/edustudio-datasets/resolve/main/FrcSub/FrcSub-middata.zip} }
2 changes: 1 addition & 1 deletion examples/single_model/run_ncdm_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from edustudio.quickstart import run_edustudio

run_edustudio(
dataset='FrcSub',
dataset='SLP-Math',
cfg_file_name=None,
traintpl_cfg_dict={
'cls': 'GeneralTrainTPL',
Expand Down

0 comments on commit 43d9bd8

Please sign in to comment.