From 1d52272c9306d5a280793eeebbd61cd30a55b940 Mon Sep 17 00:00:00 2001 From: tswsxk Date: Wed, 13 Nov 2019 15:21:07 +0800 Subject: [PATCH 01/27] fix bug --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b7117cc..dc9033f 100644 --- a/README.md +++ b/README.md @@ -111,7 +111,7 @@ In order to better verify the effectiveness of model, the dataset is usually divided into `train/valid/test` or using `kfold` method. ```shell -edudata longling train_valid_test $filename1 $filename2 -- --train_ratio 0.8 --valid_ratio 0.1 --test_ratio 0.1 -longling kfold $filename1 $filename2 -- --n_splits 5 +edudata train_valid_test $filename1 $filename2 -- --train_ratio 0.8 --valid_ratio 0.1 --test_ratio 0.1 +edudata kfold $filename1 $filename2 -- --n_splits 5 ``` Refer to [longling](https://longling.readthedocs.io/zh/latest/#dataset) for more tools and detailed information. From 326d7444ee345441f2d99a9c7cecbab6c457e1d3 Mon Sep 17 00:00:00 2001 From: tswsxk Date: Wed, 13 Nov 2019 16:24:45 +0800 Subject: [PATCH 02/27] =?UTF-8?q?=E9=87=8D=E6=9E=84=E4=B8=8B=E8=BD=BD?= =?UTF-8?q?=E6=B5=8B=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_download.py | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/tests/test_download.py b/tests/test_download.py index d02f6ce..e1a1b8c 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -1,26 +1,12 @@ # coding: utf-8 # create by tongshiwei on 2019/7/2 -import time - -import pytest - from EduData import get_data -from EduData.DataSet.download_data import url_dict def test_download(tmp_path): - for url in url_dict: - get_data(url, tmp_path, override=True) - time.sleep(1) - - -if __name__ == '__main__': - from EduData.DataSet.download_data.utils import reporthook4urlretrieve - from urllib.request import urlretrieve - - urlretrieve( - "http://base.ustc.edu.cn/data/ASSISTment/2015_100_skill_builders_main_problems.zip", - "../data/temp", - reporthook=reporthook4urlretrieve - ) + try: + get_data("toy", tmp_path, override=True) + assert True + except Exception as e: + raise e From 17c350df9675f7674c2044d90b296d3a66af8b31 Mon Sep 17 00:00:00 2001 From: tswsxk Date: Wed, 13 Nov 2019 16:25:02 +0800 Subject: [PATCH 03/27] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index dc9033f..b16311f 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,15 @@ Before downloading dataset, first check the available dataset: ```shell edudata ls ``` +and get: +```text +assistment-2009-2010-skill +assistment-2012-2013-non-skill +assistment-2015 +junyi +KDD-CUP-2010 +slepemapy.cz +``` Download the dataset by specifying the name of dataset: ```shell From 5032006326db5820e8a03cfb529f584e74ff3be8 Mon Sep 17 00:00:00 2001 From: tswsxk Date: Wed, 13 Nov 2019 16:25:56 +0800 Subject: [PATCH 04/27] =?UTF-8?q?=E9=87=8D=E6=9E=84=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E9=9B=86=E5=88=92=E5=88=86=E5=B7=A5=E5=85=B7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../DataSet/download_data/download_data.py | 6 ++- EduData/Tools/__init__.py | 2 - EduData/Tools/train_valid_test.py | 39 ------------------- EduData/main.py | 18 +++++++-- 4 files changed, 19 insertions(+), 46 deletions(-) delete mode 100644 EduData/Tools/__init__.py delete mode 100644 EduData/Tools/train_valid_test.py diff --git a/EduData/DataSet/download_data/download_data.py b/EduData/DataSet/download_data/download_data.py index 275cd12..62fd704 100644 --- a/EduData/DataSet/download_data/download_data.py +++ b/EduData/DataSet/download_data/download_data.py @@ -34,7 +34,11 @@ "KDD-CUP-2010": "http://base.ustc.edu.cn/data/KDD_Cup_2010/", "slepemapy.cz": - "http://base.ustc.edu.cn/data/slepemapy.cz/" + "http://base.ustc.edu.cn/data/slepemapy.cz/", + "synthetic": + "http://base.ustc.edu.cn/data/synthetic/", + "toy": + "http://base.ustc.edu.cn/data/toy.csv", } diff --git a/EduData/Tools/__init__.py b/EduData/Tools/__init__.py deleted file mode 100644 index 19781bf..0000000 --- a/EduData/Tools/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# coding: utf-8 -# 2019/8/23 @ tongshiwei \ No newline at end of file diff --git a/EduData/Tools/train_valid_test.py b/EduData/Tools/train_valid_test.py deleted file mode 100644 index 34c6386..0000000 --- a/EduData/Tools/train_valid_test.py +++ /dev/null @@ -1,39 +0,0 @@ -# coding: utf-8 -# create by tongshiwei on 2019-7-5 - -__all__ = ["train_valid_test", "KFold"] - -import io -from longling.ML.toolkit.dataset import train_valid_test -import random -import math -from tqdm import tqdm - - -def KFold(filename, train_prefix, valid_prefix, n_splits=5, shuffle=False): - with open(filename) as f: - indices = [idx for idx, _ in enumerate(f)] - sample_num = indices[-1] - if shuffle is True: - random.shuffle(indices) - - step = math.ceil(sample_num / n_splits) - indices_buckets = [ - (i, i + step) for i in range(0, sample_num, step) - ] - train_wfs = [ - io.open(train_prefix + str(index), "w", encoding="utf-8") for index in range(n_splits) - ] - valid_wfs = [ - io.open(valid_prefix + str(index), "w", encoding="utf-8") for index in range(n_splits) - ] - with open(filename) as f: - for line_no, line in tqdm(enumerate(f), "splitting dataset"): - for idx, (start, end) in enumerate(indices_buckets): - if start <= line_no < end: - print(line, end="", file=valid_wfs[idx]) - else: - print(line, end="", file=train_wfs[idx]) - - for wf in train_wfs + valid_wfs: - wf.close() diff --git a/EduData/main.py b/EduData/main.py index 902925a..19d7d68 100644 --- a/EduData/main.py +++ b/EduData/main.py @@ -3,14 +3,24 @@ import fire -from EduData.DataSet.download_data.download_data import get_data as download, list_resources as ls +from EduData.DataSet.download_data.download_data import get_data, list_resources from EduData.Task.KnowledgeTracing.format import tl2json, json2tl -from EduData.Task.KnowledgeTracing.statistics import analysis_records as kt_stat -from EduData.Tools.train_valid_test import train_valid_test, KFold as kfold +from EduData.Task.KnowledgeTracing.statistics import analysis_records +from longling.ML.toolkit.dataset import train_valid_test, kfold def cli(): - fire.Fire() + fire.Fire( + { + "download": get_data, + "ls": list_resources, + "tl2json": tl2json, + "json2tl": json2tl, + "kt_stat": analysis_records, + "train_valid_test": train_valid_test, + "kfold": kfold, + } + ) if __name__ == '__main__': From d2eb3d29921fc4bfb7a7b5ac0639fc5d3937b299 Mon Sep 17 00:00:00 2001 From: tswsxk Date: Wed, 13 Nov 2019 21:26:30 +0800 Subject: [PATCH 05/27] =?UTF-8?q?=E5=AE=8C=E6=88=90=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E9=9B=86=E6=A6=82=E8=A7=88=E5=92=8C=E4=B8=8B=E8=BD=BD=E7=9A=84?= =?UTF-8?q?=E6=B5=8B=E8=AF=95=E5=87=BD=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_download.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/test_download.py b/tests/test_download.py index e1a1b8c..d33f6cd 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -1,7 +1,7 @@ # coding: utf-8 # create by tongshiwei on 2019/7/2 -from EduData import get_data +from EduData import get_data, list_resources def test_download(tmp_path): @@ -10,3 +10,8 @@ def test_download(tmp_path): assert True except Exception as e: raise e + + +def test_list_resources(): + list_resources() + assert True From 793872bf84d75054862cb8ecbe3d5a9a0076623f Mon Sep 17 00:00:00 2001 From: tswsxk Date: Wed, 13 Nov 2019 21:26:51 +0800 Subject: [PATCH 06/27] =?UTF-8?q?=E6=96=B0=E5=A2=9Elist=5Fresources?= =?UTF-8?q?=E7=9A=84=E5=AF=BC=E5=87=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- EduData/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EduData/__init__.py b/EduData/__init__.py index 2f6f192..c1c8130 100644 --- a/EduData/__init__.py +++ b/EduData/__init__.py @@ -1,4 +1,4 @@ # coding: utf-8 # create by tongshiwei on 2019/7/2 -from .DataSet import get_data +from .DataSet import get_data, list_resources From ecef8d7de5c9e4ba4b04d0e302fa009521cf8351 Mon Sep 17 00:00:00 2001 From: tswsxk Date: Wed, 13 Nov 2019 21:27:18 +0800 Subject: [PATCH 07/27] =?UTF-8?q?=E5=8F=96=E6=B6=88=E9=83=A8=E5=88=86?= =?UTF-8?q?=E5=87=BD=E6=95=B0=E7=9A=84=E6=A3=80=E6=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- EduData/DataSet/download_data/download_data.py | 11 +++++------ EduData/DataSet/download_data/utils.py | 10 +++++----- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/EduData/DataSet/download_data/download_data.py b/EduData/DataSet/download_data/download_data.py index 62fd704..9b23f50 100644 --- a/EduData/DataSet/download_data/download_data.py +++ b/EduData/DataSet/download_data/download_data.py @@ -2,6 +2,7 @@ # create by tongshiwei on 2019/7/2 __all__ = ["url_dict", "get_data", "list_resources"] + import os from urllib.request import urlretrieve @@ -9,11 +10,9 @@ from bs4 import BeautifulSoup from longling import config_logging, LogLevel, path_append -# from longling.spider import download_data - try: from .utils import decompress, reporthook4urlretrieve -except (SystemError, ModuleNotFoundError): +except (SystemError, ModuleNotFoundError): # pragma: no cover from utils import decompress, reporthook4urlretrieve DEFAULT_DATADIR = path_append("./", "", to_str=True) @@ -42,7 +41,7 @@ } -def get_dataset_name(): +def get_dataset_name(): # pragma: no cover urls = [] for i in url_dict.values(): if i not in urls: @@ -70,7 +69,7 @@ def get_dataset_name(): def download_data(url, data_dir, override): urls = [] os.makedirs(data_dir, exist_ok=True) - if url.endswith('/'): + if url.endswith('/'): # pragma: no cover # 以/结尾是文件夹,其余是文件 file_path = path_append(data_dir, url.split('/')[-2], to_str=True) os.makedirs(file_path, exist_ok=True) @@ -124,7 +123,7 @@ def get_data(dataset, data_dir=DEFAULT_DATADIR, override=False): """ try: return download_data(url_dict[dataset], data_dir, override) - except FileExistsError: + except FileExistsError: # pragma: no cover return path_append(data_dir, url_dict[dataset].split('/')[-1], to_str=True) diff --git a/EduData/DataSet/download_data/utils.py b/EduData/DataSet/download_data/utils.py index df75fe0..cbba83d 100644 --- a/EduData/DataSet/download_data/utils.py +++ b/EduData/DataSet/download_data/utils.py @@ -7,7 +7,7 @@ from longling import flush_print -def decompress(file): +def decompress(file): # pragma: no cover for z in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]: if file.endswith(z): if z == ".zip": @@ -18,14 +18,14 @@ def decompress(file): un_tar(file) -def get_path(file): +def get_path(file): # pragma: no cover # 返回解压缩后的文件名 for i in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]: file = file.replace(i, "") return file -def un_zip(file): +def un_zip(file): # pragma: no cover zip_file = zipfile.ZipFile(file) uz_path = get_path(file) print(file + " is unzip to " + uz_path) @@ -34,14 +34,14 @@ def un_zip(file): zip_file.close() -def un_rar(file): +def un_rar(file): # pragma: no cover rar_file = rarfile.RarFile(file) uz_path = get_path(file) print(file + " is unrar to " + uz_path) rar_file.extractall(uz_path) -def un_tar(file): +def un_tar(file): # pragma: no cover tar_file = tarfile.open(file) uz_path = get_path(file) print(file + " is untar to " + uz_path) From 9d9af0e8e0a847041721f06cf99c3fa52133d508 Mon Sep 17 00:00:00 2001 From: tswsxk Date: Wed, 13 Nov 2019 21:42:28 +0800 Subject: [PATCH 08/27] =?UTF-8?q?=E8=A7=A3=E5=86=B3=20marker=20warning=20?= =?UTF-8?q?=E5=8F=8A=20=E6=B7=BB=E5=8A=A0=20doctest?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pytest.ini | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pytest.ini b/pytest.ini index 717e7dd..097a4e6 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,7 +1,16 @@ [pytest] +# For pytest usage, refer to https://hb4dsai.readthedocs.io/zh/latest/Architecture/Test.html norecursedirs = docs *build* trash dev +# Deal with marker warnings +markers = + pep8: pep8 + # Enable line length testing with maximum line length of 85 pep8maxlinelength = 120 -addopts = --cov --cov-report=term-missing --pep8 +# Ignore module level import not at top of file +pep8ignore = E402 + +# --doctest-modules is used for unitest +addopts = --doctest-modules --cov --cov-report=term-missing --pep8 From abda6fd8369e5c1c9e2f80ff4c017ac5823d2f05 Mon Sep 17 00:00:00 2001 From: tswsxk Date: Thu, 14 Nov 2019 08:53:46 +0800 Subject: [PATCH 09/27] =?UTF-8?q?=E6=96=B0=E5=A2=9Ekt=E4=BB=BB=E5=8A=A1ben?= =?UTF-8?q?chmark=E6=95=B0=E6=8D=AE=E9=9B=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- EduData/DataSet/download_data/download_data.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/EduData/DataSet/download_data/download_data.py b/EduData/DataSet/download_data/download_data.py index 9b23f50..823c4a7 100644 --- a/EduData/DataSet/download_data/download_data.py +++ b/EduData/DataSet/download_data/download_data.py @@ -38,6 +38,8 @@ "http://base.ustc.edu.cn/data/synthetic/", "toy": "http://base.ustc.edu.cn/data/toy.csv", + "ktbs": + "http://base.ustc.edu.cn/data/ktbs", } From 6b5d252a051d554a1914dc2ee3a9f8fb60b97dc5 Mon Sep 17 00:00:00 2001 From: tswsxk Date: Thu, 14 Nov 2019 08:54:01 +0800 Subject: [PATCH 10/27] =?UTF-8?q?=E6=96=B0=E5=A2=9Ekt=E4=BB=BB=E5=8A=A1ben?= =?UTF-8?q?chmark=E6=95=B0=E6=8D=AE=E9=9B=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- EduData/DataSet/download_data/download_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EduData/DataSet/download_data/download_data.py b/EduData/DataSet/download_data/download_data.py index 823c4a7..75473f6 100644 --- a/EduData/DataSet/download_data/download_data.py +++ b/EduData/DataSet/download_data/download_data.py @@ -39,7 +39,7 @@ "toy": "http://base.ustc.edu.cn/data/toy.csv", "ktbs": - "http://base.ustc.edu.cn/data/ktbs", + "http://base.ustc.edu.cn/data/ktbs/", } From bc6ec7f7b503cdd24c6157aec882d6e640fed208 Mon Sep 17 00:00:00 2001 From: tswsxk Date: Thu, 14 Nov 2019 20:30:11 +0800 Subject: [PATCH 11/27] =?UTF-8?q?=E9=87=8D=E6=9E=84download=5Fdata?= =?UTF-8?q?=E5=87=BD=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../DataSet/download_data/download_data.py | 86 +++++++++---------- 1 file changed, 41 insertions(+), 45 deletions(-) diff --git a/EduData/DataSet/download_data/download_data.py b/EduData/DataSet/download_data/download_data.py index 75473f6..11f1025 100644 --- a/EduData/DataSet/download_data/download_data.py +++ b/EduData/DataSet/download_data/download_data.py @@ -1,7 +1,7 @@ # coding: utf-8 # create by tongshiwei on 2019/7/2 -__all__ = ["url_dict", "get_data", "list_resources"] +__all__ = ["URL_DICT", "get_data", "list_resources"] import os from urllib.request import urlretrieve @@ -21,7 +21,7 @@ prefix = 'http://base.ustc.edu.cn/data/' -url_dict = { +URL_DICT = { "assistment-2009-2010-skill": "http://base.ustc.edu.cn/data/ASSISTment/2009_skill_builder_data_corrected.zip", "assistment-2012-2013-non-skill": @@ -36,16 +36,14 @@ "http://base.ustc.edu.cn/data/slepemapy.cz/", "synthetic": "http://base.ustc.edu.cn/data/synthetic/", - "toy": - "http://base.ustc.edu.cn/data/toy.csv", - "ktbs": - "http://base.ustc.edu.cn/data/ktbs/", + "ktbd": + "http://base.ustc.edu.cn/data/ktbd/", } def get_dataset_name(): # pragma: no cover urls = [] - for i in url_dict.values(): + for i in URL_DICT.values(): if i not in urls: urls.append(i) url = prefix @@ -65,51 +63,50 @@ def get_dataset_name(): # pragma: no cover # 避免ASSISTment和junyi的重复 if temp not in ['http://base.ustc.edu.cn/data/ASSISTment/', 'http://base.ustc.edu.cn/data/JunyiAcademy_Math_Practicing_Log/']: - url_dict[h[:-1]] = temp + URL_DICT[h[:-1]] = temp -def download_data(url, data_dir, override): - urls = [] - os.makedirs(data_dir, exist_ok=True) - if url.endswith('/'): # pragma: no cover - # 以/结尾是文件夹,其余是文件 - file_path = path_append(data_dir, url.split('/')[-2], to_str=True) - os.makedirs(file_path, exist_ok=True) +def download_file(url, save_path, override): + logger.info(url + ' is saved as ' + save_path) + urlretrieve(url, save_path, reporthook=reporthook4urlretrieve) + print() + decompress(save_path) + if override: + os.remove(save_path) + logger.info(save_path + ' is deleted.') + + +def download_data(url, data_dir, override, bloom_filter: set = None): + bloom_filter = set() if bloom_filter is None else bloom_filter + + if url in bloom_filter: + return + + if url.endswith("/"): # 以/结尾是文件夹,其余是文件 + _data_dir = path_append(data_dir, url.split('/')[-2], to_str=True) + r = requests.get(url, timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding soup = BeautifulSoup(r.text, "lxml") al = soup.find_all('a') for a in al: - # 获得文件名 + # 获得链接名 h = a.get('href') if h[0] != '.': - temp = url + h - # 避免重复 - if temp not in urls: - urls.append(temp) - temp_path = path_append(file_path, h, to_str=True) - logger.info(temp + ' is saved as ' + temp_path) - # 下载 - urlretrieve(temp, temp_path, reporthook=reporthook4urlretrieve) - print() - # 解压 - decompress(temp_path) - if override: - os.remove(temp_path) - logger.info(temp_path + ' is deleted.') + url_h = url + h + if url_h not in bloom_filter: + download_data(url_h, _data_dir, override, bloom_filter) + bloom_filter.add(url) + else: - file_path = path_append(data_dir, url.split('/')[-1], to_str=True) - logger.info(url + ' is saved as ' + file_path) - urlretrieve(url, file_path, reporthook=reporthook4urlretrieve) - print() - decompress(file_path) - if override: - os.remove(file_path) - logger.info(file_path + ' is deleted.') + os.makedirs(data_dir, exist_ok=True) + save_path = path_append(data_dir, url.split('/')[-1], to_str=True) + download_file(url, save_path, override) + bloom_filter.add(url) -def get_data(dataset, data_dir=DEFAULT_DATADIR, override=False): +def get_data(dataset, data_dir=DEFAULT_DATADIR, override=False, url_dict: dict = None): """ Parameters ---------- @@ -119,10 +116,14 @@ def get_data(dataset, data_dir=DEFAULT_DATADIR, override=False): 数据存储目录 override: bool 是否覆盖已存在的文件 + url_dict: + 链接名称与链接映射 + Returns ------- """ + url_dict = URL_DICT if not url_dict else url_dict try: return download_data(url_dict[dataset], data_dir, override) except FileExistsError: # pragma: no cover @@ -130,9 +131,4 @@ def get_data(dataset, data_dir=DEFAULT_DATADIR, override=False): def list_resources(): - print("\n".join(url_dict)) - - -if __name__ == '__main__': - list_resources() - get_data("assistment-2009-2010-skill") + print("\n".join(URL_DICT)) From b333fef7cec092e2584ebfecc81c4e45885ff922 Mon Sep 17 00:00:00 2001 From: tswsxk Date: Fri, 15 Nov 2019 08:57:58 +0800 Subject: [PATCH 12/27] =?UTF-8?q?=E9=87=8D=E5=91=BD=E5=90=8D=20url=5Fdict?= =?UTF-8?q?=20=E5=88=B0=20URL=5FDICT?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- EduData/DataSet/download_data/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EduData/DataSet/download_data/__init__.py b/EduData/DataSet/download_data/__init__.py index e5fb102..c8e15d2 100644 --- a/EduData/DataSet/download_data/__init__.py +++ b/EduData/DataSet/download_data/__init__.py @@ -2,4 +2,4 @@ # create by tongshiwei on 2019-8-16 -from .download_data import url_dict +from .download_data import URL_DICT From 37a81034e1c38c528959b51b4aa06447430e775b Mon Sep 17 00:00:00 2001 From: tswsxk Date: Fri, 15 Nov 2019 08:58:12 +0800 Subject: [PATCH 13/27] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E5=AF=BC=E5=87=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- EduData/DataSet/junyi/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/EduData/DataSet/junyi/__init__.py b/EduData/DataSet/junyi/__init__.py index c33243e..5ec63ef 100644 --- a/EduData/DataSet/junyi/__init__.py +++ b/EduData/DataSet/junyi/__init__.py @@ -1,2 +1,5 @@ # coding: utf-8 # create by tongshiwei on 2019-7-5 + +from .junyi import build_knowledge_graph +from .KnowledgeTracing import select_n_most_frequent_students From 24d9e2b7f31cfdc1e5dbc42cb2c82ff05b130d46 Mon Sep 17 00:00:00 2001 From: tswsxk Date: Fri, 15 Nov 2019 08:58:36 +0800 Subject: [PATCH 14/27] =?UTF-8?q?=E9=A1=B6=E5=B1=82=E5=87=BD=E6=95=B0?= =?UTF-8?q?=E5=B0=81=E8=A3=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- EduData/DataSet/junyi/junyi.py | 53 ++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/EduData/DataSet/junyi/junyi.py b/EduData/DataSet/junyi/junyi.py index 58be4e4..3716d0e 100644 --- a/EduData/DataSet/junyi/junyi.py +++ b/EduData/DataSet/junyi/junyi.py @@ -1,13 +1,18 @@ # coding: utf-8 # create by tongshiwei on 2019/7/2 +""" +This script is used to build the map dict (ku_name -> idx) extract some relations from the original junyi dataset. +""" +__all__ = ["build_knowledge_graph"] + import codecs import csv import json import networkx as nx import pandas -from longling import wf_open, config_logging +from longling import wf_open, config_logging, path_append from tqdm import tqdm logger = config_logging(logger="junyi", console_log_level="info") @@ -66,6 +71,7 @@ def merge_relationship_annotation(sources, target): f.readline() for line in f: wf.write(line) + return target def extract_similarity(source, target, ku_dict): @@ -104,20 +110,31 @@ def extract_difficulty(source, target, ku_dict): json.dump(difficulty, wf, indent=2) -if __name__ == '__main__': - root = "../../../" - raw_file = root + "data/junyi/junyi_Exercise_table.csv" - ku_dict_file = root + "data/junyi/graph_vertex.json" - prerequisite_file = root + "data/junyi/prerequisite.json" - similarity_raw_files = [ - root + "data/junyi/relationship_annotation_{}.csv".format(name) for name in ["testing", "training"] - ] - similarity_raw_file = root + "raw_data/junyi/relationship_annotation.csv" - similarity_file = root + "data/junyi/similarity.json" - difficulty_file = root + "data/junyi/difficulty.json" - - # merge_relationship_annotation(similarity_raw_files, similarity_raw_file) - # build_ku_dict(raw_file, ku_dict_file) - # extract_prerequisite(raw_file, prerequisite_file, ku_dict_file) - extract_similarity(similarity_raw_file, similarity_file, ku_dict_file) - # extract_difficulty(similarity_raw_file, difficulty_file, ku_dict_file) +def build_knowledge_graph(src_root: str, tar_root: (str, None) = None, + ku_dict_path: str = None, + prerequisite_path: (str, None) = None, + similarity_path: (str, None) = None, + difficulty_path: (str, None) = None): + tar_root = tar_root if tar_root is not None else src_root + exercise_src = path_append(src_root, "junyi_Exercise_table.csv") + + assert ku_dict_path is not None + + relation_src = merge_relationship_annotation( + [path_append(src_root, "relationship_annotation_{}.csv".format(name)) for name in ["testing", "training"]], + path_append(src_root, "relationship_annotation.csv") + ) + ku_dict_path = path_append(tar_root, ku_dict_path) + build_ku_dict(exercise_src, ku_dict_path) + + if prerequisite_path is not None: + prerequisite_path = path_append(tar_root, prerequisite_path) + extract_prerequisite(exercise_src, prerequisite_path, ku_dict_path) + + if similarity_path is not None: + similarity_path = path_append(tar_root, "similarity.json") + extract_similarity(relation_src, similarity_path, ku_dict_path) + + if difficulty_path is not None: + difficulty_path = path_append(tar_root, "difficulty.json") + extract_difficulty(relation_src, difficulty_path, ku_dict_path) From c2a09a64bfd3ce8ffdf6657d7144d922ec5c0fad Mon Sep 17 00:00:00 2001 From: tswsxk Date: Fri, 15 Nov 2019 08:59:03 +0800 Subject: [PATCH 15/27] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E8=AF=B4=E6=98=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- EduData/DataSet/junyi/KnowledgeTracing.py | 26 +++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/EduData/DataSet/junyi/KnowledgeTracing.py b/EduData/DataSet/junyi/KnowledgeTracing.py index 65b5746..395af8f 100644 --- a/EduData/DataSet/junyi/KnowledgeTracing.py +++ b/EduData/DataSet/junyi/KnowledgeTracing.py @@ -1,6 +1,12 @@ # coding: utf-8 # create by tongshiwei on 2019-7-5 +""" +This script is used to convert the original junyi dataset into json sequence, which can be applied in kt task. +""" + +__all__ = ["select_n_most_frequent_students"] + import csv import json @@ -9,8 +15,15 @@ from tqdm import tqdm -def _read(source, ku_dict): - """require big memory to run this function""" +def _read(source: str, ku_dict: str) -> dict: + """ + Read the learners' interaction records and classify them by user id and session id. + In the same time, the exercise name will be converted to id. + + Notes + ----- + Require big memory to run this function. + """ outcome = { "INCORRECT": 0, @@ -26,8 +39,8 @@ def _read(source, ku_dict): with open(source) as f: f.readline() for line in tqdm(csv.reader(f, delimiter='\t'), "reading data"): - student, session, exercise, correct, timestamp = line[0], line[1], ku_dict[line[-5]], \ - outcome[line[10]], line[8] + student, session, exercise = line[0], line[1], ku_dict[line[-5]], + correct, timestamp = outcome[line[10]], line[8] if student not in students: students[student] = {} if session not in students[student]: @@ -58,7 +71,7 @@ def _frequency(students): return sorted(frequency.items(), key=lambda x: x[1], reverse=True) -def get_n_most_frequent_students(students, n=None, frequency=None): +def get_n_most_frequent_students(students, n=None, frequency: list = None): frequency = _frequency(students) if frequency is None else frequency __frequency = frequency if n is None else frequency[:n] _students = {} @@ -67,7 +80,8 @@ def get_n_most_frequent_students(students, n=None, frequency=None): return _students -def select_n_most_frequent_students(source, target_prefix, ku_dict, n): +def select_n_most_frequent_students(source, target_prefix, ku_dict, n: (int, list)): + """None in n means select all students""" n_list = as_list(n) students = _read(source, ku_dict) frequency = _frequency(students) From 84d7faa9b22395d316f2ecc8b8517e55c68c015c Mon Sep 17 00:00:00 2001 From: tswsxk Date: Fri, 15 Nov 2019 08:59:17 +0800 Subject: [PATCH 16/27] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E8=AF=B4=E6=98=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- EduData/DataSet/junyi/README.md | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 EduData/DataSet/junyi/README.md diff --git a/EduData/DataSet/junyi/README.md b/EduData/DataSet/junyi/README.md new file mode 100644 index 0000000..1b58f43 --- /dev/null +++ b/EduData/DataSet/junyi/README.md @@ -0,0 +1,4 @@ +# Junyi Dataset + +For detailed annotation for each file and field, you can download the dataset from our datashop +and see the `README.md` file. \ No newline at end of file From 97a9ab3c07d3ac6d05cae0da413a89af20091d88 Mon Sep 17 00:00:00 2001 From: tswsxk Date: Fri, 15 Nov 2019 08:59:42 +0800 Subject: [PATCH 17/27] =?UTF-8?q?=E6=96=B0=E5=A2=9Edemo=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- script/junyi_kt.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 script/junyi_kt.py diff --git a/script/junyi_kt.py b/script/junyi_kt.py new file mode 100644 index 0000000..c4bbf7a --- /dev/null +++ b/script/junyi_kt.py @@ -0,0 +1,27 @@ +# coding: utf-8 +# 2019/11/14 @ tongshiwei + +__all__ = ["extract_relations", "build_json_sequence"] + +from longling import path_append +from EduData.DataSet.junyi import build_knowledge_graph, select_n_most_frequent_students + + +def extract_relations(src_root="../raw_data/junyi/", tar_root="../data/junyi/data/"): + build_knowledge_graph( + src_root, tar_root, + ku_dict_path="graph_vertex.json", + prerequisite_path="prerequisite.json", + similarity_path="similarity.json", + difficulty_path="difficulty.json", + ) + + +def build_json_sequence(src_root="../raw_data/junyi/", tar_root="../data/junyi/data/", + ku_dict_path="../data/junyi/data/graph_vertex.json", n=1000): + select_n_most_frequent_students( + path_append(src_root, "junyi_ProblemLog_for_PSLC.txt", to_str=True), + path_append(tar_root, "student_log_kt_", to_str=True), + ku_dict_path, + n, + ) From 8da274b00ec4153ffc57f0972e6046a8cf7e2051 Mon Sep 17 00:00:00 2001 From: tswsxk Date: Fri, 15 Nov 2019 09:00:03 +0800 Subject: [PATCH 18/27] =?UTF-8?q?=E8=B0=83=E6=95=B4=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- EduData/Task/KnowledgeTracing/format.py | 34 ++++++++++--------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/EduData/Task/KnowledgeTracing/format.py b/EduData/Task/KnowledgeTracing/format.py index aca69a6..861b6f8 100644 --- a/EduData/Task/KnowledgeTracing/format.py +++ b/EduData/Task/KnowledgeTracing/format.py @@ -9,38 +9,30 @@ __all__ = ["tl2json", "json2tl"] -def tl2json(src, tar): +def tl2json(src: str, tar: str): """ + convert the dataset in `tl` sequence into `json` sequence + .tl format - The first line is the number of exercises a student attempted. The second line is the exercise tag sequence. - The third line is the response sequence. + The first line is the number of exercises a student attempted. + The second line is the exercise tag sequence. + The third line is the response sequence. :: - Examples - -------- - 15 - 1,1,1,1,7,7,9,10,10,10,10,11,11,45,54 - 0,1,1,1,1,1,0,0,1,1,1,1,1,0,0 + 15 + 1,1,1,1,7,7,9,10,10,10,10,11,11,45,54 + 0,1,1,1,1,1,0,0,1,1,1,1,1,0,0 .json format Each sample contains several response elements, and each element is a two-element list. - The first is the exercise tag and the second is the response - Examples - -------- - [[1,0],[1,1],[1,1],[1,1],[7,1],[7,1],[9,0],[10,0],[10,1],[10,1],[10,1],[11,1],[11,1],[45,0],[54,0]] - - Parameters - ---------- - src - tar + The first is the exercise tag and the second is the response. :: - Examples - ------- + [[1,0],[1,1],[1,1],[1,1],[7,1],[7,1],[9,0],[10,0],[10,1],[10,1],[10,1],[11,1],[11,1],[45,0],[54,0]] """ with open(src) as f, io.open(tar, "w", encoding="utf-8") as wf: for _ in tqdm(f): - exercise_tags = f.readline().strip().split(",") - response_sequence = f.readline().strip().split(",") + exercise_tags = f.readline().strip().strip(",").split(",") + response_sequence = f.readline().strip().strip(",").split(",") responses = list(zip(exercise_tags, response_sequence)) print(json.dumps(responses), file=wf) From 2e834f1f98a039b9adf93093e865120bcb69cd90 Mon Sep 17 00:00:00 2001 From: tswsxk Date: Tue, 19 Nov 2019 09:12:59 +0800 Subject: [PATCH 19/27] =?UTF-8?q?=E5=AE=8C=E6=88=90=E6=B5=8B=E8=AF=95?= =?UTF-8?q?=E9=83=A8=E5=88=86=E7=BC=96=E5=86=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/conftest.py | 25 +++++++++++++++++++++++++ tests/test_download.py | 7 ++++--- tests/test_format.py | 9 +-------- tests/test_junyi.py | 35 +++++++++++++++++++++++++++++++++++ 4 files changed, 65 insertions(+), 11 deletions(-) create mode 100644 tests/conftest.py create mode 100644 tests/test_junyi.py diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..3bd9242 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,25 @@ +# coding: utf-8 +# 2019/11/14 @ tongshiwei + +from EduData import get_data +from longling import path_append +import functools +import pytest + +test_url_dict = { + "tests": + "http://base.ustc.edu.cn/data/tests/", + "junyi": + "http://base.ustc.edu.cn/data/tests/junyi/", +} + +get_data = functools.partial(get_data, url_dict=test_url_dict) + + +@pytest.fixture(scope="session") +def shared_data_dir(tmp_path_factory): + tmpdir = tmp_path_factory.mktemp("data") + try: + return path_append(get_data("tests", tmpdir, override=True), "tests") + except Exception as e: + raise e diff --git a/tests/test_download.py b/tests/test_download.py index d33f6cd..f56911d 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -1,12 +1,13 @@ # coding: utf-8 # create by tongshiwei on 2019/7/2 -from EduData import get_data, list_resources +from EduData import list_resources +from .conftest import get_data, test_url_dict -def test_download(tmp_path): +def test_download(shared_data_dir): try: - get_data("toy", tmp_path, override=True) + get_data("tests", shared_data_dir, url_dict=test_url_dict) assert True except Exception as e: raise e diff --git a/tests/test_format.py b/tests/test_format.py index 9f46865..5516f32 100644 --- a/tests/test_format.py +++ b/tests/test_format.py @@ -1,12 +1,5 @@ # coding: utf-8 # create by tongshiwei on 2019-8-14 -from EduData.Task.KnowledgeTracing.format import tl2json, json2tl - -def test_json2tl(tmp_path): - src = "../data/junyi/student_log_kt.json.small.test" - tl_tar = tmp_path / "student_log_kt.json.small.test.tl" - json_tar = tmp_path / "student_log_kt.json.small.test.json" - json2tl(src, tl_tar) - tl2json(tl_tar, json_tar) +# redirect to test_junyi.py diff --git a/tests/test_junyi.py b/tests/test_junyi.py new file mode 100644 index 0000000..bcfe32f --- /dev/null +++ b/tests/test_junyi.py @@ -0,0 +1,35 @@ +# coding: utf-8 +# 2019/11/14 @ tongshiwei + +from longling import path_append +from script.junyi_kt import extract_relations, build_json_sequence +from EduData.Task.KnowledgeTracing.format import tl2json, json2tl +from EduData.Task.KnowledgeTracing.statistics import analysis_records + + +def test_junyi(shared_data_dir): + src_root = path_append(shared_data_dir, "junyi", to_str=True) + extract_relations(src_root, path_append(src_root, "data")) + assert True + + +def test_junyi_kt(shared_data_dir): + src_root = path_append(shared_data_dir, "junyi", to_str=True) + ku_dict_path = path_append(shared_data_dir, "junyi", "data", "graph_vertex.json") + build_json_sequence(src_root, path_append(src_root, "data", to_str=True), ku_dict_path) + assert True + + +def test_json2tl(shared_data_dir): + src = path_append(shared_data_dir, "junyi", "data", "student_log_kt_1000", to_str=True) + tl_tar = path_append(shared_data_dir, "junyi", "data", "student_log_kt_1000.tl", to_str=True) + json_tar = path_append(shared_data_dir, "junyi", "data", "student_log_kt_1000.json", to_str=True) + json2tl(src, tl_tar) + tl2json(tl_tar, json_tar) + assert True + + +def test_analysis(shared_data_dir): + src = path_append(shared_data_dir, "junyi", "data", "student_log_kt_1000", to_str=True) + analysis_records(src) + assert True From e67be68050247fa4ec334ac97bcbf10b3ff99632 Mon Sep 17 00:00:00 2001 From: tswsxk Date: Tue, 19 Nov 2019 09:13:22 +0800 Subject: [PATCH 20/27] =?UTF-8?q?=E9=80=82=E9=85=8D=E5=8D=95=E5=85=83?= =?UTF-8?q?=E6=B5=8B=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../DataSet/download_data/download_data.py | 11 +++-- EduData/DataSet/junyi/KnowledgeTracing.py | 45 ------------------- EduData/main.py | 6 +-- 3 files changed, 8 insertions(+), 54 deletions(-) diff --git a/EduData/DataSet/download_data/download_data.py b/EduData/DataSet/download_data/download_data.py index 11f1025..8853ca3 100644 --- a/EduData/DataSet/download_data/download_data.py +++ b/EduData/DataSet/download_data/download_data.py @@ -67,19 +67,20 @@ def get_dataset_name(): # pragma: no cover def download_file(url, save_path, override): + if os.path.exists(save_path) and override: # pragma: no cover + os.remove(save_path) + logger.info(save_path + ' will be overridden.') + logger.info(url + ' is saved as ' + save_path) urlretrieve(url, save_path, reporthook=reporthook4urlretrieve) print() decompress(save_path) - if override: - os.remove(save_path) - logger.info(save_path + ' is deleted.') def download_data(url, data_dir, override, bloom_filter: set = None): bloom_filter = set() if bloom_filter is None else bloom_filter - if url in bloom_filter: + if url in bloom_filter: # pragma: no cover return if url.endswith("/"): # 以/结尾是文件夹,其余是文件 @@ -105,6 +106,8 @@ def download_data(url, data_dir, override, bloom_filter: set = None): download_file(url, save_path, override) bloom_filter.add(url) + return data_dir + def get_data(dataset, data_dir=DEFAULT_DATADIR, override=False, url_dict: dict = None): """ diff --git a/EduData/DataSet/junyi/KnowledgeTracing.py b/EduData/DataSet/junyi/KnowledgeTracing.py index 395af8f..1d9445d 100644 --- a/EduData/DataSet/junyi/KnowledgeTracing.py +++ b/EduData/DataSet/junyi/KnowledgeTracing.py @@ -59,11 +59,6 @@ def _write(students, target): print(json.dumps(exercise_response), file=wf) -def extract_students_log(source, target, ku_dict): - students = _read(source, ku_dict) - _write(students, target) - - def _frequency(students): frequency = {} for student_id, sessions in tqdm(students.items(), "calculating frequency"): @@ -87,43 +82,3 @@ def select_n_most_frequent_students(source, target_prefix, ku_dict, n: (int, lis frequency = _frequency(students) for _n in n_list: _write(get_n_most_frequent_students(students, _n, frequency), target_prefix + "%s" % _n) - - -if __name__ == '__main__': - root = "../../../" - student_log_raw_file = root + "data/junyi/junyi_ProblemLog_for_PSLC.txt" - # student_log_file = root + "data/junyi/student_log_kt.json" - ku_dict_file = root + "data/junyi/graph_vertex.json" - - select_n_most_frequent_students( - student_log_raw_file, - root + "data/junyi/student_log_kt_", - ku_dict_file, - [None] - ) - - # select_n_most_frequent_students( - # student_log_raw_file, - # root + "data/junyi/student_log_kt_", - # ku_dict_file, - # [100, 200, 300] - # ) - # [500, 1000, 2000] - - # extract_students_log(student_log_raw_file, student_log_file, ku_dict_file) - - # student_log_file_small = student_log_file + ".small" - # - # with open(student_log_file) as f, wf_open(student_log_file_small) as wf: - # for i, line in tqdm(enumerate(f)): - # if i > 50000: - # break - # print(line, end="", file=wf) - # - # print(train_valid_test( - # student_log_file_small, - # valid_ratio=0., - # test_ratio=0.2, - # root_dir=root + "data/junyi/", - # silent=False, - # )) diff --git a/EduData/main.py b/EduData/main.py index 19d7d68..bbc461a 100644 --- a/EduData/main.py +++ b/EduData/main.py @@ -9,7 +9,7 @@ from longling.ML.toolkit.dataset import train_valid_test, kfold -def cli(): +def cli(): # pragma: no cover fire.Fire( { "download": get_data, @@ -21,7 +21,3 @@ def cli(): "kfold": kfold, } ) - - -if __name__ == '__main__': - cli() From 4a0c934d70fe5ae4921b94d8abcd1248bd12ffde Mon Sep 17 00:00:00 2001 From: tswsxk Date: Tue, 19 Nov 2019 09:19:16 +0800 Subject: [PATCH 21/27] VNR: ku_dict -> ku_dict_path --- EduData/DataSet/junyi/KnowledgeTracing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/EduData/DataSet/junyi/KnowledgeTracing.py b/EduData/DataSet/junyi/KnowledgeTracing.py index 1d9445d..3267294 100644 --- a/EduData/DataSet/junyi/KnowledgeTracing.py +++ b/EduData/DataSet/junyi/KnowledgeTracing.py @@ -75,10 +75,10 @@ def get_n_most_frequent_students(students, n=None, frequency: list = None): return _students -def select_n_most_frequent_students(source, target_prefix, ku_dict, n: (int, list)): +def select_n_most_frequent_students(source, target_prefix, ku_dict_path, n: (int, list)): """None in n means select all students""" n_list = as_list(n) - students = _read(source, ku_dict) + students = _read(source, ku_dict_path) frequency = _frequency(students) for _n in n_list: _write(get_n_most_frequent_students(students, _n, frequency), target_prefix + "%s" % _n) From 3a870b2e226c0c59691ca4ed9fc161604f005c15 Mon Sep 17 00:00:00 2001 From: tswsxk Date: Tue, 19 Nov 2019 09:38:16 +0800 Subject: [PATCH 22/27] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=96=B0=E7=9A=84?= =?UTF-8?q?=E5=91=BD=E4=BB=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- EduData/main.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/EduData/main.py b/EduData/main.py index bbc461a..837ea55 100644 --- a/EduData/main.py +++ b/EduData/main.py @@ -7,6 +7,7 @@ from EduData.Task.KnowledgeTracing.format import tl2json, json2tl from EduData.Task.KnowledgeTracing.statistics import analysis_records from longling.ML.toolkit.dataset import train_valid_test, kfold +from script.junyi_kt import extract_relations, build_json_sequence def cli(): # pragma: no cover @@ -19,5 +20,13 @@ def cli(): # pragma: no cover "kt_stat": analysis_records, "train_valid_test": train_valid_test, "kfold": kfold, + "dataset": { + "junyi": { + "kt": { + "extract_relations": extract_relations, + "build_json_sequence": build_json_sequence, + } + } + } } ) From d7f2e74c763c240be1b4662e13c36817a30460d4 Mon Sep 17 00:00:00 2001 From: tswsxk Date: Tue, 19 Nov 2019 09:44:15 +0800 Subject: [PATCH 23/27] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E7=A4=BA=E4=BE=8B?= =?UTF-8?q?=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b16311f..5ae793f 100644 --- a/README.md +++ b/README.md @@ -85,7 +85,8 @@ we offer another one format, named `json sequence` to represent the interaction ``` Each item in the sequence represent one interaction. The first element of the item is the exercise -(some works call it knowledge unit or knowledge item) id +id (in some works, the exercise id is not one-to-one mapped to one knowledge unit(ku)/concept, +but in junyi, one exercise contains one ku) and the second one indicates whether the learner correctly answer the exercise, 0 for wrongly while 1 for correctly One line, one `json` record, which is corresponded to a learner's interaction sequence. @@ -102,7 +103,18 @@ The cli tools to quickly convert the "raw" data of the dataset into "mature" dat The "mature" data is in `json sequence` format and can be modeled by [XKT](https://github.com/bigdata-ustc/XKT) and TKT(TBA) -TBA +###### junyi +``` +# download junyi dataset to junyi/ +>>> edudata download junyi +# build knolwedge graph +>>> edudata dataset junyi kt extract_relations junyi/ junyi/data/ +# prepare dataset for knwoeldge tracing task, which is represented in json sequence +>>> edudata dataset junyi kt build_json_sequence junyi/ junyi/data/ junyi/data/graph_vertex.json 1000 +# after preprocessing, a json sequence file, named student_log_kt_1000, can be found in junyi/data/ +# further preprocessing like spliting dataset into train and test can be performed +>>> edudata train_valid_test junyi/data/student_log_kt_1000 -- --train_ratio 0.8 --valid_ratio 0.1 --test_ratio 0.1 +``` ###### Analysis Dataset This tool only supports the `json sequence` format. To check the following statical indexes of the dataset: From 0e432134e7f7ab5f9c7e32c142c25af82cbdfed6 Mon Sep 17 00:00:00 2001 From: tswsxk Date: Tue, 19 Nov 2019 09:47:57 +0800 Subject: [PATCH 24/27] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E7=B1=BB=E5=9E=8B?= =?UTF-8?q?=E6=A3=80=E6=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- script/junyi_kt.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/script/junyi_kt.py b/script/junyi_kt.py index c4bbf7a..af0792e 100644 --- a/script/junyi_kt.py +++ b/script/junyi_kt.py @@ -7,7 +7,7 @@ from EduData.DataSet.junyi import build_knowledge_graph, select_n_most_frequent_students -def extract_relations(src_root="../raw_data/junyi/", tar_root="../data/junyi/data/"): +def extract_relations(src_root: str = "../raw_data/junyi/", tar_root: str = "../data/junyi/data/"): build_knowledge_graph( src_root, tar_root, ku_dict_path="graph_vertex.json", @@ -17,8 +17,8 @@ def extract_relations(src_root="../raw_data/junyi/", tar_root="../data/junyi/dat ) -def build_json_sequence(src_root="../raw_data/junyi/", tar_root="../data/junyi/data/", - ku_dict_path="../data/junyi/data/graph_vertex.json", n=1000): +def build_json_sequence(src_root: str = "../raw_data/junyi/", tar_root: str = "../data/junyi/data/", + ku_dict_path: str = "../data/junyi/data/graph_vertex.json", n: int = 1000): select_n_most_frequent_students( path_append(src_root, "junyi_ProblemLog_for_PSLC.txt", to_str=True), path_append(tar_root, "student_log_kt_", to_str=True), From 3920a7337b7090d5a91e0dec112b581c204fcf38 Mon Sep 17 00:00:00 2001 From: tswsxk Date: Tue, 19 Nov 2019 09:48:17 +0800 Subject: [PATCH 25/27] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E8=AF=B4=E6=98=8E?= =?UTF-8?q?=E5=92=8C=E4=BE=8B=E5=AD=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5ae793f..745ea18 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,11 @@ The dataset includes: * [synthetic](https://github.com/chrispiech/DeepKnowledgeTracing/tree/master/data/synthetic) -Your can also visit our datashop [BaseData](http://base.ustc.edu.cn/data/) to get those mentioned-above (most of them) dataset. +Your can also visit our datashop [BaseData](http://base.ustc.edu.cn/data/) to get those mentioned-above (most of them) dataset. + +Except those mentioned-above dataset, we also provide some benchmark dataset for some specified task, which is listed as follows: + +* [knowledge tracing benchmark dataset](http://base.ustc.edu.cn/data/ktbd/) ## Tutorial From d6cb940312a976e3b5576fc6bf998019728f95ae Mon Sep 17 00:00:00 2001 From: tswsxk Date: Tue, 19 Nov 2019 09:48:33 +0800 Subject: [PATCH 26/27] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E7=B1=BB=E5=9E=8B?= =?UTF-8?q?=E6=A3=80=E6=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- EduData/DataSet/junyi/KnowledgeTracing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EduData/DataSet/junyi/KnowledgeTracing.py b/EduData/DataSet/junyi/KnowledgeTracing.py index 3267294..16ea159 100644 --- a/EduData/DataSet/junyi/KnowledgeTracing.py +++ b/EduData/DataSet/junyi/KnowledgeTracing.py @@ -75,7 +75,7 @@ def get_n_most_frequent_students(students, n=None, frequency: list = None): return _students -def select_n_most_frequent_students(source, target_prefix, ku_dict_path, n: (int, list)): +def select_n_most_frequent_students(source: str, target_prefix: str, ku_dict_path: str, n: (int, list)): """None in n means select all students""" n_list = as_list(n) students = _read(source, ku_dict_path) From a1ba92052bc77f3984b36ebb77e54c3a8679cde3 Mon Sep 17 00:00:00 2001 From: tswsxk Date: Tue, 19 Nov 2019 13:35:22 +0800 Subject: [PATCH 27/27] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 745ea18..3a05b7d 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,11 @@ Download the dataset by specifying the name of dataset: edudata download assistment-2009-2010-skill ``` +In order to change the storing directory, use the following order: +```shell +edudata download assistment-2009-2010-skill $dir +``` + #### Task Specified Tools ##### Knowledge Tracing