From 1d52272c9306d5a280793eeebbd61cd30a55b940 Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Wed, 13 Nov 2019 15:21:07 +0800
Subject: [PATCH 01/27] fix bug

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index b7117cc..dc9033f 100644
--- a/README.md
+++ b/README.md
@@ -111,7 +111,7 @@ In order to better verify the effectiveness of model,
 the dataset is usually divided into `train/valid/test` or using `kfold` method.
 
 ```shell
-edudata longling train_valid_test $filename1 $filename2 -- --train_ratio 0.8 --valid_ratio 0.1 --test_ratio 0.1
-longling kfold $filename1 $filename2 -- --n_splits 5
+edudata train_valid_test $filename1 $filename2 -- --train_ratio 0.8 --valid_ratio 0.1 --test_ratio 0.1
+edudata kfold $filename1 $filename2 -- --n_splits 5
 ```
 Refer to [longling](https://longling.readthedocs.io/zh/latest/#dataset) for more tools and detailed information.

From 326d7444ee345441f2d99a9c7cecbab6c457e1d3 Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Wed, 13 Nov 2019 16:24:45 +0800
Subject: [PATCH 02/27] =?UTF-8?q?=E9=87=8D=E6=9E=84=E4=B8=8B=E8=BD=BD?=
 =?UTF-8?q?=E6=B5=8B=E8=AF=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/test_download.py | 24 +++++-------------------
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/tests/test_download.py b/tests/test_download.py
index d02f6ce..e1a1b8c 100644
--- a/tests/test_download.py
+++ b/tests/test_download.py
@@ -1,26 +1,12 @@
 # coding: utf-8
 # create by tongshiwei on 2019/7/2
 
-import time
-
-import pytest
-
 from EduData import get_data
-from EduData.DataSet.download_data import url_dict
 
 
 def test_download(tmp_path):
-    for url in url_dict:
-        get_data(url, tmp_path, override=True)
-        time.sleep(1)
-
-
-if __name__ == '__main__':
-    from EduData.DataSet.download_data.utils import reporthook4urlretrieve
-    from urllib.request import urlretrieve
-
-    urlretrieve(
-        "http://base.ustc.edu.cn/data/ASSISTment/2015_100_skill_builders_main_problems.zip",
-        "../data/temp",
-        reporthook=reporthook4urlretrieve
-    )
+    try:
+        get_data("toy", tmp_path, override=True)
+        assert True
+    except Exception as e:
+        raise e

From 17c350df9675f7674c2044d90b296d3a66af8b31 Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Wed, 13 Nov 2019 16:25:02 +0800
Subject: [PATCH 03/27] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=B3=A8=E9=87=8A?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/README.md b/README.md
index dc9033f..b16311f 100644
--- a/README.md
+++ b/README.md
@@ -45,6 +45,15 @@ Before downloading dataset, first check the available dataset:
 ```shell
 edudata ls
 ```
+and get:
+```text
+assistment-2009-2010-skill
+assistment-2012-2013-non-skill
+assistment-2015
+junyi
+KDD-CUP-2010
+slepemapy.cz
+```
 
 Download the dataset by specifying the name of dataset:
 ```shell

From 5032006326db5820e8a03cfb529f584e74ff3be8 Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Wed, 13 Nov 2019 16:25:56 +0800
Subject: [PATCH 04/27] =?UTF-8?q?=E9=87=8D=E6=9E=84=E6=95=B0=E6=8D=AE?=
 =?UTF-8?q?=E9=9B=86=E5=88=92=E5=88=86=E5=B7=A5=E5=85=B7?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../DataSet/download_data/download_data.py    |  6 ++-
 EduData/Tools/__init__.py                     |  2 -
 EduData/Tools/train_valid_test.py             | 39 -------------------
 EduData/main.py                               | 18 +++++++--
 4 files changed, 19 insertions(+), 46 deletions(-)
 delete mode 100644 EduData/Tools/__init__.py
 delete mode 100644 EduData/Tools/train_valid_test.py

diff --git a/EduData/DataSet/download_data/download_data.py b/EduData/DataSet/download_data/download_data.py
index 275cd12..62fd704 100644
--- a/EduData/DataSet/download_data/download_data.py
+++ b/EduData/DataSet/download_data/download_data.py
@@ -34,7 +34,11 @@
     "KDD-CUP-2010":
         "http://base.ustc.edu.cn/data/KDD_Cup_2010/",
     "slepemapy.cz":
-        "http://base.ustc.edu.cn/data/slepemapy.cz/"
+        "http://base.ustc.edu.cn/data/slepemapy.cz/",
+    "synthetic":
+        "http://base.ustc.edu.cn/data/synthetic/",
+    "toy":
+        "http://base.ustc.edu.cn/data/toy.csv",
 }
 
 
diff --git a/EduData/Tools/__init__.py b/EduData/Tools/__init__.py
deleted file mode 100644
index 19781bf..0000000
--- a/EduData/Tools/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# coding: utf-8
-# 2019/8/23 @ tongshiwei
\ No newline at end of file
diff --git a/EduData/Tools/train_valid_test.py b/EduData/Tools/train_valid_test.py
deleted file mode 100644
index 34c6386..0000000
--- a/EduData/Tools/train_valid_test.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# coding: utf-8
-# create by tongshiwei on 2019-7-5
-
-__all__ = ["train_valid_test", "KFold"]
-
-import io
-from longling.ML.toolkit.dataset import train_valid_test
-import random
-import math
-from tqdm import tqdm
-
-
-def KFold(filename, train_prefix, valid_prefix, n_splits=5, shuffle=False):
-    with open(filename) as f:
-        indices = [idx for idx, _ in enumerate(f)]
-        sample_num = indices[-1]
-    if shuffle is True:
-        random.shuffle(indices)
-
-    step = math.ceil(sample_num / n_splits)
-    indices_buckets = [
-        (i, i + step) for i in range(0, sample_num, step)
-    ]
-    train_wfs = [
-        io.open(train_prefix + str(index), "w", encoding="utf-8") for index in range(n_splits)
-    ]
-    valid_wfs = [
-        io.open(valid_prefix + str(index), "w", encoding="utf-8") for index in range(n_splits)
-    ]
-    with open(filename) as f:
-        for line_no, line in tqdm(enumerate(f), "splitting dataset"):
-            for idx, (start, end) in enumerate(indices_buckets):
-                if start <= line_no < end:
-                    print(line, end="", file=valid_wfs[idx])
-                else:
-                    print(line, end="", file=train_wfs[idx])
-
-    for wf in train_wfs + valid_wfs:
-        wf.close()
diff --git a/EduData/main.py b/EduData/main.py
index 902925a..19d7d68 100644
--- a/EduData/main.py
+++ b/EduData/main.py
@@ -3,14 +3,24 @@
 
 import fire
 
-from EduData.DataSet.download_data.download_data import get_data as download, list_resources as ls
+from EduData.DataSet.download_data.download_data import get_data, list_resources
 from EduData.Task.KnowledgeTracing.format import tl2json, json2tl
-from EduData.Task.KnowledgeTracing.statistics import analysis_records as kt_stat
-from EduData.Tools.train_valid_test import train_valid_test, KFold as kfold
+from EduData.Task.KnowledgeTracing.statistics import analysis_records
+from longling.ML.toolkit.dataset import train_valid_test, kfold
 
 
 def cli():
-    fire.Fire()
+    fire.Fire(
+        {
+            "download": get_data,
+            "ls": list_resources,
+            "tl2json": tl2json,
+            "json2tl": json2tl,
+            "kt_stat": analysis_records,
+            "train_valid_test": train_valid_test,
+            "kfold": kfold,
+        }
+    )
 
 
 if __name__ == '__main__':

From d2eb3d29921fc4bfb7a7b5ac0639fc5d3937b299 Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Wed, 13 Nov 2019 21:26:30 +0800
Subject: [PATCH 05/27] =?UTF-8?q?=E5=AE=8C=E6=88=90=E6=95=B0=E6=8D=AE?=
 =?UTF-8?q?=E9=9B=86=E6=A6=82=E8=A7=88=E5=92=8C=E4=B8=8B=E8=BD=BD=E7=9A=84?=
 =?UTF-8?q?=E6=B5=8B=E8=AF=95=E5=87=BD=E6=95=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/test_download.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/test_download.py b/tests/test_download.py
index e1a1b8c..d33f6cd 100644
--- a/tests/test_download.py
+++ b/tests/test_download.py
@@ -1,7 +1,7 @@
 # coding: utf-8
 # create by tongshiwei on 2019/7/2
 
-from EduData import get_data
+from EduData import get_data, list_resources
 
 
 def test_download(tmp_path):
@@ -10,3 +10,8 @@ def test_download(tmp_path):
         assert True
     except Exception as e:
         raise e
+
+
+def test_list_resources():
+    list_resources()
+    assert True

From 793872bf84d75054862cb8ecbe3d5a9a0076623f Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Wed, 13 Nov 2019 21:26:51 +0800
Subject: [PATCH 06/27] =?UTF-8?q?=E6=96=B0=E5=A2=9Elist=5Fresources?=
 =?UTF-8?q?=E7=9A=84=E5=AF=BC=E5=87=BA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 EduData/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/EduData/__init__.py b/EduData/__init__.py
index 2f6f192..c1c8130 100644
--- a/EduData/__init__.py
+++ b/EduData/__init__.py
@@ -1,4 +1,4 @@
 # coding: utf-8
 # create by tongshiwei on 2019/7/2
 
-from .DataSet import get_data
+from .DataSet import get_data, list_resources

From ecef8d7de5c9e4ba4b04d0e302fa009521cf8351 Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Wed, 13 Nov 2019 21:27:18 +0800
Subject: [PATCH 07/27] =?UTF-8?q?=E5=8F=96=E6=B6=88=E9=83=A8=E5=88=86?=
 =?UTF-8?q?=E5=87=BD=E6=95=B0=E7=9A=84=E6=A3=80=E6=9F=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 EduData/DataSet/download_data/download_data.py | 11 +++++------
 EduData/DataSet/download_data/utils.py         | 10 +++++-----
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/EduData/DataSet/download_data/download_data.py b/EduData/DataSet/download_data/download_data.py
index 62fd704..9b23f50 100644
--- a/EduData/DataSet/download_data/download_data.py
+++ b/EduData/DataSet/download_data/download_data.py
@@ -2,6 +2,7 @@
 # create by tongshiwei on 2019/7/2
 
 __all__ = ["url_dict", "get_data", "list_resources"]
+
 import os
 from urllib.request import urlretrieve
 
@@ -9,11 +10,9 @@
 from bs4 import BeautifulSoup
 from longling import config_logging, LogLevel, path_append
 
-# from longling.spider import download_data
-
 try:
     from .utils import decompress, reporthook4urlretrieve
-except (SystemError, ModuleNotFoundError):
+except (SystemError, ModuleNotFoundError):  # pragma: no cover
     from utils import decompress, reporthook4urlretrieve
 
 DEFAULT_DATADIR = path_append("./", "", to_str=True)
@@ -42,7 +41,7 @@
 }
 
 
-def get_dataset_name():
+def get_dataset_name():  # pragma: no cover
     urls = []
     for i in url_dict.values():
         if i not in urls:
@@ -70,7 +69,7 @@ def get_dataset_name():
 def download_data(url, data_dir, override):
     urls = []
     os.makedirs(data_dir, exist_ok=True)
-    if url.endswith('/'):
+    if url.endswith('/'):  # pragma: no cover
         # 以/结尾是文件夹，其余是文件
         file_path = path_append(data_dir, url.split('/')[-2], to_str=True)
         os.makedirs(file_path, exist_ok=True)
@@ -124,7 +123,7 @@ def get_data(dataset, data_dir=DEFAULT_DATADIR, override=False):
     """
     try:
         return download_data(url_dict[dataset], data_dir, override)
-    except FileExistsError:
+    except FileExistsError:  # pragma: no cover
         return path_append(data_dir, url_dict[dataset].split('/')[-1], to_str=True)
 
 
diff --git a/EduData/DataSet/download_data/utils.py b/EduData/DataSet/download_data/utils.py
index df75fe0..cbba83d 100644
--- a/EduData/DataSet/download_data/utils.py
+++ b/EduData/DataSet/download_data/utils.py
@@ -7,7 +7,7 @@
 from longling import flush_print
 
 
-def decompress(file):
+def decompress(file):  # pragma: no cover
     for z in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]:
         if file.endswith(z):
             if z == ".zip":
@@ -18,14 +18,14 @@ def decompress(file):
                 un_tar(file)
 
 
-def get_path(file):
+def get_path(file):  # pragma: no cover
     #  返回解压缩后的文件名
     for i in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]:
         file = file.replace(i, "")
     return file
 
 
-def un_zip(file):
+def un_zip(file):  # pragma: no cover
     zip_file = zipfile.ZipFile(file)
     uz_path = get_path(file)
     print(file + " is unzip to " + uz_path)
@@ -34,14 +34,14 @@ def un_zip(file):
     zip_file.close()
 
 
-def un_rar(file):
+def un_rar(file):  # pragma: no cover
     rar_file = rarfile.RarFile(file)
     uz_path = get_path(file)
     print(file + " is unrar to " + uz_path)
     rar_file.extractall(uz_path)
 
 
-def un_tar(file):
+def un_tar(file):  # pragma: no cover
     tar_file = tarfile.open(file)
     uz_path = get_path(file)
     print(file + " is untar to " + uz_path)

From 9d9af0e8e0a847041721f06cf99c3fa52133d508 Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Wed, 13 Nov 2019 21:42:28 +0800
Subject: [PATCH 08/27] =?UTF-8?q?=E8=A7=A3=E5=86=B3=20marker=20warning=20?=
 =?UTF-8?q?=E5=8F=8A=20=E6=B7=BB=E5=8A=A0=20doctest?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pytest.ini | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/pytest.ini b/pytest.ini
index 717e7dd..097a4e6 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,7 +1,16 @@
 [pytest]
+# For pytest usage, refer to https://hb4dsai.readthedocs.io/zh/latest/Architecture/Test.html
 norecursedirs = docs *build* trash dev
 
+# Deal with marker warnings
+markers =
+    pep8: pep8
+
 # Enable line length testing with maximum line length of 85
 pep8maxlinelength = 120
 
-addopts = --cov --cov-report=term-missing --pep8
+# Ignore module level import not at top of file
+pep8ignore = E402
+
+# --doctest-modules is used for unitest
+addopts = --doctest-modules --cov --cov-report=term-missing --pep8

From abda6fd8369e5c1c9e2f80ff4c017ac5823d2f05 Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Thu, 14 Nov 2019 08:53:46 +0800
Subject: [PATCH 09/27] =?UTF-8?q?=E6=96=B0=E5=A2=9Ekt=E4=BB=BB=E5=8A=A1ben?=
 =?UTF-8?q?chmark=E6=95=B0=E6=8D=AE=E9=9B=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 EduData/DataSet/download_data/download_data.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/EduData/DataSet/download_data/download_data.py b/EduData/DataSet/download_data/download_data.py
index 9b23f50..823c4a7 100644
--- a/EduData/DataSet/download_data/download_data.py
+++ b/EduData/DataSet/download_data/download_data.py
@@ -38,6 +38,8 @@
         "http://base.ustc.edu.cn/data/synthetic/",
     "toy":
         "http://base.ustc.edu.cn/data/toy.csv",
+    "ktbs":
+        "http://base.ustc.edu.cn/data/ktbs",
 }
 
 

From 6b5d252a051d554a1914dc2ee3a9f8fb60b97dc5 Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Thu, 14 Nov 2019 08:54:01 +0800
Subject: [PATCH 10/27] =?UTF-8?q?=E6=96=B0=E5=A2=9Ekt=E4=BB=BB=E5=8A=A1ben?=
 =?UTF-8?q?chmark=E6=95=B0=E6=8D=AE=E9=9B=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 EduData/DataSet/download_data/download_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/EduData/DataSet/download_data/download_data.py b/EduData/DataSet/download_data/download_data.py
index 823c4a7..75473f6 100644
--- a/EduData/DataSet/download_data/download_data.py
+++ b/EduData/DataSet/download_data/download_data.py
@@ -39,7 +39,7 @@
     "toy":
         "http://base.ustc.edu.cn/data/toy.csv",
     "ktbs":
-        "http://base.ustc.edu.cn/data/ktbs",
+        "http://base.ustc.edu.cn/data/ktbs/",
 }
 
 

From bc6ec7f7b503cdd24c6157aec882d6e640fed208 Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Thu, 14 Nov 2019 20:30:11 +0800
Subject: [PATCH 11/27] =?UTF-8?q?=E9=87=8D=E6=9E=84download=5Fdata?=
 =?UTF-8?q?=E5=87=BD=E6=95=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../DataSet/download_data/download_data.py    | 86 +++++++++----------
 1 file changed, 41 insertions(+), 45 deletions(-)

diff --git a/EduData/DataSet/download_data/download_data.py b/EduData/DataSet/download_data/download_data.py
index 75473f6..11f1025 100644
--- a/EduData/DataSet/download_data/download_data.py
+++ b/EduData/DataSet/download_data/download_data.py
@@ -1,7 +1,7 @@
 # coding: utf-8
 # create by tongshiwei on 2019/7/2
 
-__all__ = ["url_dict", "get_data", "list_resources"]
+__all__ = ["URL_DICT", "get_data", "list_resources"]
 
 import os
 from urllib.request import urlretrieve
@@ -21,7 +21,7 @@
 
 prefix = 'http://base.ustc.edu.cn/data/'
 
-url_dict = {
+URL_DICT = {
     "assistment-2009-2010-skill":
         "http://base.ustc.edu.cn/data/ASSISTment/2009_skill_builder_data_corrected.zip",
     "assistment-2012-2013-non-skill":
@@ -36,16 +36,14 @@
         "http://base.ustc.edu.cn/data/slepemapy.cz/",
     "synthetic":
         "http://base.ustc.edu.cn/data/synthetic/",
-    "toy":
-        "http://base.ustc.edu.cn/data/toy.csv",
-    "ktbs":
-        "http://base.ustc.edu.cn/data/ktbs/",
+    "ktbd":
+        "http://base.ustc.edu.cn/data/ktbd/",
 }
 
 
 def get_dataset_name():  # pragma: no cover
     urls = []
-    for i in url_dict.values():
+    for i in URL_DICT.values():
         if i not in urls:
             urls.append(i)
     url = prefix
@@ -65,51 +63,50 @@ def get_dataset_name():  # pragma: no cover
                 # 避免ASSISTment和junyi的重复
                 if temp not in ['http://base.ustc.edu.cn/data/ASSISTment/',
                                 'http://base.ustc.edu.cn/data/JunyiAcademy_Math_Practicing_Log/']:
-                    url_dict[h[:-1]] = temp
+                    URL_DICT[h[:-1]] = temp
 
 
-def download_data(url, data_dir, override):
-    urls = []
-    os.makedirs(data_dir, exist_ok=True)
-    if url.endswith('/'):  # pragma: no cover
-        # 以/结尾是文件夹，其余是文件
-        file_path = path_append(data_dir, url.split('/')[-2], to_str=True)
-        os.makedirs(file_path, exist_ok=True)
+def download_file(url, save_path, override):
+    logger.info(url + ' is saved as ' + save_path)
+    urlretrieve(url, save_path, reporthook=reporthook4urlretrieve)
+    print()
+    decompress(save_path)
+    if override:
+        os.remove(save_path)
+        logger.info(save_path + ' is deleted.')
+
+
+def download_data(url, data_dir, override, bloom_filter: set = None):
+    bloom_filter = set() if bloom_filter is None else bloom_filter
+
+    if url in bloom_filter:
+        return
+
+    if url.endswith("/"):  # 以/结尾是文件夹，其余是文件
+        _data_dir = path_append(data_dir, url.split('/')[-2], to_str=True)
+
         r = requests.get(url, timeout=30)
         r.raise_for_status()
         r.encoding = r.apparent_encoding
         soup = BeautifulSoup(r.text, "lxml")
         al = soup.find_all('a')
         for a in al:
-            # 获得文件名
+            # 获得链接名
             h = a.get('href')
             if h[0] != '.':
-                temp = url + h
-                # 避免重复
-                if temp not in urls:
-                    urls.append(temp)
-                    temp_path = path_append(file_path, h, to_str=True)
-                    logger.info(temp + ' is saved as ' + temp_path)
-                    # 下载
-                    urlretrieve(temp, temp_path, reporthook=reporthook4urlretrieve)
-                    print()
-                    # 解压
-                    decompress(temp_path)
-                    if override:
-                        os.remove(temp_path)
-                        logger.info(temp_path + ' is deleted.')
+                url_h = url + h
+                if url_h not in bloom_filter:
+                    download_data(url_h, _data_dir, override, bloom_filter)
+        bloom_filter.add(url)
+
     else:
-        file_path = path_append(data_dir, url.split('/')[-1], to_str=True)
-        logger.info(url + ' is saved as ' + file_path)
-        urlretrieve(url, file_path, reporthook=reporthook4urlretrieve)
-        print()
-        decompress(file_path)
-        if override:
-            os.remove(file_path)
-            logger.info(file_path + ' is deleted.')
+        os.makedirs(data_dir, exist_ok=True)
+        save_path = path_append(data_dir, url.split('/')[-1], to_str=True)
+        download_file(url, save_path, override)
+        bloom_filter.add(url)
 
 
-def get_data(dataset, data_dir=DEFAULT_DATADIR, override=False):
+def get_data(dataset, data_dir=DEFAULT_DATADIR, override=False, url_dict: dict = None):
     """
     Parameters
     ----------
@@ -119,10 +116,14 @@ def get_data(dataset, data_dir=DEFAULT_DATADIR, override=False):
         数据存储目录
     override: bool
         是否覆盖已存在的文件
+    url_dict:
+        链接名称与链接映射
+
     Returns
     -------
 
     """
+    url_dict = URL_DICT if not url_dict else url_dict
     try:
         return download_data(url_dict[dataset], data_dir, override)
     except FileExistsError:  # pragma: no cover
@@ -130,9 +131,4 @@ def get_data(dataset, data_dir=DEFAULT_DATADIR, override=False):
 
 
 def list_resources():
-    print("\n".join(url_dict))
-
-
-if __name__ == '__main__':
-    list_resources()
-    get_data("assistment-2009-2010-skill")
+    print("\n".join(URL_DICT))

From b333fef7cec092e2584ebfecc81c4e45885ff922 Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Fri, 15 Nov 2019 08:57:58 +0800
Subject: [PATCH 12/27] =?UTF-8?q?=E9=87=8D=E5=91=BD=E5=90=8D=20url=5Fdict?=
 =?UTF-8?q?=20=E5=88=B0=20URL=5FDICT?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 EduData/DataSet/download_data/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/EduData/DataSet/download_data/__init__.py b/EduData/DataSet/download_data/__init__.py
index e5fb102..c8e15d2 100644
--- a/EduData/DataSet/download_data/__init__.py
+++ b/EduData/DataSet/download_data/__init__.py
@@ -2,4 +2,4 @@
 # create by tongshiwei on 2019-8-16
 
 
-from .download_data import url_dict
+from .download_data import URL_DICT

From 37a81034e1c38c528959b51b4aa06447430e775b Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Fri, 15 Nov 2019 08:58:12 +0800
Subject: [PATCH 13/27] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E5=AF=BC=E5=87=BA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 EduData/DataSet/junyi/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/EduData/DataSet/junyi/__init__.py b/EduData/DataSet/junyi/__init__.py
index c33243e..5ec63ef 100644
--- a/EduData/DataSet/junyi/__init__.py
+++ b/EduData/DataSet/junyi/__init__.py
@@ -1,2 +1,5 @@
 # coding: utf-8
 # create by tongshiwei on 2019-7-5
+
+from .junyi import build_knowledge_graph
+from .KnowledgeTracing import select_n_most_frequent_students

From 24d9e2b7f31cfdc1e5dbc42cb2c82ff05b130d46 Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Fri, 15 Nov 2019 08:58:36 +0800
Subject: [PATCH 14/27] =?UTF-8?q?=E9=A1=B6=E5=B1=82=E5=87=BD=E6=95=B0?=
 =?UTF-8?q?=E5=B0=81=E8=A3=85?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 EduData/DataSet/junyi/junyi.py | 53 ++++++++++++++++++++++------------
 1 file changed, 35 insertions(+), 18 deletions(-)

diff --git a/EduData/DataSet/junyi/junyi.py b/EduData/DataSet/junyi/junyi.py
index 58be4e4..3716d0e 100644
--- a/EduData/DataSet/junyi/junyi.py
+++ b/EduData/DataSet/junyi/junyi.py
@@ -1,13 +1,18 @@
 # coding: utf-8
 # create by tongshiwei on 2019/7/2
 
+"""
+This script is used to build the map dict (ku_name -> idx) extract some relations from the original junyi dataset.
+"""
+__all__ = ["build_knowledge_graph"]
+
 import codecs
 import csv
 import json
 
 import networkx as nx
 import pandas
-from longling import wf_open, config_logging
+from longling import wf_open, config_logging, path_append
 from tqdm import tqdm
 
 logger = config_logging(logger="junyi", console_log_level="info")
@@ -66,6 +71,7 @@ def merge_relationship_annotation(sources, target):
             f.readline()
             for line in f:
                 wf.write(line)
+    return target
 
 
 def extract_similarity(source, target, ku_dict):
@@ -104,20 +110,31 @@ def extract_difficulty(source, target, ku_dict):
         json.dump(difficulty, wf, indent=2)
 
 
-if __name__ == '__main__':
-    root = "../../../"
-    raw_file = root + "data/junyi/junyi_Exercise_table.csv"
-    ku_dict_file = root + "data/junyi/graph_vertex.json"
-    prerequisite_file = root + "data/junyi/prerequisite.json"
-    similarity_raw_files = [
-        root + "data/junyi/relationship_annotation_{}.csv".format(name) for name in ["testing", "training"]
-    ]
-    similarity_raw_file = root + "raw_data/junyi/relationship_annotation.csv"
-    similarity_file = root + "data/junyi/similarity.json"
-    difficulty_file = root + "data/junyi/difficulty.json"
-
-    # merge_relationship_annotation(similarity_raw_files, similarity_raw_file)
-    # build_ku_dict(raw_file, ku_dict_file)
-    # extract_prerequisite(raw_file, prerequisite_file, ku_dict_file)
-    extract_similarity(similarity_raw_file, similarity_file, ku_dict_file)
-    # extract_difficulty(similarity_raw_file, difficulty_file, ku_dict_file)
+def build_knowledge_graph(src_root: str, tar_root: (str, None) = None,
+                          ku_dict_path: str = None,
+                          prerequisite_path: (str, None) = None,
+                          similarity_path: (str, None) = None,
+                          difficulty_path: (str, None) = None):
+    tar_root = tar_root if tar_root is not None else src_root
+    exercise_src = path_append(src_root, "junyi_Exercise_table.csv")
+
+    assert ku_dict_path is not None
+
+    relation_src = merge_relationship_annotation(
+        [path_append(src_root, "relationship_annotation_{}.csv".format(name)) for name in ["testing", "training"]],
+        path_append(src_root, "relationship_annotation.csv")
+    )
+    ku_dict_path = path_append(tar_root, ku_dict_path)
+    build_ku_dict(exercise_src, ku_dict_path)
+
+    if prerequisite_path is not None:
+        prerequisite_path = path_append(tar_root, prerequisite_path)
+        extract_prerequisite(exercise_src, prerequisite_path, ku_dict_path)
+
+    if similarity_path is not None:
+        similarity_path = path_append(tar_root, "similarity.json")
+        extract_similarity(relation_src, similarity_path, ku_dict_path)
+
+    if difficulty_path is not None:
+        difficulty_path = path_append(tar_root, "difficulty.json")
+        extract_difficulty(relation_src, difficulty_path, ku_dict_path)

From c2a09a64bfd3ce8ffdf6657d7144d922ec5c0fad Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Fri, 15 Nov 2019 08:59:03 +0800
Subject: [PATCH 15/27] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E8=AF=B4=E6=98=8E?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 EduData/DataSet/junyi/KnowledgeTracing.py | 26 +++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/EduData/DataSet/junyi/KnowledgeTracing.py b/EduData/DataSet/junyi/KnowledgeTracing.py
index 65b5746..395af8f 100644
--- a/EduData/DataSet/junyi/KnowledgeTracing.py
+++ b/EduData/DataSet/junyi/KnowledgeTracing.py
@@ -1,6 +1,12 @@
 # coding: utf-8
 # create by tongshiwei on 2019-7-5
 
+"""
+This script is used to convert the original junyi dataset into json sequence, which can be applied in kt task.
+"""
+
+__all__ = ["select_n_most_frequent_students"]
+
 import csv
 import json
 
@@ -9,8 +15,15 @@
 from tqdm import tqdm
 
 
-def _read(source, ku_dict):
-    """require big memory to run this function"""
+def _read(source: str, ku_dict: str) -> dict:
+    """
+    Read the learners' interaction records and classify them by user id and session id.
+    In the same time, the exercise name will be converted to id.
+
+    Notes
+    -----
+    Require big memory to run this function.
+    """
 
     outcome = {
         "INCORRECT": 0,
@@ -26,8 +39,8 @@ def _read(source, ku_dict):
     with open(source) as f:
         f.readline()
         for line in tqdm(csv.reader(f, delimiter='\t'), "reading data"):
-            student, session, exercise, correct, timestamp = line[0], line[1], ku_dict[line[-5]], \
-                                                             outcome[line[10]], line[8]
+            student, session, exercise = line[0], line[1], ku_dict[line[-5]],
+            correct, timestamp = outcome[line[10]], line[8]
             if student not in students:
                 students[student] = {}
             if session not in students[student]:
@@ -58,7 +71,7 @@ def _frequency(students):
     return sorted(frequency.items(), key=lambda x: x[1], reverse=True)
 
 
-def get_n_most_frequent_students(students, n=None, frequency=None):
+def get_n_most_frequent_students(students, n=None, frequency: list = None):
     frequency = _frequency(students) if frequency is None else frequency
     __frequency = frequency if n is None else frequency[:n]
     _students = {}
@@ -67,7 +80,8 @@ def get_n_most_frequent_students(students, n=None, frequency=None):
     return _students
 
 
-def select_n_most_frequent_students(source, target_prefix, ku_dict, n):
+def select_n_most_frequent_students(source, target_prefix, ku_dict, n: (int, list)):
+    """None in n means select all students"""
     n_list = as_list(n)
     students = _read(source, ku_dict)
     frequency = _frequency(students)

From 84d7faa9b22395d316f2ecc8b8517e55c68c015c Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Fri, 15 Nov 2019 08:59:17 +0800
Subject: [PATCH 16/27] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E8=AF=B4=E6=98=8E?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 EduData/DataSet/junyi/README.md | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 EduData/DataSet/junyi/README.md

diff --git a/EduData/DataSet/junyi/README.md b/EduData/DataSet/junyi/README.md
new file mode 100644
index 0000000..1b58f43
--- /dev/null
+++ b/EduData/DataSet/junyi/README.md
@@ -0,0 +1,4 @@
+# Junyi Dataset
+
+For detailed annotation for each file and field, you can download the dataset from our datashop 
+and see the `README.md` file.
\ No newline at end of file

From 97a9ab3c07d3ac6d05cae0da413a89af20091d88 Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Fri, 15 Nov 2019 08:59:42 +0800
Subject: [PATCH 17/27] =?UTF-8?q?=E6=96=B0=E5=A2=9Edemo=E8=84=9A=E6=9C=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 script/junyi_kt.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 script/junyi_kt.py

diff --git a/script/junyi_kt.py b/script/junyi_kt.py
new file mode 100644
index 0000000..c4bbf7a
--- /dev/null
+++ b/script/junyi_kt.py
@@ -0,0 +1,27 @@
+# coding: utf-8
+# 2019/11/14 @ tongshiwei
+
+__all__ = ["extract_relations", "build_json_sequence"]
+
+from longling import path_append
+from EduData.DataSet.junyi import build_knowledge_graph, select_n_most_frequent_students
+
+
+def extract_relations(src_root="../raw_data/junyi/", tar_root="../data/junyi/data/"):
+    build_knowledge_graph(
+        src_root, tar_root,
+        ku_dict_path="graph_vertex.json",
+        prerequisite_path="prerequisite.json",
+        similarity_path="similarity.json",
+        difficulty_path="difficulty.json",
+    )
+
+
+def build_json_sequence(src_root="../raw_data/junyi/", tar_root="../data/junyi/data/",
+                        ku_dict_path="../data/junyi/data/graph_vertex.json", n=1000):
+    select_n_most_frequent_students(
+        path_append(src_root, "junyi_ProblemLog_for_PSLC.txt", to_str=True),
+        path_append(tar_root, "student_log_kt_", to_str=True),
+        ku_dict_path,
+        n,
+    )

From 8da274b00ec4153ffc57f0972e6046a8cf7e2051 Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Fri, 15 Nov 2019 09:00:03 +0800
Subject: [PATCH 18/27] =?UTF-8?q?=E8=B0=83=E6=95=B4=E6=B3=A8=E9=87=8A?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 EduData/Task/KnowledgeTracing/format.py | 34 ++++++++++---------------
 1 file changed, 13 insertions(+), 21 deletions(-)

diff --git a/EduData/Task/KnowledgeTracing/format.py b/EduData/Task/KnowledgeTracing/format.py
index aca69a6..861b6f8 100644
--- a/EduData/Task/KnowledgeTracing/format.py
+++ b/EduData/Task/KnowledgeTracing/format.py
@@ -9,38 +9,30 @@
 __all__ = ["tl2json", "json2tl"]
 
 
-def tl2json(src, tar):
+def tl2json(src: str, tar: str):
     """
+    convert the dataset in `tl` sequence into `json` sequence
+
     .tl format
-    The first line is the number of exercises a student attempted. The second line is the exercise tag sequence.
-    The third line is the response sequence.
+    The first line is the number of exercises a student attempted.
+    The second line is the exercise tag sequence.
+    The third line is the response sequence. ::
 
-    Examples
-    --------
-    15
-    1,1,1,1,7,7,9,10,10,10,10,11,11,45,54
-    0,1,1,1,1,1,0,0,1,1,1,1,1,0,0
+        15
+        1,1,1,1,7,7,9,10,10,10,10,11,11,45,54
+        0,1,1,1,1,1,0,0,1,1,1,1,1,0,0
 
     .json format
     Each sample contains several response elements, and each element is a two-element list.
-    The first is the exercise tag and the second is the response
-    Examples
-    --------
-    [[1,0],[1,1],[1,1],[1,1],[7,1],[7,1],[9,0],[10,0],[10,1],[10,1],[10,1],[11,1],[11,1],[45,0],[54,0]]
-
-    Parameters
-    ----------
-    src
-    tar
+    The first is the exercise tag and the second is the response. ::
 
-    Examples
-    -------
+        [[1,0],[1,1],[1,1],[1,1],[7,1],[7,1],[9,0],[10,0],[10,1],[10,1],[10,1],[11,1],[11,1],[45,0],[54,0]]
 
     """
     with open(src) as f, io.open(tar, "w", encoding="utf-8") as wf:
         for _ in tqdm(f):
-            exercise_tags = f.readline().strip().split(",")
-            response_sequence = f.readline().strip().split(",")
+            exercise_tags = f.readline().strip().strip(",").split(",")
+            response_sequence = f.readline().strip().strip(",").split(",")
             responses = list(zip(exercise_tags, response_sequence))
             print(json.dumps(responses), file=wf)
 

From 2e834f1f98a039b9adf93093e865120bcb69cd90 Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Tue, 19 Nov 2019 09:12:59 +0800
Subject: [PATCH 19/27] =?UTF-8?q?=E5=AE=8C=E6=88=90=E6=B5=8B=E8=AF=95?=
 =?UTF-8?q?=E9=83=A8=E5=88=86=E7=BC=96=E5=86=99?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/conftest.py      | 25 +++++++++++++++++++++++++
 tests/test_download.py |  7 ++++---
 tests/test_format.py   |  9 +--------
 tests/test_junyi.py    | 35 +++++++++++++++++++++++++++++++++++
 4 files changed, 65 insertions(+), 11 deletions(-)
 create mode 100644 tests/conftest.py
 create mode 100644 tests/test_junyi.py

diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..3bd9242
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,25 @@
+# coding: utf-8
+# 2019/11/14 @ tongshiwei
+
+from EduData import get_data
+from longling import path_append
+import functools
+import pytest
+
+test_url_dict = {
+    "tests":
+        "http://base.ustc.edu.cn/data/tests/",
+    "junyi":
+        "http://base.ustc.edu.cn/data/tests/junyi/",
+}
+
+get_data = functools.partial(get_data, url_dict=test_url_dict)
+
+
+@pytest.fixture(scope="session")
+def shared_data_dir(tmp_path_factory):
+    tmpdir = tmp_path_factory.mktemp("data")
+    try:
+        return path_append(get_data("tests", tmpdir, override=True), "tests")
+    except Exception as e:
+        raise e
diff --git a/tests/test_download.py b/tests/test_download.py
index d33f6cd..f56911d 100644
--- a/tests/test_download.py
+++ b/tests/test_download.py
@@ -1,12 +1,13 @@
 # coding: utf-8
 # create by tongshiwei on 2019/7/2
 
-from EduData import get_data, list_resources
+from EduData import list_resources
+from .conftest import get_data, test_url_dict
 
 
-def test_download(tmp_path):
+def test_download(shared_data_dir):
     try:
-        get_data("toy", tmp_path, override=True)
+        get_data("tests", shared_data_dir, url_dict=test_url_dict)
         assert True
     except Exception as e:
         raise e
diff --git a/tests/test_format.py b/tests/test_format.py
index 9f46865..5516f32 100644
--- a/tests/test_format.py
+++ b/tests/test_format.py
@@ -1,12 +1,5 @@
 # coding: utf-8
 # create by tongshiwei on 2019-8-14
 
-from EduData.Task.KnowledgeTracing.format import tl2json, json2tl
 
-
-def test_json2tl(tmp_path):
-    src = "../data/junyi/student_log_kt.json.small.test"
-    tl_tar = tmp_path / "student_log_kt.json.small.test.tl"
-    json_tar = tmp_path / "student_log_kt.json.small.test.json"
-    json2tl(src, tl_tar)
-    tl2json(tl_tar, json_tar)
+# redirect to test_junyi.py
diff --git a/tests/test_junyi.py b/tests/test_junyi.py
new file mode 100644
index 0000000..bcfe32f
--- /dev/null
+++ b/tests/test_junyi.py
@@ -0,0 +1,35 @@
+# coding: utf-8
+# 2019/11/14 @ tongshiwei
+
+from longling import path_append
+from script.junyi_kt import extract_relations, build_json_sequence
+from EduData.Task.KnowledgeTracing.format import tl2json, json2tl
+from EduData.Task.KnowledgeTracing.statistics import analysis_records
+
+
+def test_junyi(shared_data_dir):
+    src_root = path_append(shared_data_dir, "junyi", to_str=True)
+    extract_relations(src_root, path_append(src_root, "data"))
+    assert True
+
+
+def test_junyi_kt(shared_data_dir):
+    src_root = path_append(shared_data_dir, "junyi", to_str=True)
+    ku_dict_path = path_append(shared_data_dir, "junyi", "data", "graph_vertex.json")
+    build_json_sequence(src_root, path_append(src_root, "data", to_str=True), ku_dict_path)
+    assert True
+
+
+def test_json2tl(shared_data_dir):
+    src = path_append(shared_data_dir, "junyi", "data", "student_log_kt_1000", to_str=True)
+    tl_tar = path_append(shared_data_dir, "junyi", "data", "student_log_kt_1000.tl", to_str=True)
+    json_tar = path_append(shared_data_dir, "junyi", "data", "student_log_kt_1000.json", to_str=True)
+    json2tl(src, tl_tar)
+    tl2json(tl_tar, json_tar)
+    assert True
+
+
+def test_analysis(shared_data_dir):
+    src = path_append(shared_data_dir, "junyi", "data", "student_log_kt_1000", to_str=True)
+    analysis_records(src)
+    assert True

From e67be68050247fa4ec334ac97bcbf10b3ff99632 Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Tue, 19 Nov 2019 09:13:22 +0800
Subject: [PATCH 20/27] =?UTF-8?q?=E9=80=82=E9=85=8D=E5=8D=95=E5=85=83?=
 =?UTF-8?q?=E6=B5=8B=E8=AF=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../DataSet/download_data/download_data.py    | 11 +++--
 EduData/DataSet/junyi/KnowledgeTracing.py     | 45 -------------------
 EduData/main.py                               |  6 +--
 3 files changed, 8 insertions(+), 54 deletions(-)

diff --git a/EduData/DataSet/download_data/download_data.py b/EduData/DataSet/download_data/download_data.py
index 11f1025..8853ca3 100644
--- a/EduData/DataSet/download_data/download_data.py
+++ b/EduData/DataSet/download_data/download_data.py
@@ -67,19 +67,20 @@ def get_dataset_name():  # pragma: no cover
 
 
 def download_file(url, save_path, override):
+    if os.path.exists(save_path) and override:  # pragma: no cover
+        os.remove(save_path)
+        logger.info(save_path + ' will be overridden.')
+
     logger.info(url + ' is saved as ' + save_path)
     urlretrieve(url, save_path, reporthook=reporthook4urlretrieve)
     print()
     decompress(save_path)
-    if override:
-        os.remove(save_path)
-        logger.info(save_path + ' is deleted.')
 
 
 def download_data(url, data_dir, override, bloom_filter: set = None):
     bloom_filter = set() if bloom_filter is None else bloom_filter
 
-    if url in bloom_filter:
+    if url in bloom_filter:  # pragma: no cover
         return
 
     if url.endswith("/"):  # 以/结尾是文件夹，其余是文件
@@ -105,6 +106,8 @@ def download_data(url, data_dir, override, bloom_filter: set = None):
         download_file(url, save_path, override)
         bloom_filter.add(url)
 
+    return data_dir
+
 
 def get_data(dataset, data_dir=DEFAULT_DATADIR, override=False, url_dict: dict = None):
     """
diff --git a/EduData/DataSet/junyi/KnowledgeTracing.py b/EduData/DataSet/junyi/KnowledgeTracing.py
index 395af8f..1d9445d 100644
--- a/EduData/DataSet/junyi/KnowledgeTracing.py
+++ b/EduData/DataSet/junyi/KnowledgeTracing.py
@@ -59,11 +59,6 @@ def _write(students, target):
                 print(json.dumps(exercise_response), file=wf)
 
 
-def extract_students_log(source, target, ku_dict):
-    students = _read(source, ku_dict)
-    _write(students, target)
-
-
 def _frequency(students):
     frequency = {}
     for student_id, sessions in tqdm(students.items(), "calculating frequency"):
@@ -87,43 +82,3 @@ def select_n_most_frequent_students(source, target_prefix, ku_dict, n: (int, lis
     frequency = _frequency(students)
     for _n in n_list:
         _write(get_n_most_frequent_students(students, _n, frequency), target_prefix + "%s" % _n)
-
-
-if __name__ == '__main__':
-    root = "../../../"
-    student_log_raw_file = root + "data/junyi/junyi_ProblemLog_for_PSLC.txt"
-    # student_log_file = root + "data/junyi/student_log_kt.json"
-    ku_dict_file = root + "data/junyi/graph_vertex.json"
-
-    select_n_most_frequent_students(
-        student_log_raw_file,
-        root + "data/junyi/student_log_kt_",
-        ku_dict_file,
-        [None]
-    )
-
-    # select_n_most_frequent_students(
-    #     student_log_raw_file,
-    #     root + "data/junyi/student_log_kt_",
-    #     ku_dict_file,
-    #     [100, 200, 300]
-    # )
-    # [500, 1000, 2000]
-
-    # extract_students_log(student_log_raw_file, student_log_file, ku_dict_file)
-
-    # student_log_file_small = student_log_file + ".small"
-    #
-    # with open(student_log_file) as f, wf_open(student_log_file_small) as wf:
-    #     for i, line in tqdm(enumerate(f)):
-    #         if i > 50000:
-    #             break
-    #         print(line, end="", file=wf)
-    #
-    # print(train_valid_test(
-    #     student_log_file_small,
-    #     valid_ratio=0.,
-    #     test_ratio=0.2,
-    #     root_dir=root + "data/junyi/",
-    #     silent=False,
-    # ))
diff --git a/EduData/main.py b/EduData/main.py
index 19d7d68..bbc461a 100644
--- a/EduData/main.py
+++ b/EduData/main.py
@@ -9,7 +9,7 @@
 from longling.ML.toolkit.dataset import train_valid_test, kfold
 
 
-def cli():
+def cli():  # pragma: no cover
     fire.Fire(
         {
             "download": get_data,
@@ -21,7 +21,3 @@ def cli():
             "kfold": kfold,
         }
     )
-
-
-if __name__ == '__main__':
-    cli()

From 4a0c934d70fe5ae4921b94d8abcd1248bd12ffde Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Tue, 19 Nov 2019 09:19:16 +0800
Subject: [PATCH 21/27] VNR: ku_dict -> ku_dict_path

---
 EduData/DataSet/junyi/KnowledgeTracing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/EduData/DataSet/junyi/KnowledgeTracing.py b/EduData/DataSet/junyi/KnowledgeTracing.py
index 1d9445d..3267294 100644
--- a/EduData/DataSet/junyi/KnowledgeTracing.py
+++ b/EduData/DataSet/junyi/KnowledgeTracing.py
@@ -75,10 +75,10 @@ def get_n_most_frequent_students(students, n=None, frequency: list = None):
     return _students
 
 
-def select_n_most_frequent_students(source, target_prefix, ku_dict, n: (int, list)):
+def select_n_most_frequent_students(source, target_prefix, ku_dict_path, n: (int, list)):
     """None in n means select all students"""
     n_list = as_list(n)
-    students = _read(source, ku_dict)
+    students = _read(source, ku_dict_path)
     frequency = _frequency(students)
     for _n in n_list:
         _write(get_n_most_frequent_students(students, _n, frequency), target_prefix + "%s" % _n)

From 3a870b2e226c0c59691ca4ed9fc161604f005c15 Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Tue, 19 Nov 2019 09:38:16 +0800
Subject: [PATCH 22/27] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=96=B0=E7=9A=84?=
 =?UTF-8?q?=E5=91=BD=E4=BB=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 EduData/main.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/EduData/main.py b/EduData/main.py
index bbc461a..837ea55 100644
--- a/EduData/main.py
+++ b/EduData/main.py
@@ -7,6 +7,7 @@
 from EduData.Task.KnowledgeTracing.format import tl2json, json2tl
 from EduData.Task.KnowledgeTracing.statistics import analysis_records
 from longling.ML.toolkit.dataset import train_valid_test, kfold
+from script.junyi_kt import extract_relations, build_json_sequence
 
 
 def cli():  # pragma: no cover
@@ -19,5 +20,13 @@ def cli():  # pragma: no cover
             "kt_stat": analysis_records,
             "train_valid_test": train_valid_test,
             "kfold": kfold,
+            "dataset": {
+                "junyi": {
+                    "kt": {
+                        "extract_relations": extract_relations,
+                        "build_json_sequence": build_json_sequence,
+                    }
+                }
+            }
         }
     )

From d7f2e74c763c240be1b4662e13c36817a30460d4 Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Tue, 19 Nov 2019 09:44:15 +0800
Subject: [PATCH 23/27] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E7=A4=BA=E4=BE=8B?=
 =?UTF-8?q?=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index b16311f..5ae793f 100644
--- a/README.md
+++ b/README.md
@@ -85,7 +85,8 @@ we offer another one format, named `json sequence` to represent the interaction
 ```
 
 Each item in the sequence represent one interaction. The first element of the item is the exercise 
-(some works call it knowledge unit or knowledge item) id 
+id (in some works, the exercise id is not one-to-one mapped to one knowledge unit(ku)/concept, 
+but in junyi, one exercise contains one ku) 
 and the second one indicates whether the learner correctly answer the exercise, 0 for wrongly while 1 for correctly  
 One line, one `json` record, which is corresponded to a learner's interaction sequence.
 
@@ -102,7 +103,18 @@ The cli tools to quickly convert the "raw" data of the dataset into "mature" dat
 The "mature" data is in `json sequence` format 
 and can be modeled by [XKT](https://github.com/bigdata-ustc/XKT) and TKT(TBA)
 
-TBA
+###### junyi
+```
+# download junyi dataset to junyi/
+>>> edudata download junyi
+# build knolwedge graph
+>>> edudata dataset junyi kt extract_relations junyi/ junyi/data/
+# prepare dataset for knwoeldge tracing task, which is represented in json sequence
+>>> edudata dataset junyi kt build_json_sequence junyi/ junyi/data/ junyi/data/graph_vertex.json 1000
+# after preprocessing, a json sequence file, named student_log_kt_1000, can be found in junyi/data/
+# further preprocessing like spliting dataset into train and test can be performed
+>>> edudata train_valid_test junyi/data/student_log_kt_1000 -- --train_ratio 0.8 --valid_ratio 0.1 --test_ratio 0.1
+```  
 
 ###### Analysis Dataset
 This tool only supports the `json sequence` format. To check the following statical indexes of the dataset:

From 0e432134e7f7ab5f9c7e32c142c25af82cbdfed6 Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Tue, 19 Nov 2019 09:47:57 +0800
Subject: [PATCH 24/27] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E7=B1=BB=E5=9E=8B?=
 =?UTF-8?q?=E6=A3=80=E6=9F=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 script/junyi_kt.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/script/junyi_kt.py b/script/junyi_kt.py
index c4bbf7a..af0792e 100644
--- a/script/junyi_kt.py
+++ b/script/junyi_kt.py
@@ -7,7 +7,7 @@
 from EduData.DataSet.junyi import build_knowledge_graph, select_n_most_frequent_students
 
 
-def extract_relations(src_root="../raw_data/junyi/", tar_root="../data/junyi/data/"):
+def extract_relations(src_root: str = "../raw_data/junyi/", tar_root: str = "../data/junyi/data/"):
     build_knowledge_graph(
         src_root, tar_root,
         ku_dict_path="graph_vertex.json",
@@ -17,8 +17,8 @@ def extract_relations(src_root="../raw_data/junyi/", tar_root="../data/junyi/dat
     )
 
 
-def build_json_sequence(src_root="../raw_data/junyi/", tar_root="../data/junyi/data/",
-                        ku_dict_path="../data/junyi/data/graph_vertex.json", n=1000):
+def build_json_sequence(src_root: str = "../raw_data/junyi/", tar_root: str = "../data/junyi/data/",
+                        ku_dict_path: str = "../data/junyi/data/graph_vertex.json", n: int = 1000):
     select_n_most_frequent_students(
         path_append(src_root, "junyi_ProblemLog_for_PSLC.txt", to_str=True),
         path_append(tar_root, "student_log_kt_", to_str=True),

From 3920a7337b7090d5a91e0dec112b581c204fcf38 Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Tue, 19 Nov 2019 09:48:17 +0800
Subject: [PATCH 25/27] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E8=AF=B4=E6=98=8E?=
 =?UTF-8?q?=E5=92=8C=E4=BE=8B=E5=AD=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5ae793f..745ea18 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,11 @@ The dataset includes:
 
 * [synthetic](https://github.com/chrispiech/DeepKnowledgeTracing/tree/master/data/synthetic)
 
-Your can also visit our datashop [BaseData](http://base.ustc.edu.cn/data/) to get those mentioned-above (most of them) dataset. 
+Your can also visit our datashop [BaseData](http://base.ustc.edu.cn/data/) to get those mentioned-above (most of them) dataset.
+
+Except those mentioned-above dataset, we also provide some benchmark dataset for some specified task, which is listed as follows:
+
+* [knowledge tracing benchmark dataset](http://base.ustc.edu.cn/data/ktbd/)
 
 ## Tutorial
 

From d6cb940312a976e3b5576fc6bf998019728f95ae Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Tue, 19 Nov 2019 09:48:33 +0800
Subject: [PATCH 26/27] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E7=B1=BB=E5=9E=8B?=
 =?UTF-8?q?=E6=A3=80=E6=9F=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 EduData/DataSet/junyi/KnowledgeTracing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/EduData/DataSet/junyi/KnowledgeTracing.py b/EduData/DataSet/junyi/KnowledgeTracing.py
index 3267294..16ea159 100644
--- a/EduData/DataSet/junyi/KnowledgeTracing.py
+++ b/EduData/DataSet/junyi/KnowledgeTracing.py
@@ -75,7 +75,7 @@ def get_n_most_frequent_students(students, n=None, frequency: list = None):
     return _students
 
 
-def select_n_most_frequent_students(source, target_prefix, ku_dict_path, n: (int, list)):
+def select_n_most_frequent_students(source: str, target_prefix: str, ku_dict_path: str, n: (int, list)):
     """None in n means select all students"""
     n_list = as_list(n)
     students = _read(source, ku_dict_path)

From a1ba92052bc77f3984b36ebb77e54c3a8679cde3 Mon Sep 17 00:00:00 2001
From: tswsxk <tongsw@mail.ustc.edu.cn>
Date: Tue, 19 Nov 2019 13:35:22 +0800
Subject: [PATCH 27/27] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=B3=A8=E9=87=8A?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index 745ea18..3a05b7d 100644
--- a/README.md
+++ b/README.md
@@ -64,6 +64,11 @@ Download the dataset by specifying the name of dataset:
 edudata download assistment-2009-2010-skill
 ```
 
+In order to change the storing directory, use the following order:
+```shell
+edudata download assistment-2009-2010-skill $dir
+```
+
 #### Task Specified Tools
 
 ##### Knowledge Tracing