Merge pull request #5 from tswsxk/master

tests done!
bigdata-ustc · Nov 19, 2019 · ba766ce · ba766ce
2 parents 692de7a + a1ba920
commit ba766ce
Show file tree

Hide file tree

Showing 19 changed files with 302 additions and 228 deletions.
diff --git a/EduData/DataSet/download_data/__init__.py b/EduData/DataSet/download_data/__init__.py
@@ -2,4 +2,4 @@
 # create by tongshiwei on 2019-8-16
 
 
-from .download_data import url_dict
+from .download_data import URL_DICT
diff --git a/EduData/DataSet/download_data/download_data.py b/EduData/DataSet/download_data/download_data.py
@@ -1,19 +1,18 @@
 # coding: utf-8
 # create by tongshiwei on 2019/7/2
 
-__all__ = ["url_dict", "get_data", "list_resources"]
+__all__ = ["URL_DICT", "get_data", "list_resources"]
+
 import os
 from urllib.request import urlretrieve
 
 import requests
 from bs4 import BeautifulSoup
 from longling import config_logging, LogLevel, path_append
 
-# from longling.spider import download_data
-
 try:
     from .utils import decompress, reporthook4urlretrieve
-except (SystemError, ModuleNotFoundError):
+except (SystemError, ModuleNotFoundError):  # pragma: no cover
     from utils import decompress, reporthook4urlretrieve
 
 DEFAULT_DATADIR = path_append("./", "", to_str=True)
@@ -22,7 +21,7 @@
 
 prefix = 'http://base.ustc.edu.cn/data/'
 
-url_dict = {
+URL_DICT = {
     "assistment-2009-2010-skill":
         "http://base.ustc.edu.cn/data/ASSISTment/2009_skill_builder_data_corrected.zip",
     "assistment-2012-2013-non-skill":
@@ -34,13 +33,17 @@
     "KDD-CUP-2010":
         "http://base.ustc.edu.cn/data/KDD_Cup_2010/",
     "slepemapy.cz":
-        "http://base.ustc.edu.cn/data/slepemapy.cz/"
+        "http://base.ustc.edu.cn/data/slepemapy.cz/",
+    "synthetic":
+        "http://base.ustc.edu.cn/data/synthetic/",
+    "ktbd":
+        "http://base.ustc.edu.cn/data/ktbd/",
 }
 
 
-def get_dataset_name():
+def get_dataset_name():  # pragma: no cover
     urls = []
-    for i in url_dict.values():
+    for i in URL_DICT.values():
         if i not in urls:
             urls.append(i)
     url = prefix
@@ -60,51 +63,53 @@ def get_dataset_name():
                 # 避免ASSISTment和junyi的重复
                 if temp not in ['http://base.ustc.edu.cn/data/ASSISTment/',
                                 'http://base.ustc.edu.cn/data/JunyiAcademy_Math_Practicing_Log/']:
-                    url_dict[h[:-1]] = temp
+                    URL_DICT[h[:-1]] = temp
 
 
-def download_data(url, data_dir, override):
-    urls = []
-    os.makedirs(data_dir, exist_ok=True)
-    if url.endswith('/'):
-        # 以/结尾是文件夹，其余是文件
-        file_path = path_append(data_dir, url.split('/')[-2], to_str=True)
-        os.makedirs(file_path, exist_ok=True)
+def download_file(url, save_path, override):
+    if os.path.exists(save_path) and override:  # pragma: no cover
+        os.remove(save_path)
+        logger.info(save_path + ' will be overridden.')
+
+    logger.info(url + ' is saved as ' + save_path)
+    urlretrieve(url, save_path, reporthook=reporthook4urlretrieve)
+    print()
+    decompress(save_path)
+
+
+def download_data(url, data_dir, override, bloom_filter: set = None):
+    bloom_filter = set() if bloom_filter is None else bloom_filter
+
+    if url in bloom_filter:  # pragma: no cover
+        return
+
+    if url.endswith("/"):  # 以/结尾是文件夹，其余是文件
+        _data_dir = path_append(data_dir, url.split('/')[-2], to_str=True)
+
         r = requests.get(url, timeout=30)
         r.raise_for_status()
         r.encoding = r.apparent_encoding
         soup = BeautifulSoup(r.text, "lxml")
         al = soup.find_all('a')
         for a in al:
-            # 获得文件名
+            # 获得链接名
             h = a.get('href')
             if h[0] != '.':
-                temp = url + h
-                # 避免重复
-                if temp not in urls:
-                    urls.append(temp)
-                    temp_path = path_append(file_path, h, to_str=True)
-                    logger.info(temp + ' is saved as ' + temp_path)
-                    # 下载
-                    urlretrieve(temp, temp_path, reporthook=reporthook4urlretrieve)
-                    print()
-                    # 解压
-                    decompress(temp_path)
-                    if override:
-                        os.remove(temp_path)
-                        logger.info(temp_path + ' is deleted.')
+                url_h = url + h
+                if url_h not in bloom_filter:
+                    download_data(url_h, _data_dir, override, bloom_filter)
+        bloom_filter.add(url)
+
     else:
-        file_path = path_append(data_dir, url.split('/')[-1], to_str=True)
-        logger.info(url + ' is saved as ' + file_path)
-        urlretrieve(url, file_path, reporthook=reporthook4urlretrieve)
-        print()
-        decompress(file_path)
-        if override:
-            os.remove(file_path)
-            logger.info(file_path + ' is deleted.')
+        os.makedirs(data_dir, exist_ok=True)
+        save_path = path_append(data_dir, url.split('/')[-1], to_str=True)
+        download_file(url, save_path, override)
+        bloom_filter.add(url)
+
+    return data_dir
 
 
-def get_data(dataset, data_dir=DEFAULT_DATADIR, override=False):
+def get_data(dataset, data_dir=DEFAULT_DATADIR, override=False, url_dict: dict = None):
     """
     Parameters
     ----------
@@ -114,20 +119,19 @@ def get_data(dataset, data_dir=DEFAULT_DATADIR, override=False):
         数据存储目录
     override: bool
         是否覆盖已存在的文件
+    url_dict:
+        链接名称与链接映射
+
     Returns
     -------
 
     """
+    url_dict = URL_DICT if not url_dict else url_dict
     try:
         return download_data(url_dict[dataset], data_dir, override)
-    except FileExistsError:
+    except FileExistsError:  # pragma: no cover
         return path_append(data_dir, url_dict[dataset].split('/')[-1], to_str=True)
 
 
 def list_resources():
-    print("\n".join(url_dict))
-
-
-if __name__ == '__main__':
-    list_resources()
-    get_data("assistment-2009-2010-skill")
+    print("\n".join(URL_DICT))
diff --git a/EduData/DataSet/download_data/utils.py b/EduData/DataSet/download_data/utils.py
@@ -7,7 +7,7 @@
 from longling import flush_print
 
 
-def decompress(file):
+def decompress(file):  # pragma: no cover
     for z in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]:
         if file.endswith(z):
             if z == ".zip":
@@ -18,14 +18,14 @@ def decompress(file):
                 un_tar(file)
 
 
-def get_path(file):
+def get_path(file):  # pragma: no cover
     #  返回解压缩后的文件名
     for i in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]:
         file = file.replace(i, "")
     return file
 
 
-def un_zip(file):
+def un_zip(file):  # pragma: no cover
     zip_file = zipfile.ZipFile(file)
     uz_path = get_path(file)
     print(file + " is unzip to " + uz_path)
@@ -34,14 +34,14 @@ def un_zip(file):
     zip_file.close()
 
 
-def un_rar(file):
+def un_rar(file):  # pragma: no cover
     rar_file = rarfile.RarFile(file)
     uz_path = get_path(file)
     print(file + " is unrar to " + uz_path)
     rar_file.extractall(uz_path)
 
 
-def un_tar(file):
+def un_tar(file):  # pragma: no cover
     tar_file = tarfile.open(file)
     uz_path = get_path(file)
     print(file + " is untar to " + uz_path)

diff --git a/EduData/DataSet/junyi/KnowledgeTracing.py b/EduData/DataSet/junyi/KnowledgeTracing.py
@@ -1,6 +1,12 @@
 # coding: utf-8
 # create by tongshiwei on 2019-7-5
 
+"""
+This script is used to convert the original junyi dataset into json sequence, which can be applied in kt task.
+"""
+
+__all__ = ["select_n_most_frequent_students"]
+
 import csv
 import json
 
@@ -9,8 +15,15 @@
 from tqdm import tqdm
 
 
-def _read(source, ku_dict):
-    """require big memory to run this function"""
+def _read(source: str, ku_dict: str) -> dict:
+    """
+    Read the learners' interaction records and classify them by user id and session id.
+    In the same time, the exercise name will be converted to id.
+
+    Notes
+    -----
+    Require big memory to run this function.
+    """
 
     outcome = {
         "INCORRECT": 0,
@@ -26,8 +39,8 @@ def _read(source, ku_dict):
     with open(source) as f:
         f.readline()
         for line in tqdm(csv.reader(f, delimiter='\t'), "reading data"):
-            student, session, exercise, correct, timestamp = line[0], line[1], ku_dict[line[-5]], \
-                                                             outcome[line[10]], line[8]
+            student, session, exercise = line[0], line[1], ku_dict[line[-5]],
+            correct, timestamp = outcome[line[10]], line[8]
             if student not in students:
                 students[student] = {}
             if session not in students[student]:
@@ -46,19 +59,14 @@ def _write(students, target):
                 print(json.dumps(exercise_response), file=wf)
 
 
-def extract_students_log(source, target, ku_dict):
-    students = _read(source, ku_dict)
-    _write(students, target)
-
-
 def _frequency(students):
     frequency = {}
     for student_id, sessions in tqdm(students.items(), "calculating frequency"):
         frequency[student_id] = sum([len(session) for session in sessions])
     return sorted(frequency.items(), key=lambda x: x[1], reverse=True)
 
 
-def get_n_most_frequent_students(students, n=None, frequency=None):
+def get_n_most_frequent_students(students, n=None, frequency: list = None):
     frequency = _frequency(students) if frequency is None else frequency
     __frequency = frequency if n is None else frequency[:n]
     _students = {}
@@ -67,49 +75,10 @@ def get_n_most_frequent_students(students, n=None, frequency=None):
     return _students
 
 
-def select_n_most_frequent_students(source, target_prefix, ku_dict, n):
+def select_n_most_frequent_students(source: str, target_prefix: str, ku_dict_path: str, n: (int, list)):
+    """None in n means select all students"""
     n_list = as_list(n)
-    students = _read(source, ku_dict)
+    students = _read(source, ku_dict_path)
     frequency = _frequency(students)
     for _n in n_list:
         _write(get_n_most_frequent_students(students, _n, frequency), target_prefix + "%s" % _n)
-
-
-if __name__ == '__main__':
-    root = "../../../"
-    student_log_raw_file = root + "data/junyi/junyi_ProblemLog_for_PSLC.txt"
-    # student_log_file = root + "data/junyi/student_log_kt.json"
-    ku_dict_file = root + "data/junyi/graph_vertex.json"
-
-    select_n_most_frequent_students(
-        student_log_raw_file,
-        root + "data/junyi/student_log_kt_",
-        ku_dict_file,
-        [None]
-    )
-
-    # select_n_most_frequent_students(
-    #     student_log_raw_file,
-    #     root + "data/junyi/student_log_kt_",
-    #     ku_dict_file,
-    #     [100, 200, 300]
-    # )
-    # [500, 1000, 2000]
-
-    # extract_students_log(student_log_raw_file, student_log_file, ku_dict_file)
-
-    # student_log_file_small = student_log_file + ".small"
-    #
-    # with open(student_log_file) as f, wf_open(student_log_file_small) as wf:
-    #     for i, line in tqdm(enumerate(f)):
-    #         if i > 50000:
-    #             break
-    #         print(line, end="", file=wf)
-    #
-    # print(train_valid_test(
-    #     student_log_file_small,
-    #     valid_ratio=0.,
-    #     test_ratio=0.2,
-    #     root_dir=root + "data/junyi/",
-    #     silent=False,
-    # ))
diff --git a/EduData/DataSet/junyi/README.md b/EduData/DataSet/junyi/README.md
@@ -0,0 +1,4 @@
+# Junyi Dataset
+
+For detailed annotation for each file and field, you can download the dataset from our datashop 
+and see the `README.md` file.
diff --git a/EduData/DataSet/junyi/__init__.py b/EduData/DataSet/junyi/__init__.py
@@ -1,2 +1,5 @@
 # coding: utf-8
 # create by tongshiwei on 2019-7-5
+
+from .junyi import build_knowledge_graph
+from .KnowledgeTracing import select_n_most_frequent_students
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,4 +2,4 @@
		# create by tongshiwei on 2019-8-16


		from .download_data import url_dict
		from .download_data import URL_DICT