Skip to content

Commit

Permalink
Merge pull request #5 from tswsxk/master
Browse files Browse the repository at this point in the history
tests done!
  • Loading branch information
tswsxk authored Nov 19, 2019
2 parents 692de7a + a1ba920 commit ba766ce
Show file tree
Hide file tree
Showing 19 changed files with 302 additions and 228 deletions.
2 changes: 1 addition & 1 deletion EduData/DataSet/download_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
# create by tongshiwei on 2019-8-16


from .download_data import url_dict
from .download_data import URL_DICT
98 changes: 51 additions & 47 deletions EduData/DataSet/download_data/download_data.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
# coding: utf-8
# create by tongshiwei on 2019/7/2

__all__ = ["url_dict", "get_data", "list_resources"]
__all__ = ["URL_DICT", "get_data", "list_resources"]

import os
from urllib.request import urlretrieve

import requests
from bs4 import BeautifulSoup
from longling import config_logging, LogLevel, path_append

# from longling.spider import download_data

try:
from .utils import decompress, reporthook4urlretrieve
except (SystemError, ModuleNotFoundError):
except (SystemError, ModuleNotFoundError): # pragma: no cover
from utils import decompress, reporthook4urlretrieve

DEFAULT_DATADIR = path_append("./", "", to_str=True)
Expand All @@ -22,7 +21,7 @@

prefix = 'http://base.ustc.edu.cn/data/'

url_dict = {
URL_DICT = {
"assistment-2009-2010-skill":
"http://base.ustc.edu.cn/data/ASSISTment/2009_skill_builder_data_corrected.zip",
"assistment-2012-2013-non-skill":
Expand All @@ -34,13 +33,17 @@
"KDD-CUP-2010":
"http://base.ustc.edu.cn/data/KDD_Cup_2010/",
"slepemapy.cz":
"http://base.ustc.edu.cn/data/slepemapy.cz/"
"http://base.ustc.edu.cn/data/slepemapy.cz/",
"synthetic":
"http://base.ustc.edu.cn/data/synthetic/",
"ktbd":
"http://base.ustc.edu.cn/data/ktbd/",
}


def get_dataset_name():
def get_dataset_name(): # pragma: no cover
urls = []
for i in url_dict.values():
for i in URL_DICT.values():
if i not in urls:
urls.append(i)
url = prefix
Expand All @@ -60,51 +63,53 @@ def get_dataset_name():
# 避免ASSISTment和junyi的重复
if temp not in ['http://base.ustc.edu.cn/data/ASSISTment/',
'http://base.ustc.edu.cn/data/JunyiAcademy_Math_Practicing_Log/']:
url_dict[h[:-1]] = temp
URL_DICT[h[:-1]] = temp


def download_data(url, data_dir, override):
urls = []
os.makedirs(data_dir, exist_ok=True)
if url.endswith('/'):
# 以/结尾是文件夹,其余是文件
file_path = path_append(data_dir, url.split('/')[-2], to_str=True)
os.makedirs(file_path, exist_ok=True)
def download_file(url, save_path, override):
if os.path.exists(save_path) and override: # pragma: no cover
os.remove(save_path)
logger.info(save_path + ' will be overridden.')

logger.info(url + ' is saved as ' + save_path)
urlretrieve(url, save_path, reporthook=reporthook4urlretrieve)
print()
decompress(save_path)


def download_data(url, data_dir, override, bloom_filter: set = None):
bloom_filter = set() if bloom_filter is None else bloom_filter

if url in bloom_filter: # pragma: no cover
return

if url.endswith("/"): # 以/结尾是文件夹,其余是文件
_data_dir = path_append(data_dir, url.split('/')[-2], to_str=True)

r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, "lxml")
al = soup.find_all('a')
for a in al:
# 获得文件名
# 获得链接名
h = a.get('href')
if h[0] != '.':
temp = url + h
# 避免重复
if temp not in urls:
urls.append(temp)
temp_path = path_append(file_path, h, to_str=True)
logger.info(temp + ' is saved as ' + temp_path)
# 下载
urlretrieve(temp, temp_path, reporthook=reporthook4urlretrieve)
print()
# 解压
decompress(temp_path)
if override:
os.remove(temp_path)
logger.info(temp_path + ' is deleted.')
url_h = url + h
if url_h not in bloom_filter:
download_data(url_h, _data_dir, override, bloom_filter)
bloom_filter.add(url)

else:
file_path = path_append(data_dir, url.split('/')[-1], to_str=True)
logger.info(url + ' is saved as ' + file_path)
urlretrieve(url, file_path, reporthook=reporthook4urlretrieve)
print()
decompress(file_path)
if override:
os.remove(file_path)
logger.info(file_path + ' is deleted.')
os.makedirs(data_dir, exist_ok=True)
save_path = path_append(data_dir, url.split('/')[-1], to_str=True)
download_file(url, save_path, override)
bloom_filter.add(url)

return data_dir


def get_data(dataset, data_dir=DEFAULT_DATADIR, override=False):
def get_data(dataset, data_dir=DEFAULT_DATADIR, override=False, url_dict: dict = None):
"""
Parameters
----------
Expand All @@ -114,20 +119,19 @@ def get_data(dataset, data_dir=DEFAULT_DATADIR, override=False):
数据存储目录
override: bool
是否覆盖已存在的文件
url_dict:
链接名称与链接映射
Returns
-------
"""
url_dict = URL_DICT if not url_dict else url_dict
try:
return download_data(url_dict[dataset], data_dir, override)
except FileExistsError:
except FileExistsError: # pragma: no cover
return path_append(data_dir, url_dict[dataset].split('/')[-1], to_str=True)


def list_resources():
print("\n".join(url_dict))


if __name__ == '__main__':
list_resources()
get_data("assistment-2009-2010-skill")
print("\n".join(URL_DICT))
10 changes: 5 additions & 5 deletions EduData/DataSet/download_data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from longling import flush_print


def decompress(file):
def decompress(file): # pragma: no cover
for z in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]:
if file.endswith(z):
if z == ".zip":
Expand All @@ -18,14 +18,14 @@ def decompress(file):
un_tar(file)


def get_path(file):
def get_path(file): # pragma: no cover
# 返回解压缩后的文件名
for i in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]:
file = file.replace(i, "")
return file


def un_zip(file):
def un_zip(file): # pragma: no cover
zip_file = zipfile.ZipFile(file)
uz_path = get_path(file)
print(file + " is unzip to " + uz_path)
Expand All @@ -34,14 +34,14 @@ def un_zip(file):
zip_file.close()


def un_rar(file):
def un_rar(file): # pragma: no cover
rar_file = rarfile.RarFile(file)
uz_path = get_path(file)
print(file + " is unrar to " + uz_path)
rar_file.extractall(uz_path)


def un_tar(file):
def un_tar(file): # pragma: no cover
tar_file = tarfile.open(file)
uz_path = get_path(file)
print(file + " is untar to " + uz_path)
Expand Down
73 changes: 21 additions & 52 deletions EduData/DataSet/junyi/KnowledgeTracing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
# coding: utf-8
# create by tongshiwei on 2019-7-5

"""
This script is used to convert the original junyi dataset into json sequence, which can be applied in kt task.
"""

__all__ = ["select_n_most_frequent_students"]

import csv
import json

Expand All @@ -9,8 +15,15 @@
from tqdm import tqdm


def _read(source, ku_dict):
"""require big memory to run this function"""
def _read(source: str, ku_dict: str) -> dict:
"""
Read the learners' interaction records and classify them by user id and session id.
In the same time, the exercise name will be converted to id.
Notes
-----
Require big memory to run this function.
"""

outcome = {
"INCORRECT": 0,
Expand All @@ -26,8 +39,8 @@ def _read(source, ku_dict):
with open(source) as f:
f.readline()
for line in tqdm(csv.reader(f, delimiter='\t'), "reading data"):
student, session, exercise, correct, timestamp = line[0], line[1], ku_dict[line[-5]], \
outcome[line[10]], line[8]
student, session, exercise = line[0], line[1], ku_dict[line[-5]],
correct, timestamp = outcome[line[10]], line[8]
if student not in students:
students[student] = {}
if session not in students[student]:
Expand All @@ -46,19 +59,14 @@ def _write(students, target):
print(json.dumps(exercise_response), file=wf)


def extract_students_log(source, target, ku_dict):
students = _read(source, ku_dict)
_write(students, target)


def _frequency(students):
frequency = {}
for student_id, sessions in tqdm(students.items(), "calculating frequency"):
frequency[student_id] = sum([len(session) for session in sessions])
return sorted(frequency.items(), key=lambda x: x[1], reverse=True)


def get_n_most_frequent_students(students, n=None, frequency=None):
def get_n_most_frequent_students(students, n=None, frequency: list = None):
frequency = _frequency(students) if frequency is None else frequency
__frequency = frequency if n is None else frequency[:n]
_students = {}
Expand All @@ -67,49 +75,10 @@ def get_n_most_frequent_students(students, n=None, frequency=None):
return _students


def select_n_most_frequent_students(source, target_prefix, ku_dict, n):
def select_n_most_frequent_students(source: str, target_prefix: str, ku_dict_path: str, n: (int, list)):
"""None in n means select all students"""
n_list = as_list(n)
students = _read(source, ku_dict)
students = _read(source, ku_dict_path)
frequency = _frequency(students)
for _n in n_list:
_write(get_n_most_frequent_students(students, _n, frequency), target_prefix + "%s" % _n)


if __name__ == '__main__':
root = "../../../"
student_log_raw_file = root + "data/junyi/junyi_ProblemLog_for_PSLC.txt"
# student_log_file = root + "data/junyi/student_log_kt.json"
ku_dict_file = root + "data/junyi/graph_vertex.json"

select_n_most_frequent_students(
student_log_raw_file,
root + "data/junyi/student_log_kt_",
ku_dict_file,
[None]
)

# select_n_most_frequent_students(
# student_log_raw_file,
# root + "data/junyi/student_log_kt_",
# ku_dict_file,
# [100, 200, 300]
# )
# [500, 1000, 2000]

# extract_students_log(student_log_raw_file, student_log_file, ku_dict_file)

# student_log_file_small = student_log_file + ".small"
#
# with open(student_log_file) as f, wf_open(student_log_file_small) as wf:
# for i, line in tqdm(enumerate(f)):
# if i > 50000:
# break
# print(line, end="", file=wf)
#
# print(train_valid_test(
# student_log_file_small,
# valid_ratio=0.,
# test_ratio=0.2,
# root_dir=root + "data/junyi/",
# silent=False,
# ))
4 changes: 4 additions & 0 deletions EduData/DataSet/junyi/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Junyi Dataset

For detailed annotation for each file and field, you can download the dataset from our datashop
and see the `README.md` file.
3 changes: 3 additions & 0 deletions EduData/DataSet/junyi/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
# coding: utf-8
# create by tongshiwei on 2019-7-5

from .junyi import build_knowledge_graph
from .KnowledgeTracing import select_n_most_frequent_students
Loading

0 comments on commit ba766ce

Please sign in to comment.