Skip to content

Commit 692de7a

Browse files
authored
Merge pull request #4 from tswsxk/master
update
2 parents b7adfd6 + 9bf7eae commit 692de7a

File tree

7 files changed

+243
-46
lines changed

7 files changed

+243
-46
lines changed

EduData/DataSet/junyi/KnowledgeTracing.py

Lines changed: 66 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,11 @@
55
import json
66

77
from longling import wf_open
8+
from longling.lib.candylib import as_list
89
from tqdm import tqdm
910

10-
from EduData.Tools import train_valid_test
1111

12-
13-
def extract_students_log(source, target, ku_dict):
12+
def _read(source, ku_dict):
1413
"""require big memory to run this function"""
1514

1615
outcome = {
@@ -35,39 +34,82 @@ def extract_students_log(source, target, ku_dict):
3534
students[student][session] = []
3635

3736
students[student][session].append([int(timestamp), exercise, correct])
37+
return students
38+
3839

40+
def _write(students, target):
3941
with wf_open(target) as wf:
40-
for student_id, sessions in tqdm(students.items(), "sorting"):
42+
for student_id, sessions in tqdm(students.items(), "writing -> %s" % target):
4143
for session_id, exercises in sessions.items():
4244
exercises.sort(key=lambda x: x[0])
4345
exercise_response = [(exercise[1], exercise[2]) for exercise in exercises]
4446
print(json.dumps(exercise_response), file=wf)
4547

4648

47-
def select_n_most_frequent_students(source, target, n=1000):
49+
def extract_students_log(source, target, ku_dict):
50+
students = _read(source, ku_dict)
51+
_write(students, target)
52+
53+
54+
def _frequency(students):
55+
frequency = {}
56+
for student_id, sessions in tqdm(students.items(), "calculating frequency"):
57+
frequency[student_id] = sum([len(session) for session in sessions])
58+
return sorted(frequency.items(), key=lambda x: x[1], reverse=True)
4859

49-
pass
60+
61+
def get_n_most_frequent_students(students, n=None, frequency=None):
62+
frequency = _frequency(students) if frequency is None else frequency
63+
__frequency = frequency if n is None else frequency[:n]
64+
_students = {}
65+
for _id, _ in __frequency:
66+
_students[_id] = students[_id]
67+
return _students
68+
69+
70+
def select_n_most_frequent_students(source, target_prefix, ku_dict, n):
71+
n_list = as_list(n)
72+
students = _read(source, ku_dict)
73+
frequency = _frequency(students)
74+
for _n in n_list:
75+
_write(get_n_most_frequent_students(students, _n, frequency), target_prefix + "%s" % _n)
5076

5177

5278
if __name__ == '__main__':
53-
root = "../../"
54-
student_log_raw_file = root + "raw_data/junyi/junyi_ProblemLog_for_PSLC.txt"
55-
student_log_file = root + "data/junyi/student_log_kt.json"
79+
root = "../../../"
80+
student_log_raw_file = root + "data/junyi/junyi_ProblemLog_for_PSLC.txt"
81+
# student_log_file = root + "data/junyi/student_log_kt.json"
5682
ku_dict_file = root + "data/junyi/graph_vertex.json"
83+
84+
select_n_most_frequent_students(
85+
student_log_raw_file,
86+
root + "data/junyi/student_log_kt_",
87+
ku_dict_file,
88+
[None]
89+
)
90+
91+
# select_n_most_frequent_students(
92+
# student_log_raw_file,
93+
# root + "data/junyi/student_log_kt_",
94+
# ku_dict_file,
95+
# [100, 200, 300]
96+
# )
97+
# [500, 1000, 2000]
98+
5799
# extract_students_log(student_log_raw_file, student_log_file, ku_dict_file)
58100

59-
student_log_file_small = student_log_file + ".small"
60-
61-
with open(student_log_file) as f, wf_open(student_log_file_small) as wf:
62-
for i, line in tqdm(enumerate(f)):
63-
if i > 50000:
64-
break
65-
print(line, end="", file=wf)
66-
67-
print(train_valid_test(
68-
student_log_file_small,
69-
valid_ratio=0.,
70-
test_ratio=0.2,
71-
root_dir=root + "data/junyi/",
72-
silent=False,
73-
))
101+
# student_log_file_small = student_log_file + ".small"
102+
#
103+
# with open(student_log_file) as f, wf_open(student_log_file_small) as wf:
104+
# for i, line in tqdm(enumerate(f)):
105+
# if i > 50000:
106+
# break
107+
# print(line, end="", file=wf)
108+
#
109+
# print(train_valid_test(
110+
# student_log_file_small,
111+
# valid_ratio=0.,
112+
# test_ratio=0.2,
113+
# root_dir=root + "data/junyi/",
114+
# silent=False,
115+
# ))

EduData/DataSet/junyi/junyi.py

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -86,19 +86,38 @@ def extract_similarity(source, target, ku_dict):
8686
json.dump(similarity, wf, indent=2)
8787

8888

89+
def extract_difficulty(source, target, ku_dict):
90+
"""
91+
In target: (A, B, v) means A is similar with B in v degree.
92+
If v is small, A and B should be considered as not similar.
93+
"""
94+
difficulty = []
95+
with codecs.open(source, encoding="utf-8") as f, open(ku_dict) as kf, wf_open(target) as wf:
96+
f.readline()
97+
ku_dict = json.load(kf)
98+
for line in csv.reader(f):
99+
difficulty.append((ku_dict[line[0]], ku_dict[line[1]], float(line[4])))
100+
101+
logger.info("edges: %s" % len(difficulty))
102+
103+
logger.info(pandas.Series([sim[-1] for sim in difficulty]).describe())
104+
json.dump(difficulty, wf, indent=2)
105+
106+
89107
if __name__ == '__main__':
90-
root = "../../"
91-
raw_file = root + "raw_data/junyi/junyi_Exercise_table.csv"
108+
root = "../../../"
109+
raw_file = root + "data/junyi/junyi_Exercise_table.csv"
92110
ku_dict_file = root + "data/junyi/graph_vertex.json"
93111
prerequisite_file = root + "data/junyi/prerequisite.json"
94112
similarity_raw_files = [
95-
root + "raw_data/junyi/relationship_annotation_{}.csv".format(name) for name in ["testing", "training"]
113+
root + "data/junyi/relationship_annotation_{}.csv".format(name) for name in ["testing", "training"]
96114
]
97115
similarity_raw_file = root + "raw_data/junyi/relationship_annotation.csv"
98116
similarity_file = root + "data/junyi/similarity.json"
117+
difficulty_file = root + "data/junyi/difficulty.json"
99118

100119
# merge_relationship_annotation(similarity_raw_files, similarity_raw_file)
101-
102120
# build_ku_dict(raw_file, ku_dict_file)
103-
extract_prerequisite(raw_file, prerequisite_file, ku_dict_file)
104-
# extract_similarity(similarity_raw_file, similarity_file, ku_dict_file)
121+
# extract_prerequisite(raw_file, prerequisite_file, ku_dict_file)
122+
extract_similarity(similarity_raw_file, similarity_file, ku_dict_file)
123+
# extract_difficulty(similarity_raw_file, difficulty_file, ku_dict_file)
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# coding: utf-8
2+
# 2019/8/24 @ tongshiwei
3+
4+
__all__ = ["analysis_records"]
5+
6+
from tqdm import tqdm
7+
import json
8+
9+
10+
def analysis_records(source):
11+
ku_set = set()
12+
records_num = 0
13+
seq_count = 0
14+
correct_num = 0
15+
with open(source) as f:
16+
for line in tqdm(f, "doing statistics"):
17+
seq_count += 1
18+
responses = json.loads(line)
19+
records_num += len(responses)
20+
correct_num += len([r[1] for r in responses if int(r[1]) == 1])
21+
ku_set.update(set([_id for _id, _ in responses]))
22+
23+
print("in %s" % source)
24+
print("knowledge units number: %s" % len(ku_set))
25+
print("records number: %s" % records_num)
26+
print("correct records number: %s" % correct_num)
27+
print("the number of sequence: %s" % seq_count)

EduData/Tools/train_valid_test.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
# coding: utf-8
22
# create by tongshiwei on 2019-7-5
33

4-
__all__ = ["train_valid_test"]
4+
__all__ = ["train_valid_test", "KFold"]
55

66
import io
7-
from longling.ML.toolkit.dataset.splitter import train_valid_test
7+
from longling.ML.toolkit.dataset import train_valid_test
88
import random
99
import math
10+
from tqdm import tqdm
1011

1112

1213
def KFold(filename, train_prefix, valid_prefix, n_splits=5, shuffle=False):
@@ -16,19 +17,18 @@ def KFold(filename, train_prefix, valid_prefix, n_splits=5, shuffle=False):
1617
if shuffle is True:
1718
random.shuffle(indices)
1819

19-
proportion = sample_num / n_splits
20-
step = math.floor(proportion * sample_num)
20+
step = math.ceil(sample_num / n_splits)
2121
indices_buckets = [
2222
(i, i + step) for i in range(0, sample_num, step)
2323
]
2424
train_wfs = [
25-
io.open(train_prefix + index, "w", encoding="utf-8") for index in range(n_splits)
25+
io.open(train_prefix + str(index), "w", encoding="utf-8") for index in range(n_splits)
2626
]
2727
valid_wfs = [
28-
io.open(valid_prefix + index, "w", encoding="utf-8") for index in range(n_splits)
28+
io.open(valid_prefix + str(index), "w", encoding="utf-8") for index in range(n_splits)
2929
]
3030
with open(filename) as f:
31-
for line_no, line in enumerate(f):
31+
for line_no, line in tqdm(enumerate(f), "splitting dataset"):
3232
for idx, (start, end) in enumerate(indices_buckets):
3333
if start <= line_no < end:
3434
print(line, end="", file=valid_wfs[idx])

EduData/main.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,13 @@
55

66
from EduData.DataSet.download_data.download_data import get_data as download, list_resources as ls
77
from EduData.Task.KnowledgeTracing.format import tl2json, json2tl
8+
from EduData.Task.KnowledgeTracing.statistics import analysis_records as kt_stat
9+
from EduData.Tools.train_valid_test import train_valid_test, KFold as kfold
810

9-
if __name__ == '__main__':
11+
12+
def cli():
1013
fire.Fire()
14+
15+
16+
if __name__ == '__main__':
17+
cli()

README.md

Lines changed: 109 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,117 @@
11
# EduData
2-
Convenient interface for downloading and preprocessing dataset in education
2+
Convenient interface for downloading and preprocessing dataset in education.
33

4-
# Tutorial
4+
The dataset includes:
55

6-
## Installation
7-
```bash
8-
python setup.py install
6+
* [KDD Cup 2010](https://pslcdatashop.web.cmu.edu/KDDCup/downloads.jsp)
7+
8+
* [ASSISTments](https://sites.google.com/site/assistmentsdata/)
9+
10+
* [OLI Engineering Statics 2011](https://pslcdatashop.web.cmu.edu/DatasetInfo?datasetId=507)
11+
12+
* [JunyiAcademy Math Practicing Log](https://pslcdatashop.web.cmu.edu/DatasetInfo?datasetId=1198)
13+
14+
* [slepemapy.cz](https://www.fi.muni.cz/adaptivelearning/?a=data)
15+
16+
* [synthetic](https://github.com/chrispiech/DeepKnowledgeTracing/tree/master/data/synthetic)
17+
18+
Your can also visit our datashop [BaseData](http://base.ustc.edu.cn/data/) to get those mentioned-above (most of them) dataset.
19+
20+
## Tutorial
21+
22+
### Installation
23+
Git and install by `pip`
24+
25+
```shell
26+
pip install -e .
927
```
1028

11-
## DataSet
29+
### CLI
30+
```shell
31+
edudata $subcommand $parameters1 $parameters2
32+
```
33+
34+
To see the `help` information:
35+
```shell
36+
edudata -- --help
37+
edudata $subcommand --help
38+
```
39+
40+
The cli tools is constructed based on [fire](https://github.com/google/python-fire).
41+
Refer to the [documentation](https://github.com/google/python-fire/blob/master/docs/using-cli.md) for detailed usage.
42+
43+
#### Download Dataset
44+
Before downloading dataset, first check the available dataset:
45+
```shell
46+
edudata ls
47+
```
48+
49+
Download the dataset by specifying the name of dataset:
50+
```shell
51+
edudata download assistment-2009-2010-skill
52+
```
53+
54+
#### Task Specified Tools
55+
56+
##### Knowledge Tracing
57+
58+
###### Format converter
59+
In Knowledge Tracing task, there is a popular format (we named it `triple line (tl)` format) to represent the interaction sequence records:
60+
```text
61+
5
62+
419,419,419,665,665
63+
1,1,1,0,0
64+
```
65+
which can be found in [Deep Knowledge Tracing](https://github.com/chrispiech/DeepKnowledgeTracing/tree/master/data/assistments).
66+
In this format, three lines are composed of an interaction sequence.
67+
The first line indicates the length of the interaction sequence,
68+
and the second line represents the exercise id followed by the third line,
69+
where each elements stands for correct answer (i.e., 1) or wrong answer (i.e., 0)
70+
71+
72+
In order to deal with the issue that some special symbols are hard to be stored in the mentioned-above format,
73+
we offer another one format, named `json sequence` to represent the interaction sequence records:
74+
```json
75+
[[419, 1], [419, 1], [419, 1], [665, 0], [665, 0]]
76+
```
77+
78+
Each item in the sequence represent one interaction. The first element of the item is the exercise
79+
(some works call it knowledge unit or knowledge item) id
80+
and the second one indicates whether the learner correctly answer the exercise, 0 for wrongly while 1 for correctly
81+
One line, one `json` record, which is corresponded to a learner's interaction sequence.
82+
83+
We provide tools for converting two format:
84+
```shell
85+
# convert tl sequence to json sequence
86+
edudata tl2json $src $tar
87+
# convert json sequence to tl sequence
88+
edudata json2tl $src $tar
89+
```
90+
91+
###### Dataset Preprocess
92+
The cli tools to quickly convert the "raw" data of the dataset into "mature" data for knowledge tracing task.
93+
The "mature" data is in `json sequence` format
94+
and can be modeled by [XKT](https://github.com/bigdata-ustc/XKT) and TKT(TBA)
95+
96+
TBA
97+
98+
###### Analysis Dataset
99+
This tool only supports the `json sequence` format. To check the following statical indexes of the dataset:
100+
101+
* knowledge units number
102+
* correct records number
103+
* the number of sequence
104+
105+
```shell
106+
edudata kt_stat $filename
107+
```
12108

13-
```bash
109+
#### Evaluation
110+
In order to better verify the effectiveness of model,
111+
the dataset is usually divided into `train/valid/test` or using `kfold` method.
14112

113+
```shell
114+
edudata longling train_valid_test $filename1 $filename2 -- --train_ratio 0.8 --valid_ratio 0.1 --test_ratio 0.1
115+
longling kfold $filename1 $filename2 -- --n_splits 5
15116
```
117+
Refer to [longling](https://longling.readthedocs.io/zh/latest/#dataset) for more tools and detailed information.

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
], # And any other dependencies foo needs
2828
entry_points={
2929
"console_scripts": [
30-
"edudata = EduData.main:__main__",
30+
"edudata = EduData.main:cli",
3131
],
3232
},
3333
)

0 commit comments

Comments
 (0)