Skip to content

Commit

Permalink
Merge pull request #4 from tswsxk/master
Browse files Browse the repository at this point in the history
update
  • Loading branch information
tswsxk authored Nov 13, 2019
2 parents b7adfd6 + 9bf7eae commit 692de7a
Show file tree
Hide file tree
Showing 7 changed files with 243 additions and 46 deletions.
90 changes: 66 additions & 24 deletions EduData/DataSet/junyi/KnowledgeTracing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@
import json

from longling import wf_open
from longling.lib.candylib import as_list
from tqdm import tqdm

from EduData.Tools import train_valid_test


def extract_students_log(source, target, ku_dict):
def _read(source, ku_dict):
"""require big memory to run this function"""

outcome = {
Expand All @@ -35,39 +34,82 @@ def extract_students_log(source, target, ku_dict):
students[student][session] = []

students[student][session].append([int(timestamp), exercise, correct])
return students


def _write(students, target):
with wf_open(target) as wf:
for student_id, sessions in tqdm(students.items(), "sorting"):
for student_id, sessions in tqdm(students.items(), "writing -> %s" % target):
for session_id, exercises in sessions.items():
exercises.sort(key=lambda x: x[0])
exercise_response = [(exercise[1], exercise[2]) for exercise in exercises]
print(json.dumps(exercise_response), file=wf)


def select_n_most_frequent_students(source, target, n=1000):
def extract_students_log(source, target, ku_dict):
students = _read(source, ku_dict)
_write(students, target)


def _frequency(students):
frequency = {}
for student_id, sessions in tqdm(students.items(), "calculating frequency"):
frequency[student_id] = sum([len(session) for session in sessions])
return sorted(frequency.items(), key=lambda x: x[1], reverse=True)

pass

def get_n_most_frequent_students(students, n=None, frequency=None):
frequency = _frequency(students) if frequency is None else frequency
__frequency = frequency if n is None else frequency[:n]
_students = {}
for _id, _ in __frequency:
_students[_id] = students[_id]
return _students


def select_n_most_frequent_students(source, target_prefix, ku_dict, n):
n_list = as_list(n)
students = _read(source, ku_dict)
frequency = _frequency(students)
for _n in n_list:
_write(get_n_most_frequent_students(students, _n, frequency), target_prefix + "%s" % _n)


if __name__ == '__main__':
root = "../../"
student_log_raw_file = root + "raw_data/junyi/junyi_ProblemLog_for_PSLC.txt"
student_log_file = root + "data/junyi/student_log_kt.json"
root = "../../../"
student_log_raw_file = root + "data/junyi/junyi_ProblemLog_for_PSLC.txt"
# student_log_file = root + "data/junyi/student_log_kt.json"
ku_dict_file = root + "data/junyi/graph_vertex.json"

select_n_most_frequent_students(
student_log_raw_file,
root + "data/junyi/student_log_kt_",
ku_dict_file,
[None]
)

# select_n_most_frequent_students(
# student_log_raw_file,
# root + "data/junyi/student_log_kt_",
# ku_dict_file,
# [100, 200, 300]
# )
# [500, 1000, 2000]

# extract_students_log(student_log_raw_file, student_log_file, ku_dict_file)

student_log_file_small = student_log_file + ".small"

with open(student_log_file) as f, wf_open(student_log_file_small) as wf:
for i, line in tqdm(enumerate(f)):
if i > 50000:
break
print(line, end="", file=wf)

print(train_valid_test(
student_log_file_small,
valid_ratio=0.,
test_ratio=0.2,
root_dir=root + "data/junyi/",
silent=False,
))
# student_log_file_small = student_log_file + ".small"
#
# with open(student_log_file) as f, wf_open(student_log_file_small) as wf:
# for i, line in tqdm(enumerate(f)):
# if i > 50000:
# break
# print(line, end="", file=wf)
#
# print(train_valid_test(
# student_log_file_small,
# valid_ratio=0.,
# test_ratio=0.2,
# root_dir=root + "data/junyi/",
# silent=False,
# ))
31 changes: 25 additions & 6 deletions EduData/DataSet/junyi/junyi.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,19 +86,38 @@ def extract_similarity(source, target, ku_dict):
json.dump(similarity, wf, indent=2)


def extract_difficulty(source, target, ku_dict):
"""
In target: (A, B, v) means A is similar with B in v degree.
If v is small, A and B should be considered as not similar.
"""
difficulty = []
with codecs.open(source, encoding="utf-8") as f, open(ku_dict) as kf, wf_open(target) as wf:
f.readline()
ku_dict = json.load(kf)
for line in csv.reader(f):
difficulty.append((ku_dict[line[0]], ku_dict[line[1]], float(line[4])))

logger.info("edges: %s" % len(difficulty))

logger.info(pandas.Series([sim[-1] for sim in difficulty]).describe())
json.dump(difficulty, wf, indent=2)


if __name__ == '__main__':
root = "../../"
raw_file = root + "raw_data/junyi/junyi_Exercise_table.csv"
root = "../../../"
raw_file = root + "data/junyi/junyi_Exercise_table.csv"
ku_dict_file = root + "data/junyi/graph_vertex.json"
prerequisite_file = root + "data/junyi/prerequisite.json"
similarity_raw_files = [
root + "raw_data/junyi/relationship_annotation_{}.csv".format(name) for name in ["testing", "training"]
root + "data/junyi/relationship_annotation_{}.csv".format(name) for name in ["testing", "training"]
]
similarity_raw_file = root + "raw_data/junyi/relationship_annotation.csv"
similarity_file = root + "data/junyi/similarity.json"
difficulty_file = root + "data/junyi/difficulty.json"

# merge_relationship_annotation(similarity_raw_files, similarity_raw_file)

# build_ku_dict(raw_file, ku_dict_file)
extract_prerequisite(raw_file, prerequisite_file, ku_dict_file)
# extract_similarity(similarity_raw_file, similarity_file, ku_dict_file)
# extract_prerequisite(raw_file, prerequisite_file, ku_dict_file)
extract_similarity(similarity_raw_file, similarity_file, ku_dict_file)
# extract_difficulty(similarity_raw_file, difficulty_file, ku_dict_file)
27 changes: 27 additions & 0 deletions EduData/Task/KnowledgeTracing/statistics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# coding: utf-8
# 2019/8/24 @ tongshiwei

__all__ = ["analysis_records"]

from tqdm import tqdm
import json


def analysis_records(source):
ku_set = set()
records_num = 0
seq_count = 0
correct_num = 0
with open(source) as f:
for line in tqdm(f, "doing statistics"):
seq_count += 1
responses = json.loads(line)
records_num += len(responses)
correct_num += len([r[1] for r in responses if int(r[1]) == 1])
ku_set.update(set([_id for _id, _ in responses]))

print("in %s" % source)
print("knowledge units number: %s" % len(ku_set))
print("records number: %s" % records_num)
print("correct records number: %s" % correct_num)
print("the number of sequence: %s" % seq_count)
14 changes: 7 additions & 7 deletions EduData/Tools/train_valid_test.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
# coding: utf-8
# create by tongshiwei on 2019-7-5

__all__ = ["train_valid_test"]
__all__ = ["train_valid_test", "KFold"]

import io
from longling.ML.toolkit.dataset.splitter import train_valid_test
from longling.ML.toolkit.dataset import train_valid_test
import random
import math
from tqdm import tqdm


def KFold(filename, train_prefix, valid_prefix, n_splits=5, shuffle=False):
Expand All @@ -16,19 +17,18 @@ def KFold(filename, train_prefix, valid_prefix, n_splits=5, shuffle=False):
if shuffle is True:
random.shuffle(indices)

proportion = sample_num / n_splits
step = math.floor(proportion * sample_num)
step = math.ceil(sample_num / n_splits)
indices_buckets = [
(i, i + step) for i in range(0, sample_num, step)
]
train_wfs = [
io.open(train_prefix + index, "w", encoding="utf-8") for index in range(n_splits)
io.open(train_prefix + str(index), "w", encoding="utf-8") for index in range(n_splits)
]
valid_wfs = [
io.open(valid_prefix + index, "w", encoding="utf-8") for index in range(n_splits)
io.open(valid_prefix + str(index), "w", encoding="utf-8") for index in range(n_splits)
]
with open(filename) as f:
for line_no, line in enumerate(f):
for line_no, line in tqdm(enumerate(f), "splitting dataset"):
for idx, (start, end) in enumerate(indices_buckets):
if start <= line_no < end:
print(line, end="", file=valid_wfs[idx])
Expand Down
9 changes: 8 additions & 1 deletion EduData/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@

from EduData.DataSet.download_data.download_data import get_data as download, list_resources as ls
from EduData.Task.KnowledgeTracing.format import tl2json, json2tl
from EduData.Task.KnowledgeTracing.statistics import analysis_records as kt_stat
from EduData.Tools.train_valid_test import train_valid_test, KFold as kfold

if __name__ == '__main__':

def cli():
fire.Fire()


if __name__ == '__main__':
cli()
116 changes: 109 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,117 @@
# EduData
Convenient interface for downloading and preprocessing dataset in education
Convenient interface for downloading and preprocessing dataset in education.

# Tutorial
The dataset includes:

## Installation
```bash
python setup.py install
* [KDD Cup 2010](https://pslcdatashop.web.cmu.edu/KDDCup/downloads.jsp)

* [ASSISTments](https://sites.google.com/site/assistmentsdata/)

* [OLI Engineering Statics 2011](https://pslcdatashop.web.cmu.edu/DatasetInfo?datasetId=507)

* [JunyiAcademy Math Practicing Log](https://pslcdatashop.web.cmu.edu/DatasetInfo?datasetId=1198)

* [slepemapy.cz](https://www.fi.muni.cz/adaptivelearning/?a=data)

* [synthetic](https://github.com/chrispiech/DeepKnowledgeTracing/tree/master/data/synthetic)

Your can also visit our datashop [BaseData](http://base.ustc.edu.cn/data/) to get those mentioned-above (most of them) dataset.

## Tutorial

### Installation
Git and install by `pip`

```shell
pip install -e .
```

## DataSet
### CLI
```shell
edudata $subcommand $parameters1 $parameters2
```

To see the `help` information:
```shell
edudata -- --help
edudata $subcommand --help
```

The cli tools is constructed based on [fire](https://github.com/google/python-fire).
Refer to the [documentation](https://github.com/google/python-fire/blob/master/docs/using-cli.md) for detailed usage.

#### Download Dataset
Before downloading dataset, first check the available dataset:
```shell
edudata ls
```

Download the dataset by specifying the name of dataset:
```shell
edudata download assistment-2009-2010-skill
```

#### Task Specified Tools

##### Knowledge Tracing

###### Format converter
In Knowledge Tracing task, there is a popular format (we named it `triple line (tl)` format) to represent the interaction sequence records:
```text
5
419,419,419,665,665
1,1,1,0,0
```
which can be found in [Deep Knowledge Tracing](https://github.com/chrispiech/DeepKnowledgeTracing/tree/master/data/assistments).
In this format, three lines are composed of an interaction sequence.
The first line indicates the length of the interaction sequence,
and the second line represents the exercise id followed by the third line,
where each elements stands for correct answer (i.e., 1) or wrong answer (i.e., 0)


In order to deal with the issue that some special symbols are hard to be stored in the mentioned-above format,
we offer another one format, named `json sequence` to represent the interaction sequence records:
```json
[[419, 1], [419, 1], [419, 1], [665, 0], [665, 0]]
```

Each item in the sequence represent one interaction. The first element of the item is the exercise
(some works call it knowledge unit or knowledge item) id
and the second one indicates whether the learner correctly answer the exercise, 0 for wrongly while 1 for correctly
One line, one `json` record, which is corresponded to a learner's interaction sequence.

We provide tools for converting two format:
```shell
# convert tl sequence to json sequence
edudata tl2json $src $tar
# convert json sequence to tl sequence
edudata json2tl $src $tar
```

###### Dataset Preprocess
The cli tools to quickly convert the "raw" data of the dataset into "mature" data for knowledge tracing task.
The "mature" data is in `json sequence` format
and can be modeled by [XKT](https://github.com/bigdata-ustc/XKT) and TKT(TBA)

TBA

###### Analysis Dataset
This tool only supports the `json sequence` format. To check the following statical indexes of the dataset:

* knowledge units number
* correct records number
* the number of sequence

```shell
edudata kt_stat $filename
```

```bash
#### Evaluation
In order to better verify the effectiveness of model,
the dataset is usually divided into `train/valid/test` or using `kfold` method.

```shell
edudata longling train_valid_test $filename1 $filename2 -- --train_ratio 0.8 --valid_ratio 0.1 --test_ratio 0.1
longling kfold $filename1 $filename2 -- --n_splits 5
```
Refer to [longling](https://longling.readthedocs.io/zh/latest/#dataset) for more tools and detailed information.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
], # And any other dependencies foo needs
entry_points={
"console_scripts": [
"edudata = EduData.main:__main__",
"edudata = EduData.main:cli",
],
},
)

0 comments on commit 692de7a

Please sign in to comment.