Merge pull request #4 from tswsxk/master

tswsxk · web-flow · commit 692de7af10f3 · 2019-11-13T15:15:57.000+08:00
update
diff --git a/EduData/DataSet/junyi/KnowledgeTracing.py b/EduData/DataSet/junyi/KnowledgeTracing.py
@@ -5,12 +5,11 @@
 import json
 
 from longling import wf_open
+from longling.lib.candylib import as_list
 from tqdm import tqdm
 
-from EduData.Tools import train_valid_test
 
-
-def extract_students_log(source, target, ku_dict):
+def _read(source, ku_dict):
     """require big memory to run this function"""
 
     outcome = {
@@ -35,39 +34,82 @@ def extract_students_log(source, target, ku_dict):
                 students[student][session] = []
 
             students[student][session].append([int(timestamp), exercise, correct])
+    return students
+
 
+def _write(students, target):
     with wf_open(target) as wf:
-        for student_id, sessions in tqdm(students.items(), "sorting"):
+        for student_id, sessions in tqdm(students.items(), "writing -> %s" % target):
             for session_id, exercises in sessions.items():
                 exercises.sort(key=lambda x: x[0])
                 exercise_response = [(exercise[1], exercise[2]) for exercise in exercises]
                 print(json.dumps(exercise_response), file=wf)
 
 
-def select_n_most_frequent_students(source, target, n=1000):
+def extract_students_log(source, target, ku_dict):
+    students = _read(source, ku_dict)
+    _write(students, target)
+
+
+def _frequency(students):
+    frequency = {}
+    for student_id, sessions in tqdm(students.items(), "calculating frequency"):
+        frequency[student_id] = sum([len(session) for session in sessions])
+    return sorted(frequency.items(), key=lambda x: x[1], reverse=True)
 
-    pass
+
+def get_n_most_frequent_students(students, n=None, frequency=None):
+    frequency = _frequency(students) if frequency is None else frequency
+    __frequency = frequency if n is None else frequency[:n]
+    _students = {}
+    for _id, _ in __frequency:
+        _students[_id] = students[_id]
+    return _students
+
+
+def select_n_most_frequent_students(source, target_prefix, ku_dict, n):
+    n_list = as_list(n)
+    students = _read(source, ku_dict)
+    frequency = _frequency(students)
+    for _n in n_list:
+        _write(get_n_most_frequent_students(students, _n, frequency), target_prefix + "%s" % _n)
 
 
 if __name__ == '__main__':
-    root = "../../"
-    student_log_raw_file = root + "raw_data/junyi/junyi_ProblemLog_for_PSLC.txt"
-    student_log_file = root + "data/junyi/student_log_kt.json"
+    root = "../../../"
+    student_log_raw_file = root + "data/junyi/junyi_ProblemLog_for_PSLC.txt"
+    # student_log_file = root + "data/junyi/student_log_kt.json"
     ku_dict_file = root + "data/junyi/graph_vertex.json"
+
+    select_n_most_frequent_students(
+        student_log_raw_file,
+        root + "data/junyi/student_log_kt_",
+        ku_dict_file,
+        [None]
+    )
+
+    # select_n_most_frequent_students(
+    #     student_log_raw_file,
+    #     root + "data/junyi/student_log_kt_",
+    #     ku_dict_file,
+    #     [100, 200, 300]
+    # )
+    # [500, 1000, 2000]
+
     # extract_students_log(student_log_raw_file, student_log_file, ku_dict_file)
 
-    student_log_file_small = student_log_file + ".small"
-
-    with open(student_log_file) as f, wf_open(student_log_file_small) as wf:
-        for i, line in tqdm(enumerate(f)):
-            if i > 50000:
-                break
-            print(line, end="", file=wf)
-
-    print(train_valid_test(
-        student_log_file_small,
-        valid_ratio=0.,
-        test_ratio=0.2,
-        root_dir=root + "data/junyi/",
-        silent=False,
-    ))
+    # student_log_file_small = student_log_file + ".small"
+    #
+    # with open(student_log_file) as f, wf_open(student_log_file_small) as wf:
+    #     for i, line in tqdm(enumerate(f)):
+    #         if i > 50000:
+    #             break
+    #         print(line, end="", file=wf)
+    #
+    # print(train_valid_test(
+    #     student_log_file_small,
+    #     valid_ratio=0.,
+    #     test_ratio=0.2,
+    #     root_dir=root + "data/junyi/",
+    #     silent=False,
+    # ))
diff --git a/EduData/DataSet/junyi/junyi.py b/EduData/DataSet/junyi/junyi.py
@@ -86,19 +86,38 @@ def extract_similarity(source, target, ku_dict):
         json.dump(similarity, wf, indent=2)
 
 
+def extract_difficulty(source, target, ku_dict):
+    """
+    In target: (A, B, v) means A is similar with B in v degree.
+    If v is small, A and B should be considered as not similar.
+    """
+    difficulty = []
+    with codecs.open(source, encoding="utf-8") as f, open(ku_dict) as kf, wf_open(target) as wf:
+        f.readline()
+        ku_dict = json.load(kf)
+        for line in csv.reader(f):
+            difficulty.append((ku_dict[line[0]], ku_dict[line[1]], float(line[4])))
+
+        logger.info("edges: %s" % len(difficulty))
+
+        logger.info(pandas.Series([sim[-1] for sim in difficulty]).describe())
+        json.dump(difficulty, wf, indent=2)
+
+
 if __name__ == '__main__':
-    root = "../../"
-    raw_file = root + "raw_data/junyi/junyi_Exercise_table.csv"
+    root = "../../../"
+    raw_file = root + "data/junyi/junyi_Exercise_table.csv"
     ku_dict_file = root + "data/junyi/graph_vertex.json"
     prerequisite_file = root + "data/junyi/prerequisite.json"
     similarity_raw_files = [
-        root + "raw_data/junyi/relationship_annotation_{}.csv".format(name) for name in ["testing", "training"]
+        root + "data/junyi/relationship_annotation_{}.csv".format(name) for name in ["testing", "training"]
     ]
     similarity_raw_file = root + "raw_data/junyi/relationship_annotation.csv"
     similarity_file = root + "data/junyi/similarity.json"
+    difficulty_file = root + "data/junyi/difficulty.json"
 
     # merge_relationship_annotation(similarity_raw_files, similarity_raw_file)
-
     # build_ku_dict(raw_file, ku_dict_file)
-    extract_prerequisite(raw_file, prerequisite_file, ku_dict_file)
-    # extract_similarity(similarity_raw_file, similarity_file, ku_dict_file)
+    # extract_prerequisite(raw_file, prerequisite_file, ku_dict_file)
+    extract_similarity(similarity_raw_file, similarity_file, ku_dict_file)
+    # extract_difficulty(similarity_raw_file, difficulty_file, ku_dict_file)
diff --git a/EduData/Task/KnowledgeTracing/statistics.py b/EduData/Task/KnowledgeTracing/statistics.py
@@ -0,0 +1,27 @@
+# coding: utf-8
+# 2019/8/24 @ tongshiwei
+
+__all__ = ["analysis_records"]
+
+from tqdm import tqdm
+import json
+
+
+def analysis_records(source):
+    ku_set = set()
+    records_num = 0
+    seq_count = 0
+    correct_num = 0
+    with open(source) as f:
+        for line in tqdm(f, "doing statistics"):
+            seq_count += 1
+            responses = json.loads(line)
+            records_num += len(responses)
+            correct_num += len([r[1] for r in responses if int(r[1]) == 1])
+            ku_set.update(set([_id for _id, _ in responses]))
+
+    print("in %s" % source)
+    print("knowledge units number: %s" % len(ku_set))
+    print("records number: %s" % records_num)
+    print("correct records number: %s" % correct_num)
+    print("the number of sequence: %s" % seq_count)
diff --git a/EduData/Tools/train_valid_test.py b/EduData/Tools/train_valid_test.py
@@ -1,12 +1,13 @@
 # coding: utf-8
 # create by tongshiwei on 2019-7-5
 
-__all__ = ["train_valid_test"]
+__all__ = ["train_valid_test", "KFold"]
 
 import io
-from longling.ML.toolkit.dataset.splitter import train_valid_test
+from longling.ML.toolkit.dataset import train_valid_test
 import random
 import math
+from tqdm import tqdm
 
 
 def KFold(filename, train_prefix, valid_prefix, n_splits=5, shuffle=False):
@@ -16,19 +17,18 @@ def KFold(filename, train_prefix, valid_prefix, n_splits=5, shuffle=False):
     if shuffle is True:
         random.shuffle(indices)
 
-    proportion = sample_num / n_splits
-    step = math.floor(proportion * sample_num)
+    step = math.ceil(sample_num / n_splits)
     indices_buckets = [
         (i, i + step) for i in range(0, sample_num, step)
     ]
     train_wfs = [
-        io.open(train_prefix + index, "w", encoding="utf-8") for index in range(n_splits)
+        io.open(train_prefix + str(index), "w", encoding="utf-8") for index in range(n_splits)
     ]
     valid_wfs = [
-        io.open(valid_prefix + index, "w", encoding="utf-8") for index in range(n_splits)
+        io.open(valid_prefix + str(index), "w", encoding="utf-8") for index in range(n_splits)
     ]
     with open(filename) as f:
-        for line_no, line in enumerate(f):
+        for line_no, line in tqdm(enumerate(f), "splitting dataset"):
             for idx, (start, end) in enumerate(indices_buckets):
                 if start <= line_no < end:
                     print(line, end="", file=valid_wfs[idx])
diff --git a/EduData/main.py b/EduData/main.py
@@ -5,6 +5,13 @@
 
 from EduData.DataSet.download_data.download_data import get_data as download, list_resources as ls
 from EduData.Task.KnowledgeTracing.format import tl2json, json2tl
+from EduData.Task.KnowledgeTracing.statistics import analysis_records as kt_stat
+from EduData.Tools.train_valid_test import train_valid_test, KFold as kfold
 
-if __name__ == '__main__':
+
+def cli():
     fire.Fire()
+
+
+if __name__ == '__main__':
+    cli()
diff --git a/README.md b/README.md
@@ -1,15 +1,117 @@
 # EduData
-Convenient interface for downloading and preprocessing dataset in education
+Convenient interface for downloading and preprocessing dataset in education.
 
-# Tutorial
+The dataset includes:
 
-## Installation
-```bash
-python setup.py install
+* [KDD Cup 2010](https://pslcdatashop.web.cmu.edu/KDDCup/downloads.jsp)
+
+* [ASSISTments](https://sites.google.com/site/assistmentsdata/)
+
+* [OLI Engineering Statics 2011](https://pslcdatashop.web.cmu.edu/DatasetInfo?datasetId=507)
+
+* [JunyiAcademy Math Practicing Log](https://pslcdatashop.web.cmu.edu/DatasetInfo?datasetId=1198)
+
+* [slepemapy.cz](https://www.fi.muni.cz/adaptivelearning/?a=data)
+
+* [synthetic](https://github.com/chrispiech/DeepKnowledgeTracing/tree/master/data/synthetic)
+
+Your can also visit our datashop [BaseData](http://base.ustc.edu.cn/data/) to get those mentioned-above (most of them) dataset. 
+
+## Tutorial
+
+### Installation
+Git and install by `pip`
+
+```shell
+pip install -e .
 ```
 
-## DataSet
+### CLI
+```shell
+edudata $subcommand $parameters1 $parameters2
+```
+
+To see the `help` information:
+```shell
+edudata -- --help
+edudata $subcommand --help
+```
+
+The cli tools is constructed based on [fire](https://github.com/google/python-fire). 
+Refer to the [documentation](https://github.com/google/python-fire/blob/master/docs/using-cli.md) for detailed usage.
+
+#### Download Dataset
+Before downloading dataset, first check the available dataset:
+```shell
+edudata ls
+```
+
+Download the dataset by specifying the name of dataset:
+```shell
+edudata download assistment-2009-2010-skill
+```
+
+#### Task Specified Tools
+
+##### Knowledge Tracing
+
+###### Format converter
+In Knowledge Tracing task, there is a popular format (we named it `triple line (tl)` format) to represent the interaction sequence records:
+```text
+5
+419,419,419,665,665
+1,1,1,0,0
+```
+which can be found in [Deep Knowledge Tracing](https://github.com/chrispiech/DeepKnowledgeTracing/tree/master/data/assistments).
+In this format, three lines are composed of an interaction sequence.
+The first line indicates the length of the interaction sequence, 
+and the second line represents the exercise id followed by the third line, 
+where each elements stands for correct answer (i.e., 1) or wrong answer (i.e., 0) 
+
+
+In order to deal with the issue that some special symbols are hard to be stored in the mentioned-above format,
+we offer another one format, named `json sequence` to represent the interaction sequence records:
+```json
+[[419, 1], [419, 1], [419, 1], [665, 0], [665, 0]]
+```
+
+Each item in the sequence represent one interaction. The first element of the item is the exercise 
+(some works call it knowledge unit or knowledge item) id 
+and the second one indicates whether the learner correctly answer the exercise, 0 for wrongly while 1 for correctly  
+One line, one `json` record, which is corresponded to a learner's interaction sequence.
+
+We provide tools for converting two format:
+```shell
+# convert tl sequence to json sequence
+edudata tl2json $src $tar
+# convert json sequence to tl sequence
+edudata json2tl $src $tar
+```
+
+###### Dataset Preprocess
+The cli tools to quickly convert the "raw" data of the dataset into "mature" data for knowledge tracing task. 
+The "mature" data is in `json sequence` format 
+and can be modeled by [XKT](https://github.com/bigdata-ustc/XKT) and TKT(TBA)
+
+TBA
+
+###### Analysis Dataset
+This tool only supports the `json sequence` format. To check the following statical indexes of the dataset:
+
+* knowledge units number
+* correct records number
+* the number of sequence
+
+```shell
+edudata kt_stat $filename
+```
 
-```bash
+#### Evaluation
+In order to better verify the effectiveness of model, 
+the dataset is usually divided into `train/valid/test` or using `kfold` method.
 
+```shell
+edudata longling train_valid_test $filename1 $filename2 -- --train_ratio 0.8 --valid_ratio 0.1 --test_ratio 0.1
+longling kfold $filename1 $filename2 -- --n_splits 5
 ```
+Refer to [longling](https://longling.readthedocs.io/zh/latest/#dataset) for more tools and detailed information.
diff --git a/setup.py b/setup.py
@@ -27,7 +27,7 @@
     ],  # And any other dependencies foo needs
     entry_points={
         "console_scripts": [
-            "edudata = EduData.main:__main__",
+            "edudata = EduData.main:cli",
         ],
     },
 )

Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@`
`27`	`27`	`], # And any other dependencies foo needs`
`28`	`28`	`entry_points={`
`29`	`29`	`"console_scripts": [`
`30`		`- "edudata = EduData.main:__main__",`
	`30`	`+ "edudata = EduData.main:cli",`
`31`	`31`	`],`
`32`	`32`	`},`
`33`	`33`	`)`