Merge pull request #91 from FengZiYjun/master

Merge Preprocessor into DataSet.
fastnlp · Oct 1, 2018 · 8b6d082 · 8b6d082
2 parents 281b567 + 81790d7
commit 8b6d082
Show file tree

Hide file tree

Showing 52 changed files with 2,249 additions and 67,893 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -5,7 +5,6 @@ python:
 install:
   - pip install --quiet -r requirements.txt
   - pip install pytest pytest-cov
-  - pip install -U scikit-learn
 # command to run tests
 script:
   - pytest --cov=./

diff --git a/README.md b/README.md
@@ -30,77 +30,36 @@ Run the following commands to install fastNLP package.
 pip install fastNLP
 ```
 
-### Cloning From GitHub
-
-If you just want to use fastNLP, use:
-```shell
-git clone https://github.com/fastnlp/fastNLP
-cd fastNLP
-```
-
-### PyTorch Installation
-
-Visit the [PyTorch official website] for installation instructions based on your system. In general, you could use:
-```shell
-# using conda
-conda install pytorch torchvision -c pytorch
-# or using pip
-pip3 install torch torchvision
-```
-
-### TensorboardX Installation
-
-```shell
-pip3 install tensorboardX
-```
 
 ## Project Structure
 
-```
-FastNLP
-├── docs
-├── fastNLP
-│   ├── core
-│   │   ├── action.py
-│   │   ├── __init__.py
-│   │   ├── loss.py
-│   │   ├── metrics.py
-│   │   ├── optimizer.py
-│   │   ├── predictor.py
-│   │   ├── preprocess.py
-│   │   ├── README.md
-│   │   ├── tester.py
-│   │   └── trainer.py
-│   ├── fastnlp.py
-│   ├── __init__.py
-│   ├── loader
-│   │   ├── base_loader.py
-│   │   ├── config_loader.py
-│   │   ├── dataset_loader.py
-│   │   ├── embed_loader.py
-│   │   ├── __init__.py
-│   │   └── model_loader.py
-│   ├── models
-│   ├── modules
-│   │   ├── aggregation
-│   │   ├── decoder
-│   │   ├── encoder
-│   │   ├── __init__.py
-│   │   ├── interaction
-│   │   ├── other_modules.py
-│   │   └── utils.py
-│   └── saver
-├── LICENSE
-├── README.md
-├── reproduction
-├── requirements.txt
-├── setup.py
-└── test
-    ├── core
-    ├── data_for_tests
-    ├── __init__.py
-    ├── loader
-    ├── modules
-    └── readme_example.py
-
-```
+<table>
+<tr>
+    <td><b> fastNLP </b></td>
+    <td> an open-source NLP library </td>
+</tr>
+<tr>
+    <td><b> fastNLP.core </b></td>
+    <td> trainer, tester, predictor </td>
+</tr>
+<tr>
+    <td><b> fastNLP.loader </b></td>
+    <td> all kinds of loaders/readers </td>
+</tr>
+<tr>
+    <td><b> fastNLP.models </b></td>
+    <td> a collection of NLP models </td>
+</tr>
+<tr>
+    <td><b> fastNLP.modules </b></td>
+    <td> a collection of PyTorch sub-models/components/wheels </td>
+</tr>
+<tr>
+    <td><b> fastNLP.saver </b></td>
+    <td> all kinds of savers/writers </td>
+</tr>
+<tr>
+    <td><b> fastNLP.fastnlp </b></td>
+    <td> a high-level interface for prediction </td>
+</tr>
+</table>
diff --git a/docs/source/user/quickstart.rst b/docs/source/user/quickstart.rst
@@ -18,7 +18,7 @@ pre-processing data, constructing model and training model.
    from fastNLP.modules import aggregation
    from fastNLP.modules import decoder
 
-   from fastNLP.loader.dataset_loader import ClassDatasetLoader
+   from fastNLP.loader.dataset_loader import ClassDataSetLoader
    from fastNLP.loader.preprocess import ClassPreprocess
    from fastNLP.core.trainer import ClassificationTrainer
    from fastNLP.core.inference import ClassificationInfer
@@ -50,7 +50,7 @@ pre-processing data, constructing model and training model.
    train_path = 'test/data_for_tests/text_classify.txt'  # training set file
 
    # load dataset
-   ds_loader = ClassDatasetLoader("train", train_path)
+   ds_loader = ClassDataSetLoader("train", train_path)
    data = ds_loader.load()
 
    # pre-process dataset

diff --git a/examples/readme_example.py b/examples/readme_example.py
@@ -3,7 +3,7 @@
 from fastNLP.core.predictor import ClassificationInfer
 from fastNLP.core.preprocess import ClassPreprocess
 from fastNLP.core.trainer import ClassificationTrainer
-from fastNLP.loader.dataset_loader import ClassDatasetLoader
+from fastNLP.loader.dataset_loader import ClassDataSetLoader
 from fastNLP.models.base_model import BaseModel
 from fastNLP.modules import aggregator
 from fastNLP.modules import decoder
@@ -36,7 +36,7 @@ def forward(self, x):
 train_path = './data_for_tests/text_classify.txt'  # training set file
 
 # load dataset
-ds_loader = ClassDatasetLoader(train_path)
+ds_loader = ClassDataSetLoader()
 data = ds_loader.load()
 
 # pre-process dataset

diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py
@@ -17,7 +17,7 @@ def __init__(self, dataset, batch_size, sampler, use_cuda):
         :param dataset: a DataSet object
         :param batch_size: int, the size of the batch
         :param sampler: a Sampler object
-        :param use_cuda: bool, whetjher to use GPU
+        :param use_cuda: bool, whether to use GPU
 
         """
         self.dataset = dataset
@@ -37,15 +37,12 @@ def __next__(self):
         """
 
         :return batch_x: dict of (str: torch.LongTensor), which means (field name: tensor of shape [batch_size, padding_length])
-                         batch_x also contains an item (str: list of int) about origin lengths,
-                         which means ("field_name_origin_len": origin lengths).
                          E.g.
                          ::
                          {'text': tensor([[ 0,  1,  2,  3,  0,  0,  0], 4,  5,  2,  6,  7,  8,  9]]), 'text_origin_len': [4, 7]})
 
                 batch_y: dict of (str: torch.LongTensor), which means (field name: tensor of shape [batch_size, padding_length])
                 All tensors in both batch_x and batch_y will be cuda tensors if use_cuda is True.
-                The names of fields are defined in preprocessor's convert_to_dataset method.
 
         """
         if self.curidx >= len(self.idx_list):
@@ -54,34 +51,24 @@ def __next__(self):
             endidx = min(self.curidx + self.batch_size, len(self.idx_list))
             padding_length = {field_name: max(field_length[self.curidx: endidx])
                               for field_name, field_length in self.lengths.items()}
-            origin_lengths = {field_name: field_length[self.curidx: endidx]
-                              for field_name, field_length in self.lengths.items()}
-
             batch_x, batch_y = defaultdict(list), defaultdict(list)
+
+            # transform index to tensor and do padding for sequences
             for idx in range(self.curidx, endidx):
                 x, y = self.dataset.to_tensor(idx, padding_length)
                 for name, tensor in x.items():
                     batch_x[name].append(tensor)
                 for name, tensor in y.items():
                     batch_y[name].append(tensor)
 
-            batch_origin_length = {}
-            # combine instances into a batch
+            # combine instances to form a batch
             for batch in (batch_x, batch_y):
                 for name, tensor_list in batch.items():
                     if self.use_cuda:
                         batch[name] = torch.stack(tensor_list, dim=0).cuda()
                     else:
                         batch[name] = torch.stack(tensor_list, dim=0)
 
-            # add origin lengths in batch_x
-            for name, tensor in batch_x.items():
-                if self.use_cuda:
-                    batch_origin_length[name + "_origin_len"] = torch.LongTensor(origin_lengths[name]).cuda()
-                else:
-                    batch_origin_length[name + "_origin_len"] = torch.LongTensor(origin_lengths[name])
-            batch_x.update(batch_origin_length)
-
             self.curidx = endidx
             return batch_x, batch_y