Skip to content

Commit

Permalink
Merge pull request #132 from FengZiYjun/v0.3.1
Browse files Browse the repository at this point in the history
fastNLP V0.3.1
  • Loading branch information
FengZiYjun authored Feb 6, 2019
2 parents 3fa95b6 + b66d7b8 commit 13faa2b
Show file tree
Hide file tree
Showing 70 changed files with 3,957 additions and 4,832 deletions.
5 changes: 5 additions & 0 deletions codecov.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
ignore:
- "reproduction" # ignore folders and all its contents
- "setup.py"
- "docs"
- "tutorials"
5 changes: 3 additions & 2 deletions docs/source/tutorials/fastnlp_10tmin_tutorial.rst
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@

fastNLP上手教程
fastNLP 10分钟上手教程
===============

教程原文见 https://github.com/fastnlp/fastNLP/blob/master/tutorials/fastnlp_10min_tutorial.ipynb

fastNLP提供方便的数据预处理,训练和测试模型的功能

DataSet & Instance
Expand Down
2 changes: 2 additions & 0 deletions docs/source/tutorials/fastnlp_1_minute_tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
FastNLP 1分钟上手教程
=====================

教程原文见 https://github.com/fastnlp/fastNLP/blob/master/tutorials/fastnlp_1min_tutorial.ipynb

step 1
------

Expand Down
5 changes: 5 additions & 0 deletions docs/source/tutorials/fastnlp_advanced_tutorial.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
fastNLP 进阶教程
===============

教程原文见 https://github.com/fastnlp/fastNLP/blob/master/tutorials/fastnlp_advanced_tutorial/advance_tutorial.ipynb

5 changes: 5 additions & 0 deletions docs/source/tutorials/fastnlp_developer_guide.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
fastNLP 开发者指南
===============

原文见 https://github.com/fastnlp/fastNLP/blob/master/tutorials/tutorial_for_developer.md

1 change: 1 addition & 0 deletions docs/source/user/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Installation
.. contents::
:local:

Make sure your environment satisfies https://github.com/fastnlp/fastNLP/blob/master/requirements.txt .

Run the following commands to install fastNLP package:

Expand Down
2 changes: 2 additions & 0 deletions docs/source/user/quickstart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,6 @@ Quickstart

../tutorials/fastnlp_1_minute_tutorial
../tutorials/fastnlp_10tmin_tutorial
../tutorials/fastnlp_advanced_tutorial
../tutorials/fastnlp_developer_guide

21 changes: 11 additions & 10 deletions fastNLP/api/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,26 +18,27 @@ print(cws.predict(text))
# ['编者 按 : 7月 12日 , 英国 航空 航天 系统 公司 公布 了 该 公司 研制 的 第一 款 高 科技 隐形 无人 机雷电 之 神 。', '这 款 飞行 从 外型 上 来 看 酷似 电影 中 的 太空 飞行器 , 据 英国 方面 介绍 , 可以 实现 洲际 远程 打击 。', '那么 这 款 无人 机 到底 有 多 厉害 ?']
```

### 中文分词+词性标注
### 词性标注
```python
text = ['编者按:7月12日,英国航空航天系统公司公布了该公司研制的第一款高科技隐形无人机雷电之神。',
'这款飞行从外型上来看酷似电影中的太空飞行器,据英国方面介绍,可以实现洲际远程打击。',
'那么这款无人机到底有多厉害?']
# 输入已分词序列
text = [['编者', '按:', '7月', '12日', '', '英国', '航空', '航天', '系统', '公司', '公布', '', '', '公司',
'研制', '', '第一款', '高科技', '隐形', '无人机', '雷电之神', ''],
['那么', '', '', '无人机', '到底', '', '', '厉害', '']]
from fastNLP.api import POS
pos = POS(device='cpu')
print(pos.predict(text))
# [['编者/NN', '按/P', ':/PU', '7月/NT', '12日/NR', ',/PU', '英国/NR', '航空/NN', '航天/NN', '系统/NN', '公司/NN', '公布/VV', '了/AS', '该/DT', '公司/NN', '研制/VV', '的/DEC', '第一/OD', '款高/NN', '科技/NN', '隐形/NN', '无/VE', '人机/NN', '雷电/NN', '之/DEG', '神/NN', '。/PU'], ['这/DT', '款/NN', '飞行/VV', '从/P', '外型/NN', '上/LC', '来/MSP', '看/VV', '酷似/VV', '电影/NN', '中/LC', '的/DEG', '太空/NN', '飞行器/NN', ',/PU', '据/P', '英国/NR', '方面/NN', '介绍/VV', ',/PU', '可以/VV', '实现/VV', '洲际/NN', '远程/NN', '打击/NN', '。/PU'], ['那么/AD', '这/DT', '款/NN', '无/VE', '人机/NN', '到底/AD', '有/VE', '多/CD', '厉害/NN', '?/PU']]
# [['编者/NN', '按:/NN', '7月/NT', '12日/NT', ',/PU', '英国/NR', '航空/NN', '航天/NN', '系统/NN', '公司/NN', '公布/VV', '了/AS', '该/DT', '公司/NN', '研制/VV', '的/DEC', '第一款/NN', '高科技/NN', '隐形/AD', '无人机/VV', '雷电之神/NN', '。/PU'], ['那么/AD', '这/DT', '款/NN', '无人机/VV', '到底/AD', '有/VE', '多/AD', '厉害/VA', '?/PU']]
```

### 中文分词+词性标注+句法分析
### 句法分析
```python
text = ['编者按:7月12日,英国航空航天系统公司公布了该公司研制的第一款高科技隐形无人机雷电之神。',
'这款飞行从外型上来看酷似电影中的太空飞行器,据英国方面介绍,可以实现洲际远程打击。',
'那么这款无人机到底有多厉害?']
text = [['编者', '按:', '7月', '12日', '', '英国', '航空', '航天', '系统', '公司', '公布', '', '', '公司',
'研制', '', '第一款', '高科技', '隐形', '无人机', '雷电之神', ''],
['那么', '', '', '无人机', '到底', '', '', '厉害', '']]
from fastNLP.api import Parser
parser = Parser(device='cpu')
print(parser.predict(text))
# [['12/nsubj', '12/prep', '2/punct', '5/nn', '2/pobj', '12/punct', '11/nn', '11/nn', '11/nn', '11/nn', '2/pobj', '0/root', '12/asp', '15/det', '16/nsubj', '21/rcmod', '16/cpm', '21/nummod', '21/nn', '21/nn', '22/top', '12/ccomp', '24/nn', '26/assmod', '24/assm', '22/dobj', '12/punct'], ['2/det', '8/xsubj', '8/mmod', '8/prep', '6/lobj', '4/plmod', '8/prtmod', '0/root', '8/ccomp', '11/lobj', '14/assmod', '11/assm', '14/nn', '9/dobj', '8/punct', '22/prep', '18/nn', '19/nsubj', '16/pccomp', '22/punct', '22/mmod', '8/dep', '25/nn', '25/nn', '22/dobj', '8/punct'], ['4/advmod', '3/det', '4/nsubj', '0/root', '4/dobj', '7/advmod', '4/conj', '9/nummod', '7/dobj', '4/punct']]
# [['2/nn', '4/nn', '4/nn', '20/tmod', '11/punct', '10/nn', '10/nn', '10/nn', '10/nn', '11/nsubj', '20/dep', '11/asp', '14/det', '15/nsubj', '18/rcmod', '15/cpm', '18/nn', '11/dobj', '20/advmod', '0/root', '20/dobj', '20/punct'], ['4/advmod', '3/det', '8/xsubj', '8/dep', '8/advmod', '8/dep', '8/advmod', '0/root', '8/punct']]
```

完整样例见`examples.py`
129 changes: 86 additions & 43 deletions fastNLP/api/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,17 @@

from fastNLP.api.utils import load_url
from fastNLP.api.processor import ModelProcessor
from reproduction.chinese_word_segment.cws_io.cws_reader import ConllCWSReader
from reproduction.pos_tag_model.pos_reader import ZhConllPOSReader
from reproduction.Biaffine_parser.util import ConllxDataLoader, add_seg_tag
from fastNLP.io.dataset_loader import ConllCWSReader, ConllxDataLoader
from fastNLP.core.instance import Instance
from fastNLP.api.pipeline import Pipeline
from fastNLP.core.metrics import SpanFPreRecMetric
from fastNLP.api.processor import IndexerProcessor

# TODO add pretrain urls
model_urls = {
"cws": "http://123.206.98.91:8888/download/cws_crf_1_11-457fc899.pkl",
"pos": "http://123.206.98.91:8888/download/pos_tag_model_20190108-f3c60ee5.pkl",
"parser": "http://123.206.98.91:8888/download/biaffine_parser-3a2f052c.pkl"
"cws": "http://123.206.98.91:8888/download/cws_lstm_ctb9_1_20-09908656.pkl",
"pos": "http://123.206.98.91:8888/download/pos_tag_model_20190119-43f8b435.pkl",
"parser": "http://123.206.98.91:8888/download/parser_20190204-c72ca5c0.pkl"
}


Expand All @@ -31,6 +29,16 @@ def __init__(self):
self._dict = None

def predict(self, *args, **kwargs):
"""Do prediction for the given input.
"""
raise NotImplementedError

def test(self, file_path):
"""Test performance over the given data set.
:param str file_path:
:return: a dictionary of metric values
"""
raise NotImplementedError

def load(self, path, device):
Expand Down Expand Up @@ -69,12 +77,11 @@ def predict(self, content):
if not hasattr(self, "pipeline"):
raise ValueError("You have to load model first.")

sentence_list = []
sentence_list = content
# 1. 检查sentence的类型
if isinstance(content, str):
sentence_list.append(content)
elif isinstance(content, list):
sentence_list = content
for sentence in sentence_list:
if not all((type(obj) == str for obj in sentence)):
raise ValueError("Input must be list of list of string.")

# 2. 组建dataset
dataset = DataSet()
Expand All @@ -83,36 +90,28 @@ def predict(self, content):
# 3. 使用pipeline
self.pipeline(dataset)

def decode_tags(ins):
pred_tags = ins["tag"]
chars = ins["words"]
words = []
start_idx = 0
for idx, tag in enumerate(pred_tags):
if tag[0] == "S":
words.append(chars[start_idx:idx + 1] + "/" + tag[2:])
start_idx = idx + 1
elif tag[0] == "E":
words.append("".join(chars[start_idx:idx + 1]) + "/" + tag[2:])
start_idx = idx + 1
return words

dataset.apply(decode_tags, new_field_name="tag_output")

output = dataset.field_arrays["tag_output"].content
def merge_tag(words_list, tags_list):
rtn = []
for words, tags in zip(words_list, tags_list):
rtn.append([w + "/" + t for w, t in zip(words, tags)])
return rtn

output = dataset.field_arrays["tag"].content
if isinstance(content, str):
return output[0]
elif isinstance(content, list):
return output
return merge_tag(content, output)

def test(self, file_path):
test_data = ZhConllPOSReader().load(file_path)
test_data = ConllxDataLoader().load(file_path)

tag_vocab = self._dict["tag_vocab"]
pipeline = self._dict["pipeline"]
save_dict = self._dict
tag_vocab = save_dict["tag_vocab"]
pipeline = save_dict["pipeline"]
index_tag = IndexerProcessor(vocab=tag_vocab, field_name="tag", new_added_field_name="truth", is_input=False)
pipeline.pipeline = [index_tag] + pipeline.pipeline

test_data.rename_field("pos_tags", "tag")
pipeline(test_data)
test_data.set_target("truth")
prediction = test_data.field_arrays["predict"].content
Expand Down Expand Up @@ -226,7 +225,7 @@ def test(self, filepath):
rec = eval_res['BMESF1PreRecMetric']['rec']
# print("f1:{:.2f}, pre:{:.2f}, rec:{:.2f}".format(f1, pre, rec))

return f1, pre, rec
return {"F1": f1, "precision": pre, "recall": rec}


class Parser(API):
Expand All @@ -251,6 +250,7 @@ def predict(self, content):
dataset.add_field('wp', pos_out)
dataset.apply(lambda x: ['<BOS>'] + [w.split('/')[0] for w in x['wp']], new_field_name='words')
dataset.apply(lambda x: ['<BOS>'] + [w.split('/')[1] for w in x['wp']], new_field_name='pos')
dataset.rename_field("words", "raw_words")

# 3. 使用pipeline
self.pipeline(dataset)
Expand All @@ -260,39 +260,82 @@ def predict(self, content):
# output like: [['2/top', '0/root', '4/nn', '2/dep']]
return dataset.field_arrays['output'].content

def test(self, filepath):
data = ConllxDataLoader().load(filepath)
ds = DataSet()
for ins1, ins2 in zip(add_seg_tag(data), data):
ds.append(Instance(words=ins1[0], tag=ins1[1],
gold_words=ins2[0], gold_pos=ins2[1],
gold_heads=ins2[2], gold_head_tags=ins2[3]))
def load_test_file(self, path):
def get_one(sample):
sample = list(map(list, zip(*sample)))
if len(sample) == 0:
return None
for w in sample[7]:
if w == '_':
print('Error Sample {}'.format(sample))
return None
# return word_seq, pos_seq, head_seq, head_tag_seq
return sample[1], sample[3], list(map(int, sample[6])), sample[7]

datalist = []
with open(path, 'r', encoding='utf-8') as f:
sample = []
for line in f:
if line.startswith('\n'):
datalist.append(sample)
sample = []
elif line.startswith('#'):
continue
else:
sample.append(line.split('\t'))
if len(sample) > 0:
datalist.append(sample)

data = [get_one(sample) for sample in datalist]
data_list = list(filter(lambda x: x is not None, data))
return data_list

def test(self, filepath):
data = self.load_test_file(filepath)

def convert(data):
BOS = '<BOS>'
dataset = DataSet()
for sample in data:
word_seq = [BOS] + sample[0]
pos_seq = [BOS] + sample[1]
heads = [0] + sample[2]
head_tags = [BOS] + sample[3]
dataset.append(Instance(raw_words=word_seq,
pos=pos_seq,
gold_heads=heads,
arc_true=heads,
tags=head_tags))
return dataset

ds = convert(data)
pp = self.pipeline
for p in pp:
if p.field_name == 'word_list':
p.field_name = 'gold_words'
elif p.field_name == 'pos_list':
p.field_name = 'gold_pos'
# ds.rename_field("words", "raw_words")
# ds.rename_field("tag", "pos")
pp(ds)
head_cor, label_cor, total = 0, 0, 0
for ins in ds:
head_gold = ins['gold_heads']
head_pred = ins['heads']
head_pred = ins['arc_pred']
length = len(head_gold)
total += length
for i in range(length):
head_cor += 1 if head_pred[i] == head_gold[i] else 0
uas = head_cor / total
print('uas:{:.2f}'.format(uas))
# print('uas:{:.2f}'.format(uas))

for p in pp:
if p.field_name == 'gold_words':
p.field_name = 'word_list'
elif p.field_name == 'gold_pos':
p.field_name = 'pos_list'

return uas
return {"USA": round(uas, 5)}


class Analyzer:
Expand Down
27 changes: 27 additions & 0 deletions fastNLP/api/examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,42 @@ def chinese_word_segmentation():
print(cws.predict(text))


def chinese_word_segmentation_test():
cws = CWS(device='cpu')
print(cws.test("../../test/data_for_tests/zh_sample.conllx"))


def pos_tagging():
# 输入已分词序列
text = [['编者', '按:', '7月', '12日', ',', '英国', '航空', '航天', '系统', '公司', '公布', '了', '该', '公司',
'研制', '的', '第一款', '高科技', '隐形', '无人机', '雷电之神', '。'],
['那么', '这', '款', '无人机', '到底', '有', '多', '厉害', '?']]
pos = POS(device='cpu')
print(pos.predict(text))


def pos_tagging_test():
pos = POS(device='cpu')
print(pos.test("../../test/data_for_tests/zh_sample.conllx"))


def syntactic_parsing():
text = [['编者', '按:', '7月', '12日', ',', '英国', '航空', '航天', '系统', '公司', '公布', '了', '该', '公司',
'研制', '的', '第一款', '高科技', '隐形', '无人机', '雷电之神', '。'],
['那么', '这', '款', '无人机', '到底', '有', '多', '厉害', '?']]
parser = Parser(device='cpu')
print(parser.predict(text))


def syntactic_parsing_test():
parser = Parser(device='cpu')
print(parser.test("../../test/data_for_tests/zh_sample.conllx"))


if __name__ == "__main__":
# chinese_word_segmentation()
# chinese_word_segmentation_test()
# pos_tagging()
# pos_tagging_test()
syntactic_parsing()
# syntactic_parsing_test()
Loading

0 comments on commit 13faa2b

Please sign in to comment.