-
Notifications
You must be signed in to change notification settings - Fork 46
/
Copy pathmyLTP.py
182 lines (151 loc) · 6.13 KB
/
myLTP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# -*- coding: UTF-8 -*-
import pandas as pd
import os
import jieba
import triple_ie
from pyltp import SentenceSplitter,Segmentor,Postagger,NamedEntityRecognizer,Parser,SementicRoleLabeller
LTP_DATA_DIR='D:\LTP\ltp_data_v3.4.0'
cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model`
pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model`
ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`ner.model`
par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model`
srl_model_path = os.path.join(LTP_DATA_DIR, 'srl') # 语义角色标注模型目录路径,模型目录为`srl`。注意该模型路径是一个目录,而不是一个文件。
#创建停用词表
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
# 分句,也就是将一片文本分割为独立的句子
def sentence_splitter(sentence):
sents = SentenceSplitter.split(sentence) # 分句
#print('\n'.join(sents))
# 分词
def segmentor(sentence):
segmentor = Segmentor() # 初始化实例
segmentor.load(cws_model_path) # 加载模型
#segmentor.load_with_lexicon('cws_model_path', 'D:\pyprojects\LTP\ltp_data\dict.txt') #加载模型 使用用户自定义字典的高级分词
words = segmentor.segment(sentence) # 分词
# 默认可以这样输出
#print('/'.join(words))
# 可以转换成List 输出
words_list = list(words)
segmentor.release() # 释放模型
return words_list
# 词性标注
def posttagger(words):
postagger = Postagger() # 初始化实例
postagger.load(pos_model_path) # 加载模型
postags = postagger.postag(words) # 词性标注
#for word, tag in zip(words, postags):
#print(word + '/' + tag)
postagger.release() # 释放模型
return postags
# 命名实体识别
def e_recognize(words, postags):
recognizer = NamedEntityRecognizer() # 初始化实例
recognizer.load(ner_model_path) # 加载模型
netags = recognizer.recognize(words, postags) # 命名实体识别
#for word, ntag in zip(words, netags):
#print(word + '/' + ntag)
recognizer.release() # 释放模型
return netags
#利用LTP实现对输入的.csv文件的分句、分词、词性标注、命名实体识别和三元组抽取
def myltp(input_path, output_path):
records = pd.read_csv(input_path)
titles = records['title']
openTypeLists = records['openTypeList']
details = records['detail']
# for i in range(20):
# #分句
# sents = SentenceSplitter.split(details[i])
# #print(sents)
# sub_sents= list(sents)
#
# #初始化实例
# segmentor = Segmentor()
# segmentor.load_with_lexicon(cws_model_path, 'lexicon')
#
# postagger = Postagger()
# postagger.load(pos_model_path)
#
# recognizer = NamedEntityRecognizer()
# recognizer.load(ner_model_path)
#
# for i in range(len(sub_sents)):
# #print('sub_sents[i]=' +sub_sents[i])
#
# #分词
# words = segmentor.segment(sub_sents[i])
# #print(type(words))
# #print('\t'.join(words))
# old_sub_words = list(words)
# print(old_sub_words)
#
# # forceSegmentor = ForceSegmentor()
# # forceSegmentor.load('D:\\Python\\MyTest\\KG_Agriculture\\lexicon.txt')
# # words = forceSegmentor.merge(sub_sents[i], old_sub_words) # 强制分词以后的结果
# # new_sub_words = list(words)
# # print(new_sub_words)
#
# # 词性标注
# postags = postagger.postag(old_sub_words)
# postags_result = list(postags)
#
# print(old_sub_words)
# print(postags_result)
#
# #命名实体识别
# netags = recognizer.recognize(old_sub_words, postags_result)
# netags_result = list(netags)
# print(netags_result)
#
# segmentor.release()
# postagger.release()
# recognizer.release()
stopwords = stopwordslist('D:\\Python\\MyTest\\KG_Agriculture\\stopwords\\ltp_stopwords.txt')
jieba.load_userdict('D:\\Python\\MyTest\\KG_Agriculture\\Plants\\lexicon.txt')
file1 = open(output_path, 'w', encoding='UTF-8')
triple_path = 'D:\\Python\\MyTest\\KG_Agriculture\\Plants\\triple_results_plants2.txt'
for i in range(20):
sentences = SentenceSplitter.split(details[i])
file2 = open(triple_path, 'a', encoding='UTF-8')
file2.write(titles[i]+': \t')
file1.write(titles[i]+': \t')
file2.close()
for sentence in sentences:
final = []
segs = jieba.cut(sentence, cut_all=False)
for seg in segs:
#print(seg + '/ ')
#if seg not in stopwords:
final.append(seg)
#print(final)
#words = segmentor(details[i])
#print(words)
postags = posttagger(final)
netags = e_recognize(final, postags)
IE = triple_ie.TripleIE(sentence, triple_path, final, postags, netags)
IE.run()
tags = []
dict = []
#file.write(str(i) + '\n')
#tmp=""
for word, ntag in zip(final, netags):
if (ntag != 'O'): # 过滤非命名实体
#print(word, ntag)
tags.append(ntag)
if (ntag not in dict):
dict.append(ntag)
file1.write(word +':'+ ntag+ ' ')
file2 = open(triple_path, 'a', encoding='UTF-8')
file1.write('\n')
file2.write('\n')
file2.close()
print(i)
file1.close()
print('end...')
def main():
input_path = 'D:\\Python\\MyTest\\KG_Agriculture\\Plants\\my_datas_plants.csv'
output_path = 'D:\\Python\\MyTest\\KG_Agriculture\\Plants\\ner_results_plants2.txt'
myltp(input_path=input_path, output_path=output_path)
if __name__ == '__main__':
main()