-
Notifications
You must be signed in to change notification settings - Fork 0
/
P3_3.py
122 lines (110 loc) · 5.95 KB
/
P3_3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import P3_4
import P3_1
def calc_score(std_seg_path, my_seg_path, std_seg_encoding, my_seg_encoding, k=1):
std_seg_num, model_seg_num, right_num = 0, 0, 0
standard_lines = readseg_tList(std_seg_path, std_seg_encoding)
my_lines = readseg_tList(my_seg_path, my_seg_encoding)
for index, this_line in enumerate(standard_lines):
line_words = this_line.split('/ ')[:-1] # 取出标准的分词文本中每行的词语
my_line_words = my_lines[index].split('/ ')[:-1] # 取对比文本中每行的词语
std_this_num = len(line_words) # 该行标准切分词数
model_this_num = len(my_line_words) # 该行模型切分词数
std_seg_num += std_this_num # 总标准切分数中加入该行的标准切分数
model_seg_num += model_this_num # 总模型切分数中加入该行的模型切分数
i = j = 0
std_char_num, model_char_num = len(line_words[0]), len(my_line_words[0])
while i < std_this_num and j < model_this_num:
if std_char_num == model_char_num:
right_num += 1
if i == std_this_num - 1 or j == model_this_num - 1:
break
i += 1
j += 1
std_char_num += len(line_words[i])
model_char_num += len(my_line_words[j])
else:
while True:
if std_char_num < model_char_num:
i += 1
std_char_num += len(line_words[i])
elif std_char_num > model_char_num:
j += 1
model_char_num += len(my_line_words[j])
else:
if i < std_this_num - 1 and j < model_this_num -1:
std_char_num += len(line_words[i + 1])
model_char_num += len(my_line_words[j + 1])
i += 1
j += 1
break
precision = right_num / float(model_seg_num)
recall = right_num / float(std_seg_num)
f_value = (k * k + 1) * precision * recall / (k * k * precision + recall)
return precision, recall, f_value
def readseg_tList(seg_path, encoding='utf-8'):
"""
从给定路径中用给定编码,将所有非空行处理后读入返回的list中
:param seg_path: 路径
:param encoding: 编码方式
:return: 非空行处理后形成的list
"""
result_list = [] # 保存结果
with open(seg_path, 'r', encoding=encoding) as file:
for line in file:
if line == '\n':
continue
handled_line = '' # 保存处理过后的一行
for word in line.split():
handled_line += word[1 if word[0] == '[' else 0:word.index('/')] + '/ '
result_list.append(handled_line)
return result_list
def write_ave_score(writo_path, std_encoding='utf-8', model_encoding='utf-8'):
total_pre, total_recall, total_F = 0, 0, 0
for i in range(1, 11):
train_path = 'TrainFiles/train_' + str(i) + '.txt' # 生成词典的训练样本
std_path = 'TestFiles/std_' + str(i) + '.txt' # 标准切分文件
toseg_path = 'TestFiles/test_' + str(i) + '.txt' # 待切分文件
fmm_file = 'fmm_result_' + str(i) + '.txt' # 将切分结果写入的文件
dic_path = 'dic_' + str(i) + '.txt' # 将词典写入的路径
P3_1.gene_dic(train_path, dic_path, encoding='utf-8') # 利用该训练集生成词典
trie_root = P3_4.TrieDic.gene_fmm_trie(dic_path, encoding='utf-8') # 通过词典生成对应trie树
P3_4.optimized_fmm(toseg_path, fmm_file, trie_root, encoding='utf-8') # 获得切分结果
precision, recall, F = calc_score(std_path, fmm_file, std_encoding, model_encoding) # FMM
total_pre += precision
total_recall += recall
total_F += F
score = ''
score += 'FMM\n'
# 求均值(*100 /10)
score += ' 准确率: ' + str(total_pre * 10) + '%\n' + ' 召回率: ' + str(total_recall * 10) + '%\n' + ' F值:' + str(
total_F * 10) + '%\n\n'
total_pre, total_recall, total_F = 0, 0, 0
for i in range(1, 11):
train_path = 'TrainFiles/train_' + str(i) + '.txt'
std_path = 'TestFiles/std_' + str(i) + '.txt' # 标准切分文件
toseg_path = 'TestFiles/test_' + str(i) + '.txt'
bmm_file = 'bmm_result_' + str(i) + '.txt' # 将切分结果写入的文件
dic_path = 'dic_' + str(i) + '.txt'
P3_1.gene_dic(train_path, dic_path, encoding='utf-8') # 利用该训练集生成词典
trie_root = P3_4.TrieDic.gene_bmm_trie(dic_path, encoding='utf-8') # 通过词典生成对应trie树
P3_4.optimized_bmm(toseg_path, bmm_file, trie_root, encoding='utf-8') # 获得切分结果
precision, recall, F = calc_score(std_path, bmm_file, std_encoding, model_encoding) # FMM
total_pre += precision
total_recall += recall
total_F += F
score += 'BMM\n'
score += ' 准确率: ' + str(total_pre * 10) + '%\n' + ' 召回率: ' + str(total_recall * 10) + '%\n' + ' F值:' + str(
total_F * 10) + '%\n\n'
open(writo_path, 'w', encoding='UTF-8').write(score)
def write_score(std_path = '199801_seg&pos.txt',writo_path='score.txt', std_encoding='gbk', model_encoding='utf-8'):
precision, recall, F = calc_score(std_path, 'seg_FMM.txt', std_encoding, model_encoding) # FMM
score = ''
score += 'FMM\n'
# 求均值(*100 /10)
score += ' 准确率: '+ str(precision*100)+'%\n'+' 召回率: '+str(recall*100)+'%\n'+' F值:'+str(F*100)+'%\n\n'
precision, recall, F = calc_score(std_path, 'seg_BMM.txt', std_encoding, model_encoding) # FMM
score += 'BMM\n'
score += ' 准确率: '+ str(precision*100)+'%\n'+' 召回率: '+str(recall*100)+'%\n'+' F值:'+str(F*100)+'%\n\n'
open(writo_path, 'w', encoding='UTF-8').write(score)
#客户端
# write_score() # 如检查请取消该行注释