-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathextract-sougou-dict.py
84 lines (70 loc) · 2.92 KB
/
extract-sougou-dict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# @author: [email protected]
# @date: 2014-09-03
# @depends: python2.7+
'''
从搜狗词库中提取中文词组
参考[这篇文章](http://blog.csdn.net/zhangzhenhu/article/details/7014271)中的代码实现。
搜狗词库文件使用小端序,采用UTF-16编码,词组列表从0x2628处开始直到文件结尾。
'''
# check python version
import sys
if sys.version_info[0] == 2:
range = xrange
if sys.version_info[1] < 7:
print("Require python version 2.7+")
sys.exit(1)
import argparse
import struct
import codecs
gWordsOffset = 0x2628 # 词组列表的偏移地址
def extract_sougou_words(data):
'''提取中文词组
每个列表元素包括一组同音词,格式如下:
(<同音词数量(2字节)>
<拼音表长度(2字节)>
<拼音表>
[
(<同音词长度(2字节)><同音词(UTF-16编码)><扩展信息长度(2字节)><扩展信息>),
...
]),
'''
offset = 0
dataLen = len(data)
wordList = []
while offset < dataLen:
numTongYinCi, pinYinTableLen = struct.unpack('<HH', data[offset:offset+4])
offset += (4 + pinYinTableLen)
for i in range(numTongYinCi):
wordLen = struct.unpack('<H', data[offset:offset+2])[0]
offset += 2
word = struct.unpack('<' + str(wordLen) + 's', data[offset:offset+wordLen])[0]
offset += wordLen
wordList.append(word.decode('UTF-16'))
extInfoLen = struct.unpack('<H', data[offset:offset+2])[0]
offset += (2 + extInfoLen)
return wordList
def extract_sougou_dict_files(pathList):
'''
从多个搜狗词库文件中提取中文词组,并合并重复的词组。
'''
wordSet = set()
for path in pathList:
with open(path, 'rb') as f:
wordSet.update(extract_sougou_words(f.read()[gWordsOffset:]))
return wordSet
if __name__ == '__main__':
argParser = argparse.ArgumentParser(description=u'从搜狗词库中提取中文词组,如果输入文件不止一个,重复的词组会被合并。')
argParser.add_argument('dictfile', nargs='+', help=u'搜狗词库文件的路径,可以有多个')
argParser.add_argument('-o', dest='output', required=True, help=u'输出文件的路径,默认情况下每个词组占一行')
argParser.add_argument('-mmseg', dest='mmseg', action='store_true', help=u'按libmmseg字典文件的格式生成输出文件')
args = argParser.parse_args()
wordSet = extract_sougou_dict_files(args.dictfile)
with codecs.open(args.output, 'w', encoding='UTF-8') as f:
if not args.mmseg:
f.write(u'\n'.join(wordSet))
else:
for word in wordSet:
f.write(u'{}\t1\nx:1\n'.format(word))
print(u'成功从{}个搜狗词库中提取出{}个词组 => {}'.format(len(args.dictfile), len(wordSet), args.output))