forked from youzan/YZSpamFilter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
51 lines (42 loc) · 1.13 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# -*- coding: utf-8 -*-
import re
import jieba
chinese_character_pattern = re.compile(ur"([\u4e00-\u9fa5]+)")
# abbr.
CCP = chinese_character_pattern
def extract_chinese(buf):
"""
extract chinese characters without
"""
buffer = buf
if isinstance(buffer, str):
buffer = buffer.decode('utf-8')
segment_list = []
m = CCP.search(buffer)
while m is not None:
segment = m.group(1)
segment_list.append(segment)
idx = m.start() + len(segment)
buffer = buffer[idx:]
m = CCP.search(buffer)
return segment_list
def ClearAndSegment(mes):
query = mes
seg_list = ''
if query is not None:
query = extract_chinese(query)
query = ''.join(query)
seg_list = jieba.cut(query, False)
seg_list = list(set(seg_list))
return seg_list
return seg_list
def u(x):
"""
translate str to unicode
"""
if not isinstance(x, unicode):
return x.decode('utf-8')
return x
if __name__ == '__main__':
liststr = ClearAndSegment(u"赚钱test宝妈tes日赚学生兼职*.@打字员")
print liststr