-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathprocess.py
72 lines (59 loc) · 1.85 KB
/
process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''
@Project :NLP
@File :process.py
@Author :JackHCC
@Date :2022/6/12 14:11
@Desc :process raw data and cut sentences
'''
import os
import pandas as pd
import re
import jieba
import jieba.posseg as psg
ROOT_PATH = os.path.dirname(__file__)
CONFIG_PATH = os.path.join(ROOT_PATH, 'config')
DATA_PATH = os.path.join(ROOT_PATH, 'data')
JIEBA_USER_DICT = os.path.join(CONFIG_PATH, 'jieba_user_dict.txt')
STOP_WORDS = os.path.join(CONFIG_PATH, 'stop_words.txt')
POS_DICT = os.path.join(CONFIG_PATH, 'POS_dict.txt')
# 加载词性筛选配置
FLAG_LIST = []
with open(POS_DICT, 'r', encoding='utf-8') as f:
for line in f.readlines():
line = line.strip()
FLAG_LIST.append(line)
print(FLAG_LIST)
def chinese_word_cut(article):
jieba.load_userdict(JIEBA_USER_DICT)
jieba.initialize()
try:
stop_word_list = open(STOP_WORDS, encoding='utf-8')
except:
stop_word_list = []
print("Error in stop_words file")
stop_list = []
for line in stop_word_list:
line = re.sub(u'\n|\\r', '', line)
stop_list.append(line)
word_list = []
# jieba分词
seg_list = psg.cut(article)
for seg_word in seg_list:
word = re.sub(u'[^\u4e00-\u9fa5]', '', seg_word.word)
find = 0
for stop_word in stop_list:
if stop_word == word or len(word) < 2: # this word is stopword
find = 1
break
if find == 0 and seg_word.flag in FLAG_LIST:
word_list.append(word)
return ' '.join(word_list)
def data_word_cut(data_path=os.path.join(DATA_PATH, 'data.xlsx')):
data = pd.read_excel(data_path)
data["content_cut"] = data.content.apply(chinese_word_cut)
print("Data process get columns info: ", data.columns)
return data
if __name__ == "__main__":
data_word_cut()