Skip to content

Commit b0794ed

Browse files
liushuchunsloth2012
authored andcommittedJun 22, 2018
add the ml code
1 parent 42025ba commit b0794ed

13 files changed

+18730
-0
lines changed
 

‎chapter-9/README.md

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#机器学习相关
2+
* 分类
3+
* 聚类
+225
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
"""
2+
author: liushuchun
3+
"""
4+
import numpy as np
5+
from sklearn.cross_validation import train_test_split
6+
7+
8+
def get_data():
9+
'''
10+
获取数据
11+
:return: 文本数据,对应的labels
12+
'''
13+
with open("data/ham_data.txt", encoding="utf8") as ham_f, open("data/spam_data.txt", encoding="utf8") as spam_f:
14+
ham_data = ham_f.readlines()
15+
spam_data = spam_f.readlines()
16+
17+
ham_label = np.ones(len(ham_data)).tolist()
18+
spam_label = np.zeros(len(spam_data)).tolist()
19+
20+
corpus = ham_data + spam_data
21+
22+
labels = ham_label + spam_label
23+
24+
return corpus, labels
25+
26+
27+
def prepare_datasets(corpus, labels, test_data_proportion=0.3):
28+
'''
29+
30+
:param corpus: 文本数据
31+
:param labels: label数据
32+
:param test_data_proportion:测试数据占比
33+
:return: 训练数据,测试数据,训练label,测试label
34+
'''
35+
train_X, test_X, train_Y, test_Y = train_test_split(corpus, labels,
36+
test_size=test_data_proportion, random_state=42)
37+
return train_X, test_X, train_Y, test_Y
38+
39+
40+
def remove_empty_docs(corpus, labels):
41+
filtered_corpus = []
42+
filtered_labels = []
43+
for doc, label in zip(corpus, labels):
44+
if doc.strip():
45+
filtered_corpus.append(doc)
46+
filtered_labels.append(label)
47+
48+
return filtered_corpus, filtered_labels
49+
50+
51+
from sklearn import metrics
52+
53+
54+
def get_metrics(true_labels, predicted_labels):
55+
print('准确率:', np.round(
56+
metrics.accuracy_score(true_labels,
57+
predicted_labels),
58+
2))
59+
print('精度:', np.round(
60+
metrics.precision_score(true_labels,
61+
predicted_labels,
62+
average='weighted'),
63+
2))
64+
print('召回率:', np.round(
65+
metrics.recall_score(true_labels,
66+
predicted_labels,
67+
average='weighted'),
68+
2))
69+
print('F1得分:', np.round(
70+
metrics.f1_score(true_labels,
71+
predicted_labels,
72+
average='weighted'),
73+
2))
74+
75+
76+
def train_predict_evaluate_model(classifier,
77+
train_features, train_labels,
78+
test_features, test_labels):
79+
# build model
80+
classifier.fit(train_features, train_labels)
81+
# predict using model
82+
predictions = classifier.predict(test_features)
83+
# evaluate model prediction performance
84+
get_metrics(true_labels=test_labels,
85+
predicted_labels=predictions)
86+
return predictions
87+
88+
89+
def main():
90+
corpus, labels = get_data() # 获取数据集
91+
92+
print("总的数据量:", len(labels))
93+
94+
corpus, labels = remove_empty_docs(corpus, labels)
95+
96+
print('样本之一:', corpus[10])
97+
print('样本的label:', labels[10])
98+
label_name_map = ["垃圾邮件", "正常邮件"]
99+
print('实际类型:', label_name_map[int(labels[10])], label_name_map[int(labels[5900])])
100+
101+
# 对数据进行划分
102+
train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(corpus,
103+
labels,
104+
test_data_proportion=0.3)
105+
106+
from normalization import normalize_corpus
107+
108+
# 进行归一化
109+
norm_train_corpus = normalize_corpus(train_corpus)
110+
norm_test_corpus = normalize_corpus(test_corpus)
111+
112+
''.strip()
113+
114+
from feature_extractors import bow_extractor, tfidf_extractor
115+
import gensim
116+
import jieba
117+
118+
# 词袋模型特征
119+
bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)
120+
bow_test_features = bow_vectorizer.transform(norm_test_corpus)
121+
122+
# tfidf 特征
123+
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)
124+
tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)
125+
126+
# tokenize documents
127+
tokenized_train = [jieba.lcut(text)
128+
for text in norm_train_corpus]
129+
print(tokenized_train[2:10])
130+
tokenized_test = [jieba.lcut(text)
131+
for text in norm_test_corpus]
132+
# build word2vec 模型
133+
model = gensim.models.Word2Vec(tokenized_train,
134+
size=500,
135+
window=100,
136+
min_count=30,
137+
sample=1e-3)
138+
139+
from sklearn.naive_bayes import MultinomialNB
140+
from sklearn.linear_model import SGDClassifier
141+
from sklearn.linear_model import LogisticRegression
142+
mnb = MultinomialNB()
143+
svm = SGDClassifier(loss='hinge', n_iter=100)
144+
lr = LogisticRegression()
145+
146+
# 基于词袋模型的多项朴素贝叶斯
147+
print("基于词袋模型特征的贝叶斯分类器")
148+
mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb,
149+
train_features=bow_train_features,
150+
train_labels=train_labels,
151+
test_features=bow_test_features,
152+
test_labels=test_labels)
153+
154+
# 基于词袋模型特征的逻辑回归
155+
print("基于词袋模型特征的逻辑回归")
156+
lr_bow_predictions = train_predict_evaluate_model(classifier=lr,
157+
train_features=bow_train_features,
158+
train_labels=train_labels,
159+
test_features=bow_test_features,
160+
test_labels=test_labels)
161+
162+
# 基于词袋模型的支持向量机方法
163+
print("基于词袋模型的支持向量机")
164+
svm_bow_predictions = train_predict_evaluate_model(classifier=svm,
165+
train_features=bow_train_features,
166+
train_labels=train_labels,
167+
test_features=bow_test_features,
168+
test_labels=test_labels)
169+
170+
171+
# 基于tfidf的多项式朴素贝叶斯模型
172+
print("基于tfidf的贝叶斯模型")
173+
mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb,
174+
train_features=tfidf_train_features,
175+
train_labels=train_labels,
176+
test_features=tfidf_test_features,
177+
test_labels=test_labels)
178+
# 基于tfidf的逻辑回归模型
179+
print("基于tfidf的逻辑回归模型")
180+
lr_tfidf_predictions=train_predict_evaluate_model(classifier=lr,
181+
train_features=tfidf_train_features,
182+
train_labels=train_labels,
183+
test_features=tfidf_test_features,
184+
test_labels=test_labels)
185+
186+
187+
# 基于tfidf的支持向量机模型
188+
print("基于tfidf的支持向量机模型")
189+
svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm,
190+
train_features=tfidf_train_features,
191+
train_labels=train_labels,
192+
test_features=tfidf_test_features,
193+
test_labels=test_labels)
194+
195+
196+
197+
import re
198+
199+
num = 0
200+
for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions):
201+
if label == 0 and predicted_label == 0:
202+
print('邮件类型:', label_name_map[int(label)])
203+
print('预测的邮件类型:', label_name_map[int(predicted_label)])
204+
print('文本:-')
205+
print(re.sub('\n', ' ', document))
206+
207+
num += 1
208+
if num == 4:
209+
break
210+
211+
num = 0
212+
for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions):
213+
if label == 1 and predicted_label == 0:
214+
print('邮件类型:', label_name_map[int(label)])
215+
print('预测的邮件类型:', label_name_map[int(predicted_label)])
216+
print('文本:-')
217+
print(re.sub('\n', ' ', document))
218+
219+
num += 1
220+
if num == 4:
221+
break
222+
223+
224+
if __name__ == "__main__":
225+
main()

‎chapter-9/classification/data/ham_data.txt

+5,000
Large diffs are not rendered by default.

‎chapter-9/classification/data/spam_data.txt

+5,001
Large diffs are not rendered by default.

‎chapter-9/classification/dict/stop_words.utf8

+1,534
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
"""
2+
3+
@author: liushuchun
4+
"""
5+
6+
from sklearn.feature_extraction.text import CountVectorizer
7+
8+
9+
def bow_extractor(corpus, ngram_range=(1, 1)):
10+
vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)
11+
features = vectorizer.fit_transform(corpus)
12+
return vectorizer, features
13+
14+
15+
from sklearn.feature_extraction.text import TfidfTransformer
16+
17+
18+
def tfidf_transformer(bow_matrix):
19+
transformer = TfidfTransformer(norm='l2',
20+
smooth_idf=True,
21+
use_idf=True)
22+
tfidf_matrix = transformer.fit_transform(bow_matrix)
23+
return transformer, tfidf_matrix
24+
25+
26+
from sklearn.feature_extraction.text import TfidfVectorizer
27+
28+
29+
def tfidf_extractor(corpus, ngram_range=(1, 1)):
30+
vectorizer = TfidfVectorizer(min_df=1,
31+
norm='l2',
32+
smooth_idf=True,
33+
use_idf=True,
34+
ngram_range=ngram_range)
35+
features = vectorizer.fit_transform(corpus)
36+
return vectorizer, features
37+
38+
39+
40+
41+
+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
"""
2+
3+
@author: liushuchun
4+
"""
5+
import re
6+
import string
7+
import jieba
8+
9+
# 加载停用词
10+
with open("dict/stop_words.utf8", encoding="utf8") as f:
11+
stopword_list = f.readlines()
12+
13+
14+
def tokenize_text(text):
15+
tokens = jieba.cut(text)
16+
tokens = [token.strip() for token in tokens]
17+
return tokens
18+
19+
20+
def remove_special_characters(text):
21+
tokens = tokenize_text(text)
22+
pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
23+
filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
24+
filtered_text = ' '.join(filtered_tokens)
25+
return filtered_text
26+
27+
28+
def remove_stopwords(text):
29+
tokens = tokenize_text(text)
30+
filtered_tokens = [token for token in tokens if token not in stopword_list]
31+
filtered_text = ''.join(filtered_tokens)
32+
return filtered_text
33+
34+
35+
def normalize_corpus(corpus, tokenize=False):
36+
normalized_corpus = []
37+
for text in corpus:
38+
39+
text = remove_special_characters(text)
40+
text = remove_stopwords(text)
41+
normalized_corpus.append(text)
42+
if tokenize:
43+
text = tokenize_text(text)
44+
normalized_corpus.append(text)
45+
46+
return normalized_corpus

‎chapter-9/classification/plot.py

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import matplotlib.pyplot as plt
2+
import numpy as np
3+
x = np.linspace(-1, 1, 50)
4+
y = 1.0/(1.0+np.exp(-5*x))
5+
plt.figure()
6+
plt.plot(x, y)
7+
plt.show()

‎chapter-9/cluster/cluster.py

+265
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,265 @@
1+
"""
2+
@author: liushuchun
3+
"""
4+
import pandas as pd
5+
import numpy as np
6+
7+
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
8+
9+
10+
def build_feature_matrix(documents, feature_type='frequency',
11+
ngram_range=(1, 1), min_df=0.0, max_df=1.0):
12+
feature_type = feature_type.lower().strip()
13+
14+
if feature_type == 'binary':
15+
vectorizer = CountVectorizer(binary=True,
16+
max_df=max_df, ngram_range=ngram_range)
17+
elif feature_type == 'frequency':
18+
vectorizer = CountVectorizer(binary=False, min_df=min_df,
19+
max_df=max_df, ngram_range=ngram_range)
20+
elif feature_type == 'tfidf':
21+
vectorizer = TfidfVectorizer()
22+
else:
23+
raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")
24+
25+
feature_matrix = vectorizer.fit_transform(documents).astype(float)
26+
27+
return vectorizer, feature_matrix
28+
29+
30+
book_data = pd.read_csv('data/data.csv') #读取文件
31+
32+
print(book_data.head())
33+
34+
book_titles = book_data['title'].tolist()
35+
book_content = book_data['content'].tolist()
36+
37+
print('书名:', book_titles[0])
38+
print('内容:', book_content[0][:10])
39+
40+
from normalization import normalize_corpus
41+
42+
# normalize corpus
43+
norm_book_content = normalize_corpus(book_content)
44+
45+
46+
# 提取 tf-idf 特征
47+
vectorizer, feature_matrix = build_feature_matrix(norm_book_content,
48+
feature_type='tfidf',
49+
min_df=0.2, max_df=0.90,
50+
ngram_range=(1, 2))
51+
# 查看特征数量
52+
print(feature_matrix.shape)
53+
54+
# 获取特征名字
55+
feature_names = vectorizer.get_feature_names()
56+
57+
# 打印某些特征
58+
print(feature_names[:10])
59+
60+
from sklearn.cluster import KMeans
61+
62+
63+
def k_means(feature_matrix, num_clusters=10):
64+
km = KMeans(n_clusters=num_clusters,
65+
max_iter=10000)
66+
km.fit(feature_matrix)
67+
clusters = km.labels_
68+
return km, clusters
69+
70+
71+
num_clusters = 10
72+
km_obj, clusters = k_means(feature_matrix=feature_matrix,
73+
num_clusters=num_clusters)
74+
book_data['Cluster'] = clusters
75+
76+
from collections import Counter
77+
78+
# 获取每个cluster的数量
79+
c = Counter(clusters)
80+
print(c.items())
81+
82+
83+
def get_cluster_data(clustering_obj, book_data,
84+
feature_names, num_clusters,
85+
topn_features=10):
86+
cluster_details = {}
87+
# 获取cluster的center
88+
ordered_centroids = clustering_obj.cluster_centers_.argsort()[:, ::-1]
89+
# 获取每个cluster的关键特征
90+
# 获取每个cluster的书
91+
for cluster_num in range(num_clusters):
92+
cluster_details[cluster_num] = {}
93+
cluster_details[cluster_num]['cluster_num'] = cluster_num
94+
key_features = [feature_names[index]
95+
for index
96+
in ordered_centroids[cluster_num, :topn_features]]
97+
cluster_details[cluster_num]['key_features'] = key_features
98+
99+
books = book_data[book_data['Cluster'] == cluster_num]['title'].values.tolist()
100+
cluster_details[cluster_num]['books'] = books
101+
102+
return cluster_details
103+
104+
105+
def print_cluster_data(cluster_data):
106+
# print cluster details
107+
for cluster_num, cluster_details in cluster_data.items():
108+
print('Cluster {} details:'.format(cluster_num))
109+
print('-' * 20)
110+
print('Key features:', cluster_details['key_features'])
111+
print('book in this cluster:')
112+
print(', '.join(cluster_details['books']))
113+
print('=' * 40)
114+
115+
116+
import matplotlib.pyplot as plt
117+
from sklearn.manifold import MDS
118+
from sklearn.metrics.pairwise import cosine_similarity
119+
import random
120+
from matplotlib.font_manager import FontProperties
121+
122+
123+
def plot_clusters(num_clusters, feature_matrix,
124+
cluster_data, book_data,
125+
plot_size=(16, 8)):
126+
# generate random color for clusters
127+
def generate_random_color():
128+
color = '#%06x' % random.randint(0, 0xFFFFFF)
129+
return color
130+
131+
# define markers for clusters
132+
markers = ['o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd']
133+
# build cosine distance matrix
134+
cosine_distance = 1 - cosine_similarity(feature_matrix)
135+
# dimensionality reduction using MDS
136+
mds = MDS(n_components=2, dissimilarity="precomputed",
137+
random_state=1)
138+
# get coordinates of clusters in new low-dimensional space
139+
plot_positions = mds.fit_transform(cosine_distance)
140+
x_pos, y_pos = plot_positions[:, 0], plot_positions[:, 1]
141+
# build cluster plotting data
142+
cluster_color_map = {}
143+
cluster_name_map = {}
144+
for cluster_num, cluster_details in cluster_data[0:500].items():
145+
# assign cluster features to unique label
146+
cluster_color_map[cluster_num] = generate_random_color()
147+
cluster_name_map[cluster_num] = ', '.join(cluster_details['key_features'][:5]).strip()
148+
# map each unique cluster label with its coordinates and books
149+
cluster_plot_frame = pd.DataFrame({'x': x_pos,
150+
'y': y_pos,
151+
'label': book_data['Cluster'].values.tolist(),
152+
'title': book_data['title'].values.tolist()
153+
})
154+
grouped_plot_frame = cluster_plot_frame.groupby('label')
155+
# set plot figure size and axes
156+
fig, ax = plt.subplots(figsize=plot_size)
157+
ax.margins(0.05)
158+
# plot each cluster using co-ordinates and book titles
159+
for cluster_num, cluster_frame in grouped_plot_frame:
160+
marker = markers[cluster_num] if cluster_num < len(markers) \
161+
else np.random.choice(markers, size=1)[0]
162+
ax.plot(cluster_frame['x'], cluster_frame['y'],
163+
marker=marker, linestyle='', ms=12,
164+
label=cluster_name_map[cluster_num],
165+
color=cluster_color_map[cluster_num], mec='none')
166+
ax.set_aspect('auto')
167+
ax.tick_params(axis='x', which='both', bottom='off', top='off',
168+
labelbottom='off')
169+
ax.tick_params(axis='y', which='both', left='off', top='off',
170+
labelleft='off')
171+
fontP = FontProperties()
172+
fontP.set_size('small')
173+
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.01), fancybox=True,
174+
shadow=True, ncol=5, numpoints=1, prop=fontP)
175+
# add labels as the film titles
176+
for index in range(len(cluster_plot_frame)):
177+
ax.text(cluster_plot_frame.ix[index]['x'],
178+
cluster_plot_frame.ix[index]['y'],
179+
cluster_plot_frame.ix[index]['title'], size=8)
180+
# show the plot
181+
plt.show()
182+
183+
184+
cluster_data = get_cluster_data(clustering_obj=km_obj,
185+
book_data=book_data,
186+
feature_names=feature_names,
187+
num_clusters=num_clusters,
188+
topn_features=5)
189+
190+
print_cluster_data(cluster_data)
191+
192+
plot_clusters(num_clusters=num_clusters,
193+
feature_matrix=feature_matrix,
194+
cluster_data=cluster_data,
195+
book_data=book_data,
196+
plot_size=(16, 8))
197+
198+
from sklearn.cluster import AffinityPropagation
199+
200+
201+
def affinity_propagation(feature_matrix):
202+
sim = feature_matrix * feature_matrix.T
203+
sim = sim.todense()
204+
ap = AffinityPropagation()
205+
ap.fit(sim)
206+
clusters = ap.labels_
207+
return ap, clusters
208+
209+
210+
# get clusters using affinity propagation
211+
ap_obj, clusters = affinity_propagation(feature_matrix=feature_matrix)
212+
book_data['Cluster'] = clusters
213+
214+
# get the total number of books per cluster
215+
c = Counter(clusters)
216+
print(c.items())
217+
218+
# get total clusters
219+
total_clusters = len(c)
220+
print('Total Clusters:', total_clusters)
221+
222+
cluster_data = get_cluster_data(clustering_obj=ap_obj,
223+
book_data=book_data,
224+
feature_names=feature_names,
225+
num_clusters=total_clusters,
226+
topn_features=5)
227+
228+
print_cluster_data(cluster_data)
229+
230+
plot_clusters(num_clusters=num_clusters,
231+
feature_matrix=feature_matrix,
232+
cluster_data=cluster_data,
233+
book_data=book_data,
234+
plot_size=(16, 8))
235+
236+
from scipy.cluster.hierarchy import ward, dendrogram
237+
238+
239+
def ward_hierarchical_clustering(feature_matrix):
240+
cosine_distance = 1 - cosine_similarity(feature_matrix)
241+
linkage_matrix = ward(cosine_distance)
242+
return linkage_matrix
243+
244+
245+
def plot_hierarchical_clusters(linkage_matrix, book_data, figure_size=(8, 12)):
246+
# set size
247+
fig, ax = plt.subplots(figsize=figure_size)
248+
book_titles = book_data['title'].values.tolist()
249+
# plot dendrogram
250+
ax = dendrogram(linkage_matrix, orientation="left", labels=book_titles)
251+
plt.tick_params(axis='x',
252+
which='both',
253+
bottom='off',
254+
top='off',
255+
labelbottom='off')
256+
plt.tight_layout()
257+
plt.savefig('ward_hierachical_clusters.png', dpi=200)
258+
259+
260+
# build ward's linkage matrix
261+
linkage_matrix = ward_hierarchical_clustering(feature_matrix)
262+
# plot the dendrogram
263+
plot_hierarchical_clusters(linkage_matrix=linkage_matrix,
264+
book_data=book_data,
265+
figure_size=(8, 10))

‎chapter-9/cluster/data/data.csv

+4,926
Large diffs are not rendered by default.

‎chapter-9/cluster/dict/stop_words.utf8

+1,534
Large diffs are not rendered by default.

‎chapter-9/cluster/douban_spider.py

+106
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
import ssl
2+
import bs4
3+
import re
4+
import requests
5+
import csv
6+
import codecs
7+
import time
8+
9+
from urllib import request, error
10+
11+
context = ssl._create_unverified_context()
12+
13+
14+
class DouBanSpider:
15+
def __init__(self):
16+
self.userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
17+
self.headers = {"User-Agent": self.userAgent}
18+
19+
# 拿到豆瓣图书的分类标签
20+
def getBookCategroies(self):
21+
try:
22+
url = "https://book.douban.com/tag/?view=type&icn=index-sorttags-all"
23+
response = request.urlopen(url, context=context)
24+
content = response.read().decode("utf-8")
25+
return content
26+
except error.HTTPError as identifier:
27+
print("errorCode: " + identifier.code + "errrorReason: " + identifier.reason)
28+
return None
29+
30+
# 找到每个标签的内容
31+
def getCategroiesContent(self):
32+
content = self.getBookCategroies()
33+
if not content:
34+
print("页面抓取失败...")
35+
return None
36+
soup = bs4.BeautifulSoup(content, "lxml")
37+
categroyMatch = re.compile(r"^/tag/*")
38+
categroies = []
39+
for categroy in soup.find_all("a", {"href": categroyMatch}):
40+
if categroy:
41+
categroies.append(categroy.string)
42+
return categroies
43+
44+
# 拿到每个标签的链接
45+
def getCategroyLink(self):
46+
categroies = self.getCategroiesContent()
47+
categroyLinks = []
48+
for item in categroies:
49+
link = "https://book.douban.com/tag/" + str(item)
50+
categroyLinks.append(link)
51+
return categroyLinks
52+
53+
def getBookInfo(self, categroyLinks):
54+
self.setCsvTitle()
55+
categroies = categroyLinks
56+
try:
57+
for link in categroies:
58+
print("正在爬取:" + link)
59+
bookList = []
60+
response = requests.get(link)
61+
soup = bs4.BeautifulSoup(response.text, 'lxml')
62+
bookCategroy = soup.h1.string
63+
for book in soup.find_all("li", {"class": "subject-item"}):
64+
bookSoup = bs4.BeautifulSoup(str(book), "lxml")
65+
bookTitle = bookSoup.h2.a["title"]
66+
bookAuthor = bookSoup.find("div", {"class": "pub"})
67+
bookComment = bookSoup.find("span", {"class": "pl"})
68+
bookContent = bookSoup.li.p
69+
# print(bookContent)
70+
if bookTitle and bookAuthor and bookComment and bookContent:
71+
bookList.append([bookTitle.strip(),bookCategroy.strip() , bookAuthor.string.strip(),
72+
bookComment.string.strip(), bookContent.string.strip()])
73+
self.saveBookInfo(bookList)
74+
time.sleep(3)
75+
76+
print("爬取结束....")
77+
78+
except error.HTTPError as identifier:
79+
print("errorCode: " + identifier.code + "errrorReason: " + identifier.reason)
80+
return None
81+
82+
def setCsvTitle(self):
83+
csvFile = codecs.open("data/data.csv", 'a', 'utf_8_sig')
84+
try:
85+
writer = csv.writer(csvFile)
86+
writer.writerow(['title', 'tag', 'info', 'comments', 'content'])
87+
finally:
88+
csvFile.close()
89+
90+
def saveBookInfo(self, bookList):
91+
bookList = bookList
92+
csvFile = codecs.open("data/data.csv", 'a', 'utf_8_sig')
93+
try:
94+
writer = csv.writer(csvFile)
95+
for book in bookList:
96+
writer.writerow(book)
97+
finally:
98+
csvFile.close()
99+
100+
def start(self):
101+
categroyLink = self.getCategroyLink()
102+
self.getBookInfo(categroyLink)
103+
104+
105+
douBanSpider = DouBanSpider()
106+
douBanSpider.start()

‎chapter-9/cluster/normalization.py

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
"""
2+
3+
@author: liushuchun
4+
"""
5+
import re
6+
import string
7+
import jieba
8+
9+
# 加载停用词
10+
with open("dict/stop_words.utf8", encoding="utf8") as f:
11+
stopword_list = f.readlines()
12+
13+
14+
def tokenize_text(text):
15+
tokens = jieba.lcut(text)
16+
tokens = [token.strip() for token in tokens]
17+
return tokens
18+
19+
20+
def remove_special_characters(text):
21+
tokens = tokenize_text(text)
22+
pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
23+
filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
24+
filtered_text = ' '.join(filtered_tokens)
25+
return filtered_text
26+
27+
28+
def remove_stopwords(text):
29+
tokens = tokenize_text(text)
30+
filtered_tokens = [token for token in tokens if token not in stopword_list]
31+
filtered_text = ''.join(filtered_tokens)
32+
return filtered_text
33+
34+
35+
def normalize_corpus(corpus):
36+
normalized_corpus = []
37+
for text in corpus:
38+
39+
text =" ".join(jieba.lcut(text))
40+
normalized_corpus.append(text)
41+
42+
return normalized_corpus

0 commit comments

Comments
 (0)
Please sign in to comment.