forked from norryfan93/system_design
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsim.py
106 lines (91 loc) · 2.98 KB
/
sim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 4 17:58:07 2021
@author: Administrator
"""
import csv
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import vstack
import numpy as np
import scipy
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import jieba
import time
corpus=[]
alltime=[]
annid=[]
with open('Control_Date.csv','r') as file:
file.readline()
reader=csv.reader(file)
for line in reader:
cutc=(jieba.cut(line[5],cut_all=False))
temp=[]
for i in cutc:
temp.append(i)
res=' '.join(temp)
corpus.append(res)
timeStruct = time.strptime(line[3], "%Y-%m-%d")
ts = int(time.mktime(timeStruct))
alltime.append(ts)
annid.append(line[4])
control=len(corpus)
treat=[]
with open('Treat_Date.csv','r') as file:
file.readline()
reader=csv.reader(file)
for line in reader:
cutc=(jieba.cut(line[5],cut_all=False))
temp=[]
for i in cutc:
temp.append(i)
res=' '.join(temp)
corpus.append(res)
treat.append(res)
timeStruct = time.strptime(line[3], "%Y-%m-%d")
ts = int(time.mktime(timeStruct))
alltime.append(ts)
annid.append(line[4])
centroids = []
vectorizer = CountVectorizer() # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
transformer = TfidfTransformer() # 该类会统计每个词语的tf-idf权值
vfit=vectorizer.fit_transform(corpus)
tfidf=transformer.fit_transform(vfit)#第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
for i in range(len(corpus)):
if i%100==0:
print(i)
test_count=vectorizer.transform([corpus[i]])
test_tfidf = transformer.transform(test_count)
a=sparse.csr_matrix(test_tfidf.toarray())
c = a.astype('float16')
centroids.append(c)
c = vstack(centroids, format="csr")
similarities = cosine_similarity(c)
result=[]
for i in range(control,len(corpus)):
smaxm=similarities[i,0:control]
smax=max(smaxm)
temp=[]
for j,k in enumerate(smaxm):
if k==smax:
temp.append(j)
if len(temp)>1:
thr=9000000000
zzz=0
for a in temp:
print(corpus[a])
print(alltime[a])
diff=abs(alltime[a]-alltime[i])
if diff<=thr:
thr=diff
zzz=a
result.append((annid[i],annid[zzz]))
print(zzz)
else:
result.append((annid[i],annid[temp[0]]))
out1 = open('ressss.csv','w', newline='',encoding='utf-8-sig')
csv_write=csv.writer(out1,dialect='excel')
for i in result:
csv_write.writerow([i[0],i[1]])
out1.close()