-
Notifications
You must be signed in to change notification settings - Fork 0
/
TF-IDF.py
73 lines (59 loc) · 2.33 KB
/
TF-IDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# finding the most similar pair of lines and the tf-idf representation. bag of words and tf-idf
import operator as op
import math
import numpy as np
text = '''Humpty Dumpty sat on a wall
Humpty Dumpty had a great fall
all the king's horses and all the king's men
couldn't put Humpty together again'''
def main(text):
# 1. split the text into words, and get a list of unique words that appear in it
newtxt = text.split()
unique = []
# a short one-liner to separate the text into sentences (with words lower-cased to make words equal
# despite casing) can be done with
docs = [line.lower().split() for line in text.split('\n')]
for j in docs:
unique = unique + j
unique = list(set(unique))
# 2. go over each unique word and calculate its term frequency, and its document frequency
# initliazing dicts to hold term frequency, document frequency and tf-idf OF EACH LINE
termf = {}
docf = {}
tdif ={}
for u1 in unique :
count = 0
for d1 in docs:
if u1 in d1:
count +=1
# getting log with base 10
docf[u1] = math.log(len(docs)/count,10)
i = 0
for d in docs:
tdif[i] = []
for u in unique:
termf[u] = op.countOf(d, u)
termf[u] = termf[u]/len(d)
# 3. after you have your term frequencies and document frequencies, go over each line in the text and
# calculate its TF-IDF representation, which will be a vector
tdadd = termf[u] * docf[u]
tdif[i].append(tdadd)
i+=1
# 4. after you have calculated the TF-IDF representations for each line in the text, you need to
# calculate the distances between each line to find which are the closest.
# np 2d array to hold differences in order to find out most smiliar pair
dist = np.empty((len(tdif), len(tdif)), dtype=float)
for x in range (len(tdif)):
for y in range (len(tdif)):
sum1 = 0
if x == y:
dist[x][y] = np.inf
else:
# iterating over tdif dict to get summation of differences
for e in range(len(tdif[x])):
sum1 += abs(tdif[x][e] - tdif[y][e])
dist[x][y] = sum1
ans = np.unravel_index(np.argmin(dist), dist.shape)
print(ans)
return()
main(text)