-
Notifications
You must be signed in to change notification settings - Fork 0
/
bag of words.py
33 lines (27 loc) · 1.25 KB
/
bag of words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import numpy as np
# data indicates number of occurencxes of each words in the poem 'The little piggy rhyme ' per line
data = [[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1],
[1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1],
[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1],
[1, 1, 1, 0, 1, 3, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1]]
# function finds the nearest matching pair of lines using bag of words concept, to find the
# least summation of differences of occurences of words per line
def find_nearest_pair(data):
N = len(data)
dist = np.empty((N, N), dtype=float)
for d in range (len(data)):
for j in range (len(data)):
sum = 0
if j == d:
dist[d][j] = np.inf
else:
for i in range (len(data[d])):
sum += abs(data[d][i] - data[j][i])
dist[d][j] = sum
# unravel : A quick way to get the index of the element with the lowest value in a 2D array (or in fact, any dimension) is by the function
np.unravel_index(np.argmin(dist), dist.shape))
ans = np.unravel_index(np.argmin(dist), dist.shape)
print(ans)
return()
find_nearest_pair(data)