-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathword_join_red.py
executable file
·36 lines (28 loc) · 1.22 KB
/
word_join_red.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/usr/bin/env python
from __future__ import print_function
import sys
import map_reduce_utils as mru
def reduce_word_join(input=mru.reducer_stream(), output=sys.stdout):
"""
(word) (file_name tfidf) --> (word) (file1 file2 tfidf1*tfidf2)
for each word, if two distinct documents both contain that word,
a line is emitted containing the product of the tfidf scores of that
word in both documents.
This is the first step in computing the pairwise dot product of the tf-idf
vectors between all documents, where the corresponding elements for every
pair of documents are multiplied together.
"""
for in_key, key_stream in input:
values = []
for in_value in key_stream:
values.append(in_value)
for val1 in values:
for val2 in values:
if not val1['filename'] == val2['filename']:
out_key = {'word': in_key['word']}
out_value = {'file1': val1['filename'],
'file2': val2['filename'],
'product': val1['tfidf'] * val2['tfidf']}
mru.reducer_emit(out_key, out_value, output)
if __name__ == '__main__':
reduce_word_join()