forked from karpathy/arxiv-sanity-preserver
-
Notifications
You must be signed in to change notification settings - Fork 0
/
make_cache.py
103 lines (88 loc) · 3.49 KB
/
make_cache.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
"""
computes various cache things on top of db.py so that the server
(running from serve.py) can start up and serve faster when restarted.
this script should be run whenever db.p is updated, and
creates db2.p, which can be read by the server.
"""
import os
import json
import time
import pickle
import dateutil.parser
from sqlite3 import dbapi2 as sqlite3
from utils import safe_pickle_dump, Config
sqldb = sqlite3.connect(Config.database_path)
sqldb.row_factory = sqlite3.Row # to return dicts rather than tuples
CACHE = {}
print('loading the paper database', Config.db_path)
db = pickle.load(open(Config.db_path, 'rb'))
print('loading tfidf_meta', Config.meta_path)
meta = pickle.load(open(Config.meta_path, "rb"))
vocab = meta['vocab']
idf = meta['idf']
print('decorating the database with additional information...')
for pid,p in db.items():
timestruct = dateutil.parser.parse(p['updated'])
p['time_updated'] = int(timestruct.strftime("%s")) # store in struct for future convenience
timestruct = dateutil.parser.parse(p['published'])
p['time_published'] = int(timestruct.strftime("%s")) # store in struct for future convenience
print('computing min/max time for all papers...')
tts = [time.mktime(dateutil.parser.parse(p['updated']).timetuple()) for pid,p in db.items()]
ttmin = min(tts)*1.0
ttmax = max(tts)*1.0
for pid,p in db.items():
tt = time.mktime(dateutil.parser.parse(p['updated']).timetuple())
p['tscore'] = (tt-ttmin)/(ttmax-ttmin)
print('precomputing papers date sorted...')
scores = [(p['time_updated'], pid) for pid,p in db.items()]
scores.sort(reverse=True, key=lambda x: x[0])
CACHE['date_sorted_pids'] = [sp[1] for sp in scores]
# compute top papers in peoples' libraries
print('computing top papers...')
libs = sqldb.execute('''select * from library''').fetchall()
counts = {}
for lib in libs:
pid = lib['paper_id']
counts[pid] = counts.get(pid, 0) + 1
top_paper_counts = sorted([(v,k) for k,v in counts.items() if v > 0], reverse=True)
CACHE['top_sorted_pids'] = [q[1] for q in top_paper_counts]
# some utilities for creating a search index for faster search
punc = "'!\"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~'" # removed hyphen from string.punctuation
trans_table = {ord(c): None for c in punc}
def makedict(s, forceidf=None, scale=1.0):
words = set(s.lower().translate(trans_table).strip().split())
idfd = {}
for w in words: # todo: if we're using bigrams in vocab then this won't search over them
if forceidf is None:
if w in vocab:
# we have idf for this
idfval = idf[vocab[w]]*scale
else:
idfval = 1.0*scale # assume idf 1.0 (low)
else:
idfval = forceidf
idfd[w] = idfval
return idfd
def merge_dicts(dlist):
m = {}
for d in dlist:
for k,v in d.items():
m[k] = m.get(k,0) + v
return m
print('building an index for faster search...')
search_dict = {}
for pid,p in db.items():
dict_title = makedict(p['title'], forceidf=5, scale=3)
dict_authors = makedict(' '.join(x['name'] for x in p['authors']), forceidf=5)
dict_categories = {x['term'].lower():5 for x in p['tags']}
if 'and' in dict_authors:
# special case for "and" handling in authors list
del dict_authors['and']
dict_summary = makedict(p['summary'])
search_dict[pid] = merge_dicts([dict_title, dict_authors, dict_categories, dict_summary])
CACHE['search_dict'] = search_dict
# save the cache
print('writing', Config.serve_cache_path)
safe_pickle_dump(CACHE, Config.serve_cache_path)
print('writing', Config.db_serve_path)
safe_pickle_dump(db, Config.db_serve_path)