-
Notifications
You must be signed in to change notification settings - Fork 0
/
Utils.py
78 lines (60 loc) · 2.21 KB
/
Utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
'''
Utility functions
'''
from ElasticSearchServer import ElasticSearchServer
import json, re
from datetime import date, datetime
class Utils:
def __init__(self):
pass
def __clean(self, t):
t = re.sub("\n"," ", t)
return t
def __author_parser__(self, authors):
_authors = []
for author in authors:
_authors.append({
"fname": author[1],
"lname": author[0],
"full_name": author[1] + " " + author[0],
"affiliation": author[2]
})
return _authors
def __date_parser__(self, d):
d = datetime.strptime(d, "%a, %d %b %Y %H:%M:%S %Z").date()
return d.isoformat(), {"year": d.year, "month": d.month, "day": d.day}
def wrap_in_json(self, datapath):
'''
Wrap data in JSON format ready to be ingested into an index
@params
datapath: path to data
'''
data = json.load(open(datapath, "r"))
wf = open("data/data.json","w+")
for i, doc in enumerate(data):
obj = {
"pid": doc["id"],
"title": self.__clean(doc["title"]),
"journal": doc["journal-ref"],
"doi": "https://arxiv.org/pdf/" + doc["id"] + ".pdf",
"authors": self.__author_parser__(doc["authors_parsed"]),
"categories": [c for c in doc['categories'].split(" ")],
"license": doc["license"],
"abstract": self.__clean(doc["abstract"]),
"date": self.__date_parser__(doc["versions"][0]["created"])[0],
"date_parsed": self.__date_parser__(doc["versions"][0]["created"])[1]
}
json.dump({"index": {"_id": i}}, wf)
wf.write("\n")
json.dump(obj, wf)
wf.write("\n")
def load_to_index(self, datapath, server):
'''
Load the JSON formatted data to index
@params
datapath: path to json file
server: elastic search server
'''
with open(datapath, encoding='utf-8', errors='ignore') as f:
data = f.readlines()
server.load(data)