-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathzenodo.py
70 lines (55 loc) · 2.16 KB
/
zenodo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import os,json,time
import numpy as np
import pandas as pd
from utils import glob_colors as colors
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
records_json='static/spg/records.json'
options = webdriver.ChromeOptions()
options.add_argument('--headless')
# options.add_argument("--window-size=1920,1080")
# options.add_argument("--start-maximized")
# browser = webdriver.Firefox(options=options)
browser = webdriver.Chrome()
def get_records():
browser.get("https://zenodo.org/communities/microed/search?page=1&size=1000")
elts = browser.find_elements("class name", "record-elem")
d = {os.path.basename(e.get_dom_attribute("href")): e.text
for e in map(lambda e: e.find_elements("tag name", "a")[1], elts)}
# print(d)
# records = pd.DataFrame.from_dict(d,orient='index',columns=['title'])
return d
def get_files(records):
all_files = dict()
for record in records:
print(record)
browser.get("https://zenodo.org/record/%s" %record)
files = browser.find_element("id", "files").find_elements("tag name", "tr")
df = pd.DataFrame(columns=['size','link'])
for file in map(lambda e: e.find_elements("tag name", "td"),files[1:]):
a = file[0].find_element('tag name','a')
df.loc[a.text] = [file[1].text,a.get_dom_attribute("href")]
# print(df)
all_files[record]=df.to_dict('index')
time.sleep(np.random.rand()*2)
return all_files
records = get_records()
with open(records_json,'r') as f:
all_records = json.load(f)
new_records=np.setdiff1d(
list(records.keys()),
list(all_records.keys())
)
if any(new_records):
print(colors.green,'new records :')
print(colors.blue, ' '.join(new_records),colors.black)
new_files = get_files(new_records)
for record in new_records:
all_records[record]['files'] = new_files[record]
all_records[record]['title'] = records[record]
with open(records_json,'w') as f:
f.write(json.dumps(all_records))
print(colors.green,'zenodo updated ',colors.black)
else:
print(colors.green,'Already up to date with zenodo',colors.black)
browser.quit()