Skip to content

Commit d5b15fa

Browse files
committed
Initial commit
0 parents  commit d5b15fa

27 files changed

+1328
-0
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
__pycache__/
2+
site_credentials.py
3+
.DS_Store

.gitmodules

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[submodule "BiblioWikidata"]
2+
path = BiblioWikidata
3+
url = https://github.com/harej/BiblioWikidata

BiblioWikidata

Submodule BiblioWikidata added at 6cc6931

README.md

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Various scripts I've written in support of my bibliographic metadata work on Wikidata.
2+
3+
Most of these scripts are lousy and rushed. I do not recommend using them. I'm mostly putting this up so that I have access to the files when I'm not on my computer.

__init__.py

Whitespace-only changes.

crossref_issn_associator.py

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import requests
2+
import threading
3+
4+
class AskCrossref(threading.Thread):
5+
def __init__ (self, threadID, name, package, issn_to_wikidata, doi_to_wikidata):
6+
threading.Thread.__init__(self)
7+
self.threadID = threadID
8+
self.name = name
9+
self.package = package
10+
self.issn_to_wikidata = issn_to_wikidata
11+
self.doi_to_wikidata = doi_to_wikidata
12+
13+
def run(self):
14+
for doi in self.package:
15+
try:
16+
r = requests.get("https://doi.org/" + doi, headers={"Accept": "application/json"})
17+
except: # fuck you
18+
continue
19+
if r.status_code != 200:
20+
continue
21+
try:
22+
blob = r.json()
23+
except:
24+
continue
25+
if "ISSN" in blob:
26+
issn_item_list = []
27+
for issn in blob["ISSN"]:
28+
if issn in self.issn_to_wikidata:
29+
issn_item_list.append(self.issn_to_wikidata[issn])
30+
issn_item_list = list(set(issn_item_list))
31+
for issn_item in issn_item_list:
32+
article_item = self.doi_to_wikidata[doi]
33+
if issn_item != article_item:
34+
print(article_item + "\tP1433\t" + issn_item + "\tS248\tQ5188229")
35+
36+
def main():
37+
issn_query_url = "https://query.wikidata.org/sparql?format=json&query=select%20%3Fi%20%3Fissn%20where%20%7B%20%3Fi%20wdt%3AP236%20%3Fissn%20%7D"
38+
issn_seed = requests.get(issn_query_url).json()["results"]["bindings"]
39+
issn_to_wikidata = {x["issn"]["value"]: x["i"]["value"].replace("http://www.wikidata.org/entity/", "") for x in issn_seed}
40+
41+
doi_seed = requests.get("https://query.wikidata.org/sparql?format=json&query=select%20%3Fitem%20%3Fdoi%20where%20%7B%20%0A%20%20%3Fitem%20wdt%3AP356%20%3Fdoi%20.%0A%20%20optional%20%7B%3Fitem%20wdt%3AP1433%20%3Fx%7D%0A%20%20filter%28%21bound%28%3Fx%29%29%0A%7D%0Aorder%20by%20%3Fitem").json()
42+
doi_to_wikidata = {x["doi"]["value"]: x["item"]["value"].replace("http://www.wikidata.org/entity/", "") for x in doi_seed["results"]["bindings"]}
43+
doi_list = list(doi_to_wikidata.keys())
44+
doi_packages = [doi_list[x:x+1000] for x in range(0, len(doi_list), 1000)]
45+
46+
thread_counter = 0
47+
for package in doi_packages:
48+
thread = AskCrossref(thread_counter, "thread-" + str(thread_counter), package, issn_to_wikidata, doi_to_wikidata)
49+
thread_counter += 1
50+
thread.start()
51+
52+
53+
if __name__ == '__main__':
54+
main()

crossref_issn_itemizer.py

+71
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
import requests
2+
import threading
3+
4+
already_processed = []
5+
6+
class AskCrossref(threading.Thread):
7+
def __init__ (self, threadID, name, package, issn_in_wikidata):
8+
threading.Thread.__init__(self)
9+
self.threadID = threadID
10+
self.name = name
11+
self.package = package
12+
self.issn_in_wikidata = issn_in_wikidata
13+
14+
def run(self):
15+
global already_processed # I'm going to hell
16+
for doi in self.package:
17+
try:
18+
r = requests.get("https://dx.doi.org/" + doi, headers={"Accept": "application/json"})
19+
except:
20+
continue
21+
if r.status_code != 200:
22+
continue
23+
24+
try:
25+
blob = r.json()
26+
except:
27+
continue
28+
if "ISSN" in blob:
29+
for issn in blob["ISSN"]:
30+
if issn in self.issn_in_wikidata:
31+
continue
32+
if issn in already_processed:
33+
continue
34+
worldcat = requests.get("http://xissn.worldcat.org/webservices/xid/issn/{0}?format=json&method=getMetadata&fl=title".format(issn))
35+
if worldcat.status_code == 200:
36+
try:
37+
worldcat = worldcat.json()
38+
except:
39+
continue
40+
if "group" in worldcat:
41+
if len(worldcat["group"]) == 1:
42+
if "list" in worldcat["group"][0]:
43+
if "issn" in worldcat["group"][0]["list"][0]: # IS THIS ENOUGH ERROR HANDLING FOR YOU???
44+
output_string = ""
45+
output_string += "CREATE\n"
46+
output_string += "LAST\tP236\t\"" + issn + "\"\n"
47+
if "title" in worldcat["group"][0]["list"][0]:
48+
output_string += "LAST\tLen\t\"" + worldcat["group"][0]["list"][0]["title"] + "\"\n"
49+
output_string += "LAST\tDen\t\"journal\"\n"
50+
output_string += "LAST\tP31\tQ5633421"
51+
print(output_string)
52+
already_processed.append(issn)
53+
54+
55+
def main():
56+
issn_query_url = "https://query.wikidata.org/sparql?format=json&query=select%20%3Fi%20%3Fissn%20where%20%7B%20%3Fi%20wdt%3AP236%20%3Fissn%20%7D"
57+
issn_seed = requests.get(issn_query_url).json()["results"]["bindings"]
58+
issn_in_wikidata = [x["issn"]["value"] for x in issn_seed]
59+
60+
doi_seed = requests.get("https://query.wikidata.org/sparql?format=json&query=select%20%3Fdoi%20where%20%7B%20%0A%20%20%3Fitem%20wdt%3AP356%20%3Fdoi%20.%0A%20%20optional%20%7B%3Fitem%20wdt%3AP1433%20%3Fx%7D%0A%20%20filter%28%21bound%28%3Fx%29%29%0A%7D").json()
61+
doi_list = [x["doi"]["value"] for x in doi_seed["results"]["bindings"]]
62+
doi_packages = [doi_list[x:x+1000] for x in range(0, len(doi_list), 1000)]
63+
64+
thread_counter = 0
65+
for package in doi_packages:
66+
thread = AskCrossref(thread_counter, "thread-" + str(thread_counter), package, issn_in_wikidata)
67+
thread_counter += 1
68+
thread.start()
69+
70+
if __name__ == "__main__":
71+
main()

doi_normalizer.py

+83
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
import requests
2+
3+
def main():
4+
# Canonical DOI format: all uppercase letters.
5+
# Three scenarios:
6+
#
7+
# 1. One DOI on an item that is identical with the canonical format:
8+
# No action needed
9+
#
10+
# 2. One DOI on an item that is not identical with canonical format:
11+
# Convert to canonical format
12+
#
13+
# 3. Two or more DOIs on a page that match with the canonical format when converted to uppercase:
14+
# Check if canonically formatted DOI already on page and keep that. Remove non-matching ones.
15+
# If none are in matching format, create an entry in the necessary format and delete the others.
16+
#
17+
# 4. Two or more DOIs that are different even when normalized to uppercase:
18+
# Do nothing. This is a special case and requires manual intervention.
19+
20+
url = "https://query.wikidata.org/sparql?format=json&query=select%20%3Fi%20%3Fdoi%20where%20%7B%20%3Fi%20wdt%3AP356%20%3Fdoi%20%7D%20order%20by%20%3Fi"
21+
seed = requests.get(url).json()["results"]["bindings"]
22+
23+
manifest = {} # dictionary of lists
24+
25+
for result in seed:
26+
wikidata_item = result["i"]["value"].replace("http://www.wikidata.org/entity/", "")
27+
doi = result["doi"]["value"]
28+
29+
if wikidata_item not in manifest:
30+
manifest[wikidata_item] = []
31+
32+
manifest[wikidata_item].append(doi)
33+
34+
lines_to_print = []
35+
36+
for wikidata_item, doi_list in manifest.items():
37+
38+
canonical = doi_list[0].upper()
39+
40+
if len(doi_list) > 1:
41+
# Testing to see if all DOIs are the same when converted to uppercase.
42+
# If not, then it's case 4 and must be skipped.
43+
44+
requires_manual_intervention = False
45+
for doi in doi_list[1:]:
46+
if doi.upper() != canonical:
47+
requires_manual_intervention = True
48+
49+
if requires_manual_intervention == True:
50+
continue
51+
52+
# Next: find out if the canonical is already in this list
53+
54+
canonical_is_present = False
55+
for doi in doi_list:
56+
if doi == canonical:
57+
canonical_is_present = True
58+
else:
59+
lines_to_print.append("-" + wikidata_item + "|P356|\"" + doi + "\"||")
60+
61+
if canonical_is_present == False:
62+
lines_to_print.append(wikidata_item + "|P356|\"" + canonical + "\"||")
63+
64+
else:
65+
if doi_list[0] != canonical:
66+
lines_to_print.append("-" + wikidata_item + "|P356|\"" + doi_list[0] + "\"||")
67+
lines_to_print.append(wikidata_item + "|P356|\"" + canonical + "\"||")
68+
69+
packages = [lines_to_print[x:x+20000] for x in range(0, len(lines_to_print), 20000)]
70+
71+
counter = 0
72+
for package in packages:
73+
with open('normalized-' + str(counter).zfill(2) + '.txt', 'w') as f:
74+
to_write = ''
75+
for line in package:
76+
to_write += line
77+
f.write(to_write)
78+
counter += 1
79+
80+
81+
82+
if __name__ == '__main__':
83+
main()

doi_to_pmcid.py

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import requests
2+
3+
def main():
4+
seed = "https://query.wikidata.org/sparql?format=json&query=SELECT%20%3Fitem%20%3Fdoi%20WHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP356%20%3Fdoi%20.%0A%20%20OPTIONAL%20%7B%20%3Fitem%20wdt%3AP932%20%3Fdummy1%20%7D%0A%20%20FILTER%28%21bound%28%3Fdummy1%29%29%0A%7D"
5+
url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?format=json&tool=wikidata_worker&[email protected]&ids="
6+
7+
r = requests.get(seed)
8+
blob = r.json()
9+
10+
wikidata_items = {} # key: doi; value: wikidata item
11+
doi = []
12+
for result in blob["results"]["bindings"]:
13+
wikidata_items[result["doi"]["value"]] = result["item"]["value"].replace("http://www.wikidata.org/entity/", "")
14+
doi.append(result["doi"]["value"])
15+
16+
packages = [doi[x:x+200] for x in range(0, len(doi), 200)]
17+
18+
for package in packages:
19+
query_string = ""
20+
for item in package:
21+
query_string += item + ","
22+
query_string = query_string[:-1] # Remove trailing comma
23+
24+
s = requests.get(url + query_string)
25+
try:
26+
blob = s.json()
27+
except ValueError:
28+
continue
29+
30+
if "records" in blob:
31+
for response in blob["records"]:
32+
if "pmcid" in response:
33+
print(wikidata_items[response["doi"]] + "\tP932\t\"" + response["pmcid"].replace("PMC", "") + "\"")
34+
if "pmid" in response:
35+
print(wikidata_items[response["doi"]] + "\tP698\t\"" + response["pmid"] + "\"")
36+
37+
if __name__ == '__main__':
38+
main()

doi_to_pmid.py

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import requests
2+
import threading
3+
4+
class AskPubMed(threading.Thread):
5+
def __init__(self, threadID, name, package):
6+
threading.Thread.__init__(self)
7+
self.threadID = threadID
8+
self.name = name
9+
self.package = package
10+
11+
def run(self):
12+
for item in self.package:
13+
wikidata_item = item[0]
14+
doi = item[1]
15+
try:
16+
r = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?retmode=json&db=pubmed&term=" + doi).json()
17+
except (OSError, ValueError):
18+
continue
19+
20+
if "esearchresult" in r:
21+
if "count" in r["esearchresult"]:
22+
if r["esearchresult"]["count"] == "1":
23+
if "errorlist" not in r["esearchresult"]:
24+
pmid = r["esearchresult"]["idlist"][0]
25+
print(wikidata_item + "\tP698\t\"" + pmid + "\"")
26+
27+
28+
def main():
29+
prefix = "http://www.wikidata.org/entity/"
30+
seed = "https://query.wikidata.org/sparql?format=json&query=SELECT%20%3Fi%20%3Fd%20WHERE%20%7B%0A%20%20%3Fi%20wdt%3AP356%20%3Fd%20.%0A%20%20OPTIONAL%20%7B%20%3Fi%20wdt%3AP698%20%3Fp%20%7D%0A%20%20FILTER%28%21bound%28%3Fp%29%29%0A%7D%0AORDER%20BY%20%3Fi"
31+
r = requests.get(seed).json()
32+
items = [(x["i"]["value"].replace(prefix, ""), x["d"]["value"]) for x in r["results"]["bindings"]]
33+
packages = [items[x:x+3000] for x in range(0, len(items), 3000)]
34+
thread_counter = 0
35+
for package in packages:
36+
thread = AskPubMed(thread_counter, "thread-" + str(thread_counter), package)
37+
thread_counter += 1
38+
thread.start()
39+
40+
if __name__ == '__main__':
41+
main()

isbn_associator.py

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import csv
2+
import requests
3+
4+
seed = requests.get("https://query.wikidata.org/sparql?format=json&query=select%20%3Fi%20where%20%7B%0A%20%20%7B%0A%20%20%20%20%3Fi%20wdt%3AP2880%20%3Fn%20.%0A%20%20%20%20%3Fi%20wdt%3AP1433%20%3Fpublishedin%20.%0A%20%20%7D%20UNION%20%7B%0A%20%20%20%20%3Fi%20wdt%3AP2880%20%3Fn%20.%0A%20%20%20%20%3Fi%20wdt%3AP212%20%3Fisbn13%20.%0A%20%20%7D%20UNION%20%7B%0A%20%20%20%20%3Fi%20wdt%3AP2880%20%3Fn%20.%0A%20%20%20%20%3Fi%20wdt%3AP957%20%3Fisbn10%20.%0A%20%20%7D%0A%7D").json()["results"]["bindings"]
5+
do_not_generate = [x["i"]["value"].replace("http://www.wikidata.org/entity/", "") for x in seed]
6+
7+
isbn_bank = {} # maps ISBNs to their putative titles; to prevent duplicate lookup
8+
9+
with open("nioshtic_isbn.csv") as f:
10+
spreadsheet = csv.reader(f)
11+
for row in spreadsheet:
12+
wikidata_item = row[0]
13+
original_title = row[1]
14+
isbn = row[2]
15+
16+
if wikidata_item in do_not_generate:
17+
continue
18+
19+
if len(isbn) < 10:
20+
isbn = isbn.zfill(10)
21+
22+
if isbn not in isbn_bank:
23+
get_title = requests.get("http://xissn.worldcat.org/webservices/xid/isbn/" + isbn + "?format=json&method=getMetadata&fl=title")
24+
if get_title.status_code != 200:
25+
continue
26+
try:
27+
blob = get_title.json()
28+
except:
29+
continue
30+
if "list" in blob:
31+
if len(blob["list"]) == 1:
32+
if "title" in blob["list"][0]:
33+
print(wikidata_item + "\t" + original_title + "\t" + isbn + "\t" + blob["list"][0]["title"])
34+
isbn_bank[isbn] = blob["list"][0]["title"]
35+
else:
36+
print(wikidata_item + "\t" + original_title + "\t" + isbn + "\t" + isbn_bank[isbn])

isbn_associator_2.py

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import csv
2+
import requests
3+
4+
seed = requests.get("https://query.wikidata.org/sparql?format=json&query=select%20%3Fitem%20%3Fisbn13%20where%20%7B%20%3Fitem%20wdt%3AP212%20%3Fisbn13%20%7D").json()["results"]["bindings"]
5+
isbn13_to_wikidata = {x["isbn13"]["value"].replace("-", ""): x["item"]["value"].replace("http://www.wikidata.org/entity/", "") for x in seed}
6+
7+
seed = requests.get("https://query.wikidata.org/sparql?format=json&query=select%20%3Fitem%20%3Fisbn10%20where%20%7B%20%3Fitem%20wdt%3AP957%20%3Fisbn10%20%7D").json()["results"]["bindings"]
8+
isbn10_to_wikidata = {x["isbn10"]["value"].replace("-", ""): x["item"]["value"].replace("http://www.wikidata.org/entity/", "") for x in seed}
9+
10+
with open("isbn_associator.csv") as f:
11+
spreadsheet = csv.reader(f)
12+
for row in spreadsheet:
13+
item = row[0].strip()
14+
isbn = row[1].strip() # item is published in isbn13
15+
16+
if len(isbn) == 13:
17+
if isbn in isbn13_to_wikidata:
18+
print(item + "\tP1433\t" + isbn13_to_wikidata[isbn] + "\tS248\tQ26822184")
19+
elif len(isbn) == 10:
20+
if isbn in isbn10_to_wikidata:
21+
print(item + "\tP1433\t" + isbn10_to_wikidata[isbn] + "\tS248\tQ26822184")

0 commit comments

Comments
 (0)