harej
diff --git a/‎.gitignore
+3 b/‎.gitignore
+3
diff --git a/‎.gitmodules
+3 b/‎.gitmodules
+3
diff --git a/‎BiblioWikidata b/‎BiblioWikidata
diff --git a/‎README.md
+3 b/‎README.md
+3
diff --git a/‎__init__.py b/‎__init__.py
diff --git a/‎crossref_issn_associator.py
+54 b/‎crossref_issn_associator.py
+54
diff --git a/‎crossref_issn_itemizer.py
+71 b/‎crossref_issn_itemizer.py
+71
diff --git a/‎doi_normalizer.py
+83 b/‎doi_normalizer.py
+83
diff --git a/‎doi_to_pmcid.py
+38 b/‎doi_to_pmcid.py
+38
diff --git a/‎doi_to_pmid.py
+41 b/‎doi_to_pmid.py
+41
diff --git a/‎isbn_associator.py
+36 b/‎isbn_associator.py
+36
diff --git a/‎isbn_associator_2.py
+21 b/‎isbn_associator_2.py
+21
@@ -0,0 +1,3 @@
+__pycache__/
+site_credentials.py
+.DS_Store
@@ -0,0 +1,3 @@
+[submodule "BiblioWikidata"]
+	path = BiblioWikidata
+	url = https://github.com/harej/BiblioWikidata
@@ -0,0 +1,3 @@
+Various scripts I've written in support of my bibliographic metadata work on Wikidata.
+
+Most of these scripts are lousy and rushed. I do not recommend using them. I'm mostly putting this up so that I have access to the files when I'm not on my computer.
@@ -0,0 +1,54 @@
+import requests
+import threading
+
+class AskCrossref(threading.Thread):
+    def __init__ (self, threadID, name, package, issn_to_wikidata, doi_to_wikidata):
+        threading.Thread.__init__(self)
+        self.threadID = threadID
+        self.name = name
+        self.package = package
+        self.issn_to_wikidata = issn_to_wikidata
+        self.doi_to_wikidata = doi_to_wikidata
+
+    def run(self):
+        for doi in self.package:
+            try:
+                r = requests.get("https://doi.org/" + doi, headers={"Accept": "application/json"})
+            except:  # fuck you
+                continue
+            if r.status_code != 200:
+                continue
+            try:
+                blob = r.json()
+            except:
+                continue
+            if "ISSN" in blob:
+                issn_item_list = []
+                for issn in blob["ISSN"]:
+                    if issn in self.issn_to_wikidata:
+                        issn_item_list.append(self.issn_to_wikidata[issn])
+                issn_item_list = list(set(issn_item_list))
+                for issn_item in issn_item_list:
+                    article_item = self.doi_to_wikidata[doi]
+                    if issn_item != article_item:
+                        print(article_item + "\tP1433\t" + issn_item + "\tS248\tQ5188229")
+
+def main():
+    issn_query_url = "https://query.wikidata.org/sparql?format=json&query=select%20%3Fi%20%3Fissn%20where%20%7B%20%3Fi%20wdt%3AP236%20%3Fissn%20%7D"
+    issn_seed = requests.get(issn_query_url).json()["results"]["bindings"]
+    issn_to_wikidata = {x["issn"]["value"]: x["i"]["value"].replace("http://www.wikidata.org/entity/", "") for x in issn_seed}
+
+    doi_seed = requests.get("https://query.wikidata.org/sparql?format=json&query=select%20%3Fitem%20%3Fdoi%20where%20%7B%20%0A%20%20%3Fitem%20wdt%3AP356%20%3Fdoi%20.%0A%20%20optional%20%7B%3Fitem%20wdt%3AP1433%20%3Fx%7D%0A%20%20filter%28%21bound%28%3Fx%29%29%0A%7D%0Aorder%20by%20%3Fitem").json()
+    doi_to_wikidata = {x["doi"]["value"]: x["item"]["value"].replace("http://www.wikidata.org/entity/", "") for x in doi_seed["results"]["bindings"]}
+    doi_list = list(doi_to_wikidata.keys())
+    doi_packages = [doi_list[x:x+1000] for x in range(0, len(doi_list), 1000)]
+
+    thread_counter = 0
+    for package in doi_packages:
+        thread = AskCrossref(thread_counter, "thread-" + str(thread_counter), package, issn_to_wikidata, doi_to_wikidata)
+        thread_counter += 1
+        thread.start()
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,71 @@
+import requests
+import threading
+
+already_processed = []
+
+class AskCrossref(threading.Thread):
+    def __init__ (self, threadID, name, package, issn_in_wikidata):
+        threading.Thread.__init__(self)
+        self.threadID = threadID
+        self.name = name
+        self.package = package
+        self.issn_in_wikidata = issn_in_wikidata
+
+    def run(self):
+        global already_processed  # I'm going to hell
+        for doi in self.package:
+            try:
+                r = requests.get("https://dx.doi.org/" + doi, headers={"Accept": "application/json"})
+            except:
+                continue
+            if r.status_code != 200:
+                continue
+
+            try:
+                blob = r.json()
+            except:
+                continue
+            if "ISSN" in blob:
+                for issn in blob["ISSN"]:
+                    if issn in self.issn_in_wikidata:
+                        continue
+                    if issn in already_processed:
+                        continue
+                    worldcat = requests.get("http://xissn.worldcat.org/webservices/xid/issn/{0}?format=json&method=getMetadata&fl=title".format(issn))
+                    if worldcat.status_code == 200:
+                        try:
+                            worldcat = worldcat.json()
+                        except:
+                            continue
+                        if "group" in worldcat:
+                            if len(worldcat["group"]) == 1:
+                                if "list" in worldcat["group"][0]:
+                                    if "issn" in worldcat["group"][0]["list"][0]:  # IS THIS ENOUGH ERROR HANDLING FOR YOU???
+                                        output_string = ""
+                                        output_string += "CREATE\n"
+                                        output_string += "LAST\tP236\t\"" + issn + "\"\n"
+                                        if "title" in worldcat["group"][0]["list"][0]:
+                                            output_string += "LAST\tLen\t\"" + worldcat["group"][0]["list"][0]["title"] + "\"\n"
+                                        output_string += "LAST\tDen\t\"journal\"\n"
+                                        output_string += "LAST\tP31\tQ5633421"
+                                        print(output_string)
+                                        already_processed.append(issn)
+
+
+def main():
+    issn_query_url = "https://query.wikidata.org/sparql?format=json&query=select%20%3Fi%20%3Fissn%20where%20%7B%20%3Fi%20wdt%3AP236%20%3Fissn%20%7D"
+    issn_seed = requests.get(issn_query_url).json()["results"]["bindings"]
+    issn_in_wikidata = [x["issn"]["value"] for x in issn_seed]
+
+    doi_seed = requests.get("https://query.wikidata.org/sparql?format=json&query=select%20%3Fdoi%20where%20%7B%20%0A%20%20%3Fitem%20wdt%3AP356%20%3Fdoi%20.%0A%20%20optional%20%7B%3Fitem%20wdt%3AP1433%20%3Fx%7D%0A%20%20filter%28%21bound%28%3Fx%29%29%0A%7D").json()
+    doi_list = [x["doi"]["value"] for x in doi_seed["results"]["bindings"]]
+    doi_packages = [doi_list[x:x+1000] for x in range(0, len(doi_list), 1000)]
+
+    thread_counter = 0
+    for package in doi_packages:
+        thread = AskCrossref(thread_counter, "thread-" + str(thread_counter), package, issn_in_wikidata)
+        thread_counter += 1
+        thread.start()
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,83 @@
+import requests
+
+def main():
+    # Canonical DOI format: all uppercase letters.
+    # Three scenarios:
+    #
+    #   1. One DOI on an item that is identical with the canonical format:
+    #      No action needed
+    #
+    #   2. One DOI on an item that is not identical with canonical format:
+    #      Convert to canonical format
+    #
+    #   3. Two or more DOIs on a page that match with the canonical format when converted to uppercase:
+    #      Check if canonically formatted DOI already on page and keep that. Remove non-matching ones.
+    #      If none are in matching format, create an entry in the necessary format and delete the others.
+    #
+    #   4. Two or more DOIs that are different even when normalized to uppercase:
+    #      Do nothing. This is a special case and requires manual intervention.
+
+    url = "https://query.wikidata.org/sparql?format=json&query=select%20%3Fi%20%3Fdoi%20where%20%7B%20%3Fi%20wdt%3AP356%20%3Fdoi%20%7D%20order%20by%20%3Fi"
+    seed = requests.get(url).json()["results"]["bindings"]
+
+    manifest = {}  # dictionary of lists
+
+    for result in seed:
+        wikidata_item = result["i"]["value"].replace("http://www.wikidata.org/entity/", "")
+        doi = result["doi"]["value"]
+
+        if wikidata_item not in manifest:
+            manifest[wikidata_item] = []
+
+        manifest[wikidata_item].append(doi)
+
+    lines_to_print = []
+
+    for wikidata_item, doi_list in manifest.items():
+
+        canonical = doi_list[0].upper()
+
+        if len(doi_list) > 1:
+            # Testing to see if all DOIs are the same when converted to uppercase.
+            # If not, then it's case 4 and must be skipped.
+
+            requires_manual_intervention = False
+            for doi in doi_list[1:]:
+                if doi.upper() != canonical:
+                    requires_manual_intervention = True
+
+            if requires_manual_intervention == True:
+                continue
+
+            # Next: find out if the canonical is already in this list
+
+            canonical_is_present = False
+            for doi in doi_list:
+                if doi == canonical:
+                    canonical_is_present = True
+                else:
+                    lines_to_print.append("-" + wikidata_item + "|P356|\"" + doi + "\"||")
+
+            if canonical_is_present == False:
+                lines_to_print.append(wikidata_item + "|P356|\"" + canonical + "\"||")
+
+        else:
+            if doi_list[0] != canonical:
+                lines_to_print.append("-" + wikidata_item + "|P356|\"" + doi_list[0] + "\"||")
+                lines_to_print.append(wikidata_item + "|P356|\"" + canonical + "\"||")
+
+    packages = [lines_to_print[x:x+20000] for x in range(0, len(lines_to_print), 20000)]
+
+    counter = 0
+    for package in packages:
+        with open('normalized-' + str(counter).zfill(2) + '.txt', 'w') as f:
+            to_write = ''
+            for line in package:
+                to_write += line
+            f.write(to_write)
+        counter += 1
+
+    
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,38 @@
+import requests
+
+def main():
+    seed = "https://query.wikidata.org/sparql?format=json&query=SELECT%20%3Fitem%20%3Fdoi%20WHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP356%20%3Fdoi%20.%0A%20%20OPTIONAL%20%7B%20%3Fitem%20wdt%3AP932%20%3Fdummy1%20%7D%0A%20%20FILTER%28%21bound%28%3Fdummy1%29%29%0A%7D"
+    url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?format=json&tool=wikidata_worker&[email protected]&ids="
+
+    r = requests.get(seed)
+    blob = r.json()
+
+    wikidata_items = {}  # key: doi; value: wikidata item
+    doi = []
+    for result in blob["results"]["bindings"]:
+        wikidata_items[result["doi"]["value"]] = result["item"]["value"].replace("http://www.wikidata.org/entity/", "")
+        doi.append(result["doi"]["value"])
+
+    packages = [doi[x:x+200] for x in range(0, len(doi), 200)]
+
+    for package in packages:
+        query_string = ""
+        for item in package:
+            query_string += item + ","
+        query_string = query_string[:-1]  # Remove trailing comma
+
+        s = requests.get(url + query_string)
+        try:
+            blob = s.json()
+        except ValueError:
+            continue
+
+        if "records" in blob:
+            for response in blob["records"]:
+                if "pmcid" in response:
+                    print(wikidata_items[response["doi"]] + "\tP932\t\"" + response["pmcid"].replace("PMC", "") + "\"")
+                    if "pmid" in response:
+                        print(wikidata_items[response["doi"]] + "\tP698\t\"" + response["pmid"] + "\"")
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,41 @@
+import requests
+import threading
+
+class AskPubMed(threading.Thread):
+    def __init__(self, threadID, name, package):
+        threading.Thread.__init__(self)
+        self.threadID = threadID
+        self.name = name
+        self.package = package
+
+    def run(self):
+        for item in self.package:
+            wikidata_item = item[0]
+            doi = item[1]
+            try:
+                r = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?retmode=json&db=pubmed&term=" + doi).json()
+            except (OSError, ValueError):
+                continue
+
+            if "esearchresult" in r:
+                if "count" in r["esearchresult"]:
+                    if r["esearchresult"]["count"] == "1":
+                        if "errorlist" not in r["esearchresult"]:
+                            pmid = r["esearchresult"]["idlist"][0]
+                            print(wikidata_item + "\tP698\t\"" + pmid + "\"")
+
+
+def main():
+    prefix = "http://www.wikidata.org/entity/"
+    seed = "https://query.wikidata.org/sparql?format=json&query=SELECT%20%3Fi%20%3Fd%20WHERE%20%7B%0A%20%20%3Fi%20wdt%3AP356%20%3Fd%20.%0A%20%20OPTIONAL%20%7B%20%3Fi%20wdt%3AP698%20%3Fp%20%7D%0A%20%20FILTER%28%21bound%28%3Fp%29%29%0A%7D%0AORDER%20BY%20%3Fi"
+    r = requests.get(seed).json()
+    items = [(x["i"]["value"].replace(prefix, ""), x["d"]["value"]) for x in r["results"]["bindings"]]
+    packages = [items[x:x+3000] for x in range(0, len(items), 3000)]
+    thread_counter = 0
+    for package in packages:
+        thread = AskPubMed(thread_counter, "thread-" + str(thread_counter), package)
+        thread_counter += 1
+        thread.start()
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,36 @@
+import csv
+import requests
+
+seed = requests.get("https://query.wikidata.org/sparql?format=json&query=select%20%3Fi%20where%20%7B%0A%20%20%7B%0A%20%20%20%20%3Fi%20wdt%3AP2880%20%3Fn%20.%0A%20%20%20%20%3Fi%20wdt%3AP1433%20%3Fpublishedin%20.%0A%20%20%7D%20UNION%20%7B%0A%20%20%20%20%3Fi%20wdt%3AP2880%20%3Fn%20.%0A%20%20%20%20%3Fi%20wdt%3AP212%20%3Fisbn13%20.%0A%20%20%7D%20UNION%20%7B%0A%20%20%20%20%3Fi%20wdt%3AP2880%20%3Fn%20.%0A%20%20%20%20%3Fi%20wdt%3AP957%20%3Fisbn10%20.%0A%20%20%7D%0A%7D").json()["results"]["bindings"]
+do_not_generate = [x["i"]["value"].replace("http://www.wikidata.org/entity/", "") for x in seed]
+
+isbn_bank = {}  # maps ISBNs to their putative titles; to prevent duplicate lookup
+
+with open("nioshtic_isbn.csv") as f:
+    spreadsheet = csv.reader(f)
+    for row in spreadsheet:
+        wikidata_item = row[0]
+        original_title = row[1]
+        isbn = row[2]
+
+        if wikidata_item in do_not_generate:
+            continue
+
+        if len(isbn) < 10:
+            isbn = isbn.zfill(10)
+
+        if isbn not in isbn_bank:
+            get_title = requests.get("http://xissn.worldcat.org/webservices/xid/isbn/" + isbn + "?format=json&method=getMetadata&fl=title")
+            if get_title.status_code != 200:
+                continue
+            try:
+                blob = get_title.json()
+            except:
+                continue
+            if "list" in blob:
+                if len(blob["list"]) == 1:
+                    if "title" in blob["list"][0]:
+                        print(wikidata_item + "\t" + original_title + "\t" + isbn + "\t" + blob["list"][0]["title"])
+                        isbn_bank[isbn] = blob["list"][0]["title"]
+        else:
+            print(wikidata_item + "\t" + original_title + "\t" + isbn + "\t" + isbn_bank[isbn])
@@ -0,0 +1,21 @@
+import csv
+import requests
+
+seed = requests.get("https://query.wikidata.org/sparql?format=json&query=select%20%3Fitem%20%3Fisbn13%20where%20%7B%20%3Fitem%20wdt%3AP212%20%3Fisbn13%20%7D").json()["results"]["bindings"]
+isbn13_to_wikidata = {x["isbn13"]["value"].replace("-", ""): x["item"]["value"].replace("http://www.wikidata.org/entity/", "") for x in seed}
+
+seed = requests.get("https://query.wikidata.org/sparql?format=json&query=select%20%3Fitem%20%3Fisbn10%20where%20%7B%20%3Fitem%20wdt%3AP957%20%3Fisbn10%20%7D").json()["results"]["bindings"]
+isbn10_to_wikidata = {x["isbn10"]["value"].replace("-", ""): x["item"]["value"].replace("http://www.wikidata.org/entity/", "") for x in seed}
+
+with open("isbn_associator.csv") as f:
+	spreadsheet = csv.reader(f)
+	for row in spreadsheet:
+		item = row[0].strip()
+		isbn = row[1].strip()  # item is published in isbn13
+
+		if len(isbn) == 13:
+			if isbn in isbn13_to_wikidata:
+				print(item + "\tP1433\t" + isbn13_to_wikidata[isbn] + "\tS248\tQ26822184")
+		elif len(isbn) == 10:
+			if isbn in isbn10_to_wikidata:
+				print(item + "\tP1433\t" + isbn10_to_wikidata[isbn] + "\tS248\tQ26822184")
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+__pycache__/`
	`2`	`+site_credentials.py`
	`3`	`+.DS_Store`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[submodule "BiblioWikidata"]`
	`2`	`+ path = BiblioWikidata`
	`3`	`+ url = https://github.com/harej/BiblioWikidata`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Various scripts I've written in support of my bibliographic metadata work on Wikidata.`
	`2`	`+`
	`3`	`+Most of these scripts are lousy and rushed. I do not recommend using them. I'm mostly putting this up so that I have access to the files when I'm not on my computer.`