|
| 1 | +import requests |
| 2 | + |
| 3 | +def main(): |
| 4 | + # Canonical DOI format: all uppercase letters. |
| 5 | + # Three scenarios: |
| 6 | + # |
| 7 | + # 1. One DOI on an item that is identical with the canonical format: |
| 8 | + # No action needed |
| 9 | + # |
| 10 | + # 2. One DOI on an item that is not identical with canonical format: |
| 11 | + # Convert to canonical format |
| 12 | + # |
| 13 | + # 3. Two or more DOIs on a page that match with the canonical format when converted to uppercase: |
| 14 | + # Check if canonically formatted DOI already on page and keep that. Remove non-matching ones. |
| 15 | + # If none are in matching format, create an entry in the necessary format and delete the others. |
| 16 | + # |
| 17 | + # 4. Two or more DOIs that are different even when normalized to uppercase: |
| 18 | + # Do nothing. This is a special case and requires manual intervention. |
| 19 | + |
| 20 | + url = "https://query.wikidata.org/sparql?format=json&query=select%20%3Fi%20%3Fdoi%20where%20%7B%20%3Fi%20wdt%3AP356%20%3Fdoi%20%7D%20order%20by%20%3Fi" |
| 21 | + seed = requests.get(url).json()["results"]["bindings"] |
| 22 | + |
| 23 | + manifest = {} # dictionary of lists |
| 24 | + |
| 25 | + for result in seed: |
| 26 | + wikidata_item = result["i"]["value"].replace("http://www.wikidata.org/entity/", "") |
| 27 | + doi = result["doi"]["value"] |
| 28 | + |
| 29 | + if wikidata_item not in manifest: |
| 30 | + manifest[wikidata_item] = [] |
| 31 | + |
| 32 | + manifest[wikidata_item].append(doi) |
| 33 | + |
| 34 | + lines_to_print = [] |
| 35 | + |
| 36 | + for wikidata_item, doi_list in manifest.items(): |
| 37 | + |
| 38 | + canonical = doi_list[0].upper() |
| 39 | + |
| 40 | + if len(doi_list) > 1: |
| 41 | + # Testing to see if all DOIs are the same when converted to uppercase. |
| 42 | + # If not, then it's case 4 and must be skipped. |
| 43 | + |
| 44 | + requires_manual_intervention = False |
| 45 | + for doi in doi_list[1:]: |
| 46 | + if doi.upper() != canonical: |
| 47 | + requires_manual_intervention = True |
| 48 | + |
| 49 | + if requires_manual_intervention == True: |
| 50 | + continue |
| 51 | + |
| 52 | + # Next: find out if the canonical is already in this list |
| 53 | + |
| 54 | + canonical_is_present = False |
| 55 | + for doi in doi_list: |
| 56 | + if doi == canonical: |
| 57 | + canonical_is_present = True |
| 58 | + else: |
| 59 | + lines_to_print.append("-" + wikidata_item + "|P356|\"" + doi + "\"||") |
| 60 | + |
| 61 | + if canonical_is_present == False: |
| 62 | + lines_to_print.append(wikidata_item + "|P356|\"" + canonical + "\"||") |
| 63 | + |
| 64 | + else: |
| 65 | + if doi_list[0] != canonical: |
| 66 | + lines_to_print.append("-" + wikidata_item + "|P356|\"" + doi_list[0] + "\"||") |
| 67 | + lines_to_print.append(wikidata_item + "|P356|\"" + canonical + "\"||") |
| 68 | + |
| 69 | + packages = [lines_to_print[x:x+20000] for x in range(0, len(lines_to_print), 20000)] |
| 70 | + |
| 71 | + counter = 0 |
| 72 | + for package in packages: |
| 73 | + with open('normalized-' + str(counter).zfill(2) + '.txt', 'w') as f: |
| 74 | + to_write = '' |
| 75 | + for line in package: |
| 76 | + to_write += line |
| 77 | + f.write(to_write) |
| 78 | + counter += 1 |
| 79 | + |
| 80 | + |
| 81 | + |
| 82 | +if __name__ == '__main__': |
| 83 | + main() |
0 commit comments