From 3016f2c812b21a907b54068fdad4f06c2466bcdf Mon Sep 17 00:00:00 2001 From: David Scotson Date: Tue, 27 Jun 2023 22:26:48 +0100 Subject: [PATCH] Add ISBN 10 to 13 converter bot --- isbn10to13bot/README.md | 13 ++++ .../find_editions_with_isbn_10_not_13.sh | 17 +++++ isbn10to13bot/isbn_10_to_13.py | 75 +++++++++++++++++++ isbn10to13bot/requirements.txt | 2 + 4 files changed, 107 insertions(+) create mode 100644 isbn10to13bot/README.md create mode 100755 isbn10to13bot/find_editions_with_isbn_10_not_13.sh create mode 100644 isbn10to13bot/isbn_10_to_13.py create mode 100644 isbn10to13bot/requirements.txt diff --git a/isbn10to13bot/README.md b/isbn10to13bot/README.md new file mode 100644 index 00000000..f57cda53 --- /dev/null +++ b/isbn10to13bot/README.md @@ -0,0 +1,13 @@ +A set of scripts to add isbn_13 values to editions with valid isbn_10. +### How To Use +```bash +# Find Editions with ISBN 10, but no ISBN 13 + ./find_editions_with_isbn10not13.sh /path/to/ol_dump.txt.gz /path/to/filtered_dump.txt.gz +# Add ISBN 13s converted from the ISBN 10 +python isbn_10_to_13.py --dump_path=/path/to/filtered_dump.txt.gz --dry_run= --limit= +``` +If `dry_run` is True, the script will run as normal, but no changes will be saved to OpenLibrary. +This is for debugging purposes. By default, `dry_run` is `True`. +`limit` is the maximum number of changes to OpenLibrary that will occur before the script quits. +By default, `limit` is set to `1`. Setting `limit` to `0` allows unlimited edits. +A log is automatically generated whenever `isbn_10_to_13.py` executes. diff --git a/isbn10to13bot/find_editions_with_isbn_10_not_13.sh b/isbn10to13bot/find_editions_with_isbn_10_not_13.sh new file mode 100755 index 00000000..4d1bacea --- /dev/null +++ b/isbn10to13bot/find_editions_with_isbn_10_not_13.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +if [[ -z $1 ]] + then + echo "No dump file provided" + exit 1 +fi +if [[ -z $2 ]] + then + echo "No output file provided" + exit 1 +fi + +OL_DUMP=$1 +OUTPUT=$2 + +zgrep ^/type/edition $OL_DUMP | grep -E '"isbn_10":' | grep -v -E '"isbn_13":' | pv | gzip > $OUTPUT diff --git a/isbn10to13bot/isbn_10_to_13.py b/isbn10to13bot/isbn_10_to_13.py new file mode 100644 index 00000000..f0bd0079 --- /dev/null +++ b/isbn10to13bot/isbn_10_to_13.py @@ -0,0 +1,75 @@ +""" +isbn 10 to isbn 13 +NOTE: This script ideally works on an Open Library Dump that only contains editions with an isbn_10 and no isbn_13 +""" +import gzip +import json + +import isbnlib +import olclient + + +class ConvertISBN10to13Job(olclient.AbstractBotJob): + + def run(self) -> None: + """Looks for any ISBN 10s to convert to 13""" + self.write_changes_declaration() + header = {"type": 0, "key": 1, "revision": 2, "last_modified": 3, "JSON": 4} + comment = "convert ISBN 10 to 13 using isbnlib" + with gzip.open(self.args.file, "rb") as fin: + for row_num, row in enumerate(fin): + row = row.decode().split("\t") + _json = json.loads(row[header["JSON"]]) + if _json["type"]["key"] != "/type/edition": + continue + + if "isbn_10" in _json: + isbns_10 = _json.get("isbn_10", None) + else: + # we only update editions with existing isbn 10s + continue + if "isbn_13" in _json: + # we only update editions with no existing isbn 13s (for now at least) + continue + + olid = _json["key"].split("/")[-1] + edition = self.ol.Edition.get(olid) + if edition.type["key"] != "/type/edition": + continue + + isbns_13 = [] + for isbn in isbns_10: + canonical = isbnlib.canonical(isbn) + if isbnlib.is_isbn10(canonical): + isbn_13 = isbnlib.to_isbn13(canonical) + if isbnlib.is_isbn13(canonical): + isbn_13 = canonical + if isbn_13: + isbns_13.append(isbn_13) + + if len(isbns_13) > 1: + isbns_13 = dedupe(isbns_13) # remove duplicates, shouldn't normally be necessary + + setattr(edition, 'isbn_13', isbns_13) + self.logger.info( + "\t".join([olid, str(isbns_10), str(isbns_13)]) + ) + self.save(lambda: edition.save(comment=comment)) + +def dedupe(input_list: list) -> list: + """Remove duplicate elements in a list and return the new list""" + output = [] + for i in input_list: + if i not in output: + output.append(i) + return output + + +if __name__ == "__main__": + job = ConvertISBN10to13Job() + + try: + job.run() + except Exception as e: + job.logger.exception(e) + raise e diff --git a/isbn10to13bot/requirements.txt b/isbn10to13bot/requirements.txt new file mode 100644 index 00000000..c9202c4b --- /dev/null +++ b/isbn10to13bot/requirements.txt @@ -0,0 +1,2 @@ +openlibrary-client==0.0.30 +isbnlib==3.10.14