From 749bb20d3a902a5c865c537875ca55f132ab7045 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 11 Dec 2024 15:44:34 +1100 Subject: [PATCH] Script renamings --- ...-orphans.py => find-fixable-s3-orphans.py} | 35 +++++++++++++------ ...rphans.py => repair-fixable-s3-orphans.py} | 12 +++++++ 2 files changed, 37 insertions(+), 10 deletions(-) rename bin/{find-fixable-orphans.py => find-fixable-s3-orphans.py} (88%) rename bin/{repair-fixable-orphans.py => repair-fixable-s3-orphans.py} (95%) diff --git a/bin/find-fixable-orphans.py b/bin/find-fixable-s3-orphans.py similarity index 88% rename from bin/find-fixable-orphans.py rename to bin/find-fixable-s3-orphans.py index 90555cfc..0b886714 100755 --- a/bin/find-fixable-orphans.py +++ b/bin/find-fixable-s3-orphans.py @@ -23,8 +23,6 @@ # Magic required to allow this script to use Signbank Django classes # This goes away if this script becomes a Django Management Command -print("Importing site-packages environment", file=sys.stderr) -print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr) sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") from django.core.wsgi import get_wsgi_application @@ -72,11 +70,14 @@ # Other globals CSV_DELIMITER = "," -FAKEKEY_PREFIX = "this_is_not_a_key_" DATABASE_URL = os.getenv("DATABASE_URL", "") PGCLI = args.pgcli AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}" +# Hack to handle FULL JOIN +# See get_nzsl_raw_keys_dict() +FAKEKEY_PREFIX = "this_is_not_a_key_" + def pg_cli(args_list): try: @@ -146,7 +147,23 @@ def get_nzsl_raw_keys_dict(): video_key, ] = rawl.split("|") - # Hack to handle FULL JOIN + """ + Hack to handle FULL JOIN. + We are storing data rows in a dictionary, indexed by video_key. + Because we are doing a FULL JOIN on the NZSL Signbank database, + we also get rows where there are gloss entries that do not have + a corresponding video_glossvideo. + (These are erroneous and one of the reasons this script exists, + to find them.) + Consequently there is no video_key, and we cannot use it to index + the data row. + Instead, we create a fake video_key that is unique and, theoretically, + impossible for anything else to try and use. It also has a 'safe', + easily filtered prefix, which means later code can easily tell + a fake key from a real key. + Always having a key, in this way, means that code, eg. loops, + that depends on there being a dictionary key axis will not break. + """ video_key = maybe_fakekey(video_key.strip()) # This sets the initial field ordering in the all_keys dictionary row @@ -194,6 +211,9 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): this_all_keys_dict = {} # Find S3 keys that are present in NZSL, or absent + # TODO This could be changed to use pop(), so that on each pass we are left + # with a smaller subset of the rows, which we can search faster. If the + # database becomes very large in future this could save a lot of processing. for video_key in this_s3_bucket_raw_keys_list: dict_row = this_nzsl_raw_keys_dict.get(video_key, None) if dict_row: @@ -252,19 +272,14 @@ def find_orphans(): # This Signbank record already has an S3 object, all is well continue - # Business rule - if int(gloss_id) < 8000: - continue - # The gloss_id is the only reliable retrieval key at the Signbank end gloss = Gloss.objects.get(id=gloss_id) gloss_name = gloss.idgloss.split(":")[0].strip() - video_path = gloss.get_video_path() # Skip any that already have a video path # These should have an S3 object but don't: For some reason the video never made it to S3 # These will have to have their videos reinstated (separate operation) - if len(video_path) > 0: + if gloss.glossvideo_set.exists(): continue # We try to find the orphaned S3 object, if it exists diff --git a/bin/repair-fixable-orphans.py b/bin/repair-fixable-s3-orphans.py similarity index 95% rename from bin/repair-fixable-orphans.py rename to bin/repair-fixable-s3-orphans.py index ce948be2..84648b49 100755 --- a/bin/repair-fixable-orphans.py +++ b/bin/repair-fixable-s3-orphans.py @@ -66,6 +66,13 @@ required=False, help=f"Postgres client path (default: %(default)s)", ) +parser.add_argument( + "--dryrun", + default=False, + required=False, + action="store_true", + help=f"Don't actually make any changes, just output what would happen", +) args = parser.parse_args() # Keep synced with other scripts @@ -151,6 +158,11 @@ def process_csv(): ) print(gloss) print(gloss_video) + + if args.dryrun: + print("Dry run, no changes") + continue + # At this point we complete the repair # We use bulk_create() because we cannot allow save() to run if len(GlossVideo.objects.bulk_create([gloss_video])) < 1: