Skip to content

Commit

Permalink
Repair script stripped
Browse files Browse the repository at this point in the history
  • Loading branch information
jonholdsworth committed Nov 1, 2024
1 parent f16ca20 commit 4951b52
Showing 1 changed file with 0 additions and 194 deletions.
194 changes: 0 additions & 194 deletions bin/repair-orphaned-videos.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,203 +125,9 @@ def aws_cli(args_list):
return output


# Fake key is a hack to handle FULL JOIN
def maybe_fakekey(instring):
return instring if instring else FAKEKEY_PREFIX + str(uuid4())


def filter_fakekey(instring):
return "" if instring.startswith(FAKEKEY_PREFIX) else instring


# Get the video files info from NZSL Signbank
def get_nzsl_raw_keys_dict():
print(
f"Getting raw list of video file info from NZSL Signbank ...",
file=sys.stderr,
)
this_nzsl_raw_keys_dict = {}
# Column renaming is for readability
# Special delimiter because columns might contain commas
result = pg_cli(
[
"COPY ("
"SELECT "
"dg.id AS gloss_id, "
"dg.idgloss AS gloss_idgloss, "
"dg.created_at AS gloss_created_at, "
"dg.published AS gloss_public, "
"vg.is_public AS video_public, "
"vg.id AS video_id, "
"vg.videofile AS video_key "
"FROM dictionary_gloss AS dg "
"FULL JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id"
") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')",
]
)

# Separate the NZSL db columns
# Write them to a dictionary, so we can do fast operations
for rawl in result.stdout.split("\n"):
rawl = rawl.strip()
if not rawl:
continue
[
gloss_id,
gloss_idgloss,
gloss_created_at,
gloss_public,
video_public,
video_id,
video_key,
] = rawl.split("|")

# Hack to handle FULL JOIN
video_key = maybe_fakekey(video_key.strip())

# This sets the initial field ordering in the all_keys dictionary row
this_nzsl_raw_keys_dict[video_key] = [
gloss_idgloss.replace(CSV_DELIMITER, ""),
gloss_created_at,
gloss_id,
video_id,
gloss_public.lower() == "t",
video_public.lower() == "t",
]

print(
f"{len(this_nzsl_raw_keys_dict)} rows retrieved",
file=sys.stderr,
)

return this_nzsl_raw_keys_dict


# Get all keys from AWS S3
def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET):
print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr)
result = aws_cli(
[
"s3",
"ls",
f"s3://{s3_bucket}",
"--recursive",
],
)

# Separate out just the key from date, time, size, key
this_s3_bucket_raw_keys_list = []
for line in result.stdout.split("\n"):
if line:
this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3])

print(
f"{len(this_s3_bucket_raw_keys_list)} rows retrieved",
file=sys.stderr,
)

return this_s3_bucket_raw_keys_list


# Get the keys present and absent across NZSL Signbank and S3, to dictionary
def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list):
print(
"Getting keys present and absent across NZSL Signbank and S3 ...",
file=sys.stderr,
)
this_all_keys_dict = {}

# Find S3 keys that are present in NZSL, or absent
for video_key in this_s3_bucket_raw_keys_list:
dict_row = this_nzsl_raw_keys_dict.get(video_key, None)
if dict_row:
# NZSL glossvideo record for this S3 key
this_all_keys_dict[video_key] = [
True, # NZSL PRESENT
True, # S3 PRESENT
] + dict_row
else:
# S3 key with no corresponding NZSL glossvideo record
this_all_keys_dict[video_key] = [
False, # NZSL Absent
True, # S3 PRESENT
] + [""] * 6

# Find NZSL keys that are absent from S3 (present in both handled above)
for video_key, dict_row in this_nzsl_raw_keys_dict.items():
if video_key not in this_s3_bucket_raw_keys_list:
# gloss/glossvideo record with no corresponding S3 key
# Either:
# video_key is real, but the S3 object is missing
# video_key is fake (to handle the FULL JOIN) and this gloss/glossvideo never had an S3 object
this_all_keys_dict[video_key] = [
True, # NZSL PRESENT
False, # S3 Absent
] + dict_row

return this_all_keys_dict


def find_orphans():
all_keys_dict = create_all_keys_dict(
get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()
)

print("Gloss ID,Gloss,Suggested Video key")

# Traverse all the NZSL Signbank glosses that are missing S3 objects
for video_key, [
key_in_nzsl,
key_in_s3,
gloss_idgloss,
gloss_created_at,
gloss_id,
video_id,
gloss_public,
video_public,
] in all_keys_dict.items():

if not key_in_nzsl:
# This is an S3 object, not a Signbank record
continue

if key_in_s3:
# This Signbank record already has an S3 object, all is well
continue

# Business rule
if int(gloss_id) < 8000:
continue

# The gloss_id is the only reliable retrieval key at the Signbank end
gloss = Gloss.objects.get(id=gloss_id)
gloss_name = gloss.idgloss.split(":")[0].strip()
video_path = gloss.get_video_path()

# Skip any that already have a video path
# These should have an S3 object but don't: For some reason the video never made it to S3
# These will have to have their videos reinstated (separate operation)
if len(video_path) > 0:
continue

# We try to find the orphaned S3 object, if it exists
# TODO We could improve on brute-force by installing new libraries eg. rapidfuzz
for test_key, [key_nzsl_yes, key_s3_yes, *_] in all_keys_dict.items():
if gloss_name in test_key:
if str(gloss_id) in test_key:
if key_nzsl_yes:
print(f"Anomaly (in NZSL): {gloss.idgloss}", file=sys.stderr)
continue
if not key_s3_yes:
print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr)
continue
print(CSV_DELIMITER.join([gloss_id, gloss.idgloss, test_key]))


print(f"Env: {args.env}", file=sys.stderr)
print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr)
print(f"AWSCLI: {AWSCLI}", file=sys.stderr)
print(f"PGCLI: {PGCLI}", file=sys.stderr)
print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr)

find_orphans()

0 comments on commit 4951b52

Please sign in to comment.