Skip to content

Commit

Permalink
OD-1728: Perform steps O, P, Q, R automatically (#75)
Browse files Browse the repository at this point in the history
  • Loading branch information
hlieberman authored Jan 21, 2024
1 parent 518a32d commit 48d1828
Show file tree
Hide file tree
Showing 3 changed files with 166 additions and 0 deletions.
147 changes: 147 additions & 0 deletions 08-Check-ODAP-Tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
#!/usr/bin/env python3

import sys
from shared_python.Args import Args
from shared_python.Sql import Sql

if __name__ == "__main__":
"""Perform various audits to ensure data quality on the ODAP tables prior to
export."""
args_obj = Args()
args = args_obj.args_for_08()

log = args_obj.logger_with_filename()
sql = Sql(args, log)
db: str = args.output_database
found_error = False

log.info(f"Performing quality checks on {db}")
sql.execute(f"USE {db}")

##
## Check for too many tags
##

# The Archive will error if a work has more than 75 total tags.

log.debug("Checking for too many tags.")

tag_counts: dict[str, int] = sql.execute_dict(
"""SELECT
id,
length(categories) - length(replace(categories, ",", "")) as cat_num,
length(characters) - length(replace(characters, ",", "")) as chr_num,
length(fandoms) - length(replace(fandoms, ",", "")) as fnd_num,
length(tags) - length(replace(tags, ",", "")) as tag_num
FROM
stories""",
)

for story in tag_counts:
# We're counting commas, not items, so we need to add 4 to the total to
# account for the worst-case of each type having one entry.
total = (
story["cat_num"]
+ story["chr_num"]
+ story["fnd_num"]
+ story["tag_num"]
+ 4
)
if total > 75:
log.error(f"Found story {story['id']} with too many tags!")
found_error = True

if found_error:
log.error("Found at least one story with too many tags; ending audit here.")
sys.exit(1)

##
## Check for excessively long summaries
##

# The Archive does not support summaries longer than 1250 characters.

log.debug("Checking for excessively long summaries.")
found_error = False

long_sums = sql.execute_dict(
"SELECT id, char_length(summary) as len FROM stories HAVING len >= 1250"
)
if long_sums:
found_error = True
for story in long_sums:
log.error(f"Found story {story['id']} with too long summary!")

if found_error:
log.error("Found at least one story with too long summary; ending audit here.")
sys.exit(2)

##
## Check for excessively long notes
##

# The Archive does not support notes longer than 5000 characters.

log.debug("Checking for excessively long notes.")
found_error = False

long_notes = sql.execute_dict(
"SELECT id, char_length(notes) as len FROM stories HAVING len >= 5000"
)
if long_notes:
found_error = True
for story in long_notes:
log.error(f"Found story {story['id']} with too long notes!")

if found_error:
log.error(
"Found at least one story with excessively long notes; ending audit here."
)
sys.exit(3)

##
## Check for too-long chapters
##

# Chapters longer than 500,000 characters should be split.
# NB: This is counting bytes, not characters, for performance reasons.
log.debug("Checking for too long chapters.")
found_error = False

long_chap = sql.execute_dict(
"SELECT id as chap, story_id as sid, length(text) as len FROM chapters HAVING len > 500000",
)
if long_chap:
found_error = True
for story in long_chap:
log.error(
f"Found chapter {story['chap']} in story {story['sid']} that could be too long. Check to make sure it's under 500k characters."
)

if found_error:
log.error("Found at least one too-long chapter; ending audit here.")
sys.exit(4)

##
## Check for stories with too many chapters
##

# The Archive does not allow importing a story with more than 200 chapters
log.debug("Checking for stories with too many chapters")
found_error = False

many_chap = sql.execute_dict(
"SELECT story_id as sid, count(id) as len FROM chapters GROUP BY sid HAVING len >= 200",
)
if many_chap:
found_error = True
for story in many_chap:
log.error(
f"Found story {story['sid']} that has too many chapters ({story['len']})!"
)

if found_error:
log.error("Found at least one story with too many chapters; ending audit here.")
sys.exit(5)

log.info("All checks completed successfully.")
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ story link information into spreadsheets used for searching. (all)
- 04 - Map the tags in the `tags` table to AO3 tags suggested by wranglers. (all)
- 05 - Create the final tables that will be used for the temp site and copy all the authors, stories and story links. (all)
- 06 - Copy the AO3 tags into the final story and story link rows. (all)
- 08 - Audit the final tables to find common problems. (all)

At this point, the final database is ready to be loaded into a [temporary website](https://github.com/otwcode/open-doors-temp-site) that will be used to feed the works into
the Archive using its mass import API.
Expand Down Expand Up @@ -185,6 +186,12 @@ fields in the `stories` or `bookmarks` databases.
- The output for this command (eg "Getting all tags per story...429/429 stories") will report the number of stories in
the tag table, which may be more than the number of stories you have after removing DNI in the previous stage.

### Step 08 - Audit final tables for common problems

| python 08-Check-ODAP-Tables.py -p <archive name>.yml

This script performs checks on common reasons for Archive rejection, including checking for chapters that are too long, stories that have too many chapters, etc. It makes no attempt to fix problems that arise; you must do that manually. Also note, it will only perform each check (aligned to the old JIRA tickets O, P, Q, R, and S) if no problems have been found on previous steps. If you encounter errors in any step, you will need to repeat running the script and fixing the errors until it exits cleanly.

### Common problems to look out for when processing chapters

*Tip*: Some of these problems might be easier to fix by loading the chapters into MySQL and then exporting the `chapters`
Expand Down
12 changes: 12 additions & 0 deletions shared_python/Args.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,3 +256,15 @@ def args_for_07(self):
)
self._print_args(self.args)
return self.args


def args_for_08(self):
if self.args.output_database is None:
self.args.output_database = input(
'Name of the database the final tables should be created in (default "od_sgf"):'
)
self.args.output_database = (
"od_sgf" if self.args.output_database == "" else self.args.output_database
)
self._print_args(self.args)
return self.args

0 comments on commit 48d1828

Please sign in to comment.