OD-1728: Perform steps O, P, Q, R automatically (#75)

otwcode · Jan 21, 2024 · 48d1828 · 48d1828
1 parent 518a32d
commit 48d1828
Show file tree

Hide file tree

Showing 3 changed files with 166 additions and 0 deletions.
diff --git a/08-Check-ODAP-Tables.py b/08-Check-ODAP-Tables.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+
+import sys
+from shared_python.Args import Args
+from shared_python.Sql import Sql
+
+if __name__ == "__main__":
+    """Perform various audits to ensure data quality on the ODAP tables prior to
+    export."""
+    args_obj = Args()
+    args = args_obj.args_for_08()
+
+    log = args_obj.logger_with_filename()
+    sql = Sql(args, log)
+    db: str = args.output_database
+    found_error = False
+
+    log.info(f"Performing quality checks on {db}")
+    sql.execute(f"USE {db}")
+
+    ##
+    ## Check for too many tags
+    ##
+
+    # The Archive will error if a work has more than 75 total tags.
+
+    log.debug("Checking for too many tags.")
+
+    tag_counts: dict[str, int] = sql.execute_dict(
+        """SELECT
+        id,
+        length(categories) - length(replace(categories, ",", "")) as cat_num,
+        length(characters) - length(replace(characters, ",", "")) as chr_num,
+        length(fandoms) - length(replace(fandoms, ",", "")) as fnd_num,
+        length(tags) - length(replace(tags, ",", "")) as tag_num
+        FROM
+        stories""",
+    )
+
+    for story in tag_counts:
+        # We're counting commas, not items, so we need to add 4 to the total to
+        # account for the worst-case of each type having one entry.
+        total = (
+            story["cat_num"]
+            + story["chr_num"]
+            + story["fnd_num"]
+            + story["tag_num"]
+            + 4
+        )
+        if total > 75:
+            log.error(f"Found story {story['id']} with too many tags!")
+            found_error = True
+
+    if found_error:
+        log.error("Found at least one story with too many tags; ending audit here.")
+        sys.exit(1)
+
+    ##
+    ## Check for excessively long summaries
+    ##
+
+    # The Archive does not support summaries longer than 1250 characters.
+
+    log.debug("Checking for excessively long summaries.")
+    found_error = False
+
+    long_sums = sql.execute_dict(
+        "SELECT id, char_length(summary) as len FROM stories HAVING len >= 1250"
+    )
+    if long_sums:
+        found_error = True
+        for story in long_sums:
+            log.error(f"Found story {story['id']} with too long summary!")
+
+    if found_error:
+        log.error("Found at least one story with too long summary; ending audit here.")
+        sys.exit(2)
+
+    ##
+    ## Check for excessively long notes
+    ##
+
+    # The Archive does not support notes longer than 5000 characters.
+
+    log.debug("Checking for excessively long notes.")
+    found_error = False
+
+    long_notes = sql.execute_dict(
+        "SELECT id, char_length(notes) as len FROM stories HAVING len >= 5000"
+    )
+    if long_notes:
+        found_error = True
+        for story in long_notes:
+            log.error(f"Found story {story['id']} with too long notes!")
+
+    if found_error:
+        log.error(
+            "Found at least one story with excessively long notes; ending audit here."
+        )
+        sys.exit(3)
+
+    ##
+    ## Check for too-long chapters
+    ##
+
+    # Chapters longer than 500,000 characters should be split.
+    # NB: This is counting bytes, not characters, for performance reasons.
+    log.debug("Checking for too long chapters.")
+    found_error = False
+
+    long_chap = sql.execute_dict(
+        "SELECT id as chap, story_id as sid, length(text) as len FROM chapters HAVING len > 500000",
+    )
+    if long_chap:
+        found_error = True
+        for story in long_chap:
+            log.error(
+                f"Found chapter {story['chap']} in story {story['sid']} that could be too long.  Check to make sure it's under 500k characters."
+            )
+
+    if found_error:
+        log.error("Found at least one too-long chapter; ending audit here.")
+        sys.exit(4)
+
+    ##
+    ## Check for stories with too many chapters
+    ##
+
+    # The Archive does not allow importing a story with more than 200 chapters
+    log.debug("Checking for stories with too many chapters")
+    found_error = False
+
+    many_chap = sql.execute_dict(
+        "SELECT story_id as sid, count(id) as len FROM chapters GROUP BY sid HAVING len >= 200",
+    )
+    if many_chap:
+        found_error = True
+        for story in many_chap:
+            log.error(
+                f"Found story {story['sid']} that has too many chapters ({story['len']})!"
+            )
+
+    if found_error:
+        log.error("Found at least one story with too many chapters; ending audit here.")
+        sys.exit(5)
+
+    log.info("All checks completed successfully.")
diff --git a/README.md b/README.md
@@ -49,6 +49,7 @@ story link information into spreadsheets used for searching. (all)
 - 04 - Map the tags in the `tags` table to AO3 tags suggested by wranglers. (all)
 - 05 - Create the final tables that will be used for the temp site and copy all the authors, stories and story links. (all)
 - 06 - Copy the AO3 tags into the final story and story link rows. (all)
+- 08 - Audit the final tables to find common problems. (all)
 
 At this point, the final database is ready to be loaded into a [temporary website](https://github.com/otwcode/open-doors-temp-site) that will be used to feed the works into
 the Archive using its mass import API.
@@ -185,6 +186,12 @@ fields in the `stories` or `bookmarks` databases.
 - The output for this command  (eg "Getting all tags per story...429/429 stories") will report the number of stories in 
 the tag table, which may be more than the number of stories you have after removing DNI in the previous stage.
 
+### Step 08 - Audit final tables for common problems
+
+|   python 08-Check-ODAP-Tables.py -p <archive name>.yml
+
+This script performs checks on common reasons for Archive rejection, including checking for chapters that are too long, stories that have too many chapters, etc. It makes no attempt to fix problems that arise; you must do that manually.  Also note, it will only perform each check (aligned to the old JIRA tickets O, P, Q, R, and S) if no problems have been found on previous steps.  If you encounter errors in any step, you will need to repeat running the script and fixing the errors until it exits cleanly.
+
 ### Common problems to look out for when processing chapters
 
 *Tip*: Some of these problems might be easier to fix by loading the chapters into MySQL and then exporting the `chapters`

diff --git a/shared_python/Args.py b/shared_python/Args.py
@@ -256,3 +256,15 @@ def args_for_07(self):
             )
         self._print_args(self.args)
         return self.args
+
+
+def args_for_08(self):
+    if self.args.output_database is None:
+        self.args.output_database = input(
+            'Name of the database the final tables should be created in (default "od_sgf"):'
+        )
+        self.args.output_database = (
+            "od_sgf" if self.args.output_database == "" else self.args.output_database
+        )
+    self._print_args(self.args)
+    return self.args