Uncompress SARS2 feed file on the fly to save disk space (#623)

* Uncompress feed file on the fly to save disk space * Fix typo in read mode
vector-engineering · Jul 27, 2023 · 1b52c3a · 1b52c3a
1 parent 4598e5a
commit 1b52c3a
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 6 deletions.
diff --git a/workflow_sars2_gisaid_ingest/Snakefile b/workflow_sars2_gisaid_ingest/Snakefile
@@ -34,14 +34,12 @@ rule all:
 
 
 rule download:
-    """Download the data feed JSON object from the GISAID database, using our data feed credentials. The resulting file will need to be decompressed by `decompress_data_feed`
-    """
+    """Download the data feed JSON object from the GISAID database, using our data feed credentials."""
     output:
-        feed = temp(os.path.join(data_folder, "feed.json")),
+        feed = temp(os.path.join(data_folder, "feed.json.xz")),
         status = touch(rules.all.input.download_status)
-    threads: workflow.cores
     shell:
-        "scripts/download.sh | unxz --threads={threads} -c - > {output.feed}"
+        "scripts/download.sh > {output.feed}"
 
 
 rule process_feed:

diff --git a/workflow_sars2_gisaid_ingest/scripts/process_feed.py b/workflow_sars2_gisaid_ingest/scripts/process_feed.py
@@ -14,6 +14,7 @@
 import gzip
 import hashlib
 import json
+import lzma
 import multiprocessing as mp
 import os
 import pandas as pd
@@ -172,7 +173,7 @@ def error_callback(e):
 
     # Get fields for each isolate
     fields = []
-    with open(args.data_feed, "r") as fp_in:
+    with lzma.open(args.data_feed, "xt") as fp_in:
         isolate = json.loads(fp_in.readline().strip())
         for i, key in enumerate(isolate.keys()):
             # Skip the special sequence column