Skip to content

Commit

Permalink
Uncompress SARS2 feed file on the fly to save disk space (#623)
Browse files Browse the repository at this point in the history
* Uncompress feed file on the fly to save disk space

* Fix typo in read mode
  • Loading branch information
atc3 authored Jul 27, 2023
1 parent 4598e5a commit 1b52c3a
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 6 deletions.
8 changes: 3 additions & 5 deletions workflow_sars2_gisaid_ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,12 @@ rule all:


rule download:
"""Download the data feed JSON object from the GISAID database, using our data feed credentials. The resulting file will need to be decompressed by `decompress_data_feed`
"""
"""Download the data feed JSON object from the GISAID database, using our data feed credentials."""
output:
feed = temp(os.path.join(data_folder, "feed.json")),
feed = temp(os.path.join(data_folder, "feed.json.xz")),
status = touch(rules.all.input.download_status)
threads: workflow.cores
shell:
"scripts/download.sh | unxz --threads={threads} -c - > {output.feed}"
"scripts/download.sh > {output.feed}"


rule process_feed:
Expand Down
3 changes: 2 additions & 1 deletion workflow_sars2_gisaid_ingest/scripts/process_feed.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import gzip
import hashlib
import json
import lzma
import multiprocessing as mp
import os
import pandas as pd
Expand Down Expand Up @@ -172,7 +173,7 @@ def error_callback(e):

# Get fields for each isolate
fields = []
with open(args.data_feed, "r") as fp_in:
with lzma.open(args.data_feed, "xt") as fp_in:
isolate = json.loads(fp_in.readline().strip())
for i, key in enumerate(isolate.keys()):
# Skip the special sequence column
Expand Down

0 comments on commit 1b52c3a

Please sign in to comment.