From a33265fedeba8ed6a9b9fb99e8e78ce63701cef2 Mon Sep 17 00:00:00 2001 From: Albert Tian Chen Date: Thu, 23 May 2024 13:30:00 -0400 Subject: [PATCH] Adjust GenBank download chunks to not surpass today's date --- workflow_flu_genbank_ingest/Snakefile | 7 +++++-- workflow_rsv_genbank_ingest/Snakefile | 7 +++++-- workflow_sars2_genbank_ingest/Snakefile | 8 ++++++-- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/workflow_flu_genbank_ingest/Snakefile b/workflow_flu_genbank_ingest/Snakefile index 744fb5cf..7c6d461d 100644 --- a/workflow_flu_genbank_ingest/Snakefile +++ b/workflow_flu_genbank_ingest/Snakefile @@ -19,9 +19,12 @@ static_data_folder = os.path.join("..", config["static_data_folder"]) min_date = pd.to_datetime(config.get('min_date', '2019-12-01')) if min_date is None: min_date = '2019-12-01' -max_date = pd.to_datetime(datetime.date.today().isoformat()) +max_date = pd.to_datetime( + config.get('end_date_cutoff', + (datetime.date.today() - datetime.timedelta(days=14)).isoformat() +)) if max_date is None: - max_date = datetime.date.today().isoformat() + max_date = (datetime.date.today() - datetime.timedelta(days=14)).isoformat() chunks = [d for d in pd.period_range(start=min_date, end=max_date, freq=config.get('dl_chunk_period', 'W'))] DL_CHUNKS = [i for i in range(len(chunks))] diff --git a/workflow_rsv_genbank_ingest/Snakefile b/workflow_rsv_genbank_ingest/Snakefile index a6adc5e2..6077734e 100644 --- a/workflow_rsv_genbank_ingest/Snakefile +++ b/workflow_rsv_genbank_ingest/Snakefile @@ -19,9 +19,12 @@ static_data_folder = os.path.join("..", config["static_data_folder"]) min_date = pd.to_datetime(config.get('min_date', '2019-12-01')) if min_date is None: min_date = '2019-12-01' -max_date = pd.to_datetime(datetime.date.today().isoformat()) +max_date = pd.to_datetime( + config.get('end_date_cutoff', + (datetime.date.today() - datetime.timedelta(days=14)).isoformat() +)) if max_date is None: - max_date = datetime.date.today().isoformat() + max_date = (datetime.date.today() - datetime.timedelta(days=14)).isoformat() chunks = [d for d in pd.period_range(start=min_date, end=max_date, freq=config.get('dl_chunk_period', 'W'))] DL_CHUNKS = [i for i in range(len(chunks))] diff --git a/workflow_sars2_genbank_ingest/Snakefile b/workflow_sars2_genbank_ingest/Snakefile index d0cee6cc..0d291ec4 100644 --- a/workflow_sars2_genbank_ingest/Snakefile +++ b/workflow_sars2_genbank_ingest/Snakefile @@ -34,13 +34,17 @@ rule all: min_date = pd.to_datetime(config.get('min_date', '2019-12-01')) if min_date is None: min_date = '2019-12-01' -max_date = pd.to_datetime(config.get('end_date_cutoff', datetime.date.today().isoformat())) +max_date = pd.to_datetime( + config.get('end_date_cutoff', + (datetime.date.today() - datetime.timedelta(days=14)).isoformat() +)) if max_date is None: - max_date = datetime.date.today().isoformat() + max_date = (datetime.date.today() - datetime.timedelta(days=14)).isoformat() chunks = [d for d in pd.period_range(start=min_date, end=max_date, freq=config.get('dl_chunk_period', 'W'))] DL_CHUNKS = [i for i in range(len(chunks))] + rule download_metadata_chunk: """Download the data feed in chunks, to avoid timeouts. """