-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetch_genomes.py
executable file
·570 lines (514 loc) · 25.9 KB
/
fetch_genomes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
#!/usr/bin/env python3
# Created by Michal Bukowski ([email protected]) under GPL-3.0 license
# USAGE, example (for more see ./fetch_genomes.py -h):
# Download genomes for taxid 1279 (Staphylococcus) and 1350 (Enterococcus) and
# all subtaxa to a default directory (genomes) in the current location:
# ./fetch_genomes.py -t 1279 1350
# Resume previous downolading based on saved filtered assembly summary:
# ./fetch_genomes.py -a assembly_summary_copy.tsv
# Retrive filtered assembly summary only:
# ./fetch_genomes.py -t 1279 1350 -s
# You may find desirable taxids here: https://www.ncbi.nlm.nih.gov/taxonomy
import os, sys, argparse
import requests, urllib
import hashlib
from time import sleep
import pandas as pd
# A path to a TSV file on NCBI server that contains info on genomic assemblies.
assembly_summary = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/assembly_summary_genbank.txt'
# Columns that must be present in the summary file.
summary_cols = 'assembly_accession taxid assembly_level asm_name ftp_path'.split()
# The default path where to save a filtered summary copy.
summary_copy = 'assembly_summary_copy.tsv'
# Data that can be obtained from NCBI GenBank for a given genomic assembly,
# see parse_args function for more information.
assembly_formats = {
'fna' : 'genomic.fna.gz',
'gbff' : 'genomic.gbff.gz',
'gff' : 'genomic.gff.gz',
'rna' : 'rna_from_genomic.fna.gz',
'cds' : 'cds_from_genomic.fna.gz',
'prot' : 'translated_cds.faa.gz'
}
# Possible genomic assembly levels to choose from, see parse_args function
# for more information.
assembly_levels = {
'chr' : 'Chromosome',
'scff' : 'Scaffold',
'cmpl' : 'Complete Genome',
'ctg' : 'Contig'
}
# The default directory where to save downloaded genomes.
gen_dir = 'genomes'
# The filename with MD5 checksums in remote assembly directories.
md5sums_fname = 'md5checksums.txt'
# The maximum number of records to fetch from NCBI Taxonomy.
esearch_retmax = 100000
# A request template to NCBI Taxonomy database.
esearch_path = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?' + \
'db=taxonomy&term=txid{taxid}[orgn]&retmode=json&' + \
'retmax={retmax}&retstart={retstart}'
def parse_args():
'''Parse arguments:
-a, ‑‑assembly‑summary -- a path to a custom local file in TSV format that
contains information on assemblies that are to be downloaded,
default: assembly summary will be fetched from NCBI GenBank FTP site
-c, ‑‑summary-copy -- a path to a TSV file where to save the filtered
assembly summary for chosen taxids in TSV format,
default: _assembly_summary_copy.tsv_
-t, ‑‑taxids -- space-separated IDs of taxa to retrive genomic sequences for,
default: all existing(!)
-l, ‑‑assembly-levels -- space-separated assembly levels that will be taken
into consideration: chromosome (chr), scaffold (scff), complete (cmpl),
contig (ctg), default: all levels
-o, ‑‑output-dir -- a path to the directory for downloaded genomes,
dafault: _genomes_
-f, ‑‑formats -- formats of data to be downloaded: genomic sequences in
nucleotide fasta format (fna), genomic sequences in GenBank format (gbff),
annotation table (gff), RNA sequences in nucleotide fasta format (rna),
coding sequences (CDS) in nucleotide fasta format (cds), translations
of CDS in protein fasta format (prot), default: fna
-n, ‑‑non-interactive -- do not ask questions and overwrite existing data
(be absolutely sure what you do)
-s, ‑‑summary-only -- for given taxids or all, only download assembly summary
'''
parser = argparse.ArgumentParser()
parser.add_argument(
'-a', '--assembly-summary', type=str, default=None, metavar='file_path',
help='A path to a custom local file in TSV format that contains' +
' information on assemblies that are to be downloaded, default: ' +
f'assembly summary will be fetched from NCBI, "{assembly_summary}"'
)
parser.add_argument(
'-c', '--summary-copy', type=str, default=summary_copy, metavar='file_path',
help='A path to a TSV file where to save the filtered assembly ' +
f'summary for chosen taxids in TSV format, default: {summary_copy}'
)
parser.add_argument(
'-t', '--taxids', type=int, nargs='*', default=None, metavar='taxid',
help='Space-separated IDs of taxa to retrive genomic sequences for, ' +
'default: all existing(!)'
)
parser.add_argument(
'-l', '--assembly-levels', type=str, nargs='+', default=None,
choices=assembly_levels.keys(), metavar='level',
help='Space-separated assembly levels that will be taken into' +
'consideration: chromosome (chr), scaffold (scff), ' +
'complete (cmpl), contig (ctg), default: all levels'
)
parser.add_argument(
'-o', '--output-dir', type=str, default=gen_dir, metavar='dir_path',
help='Path to the directory for downloaded genomes, dafault: "genomes"'
)
parser.add_argument(
'-f', '--formats', type=str, nargs='+', default=['fna'],
choices=assembly_formats.keys(), metavar='format',
help='Formats of data to be downloaded: ' +
'genomic sequences in nucleotide fasta format (fna), ' +
'genomic sequences in GenBank format (gbff), ' +
'annotation table (gff), ' +
'RNA sequences in nucleotide fasta format (rna), ' +
'coding sequences (CDS) in nucleotide fasta format (cds), ' +
'translations of CDS in protein fasta format (prot), ' +
'default: only fna'
)
parser.add_argument(
'-n', '--non-interactive', action='store_true',
help='Do not ask questions and overwrite existing data ' +
'(be absolutely sure what you do)'
)
parser.add_argument(
'-s', '--summary-only', action='store_true',
help='For given taxids or all, only download assembly summary'
)
args = parser.parse_args()
return args
def interrogate(msg):
'''In interactive mode, shows a yes/no question (msg) and retrive an answer,
not used when non-interatvie mode (-n, --non-interactive) is on. Arguments:
msg -- a message/question to be communicated
'''
ans = ''
while ans != 'yes' and ans != 'no':
ans = input(msg + ' (yes/no)\n')
if ans == 'yes':
return True
else:
return False
def setup_env(args):
'''Shows information on planned actions and ask for confirmation, unless
non-interatvie mode (-n, --non-interactive) is on. Arguments:
args -- the object returned by parse_args function containg values
of command line arguments
'''
# No taxid or pre-filtered assembly summary is provided, ask whether to
# download all genomes from NCBI GenBank.
if args.assembly_summary is None and args.taxids is None:
ans = True if args.non_interactive else \
interrogate('The script will run at default values for parameters ' +
'--assembly-summary and --taxids, which means that ' +
'all genomic sequences from GenBank will be downloaded. ' +
'You may narrow down the number of geneomes by providing ' +
'either your own assembly summary file or specific taxids. ' +
'Do you really want to download all genomic sequences?')
if not ans:
return False
else:
print('[WARNING] proceeding to download ALL genomic sequences from GenBank!')
# Check wheteher indicated assembly summary path exists, if not or it is
# not a file, show an error message and return False.
if args.assembly_summary is not None:
if not os.path.exists(args.assembly_summary):
print(f'[ERROR] Assembly summary path "{args.assembly_summary}" ' +
'does not exist')
return False
elif not os.path.isfile(args.assembly_summary):
print(f'[ERROR] Assembly summary path "{args.assembly_summary}" ' +
'points to an existing directory')
return False
# Confirm wheteher to overvire an existing assembly summary or show
# an error message and return False if the path point to a directory.
if os.path.exists(args.summary_copy):
if os.path.isfile(args.summary_copy):
ans = True if args.non_interactive else \
interrogate(f'Assembly summary copy "{args.summary_copy}" exists. ' +
'Do you want to overwrite?')
if not ans:
return False
else:
print(f'[ERROR] Assembly summary copy path "{args.summary_copy}" ' +
'points to an existing directory')
return False
# Confirm whether to download genomes to an existing directory, show
# an error message and return False if the path points to a file.
if not args.summary_only and os.path.exists(args.output_dir):
if os.path.isdir(args.output_dir):
ans = True if args.non_interactive else \
interrogate(f'Output directory "{args.output_dir}" exists. ' +
'Genomes will be saved alongside existing data. ' +
'Do you want to continue?')
if not ans:
return False
else:
print(f'[ERROR] Output directory path "{args.output_dir}" ' +
'points to an existing file')
return False
# Create the output diretory for download genomes, if does not exist, unless
# only summary is to be fetched.
if not args.summary_only:
os.makedirs(args.output_dir, exist_ok=True)
return True
def fetch_taxids(taxids):
'''Fetches IDs for all subtaxa for provided taxids from NCBI Taxonomy in chunks
of esearch_retmax (hardcoded for maximum size of 100000) by sending
HTTPS requests with GET method and obtaining responses in JSON format,
sleeps 0.5 sec after each request to avoid being blocked by NCBI server.
Arguments:
taxids -- a list of taxids of interest, may be empty if all taxids ought
to be taken into account
'''
if taxids is None:
return [], '[INFO] No taxid provided, assembly summary will not be filtered'
all_taxids = taxids.copy()
for taxid in taxids:
taxid_chunk = []
res_chunk = [None]
retstart = 0
while len(res_chunk) > 0:
res = requests.get(
esearch_path.format(taxid=taxid, retmax=esearch_retmax, retstart=retstart),
timeout=60
)
if res.status_code != 200:
return None, f'[ERROR] A problem occured while running NCBI esearch for taxid {taxid}'
res_json = res.json()
if 'esearchresult' not in res_json or 'idlist' not in res_json['esearchresult']:
return None, f'[ERROR] Unexpected result format returned by NCBI efetch for taxid {taxid}'
res_chunk = [ int(taxid) for taxid in res_json['esearchresult']['idlist'] ]
taxid_chunk.extend(res_chunk)
retstart += esearch_retmax
sleep(0.5)
if len(taxid_chunk) == 0:
return None, f'[ERROR] NCBI efetch returned no results for taxid {taxid}'
all_taxids.extend(taxid_chunk)
return all_taxids, f'[INFO] Fetched total number of {len(all_taxids)} taxids'
def fetch_summary(summary_path):
'''Fetches assembly summary form NCBI GenBank FTP server if a path to an
exisiting one is not given, returns None and an error message when
any problem arises. Arguments:
summary_path -- a path to a custom local file in TSV format that
contains information on assemblies that are to be downloaded,
if None, the summary will be fetch from NCBI server
'''
if summary_path is None:
try:
res = urllib.request.urlopen(assembly_summary, timeout=60)
summary_df = pd.read_csv(res, skiprows=1, index_col=None, sep='\t')
asm_col_name = summary_df.columns[0]
summary_df.rename(columns={asm_col_name : asm_col_name.lstrip('# ')}, inplace=True)
except:
return None, f'[ERROR] Assembly summary cannot be fetched from "{assembly_summary}"'
else:
return summary_df, '[INFO] Fetched assembly summary of {} rows and {} columns'.format(*summary_df.shape)
else:
try:
summary_df = pd.read_csv(summary_path, index_col=None, sep='\t')
except:
return None, f'[ERROR] Assembly summary cannot be loaded from "{summary_path}"'
if not set(summary_cols).issubset( set(summary_df.columns) ):
return None, '[ERROR] Assembly summary does not contain required columns: ' + ', '.join(summary_cols)
else:
return summary_df, '[INFO] Loaded assembly summary of {} rows and {} columns'.format(*summary_df.shape)
def filter_taxids(summary_df, taxids):
'''Having a complete list of requensted taxids and taxids for all subtaxa,
selects desirable rows from the assembly summary DataFrame. Arguments:
summary_df -- a Pandas DataFrame with the assembly summary
taxids -- a list of taxids of interest
'''
if len(taxids) == 0:
return summary_df, f'[INFO] No taxid provided, all {summary_df.shape[0]} assemblies will be processed'
summary_df = summary_df[ summary_df['taxid'].isin(taxids) ]
if summary_df.shape[0] > 0:
msg = f'[INFO] There is {summary_df.shape[0]} assemblies for the provided taxids'
else:
msg = '[WARNING] No assemblies in the summary for the provided taxids'
return summary_df, msg
def filter_levels(summary_df, levels):
'''Having assembly levels indicated, selects desirable rows from the assembly
summary DataFrame. Arguments:
summary_df -- a Pandas DataFrame with the assembly summary
levels -- a list of assembly levels of interest
'''
if levels is None:
return summary_df, f'[INFO] No assembly levels provided, all {summary_df.shape[0]} assemblies will be processed'
else:
levels = [ assembly_levels[level] for level in levels ]
summary_df = summary_df[ summary_df['assembly_level'].isin(levels) ]
if summary_df.shape[0] > 0:
msg = f'[INFO] There is {summary_df.shape[0]} assemblies for the provided assembly levels'
else:
msg = '[WARNING] No assemblies in the summary for the provided assembly levels'
return summary_df, msg
def save_summary(summary_df, fpath):
'''Saves the filtered assembly summary. Arguments:
summary_df -- a Pandas DataFrame with the final assembly summary
fpath -- a path where the summary ought to be saved as a TSV file
'''
try:
summary_df.to_csv(fpath, index=False, sep='\t')
except:
return None, f'[ERROR] Cannot save a copy of filtered summary to "{fpath}"'
else:
return True, f'[INFO] Filtered summary successfully saved to "{fpath}"'
def fetch_genomes(summary_df, formats, output_dir):
'''Having filtered assembly summary, fetches finally selected genomes by sending
FTP requests via urllib. Arguments:
summary_df -- a Pandas DataFrame with the final assembly summary
formats -- formats of data to be retrieved
output_dir -- a directory for the data to be saved to
'''
not_found = 0
fetched = 0
existing = 0
# Iterate over assembly accession numbers and corresponding FTP paths
# in the assembly summary DataFrame, if an error appears, show a message
# and proceed to next genome.
summary_df.reset_index(drop=True, inplace=True)
for index, (asm_acc, ftp_path) in summary_df[
'assembly_accession ftp_path'.split()
].iterrows():
if ftp_path.startswith('https://'):
ftp_path = 'ftp://' + ftp_path[8:]
pos = ftp_path.rfind('/')
asm_full_name = ftp_path[pos+1:]
# Check whether all requested files are already downloaded, if so,
# yield a proper message and continue to next iteration/genome.
done = [False] * len(formats)
for i, fmt in enumerate(formats):
suffix = assembly_formats[fmt]
fnamein = f'{asm_full_name}_{suffix}'
fpathout = f'{output_dir}/{asm_acc}_{suffix}'
if os.path.exists(fpathout):
if os.path.isfile(fpathout):
done[i] = True
if all(done):
existing += len(formats)
yield f'[INFO] All files requested for {asm_acc} exist and are files, considered done'
yield f'[INFO] Skipping {asm_acc}, already fetched'
continue
yield f'\n[INFO] Fetching files for assembly {asm_acc} ' + \
f'({index+1}/{summary_df.shape[0]})...'
# Fetch file list form the genome directory, if unsuccessful, yield
# a proper message and continue to next iteration/genome.
try:
res = urllib.request.urlopen(ftp_path, timeout=60)
lines = res.read().decode().rstrip().split('\n')
flist = [ line.split()[-1] for line in lines ]
except KeyboardInterrupt as e:
raise e
except:
yield f'[ERROR] Cannot fetch file list from "{ftp_path}"'
yield f'[WARNING] Skipping assembly {asm_acc}...'
continue
yield f'[INFO] There is {len(flist)} files at "{ftp_path}"'
# Fetch the file with MD5 sums for genome files, if unsuccessful, yield
# a proper message and continue to next iteration/genome.
full_path = f'{ftp_path}/{md5sums_fname}'
try:
res = urllib.request.urlopen(full_path, timeout=60)
md5sums = res.read().decode().rstrip().split('\n')
except KeyboardInterrupt as e:
raise e
except:
yield f'[ERROR] Info on MD5 checksums cannot be fetched from "{full_path}"'
yield f'[WARNING] Skipping assembly {asm_acc}...'
continue
md5sums = [ line.split() for line in md5sums ]
md5sums = { line[1].lstrip('./') : line[0] for line in md5sums }
yield f'[INFO] MD5 checksums for {asm_acc} successfully fetched'
# Iterate over per-genome requested files (data formats) and
# fetch those files.
old_fetched = fetched
for fmt in formats:
suffix = assembly_formats[fmt]
fnamein = f'{asm_full_name}_{suffix}'
fpathin = f'{ftp_path}/{fnamein}'
# Look up wheter request file exists on the server, if not, yield
# a proper message and continue to next iteration/genome.
if not fnamein in flist:
yield f'[ERROR] No such file for {asm_acc} assembly: "{fpathin}"'
yield f'[WARNING] Skipping {asm_acc} assembly file: "{fpathin}"'
not_found += 1
continue
# Continue to next iteration/genome if requested file already exists
# in the ouput directory, yeild a proper message if the local path
# points to a directory.
fpathout = f'{output_dir}/{asm_acc}_{suffix}'
if os.path.exists(fpathout):
if os.path.isfile(fpathout):
yield f'[INFO] The output path "{fpathout}" exists and is a file, considered done'
yield f'[INFO] Skipping {asm_acc} assembly file: "{fpathin}", already fetched'
existing += 1
else:
yield f'[ERROR] The output path "{fpathout}" exists and is not a file'
yield f'[WARNING] Skipping {asm_acc} assembly file: "{fpathin}"'
continue
# Check if there is a MD5 sum for the file to be downloaded,
# if not, yeild an error message and continue to next iteration.
if not fnamein in md5sums:
yield f'[ERROR] Cannot find MD5 checksum for {asm_acc} assembly file: "{fpathin}"'
yield f'[WARNING] Skipping {asm_acc} assembly file: "{fpathin}"'
continue
# Fetch the content of the file to be downloaded, yield a proper
# message if it goes wrong and continue to next iteration/genome.
try:
res = urllib.request.urlopen(fpathin, timeout=60)
content = res.read()
except:
yield f'[ERROR] {asm_acc} assembly file cannot be fetched from: "{fpathin}"'
yield f'[WARNING] Skipping {asm_acc} assembly file: "{fpathin}"'
continue
yield f'[INFO] {asm_acc} assembly file "{fpathin}" successfully fetched'
# Generate an MD5 sum for the downloaded content and compare to the one
# retrieved from the server, if these do not agree, yield a proper
# message and continue to next iteration/genome.
md5sum = hashlib.md5(content).hexdigest()
if md5sum == md5sums[fnamein]:
yield f'[INFO] Correct MD5 checksum ({md5sum}) for {asm_acc} assembly file: "{fpathin}"'
else:
yield f'[ERROR] Incorrect MD5 checksum ({md5sum}) ' + \
f'for {asm_acc} assembly file ({md5sums[fnamein]}): "{fpathin}"'
yield f'[WARNING] Skipping {asm_acc} assembly file: "{fpathin}"'
continue
# Try to save content to a temporary file, if unsuccessful, yield
# a proper message and continue to next iteration/genome.
tmpfpathout = f'{output_dir}/.{asm_acc}_{suffix}'
try:
with open(tmpfpathout, 'wb') as f:
f.write(content)
except:
yield f'[ERROR] Cannot save to "{fpathout}" the {asm_acc} assembly file: "{fpathin}"'
yield f'[WARNING] Skipping {asm_acc} assembly file: "{fpathin}"'
continue
# Try to rename the temporary file to give it the final name,
# if unsuccessful, yield a proper message.
try:
os.rename(tmpfpathout, fpathout)
except:
yield f'[ERROR] Cannot save to "{fpathout}" the {asm_acc} assembly file: "{fpathin}"'
yield f'[WARNING] Skipping {asm_acc} assembly file: "{fpathin}"'
os.remove(tmpfpathout)
else:
yield f'[INFO] {asm_acc} assembly file "{fpathin}" successfully saved to "{fpathout}"'
fetched += 1
# Show the summary, especially how many files already existed or were
# successfully fetch as well as fetching of how many failed and require
# a rerun.
total = summary_df.shape[0] * len(formats)
left = total-existing-not_found-fetched
yield f'\n[INFO] Fetched {fetched} files out of {total} inferred ' + \
f'(already existing: {existing}, not found on site: {not_found})'
if left > 0:
yield f'\n[WARNING] {left} files are still to be fetched'
else:
yield '[INFO] All files have been successfully fetched'
yield '[INFO] Fetching genomes has been completed'
def main():
'''The entry poin function that executes all stages one by one, receive messages
from stage-executing functions and prints them, if a critical error araises,
shows a message and exits.
'''
# Parse command line arguments.
args = parse_args()
# Setup all variables, check up the environment, a function that
# interactacts with a user unless non-interatvie mode is on
# (-n, --non-interactive),
# the only function that pronts messages on its own.
res = setup_env(args)
if not res:
sys.exit('[INFO] Exiting...')
# Fetch assembly summary from NCBI GenBank FTP server, if a path to
# an existing one is not provided.
extra_msg = ' from NCBI (it may take a while...)' \
if args.assembly_summary is None else ''
print(f'[INFO] Fetching assembly summary{extra_msg}', flush=True)
summary_df, msg = fetch_summary(args.assembly_summary)
print(msg, flush=True)
if summary_df is None:
sys.exit(1)
# Fetch taxids for subtaxa of provided taxids, if the list is empty,
# fetch_taxids function handles it properly.
print('[INFO] Fetching taxids...', flush=True)
taxids, msg = fetch_taxids(args.taxids)
print(msg, flush=True)
if taxids is None:
sys.exit(1)
# Filter assembly summary DataFrame to keep genomes belonging to taxa
# contained in the fetch taxid list, if the list is empty
# filter_taxids function handles it properly.
summary_df, msg = filter_taxids(summary_df, taxids)
print(msg, flush=True)
if summary_df is None:
sys.exit(1)
# Filter assembly summary DataFrame to keep genomes of chosen assembly levels.
summary_df, msg = filter_levels(summary_df, args.assembly_levels)
print(msg, flush=True)
if summary_df is None:
sys.exit(1)
# Save filtered summary, i.e. summary that contain only those genomes that
# are to be fetched, exit if only summary has been requested, wihtout
# fetching any data.
status, msg = save_summary(summary_df, args.summary_copy)
print(msg, flush=True)
if status is None:
sys.exit(1)
elif args.summary_only:
print('[INFO] Only assembly summary requested, exiting...', flush=True)
sys.exit(0)
# Iteratively fetch selected genomes using fetch_genomes generator that
# yields messages on the progress.
for msg in fetch_genomes(summary_df, args.formats, args.output_dir):
print(msg, flush=True)
# Entry point.
if __name__ == '__main__':
main()