Skip to content

Commit

Permalink
[MRG] Switch to using picklists for storing matches, as opposed to sa…
Browse files Browse the repository at this point in the history
…ving a copy of the signatures. (#202)

* update sourmash version

* use picklists + prefetch matches csv instead of saving redundant matches

* fix test_compare_taxonomy

* add matches_csv, oops

* fixed test_contigs_list_contaminants.py

* fixed test_contigs_search.py

* fix snakemake tests.

* update {filename} to just use {g} throughout

* re-add abund test, but now with actual comments/appropriate test name :)
  • Loading branch information
ctb authored Dec 30, 2021
1 parent a2b428d commit b9c2b8d
Show file tree
Hide file tree
Showing 14 changed files with 172 additions and 67 deletions.
90 changes: 55 additions & 35 deletions charcoal/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ class Checkpoint_HitListPairs:

# wait for the results of 'hitlist_make_contigs_matches';
# this will trigger exception until that rule has been run.
checkpoints.hitlist_make_contigs_matches.get(g=g)
checkpoints.hitlist_make_contigs_matches_all.get(g=g)

match_accs = get_hitlist_match_accs(g)
p = expand(self.pattern, g=g, acc=match_accs)
Expand All @@ -230,12 +230,12 @@ rule all:
@toplevel
rule clean:
input:
expand(output_dir + '/{f}.clean.fa.gz', f=genome_list)
expand(output_dir + '/{g}.clean.fa.gz', g=genome_list)

# CTB: alias for 'clean'; remove later after making sure it's not in docs :)
rule all_clean_contigs:
input:
expand(output_dir + '/{f}.clean.fa.gz', f=genome_list)
expand(output_dir + '/{g}.clean.fa.gz', g=genome_list)

# check config files only
@toplevel
Expand Down Expand Up @@ -294,11 +294,11 @@ def make_moltype_compute_args(moltype):

# generate a signature for a query genome
# CTB TODO: update for sourmash sketch
rule contigs_sig:
rule contigs_sig_wc:
input:
genome_dir + '/{filename}'
genome_dir + '/{g}'
output:
stage1_dir + '/{filename}.sig'
stage1_dir + '/{g}.sig'
conda: 'conf/env-sourmash.yml'
params:
scaled = config['scaled'],
Expand All @@ -309,15 +309,16 @@ rule contigs_sig:
{params.moltype} {input} -o {output}
"""



# run a search, query.x.database.
rule prefetch_all:
rule prefetch_all_matches_wc:
input:
query = stage1_dir + '/{filename}.sig',
query = stage1_dir + '/{g}.sig',
databases = config['gather_db']
output:
csv = stage1_dir + '/{filename}.matches.csv',
matches = stage1_dir + '/{filename}.matches.zip',
txt = stage1_dir + '/{filename}.matches.txt'
csv = stage1_dir + '/{g}.matches.csv',
txt = stage1_dir + '/{g}.matches.txt'
params:
moltype = "--{}".format(config['moltype'].lower()),
gather_scaled = config['gather_scaled'],
Expand All @@ -326,43 +327,54 @@ rule prefetch_all:
shell: """
sourmash prefetch {input.query} {input.databases} -o {output.csv} \
{params.moltype} --scaled {params.gather_scaled} \
--save-matches {output.matches} \
--threshold-bp {params.threshold_bp} >& {output.txt}
cat {output.txt}
touch {output.csv} {output.matches}
touch {output.csv}
"""

# generate contigs taxonomy
rule make_contigs_taxonomy_json:
@toplevel
rule prefetch_all_matches:
input:
genome = genome_dir + '/{f}',
genome_sig = stage1_dir + '/{f}.sig',
matches = stage1_dir + '/{f}.matches.zip',
expand(stage1_dir + '/{g}.matches.csv', g=genome_list)

lineages = config['lineages_csv']
# generate contigs taxonomy
rule make_contigs_search_taxonomy_wc:
input:
genome = genome_dir + '/{g}',
genome_sig = stage1_dir + '/{g}.sig',
matches_csv = stage1_dir + '/{g}.matches.csv',
lineages = config['lineages_csv'],
databases = config['gather_db']
output:
json = stage1_dir + '/{f}.contigs-tax.json',
json = stage1_dir + '/{g}.contigs-tax.json',
conda: 'conf/env-sourmash.yml'
params:
match_rank = default_match_rank,
shell: """
python -m charcoal.contigs_search_taxonomy \
--genome {input.genome} --lineages-csv {input.lineages} \
--genome-sig {input.genome_sig} \
--matches-sig {input.matches} \
--matches-csv {input.matches_csv} \
--databases {input.databases} \
--json-out {output.json} \
--match-rank {params.match_rank}
"""

@toplevel
rule make_contigs_search_taxonomy:
input:
expand(stage1_dir + '/{g}.contigs-tax.json', g=genome_list)

# compare taxonomy for contigs in a genome; this generates hit list and
# genome summary outputs.
rule compare_taxonomy_single:
rule compare_taxonomy_single_wc:
input:
json = stage1_dir + '/{g}.contigs-tax.json',
sig = stage1_dir + '/{g}.matches.zip',
matches_csv = stage1_dir + '/{g}.matches.csv',
lineages = config['lineages_csv'],
provided_lineages = provided_lineages_file,
genome_list = genome_list_file
databases = config['gather_db'],
genome_list = genome_list_file,
output:
hit_list_csv = stage1_dir + '/{g}.hitlist_for_filtering.csv',
summary_csv = stage1_dir + '/{g}.genome_summary.csv',
Expand All @@ -384,7 +396,8 @@ rule compare_taxonomy_single:
--min_f_ident={params.min_f_ident} \
--min_f_major={params.min_f_major} \
--match-rank={params.match_rank} \
{wildcards.g}
{wildcards.g} \
--databases {input.databases}
"""

# combine all of the individual hit lists into a single hitlist summary file.
Expand Down Expand Up @@ -466,11 +479,12 @@ rule make_hitlist_matches_info_csv:

# generates list of contaminant & non-contaminant accessions for genomes
# on the hitlist.
checkpoint hitlist_make_contigs_matches:
checkpoint hitlist_make_contigs_matches_all:
input:
genome = genome_dir + '/{g}',
genome_sig = stage1_dir + '/{g}.sig',
matches = stage1_dir + '/{g}.matches.zip',
matches_csv = stage1_dir + '/{g}.matches.csv',
databases = config['gather_db'],
lineages = config['lineages_csv'],
hitlist = output_dir + '/stage1_hitlist.csv'
output:
Expand All @@ -482,20 +496,26 @@ checkpoint hitlist_make_contigs_matches:
python -m charcoal.contigs_list_contaminants \
--genome {input.genome} --lineages-csv {input.lineages} \
--genome-sig {input.genome_sig} \
--matches-sig {input.matches} \
--matches-csv {input.matches_csv} \
--databases {input.databases} \
--hitlist {input.hitlist} \
--json-out {output.matches_json} \
--match-rank {params.match_rank}
"""

@toplevel
rule hitlist_make_contigs_matches:
input:
expand(stage2_dir + '/{g}.matches.json', g=genome_list)

# run a mashmap comparison of two genomes.
rule mashmap_compare:
input:
query = genome_dir + '/{f}',
query = genome_dir + '/{g}',
target = ancient('genbank_genomes/{acc}_genomic.fna.gz'),
output:
cmpfile = stage2_dir + '/{f}.x.{acc}.mashmap.align',
outfile = stage2_dir + '/{f}.x.{acc}.mashmap.out',
cmpfile = stage2_dir + '/{g}.x.{acc}.mashmap.align',
outfile = stage2_dir + '/{g}.x.{acc}.mashmap.out',
conda: 'conf/env-mashmap.yml'
shell: """
mashmap -q {input.query} -r {input.target} -o {output.cmpfile} \
Expand Down Expand Up @@ -549,12 +569,12 @@ rule combine_postprocessing_csv:
# actually do cleaning. @CTB out of date, fixme later!
rule clean_contigs:
input:
genome = genome_dir + '/{f}',
json = stage1_dir + '/{f}.contigs-tax.json',
genome = genome_dir + '/{g}',
json = stage1_dir + '/{g}.contigs-tax.json',
hit_list = output_dir + '/stage1_hitlist.csv',
output:
clean = output_dir + '/{f}.clean.fa.gz',
dirty = output_dir + '/{f}.dirty.fa.gz',
clean = output_dir + '/{g}.clean.fa.gz',
dirty = output_dir + '/{g}.dirty.fa.gz',
conda: 'conf/env-sourmash.yml'
shell: """
python -m charcoal.clean_genome \
Expand Down
26 changes: 20 additions & 6 deletions charcoal/compare_taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,12 +120,23 @@ def choose_genome_lineage(guessed_genome_lineage, provided_lineage, match_rank,
return genome_lineage, comment, needs_lineage


def get_genome_taxonomy(matches_filename, genome_sig_filename, provided_lineage,
def get_genome_taxonomy(matches_filename, database_list,
genome_sig_filename, provided_lineage,
tax_assign, match_rank, min_f_ident, min_f_major):
try:
siglist = list(sourmash.load_file_as_signatures(matches_filename))
except (ValueError, AssertionError) as e:
siglist = None
# load the matches from prefetch as a picklist
picklist = sourmash.picklist.SignaturePicklist('prefetch')
picklist.load(matches_filename, picklist.column_name)

# load all of the matches in the database, as found by prefetch;
# select on them; and then aggregate into MultiIndex.
# CTB note: currently, this loads all the signatures into memory.
# Alternatively we could do something with LazyLoadedIndex maybe?

siglist = []
for filename in database_list:
db = sourmash.load_file_as_index(filename)
db = db.select(picklist=picklist)
siglist += list(db.signatures())

if not siglist:
comment = 'no matches for this genome.'
Expand Down Expand Up @@ -248,12 +259,13 @@ def main(args):
detected_contam = {}

summary_d = {}
matches_filename = os.path.join(dirname, genome_name + '.matches.zip')
matches_filename = os.path.join(dirname, genome_name + '.matches.csv')
genome_sig = os.path.join(dirname, genome_name + '.sig')
lineage = provided_lineages.get(genome_name, '')
contigs_json = os.path.join(dirname, genome_name + '.contigs-tax.json')

x = get_genome_taxonomy(matches_filename,
args.databases,
genome_sig,
lineage,
tax_assign, match_rank,
Expand Down Expand Up @@ -430,6 +442,8 @@ def cmdline(sys_args):
p.add_argument('--min_f_ident', type=float, default=F_IDENT_THRESHOLD)
p.add_argument('--min_f_major', type=float, default=F_MAJOR_THRESHOLD)
p.add_argument('--match-rank', required=True)
p.add_argument('--databases', help='sourmash databases', required=True,
nargs='+')
p.add_argument('genome')
args = p.parse_args()

Expand Down
2 changes: 1 addition & 1 deletion charcoal/conf/env-reporting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ dependencies:
- notebook>=6,<7
- plotly>=4.9.0,<5
- ipykernel
- sourmash>=4.2.0,<5
- sourmash>=4.2.2,<5
- pip
- pip:
- git+https://github.com/dib-lab/charcoal.git
Expand Down
4 changes: 2 additions & 2 deletions charcoal/conf/env-sourmash.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ channels:
- bioconda
- defaults
dependencies:
- python>=3.6
- python>=3.7,<3.10
- pytest>6,<7
- pytest-dependency>=0.5.1
- pyyaml>5.3,<6
- sourmash>=4.2.0,<5
- sourmash>=4.2.2,<5
- pip
- pip:
- git+https://github.com/dib-lab/charcoal.git
Expand Down
24 changes: 17 additions & 7 deletions charcoal/contigs_list_contaminants.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,20 @@ def main(args):
# load the genome signature
genome_sig = sourmash.load_one_signature(args.genome_sig)

# load all of the matches from search --containment in the database
try:
siglist = list(sourmash.load_file_as_signatures(args.matches_sig))
except ValueError:
siglist = []
print(f"loaded {len(siglist)} matches from '{args.matches_sig}'")
# load the matches from prefetch as a picklist
picklist = sourmash.picklist.SignaturePicklist('prefetch')
picklist.load(args.matches_csv, picklist.column_name)

# load all of the matches in the database, as found by prefetch;
# select on them; and then aggregate into MultiIndex.
# CTB note: currently, this loads all the signatures into memory.
# Alternatively we could do something with LazyLoadedIndex maybe?

siglist = []
for filename in args.databases:
db = sourmash.load_file_as_index(filename)
db = db.select(picklist=picklist)
siglist += list(db.signatures())

# Hack for examining members of our search database: remove exact matches.
new_siglist = []
Expand Down Expand Up @@ -192,7 +200,9 @@ def cmdline(sys_args):
p = argparse.ArgumentParser(sys_args)
p.add_argument('--genome', help='genome file', required=True)
p.add_argument('--genome-sig', help='genome sig', required=True)
p.add_argument('--matches-sig', help='all relevant matches', required=True)
p.add_argument('--matches-csv', help='all relevant matches', required=True)
p.add_argument('--databases', help='sourmash databases', required=True,
nargs='+')
p.add_argument('--lineages-csv', help='lineage spreadsheet', required=True)
p.add_argument('--hitlist', help='hitlist spreadsheet', required=True)
p.add_argument('--force', help='continue past survivable errors',
Expand Down
26 changes: 19 additions & 7 deletions charcoal/contigs_search_taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,22 @@ def main(args):
# load the genome signature
genome_sig = sourmash.load_one_signature(args.genome_sig)

# load all of the matches from search --containment in the database
try:
siglist = list(sourmash.load_file_as_signatures(args.matches_sig))
except (ValueError, AssertionError) as e:
siglist = []
print(f"loaded {len(siglist)} matches from '{args.matches_sig}'")
# load the matches from prefetch as a picklist
picklist = sourmash.picklist.SignaturePicklist('prefetch')
picklist.load(args.matches_csv, picklist.column_name)

# load all of the matches in the database, as found by prefetch;
# select on them; and then aggregate into MultiIndex.
# CTB note: currently, this loads all the signatures into memory.
# Alternatively we could do something with LazyLoadedIndex maybe?

siglist = []
for filename in args.databases:
db = sourmash.load_file_as_index(filename)
db = db.select(picklist=picklist)
siglist += list(db.signatures())

print(f"loaded {len(siglist)} matches from '{args.matches_csv}'")

# Hack for examining members of our search database: remove exact matches.
new_siglist = []
Expand Down Expand Up @@ -120,7 +130,9 @@ def cmdline(sys_args):
p = argparse.ArgumentParser(sys_args)
p.add_argument('--genome', help='genome file', required=True)
p.add_argument('--genome-sig', help='genome sig', required=True)
p.add_argument('--matches-sig', help='all relevant matches', required=True)
p.add_argument('--matches-csv', help='all relevant matches', required=True)
p.add_argument('--databases', help='sourmash databases', required=True,
nargs='+')
p.add_argument('--lineages-csv', help='lineage spreadsheet', required=True)
p.add_argument('--force', help='continue past survivable errors',
action='store_true')
Expand Down
4 changes: 2 additions & 2 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ channels:
- bioconda
- defaults
dependencies:
- python>=3.7
- python>=3.7,<3.10
- snakemake-minimal=6.5.1
- screed
- click>=7,<8
- pip
- mamba
- sourmash>=4.2.0,<5
- sourmash>=4.2.2,<5
2 changes: 2 additions & 0 deletions tests/test-data/2.fa.gz.gather-matches.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
intersect_bp,jaccard,max_containment,f_query_match,f_match_query,match_filename,match_name,match_md5,match_bp,query_filename,query_name,query_md5,query_bp
2740000,1.0,1.0,1.0,1.0,,"CP001071.1 Akkermansia muciniphila ATCC BAA-835, complete genome",318839c2,2740000,tests/test-data/genomes/2.fa.gz,"CP001071.1 Akkermansia muciniphila ATCC BAA-835, complete genome",318839c2,2740000
Loading

0 comments on commit b9c2b8d

Please sign in to comment.