Skip to content

Commit

Permalink
prepare_for_supplement: Expand its scope.
Browse files Browse the repository at this point in the history
  • Loading branch information
wwood committed Apr 24, 2024
1 parent 0e5f187 commit 760ec5c
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 2 deletions.
55 changes: 53 additions & 2 deletions extras/prepare_for_supplement/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ prodigal_runner_path = config['prodigal_runner_path'] #'~/git/prodigal-runner/bi
mag_paths = config['mag_paths']
gtdbtk_db_path = config['GTDBTK_DATA_PATH']
checkm2_db = config['CHECKM2DB']
num_threads = 8

output_directory = config['output'] if 'output_directory' in config else 'supplement_preparation'
genome_fasta_extension = config['genome_fasta_extension'] if 'genome_fasta_extension' in config else '.fna'
Expand All @@ -27,11 +28,61 @@ print("Found {} genome files and {} groups".format(len(genomes_input), len(group
group_ids = list([str(i) for i in range(len(groups))])
group_ids = list([i for i in range(len(groups))])

rule all:
rule all:
input:
f'{output_directory}/done/all-prodigal-runner.done',
f'{output_directory}/done/all-gtdbtk.done',
f'{output_directory}/done/all-checkm2.done',
f'{output_directory}/done/galah.done'

rule galah:
input:
checkm2_report = f'{output_directory}/checkm2_quality_report.tsv',
output:
representatives = f'{output_directory}/galah_representatives.tsv',
done = touch(f'{output_directory}/done/galah.done')
conda:
"envs/galah.yml"
threads: num_threads
shell:
"galah cluster --genome-fasta-list {mag_paths} --checkm2-quality-report {input.checkm2_report} --output-representative-list {output.representatives} -t {threads}"

rule all_checkm2:
input:
expand(f'{output_directory}/done/checkm2-'+'{group}.done', group=range(len(groups))),
output:
report = f'{output_directory}/checkm2_quality_report.tsv',
done = touch(f'{output_directory}/done/all-checkm2.done')
shell:
"""
head -1 {output_directory}/checkm2/0/quality_report.tsv > {output.report} && \
find {output_directory}/checkm2/ |grep quality_report.tsv | parallel -j1 -k tail -n+2 {{}} >> {output.report}
"""

rule all_gtdbtk:
input:
expand(f'{output_directory}/done/gtdbtk-'+'{group}.done', group=range(len(groups))),
expand(f'{output_directory}/done/prodigal-runner-'+'{group}.done', group=range(len(groups)))
output:
done = touch(f'{output_directory}/done/all-gtdbtk.done')
shell:
"""
head -1 {output_directory}/gtdbtk/0/gtdbtk.ar53.summary.tsv > {output_directory}/gtdbtk/gtdbtk.ar53.summary.tsv && \
find {output_directory}/gtdbtk -type f |grep -v gtdbtk/gtdbtk.ar53.summary.tsv |grep gtdbtk.ar53.summary.tsv | parallel -j1 tail -n+2 {{}} >> {output_directory}/gtdbtk/gtdbtk.ar53.summary.tsv && \
head -1 {output_directory}/gtdbtk/0/gtdbtk.bac120.summary.tsv > {output_directory}/gtdbtk/gtdbtk.bac120.summary.tsv && \
find {output_directory}/gtdbtk -type f |grep -v gtdbtk/gtdbtk.bac120.summary.tsv |grep gtdbtk.bac120.summary.tsv | parallel -j1 tail -n+2 {{}} >> {output_directory}/gtdbtk/gtdbtk.bac120.summary.tsv
"""

rule all_prodigal_runner:
input:
expand(f'{output_directory}/done/prodigal-runner-'+'{group}.done', group=range(len(groups))),
output:
gene_definitions = f'{output_directory}/gene_definitions.tsv',
done=touch(f'{output_directory}/done/all-prodigal-runner.done')
params:
output_directory = output_directory,
genomes_input = genomes_input,
script:
"bin/combine-prodigal-runner.py"

rule checkm2:
output:
Expand Down
73 changes: 73 additions & 0 deletions extras/prepare_for_supplement/bin/combine-prodigal-runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/usr/bin/env python3

import re
import os

# Tab-separated file of genome_fasta<TAB>transcript_fasta<TAB>protein_fasta
# [default: undefined, call genes using Prodigal]

output_dir = snakemake.params['output_directory']
prodigal_runner_output_directory = output_dir + "/prodigal-runner"
output_file = snakemake.output['gene_definitions']
genomes_input = snakemake.params['genomes_input']

r = re.compile(r'^\d+$')

genome_to_paths = {}
def file_to_genome(filename):
r = re.compile(r'^(.*)\..+')
if r.match(filename):
return r.match(filename).group(1)
else:
raise Exception("Error: file name {} does not match expected format".format(filename))

# Cache genome to original fasta path
genome_to_fasta = {}
for fasta in genomes_input:
genome = file_to_genome(os.path.basename(fasta))
genome_to_fasta[genome] = fasta

# Get paths of transcripts and proteins
for directory in os.listdir(prodigal_runner_output_directory):
if r.match(directory):
for file in os.listdir(prodigal_runner_output_directory + "/" + directory):
genome = file_to_genome(file)
if genome not in genome_to_paths:
genome_to_paths[genome] = {}

if file.endswith('.faa'):
protein_fasta = prodigal_runner_output_directory + "/" + directory + "/" + file
if 'protein' in genome_to_paths[genome]:
raise Exception("Error: multiple protein fasta files found for genome {}".format(genome))
genome_to_paths[genome]['protein'] = protein_fasta
elif file.endswith('.fna'):
transcripts_fasta = prodigal_runner_output_directory + "/" + directory + "/" + file
if 'transcripts' in genome_to_paths[genome]:
raise Exception("Error: multiple transcript fasta files found for genome {}".format(genome))
genome_to_paths[genome]['transcripts'] = transcripts_fasta
elif file.endswith('.gff'):
pass
else:
raise Exception("Error: unexpected file found in directory {}".format(file))

# Write out the gene definitions file
num_genomes = len(genome_to_paths)
with open(output_file, 'w') as out:
# genome_fasta', 'transcript_fasta', 'protein_fasta
out.write("\t".join(['genome_fasta', 'transcript_fasta', 'protein_fasta']))
out.write("\n")

for genome in genome_to_paths:
if 'protein' not in genome_to_paths[genome]:
raise Exception("Error: no protein fasta file found for genome {}".format(genome))
if 'transcripts' not in genome_to_paths[genome]:
raise Exception("Error: no transcript fasta file found for genome {}".format(genome))

out.write("\t".join([
genome_to_fasta[genome],
genome_to_paths[genome]['transcripts'],
genome_to_paths[genome]['protein'],
]))
out.write("\n")

print("Wrote {} gene definitions to {}".format(num_genomes, output_file))
8 changes: 8 additions & 0 deletions extras/prepare_for_supplement/envs/galah.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- galah==0.4.0
- parallel

1 change: 1 addition & 0 deletions singlem/supplement.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ def generate_taxonomy_for_new_genomes(**kwargs):
logging.info("Writing new genome taxonomies to {}".format(output_taxonomies_file))
output_taxonomies_fh = open(output_taxonomies_file, 'w')
output_taxonomies_fh.write('genome\ttaxonomy\n')
import IPython; IPython.embed()

# For loop
for genome_name, taxonomy_str in taxonomies_to_process:
Expand Down

0 comments on commit 760ec5c

Please sign in to comment.