From c14a50f3d286bc3f2e4410ba3208b96337cc8211 Mon Sep 17 00:00:00 2001 From: boasvdp Date: Wed, 9 Aug 2023 13:43:51 +0200 Subject: [PATCH 1/2] fix: sort reads before subsample for repeatability --- bin/rules/clean_fastq.smk | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/bin/rules/clean_fastq.smk b/bin/rules/clean_fastq.smk index 3ea9897..cc7c3e2 100644 --- a/bin/rules/clean_fastq.smk +++ b/bin/rules/clean_fastq.smk @@ -2,8 +2,8 @@ rule clean_fastq: input: lambda wildcards: (SAMPLES[wildcards.sample][i] for i in ["R1", "R2"]), output: - r1=OUT + "/clean_fastq/{sample}_pR1.fastq.gz", - r2=OUT + "/clean_fastq/{sample}_pR2.fastq.gz", + r1=temp(OUT + "/clean_unsorted_fastq/{sample}_pR1.fastq.gz"), + r2=temp(OUT + "/clean_unsorted_fastq/{sample}_pR2.fastq.gz"), unpaired=OUT + "/clean_fastq/{sample}_unpaired_joined.fastq.gz", html=OUT + "/clean_fastq/{sample}_fastp.html", json=OUT + "/clean_fastq/{sample}_fastp.json", @@ -41,3 +41,27 @@ rule clean_fastq: --correction \ --length_required {params.min_length} > {log} 2>&1 """ + +rule sort_paired_fastq: + input: + r1=OUT + "/clean_unsorted_fastq/{sample}_pR1.fastq.gz", + r2=OUT + "/clean_unsorted_fastq/{sample}_pR2.fastq.gz", + output: + r1=OUT + "/clean_fastq/{sample}_pR1.fastq.gz", + r2=OUT + "/clean_fastq/{sample}_pR2.fastq.gz", + message: + "Sorting cleaned paired reads to increase repeatability" + conda: + "../../envs/scaffold_analyses.yaml" + container: + "docker://staphb/bbtools:38.86" + threads: int(config["threads"]["pileup"]) + resources: + mem_gb=config["mem_gb"]["pileup"], + log: + OUT + "/log/sort_paired_fastq/sort_paired_fastq_{sample}.log", + shell: + """ +sortbyname.sh in={input.r1} out={output.r1} +sortbyname.sh in={input.r2} out={output.r2} + """ \ No newline at end of file From c907f7927944f412f93cf69ad0ce51c9e534ef04 Mon Sep 17 00:00:00 2001 From: boasvdp Date: Thu, 10 Aug 2023 08:54:32 +0200 Subject: [PATCH 2/2] style: black and snakefmt formatting --- bin/parse_bbtools.py | 5 +- bin/parse_bbtools_summary.py | 5 +- bin/rules/clean_fastq.smk | 3 +- bin/rules/de_novo_assembly.smk | 2 +- bin/rules/identify_species.smk | 8 ++- bin/rules/multiqc.smk | 3 +- bin/rules/run_checkm.smk | 2 + bin/rules/subsample_fastq.smk | 8 +-- bin/select_genus_checkm.py | 65 ++++++++++++------- bin/subsample_reads.py | 111 ++++++++++++++++++++------------- juno_assembly.py | 2 +- 11 files changed, 138 insertions(+), 76 deletions(-) diff --git a/bin/parse_bbtools.py b/bin/parse_bbtools.py index 5fc05fa..8a36c23 100644 --- a/bin/parse_bbtools.py +++ b/bin/parse_bbtools.py @@ -1,6 +1,7 @@ import pandas import re + def parse_bbtools_perScaffold(input_bbtools, output_bbtools): # create an empty dataframe with the right headers bbtools_headers_file = open(input_bbtools[0], "r") @@ -11,7 +12,9 @@ def parse_bbtools_perScaffold(input_bbtools, output_bbtools): # loop over the bbtools files (perscaffold) and add them to the dataframe for input_file in str(input_bbtools).split(): # get the sample name from the file name - sample_name = re.sub("_perMinLenFiltScaffold.tsv", "", str(input_file).split("sample/")[1]) + sample_name = re.sub( + "_perMinLenFiltScaffold.tsv", "", str(input_file).split("sample/")[1] + ) # read the data into a pandas dataframe sample_dataframe = pandas.read_csv(input_file, sep="\t") diff --git a/bin/parse_bbtools_summary.py b/bin/parse_bbtools_summary.py index 1823d0c..0e8e200 100644 --- a/bin/parse_bbtools_summary.py +++ b/bin/parse_bbtools_summary.py @@ -1,12 +1,15 @@ import argparse import re + def parse_bbtools_summary(input_bbtools, output_bbtools): summary_dict = {} for input_file in input_bbtools: # get the sample name from the file name - sample_name = re.sub("_MinLenFiltSummary.tsv", "", str(input_file).split("sample/")[1]) + sample_name = re.sub( + "_MinLenFiltSummary.tsv", "", str(input_file).split("sample/")[1] + ) variable_name_list = [] value_list = [] diff --git a/bin/rules/clean_fastq.smk b/bin/rules/clean_fastq.smk index cc7c3e2..8fdf410 100644 --- a/bin/rules/clean_fastq.smk +++ b/bin/rules/clean_fastq.smk @@ -42,6 +42,7 @@ rule clean_fastq: --length_required {params.min_length} > {log} 2>&1 """ + rule sort_paired_fastq: input: r1=OUT + "/clean_unsorted_fastq/{sample}_pR1.fastq.gz", @@ -64,4 +65,4 @@ rule sort_paired_fastq: """ sortbyname.sh in={input.r1} out={output.r1} sortbyname.sh in={input.r2} out={output.r2} - """ \ No newline at end of file + """ diff --git a/bin/rules/de_novo_assembly.smk b/bin/rules/de_novo_assembly.smk index 391ce74..b36b988 100644 --- a/bin/rules/de_novo_assembly.smk +++ b/bin/rules/de_novo_assembly.smk @@ -13,7 +13,7 @@ rule de_novo_assembly: r1=OUT + "/subsampled_fastq/{sample}_pR1.fastq.gz", r2=OUT + "/subsampled_fastq/{sample}_pR2.fastq.gz", fastq_unpaired=OUT + "/clean_fastq/{sample}_unpaired_joined.fastq.gz", - cov_cutoff_file = OUT + "/subsampling/{sample}.txt", + cov_cutoff_file=OUT + "/subsampling/{sample}.txt", output: scaffolds=OUT + "/de_novo_assembly/{sample}/scaffolds.fasta", contigs=temp(OUT + "/de_novo_assembly/{sample}/contigs.fasta"), diff --git a/bin/rules/identify_species.smk b/bin/rules/identify_species.smk index 019e96f..1d3e131 100644 --- a/bin/rules/identify_species.smk +++ b/bin/rules/identify_species.smk @@ -40,12 +40,16 @@ rule identify_species_reads: """ + rule identify_species: input: OUT + "/de_novo_assembly_filtered/{sample}.fasta", output: - kraken2_kreport=temp(OUT + "/identify_species/contigs/{sample}/{sample}.kreport2"), - bracken_s=OUT + "/identify_species/contigs/{sample}/{sample}_species_content.txt", + kraken2_kreport=temp( + OUT + "/identify_species/contigs/{sample}/{sample}.kreport2" + ), + bracken_s=OUT + + "/identify_species/contigs/{sample}/{sample}_species_content.txt", bracken_kreport=OUT + "/identify_species/contigs/{sample}/{sample}_bracken_species.kreport2", message: diff --git a/bin/rules/multiqc.smk b/bin/rules/multiqc.smk index 229eefc..94ffddd 100644 --- a/bin/rules/multiqc.smk +++ b/bin/rules/multiqc.smk @@ -24,7 +24,8 @@ rule multiqc: sample=SAMPLES, ), expand( - OUT + "/identify_species/contigs/{sample}/{sample}_bracken_species.kreport2", + OUT + + "/identify_species/contigs/{sample}/{sample}_bracken_species.kreport2", sample=SAMPLES, ), output: diff --git a/bin/rules/run_checkm.smk b/bin/rules/run_checkm.smk index cf8947a..7886833 100644 --- a/bin/rules/run_checkm.smk +++ b/bin/rules/run_checkm.smk @@ -2,6 +2,7 @@ ##### Scaffold analyses: QUAST, CheckM, picard, bbmap and QC-metrics ##### ############################################################################# + rule select_genus_checkm: input: genus_bracken=OUT @@ -24,6 +25,7 @@ rule select_genus_checkm: --output {output.selected_genus} 2>&1>{log} """ + rule checkm: input: assembly=OUT + "/de_novo_assembly/{sample}/scaffolds.fasta", diff --git a/bin/rules/subsample_fastq.smk b/bin/rules/subsample_fastq.smk index 6f299a8..e3aa137 100644 --- a/bin/rules/subsample_fastq.smk +++ b/bin/rules/subsample_fastq.smk @@ -3,9 +3,9 @@ rule subsample_fastq: r1=OUT + "/clean_fastq/{sample}_pR1.fastq.gz", r2=OUT + "/clean_fastq/{sample}_pR2.fastq.gz", output: - r1 = OUT + "/subsampled_fastq/{sample}_pR1.fastq.gz", - r2 = OUT + "/subsampled_fastq/{sample}_pR2.fastq.gz", - cov_cutoff_file = OUT + "/subsampling/{sample}.txt" + r1=OUT + "/subsampled_fastq/{sample}_pR1.fastq.gz", + r2=OUT + "/subsampled_fastq/{sample}_pR2.fastq.gz", + cov_cutoff_file=OUT + "/subsampling/{sample}.txt", message: "Subsampling reads for {wildcards.sample}." conda: @@ -28,4 +28,4 @@ python bin/subsample_reads.py --input {input.r1} {input.r2} \ --cov-cutoff-in {params.cov_cutoff} \ --cov-cutoff-out {output.cov_cutoff_file} \ --threads {threads} 2>&1>{log} - """ \ No newline at end of file + """ diff --git a/bin/select_genus_checkm.py b/bin/select_genus_checkm.py index 0f816f5..a69c9ba 100644 --- a/bin/select_genus_checkm.py +++ b/bin/select_genus_checkm.py @@ -6,30 +6,45 @@ def read_bracken_report(path_to_report: Path) -> pd.DataFrame: - df = pd.read_csv(path_to_report, sep='\t', header=None, names=['pct', 'count', 'count_unique', 'rank', 'taxid', 'name']) + df = pd.read_csv( + path_to_report, + sep="\t", + header=None, + names=["pct", "count", "count_unique", "rank", "taxid", "name"], + ) df["name"] = df["name"].str.strip() return df def get_top_microbial_hit(bracken_result: pd.DataFrame) -> str: df_genera = bracken_result[bracken_result["rank"] == "G"] - top_hit = df_genera.sort_values("count", ascending=False).reset_index().loc[0, "name"] + top_hit = ( + df_genera.sort_values("count", ascending=False).reset_index().loc[0, "name"] + ) if top_hit == "Homo": - logging.warning(f"The top species is the Homo genus, indicating contamination with an eukaryote.") - top_hit_microbial = df_genera.sort_values("count", ascending=False).reset_index().loc[1, "name"] + logging.warning( + f"The top species is the Homo genus, indicating contamination with an eukaryote." + ) + top_hit_microbial = ( + df_genera.sort_values("count", ascending=False).reset_index().loc[1, "name"] + ) else: top_hit_microbial = top_hit return top_hit_microbial -def check_if_top_hit_is_supported(top_hit: str, path_to_list_accepted_genera: Path) -> str: +def check_if_top_hit_is_supported( + top_hit: str, path_to_list_accepted_genera: Path +) -> str: with open(path_to_list_accepted_genera, "r") as f: lines = f.readlines() list_accepted_genera = [accepted_genus.strip() for accepted_genus in lines] if top_hit in list_accepted_genera: selected_genus = top_hit else: - logging.warning(f"The selected species is not supported by this version of CheckM.") + logging.warning( + f"The selected species is not supported by this version of CheckM." + ) selected_genus = "NOT_SUPPORTED" return selected_genus @@ -42,7 +57,9 @@ def save_selected_genus(genus_name: str, output_path: Path) -> None: def main(args): # str "None" is provided on command line if args.genus == "None": - logging.warning(f"No genus was provided, this will be guessed from Kraken2+bracken analysis.") + logging.warning( + f"No genus was provided, this will be guessed from Kraken2+bracken analysis." + ) bracken_result = read_bracken_report(args.bracken_output) top_hit = get_top_microbial_hit(bracken_result) else: @@ -50,26 +67,30 @@ def main(args): selected_genus = check_if_top_hit_is_supported(top_hit, args.list_accepted_genera) save_selected_genus(selected_genus, args.output) + if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() - parser.add_argument("--genus", - type=str.capitalize, - help="Genus supplied through metadata, overwriting Kraken2 analysis", - default="None") - parser.add_argument("--bracken-output", - type=Path, - help="Path to bracken output report") - parser.add_argument("--list-accepted-genera", - default=Path("files/accepted_genera_checkm.txt"), - type=Path) - parser.add_argument("--output", - type=Path, - help="Path to output file", - required=True) + parser.add_argument( + "--genus", + type=str.capitalize, + help="Genus supplied through metadata, overwriting Kraken2 analysis", + default="None", + ) + parser.add_argument( + "--bracken-output", type=Path, help="Path to bracken output report" + ) + parser.add_argument( + "--list-accepted-genera", + default=Path("files/accepted_genera_checkm.txt"), + type=Path, + ) + parser.add_argument( + "--output", type=Path, help="Path to output file", required=True + ) args = parser.parse_args() - main(args) \ No newline at end of file + main(args) diff --git a/bin/subsample_reads.py b/bin/subsample_reads.py index e23d0a0..5a9d5fc 100644 --- a/bin/subsample_reads.py +++ b/bin/subsample_reads.py @@ -6,6 +6,7 @@ from pathlib import Path import math + def estimate_genome_size(input: list) -> float: """ Estimate genome size of paired-end read set using mash sketch. @@ -14,13 +15,14 @@ def estimate_genome_size(input: list) -> float: with tempfile.TemporaryDirectory() as tmpdir: cmd_string = f"mash sketch -o {tmpdir}/tmpfile.msh -k 21 -r -m 3 {input[0]}" result = subprocess.run(cmd_string, capture_output=True, shell=True, check=True) - decoded_result = result.stderr.decode('utf-8') - genome_size = int(float(decoded_result.split('\n')[0].split(' ')[-1])) + decoded_result = result.stderr.decode("utf-8") + genome_size = int(float(decoded_result.split("\n")[0].split(" ")[-1])) # mash depth estimation underestimates actual cov often # coverage = float(decoded_result.split('\n')[1].split(' ')[-1]) * 2 logging.info(f"Genome size is estimated to be {genome_size}") return genome_size + def estimate_depth(input: list, genome_size) -> float: """ Estimate depth of paired-end reads using seqtk size and genome size. @@ -29,12 +31,13 @@ def estimate_depth(input: list, genome_size) -> float: with tempfile.TemporaryDirectory() as tmpdir: cmd_string = f"seqtk size {input[0]}" result = subprocess.run(cmd_string, capture_output=True, shell=True, check=True) - decoded_result = result.stdout.decode('utf-8') - total_nt_fw = int(decoded_result.split('\t')[1].rstrip('\n')) + decoded_result = result.stdout.decode("utf-8") + total_nt_fw = int(decoded_result.split("\t")[1].rstrip("\n")) coverage = (total_nt_fw * 2) / genome_size logging.info(f"Depth of coverage is estimated to be {coverage}") return coverage + def calculate_fraction(estimated_depth: float, target_depth: int) -> float: """ Calculate fraction of subsampling. @@ -45,9 +48,12 @@ def calculate_fraction(estimated_depth: float, target_depth: int) -> float: logging.info(f"Estimated depth is higher than target depth, will subsample") logging.info(f"Subsampling using fraction {subsample_fraction}") else: - logging.info(f"Estimated depth is lower than or approx. equal to target depth, will not subsample") + logging.info( + f"Estimated depth is lower than or approx. equal to target depth, will not subsample" + ) return subsample_fraction + def subsample_reads(input: list, output: list, fraction: float, n_threads: int): """ Subsample reads based on calculated fraction. @@ -55,7 +61,9 @@ def subsample_reads(input: list, output: list, fraction: float, n_threads: int): if fraction < 1: cmd_string_r1 = f"seqtk seq -f {fraction} -s 1704 {input[0]} | pigz -p {n_threads} > {output[0]}" cmd_string_r2 = f"seqtk seq -f {fraction} -s 1704 {input[1]} | pigz -p {n_threads} > {output[1]}" - logging.info(f"Subsampling files {[str(x) for x in input]} to {[str(x) for x in output]}, resp.") + logging.info( + f"Subsampling files {[str(x) for x in input]} to {[str(x) for x in output]}, resp." + ) subprocess.run(cmd_string_r1, shell=True, check=True) subprocess.run(cmd_string_r2, shell=True, check=True) logging.info("Finished subsampling reads") @@ -67,6 +75,7 @@ def subsample_reads(input: list, output: list, fraction: float, n_threads: int): subprocess.run(cmd_cp_string_r2, shell=True, check=True) logging.info(f"Finished copying files") + def calculate_coverage_cutoff(estimated_depth: float, target_depth: int): if (target_depth / estimated_depth) < 1: logging.info(f"Basing coverage cutoff on target depth ({target_depth})") @@ -77,60 +86,78 @@ def calculate_coverage_cutoff(estimated_depth: float, target_depth: int): logging.info(f"Coverage cutoff set to {cov_cutoff}") return cov_cutoff + def output_cov_cutoff(cov_cutoff, cov_cutoff_out): with open(cov_cutoff_out, "w") as file: file.write(str(cov_cutoff)) + def main(args): genome_size = estimate_genome_size(args.input) coverage = estimate_depth(args.input, genome_size) fraction = calculate_fraction(coverage, args.depth) subsample_reads(args.input, args.output, fraction, args.threads) if args.cov_cutoff_in == "calculate": - logging.info(f"Coverage cutoff was not specified on command line, will calculate value to use") + logging.info( + f"Coverage cutoff was not specified on command line, will calculate value to use" + ) cov_cutoff = calculate_coverage_cutoff(coverage, args.depth) else: - logging.info(f"Coverage cutoff was set to \"{str(args.cov_cutoff_in)}\" on command line, will pass on this value") + logging.info( + f'Coverage cutoff was set to "{str(args.cov_cutoff_in)}" on command line, will pass on this value' + ) cov_cutoff = args.cov_cutoff_in output_cov_cutoff(cov_cutoff, args.cov_cutoff_out) - - + + if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() - parser.add_argument("-i", "--input", - help="Paired input FASTQ files", - type=Path, - nargs=2, - metavar="INPUT_FILE", - required=True) - parser.add_argument("-d", "--depth", - help="Target depth [100]", - default=100, - metavar="INT", - type=int) - parser.add_argument("--cov-cutoff-in", - help="Input argument for coverage cutoff setting in SPAdes") - parser.add_argument("--cov-cutoff-out", - help="Output file for new coverage cutoff", - type=Path, - metavar="STR") - parser.add_argument("-o", "--output", - help="Paired output FASTQ files", - type=Path, - nargs=2, - metavar="STR") - parser.add_argument("-t", "--threads", - help="Number of threads to use for pigz [1]", - default=1, - type=int) - + parser.add_argument( + "-i", + "--input", + help="Paired input FASTQ files", + type=Path, + nargs=2, + metavar="INPUT_FILE", + required=True, + ) + parser.add_argument( + "-d", "--depth", help="Target depth [100]", default=100, metavar="INT", type=int + ) + parser.add_argument( + "--cov-cutoff-in", help="Input argument for coverage cutoff setting in SPAdes" + ) + parser.add_argument( + "--cov-cutoff-out", + help="Output file for new coverage cutoff", + type=Path, + metavar="STR", + ) + parser.add_argument( + "-o", + "--output", + help="Paired output FASTQ files", + type=Path, + nargs=2, + metavar="STR", + ) + parser.add_argument( + "-t", + "--threads", + help="Number of threads to use for pigz [1]", + default=1, + type=int, + ) + args = parser.parse_args() - logging.basicConfig(level=logging.INFO, - format='[%(asctime)s] %(message)s', - datefmt='%Y/%m/%d %H:%M:%S') - - main(args) \ No newline at end of file + logging.basicConfig( + level=logging.INFO, + format="[%(asctime)s] %(message)s", + datefmt="%Y/%m/%d %H:%M:%S", + ) + + main(args) diff --git a/juno_assembly.py b/juno_assembly.py index 5bb0f48..384cea4 100644 --- a/juno_assembly.py +++ b/juno_assembly.py @@ -123,7 +123,7 @@ def __call__(self, *args, **kwargs) -> None: # type: ignore metavar="STR/INT", default="calculate", help="SPAdes k-mer coverage cut-off to use. Can be calculate, off, or a specified integer. " - "\"Calculate\" lets the script calculate a sample-specific value that works for most use cases.", + '"Calculate" lets the script calculate a sample-specific value that works for most use cases.', ) self.add_argument( "-cl",