From c14a50f3d286bc3f2e4410ba3208b96337cc8211 Mon Sep 17 00:00:00 2001
From: boasvdp <boasvdp@gmail.com>
Date: Wed, 9 Aug 2023 13:43:51 +0200
Subject: [PATCH 1/2] fix: sort reads before subsample for repeatability

---
 bin/rules/clean_fastq.smk | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/bin/rules/clean_fastq.smk b/bin/rules/clean_fastq.smk
index 3ea9897..cc7c3e2 100644
--- a/bin/rules/clean_fastq.smk
+++ b/bin/rules/clean_fastq.smk
@@ -2,8 +2,8 @@ rule clean_fastq:
     input:
         lambda wildcards: (SAMPLES[wildcards.sample][i] for i in ["R1", "R2"]),
     output:
-        r1=OUT + "/clean_fastq/{sample}_pR1.fastq.gz",
-        r2=OUT + "/clean_fastq/{sample}_pR2.fastq.gz",
+        r1=temp(OUT + "/clean_unsorted_fastq/{sample}_pR1.fastq.gz"),
+        r2=temp(OUT + "/clean_unsorted_fastq/{sample}_pR2.fastq.gz"),
         unpaired=OUT + "/clean_fastq/{sample}_unpaired_joined.fastq.gz",
         html=OUT + "/clean_fastq/{sample}_fastp.html",
         json=OUT + "/clean_fastq/{sample}_fastp.json",
@@ -41,3 +41,27 @@ rule clean_fastq:
             --correction \
             --length_required {params.min_length} > {log} 2>&1
         """
+
+rule sort_paired_fastq:
+    input:
+        r1=OUT + "/clean_unsorted_fastq/{sample}_pR1.fastq.gz",
+        r2=OUT + "/clean_unsorted_fastq/{sample}_pR2.fastq.gz",
+    output:
+        r1=OUT + "/clean_fastq/{sample}_pR1.fastq.gz",
+        r2=OUT + "/clean_fastq/{sample}_pR2.fastq.gz",
+    message:
+        "Sorting cleaned paired reads to increase repeatability"
+    conda:
+        "../../envs/scaffold_analyses.yaml"
+    container:
+        "docker://staphb/bbtools:38.86"
+    threads: int(config["threads"]["pileup"])
+    resources:
+        mem_gb=config["mem_gb"]["pileup"],
+    log:
+        OUT + "/log/sort_paired_fastq/sort_paired_fastq_{sample}.log",
+    shell:
+        """
+sortbyname.sh in={input.r1} out={output.r1}
+sortbyname.sh in={input.r2} out={output.r2}
+        """
\ No newline at end of file

From c907f7927944f412f93cf69ad0ce51c9e534ef04 Mon Sep 17 00:00:00 2001
From: boasvdp <boasvdp@gmail.com>
Date: Thu, 10 Aug 2023 08:54:32 +0200
Subject: [PATCH 2/2] style: black and snakefmt formatting

---
 bin/parse_bbtools.py           |   5 +-
 bin/parse_bbtools_summary.py   |   5 +-
 bin/rules/clean_fastq.smk      |   3 +-
 bin/rules/de_novo_assembly.smk |   2 +-
 bin/rules/identify_species.smk |   8 ++-
 bin/rules/multiqc.smk          |   3 +-
 bin/rules/run_checkm.smk       |   2 +
 bin/rules/subsample_fastq.smk  |   8 +--
 bin/select_genus_checkm.py     |  65 ++++++++++++-------
 bin/subsample_reads.py         | 111 ++++++++++++++++++++-------------
 juno_assembly.py               |   2 +-
 11 files changed, 138 insertions(+), 76 deletions(-)

diff --git a/bin/parse_bbtools.py b/bin/parse_bbtools.py
index 5fc05fa..8a36c23 100644
--- a/bin/parse_bbtools.py
+++ b/bin/parse_bbtools.py
@@ -1,6 +1,7 @@
 import pandas
 import re
 
+
 def parse_bbtools_perScaffold(input_bbtools, output_bbtools):
     # create an empty dataframe with the right headers
     bbtools_headers_file = open(input_bbtools[0], "r")
@@ -11,7 +12,9 @@ def parse_bbtools_perScaffold(input_bbtools, output_bbtools):
     # loop over the bbtools files (perscaffold) and add them to the dataframe
     for input_file in str(input_bbtools).split():
         # get the sample name from the file name
-        sample_name = re.sub("_perMinLenFiltScaffold.tsv", "", str(input_file).split("sample/")[1])
+        sample_name = re.sub(
+            "_perMinLenFiltScaffold.tsv", "", str(input_file).split("sample/")[1]
+        )
 
         # read the data into a pandas dataframe
         sample_dataframe = pandas.read_csv(input_file, sep="\t")
diff --git a/bin/parse_bbtools_summary.py b/bin/parse_bbtools_summary.py
index 1823d0c..0e8e200 100644
--- a/bin/parse_bbtools_summary.py
+++ b/bin/parse_bbtools_summary.py
@@ -1,12 +1,15 @@
 import argparse
 import re
 
+
 def parse_bbtools_summary(input_bbtools, output_bbtools):
     summary_dict = {}
 
     for input_file in input_bbtools:
         # get the sample name from the file name
-        sample_name = re.sub("_MinLenFiltSummary.tsv", "", str(input_file).split("sample/")[1])
+        sample_name = re.sub(
+            "_MinLenFiltSummary.tsv", "", str(input_file).split("sample/")[1]
+        )
         variable_name_list = []
         value_list = []
 
diff --git a/bin/rules/clean_fastq.smk b/bin/rules/clean_fastq.smk
index cc7c3e2..8fdf410 100644
--- a/bin/rules/clean_fastq.smk
+++ b/bin/rules/clean_fastq.smk
@@ -42,6 +42,7 @@ rule clean_fastq:
             --length_required {params.min_length} > {log} 2>&1
         """
 
+
 rule sort_paired_fastq:
     input:
         r1=OUT + "/clean_unsorted_fastq/{sample}_pR1.fastq.gz",
@@ -64,4 +65,4 @@ rule sort_paired_fastq:
         """
 sortbyname.sh in={input.r1} out={output.r1}
 sortbyname.sh in={input.r2} out={output.r2}
-        """
\ No newline at end of file
+        """
diff --git a/bin/rules/de_novo_assembly.smk b/bin/rules/de_novo_assembly.smk
index 391ce74..b36b988 100644
--- a/bin/rules/de_novo_assembly.smk
+++ b/bin/rules/de_novo_assembly.smk
@@ -13,7 +13,7 @@ rule de_novo_assembly:
         r1=OUT + "/subsampled_fastq/{sample}_pR1.fastq.gz",
         r2=OUT + "/subsampled_fastq/{sample}_pR2.fastq.gz",
         fastq_unpaired=OUT + "/clean_fastq/{sample}_unpaired_joined.fastq.gz",
-        cov_cutoff_file = OUT + "/subsampling/{sample}.txt",
+        cov_cutoff_file=OUT + "/subsampling/{sample}.txt",
     output:
         scaffolds=OUT + "/de_novo_assembly/{sample}/scaffolds.fasta",
         contigs=temp(OUT + "/de_novo_assembly/{sample}/contigs.fasta"),
diff --git a/bin/rules/identify_species.smk b/bin/rules/identify_species.smk
index 019e96f..1d3e131 100644
--- a/bin/rules/identify_species.smk
+++ b/bin/rules/identify_species.smk
@@ -40,12 +40,16 @@ rule identify_species_reads:
 
         """
 
+
 rule identify_species:
     input:
         OUT + "/de_novo_assembly_filtered/{sample}.fasta",
     output:
-        kraken2_kreport=temp(OUT + "/identify_species/contigs/{sample}/{sample}.kreport2"),
-        bracken_s=OUT + "/identify_species/contigs/{sample}/{sample}_species_content.txt",
+        kraken2_kreport=temp(
+            OUT + "/identify_species/contigs/{sample}/{sample}.kreport2"
+        ),
+        bracken_s=OUT
+        + "/identify_species/contigs/{sample}/{sample}_species_content.txt",
         bracken_kreport=OUT
         + "/identify_species/contigs/{sample}/{sample}_bracken_species.kreport2",
     message:
diff --git a/bin/rules/multiqc.smk b/bin/rules/multiqc.smk
index 229eefc..94ffddd 100644
--- a/bin/rules/multiqc.smk
+++ b/bin/rules/multiqc.smk
@@ -24,7 +24,8 @@ rule multiqc:
             sample=SAMPLES,
         ),
         expand(
-            OUT + "/identify_species/contigs/{sample}/{sample}_bracken_species.kreport2",
+            OUT
+            + "/identify_species/contigs/{sample}/{sample}_bracken_species.kreport2",
             sample=SAMPLES,
         ),
     output:
diff --git a/bin/rules/run_checkm.smk b/bin/rules/run_checkm.smk
index cf8947a..7886833 100644
--- a/bin/rules/run_checkm.smk
+++ b/bin/rules/run_checkm.smk
@@ -2,6 +2,7 @@
 ##### Scaffold analyses: QUAST, CheckM, picard, bbmap and QC-metrics    #####
 #############################################################################
 
+
 rule select_genus_checkm:
     input:
         genus_bracken=OUT
@@ -24,6 +25,7 @@ rule select_genus_checkm:
         --output {output.selected_genus} 2>&1>{log}
         """
 
+
 rule checkm:
     input:
         assembly=OUT + "/de_novo_assembly/{sample}/scaffolds.fasta",
diff --git a/bin/rules/subsample_fastq.smk b/bin/rules/subsample_fastq.smk
index 6f299a8..e3aa137 100644
--- a/bin/rules/subsample_fastq.smk
+++ b/bin/rules/subsample_fastq.smk
@@ -3,9 +3,9 @@ rule subsample_fastq:
         r1=OUT + "/clean_fastq/{sample}_pR1.fastq.gz",
         r2=OUT + "/clean_fastq/{sample}_pR2.fastq.gz",
     output:
-        r1 = OUT + "/subsampled_fastq/{sample}_pR1.fastq.gz",
-        r2 = OUT + "/subsampled_fastq/{sample}_pR2.fastq.gz",
-        cov_cutoff_file = OUT + "/subsampling/{sample}.txt"
+        r1=OUT + "/subsampled_fastq/{sample}_pR1.fastq.gz",
+        r2=OUT + "/subsampled_fastq/{sample}_pR2.fastq.gz",
+        cov_cutoff_file=OUT + "/subsampling/{sample}.txt",
     message:
         "Subsampling reads for {wildcards.sample}."
     conda:
@@ -28,4 +28,4 @@ python bin/subsample_reads.py --input {input.r1} {input.r2} \
     --cov-cutoff-in {params.cov_cutoff} \
     --cov-cutoff-out {output.cov_cutoff_file} \
     --threads {threads} 2>&1>{log}
-        """
\ No newline at end of file
+        """
diff --git a/bin/select_genus_checkm.py b/bin/select_genus_checkm.py
index 0f816f5..a69c9ba 100644
--- a/bin/select_genus_checkm.py
+++ b/bin/select_genus_checkm.py
@@ -6,30 +6,45 @@
 
 
 def read_bracken_report(path_to_report: Path) -> pd.DataFrame:
-    df = pd.read_csv(path_to_report, sep='\t', header=None, names=['pct', 'count', 'count_unique', 'rank', 'taxid', 'name'])
+    df = pd.read_csv(
+        path_to_report,
+        sep="\t",
+        header=None,
+        names=["pct", "count", "count_unique", "rank", "taxid", "name"],
+    )
     df["name"] = df["name"].str.strip()
     return df
 
 
 def get_top_microbial_hit(bracken_result: pd.DataFrame) -> str:
     df_genera = bracken_result[bracken_result["rank"] == "G"]
-    top_hit = df_genera.sort_values("count", ascending=False).reset_index().loc[0, "name"]
+    top_hit = (
+        df_genera.sort_values("count", ascending=False).reset_index().loc[0, "name"]
+    )
     if top_hit == "Homo":
-        logging.warning(f"The top species is the Homo genus, indicating contamination with an eukaryote.")
-        top_hit_microbial = df_genera.sort_values("count", ascending=False).reset_index().loc[1, "name"]
+        logging.warning(
+            f"The top species is the Homo genus, indicating contamination with an eukaryote."
+        )
+        top_hit_microbial = (
+            df_genera.sort_values("count", ascending=False).reset_index().loc[1, "name"]
+        )
     else:
         top_hit_microbial = top_hit
     return top_hit_microbial
 
 
-def check_if_top_hit_is_supported(top_hit: str, path_to_list_accepted_genera: Path) -> str:
+def check_if_top_hit_is_supported(
+    top_hit: str, path_to_list_accepted_genera: Path
+) -> str:
     with open(path_to_list_accepted_genera, "r") as f:
         lines = f.readlines()
     list_accepted_genera = [accepted_genus.strip() for accepted_genus in lines]
     if top_hit in list_accepted_genera:
         selected_genus = top_hit
     else:
-        logging.warning(f"The selected species is not supported by this version of CheckM.")
+        logging.warning(
+            f"The selected species is not supported by this version of CheckM."
+        )
         selected_genus = "NOT_SUPPORTED"
     return selected_genus
 
@@ -42,7 +57,9 @@ def save_selected_genus(genus_name: str, output_path: Path) -> None:
 def main(args):
     # str "None" is provided on command line
     if args.genus == "None":
-        logging.warning(f"No genus was provided, this will be guessed from Kraken2+bracken analysis.")
+        logging.warning(
+            f"No genus was provided, this will be guessed from Kraken2+bracken analysis."
+        )
         bracken_result = read_bracken_report(args.bracken_output)
         top_hit = get_top_microbial_hit(bracken_result)
     else:
@@ -50,26 +67,30 @@ def main(args):
     selected_genus = check_if_top_hit_is_supported(top_hit, args.list_accepted_genera)
     save_selected_genus(selected_genus, args.output)
 
+
 if __name__ == "__main__":
     import argparse
 
     parser = argparse.ArgumentParser()
 
-    parser.add_argument("--genus",
-                        type=str.capitalize,
-                        help="Genus supplied through metadata, overwriting Kraken2 analysis",
-                        default="None")
-    parser.add_argument("--bracken-output",
-                        type=Path,
-                        help="Path to bracken output report")
-    parser.add_argument("--list-accepted-genera",
-                        default=Path("files/accepted_genera_checkm.txt"),
-                        type=Path)
-    parser.add_argument("--output",
-                        type=Path,
-                        help="Path to output file",
-                        required=True)
+    parser.add_argument(
+        "--genus",
+        type=str.capitalize,
+        help="Genus supplied through metadata, overwriting Kraken2 analysis",
+        default="None",
+    )
+    parser.add_argument(
+        "--bracken-output", type=Path, help="Path to bracken output report"
+    )
+    parser.add_argument(
+        "--list-accepted-genera",
+        default=Path("files/accepted_genera_checkm.txt"),
+        type=Path,
+    )
+    parser.add_argument(
+        "--output", type=Path, help="Path to output file", required=True
+    )
 
     args = parser.parse_args()
 
-    main(args)
\ No newline at end of file
+    main(args)
diff --git a/bin/subsample_reads.py b/bin/subsample_reads.py
index e23d0a0..5a9d5fc 100644
--- a/bin/subsample_reads.py
+++ b/bin/subsample_reads.py
@@ -6,6 +6,7 @@
 from pathlib import Path
 import math
 
+
 def estimate_genome_size(input: list) -> float:
     """
     Estimate genome size of paired-end read set using mash sketch.
@@ -14,13 +15,14 @@ def estimate_genome_size(input: list) -> float:
     with tempfile.TemporaryDirectory() as tmpdir:
         cmd_string = f"mash sketch -o {tmpdir}/tmpfile.msh -k 21 -r -m 3 {input[0]}"
         result = subprocess.run(cmd_string, capture_output=True, shell=True, check=True)
-    decoded_result = result.stderr.decode('utf-8')
-    genome_size = int(float(decoded_result.split('\n')[0].split(' ')[-1]))
+    decoded_result = result.stderr.decode("utf-8")
+    genome_size = int(float(decoded_result.split("\n")[0].split(" ")[-1]))
     # mash depth estimation underestimates actual cov often
     # coverage = float(decoded_result.split('\n')[1].split(' ')[-1]) * 2
     logging.info(f"Genome size is estimated to be {genome_size}")
     return genome_size
 
+
 def estimate_depth(input: list, genome_size) -> float:
     """
     Estimate depth of paired-end reads using seqtk size and genome size.
@@ -29,12 +31,13 @@ def estimate_depth(input: list, genome_size) -> float:
     with tempfile.TemporaryDirectory() as tmpdir:
         cmd_string = f"seqtk size {input[0]}"
         result = subprocess.run(cmd_string, capture_output=True, shell=True, check=True)
-    decoded_result = result.stdout.decode('utf-8')
-    total_nt_fw = int(decoded_result.split('\t')[1].rstrip('\n'))
+    decoded_result = result.stdout.decode("utf-8")
+    total_nt_fw = int(decoded_result.split("\t")[1].rstrip("\n"))
     coverage = (total_nt_fw * 2) / genome_size
     logging.info(f"Depth of coverage is estimated to be {coverage}")
     return coverage
 
+
 def calculate_fraction(estimated_depth: float, target_depth: int) -> float:
     """
     Calculate fraction of subsampling.
@@ -45,9 +48,12 @@ def calculate_fraction(estimated_depth: float, target_depth: int) -> float:
         logging.info(f"Estimated depth is higher than target depth, will subsample")
         logging.info(f"Subsampling using fraction {subsample_fraction}")
     else:
-        logging.info(f"Estimated depth is lower than or approx. equal to target depth, will not subsample")
+        logging.info(
+            f"Estimated depth is lower than or approx. equal to target depth, will not subsample"
+        )
     return subsample_fraction
 
+
 def subsample_reads(input: list, output: list, fraction: float, n_threads: int):
     """
     Subsample reads based on calculated fraction.
@@ -55,7 +61,9 @@ def subsample_reads(input: list, output: list, fraction: float, n_threads: int):
     if fraction < 1:
         cmd_string_r1 = f"seqtk seq -f {fraction} -s 1704 {input[0]} | pigz -p {n_threads} > {output[0]}"
         cmd_string_r2 = f"seqtk seq -f {fraction} -s 1704 {input[1]} | pigz -p {n_threads} > {output[1]}"
-        logging.info(f"Subsampling files {[str(x) for x in input]} to {[str(x) for x in output]}, resp.")
+        logging.info(
+            f"Subsampling files {[str(x) for x in input]} to {[str(x) for x in output]}, resp."
+        )
         subprocess.run(cmd_string_r1, shell=True, check=True)
         subprocess.run(cmd_string_r2, shell=True, check=True)
         logging.info("Finished subsampling reads")
@@ -67,6 +75,7 @@ def subsample_reads(input: list, output: list, fraction: float, n_threads: int):
         subprocess.run(cmd_cp_string_r2, shell=True, check=True)
         logging.info(f"Finished copying files")
 
+
 def calculate_coverage_cutoff(estimated_depth: float, target_depth: int):
     if (target_depth / estimated_depth) < 1:
         logging.info(f"Basing coverage cutoff on target depth ({target_depth})")
@@ -77,60 +86,78 @@ def calculate_coverage_cutoff(estimated_depth: float, target_depth: int):
     logging.info(f"Coverage cutoff set to {cov_cutoff}")
     return cov_cutoff
 
+
 def output_cov_cutoff(cov_cutoff, cov_cutoff_out):
     with open(cov_cutoff_out, "w") as file:
         file.write(str(cov_cutoff))
 
+
 def main(args):
     genome_size = estimate_genome_size(args.input)
     coverage = estimate_depth(args.input, genome_size)
     fraction = calculate_fraction(coverage, args.depth)
     subsample_reads(args.input, args.output, fraction, args.threads)
     if args.cov_cutoff_in == "calculate":
-        logging.info(f"Coverage cutoff was not specified on command line, will calculate value to use")
+        logging.info(
+            f"Coverage cutoff was not specified on command line, will calculate value to use"
+        )
         cov_cutoff = calculate_coverage_cutoff(coverage, args.depth)
     else:
-        logging.info(f"Coverage cutoff was set to \"{str(args.cov_cutoff_in)}\" on command line, will pass on this value")
+        logging.info(
+            f'Coverage cutoff was set to "{str(args.cov_cutoff_in)}" on command line, will pass on this value'
+        )
         cov_cutoff = args.cov_cutoff_in
     output_cov_cutoff(cov_cutoff, args.cov_cutoff_out)
-    
-    
+
+
 if __name__ == "__main__":
     import argparse
 
     parser = argparse.ArgumentParser()
 
-    parser.add_argument("-i", "--input",
-                        help="Paired input FASTQ files",
-                        type=Path,
-                        nargs=2,
-                        metavar="INPUT_FILE",
-                        required=True)
-    parser.add_argument("-d", "--depth",
-                        help="Target depth [100]",
-                        default=100,
-                        metavar="INT",
-                        type=int)
-    parser.add_argument("--cov-cutoff-in",
-                        help="Input argument for coverage cutoff setting in SPAdes")
-    parser.add_argument("--cov-cutoff-out",
-                        help="Output file for new coverage cutoff",
-                        type=Path,
-                        metavar="STR")
-    parser.add_argument("-o", "--output",
-                        help="Paired output FASTQ files",
-                        type=Path,
-                        nargs=2,
-                        metavar="STR")
-    parser.add_argument("-t", "--threads",
-                        help="Number of threads to use for pigz [1]",
-                        default=1,
-                        type=int)
-    
+    parser.add_argument(
+        "-i",
+        "--input",
+        help="Paired input FASTQ files",
+        type=Path,
+        nargs=2,
+        metavar="INPUT_FILE",
+        required=True,
+    )
+    parser.add_argument(
+        "-d", "--depth", help="Target depth [100]", default=100, metavar="INT", type=int
+    )
+    parser.add_argument(
+        "--cov-cutoff-in", help="Input argument for coverage cutoff setting in SPAdes"
+    )
+    parser.add_argument(
+        "--cov-cutoff-out",
+        help="Output file for new coverage cutoff",
+        type=Path,
+        metavar="STR",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        help="Paired output FASTQ files",
+        type=Path,
+        nargs=2,
+        metavar="STR",
+    )
+    parser.add_argument(
+        "-t",
+        "--threads",
+        help="Number of threads to use for pigz [1]",
+        default=1,
+        type=int,
+    )
+
     args = parser.parse_args()
 
-    logging.basicConfig(level=logging.INFO,
-                            format='[%(asctime)s] %(message)s',
-                            datefmt='%Y/%m/%d %H:%M:%S')
-    
-    main(args)
\ No newline at end of file
+    logging.basicConfig(
+        level=logging.INFO,
+        format="[%(asctime)s] %(message)s",
+        datefmt="%Y/%m/%d %H:%M:%S",
+    )
+
+    main(args)
diff --git a/juno_assembly.py b/juno_assembly.py
index 5bb0f48..384cea4 100644
--- a/juno_assembly.py
+++ b/juno_assembly.py
@@ -123,7 +123,7 @@ def __call__(self, *args, **kwargs) -> None:  # type: ignore
             metavar="STR/INT",
             default="calculate",
             help="SPAdes k-mer coverage cut-off to use. Can be calculate, off, or a specified integer. "
-            "\"Calculate\" lets the script calculate a sample-specific value that works for most use cases.",
+            '"Calculate" lets the script calculate a sample-specific value that works for most use cases.',
         )
         self.add_argument(
             "-cl",