Merge pull request #33 from RIVM-bioinformatics/increase_repeatability

Increase repeatability
RIVM-bioinformatics · Aug 10, 2023 · c6b7546 · c6b7546
2 parents 8d4bbff + c907f79
commit c6b7546
Show file tree

Hide file tree

Showing 11 changed files with 163 additions and 77 deletions.
diff --git a/bin/parse_bbtools.py b/bin/parse_bbtools.py
@@ -1,6 +1,7 @@
 import pandas
 import re
 
+
 def parse_bbtools_perScaffold(input_bbtools, output_bbtools):
     # create an empty dataframe with the right headers
     bbtools_headers_file = open(input_bbtools[0], "r")
@@ -11,7 +12,9 @@ def parse_bbtools_perScaffold(input_bbtools, output_bbtools):
     # loop over the bbtools files (perscaffold) and add them to the dataframe
     for input_file in str(input_bbtools).split():
         # get the sample name from the file name
-        sample_name = re.sub("_perMinLenFiltScaffold.tsv", "", str(input_file).split("sample/")[1])
+        sample_name = re.sub(
+            "_perMinLenFiltScaffold.tsv", "", str(input_file).split("sample/")[1]
+        )
 
         # read the data into a pandas dataframe
         sample_dataframe = pandas.read_csv(input_file, sep="\t")

diff --git a/bin/parse_bbtools_summary.py b/bin/parse_bbtools_summary.py
@@ -1,12 +1,15 @@
 import argparse
 import re
 
+
 def parse_bbtools_summary(input_bbtools, output_bbtools):
     summary_dict = {}
 
     for input_file in input_bbtools:
         # get the sample name from the file name
-        sample_name = re.sub("_MinLenFiltSummary.tsv", "", str(input_file).split("sample/")[1])
+        sample_name = re.sub(
+            "_MinLenFiltSummary.tsv", "", str(input_file).split("sample/")[1]
+        )
         variable_name_list = []
         value_list = []
 

diff --git a/bin/rules/clean_fastq.smk b/bin/rules/clean_fastq.smk
@@ -2,8 +2,8 @@ rule clean_fastq:
     input:
         lambda wildcards: (SAMPLES[wildcards.sample][i] for i in ["R1", "R2"]),
     output:
-        r1=OUT + "/clean_fastq/{sample}_pR1.fastq.gz",
-        r2=OUT + "/clean_fastq/{sample}_pR2.fastq.gz",
+        r1=temp(OUT + "/clean_unsorted_fastq/{sample}_pR1.fastq.gz"),
+        r2=temp(OUT + "/clean_unsorted_fastq/{sample}_pR2.fastq.gz"),
         unpaired=OUT + "/clean_fastq/{sample}_unpaired_joined.fastq.gz",
         html=OUT + "/clean_fastq/{sample}_fastp.html",
         json=OUT + "/clean_fastq/{sample}_fastp.json",
@@ -41,3 +41,28 @@ rule clean_fastq:
             --correction \
             --length_required {params.min_length} > {log} 2>&1
         """
+
+
+rule sort_paired_fastq:
+    input:
+        r1=OUT + "/clean_unsorted_fastq/{sample}_pR1.fastq.gz",
+        r2=OUT + "/clean_unsorted_fastq/{sample}_pR2.fastq.gz",
+    output:
+        r1=OUT + "/clean_fastq/{sample}_pR1.fastq.gz",
+        r2=OUT + "/clean_fastq/{sample}_pR2.fastq.gz",
+    message:
+        "Sorting cleaned paired reads to increase repeatability"
+    conda:
+        "../../envs/scaffold_analyses.yaml"
+    container:
+        "docker://staphb/bbtools:38.86"
+    threads: int(config["threads"]["pileup"])
+    resources:
+        mem_gb=config["mem_gb"]["pileup"],
+    log:
+        OUT + "/log/sort_paired_fastq/sort_paired_fastq_{sample}.log",
+    shell:
+        """
+sortbyname.sh in={input.r1} out={output.r1}
+sortbyname.sh in={input.r2} out={output.r2}
+        """
diff --git a/bin/rules/de_novo_assembly.smk b/bin/rules/de_novo_assembly.smk
@@ -13,7 +13,7 @@ rule de_novo_assembly:
         r1=OUT + "/subsampled_fastq/{sample}_pR1.fastq.gz",
         r2=OUT + "/subsampled_fastq/{sample}_pR2.fastq.gz",
         fastq_unpaired=OUT + "/clean_fastq/{sample}_unpaired_joined.fastq.gz",
-        cov_cutoff_file = OUT + "/subsampling/{sample}.txt",
+        cov_cutoff_file=OUT + "/subsampling/{sample}.txt",
     output:
         scaffolds=OUT + "/de_novo_assembly/{sample}/scaffolds.fasta",
         contigs=temp(OUT + "/de_novo_assembly/{sample}/contigs.fasta"),

diff --git a/bin/rules/identify_species.smk b/bin/rules/identify_species.smk
@@ -40,12 +40,16 @@ rule identify_species_reads:
 
         """
 
+
 rule identify_species:
     input:
         OUT + "/de_novo_assembly_filtered/{sample}.fasta",
     output:
-        kraken2_kreport=temp(OUT + "/identify_species/contigs/{sample}/{sample}.kreport2"),
-        bracken_s=OUT + "/identify_species/contigs/{sample}/{sample}_species_content.txt",
+        kraken2_kreport=temp(
+            OUT + "/identify_species/contigs/{sample}/{sample}.kreport2"
+        ),
+        bracken_s=OUT
+        + "/identify_species/contigs/{sample}/{sample}_species_content.txt",
         bracken_kreport=OUT
         + "/identify_species/contigs/{sample}/{sample}_bracken_species.kreport2",
     message:

diff --git a/bin/rules/multiqc.smk b/bin/rules/multiqc.smk
@@ -24,7 +24,8 @@ rule multiqc:
             sample=SAMPLES,
         ),
         expand(
-            OUT + "/identify_species/contigs/{sample}/{sample}_bracken_species.kreport2",
+            OUT
+            + "/identify_species/contigs/{sample}/{sample}_bracken_species.kreport2",
             sample=SAMPLES,
         ),
     output:

diff --git a/bin/rules/run_checkm.smk b/bin/rules/run_checkm.smk
@@ -2,6 +2,7 @@
 ##### Scaffold analyses: QUAST, CheckM, picard, bbmap and QC-metrics    #####
 #############################################################################
 
+
 rule select_genus_checkm:
     input:
         genus_bracken=OUT
@@ -24,6 +25,7 @@ rule select_genus_checkm:
         --output {output.selected_genus} 2>&1>{log}
         """
 
+
 rule checkm:
     input:
         assembly=OUT + "/de_novo_assembly/{sample}/scaffolds.fasta",

diff --git a/bin/rules/subsample_fastq.smk b/bin/rules/subsample_fastq.smk
@@ -3,9 +3,9 @@ rule subsample_fastq:
         r1=OUT + "/clean_fastq/{sample}_pR1.fastq.gz",
         r2=OUT + "/clean_fastq/{sample}_pR2.fastq.gz",
     output:
-        r1 = OUT + "/subsampled_fastq/{sample}_pR1.fastq.gz",
-        r2 = OUT + "/subsampled_fastq/{sample}_pR2.fastq.gz",
-        cov_cutoff_file = OUT + "/subsampling/{sample}.txt"
+        r1=OUT + "/subsampled_fastq/{sample}_pR1.fastq.gz",
+        r2=OUT + "/subsampled_fastq/{sample}_pR2.fastq.gz",
+        cov_cutoff_file=OUT + "/subsampling/{sample}.txt",
     message:
         "Subsampling reads for {wildcards.sample}."
     conda:
@@ -28,4 +28,4 @@ python bin/subsample_reads.py --input {input.r1} {input.r2} \
     --cov-cutoff-in {params.cov_cutoff} \
     --cov-cutoff-out {output.cov_cutoff_file} \
     --threads {threads} 2>&1>{log}
-        """
+        """
diff --git a/bin/select_genus_checkm.py b/bin/select_genus_checkm.py
@@ -6,30 +6,45 @@
 
 
 def read_bracken_report(path_to_report: Path) -> pd.DataFrame:
-    df = pd.read_csv(path_to_report, sep='\t', header=None, names=['pct', 'count', 'count_unique', 'rank', 'taxid', 'name'])
+    df = pd.read_csv(
+        path_to_report,
+        sep="\t",
+        header=None,
+        names=["pct", "count", "count_unique", "rank", "taxid", "name"],
+    )
     df["name"] = df["name"].str.strip()
     return df
 
 
 def get_top_microbial_hit(bracken_result: pd.DataFrame) -> str:
     df_genera = bracken_result[bracken_result["rank"] == "G"]
-    top_hit = df_genera.sort_values("count", ascending=False).reset_index().loc[0, "name"]
+    top_hit = (
+        df_genera.sort_values("count", ascending=False).reset_index().loc[0, "name"]
+    )
     if top_hit == "Homo":
-        logging.warning(f"The top species is the Homo genus, indicating contamination with an eukaryote.")
-        top_hit_microbial = df_genera.sort_values("count", ascending=False).reset_index().loc[1, "name"]
+        logging.warning(
+            f"The top species is the Homo genus, indicating contamination with an eukaryote."
+        )
+        top_hit_microbial = (
+            df_genera.sort_values("count", ascending=False).reset_index().loc[1, "name"]
+        )
     else:
         top_hit_microbial = top_hit
     return top_hit_microbial
 
 
-def check_if_top_hit_is_supported(top_hit: str, path_to_list_accepted_genera: Path) -> str:
+def check_if_top_hit_is_supported(
+    top_hit: str, path_to_list_accepted_genera: Path
+) -> str:
     with open(path_to_list_accepted_genera, "r") as f:
         lines = f.readlines()
     list_accepted_genera = [accepted_genus.strip() for accepted_genus in lines]
     if top_hit in list_accepted_genera:
         selected_genus = top_hit
     else:
-        logging.warning(f"The selected species is not supported by this version of CheckM.")
+        logging.warning(
+            f"The selected species is not supported by this version of CheckM."
+        )
         selected_genus = "NOT_SUPPORTED"
     return selected_genus
 
@@ -42,34 +57,40 @@ def save_selected_genus(genus_name: str, output_path: Path) -> None:
 def main(args):
     # str "None" is provided on command line
     if args.genus == "None":
-        logging.warning(f"No genus was provided, this will be guessed from Kraken2+bracken analysis.")
+        logging.warning(
+            f"No genus was provided, this will be guessed from Kraken2+bracken analysis."
+        )
         bracken_result = read_bracken_report(args.bracken_output)
         top_hit = get_top_microbial_hit(bracken_result)
     else:
         top_hit = args.genus
     selected_genus = check_if_top_hit_is_supported(top_hit, args.list_accepted_genera)
     save_selected_genus(selected_genus, args.output)
 
+
 if __name__ == "__main__":
     import argparse
 
     parser = argparse.ArgumentParser()
 
-    parser.add_argument("--genus",
-                        type=str.capitalize,
-                        help="Genus supplied through metadata, overwriting Kraken2 analysis",
-                        default="None")
-    parser.add_argument("--bracken-output",
-                        type=Path,
-                        help="Path to bracken output report")
-    parser.add_argument("--list-accepted-genera",
-                        default=Path("files/accepted_genera_checkm.txt"),
-                        type=Path)
-    parser.add_argument("--output",
-                        type=Path,
-                        help="Path to output file",
-                        required=True)
+    parser.add_argument(
+        "--genus",
+        type=str.capitalize,
+        help="Genus supplied through metadata, overwriting Kraken2 analysis",
+        default="None",
+    )
+    parser.add_argument(
+        "--bracken-output", type=Path, help="Path to bracken output report"
+    )
+    parser.add_argument(
+        "--list-accepted-genera",
+        default=Path("files/accepted_genera_checkm.txt"),
+        type=Path,
+    )
+    parser.add_argument(
+        "--output", type=Path, help="Path to output file", required=True
+    )
 
     args = parser.parse_args()
 
-    main(args)
+    main(args)