Skip to content

Commit

Permalink
Merge pull request #33 from RIVM-bioinformatics/increase_repeatability
Browse files Browse the repository at this point in the history
Increase repeatability
  • Loading branch information
boasvdp authored Aug 10, 2023
2 parents 8d4bbff + c907f79 commit c6b7546
Show file tree
Hide file tree
Showing 11 changed files with 163 additions and 77 deletions.
5 changes: 4 additions & 1 deletion bin/parse_bbtools.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pandas
import re


def parse_bbtools_perScaffold(input_bbtools, output_bbtools):
# create an empty dataframe with the right headers
bbtools_headers_file = open(input_bbtools[0], "r")
Expand All @@ -11,7 +12,9 @@ def parse_bbtools_perScaffold(input_bbtools, output_bbtools):
# loop over the bbtools files (perscaffold) and add them to the dataframe
for input_file in str(input_bbtools).split():
# get the sample name from the file name
sample_name = re.sub("_perMinLenFiltScaffold.tsv", "", str(input_file).split("sample/")[1])
sample_name = re.sub(
"_perMinLenFiltScaffold.tsv", "", str(input_file).split("sample/")[1]
)

# read the data into a pandas dataframe
sample_dataframe = pandas.read_csv(input_file, sep="\t")
Expand Down
5 changes: 4 additions & 1 deletion bin/parse_bbtools_summary.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import argparse
import re


def parse_bbtools_summary(input_bbtools, output_bbtools):
summary_dict = {}

for input_file in input_bbtools:
# get the sample name from the file name
sample_name = re.sub("_MinLenFiltSummary.tsv", "", str(input_file).split("sample/")[1])
sample_name = re.sub(
"_MinLenFiltSummary.tsv", "", str(input_file).split("sample/")[1]
)
variable_name_list = []
value_list = []

Expand Down
29 changes: 27 additions & 2 deletions bin/rules/clean_fastq.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ rule clean_fastq:
input:
lambda wildcards: (SAMPLES[wildcards.sample][i] for i in ["R1", "R2"]),
output:
r1=OUT + "/clean_fastq/{sample}_pR1.fastq.gz",
r2=OUT + "/clean_fastq/{sample}_pR2.fastq.gz",
r1=temp(OUT + "/clean_unsorted_fastq/{sample}_pR1.fastq.gz"),
r2=temp(OUT + "/clean_unsorted_fastq/{sample}_pR2.fastq.gz"),
unpaired=OUT + "/clean_fastq/{sample}_unpaired_joined.fastq.gz",
html=OUT + "/clean_fastq/{sample}_fastp.html",
json=OUT + "/clean_fastq/{sample}_fastp.json",
Expand Down Expand Up @@ -41,3 +41,28 @@ rule clean_fastq:
--correction \
--length_required {params.min_length} > {log} 2>&1
"""


rule sort_paired_fastq:
input:
r1=OUT + "/clean_unsorted_fastq/{sample}_pR1.fastq.gz",
r2=OUT + "/clean_unsorted_fastq/{sample}_pR2.fastq.gz",
output:
r1=OUT + "/clean_fastq/{sample}_pR1.fastq.gz",
r2=OUT + "/clean_fastq/{sample}_pR2.fastq.gz",
message:
"Sorting cleaned paired reads to increase repeatability"
conda:
"../../envs/scaffold_analyses.yaml"
container:
"docker://staphb/bbtools:38.86"
threads: int(config["threads"]["pileup"])
resources:
mem_gb=config["mem_gb"]["pileup"],
log:
OUT + "/log/sort_paired_fastq/sort_paired_fastq_{sample}.log",
shell:
"""
sortbyname.sh in={input.r1} out={output.r1}
sortbyname.sh in={input.r2} out={output.r2}
"""
2 changes: 1 addition & 1 deletion bin/rules/de_novo_assembly.smk
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ rule de_novo_assembly:
r1=OUT + "/subsampled_fastq/{sample}_pR1.fastq.gz",
r2=OUT + "/subsampled_fastq/{sample}_pR2.fastq.gz",
fastq_unpaired=OUT + "/clean_fastq/{sample}_unpaired_joined.fastq.gz",
cov_cutoff_file = OUT + "/subsampling/{sample}.txt",
cov_cutoff_file=OUT + "/subsampling/{sample}.txt",
output:
scaffolds=OUT + "/de_novo_assembly/{sample}/scaffolds.fasta",
contigs=temp(OUT + "/de_novo_assembly/{sample}/contigs.fasta"),
Expand Down
8 changes: 6 additions & 2 deletions bin/rules/identify_species.smk
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,16 @@ rule identify_species_reads:
"""


rule identify_species:
input:
OUT + "/de_novo_assembly_filtered/{sample}.fasta",
output:
kraken2_kreport=temp(OUT + "/identify_species/contigs/{sample}/{sample}.kreport2"),
bracken_s=OUT + "/identify_species/contigs/{sample}/{sample}_species_content.txt",
kraken2_kreport=temp(
OUT + "/identify_species/contigs/{sample}/{sample}.kreport2"
),
bracken_s=OUT
+ "/identify_species/contigs/{sample}/{sample}_species_content.txt",
bracken_kreport=OUT
+ "/identify_species/contigs/{sample}/{sample}_bracken_species.kreport2",
message:
Expand Down
3 changes: 2 additions & 1 deletion bin/rules/multiqc.smk
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ rule multiqc:
sample=SAMPLES,
),
expand(
OUT + "/identify_species/contigs/{sample}/{sample}_bracken_species.kreport2",
OUT
+ "/identify_species/contigs/{sample}/{sample}_bracken_species.kreport2",
sample=SAMPLES,
),
output:
Expand Down
2 changes: 2 additions & 0 deletions bin/rules/run_checkm.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
##### Scaffold analyses: QUAST, CheckM, picard, bbmap and QC-metrics #####
#############################################################################


rule select_genus_checkm:
input:
genus_bracken=OUT
Expand All @@ -24,6 +25,7 @@ rule select_genus_checkm:
--output {output.selected_genus} 2>&1>{log}
"""


rule checkm:
input:
assembly=OUT + "/de_novo_assembly/{sample}/scaffolds.fasta",
Expand Down
8 changes: 4 additions & 4 deletions bin/rules/subsample_fastq.smk
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ rule subsample_fastq:
r1=OUT + "/clean_fastq/{sample}_pR1.fastq.gz",
r2=OUT + "/clean_fastq/{sample}_pR2.fastq.gz",
output:
r1 = OUT + "/subsampled_fastq/{sample}_pR1.fastq.gz",
r2 = OUT + "/subsampled_fastq/{sample}_pR2.fastq.gz",
cov_cutoff_file = OUT + "/subsampling/{sample}.txt"
r1=OUT + "/subsampled_fastq/{sample}_pR1.fastq.gz",
r2=OUT + "/subsampled_fastq/{sample}_pR2.fastq.gz",
cov_cutoff_file=OUT + "/subsampling/{sample}.txt",
message:
"Subsampling reads for {wildcards.sample}."
conda:
Expand All @@ -28,4 +28,4 @@ python bin/subsample_reads.py --input {input.r1} {input.r2} \
--cov-cutoff-in {params.cov_cutoff} \
--cov-cutoff-out {output.cov_cutoff_file} \
--threads {threads} 2>&1>{log}
"""
"""
65 changes: 43 additions & 22 deletions bin/select_genus_checkm.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,45 @@


def read_bracken_report(path_to_report: Path) -> pd.DataFrame:
df = pd.read_csv(path_to_report, sep='\t', header=None, names=['pct', 'count', 'count_unique', 'rank', 'taxid', 'name'])
df = pd.read_csv(
path_to_report,
sep="\t",
header=None,
names=["pct", "count", "count_unique", "rank", "taxid", "name"],
)
df["name"] = df["name"].str.strip()
return df


def get_top_microbial_hit(bracken_result: pd.DataFrame) -> str:
df_genera = bracken_result[bracken_result["rank"] == "G"]
top_hit = df_genera.sort_values("count", ascending=False).reset_index().loc[0, "name"]
top_hit = (
df_genera.sort_values("count", ascending=False).reset_index().loc[0, "name"]
)
if top_hit == "Homo":
logging.warning(f"The top species is the Homo genus, indicating contamination with an eukaryote.")
top_hit_microbial = df_genera.sort_values("count", ascending=False).reset_index().loc[1, "name"]
logging.warning(
f"The top species is the Homo genus, indicating contamination with an eukaryote."
)
top_hit_microbial = (
df_genera.sort_values("count", ascending=False).reset_index().loc[1, "name"]
)
else:
top_hit_microbial = top_hit
return top_hit_microbial


def check_if_top_hit_is_supported(top_hit: str, path_to_list_accepted_genera: Path) -> str:
def check_if_top_hit_is_supported(
top_hit: str, path_to_list_accepted_genera: Path
) -> str:
with open(path_to_list_accepted_genera, "r") as f:
lines = f.readlines()
list_accepted_genera = [accepted_genus.strip() for accepted_genus in lines]
if top_hit in list_accepted_genera:
selected_genus = top_hit
else:
logging.warning(f"The selected species is not supported by this version of CheckM.")
logging.warning(
f"The selected species is not supported by this version of CheckM."
)
selected_genus = "NOT_SUPPORTED"
return selected_genus

Expand All @@ -42,34 +57,40 @@ def save_selected_genus(genus_name: str, output_path: Path) -> None:
def main(args):
# str "None" is provided on command line
if args.genus == "None":
logging.warning(f"No genus was provided, this will be guessed from Kraken2+bracken analysis.")
logging.warning(
f"No genus was provided, this will be guessed from Kraken2+bracken analysis."
)
bracken_result = read_bracken_report(args.bracken_output)
top_hit = get_top_microbial_hit(bracken_result)
else:
top_hit = args.genus
selected_genus = check_if_top_hit_is_supported(top_hit, args.list_accepted_genera)
save_selected_genus(selected_genus, args.output)


if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser()

parser.add_argument("--genus",
type=str.capitalize,
help="Genus supplied through metadata, overwriting Kraken2 analysis",
default="None")
parser.add_argument("--bracken-output",
type=Path,
help="Path to bracken output report")
parser.add_argument("--list-accepted-genera",
default=Path("files/accepted_genera_checkm.txt"),
type=Path)
parser.add_argument("--output",
type=Path,
help="Path to output file",
required=True)
parser.add_argument(
"--genus",
type=str.capitalize,
help="Genus supplied through metadata, overwriting Kraken2 analysis",
default="None",
)
parser.add_argument(
"--bracken-output", type=Path, help="Path to bracken output report"
)
parser.add_argument(
"--list-accepted-genera",
default=Path("files/accepted_genera_checkm.txt"),
type=Path,
)
parser.add_argument(
"--output", type=Path, help="Path to output file", required=True
)

args = parser.parse_args()

main(args)
main(args)
Loading

0 comments on commit c6b7546

Please sign in to comment.