-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapollo_mapping.py
161 lines (140 loc) · 5.42 KB
/
apollo_mapping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""
Apollo mapping, based on Juno template
Authors: Roxanne Wolthuis, Boas van der Putten
Organization: Rijksinstituut voor Volksgezondheid en Milieu (RIVM)
Department: Infektieziekteonderzoek, Diagnostiek en Laboratorium
Surveillance (IDS), Bacteriologie (BPD)
Date: 07-04-2023
"""
from pathlib import Path
import pathlib
import yaml
import argparse
import sys
from dataclasses import dataclass, field
from juno_library import Pipeline
from typing import Optional
from version import __package_name__, __version__, __description__
def main() -> None:
apollo_mapping = ApolloMapping()
apollo_mapping.run()
@dataclass
class ApolloMapping(Pipeline):
pipeline_name: str = __package_name__
pipeline_version: str = __version__
input_type: str = "fastq"
species_options = ["candida_auris", "aspergillus_fumigatus"]
def _add_args_to_parser(self) -> None:
super()._add_args_to_parser()
self.parser.description = (
"Apollo mapping pipelines for reference mapping analysis of fungal genomes."
)
self.add_argument(
"-s",
"--species",
type=str.lower,
metavar="STR",
help=f"Species to use, choose from: {self.species_options}",
required=True,
dest="species",
choices=self.species_options,
)
self.add_argument(
"--reference",
type=Path,
metavar="FILE",
dest="custom_reference",
help="Reference genome to use default is chosen based on species argument, defaults per species can be found in: /mnt/db/apollo/mapping/[species]",
required=False,
)
self.add_argument(
"--db-dir",
type=Path,
default="/mnt/db/apollo/kraken_db_apollo",
metavar="DIR",
help="Kraken2 database directory (should include fungi!).",
)
self.add_argument(
"-mpt",
"--mean-quality-threshold",
type=int,
metavar="INT",
default=28,
help="Phred score to be used as threshold for cleaning (filtering) fastq files.",
)
self.add_argument(
"-ws",
"--window-size",
type=int,
metavar="INT",
default=5,
help="Window size to use for cleaning (filtering) fastq files.",
)
self.add_argument(
"-ml",
"--minimum-length",
type=int,
metavar="INT",
default=50,
help="Minimum length for fastq reads to be kept after trimming.",
)
def _parse_args(self) -> argparse.Namespace:
args = super()._parse_args()
# Optional arguments are loaded into self here
self.db_dir: Path = args.db_dir
self.mean_quality_threshold: int = args.mean_quality_threshold
self.window_size: int = args.window_size
self.min_read_length: int = args.minimum_length
self.reference: Optional[Path] = None
self.custom_reference: Path = args.custom_reference
self.time_limit: int = args.time_limit
self.species: str = args.species
return args
def setup(self) -> None:
super().setup()
if self.snakemake_args["use_singularity"]:
self.snakemake_args["singularity_args"] = " ".join(
[
self.snakemake_args["singularity_args"],
f"--bind {self.db_dir}:{self.db_dir}",
] # paths that singularity should be able to read from can be bound by adding to the above list
)
# Change default time_limit to 180, or keep time_limit from the command line if > 180
if self.time_limit < 300:
self.time_limit = 300
# select a reference based on species:
# self.ref_dir = "/mnt/db/apollo/mapping/"
if self.species == "candida_auris":
self.reference = Path(
"/mnt/db/apollo/mapping/candida_auris/GCF_003013715.1.fasta"
)
elif self.species == "aspergillus_fumigatus":
self.reference = Path(
"/mnt/db/apollo/mapping/aspergillus_fumigatus/CEA10.fasta"
)
if self.custom_reference is not None:
print(
"A reference genome was specified by the user, which may not be the default reference genome for this species."
)
self.reference = self.custom_reference
print(f"Running pipeline for {self.species} with reference: {self.reference}.")
with open(
Path(__file__).parent.joinpath("config/pipeline_parameters.yaml")
) as f:
parameters_dict = yaml.safe_load(f)
self.snakemake_config.update(parameters_dict)
self.user_parameters = {
"input_dir": str(self.input_dir),
"output_dir": str(self.output_dir),
"exclusion_file": str(self.exclusion_file),
"db_dir": str(self.db_dir),
"mean_quality_threshold": int(self.mean_quality_threshold),
"window_size": int(self.window_size),
"min_read_length": int(self.min_read_length),
"reference": str(self.reference),
"use_singularity": str(self.snakemake_args["use_singularity"]),
"time-limit": str(self.time_limit),
"species": str(self.species),
}
if __name__ == "__main__":
main()