diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml new file mode 100644 index 0000000..97ff8f8 --- /dev/null +++ b/.github/workflows/build_and_test.yml @@ -0,0 +1,93 @@ +name: build containers and run tests + +on: + pull_request: + branches: + - 'main' + workflow_dispatch: + + +jobs: + Setup_and_build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Setup Mamba + uses: mamba-org/setup-micromamba@v1 + with: + cache-environment: true + post-cleanup: 'all' + environment-file: env.yml + init-shell: bash + + - name: Install local python package + run: | + pip install . --no-deps + shell: micromamba-shell {0} + + - name: build containers + run: | + python containers/build_containers.py + env: + TOKEN: ${{ secrets.GITHUB_TOKEN }} + shell: micromamba-shell {0} + + - name: zip built containers + run: | + cd ./containers/ + tar -czvf containers.tar.gz builtcontainers.json $(find . -type f -name "*.tar" -printf '%f ') + + + - name: Upload container artifacts + uses: actions/upload-artifact@v3 + with: + name: built_containers + path: ./containers/containers.tar.gz + + Test: + runs-on: ubuntu-latest + needs: Setup_and_build + steps: + - uses: actions/checkout@v3 + + - uses: actions/download-artifact@v3 + with: + name: built_containers + + - name: move artifact + run: | + mv ./containers.tar.gz ./containers/containers.tar.gz + + - name: unzip built containers + run: | + cd ./containers/ + tar -xzvf containers.tar.gz + cd .. + + - name: Setup Apptainer + uses: eWaterCycle/setup-apptainer@v2 + + - name: Setup Mamba + uses: mamba-org/setup-micromamba@v1 + with: + cache-environment: true + post-cleanup: 'all' + environment-file: env.yml + init-shell: bash + + - name: Install local python package + run: | + pip install . --no-deps + shell: micromamba-shell {0} + + - name: convert containers + run: | + python containers/convert_artifact_containers_for_apptainer.py + + - name: download existing containers + run: | + python containers/pull_published_containers.py + shell: micromamba-shell {0} + + ## rest of the testing suite here \ No newline at end of file diff --git a/.github/workflows/publish_containers.yaml b/.github/workflows/publish_containers.yaml new file mode 100644 index 0000000..2bf870c --- /dev/null +++ b/.github/workflows/publish_containers.yaml @@ -0,0 +1,63 @@ +name: Publish containers + +on: + release: + types: + - published + workflow_dispatch: + + +jobs: + Upload: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Download artifact + id: download-artifact + uses: dawidd6/action-download-artifact@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + workflow: build_and_test.yml + name: built_containers + skip_unpack: false + + - name: move artifact + run: | + mv ./containers.tar.gz ./containers/containers.tar.gz + + - name: unzip built containers + run: | + cd ./containers/ + tar -xzvf containers.tar.gz + cd .. + + - name: Setup Mamba + uses: mamba-org/setup-micromamba@v1 + with: + cache-environment: true + post-cleanup: 'all' + environment-file: env.yml + init-shell: bash + + - name: Install local python package + run: | + pip install . --no-deps + shell: micromamba-shell {0} + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Add artifacted containers to docker daemon + run: | + python containers/add_OCI_to_docker_engine.py + shell: micromamba-shell {0} + + - name: tag and push containers + run: | + python containers/tag_and_push_containers.py + shell: micromamba-shell {0} \ No newline at end of file diff --git a/ViroConstrictor/__main__.py b/ViroConstrictor/__main__.py index 91d6be4..f05919f 100644 --- a/ViroConstrictor/__main__.py +++ b/ViroConstrictor/__main__.py @@ -23,6 +23,10 @@ from ViroConstrictor.runconfigs import GetSnakemakeRunDetails, WriteYaml from ViroConstrictor.runreport import WriteReport from ViroConstrictor.update import update +from ViroConstrictor.workflow.containers import ( + construct_container_bind_args, + download_containers, +) def get_preset_warning_list( @@ -67,7 +71,13 @@ def get_preset_warning_list( This applies to the following samples:\n{''.join(samples)}""" preset_score_warnings.append(warn) - p_fallbackwarning_df = sample_info_df.loc[sample_info_df["PRESET_SCORE"] == 0.0] + # check if the preset score is larger or equal than 0.0 and smaller than 0.000001 (1e-6) + # We do this because the preset score is a float and we want to check if it is within a certain range as floating point equality checks are not reliable + p_fallbackwarning_df = sample_info_df.loc[ + (sample_info_df["PRESET_SCORE"] >= 0.0) + & (sample_info_df["PRESET_SCORE"] < 1e-6) + ] + targets, presets = ( ( list(x) @@ -151,7 +161,19 @@ def main() -> NoReturn: inputs_obj=parsed_input, samplesheetfilename="samples_main" ) + # if configured to use containers, check if they are available and download them if necessary + # TODO: add the verbosity flag to the download_containers function and update log message to reflect this + if ( + snakemake_run_details.snakemake_run_conf["use-singularity"] + and download_containers(snakemake_run_details.snakemake_run_conf) != 0 + ): + log.error( + "Failed to download containers required for workflow.\nPlease check the logs and your settings for more information and try again later." + ) + sys.exit(1) + log.info(f"{'='*20} [bold yellow] Starting Main Workflow [/bold yellow] {'='*20}") + status: bool = False if parsed_input.user_config["COMPUTING"]["compmode"] == "local": status = snakemake.snakemake( @@ -160,22 +182,29 @@ def main() -> NoReturn: cores=snakemake_run_details.snakemake_run_conf["cores"], use_conda=snakemake_run_details.snakemake_run_conf["use-conda"], conda_frontend="mamba", + use_singularity=snakemake_run_details.snakemake_run_conf["use-singularity"], + singularity_args=construct_container_bind_args(parsed_input.samples_dict), jobname=snakemake_run_details.snakemake_run_conf["jobname"], latency_wait=snakemake_run_details.snakemake_run_conf["latency-wait"], dryrun=snakemake_run_details.snakemake_run_conf["dryrun"], + force_incomplete=snakemake_run_details.snakemake_run_conf["force-incomplete"], configfiles=[ WriteYaml( snakemake_run_details.snakemake_run_parameters, f"{parsed_input.workdir}/config/run_params.yaml", - ) + ), + WriteYaml( + snakemake_run_details.snakemake_run_conf, + f"{parsed_input.workdir}/config/run_configs.yaml", + ), ], - restart_times=3, - keepgoing=True, + restart_times=snakemake_run_details.snakemake_run_conf["restart-times"], + keepgoing=snakemake_run_details.snakemake_run_conf["keep-going"], quiet=["all"], # type: ignore log_handler=[ ViroConstrictor.logging.snakemake_logger(logfile=parsed_input.logfile) ], - printshellcmds=False, + printshellcmds=snakemake_run_details.snakemake_run_conf["printshellcmds"], ) if parsed_input.user_config["COMPUTING"]["compmode"] == "grid": status = snakemake.snakemake( @@ -185,23 +214,31 @@ def main() -> NoReturn: nodes=snakemake_run_details.snakemake_run_conf["cores"], use_conda=snakemake_run_details.snakemake_run_conf["use-conda"], conda_frontend="mamba", + use_singularity=snakemake_run_details.snakemake_run_conf["use-singularity"], + singularity_args=construct_container_bind_args(parsed_input.samples_dict), jobname=snakemake_run_details.snakemake_run_conf["jobname"], latency_wait=snakemake_run_details.snakemake_run_conf["latency-wait"], drmaa=snakemake_run_details.snakemake_run_conf["drmaa"], drmaa_log_dir=snakemake_run_details.snakemake_run_conf["drmaa-log-dir"], dryrun=snakemake_run_details.snakemake_run_conf["dryrun"], + force_incomplete=snakemake_run_details.snakemake_run_conf["force-incomplete"], configfiles=[ WriteYaml( snakemake_run_details.snakemake_run_parameters, f"{parsed_input.workdir}/config/run_params.yaml", - ) + ), + WriteYaml( + snakemake_run_details.snakemake_run_conf, + f"{parsed_input.workdir}/config/run_configs.yaml", + ), ], - restart_times=3, - keepgoing=True, + restart_times=snakemake_run_details.snakemake_run_conf["restart-times"], + keepgoing=snakemake_run_details.snakemake_run_conf["keep-going"], quiet=["all"], # type: ignore log_handler=[ ViroConstrictor.logging.snakemake_logger(logfile=parsed_input.logfile) ], + printshellcmds=snakemake_run_details.snakemake_run_conf["printshellcmds"], ) if snakemake_run_details.snakemake_run_conf["dryrun"] is False and status is True: @@ -213,7 +250,11 @@ def main() -> NoReturn: WriteYaml( snakemake_run_details.snakemake_run_parameters, f"{parsed_input.workdir}/config/run_params.yaml", - ) + ), + WriteYaml( + snakemake_run_details.snakemake_run_conf, + f"{parsed_input.workdir}/config/run_configs.yaml", + ), ], quiet=["all"], # type: ignore log_handler=[ diff --git a/ViroConstrictor/functions.py b/ViroConstrictor/functions.py index 5fd05fc..e09f952 100644 --- a/ViroConstrictor/functions.py +++ b/ViroConstrictor/functions.py @@ -83,7 +83,7 @@ def _split_paragraphs(self, text: str) -> list[str]: """Split text in to paragraphs of like-indented lines.""" text = textwrap.dedent(text).strip() - text = re.sub("\n\n[\n]+", "\n\n", text) + text = re.sub("\n\n\n+", "\n\n", text) last_sub_indent: Optional[int] = None paragraphs: list[str] = [] @@ -157,7 +157,8 @@ def pathCompleter(self, text: str, state: int) -> str: if os.path.isdir(text): text += "/" - return list(glob.glob(f"{text}*"))[state] + # we explicitly to a list comprehension here instead of a call to the constructor as the this would otherwise break the autocompletion functionality of paths. + return [x for x in glob.glob(f"{text}*")][state] def createListCompleter(self, ll: list[str]) -> None: """ diff --git a/ViroConstrictor/logging.py b/ViroConstrictor/logging.py index d3b3fcf..9f0b1c9 100644 --- a/ViroConstrictor/logging.py +++ b/ViroConstrictor/logging.py @@ -3,7 +3,7 @@ import os import pathlib import re -from typing import Any +from typing import Any, Callable from rich.color import ANSI_COLOR_NAMES from rich.default_styles import DEFAULT_STYLES @@ -219,8 +219,9 @@ def print_jobstatistics_logmessage(msg: dict) -> None: log.info(f"Job statistics:\n[yellow]{logmessage}[/yellow]") -logmessage_strings_info: dict[str, Any] = { +logmessage_strings_info: dict[str, Callable] = { "Activating conda environment": ColorizeLogMessagePath, + "Activating singularity image": ColorizeLogMessagePath, "Building DAG of jobs": BaseLogMessage, "Creating conda environment": ColorizeLogMessagePath, "Removing incomplete Conda environment": ColorizeLogMessagePath, @@ -262,15 +263,11 @@ def log_handler(msg: dict) -> None: loglevel = msg.get("level") logmessage = msg.get("msg") - if loglevel == "dag_debug": - return None - if loglevel == "debug": - return None - if loglevel == "shellcmd": + if loglevel in ["dag_debug", "debug", "shellcmd"]: return None if logmessage is not None and any( - x for x in logmessage_suppressed_strings_warning if x in logmessage + x in logmessage for x in logmessage_suppressed_strings_warning ): return None diff --git a/ViroConstrictor/match_ref.py b/ViroConstrictor/match_ref.py index fcffc02..5b3d104 100644 --- a/ViroConstrictor/match_ref.py +++ b/ViroConstrictor/match_ref.py @@ -1,4 +1,5 @@ import copy +import sys from typing import Literal import pandas as pd @@ -9,6 +10,10 @@ from ViroConstrictor.parser import CLIparser from ViroConstrictor.runconfigs import GetSnakemakeRunDetails, WriteYaml from ViroConstrictor.runreport import WriteReport +from ViroConstrictor.workflow.containers import ( + construct_container_bind_args, + download_containers, +) def run_snakemake( @@ -36,23 +41,30 @@ def run_snakemake( cores=snakemakedetails.snakemake_run_conf["cores"], use_conda=snakemakedetails.snakemake_run_conf["use-conda"], conda_frontend="mamba", + use_singularity=snakemakedetails.snakemake_run_conf["use-singularity"], + singularity_args=construct_container_bind_args(inputs_obj.samples_dict), jobname=snakemakedetails.snakemake_run_conf["jobname"], latency_wait=snakemakedetails.snakemake_run_conf["latency-wait"], dryrun=snakemakedetails.snakemake_run_conf["dryrun"], + force_incomplete=snakemakedetails.snakemake_run_conf["force-incomplete"], configfiles=[ WriteYaml( snakemakedetails.snakemake_run_parameters, f"{inputs_obj.workdir}/config/run_params_MR.yaml", - ) + ), + WriteYaml( + snakemakedetails.snakemake_run_conf, + f"{inputs_obj.workdir}/config/run_configs_MR.yaml", + ), ], - restart_times=3, - keepgoing=True, + restart_times=snakemakedetails.snakemake_run_conf["restart-times"], + keepgoing=snakemakedetails.snakemake_run_conf["keep-going"], quiet=["all"], # type: ignore log_handler=[ ViroConstrictor.logging.snakemake_logger(logfile=inputs_obj.logfile), ], - printshellcmds=False, - scheduler="greedy", + printshellcmds=snakemakedetails.snakemake_run_conf["printshellcmds"], + scheduler=snakemakedetails.snakemake_run_conf["scheduler"], ) return snakemake( @@ -61,25 +73,32 @@ def run_snakemake( cores=snakemakedetails.snakemake_run_conf["cores"], use_conda=snakemakedetails.snakemake_run_conf["use-conda"], conda_frontend="mamba", + use_singularity=snakemakedetails.snakemake_run_conf["use-singularity"], + singularity_args=construct_container_bind_args(inputs_obj.samples_dict), jobname=snakemakedetails.snakemake_run_conf["jobname"], latency_wait=snakemakedetails.snakemake_run_conf["latency-wait"], drmaa=snakemakedetails.snakemake_run_conf["drmaa"], drmaa_log_dir=snakemakedetails.snakemake_run_conf["drmaa-log-dir"], dryrun=snakemakedetails.snakemake_run_conf["dryrun"], + force_incomplete=snakemakedetails.snakemake_run_conf["force-incomplete"], configfiles=[ WriteYaml( snakemakedetails.snakemake_run_parameters, f"{inputs_obj.workdir}/config/run_params_MR.yaml", - ) + ), + WriteYaml( + snakemakedetails.snakemake_run_conf, + f"{inputs_obj.workdir}/config/run_configs_MR.yaml", + ), ], - restart_times=3, - keepgoing=True, + restart_times=snakemakedetails.snakemake_run_conf["restart-times"], + keepgoing=snakemakedetails.snakemake_run_conf["keep-going"], quiet=["all"], # type: ignore log_handler=[ ViroConstrictor.logging.snakemake_logger(logfile=inputs_obj.logfile), ], - printshellcmds=False, - scheduler="greedy", + printshellcmds=snakemakedetails.snakemake_run_conf["printshellcmds"], + scheduler=snakemakedetails.snakemake_run_conf["scheduler"], ) @@ -154,11 +173,19 @@ def replacement_merge_dataframe_on_cols( pd.DataFrame The merged dataframe with updated values from the override dataframe. """ - for i in zip(cols_left, cols_right): - original_df[i[0]] = original_df.apply( - lambda x: override_df[i[1]][override_df["sample"] == x["SAMPLE"]].values[0] - if x["SAMPLE"] in override_df["sample"].values and x[i[0]] != "NONE" - else x[i[0]], + + # set sample columns to str type to avoid issues with merging + original_df["SAMPLE"] = original_df["SAMPLE"].astype(str) + override_df["sample"] = override_df["sample"].astype(str) + + + for replacement_columns in zip(cols_left, cols_right): + original_df[replacement_columns[0]] = original_df.apply( + lambda x, replacement_columns=replacement_columns: ( + override_df[replacement_columns[1]][override_df["sample"] == x["SAMPLE"]].values[0] + if x["SAMPLE"] in override_df["sample"].values and x[replacement_columns[0]] != "NONE" + else x[replacement_columns[0]] + ), axis=1, ) return original_df @@ -185,6 +212,16 @@ def process_match_ref(parsed_inputs: CLIparser) -> CLIparser: log.info( f"{'='*20} [bold orange_red1] Starting Match-reference process [/bold orange_red1] {'='*20}" ) + + if ( + snakemakedetails.snakemake_run_conf["use-singularity"] is True + and download_containers(snakemakedetails.snakemake_run_conf) != 0 + ): + log.error( + "Failed to download containers required for workflow.\nPlease check the logs and your settings for more information and try again later." + ) + sys.exit(1) + status = run_snakemake(inputs_obj_match_ref, snakemakedetails) workflow_state: Literal["Failed", "Success"] = ( diff --git a/ViroConstrictor/parser.py b/ViroConstrictor/parser.py index 4821011..1c6d9e3 100644 --- a/ViroConstrictor/parser.py +++ b/ViroConstrictor/parser.py @@ -530,8 +530,6 @@ def _get_paths_for_workflow( if not os.path.exists(working_directory): os.makedirs(working_directory) - if os.getcwd() != working_directory: - os.chdir(working_directory) return ( input_path, diff --git a/ViroConstrictor/runconfigs.py b/ViroConstrictor/runconfigs.py index 2e510e0..c77f4e0 100644 --- a/ViroConstrictor/runconfigs.py +++ b/ViroConstrictor/runconfigs.py @@ -52,28 +52,40 @@ def _snakemake_run_config(self) -> dict[str, Any]: cores = self._set_cores(self.inputs.flags.threads) configuration = self.inputs.user_config compmode = configuration["COMPUTING"]["compmode"] + reproduction_mode = configuration["REPRODUCTION"]["repro_method"] + reproduction_cache_path = configuration["REPRODUCTION"]["container_cache_path"] - if compmode == "grid": - queuename = configuration["COMPUTING"]["queuename"] - # threads = "{threads}" - # mem = "{resources.mem_mb}" - self.snakemake_run_conf = { - "cores": 300, - "latency-wait": 60, - "use-conda": True, - "dryrun": self.inputs.flags.dryrun, - "jobname": "ViroConstrictor_{name}.jobid{jobid}", - "drmaa": f' -q {queuename} -n {{threads}} -R "span[hosts=1]" -M {{resources.mem_mb}}', - "drmaa-log-dir": "logs/drmaa", - } - return self.snakemake_run_conf - self.snakemake_run_conf = { - "cores": cores, + base_config = { "latency-wait": 60, - "use-conda": True, "dryrun": self.inputs.flags.dryrun, "jobname": "ViroConstrictor_{name}.jobid{jobid}", + "use-conda": reproduction_mode == "conda", + "use-singularity": reproduction_mode == "containers", + "container_cache": reproduction_cache_path, + "restart-times": 3, + "keep-going": True, + "printshellcmds": False, + "scheduler": "greedy", + "force-incomplete": True, } + + if compmode == "grid": + queuename = configuration["COMPUTING"]["queuename"] + base_config.update( + { + "cores": 300, + "drmaa": f' -q {queuename} -n {{threads}} -R "span[hosts=1]" -M {{resources.mem_mb}}', + "drmaa-log-dir": "logs/drmaa", + } + ) + self.snakemake_run_conf = base_config + return self.snakemake_run_conf + base_config.update( + { + "cores": cores, + } + ) + self.snakemake_run_conf = base_config return self.snakemake_run_conf def _snakemake_run_params(self) -> dict[str, Any]: diff --git a/ViroConstrictor/userprofile.py b/ViroConstrictor/userprofile.py index df1885a..c8b253c 100644 --- a/ViroConstrictor/userprofile.py +++ b/ViroConstrictor/userprofile.py @@ -10,12 +10,14 @@ import readline import subprocess import sys +from typing import Any from rich import print from rich.console import Console from ViroConstrictor.functions import tabCompleter from ViroConstrictor.logging import log +from ViroConstrictor.workflow.containers import containerization_installed def FileExists(file: pathlib.Path) -> bool: @@ -43,7 +45,11 @@ def FileIsPopulated(file: pathlib.Path) -> bool: def AskPrompts( - intro: str, prompt: str, options: list, fixedchoices: bool = False + intro: str, + prompt: str, + options: list, + fixedchoices: bool = False, + default: Any = None, ) -> str | None: """This function is used to ask the user a question and provide a list of options to choose from. A free-text user reply is also possible. @@ -72,13 +78,15 @@ def AskPrompts( the reply variable. """ + completer = tabCompleter() if fixedchoices: - completer = tabCompleter() completer.createListCompleter(options) - readline.set_completer_delims("\t") readline.parse_and_bind("tab: complete") readline.set_completer(completer.listCompleter) + else: + readline.parse_and_bind("tab: complete") + readline.set_completer(completer.pathCompleter) subprocess.call("/bin/clear", shell=False) @@ -101,8 +109,8 @@ def AskPrompts( reply = input(prompt).strip() if reply == "quit": sys.exit(-1) - else: - return reply + # if reply is empty and a default value is given, return the default value + return default if not reply and default else reply def BuildConfig(file: pathlib.Path) -> None: @@ -133,7 +141,7 @@ def BuildConfig(file: pathlib.Path) -> None: if conf_object["COMPUTING"]["compmode"] == "grid": conf_object["COMPUTING"]["queuename"] = AskPrompts( # type: ignore - f"""Grid mode has been chosen. Please enter the name of computing-queue that you wish to use on your grid/HPC cluster.\nThis is necessary so ViroConstrictor will send all the various tasks to the correct (remote) computers.\n\n[bold underline yellow]Please note that this is case-sensitive[/bold underline yellow]\n""", + """Grid mode has been chosen. Please enter the name of computing-queue that you wish to use on your grid/HPC cluster.\nThis is necessary so ViroConstrictor will send all the various tasks to the correct (remote) computers.\n\n[bold underline yellow]Please note that this is case-sensitive[/bold underline yellow]\n""", "Please specify the name of the Queue on your grid/HPC cluster that you wish to use. [free text] ", [], fixedchoices=False, @@ -143,7 +151,7 @@ def BuildConfig(file: pathlib.Path) -> None: "auto_update": AskPrompts( """ViroConstrictor can check and update itself everytime you run it. Please specify whether you wish to enable the auto-update feature.""", - f"""Do you wish to enable the auto-update feature? \[yes/no] """, + """Do you wish to enable the auto-update feature? \[yes/no] """, ["yes", "no"], fixedchoices=True, ) @@ -157,7 +165,22 @@ def BuildConfig(file: pathlib.Path) -> None: fixedchoices=True, ) - subprocess.call("/bin/clear", shell=False) + if containerization_installed is False: + conf_object["REPRODUCTION"] = { + "repro_method": "conda", + "container_cache_path": None, + } + else: + conf_object["REPRODUCTION"] = {"repro_method": "containers"} + conf_object["REPRODUCTION"]["container_cache_path"] = AskPrompts( + f"""ViroConstrictor will use containers to run the various analysis steps in a reproducible manner.\nHowever, to speed up the workflow and to allow offline-use, ViroConstrictor will cache the containers on your local machine.\nThis directory will be used to locally store the containers.\nIf you do not provide a path, the default path will be used. ({str(pathlib.Path.home())}/.viroconstrictor/containers)""", + """Please specify the path to the container cache directory. [free text] """, + [], + fixedchoices=False, + default=f"{str(pathlib.Path.home())}/.viroconstrictor/containers", + ) + + # subprocess.call("/bin/clear", shell=False) with open(file, "w") as conffile: conf_object.write(conffile) @@ -202,6 +225,17 @@ def AllOptionsGiven(config: configparser.ConfigParser) -> bool: else: all_present = False + if config.has_section("REPRODUCTION") is True: + if ( + config.has_option("REPRODUCTION", "repro_method") is True + and config["REPRODUCTION"]["repro_method"] == "containers" + and config.has_option("REPRODUCTION", "container_cache_path") is False + or config.has_option("REPRODUCTION", "repro_method") is not True + ): + all_present = False + else: + all_present = False + return all_present diff --git a/ViroConstrictor/workflow/__init__.py b/ViroConstrictor/workflow/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ViroConstrictor/workflow/containers.py b/ViroConstrictor/workflow/containers.py new file mode 100644 index 0000000..bc00cab --- /dev/null +++ b/ViroConstrictor/workflow/containers.py @@ -0,0 +1,341 @@ +import hashlib +import os +import subprocess +from typing import Any, Dict, List, Tuple + +from ViroConstrictor import __prog__ +from ViroConstrictor.logging import log + +upstream_registry = "ghcr.io/rivm-bioinformatics" + + +def fetch_recipes(recipe_folder: str) -> List[str]: + """ + Fetches a list of absolute file paths for all YAML files in the given recipe folder. + + Parameters + ---------- + recipe_folder : str + The path to the folder containing the recipe files. + + Returns + ------- + List[str] + A list of absolute file paths for all YAML files in the recipe folder. + """ + return [ + os.path.abspath(os.path.join(recipe_folder, file)) + for file in os.listdir(recipe_folder) + if file.endswith(".yaml") + ] + + +def fetch_scripts(script_folder: str) -> List[str]: + """ + Fetches all Python script files with the extension '.py' from the specified folder. + + Parameters + ---------- + script_folder : str + The path to the folder containing the script files. + + Returns + ------- + List[str] + A list of absolute paths to the fetched script files. + """ + script_files = [] + for root, dirs, files in os.walk(script_folder): + script_files.extend( + os.path.abspath(os.path.join(root, file)) + for file in files + if file.endswith(".py") + ) + return script_files + + +def fetch_files(file_folder: str) -> List[str]: + """ + Fetches a list of absolute file paths from the given file folder. + + Parameters + ---------- + file_folder : str + The path to the folder containing the files. + + Returns + ------- + List[str] + A list of absolute file paths. + + """ + return [ + os.path.abspath(os.path.join(file_folder, file)) + for file in os.listdir(file_folder) + ] + + +def calculate_hashes(file_list: List[str]) -> Dict[str, str]: + """ + Parameters + ---------- + file_list : List[str] + A list of file paths for which to calculate the hashes. + + Returns + ------- + Dict[str, str] + A dictionary where the keys are file paths and the values are the first 6 characters of the SHA-256 hash of the file contents. + """ + hashdict = {} + for file in file_list: + with open(file, "rb") as f: + file_hash = hashlib.sha256(f.read()).hexdigest()[:6] + hashdict[file] = file_hash + return hashdict + + +def fetch_hashes() -> Dict[str, str]: + """ + Fetches and returns the hashes of recipe files, script files, and config files. + + Returns + ------- + Dict[str, str] + A dictionary containing the file paths as keys and their corresponding hashes as values. + """ + # Fetch the recipe files, script files, and config files + recipe_files = sorted( + fetch_recipes(f"{os.path.dirname(os.path.realpath(__file__))}/envs/") + ) + script_files = sorted( + fetch_scripts(f"{os.path.dirname(os.path.realpath(__file__))}/scripts/") + ) + config_files = sorted( + fetch_files(f"{os.path.dirname(os.path.realpath(__file__))}/files/") + ) + + # Calculate hashes for script files + script_hashes = calculate_hashes(script_files) + + # Calculate hashes for config files + config_hashes = calculate_hashes(config_files) + + # Sort the hashes of the scripts and the configs + script_hashes = dict(sorted(script_hashes.items())) + config_hashes = dict(sorted(config_hashes.items())) + + # Join the hashes of the scripts and the configs, and create a new hash of the joined hashes + merged_hashes = hashlib.sha256( + "".join(list(script_hashes.values()) + list(config_hashes.values())).encode() + ).hexdigest()[:6] + + # Calculate hashes for recipe files + hashes = {} + for recipe_file in recipe_files: + with open(recipe_file, "rb") as f: + recipe_hash = hashlib.sha256(f.read()).hexdigest()[:6] + # if the recipe file is not named 'Scripts', then add the hash to the dictionary and continue + if os.path.basename(recipe_file).split(".")[0] != "Scripts": + hashes[recipe_file] = recipe_hash + continue + # if the recipe file *is* named 'Scripts', then combine the recipe hash with the merged hash of the scripts and configs + # Subsequently, create a new hash of the joined hashes to avoid conflicts and end up with a singular tracker. + file_hash = hashlib.sha256( + (recipe_hash + merged_hashes).encode() + ).hexdigest()[:6] + hashes[recipe_file] = file_hash + + return hashes + + +def get_hash(target_container: str) -> str | None: + """ + Get the hash for the target container. + + Parameters + ---------- + target_container : str + The name of the target container. + + Returns + ------- + str or None + The hash of the target container if found, None otherwise. + """ + hashes = fetch_hashes() + return next( + (hash for recipe, hash in hashes.items() if target_container in recipe), + None, + ) + + +def containerization_installed() -> bool: + """ + Check if containerization tools are installed. + + Returns + ------- + bool + True if either 'apptainer' or 'singularity' is installed, False otherwise. + """ + if os.system("which apptainer") == 0: + return True + return os.system("which singularity") == 0 + + +def containers_to_download(config: Dict[str, Any]) -> List[str]: + """ + Returns a list of containers that need to be downloaded for the workflow. + + Parameters + ---------- + config : Dict[str, Any] + A dictionary containing the configuration parameters. + + Returns + ------- + List[str] + A list of container names that need to be downloaded. + + """ + recipes = fetch_hashes() + + # check the recipes dict and create a list of required containers for the workflow. the list must contain strings that look like "viroconstrictor_alignment_{hash}" "viroconstrictor_clean_{hash}" etc. + required_containers = [] + for key, val in recipes.items(): + recipe_basename = os.path.basename(key).replace(".yaml", "") + required_containers.append(f"{__prog__}_{recipe_basename}_{val}".lower()) + + # check if the folder exists, if not create it + if not os.path.exists(config["container_cache"]): + os.makedirs(config["container_cache"], exist_ok=True) + containers_present = os.listdir(config["container_cache"]) + + # remove the .sif extension from the container names + containers_present = [item.split(".")[0] for item in containers_present] + + # loop through the required_containers list and check if they are present in the containers_present list + # if they are not present, add them to the containers_to_download list + return [ + container + for container in required_containers + if container not in containers_present + ] + + +def containerization_executable() -> str: + """ + Determines the containerization executable to be used. + + Returns + ------- + str + The name of the containerization executable ('apptainer' or 'singularity'). + """ + return ( + "apptainer" + if subprocess.call( + "which apptainer", + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + == 0 + else "singularity" + ) + + +def download_containers(config: Dict[str, Any], verbose=False) -> int: + """ + Download containers specified in the configuration. + + Parameters + ---------- + config : Dict[str, Any] + The configuration containing container information. + verbose : bool, optional + Whether to display verbose output. Defaults to False. + + Returns + ------- + int + 0 if all containers were downloaded successfully, 1 otherwise. + """ + to_download = containers_to_download(config) + to_download = [x.rsplit("_", 1)[0] + ":" + x.rsplit("_", 1)[1] for x in to_download] + + if config["dryrun"]: + log.info( + f"Container(s) [magenta]{', '.join(to_download)}[/magenta] will be downloaded" + ) + return 0 + # I thought this would be a great place to use concurrent.futures to download the containers in parallel + # however this has unintended consequences for the various OCI layers resulting in at least one container not being downloaded correctly. + # Thus it's better for now to download the containers sequentially. + for container in to_download: + log.info( + f"Downloading container: [magenta]'{container}'[/magenta] to local cache" + ) + executable = containerization_executable() + status = subprocess.call( + f"{executable} pull --dir {config['container_cache']} docker://{upstream_registry}/{container}", + shell=True, + stderr=subprocess.PIPE if verbose is False else None, + stdout=subprocess.PIPE if verbose is False else None, + ) + if status != 0: + log.error(f"Failed to download container: [magenta]'{container}'[/magenta]") + return 1 + log.info( + f"Successfully downloaded container: [magenta]'{container}'[/magenta] to local cache" + ) + + return 0 + + +def construct_container_bind_args(samples_dict: Dict) -> str: + """ + Constructs the bind arguments for container execution based on the given samples dictionary. + + Parameters + ---------- + samples_dict : Dict + A dictionary containing sample information. + + Returns + ------- + str + A string representing the bind arguments for container execution. + + Notes + ----- + This function iterates over the given samples dictionary and extracts the paths of all files that exist. + It then removes any duplicate paths and constructs a string of bind arguments for container execution. + + Examples + -------- + >>> samples = { + ... 'sample1': { + ... 'file1': '/path/to/file1', + ... 'file2': '/path/to/file2' + ... }, + ... 'sample2': { + ... 'file3': '/path/to/file3', + ... 'file4': '/path/to/file4' + ... } + ... } + >>> construct_container_bind_args(samples) + '--bind /path/to' + """ + paths = [] + for keys, nested_dict in samples_dict.items(): + paths.extend( + f"{os.path.dirname(value)}" + for value in nested_dict.values() + if isinstance(value, str) and os.path.exists(value) + ) + # remove all duplicates from the paths list by making it a set + # for every item in the set, add '--bind ' + # join all the items together to make a long string + return " ".join([f"--bind {path}" for path in set(paths)]) diff --git a/ViroConstrictor/workflow/envs/ORF_analysis.yaml b/ViroConstrictor/workflow/envs/ORF_analysis.yaml index 65a02c2..81d9d50 100644 --- a/ViroConstrictor/workflow/envs/ORF_analysis.yaml +++ b/ViroConstrictor/workflow/envs/ORF_analysis.yaml @@ -5,7 +5,7 @@ channels: - nodefaults dependencies: - conda-forge::python>=3.10 - - conda-forge::pandas + - conda-forge::pandas==2.0.* - conda-forge::biopython==1.79 - bioconda::prodigal==2.6.3 diff --git a/ViroConstrictor/workflow/envs/Scripts.yaml b/ViroConstrictor/workflow/envs/Scripts.yaml index d976979..4eafb86 100644 --- a/ViroConstrictor/workflow/envs/Scripts.yaml +++ b/ViroConstrictor/workflow/envs/Scripts.yaml @@ -5,9 +5,10 @@ channels: - nodefaults dependencies: - conda-forge::python>=3.10 - - conda-forge::pandas - - conda-forge::biopython + - conda-forge::pandas>=2.0 + - conda-forge::numpy>=2.0 + - conda-forge::biopython==1.79 - - bioconda::pysam - - bioconda::fastqc + - bioconda::pysam==0.21 + - bioconda::fastqc==0.11.9 - bioconda::aminoextract==0.3.1 \ No newline at end of file diff --git a/ViroConstrictor/workflow/match_ref.smk b/ViroConstrictor/workflow/match_ref.smk index 902d52e..e2da785 100644 --- a/ViroConstrictor/workflow/match_ref.smk +++ b/ViroConstrictor/workflow/match_ref.smk @@ -6,15 +6,19 @@ import logging import pandas as pd from presets import get_preset_parameter +from containers import get_hash from Bio import SeqIO, SeqRecord from snakemake.utils import Paramspace, min_version +import snakemake from directories import * +from rich import print min_version("7.15") + logger = logging.getLogger() logger.handlers.clear() @@ -157,21 +161,21 @@ rule filter_references: temp(f"{datadir}{matchref}{wc_folder}" "{sample}_refs.fasta"), resources: mem=low_memory_job, + threads: 1 log: f"{logdir}prepare_refs" "{Virus}.{segment}.{sample}.log", - run: - from Bio import SeqIO - - records_to_keep = [] - for record in SeqIO.parse(str(input), "fasta"): - if ( - wildcards.segment != "None" - ): # wildcards.segment is a string instead of a NoneType - if record.description.split(" ")[1].split("|")[0] == wildcards.segment: - records_to_keep.append(record) - else: - records_to_keep.append(record) - SeqIO.write(records_to_keep, str(output), "fasta") + benchmark: + f"{logdir}{bench}MR_prepare_refs" "{Virus}.{segment}.{sample}.txt" + conda: + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" + params: + script=srcdir("scripts/match_ref/filter_references.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/match_ref/filter_references.py", + shell: + """ + python {params.script} {input} {output} {wildcards.segment} >> {log} 2>&1 + """ if config["platform"] in ["nanopore", "iontorrent"]: @@ -188,6 +192,8 @@ if config["platform"] in ["nanopore", "iontorrent"]: index=temp(f"{datadir}{matchref}{wc_folder}" "{sample}.bam.bai"), conda: f"{conda_envs}Alignment.yaml" + container: + f"{config['container_cache']}/viroconstrictor_alignment_{get_hash('Alignment')}.sif" log: f"{logdir}AlignMR_" "{Virus}.{segment}.{sample}.log", benchmark: @@ -236,6 +242,8 @@ if config["platform"] == "illumina": index=temp(f"{datadir}{matchref}{wc_folder}" "{sample}.bam.bai"), conda: f"{conda_envs}Alignment.yaml" + container: + f"{config['container_cache']}/viroconstrictor_alignment_{get_hash('Alignment')}.sif" log: f"{logdir}" "AlignMR_{Virus}.{segment}.{sample}.log", benchmark: @@ -278,14 +286,16 @@ rule count_mapped_reads: output: temp(f"{datadir}{matchref}{wc_folder}" "{sample}_count.csv"), conda: - f"{conda_envs}Clean.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" threads: 1 resources: mem=low_memory_job, log: f"{logdir}CountMR_" "{Virus}.{segment}.{sample}.log", params: - script=srcdir("scripts/match_ref/count_mapped_reads.py"), + script=srcdir("scripts/match_ref/count_mapped_reads.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/match_ref/count_mapped_reads.py", shell: """ python {params.script} {input.bam} {output} >> {log} 2>&1 @@ -300,14 +310,16 @@ rule filter_best_matching_ref: filtref=temp(f"{datadir}{matchref}{wc_folder}" "{sample}_best_ref.fasta"), filtcount=temp(f"{datadir}{matchref}{wc_folder}" "{sample}_best_ref.csv"), conda: - f"{conda_envs}Clean.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" threads: 1 resources: mem=low_memory_job, log: f"{logdir}FilterBR_" "{Virus}.{segment}.{sample}.log", params: - script=srcdir("scripts/match_ref/filter_best_matching_ref.py"), + script=srcdir("scripts/match_ref/filter_best_matching_ref.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/match_ref/filter_best_matching_ref.py", shell: """ python {params.script} {input.stats} {input.ref} {output.filtref} {output.filtcount} >> {log} 2>&1 @@ -342,14 +354,16 @@ rule group_and_rename_refs: groupedrefs=f"{datadir}{matchref}" "{sample}_refs.fasta", groupedstats=temp(f"{datadir}{matchref}" "{sample}_refs.csv"), conda: - f"{conda_envs}Clean.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" threads: 1 resources: mem=low_memory_job, log: f"{logdir}GroupRefs_" "{sample}.log", params: - script=srcdir("scripts/match_ref/group_refs.py"), + script=srcdir("scripts/match_ref/group_refs.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/match_ref/group_refs.py", shell: """ python {params.script} "{input.ref}" "{input.stats}" {output.groupedrefs} {output.groupedstats} {wildcards.sample} >> {log} 2>&1 @@ -368,8 +382,12 @@ rule filter_gff: mem=low_memory_job, log: f"{logdir}FilterGFF_" "{sample}.log", + conda: + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" params: - script=srcdir("scripts/match_ref/filter_gff.py"), + script=srcdir("scripts/match_ref/filter_gff.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/match_ref/filter_gff.py", shell: """ python {params.script} {input.refdata} {input.gff} {output.gff} {output.groupedstats} >> {log} 2>&1 @@ -392,11 +410,42 @@ rule touch_gff: touch {output.gff} """ +rule filter_fasta2bed: + input: + ref=rules.group_and_rename_refs.output.groupedrefs, + refdata=rules.filter_gff.output.groupedstats, + prm=lambda wc: "" if SAMPLES[wc.sample]["PRIMERS"].endswith(".bed") else SAMPLES[wc.sample]["PRIMERS"], + output: + bed=f"{datadir}{matchref}" "{sample}_primers.bed", + groupedstats=temp(f"{datadir}{matchref}" "{sample}_data2.csv"), + conda: + f"{conda_envs}Clean.yaml" + container: + f"{config['container_cache']}/viroconstrictor_clean_{get_hash('Clean')}.sif" + threads: 1 + resources: + mem_mb=low_memory_job, + log: + f"{logdir}Fasta2Bed_" "{sample}.log", + params: + pr_mm_rate=lambda wc: SAMPLES[wc.sample]["PRIMER-MISMATCH-RATE"], + shell: + """ + python -m AmpliGone.fasta2bed \ + --primers {input.prm} \ + --reference {input.ref} \ + --output {output.bed} \ + --primer-mismatch-rate {params.pr_mm_rate} > {log} + awk -F ',' -v OFS=',' '{{ $(NF+1) = (NR==1 ? "Primer_file" : "{output.bed}"); print }}' {input.refdata} > {output.groupedstats} + """ + +ruleorder: filter_fasta2bed > filter_bed > touch_primers + + rule filter_bed: input: prm=lambda wc: SAMPLES[wc.sample]["PRIMERS"], refdata=rules.filter_gff.output.groupedstats, - ref=rules.group_and_rename_refs.output.groupedrefs, output: bed=f"{datadir}{matchref}" "{sample}_primers.bed", groupedstats=temp(f"{datadir}{matchref}" "{sample}_data2.csv"), @@ -406,25 +455,16 @@ rule filter_bed: log: f"{logdir}FilterBed_" "{sample}.log", params: - pr_mm_rate=lambda wc: SAMPLES[wc.sample]["PRIMER-MISMATCH-RATE"], - script=srcdir("scripts/match_ref/filter_bed.py"), + script=srcdir("scripts/match_ref/filter_bed.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/match_ref/filter_bed.py", conda: - f"{conda_envs}Clean.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" shell: """ - if [[ {input.prm} == *.bed ]]; then - python {params.script} {input.prm} {input.refdata} {output.bed} {output.groupedstats} - else - python -m AmpliGone.fasta2bed \ - --primers {input.prm} \ - --reference {input.ref} \ - --output {output.bed} \ - --primer-mismatch-rate {params.pr_mm_rate} > {log} - awk -F"," 'BEGIN {{ OFS = "," }} {{$6="{output.bed}"; print}}' {input.refdata} > {output.groupedstats} - fi + python {params.script} {input.prm} {input.refdata} {output.bed} {output.groupedstats} """ -ruleorder: filter_bed > touch_primers rule touch_primers: input: diff --git a/ViroConstrictor/workflow/preset_params.json b/ViroConstrictor/workflow/preset_params.json new file mode 100644 index 0000000..df22963 --- /dev/null +++ b/ViroConstrictor/workflow/preset_params.json @@ -0,0 +1,97 @@ +{ + "DEFAULT": { + "MatchRef_AdditionalAlignmentSettings": "--secondary=no", + "MatchRef_AdditionalBamFilters": "-q 10", + "RawAlign_AdditionalSettings": "", + "BaseBAMFilters": "-F 256 -F 512 -F 4 -F 2048", + "ClipperSettings_nanopore": "", + "ClipperSettings_illumina": "", + "ClipperSettings_iontorrent": "", + "Fastp_PhredScore_cutoff_illumina": 20, + "Fastp_PhredScore_cutoff_nanopore": 7, + "Fastp_PhredScore_cutoff_iontorrent": 20, + "Fastp_WindowSize_illumina": 5, + "Fastp_WindowSize_nanopore": 20, + "Fastp_WindowSize_iontorrent": 15, + "Fastp_MinReadLength": 100, + "CleanAlign_AdditionalSettings_nanopore": "", + "CleanAlign_AdditionalSettings_illumina": "", + "CleanAlign_AdditionalSettings_iontorrent": "" + }, + "SARSCOV2": { + "MatchRef_AdditionalAlignmentSettings": "--secondary=no", + "MatchRef_AdditionalBamFilters": "-q 10", + "RawAlign_AdditionalSettings": "", + "BaseBAMFilters": "-F 256 -F 512 -F 4 -F 2048", + "ClipperSettings_nanopore": "", + "ClipperSettings_illumina": "", + "ClipperSettings_iontorrent": "", + "Fastp_PhredScore_cutoff_illumina": 20, + "Fastp_PhredScore_cutoff_nanopore": 7, + "Fastp_PhredScore_cutoff_iontorrent": 20, + "Fastp_WindowSize_illumina": 5, + "Fastp_WindowSize_nanopore": 20, + "Fastp_WindowSize_iontorrent": 15, + "Fastp_MinReadLength": 100, + "CleanAlign_AdditionalSettings_nanopore": "-E2,0 -O8,24 -A4 -B4", + "CleanAlign_AdditionalSettings_illumina": "", + "CleanAlign_AdditionalSettings_iontorrent": "" + }, + "INFLUENZA": { + "MatchRef_AdditionalAlignmentSettings": "--secondary=no", + "MatchRef_AdditionalBamFilters": "-q 5", + "RawAlign_AdditionalSettings": "--splice --frag=no", + "BaseBAMFilters": "-F 256 -F 512 -F 4 -F 2048", + "ClipperSettings_nanopore": "--exclude-spliced --spliced-length-threshold 50 --min-aligned-length 0.5", + "ClipperSettings_illumina": "--exclude-spliced --spliced-length-threshold 50", + "ClipperSettings_iontorrent": "--exclude-spliced --spliced-length-threshold 50 --min-aligned-length 0.5", + "Fastp_PhredScore_cutoff_illumina": 20, + "Fastp_PhredScore_cutoff_nanopore": 7, + "Fastp_PhredScore_cutoff_iontorrent": 20, + "Fastp_WindowSize_illumina": 5, + "Fastp_WindowSize_nanopore": 20, + "Fastp_WindowSize_iontorrent": 15, + "Fastp_MinReadLength": 100, + "CleanAlign_AdditionalSettings_nanopore": "", + "CleanAlign_AdditionalSettings_illumina": "", + "CleanAlign_AdditionalSettings_iontorrent": "" + }, + "MEASLES": { + "MatchRef_AdditionalAlignmentSettings": "--secondary=no", + "MatchRef_AdditionalBamFilters": "-q 10", + "RawAlign_AdditionalSettings": "", + "BaseBAMFilters": "-F 256 -F 512 -F 4 -F 2048", + "ClipperSettings_nanopore": "", + "ClipperSettings_illumina": "", + "ClipperSettings_iontorrent": "", + "Fastp_PhredScore_cutoff_illumina": 20, + "Fastp_PhredScore_cutoff_nanopore": 7, + "Fastp_PhredScore_cutoff_iontorrent": 20, + "Fastp_WindowSize_illumina": 5, + "Fastp_WindowSize_nanopore": 20, + "Fastp_WindowSize_iontorrent": 15, + "Fastp_MinReadLength": 100, + "CleanAlign_AdditionalSettings_nanopore": "", + "CleanAlign_AdditionalSettings_illumina": "", + "CleanAlign_AdditionalSettings_iontorrent": "" + }, + "HPV": { + "MatchRef_AdditionalAlignmentSettings": "--secondary=no", + "MatchRef_AdditionalBamFilters": "-q 10", + "RawAlign_AdditionalSettings": "", + "BaseBAMFilters": "-F 256 -F 512 -F 4 -F 2048", + "ClipperSettings_nanopore": "", + "ClipperSettings_illumina": "", + "ClipperSettings_iontorrent": "", + "Fastp_PhredScore_cutoff_illumina": 20, + "Fastp_PhredScore_cutoff_nanopore": 7, + "Fastp_PhredScore_cutoff_iontorrent": 20, + "Fastp_WindowSize_illumina": 5, + "Fastp_WindowSize_nanopore": 20, + "Fastp_WindowSize_iontorrent": 15, + "Fastp_MinReadLength": 100, + "CleanAlign_AdditionalSettings_nanopore": "", + "CleanAlign_AdditionalSettings_illumina": "", + "CleanAlign_AdditionalSettings_iontorrent": "" + } +} \ No newline at end of file diff --git a/ViroConstrictor/workflow/presets.py b/ViroConstrictor/workflow/presets.py index bb7f127..f9f69c3 100644 --- a/ViroConstrictor/workflow/presets.py +++ b/ViroConstrictor/workflow/presets.py @@ -1,4 +1,6 @@ import difflib +import json +import os import re from typing import Tuple @@ -24,103 +26,10 @@ # "HPV": ["PAPILLOMA_VIRUS", "HPV"] } -presets = { - "DEFAULT": { - "MatchRef_AdditionalAlignmentSettings": "--secondary=no", - "MatchRef_AdditionalBamFilters": "-q 10", - "RawAlign_AdditionalSettings": "", - "BaseBAMFilters": "-F 256 -F 512 -F 4 -F 2048", - "ClipperSettings_nanopore": "", - "ClipperSettings_illumina": "", - "ClipperSettings_iontorrent": "", - "Fastp_PhredScore_cutoff_illumina": 20, - "Fastp_PhredScore_cutoff_nanopore": 7, - "Fastp_PhredScore_cutoff_iontorrent": 20, - "Fastp_WindowSize_illumina": 5, - "Fastp_WindowSize_nanopore": 20, - "Fastp_WindowSize_iontorrent": 15, - "Fastp_MinReadLength": 100, - "CleanAlign_AdditionalSettings_nanopore": "", - "CleanAlign_AdditionalSettings_illumina": "", - "CleanAlign_AdditionalSettings_iontorrent": "", - }, - "SARSCOV2": { - "MatchRef_AdditionalAlignmentSettings": "--secondary=no", - "MatchRef_AdditionalBamFilters": "-q 10", - "RawAlign_AdditionalSettings": "", - "BaseBAMFilters": "-F 256 -F 512 -F 4 -F 2048", - "ClipperSettings_nanopore": "", - "ClipperSettings_illumina": "", - "ClipperSettings_iontorrent": "", - "Fastp_PhredScore_cutoff_illumina": 20, - "Fastp_PhredScore_cutoff_nanopore": 7, - "Fastp_PhredScore_cutoff_iontorrent": 20, - "Fastp_WindowSize_illumina": 5, - "Fastp_WindowSize_nanopore": 20, - "Fastp_WindowSize_iontorrent": 15, - "Fastp_MinReadLength": 100, - "CleanAlign_AdditionalSettings_nanopore": "-E2,0 -O8,24 -A4 -B4", - "CleanAlign_AdditionalSettings_illumina": "", - "CleanAlign_AdditionalSettings_iontorrent": "", - }, - "INFLUENZA": { - "MatchRef_AdditionalAlignmentSettings": "--secondary=no", - "MatchRef_AdditionalBamFilters": "-q 5", - "RawAlign_AdditionalSettings": "--splice --frag=no", - "BaseBAMFilters": "-F 256 -F 512 -F 4 -F 2048", - "ClipperSettings_nanopore": "--exclude-spliced --spliced-length-threshold 50 --min-aligned-length 0.5", - "ClipperSettings_illumina": "--exclude-spliced --spliced-length-threshold 50", - "ClipperSettings_iontorrent": "--exclude-spliced --spliced-length-threshold 50 --min-aligned-length 0.5", - "Fastp_PhredScore_cutoff_illumina": 20, - "Fastp_PhredScore_cutoff_nanopore": 7, - "Fastp_PhredScore_cutoff_iontorrent": 20, - "Fastp_WindowSize_illumina": 5, - "Fastp_WindowSize_nanopore": 20, - "Fastp_WindowSize_iontorrent": 15, - "Fastp_MinReadLength": 100, - "CleanAlign_AdditionalSettings_nanopore": "", - "CleanAlign_AdditionalSettings_illumina": "", - "CleanAlign_AdditionalSettings_iontorrent": "", - }, - "MEASLES": { - "MatchRef_AdditionalAlignmentSettings": "--secondary=no", - "MatchRef_AdditionalBamFilters": "-q 10", - "RawAlign_AdditionalSettings": "", - "BaseBAMFilters": "-F 256 -F 512 -F 4 -F 2048", - "ClipperSettings_nanopore": "", - "ClipperSettings_illumina": "", - "ClipperSettings_iontorrent": "", - "Fastp_PhredScore_cutoff_illumina": 20, - "Fastp_PhredScore_cutoff_nanopore": 7, - "Fastp_PhredScore_cutoff_iontorrent": 20, - "Fastp_WindowSize_illumina": 5, - "Fastp_WindowSize_nanopore": 20, - "Fastp_WindowSize_iontorrent": 15, - "Fastp_MinReadLength": 100, - "CleanAlign_AdditionalSettings_nanopore": "", - "CleanAlign_AdditionalSettings_illumina": "", - "CleanAlign_AdditionalSettings_iontorrent": "", - }, - "HPV": { - "MatchRef_AdditionalAlignmentSettings": "--secondary=no", - "MatchRef_AdditionalBamFilters": "-q 10", - "RawAlign_AdditionalSettings": "", - "BaseBAMFilters": "-F 256 -F 512 -F 4 -F 2048", - "ClipperSettings_nanopore": "", - "ClipperSettings_illumina": "", - "ClipperSettings_iontorrent": "", - "Fastp_PhredScore_cutoff_illumina": 20, - "Fastp_PhredScore_cutoff_nanopore": 7, - "Fastp_PhredScore_cutoff_iontorrent": 20, - "Fastp_WindowSize_illumina": 5, - "Fastp_WindowSize_nanopore": 20, - "Fastp_WindowSize_iontorrent": 15, - "Fastp_MinReadLength": 100, - "CleanAlign_AdditionalSettings_nanopore": "", - "CleanAlign_AdditionalSettings_illumina": "", - "CleanAlign_AdditionalSettings_iontorrent": "", - }, -} +# Load the preset parameters from a JSON file (preset_params.json) instead of having a giant dict here +presets = json.load( + open(os.path.join(os.path.abspath(os.path.dirname(__file__)), "preset_params.json")) +) def get_key_from_value(d: dict, value: str) -> str | None: diff --git a/ViroConstrictor/workflow/scripts/clipper.py b/ViroConstrictor/workflow/scripts/clipper.py index 85790c7..02d64c4 100644 --- a/ViroConstrictor/workflow/scripts/clipper.py +++ b/ViroConstrictor/workflow/scripts/clipper.py @@ -46,6 +46,24 @@ default=0, ) +arg.add_argument( + "--max-aligned-length", + metavar="FLOAT", + type=float, + required=False, + help="Filter reads based on the maximum length of the aligned part of the read", + default=0, +) + +arg.add_argument( + "--only-include-region", + metavar="STRING", + type=str, + required=False, + help="Only include reads where the aligned section of the read start and end within the specified region", + default=None, +) + arg.add_argument( "--threads", metavar="Number", @@ -59,6 +77,26 @@ def split_cigar(cigar: str) -> list: + """ + Split a CIGAR string into a list of tuples. + + Parameters: + ----------- + cigar : str + The CIGAR string to be split. + + Returns: + -------- + list + A list of tuples, where each tuple contains two elements: + - The number of consecutive characters. + - The character itself. + + Example: + -------- + >>> split_cigar("3M1I2D") + [(3, 'M'), (1, 'I'), (2, 'D')] + """ cigar_tuples = [] current_number = "" for char in cigar: @@ -71,10 +109,42 @@ def split_cigar(cigar: str) -> list: def is_spliced(cigar_tuples: list) -> bool: + """ + Check if the given list of cigar tuples contains any spliced regions. + + Parameters + ---------- + cigar_tuples : list + A list of cigar tuples representing the alignment. + + Returns + ------- + bool + True if there are spliced regions, False otherwise. + """ return any(cigar_tuple[1] == "N" for cigar_tuple in cigar_tuples) def get_largest_spliced_len(cigar_tuples: list) -> int: + """ + Calculate the largest spliced length from a list of cigar tuples. + + Parameters: + ----------- + cigar_tuples : list + A list of cigar tuples representing the cigar string. + + Returns: + -------- + int + The largest spliced length found in the cigar tuples. + + Examples: + --------- + >>> cigar_tuples = [(10, 'M'), (5, 'N'), (20, 'M'), (15, 'N'), (30, 'M')] + >>> get_largest_spliced_len(cigar_tuples) + 15 + """ largest_spliced_len = 0 current_spliced_len = 0 for cigar_tuple in cigar_tuples: @@ -91,12 +161,27 @@ def get_largest_spliced_len(cigar_tuples: list) -> int: bamfile = pysam.AlignmentFile(flags.input, "rb", threads=flags.threads) reflength = bamfile.lengths[0] - minimal_read_length = int(reflength * flags.min_aligned_length) + minimal_read_length = int(flags.min_aligned_length) + maximum_read_length = int( + reflength if flags.max_aligned_length == 0 else flags.max_aligned_length + ) + + include_region_start = ( + flags.only_include_region.split(":")[0] + if flags.only_include_region is not None + else None + ) + include_region_end = ( + flags.only_include_region.split(":")[1] + if flags.only_include_region is not None + else None + ) for read in bamfile: read_start = read.query_alignment_start read_end = read.query_alignment_end - + ref_start = read.reference_start + ref_end = read.reference_end trimmed_seq = read.query_alignment_sequence trimmed_qual = read.qual[read_start:read_end] @@ -113,16 +198,23 @@ def get_largest_spliced_len(cigar_tuples: list) -> int: if flags.exclude_spliced: cigartuples = split_cigar(read.cigarstring) - if is_spliced(cigartuples): - if ( - get_largest_spliced_len(cigartuples) - < flags.spliced_length_threshold - ): - continue + if is_spliced(cigartuples) and ( + get_largest_spliced_len(cigartuples) > flags.spliced_length_threshold + ): + continue if read.query_alignment_length <= minimal_read_length: continue + if read.query_alignment_length >= maximum_read_length: + continue + + if include_region_start is not None and include_region_end is not None: + if ref_start < int(include_region_start) or ref_end > int( + include_region_end + ): + continue + fileout.write( "@" + str(read.query_name) diff --git a/ViroConstrictor/workflow/scripts/match_ref/filter_references.py b/ViroConstrictor/workflow/scripts/match_ref/filter_references.py new file mode 100644 index 0000000..9c9c064 --- /dev/null +++ b/ViroConstrictor/workflow/scripts/match_ref/filter_references.py @@ -0,0 +1,15 @@ +import sys +from Bio import SeqIO + +_, input, output, wildcard_segment = sys.argv + +records_to_keep = [] +for record in SeqIO.parse(str(input), "fasta"): + if ( + wildcard_segment != "None" + ): # wildcard_segment is a string instead of a NoneType + if record.description.split(" ")[1].split("|")[0] == wildcard_segment: + records_to_keep.append(record) + else: + records_to_keep.append(record) +SeqIO.write(records_to_keep, str(output), "fasta") \ No newline at end of file diff --git a/ViroConstrictor/workflow/scripts/prepare_refs.py b/ViroConstrictor/workflow/scripts/prepare_refs.py new file mode 100644 index 0000000..66fb05d --- /dev/null +++ b/ViroConstrictor/workflow/scripts/prepare_refs.py @@ -0,0 +1,9 @@ +import sys +from Bio import SeqIO + +_, input, output, reference_id = sys.argv + +for record in SeqIO.parse(str(input), "fasta"): + if reference_id in record.id: + record.seq = record.seq.upper() + SeqIO.write(record, str(output), "fasta") \ No newline at end of file diff --git a/ViroConstrictor/workflow/workflow.smk b/ViroConstrictor/workflow/workflow.smk index 658ec35..670e605 100644 --- a/ViroConstrictor/workflow/workflow.smk +++ b/ViroConstrictor/workflow/workflow.smk @@ -9,6 +9,7 @@ import numpy as np from directories import * from presets import get_preset_parameter +from containers import get_hash from Bio import SeqIO import AminoExtract from snakemake.utils import Paramspace, min_version @@ -161,18 +162,28 @@ rule prepare_refs: lambda wc: SAMPLES[wc.sample]["REFERENCE"], output: f"{datadir}{wc_folder}" "{sample}_reference.fasta", - run: - from Bio import SeqIO - - for record in SeqIO.parse(str(input), "fasta"): - if wildcards.RefID in record.id: - record.seq = record.seq.upper() - SeqIO.write(record, str(output), "fasta") + resources: + mem_mb=low_memory_job, + threads: 1 + log: + f"{logdir}prepare_refs_" "{Virus}.{RefID}.{sample}.log", + benchmark: + f"{logdir}{bench}prepare_refs_" "{Virus}.{RefID}.{sample}.txt" + conda: + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" + params: + script=srcdir("scripts/prepare_refs.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/prepare_refs.py", + shell: + """ + python {params.script} {input} {output} {wildcards.RefID} > {log} + """ rule prepare_primers: input: - prm=lambda wc: SAMPLES[wc.sample]["PRIMERS"], + prm=lambda wc: "" if SAMPLES[wc.sample]["PRIMERS"].endswith(".bed") else SAMPLES[wc.sample]["PRIMERS"], ref=rules.prepare_refs.output, output: bed=f"{datadir}{wc_folder}{prim}" "{sample}_primers.bed", @@ -184,22 +195,42 @@ rule prepare_primers: f"{logdir}{bench}prepare_primers_" "{Virus}.{RefID}.{sample}.txt" params: pr_mm_rate=lambda wc: SAMPLES[wc.sample]["PRIMER-MISMATCH-RATE"], - script=srcdir("scripts/filter_bed_input.py"), conda: f"{conda_envs}Clean.yaml" + container: + f"{config['container_cache']}/viroconstrictor_clean_{get_hash('Clean')}.sif" shell: """ - if [[ {input.prm} == *.bed ]]; then - python {params.script} {input.prm} {output.bed} {wildcards.RefID} - else - python -m AmpliGone.fasta2bed \ - --primers {input.prm} \ - --reference {input.ref} \ - --output {output.bed} \ - --primer-mismatch-rate {params.pr_mm_rate} > {log} - fi + python -m AmpliGone.fasta2bed \ + --primers {input.prm} \ + --reference {input.ref} \ + --output {output.bed} \ + --primer-mismatch-rate {params.pr_mm_rate} > {log} """ +ruleorder: prepare_primers > filter_primer_bed + +rule filter_primer_bed: + input: + prm=lambda wc: SAMPLES[wc.sample]["PRIMERS"], + output: + bed=f"{datadir}{wc_folder}{prim}" "{sample}_primers.bed", + resources: + mem_mb=low_memory_job, + log: + f"{logdir}prepare_primers_" "{Virus}.{RefID}.{sample}.log", + benchmark: + f"{logdir}{bench}prepare_primers_" "{Virus}.{RefID}.{sample}.txt" + params: + script=srcdir("scripts/filter_bed_input.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/filter_bed_input.py", + conda: + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" + shell: + """ + python {params.script} {input.prm} {output.bed} {wildcards.RefID} + """ rule prepare_gffs: input: @@ -212,11 +243,13 @@ rule prepare_gffs: benchmark: f"{logdir}{bench}prepare_gffs_" "{Virus}.{RefID}.{sample}.txt" conda: - f"{conda_envs}ORF_analysis.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" resources: mem_mb=low_memory_job, params: - script=srcdir("scripts/extract_gff.py"), + script=srcdir("scripts/extract_gff.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/extract_gff.py", shell: """ python {params.script} {input.feats} {output.gff} {wildcards.RefID} @@ -240,6 +273,8 @@ rule prodigal: threads: config["threads"]["Index"] conda: f"{conda_envs}ORF_analysis.yaml" + container: + f"{config['container_cache']}/viroconstrictor_orf_analysis_{get_hash('ORF_analysis')}.sif" resources: mem_mb=medium_memory_job, params: @@ -268,7 +303,9 @@ if config["platform"] in ["nanopore", "iontorrent"]: html=f"{datadir}{qc_pre}" "{sample}_fastqc.html", zip=f"{datadir}{qc_pre}" "{sample}_fastqc.zip", conda: - f"{conda_envs}Clean.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" log: f"{logdir}QC_raw_data_" "{sample}.log", benchmark: @@ -278,7 +315,7 @@ if config["platform"] in ["nanopore", "iontorrent"]: mem_mb=low_memory_job, params: output_dir=f"{datadir}{qc_pre}", - script=srcdir("scripts/fastqc_wrapper.sh"), + script=srcdir("wrappers/fastqc_wrapper.sh") if config["use-conda"] is True and config["use-singularity"] is False else "/wrappers/fastqc_wrapper.sh", shell: """ bash {params.script} {input} {params.output_dir} {output.html} {output.zip} {log} @@ -293,6 +330,8 @@ if config["platform"] in ["nanopore", "iontorrent"]: index=f"{datadir}{wc_folder}{cln}{raln}" "{sample}.bam.bai", conda: f"{conda_envs}Alignment.yaml" + container: + f"{config['container_cache']}/viroconstrictor_alignment_{get_hash('Alignment')}.sif" log: f"{logdir}RemoveAdapters_p1_" "{Virus}.{RefID}.{sample}.log", benchmark: @@ -329,7 +368,9 @@ if config["platform"] == "illumina": html=f"{datadir}{qc_pre}" "{sample}_{read}_fastqc.html", zip=f"{datadir}{qc_pre}" "{sample}_{read}_fastqc.zip", conda: - f"{conda_envs}Clean.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" log: f"{logdir}" "QC_raw_data_{sample}_{read}.log", benchmark: @@ -339,7 +380,7 @@ if config["platform"] == "illumina": mem_mb=low_memory_job, params: output_dir=f"{datadir}{qc_pre}", - script=srcdir("scripts/fastqc_wrapper.sh"), + script=srcdir("wrappers/fastqc_wrapper.sh") if config["use-conda"] is True and config["use-singularity"] is False else "/wrappers/fastqc_wrapper.sh", shell: """ bash {params.script} {input} {params.output_dir} {output.html} {output.zip} {log} @@ -355,6 +396,8 @@ if config["platform"] == "illumina": index=f"{datadir}{wc_folder}{cln}{raln}" "{sample}.bam.bai", conda: f"{conda_envs}Alignment.yaml" + container: + f"{config['container_cache']}/viroconstrictor_alignment_{get_hash('Alignment')}.sif" log: f"{logdir}" "RemoveAdapters_p1_{Virus}.{RefID}.{sample}.log", benchmark: @@ -387,12 +430,14 @@ rule remove_adapters_p2: output: f"{datadir}{wc_folder}{cln}{noad}" "{sample}.fastq", conda: - f"{conda_envs}Clean.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" threads: config["threads"]["AdapterRemoval"] resources: mem_mb=low_memory_job, params: - script=srcdir("scripts/clipper.py"), + script=srcdir("scripts/clipper.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/clipper.py", clipper_settings=lambda wc: get_preset_parameter( preset_name=SAMPLES[wc.sample]["PRESET"], parameter_name=f"ClipperSettings_{config['platform']}", @@ -412,6 +457,8 @@ rule qc_filter: json=f"{datadir}{wc_folder}{cln}{qcfilt}{json}" "{sample}_fastp.json", conda: f"{conda_envs}Clean.yaml" + container: + f"{config['container_cache']}/viroconstrictor_clean_{get_hash('Clean')}.sif" log: f"{logdir}QC_filter_" "{Virus}.{RefID}.{sample}.log", benchmark: @@ -453,6 +500,8 @@ rule ampligone: ep=f"{datadir}{wc_folder}{prim}" "{sample}_removedprimers.bed", conda: f"{conda_envs}Clean.yaml" + container: + f"{config['container_cache']}/viroconstrictor_clean_{get_hash('Clean')}.sif" log: f"{logdir}" "AmpliGone_{Virus}.{RefID}.{sample}.log", benchmark: @@ -488,6 +537,7 @@ rule move_fastq: output: fq=f"{datadir}{wc_folder}{cln}{prdir}" "{sample}.fastq", ep=touch(f"{datadir}{wc_folder}{prim}" "{sample}_removedprimers.bed"), + # pr=touch(f"{datadir}{wc_folder}{prim}" "{sample}_primers.bed"), resources: mem_mb=low_memory_job, shell: @@ -503,7 +553,9 @@ rule qc_clean: html=f"{datadir}{wc_folder}{qc_post}" "{sample}_fastqc.html", zip=f"{datadir}{wc_folder}{qc_post}" "{sample}_fastqc.zip", conda: - f"{conda_envs}Clean.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" log: f"{logdir}QC_clean_" "{Virus}.{RefID}.{sample}.log", benchmark: @@ -513,7 +565,7 @@ rule qc_clean: mem_mb=low_memory_job, params: outdir=f"{datadir}{wc_folder}{qc_post}", - script=srcdir("scripts/fastqc_wrapper.sh"), + script=srcdir("wrappers/fastqc_wrapper.sh") if config["use-conda"] is True and config["use-singularity"] is False else "/wrappers/fastqc_wrapper.sh", shell: """ bash {params.script} {input} {params.outdir} {output.html} {output.zip} {log} @@ -538,6 +590,8 @@ rule align_before_trueconsense: index=f"{datadir}{wc_folder}{aln}{bf}" "{sample}.bam.bai", conda: f"{conda_envs}Alignment.yaml" + container: + f"{config['container_cache']}/viroconstrictor_alignment_{get_hash('Alignment')}.sif" log: f"{logdir}Alignment_" "{Virus}.{RefID}.{sample}.log", benchmark: @@ -579,6 +633,8 @@ rule trueconsense: mincov=lambda wc: SAMPLES[wc.sample]["MIN-COVERAGE"], conda: f"{conda_envs}Consensus.yaml" + container: + f"{config['container_cache']}/viroconstrictor_consensus_{get_hash('Consensus')}.sif" log: f"{logdir}Consensus_" "{Virus}.{RefID}.{sample}.log", benchmark: @@ -632,6 +688,8 @@ rule Translate_AminoAcids: f"{datadir}{wc_folder}{amino}" "{sample}/aa.faa", conda: f"{conda_envs}ORF_analysis.yaml" + container: + f"{config['container_cache']}/viroconstrictor_orf_analysis_{get_hash('ORF_analysis')}.sif" resources: mem_mb=low_memory_job, log: @@ -680,6 +738,7 @@ def group_aminoacids_inputs(wildcards): return file_list +# this rule cannot and should not run in a separate environment/container as the sole purpose is to transfer data of the paramspace into something that can then be used in a later rule. rule make_pickle: output: temp(f"{datadir}sampleinfo.pkl"), @@ -703,10 +762,12 @@ rule concat_aminoacids: resources: mem_mb=low_memory_job, conda: - f"{conda_envs}ORF_analysis.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" threads: 1 params: - script=srcdir("scripts/group_aminoacids.py"), + script=srcdir("scripts/group_aminoacids.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/group_aminoacids.py", shell: 'python {params.script} "{input.files}" "{output}" {input.sampleinfo}' @@ -717,14 +778,16 @@ rule vcf_to_tsv: output: tsv=temp(f"{datadir}{wc_folder}{aln}{vf}" "{sample}.tsv"), conda: - f"{conda_envs}Consensus.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" threads: config["threads"]["Index"] resources: mem_mb=low_memory_job, log: f"{logdir}" "vcf_to_tsv_{Virus}.{RefID}.{sample}.log", params: - script=srcdir("scripts/vcf_to_tsv.py"), + script=srcdir("scripts/vcf_to_tsv.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/vcf_to_tsv.py", shell: """ python {params.script} {input.vcf} {output.tsv} {wildcards.sample} >> {log} 2>&1 @@ -755,6 +818,10 @@ rule get_breadth_of_coverage: temp(f"{datadir}{wc_folder}{boc}" "{sample}.tsv"), resources: mem_mb=low_memory_job, + conda: + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" params: script=srcdir("scripts/boc.py"), shell: @@ -792,9 +859,11 @@ rule calculate_amplicon_cov: resources: mem_mb=low_memory_job, conda: - f"{conda_envs}Clean.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" params: - script=srcdir("scripts/amplicon_covs.py"), + script=srcdir("scripts/amplicon_covs.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/amplicon_covs.py", shell: """ python {params.script} \ @@ -817,9 +886,11 @@ rule concat_amplicon_cov: resources: mem_mb=low_memory_job, conda: - f"{conda_envs}Clean.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" params: - script=srcdir("scripts/concat_amplicon_covs.py"), + script=srcdir("scripts/concat_amplicon_covs.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/concat_amplicon_covs.py", shell: """ python {params.script} --output {output} --input {input} @@ -870,6 +941,8 @@ rule multiqc_report: expand(f"{res}{mqc_data}multiqc_" "{program}.txt", program="fastqc"), conda: f"{conda_envs}Clean.yaml" + container: + f"{config['container_cache']}/viroconstrictor_clean_{get_hash('Clean')}.sif" log: f"{logdir}MultiQC_report.log", benchmark: @@ -877,7 +950,7 @@ rule multiqc_report: resources: mem_mb=high_memory_job, params: - conffile=srcdir("files/multiqc_config.yaml"), + conffile=srcdir("files/multiqc_config.yaml") if config["use-conda"] is True and config["use-singularity"] is False else "/files/multiqc_config.yaml", outdir=res, shell: """ diff --git a/ViroConstrictor/workflow/scripts/fastqc_wrapper.sh b/ViroConstrictor/workflow/wrappers/fastqc_wrapper.sh similarity index 100% rename from ViroConstrictor/workflow/scripts/fastqc_wrapper.sh rename to ViroConstrictor/workflow/wrappers/fastqc_wrapper.sh diff --git a/build_local_containers.py b/build_local_containers.py new file mode 100644 index 0000000..22adcac --- /dev/null +++ b/build_local_containers.py @@ -0,0 +1,42 @@ +import os +import shutil +import sys +import pathlib +import subprocess +import glob + +from ViroConstrictor.userprofile import ReadConfig + +userconf = ReadConfig(pathlib.Path("~/.ViroConstrictor_defaultprofile.ini").expanduser()) + +containerpath = None +try: + containerpath = userconf["REPRODUCTION"]["container_cache_path"] +except KeyError as e: + raise KeyError( + "The container_cache_path option is missing from the REPRODUCTION section of your configuration file." + ) from e + +if not os.path.exists(containerpath): + os.makedirs(containerpath) + +subprocess.run( + "python ./containers/build_containers.py", + shell=True, + check=True, +) + +subprocess.run( + "python containers/convert_artifact_containers_for_apptainer.py", + shell=True, + check=True, +) + +for file in glob.glob("./containers/*.sif"): + print(file) + shutil.move(file, containerpath) if containerpath else None +for file in glob.glob("./containers/*.tar"): + print(file) + os.remove(file) + +os.remove("./containers/builtcontainers.json") \ No newline at end of file diff --git a/containers/Alignment.dockerfile b/containers/Alignment.dockerfile new file mode 100644 index 0000000..e5fd7fb --- /dev/null +++ b/containers/Alignment.dockerfile @@ -0,0 +1,29 @@ +FROM mambaorg/micromamba:latest + +COPY ./ViroConstrictor/workflow/envs/Alignment.yaml /install.yml +COPY ./ViroConstrictor/workflow/files/ /files/ +COPY ./ViroConstrictor/workflow/wrappers/ /wrappers/ + +LABEL org.opencontainers.image.description="Sequence alignment processes and tools for the ViroConstrictor workflow." + +USER root + +ARG UID=10001 +RUN adduser \ + --disabled-password \ + --gecos "" \ + --home "/nonexistent" \ + --shell "/sbin/nologin" \ + --no-create-home \ + --uid "${UID}" \ + appuser + +RUN micromamba install -q -y -n base git -c conda-forge && \ + micromamba install -q -y -n base -f /install.yml && \ + micromamba clean -q --all --yes + +USER appuser + +ENV PATH=/opt/conda/bin:$PATH + +CMD ["@"] diff --git a/containers/Clean.dockerfile b/containers/Clean.dockerfile new file mode 100644 index 0000000..1d41945 --- /dev/null +++ b/containers/Clean.dockerfile @@ -0,0 +1,29 @@ +FROM mambaorg/micromamba:latest + +COPY ./ViroConstrictor/workflow/envs/Clean.yaml /install.yml +COPY ./ViroConstrictor/workflow/files/ /files/ +COPY ./ViroConstrictor/workflow/wrappers/ /wrappers/ + +LABEL org.opencontainers.image.description="Data cleaning processes and tools for the ViroConstrictor workflow." + +USER root + +ARG UID=10001 +RUN adduser \ + --disabled-password \ + --gecos "" \ + --home "/nonexistent" \ + --shell "/sbin/nologin" \ + --no-create-home \ + --uid "${UID}" \ + appuser + +RUN micromamba install -q -y -n base git -c conda-forge && \ + micromamba install -q -y -n base -f /install.yml && \ + micromamba clean -q --all --yes + +USER appuser + +ENV PATH=/opt/conda/bin:$PATH + +CMD ["@"] diff --git a/containers/Consensus.dockerfile b/containers/Consensus.dockerfile new file mode 100644 index 0000000..45a0380 --- /dev/null +++ b/containers/Consensus.dockerfile @@ -0,0 +1,29 @@ +FROM mambaorg/micromamba:latest + +COPY ./ViroConstrictor/workflow/envs/Consensus.yaml /install.yml +COPY ./ViroConstrictor/workflow/files/ /files/ +COPY ./ViroConstrictor/workflow/wrappers/ /wrappers/ + +LABEL org.opencontainers.image.description="Consensus sequence generation processes and tools for the ViroConstrictor workflow." + +USER root + +ARG UID=10001 +RUN adduser \ + --disabled-password \ + --gecos "" \ + --home "/nonexistent" \ + --shell "/sbin/nologin" \ + --no-create-home \ + --uid "${UID}" \ + appuser + +RUN micromamba install -q -y -n base git -c conda-forge && \ + micromamba install -q -y -n base -f /install.yml && \ + micromamba clean -q --all --yes + +USER appuser + +ENV PATH=/opt/conda/bin:$PATH + +CMD ["@"] diff --git a/containers/ORF_analysis.dockerfile b/containers/ORF_analysis.dockerfile new file mode 100644 index 0000000..8d2486c --- /dev/null +++ b/containers/ORF_analysis.dockerfile @@ -0,0 +1,29 @@ +FROM mambaorg/micromamba:latest + +COPY ./ViroConstrictor/workflow/envs/ORF_analysis.yaml /install.yml +COPY ./ViroConstrictor/workflow/files/ /files/ +COPY ./ViroConstrictor/workflow/wrappers/ /wrappers/ + +LABEL org.opencontainers.image.description="ORF analysis processes and tools for the ViroConstrictor workflow." + +USER root + +ARG UID=10001 +RUN adduser \ + --disabled-password \ + --gecos "" \ + --home "/nonexistent" \ + --shell "/sbin/nologin" \ + --no-create-home \ + --uid "${UID}" \ + appuser + +RUN micromamba install -q -y -n base git -c conda-forge && \ + micromamba install -q -y -n base -f /install.yml && \ + micromamba clean -q --all --yes + +USER appuser + +ENV PATH=/opt/conda/bin:$PATH + +CMD ["@"] diff --git a/containers/Scripts.dockerfile b/containers/Scripts.dockerfile new file mode 100644 index 0000000..53e7e92 --- /dev/null +++ b/containers/Scripts.dockerfile @@ -0,0 +1,30 @@ +FROM mambaorg/micromamba:latest + +COPY ./ViroConstrictor/workflow/envs/Scripts.yaml /install.yml +COPY ./ViroConstrictor/workflow/files/ /files/ +COPY ./ViroConstrictor/workflow/wrappers/ /wrappers/ +COPY ./ViroConstrictor/workflow/scripts/ /scripts/ + +LABEL org.opencontainers.image.description="Supplementary scripts for the ViroConstrictor workflow." + +USER root + +ARG UID=10001 +RUN adduser \ + --disabled-password \ + --gecos "" \ + --home "/nonexistent" \ + --shell "/sbin/nologin" \ + --no-create-home \ + --uid "${UID}" \ + appuser + +RUN micromamba install -q -y -n base git -c conda-forge && \ + micromamba install -q -y -n base -f /install.yml && \ + micromamba clean -q --all --yes + +USER appuser + +ENV PATH=/opt/conda/bin:$PATH + +CMD ["@"] diff --git a/containers/add_OCI_to_docker_engine.py b/containers/add_OCI_to_docker_engine.py new file mode 100644 index 0000000..3eac792 --- /dev/null +++ b/containers/add_OCI_to_docker_engine.py @@ -0,0 +1,21 @@ +import json +import subprocess +from typing import List + +base_path_to_container_defs = "./containers" + +if __name__ == "__main__": + print("Adding OCI containers to Docker Engine from Artifact") + + builtcontainers: List = [] + with open(f"{base_path_to_container_defs}/builtcontainers.json", "r") as f: + builtcontainers: List = json.load(f) + + for container in builtcontainers: + print(f"Adding {container} to local Docker Engine") + subprocess.run( + f"docker image load -i {base_path_to_container_defs}/{container}.tar", + shell=True, + ) + + print("Done adding OCI containers to Docker Engine") diff --git a/containers/build_containers.py b/containers/build_containers.py new file mode 100644 index 0000000..5244db2 --- /dev/null +++ b/containers/build_containers.py @@ -0,0 +1,128 @@ +import json +import os +import subprocess +import tempfile + +import requests + +from ViroConstrictor import __prog__ +from ViroConstrictor.workflow.containers import fetch_hashes, upstream_registry + +base_path_to_container_defs = "./containers" +upstream_api_endpoint = ( + "https://api.github.com/orgs/RIVM-bioinformatics/packages/container/" +) +upstream_api_authtoken = os.environ.get("TOKEN") +upstream_api_responsetype = "application/vnd.github+json" +upstream_api_version = "2022-11-28" +upstream_api_headers = { + "Accept": f"{upstream_api_responsetype}", + "X-GitHub-Api-Version": f"{upstream_api_version}", + "Authorization": f"Bearer {upstream_api_authtoken}", +} + + +#TODO: break up this script into smaller functions +if __name__ == "__main__": + print("Start of container building process for ViroConstrictor") + recipe_hashes = fetch_hashes() + + builtcontainers = [] + for recipe, VersionHash in recipe_hashes.items(): + # strip the name of the recipe to only get the name of the environment + recipe_basename = os.path.basename(recipe).replace(".yaml", "") + container_basename = f"{__prog__}_{recipe_basename}".lower() + + associated_container_dock_file = os.path.join( + base_path_to_container_defs, f"{recipe_basename}.dockerfile" + ) + upstream_registry_url = f"{upstream_registry}/{recipe_basename}:{VersionHash}" + upstream_existing_containers = ( + f"{upstream_api_endpoint}{__prog__}_{recipe_basename}/versions" + ) + print( + f"Checking if container '{container_basename}' with hash '{VersionHash}' exists in the upstream registry" + ) + json_response = requests.get( + upstream_existing_containers, headers=upstream_api_headers + ).json() + + tags = [] + + # if the container exists at all in the upstream registry, the json response will be a list. + # If the container does not exist, the json response will be a dict with a message that the container does not exist. + # You can therefore check if the json response is a list or a dict to see if the container exists or not. + if isinstance(json_response, list): + tags = [ + version["metadata"]["container"]["tags"] for version in json_response + ] + # flatten the list of tags + tags = [tag for sublist in tags for tag in sublist] + + if VersionHash in tags: + print( + f"Container '{container_basename}' with hash '{VersionHash}' already exists in the upstream registry" + ) + continue + + print( + f"Container '{container_basename}' with hash '{VersionHash}' does not exist in the upstream registry" + ) + print( + f"Starting Docker build process for container '{container_basename}:{VersionHash}'" + ) + + # create a temporary file to write the container definition to, copy the contents of {recipe_basename}.dockerfile to it and then append the labels section to it including the version hash + # then use the temporary file as the container definition file for the docker build process + # the docker build process will build the container file also in a temporary directory + # after the container is built, the built container will be saved in the docker artifact database (local). + # This is necessary to transform the container from docker format to apptainer format in a separate script. + # the container file will not be pushed to the upstream registry yet, this will be done in a separate script after all containers have been built and tested. + with tempfile.NamedTemporaryFile( + mode="w", delete=False + ) as tmp, tempfile.TemporaryDirectory() as tmpdir: + with open(associated_container_dock_file, "r") as f: + tmp.write(f.read()) + tmp.write( + f""" + +LABEL Author="RIVM-bioinformatics team" +LABEL Maintainer="RIVM-bioinformatics team" +LABEL Associated_pipeline="{__prog__}" +LABEL version="{VersionHash}" +LABEL org.opencontainers.image.authors="ids-bioinformatics@rivm.nl" +LABEL org.opencontainers.image.source=https://github.com/RIVM-bioinformatics/{__prog__} + + """ + ) + tmp.flush() # flush the temporary file to make sure the contents are written to disk + subprocess.run( + [ + "docker", + "build", + "-t", + f"{container_basename}:{VersionHash}", + "-f", + f"{tmp.name}", + ".", + "--network", + "host", + "--no-cache", + ], + check=True, + ) + # move the container file to the current working directory + subprocess.run( + [ + "docker", + "save", + "-o", + f"{base_path_to_container_defs}/{container_basename}:{VersionHash}.tar", + f"{container_basename}:{VersionHash}", + ] + ) + + builtcontainers.append(f"{container_basename}:{VersionHash}") + + with open(f"{base_path_to_container_defs}/builtcontainers.json", "w") as f: + json.dump(builtcontainers, f, indent=4) diff --git a/containers/convert_artifact_containers_for_apptainer.py b/containers/convert_artifact_containers_for_apptainer.py new file mode 100644 index 0000000..0af5568 --- /dev/null +++ b/containers/convert_artifact_containers_for_apptainer.py @@ -0,0 +1,30 @@ +import json +import shutil +import subprocess +from typing import List + +base_path_to_container_defs = "./containers" + +if __name__ == "__main__": + print("Renaming OCI containers and converting to Apptainer .sif format") + + builtcontainers: List = [] + with open(f"{base_path_to_container_defs}/builtcontainers.json", "r") as f: + builtcontainers: List = json.load(f) + + builtcontainers_trimmed = [ + container.replace(":", "_") for container in builtcontainers + ] + + for original_name, trimmed_name in zip(builtcontainers, builtcontainers_trimmed): + print(f"Renaming {original_name} to {trimmed_name}") + shutil.move( + f"{base_path_to_container_defs}/{original_name}.tar", + f"{base_path_to_container_defs}/{trimmed_name}.tar", + ) + trimmed_name_sif = trimmed_name.replace(":", "_") + print(f"Converting {original_name} to Apptainer .sif format") + subprocess.run( + f"apptainer build {base_path_to_container_defs}/{trimmed_name_sif}.sif docker-archive://{base_path_to_container_defs}/{trimmed_name}.tar", + shell=True, + ) diff --git a/containers/pull_published_containers.py b/containers/pull_published_containers.py new file mode 100644 index 0000000..ce3b920 --- /dev/null +++ b/containers/pull_published_containers.py @@ -0,0 +1,26 @@ +import os +import shutil +import sys + +from ViroConstrictor.workflow.containers import download_containers + +base_path_to_container_defs = "./containers" + +if __name__ == "__main__": + config = {"container_cache": f"{os.getcwd()}/test_containers", "dryrun": False} + os.makedirs(config["container_cache"], exist_ok=True) + + # move .sif files from ./containers to ./test_containers + for file in os.listdir(base_path_to_container_defs): + if file.endswith(".sif"): + shutil.move( + f"{base_path_to_container_defs}/{file}", + f"{config['container_cache']}/{file}", + ) + + download_status = download_containers(config, verbose=True) + if download_status == 1: + print( + "Failed to download all necessary containers. Please check the logs for more information." + ) + sys.exit(1) diff --git a/containers/tag_and_push_containers.py b/containers/tag_and_push_containers.py new file mode 100644 index 0000000..5ffb5e0 --- /dev/null +++ b/containers/tag_and_push_containers.py @@ -0,0 +1,20 @@ +import json +import subprocess +from typing import List + +from ViroConstrictor.workflow.containers import upstream_registry + +base_path_to_container_defs = "./containers" + +if __name__ == "__main__": + + built_containers: List[str] = [] + with open(f"{base_path_to_container_defs}/builtcontainers.json", "r") as f: + built_containers: List[str] = json.load(f) + + for container in built_containers: + print(f"Tagging and pushing {container}") + subprocess.run( + f"docker tag {container} {upstream_registry}/{container}", shell=True + ) + subprocess.run(f"docker push {upstream_registry}/{container}", shell=True) diff --git a/env.yml b/env.yml index 3ebba74..31147b1 100644 --- a/env.yml +++ b/env.yml @@ -5,7 +5,7 @@ channels: - nodefaults dependencies: - conda-forge::python>=3.10 - - conda-forge::mamba>=1.0.0 + - conda-forge::mamba>=1.0.0,<2.0.0 - conda-forge::drmaa==0.7.9 - conda-forge::biopython==1.81 - conda-forge::pandas==2.0.* @@ -16,5 +16,6 @@ dependencies: - conda-forge::python-magic==0.4.27 - conda-forge::rich==13.* - conda-forge::pip + - conda-forge::pyopenssl==24.* - bioconda::snakemake-minimal==7.25.* - bioconda::aminoextract==0.3.1 \ No newline at end of file diff --git a/setup.py b/setup.py index fa57f3b..a678acc 100644 --- a/setup.py +++ b/setup.py @@ -64,17 +64,19 @@ "ViroConstrictor": [ "workflow/envs/*", "workflow/scripts/*", + "workflow/wrappers/*", "workflow/files/*", "workflow/scripts/match_ref/*", + "workflow/*", ] }, install_requires=[ - "urllib3", - "biopython>=1.79", + "urllib3==1.26.*", + "biopython==1.81", "drmaa==0.7.9", - "fpdf2", + "fpdf2==2.5.1", "pandas>=1.4.2", - "openpyxl", + "openpyxl==3.1.*", "pyyaml==6.0", "rich==13.*", "AminoExtract==0.3.1",