From fabbd8a23a6525204898e5ea178ee9d558829e1d Mon Sep 17 00:00:00 2001 From: Brian Loyal Date: Fri, 3 Jan 2025 14:24:42 -0600 Subject: [PATCH] v2.12.0 --- .gitignore | 2 + CHANGELOG.md | 12 + assets/containers/alphafold-data/Dockerfile | 68 ++++-- .../alphafold-data/update_locations.py | 52 ++++- .../containers/alphafold-predict/Dockerfile | 83 ++++--- assets/containers/mmseqs2/Dockerfile | 4 + assets/containers/protein-utils/Dockerfile | 28 ++- .../protein-utils/code/resources.json | 1 + .../protein-utils/code/seq_info.json | 1 + assets/containers/protein-utils/code/setup.py | 2 +- .../src/putils/check_and_validate_inputs.py | 45 ++-- .../code/src/putils/split_fasta.py | 221 ------------------ .../containers/protein-utils/requirements.txt | 7 - assets/data/swissprot.txt | 1 + assets/data/uniref100.txt | 1 + assets/data/uniref50.txt | 1 + assets/workflows/abodybuilder3/main.nf | 3 +- .../workflows/abodybuilder3/nextflow.config | 2 +- .../workflows/alphafold2-multimer/README.md | 33 +-- .../alphafold2-multimer/build_containers.sh | 27 +++ .../workflows/alphafold2-multimer/config.yaml | 3 - assets/workflows/alphafold2-multimer/main.nf | 209 ++++++++++------- .../alphafold2-multimer/nextflow.config | 6 +- .../parameter-template.json | 6 + .../workflows/alphafold2-multimer/params.json | 3 + .../workflows/alphafold2-multimer/searches.nf | 160 +++++++------ .../workflows/alphafold2-multimer/unpack.nf | 20 +- assets/workflows/mmseqs2/README.md | 5 + assets/workflows/mmseqs2/config.yaml | 15 ++ assets/workflows/mmseqs2/main.nf | 85 +++++++ assets/workflows/mmseqs2/nextflow.config | 12 + build/buildspec/buildspec_data.yaml | 8 +- scripts/testrun.sh | 6 +- 33 files changed, 590 insertions(+), 542 deletions(-) create mode 100644 assets/containers/mmseqs2/Dockerfile create mode 100644 assets/containers/protein-utils/code/resources.json create mode 100644 assets/containers/protein-utils/code/seq_info.json delete mode 100644 assets/containers/protein-utils/code/src/putils/split_fasta.py delete mode 100644 assets/containers/protein-utils/requirements.txt create mode 100644 assets/data/swissprot.txt create mode 100644 assets/data/uniref100.txt create mode 100644 assets/data/uniref50.txt create mode 100755 assets/workflows/alphafold2-multimer/build_containers.sh create mode 100644 assets/workflows/alphafold2-multimer/parameter-template.json create mode 100644 assets/workflows/alphafold2-multimer/params.json create mode 100644 assets/workflows/mmseqs2/README.md create mode 100644 assets/workflows/mmseqs2/config.yaml create mode 100644 assets/workflows/mmseqs2/main.nf create mode 100644 assets/workflows/mmseqs2/nextflow.config diff --git a/.gitignore b/.gitignore index 46fe94f..ad4468b 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,5 @@ tmp/ .nextflow* stack-outputs.json test_data +linter-rules-for-nextflow +build/cloudformation/packaged.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index 0acd5fc..50b9c76 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 --- +## [2.12.0] - 2025-01-03 + +### 2.12.0 Added + +- Added MMseqs2 workflow + +### 2.12.0 Changed + +- Updated AlphaFold2-Multimer workflow to support multiple input fasta files + +--- + ## [2.11.0] - 2024-12-18 ### 2.11.0 Added diff --git a/assets/containers/alphafold-data/Dockerfile b/assets/containers/alphafold-data/Dockerfile index 19970db..4eaeca7 100644 --- a/assets/containers/alphafold-data/Dockerfile +++ b/assets/containers/alphafold-data/Dockerfile @@ -1,7 +1,7 @@ # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 -FROM public.ecr.aws/amazonlinux/amazonlinux:latest as build +FROM public.ecr.aws/amazonlinux/amazonlinux:latest AS build RUN yum upgrade -y \ && yum install -y \ @@ -19,26 +19,52 @@ RUN yum upgrade -y \ wget \ zstd \ && yum clean all \ - && rm -rf /var/cache/yum \ - && pushd /tmp \ - && git clone https://github.com/soedinglab/hh-suite.git \ - && cd hh-suite && mkdir build && cd build \ - && cmake -DCMAKE_INSTALL_PREFIX=/opt/hhsuite .. \ - && make -j 4 && make install \ - && popd \ - && pushd /tmp \ - && wget http://msa.sbc.su.se/downloads/kalign/current.tar.gz --no-check-certificate \ - && mkdir -p /tmp/kalign2/build \ - && tar -xvzf current.tar.gz -C /tmp/kalign2 \ - && pushd /tmp/kalign2 \ - && ./configure \ - && make && make install \ - && popd \ - && rm -rf /tmp/kalign2 \ - && popd \ - && mkdir -p /tmp/hmmer && wget -O hmmer.tar.gz http://eddylab.org/software/hmmer/hmmer-3.4.tar.gz \ - && tar xvzf hmmer.tar.gz -C /tmp/hmmer \ - && pushd /tmp/hmmer/hmmer-* \ + && rm -rf /var/cache/yum + +# ADD hh-suite.tar.gz /tmp/hh-suite +# RUN pushd /tmp/hh-suite \ +# && cmake -DCMAKE_INSTALL_PREFIX=/opt/hhsuite . \ +# && make && make install \ +# && popd +RUN pushd /tmp && \ +git clone https://github.com/soedinglab/hh-suite.git && \ +cd hh-suite && mkdir build && cd build && \ +cmake -DCMAKE_INSTALL_PREFIX=/opt/hhsuite .. && \ +make -j 4 && make install && \ +popd + +# ADD kalign.tar.gz /tmp/kalign-3.4.3 +# RUN pushd /tmp/kalign2 \ +# && ./configure \ +# && make && make install \ +# && popd +# RUN pushd /tmp && \ +# wget https://github.com/TimoLassmann/kalign/archive/refs/tags/v3.4.0.tar.gz && \ +# tar -xvzf v3.4.0.tar.gz && \ +# cd kalign-3.4.0 && \ +# mkdir build && \ +# cd build && \ +# cmake3 .. && \ +# make -j 4 && make test && \ +# make install && \ +# popd + +# Compile kalign2 from source +RUN pushd /tmp && \ +wget http://msa.sbc.su.se/downloads/kalign/current.tar.gz --no-check-certificate \ +&& mkdir -p /tmp/kalign2/build \ +&& tar -xvzf current.tar.gz -C /tmp/kalign2 \ +&& pushd /tmp/kalign2 \ +&& ./configure \ +&& make && make install \ +&& popd \ +&& rm -rf /tmp/kalign2 && \ +popd + +# ADD hmmer.tar.gz /tmp/hmmer +RUN mkdir -p /tmp/hmmer && wget -O hmmer.tar.gz http://eddylab.org/software/hmmer/hmmer-3.4.tar.gz \ +&& tar xvzf hmmer.tar.gz -C /tmp/hmmer +RUN pushd /tmp/hmmer/hmmer-* \ && ./configure \ && make && make install \ && popd diff --git a/assets/containers/alphafold-data/update_locations.py b/assets/containers/alphafold-data/update_locations.py index 35cfa43..8a76baf 100644 --- a/assets/containers/alphafold-data/update_locations.py +++ b/assets/containers/alphafold-data/update_locations.py @@ -8,22 +8,50 @@ from alphafold.data.pipeline_multimer import int_id_to_str_id - -def update_locations(target_dir, file_list): +# Example file_lists: +# +# [4ZQK.1_uniref90_hits.sto 4ZQK.2_uniref90_hits.sto] +# [4ZQK.2_mgnify_hits.sto 4ZQK.1_mgnify_hits.sto] +# [4ZQK.1_uniprot_hits.sto 4ZQK.2_uniprot_hits.sto] +# [4ZQK.1_bfd_hits.a3m 4ZQK.2_bfd_hits.a3m] +# [4ZQK.1_pdb_hits.sto 4ZQK.2_pdb_hits.sto] +# or +# 4ZQK_simple.1_uniref90_hits.sto 4ZQK_simple.2_uniref90_hits.sto +# 4ZQK_simple.1_mgnify_hits.sto 4ZQK_simple.2_mgnify_hits.sto +# 4ZQK_simple.2_uniprot_hits.sto 4ZQK_simple.1_uniprot_hits.sto +# 4ZQK_simple.2_bfd_hits.a3m 4ZQK_simple.1_bfd_hits.a3m +# 4ZQK_simple.1_pdb_hits.sto 4ZQK_simple.2_pdb_hits.sto + +def strip_suffix_str(s: str, suffix: str): + if s.endswith(suffix): + return s[:-len(suffix)] + return None + +# target_dir = msa +def update_locations(target_dir, strip_suffix, file_list): for filename in file_list: - index, _null, outfile = filename.partition("_") - index = index.split(".")[1] + # filename = 4ZQK_simple.1_uniref90_hits.sto + # strip_suffix = _uniref90_hits.sto + + stripped_filename = strip_suffix_str(filename, strip_suffix) + if stripped_filename == None: + raise Exception(f"Suffix {strip_suffix} not in {filename}") - chain = int_id_to_str_id(int(index)) - print(f'file: {filename} index: {index} chain: {chain} outfile:{outfile}') - chain = os.path.join(target_dir, chain) - path = pathlib.Path(chain) + # stripped_filename = 4ZQK_simple.1 + record_inx = int(stripped_filename[-1]) # 1 + outfile = strip_suffix[1:] # uniref90_hits.sto + chain = int_id_to_str_id(record_inx) - if not path.exists(): - path.mkdir(parents=True) - shutil.copy(filename, os.path.join(chain, outfile), follow_symlinks=True) + chain_dir_path = pathlib.Path(os.path.join(target_dir, chain)) + + if not chain_dir_path.exists(): + chain_dir_path.mkdir(parents=True) + target = os.path.join(chain_dir_path, outfile) + print(f"COPY {filename} -> {target}") + shutil.copy(filename, target, follow_symlinks=True) + if __name__ == "__main__": - update_locations(sys.argv[1], sys.argv[2:]) + update_locations(sys.argv[1], sys.argv[2], sys.argv[3:]) diff --git a/assets/containers/alphafold-predict/Dockerfile b/assets/containers/alphafold-predict/Dockerfile index 30644af..cc483f5 100644 --- a/assets/containers/alphafold-predict/Dockerfile +++ b/assets/containers/alphafold-predict/Dockerfile @@ -3,10 +3,9 @@ # SPDX-License-Identifier: Apache-2.0 # ARG CUDA=11.1.1 -ARG CUDA=12.2.2 -# ARG ALPHAFOLD2_VERSION=v2.3.2 -ARG ALPHAFOLD2_VERSION=f251de6613cb478207c732bf9627b1e853c99c2f -FROM nvcr.io/nvidia/cuda:${CUDA}-cudnn8-runtime-ubuntu20.04 +ARG CUDA=11.6.0 +ARG ALPHAFOLD2_VERSION=v2.3.2 +FROM nvcr.io/nvidia/cuda:${CUDA}-cudnn8-runtime-ubuntu18.04 # FROM directive resets ARGS, so we specify again (the value is retained if # previously set). ARG CUDA @@ -15,19 +14,18 @@ ARG ALPHAFOLD2_VERSION # Use bash to support string substitution. SHELL ["/bin/bash", "-o", "pipefail", "-c"] -RUN apt-get update \ - && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ - build-essential \ - cmake \ - cuda-command-line-tools-$(cut -f1,2 -d- <<< ${CUDA//./-}) \ - git \ - hmmer \ - kalign \ - tzdata \ - wget \ - awscli \ - jq \ - unzip \ +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ + build-essential \ + cmake \ + cuda-command-line-tools-$(cut -f1,2 -d- <<< ${CUDA//./-}) \ + git \ + hmmer \ + kalign \ + tzdata \ + wget \ + awscli \ + jq \ && rm -rf /var/lib/apt/lists/* \ && apt-get autoremove -y \ && apt-get clean @@ -36,7 +34,7 @@ RUN apt-get update \ RUN git clone --branch v3.3.0 https://github.com/soedinglab/hh-suite.git /tmp/hh-suite \ && mkdir /tmp/hh-suite/build \ && pushd /tmp/hh-suite/build \ - && cmake -DCMAKE_INSTALL_PREFIX=/opt/hhsuite .. \ + && cmake -DHAVE_AVX2=1 -DCMAKE_INSTALL_PREFIX=/opt/hhsuite .. \ && make -j 4 && make install \ && ln -s /opt/hhsuite/bin/* /usr/bin \ && popd \ @@ -50,18 +48,18 @@ RUN wget -q -P /tmp \ # Install conda packages. ENV PATH="/opt/conda/bin:$PATH" -ENV LD_LIBRARY_PATH="/opt/conda/lib:$LD_LIBRARY_PATH" -RUN conda install -qy conda==24.5.0 pip python=3.11 \ - && conda install -y -c nvidia/label/cuda-${CUDA} cuda \ - && conda install -y -c conda-forge openmm=8.0.0 pdbfixer \ - && conda clean --all --force-pkgs-dirs --yes +# RUN conda install -qy conda==4.13.0 +# && conda install -y -c conda-forge +RUN conda install -y -c conda-forge \ + openmm=7.5.1 \ + cudatoolkit=${CUDA_VERSION} \ + pdbfixer=1.7 \ + pip \ + python=3.9.16 \ + && conda clean --all --force-pkgs-dirs --yes -# Install AlphaFold -RUN wget -q -P /tmp \ - https://github.com/google-deepmind/alphafold/archive/${ALPHAFOLD2_VERSION}.zip \ - && mkdir -p /app/alphafold \ - && unzip /tmp/f251de6613cb478207c732bf9627b1e853c99c2f.zip -d /tmp \ - && mv /tmp/alphafold-f251de6613cb478207c732bf9627b1e853c99c2f/* /app/alphafold + +RUN git clone --branch ${ALPHAFOLD2_VERSION} --depth 1 https://github.com/deepmind/alphafold.git /app/alphafold RUN wget -q -P /app/alphafold/alphafold/common/ \ https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt @@ -70,20 +68,33 @@ RUN wget -q -P /app/alphafold/alphafold/common/ \ RUN pip3 install --upgrade pip --no-cache-dir \ && pip3 install -r /app/alphafold/requirements.txt --no-cache-dir \ && pip3 install --upgrade --no-cache-dir \ - jax==0.4.26 \ - jaxlib==0.4.26+cuda12.cudnn89 \ + jax==0.3.25 \ + jaxlib==0.3.25+cuda11.cudnn805 \ -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html RUN pip3 install --upgrade --no-cache-dir \ - matplotlib==3.9.2 + matplotlib==3.6.3 \ + numpy==1.24.3 + +# Apply OpenMM patch. +WORKDIR /opt/conda/lib/python3.9/site-packages +RUN patch -p0 < /app/alphafold/docker/openmm.patch # Add SETUID bit to the ldconfig binary so that non-root users can run it. RUN chmod u+s /sbin/ldconfig.real -# Currently needed to avoid undefined_symbol error. -RUN ln -sf /usr/lib/x86_64-linux-gnu/libffi.so.7 /opt/conda/lib/libffi.so.7 - +# We need to run `ldconfig` first to ensure GPUs are visible, due to some quirk +# with Debian. See https://github.com/NVIDIA/nvidia-docker/issues/1399 for +# details. +# ENTRYPOINT does not support easily running multiple commands, so instead we +# write a shell script to wrap them up. WORKDIR /app/alphafold COPY predict.py /app/alphafold/ +# COPY run.sh /app/alphafold/run.sh +# RUN echo $'#!/bin/bash\n\ +# ldconfig\n\ +# python /app/alphafold/run_alphafold.py "$@"' > /app/run_alphafold.sh \ +# && chmod +x /app/run_alphafold.sh /app/alphafold/run.sh -ENTRYPOINT [] +# ENTRYPOINT ["bash", "/app/alphafold/run.sh"] +ENTRYPOINT ["bash"] diff --git a/assets/containers/mmseqs2/Dockerfile b/assets/containers/mmseqs2/Dockerfile new file mode 100644 index 0000000..4e862f4 --- /dev/null +++ b/assets/containers/mmseqs2/Dockerfile @@ -0,0 +1,4 @@ +FROM ghcr.io/soedinglab/mmseqs2:master-cuda12 + +# Run with /usr/local/bin/entrypoint +ENTRYPOINT [] \ No newline at end of file diff --git a/assets/containers/protein-utils/Dockerfile b/assets/containers/protein-utils/Dockerfile index 4ccee87..32eb420 100644 --- a/assets/containers/protein-utils/Dockerfile +++ b/assets/containers/protein-utils/Dockerfile @@ -1,27 +1,25 @@ -FROM public.ecr.aws/amazonlinux/amazonlinux:2023 as build +FROM public.ecr.aws/amazonlinux/amazonlinux:2 as build -WORKDIR /home - -COPY code /home/putils -COPY requirements.txt /home +COPY code /tmp/putils # Install python and other dependencies -RUN yum update \ +RUN amazon-linux-extras install python3.8 \ && yum upgrade -y \ && yum install -y \ - python3.11 \ unzip-6.0 \ - wget-1.21.3 \ - && python3.11 -m venv /opt/venv \ + wget-1.14 \ + && python3.8 -m venv /opt/venv \ && source /opt/venv/bin/activate \ - && pip install -U pip \ - && pip install -q --no-cache-dir -r /home/requirements.txt \ - && pip install -q --no-cache-dir /home/putils \ - && yum autoremove -y \ + && pip install -q --no-cache-dir \ + pandas==2.0.0 \ + numpy==1.24.2 \ + biopython==1.81 \ + /tmp/putils \ && yum clean all \ - && rm -rf /var/cache/yum + && rm -rf /var/cache/yum \ + && rm -rf /tmp/putils ENV VIRTUAL_ENV="/opt/venv" ENV PATH="$VIRTUAL_ENV/bin:$PATH" -ENTRYPOINT [] \ No newline at end of file +WORKDIR /home \ No newline at end of file diff --git a/assets/containers/protein-utils/code/resources.json b/assets/containers/protein-utils/code/resources.json new file mode 100644 index 0000000..a69de4e --- /dev/null +++ b/assets/containers/protein-utils/code/resources.json @@ -0,0 +1 @@ +{"id": "3D06", "seq_length": 200, "seq_count": 1, "template_search_resources": {"vcpu": 2, "memory": "4 GiB", "gpu": "False"}, "feature_gen_resources": {"vcpu": 2, "memory": "4 GiB", "gpu": "False"}, "predict_resources": {"vcpu": 8, "memory": "32 GiB", "gpu": "True"}, "uniref90_msa_resources": {"vcpu": 8, "memory": "16 GiB", "gpu": "False"}, "mgnify_msa_resources": {"vcpu": 8, "memory": "16 GiB", "gpu": "False"}, "bfd_msa_resources": {"vcpu": 16, "memory": "32 GiB", "gpu": "False"}} \ No newline at end of file diff --git a/assets/containers/protein-utils/code/seq_info.json b/assets/containers/protein-utils/code/seq_info.json new file mode 100644 index 0000000..9104a31 --- /dev/null +++ b/assets/containers/protein-utils/code/seq_info.json @@ -0,0 +1 @@ +{"id": "2022", "seq_length": "100", "seq_count": "1"} \ No newline at end of file diff --git a/assets/containers/protein-utils/code/setup.py b/assets/containers/protein-utils/code/setup.py index de9ab47..852f912 100644 --- a/assets/containers/protein-utils/code/setup.py +++ b/assets/containers/protein-utils/code/setup.py @@ -1,5 +1,5 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: MIT-0 +# SPDX-License-Identifier: Apache-2.0 from setuptools import setup, find_packages diff --git a/assets/containers/protein-utils/code/src/putils/check_and_validate_inputs.py b/assets/containers/protein-utils/code/src/putils/check_and_validate_inputs.py index 3828a00..fd735d4 100644 --- a/assets/containers/protein-utils/code/src/putils/check_and_validate_inputs.py +++ b/assets/containers/protein-utils/code/src/putils/check_and_validate_inputs.py @@ -1,9 +1,7 @@ import argparse -import logging -# from numpy.polynomial import Polynomial from Bio import SeqIO import json -import re +import logging logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", @@ -11,26 +9,22 @@ level=logging.INFO, ) + def write_seq_file(seq, filename): with open(filename, "w") as out_fh: SeqIO.write(seq, out_fh, "fasta") -def split_and_get_sequence_metrics(target_id, seq_list, output_prefix): + +def split_and_get_sequence_metrics(seq_list, output_prefix="input"): seq_length = 0 seq_count = 0 total_length = 0 - if output_prefix: - output_prefix = output_prefix + "_" - else: - output_prefix = "input_" - for seq_record in seq_list: seq_length += len(seq_record.seq) seq_count += 1 - # id = seq_record.id - write_seq_file(seq_list, "inputs.fasta") + write_seq_file(seq_list, f"{output_prefix}.fasta") total_length += seq_length return seq_count, total_length @@ -40,46 +34,37 @@ def check_inputs(target_id, fasta_path, output_prefix): with open(fasta_path, "r") as in_fh: seq_list = list(SeqIO.parse(in_fh, "fasta")) - seq_count, total_length = split_and_get_sequence_metrics(target_id, seq_list, output_prefix) + seq_count, total_length = split_and_get_sequence_metrics(seq_list, output_prefix) seq_info = { "target_id": str(target_id), "total_length": str(total_length), - "seq_count": str(seq_count) + "seq_count": str(seq_count), } - # write the sequence info to a json file + # write the sequence info to a json file with open("seq_info.json", "w") as out_fh: json.dump(seq_info, out_fh) - # return seq_info - # return f'{total_length}\n{seq_count}\n' return total_length if __name__ == "__main__": - parser = argparse.ArgumentParser() parser.add_argument( - "--target_id", - help="The ID of the target", - type=str, - required=True - ) + "--target_id", help="The ID of the target", type=str, required=True + ) parser.add_argument( - "--fasta_path", - help="Path to input FASTA file", - type=str, - required=True - ) + "--fasta_path", help="Path to input FASTA file", type=str, required=True + ) parser.add_argument( "--output_prefix", help="(Optional) file name prefix for the sequence files", - default=None, + default="input", type=str, - required=False + required=False, ) args = parser.parse_args() output = check_inputs(args.target_id, args.fasta_path, args.output_prefix) - print(output) + print(f"Total length is {output}") diff --git a/assets/containers/protein-utils/code/src/putils/split_fasta.py b/assets/containers/protein-utils/code/src/putils/split_fasta.py deleted file mode 100644 index a5a8396..0000000 --- a/assets/containers/protein-utils/code/src/putils/split_fasta.py +++ /dev/null @@ -1,221 +0,0 @@ -import argparse -import logging -import os -import pyfastx -import random -import shutil -import tempfile -import tqdm -from urllib.parse import urlparse - -logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - level=logging.INFO, -) - - -def parse_args(): - """Parse the arguments.""" - logging.info("Parsing arguments") - parser = argparse.ArgumentParser() - - parser.add_argument( - "source", - type=str, - help="Path to input .fasta or .fasta.gz file, e.g. s3://myfasta.fa, http://myfasta.fasta.gz, ~/myfasta.fasta, etc", - ) - - parser.add_argument( - "--max_records_per_partition", - type=int, - default=2000000, - help="Max number of sequence records per csv partition", - ) - parser.add_argument( - "--output_dir", - type=str, - default=os.getcwd(), - help="Output dir for processed files", - ) - parser.add_argument( - "--save_csv", - "-c", - action="store_true", - default=False, - help="Save csv files to output dir?", - ) - parser.add_argument( - "-f", - "--save_fasta", - action="store_true", - default=False, - help="Save FASTA file to output dir?", - ) - parser.add_argument( - "--shuffle", - "-s", - action="store_true", - default=True, - help="Shuffle the records in each csv partition?", - ) - - args, _ = parser.parse_known_args() - return args - - -def main(args): - """Transform fasta file into dataset""" - - if not os.path.exists(args.output_dir): - os.makedirs(args.output_dir) - - tmp_dir = tempfile.TemporaryDirectory(dir=os.getcwd()) - input_file = os.path.join(tmp_dir.name, "input.fa") - input_path = download(args.source, input_file) - - output_path = split_fasta( - fasta_file=input_path, - output_dir=args.output_dir, - max_records_per_partition=args.max_records_per_partition, - shuffle=args.shuffle, - save_fasta=args.save_fasta, - save_csv=args.save_csv, - ) - - tmp_dir.cleanup() - logging.info(f"Files saved to {args.output_dir}") - - return output_path - - -def download(source: str, filename: str) -> str: - output_dir = os.path.dirname(filename) - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - if source.startswith("s3"): - import boto3 - - logging.info(f"Downloading {source} to {filename}") - s3 = boto3.client("s3") - parsed = urlparse(source, allow_fragments=False) - bucket = parsed.netloc - key = parsed.path[1:] - total = s3.head_object(Bucket=bucket, Key=key)["ContentLength"] - tqdm_params = { - "desc": source, - "total": total, - "miniters": 1, - "unit": "B", - "unit_scale": True, - "unit_divisor": 1024, - } - with tqdm.tqdm(**tqdm_params) as pb: - s3.download_file( - parsed.netloc, - parsed.path[1:], - filename, - Callback=lambda bytes_transferred: pb.update(bytes_transferred), - ) - elif source.startswith("http"): - import requests - - logging.info(f"Downloading {source} to {filename}") - - with open(filename, "wb") as f: - with requests.get(source, stream=True, timeout=60) as r: - r.raise_for_status() - total = int(r.headers.get("content-length", 0)) - - tqdm_params = { - "desc": source, - "total": total, - "miniters": 1, - "unit": "B", - "unit_scale": True, - "unit_divisor": 1024, - } - with tqdm.tqdm(**tqdm_params) as pb: - for chunk in r.iter_content(chunk_size=8192): - pb.update(len(chunk)) - f.write(chunk) - elif os.path.isfile(source): - logging.info(f"Copying {source} to {filename}") - shutil.copyfile(source, filename) - else: - raise ValueError(f"Invalid source: {source}") - - return filename - - -def split_fasta( - fasta_file: str, - output_dir: str = os.getcwd(), - max_records_per_partition=2000000, - shuffle=True, - save_fasta: bool = True, - save_csv: bool = False, -) -> list: - """Split a .fasta or .fasta.gz file into multiple files.""" - - # if save_fasta and not os.path.exists(os.path.join(output_dir, "fasta")): - # os.makedirs(os.path.join(output_dir, "fasta")) - - # if save_csv and not os.path.exists(os.path.join(output_dir, "csv")): - # os.makedirs(os.path.join(output_dir, "csv")) - - print(f"Splitting {fasta_file}") - fasta_list = [] - fasta_idx = 0 - - for i, seq in tqdm.tqdm( - enumerate( - pyfastx.Fasta(fasta_file, build_index=False, uppercase=True, full_name=True) - ) - ): - fasta_list.append(seq) - - if (i + 1) % max_records_per_partition == 0: - if shuffle: - random.shuffle(fasta_list) - fasta_idx = int(i / max_records_per_partition) - if save_fasta: - write_seq_record_to_fasta(fasta_list, output_dir, fasta_idx) - if save_csv: - write_seq_record_to_csv(fasta_list, output_dir, fasta_idx) - fasta_list = [] - else: - if save_fasta: - write_seq_record_to_fasta(fasta_list, output_dir, fasta_idx + 1) - if save_csv: - write_seq_record_to_csv(fasta_list, output_dir, fasta_idx + 1) - return output_dir - - -def write_seq_record_to_fasta(content_list, output_dir, index): - output_path = os.path.join( - output_dir, - f"x{str(index).rjust(3, '0')}.fasta", - ) - logging.info(f"Writing {output_path}") - - with open(output_path, "w") as f: - for record in content_list: - f.write(f">{record[0]}\n{record[1]}\n") - return output_path - - -def write_seq_record_to_csv(content_list, output_dir, index): - output_path = os.path.join(output_dir, f"x{str(index).rjust(3, '0')}.csv") - logging.info(f"Writing {output_path}") - with open(output_path, "w") as f: - f.write(f"id,text\n") - for record in content_list: - f.write(f"{record[0].replace(',','')},{record[1].replace(',','')}\n") - return output_path - - -if __name__ == "__main__": - args = parse_args() - main(args) diff --git a/assets/containers/protein-utils/requirements.txt b/assets/containers/protein-utils/requirements.txt deleted file mode 100644 index 2f03224..0000000 --- a/assets/containers/protein-utils/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -biopython -biotite -jsonlines -numpy -pandas -pyfastx -tqdm \ No newline at end of file diff --git a/assets/data/swissprot.txt b/assets/data/swissprot.txt new file mode 100644 index 0000000..b53ca11 --- /dev/null +++ b/assets/data/swissprot.txt @@ -0,0 +1 @@ +https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz diff --git a/assets/data/uniref100.txt b/assets/data/uniref100.txt new file mode 100644 index 0000000..5ca09c0 --- /dev/null +++ b/assets/data/uniref100.txt @@ -0,0 +1 @@ +https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref100/uniref100.fasta.gz diff --git a/assets/data/uniref50.txt b/assets/data/uniref50.txt new file mode 100644 index 0000000..a747a35 --- /dev/null +++ b/assets/data/uniref50.txt @@ -0,0 +1 @@ +https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref50/uniref50.fasta.gz diff --git a/assets/workflows/abodybuilder3/main.nf b/assets/workflows/abodybuilder3/main.nf index 488745e..dffa3c2 100644 --- a/assets/workflows/abodybuilder3/main.nf +++ b/assets/workflows/abodybuilder3/main.nf @@ -37,9 +37,8 @@ process ABodyBuilder3Task { """ set -euxo pipefail mkdir output - tar -xzvf $model_parameters /opt/conda/bin/python /home/scripts/abb3_inference.py $fasta_path \ - --model_path plddt-loss/best_second_stage.ckpt + --model_path $model_parameters """ } diff --git a/assets/workflows/abodybuilder3/nextflow.config b/assets/workflows/abodybuilder3/nextflow.config index ce78f68..2621110 100644 --- a/assets/workflows/abodybuilder3/nextflow.config +++ b/assets/workflows/abodybuilder3/nextflow.config @@ -1,5 +1,5 @@ params { - model_parameters = 's3://{{S3_BUCKET_NAME}}/ref-data/abodybuilder3_parameters/output.tar.gz' + model_parameters = 's3://{{S3_BUCKET_NAME}}/ref-data/abodybuilder3_parameters/plddt-loss/best_second_stage.ckpt' } process { diff --git a/assets/workflows/alphafold2-multimer/README.md b/assets/workflows/alphafold2-multimer/README.md index fce0b4f..c42d638 100644 --- a/assets/workflows/alphafold2-multimer/README.md +++ b/assets/workflows/alphafold2-multimer/README.md @@ -1,56 +1,38 @@ # AlphaFold Multimer -This repository helps you set up and run AlphaFold Multimer on AWS HealthOmics. At the end of the configuration, you should be able to run a full end-to-end inference. +This repository helps you set up and run AlphaFold Multimer on AWS HealthOmics. -AlphaFold-Multimer requires several steps: at a high level they bundle into: - -1. Download and prepare the data -2. Multisequence alignment (MSA) -3. Inference - -Traditionally, the download and prepare data stage will download `tar.gz` files and unpack. This workflow has a series of optimizations that are designed to improve data staging times and reduce the time and cost of inference while improving scale (>2500 residues). All corresponding reference data is hosted by AWS HealthOmics, so there is no charge to customers to host that data. +The following setup steps below assume you are starting from scratch and prefer to use the command line. This repository will also have 1-click build capabilities at the root of the repo. ## Running a workflow -Pick your favorite small fasta file to run your fist end-to-end test. The following command can be done from the terminal or you can navigate to the AWS console. - -### Inputs - -`target_id`: The ID of the target you wish to predict -`fasta_path`: S3 URI to a single FASTA file that is in multi-FASTA format. Currently supports 1-chain per record. +Pick your favorite small fasta file to run your fist end-to-end test. The following command can be done from the terminal or you can navigate to the AWS console. Note that AlphaFold likely will work best using `STATIC` run storage due to low data volumes and faster startup times. ### Example params.json -``` - +```json { - "fasta_path":"s3://mybucket/input/multimer/7unl.fasta", - "target_id": "7unl" + "fasta_path":"s3://mybucket/alhpafold-multimer/" } ``` - ### Running the Workflow Replace `$ROLEARN`, `$OUTPUTLOC`, `$PARAMS`, `$WFID` as appropriate. Also modify the `params.json` to point to where your FASTA resides. -``` - +```bash WFID=1234567 ROLEARN=arn:aws:iam::0123456789012:role/omics-workflow-role-0123456789012-us-east-1 -OUTPUTLOC=s3://mybuckets/run_outputs/alphafold +OUTPUTLOC=s3://mybuckets/run_outputs/alphafold2-multimer PARAMS=./params.json aws omics start-run --workflow-id $WFID --role-arn $ROLEARN --output-uri $OUTPUTLOC --storage-type STATIC --storage-capacity 4800 --parameters file://$PARAMS --name alphafold-multimer ``` - All results are written to a location defined within `$OUTPUTLOC` above. To get to the root directory of the ouputs, you can use the `GetRun` API, which provides the path as `runOutputUri`. Alternatively, this location is available in the console. ## Citation - AlphaFold Multimer was developed by DeepMind. The original source code can be found [here](https://github.com/google-deepmind/alphafold). The algorithm is presented in the following papers. ``` - @Article{AlphaFold2021, author = {Jumper, John and Evans, Richard and Pritzel, Alexander and Green, Tim and Figurnov, Michael and Ronneberger, Olaf and Tunyasuvunakool, Kathryn and Bates, Russ and {\v{Z}}{\'\i}dek, Augustin and Potapenko, Anna and Bridgland, Alex and Meyer, Clemens and Kohl, Simon A A and Ballard, Andrew J and Cowie, Andrew and Romera-Paredes, Bernardino and Nikolov, Stanislav and Jain, Rishub and Adler, Jonas and Back, Trevor and Petersen, Stig and Reiman, David and Clancy, Ellen and Zielinski, Michal and Steinegger, Martin and Pacholska, Michalina and Berghammer, Tamas and Bodenstein, Sebastian and Silver, David and Vinyals, Oriol and Senior, Andrew W and Kavukcuoglu, Koray and Kohli, Pushmeet and Hassabis, Demis}, journal = {Nature}, @@ -64,7 +46,6 @@ AlphaFold Multimer was developed by DeepMind. The original source code can be fo ``` ``` - @article {AlphaFold-Multimer2021, author = {Evans, Richard and O{\textquoteright}Neill, Michael and Pritzel, Alexander and Antropova, Natasha and Senior, Andrew and Green, Tim and {\v{Z}}{\'\i}dek, Augustin and Bates, Russ and Blackwell, Sam and Yim, Jason and Ronneberger, Olaf and Bodenstein, Sebastian and Zielinski, Michal and Bridgland, Alex and Potapenko, Anna and Cowie, Andrew and Tunyasuvunakool, Kathryn and Jain, Rishub and Clancy, Ellen and Kohli, Pushmeet and Jumper, John and Hassabis, Demis}, journal = {bioRxiv}, diff --git a/assets/workflows/alphafold2-multimer/build_containers.sh b/assets/workflows/alphafold2-multimer/build_containers.sh new file mode 100755 index 0000000..4658921 --- /dev/null +++ b/assets/workflows/alphafold2-multimer/build_containers.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +set -ex + +REGION=$1 +ACCOUNT=$2 +TAG=${3:-latest} + +aws ecr get-login-password --region $REGION | docker login --username AWS --password-stdin $ACCOUNT.dkr.ecr.$REGION.amazonaws.com + +# build protein-utils +cd protein-utils +docker build --platform linux/amd64 -t $ACCOUNT.dkr.ecr.$REGION.amazonaws.com/protein-utils:$TAG . +docker push $ACCOUNT.dkr.ecr.$REGION.amazonaws.com/protein-utils:$TAG +cd .. + +# build alphafold-data +cd alphafold-data +docker build --platform linux/amd64 -t $ACCOUNT.dkr.ecr.$REGION.amazonaws.com/alphafold-data:$TAG . +docker push $ACCOUNT.dkr.ecr.$REGION.amazonaws.com/alphafold-data:$TAG +cd .. + +# build alphafold-predict +cd alphafold-predict +docker build --platform linux/amd64 -t $ACCOUNT.dkr.ecr.$REGION.amazonaws.com/alphafold-predict:$TAG . +docker push $ACCOUNT.dkr.ecr.$REGION.amazonaws.com/alphafold-predict:$TAG +cd .. diff --git a/assets/workflows/alphafold2-multimer/config.yaml b/assets/workflows/alphafold2-multimer/config.yaml index 1db3c7d..3e38011 100644 --- a/assets/workflows/alphafold2-multimer/config.yaml +++ b/assets/workflows/alphafold2-multimer/config.yaml @@ -3,9 +3,6 @@ description: "Predict multi-chain protein structures with AlphaFold2-Multimer" engine: NEXTFLOW main: main.nf parameterTemplate: - target_id: - description: "The ID of the target being run." - optional: false fasta_path: description: "Input file in multi-FASTA format." optional: false diff --git a/assets/workflows/alphafold2-multimer/main.nf b/assets/workflows/alphafold2-multimer/main.nf index 49da065..4e96410 100644 --- a/assets/workflows/alphafold2-multimer/main.nf +++ b/assets/workflows/alphafold2-multimer/main.nf @@ -1,34 +1,61 @@ -/* groovylint-disable DuplicateNumberLiteral */ nextflow.enable.dsl = 2 -params.fasta_path = '' +params.fasta_path = "" // static data files are in nextflow.config include { - SearchUniref90 - SearchMgnify - SearchBFD - SearchTemplatesTask - SearchUniprot - CombineSearchResults -} from './searches' + SearchUniref90; + SearchMgnify; + SearchBFD; + SearchTemplatesTask; + SearchUniprot; + CombineSearchResults; +} from './searches.nf' include { - UnpackBFD - UnpackPdb70nSeqres - UnpackMMCIF -} from './unpack' + UnpackBFD; + UnpackPdb70nSeqres; + UnpackMMCIF; +} from './unpack.nf' -workflow AlphaFold2Multimer { - CheckAndValidateInputsTask(params.target_id, params.fasta_path) - // split fasta run parallel searches (Scatter) - split_seqs = CheckAndValidateInputsTask.out.fasta - .splitFasta(file: true) - .map { filename -> tuple(filename.toString().split('/')[-1].split('.fasta')[0], filename) } +workflow { + + // Convert to one or many files + if (params.fasta_path[-1] == "/") { + fasta_path = params.fasta_path + "*" + } else { + fasta_path = params.fasta_path + } + + // [5nl6, 5nl6.fasta] + // [5mlq, 5mlq.fasta] + fasta_files = Channel + .fromPath(fasta_path) + .map { filename -> tuple ( filename.toString().split("/")[-1].split(".fa")[0], filename) } + + // 5nl6.fasta + // 5mlq.fasta + CheckAndValidateInputsTask(fasta_files) + + // Explode/scatter the fasta files into channel items per contained record ID + // Write the exploded fasta records to their own file, include in tuple that contains original fasta file basename + // [5nl6, 5nl6.1, 5nl6.1.fasta] + // [5nl6, 5nl6.2, 5nl6.2.fasta] + // [5mlq, 5mlq.1, 5mlq.1.fasta] + // [5mlq, 5mlq.2, 5mlq.2.fasta] + split_seqs = CheckAndValidateInputsTask.out.fasta.map { fastaFile -> + def fastaBaseName = fastaFile.baseName + def records = fastaFile.splitFasta( file: true ) + + def fastaRecordTupleList = [] + records.forEach { record -> + fastaRecordTupleList.add(tuple (fastaBaseName, record.getBaseName(), record)) + } + return fastaRecordTupleList + } | flatMap - uniref30 = Channel.fromPath(params.uniref30_database_src).first() alphafold_model_parameters = Channel.fromPath(params.alphafold_model_parameters).first() // Unpack the databases @@ -39,88 +66,107 @@ workflow AlphaFold2Multimer { params.bfd_database_hhm_ffdata, params.bfd_database_hhm_ffindex) UnpackPdb70nSeqres(params.pdb70_src, params.pdb_seqres_src, params.db_pathname) - UnpackMMCIF(params.pdb_mmcif_src1, - params.pdb_mmcif_src2, - params.pdb_mmcif_src3, - params.pdb_mmcif_src4, - params.pdb_mmcif_src5, - params.pdb_mmcif_src6, - params.pdb_mmcif_src7, - params.pdb_mmcif_src8, - params.pdb_mmcif_src9, + UnpackMMCIF(params.pdb_mmcif_src1, + params.pdb_mmcif_src2, + params.pdb_mmcif_src3, + params.pdb_mmcif_src4, + params.pdb_mmcif_src5, + params.pdb_mmcif_src6, + params.pdb_mmcif_src7, + params.pdb_mmcif_src8, + params.pdb_mmcif_src9, params.pdb_mmcif_obsolete) + // Searches are call for each fastas * records SearchUniref90(split_seqs, params.uniref90_database_src) SearchMgnify(split_seqs, params.mgnify_database_src) SearchUniprot(split_seqs, params.uniprot_database_src) SearchBFD(split_seqs, UnpackBFD.out.db_folder, params.uniref30_database_src) - SearchTemplatesTask(SearchUniref90.out.msa_with_id, UnpackPdb70nSeqres.out.db_folder) - - // Gather - CombineSearchResults(SearchUniref90.out.msa.collect(), - SearchUniprot.out.msa.collect(), - SearchMgnify.out.msa.collect(), - SearchBFD.out.msa.collect(), - SearchTemplatesTask.out.msa.collect()) - - GenerateFeaturesTask(CheckAndValidateInputsTask.out.fasta, - CombineSearchResults.out.msa_path, - UnpackMMCIF.out.db_folder, - UnpackMMCIF.out.db_obsolete) + SearchTemplatesTask(SearchUniref90.out.fasta_basename_with_record_id_and_msa, UnpackPdb70nSeqres.out.db_folder) + + // [5nl6, 5nl6.fasta, [output_5nl6.1/5nl6.1_uniref90_hits.sto, output_5nl6.2/5nl6.2_uniref90_hits.sto], [output_5nl6.2/5nl6.2_mgnify_hits.sto, output_5nl6.1/5nl6.1_mgnify_hits.sto], ...] + // [5mlq, 5mlq.fasta, [output_5mlq.1/5mlq.1_uniref90_hits.sto, output_5mlq.2/5mlq.2_uniref90_hits.sto], [output_5mlq.1/5mlq.1_mgnify_hits.sto, output_5mlq.2/5mlq.2_mgnify_hits.sto], ...] + // + // Combine/gather the search results into channels per original fasta file + msa_tuples = fasta_files + .join(SearchUniref90.out.fasta_basename_with_msa.groupTuple()) + .join(SearchMgnify.out.fasta_basename_with_msa.groupTuple()) + .join(SearchUniprot.out.fasta_basename_with_msa.groupTuple()) + .join(SearchBFD.out.fasta_basename_with_msa.groupTuple()) + .join(SearchTemplatesTask.out.fasta_basename_with_msa.groupTuple()) + + // Per original fasta file, move all of the search result files (ArrayList of files) into single directory structure: msa/A, msa/B, ... + // Emit the first two elements of msa_tuples, and a single merged msa/ directory + CombineSearchResults(msa_tuples) + + // Called per original fasta input file + GenerateFeaturesTask(CombineSearchResults.out.fasta_basename_fasta_and_msa_path, + UnpackMMCIF.out.db_folder, + UnpackMMCIF.out.db_obsolete) + // Predict. Five separate models - model_nums = Channel.of(0, 1, 2, 3, 4) - AlphaFoldMultimerInference(params.target_id, - GenerateFeaturesTask.out.features, - params.alphafold_model_parameters, - model_nums, params.random_seed, - params.run_relax) - - MergeRankings(AlphaFoldMultimerInference.out.results.collect()) + model_nums = Channel.of(0,1,2,3,4) + features = GenerateFeaturesTask.out.fasta_basename_with_features.combine(model_nums) + AlphaFoldMultimerInference(features, alphafold_model_parameters, params.random_seed, params.run_relax) + + MergeRankings(AlphaFoldMultimerInference.out.results.groupTuple(by: 0)) } // Check the inputs and get size etc process CheckAndValidateInputsTask { + tag "${fasta_basename}" label 'protutils' cpus 2 memory '4 GB' - publishDir '/mnt/workflow/pubdir/inputs' + publishDir "/mnt/workflow/pubdir/${fasta_basename}/inputs" input: - val target_id - path fasta_path + tuple val(fasta_basename), path(fasta_path) output: stdout - path 'seq_info.json', emit: seq_info - path 'inputs.fasta', emit: fasta + path "seq_info.json", emit: seq_info + path "${fasta_basename}.fasta", emit: fasta + val "${fasta_basename}", emit: fasta_basename script: """ set -euxo pipefail + + echo ">>>>>>>>>>>>>>>>>>>" + echo $fasta_basename + echo $fasta_path + echo "<<<<<<<<<<<<<<<<<<<" + ls -alR + /opt/venv/bin/python \ - /home/putils/src/putils/check_and_validate_inputs.py \ - --target_id=$target_id --fasta_path=$fasta_path + /opt/venv/lib/python3.8/site-packages/putils/check_and_validate_inputs.py \ + --target_id=$fasta_basename --fasta_path=$fasta_path --output_prefix=$fasta_basename + + ls -alR + """ } // Generate features from the searches process GenerateFeaturesTask { + tag "${fasta_basename}" label 'data' cpus 4 memory '16 GB' - publishDir '/mnt/workflow/pubdir/features' + publishDir "/mnt/workflow/pubdir/${fasta_basename}/features" input: - path fasta_paths - path msa_dir + tuple val(fasta_basename), path(fasta_path), path(msa_dir) path pdb_mmcif_folder path mmcif_obsolete_path output: - path 'output/features.pkl', emit: features - path 'output/generate_features_metrics.json', emit: metrics + tuple val(fasta_basename), path("output/features.pkl"), emit: fasta_basename_with_features + path "output/features.pkl", emit: features + path "output/generate_features_metrics.json", emit: metrics script: """ @@ -133,14 +179,18 @@ process GenerateFeaturesTask { echo "***********************" /opt/venv/bin/python /opt/generate_features.py \ - --fasta_paths=$fasta_paths \ + --fasta_paths=$fasta_path \ --msa_dir=$msa_dir \ --template_mmcif_dir="$pdb_mmcif_folder" \ --obsolete_pdbs_path="$mmcif_obsolete_path" \ --template_hits="$msa_dir/pdb_hits.sto" \ --model_preset=multimer \ --output_dir=output \ - --max_template_date=2023-01-01 + --max_template_date=2023-01-01 + + echo "***********************" + ls -alR output/ + echo "***********************" mv output/metrics.json output/generate_features_metrics.json """ @@ -148,24 +198,23 @@ process GenerateFeaturesTask { // AlphaFold Multimer process AlphaFoldMultimerInference { + tag "${fasta_basename}_${modelnum}" errorStrategy 'retry' label 'predict' cpus { 4 * Math.pow(2, task.attempt) } memory { 16.GB * Math.pow(2, task.attempt) } accelerator 1, type: 'nvidia-tesla-a10g' maxRetries 2 - publishDir '/mnt/workflow/pubdir' + publishDir "/mnt/workflow/pubdir/${fasta_basename}/prediction_${modelnum}" input: - val target_id - path features + tuple val(fasta_basename), path (features), val(modelnum) path alphafold_model_parameters - val modelnum val random_seed val run_relax output: - path "output_model_${modelnum}/", emit: results - + tuple val(fasta_basename), path("output_model_${modelnum}/"), emit: results + script: """ set -euxo pipefail @@ -174,7 +223,7 @@ process AlphaFoldMultimerInference { export XLA_PYTHON_CLIENT_MEM_FRACTION=4.0 export TF_FORCE_UNIFIED_MEMORY=1 /opt/conda/bin/python /app/alphafold/predict.py \ - --target_id=$target_id --features_path=$features --model_preset=multimer \ + --target_id=$fasta_basename --features_path=$features --model_preset=multimer \ --model_dir=model --random_seed=$random_seed --output_dir=output_model_${modelnum} \ --run_relax=${run_relax} --use_gpu_relax=${run_relax} --model_num=$modelnum @@ -182,20 +231,22 @@ process AlphaFoldMultimerInference { """ } -//Merge Rankings + +// Merge Rankings process MergeRankings { + tag "${id}" cpus 2 memory 4.GB - publishDir '/mnt/workflow/pubdir' + publishDir "/mnt/workflow/pubdir/${id}" label 'data' input: - path results + tuple val(id), path(results) output: - path 'rankings.json', emit: rankings - path 'top_hit*', emit: top_hit - + path "rankings.json", emit: rankings + path "top_hit*", emit: top_hit + script: """ mkdir -p output @@ -206,7 +257,3 @@ process MergeRankings { mv output/rankings.json . """ } - -workflow { - AlphaFold2Multimer() -} diff --git a/assets/workflows/alphafold2-multimer/nextflow.config b/assets/workflows/alphafold2-multimer/nextflow.config index 16e3e32..c10b538 100644 --- a/assets/workflows/alphafold2-multimer/nextflow.config +++ b/assets/workflows/alphafold2-multimer/nextflow.config @@ -36,7 +36,7 @@ params { } process { - withLabel: protutils { container = "{{protein-utils:latest}}"} - withLabel: data { container = "{{alphafold-data:latest}}"} - withLabel: predict { container = "{{alphafold-predict:latest}}"} + withLabel: protutils { container = '{{protein-utils:latest}}'} + withLabel: data { container = '{{alphafold-data:latest}}'} + withLabel: predict { container = '{{alphafold-predict:latest}}'} } \ No newline at end of file diff --git a/assets/workflows/alphafold2-multimer/parameter-template.json b/assets/workflows/alphafold2-multimer/parameter-template.json new file mode 100644 index 0000000..a239744 --- /dev/null +++ b/assets/workflows/alphafold2-multimer/parameter-template.json @@ -0,0 +1,6 @@ +{ + "fasta_path": { + "description": "Input file in multi-FASTA format.", + "optional": false + } +} \ No newline at end of file diff --git a/assets/workflows/alphafold2-multimer/params.json b/assets/workflows/alphafold2-multimer/params.json new file mode 100644 index 0000000..46ffce8 --- /dev/null +++ b/assets/workflows/alphafold2-multimer/params.json @@ -0,0 +1,3 @@ +{ + "fasta_path":"s3://example-bucket/alphafold2-multimer/" +} \ No newline at end of file diff --git a/assets/workflows/alphafold2-multimer/searches.nf b/assets/workflows/alphafold2-multimer/searches.nf index dedc650..e430a37 100644 --- a/assets/workflows/alphafold2-multimer/searches.nf +++ b/assets/workflows/alphafold2-multimer/searches.nf @@ -1,201 +1,219 @@ nextflow.enable.dsl = 2 process SearchUniref90 { + tag "${record_id}" label 'data' - cpus 8 - memory '32 GB' - publishDir '/mnt/workflow/pubdir/msa' + cpus { 8 * Math.pow(2, task.attempt) } + memory { 32.GB * Math.pow(2, task.attempt) } + maxRetries 3 + publishDir "/mnt/workflow/pubdir/${fasta_basename}/msa" input: - tuple val(id), path(fasta_path) + tuple val(fasta_basename), val(record_id), path(fasta_record_path) path database_path output: - tuple val(id), path("output/${id}_uniref90_hits.sto"), emit: msa_with_id - path "output/${id}_uniref90_hits.sto", emit: msa - path "output/${id}_uniref90_metrics.json", emit: metrics + tuple val(fasta_basename), val(record_id), path("output_${record_id}/${record_id}_uniref90_hits.sto"), emit: fasta_basename_with_record_id_and_msa + tuple val(fasta_basename), path("output_${record_id}/${record_id}_uniref90_hits.sto"), emit: fasta_basename_with_msa + path "output_${record_id}/${record_id}_uniref90_hits.sto", emit: msa + path "output_${record_id}/${record_id}_uniref90_metrics.json", emit: metrics script: """ set -euxo pipefail + cat $fasta_record_path - mkdir -p output + mkdir -p output_${record_id} /opt/venv/bin/python /opt/create_msa_monomer.py \ - --fasta_path=$fasta_path \ + --fasta_path=$fasta_record_path \ --database_type=uniref90 \ --database_path=$database_path \ - --output_dir=output \ + --output_dir=output_${record_id} \ --cpu=$task.cpus - mv output/uniref90_hits.sto output/${id}_uniref90_hits.sto - mv output/metrics.json output/${id}_uniref90_metrics.json + mv output_${record_id}/uniref90_hits.sto output_${record_id}/${record_id}_uniref90_hits.sto + mv output_${record_id}/metrics.json output_${record_id}/${record_id}_uniref90_metrics.json """ } -process SearchUniprot { +process SearchMgnify { + tag "${record_id}" label 'data' - cpus 8 - memory '32 GB' - publishDir '/mnt/workflow/pubdir/msa' + cpus { 8 * Math.pow(2, task.attempt) } + memory { 64.GB * Math.pow(2, task.attempt) } + maxRetries 3 + publishDir "/mnt/workflow/pubdir/${fasta_basename}/msa" input: - tuple val(id), path(fasta_path) + tuple val(fasta_basename), val(record_id), path(fasta_record_path) path database_path output: - path "output/${id}_uniprot_hits.sto", emit: msa - path "output/${id}_uniprot_metrics.json", emit: metrics - val "$id", emit: id + tuple val(fasta_basename), path("output_${record_id}/${record_id}_mgnify_hits.sto"), emit: fasta_basename_with_msa + path "output_${record_id}/${record_id}_mgnify_hits.sto", emit: msa + path "output_${record_id}/${record_id}_mgnify_metrics.json", emit: metrics script: """ set -euxo pipefail - - mkdir -p output + cat $fasta_record_path + + mkdir -p output_${record_id} /opt/venv/bin/python /opt/create_msa_monomer.py \ - --fasta_path=$fasta_path \ - --database_type=uniprot \ + --fasta_path=$fasta_record_path \ + --database_type=mgnify \ --database_path=$database_path \ - --output_dir=output \ + --output_dir=output_${record_id} \ --cpu=$task.cpus - mv output/uniprot_hits.sto output/${id}_uniprot_hits.sto - mv output/metrics.json output/${id}_uniprot_metrics.json + mv output_${record_id}/mgnify_hits.sto output_${record_id}/${record_id}_mgnify_hits.sto + mv output_${record_id}/metrics.json output_${record_id}/${record_id}_mgnify_metrics.json """ } -process SearchMgnify { +process SearchUniprot { + tag "${record_id}" label 'data' cpus 8 - memory '64 GB' - publishDir '/mnt/workflow/pubdir/msa' + memory '32 GB' + publishDir "/mnt/workflow/pubdir/${fasta_basename}/msa" input: - tuple val(id), path(fasta_path) + tuple val(fasta_basename), val(record_id), path(fasta_record_path) path database_path output: - path "output/${id}_mgnify_hits.sto", emit: msa - path "output/${id}_mgnify_metrics.json", emit: metrics + tuple val(fasta_basename), path("output_${record_id}/${record_id}_uniprot_hits.sto"), emit: fasta_basename_with_msa + path "output_${record_id}/${record_id}_uniprot_hits.sto", emit: msa + path "output_${record_id}/${record_id}_uniprot_metrics.json", emit: metrics script: """ set -euxo pipefail + cat $fasta_record_path - mkdir -p output + mkdir -p output_${record_id} /opt/venv/bin/python /opt/create_msa_monomer.py \ - --fasta_path=$fasta_path \ - --database_type=mgnify \ + --fasta_path=$fasta_record_path \ + --database_type=uniprot \ --database_path=$database_path \ - --output_dir=output \ + --output_dir=output_${record_id} \ --cpu=$task.cpus - mv output/mgnify_hits.sto output/${id}_mgnify_hits.sto - mv output/metrics.json output/${id}_mgnify_metrics.json + mv output_${record_id}/uniprot_hits.sto output_${record_id}/${record_id}_uniprot_hits.sto + mv output_${record_id}/metrics.json output_${record_id}/${record_id}_uniprot_metrics.json """ } process SearchBFD { + tag "${record_id}" label 'data' + cpus { 8 * Math.pow(2, task.attempt) } memory { 64.GB * Math.pow(2, task.attempt) } - maxRetries 1 + maxRetries 3 errorStrategy 'retry' - publishDir '/mnt/workflow/pubdir/msa' + + publishDir "/mnt/workflow/pubdir/${fasta_basename}/msa" input: - tuple val(id), path(fasta_path) + tuple val(fasta_basename), val(record_id), path(fasta_record_path) path bfd_database_folder path uniref30_database_folder output: - path "output/${id}_bfd_uniref_hits.a3m", emit: msa - path "output/${id}_metrics.json", emit: metrics + tuple val(fasta_basename), path("output_${record_id}/${record_id}_bfd_hits.a3m"), emit: fasta_basename_with_msa + path "output_${record_id}/${record_id}_bfd_hits.a3m", emit: msa + path "output_${record_id}/${record_id}_bfd_metrics.json", emit: metrics script: """ set -euxo pipefail - - mkdir -p output + cat $fasta_record_path + mkdir -p output_${record_id} /opt/venv/bin/python /opt/create_msa_monomer.py \ - --fasta_path=$fasta_path \ + --fasta_path=$fasta_record_path \ --database_type=bfd \ --database_path=$bfd_database_folder \ --database_path_2=$uniref30_database_folder \ - --output_dir=output \ + --output_dir=output_${record_id} \ --cpu=$task.cpus - mv output/bfd_hits.a3m output/${id}_bfd_uniref_hits.a3m - mv output/metrics.json output/${id}_metrics.json + mv output_${record_id}/bfd_hits.a3m output_${record_id}/${record_id}_bfd_hits.a3m + mv output_${record_id}/metrics.json output_${record_id}/${record_id}_bfd_metrics.json """ } process SearchTemplatesTask { + tag "${record_id}" label 'data' cpus 2 memory '8 GB' - publishDir '/mnt/workflow/pubdir/msa' + publishDir "/mnt/workflow/pubdir/${fasta_basename}/msa" input: - tuple val(id), path(msa_path) + tuple val(fasta_basename), val(record_id), path(msa_path) path pdb_db_folder output: - path "output/${id}_pdb_hits.sto", emit: msa - path "output/${id}_metrics.json", emit: metrics + tuple val(fasta_basename), path("output_${record_id}/${record_id}_pdb_hits.sto"), emit: fasta_basename_with_msa + path "output_${record_id}/${record_id}_pdb_metrics.json", emit: metrics script: """ set -euxo pipefail - mkdir -p output + mkdir -p output_${record_id} /opt/venv/bin/python /opt/search_templates.py \ --msa_path=$msa_path \ - --output_dir=output \ + --output_dir=output_${record_id} \ --database_path=$pdb_db_folder \ --model_preset=multimer \ --cpu=$task.cpus - mv output/pdb_hits.sto output/${id}_pdb_hits.sto - mv output/metrics.json output/${id}_metrics.json + mv output_${record_id}/pdb_hits.sto output_${record_id}/${record_id}_pdb_hits.sto + mv output_${record_id}/metrics.json output_${record_id}/${record_id}_pdb_metrics.json """ } // Combine/rename results from parallel searches as AlphaFold expects process CombineSearchResults { + tag "${fasta_basename}" label 'data' cpus 4 memory '8 GB' + publishDir "/mnt/workflow/pubdir/${fasta_basename}/msa" input: - path uniref90_msas - path uniprot_msas - path mgnify_msas - path bfd_msas - path template_hits - output: - path 'msa/', emit: msa_path + tuple val(fasta_basename), path(fasta_path), path(uniref90_msas), path(mgnify_msas), path(uniprot_msas), path(bfd_msas), path(template_hits) + + output: + tuple val(fasta_basename), path(fasta_path), path ("msa/"), emit: fasta_basename_fasta_and_msa_path + path "msa/", emit: msa_path script: """ echo ">>>>>>>>>>>>>>>>>>>" + echo $fasta_basename + echo $fasta_path echo $uniref90_msas - echo $uniprot_msas echo $mgnify_msas + echo $uniprot_msas + echo $bfd_msas echo $template_hits echo "<<<<<<<<<<<<<<<<<<<" mkdir -p msa - /opt/venv/bin/python /opt/update_locations.py msa $uniref90_msas - /opt/venv/bin/python /opt/update_locations.py msa $uniprot_msas - /opt/venv/bin/python /opt/update_locations.py msa $mgnify_msas - /opt/venv/bin/python /opt/update_locations.py msa $bfd_msas - /opt/venv/bin/python /opt/update_locations.py msa $template_hits + /opt/venv/bin/python /opt/update_locations.py msa _uniref90_hits.sto $uniref90_msas + /opt/venv/bin/python /opt/update_locations.py msa _mgnify_hits.sto $mgnify_msas + /opt/venv/bin/python /opt/update_locations.py msa _uniprot_hits.sto $uniprot_msas + /opt/venv/bin/python /opt/update_locations.py msa _bfd_hits.a3m $bfd_msas + /opt/venv/bin/python /opt/update_locations.py msa _pdb_hits.sto $template_hits echo "***********************" ls -alR msa/ diff --git a/assets/workflows/alphafold2-multimer/unpack.nf b/assets/workflows/alphafold2-multimer/unpack.nf index b8211c0..146bc14 100644 --- a/assets/workflows/alphafold2-multimer/unpack.nf +++ b/assets/workflows/alphafold2-multimer/unpack.nf @@ -14,7 +14,7 @@ process UnpackBFD { path bfd_database_hhm_ffindex output: - path 'bfd/', emit: db_folder + path "bfd/", emit: db_folder script: """ @@ -31,6 +31,7 @@ process UnpackBFD { """ } + process UnpackUniprot { cpus 4 memory '8 GB' @@ -53,6 +54,7 @@ process UnpackUniprot { """ } + process UnpackPdb70nSeqres { label 'data' cpus 2 @@ -75,13 +77,14 @@ process UnpackPdb70nSeqres { # Templates - pdb70 and seqres mkdir -p $base_database_path/pdb mv $pdb70_src/* $base_database_path/pdb/ - + # filter strange sequences containing 0 /opt/venv/bin/python /opt/filter_pdb.py $pdb_seqres_src $base_database_path/pdb/pdb_seqres.txt ls -laR $base_database_path/pdb/ """ } + process UnpackMMCIF { cpus 2 memory '4 GB' @@ -98,10 +101,10 @@ process UnpackMMCIF { path pdb_mmcif_src8 path pdb_mmcif_src9 path pdb_mmcif_obsolete - + output: - path 'pdb_mmcif/mmcif_files/', emit: db_folder - path 'pdb_mmcif/obsolete.dat', emit: db_obsolete + path "pdb_mmcif/mmcif_files/", emit: db_folder + path "pdb_mmcif/obsolete.dat", emit: db_obsolete script: """ @@ -124,22 +127,23 @@ process UnpackMMCIF { """ } + process UnpackRecords { tag "${id}" label 'protutils' cpus 2 memory '4 GB' publishDir "/mnt/workflow/pubdir/${id}/input" - + input: tuple val(id), val(header), val(seqString) output: - tuple val(id), path('input.fasta'), emit: fasta + tuple val(id), path("input.fasta"), emit: fasta script: """ set -euxo pipefail echo -e ">${header}\n${seqString}" > input.fasta """ -} +} \ No newline at end of file diff --git a/assets/workflows/mmseqs2/README.md b/assets/workflows/mmseqs2/README.md new file mode 100644 index 0000000..973cee0 --- /dev/null +++ b/assets/workflows/mmseqs2/README.md @@ -0,0 +1,5 @@ +# Generate MSAs using MMSeqs2 + +## Summary + + MMseqs2: ultra fast and sensitive search and clustering suite. diff --git a/assets/workflows/mmseqs2/config.yaml b/assets/workflows/mmseqs2/config.yaml new file mode 100644 index 0000000..b5018a1 --- /dev/null +++ b/assets/workflows/mmseqs2/config.yaml @@ -0,0 +1,15 @@ +name: MMseqs2 +description: "MMseqs2: ultra fast and sensitive search and clustering suite" +engine: NEXTFLOW +main: main.nf +parameterTemplate: + fasta_path: + description: "FASTA file containing query sequence." + optional: false + database_path: + description: "FASTA file containing target database." + optional: true +storageCapacity: 1200 +tags: + Name: "MMseqs2" +accelerators: GPU diff --git a/assets/workflows/mmseqs2/main.nf b/assets/workflows/mmseqs2/main.nf new file mode 100644 index 0000000..8ef13de --- /dev/null +++ b/assets/workflows/mmseqs2/main.nf @@ -0,0 +1,85 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +workflow MMSeqs2 { + take: + fasta_path + database_path + + main: + + db_channel = Channel.fromPath(database_path) + db_channel.view() + MMSeqs2PrepareDatabaseTask(db_channel) + + // Convert to one or many files + if (params.fasta_path[-1] == "/") { + fasta_path = params.fasta_path + "*" + } else { + fasta_path = params.fasta_path + } + + fasta_channel = Channel.fromPath(fasta_path) + fasta_channel.view() + search_input = fasta_channel.combine(MMSeqs2PrepareDatabaseTask.out) + search_input.view() + MMSeqs2SearchTask( + search_input + ) + + emit: + MMSeqs2SearchTask.out +} + +process MMSeqs2PrepareDatabaseTask { + label 'mmseqs2' + cpus 16 + memory '32 GB' + maxRetries 1 + + input: + path database_path + + output: + path "db", emit: db + + script: + """ + set -euxo pipefail + mkdir db + /usr/local/bin/entrypoint createdb $database_path tmpDB + /usr/local/bin/entrypoint makepaddedseqdb tmpDB db/gpuDB + /usr/local/bin/entrypoint createindex db/gpuDB tmp --index-subset 2 + """ +} + +process MMSeqs2SearchTask { + label 'mmseqs2' + cpus 4 + memory '16 GB' + maxRetries 1 + accelerator 1, type: 'nvidia-tesla-a10g' + publishDir "/mnt/workflow/pubdir/${workflow.sessionId}/${task.process.replace(':', '/')}/${task.index}/${task.attempt}" + + input: + tuple path(fasta_path), path(database_path) + + output: + path "*.a3m", emit: msa + + script: + """ + set -euxo pipefail + /usr/local/bin/entrypoint createdb $fasta_path queryDB + /usr/local/bin/entrypoint search queryDB $database_path/gpuDB result tmp --gpu 1 + /usr/local/bin/entrypoint result2msa queryDB $database_path/gpuDB result ${fasta_path.baseName}.a3m --msa-format-mode 5 + """ +} + +workflow { + MMSeqs2( + params.fasta_path, + params.database_path + ) +} diff --git a/assets/workflows/mmseqs2/nextflow.config b/assets/workflows/mmseqs2/nextflow.config new file mode 100644 index 0000000..124a2e7 --- /dev/null +++ b/assets/workflows/mmseqs2/nextflow.config @@ -0,0 +1,12 @@ +params { + database_path = "s3://{{S3_BUCKET_NAME}}/ref-data/uniref100/uniref100.fasta" +} + +process { + withLabel: mmseqs2 { container = "{{mmseqs2:latest}}" } +} + +docker { + enabled = true + runOptions = "--gpus all" +} diff --git a/build/buildspec/buildspec_data.yaml b/build/buildspec/buildspec_data.yaml index 154b5c5..b616b7f 100644 --- a/build/buildspec/buildspec_data.yaml +++ b/build/buildspec/buildspec_data.yaml @@ -89,7 +89,13 @@ phases: else echo "Downloading ${SOURCE_URI}" wget -nc $SOURCE_URI -P $HOME/s3/$KEY --no-verbose --show-progress --progress=dot:giga - if [[ "$SOURCE_URI" =~ .*\.gz$ ]]; then + if [[ "$SOURCE_URI" =~ .*\.tar.gz$ ]]; then + echo "Extracting file" + tar -xzf $HOME/s3/$KEY/*.tar.gz -C $HOME/s3/$KEY + elif [[ "$SOURCE_URI" =~ .*\.tar$ ]]; then + echo "Extracting file" + tar -xf $HOME/s3/$KEY/*.tar -C $HOME/s3/$KEY + elif [[ "$SOURCE_URI" =~ .*\.gz$ ]]; then echo "Extracting file" gunzip -k $HOME/s3/$KEY/*.gz fi; diff --git a/scripts/testrun.sh b/scripts/testrun.sh index 7e38630..26d0b77 100755 --- a/scripts/testrun.sh +++ b/scripts/testrun.sh @@ -51,12 +51,11 @@ else fi # Package the workflow -mkdir -p tmp/assets/workflows/$WORKFLOW_NAME tmp/assets/modules +mkdir -p tmp/assets/workflows/$WORKFLOW_NAME pushd tmp cp -r ../assets/workflows/$WORKFLOW_NAME/* assets/workflows/$WORKFLOW_NAME -cp -r ../assets/modules/* assets/modules sed -i "" -E "s/[0-9]{12}\.dkr\.ecr\.(us-[a-z]*-[0-9])/$ACCOUNT_ID.dkr.ecr.$REGION/g" ./assets/workflows/$WORKFLOW_NAME/*.config assets/workflows/$WORKFLOW_NAME/*.wdl 2>/dev/null || true sed -i "" -E "s/[0-9]{12}\.dkr\.ecr\.(us-[a-z]*-[0-9])/$ACCOUNT_ID.dkr.ecr.$REGION/g" ./assets/workflows/$WORKFLOW_NAME/*.config assets/workflows/$WORKFLOW_NAME/*.nf 2>/dev/null || true @@ -73,7 +72,8 @@ aws omics wait workflow-active --region $REGION --id $workflow_id # Run the workflow start_run_command="aws omics start-run \ --retention-mode REMOVE \ - --storage-type DYNAMIC \ + --storage-type STATIC \ + --storage-capacity 9600 \ --workflow-id $workflow_id \ --name $WORKFLOW_NAME-dev-$TIMESTAMP \ --role-arn \"$OMICS_EXECUTION_ROLE\" \