From b2fd73c9e6fcd6acbe5d891f0cd99c16ad2eb271 Mon Sep 17 00:00:00 2001 From: John Jacquay Date: Mon, 16 Dec 2024 11:14:58 -0700 Subject: [PATCH 01/20] Committing relevant changes from alphafold-multimer work --- .gitignore | 2 + assets/containers/alphafold-data/Dockerfile | 68 +++-- .../alphafold-data/update_locations.py | 12 +- .../containers/alphafold-predict/Dockerfile | 83 +++--- assets/containers/protein-utils/Dockerfile | 28 +- .../protein-utils/code/resources.json | 1 + .../protein-utils/code/seq_info.json | 1 + assets/containers/protein-utils/code/setup.py | 2 +- .../code/src/putils/split_fasta.py | 221 ---------------- .../containers/protein-utils/requirements.txt | 7 - assets/workflows/alphafold-multimer/README.md | 59 +++++ .../alphafold-multimer/build_containers.sh | 27 ++ .../workflows/alphafold-multimer/config.yaml | 11 + assets/workflows/alphafold-multimer/main.nf | 244 ++++++++++++++++++ .../alphafold-multimer/nextflow.config | 45 ++++ .../parameter-template.json | 6 + .../workflows/alphafold-multimer/params.json | 3 + .../workflows/alphafold-multimer/searches.nf | 220 ++++++++++++++++ assets/workflows/alphafold-multimer/unpack.nf | 149 +++++++++++ 19 files changed, 883 insertions(+), 306 deletions(-) create mode 100644 assets/containers/protein-utils/code/resources.json create mode 100644 assets/containers/protein-utils/code/seq_info.json delete mode 100644 assets/containers/protein-utils/code/src/putils/split_fasta.py delete mode 100644 assets/containers/protein-utils/requirements.txt create mode 100644 assets/workflows/alphafold-multimer/README.md create mode 100755 assets/workflows/alphafold-multimer/build_containers.sh create mode 100644 assets/workflows/alphafold-multimer/config.yaml create mode 100644 assets/workflows/alphafold-multimer/main.nf create mode 100644 assets/workflows/alphafold-multimer/nextflow.config create mode 100644 assets/workflows/alphafold-multimer/parameter-template.json create mode 100644 assets/workflows/alphafold-multimer/params.json create mode 100644 assets/workflows/alphafold-multimer/searches.nf create mode 100644 assets/workflows/alphafold-multimer/unpack.nf diff --git a/.gitignore b/.gitignore index 46fe94f..ad4468b 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,5 @@ tmp/ .nextflow* stack-outputs.json test_data +linter-rules-for-nextflow +build/cloudformation/packaged.yaml diff --git a/assets/containers/alphafold-data/Dockerfile b/assets/containers/alphafold-data/Dockerfile index 19970db..4eaeca7 100644 --- a/assets/containers/alphafold-data/Dockerfile +++ b/assets/containers/alphafold-data/Dockerfile @@ -1,7 +1,7 @@ # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 -FROM public.ecr.aws/amazonlinux/amazonlinux:latest as build +FROM public.ecr.aws/amazonlinux/amazonlinux:latest AS build RUN yum upgrade -y \ && yum install -y \ @@ -19,26 +19,52 @@ RUN yum upgrade -y \ wget \ zstd \ && yum clean all \ - && rm -rf /var/cache/yum \ - && pushd /tmp \ - && git clone https://github.com/soedinglab/hh-suite.git \ - && cd hh-suite && mkdir build && cd build \ - && cmake -DCMAKE_INSTALL_PREFIX=/opt/hhsuite .. \ - && make -j 4 && make install \ - && popd \ - && pushd /tmp \ - && wget http://msa.sbc.su.se/downloads/kalign/current.tar.gz --no-check-certificate \ - && mkdir -p /tmp/kalign2/build \ - && tar -xvzf current.tar.gz -C /tmp/kalign2 \ - && pushd /tmp/kalign2 \ - && ./configure \ - && make && make install \ - && popd \ - && rm -rf /tmp/kalign2 \ - && popd \ - && mkdir -p /tmp/hmmer && wget -O hmmer.tar.gz http://eddylab.org/software/hmmer/hmmer-3.4.tar.gz \ - && tar xvzf hmmer.tar.gz -C /tmp/hmmer \ - && pushd /tmp/hmmer/hmmer-* \ + && rm -rf /var/cache/yum + +# ADD hh-suite.tar.gz /tmp/hh-suite +# RUN pushd /tmp/hh-suite \ +# && cmake -DCMAKE_INSTALL_PREFIX=/opt/hhsuite . \ +# && make && make install \ +# && popd +RUN pushd /tmp && \ +git clone https://github.com/soedinglab/hh-suite.git && \ +cd hh-suite && mkdir build && cd build && \ +cmake -DCMAKE_INSTALL_PREFIX=/opt/hhsuite .. && \ +make -j 4 && make install && \ +popd + +# ADD kalign.tar.gz /tmp/kalign-3.4.3 +# RUN pushd /tmp/kalign2 \ +# && ./configure \ +# && make && make install \ +# && popd +# RUN pushd /tmp && \ +# wget https://github.com/TimoLassmann/kalign/archive/refs/tags/v3.4.0.tar.gz && \ +# tar -xvzf v3.4.0.tar.gz && \ +# cd kalign-3.4.0 && \ +# mkdir build && \ +# cd build && \ +# cmake3 .. && \ +# make -j 4 && make test && \ +# make install && \ +# popd + +# Compile kalign2 from source +RUN pushd /tmp && \ +wget http://msa.sbc.su.se/downloads/kalign/current.tar.gz --no-check-certificate \ +&& mkdir -p /tmp/kalign2/build \ +&& tar -xvzf current.tar.gz -C /tmp/kalign2 \ +&& pushd /tmp/kalign2 \ +&& ./configure \ +&& make && make install \ +&& popd \ +&& rm -rf /tmp/kalign2 && \ +popd + +# ADD hmmer.tar.gz /tmp/hmmer +RUN mkdir -p /tmp/hmmer && wget -O hmmer.tar.gz http://eddylab.org/software/hmmer/hmmer-3.4.tar.gz \ +&& tar xvzf hmmer.tar.gz -C /tmp/hmmer +RUN pushd /tmp/hmmer/hmmer-* \ && ./configure \ && make && make install \ && popd diff --git a/assets/containers/alphafold-data/update_locations.py b/assets/containers/alphafold-data/update_locations.py index 35cfa43..fbb42ed 100644 --- a/assets/containers/alphafold-data/update_locations.py +++ b/assets/containers/alphafold-data/update_locations.py @@ -11,11 +11,13 @@ def update_locations(target_dir, file_list): for filename in file_list: - index, _null, outfile = filename.partition("_") - index = index.split(".")[1] - - chain = int_id_to_str_id(int(index)) - print(f'file: {filename} index: {index} chain: {chain} outfile:{outfile}') + # index, _null, outfile = filename.partition("_") + # index = index.split(".")[1] + # chain = int_id_to_str_id(int(index)) + [_null, chain, database, file] = filename.split("_") + outfile = "_".join([database, file]) + # print(f'file: {filename} index: {index} chain: {chain} outfile:{outfile}') + print(f'file: {filename} chain: {chain} outfile:{outfile}') chain = os.path.join(target_dir, chain) path = pathlib.Path(chain) diff --git a/assets/containers/alphafold-predict/Dockerfile b/assets/containers/alphafold-predict/Dockerfile index 30644af..cc483f5 100644 --- a/assets/containers/alphafold-predict/Dockerfile +++ b/assets/containers/alphafold-predict/Dockerfile @@ -3,10 +3,9 @@ # SPDX-License-Identifier: Apache-2.0 # ARG CUDA=11.1.1 -ARG CUDA=12.2.2 -# ARG ALPHAFOLD2_VERSION=v2.3.2 -ARG ALPHAFOLD2_VERSION=f251de6613cb478207c732bf9627b1e853c99c2f -FROM nvcr.io/nvidia/cuda:${CUDA}-cudnn8-runtime-ubuntu20.04 +ARG CUDA=11.6.0 +ARG ALPHAFOLD2_VERSION=v2.3.2 +FROM nvcr.io/nvidia/cuda:${CUDA}-cudnn8-runtime-ubuntu18.04 # FROM directive resets ARGS, so we specify again (the value is retained if # previously set). ARG CUDA @@ -15,19 +14,18 @@ ARG ALPHAFOLD2_VERSION # Use bash to support string substitution. SHELL ["/bin/bash", "-o", "pipefail", "-c"] -RUN apt-get update \ - && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ - build-essential \ - cmake \ - cuda-command-line-tools-$(cut -f1,2 -d- <<< ${CUDA//./-}) \ - git \ - hmmer \ - kalign \ - tzdata \ - wget \ - awscli \ - jq \ - unzip \ +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ + build-essential \ + cmake \ + cuda-command-line-tools-$(cut -f1,2 -d- <<< ${CUDA//./-}) \ + git \ + hmmer \ + kalign \ + tzdata \ + wget \ + awscli \ + jq \ && rm -rf /var/lib/apt/lists/* \ && apt-get autoremove -y \ && apt-get clean @@ -36,7 +34,7 @@ RUN apt-get update \ RUN git clone --branch v3.3.0 https://github.com/soedinglab/hh-suite.git /tmp/hh-suite \ && mkdir /tmp/hh-suite/build \ && pushd /tmp/hh-suite/build \ - && cmake -DCMAKE_INSTALL_PREFIX=/opt/hhsuite .. \ + && cmake -DHAVE_AVX2=1 -DCMAKE_INSTALL_PREFIX=/opt/hhsuite .. \ && make -j 4 && make install \ && ln -s /opt/hhsuite/bin/* /usr/bin \ && popd \ @@ -50,18 +48,18 @@ RUN wget -q -P /tmp \ # Install conda packages. ENV PATH="/opt/conda/bin:$PATH" -ENV LD_LIBRARY_PATH="/opt/conda/lib:$LD_LIBRARY_PATH" -RUN conda install -qy conda==24.5.0 pip python=3.11 \ - && conda install -y -c nvidia/label/cuda-${CUDA} cuda \ - && conda install -y -c conda-forge openmm=8.0.0 pdbfixer \ - && conda clean --all --force-pkgs-dirs --yes +# RUN conda install -qy conda==4.13.0 +# && conda install -y -c conda-forge +RUN conda install -y -c conda-forge \ + openmm=7.5.1 \ + cudatoolkit=${CUDA_VERSION} \ + pdbfixer=1.7 \ + pip \ + python=3.9.16 \ + && conda clean --all --force-pkgs-dirs --yes -# Install AlphaFold -RUN wget -q -P /tmp \ - https://github.com/google-deepmind/alphafold/archive/${ALPHAFOLD2_VERSION}.zip \ - && mkdir -p /app/alphafold \ - && unzip /tmp/f251de6613cb478207c732bf9627b1e853c99c2f.zip -d /tmp \ - && mv /tmp/alphafold-f251de6613cb478207c732bf9627b1e853c99c2f/* /app/alphafold + +RUN git clone --branch ${ALPHAFOLD2_VERSION} --depth 1 https://github.com/deepmind/alphafold.git /app/alphafold RUN wget -q -P /app/alphafold/alphafold/common/ \ https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt @@ -70,20 +68,33 @@ RUN wget -q -P /app/alphafold/alphafold/common/ \ RUN pip3 install --upgrade pip --no-cache-dir \ && pip3 install -r /app/alphafold/requirements.txt --no-cache-dir \ && pip3 install --upgrade --no-cache-dir \ - jax==0.4.26 \ - jaxlib==0.4.26+cuda12.cudnn89 \ + jax==0.3.25 \ + jaxlib==0.3.25+cuda11.cudnn805 \ -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html RUN pip3 install --upgrade --no-cache-dir \ - matplotlib==3.9.2 + matplotlib==3.6.3 \ + numpy==1.24.3 + +# Apply OpenMM patch. +WORKDIR /opt/conda/lib/python3.9/site-packages +RUN patch -p0 < /app/alphafold/docker/openmm.patch # Add SETUID bit to the ldconfig binary so that non-root users can run it. RUN chmod u+s /sbin/ldconfig.real -# Currently needed to avoid undefined_symbol error. -RUN ln -sf /usr/lib/x86_64-linux-gnu/libffi.so.7 /opt/conda/lib/libffi.so.7 - +# We need to run `ldconfig` first to ensure GPUs are visible, due to some quirk +# with Debian. See https://github.com/NVIDIA/nvidia-docker/issues/1399 for +# details. +# ENTRYPOINT does not support easily running multiple commands, so instead we +# write a shell script to wrap them up. WORKDIR /app/alphafold COPY predict.py /app/alphafold/ +# COPY run.sh /app/alphafold/run.sh +# RUN echo $'#!/bin/bash\n\ +# ldconfig\n\ +# python /app/alphafold/run_alphafold.py "$@"' > /app/run_alphafold.sh \ +# && chmod +x /app/run_alphafold.sh /app/alphafold/run.sh -ENTRYPOINT [] +# ENTRYPOINT ["bash", "/app/alphafold/run.sh"] +ENTRYPOINT ["bash"] diff --git a/assets/containers/protein-utils/Dockerfile b/assets/containers/protein-utils/Dockerfile index 4ccee87..32eb420 100644 --- a/assets/containers/protein-utils/Dockerfile +++ b/assets/containers/protein-utils/Dockerfile @@ -1,27 +1,25 @@ -FROM public.ecr.aws/amazonlinux/amazonlinux:2023 as build +FROM public.ecr.aws/amazonlinux/amazonlinux:2 as build -WORKDIR /home - -COPY code /home/putils -COPY requirements.txt /home +COPY code /tmp/putils # Install python and other dependencies -RUN yum update \ +RUN amazon-linux-extras install python3.8 \ && yum upgrade -y \ && yum install -y \ - python3.11 \ unzip-6.0 \ - wget-1.21.3 \ - && python3.11 -m venv /opt/venv \ + wget-1.14 \ + && python3.8 -m venv /opt/venv \ && source /opt/venv/bin/activate \ - && pip install -U pip \ - && pip install -q --no-cache-dir -r /home/requirements.txt \ - && pip install -q --no-cache-dir /home/putils \ - && yum autoremove -y \ + && pip install -q --no-cache-dir \ + pandas==2.0.0 \ + numpy==1.24.2 \ + biopython==1.81 \ + /tmp/putils \ && yum clean all \ - && rm -rf /var/cache/yum + && rm -rf /var/cache/yum \ + && rm -rf /tmp/putils ENV VIRTUAL_ENV="/opt/venv" ENV PATH="$VIRTUAL_ENV/bin:$PATH" -ENTRYPOINT [] \ No newline at end of file +WORKDIR /home \ No newline at end of file diff --git a/assets/containers/protein-utils/code/resources.json b/assets/containers/protein-utils/code/resources.json new file mode 100644 index 0000000..a69de4e --- /dev/null +++ b/assets/containers/protein-utils/code/resources.json @@ -0,0 +1 @@ +{"id": "3D06", "seq_length": 200, "seq_count": 1, "template_search_resources": {"vcpu": 2, "memory": "4 GiB", "gpu": "False"}, "feature_gen_resources": {"vcpu": 2, "memory": "4 GiB", "gpu": "False"}, "predict_resources": {"vcpu": 8, "memory": "32 GiB", "gpu": "True"}, "uniref90_msa_resources": {"vcpu": 8, "memory": "16 GiB", "gpu": "False"}, "mgnify_msa_resources": {"vcpu": 8, "memory": "16 GiB", "gpu": "False"}, "bfd_msa_resources": {"vcpu": 16, "memory": "32 GiB", "gpu": "False"}} \ No newline at end of file diff --git a/assets/containers/protein-utils/code/seq_info.json b/assets/containers/protein-utils/code/seq_info.json new file mode 100644 index 0000000..9104a31 --- /dev/null +++ b/assets/containers/protein-utils/code/seq_info.json @@ -0,0 +1 @@ +{"id": "2022", "seq_length": "100", "seq_count": "1"} \ No newline at end of file diff --git a/assets/containers/protein-utils/code/setup.py b/assets/containers/protein-utils/code/setup.py index de9ab47..852f912 100644 --- a/assets/containers/protein-utils/code/setup.py +++ b/assets/containers/protein-utils/code/setup.py @@ -1,5 +1,5 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: MIT-0 +# SPDX-License-Identifier: Apache-2.0 from setuptools import setup, find_packages diff --git a/assets/containers/protein-utils/code/src/putils/split_fasta.py b/assets/containers/protein-utils/code/src/putils/split_fasta.py deleted file mode 100644 index a5a8396..0000000 --- a/assets/containers/protein-utils/code/src/putils/split_fasta.py +++ /dev/null @@ -1,221 +0,0 @@ -import argparse -import logging -import os -import pyfastx -import random -import shutil -import tempfile -import tqdm -from urllib.parse import urlparse - -logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - level=logging.INFO, -) - - -def parse_args(): - """Parse the arguments.""" - logging.info("Parsing arguments") - parser = argparse.ArgumentParser() - - parser.add_argument( - "source", - type=str, - help="Path to input .fasta or .fasta.gz file, e.g. s3://myfasta.fa, http://myfasta.fasta.gz, ~/myfasta.fasta, etc", - ) - - parser.add_argument( - "--max_records_per_partition", - type=int, - default=2000000, - help="Max number of sequence records per csv partition", - ) - parser.add_argument( - "--output_dir", - type=str, - default=os.getcwd(), - help="Output dir for processed files", - ) - parser.add_argument( - "--save_csv", - "-c", - action="store_true", - default=False, - help="Save csv files to output dir?", - ) - parser.add_argument( - "-f", - "--save_fasta", - action="store_true", - default=False, - help="Save FASTA file to output dir?", - ) - parser.add_argument( - "--shuffle", - "-s", - action="store_true", - default=True, - help="Shuffle the records in each csv partition?", - ) - - args, _ = parser.parse_known_args() - return args - - -def main(args): - """Transform fasta file into dataset""" - - if not os.path.exists(args.output_dir): - os.makedirs(args.output_dir) - - tmp_dir = tempfile.TemporaryDirectory(dir=os.getcwd()) - input_file = os.path.join(tmp_dir.name, "input.fa") - input_path = download(args.source, input_file) - - output_path = split_fasta( - fasta_file=input_path, - output_dir=args.output_dir, - max_records_per_partition=args.max_records_per_partition, - shuffle=args.shuffle, - save_fasta=args.save_fasta, - save_csv=args.save_csv, - ) - - tmp_dir.cleanup() - logging.info(f"Files saved to {args.output_dir}") - - return output_path - - -def download(source: str, filename: str) -> str: - output_dir = os.path.dirname(filename) - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - if source.startswith("s3"): - import boto3 - - logging.info(f"Downloading {source} to {filename}") - s3 = boto3.client("s3") - parsed = urlparse(source, allow_fragments=False) - bucket = parsed.netloc - key = parsed.path[1:] - total = s3.head_object(Bucket=bucket, Key=key)["ContentLength"] - tqdm_params = { - "desc": source, - "total": total, - "miniters": 1, - "unit": "B", - "unit_scale": True, - "unit_divisor": 1024, - } - with tqdm.tqdm(**tqdm_params) as pb: - s3.download_file( - parsed.netloc, - parsed.path[1:], - filename, - Callback=lambda bytes_transferred: pb.update(bytes_transferred), - ) - elif source.startswith("http"): - import requests - - logging.info(f"Downloading {source} to {filename}") - - with open(filename, "wb") as f: - with requests.get(source, stream=True, timeout=60) as r: - r.raise_for_status() - total = int(r.headers.get("content-length", 0)) - - tqdm_params = { - "desc": source, - "total": total, - "miniters": 1, - "unit": "B", - "unit_scale": True, - "unit_divisor": 1024, - } - with tqdm.tqdm(**tqdm_params) as pb: - for chunk in r.iter_content(chunk_size=8192): - pb.update(len(chunk)) - f.write(chunk) - elif os.path.isfile(source): - logging.info(f"Copying {source} to {filename}") - shutil.copyfile(source, filename) - else: - raise ValueError(f"Invalid source: {source}") - - return filename - - -def split_fasta( - fasta_file: str, - output_dir: str = os.getcwd(), - max_records_per_partition=2000000, - shuffle=True, - save_fasta: bool = True, - save_csv: bool = False, -) -> list: - """Split a .fasta or .fasta.gz file into multiple files.""" - - # if save_fasta and not os.path.exists(os.path.join(output_dir, "fasta")): - # os.makedirs(os.path.join(output_dir, "fasta")) - - # if save_csv and not os.path.exists(os.path.join(output_dir, "csv")): - # os.makedirs(os.path.join(output_dir, "csv")) - - print(f"Splitting {fasta_file}") - fasta_list = [] - fasta_idx = 0 - - for i, seq in tqdm.tqdm( - enumerate( - pyfastx.Fasta(fasta_file, build_index=False, uppercase=True, full_name=True) - ) - ): - fasta_list.append(seq) - - if (i + 1) % max_records_per_partition == 0: - if shuffle: - random.shuffle(fasta_list) - fasta_idx = int(i / max_records_per_partition) - if save_fasta: - write_seq_record_to_fasta(fasta_list, output_dir, fasta_idx) - if save_csv: - write_seq_record_to_csv(fasta_list, output_dir, fasta_idx) - fasta_list = [] - else: - if save_fasta: - write_seq_record_to_fasta(fasta_list, output_dir, fasta_idx + 1) - if save_csv: - write_seq_record_to_csv(fasta_list, output_dir, fasta_idx + 1) - return output_dir - - -def write_seq_record_to_fasta(content_list, output_dir, index): - output_path = os.path.join( - output_dir, - f"x{str(index).rjust(3, '0')}.fasta", - ) - logging.info(f"Writing {output_path}") - - with open(output_path, "w") as f: - for record in content_list: - f.write(f">{record[0]}\n{record[1]}\n") - return output_path - - -def write_seq_record_to_csv(content_list, output_dir, index): - output_path = os.path.join(output_dir, f"x{str(index).rjust(3, '0')}.csv") - logging.info(f"Writing {output_path}") - with open(output_path, "w") as f: - f.write(f"id,text\n") - for record in content_list: - f.write(f"{record[0].replace(',','')},{record[1].replace(',','')}\n") - return output_path - - -if __name__ == "__main__": - args = parse_args() - main(args) diff --git a/assets/containers/protein-utils/requirements.txt b/assets/containers/protein-utils/requirements.txt deleted file mode 100644 index 2f03224..0000000 --- a/assets/containers/protein-utils/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -biopython -biotite -jsonlines -numpy -pandas -pyfastx -tqdm \ No newline at end of file diff --git a/assets/workflows/alphafold-multimer/README.md b/assets/workflows/alphafold-multimer/README.md new file mode 100644 index 0000000..75ca5b4 --- /dev/null +++ b/assets/workflows/alphafold-multimer/README.md @@ -0,0 +1,59 @@ +# AlphaFold Multimer + +This repository helps you set up and run AlphaFold Multimer on AWS HealthOmics. + +The following setup steps below assume you are starting from scratch and prefer to use the command line. This repository will also have 1-click build capabilities at the root of the repo. + +## Running a workflow + +Pick your favorite small fasta file to run your fist end-to-end test. The following command can be done from the terminal or you can navigate to the AWS console. Note that OpenFold likely will work best using `STATIC` run storage due to low data volumes and faster startup times. + +### Example params.json + +```json +{ + "fasta_path":"s3://mybucket/alhpafold-multimer/" +} +``` +### Running the Workflow + +Replace `$ROLEARN`, `$OUTPUTLOC`, `$PARAMS`, `$WFID` as appropriate. Also modify the `params.json` to point to where your FASTA resides. + +```bash +WFID=1234567 +ROLEARN=arn:aws:iam::0123456789012:role/omics-workflow-role-0123456789012-us-east-1 +OUTPUTLOC=s3://mybuckets/run_outputs/openfold +PARAMS=./params.json + +aws omics start-run --workflow-id $WFID --role-arn $ROLEARN --output-uri $OUTPUTLOC --storage-type STATIC --storage-capacity 4800 --parameters file://$PARAMS --name alphafold-multimer +``` +All results are written to a location defined within `$OUTPUTLOC` above. To get to the root directory of the ouputs, you can use the `GetRun` API, which provides the path as `runOutputUri`. Alternatively, this location is available in the console. + +## Citation +AlphaFold Multimer was developed by DeepMind. The original source code can be found [here](https://github.com/google-deepmind/alphafold). The algorithm is presented in the following papers. + +``` +@Article{AlphaFold2021, + author = {Jumper, John and Evans, Richard and Pritzel, Alexander and Green, Tim and Figurnov, Michael and Ronneberger, Olaf and Tunyasuvunakool, Kathryn and Bates, Russ and {\v{Z}}{\'\i}dek, Augustin and Potapenko, Anna and Bridgland, Alex and Meyer, Clemens and Kohl, Simon A A and Ballard, Andrew J and Cowie, Andrew and Romera-Paredes, Bernardino and Nikolov, Stanislav and Jain, Rishub and Adler, Jonas and Back, Trevor and Petersen, Stig and Reiman, David and Clancy, Ellen and Zielinski, Michal and Steinegger, Martin and Pacholska, Michalina and Berghammer, Tamas and Bodenstein, Sebastian and Silver, David and Vinyals, Oriol and Senior, Andrew W and Kavukcuoglu, Koray and Kohli, Pushmeet and Hassabis, Demis}, + journal = {Nature}, + title = {Highly accurate protein structure prediction with {AlphaFold}}, + year = {2021}, + volume = {596}, + number = {7873}, + pages = {583--589}, + doi = {10.1038/s41586-021-03819-2} +} +``` + +``` +@article {AlphaFold-Multimer2021, + author = {Evans, Richard and O{\textquoteright}Neill, Michael and Pritzel, Alexander and Antropova, Natasha and Senior, Andrew and Green, Tim and {\v{Z}}{\'\i}dek, Augustin and Bates, Russ and Blackwell, Sam and Yim, Jason and Ronneberger, Olaf and Bodenstein, Sebastian and Zielinski, Michal and Bridgland, Alex and Potapenko, Anna and Cowie, Andrew and Tunyasuvunakool, Kathryn and Jain, Rishub and Clancy, Ellen and Kohli, Pushmeet and Jumper, John and Hassabis, Demis}, + journal = {bioRxiv}, + title = {Protein complex prediction with AlphaFold-Multimer}, + year = {2021}, + elocation-id = {2021.10.04.463034}, + doi = {10.1101/2021.10.04.463034}, + URL = {https://www.biorxiv.org/content/early/2021/10/04/2021.10.04.463034}, + eprint = {https://www.biorxiv.org/content/early/2021/10/04/2021.10.04.463034.full.pdf}, +} +``` diff --git a/assets/workflows/alphafold-multimer/build_containers.sh b/assets/workflows/alphafold-multimer/build_containers.sh new file mode 100755 index 0000000..4658921 --- /dev/null +++ b/assets/workflows/alphafold-multimer/build_containers.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +set -ex + +REGION=$1 +ACCOUNT=$2 +TAG=${3:-latest} + +aws ecr get-login-password --region $REGION | docker login --username AWS --password-stdin $ACCOUNT.dkr.ecr.$REGION.amazonaws.com + +# build protein-utils +cd protein-utils +docker build --platform linux/amd64 -t $ACCOUNT.dkr.ecr.$REGION.amazonaws.com/protein-utils:$TAG . +docker push $ACCOUNT.dkr.ecr.$REGION.amazonaws.com/protein-utils:$TAG +cd .. + +# build alphafold-data +cd alphafold-data +docker build --platform linux/amd64 -t $ACCOUNT.dkr.ecr.$REGION.amazonaws.com/alphafold-data:$TAG . +docker push $ACCOUNT.dkr.ecr.$REGION.amazonaws.com/alphafold-data:$TAG +cd .. + +# build alphafold-predict +cd alphafold-predict +docker build --platform linux/amd64 -t $ACCOUNT.dkr.ecr.$REGION.amazonaws.com/alphafold-predict:$TAG . +docker push $ACCOUNT.dkr.ecr.$REGION.amazonaws.com/alphafold-predict:$TAG +cd .. diff --git a/assets/workflows/alphafold-multimer/config.yaml b/assets/workflows/alphafold-multimer/config.yaml new file mode 100644 index 0000000..3e38011 --- /dev/null +++ b/assets/workflows/alphafold-multimer/config.yaml @@ -0,0 +1,11 @@ +name: AlphaFold2-Multimer +description: "Predict multi-chain protein structures with AlphaFold2-Multimer" +engine: NEXTFLOW +main: main.nf +parameterTemplate: + fasta_path: + description: "Input file in multi-FASTA format." + optional: false +storageCapacity: 4800 +tags: + Name: "AlphaFold2Multimer" \ No newline at end of file diff --git a/assets/workflows/alphafold-multimer/main.nf b/assets/workflows/alphafold-multimer/main.nf new file mode 100644 index 0000000..c92e4e1 --- /dev/null +++ b/assets/workflows/alphafold-multimer/main.nf @@ -0,0 +1,244 @@ +nextflow.enable.dsl = 2 + +params.fasta_path = "" + +// static data files are in nextflow.config + +include { + SearchUniref90; + SearchMgnify; + SearchBFD; + SearchTemplatesTask; + SearchUniprot; + CombineSearchResults; +} from './searches.nf' + +include { + UnpackBFD; + UnpackPdb70nSeqres; + UnpackMMCIF; +} from './unpack.nf' + + +workflow { + + // Convert to one or many files + if (params.fasta_path[-1] == "/") { + fasta_path = params.fasta_path + "*" + } else { + fasta_path = params.fasta_path + } + + // [5nl6, 5nl6.fasta] + // [5mlq, 5mlq.fasta] + fasta_files = Channel + .fromPath(fasta_path) + .map { filename -> tuple ( filename.toString().split("/")[-1].split(".fasta")[0], filename) } + + // 5nl6.fasta + // 5mlq.fasta + CheckAndValidateInputsTask(fasta_files) + + // [5nl6, 5nl6_A, 5nl6_A.fasta] + // [5nl6, 5nl6_B, 5nl6_B.fasta] + // [5mlq, 5mlq_A, 5mlq_A.fasta] + // [5mlq, 5mlq_B, 5mlq_B.fasta] + split_seqs = CheckAndValidateInputsTask.out.fasta.splitFasta( record: [id: true, text: true] ).map { record -> + def newRecordFile = file("${record.id}.fasta") + newRecordFile.setText(record.text) + return tuple (newRecordFile.getBaseName().split("_")[0], newRecordFile.getBaseName(), newRecordFile) + } + + uniref30 = Channel.fromPath(params.uniref30_database_src).first() + alphafold_model_parameters = Channel.fromPath(params.alphafold_model_parameters).first() + + // Unpack the databases + UnpackBFD(params.bfd_database_a3m_ffdata, + params.bfd_database_a3m_ffindex, + params.bfd_database_cs219_ffdata, + params.bfd_database_cs219_ffindex, + params.bfd_database_hhm_ffdata, + params.bfd_database_hhm_ffindex) + UnpackPdb70nSeqres(params.pdb70_src, params.pdb_seqres_src, params.db_pathname) + UnpackMMCIF(params.pdb_mmcif_src1, + params.pdb_mmcif_src2, + params.pdb_mmcif_src3, + params.pdb_mmcif_src4, + params.pdb_mmcif_src5, + params.pdb_mmcif_src6, + params.pdb_mmcif_src7, + params.pdb_mmcif_src8, + params.pdb_mmcif_src9, + params.pdb_mmcif_obsolete) + + SearchUniref90(split_seqs, params.uniref90_database_src) + SearchMgnify(split_seqs, params.mgnify_database_src) + SearchUniprot(split_seqs, params.uniprot_database_src) + SearchBFD(split_seqs, UnpackBFD.out.db_folder, params.uniref30_database_src) + SearchTemplatesTask(SearchUniref90.out.fasta_basename_with_record_id_and_msa, UnpackPdb70nSeqres.out.db_folder) + + // [5nl6, 5nl6.fasta, [output_5nl6_A/5nl6_A_uniref90_hits.sto, output_5nl6_B/5nl6_B_uniref90_hits.sto], [output_5nl6_B/5nl6_B_mgnify_hits.sto, output_5nl6_A/5nl6_A_mgnify_hits.sto], ...] + // [5mlq, 5mlq.fasta, [output_5mlq_A/5mlq_A_uniref90_hits.sto, output_5mlq_B/5mlq_B_uniref90_hits.sto], [output_5mlq_A/5mlq_A_mgnify_hits.sto, output_5mlq_B/5mlq_B_mgnify_hits.sto], ...] + msa_tuples = fasta_files + .join(SearchUniref90.out.fasta_basename_with_msa.groupTuple()) + .join(SearchMgnify.out.fasta_basename_with_msa.groupTuple()) + .join(SearchUniprot.out.fasta_basename_with_msa.groupTuple()) + .join(SearchBFD.out.fasta_basename_with_msa.groupTuple()) + .join(SearchTemplatesTask.out.fasta_basename_with_msa.groupTuple()) + + // Gather + CombineSearchResults(msa_tuples) + + GenerateFeaturesTask(CombineSearchResults.out.fasta_basename_fasta_and_msa_path, + UnpackMMCIF.out.db_folder, + UnpackMMCIF.out.db_obsolete) + + // Predict. Five separate models + model_nums = Channel.of(0,1,2,3,4) + features = GenerateFeaturesTask.out.fasta_basename_with_features.combine(model_nums) + AlphaFoldMultimerInference(features, alphafold_model_parameters, params.random_seed, params.run_relax) + + MergeRankings(AlphaFoldMultimerInference.out.results.groupTuple(by: 0)) +} + +// Check the inputs and get size etc +process CheckAndValidateInputsTask { + tag "${fasta_basename}" + label 'protutils' + cpus 2 + memory '4 GB' + publishDir "/mnt/workflow/pubdir/${fasta_basename}/inputs" + + input: + tuple val(fasta_basename), path(fasta_path) + + output: + stdout + path "seq_info.json", emit: seq_info + path "${fasta_basename}.fasta", emit: fasta + val "${fasta_basename}", emit: fasta_basename + + script: + """ + set -euxo pipefail + + echo ">>>>>>>>>>>>>>>>>>>" + echo $fasta_basename + echo $fasta_path + echo "<<<<<<<<<<<<<<<<<<<" + + ls -alR + + /opt/venv/bin/python \ + /opt/venv/lib/python3.8/site-packages/putils/check_and_validate_inputs.py \ + --target_id=$fasta_basename --fasta_path=$fasta_path + """ +} + +// Generate features from the searches +process GenerateFeaturesTask { + tag "${fasta_basename}" + label 'data' + cpus 4 + memory '16 GB' + publishDir "/mnt/workflow/pubdir/${fasta_basename}/features" + + input: + tuple val(fasta_basename), path(fasta_path), path(msa_dir) + path pdb_mmcif_folder + path mmcif_obsolete_path + + output: + tuple val(fasta_basename), path("output/features.pkl"), emit: fasta_basename_with_features + path "output/features.pkl", emit: features + path "output/generate_features_metrics.json", emit: metrics + + script: + """ + set -euxo pipefail + + mkdir -p output + + echo "***********************" + ls -alR $msa_dir/ + echo "***********************" + + /opt/venv/bin/python /opt/generate_features.py \ + --fasta_paths=$fasta_path \ + --msa_dir=$msa_dir \ + --template_mmcif_dir="$pdb_mmcif_folder" \ + --obsolete_pdbs_path="$mmcif_obsolete_path" \ + --template_hits="$msa_dir/pdb_hits.sto" \ + --model_preset=multimer \ + --output_dir=output \ + --max_template_date=2023-01-01 + + echo "***********************" + ls -alR output/ + echo "***********************" + + mv output/metrics.json output/generate_features_metrics.json + """ +} + +// AlphaFold Multimer +process AlphaFoldMultimerInference { + tag "${fasta_basename}_${modelnum}" + errorStrategy 'retry' + label 'predict' + cpus { 4 * Math.pow(2, task.attempt) } + memory { 16.GB * Math.pow(2, task.attempt) } + accelerator 1, type: 'nvidia-tesla-a10g' + maxRetries 2 + publishDir "/mnt/workflow/pubdir/${fasta_basename}/prediction_${modelnum}" + input: + tuple val(fasta_basename), path (features), val(modelnum) + path alphafold_model_parameters + val random_seed + val run_relax + + output: + tuple val(fasta_basename), path("output_model_${modelnum}/"), emit: results + + script: + """ + set -euxo pipefail + mkdir -p model/params + tar -xvf $alphafold_model_parameters -C model/params + export XLA_PYTHON_CLIENT_MEM_FRACTION=4.0 + export TF_FORCE_UNIFIED_MEMORY=1 + /opt/conda/bin/python /app/alphafold/predict.py \ + --target_id=$fasta_basename --features_path=$features --model_preset=multimer \ + --model_dir=model --random_seed=$random_seed --output_dir=output_model_${modelnum} \ + --run_relax=${run_relax} --use_gpu_relax=${run_relax} --model_num=$modelnum + + rm -rf output_model_${modelnum}/msas + """ +} + + +// Merge Rankings +process MergeRankings { + tag "${id}" + cpus 2 + memory 4.GB + publishDir "/mnt/workflow/pubdir/${id}" + label 'data' + + input: + tuple val(id), path(results) + + output: + path "rankings.json", emit: rankings + path "top_hit*", emit: top_hit + + script: + """ + mkdir -p output + echo ${results} + # Create top hit + /opt/venv/bin/python /opt/merge_rankings.py --output_dir output/ --model_dirs ${results} + mv output/top_hit* . + mv output/rankings.json . + """ +} diff --git a/assets/workflows/alphafold-multimer/nextflow.config b/assets/workflows/alphafold-multimer/nextflow.config new file mode 100644 index 0000000..ba03fb6 --- /dev/null +++ b/assets/workflows/alphafold-multimer/nextflow.config @@ -0,0 +1,45 @@ +params { + aws_region = "us-east-1" // set default region + db_pathname = "database" + + src_bucket = "omics-us-east-1" + src_prefix = "alphafold_multimer" + + uniref90_database_src = "s3://${src_bucket}/${src_prefix}/uniref90/uniref90.fasta" + mgnify_database_src = "s3://${src_bucket}/${src_prefix}/mgy/mgy_clusters_2022_05.fa" + uniref30_database_src = "s3://${src_bucket}/${src_prefix}/uniref30/" + pdb70_src = "s3://${src_bucket}/${src_prefix}/pdb70/" + pdb_seqres_src = "s3://${src_bucket}/${src_prefix}/pdb_seqres.txt" + alphafold_model_parameters = "s3://${src_bucket}/${src_prefix}/alphafold_params_2022-12-06.tar" + uniprot_database_src = "s3://${src_bucket}/${src_prefix}/uniprot/uniprot.fasta" + + bfd_database_a3m_ffdata = "s3://${src_bucket}/${src_prefix}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffdata" + bfd_database_a3m_ffindex = "s3://${src_bucket}/${src_prefix}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffindex" + bfd_database_cs219_ffdata = "s3://${src_bucket}/${src_prefix}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_cs219.ffdata" + bfd_database_cs219_ffindex = "s3://${src_bucket}/${src_prefix}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_cs219.ffindex" + bfd_database_hhm_ffdata = "s3://${src_bucket}/${src_prefix}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffdata" + bfd_database_hhm_ffindex = "s3://${src_bucket}/${src_prefix}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffindex" + + pdb_mmcif_src1 = "s3://${src_bucket}/${src_prefix}/pdb_mmcif/1.tar" + pdb_mmcif_src2 = "s3://${src_bucket}/${src_prefix}/pdb_mmcif/2.tar" + pdb_mmcif_src3 = "s3://${src_bucket}/${src_prefix}/pdb_mmcif/3.tar" + pdb_mmcif_src4 = "s3://${src_bucket}/${src_prefix}/pdb_mmcif/4.tar" + pdb_mmcif_src5 = "s3://${src_bucket}/${src_prefix}/pdb_mmcif/5.tar" + pdb_mmcif_src6 = "s3://${src_bucket}/${src_prefix}/pdb_mmcif/6.tar" + pdb_mmcif_src7 = "s3://${src_bucket}/${src_prefix}/pdb_mmcif/7.tar" + pdb_mmcif_src8 = "s3://${src_bucket}/${src_prefix}/pdb_mmcif/8.tar" + pdb_mmcif_src9 = "s3://${src_bucket}/${src_prefix}/pdb_mmcif/9.tar" + pdb_mmcif_obsolete = "s3://${src_bucket}/${src_prefix}/pdb_mmcif/obsolete.dat" + + random_seed = 42 + run_relax = "false" +} + +process { +// withLabel: protutils { container = '{{protein-utils:latest}}'} + withLabel: protutils { container = '886436931557.dkr.ecr.us-east-1.amazonaws.com/protein-utils:develop'} +// withLabel: data { container = '{{alphafold-data:latest}}'} + withLabel: data { container = '886436931557.dkr.ecr.us-east-1.amazonaws.com/alphafold-data:develop'} +// withLabel: predict { container = '{{alphafold-predict:latest}}'} + withLabel: predict { container = '886436931557.dkr.ecr.us-east-1.amazonaws.com/alphafold-predict:develop'} +} \ No newline at end of file diff --git a/assets/workflows/alphafold-multimer/parameter-template.json b/assets/workflows/alphafold-multimer/parameter-template.json new file mode 100644 index 0000000..a239744 --- /dev/null +++ b/assets/workflows/alphafold-multimer/parameter-template.json @@ -0,0 +1,6 @@ +{ + "fasta_path": { + "description": "Input file in multi-FASTA format.", + "optional": false + } +} \ No newline at end of file diff --git a/assets/workflows/alphafold-multimer/params.json b/assets/workflows/alphafold-multimer/params.json new file mode 100644 index 0000000..fd4648f --- /dev/null +++ b/assets/workflows/alphafold-multimer/params.json @@ -0,0 +1,3 @@ +{ + "fasta_path":"s3://bioteam-compchem-test-inputs/alhpafold-multimer/" +} \ No newline at end of file diff --git a/assets/workflows/alphafold-multimer/searches.nf b/assets/workflows/alphafold-multimer/searches.nf new file mode 100644 index 0000000..b828d28 --- /dev/null +++ b/assets/workflows/alphafold-multimer/searches.nf @@ -0,0 +1,220 @@ +nextflow.enable.dsl = 2 + +process SearchUniref90 { + tag "${record_id}" + label 'data' + cpus 8 + memory '32 GB' + publishDir "/mnt/workflow/pubdir/${fasta_basename}/msa" + + input: + tuple val(fasta_basename), val(record_id), path(fasta_record_path) + path database_path + + output: + tuple val(fasta_basename), val(record_id), path("output_${record_id}/${record_id}_uniref90_hits.sto"), emit: fasta_basename_with_record_id_and_msa + tuple val(fasta_basename), path("output_${record_id}/${record_id}_uniref90_hits.sto"), emit: fasta_basename_with_msa + path "output_${record_id}/${record_id}_uniref90_hits.sto", emit: msa + path "output_${record_id}/${record_id}_uniref90_metrics.json", emit: metrics + + script: + """ + set -euxo pipefail + cat $fasta_record_path + + mkdir -p output_${record_id} + + /opt/venv/bin/python /opt/create_msa_monomer.py \ + --fasta_path=$fasta_record_path \ + --database_type=uniref90 \ + --database_path=$database_path \ + --output_dir=output_${record_id} \ + --cpu=$task.cpus + + mv output_${record_id}/uniref90_hits.sto output_${record_id}/${record_id}_uniref90_hits.sto + mv output_${record_id}/metrics.json output_${record_id}/${record_id}_uniref90_metrics.json + """ +} + +process SearchMgnify { + tag "${record_id}" + label 'data' + cpus 8 + memory '64 GB' + publishDir "/mnt/workflow/pubdir/${fasta_basename}/msa" + + input: + tuple val(fasta_basename), val(record_id), path(fasta_record_path) + path database_path + + output: + tuple val(fasta_basename), path("output_${record_id}/${record_id}_mgnify_hits.sto"), emit: fasta_basename_with_msa + path "output_${record_id}/${record_id}_mgnify_hits.sto", emit: msa + path "output_${record_id}/${record_id}_mgnify_metrics.json", emit: metrics + + script: + """ + set -euxo pipefail + cat $fasta_record_path + + mkdir -p output_${record_id} + + /opt/venv/bin/python /opt/create_msa_monomer.py \ + --fasta_path=$fasta_record_path \ + --database_type=mgnify \ + --database_path=$database_path \ + --output_dir=output_${record_id} \ + --cpu=$task.cpus + + mv output_${record_id}/mgnify_hits.sto output_${record_id}/${record_id}_mgnify_hits.sto + mv output_${record_id}/metrics.json output_${record_id}/${record_id}_mgnify_metrics.json + """ +} + +process SearchUniprot { + tag "${record_id}" + label 'data' + cpus 8 + memory '32 GB' + publishDir "/mnt/workflow/pubdir/${fasta_basename}/msa" + + input: + tuple val(fasta_basename), val(record_id), path(fasta_record_path) + path database_path + + output: + tuple val(fasta_basename), path("output_${record_id}/${record_id}_uniprot_hits.sto"), emit: fasta_basename_with_msa + path "output_${record_id}/${record_id}_uniprot_hits.sto", emit: msa + path "output_${record_id}/${record_id}_uniprot_metrics.json", emit: metrics + + script: + """ + set -euxo pipefail + cat $fasta_record_path + + mkdir -p output_${record_id} + + /opt/venv/bin/python /opt/create_msa_monomer.py \ + --fasta_path=$fasta_record_path \ + --database_type=uniprot \ + --database_path=$database_path \ + --output_dir=output_${record_id} \ + --cpu=$task.cpus + + mv output_${record_id}/uniprot_hits.sto output_${record_id}/${record_id}_uniprot_hits.sto + mv output_${record_id}/metrics.json output_${record_id}/${record_id}_uniprot_metrics.json + """ +} + +process SearchBFD { + tag "${record_id}" + label 'data' + + cpus { 8 * Math.pow(2, task.attempt) } + memory { 64.GB * Math.pow(2, task.attempt) } + maxRetries 1 + errorStrategy 'retry' + + publishDir "/mnt/workflow/pubdir/${fasta_basename}/msa" + + input: + tuple val(fasta_basename), val(record_id), path(fasta_record_path) + path bfd_database_folder + path uniref30_database_folder + + output: + tuple val(fasta_basename), path("output_${record_id}/${record_id}_bfd_hits.a3m"), emit: fasta_basename_with_msa + path "output_${record_id}/${record_id}_bfd_hits.a3m", emit: msa + path "output_${record_id}/${record_id}_bfd_metrics.json", emit: metrics + + script: + """ + set -euxo pipefail + cat $fasta_record_path + mkdir -p output_${record_id} + + /opt/venv/bin/python /opt/create_msa_monomer.py \ + --fasta_path=$fasta_record_path \ + --database_type=bfd \ + --database_path=$bfd_database_folder \ + --database_path_2=$uniref30_database_folder \ + --output_dir=output_${record_id} \ + --cpu=$task.cpus + + mv output_${record_id}/bfd_hits.a3m output_${record_id}/${record_id}_bfd_hits.a3m + mv output_${record_id}/metrics.json output_${record_id}/${record_id}_bfd_metrics.json + """ +} + +process SearchTemplatesTask { + tag "${record_id}" + label 'data' + cpus 2 + memory '8 GB' + publishDir "/mnt/workflow/pubdir/${fasta_basename}/msa" + + input: + tuple val(fasta_basename), val(record_id), path(msa_path) + path pdb_db_folder + + output: + tuple val(fasta_basename), path("output_${record_id}/${record_id}_pdb_hits.sto"), emit: fasta_basename_with_msa + path "output_${record_id}/${record_id}_pdb_metrics.json", emit: metrics + + script: + """ + set -euxo pipefail + + mkdir -p output_${record_id} + + /opt/venv/bin/python /opt/search_templates.py \ + --msa_path=$msa_path \ + --output_dir=output_${record_id} \ + --database_path=$pdb_db_folder \ + --model_preset=multimer \ + --cpu=$task.cpus + + mv output_${record_id}/pdb_hits.sto output_${record_id}/${record_id}_pdb_hits.sto + mv output_${record_id}/metrics.json output_${record_id}/${record_id}_pdb_metrics.json + """ +} + +// Combine/rename results from parallel searches as AlphaFold expects +process CombineSearchResults { + tag "${fasta_basename}" + label 'data' + cpus 4 + memory '8 GB' + publishDir "/mnt/workflow/pubdir/${fasta_basename}/msa" + + input: + tuple val(fasta_basename), path(fasta_path), path(uniref90_msas), path(mgnify_msas), path(uniprot_msas), path(bfd_msas), path(template_hits) + + output: + tuple val(fasta_basename), path(fasta_path), path ("msa/"), emit: fasta_basename_fasta_and_msa_path + path "msa/", emit: msa_path + + script: + """ + echo ">>>>>>>>>>>>>>>>>>>" + echo $fasta_basename + echo $fasta_path + echo $uniref90_msas + echo $mgnify_msas + echo $uniprot_msas + echo $bfd_msas + echo $template_hits + echo "<<<<<<<<<<<<<<<<<<<" + + mkdir -p msa + /opt/venv/bin/python /opt/update_locations.py msa $uniref90_msas + /opt/venv/bin/python /opt/update_locations.py msa $mgnify_msas + /opt/venv/bin/python /opt/update_locations.py msa $uniprot_msas + /opt/venv/bin/python /opt/update_locations.py msa $bfd_msas + /opt/venv/bin/python /opt/update_locations.py msa $template_hits + + echo "***********************" + ls -alR msa/ + echo "***********************" + """ +} diff --git a/assets/workflows/alphafold-multimer/unpack.nf b/assets/workflows/alphafold-multimer/unpack.nf new file mode 100644 index 0000000..146bc14 --- /dev/null +++ b/assets/workflows/alphafold-multimer/unpack.nf @@ -0,0 +1,149 @@ +// Utilities to unpack/organize certain MSA databases + +process UnpackBFD { + cpus 2 + memory '4 GB' + // Don't publish - we don't want copies of the databases + + input: + path bfd_database_a3m_ffdata + path bfd_database_a3m_ffindex + path bfd_database_cs219_ffdata + path bfd_database_cs219_ffindex + path bfd_database_hhm_ffdata + path bfd_database_hhm_ffindex + + output: + path "bfd/", emit: db_folder + + script: + """ + set -euxo pipefail + + # BFD + mkdir -p bfd + mv $bfd_database_a3m_ffdata bfd/ + mv $bfd_database_a3m_ffindex bfd/ + mv $bfd_database_cs219_ffdata bfd/ + mv $bfd_database_cs219_ffindex bfd/ + mv $bfd_database_hhm_ffdata bfd/ + mv $bfd_database_hhm_ffindex bfd/ + """ +} + + +process UnpackUniprot { + cpus 4 + memory '8 GB' + // Don't publish - we don't want copies of the databases + + input: + path uniprot_database_src + path base_database_path + + output: + path "$base_database_path/uniprot/uniprot.fasta", emit: db + + script: + """ + set -euxo pipefail + + # Uniref30 + mkdir -p $base_database_path/uniprot + tar -xvf $uniprot_database_src -C $base_database_path/uniprot + """ +} + + +process UnpackPdb70nSeqres { + label 'data' + cpus 2 + memory '4 GB' + // Don't publish - we don't want copies of the databases + + input: + path pdb70_src + path pdb_seqres_src + val base_database_path + + output: + path "$base_database_path/pdb/", emit: db_folder + path "$base_database_path/pdb/pdb_seqres.txt", emit: db_seqres + + script: + """ + set -euxo pipefail + + # Templates - pdb70 and seqres + mkdir -p $base_database_path/pdb + mv $pdb70_src/* $base_database_path/pdb/ + + # filter strange sequences containing 0 + /opt/venv/bin/python /opt/filter_pdb.py $pdb_seqres_src $base_database_path/pdb/pdb_seqres.txt + ls -laR $base_database_path/pdb/ + """ +} + + +process UnpackMMCIF { + cpus 2 + memory '4 GB' + // Don't publish - we don't want copies of the databases + + input: + path pdb_mmcif_src1 + path pdb_mmcif_src2 + path pdb_mmcif_src3 + path pdb_mmcif_src4 + path pdb_mmcif_src5 + path pdb_mmcif_src6 + path pdb_mmcif_src7 + path pdb_mmcif_src8 + path pdb_mmcif_src9 + path pdb_mmcif_obsolete + + output: + path "pdb_mmcif/mmcif_files/", emit: db_folder + path "pdb_mmcif/obsolete.dat", emit: db_obsolete + + script: + """ + set -euxo pipefail + mkdir pdb_mmcif + mkdir pdb_mmcif/mmcif_files/ + mv $pdb_mmcif_obsolete pdb_mmcif/ + ls -alR + + # Features + tar -xf $pdb_mmcif_src1 -C pdb_mmcif/mmcif_files/ + tar -xf $pdb_mmcif_src2 -C pdb_mmcif/mmcif_files/ + tar -xf $pdb_mmcif_src3 -C pdb_mmcif/mmcif_files/ + tar -xf $pdb_mmcif_src4 -C pdb_mmcif/mmcif_files/ + tar -xf $pdb_mmcif_src5 -C pdb_mmcif/mmcif_files/ + tar -xf $pdb_mmcif_src6 -C pdb_mmcif/mmcif_files/ + tar -xf $pdb_mmcif_src7 -C pdb_mmcif/mmcif_files/ + tar -xf $pdb_mmcif_src8 -C pdb_mmcif/mmcif_files/ + tar -xf $pdb_mmcif_src9 -C pdb_mmcif/mmcif_files/ + """ +} + + +process UnpackRecords { + tag "${id}" + label 'protutils' + cpus 2 + memory '4 GB' + publishDir "/mnt/workflow/pubdir/${id}/input" + + input: + tuple val(id), val(header), val(seqString) + + output: + tuple val(id), path("input.fasta"), emit: fasta + + script: + """ + set -euxo pipefail + echo -e ">${header}\n${seqString}" > input.fasta + """ +} \ No newline at end of file From 302298743400d8446655470919467336b0724e95 Mon Sep 17 00:00:00 2001 From: John Jacquay Date: Mon, 16 Dec 2024 11:17:53 -0700 Subject: [PATCH 02/20] mv alphafold-multimer -> alphafold2-multimer --- assets/workflows/alphafold-multimer/README.md | 59 ----- .../workflows/alphafold-multimer/config.yaml | 11 - assets/workflows/alphafold-multimer/main.nf | 244 ------------------ .../alphafold-multimer/nextflow.config | 45 ---- .../workflows/alphafold-multimer/searches.nf | 220 ---------------- assets/workflows/alphafold-multimer/unpack.nf | 149 ----------- .../workflows/alphafold2-multimer/README.md | 33 +-- .../build_containers.sh | 0 .../workflows/alphafold2-multimer/config.yaml | 3 - assets/workflows/alphafold2-multimer/main.nf | 188 ++++++++------ .../alphafold2-multimer/nextflow.config | 19 +- .../parameter-template.json | 0 .../params.json | 0 .../workflows/alphafold2-multimer/searches.nf | 142 +++++----- .../workflows/alphafold2-multimer/unpack.nf | 20 +- 15 files changed, 219 insertions(+), 914 deletions(-) delete mode 100644 assets/workflows/alphafold-multimer/README.md delete mode 100644 assets/workflows/alphafold-multimer/config.yaml delete mode 100644 assets/workflows/alphafold-multimer/main.nf delete mode 100644 assets/workflows/alphafold-multimer/nextflow.config delete mode 100644 assets/workflows/alphafold-multimer/searches.nf delete mode 100644 assets/workflows/alphafold-multimer/unpack.nf rename assets/workflows/{alphafold-multimer => alphafold2-multimer}/build_containers.sh (100%) rename assets/workflows/{alphafold-multimer => alphafold2-multimer}/parameter-template.json (100%) rename assets/workflows/{alphafold-multimer => alphafold2-multimer}/params.json (100%) diff --git a/assets/workflows/alphafold-multimer/README.md b/assets/workflows/alphafold-multimer/README.md deleted file mode 100644 index 75ca5b4..0000000 --- a/assets/workflows/alphafold-multimer/README.md +++ /dev/null @@ -1,59 +0,0 @@ -# AlphaFold Multimer - -This repository helps you set up and run AlphaFold Multimer on AWS HealthOmics. - -The following setup steps below assume you are starting from scratch and prefer to use the command line. This repository will also have 1-click build capabilities at the root of the repo. - -## Running a workflow - -Pick your favorite small fasta file to run your fist end-to-end test. The following command can be done from the terminal or you can navigate to the AWS console. Note that OpenFold likely will work best using `STATIC` run storage due to low data volumes and faster startup times. - -### Example params.json - -```json -{ - "fasta_path":"s3://mybucket/alhpafold-multimer/" -} -``` -### Running the Workflow - -Replace `$ROLEARN`, `$OUTPUTLOC`, `$PARAMS`, `$WFID` as appropriate. Also modify the `params.json` to point to where your FASTA resides. - -```bash -WFID=1234567 -ROLEARN=arn:aws:iam::0123456789012:role/omics-workflow-role-0123456789012-us-east-1 -OUTPUTLOC=s3://mybuckets/run_outputs/openfold -PARAMS=./params.json - -aws omics start-run --workflow-id $WFID --role-arn $ROLEARN --output-uri $OUTPUTLOC --storage-type STATIC --storage-capacity 4800 --parameters file://$PARAMS --name alphafold-multimer -``` -All results are written to a location defined within `$OUTPUTLOC` above. To get to the root directory of the ouputs, you can use the `GetRun` API, which provides the path as `runOutputUri`. Alternatively, this location is available in the console. - -## Citation -AlphaFold Multimer was developed by DeepMind. The original source code can be found [here](https://github.com/google-deepmind/alphafold). The algorithm is presented in the following papers. - -``` -@Article{AlphaFold2021, - author = {Jumper, John and Evans, Richard and Pritzel, Alexander and Green, Tim and Figurnov, Michael and Ronneberger, Olaf and Tunyasuvunakool, Kathryn and Bates, Russ and {\v{Z}}{\'\i}dek, Augustin and Potapenko, Anna and Bridgland, Alex and Meyer, Clemens and Kohl, Simon A A and Ballard, Andrew J and Cowie, Andrew and Romera-Paredes, Bernardino and Nikolov, Stanislav and Jain, Rishub and Adler, Jonas and Back, Trevor and Petersen, Stig and Reiman, David and Clancy, Ellen and Zielinski, Michal and Steinegger, Martin and Pacholska, Michalina and Berghammer, Tamas and Bodenstein, Sebastian and Silver, David and Vinyals, Oriol and Senior, Andrew W and Kavukcuoglu, Koray and Kohli, Pushmeet and Hassabis, Demis}, - journal = {Nature}, - title = {Highly accurate protein structure prediction with {AlphaFold}}, - year = {2021}, - volume = {596}, - number = {7873}, - pages = {583--589}, - doi = {10.1038/s41586-021-03819-2} -} -``` - -``` -@article {AlphaFold-Multimer2021, - author = {Evans, Richard and O{\textquoteright}Neill, Michael and Pritzel, Alexander and Antropova, Natasha and Senior, Andrew and Green, Tim and {\v{Z}}{\'\i}dek, Augustin and Bates, Russ and Blackwell, Sam and Yim, Jason and Ronneberger, Olaf and Bodenstein, Sebastian and Zielinski, Michal and Bridgland, Alex and Potapenko, Anna and Cowie, Andrew and Tunyasuvunakool, Kathryn and Jain, Rishub and Clancy, Ellen and Kohli, Pushmeet and Jumper, John and Hassabis, Demis}, - journal = {bioRxiv}, - title = {Protein complex prediction with AlphaFold-Multimer}, - year = {2021}, - elocation-id = {2021.10.04.463034}, - doi = {10.1101/2021.10.04.463034}, - URL = {https://www.biorxiv.org/content/early/2021/10/04/2021.10.04.463034}, - eprint = {https://www.biorxiv.org/content/early/2021/10/04/2021.10.04.463034.full.pdf}, -} -``` diff --git a/assets/workflows/alphafold-multimer/config.yaml b/assets/workflows/alphafold-multimer/config.yaml deleted file mode 100644 index 3e38011..0000000 --- a/assets/workflows/alphafold-multimer/config.yaml +++ /dev/null @@ -1,11 +0,0 @@ -name: AlphaFold2-Multimer -description: "Predict multi-chain protein structures with AlphaFold2-Multimer" -engine: NEXTFLOW -main: main.nf -parameterTemplate: - fasta_path: - description: "Input file in multi-FASTA format." - optional: false -storageCapacity: 4800 -tags: - Name: "AlphaFold2Multimer" \ No newline at end of file diff --git a/assets/workflows/alphafold-multimer/main.nf b/assets/workflows/alphafold-multimer/main.nf deleted file mode 100644 index c92e4e1..0000000 --- a/assets/workflows/alphafold-multimer/main.nf +++ /dev/null @@ -1,244 +0,0 @@ -nextflow.enable.dsl = 2 - -params.fasta_path = "" - -// static data files are in nextflow.config - -include { - SearchUniref90; - SearchMgnify; - SearchBFD; - SearchTemplatesTask; - SearchUniprot; - CombineSearchResults; -} from './searches.nf' - -include { - UnpackBFD; - UnpackPdb70nSeqres; - UnpackMMCIF; -} from './unpack.nf' - - -workflow { - - // Convert to one or many files - if (params.fasta_path[-1] == "/") { - fasta_path = params.fasta_path + "*" - } else { - fasta_path = params.fasta_path - } - - // [5nl6, 5nl6.fasta] - // [5mlq, 5mlq.fasta] - fasta_files = Channel - .fromPath(fasta_path) - .map { filename -> tuple ( filename.toString().split("/")[-1].split(".fasta")[0], filename) } - - // 5nl6.fasta - // 5mlq.fasta - CheckAndValidateInputsTask(fasta_files) - - // [5nl6, 5nl6_A, 5nl6_A.fasta] - // [5nl6, 5nl6_B, 5nl6_B.fasta] - // [5mlq, 5mlq_A, 5mlq_A.fasta] - // [5mlq, 5mlq_B, 5mlq_B.fasta] - split_seqs = CheckAndValidateInputsTask.out.fasta.splitFasta( record: [id: true, text: true] ).map { record -> - def newRecordFile = file("${record.id}.fasta") - newRecordFile.setText(record.text) - return tuple (newRecordFile.getBaseName().split("_")[0], newRecordFile.getBaseName(), newRecordFile) - } - - uniref30 = Channel.fromPath(params.uniref30_database_src).first() - alphafold_model_parameters = Channel.fromPath(params.alphafold_model_parameters).first() - - // Unpack the databases - UnpackBFD(params.bfd_database_a3m_ffdata, - params.bfd_database_a3m_ffindex, - params.bfd_database_cs219_ffdata, - params.bfd_database_cs219_ffindex, - params.bfd_database_hhm_ffdata, - params.bfd_database_hhm_ffindex) - UnpackPdb70nSeqres(params.pdb70_src, params.pdb_seqres_src, params.db_pathname) - UnpackMMCIF(params.pdb_mmcif_src1, - params.pdb_mmcif_src2, - params.pdb_mmcif_src3, - params.pdb_mmcif_src4, - params.pdb_mmcif_src5, - params.pdb_mmcif_src6, - params.pdb_mmcif_src7, - params.pdb_mmcif_src8, - params.pdb_mmcif_src9, - params.pdb_mmcif_obsolete) - - SearchUniref90(split_seqs, params.uniref90_database_src) - SearchMgnify(split_seqs, params.mgnify_database_src) - SearchUniprot(split_seqs, params.uniprot_database_src) - SearchBFD(split_seqs, UnpackBFD.out.db_folder, params.uniref30_database_src) - SearchTemplatesTask(SearchUniref90.out.fasta_basename_with_record_id_and_msa, UnpackPdb70nSeqres.out.db_folder) - - // [5nl6, 5nl6.fasta, [output_5nl6_A/5nl6_A_uniref90_hits.sto, output_5nl6_B/5nl6_B_uniref90_hits.sto], [output_5nl6_B/5nl6_B_mgnify_hits.sto, output_5nl6_A/5nl6_A_mgnify_hits.sto], ...] - // [5mlq, 5mlq.fasta, [output_5mlq_A/5mlq_A_uniref90_hits.sto, output_5mlq_B/5mlq_B_uniref90_hits.sto], [output_5mlq_A/5mlq_A_mgnify_hits.sto, output_5mlq_B/5mlq_B_mgnify_hits.sto], ...] - msa_tuples = fasta_files - .join(SearchUniref90.out.fasta_basename_with_msa.groupTuple()) - .join(SearchMgnify.out.fasta_basename_with_msa.groupTuple()) - .join(SearchUniprot.out.fasta_basename_with_msa.groupTuple()) - .join(SearchBFD.out.fasta_basename_with_msa.groupTuple()) - .join(SearchTemplatesTask.out.fasta_basename_with_msa.groupTuple()) - - // Gather - CombineSearchResults(msa_tuples) - - GenerateFeaturesTask(CombineSearchResults.out.fasta_basename_fasta_and_msa_path, - UnpackMMCIF.out.db_folder, - UnpackMMCIF.out.db_obsolete) - - // Predict. Five separate models - model_nums = Channel.of(0,1,2,3,4) - features = GenerateFeaturesTask.out.fasta_basename_with_features.combine(model_nums) - AlphaFoldMultimerInference(features, alphafold_model_parameters, params.random_seed, params.run_relax) - - MergeRankings(AlphaFoldMultimerInference.out.results.groupTuple(by: 0)) -} - -// Check the inputs and get size etc -process CheckAndValidateInputsTask { - tag "${fasta_basename}" - label 'protutils' - cpus 2 - memory '4 GB' - publishDir "/mnt/workflow/pubdir/${fasta_basename}/inputs" - - input: - tuple val(fasta_basename), path(fasta_path) - - output: - stdout - path "seq_info.json", emit: seq_info - path "${fasta_basename}.fasta", emit: fasta - val "${fasta_basename}", emit: fasta_basename - - script: - """ - set -euxo pipefail - - echo ">>>>>>>>>>>>>>>>>>>" - echo $fasta_basename - echo $fasta_path - echo "<<<<<<<<<<<<<<<<<<<" - - ls -alR - - /opt/venv/bin/python \ - /opt/venv/lib/python3.8/site-packages/putils/check_and_validate_inputs.py \ - --target_id=$fasta_basename --fasta_path=$fasta_path - """ -} - -// Generate features from the searches -process GenerateFeaturesTask { - tag "${fasta_basename}" - label 'data' - cpus 4 - memory '16 GB' - publishDir "/mnt/workflow/pubdir/${fasta_basename}/features" - - input: - tuple val(fasta_basename), path(fasta_path), path(msa_dir) - path pdb_mmcif_folder - path mmcif_obsolete_path - - output: - tuple val(fasta_basename), path("output/features.pkl"), emit: fasta_basename_with_features - path "output/features.pkl", emit: features - path "output/generate_features_metrics.json", emit: metrics - - script: - """ - set -euxo pipefail - - mkdir -p output - - echo "***********************" - ls -alR $msa_dir/ - echo "***********************" - - /opt/venv/bin/python /opt/generate_features.py \ - --fasta_paths=$fasta_path \ - --msa_dir=$msa_dir \ - --template_mmcif_dir="$pdb_mmcif_folder" \ - --obsolete_pdbs_path="$mmcif_obsolete_path" \ - --template_hits="$msa_dir/pdb_hits.sto" \ - --model_preset=multimer \ - --output_dir=output \ - --max_template_date=2023-01-01 - - echo "***********************" - ls -alR output/ - echo "***********************" - - mv output/metrics.json output/generate_features_metrics.json - """ -} - -// AlphaFold Multimer -process AlphaFoldMultimerInference { - tag "${fasta_basename}_${modelnum}" - errorStrategy 'retry' - label 'predict' - cpus { 4 * Math.pow(2, task.attempt) } - memory { 16.GB * Math.pow(2, task.attempt) } - accelerator 1, type: 'nvidia-tesla-a10g' - maxRetries 2 - publishDir "/mnt/workflow/pubdir/${fasta_basename}/prediction_${modelnum}" - input: - tuple val(fasta_basename), path (features), val(modelnum) - path alphafold_model_parameters - val random_seed - val run_relax - - output: - tuple val(fasta_basename), path("output_model_${modelnum}/"), emit: results - - script: - """ - set -euxo pipefail - mkdir -p model/params - tar -xvf $alphafold_model_parameters -C model/params - export XLA_PYTHON_CLIENT_MEM_FRACTION=4.0 - export TF_FORCE_UNIFIED_MEMORY=1 - /opt/conda/bin/python /app/alphafold/predict.py \ - --target_id=$fasta_basename --features_path=$features --model_preset=multimer \ - --model_dir=model --random_seed=$random_seed --output_dir=output_model_${modelnum} \ - --run_relax=${run_relax} --use_gpu_relax=${run_relax} --model_num=$modelnum - - rm -rf output_model_${modelnum}/msas - """ -} - - -// Merge Rankings -process MergeRankings { - tag "${id}" - cpus 2 - memory 4.GB - publishDir "/mnt/workflow/pubdir/${id}" - label 'data' - - input: - tuple val(id), path(results) - - output: - path "rankings.json", emit: rankings - path "top_hit*", emit: top_hit - - script: - """ - mkdir -p output - echo ${results} - # Create top hit - /opt/venv/bin/python /opt/merge_rankings.py --output_dir output/ --model_dirs ${results} - mv output/top_hit* . - mv output/rankings.json . - """ -} diff --git a/assets/workflows/alphafold-multimer/nextflow.config b/assets/workflows/alphafold-multimer/nextflow.config deleted file mode 100644 index ba03fb6..0000000 --- a/assets/workflows/alphafold-multimer/nextflow.config +++ /dev/null @@ -1,45 +0,0 @@ -params { - aws_region = "us-east-1" // set default region - db_pathname = "database" - - src_bucket = "omics-us-east-1" - src_prefix = "alphafold_multimer" - - uniref90_database_src = "s3://${src_bucket}/${src_prefix}/uniref90/uniref90.fasta" - mgnify_database_src = "s3://${src_bucket}/${src_prefix}/mgy/mgy_clusters_2022_05.fa" - uniref30_database_src = "s3://${src_bucket}/${src_prefix}/uniref30/" - pdb70_src = "s3://${src_bucket}/${src_prefix}/pdb70/" - pdb_seqres_src = "s3://${src_bucket}/${src_prefix}/pdb_seqres.txt" - alphafold_model_parameters = "s3://${src_bucket}/${src_prefix}/alphafold_params_2022-12-06.tar" - uniprot_database_src = "s3://${src_bucket}/${src_prefix}/uniprot/uniprot.fasta" - - bfd_database_a3m_ffdata = "s3://${src_bucket}/${src_prefix}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffdata" - bfd_database_a3m_ffindex = "s3://${src_bucket}/${src_prefix}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffindex" - bfd_database_cs219_ffdata = "s3://${src_bucket}/${src_prefix}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_cs219.ffdata" - bfd_database_cs219_ffindex = "s3://${src_bucket}/${src_prefix}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_cs219.ffindex" - bfd_database_hhm_ffdata = "s3://${src_bucket}/${src_prefix}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffdata" - bfd_database_hhm_ffindex = "s3://${src_bucket}/${src_prefix}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffindex" - - pdb_mmcif_src1 = "s3://${src_bucket}/${src_prefix}/pdb_mmcif/1.tar" - pdb_mmcif_src2 = "s3://${src_bucket}/${src_prefix}/pdb_mmcif/2.tar" - pdb_mmcif_src3 = "s3://${src_bucket}/${src_prefix}/pdb_mmcif/3.tar" - pdb_mmcif_src4 = "s3://${src_bucket}/${src_prefix}/pdb_mmcif/4.tar" - pdb_mmcif_src5 = "s3://${src_bucket}/${src_prefix}/pdb_mmcif/5.tar" - pdb_mmcif_src6 = "s3://${src_bucket}/${src_prefix}/pdb_mmcif/6.tar" - pdb_mmcif_src7 = "s3://${src_bucket}/${src_prefix}/pdb_mmcif/7.tar" - pdb_mmcif_src8 = "s3://${src_bucket}/${src_prefix}/pdb_mmcif/8.tar" - pdb_mmcif_src9 = "s3://${src_bucket}/${src_prefix}/pdb_mmcif/9.tar" - pdb_mmcif_obsolete = "s3://${src_bucket}/${src_prefix}/pdb_mmcif/obsolete.dat" - - random_seed = 42 - run_relax = "false" -} - -process { -// withLabel: protutils { container = '{{protein-utils:latest}}'} - withLabel: protutils { container = '886436931557.dkr.ecr.us-east-1.amazonaws.com/protein-utils:develop'} -// withLabel: data { container = '{{alphafold-data:latest}}'} - withLabel: data { container = '886436931557.dkr.ecr.us-east-1.amazonaws.com/alphafold-data:develop'} -// withLabel: predict { container = '{{alphafold-predict:latest}}'} - withLabel: predict { container = '886436931557.dkr.ecr.us-east-1.amazonaws.com/alphafold-predict:develop'} -} \ No newline at end of file diff --git a/assets/workflows/alphafold-multimer/searches.nf b/assets/workflows/alphafold-multimer/searches.nf deleted file mode 100644 index b828d28..0000000 --- a/assets/workflows/alphafold-multimer/searches.nf +++ /dev/null @@ -1,220 +0,0 @@ -nextflow.enable.dsl = 2 - -process SearchUniref90 { - tag "${record_id}" - label 'data' - cpus 8 - memory '32 GB' - publishDir "/mnt/workflow/pubdir/${fasta_basename}/msa" - - input: - tuple val(fasta_basename), val(record_id), path(fasta_record_path) - path database_path - - output: - tuple val(fasta_basename), val(record_id), path("output_${record_id}/${record_id}_uniref90_hits.sto"), emit: fasta_basename_with_record_id_and_msa - tuple val(fasta_basename), path("output_${record_id}/${record_id}_uniref90_hits.sto"), emit: fasta_basename_with_msa - path "output_${record_id}/${record_id}_uniref90_hits.sto", emit: msa - path "output_${record_id}/${record_id}_uniref90_metrics.json", emit: metrics - - script: - """ - set -euxo pipefail - cat $fasta_record_path - - mkdir -p output_${record_id} - - /opt/venv/bin/python /opt/create_msa_monomer.py \ - --fasta_path=$fasta_record_path \ - --database_type=uniref90 \ - --database_path=$database_path \ - --output_dir=output_${record_id} \ - --cpu=$task.cpus - - mv output_${record_id}/uniref90_hits.sto output_${record_id}/${record_id}_uniref90_hits.sto - mv output_${record_id}/metrics.json output_${record_id}/${record_id}_uniref90_metrics.json - """ -} - -process SearchMgnify { - tag "${record_id}" - label 'data' - cpus 8 - memory '64 GB' - publishDir "/mnt/workflow/pubdir/${fasta_basename}/msa" - - input: - tuple val(fasta_basename), val(record_id), path(fasta_record_path) - path database_path - - output: - tuple val(fasta_basename), path("output_${record_id}/${record_id}_mgnify_hits.sto"), emit: fasta_basename_with_msa - path "output_${record_id}/${record_id}_mgnify_hits.sto", emit: msa - path "output_${record_id}/${record_id}_mgnify_metrics.json", emit: metrics - - script: - """ - set -euxo pipefail - cat $fasta_record_path - - mkdir -p output_${record_id} - - /opt/venv/bin/python /opt/create_msa_monomer.py \ - --fasta_path=$fasta_record_path \ - --database_type=mgnify \ - --database_path=$database_path \ - --output_dir=output_${record_id} \ - --cpu=$task.cpus - - mv output_${record_id}/mgnify_hits.sto output_${record_id}/${record_id}_mgnify_hits.sto - mv output_${record_id}/metrics.json output_${record_id}/${record_id}_mgnify_metrics.json - """ -} - -process SearchUniprot { - tag "${record_id}" - label 'data' - cpus 8 - memory '32 GB' - publishDir "/mnt/workflow/pubdir/${fasta_basename}/msa" - - input: - tuple val(fasta_basename), val(record_id), path(fasta_record_path) - path database_path - - output: - tuple val(fasta_basename), path("output_${record_id}/${record_id}_uniprot_hits.sto"), emit: fasta_basename_with_msa - path "output_${record_id}/${record_id}_uniprot_hits.sto", emit: msa - path "output_${record_id}/${record_id}_uniprot_metrics.json", emit: metrics - - script: - """ - set -euxo pipefail - cat $fasta_record_path - - mkdir -p output_${record_id} - - /opt/venv/bin/python /opt/create_msa_monomer.py \ - --fasta_path=$fasta_record_path \ - --database_type=uniprot \ - --database_path=$database_path \ - --output_dir=output_${record_id} \ - --cpu=$task.cpus - - mv output_${record_id}/uniprot_hits.sto output_${record_id}/${record_id}_uniprot_hits.sto - mv output_${record_id}/metrics.json output_${record_id}/${record_id}_uniprot_metrics.json - """ -} - -process SearchBFD { - tag "${record_id}" - label 'data' - - cpus { 8 * Math.pow(2, task.attempt) } - memory { 64.GB * Math.pow(2, task.attempt) } - maxRetries 1 - errorStrategy 'retry' - - publishDir "/mnt/workflow/pubdir/${fasta_basename}/msa" - - input: - tuple val(fasta_basename), val(record_id), path(fasta_record_path) - path bfd_database_folder - path uniref30_database_folder - - output: - tuple val(fasta_basename), path("output_${record_id}/${record_id}_bfd_hits.a3m"), emit: fasta_basename_with_msa - path "output_${record_id}/${record_id}_bfd_hits.a3m", emit: msa - path "output_${record_id}/${record_id}_bfd_metrics.json", emit: metrics - - script: - """ - set -euxo pipefail - cat $fasta_record_path - mkdir -p output_${record_id} - - /opt/venv/bin/python /opt/create_msa_monomer.py \ - --fasta_path=$fasta_record_path \ - --database_type=bfd \ - --database_path=$bfd_database_folder \ - --database_path_2=$uniref30_database_folder \ - --output_dir=output_${record_id} \ - --cpu=$task.cpus - - mv output_${record_id}/bfd_hits.a3m output_${record_id}/${record_id}_bfd_hits.a3m - mv output_${record_id}/metrics.json output_${record_id}/${record_id}_bfd_metrics.json - """ -} - -process SearchTemplatesTask { - tag "${record_id}" - label 'data' - cpus 2 - memory '8 GB' - publishDir "/mnt/workflow/pubdir/${fasta_basename}/msa" - - input: - tuple val(fasta_basename), val(record_id), path(msa_path) - path pdb_db_folder - - output: - tuple val(fasta_basename), path("output_${record_id}/${record_id}_pdb_hits.sto"), emit: fasta_basename_with_msa - path "output_${record_id}/${record_id}_pdb_metrics.json", emit: metrics - - script: - """ - set -euxo pipefail - - mkdir -p output_${record_id} - - /opt/venv/bin/python /opt/search_templates.py \ - --msa_path=$msa_path \ - --output_dir=output_${record_id} \ - --database_path=$pdb_db_folder \ - --model_preset=multimer \ - --cpu=$task.cpus - - mv output_${record_id}/pdb_hits.sto output_${record_id}/${record_id}_pdb_hits.sto - mv output_${record_id}/metrics.json output_${record_id}/${record_id}_pdb_metrics.json - """ -} - -// Combine/rename results from parallel searches as AlphaFold expects -process CombineSearchResults { - tag "${fasta_basename}" - label 'data' - cpus 4 - memory '8 GB' - publishDir "/mnt/workflow/pubdir/${fasta_basename}/msa" - - input: - tuple val(fasta_basename), path(fasta_path), path(uniref90_msas), path(mgnify_msas), path(uniprot_msas), path(bfd_msas), path(template_hits) - - output: - tuple val(fasta_basename), path(fasta_path), path ("msa/"), emit: fasta_basename_fasta_and_msa_path - path "msa/", emit: msa_path - - script: - """ - echo ">>>>>>>>>>>>>>>>>>>" - echo $fasta_basename - echo $fasta_path - echo $uniref90_msas - echo $mgnify_msas - echo $uniprot_msas - echo $bfd_msas - echo $template_hits - echo "<<<<<<<<<<<<<<<<<<<" - - mkdir -p msa - /opt/venv/bin/python /opt/update_locations.py msa $uniref90_msas - /opt/venv/bin/python /opt/update_locations.py msa $mgnify_msas - /opt/venv/bin/python /opt/update_locations.py msa $uniprot_msas - /opt/venv/bin/python /opt/update_locations.py msa $bfd_msas - /opt/venv/bin/python /opt/update_locations.py msa $template_hits - - echo "***********************" - ls -alR msa/ - echo "***********************" - """ -} diff --git a/assets/workflows/alphafold-multimer/unpack.nf b/assets/workflows/alphafold-multimer/unpack.nf deleted file mode 100644 index 146bc14..0000000 --- a/assets/workflows/alphafold-multimer/unpack.nf +++ /dev/null @@ -1,149 +0,0 @@ -// Utilities to unpack/organize certain MSA databases - -process UnpackBFD { - cpus 2 - memory '4 GB' - // Don't publish - we don't want copies of the databases - - input: - path bfd_database_a3m_ffdata - path bfd_database_a3m_ffindex - path bfd_database_cs219_ffdata - path bfd_database_cs219_ffindex - path bfd_database_hhm_ffdata - path bfd_database_hhm_ffindex - - output: - path "bfd/", emit: db_folder - - script: - """ - set -euxo pipefail - - # BFD - mkdir -p bfd - mv $bfd_database_a3m_ffdata bfd/ - mv $bfd_database_a3m_ffindex bfd/ - mv $bfd_database_cs219_ffdata bfd/ - mv $bfd_database_cs219_ffindex bfd/ - mv $bfd_database_hhm_ffdata bfd/ - mv $bfd_database_hhm_ffindex bfd/ - """ -} - - -process UnpackUniprot { - cpus 4 - memory '8 GB' - // Don't publish - we don't want copies of the databases - - input: - path uniprot_database_src - path base_database_path - - output: - path "$base_database_path/uniprot/uniprot.fasta", emit: db - - script: - """ - set -euxo pipefail - - # Uniref30 - mkdir -p $base_database_path/uniprot - tar -xvf $uniprot_database_src -C $base_database_path/uniprot - """ -} - - -process UnpackPdb70nSeqres { - label 'data' - cpus 2 - memory '4 GB' - // Don't publish - we don't want copies of the databases - - input: - path pdb70_src - path pdb_seqres_src - val base_database_path - - output: - path "$base_database_path/pdb/", emit: db_folder - path "$base_database_path/pdb/pdb_seqres.txt", emit: db_seqres - - script: - """ - set -euxo pipefail - - # Templates - pdb70 and seqres - mkdir -p $base_database_path/pdb - mv $pdb70_src/* $base_database_path/pdb/ - - # filter strange sequences containing 0 - /opt/venv/bin/python /opt/filter_pdb.py $pdb_seqres_src $base_database_path/pdb/pdb_seqres.txt - ls -laR $base_database_path/pdb/ - """ -} - - -process UnpackMMCIF { - cpus 2 - memory '4 GB' - // Don't publish - we don't want copies of the databases - - input: - path pdb_mmcif_src1 - path pdb_mmcif_src2 - path pdb_mmcif_src3 - path pdb_mmcif_src4 - path pdb_mmcif_src5 - path pdb_mmcif_src6 - path pdb_mmcif_src7 - path pdb_mmcif_src8 - path pdb_mmcif_src9 - path pdb_mmcif_obsolete - - output: - path "pdb_mmcif/mmcif_files/", emit: db_folder - path "pdb_mmcif/obsolete.dat", emit: db_obsolete - - script: - """ - set -euxo pipefail - mkdir pdb_mmcif - mkdir pdb_mmcif/mmcif_files/ - mv $pdb_mmcif_obsolete pdb_mmcif/ - ls -alR - - # Features - tar -xf $pdb_mmcif_src1 -C pdb_mmcif/mmcif_files/ - tar -xf $pdb_mmcif_src2 -C pdb_mmcif/mmcif_files/ - tar -xf $pdb_mmcif_src3 -C pdb_mmcif/mmcif_files/ - tar -xf $pdb_mmcif_src4 -C pdb_mmcif/mmcif_files/ - tar -xf $pdb_mmcif_src5 -C pdb_mmcif/mmcif_files/ - tar -xf $pdb_mmcif_src6 -C pdb_mmcif/mmcif_files/ - tar -xf $pdb_mmcif_src7 -C pdb_mmcif/mmcif_files/ - tar -xf $pdb_mmcif_src8 -C pdb_mmcif/mmcif_files/ - tar -xf $pdb_mmcif_src9 -C pdb_mmcif/mmcif_files/ - """ -} - - -process UnpackRecords { - tag "${id}" - label 'protutils' - cpus 2 - memory '4 GB' - publishDir "/mnt/workflow/pubdir/${id}/input" - - input: - tuple val(id), val(header), val(seqString) - - output: - tuple val(id), path("input.fasta"), emit: fasta - - script: - """ - set -euxo pipefail - echo -e ">${header}\n${seqString}" > input.fasta - """ -} \ No newline at end of file diff --git a/assets/workflows/alphafold2-multimer/README.md b/assets/workflows/alphafold2-multimer/README.md index fce0b4f..c42d638 100644 --- a/assets/workflows/alphafold2-multimer/README.md +++ b/assets/workflows/alphafold2-multimer/README.md @@ -1,56 +1,38 @@ # AlphaFold Multimer -This repository helps you set up and run AlphaFold Multimer on AWS HealthOmics. At the end of the configuration, you should be able to run a full end-to-end inference. +This repository helps you set up and run AlphaFold Multimer on AWS HealthOmics. -AlphaFold-Multimer requires several steps: at a high level they bundle into: - -1. Download and prepare the data -2. Multisequence alignment (MSA) -3. Inference - -Traditionally, the download and prepare data stage will download `tar.gz` files and unpack. This workflow has a series of optimizations that are designed to improve data staging times and reduce the time and cost of inference while improving scale (>2500 residues). All corresponding reference data is hosted by AWS HealthOmics, so there is no charge to customers to host that data. +The following setup steps below assume you are starting from scratch and prefer to use the command line. This repository will also have 1-click build capabilities at the root of the repo. ## Running a workflow -Pick your favorite small fasta file to run your fist end-to-end test. The following command can be done from the terminal or you can navigate to the AWS console. - -### Inputs - -`target_id`: The ID of the target you wish to predict -`fasta_path`: S3 URI to a single FASTA file that is in multi-FASTA format. Currently supports 1-chain per record. +Pick your favorite small fasta file to run your fist end-to-end test. The following command can be done from the terminal or you can navigate to the AWS console. Note that AlphaFold likely will work best using `STATIC` run storage due to low data volumes and faster startup times. ### Example params.json -``` - +```json { - "fasta_path":"s3://mybucket/input/multimer/7unl.fasta", - "target_id": "7unl" + "fasta_path":"s3://mybucket/alhpafold-multimer/" } ``` - ### Running the Workflow Replace `$ROLEARN`, `$OUTPUTLOC`, `$PARAMS`, `$WFID` as appropriate. Also modify the `params.json` to point to where your FASTA resides. -``` - +```bash WFID=1234567 ROLEARN=arn:aws:iam::0123456789012:role/omics-workflow-role-0123456789012-us-east-1 -OUTPUTLOC=s3://mybuckets/run_outputs/alphafold +OUTPUTLOC=s3://mybuckets/run_outputs/alphafold2-multimer PARAMS=./params.json aws omics start-run --workflow-id $WFID --role-arn $ROLEARN --output-uri $OUTPUTLOC --storage-type STATIC --storage-capacity 4800 --parameters file://$PARAMS --name alphafold-multimer ``` - All results are written to a location defined within `$OUTPUTLOC` above. To get to the root directory of the ouputs, you can use the `GetRun` API, which provides the path as `runOutputUri`. Alternatively, this location is available in the console. ## Citation - AlphaFold Multimer was developed by DeepMind. The original source code can be found [here](https://github.com/google-deepmind/alphafold). The algorithm is presented in the following papers. ``` - @Article{AlphaFold2021, author = {Jumper, John and Evans, Richard and Pritzel, Alexander and Green, Tim and Figurnov, Michael and Ronneberger, Olaf and Tunyasuvunakool, Kathryn and Bates, Russ and {\v{Z}}{\'\i}dek, Augustin and Potapenko, Anna and Bridgland, Alex and Meyer, Clemens and Kohl, Simon A A and Ballard, Andrew J and Cowie, Andrew and Romera-Paredes, Bernardino and Nikolov, Stanislav and Jain, Rishub and Adler, Jonas and Back, Trevor and Petersen, Stig and Reiman, David and Clancy, Ellen and Zielinski, Michal and Steinegger, Martin and Pacholska, Michalina and Berghammer, Tamas and Bodenstein, Sebastian and Silver, David and Vinyals, Oriol and Senior, Andrew W and Kavukcuoglu, Koray and Kohli, Pushmeet and Hassabis, Demis}, journal = {Nature}, @@ -64,7 +46,6 @@ AlphaFold Multimer was developed by DeepMind. The original source code can be fo ``` ``` - @article {AlphaFold-Multimer2021, author = {Evans, Richard and O{\textquoteright}Neill, Michael and Pritzel, Alexander and Antropova, Natasha and Senior, Andrew and Green, Tim and {\v{Z}}{\'\i}dek, Augustin and Bates, Russ and Blackwell, Sam and Yim, Jason and Ronneberger, Olaf and Bodenstein, Sebastian and Zielinski, Michal and Bridgland, Alex and Potapenko, Anna and Cowie, Andrew and Tunyasuvunakool, Kathryn and Jain, Rishub and Clancy, Ellen and Kohli, Pushmeet and Jumper, John and Hassabis, Demis}, journal = {bioRxiv}, diff --git a/assets/workflows/alphafold-multimer/build_containers.sh b/assets/workflows/alphafold2-multimer/build_containers.sh similarity index 100% rename from assets/workflows/alphafold-multimer/build_containers.sh rename to assets/workflows/alphafold2-multimer/build_containers.sh diff --git a/assets/workflows/alphafold2-multimer/config.yaml b/assets/workflows/alphafold2-multimer/config.yaml index 1db3c7d..3e38011 100644 --- a/assets/workflows/alphafold2-multimer/config.yaml +++ b/assets/workflows/alphafold2-multimer/config.yaml @@ -3,9 +3,6 @@ description: "Predict multi-chain protein structures with AlphaFold2-Multimer" engine: NEXTFLOW main: main.nf parameterTemplate: - target_id: - description: "The ID of the target being run." - optional: false fasta_path: description: "Input file in multi-FASTA format." optional: false diff --git a/assets/workflows/alphafold2-multimer/main.nf b/assets/workflows/alphafold2-multimer/main.nf index 49da065..c92e4e1 100644 --- a/assets/workflows/alphafold2-multimer/main.nf +++ b/assets/workflows/alphafold2-multimer/main.nf @@ -1,32 +1,53 @@ -/* groovylint-disable DuplicateNumberLiteral */ nextflow.enable.dsl = 2 -params.fasta_path = '' +params.fasta_path = "" // static data files are in nextflow.config include { - SearchUniref90 - SearchMgnify - SearchBFD - SearchTemplatesTask - SearchUniprot - CombineSearchResults -} from './searches' + SearchUniref90; + SearchMgnify; + SearchBFD; + SearchTemplatesTask; + SearchUniprot; + CombineSearchResults; +} from './searches.nf' include { - UnpackBFD - UnpackPdb70nSeqres - UnpackMMCIF -} from './unpack' + UnpackBFD; + UnpackPdb70nSeqres; + UnpackMMCIF; +} from './unpack.nf' -workflow AlphaFold2Multimer { - CheckAndValidateInputsTask(params.target_id, params.fasta_path) - // split fasta run parallel searches (Scatter) - split_seqs = CheckAndValidateInputsTask.out.fasta - .splitFasta(file: true) - .map { filename -> tuple(filename.toString().split('/')[-1].split('.fasta')[0], filename) } +workflow { + + // Convert to one or many files + if (params.fasta_path[-1] == "/") { + fasta_path = params.fasta_path + "*" + } else { + fasta_path = params.fasta_path + } + + // [5nl6, 5nl6.fasta] + // [5mlq, 5mlq.fasta] + fasta_files = Channel + .fromPath(fasta_path) + .map { filename -> tuple ( filename.toString().split("/")[-1].split(".fasta")[0], filename) } + + // 5nl6.fasta + // 5mlq.fasta + CheckAndValidateInputsTask(fasta_files) + + // [5nl6, 5nl6_A, 5nl6_A.fasta] + // [5nl6, 5nl6_B, 5nl6_B.fasta] + // [5mlq, 5mlq_A, 5mlq_A.fasta] + // [5mlq, 5mlq_B, 5mlq_B.fasta] + split_seqs = CheckAndValidateInputsTask.out.fasta.splitFasta( record: [id: true, text: true] ).map { record -> + def newRecordFile = file("${record.id}.fasta") + newRecordFile.setText(record.text) + return tuple (newRecordFile.getBaseName().split("_")[0], newRecordFile.getBaseName(), newRecordFile) + } uniref30 = Channel.fromPath(params.uniref30_database_src).first() alphafold_model_parameters = Channel.fromPath(params.alphafold_model_parameters).first() @@ -39,88 +60,98 @@ workflow AlphaFold2Multimer { params.bfd_database_hhm_ffdata, params.bfd_database_hhm_ffindex) UnpackPdb70nSeqres(params.pdb70_src, params.pdb_seqres_src, params.db_pathname) - UnpackMMCIF(params.pdb_mmcif_src1, - params.pdb_mmcif_src2, - params.pdb_mmcif_src3, - params.pdb_mmcif_src4, - params.pdb_mmcif_src5, - params.pdb_mmcif_src6, - params.pdb_mmcif_src7, - params.pdb_mmcif_src8, - params.pdb_mmcif_src9, + UnpackMMCIF(params.pdb_mmcif_src1, + params.pdb_mmcif_src2, + params.pdb_mmcif_src3, + params.pdb_mmcif_src4, + params.pdb_mmcif_src5, + params.pdb_mmcif_src6, + params.pdb_mmcif_src7, + params.pdb_mmcif_src8, + params.pdb_mmcif_src9, params.pdb_mmcif_obsolete) SearchUniref90(split_seqs, params.uniref90_database_src) SearchMgnify(split_seqs, params.mgnify_database_src) SearchUniprot(split_seqs, params.uniprot_database_src) SearchBFD(split_seqs, UnpackBFD.out.db_folder, params.uniref30_database_src) - SearchTemplatesTask(SearchUniref90.out.msa_with_id, UnpackPdb70nSeqres.out.db_folder) + SearchTemplatesTask(SearchUniref90.out.fasta_basename_with_record_id_and_msa, UnpackPdb70nSeqres.out.db_folder) - // Gather - CombineSearchResults(SearchUniref90.out.msa.collect(), - SearchUniprot.out.msa.collect(), - SearchMgnify.out.msa.collect(), - SearchBFD.out.msa.collect(), - SearchTemplatesTask.out.msa.collect()) + // [5nl6, 5nl6.fasta, [output_5nl6_A/5nl6_A_uniref90_hits.sto, output_5nl6_B/5nl6_B_uniref90_hits.sto], [output_5nl6_B/5nl6_B_mgnify_hits.sto, output_5nl6_A/5nl6_A_mgnify_hits.sto], ...] + // [5mlq, 5mlq.fasta, [output_5mlq_A/5mlq_A_uniref90_hits.sto, output_5mlq_B/5mlq_B_uniref90_hits.sto], [output_5mlq_A/5mlq_A_mgnify_hits.sto, output_5mlq_B/5mlq_B_mgnify_hits.sto], ...] + msa_tuples = fasta_files + .join(SearchUniref90.out.fasta_basename_with_msa.groupTuple()) + .join(SearchMgnify.out.fasta_basename_with_msa.groupTuple()) + .join(SearchUniprot.out.fasta_basename_with_msa.groupTuple()) + .join(SearchBFD.out.fasta_basename_with_msa.groupTuple()) + .join(SearchTemplatesTask.out.fasta_basename_with_msa.groupTuple()) - GenerateFeaturesTask(CheckAndValidateInputsTask.out.fasta, - CombineSearchResults.out.msa_path, - UnpackMMCIF.out.db_folder, - UnpackMMCIF.out.db_obsolete) + // Gather + CombineSearchResults(msa_tuples) + GenerateFeaturesTask(CombineSearchResults.out.fasta_basename_fasta_and_msa_path, + UnpackMMCIF.out.db_folder, + UnpackMMCIF.out.db_obsolete) + // Predict. Five separate models - model_nums = Channel.of(0, 1, 2, 3, 4) - AlphaFoldMultimerInference(params.target_id, - GenerateFeaturesTask.out.features, - params.alphafold_model_parameters, - model_nums, params.random_seed, - params.run_relax) - - MergeRankings(AlphaFoldMultimerInference.out.results.collect()) + model_nums = Channel.of(0,1,2,3,4) + features = GenerateFeaturesTask.out.fasta_basename_with_features.combine(model_nums) + AlphaFoldMultimerInference(features, alphafold_model_parameters, params.random_seed, params.run_relax) + + MergeRankings(AlphaFoldMultimerInference.out.results.groupTuple(by: 0)) } // Check the inputs and get size etc process CheckAndValidateInputsTask { + tag "${fasta_basename}" label 'protutils' cpus 2 memory '4 GB' - publishDir '/mnt/workflow/pubdir/inputs' + publishDir "/mnt/workflow/pubdir/${fasta_basename}/inputs" input: - val target_id - path fasta_path + tuple val(fasta_basename), path(fasta_path) output: stdout - path 'seq_info.json', emit: seq_info - path 'inputs.fasta', emit: fasta + path "seq_info.json", emit: seq_info + path "${fasta_basename}.fasta", emit: fasta + val "${fasta_basename}", emit: fasta_basename script: """ set -euxo pipefail + + echo ">>>>>>>>>>>>>>>>>>>" + echo $fasta_basename + echo $fasta_path + echo "<<<<<<<<<<<<<<<<<<<" + ls -alR + /opt/venv/bin/python \ - /home/putils/src/putils/check_and_validate_inputs.py \ - --target_id=$target_id --fasta_path=$fasta_path + /opt/venv/lib/python3.8/site-packages/putils/check_and_validate_inputs.py \ + --target_id=$fasta_basename --fasta_path=$fasta_path """ } // Generate features from the searches process GenerateFeaturesTask { + tag "${fasta_basename}" label 'data' cpus 4 memory '16 GB' - publishDir '/mnt/workflow/pubdir/features' + publishDir "/mnt/workflow/pubdir/${fasta_basename}/features" input: - path fasta_paths - path msa_dir + tuple val(fasta_basename), path(fasta_path), path(msa_dir) path pdb_mmcif_folder path mmcif_obsolete_path output: - path 'output/features.pkl', emit: features - path 'output/generate_features_metrics.json', emit: metrics + tuple val(fasta_basename), path("output/features.pkl"), emit: fasta_basename_with_features + path "output/features.pkl", emit: features + path "output/generate_features_metrics.json", emit: metrics script: """ @@ -133,14 +164,18 @@ process GenerateFeaturesTask { echo "***********************" /opt/venv/bin/python /opt/generate_features.py \ - --fasta_paths=$fasta_paths \ + --fasta_paths=$fasta_path \ --msa_dir=$msa_dir \ --template_mmcif_dir="$pdb_mmcif_folder" \ --obsolete_pdbs_path="$mmcif_obsolete_path" \ --template_hits="$msa_dir/pdb_hits.sto" \ --model_preset=multimer \ --output_dir=output \ - --max_template_date=2023-01-01 + --max_template_date=2023-01-01 + + echo "***********************" + ls -alR output/ + echo "***********************" mv output/metrics.json output/generate_features_metrics.json """ @@ -148,24 +183,23 @@ process GenerateFeaturesTask { // AlphaFold Multimer process AlphaFoldMultimerInference { + tag "${fasta_basename}_${modelnum}" errorStrategy 'retry' label 'predict' cpus { 4 * Math.pow(2, task.attempt) } memory { 16.GB * Math.pow(2, task.attempt) } accelerator 1, type: 'nvidia-tesla-a10g' maxRetries 2 - publishDir '/mnt/workflow/pubdir' + publishDir "/mnt/workflow/pubdir/${fasta_basename}/prediction_${modelnum}" input: - val target_id - path features + tuple val(fasta_basename), path (features), val(modelnum) path alphafold_model_parameters - val modelnum val random_seed val run_relax output: - path "output_model_${modelnum}/", emit: results - + tuple val(fasta_basename), path("output_model_${modelnum}/"), emit: results + script: """ set -euxo pipefail @@ -174,7 +208,7 @@ process AlphaFoldMultimerInference { export XLA_PYTHON_CLIENT_MEM_FRACTION=4.0 export TF_FORCE_UNIFIED_MEMORY=1 /opt/conda/bin/python /app/alphafold/predict.py \ - --target_id=$target_id --features_path=$features --model_preset=multimer \ + --target_id=$fasta_basename --features_path=$features --model_preset=multimer \ --model_dir=model --random_seed=$random_seed --output_dir=output_model_${modelnum} \ --run_relax=${run_relax} --use_gpu_relax=${run_relax} --model_num=$modelnum @@ -182,20 +216,22 @@ process AlphaFoldMultimerInference { """ } -//Merge Rankings + +// Merge Rankings process MergeRankings { + tag "${id}" cpus 2 memory 4.GB - publishDir '/mnt/workflow/pubdir' + publishDir "/mnt/workflow/pubdir/${id}" label 'data' input: - path results + tuple val(id), path(results) output: - path 'rankings.json', emit: rankings - path 'top_hit*', emit: top_hit - + path "rankings.json", emit: rankings + path "top_hit*", emit: top_hit + script: """ mkdir -p output @@ -206,7 +242,3 @@ process MergeRankings { mv output/rankings.json . """ } - -workflow { - AlphaFold2Multimer() -} diff --git a/assets/workflows/alphafold2-multimer/nextflow.config b/assets/workflows/alphafold2-multimer/nextflow.config index 16e3e32..ba03fb6 100644 --- a/assets/workflows/alphafold2-multimer/nextflow.config +++ b/assets/workflows/alphafold2-multimer/nextflow.config @@ -1,16 +1,16 @@ params { aws_region = "us-east-1" // set default region db_pathname = "database" - - src_bucket = "{{S3_BUCKET_NAME}}" - src_prefix = "ref-data" + + src_bucket = "omics-us-east-1" + src_prefix = "alphafold_multimer" uniref90_database_src = "s3://${src_bucket}/${src_prefix}/uniref90/uniref90.fasta" mgnify_database_src = "s3://${src_bucket}/${src_prefix}/mgy/mgy_clusters_2022_05.fa" uniref30_database_src = "s3://${src_bucket}/${src_prefix}/uniref30/" pdb70_src = "s3://${src_bucket}/${src_prefix}/pdb70/" - pdb_seqres_src = "s3://${src_bucket}/${src_prefix}/pdb_seqres/pdb_seqres.txt" - alphafold_model_parameters = "s3://${src_bucket}/${src_prefix}/alphafold_parameters/alphafold_params_2022-12-06.tar" + pdb_seqres_src = "s3://${src_bucket}/${src_prefix}/pdb_seqres.txt" + alphafold_model_parameters = "s3://${src_bucket}/${src_prefix}/alphafold_params_2022-12-06.tar" uniprot_database_src = "s3://${src_bucket}/${src_prefix}/uniprot/uniprot.fasta" bfd_database_a3m_ffdata = "s3://${src_bucket}/${src_prefix}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffdata" @@ -36,7 +36,10 @@ params { } process { - withLabel: protutils { container = "{{protein-utils:latest}}"} - withLabel: data { container = "{{alphafold-data:latest}}"} - withLabel: predict { container = "{{alphafold-predict:latest}}"} +// withLabel: protutils { container = '{{protein-utils:latest}}'} + withLabel: protutils { container = '886436931557.dkr.ecr.us-east-1.amazonaws.com/protein-utils:develop'} +// withLabel: data { container = '{{alphafold-data:latest}}'} + withLabel: data { container = '886436931557.dkr.ecr.us-east-1.amazonaws.com/alphafold-data:develop'} +// withLabel: predict { container = '{{alphafold-predict:latest}}'} + withLabel: predict { container = '886436931557.dkr.ecr.us-east-1.amazonaws.com/alphafold-predict:develop'} } \ No newline at end of file diff --git a/assets/workflows/alphafold-multimer/parameter-template.json b/assets/workflows/alphafold2-multimer/parameter-template.json similarity index 100% rename from assets/workflows/alphafold-multimer/parameter-template.json rename to assets/workflows/alphafold2-multimer/parameter-template.json diff --git a/assets/workflows/alphafold-multimer/params.json b/assets/workflows/alphafold2-multimer/params.json similarity index 100% rename from assets/workflows/alphafold-multimer/params.json rename to assets/workflows/alphafold2-multimer/params.json diff --git a/assets/workflows/alphafold2-multimer/searches.nf b/assets/workflows/alphafold2-multimer/searches.nf index dedc650..b828d28 100644 --- a/assets/workflows/alphafold2-multimer/searches.nf +++ b/assets/workflows/alphafold2-multimer/searches.nf @@ -1,199 +1,215 @@ nextflow.enable.dsl = 2 process SearchUniref90 { + tag "${record_id}" label 'data' cpus 8 memory '32 GB' - publishDir '/mnt/workflow/pubdir/msa' + publishDir "/mnt/workflow/pubdir/${fasta_basename}/msa" input: - tuple val(id), path(fasta_path) + tuple val(fasta_basename), val(record_id), path(fasta_record_path) path database_path output: - tuple val(id), path("output/${id}_uniref90_hits.sto"), emit: msa_with_id - path "output/${id}_uniref90_hits.sto", emit: msa - path "output/${id}_uniref90_metrics.json", emit: metrics + tuple val(fasta_basename), val(record_id), path("output_${record_id}/${record_id}_uniref90_hits.sto"), emit: fasta_basename_with_record_id_and_msa + tuple val(fasta_basename), path("output_${record_id}/${record_id}_uniref90_hits.sto"), emit: fasta_basename_with_msa + path "output_${record_id}/${record_id}_uniref90_hits.sto", emit: msa + path "output_${record_id}/${record_id}_uniref90_metrics.json", emit: metrics script: """ set -euxo pipefail + cat $fasta_record_path - mkdir -p output + mkdir -p output_${record_id} /opt/venv/bin/python /opt/create_msa_monomer.py \ - --fasta_path=$fasta_path \ + --fasta_path=$fasta_record_path \ --database_type=uniref90 \ --database_path=$database_path \ - --output_dir=output \ + --output_dir=output_${record_id} \ --cpu=$task.cpus - mv output/uniref90_hits.sto output/${id}_uniref90_hits.sto - mv output/metrics.json output/${id}_uniref90_metrics.json + mv output_${record_id}/uniref90_hits.sto output_${record_id}/${record_id}_uniref90_hits.sto + mv output_${record_id}/metrics.json output_${record_id}/${record_id}_uniref90_metrics.json """ } -process SearchUniprot { +process SearchMgnify { + tag "${record_id}" label 'data' cpus 8 - memory '32 GB' - publishDir '/mnt/workflow/pubdir/msa' + memory '64 GB' + publishDir "/mnt/workflow/pubdir/${fasta_basename}/msa" input: - tuple val(id), path(fasta_path) + tuple val(fasta_basename), val(record_id), path(fasta_record_path) path database_path output: - path "output/${id}_uniprot_hits.sto", emit: msa - path "output/${id}_uniprot_metrics.json", emit: metrics - val "$id", emit: id + tuple val(fasta_basename), path("output_${record_id}/${record_id}_mgnify_hits.sto"), emit: fasta_basename_with_msa + path "output_${record_id}/${record_id}_mgnify_hits.sto", emit: msa + path "output_${record_id}/${record_id}_mgnify_metrics.json", emit: metrics script: """ set -euxo pipefail - - mkdir -p output + cat $fasta_record_path + + mkdir -p output_${record_id} /opt/venv/bin/python /opt/create_msa_monomer.py \ - --fasta_path=$fasta_path \ - --database_type=uniprot \ + --fasta_path=$fasta_record_path \ + --database_type=mgnify \ --database_path=$database_path \ - --output_dir=output \ + --output_dir=output_${record_id} \ --cpu=$task.cpus - mv output/uniprot_hits.sto output/${id}_uniprot_hits.sto - mv output/metrics.json output/${id}_uniprot_metrics.json + mv output_${record_id}/mgnify_hits.sto output_${record_id}/${record_id}_mgnify_hits.sto + mv output_${record_id}/metrics.json output_${record_id}/${record_id}_mgnify_metrics.json """ } -process SearchMgnify { +process SearchUniprot { + tag "${record_id}" label 'data' cpus 8 - memory '64 GB' - publishDir '/mnt/workflow/pubdir/msa' + memory '32 GB' + publishDir "/mnt/workflow/pubdir/${fasta_basename}/msa" input: - tuple val(id), path(fasta_path) + tuple val(fasta_basename), val(record_id), path(fasta_record_path) path database_path output: - path "output/${id}_mgnify_hits.sto", emit: msa - path "output/${id}_mgnify_metrics.json", emit: metrics + tuple val(fasta_basename), path("output_${record_id}/${record_id}_uniprot_hits.sto"), emit: fasta_basename_with_msa + path "output_${record_id}/${record_id}_uniprot_hits.sto", emit: msa + path "output_${record_id}/${record_id}_uniprot_metrics.json", emit: metrics script: """ set -euxo pipefail + cat $fasta_record_path - mkdir -p output + mkdir -p output_${record_id} /opt/venv/bin/python /opt/create_msa_monomer.py \ - --fasta_path=$fasta_path \ - --database_type=mgnify \ + --fasta_path=$fasta_record_path \ + --database_type=uniprot \ --database_path=$database_path \ - --output_dir=output \ + --output_dir=output_${record_id} \ --cpu=$task.cpus - mv output/mgnify_hits.sto output/${id}_mgnify_hits.sto - mv output/metrics.json output/${id}_mgnify_metrics.json + mv output_${record_id}/uniprot_hits.sto output_${record_id}/${record_id}_uniprot_hits.sto + mv output_${record_id}/metrics.json output_${record_id}/${record_id}_uniprot_metrics.json """ } process SearchBFD { + tag "${record_id}" label 'data' + cpus { 8 * Math.pow(2, task.attempt) } memory { 64.GB * Math.pow(2, task.attempt) } maxRetries 1 errorStrategy 'retry' - publishDir '/mnt/workflow/pubdir/msa' + + publishDir "/mnt/workflow/pubdir/${fasta_basename}/msa" input: - tuple val(id), path(fasta_path) + tuple val(fasta_basename), val(record_id), path(fasta_record_path) path bfd_database_folder path uniref30_database_folder output: - path "output/${id}_bfd_uniref_hits.a3m", emit: msa - path "output/${id}_metrics.json", emit: metrics + tuple val(fasta_basename), path("output_${record_id}/${record_id}_bfd_hits.a3m"), emit: fasta_basename_with_msa + path "output_${record_id}/${record_id}_bfd_hits.a3m", emit: msa + path "output_${record_id}/${record_id}_bfd_metrics.json", emit: metrics script: """ set -euxo pipefail - - mkdir -p output + cat $fasta_record_path + mkdir -p output_${record_id} /opt/venv/bin/python /opt/create_msa_monomer.py \ - --fasta_path=$fasta_path \ + --fasta_path=$fasta_record_path \ --database_type=bfd \ --database_path=$bfd_database_folder \ --database_path_2=$uniref30_database_folder \ - --output_dir=output \ + --output_dir=output_${record_id} \ --cpu=$task.cpus - mv output/bfd_hits.a3m output/${id}_bfd_uniref_hits.a3m - mv output/metrics.json output/${id}_metrics.json + mv output_${record_id}/bfd_hits.a3m output_${record_id}/${record_id}_bfd_hits.a3m + mv output_${record_id}/metrics.json output_${record_id}/${record_id}_bfd_metrics.json """ } process SearchTemplatesTask { + tag "${record_id}" label 'data' cpus 2 memory '8 GB' - publishDir '/mnt/workflow/pubdir/msa' + publishDir "/mnt/workflow/pubdir/${fasta_basename}/msa" input: - tuple val(id), path(msa_path) + tuple val(fasta_basename), val(record_id), path(msa_path) path pdb_db_folder output: - path "output/${id}_pdb_hits.sto", emit: msa - path "output/${id}_metrics.json", emit: metrics + tuple val(fasta_basename), path("output_${record_id}/${record_id}_pdb_hits.sto"), emit: fasta_basename_with_msa + path "output_${record_id}/${record_id}_pdb_metrics.json", emit: metrics script: """ set -euxo pipefail - mkdir -p output + mkdir -p output_${record_id} /opt/venv/bin/python /opt/search_templates.py \ --msa_path=$msa_path \ - --output_dir=output \ + --output_dir=output_${record_id} \ --database_path=$pdb_db_folder \ --model_preset=multimer \ --cpu=$task.cpus - mv output/pdb_hits.sto output/${id}_pdb_hits.sto - mv output/metrics.json output/${id}_metrics.json + mv output_${record_id}/pdb_hits.sto output_${record_id}/${record_id}_pdb_hits.sto + mv output_${record_id}/metrics.json output_${record_id}/${record_id}_pdb_metrics.json """ } // Combine/rename results from parallel searches as AlphaFold expects process CombineSearchResults { + tag "${fasta_basename}" label 'data' cpus 4 memory '8 GB' + publishDir "/mnt/workflow/pubdir/${fasta_basename}/msa" input: - path uniref90_msas - path uniprot_msas - path mgnify_msas - path bfd_msas - path template_hits - output: - path 'msa/', emit: msa_path + tuple val(fasta_basename), path(fasta_path), path(uniref90_msas), path(mgnify_msas), path(uniprot_msas), path(bfd_msas), path(template_hits) + + output: + tuple val(fasta_basename), path(fasta_path), path ("msa/"), emit: fasta_basename_fasta_and_msa_path + path "msa/", emit: msa_path script: """ echo ">>>>>>>>>>>>>>>>>>>" + echo $fasta_basename + echo $fasta_path echo $uniref90_msas - echo $uniprot_msas echo $mgnify_msas + echo $uniprot_msas + echo $bfd_msas echo $template_hits echo "<<<<<<<<<<<<<<<<<<<" mkdir -p msa /opt/venv/bin/python /opt/update_locations.py msa $uniref90_msas - /opt/venv/bin/python /opt/update_locations.py msa $uniprot_msas /opt/venv/bin/python /opt/update_locations.py msa $mgnify_msas + /opt/venv/bin/python /opt/update_locations.py msa $uniprot_msas /opt/venv/bin/python /opt/update_locations.py msa $bfd_msas /opt/venv/bin/python /opt/update_locations.py msa $template_hits diff --git a/assets/workflows/alphafold2-multimer/unpack.nf b/assets/workflows/alphafold2-multimer/unpack.nf index b8211c0..146bc14 100644 --- a/assets/workflows/alphafold2-multimer/unpack.nf +++ b/assets/workflows/alphafold2-multimer/unpack.nf @@ -14,7 +14,7 @@ process UnpackBFD { path bfd_database_hhm_ffindex output: - path 'bfd/', emit: db_folder + path "bfd/", emit: db_folder script: """ @@ -31,6 +31,7 @@ process UnpackBFD { """ } + process UnpackUniprot { cpus 4 memory '8 GB' @@ -53,6 +54,7 @@ process UnpackUniprot { """ } + process UnpackPdb70nSeqres { label 'data' cpus 2 @@ -75,13 +77,14 @@ process UnpackPdb70nSeqres { # Templates - pdb70 and seqres mkdir -p $base_database_path/pdb mv $pdb70_src/* $base_database_path/pdb/ - + # filter strange sequences containing 0 /opt/venv/bin/python /opt/filter_pdb.py $pdb_seqres_src $base_database_path/pdb/pdb_seqres.txt ls -laR $base_database_path/pdb/ """ } + process UnpackMMCIF { cpus 2 memory '4 GB' @@ -98,10 +101,10 @@ process UnpackMMCIF { path pdb_mmcif_src8 path pdb_mmcif_src9 path pdb_mmcif_obsolete - + output: - path 'pdb_mmcif/mmcif_files/', emit: db_folder - path 'pdb_mmcif/obsolete.dat', emit: db_obsolete + path "pdb_mmcif/mmcif_files/", emit: db_folder + path "pdb_mmcif/obsolete.dat", emit: db_obsolete script: """ @@ -124,22 +127,23 @@ process UnpackMMCIF { """ } + process UnpackRecords { tag "${id}" label 'protutils' cpus 2 memory '4 GB' publishDir "/mnt/workflow/pubdir/${id}/input" - + input: tuple val(id), val(header), val(seqString) output: - tuple val(id), path('input.fasta'), emit: fasta + tuple val(id), path("input.fasta"), emit: fasta script: """ set -euxo pipefail echo -e ">${header}\n${seqString}" > input.fasta """ -} +} \ No newline at end of file From 1f1f3136b96bdc8187ea3da30db95dd93ead176b Mon Sep 17 00:00:00 2001 From: John Jacquay Date: Mon, 16 Dec 2024 11:27:16 -0700 Subject: [PATCH 03/20] deprecate modules dir --- scripts/testrun.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/testrun.sh b/scripts/testrun.sh index 7e38630..bb1f7c9 100755 --- a/scripts/testrun.sh +++ b/scripts/testrun.sh @@ -51,12 +51,11 @@ else fi # Package the workflow -mkdir -p tmp/assets/workflows/$WORKFLOW_NAME tmp/assets/modules +mkdir -p tmp/assets/workflows/$WORKFLOW_NAME pushd tmp cp -r ../assets/workflows/$WORKFLOW_NAME/* assets/workflows/$WORKFLOW_NAME -cp -r ../assets/modules/* assets/modules sed -i "" -E "s/[0-9]{12}\.dkr\.ecr\.(us-[a-z]*-[0-9])/$ACCOUNT_ID.dkr.ecr.$REGION/g" ./assets/workflows/$WORKFLOW_NAME/*.config assets/workflows/$WORKFLOW_NAME/*.wdl 2>/dev/null || true sed -i "" -E "s/[0-9]{12}\.dkr\.ecr\.(us-[a-z]*-[0-9])/$ACCOUNT_ID.dkr.ecr.$REGION/g" ./assets/workflows/$WORKFLOW_NAME/*.config assets/workflows/$WORKFLOW_NAME/*.nf 2>/dev/null || true From c726195f0bb9530d7affe3cbe84e9fc72c765801 Mon Sep 17 00:00:00 2001 From: John Jacquay Date: Tue, 17 Dec 2024 08:24:01 -0700 Subject: [PATCH 04/20] Fixed storage config in testrun.sh --- scripts/testrun.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/testrun.sh b/scripts/testrun.sh index bb1f7c9..26d0b77 100755 --- a/scripts/testrun.sh +++ b/scripts/testrun.sh @@ -72,7 +72,8 @@ aws omics wait workflow-active --region $REGION --id $workflow_id # Run the workflow start_run_command="aws omics start-run \ --retention-mode REMOVE \ - --storage-type DYNAMIC \ + --storage-type STATIC \ + --storage-capacity 9600 \ --workflow-id $workflow_id \ --name $WORKFLOW_NAME-dev-$TIMESTAMP \ --role-arn \"$OMICS_EXECUTION_ROLE\" \ From 4ca440bddfd7b64d76fb8a1b54f20eb075609d60 Mon Sep 17 00:00:00 2001 From: John Jacquay Date: Tue, 17 Dec 2024 15:44:24 -0700 Subject: [PATCH 05/20] potential fix --- assets/workflows/alphafold2-multimer/main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/assets/workflows/alphafold2-multimer/main.nf b/assets/workflows/alphafold2-multimer/main.nf index c92e4e1..fd75b74 100644 --- a/assets/workflows/alphafold2-multimer/main.nf +++ b/assets/workflows/alphafold2-multimer/main.nf @@ -46,10 +46,10 @@ workflow { split_seqs = CheckAndValidateInputsTask.out.fasta.splitFasta( record: [id: true, text: true] ).map { record -> def newRecordFile = file("${record.id}.fasta") newRecordFile.setText(record.text) - return tuple (newRecordFile.getBaseName().split("_")[0], newRecordFile.getBaseName(), newRecordFile) + return tuple (CheckAndValidateInputsTask.out.fasta.getBaseName(), newRecordFile.getBaseName(), newRecordFile) } - uniref30 = Channel.fromPath(params.uniref30_database_src).first() + // uniref30 = Channel.fromPath(params.uniref30_database_src).first() alphafold_model_parameters = Channel.fromPath(params.alphafold_model_parameters).first() // Unpack the databases From b01a02088c0df624bae932c04c14c87337c5e986 Mon Sep 17 00:00:00 2001 From: John Jacquay Date: Tue, 17 Dec 2024 17:00:10 -0700 Subject: [PATCH 06/20] baseName --- assets/workflows/alphafold2-multimer/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/workflows/alphafold2-multimer/main.nf b/assets/workflows/alphafold2-multimer/main.nf index fd75b74..a596fb0 100644 --- a/assets/workflows/alphafold2-multimer/main.nf +++ b/assets/workflows/alphafold2-multimer/main.nf @@ -46,7 +46,7 @@ workflow { split_seqs = CheckAndValidateInputsTask.out.fasta.splitFasta( record: [id: true, text: true] ).map { record -> def newRecordFile = file("${record.id}.fasta") newRecordFile.setText(record.text) - return tuple (CheckAndValidateInputsTask.out.fasta.getBaseName(), newRecordFile.getBaseName(), newRecordFile) + return tuple (CheckAndValidateInputsTask.out.fasta.baseName, newRecordFile.getBaseName(), newRecordFile) } // uniref30 = Channel.fromPath(params.uniref30_database_src).first() From 02796810eda9cfa8f364a36bd180b212c86f9438 Mon Sep 17 00:00:00 2001 From: vanetten Date: Wed, 18 Dec 2024 13:52:57 -0500 Subject: [PATCH 07/20] revert to something more similar to previous split_seqs --- .../containers/alphafold-data/update_locations.py | 13 +++++-------- assets/workflows/alphafold2-multimer/main.nf | 12 +++++++----- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/assets/containers/alphafold-data/update_locations.py b/assets/containers/alphafold-data/update_locations.py index fbb42ed..8dbb573 100644 --- a/assets/containers/alphafold-data/update_locations.py +++ b/assets/containers/alphafold-data/update_locations.py @@ -11,17 +11,14 @@ def update_locations(target_dir, file_list): for filename in file_list: - # index, _null, outfile = filename.partition("_") - # index = index.split(".")[1] - # chain = int_id_to_str_id(int(index)) - [_null, chain, database, file] = filename.split("_") - outfile = "_".join([database, file]) - # print(f'file: {filename} index: {index} chain: {chain} outfile:{outfile}') - print(f'file: {filename} chain: {chain} outfile:{outfile}') + index, _null, outfile = filename.partition("_") + index = index.split(".")[1] + + chain = int_id_to_str_id(int(index)) + print(f'file: {filename} index: {index} chain: {chain} outfile:{outfile}') chain = os.path.join(target_dir, chain) path = pathlib.Path(chain) - if not path.exists(): path.mkdir(parents=True) shutil.copy(filename, os.path.join(chain, outfile), follow_symlinks=True) diff --git a/assets/workflows/alphafold2-multimer/main.nf b/assets/workflows/alphafold2-multimer/main.nf index a596fb0..cd5eb5d 100644 --- a/assets/workflows/alphafold2-multimer/main.nf +++ b/assets/workflows/alphafold2-multimer/main.nf @@ -29,11 +29,13 @@ workflow { fasta_path = params.fasta_path } - // [5nl6, 5nl6.fasta] - // [5mlq, 5mlq.fasta] - fasta_files = Channel - .fromPath(fasta_path) - .map { filename -> tuple ( filename.toString().split("/")[-1].split(".fasta")[0], filename) } + // [5nl6, 5nl6.1, 5nl6.1.fasta] + // [5nl6, 5nl6.2, 5nl6.2.fasta] + // [5mlq, 5mlq.1, 5mlq.1.fasta] + // [5mlq, 5mlq.2, 5mlq.2.fasta] + split_seqs = CheckAndValidateInputsTask.out.fasta + .splitFasta( file: true ) + .map { filename -> tuple (filename.getBaseName().split("\\.")[0], filename.getBaseName(), filename) } // 5nl6.fasta // 5mlq.fasta From 3246bb08de502074563a34222b15d6a96440f457 Mon Sep 17 00:00:00 2001 From: vanetten Date: Wed, 18 Dec 2024 13:56:10 -0500 Subject: [PATCH 08/20] revert to something more similar to previous split_seqs --- assets/workflows/alphafold-multimer/main.nf | 242 ++++++++++++++++++++ 1 file changed, 242 insertions(+) create mode 100644 assets/workflows/alphafold-multimer/main.nf diff --git a/assets/workflows/alphafold-multimer/main.nf b/assets/workflows/alphafold-multimer/main.nf new file mode 100644 index 0000000..96773f1 --- /dev/null +++ b/assets/workflows/alphafold-multimer/main.nf @@ -0,0 +1,242 @@ +nextflow.enable.dsl = 2 + +params.fasta_path = "" + +// static data files are in nextflow.config + +include { + SearchUniref90; + SearchMgnify; + SearchBFD; + SearchTemplatesTask; + SearchUniprot; + CombineSearchResults; +} from './searches.nf' + +include { + UnpackBFD; + UnpackPdb70nSeqres; + UnpackMMCIF; +} from './unpack.nf' + + +workflow { + + // Convert to one or many files + if (params.fasta_path[-1] == "/") { + fasta_path = params.fasta_path + "*" + } else { + fasta_path = params.fasta_path + } + + // [5nl6, 5nl6.fasta] + // [5mlq, 5mlq.fasta] + fasta_files = Channel + .fromPath(fasta_path) + .map { filename -> tuple ( filename.toString().split("/")[-1].split(".fasta")[0], filename) } + + // 5nl6.fasta + // 5mlq.fasta + CheckAndValidateInputsTask(fasta_files) + + // [5nl6, 5nl6.1, 5nl6.1.fasta] + // [5nl6, 5nl6.2, 5nl6.2.fasta] + // [5mlq, 5mlq.1, 5mlq.1.fasta] + // [5mlq, 5mlq.2, 5mlq.2.fasta] + split_seqs = CheckAndValidateInputsTask.out.fasta + .splitFasta( file: true ) + .map { filename -> tuple (filename.getBaseName().split("\\.")[0], filename.getBaseName(), filename) } + + uniref30 = Channel.fromPath(params.uniref30_database_src).first() + alphafold_model_parameters = Channel.fromPath(params.alphafold_model_parameters).first() + + // Unpack the databases + UnpackBFD(params.bfd_database_a3m_ffdata, + params.bfd_database_a3m_ffindex, + params.bfd_database_cs219_ffdata, + params.bfd_database_cs219_ffindex, + params.bfd_database_hhm_ffdata, + params.bfd_database_hhm_ffindex) + UnpackPdb70nSeqres(params.pdb70_src, params.pdb_seqres_src, params.db_pathname) + UnpackMMCIF(params.pdb_mmcif_src1, + params.pdb_mmcif_src2, + params.pdb_mmcif_src3, + params.pdb_mmcif_src4, + params.pdb_mmcif_src5, + params.pdb_mmcif_src6, + params.pdb_mmcif_src7, + params.pdb_mmcif_src8, + params.pdb_mmcif_src9, + params.pdb_mmcif_obsolete) + + SearchUniref90(split_seqs, params.uniref90_database_src) + SearchMgnify(split_seqs, params.mgnify_database_src) + SearchUniprot(split_seqs, params.uniprot_database_src) + SearchBFD(split_seqs, UnpackBFD.out.db_folder, params.uniref30_database_src) + SearchTemplatesTask(SearchUniref90.out.fasta_basename_with_record_id_and_msa, UnpackPdb70nSeqres.out.db_folder) + + // [5nl6, 5nl6.fasta, [output_5nl6_A/5nl6_A_uniref90_hits.sto, output_5nl6_B/5nl6_B_uniref90_hits.sto], [output_5nl6_B/5nl6_B_mgnify_hits.sto, output_5nl6_A/5nl6_A_mgnify_hits.sto], ...] + // [5mlq, 5mlq.fasta, [output_5mlq_A/5mlq_A_uniref90_hits.sto, output_5mlq_B/5mlq_B_uniref90_hits.sto], [output_5mlq_A/5mlq_A_mgnify_hits.sto, output_5mlq_B/5mlq_B_mgnify_hits.sto], ...] + msa_tuples = fasta_files + .join(SearchUniref90.out.fasta_basename_with_msa.groupTuple()) + .join(SearchMgnify.out.fasta_basename_with_msa.groupTuple()) + .join(SearchUniprot.out.fasta_basename_with_msa.groupTuple()) + .join(SearchBFD.out.fasta_basename_with_msa.groupTuple()) + .join(SearchTemplatesTask.out.fasta_basename_with_msa.groupTuple()) + + // Gather + CombineSearchResults(msa_tuples) + + GenerateFeaturesTask(CombineSearchResults.out.fasta_basename_fasta_and_msa_path, + UnpackMMCIF.out.db_folder, + UnpackMMCIF.out.db_obsolete) + + // Predict. Five separate models + model_nums = Channel.of(0,1,2,3,4) + features = GenerateFeaturesTask.out.fasta_basename_with_features.combine(model_nums) + AlphaFoldMultimerInference(features, alphafold_model_parameters, params.random_seed, params.run_relax) + + MergeRankings(AlphaFoldMultimerInference.out.results.groupTuple(by: 0)) +} + +// Check the inputs and get size etc +process CheckAndValidateInputsTask { + tag "${fasta_basename}" + label 'protutils' + cpus 2 + memory '4 GB' + publishDir "/mnt/workflow/pubdir/${fasta_basename}/inputs" + + input: + tuple val(fasta_basename), path(fasta_path) + + output: + stdout + path "seq_info.json", emit: seq_info + path "${fasta_basename}.fasta", emit: fasta + val "${fasta_basename}", emit: fasta_basename + + script: + """ + set -euxo pipefail + + echo ">>>>>>>>>>>>>>>>>>>" + echo $fasta_basename + echo $fasta_path + echo "<<<<<<<<<<<<<<<<<<<" + + ls -alR + + /opt/venv/bin/python \ + /opt/venv/lib/python3.8/site-packages/putils/check_and_validate_inputs.py \ + --target_id=$fasta_basename --fasta_path=$fasta_path + """ +} + +// Generate features from the searches +process GenerateFeaturesTask { + tag "${fasta_basename}" + label 'data' + cpus 4 + memory '16 GB' + publishDir "/mnt/workflow/pubdir/${fasta_basename}/features" + + input: + tuple val(fasta_basename), path(fasta_path), path(msa_dir) + path pdb_mmcif_folder + path mmcif_obsolete_path + + output: + tuple val(fasta_basename), path("output/features.pkl"), emit: fasta_basename_with_features + path "output/features.pkl", emit: features + path "output/generate_features_metrics.json", emit: metrics + + script: + """ + set -euxo pipefail + + mkdir -p output + + echo "***********************" + ls -alR $msa_dir/ + echo "***********************" + + /opt/venv/bin/python /opt/generate_features.py \ + --fasta_paths=$fasta_path \ + --msa_dir=$msa_dir \ + --template_mmcif_dir="$pdb_mmcif_folder" \ + --obsolete_pdbs_path="$mmcif_obsolete_path" \ + --template_hits="$msa_dir/pdb_hits.sto" \ + --model_preset=multimer \ + --output_dir=output \ + --max_template_date=2023-01-01 + + echo "***********************" + ls -alR output/ + echo "***********************" + + mv output/metrics.json output/generate_features_metrics.json + """ +} + +// AlphaFold Multimer +process AlphaFoldMultimerInference { + tag "${fasta_basename}_${modelnum}" + errorStrategy 'retry' + label 'predict' + cpus { 4 * Math.pow(2, task.attempt) } + memory { 16.GB * Math.pow(2, task.attempt) } + accelerator 1, type: 'nvidia-tesla-a10g' + maxRetries 2 + publishDir "/mnt/workflow/pubdir/${fasta_basename}/prediction_${modelnum}" + input: + tuple val(fasta_basename), path (features), val(modelnum) + path alphafold_model_parameters + val random_seed + val run_relax + + output: + tuple val(fasta_basename), path("output_model_${modelnum}/"), emit: results + + script: + """ + set -euxo pipefail + mkdir -p model/params + tar -xvf $alphafold_model_parameters -C model/params + export XLA_PYTHON_CLIENT_MEM_FRACTION=4.0 + export TF_FORCE_UNIFIED_MEMORY=1 + /opt/conda/bin/python /app/alphafold/predict.py \ + --target_id=$fasta_basename --features_path=$features --model_preset=multimer \ + --model_dir=model --random_seed=$random_seed --output_dir=output_model_${modelnum} \ + --run_relax=${run_relax} --use_gpu_relax=${run_relax} --model_num=$modelnum + + rm -rf output_model_${modelnum}/msas + """ +} + + +// Merge Rankings +process MergeRankings { + tag "${id}" + cpus 2 + memory 4.GB + publishDir "/mnt/workflow/pubdir/${id}" + label 'data' + + input: + tuple val(id), path(results) + + output: + path "rankings.json", emit: rankings + path "top_hit*", emit: top_hit + + script: + """ + mkdir -p output + echo ${results} + # Create top hit + /opt/venv/bin/python /opt/merge_rankings.py --output_dir output/ --model_dirs ${results} + mv output/top_hit* . + mv output/rankings.json . + """ +} From 37046350cdadd8ae3d3594dd8981aae0fe372aeb Mon Sep 17 00:00:00 2001 From: vanetten Date: Wed, 18 Dec 2024 14:46:24 -0500 Subject: [PATCH 09/20] fasta_files got missed in the merge --- assets/workflows/alphafold2-multimer/main.nf | 26 +++++++++----------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/assets/workflows/alphafold2-multimer/main.nf b/assets/workflows/alphafold2-multimer/main.nf index cd5eb5d..96773f1 100644 --- a/assets/workflows/alphafold2-multimer/main.nf +++ b/assets/workflows/alphafold2-multimer/main.nf @@ -29,6 +29,16 @@ workflow { fasta_path = params.fasta_path } + // [5nl6, 5nl6.fasta] + // [5mlq, 5mlq.fasta] + fasta_files = Channel + .fromPath(fasta_path) + .map { filename -> tuple ( filename.toString().split("/")[-1].split(".fasta")[0], filename) } + + // 5nl6.fasta + // 5mlq.fasta + CheckAndValidateInputsTask(fasta_files) + // [5nl6, 5nl6.1, 5nl6.1.fasta] // [5nl6, 5nl6.2, 5nl6.2.fasta] // [5mlq, 5mlq.1, 5mlq.1.fasta] @@ -37,21 +47,7 @@ workflow { .splitFasta( file: true ) .map { filename -> tuple (filename.getBaseName().split("\\.")[0], filename.getBaseName(), filename) } - // 5nl6.fasta - // 5mlq.fasta - CheckAndValidateInputsTask(fasta_files) - - // [5nl6, 5nl6_A, 5nl6_A.fasta] - // [5nl6, 5nl6_B, 5nl6_B.fasta] - // [5mlq, 5mlq_A, 5mlq_A.fasta] - // [5mlq, 5mlq_B, 5mlq_B.fasta] - split_seqs = CheckAndValidateInputsTask.out.fasta.splitFasta( record: [id: true, text: true] ).map { record -> - def newRecordFile = file("${record.id}.fasta") - newRecordFile.setText(record.text) - return tuple (CheckAndValidateInputsTask.out.fasta.baseName, newRecordFile.getBaseName(), newRecordFile) - } - - // uniref30 = Channel.fromPath(params.uniref30_database_src).first() + uniref30 = Channel.fromPath(params.uniref30_database_src).first() alphafold_model_parameters = Channel.fromPath(params.alphafold_model_parameters).first() // Unpack the databases From bd0486ffad322561c4e1ec2755f7c64652f98e72 Mon Sep 17 00:00:00 2001 From: John Jacquay Date: Wed, 18 Dec 2024 17:04:37 -0700 Subject: [PATCH 10/20] Logic fixes --- assets/workflows/alphafold2-multimer/main.nf | 31 ++++++++++++++++---- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/assets/workflows/alphafold2-multimer/main.nf b/assets/workflows/alphafold2-multimer/main.nf index a596fb0..7dcd24e 100644 --- a/assets/workflows/alphafold2-multimer/main.nf +++ b/assets/workflows/alphafold2-multimer/main.nf @@ -39,15 +39,30 @@ workflow { // 5mlq.fasta CheckAndValidateInputsTask(fasta_files) + splitFastaWithBasename = CheckAndValidateInputsTask.out.fasta.map { f -> + return tuple (f.baseName, f.splitFasta( record: [id: true, text: true] )) + } + + // Write fasta records and return expected tuple format: // [5nl6, 5nl6_A, 5nl6_A.fasta] // [5nl6, 5nl6_B, 5nl6_B.fasta] // [5mlq, 5mlq_A, 5mlq_A.fasta] // [5mlq, 5mlq_B, 5mlq_B.fasta] - split_seqs = CheckAndValidateInputsTask.out.fasta.splitFasta( record: [id: true, text: true] ).map { record -> - def newRecordFile = file("${record.id}.fasta") - newRecordFile.setText(record.text) - return tuple (CheckAndValidateInputsTask.out.fasta.baseName, newRecordFile.getBaseName(), newRecordFile) - } + // or + // [4ZQK_simple, 4ZQK2, 4ZQK2.fasta] + // [4ZQK_simple, 4ZQK1, 4ZQK1.fasta] + split_seqs = splitFastaWithBasename.map { t -> + def fastaBaseName = t[0] + def records = t[1] + + def recordList = [] + records.forEach { record -> + def newRecordFile = file("${record.id}.fasta") + newRecordFile.setText(record.text) + recordList.add(tuple (fastaBaseName, newRecordFile.getBaseName(), newRecordFile)) + } + return recordList + } | flatMap // uniref30 = Channel.fromPath(params.uniref30_database_src).first() alphafold_model_parameters = Channel.fromPath(params.alphafold_model_parameters).first() @@ -71,17 +86,23 @@ workflow { params.pdb_mmcif_src9, params.pdb_mmcif_obsolete) + // [4ZQK_simple, 4ZQK2, 4ZQK2.fasta] + // [4ZQK_simple, 4ZQK1, 4ZQK1.fasta] SearchUniref90(split_seqs, params.uniref90_database_src) SearchMgnify(split_seqs, params.mgnify_database_src) SearchUniprot(split_seqs, params.uniprot_database_src) SearchBFD(split_seqs, UnpackBFD.out.db_folder, params.uniref30_database_src) + SearchTemplatesTask(SearchUniref90.out.fasta_basename_with_record_id_and_msa, UnpackPdb70nSeqres.out.db_folder) // [5nl6, 5nl6.fasta, [output_5nl6_A/5nl6_A_uniref90_hits.sto, output_5nl6_B/5nl6_B_uniref90_hits.sto], [output_5nl6_B/5nl6_B_mgnify_hits.sto, output_5nl6_A/5nl6_A_mgnify_hits.sto], ...] // [5mlq, 5mlq.fasta, [output_5mlq_A/5mlq_A_uniref90_hits.sto, output_5mlq_B/5mlq_B_uniref90_hits.sto], [output_5mlq_A/5mlq_A_mgnify_hits.sto, output_5mlq_B/5mlq_B_mgnify_hits.sto], ...] msa_tuples = fasta_files + // groupTuple -> [4ZQK_simple, [record1file, record2file]] .join(SearchUniref90.out.fasta_basename_with_msa.groupTuple()) + // groupTuple -> [4ZQK_simple, [record1file, record2file]] .join(SearchMgnify.out.fasta_basename_with_msa.groupTuple()) + // .join(SearchUniprot.out.fasta_basename_with_msa.groupTuple()) .join(SearchBFD.out.fasta_basename_with_msa.groupTuple()) .join(SearchTemplatesTask.out.fasta_basename_with_msa.groupTuple()) From 1f340b09cccfca300142667523a1ab4302ffbd20 Mon Sep 17 00:00:00 2001 From: John Jacquay Date: Wed, 18 Dec 2024 17:15:01 -0700 Subject: [PATCH 11/20] Another test with filename that contains fasta basename --- assets/workflows/alphafold2-multimer/main.nf | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/assets/workflows/alphafold2-multimer/main.nf b/assets/workflows/alphafold2-multimer/main.nf index 7dcd24e..71dd073 100644 --- a/assets/workflows/alphafold2-multimer/main.nf +++ b/assets/workflows/alphafold2-multimer/main.nf @@ -57,7 +57,7 @@ workflow { def recordList = [] records.forEach { record -> - def newRecordFile = file("${record.id}.fasta") + def newRecordFile = file("${fastaBaseName}-${record.id}.fasta") newRecordFile.setText(record.text) recordList.add(tuple (fastaBaseName, newRecordFile.getBaseName(), newRecordFile)) } @@ -98,11 +98,8 @@ workflow { // [5nl6, 5nl6.fasta, [output_5nl6_A/5nl6_A_uniref90_hits.sto, output_5nl6_B/5nl6_B_uniref90_hits.sto], [output_5nl6_B/5nl6_B_mgnify_hits.sto, output_5nl6_A/5nl6_A_mgnify_hits.sto], ...] // [5mlq, 5mlq.fasta, [output_5mlq_A/5mlq_A_uniref90_hits.sto, output_5mlq_B/5mlq_B_uniref90_hits.sto], [output_5mlq_A/5mlq_A_mgnify_hits.sto, output_5mlq_B/5mlq_B_mgnify_hits.sto], ...] msa_tuples = fasta_files - // groupTuple -> [4ZQK_simple, [record1file, record2file]] .join(SearchUniref90.out.fasta_basename_with_msa.groupTuple()) - // groupTuple -> [4ZQK_simple, [record1file, record2file]] .join(SearchMgnify.out.fasta_basename_with_msa.groupTuple()) - // .join(SearchUniprot.out.fasta_basename_with_msa.groupTuple()) .join(SearchBFD.out.fasta_basename_with_msa.groupTuple()) .join(SearchTemplatesTask.out.fasta_basename_with_msa.groupTuple()) From 2f92c40838c2c1299e1f2ca52e20529eabd3d012 Mon Sep 17 00:00:00 2001 From: John Jacquay Date: Thu, 19 Dec 2024 10:22:14 -0700 Subject: [PATCH 12/20] fixes --- .../alphafold-data/update_locations.py | 95 ++++++++++++++++--- assets/workflows/alphafold2-multimer/main.nf | 2 +- .../alphafold2-multimer/nextflow.config | 6 +- scripts/testrun.sh | 8 +- 4 files changed, 90 insertions(+), 21 deletions(-) diff --git a/assets/containers/alphafold-data/update_locations.py b/assets/containers/alphafold-data/update_locations.py index fbb42ed..500307f 100644 --- a/assets/containers/alphafold-data/update_locations.py +++ b/assets/containers/alphafold-data/update_locations.py @@ -8,24 +8,91 @@ from alphafold.data.pipeline_multimer import int_id_to_str_id +# def int_id_to_str_id(num: int) -> str: +# """Encodes a number as a string, using reverse spreadsheet style naming. +# Args: +# num: A positive integer. + +# Returns: +# A string that encodes the positive integer using reverse spreadsheet style, +# naming e.g. 1 = A, 2 = B, ..., 27 = AA, 28 = BA, 29 = CA, ... This is the +# usual way to encode chain IDs in mmCIF files. +# """ +# if num <= 0: +# raise ValueError(f'Only positive integers allowed, got {num}.') + +# num = num - 1 # 1-based indexing. +# output = [] +# while num >= 0: +# output.append(chr(num % 26 + ord('A'))) +# num = num // 26 - 1 +# return ''.join(output) + + +# File lists +# NEW: +# [4ZQK2_uniref90_hits.sto 4ZQK1_uniref90_hits.sto] +# [4ZQK1_mgnify_hits.sto 4ZQK2_mgnify_hits.sto] +# [4ZQK1_uniprot_hits.sto 4ZQK2_uniprot_hits.sto] +# [4ZQK1_bfd_hits.a3m 4ZQK2_bfd_hits.a3m] +# [4ZQK2_pdb_hits.sto 4ZQK1_pdb_hits.sto] + +# OLD: +# 5nzz_A_uniref90_hits.sto 5nzz_B_uniref90_hits.sto +# 5nzz_A_mgnify_hits.sto 5nzz_B_mgnify_hits.sto +# 5nzz_A_uniprot_hits.sto 5nzz_B_uniprot_hits.sto +# 5nzz_A_bfd_hits.a3m 5nzz_B_bfd_hits.a3m +# 5nzz_A_pdb_hits.sto 5nzz_B_pdb_hits.sto + +STRIP_SUFFIXES = [ + "_uniref90_hits.sto", + "_mgnify_hits.sto", + "_uniprot_hits.sto", + "_bfd_hits.a3m", + "_pdb_hits.sto" +] + +def strip_suffixes(s: str, suffixes: list[str]): + for suffix in suffixes: + if s.endswith(suffix): + return (s[:-len(suffix)], suffix) + return (s, None) + +# target_dir = msa def update_locations(target_dir, file_list): for filename in file_list: - # index, _null, outfile = filename.partition("_") - # index = index.split(".")[1] - # chain = int_id_to_str_id(int(index)) - [_null, chain, database, file] = filename.split("_") - outfile = "_".join([database, file]) - # print(f'file: {filename} index: {index} chain: {chain} outfile:{outfile}') - print(f'file: {filename} chain: {chain} outfile:{outfile}') - chain = os.path.join(target_dir, chain) - path = pathlib.Path(chain) - - - if not path.exists(): - path.mkdir(parents=True) - shutil.copy(filename, os.path.join(chain, outfile), follow_symlinks=True) + + # strip suffixes from filename + # e.g. "5nzz_A_uniref90_hits.sto" -> + # stripped_filename = "5nzz_A" + # stripped_suffix = _uniref90_hits.sto + (stripped_filename, stripped_suffix) = strip_suffixes(filename, STRIP_SUFFIXES) + if stripped_suffix == None: + raise Exception(f"expected suffixes not found in filename: {filename}") + + # "_uniref90_hits.sto" -> "uniref90_hits.sto" + outfile = stripped_suffix[1:] + + if "_" in stripped_filename: + # assume 5nzz_A format + # chain = A + chain = stripped_filename[-1].upper() + else: + # assume 4ZQK2 format + # chain = B + chain = int_id_to_str_id(int(stripped_filename[-1])) + + chain_dir = os.path.join(target_dir, chain) + chain_dir_path = pathlib.Path(chain_dir) + + if not chain_dir_path.exists(): + chain_dir_path.mkdir(parents=True) + target = os.path.join(chain_dir_path, outfile) + print(f"COPY {filename} -> {target}") + shutil.copy(filename, target, follow_symlinks=True) + if __name__ == "__main__": update_locations(sys.argv[1], sys.argv[2:]) diff --git a/assets/workflows/alphafold2-multimer/main.nf b/assets/workflows/alphafold2-multimer/main.nf index 71dd073..3cdc440 100644 --- a/assets/workflows/alphafold2-multimer/main.nf +++ b/assets/workflows/alphafold2-multimer/main.nf @@ -57,7 +57,7 @@ workflow { def recordList = [] records.forEach { record -> - def newRecordFile = file("${fastaBaseName}-${record.id}.fasta") + def newRecordFile = file("${record.id}.fasta") newRecordFile.setText(record.text) recordList.add(tuple (fastaBaseName, newRecordFile.getBaseName(), newRecordFile)) } diff --git a/assets/workflows/alphafold2-multimer/nextflow.config b/assets/workflows/alphafold2-multimer/nextflow.config index ba03fb6..e4c1491 100644 --- a/assets/workflows/alphafold2-multimer/nextflow.config +++ b/assets/workflows/alphafold2-multimer/nextflow.config @@ -37,9 +37,9 @@ params { process { // withLabel: protutils { container = '{{protein-utils:latest}}'} - withLabel: protutils { container = '886436931557.dkr.ecr.us-east-1.amazonaws.com/protein-utils:develop'} + withLabel: protutils { container = '886436931557.dkr.ecr.us-east-1.amazonaws.com/protein-utils:developjj'} // withLabel: data { container = '{{alphafold-data:latest}}'} - withLabel: data { container = '886436931557.dkr.ecr.us-east-1.amazonaws.com/alphafold-data:develop'} + withLabel: data { container = '886436931557.dkr.ecr.us-east-1.amazonaws.com/alphafold-data:developjj'} // withLabel: predict { container = '{{alphafold-predict:latest}}'} - withLabel: predict { container = '886436931557.dkr.ecr.us-east-1.amazonaws.com/alphafold-predict:develop'} + withLabel: predict { container = '886436931557.dkr.ecr.us-east-1.amazonaws.com/alphafold-predict:developjj'} } \ No newline at end of file diff --git a/scripts/testrun.sh b/scripts/testrun.sh index 26d0b77..c86f591 100755 --- a/scripts/testrun.sh +++ b/scripts/testrun.sh @@ -39,15 +39,15 @@ aws ecr get-login-password --region $REGION | docker login --username AWS --pass # rfdiffusion is the only workflow name that is 1:1 with container name if [ "$WORKFLOW_NAME" != "rfdiffusion" ]; then pushd assets/containers - bash ../workflows/$WORKFLOW_NAME/build_containers.sh $REGION $ACCOUNT_ID develop + bash ../workflows/$WORKFLOW_NAME/build_containers.sh $REGION $ACCOUNT_ID developjj popd else docker build \ --platform linux/amd64 \ - -t $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$WORKFLOW_NAME:develop \ + -t $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$WORKFLOW_NAME:developjj \ -f assets/containers/$WORKFLOW_NAME/Dockerfile assets/containers/$WORKFLOW_NAME - docker push $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$WORKFLOW_NAME:develop + docker push $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$WORKFLOW_NAME:developjj fi # Package the workflow @@ -72,6 +72,8 @@ aws omics wait workflow-active --region $REGION --id $workflow_id # Run the workflow start_run_command="aws omics start-run \ --retention-mode REMOVE \ + --cache-id "4556527" \ + --cache-behavior "CACHE_ALWAYS" \ --storage-type STATIC \ --storage-capacity 9600 \ --workflow-id $workflow_id \ From 2bf3ab476b62dddbbaa387d9ace7d522b2623c16 Mon Sep 17 00:00:00 2001 From: John Jacquay Date: Thu, 19 Dec 2024 12:20:39 -0700 Subject: [PATCH 13/20] comments --- assets/workflows/alphafold2-multimer/main.nf | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/assets/workflows/alphafold2-multimer/main.nf b/assets/workflows/alphafold2-multimer/main.nf index 3cdc440..1cd8bcd 100644 --- a/assets/workflows/alphafold2-multimer/main.nf +++ b/assets/workflows/alphafold2-multimer/main.nf @@ -39,6 +39,7 @@ workflow { // 5mlq.fasta CheckAndValidateInputsTask(fasta_files) + // Explode/scatter the fasta files into channels per contained record ID splitFastaWithBasename = CheckAndValidateInputsTask.out.fasta.map { f -> return tuple (f.baseName, f.splitFasta( record: [id: true, text: true] )) } @@ -51,6 +52,8 @@ workflow { // or // [4ZQK_simple, 4ZQK2, 4ZQK2.fasta] // [4ZQK_simple, 4ZQK1, 4ZQK1.fasta] + // + // Write the exploded fasta records to their own file, include in tuple that contains original fasta file basename split_seqs = splitFastaWithBasename.map { t -> def fastaBaseName = t[0] def records = t[1] @@ -88,6 +91,8 @@ workflow { // [4ZQK_simple, 4ZQK2, 4ZQK2.fasta] // [4ZQK_simple, 4ZQK1, 4ZQK1.fasta] + // + // Searches are call for each fastas * records SearchUniref90(split_seqs, params.uniref90_database_src) SearchMgnify(split_seqs, params.mgnify_database_src) SearchUniprot(split_seqs, params.uniprot_database_src) @@ -97,6 +102,8 @@ workflow { // [5nl6, 5nl6.fasta, [output_5nl6_A/5nl6_A_uniref90_hits.sto, output_5nl6_B/5nl6_B_uniref90_hits.sto], [output_5nl6_B/5nl6_B_mgnify_hits.sto, output_5nl6_A/5nl6_A_mgnify_hits.sto], ...] // [5mlq, 5mlq.fasta, [output_5mlq_A/5mlq_A_uniref90_hits.sto, output_5mlq_B/5mlq_B_uniref90_hits.sto], [output_5mlq_A/5mlq_A_mgnify_hits.sto, output_5mlq_B/5mlq_B_mgnify_hits.sto], ...] + // + // Combine/gather the search results into channels per original fasta file msa_tuples = fasta_files .join(SearchUniref90.out.fasta_basename_with_msa.groupTuple()) .join(SearchMgnify.out.fasta_basename_with_msa.groupTuple()) @@ -104,9 +111,11 @@ workflow { .join(SearchBFD.out.fasta_basename_with_msa.groupTuple()) .join(SearchTemplatesTask.out.fasta_basename_with_msa.groupTuple()) - // Gather + // Per original fasta file, move all of the search result files (ArrayList of files) into single directory structure: msa/A, msa/B, ... + // Emit the first two elements of msa_tuples, and a single merged msa/ directory CombineSearchResults(msa_tuples) + // Called per original fasta input file GenerateFeaturesTask(CombineSearchResults.out.fasta_basename_fasta_and_msa_path, UnpackMMCIF.out.db_folder, UnpackMMCIF.out.db_obsolete) From 676f04de14a520e0bf766b9b0b8e9d6fa1bfd0ef Mon Sep 17 00:00:00 2001 From: John Jacquay Date: Thu, 19 Dec 2024 12:52:17 -0700 Subject: [PATCH 14/20] indexed record ids --- .../alphafold-data/update_locations.py | 39 ++++++++----------- assets/workflows/alphafold2-multimer/main.nf | 12 ++++-- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/assets/containers/alphafold-data/update_locations.py b/assets/containers/alphafold-data/update_locations.py index 500307f..995ed07 100644 --- a/assets/containers/alphafold-data/update_locations.py +++ b/assets/containers/alphafold-data/update_locations.py @@ -45,6 +45,14 @@ # 5nzz_A_bfd_hits.a3m 5nzz_B_bfd_hits.a3m # 5nzz_A_pdb_hits.sto 5nzz_B_pdb_hits.sto +# NEW NEW: +# [5nl6, 5nl6.1, /Users/john/Documents/Code/nextflow-learning/work/dd/64a9f556e8a9f924448cb6e0b35d88/5nl6.1.fasta] +# [5nl6, 5nl6.2, /Users/john/Documents/Code/nextflow-learning/work/dd/64a9f556e8a9f924448cb6e0b35d88/5nl6.2.fasta] +# [5od9, 5od9.1, /Users/john/Documents/Code/nextflow-learning/work/d4/47cd7ab0ebe06438f83aba70061876/5od9.1.fasta] +# [5od9, 5od9.2, /Users/john/Documents/Code/nextflow-learning/work/d4/47cd7ab0ebe06438f83aba70061876/5od9.2.fasta] + +# 5nl6.1_uniref90_hits.sto 5nl6.2_uniref90_hits.sto + STRIP_SUFFIXES = [ "_uniref90_hits.sto", "_mgnify_hits.sto", @@ -63,28 +71,15 @@ def strip_suffixes(s: str, suffixes: list[str]): def update_locations(target_dir, file_list): for filename in file_list: - # strip suffixes from filename - # e.g. "5nzz_A_uniref90_hits.sto" -> - # stripped_filename = "5nzz_A" - # stripped_suffix = _uniref90_hits.sto - (stripped_filename, stripped_suffix) = strip_suffixes(filename, STRIP_SUFFIXES) - if stripped_suffix == None: - raise Exception(f"expected suffixes not found in filename: {filename}") - - # "_uniref90_hits.sto" -> "uniref90_hits.sto" - outfile = stripped_suffix[1:] - - if "_" in stripped_filename: - # assume 5nzz_A format - # chain = A - chain = stripped_filename[-1].upper() - else: - # assume 4ZQK2 format - # chain = B - chain = int_id_to_str_id(int(stripped_filename[-1])) - - chain_dir = os.path.join(target_dir, chain) - chain_dir_path = pathlib.Path(chain_dir) + # Indexed format: 5nl6.1_uniref90_hits.sto + # record_id = 5nl6.1 + # outfile = uniref90_hits.sto + record_id, _null, outfile = filename.partition("_") + record_inx = int(record_id[-1]) + + chain = int_id_to_str_id(record_inx) + + chain_dir_path = pathlib.Path(os.path.join(target_dir, chain)) if not chain_dir_path.exists(): chain_dir_path.mkdir(parents=True) diff --git a/assets/workflows/alphafold2-multimer/main.nf b/assets/workflows/alphafold2-multimer/main.nf index 1cd8bcd..6b3b2be 100644 --- a/assets/workflows/alphafold2-multimer/main.nf +++ b/assets/workflows/alphafold2-multimer/main.nf @@ -41,7 +41,9 @@ workflow { // Explode/scatter the fasta files into channels per contained record ID splitFastaWithBasename = CheckAndValidateInputsTask.out.fasta.map { f -> - return tuple (f.baseName, f.splitFasta( record: [id: true, text: true] )) + // use indexed fasta names rather than record + return tuple (f.baseName, f.splitFasta( file: true )) + // return tuple (f.baseName, f.splitFasta( record: [id: true, text: true] )) } // Write fasta records and return expected tuple format: @@ -60,9 +62,11 @@ workflow { def recordList = [] records.forEach { record -> - def newRecordFile = file("${record.id}.fasta") - newRecordFile.setText(record.text) - recordList.add(tuple (fastaBaseName, newRecordFile.getBaseName(), newRecordFile)) + // def newRecordFile = file("${record.id}.fasta") + // newRecordFile.setText(record.text) + // + // use indexed fasta names rather than record + recordList.add(tuple (fastaBaseName, record.getBaseName(), record)) } return recordList } | flatMap From 1365ce07269015edb78abcb730b8e0df905ff104 Mon Sep 17 00:00:00 2001 From: John Jacquay Date: Thu, 19 Dec 2024 13:16:23 -0700 Subject: [PATCH 15/20] Cleanup --- .../alphafold-data/update_locations.py | 65 ++----------------- assets/workflows/alphafold2-multimer/main.nf | 44 ++++--------- 2 files changed, 20 insertions(+), 89 deletions(-) diff --git a/assets/containers/alphafold-data/update_locations.py b/assets/containers/alphafold-data/update_locations.py index 995ed07..3db54a4 100644 --- a/assets/containers/alphafold-data/update_locations.py +++ b/assets/containers/alphafold-data/update_locations.py @@ -8,64 +8,13 @@ from alphafold.data.pipeline_multimer import int_id_to_str_id -# def int_id_to_str_id(num: int) -> str: -# """Encodes a number as a string, using reverse spreadsheet style naming. - -# Args: -# num: A positive integer. - -# Returns: -# A string that encodes the positive integer using reverse spreadsheet style, -# naming e.g. 1 = A, 2 = B, ..., 27 = AA, 28 = BA, 29 = CA, ... This is the -# usual way to encode chain IDs in mmCIF files. -# """ -# if num <= 0: -# raise ValueError(f'Only positive integers allowed, got {num}.') - -# num = num - 1 # 1-based indexing. -# output = [] -# while num >= 0: -# output.append(chr(num % 26 + ord('A'))) -# num = num // 26 - 1 -# return ''.join(output) - - -# File lists -# NEW: -# [4ZQK2_uniref90_hits.sto 4ZQK1_uniref90_hits.sto] -# [4ZQK1_mgnify_hits.sto 4ZQK2_mgnify_hits.sto] -# [4ZQK1_uniprot_hits.sto 4ZQK2_uniprot_hits.sto] -# [4ZQK1_bfd_hits.a3m 4ZQK2_bfd_hits.a3m] -# [4ZQK2_pdb_hits.sto 4ZQK1_pdb_hits.sto] - -# OLD: -# 5nzz_A_uniref90_hits.sto 5nzz_B_uniref90_hits.sto -# 5nzz_A_mgnify_hits.sto 5nzz_B_mgnify_hits.sto -# 5nzz_A_uniprot_hits.sto 5nzz_B_uniprot_hits.sto -# 5nzz_A_bfd_hits.a3m 5nzz_B_bfd_hits.a3m -# 5nzz_A_pdb_hits.sto 5nzz_B_pdb_hits.sto - -# NEW NEW: -# [5nl6, 5nl6.1, /Users/john/Documents/Code/nextflow-learning/work/dd/64a9f556e8a9f924448cb6e0b35d88/5nl6.1.fasta] -# [5nl6, 5nl6.2, /Users/john/Documents/Code/nextflow-learning/work/dd/64a9f556e8a9f924448cb6e0b35d88/5nl6.2.fasta] -# [5od9, 5od9.1, /Users/john/Documents/Code/nextflow-learning/work/d4/47cd7ab0ebe06438f83aba70061876/5od9.1.fasta] -# [5od9, 5od9.2, /Users/john/Documents/Code/nextflow-learning/work/d4/47cd7ab0ebe06438f83aba70061876/5od9.2.fasta] - -# 5nl6.1_uniref90_hits.sto 5nl6.2_uniref90_hits.sto - -STRIP_SUFFIXES = [ - "_uniref90_hits.sto", - "_mgnify_hits.sto", - "_uniprot_hits.sto", - "_bfd_hits.a3m", - "_pdb_hits.sto" -] - -def strip_suffixes(s: str, suffixes: list[str]): - for suffix in suffixes: - if s.endswith(suffix): - return (s[:-len(suffix)], suffix) - return (s, None) +# Example file_lists: +# +# [4ZQK.1_uniref90_hits.sto 4ZQK.2_uniref90_hits.sto] +# [4ZQK.2_mgnify_hits.sto 4ZQK.1_mgnify_hits.sto] +# [4ZQK.1_uniprot_hits.sto 4ZQK.2_uniprot_hits.sto] +# [4ZQK.1_bfd_hits.a3m 4ZQK.2_bfd_hits.a3m] +# [4ZQK.1_pdb_hits.sto 4ZQK.2_pdb_hits.sto] # target_dir = msa def update_locations(target_dir, file_list): diff --git a/assets/workflows/alphafold2-multimer/main.nf b/assets/workflows/alphafold2-multimer/main.nf index 6b3b2be..aa7f73d 100644 --- a/assets/workflows/alphafold2-multimer/main.nf +++ b/assets/workflows/alphafold2-multimer/main.nf @@ -40,35 +40,20 @@ workflow { CheckAndValidateInputsTask(fasta_files) // Explode/scatter the fasta files into channels per contained record ID - splitFastaWithBasename = CheckAndValidateInputsTask.out.fasta.map { f -> - // use indexed fasta names rather than record - return tuple (f.baseName, f.splitFasta( file: true )) - // return tuple (f.baseName, f.splitFasta( record: [id: true, text: true] )) - } - - // Write fasta records and return expected tuple format: - // [5nl6, 5nl6_A, 5nl6_A.fasta] - // [5nl6, 5nl6_B, 5nl6_B.fasta] - // [5mlq, 5mlq_A, 5mlq_A.fasta] - // [5mlq, 5mlq_B, 5mlq_B.fasta] - // or - // [4ZQK_simple, 4ZQK2, 4ZQK2.fasta] - // [4ZQK_simple, 4ZQK1, 4ZQK1.fasta] - // // Write the exploded fasta records to their own file, include in tuple that contains original fasta file basename - split_seqs = splitFastaWithBasename.map { t -> - def fastaBaseName = t[0] - def records = t[1] - - def recordList = [] + // [5nl6, 5nl6.1, 5nl6.1.fasta] + // [5nl6, 5nl6.2, 5nl6.2.fasta] + // [5mlq, 5mlq.1, 5mlq.1.fasta] + // [5mlq, 5mlq.2, 5mlq.2.fasta] + split_seqs = CheckAndValidateInputsTask.out.fasta.map { fastaFile -> + def fastaBaseName = fastaFile.baseName + def records = fastaFile.splitFasta( file: true ) + + def fastaRecordTupleList = [] records.forEach { record -> - // def newRecordFile = file("${record.id}.fasta") - // newRecordFile.setText(record.text) - // - // use indexed fasta names rather than record - recordList.add(tuple (fastaBaseName, record.getBaseName(), record)) + fastaRecordTupleList.add(tuple (fastaBaseName, record.getBaseName(), record)) } - return recordList + return fastaRecordTupleList } | flatMap // uniref30 = Channel.fromPath(params.uniref30_database_src).first() @@ -93,9 +78,6 @@ workflow { params.pdb_mmcif_src9, params.pdb_mmcif_obsolete) - // [4ZQK_simple, 4ZQK2, 4ZQK2.fasta] - // [4ZQK_simple, 4ZQK1, 4ZQK1.fasta] - // // Searches are call for each fastas * records SearchUniref90(split_seqs, params.uniref90_database_src) SearchMgnify(split_seqs, params.mgnify_database_src) @@ -104,8 +86,8 @@ workflow { SearchTemplatesTask(SearchUniref90.out.fasta_basename_with_record_id_and_msa, UnpackPdb70nSeqres.out.db_folder) - // [5nl6, 5nl6.fasta, [output_5nl6_A/5nl6_A_uniref90_hits.sto, output_5nl6_B/5nl6_B_uniref90_hits.sto], [output_5nl6_B/5nl6_B_mgnify_hits.sto, output_5nl6_A/5nl6_A_mgnify_hits.sto], ...] - // [5mlq, 5mlq.fasta, [output_5mlq_A/5mlq_A_uniref90_hits.sto, output_5mlq_B/5mlq_B_uniref90_hits.sto], [output_5mlq_A/5mlq_A_mgnify_hits.sto, output_5mlq_B/5mlq_B_mgnify_hits.sto], ...] + // [5nl6, 5nl6.fasta, [output_5nl6.1/5nl6.1_uniref90_hits.sto, output_5nl6.2/5nl6.2_uniref90_hits.sto], [output_5nl6.2/5nl6.2_mgnify_hits.sto, output_5nl6.1/5nl6.1_mgnify_hits.sto], ...] + // [5mlq, 5mlq.fasta, [output_5mlq.1/5mlq.1_uniref90_hits.sto, output_5mlq.2/5mlq.2_uniref90_hits.sto], [output_5mlq.1/5mlq.1_mgnify_hits.sto, output_5mlq.2/5mlq.2_mgnify_hits.sto], ...] // // Combine/gather the search results into channels per original fasta file msa_tuples = fasta_files From 020ae6feecdaa4cf55dda3890353544f7b40a6b9 Mon Sep 17 00:00:00 2001 From: John Jacquay Date: Thu, 19 Dec 2024 13:19:04 -0700 Subject: [PATCH 16/20] More cleanup --- assets/workflows/alphafold2-multimer/main.nf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/assets/workflows/alphafold2-multimer/main.nf b/assets/workflows/alphafold2-multimer/main.nf index aa7f73d..7c06771 100644 --- a/assets/workflows/alphafold2-multimer/main.nf +++ b/assets/workflows/alphafold2-multimer/main.nf @@ -39,7 +39,7 @@ workflow { // 5mlq.fasta CheckAndValidateInputsTask(fasta_files) - // Explode/scatter the fasta files into channels per contained record ID + // Explode/scatter the fasta files into channel items per contained record ID // Write the exploded fasta records to their own file, include in tuple that contains original fasta file basename // [5nl6, 5nl6.1, 5nl6.1.fasta] // [5nl6, 5nl6.2, 5nl6.2.fasta] @@ -56,7 +56,6 @@ workflow { return fastaRecordTupleList } | flatMap - // uniref30 = Channel.fromPath(params.uniref30_database_src).first() alphafold_model_parameters = Channel.fromPath(params.alphafold_model_parameters).first() // Unpack the databases From 8c4a876e6c7bef21495124096c4a26185ed1097a Mon Sep 17 00:00:00 2001 From: John Jacquay Date: Thu, 19 Dec 2024 13:36:21 -0700 Subject: [PATCH 17/20] Cleanup for PR --- assets/workflows/alphafold2-multimer/nextflow.config | 9 +++------ scripts/testrun.sh | 6 +++--- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/assets/workflows/alphafold2-multimer/nextflow.config b/assets/workflows/alphafold2-multimer/nextflow.config index e4c1491..9be7d61 100644 --- a/assets/workflows/alphafold2-multimer/nextflow.config +++ b/assets/workflows/alphafold2-multimer/nextflow.config @@ -36,10 +36,7 @@ params { } process { -// withLabel: protutils { container = '{{protein-utils:latest}}'} - withLabel: protutils { container = '886436931557.dkr.ecr.us-east-1.amazonaws.com/protein-utils:developjj'} -// withLabel: data { container = '{{alphafold-data:latest}}'} - withLabel: data { container = '886436931557.dkr.ecr.us-east-1.amazonaws.com/alphafold-data:developjj'} -// withLabel: predict { container = '{{alphafold-predict:latest}}'} - withLabel: predict { container = '886436931557.dkr.ecr.us-east-1.amazonaws.com/alphafold-predict:developjj'} + withLabel: protutils { container = '{{protein-utils:latest}}'} + withLabel: data { container = '{{alphafold-data:latest}}'} + withLabel: predict { container = '{{alphafold-predict:latest}}'} } \ No newline at end of file diff --git a/scripts/testrun.sh b/scripts/testrun.sh index c86f591..503e275 100755 --- a/scripts/testrun.sh +++ b/scripts/testrun.sh @@ -39,15 +39,15 @@ aws ecr get-login-password --region $REGION | docker login --username AWS --pass # rfdiffusion is the only workflow name that is 1:1 with container name if [ "$WORKFLOW_NAME" != "rfdiffusion" ]; then pushd assets/containers - bash ../workflows/$WORKFLOW_NAME/build_containers.sh $REGION $ACCOUNT_ID developjj + bash ../workflows/$WORKFLOW_NAME/build_containers.sh $REGION $ACCOUNT_ID develop popd else docker build \ --platform linux/amd64 \ - -t $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$WORKFLOW_NAME:developjj \ + -t $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$WORKFLOW_NAME:develop \ -f assets/containers/$WORKFLOW_NAME/Dockerfile assets/containers/$WORKFLOW_NAME - docker push $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$WORKFLOW_NAME:developjj + docker push $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$WORKFLOW_NAME:develop fi # Package the workflow From f968aab91e43dfc361213de8ccb225f4abd7f194 Mon Sep 17 00:00:00 2001 From: John Jacquay Date: Thu, 19 Dec 2024 13:41:23 -0700 Subject: [PATCH 18/20] rm test branch --- assets/workflows/alphafold-multimer/main.nf | 242 -------------------- 1 file changed, 242 deletions(-) delete mode 100644 assets/workflows/alphafold-multimer/main.nf diff --git a/assets/workflows/alphafold-multimer/main.nf b/assets/workflows/alphafold-multimer/main.nf deleted file mode 100644 index 96773f1..0000000 --- a/assets/workflows/alphafold-multimer/main.nf +++ /dev/null @@ -1,242 +0,0 @@ -nextflow.enable.dsl = 2 - -params.fasta_path = "" - -// static data files are in nextflow.config - -include { - SearchUniref90; - SearchMgnify; - SearchBFD; - SearchTemplatesTask; - SearchUniprot; - CombineSearchResults; -} from './searches.nf' - -include { - UnpackBFD; - UnpackPdb70nSeqres; - UnpackMMCIF; -} from './unpack.nf' - - -workflow { - - // Convert to one or many files - if (params.fasta_path[-1] == "/") { - fasta_path = params.fasta_path + "*" - } else { - fasta_path = params.fasta_path - } - - // [5nl6, 5nl6.fasta] - // [5mlq, 5mlq.fasta] - fasta_files = Channel - .fromPath(fasta_path) - .map { filename -> tuple ( filename.toString().split("/")[-1].split(".fasta")[0], filename) } - - // 5nl6.fasta - // 5mlq.fasta - CheckAndValidateInputsTask(fasta_files) - - // [5nl6, 5nl6.1, 5nl6.1.fasta] - // [5nl6, 5nl6.2, 5nl6.2.fasta] - // [5mlq, 5mlq.1, 5mlq.1.fasta] - // [5mlq, 5mlq.2, 5mlq.2.fasta] - split_seqs = CheckAndValidateInputsTask.out.fasta - .splitFasta( file: true ) - .map { filename -> tuple (filename.getBaseName().split("\\.")[0], filename.getBaseName(), filename) } - - uniref30 = Channel.fromPath(params.uniref30_database_src).first() - alphafold_model_parameters = Channel.fromPath(params.alphafold_model_parameters).first() - - // Unpack the databases - UnpackBFD(params.bfd_database_a3m_ffdata, - params.bfd_database_a3m_ffindex, - params.bfd_database_cs219_ffdata, - params.bfd_database_cs219_ffindex, - params.bfd_database_hhm_ffdata, - params.bfd_database_hhm_ffindex) - UnpackPdb70nSeqres(params.pdb70_src, params.pdb_seqres_src, params.db_pathname) - UnpackMMCIF(params.pdb_mmcif_src1, - params.pdb_mmcif_src2, - params.pdb_mmcif_src3, - params.pdb_mmcif_src4, - params.pdb_mmcif_src5, - params.pdb_mmcif_src6, - params.pdb_mmcif_src7, - params.pdb_mmcif_src8, - params.pdb_mmcif_src9, - params.pdb_mmcif_obsolete) - - SearchUniref90(split_seqs, params.uniref90_database_src) - SearchMgnify(split_seqs, params.mgnify_database_src) - SearchUniprot(split_seqs, params.uniprot_database_src) - SearchBFD(split_seqs, UnpackBFD.out.db_folder, params.uniref30_database_src) - SearchTemplatesTask(SearchUniref90.out.fasta_basename_with_record_id_and_msa, UnpackPdb70nSeqres.out.db_folder) - - // [5nl6, 5nl6.fasta, [output_5nl6_A/5nl6_A_uniref90_hits.sto, output_5nl6_B/5nl6_B_uniref90_hits.sto], [output_5nl6_B/5nl6_B_mgnify_hits.sto, output_5nl6_A/5nl6_A_mgnify_hits.sto], ...] - // [5mlq, 5mlq.fasta, [output_5mlq_A/5mlq_A_uniref90_hits.sto, output_5mlq_B/5mlq_B_uniref90_hits.sto], [output_5mlq_A/5mlq_A_mgnify_hits.sto, output_5mlq_B/5mlq_B_mgnify_hits.sto], ...] - msa_tuples = fasta_files - .join(SearchUniref90.out.fasta_basename_with_msa.groupTuple()) - .join(SearchMgnify.out.fasta_basename_with_msa.groupTuple()) - .join(SearchUniprot.out.fasta_basename_with_msa.groupTuple()) - .join(SearchBFD.out.fasta_basename_with_msa.groupTuple()) - .join(SearchTemplatesTask.out.fasta_basename_with_msa.groupTuple()) - - // Gather - CombineSearchResults(msa_tuples) - - GenerateFeaturesTask(CombineSearchResults.out.fasta_basename_fasta_and_msa_path, - UnpackMMCIF.out.db_folder, - UnpackMMCIF.out.db_obsolete) - - // Predict. Five separate models - model_nums = Channel.of(0,1,2,3,4) - features = GenerateFeaturesTask.out.fasta_basename_with_features.combine(model_nums) - AlphaFoldMultimerInference(features, alphafold_model_parameters, params.random_seed, params.run_relax) - - MergeRankings(AlphaFoldMultimerInference.out.results.groupTuple(by: 0)) -} - -// Check the inputs and get size etc -process CheckAndValidateInputsTask { - tag "${fasta_basename}" - label 'protutils' - cpus 2 - memory '4 GB' - publishDir "/mnt/workflow/pubdir/${fasta_basename}/inputs" - - input: - tuple val(fasta_basename), path(fasta_path) - - output: - stdout - path "seq_info.json", emit: seq_info - path "${fasta_basename}.fasta", emit: fasta - val "${fasta_basename}", emit: fasta_basename - - script: - """ - set -euxo pipefail - - echo ">>>>>>>>>>>>>>>>>>>" - echo $fasta_basename - echo $fasta_path - echo "<<<<<<<<<<<<<<<<<<<" - - ls -alR - - /opt/venv/bin/python \ - /opt/venv/lib/python3.8/site-packages/putils/check_and_validate_inputs.py \ - --target_id=$fasta_basename --fasta_path=$fasta_path - """ -} - -// Generate features from the searches -process GenerateFeaturesTask { - tag "${fasta_basename}" - label 'data' - cpus 4 - memory '16 GB' - publishDir "/mnt/workflow/pubdir/${fasta_basename}/features" - - input: - tuple val(fasta_basename), path(fasta_path), path(msa_dir) - path pdb_mmcif_folder - path mmcif_obsolete_path - - output: - tuple val(fasta_basename), path("output/features.pkl"), emit: fasta_basename_with_features - path "output/features.pkl", emit: features - path "output/generate_features_metrics.json", emit: metrics - - script: - """ - set -euxo pipefail - - mkdir -p output - - echo "***********************" - ls -alR $msa_dir/ - echo "***********************" - - /opt/venv/bin/python /opt/generate_features.py \ - --fasta_paths=$fasta_path \ - --msa_dir=$msa_dir \ - --template_mmcif_dir="$pdb_mmcif_folder" \ - --obsolete_pdbs_path="$mmcif_obsolete_path" \ - --template_hits="$msa_dir/pdb_hits.sto" \ - --model_preset=multimer \ - --output_dir=output \ - --max_template_date=2023-01-01 - - echo "***********************" - ls -alR output/ - echo "***********************" - - mv output/metrics.json output/generate_features_metrics.json - """ -} - -// AlphaFold Multimer -process AlphaFoldMultimerInference { - tag "${fasta_basename}_${modelnum}" - errorStrategy 'retry' - label 'predict' - cpus { 4 * Math.pow(2, task.attempt) } - memory { 16.GB * Math.pow(2, task.attempt) } - accelerator 1, type: 'nvidia-tesla-a10g' - maxRetries 2 - publishDir "/mnt/workflow/pubdir/${fasta_basename}/prediction_${modelnum}" - input: - tuple val(fasta_basename), path (features), val(modelnum) - path alphafold_model_parameters - val random_seed - val run_relax - - output: - tuple val(fasta_basename), path("output_model_${modelnum}/"), emit: results - - script: - """ - set -euxo pipefail - mkdir -p model/params - tar -xvf $alphafold_model_parameters -C model/params - export XLA_PYTHON_CLIENT_MEM_FRACTION=4.0 - export TF_FORCE_UNIFIED_MEMORY=1 - /opt/conda/bin/python /app/alphafold/predict.py \ - --target_id=$fasta_basename --features_path=$features --model_preset=multimer \ - --model_dir=model --random_seed=$random_seed --output_dir=output_model_${modelnum} \ - --run_relax=${run_relax} --use_gpu_relax=${run_relax} --model_num=$modelnum - - rm -rf output_model_${modelnum}/msas - """ -} - - -// Merge Rankings -process MergeRankings { - tag "${id}" - cpus 2 - memory 4.GB - publishDir "/mnt/workflow/pubdir/${id}" - label 'data' - - input: - tuple val(id), path(results) - - output: - path "rankings.json", emit: rankings - path "top_hit*", emit: top_hit - - script: - """ - mkdir -p output - echo ${results} - # Create top hit - /opt/venv/bin/python /opt/merge_rankings.py --output_dir output/ --model_dirs ${results} - mv output/top_hit* . - mv output/rankings.json . - """ -} From dd70dde2f3432424ddb8b6e341a2c06f64f976a4 Mon Sep 17 00:00:00 2001 From: John Jacquay Date: Thu, 19 Dec 2024 13:44:26 -0700 Subject: [PATCH 19/20] Cleanup --- assets/workflows/alphafold2-multimer/params.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/workflows/alphafold2-multimer/params.json b/assets/workflows/alphafold2-multimer/params.json index fd4648f..46ffce8 100644 --- a/assets/workflows/alphafold2-multimer/params.json +++ b/assets/workflows/alphafold2-multimer/params.json @@ -1,3 +1,3 @@ { - "fasta_path":"s3://bioteam-compchem-test-inputs/alhpafold-multimer/" + "fasta_path":"s3://example-bucket/alphafold2-multimer/" } \ No newline at end of file From 7bb2edc24c178cd30445fcb72351126e2526faf2 Mon Sep 17 00:00:00 2001 From: John Jacquay Date: Thu, 19 Dec 2024 17:33:57 -0700 Subject: [PATCH 20/20] Fixes for update_locations.py edge case --- .../alphafold-data/update_locations.py | 29 ++++++++++++++----- .../workflows/alphafold2-multimer/searches.nf | 10 +++---- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/assets/containers/alphafold-data/update_locations.py b/assets/containers/alphafold-data/update_locations.py index 3db54a4..8a76baf 100644 --- a/assets/containers/alphafold-data/update_locations.py +++ b/assets/containers/alphafold-data/update_locations.py @@ -15,16 +15,31 @@ # [4ZQK.1_uniprot_hits.sto 4ZQK.2_uniprot_hits.sto] # [4ZQK.1_bfd_hits.a3m 4ZQK.2_bfd_hits.a3m] # [4ZQK.1_pdb_hits.sto 4ZQK.2_pdb_hits.sto] +# or +# 4ZQK_simple.1_uniref90_hits.sto 4ZQK_simple.2_uniref90_hits.sto +# 4ZQK_simple.1_mgnify_hits.sto 4ZQK_simple.2_mgnify_hits.sto +# 4ZQK_simple.2_uniprot_hits.sto 4ZQK_simple.1_uniprot_hits.sto +# 4ZQK_simple.2_bfd_hits.a3m 4ZQK_simple.1_bfd_hits.a3m +# 4ZQK_simple.1_pdb_hits.sto 4ZQK_simple.2_pdb_hits.sto + +def strip_suffix_str(s: str, suffix: str): + if s.endswith(suffix): + return s[:-len(suffix)] + return None # target_dir = msa -def update_locations(target_dir, file_list): +def update_locations(target_dir, strip_suffix, file_list): for filename in file_list: + # filename = 4ZQK_simple.1_uniref90_hits.sto + # strip_suffix = _uniref90_hits.sto - # Indexed format: 5nl6.1_uniref90_hits.sto - # record_id = 5nl6.1 - # outfile = uniref90_hits.sto - record_id, _null, outfile = filename.partition("_") - record_inx = int(record_id[-1]) + stripped_filename = strip_suffix_str(filename, strip_suffix) + if stripped_filename == None: + raise Exception(f"Suffix {strip_suffix} not in {filename}") + + # stripped_filename = 4ZQK_simple.1 + record_inx = int(stripped_filename[-1]) # 1 + outfile = strip_suffix[1:] # uniref90_hits.sto chain = int_id_to_str_id(record_inx) @@ -39,4 +54,4 @@ def update_locations(target_dir, file_list): if __name__ == "__main__": - update_locations(sys.argv[1], sys.argv[2:]) + update_locations(sys.argv[1], sys.argv[2], sys.argv[3:]) diff --git a/assets/workflows/alphafold2-multimer/searches.nf b/assets/workflows/alphafold2-multimer/searches.nf index b828d28..47d18bd 100644 --- a/assets/workflows/alphafold2-multimer/searches.nf +++ b/assets/workflows/alphafold2-multimer/searches.nf @@ -207,11 +207,11 @@ process CombineSearchResults { echo "<<<<<<<<<<<<<<<<<<<" mkdir -p msa - /opt/venv/bin/python /opt/update_locations.py msa $uniref90_msas - /opt/venv/bin/python /opt/update_locations.py msa $mgnify_msas - /opt/venv/bin/python /opt/update_locations.py msa $uniprot_msas - /opt/venv/bin/python /opt/update_locations.py msa $bfd_msas - /opt/venv/bin/python /opt/update_locations.py msa $template_hits + /opt/venv/bin/python /opt/update_locations.py msa _uniref90_hits.sto $uniref90_msas + /opt/venv/bin/python /opt/update_locations.py msa _mgnify_hits.sto $mgnify_msas + /opt/venv/bin/python /opt/update_locations.py msa _uniprot_hits.sto $uniprot_msas + /opt/venv/bin/python /opt/update_locations.py msa _bfd_hits.a3m $bfd_msas + /opt/venv/bin/python /opt/update_locations.py msa _pdb_hits.sto $template_hits echo "***********************" ls -alR msa/