Release for OpenFold

aws-samples · Nov 12, 2024 · 93f878c · 93f878c
1 parent 6379657
commit 93f878c
Show file tree

Hide file tree

Showing 17 changed files with 418 additions and 25 deletions.
diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,5 @@ tmp/
 .nextflow*
 stack-outputs.json
 test_data
+
+build/cloudformation/packaged.yaml
diff --git a/README.md b/README.md
@@ -131,6 +131,7 @@ Example run with full argument list:
 -r us-east-1 \
 -o "arn:aws:iam::123456789012:role/healthomics-dev-role" \
 -b mybucket \
+-f mydeployrefbucket \
 -p file://testparams/rfdiffusion.params.json
 ```
 
@@ -141,12 +142,15 @@ ACCOUNT_ID=123456789012
 REGION=us-east-1
 OMICS_EXECUTION_ROLE=arn:aws:iam::123456789012:role/healthomics-dev-role
 OUTPUT_BUCKET=mybucket
+REF_DATA_BUCKET=my-deployment-bucket
 ```
 
+`REF_DATA_BUCKET` should be the same bucket you specified during the CloudFormation deployment: `deploy.sh -b "my-deployment-bucket"`.
+
 and then:
 
 ```sh
-./scripts/testrun.sh -w rfdiffusion -p testparams/rfdiffusion.params.json
+./scripts/testrun.sh -w rfdiffusion -p "file://testparams/rfdiffusion.params.json"
 ```
 
 `s3:<BUCKET NAME SPECIFIED IN CFN>/ref-data/<FILENAME WITHOUT EXTENSION>/...`

diff --git a/assets/containers/openfold/Dockerfile b/assets/containers/openfold/Dockerfile
@@ -0,0 +1,55 @@
+# Tagged version 2.1.0
+# https://github.com/aqlaboratory/openfold/blob/v.2.1.0/Dockerfile
+
+# Modifications have been made to work within the aws-samples/drug-discovery-workflows repository
+# Mostly addition of git clone and comment out COPY statements 
+
+FROM nvidia/cuda:11.3.1-cudnn8-devel-ubuntu18.04
+
+# metainformation
+LABEL org.opencontainers.image.version = "1.0.0"
+LABEL org.opencontainers.image.authors = "Gustaf Ahdritz"
+LABEL org.opencontainers.image.source = "https://github.com/aqlaboratory/openfold"
+LABEL org.opencontainers.image.licenses = "Apache License 2.0"
+LABEL org.opencontainers.image.base.name="docker.io/nvidia/cuda:10.2-cudnn8-runtime-ubuntu18.04"
+
+RUN apt-key del 7fa2af80
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+
+RUN apt-get update && apt-get install -y wget libxml2 cuda-minimal-build-11-3 libcusparse-dev-11-3 libcublas-dev-11-3 libcusolver-dev-11-3 git tree gzip
+RUN wget -P /tmp \
+    "https://github.com/conda-forge/miniforge/releases/download/23.3.1-1/Miniforge3-Linux-x86_64.sh" \
+    && bash /tmp/Miniforge3-Linux-x86_64.sh -b -p /opt/conda \
+    && rm /tmp/Miniforge3-Linux-x86_64.sh
+ENV PATH /opt/conda/bin:$PATH
+
+# Tagged version 2.1.0
+RUN mkdir -p /opt/openfold
+RUN git clone https://github.com/aqlaboratory/openfold.git /opt/openfold && cd /opt/openfold && git checkout f434a2786b5a6b39171f358fb3470ad9f4fd2a58
+
+WORKDIR /opt/openfold
+
+# The git clone does this already
+# COPY environment.yml /opt/openfold/environment.yml
+
+# installing into the base environment since the docker container wont do anything other than run openfold
+RUN mamba env update -n base --file /opt/openfold/environment.yml && mamba clean --all
+RUN export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}
+
+# The git clone does this already
+# COPY openfold /opt/openfold/openfold
+# COPY scripts /opt/openfold/scripts
+# COPY run_pretrained_openfold.py /opt/openfold/run_pretrained_openfold.py
+# COPY train_openfold.py /opt/openfold/train_openfold.py
+# COPY setup.py /opt/openfold/setup.py
+
+COPY ./prep_mmseqs_dbs.mod.sh /opt/openfold/scripts/prep_mmseqs_dbs.sh
+
+RUN ./scripts/install_third_party_dependencies.sh
+
+# from scripts/install_third_party_dependencies.sh
+# RUN wget -q -P /opt/openfold/openfold/resources \
+#     https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt
+# WORKDIR /opt/openfold
+# RUN python3 setup.py install
diff --git a/assets/containers/openfold/prep_mmseqs_dbs.mod.sh b/assets/containers/openfold/prep_mmseqs_dbs.mod.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+#
+# Copyright 2021 AlQuraishi Laboratory 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips all required data for AlphaFold.
+#
+# Usage: bash download_all_data.sh /path/to/download/directory
+set -e
+
+DOWNLOAD_DIR="$1"
+TMP_DIR="$2"
+ROOT_DIR="${DOWNLOAD_DIR}/mmseqs_dbs"
+mkdir -p $ROOT_DIR
+
+for f in $(ls ${DOWNLOAD_DIR}/*.tar*)
+do
+  tar --extract --verbose --file="${f}" \
+      --directory=$ROOT_DIR
+  rm "${f}"
+  BASENAME="$(basename ${f%%.*})"
+  DB_NAME="${BASENAME}_db"
+  OLD_PWD=$(pwd)
+  cd $ROOT_DIR 
+  mmseqs tsv2exprofiledb "${BASENAME}" "${DB_NAME}"
+  # mmseqs createindex "${DB_NAME}" "${DOWNLOAD_DIR}/tmp/"
+  mmseqs createindex "${DB_NAME}" "${TMP_DIR}"
+  cd "${OLD_PWD}"
+done
diff --git a/assets/data/colabfold_envdb.txt b/assets/data/colabfold_envdb.txt
@@ -0,0 +1 @@
+https://wwwuser.gwdguser.de/~compbiol/colabfold/colabfold_envdb_202108.tar.gz
diff --git a/assets/data/mgy.txt b/assets/data/mgy.txt
@@ -1 +1,2 @@
 s3://omics-{{REGION}}/alphafold_multimer/mgy/
+https://storage.googleapis.com/alphafold-databases/v2.3/mgy_clusters_2022_05.fa.gz
diff --git a/assets/data/openfold.txt b/assets/data/openfold.txt
@@ -0,0 +1 @@
+s3://openfold/openfold_params/
diff --git a/assets/data/uniclust30.txt b/assets/data/uniclust30.txt
@@ -0,0 +1 @@
+https://storage.googleapis.com/alphafold-databases/casp14_versions/uniclust30_2018_08_hhsuite.tar.gz
diff --git a/assets/workflows/openfold/README.md b/assets/workflows/openfold/README.md
@@ -0,0 +1 @@
+This repository helps you set up and run OpenFold Monomer on AWS HealthOmics. At the end of the configuration, you should be able to run a full end-to-end inference.
diff --git a/assets/workflows/openfold/build_containers.sh b/assets/workflows/openfold/build_containers.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+set -ex
+
+REGION=$1
+ACCOUNT=$2
+TAG=${3:-latest}
+
+aws ecr get-login-password --region $REGION | docker login --username AWS --password-stdin $ACCOUNT.dkr.ecr.$REGION.amazonaws.com
+
+# build openfold
+cd openfold
+docker build --platform linux/amd64 -t $ACCOUNT.dkr.ecr.$REGION.amazonaws.com/openfold:$TAG .
+docker push $ACCOUNT.dkr.ecr.$REGION.amazonaws.com/openfold:$TAG
+cd ..
diff --git a/assets/workflows/openfold/config.yaml b/assets/workflows/openfold/config.yaml
@@ -0,0 +1,39 @@
+name: OpenFold
+description: "Predict single-chain protein structures with OpenFold"
+engine: NEXTFLOW
+main: main.nf
+parameterTemplate:
+  mmseq_db:
+    optional: true
+    description: "mmseq db"
+  uniref90:
+    optional: true
+    description: "uniref90"
+  uniref30:
+    optional: true
+    description: "uniref30"
+  mgnify:
+    optional: true
+    description: "mgnify"
+  pdb70:
+    optional: true
+    description: "pdb70"
+  uniclust30:
+    optional: true
+    description: "uniclust30"
+  bfd: 
+    optional: true
+    description: "bfd"
+  openfold_checkpoint:
+    optional: true
+    description: "openfold_checkpoint"
+  fasta_dir:
+    optional: false
+    description: "fasta_dir"
+  pdb_mmcif_files:
+    optional: true
+    description: "pdb_mmcif_files"
+
+storageCapacity: 4800
+tags:
+  Name: "OpenFold"
diff --git a/assets/workflows/openfold/main.nf b/assets/workflows/openfold/main.nf
@@ -0,0 +1,179 @@
+nextflow.enable.dsl = 2
+
+workflow {
+
+    mmseqs_db = PrepMMseqDatabases(params.mmseq_db)
+
+    // Create a channel for all FASTA files in the directory
+    fasta_files = Channel.fromPath("${params.fasta_dir}/*.fasta")
+
+    // runs per .fa file
+    alignment_dirs = PrecomputeAlignments(
+        fasta_files,
+        mmseqs_db.prepped_mmseq_db,
+        params.pdb70
+    )
+
+    // Define the target directory for merging files
+    merged_alignment_dir = file("merged_alignments")
+
+    // Create the target directory if it doesn't exist
+    merged_alignment_dir.mkdirs()
+
+    merged_aligned_dir_chan = alignment_dirs.alignments.flatMap { d -> 
+        // Get the directory within "output"
+        d.listFiles()
+    } | map { d ->
+        // Copy the dir to our shared output
+        d.copyTo(merged_alignment_dir)
+        return merged_alignment_dir
+    // Ensure all directories are copied
+    } | last
+
+    // Should be run once
+    PretrainedOpenFold(
+        params.fasta_dir,
+        params.pdb_mmcif_files,
+        merged_aligned_dir_chan,
+
+        params.openfold_checkpoint
+    )
+
+}
+
+process PrepMMseqDatabases {
+    label 'openfold'
+    cpus 32
+    memory '128 GB'
+    accelerator 1, type: 'nvidia-tesla-a10g'
+
+    input:
+        path data_dir
+
+    output:
+        path "${data_dir}/*", emit: prepped_mmseq_db
+
+    script:
+    """
+    set -euxo pipefail
+
+    tree .
+    tree ${data_dir}
+
+    mkdir -p prep-tmp/
+
+    bash /opt/openfold/scripts/prep_mmseqs_dbs.sh ${data_dir} prep-tmp/
+
+    tree .
+    tree ${data_dir}
+    """
+}
+
+process PrecomputeAlignments {
+    label 'openfold'
+    cpus 32
+    memory '128 GB'
+    accelerator 1, type: 'nvidia-tesla-a10g'
+
+    publishDir "/mnt/workflow/pubdir"
+
+    input:
+        path fasta
+        path prepped_mmseq_db
+        path pdb70
+
+    output:
+        path 'output', emit: alignments
+
+    script:
+    """
+    set -euxo pipefail
+
+    mkdir -p ./output
+
+    pwd
+
+    tree .
+
+    WORK="\$(realpath .)"
+    FASTA="\$(realpath ${fasta})"
+    MMSEQ_DB="\$(realpath ${prepped_mmseq_db})"
+    PDB_70="\$(realpath ${pdb70})"
+
+    pushd /opt/openfold
+
+    pwd
+    
+    python3 /opt/openfold/scripts/precompute_alignments_mmseqs.py \$FASTA \
+        \$MMSEQ_DB \
+        colabfold_envdb_202108_db \
+        \$WORK/output \
+        /opt/conda/bin/mmseqs \
+        --hhsearch_binary_path /opt/conda/bin/hhsearch \
+        --env_db colabfold_envdb_202108_db \
+        --pdb70 \$PDB_70/pdb70
+    
+    tree \$WORK
+    
+    pwd
+
+    popd
+
+    pwd
+    """
+}
+
+process PretrainedOpenFold {
+    label 'openfold'
+    cpus 32
+    memory '128 GB'
+    accelerator 1, type: 'nvidia-tesla-a10g'
+
+    publishDir "/mnt/workflow/pubdir"
+
+    input:
+        // inputs
+        path fasta_dir
+        path pdb_mmcif_files
+        path alignment_dir
+
+        // ref data
+        path openfold_checkpoint
+
+    output:
+        path 'output/*', emit: results
+
+    script:
+    """
+    set -euxo pipefail
+
+    mkdir -p ./output
+
+    tree .
+
+    tree ${fasta_dir}
+    tree ${pdb_mmcif_files}
+    tree ${openfold_checkpoint}
+
+    pushd ${pdb_mmcif_files}
+
+    for file in *.tar; do
+    tar -xf "\$file"
+    done
+
+    popd
+
+    tree ${pdb_mmcif_files}
+
+    python3 /opt/openfold/run_pretrained_openfold.py \
+        ${fasta_dir} \
+        ${pdb_mmcif_files} \
+        --use_precomputed_alignments ${alignment_dir} \
+        --config_preset model_1_ptm \
+        --output_dir ./output \
+        --model_device cuda:0 \
+        --openfold_checkpoint_path ${openfold_checkpoint}/finetuning_ptm_2.pt
+    tree .
+
+    """
+}
diff --git a/assets/workflows/openfold/nextflow.config b/assets/workflows/openfold/nextflow.config
@@ -0,0 +1,15 @@
+params {
+    mmseq_db = "s3://{{S3_BUCKET_NAME}}/ref-data/colabfold_envdb/"
+    uniref90 = "s3://{{S3_BUCKET_NAME}}/ref-data/uniref90/uniref90.fasta"
+    uniref30 = "s3://{{S3_BUCKET_NAME}}/ref-data/uniref30/"
+    mgnify = "s3://{{S3_BUCKET_NAME}}/ref-data/mgy/mgy_clusters_2022_05.fa.gz"
+    pdb70 = "s3://{{S3_BUCKET_NAME}}/ref-data/pdb70/"
+    uniclust30 = "s3://{{S3_BUCKET_NAME}}/ref-data/uniclust30/uniclust30_2018_08_hhsuite.tar.gz"
+    bfd = "s3://{{S3_BUCKET_NAME}}/ref-data/bfd/"
+    openfold_checkpoint = "s3://{{S3_BUCKET_NAME}}/ref-data/openfold/"
+    pdb_mmcif_files = "s3://{{S3_BUCKET_NAME}}/ref-data/pdb_mmcif/"
+}
+
+process {
+    withLabel: openfold { container = "{{openfold:latest}}"}
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		https://wwwuser.gwdguser.de/~compbiol/colabfold/colabfold_envdb_202108.tar.gz
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		s3://omics-{{REGION}}/alphafold_multimer/mgy/
		https://storage.googleapis.com/alphafold-databases/v2.3/mgy_clusters_2022_05.fa.gz
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		This repository helps you set up and run OpenFold Monomer on AWS HealthOmics. At the end of the configuration, you should be able to run a full end-to-end inference.