Skip to content

Commit

Permalink
Release for OpenFold
Browse files Browse the repository at this point in the history
  • Loading branch information
jacquayj committed Nov 12, 2024
1 parent 6379657 commit 93f878c
Show file tree
Hide file tree
Showing 17 changed files with 418 additions and 25 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,5 @@ tmp/
.nextflow*
stack-outputs.json
test_data

build/cloudformation/packaged.yaml
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ Example run with full argument list:
-r us-east-1 \
-o "arn:aws:iam::123456789012:role/healthomics-dev-role" \
-b mybucket \
-f mydeployrefbucket \
-p file://testparams/rfdiffusion.params.json
```

Expand All @@ -141,12 +142,15 @@ ACCOUNT_ID=123456789012
REGION=us-east-1
OMICS_EXECUTION_ROLE=arn:aws:iam::123456789012:role/healthomics-dev-role
OUTPUT_BUCKET=mybucket
REF_DATA_BUCKET=my-deployment-bucket
```

`REF_DATA_BUCKET` should be the same bucket you specified during the CloudFormation deployment: `deploy.sh -b "my-deployment-bucket"`.

and then:

```sh
./scripts/testrun.sh -w rfdiffusion -p testparams/rfdiffusion.params.json
./scripts/testrun.sh -w rfdiffusion -p "file://testparams/rfdiffusion.params.json"
```

`s3:<BUCKET NAME SPECIFIED IN CFN>/ref-data/<FILENAME WITHOUT EXTENSION>/...`
Expand Down
55 changes: 55 additions & 0 deletions assets/containers/openfold/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Tagged version 2.1.0
# https://github.com/aqlaboratory/openfold/blob/v.2.1.0/Dockerfile

# Modifications have been made to work within the aws-samples/drug-discovery-workflows repository
# Mostly addition of git clone and comment out COPY statements

FROM nvidia/cuda:11.3.1-cudnn8-devel-ubuntu18.04

# metainformation
LABEL org.opencontainers.image.version = "1.0.0"
LABEL org.opencontainers.image.authors = "Gustaf Ahdritz"
LABEL org.opencontainers.image.source = "https://github.com/aqlaboratory/openfold"
LABEL org.opencontainers.image.licenses = "Apache License 2.0"
LABEL org.opencontainers.image.base.name="docker.io/nvidia/cuda:10.2-cudnn8-runtime-ubuntu18.04"

RUN apt-key del 7fa2af80
RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub

RUN apt-get update && apt-get install -y wget libxml2 cuda-minimal-build-11-3 libcusparse-dev-11-3 libcublas-dev-11-3 libcusolver-dev-11-3 git tree gzip
RUN wget -P /tmp \
"https://github.com/conda-forge/miniforge/releases/download/23.3.1-1/Miniforge3-Linux-x86_64.sh" \
&& bash /tmp/Miniforge3-Linux-x86_64.sh -b -p /opt/conda \
&& rm /tmp/Miniforge3-Linux-x86_64.sh
ENV PATH /opt/conda/bin:$PATH

# Tagged version 2.1.0
RUN mkdir -p /opt/openfold
RUN git clone https://github.com/aqlaboratory/openfold.git /opt/openfold && cd /opt/openfold && git checkout f434a2786b5a6b39171f358fb3470ad9f4fd2a58

WORKDIR /opt/openfold

# The git clone does this already
# COPY environment.yml /opt/openfold/environment.yml

# installing into the base environment since the docker container wont do anything other than run openfold
RUN mamba env update -n base --file /opt/openfold/environment.yml && mamba clean --all
RUN export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}

# The git clone does this already
# COPY openfold /opt/openfold/openfold
# COPY scripts /opt/openfold/scripts
# COPY run_pretrained_openfold.py /opt/openfold/run_pretrained_openfold.py
# COPY train_openfold.py /opt/openfold/train_openfold.py
# COPY setup.py /opt/openfold/setup.py

COPY ./prep_mmseqs_dbs.mod.sh /opt/openfold/scripts/prep_mmseqs_dbs.sh

RUN ./scripts/install_third_party_dependencies.sh

# from scripts/install_third_party_dependencies.sh
# RUN wget -q -P /opt/openfold/openfold/resources \
# https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt
# WORKDIR /opt/openfold
# RUN python3 setup.py install
40 changes: 40 additions & 0 deletions assets/containers/openfold/prep_mmseqs_dbs.mod.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/bin/bash
#
# Copyright 2021 AlQuraishi Laboratory
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Downloads and unzips all required data for AlphaFold.
#
# Usage: bash download_all_data.sh /path/to/download/directory
set -e

DOWNLOAD_DIR="$1"
TMP_DIR="$2"
ROOT_DIR="${DOWNLOAD_DIR}/mmseqs_dbs"
mkdir -p $ROOT_DIR

for f in $(ls ${DOWNLOAD_DIR}/*.tar*)
do
tar --extract --verbose --file="${f}" \
--directory=$ROOT_DIR
rm "${f}"
BASENAME="$(basename ${f%%.*})"
DB_NAME="${BASENAME}_db"
OLD_PWD=$(pwd)
cd $ROOT_DIR
mmseqs tsv2exprofiledb "${BASENAME}" "${DB_NAME}"
# mmseqs createindex "${DB_NAME}" "${DOWNLOAD_DIR}/tmp/"
mmseqs createindex "${DB_NAME}" "${TMP_DIR}"
cd "${OLD_PWD}"
done
1 change: 1 addition & 0 deletions assets/data/colabfold_envdb.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
https://wwwuser.gwdguser.de/~compbiol/colabfold/colabfold_envdb_202108.tar.gz
1 change: 1 addition & 0 deletions assets/data/mgy.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
s3://omics-{{REGION}}/alphafold_multimer/mgy/
https://storage.googleapis.com/alphafold-databases/v2.3/mgy_clusters_2022_05.fa.gz
1 change: 1 addition & 0 deletions assets/data/openfold.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
s3://openfold/openfold_params/
1 change: 1 addition & 0 deletions assets/data/uniclust30.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
https://storage.googleapis.com/alphafold-databases/casp14_versions/uniclust30_2018_08_hhsuite.tar.gz
1 change: 1 addition & 0 deletions assets/workflows/openfold/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This repository helps you set up and run OpenFold Monomer on AWS HealthOmics. At the end of the configuration, you should be able to run a full end-to-end inference.
15 changes: 15 additions & 0 deletions assets/workflows/openfold/build_containers.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

set -ex

REGION=$1
ACCOUNT=$2
TAG=${3:-latest}

aws ecr get-login-password --region $REGION | docker login --username AWS --password-stdin $ACCOUNT.dkr.ecr.$REGION.amazonaws.com

# build openfold
cd openfold
docker build --platform linux/amd64 -t $ACCOUNT.dkr.ecr.$REGION.amazonaws.com/openfold:$TAG .
docker push $ACCOUNT.dkr.ecr.$REGION.amazonaws.com/openfold:$TAG
cd ..
39 changes: 39 additions & 0 deletions assets/workflows/openfold/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: OpenFold
description: "Predict single-chain protein structures with OpenFold"
engine: NEXTFLOW
main: main.nf
parameterTemplate:
mmseq_db:
optional: true
description: "mmseq db"
uniref90:
optional: true
description: "uniref90"
uniref30:
optional: true
description: "uniref30"
mgnify:
optional: true
description: "mgnify"
pdb70:
optional: true
description: "pdb70"
uniclust30:
optional: true
description: "uniclust30"
bfd:
optional: true
description: "bfd"
openfold_checkpoint:
optional: true
description: "openfold_checkpoint"
fasta_dir:
optional: false
description: "fasta_dir"
pdb_mmcif_files:
optional: true
description: "pdb_mmcif_files"

storageCapacity: 4800
tags:
Name: "OpenFold"
179 changes: 179 additions & 0 deletions assets/workflows/openfold/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
nextflow.enable.dsl = 2

workflow {

mmseqs_db = PrepMMseqDatabases(params.mmseq_db)

// Create a channel for all FASTA files in the directory
fasta_files = Channel.fromPath("${params.fasta_dir}/*.fasta")

// runs per .fa file
alignment_dirs = PrecomputeAlignments(
fasta_files,
mmseqs_db.prepped_mmseq_db,
params.pdb70
)

// Define the target directory for merging files
merged_alignment_dir = file("merged_alignments")

// Create the target directory if it doesn't exist
merged_alignment_dir.mkdirs()

merged_aligned_dir_chan = alignment_dirs.alignments.flatMap { d ->
// Get the directory within "output"
d.listFiles()
} | map { d ->
// Copy the dir to our shared output
d.copyTo(merged_alignment_dir)
return merged_alignment_dir
// Ensure all directories are copied
} | last

// Should be run once
PretrainedOpenFold(
params.fasta_dir,
params.pdb_mmcif_files,
merged_aligned_dir_chan,

params.openfold_checkpoint
)

}

process PrepMMseqDatabases {
label 'openfold'
cpus 32
memory '128 GB'
accelerator 1, type: 'nvidia-tesla-a10g'

input:
path data_dir

output:
path "${data_dir}/*", emit: prepped_mmseq_db

script:
"""
set -euxo pipefail
tree .
tree ${data_dir}
mkdir -p prep-tmp/
bash /opt/openfold/scripts/prep_mmseqs_dbs.sh ${data_dir} prep-tmp/
tree .
tree ${data_dir}
"""
}

process PrecomputeAlignments {
label 'openfold'
cpus 32
memory '128 GB'
accelerator 1, type: 'nvidia-tesla-a10g'

publishDir "/mnt/workflow/pubdir"

input:
path fasta
path prepped_mmseq_db
path pdb70

output:
path 'output', emit: alignments

script:
"""
set -euxo pipefail
mkdir -p ./output
pwd
tree .
WORK="\$(realpath .)"
FASTA="\$(realpath ${fasta})"
MMSEQ_DB="\$(realpath ${prepped_mmseq_db})"
PDB_70="\$(realpath ${pdb70})"
pushd /opt/openfold
pwd
python3 /opt/openfold/scripts/precompute_alignments_mmseqs.py \$FASTA \
\$MMSEQ_DB \
colabfold_envdb_202108_db \
\$WORK/output \
/opt/conda/bin/mmseqs \
--hhsearch_binary_path /opt/conda/bin/hhsearch \
--env_db colabfold_envdb_202108_db \
--pdb70 \$PDB_70/pdb70
tree \$WORK
pwd
popd
pwd
"""
}

process PretrainedOpenFold {
label 'openfold'
cpus 32
memory '128 GB'
accelerator 1, type: 'nvidia-tesla-a10g'

publishDir "/mnt/workflow/pubdir"

input:
// inputs
path fasta_dir
path pdb_mmcif_files
path alignment_dir

// ref data
path openfold_checkpoint

output:
path 'output/*', emit: results

script:
"""
set -euxo pipefail
mkdir -p ./output
tree .
tree ${fasta_dir}
tree ${pdb_mmcif_files}
tree ${openfold_checkpoint}
pushd ${pdb_mmcif_files}
for file in *.tar; do
tar -xf "\$file"
done
popd
tree ${pdb_mmcif_files}
python3 /opt/openfold/run_pretrained_openfold.py \
${fasta_dir} \
${pdb_mmcif_files} \
--use_precomputed_alignments ${alignment_dir} \
--config_preset model_1_ptm \
--output_dir ./output \
--model_device cuda:0 \
--openfold_checkpoint_path ${openfold_checkpoint}/finetuning_ptm_2.pt
tree .
"""
}
15 changes: 15 additions & 0 deletions assets/workflows/openfold/nextflow.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
params {
mmseq_db = "s3://{{S3_BUCKET_NAME}}/ref-data/colabfold_envdb/"
uniref90 = "s3://{{S3_BUCKET_NAME}}/ref-data/uniref90/uniref90.fasta"
uniref30 = "s3://{{S3_BUCKET_NAME}}/ref-data/uniref30/"
mgnify = "s3://{{S3_BUCKET_NAME}}/ref-data/mgy/mgy_clusters_2022_05.fa.gz"
pdb70 = "s3://{{S3_BUCKET_NAME}}/ref-data/pdb70/"
uniclust30 = "s3://{{S3_BUCKET_NAME}}/ref-data/uniclust30/uniclust30_2018_08_hhsuite.tar.gz"
bfd = "s3://{{S3_BUCKET_NAME}}/ref-data/bfd/"
openfold_checkpoint = "s3://{{S3_BUCKET_NAME}}/ref-data/openfold/"
pdb_mmcif_files = "s3://{{S3_BUCKET_NAME}}/ref-data/pdb_mmcif/"
}

process {
withLabel: openfold { container = "{{openfold:latest}}"}
}
Loading

0 comments on commit 93f878c

Please sign in to comment.