From 1e5173003c8d15fef95d90cba5b4579a15fe776a Mon Sep 17 00:00:00 2001 From: Brian Loyal Date: Fri, 13 Dec 2024 16:35:13 -0600 Subject: [PATCH] v2.10.0 --- CHANGELOG.md | 8 +++ README.md | 1 + assets/containers/temstapro/Dockerfile | 29 ++++++++ assets/containers/temstapro/requirements.txt | 2 + assets/data/temstapro | 1 + assets/workflows/temstapro/README.md | 5 ++ assets/workflows/temstapro/config.yaml | 18 +++++ assets/workflows/temstapro/main.nf | 69 ++++++++++++++++++++ assets/workflows/temstapro/nextflow.config | 15 +++++ 9 files changed, 148 insertions(+) create mode 100644 assets/containers/temstapro/Dockerfile create mode 100644 assets/containers/temstapro/requirements.txt create mode 100644 assets/data/temstapro create mode 100644 assets/workflows/temstapro/README.md create mode 100644 assets/workflows/temstapro/config.yaml create mode 100644 assets/workflows/temstapro/main.nf create mode 100644 assets/workflows/temstapro/nextflow.config diff --git a/CHANGELOG.md b/CHANGELOG.md index e4f26f1..086565f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 --- +## [2.10.0] - 2024-12-13 + +### 2.10.0 Added + +- Added TemStaPro prediction workflow. + +--- + ## [2.9.0] - 2024-12-13 ### 2.9.0 Added diff --git a/README.md b/README.md index a93b97c..fb5634a 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ A collection of AWS HealthOmics workflows to accelerate drug discovery. - [Generate Protein Sequence Embeddings](https://github.com/aws-samples/drug-discovery-workflows/tree/main/assets/workflows/generate-protein-seq-embeddings): From Meta. Generate ESM-2 vector embeddings for one or more protein amino acid sequences. - [NanobodyBuilder2](https://github.com/oxpig/ImmuneBuilder): From Oxford Protein Informatics Group. Predict the 3D structure of single-chain nanobodies. - [RFDiffusion-ProteinMPNN](https://github.com/aws-samples/drug-discovery-workflows/tree/main/assets/workflows/rfdiffusion-proteinmpnn): From the Institute for Protein Design at the University of Washington. Generate protein backbone structures and sequences given a binding target or other structural context. +- [TemStaPro](https://github.com/ievapudz/TemStaPro): From Institute of Biotechnology, Life Sciences Center, Vilnius University. Predict protein thermostability using sequence representations from a protein language model. - [ThermoMPNN](https://github.com/Kuhlman-Lab/ThermoMPNN): From the University of North Carolina School of Medicine. Predict changes in thermodynamic stability for protein point mutants. ### E2E Workflows diff --git a/assets/containers/temstapro/Dockerfile b/assets/containers/temstapro/Dockerfile new file mode 100644 index 0000000..8678bf4 --- /dev/null +++ b/assets/containers/temstapro/Dockerfile @@ -0,0 +1,29 @@ +ARG AWS_DEFAULT_REGION=us-east-1 + +FROM 763104351884.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com/pytorch-inference:2.2.0-gpu-py310-cu118-ubuntu20.04-ec2 +ARG COMMIT=db001d2b2479131bad2d0ee170b75001e9fad076 + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get install nano -y \ + && apt-get autoremove -y \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Download ThermoMPNN Code +RUN wget -q -P /tmp "https://github.com/ievapudz/TemStaPro/archive/${COMMIT}.zip" \ + && mkdir -p /home/TemStaPro \ + && unzip /tmp/${COMMIT}.zip -d /tmp \ + && mv /tmp/TemStaPro-${COMMIT}/* /home/TemStaPro \ + && rm -rf /tmp/TemStaPro-${COMMIT} + +WORKDIR /home/TemStaPro + +COPY requirements.txt /tmp/requirements.txt +RUN pip install --no-cache-dir -U -r /tmp/requirements.txt \ + && rm /tmp/requirements.txt + +ENV TMPDIR="/tmp" +ENV PYTHONPATH="/home/TemStaPro:${PYTHONPATH}" + +ENTRYPOINT [] \ No newline at end of file diff --git a/assets/containers/temstapro/requirements.txt b/assets/containers/temstapro/requirements.txt new file mode 100644 index 0000000..ceed438 --- /dev/null +++ b/assets/containers/temstapro/requirements.txt @@ -0,0 +1,2 @@ +transformers==4.24.0 +sentencepiece==0.1.96 diff --git a/assets/data/temstapro b/assets/data/temstapro new file mode 100644 index 0000000..d46555b --- /dev/null +++ b/assets/data/temstapro @@ -0,0 +1 @@ +Rostlab/prot_t5_xl_half_uniref50-enc diff --git a/assets/workflows/temstapro/README.md b/assets/workflows/temstapro/README.md new file mode 100644 index 0000000..91e1815 --- /dev/null +++ b/assets/workflows/temstapro/README.md @@ -0,0 +1,5 @@ +# Predict protein thermostability using sequence representations from a protein language model + +## Summary + +Predict protein thermostability using sequence representations from the [TemStaPro](https://github.com/ievapudz/TemStaPro) protein language model. From Institute of Biotechnology, Life Sciences Center, Vilnius University. diff --git a/assets/workflows/temstapro/config.yaml b/assets/workflows/temstapro/config.yaml new file mode 100644 index 0000000..91054d7 --- /dev/null +++ b/assets/workflows/temstapro/config.yaml @@ -0,0 +1,18 @@ +name: TemStaPro +description: "Predict protein thermostability using sequence representations from a protein language model. From Institute of Biotechnology, Life Sciences Center, Vilnius University." +engine: NEXTFLOW +main: main.nf +parameterTemplate: + fasta_path: + description: "Input file in FASTA format." + optional: false + window_size_predictions: + description: "set the window size for average smoothing of per residue predictions for plotting. Defaults to 81." + optional: true + portion_size: + description: "Maximum size of input sequence divisions. Set to 0 for no division. Defaults to 1000." + optional: true +storageCapacity: 1200 +tags: + Name: "TemStaPro" +accelerators: GPU diff --git a/assets/workflows/temstapro/main.nf b/assets/workflows/temstapro/main.nf new file mode 100644 index 0000000..d953dfa --- /dev/null +++ b/assets/workflows/temstapro/main.nf @@ -0,0 +1,69 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +workflow TemStaPro { + take: + fasta_path + window_size_predictions + portion_size + prot_t5_params + + main: + TemStaProTask( + fasta_path, + window_size_predictions, + portion_size, + prot_t5_params + ) + + TemStaProTask.out.set { results } + + emit: + results +} + +process TemStaProTask { + label 'temstapro' + cpus 4 + memory '16 GB' + maxRetries 1 + accelerator 1, type: 'nvidia-tesla-a10g' + publishDir "/mnt/workflow/pubdir/${workflow.sessionId}/${task.process.replace(':', '/')}/${task.index}/${task.attempt}" + + input: + path fasta_path + val window_size_predictions + val portion_size + path prot_t5_params + + output: + path 'output/*' + + script: + """ + set -euxo pipefail + mkdir output + /opt/conda/bin/python /home/TemStaPro/temstapro \ + --input-fasta $fasta_path \ + --PT-directory $prot_t5_params \ + --temstapro-directory '/home/TemStaPro' \ + --more-thresholds \ + --mean-output 'output/mean_output.tsv' \ + --per-res-output 'output/per_res_output.tsv' \ + --window-size-predictions $window_size_predictions \ + --per-residue-plot-dir output \ + --portion-size $portion_size + + """ +} + +workflow { + TemStaPro( + Channel.fromPath(params.fasta_path), + params.window_size_predictions, + params.portion_size, + params.prot_t5_params + ) +} + diff --git a/assets/workflows/temstapro/nextflow.config b/assets/workflows/temstapro/nextflow.config new file mode 100644 index 0000000..66c562a --- /dev/null +++ b/assets/workflows/temstapro/nextflow.config @@ -0,0 +1,15 @@ +params { + window_size_predictions = 81 + portion_size = 1000 + + prot_t5_params = "s3://{{S3_BUCKET_NAME}}/ref-data/temstapro/Rostlab/prot_t5_xl_half_uniref50-enc/" +} + +process { + withLabel: temstapro { container = "{{temstapro:latest}}" } +} + +docker { + enabled = true + runOptions = "--gpus all" +}