Skip to content

Commit

Permalink
load_trained_molgan_model
Browse files Browse the repository at this point in the history
  • Loading branch information
ndonyapour committed Dec 18, 2024
1 parent 92d2d65 commit 467ed6e
Show file tree
Hide file tree
Showing 13 changed files with 429 additions and 0 deletions.
29 changes: 29 additions & 0 deletions utils/load-trained-molgan-model-plugin/.bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
[bumpversion]
current_version = 0.1.0
commit = False
tag = False
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<dev>\d+))?
serialize =
{major}.{minor}.{patch}-{release}{dev}
{major}.{minor}.{patch}

[bumpversion:part:release]
optional_value = _
first_value = dev
values =
dev
_

[bumpversion:part:dev]

[bumpversion:file:pyproject.toml]
search = version = "{current_version}"
replace = version = "{new_version}"

[bumpversion:file:VERSION]

[bumpversion:file:README.md]

[bumpversion:file:plugin.json]

[bumpversion:file:src/polus/mm/utils/load_trained_molgan_model/__init__.py]
4 changes: 4 additions & 0 deletions utils/load-trained-molgan-model-plugin/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.venv
out
tests
__pycache__
1 change: 1 addition & 0 deletions utils/load-trained-molgan-model-plugin/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
poetry.lock
5 changes: 5 additions & 0 deletions utils/load-trained-molgan-model-plugin/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# CHANGELOG

## 0.1.0

Initial release.
38 changes: 38 additions & 0 deletions utils/load-trained-molgan-model-plugin/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# docker build -f Dockerfile -t polusai/molgan-tool:0.1.0 .
FROM condaforge/mambaforge
# NOT mambaforge-pypy3 (rdkit is incompatible with pypy)

# RDKIT logging
ENV RDKIT_ERROR_LOGGING="OFF"

RUN apt-get update && apt-get install -y wget git

# Install Python 3.10 using Mamba
RUN mamba install -y python=3.10

# Clone MolGAN
RUN git clone https://github.com/ndonyapour/MolGAN.git

# Build and install python bindings
# MolGAN was initially implemented using TensorFlow v1, and TensorFlow version 2 offers support
# for v1 functionalities. However, it's important to mention that the current patch for upgrading
# to v2 is not truly upgrading v1 API to v2 API, but calling legacy v1 API from v2 package via
# "tf.compat.v1". Essentially, it’s still v1.certain. Truely upgrade to v2 requires rewriting most
# functions of MolGAN, including model creation, data processing, and training.

RUN mamba install -c conda-forge rdkit "tensorflow<2.13" numpy scikit-learn xorg-libxrender

# Make sure rdkit is activated
RUN python -c "import rdkit"

# Train a Model
WORKDIR /MolGAN

# Download the gdb9 database
RUN bash data/download_dataset.sh data/gdb9.sdf data/NP_score.pkl.gz data/SA_score.pkl.gz

# Download the pretrained model
RUN wget -nv --no-clobber https://huggingface.co/ndonyapour/MolGAN/resolve/main/MolGAN_model.tar.gz && tar xvzf MolGAN_model.tar.gz
RUN mv MolGAN_model trained_models
RUN wget -nv --no-clobber https://huggingface.co/ndonyapour/MolGAN/resolve/main/data.pkl -O data/data.pkl
ADD Dockerfile .
20 changes: 20 additions & 0 deletions utils/load-trained-molgan-model-plugin/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# load_trained_molgan_model (0.1.0)

MolGAN tool for generating small molecules

## Options

This plugin takes 7 input arguments and 3 output argument:

| Name | Description | I/O | Type | Default |
|---------------|-------------------------|--------|--------|---------|
| input_data_path | Path to the input data file, Type: string, File type: input, Accepted formats: pkl, Example file: https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/reference/classification/ref_output_model_support_vector_machine.pkl | Input | string | string |
| input_NP_Score_path | Output ceout file (AMBER ceout), Type: string, File type: input, Accepted formats: gz, Example file: https://github.com/bioexcel/biobb_amber/raw/master/biobb_amber/test/data/cphstats/sander.ceout.gz | Input | string | string |
| input_SA_Score_path | Output ceout file (AMBER ceout), Type: string, File type: input, Accepted formats: gz, Example file: https://github.com/bioexcel/biobb_amber/raw/master/biobb_amber/test/data/cphstats/sander.ceout.gz | Input | string | string |
| input_model_dir | Input directory of trained models | Input | string | string |
| output_log_path | Path to the log file, Type: string, File type: output, Accepted formats: log | Input | string | string |
| output_sdf_path | Path to the output file, Type: string, File type: output, Accepted formats: sdf | Input | string | string |
| num_samples | The number of training epochs, Type: int | Input | int | int |
| rdkit_error_logging | Enable or disable RDKit error logging | Input | string | string |
| output_log_path | Path to the log file | Output | File | File |
| output_sdf_path | Path to the output file | Output | File | File |
1 change: 1 addition & 0 deletions utils/load-trained-molgan-model-plugin/VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0.1.0
4 changes: 4 additions & 0 deletions utils/load-trained-molgan-model-plugin/build-docker.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

version=$(<VERSION)
docker build . -t polusai/load-trained-molgan-model-tool:${version}
114 changes: 114 additions & 0 deletions utils/load-trained-molgan-model-plugin/ict.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
specVersion: "0.1.0"
name: load_trained_molgan_model
version: 0.1.0
container: load-trained-molgan-model-plugin
entrypoint:
title: load_trained_molgan_model
description: MolGAN tool for generating small molecules
author: Data Scientist
contact: [email protected]
repository:
documentation:
citation:

inputs:
- name: input_data_path
required: true
description: Path to the input data file, Type string, File type input, Accepted formats pkl, Example file https//github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/reference/classification/ref_output_model_support_vector_machine.pkl
type: string
defaultValue: system.pkl
format:
uri: edam:format_3653
- name: input_NP_Score_path
required: true
description: Output ceout file (AMBER ceout), Type string, File type input, Accepted formats gz, Example file https//github.com/bioexcel/biobb_amber/raw/master/biobb_amber/test/data/cphstats/sander.ceout.gz
type: string
defaultValue: NP.gz
format:
uri: edam:format_3987
- name: input_SA_Score_path
required: true
description: Output ceout file (AMBER ceout), Type string, File type input, Accepted formats gz, Example file https//github.com/bioexcel/biobb_amber/raw/master/biobb_amber/test/data/cphstats/sander.ceout.gz
type: string
defaultValue: SA.gz
format:
uri: edam:format_3987
- name: input_model_dir
required: true
description: "Input directory of trained models"
type: string
defaultValue: output
format:
uri: edam:format_2330
- name: output_log_path
required: true
description: Path to the log file, Type string, File type output, Accepted formats log
type: string
defaultValue: system.log
format:
uri: edam:format_2330
- name: output_sdf_path
required: true
description: Path to the output file, Type string, File type output, Accepted formats sdf
type: string
defaultValue: system.sdf
format:
uri: edam:format_3814
- name: num_samples
required: true
description: The number of training epochs, Type int
type: int
defaultValue: 1000
format:
uri: edam:format_2330
- name: rdkit_error_logging
required: true
description: Enable or disable RDKit error logging
type: string
defaultValue: ON
outputs:
- name: output_log_path
required: true
description: Path to the log file
type: File
format:
uri: edam:format_2330
- name: output_sdf_path
required: true
description: Path to the output file
type: File
format:
uri: edam:format_3814
ui:
- key: inputs.input_data_path
title: "input_data_path: "
description: "Path to the input data file, Type string, File type input, Accepted formats pkl, Example file https//github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/reference/classification/ref_output_model_support_vector_machine.pkl"
type: string
- key: inputs.input_NP_Score_path
title: "input_NP_Score_path: "
description: "Output ceout file (AMBER ceout), Type string, File type input, Accepted formats gz, Example file https//github.com/bioexcel/biobb_amber/raw/master/biobb_amber/test/data/cphstats/sander.ceout.gz"
type: string
- key: inputs.input_SA_Score_path
title: "input_SA_Score_path: "
description: "Output ceout file (AMBER ceout), Type string, File type input, Accepted formats gz, Example file https//github.com/bioexcel/biobb_amber/raw/master/biobb_amber/test/data/cphstats/sander.ceout.gz"
type: string
- key: inputs.input_model_dir
title: "input_model_dir: "
description: ""
type: string
- key: inputs.output_log_path
title: "output_log_path: "
description: "Path to the log file, Type string, File type output, Accepted formats log"
type: string
- key: inputs.output_sdf_path
title: "output_sdf_path: "
description: "Path to the output file, Type string, File type output, Accepted formats sdf"
type: string
- key: inputs.num_samples
title: "num_samples: "
description: "The number of training epochs, Type int"
type: int
- key: inputs.rdkit_error_logging
title: "rdkit_error_logging: "
description: "Enable or disable RDKit error logging"
type: string
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
#!/usr/bin/env cwl-runner
cwlVersion: v1.0

class: CommandLineTool

label: MolGAN tool for generating small molecules

baseCommand: ["python", "/MolGAN/run_trained_model.py"]

hints:
DockerRequirement:
dockerPull: polusai/molgan-tool@sha256:e008e74170be12dcf50a936a417b8c330ccdebf7fe17abaa8fa2689dac210725

# Set environment variables for the tool,
# See: https://www.commonwl.org/user_guide/topics/environment-variables.html
requirements:
EnvVarRequirement:
envDef:
RDKIT_ERROR_LOGGING: $(inputs.rdkit_error_logging)
inputs:
input_data_path:
label: Path to the input data file
doc: |-
Path to the input data file
Type: string
File type: input
Accepted formats: pkl
Example file: https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/reference/classification/ref_output_model_support_vector_machine.pkl
type: string
format: edam:format_3653
inputBinding:
prefix: --input_data_path
default: system.pkl

input_NP_Score_path:
label: Output ceout file (AMBER ceout)
doc: |-
Output ceout file (AMBER ceout)
Type: string
File type: input
Accepted formats: gz
Example file: https://github.com/bioexcel/biobb_amber/raw/master/biobb_amber/test/data/cphstats/sander.ceout.gz
type: string
format: edam:format_3987
default: NP.gz
inputBinding:
prefix: --input_NP_Score_path

input_SA_Score_path:
label: Output ceout file (AMBER ceout)
doc: |-
Output ceout file (AMBER ceout)
Type: string
File type: input
Accepted formats: gz
Example file: https://github.com/bioexcel/biobb_amber/raw/master/biobb_amber/test/data/cphstats/sander.ceout.gz
type: string
format: edam:format_3987
default: SA.gz
inputBinding:
prefix: --input_SA_Score_path

input_model_dir:
label: Input directory of trained models
doc: |-
Input directory of trained models
type: string
format: edam:format_2330 # 'Textual format'
inputBinding:
prefix: --input_model_dir
default: output

output_log_path:
label: Path to the log file
doc: |-
Path to the log file
Type: string
File type: output
Accepted formats: log
type: string
format: edam:format_2330
inputBinding:
prefix: --output_log_path
default: system.log

output_sdf_path:
label: Path to the output file
doc: |-
Path to the output file
Type: string
File type: output
Accepted formats: sdf
type: string
format: edam:format_3814 # sdf
default: system.sdf
inputBinding:
prefix: --output_sdf_path

num_samples:
label: The number of new molecules to generate
doc: |-
The number of training epochs
Type: int
type: int?
format: edam:format_2330
inputBinding:
position: 7
prefix: --num_samples
default: 1000

rdkit_error_logging:
label: Enable or disable RDKit error logging
doc: |-
Enable or disable RDKit error logging
type: string?
# RDKit prints out all errors by default, which can pose issues for CI,
# particularly with large databases. It would be more efficient to suppress these errors.
default: "ON"
outputs:
output_log_path:
label: Path to the log file
doc: |-
Path to the log file
type: File
outputBinding:
glob: $(inputs.output_log_path)
format: edam:format_2330

output_sdf_path:
label: Path to the output file
doc: |-
Path to the output file
type: File
outputBinding:
glob: $(inputs.output_sdf_path)
format: edam:format_3814 # sdf

stderr:
type: File
outputBinding:
glob: stderr

stderr: stderr

$namespaces:
edam: https://edamontology.org/

$schemas:
- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl
Loading

0 comments on commit 467ed6e

Please sign in to comment.