load_trained_molgan_model

PolusAI · Dec 18, 2024 · 467ed6e · 467ed6e
1 parent 92d2d65
commit 467ed6e
Show file tree

Hide file tree

Showing 13 changed files with 429 additions and 0 deletions.
diff --git a/utils/load-trained-molgan-model-plugin/.bumpversion.cfg b/utils/load-trained-molgan-model-plugin/.bumpversion.cfg
@@ -0,0 +1,29 @@
+[bumpversion]
+current_version = 0.1.0
+commit = False
+tag = False
+parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<dev>\d+))?
+serialize = 
+	{major}.{minor}.{patch}-{release}{dev}
+	{major}.{minor}.{patch}
+
+[bumpversion:part:release]
+optional_value = _
+first_value = dev
+values = 
+	dev
+	_
+
+[bumpversion:part:dev]
+
+[bumpversion:file:pyproject.toml]
+search = version = "{current_version}"
+replace = version = "{new_version}"
+
+[bumpversion:file:VERSION]
+
+[bumpversion:file:README.md]
+
+[bumpversion:file:plugin.json]
+
+[bumpversion:file:src/polus/mm/utils/load_trained_molgan_model/__init__.py]
diff --git a/utils/load-trained-molgan-model-plugin/.dockerignore b/utils/load-trained-molgan-model-plugin/.dockerignore
@@ -0,0 +1,4 @@
+.venv
+out
+tests
+__pycache__
diff --git a/utils/load-trained-molgan-model-plugin/.gitignore b/utils/load-trained-molgan-model-plugin/.gitignore
@@ -0,0 +1 @@
+poetry.lock
diff --git a/utils/load-trained-molgan-model-plugin/CHANGELOG.md b/utils/load-trained-molgan-model-plugin/CHANGELOG.md
@@ -0,0 +1,5 @@
+# CHANGELOG
+
+## 0.1.0
+
+Initial release.
diff --git a/utils/load-trained-molgan-model-plugin/Dockerfile b/utils/load-trained-molgan-model-plugin/Dockerfile
@@ -0,0 +1,38 @@
+# docker build -f Dockerfile -t polusai/molgan-tool:0.1.0 .
+FROM condaforge/mambaforge
+# NOT mambaforge-pypy3 (rdkit is incompatible with pypy)
+
+# RDKIT logging
+ENV RDKIT_ERROR_LOGGING="OFF"
+
+RUN apt-get update && apt-get install -y wget git
+
+# Install Python 3.10 using Mamba
+RUN mamba install -y python=3.10
+
+# Clone MolGAN
+RUN git clone https://github.com/ndonyapour/MolGAN.git
+
+# Build and install python bindings
+# MolGAN was initially implemented using TensorFlow v1, and TensorFlow version 2 offers support
+# for v1 functionalities. However, it's important to mention that the current patch for upgrading
+# to v2 is not truly upgrading v1 API to v2 API, but calling legacy v1 API from v2 package via
+# "tf.compat.v1". Essentially, it’s still v1.certain. Truely upgrade to v2 requires rewriting most
+# functions of MolGAN, including model creation, data processing, and training.
+
+RUN mamba install -c conda-forge rdkit "tensorflow<2.13" numpy scikit-learn xorg-libxrender
+
+# Make sure rdkit is activated
+RUN python -c "import rdkit"
+
+# Train a Model
+WORKDIR /MolGAN
+
+# Download the gdb9 database
+RUN bash data/download_dataset.sh data/gdb9.sdf data/NP_score.pkl.gz data/SA_score.pkl.gz
+
+# Download the pretrained model
+RUN wget -nv --no-clobber https://huggingface.co/ndonyapour/MolGAN/resolve/main/MolGAN_model.tar.gz && tar xvzf MolGAN_model.tar.gz
+RUN mv MolGAN_model trained_models
+RUN  wget -nv --no-clobber https://huggingface.co/ndonyapour/MolGAN/resolve/main/data.pkl -O data/data.pkl
+ADD Dockerfile .
diff --git a/utils/load-trained-molgan-model-plugin/README.md b/utils/load-trained-molgan-model-plugin/README.md
@@ -0,0 +1,20 @@
+# load_trained_molgan_model (0.1.0)
+
+MolGAN tool for generating small molecules
+
+## Options
+
+This plugin takes 7 input arguments and 3 output argument:
+
+| Name          | Description             | I/O    | Type   | Default |
+|---------------|-------------------------|--------|--------|---------|
+| input_data_path | Path to the input data file, Type: string, File type: input, Accepted formats: pkl, Example file: https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/reference/classification/ref_output_model_support_vector_machine.pkl | Input | string | string |
+| input_NP_Score_path | Output ceout file (AMBER ceout), Type: string, File type: input, Accepted formats: gz, Example file: https://github.com/bioexcel/biobb_amber/raw/master/biobb_amber/test/data/cphstats/sander.ceout.gz | Input | string | string |
+| input_SA_Score_path | Output ceout file (AMBER ceout), Type: string, File type: input, Accepted formats: gz, Example file: https://github.com/bioexcel/biobb_amber/raw/master/biobb_amber/test/data/cphstats/sander.ceout.gz | Input | string | string |
+| input_model_dir | Input directory of trained models | Input | string | string |
+| output_log_path | Path to the log file, Type: string, File type: output, Accepted formats: log | Input | string | string |
+| output_sdf_path | Path to the output file, Type: string, File type: output, Accepted formats: sdf | Input | string | string |
+| num_samples | The number of training epochs, Type: int | Input | int | int |
+| rdkit_error_logging | Enable or disable RDKit error logging | Input | string | string |
+| output_log_path | Path to the log file | Output | File | File |
+| output_sdf_path | Path to the output file | Output | File | File |
diff --git a/utils/load-trained-molgan-model-plugin/VERSION b/utils/load-trained-molgan-model-plugin/VERSION
@@ -0,0 +1 @@
+0.1.0
diff --git a/utils/load-trained-molgan-model-plugin/build-docker.sh b/utils/load-trained-molgan-model-plugin/build-docker.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+version=$(<VERSION)
+docker build . -t polusai/load-trained-molgan-model-tool:${version}
diff --git a/utils/load-trained-molgan-model-plugin/ict.yml b/utils/load-trained-molgan-model-plugin/ict.yml
@@ -0,0 +1,114 @@
+specVersion: "0.1.0"
+name: load_trained_molgan_model
+version: 0.1.0
+container: load-trained-molgan-model-plugin
+entrypoint:
+title: load_trained_molgan_model
+description: MolGAN tool for generating small molecules
+author: Data Scientist
+contact: [email protected]
+repository:
+documentation:
+citation:
+
+inputs:
+  - name: input_data_path
+    required: true
+    description: Path to the input data file, Type string, File type input, Accepted formats pkl, Example file https//github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/reference/classification/ref_output_model_support_vector_machine.pkl
+    type: string
+    defaultValue: system.pkl
+    format:
+      uri: edam:format_3653
+  - name: input_NP_Score_path
+    required: true
+    description: Output ceout file (AMBER ceout), Type string, File type input, Accepted formats gz, Example file https//github.com/bioexcel/biobb_amber/raw/master/biobb_amber/test/data/cphstats/sander.ceout.gz
+    type: string
+    defaultValue: NP.gz
+    format:
+      uri: edam:format_3987
+  - name: input_SA_Score_path
+    required: true
+    description: Output ceout file (AMBER ceout), Type string, File type input, Accepted formats gz, Example file https//github.com/bioexcel/biobb_amber/raw/master/biobb_amber/test/data/cphstats/sander.ceout.gz
+    type: string
+    defaultValue: SA.gz
+    format:
+      uri: edam:format_3987
+  - name: input_model_dir
+    required: true
+    description: "Input directory of trained models"
+    type: string
+    defaultValue: output
+    format:
+      uri: edam:format_2330
+  - name: output_log_path
+    required: true
+    description: Path to the log file, Type string, File type output, Accepted formats log
+    type: string
+    defaultValue: system.log
+    format:
+      uri: edam:format_2330
+  - name: output_sdf_path
+    required: true
+    description: Path to the output file, Type string, File type output, Accepted formats sdf
+    type: string
+    defaultValue: system.sdf
+    format:
+      uri: edam:format_3814
+  - name: num_samples
+    required: true
+    description: The number of training epochs, Type int
+    type: int
+    defaultValue: 1000
+    format:
+      uri: edam:format_2330
+  - name: rdkit_error_logging
+    required: true
+    description: Enable or disable RDKit error logging
+    type: string
+    defaultValue: ON
+outputs:
+  - name: output_log_path
+    required: true
+    description: Path to the log file
+    type: File
+    format:
+      uri: edam:format_2330
+  - name: output_sdf_path
+    required: true
+    description: Path to the output file
+    type: File
+    format:
+      uri: edam:format_3814
+ui:
+  - key: inputs.input_data_path
+    title: "input_data_path: "
+    description: "Path to the input data file, Type string, File type input, Accepted formats pkl, Example file https//github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/reference/classification/ref_output_model_support_vector_machine.pkl"
+    type: string
+  - key: inputs.input_NP_Score_path
+    title: "input_NP_Score_path: "
+    description: "Output ceout file (AMBER ceout), Type string, File type input, Accepted formats gz, Example file https//github.com/bioexcel/biobb_amber/raw/master/biobb_amber/test/data/cphstats/sander.ceout.gz"
+    type: string
+  - key: inputs.input_SA_Score_path
+    title: "input_SA_Score_path: "
+    description: "Output ceout file (AMBER ceout), Type string, File type input, Accepted formats gz, Example file https//github.com/bioexcel/biobb_amber/raw/master/biobb_amber/test/data/cphstats/sander.ceout.gz"
+    type: string
+  - key: inputs.input_model_dir
+    title: "input_model_dir: "
+    description: ""
+    type: string
+  - key: inputs.output_log_path
+    title: "output_log_path: "
+    description: "Path to the log file, Type string, File type output, Accepted formats log"
+    type: string
+  - key: inputs.output_sdf_path
+    title: "output_sdf_path: "
+    description: "Path to the output file, Type string, File type output, Accepted formats sdf"
+    type: string
+  - key: inputs.num_samples
+    title: "num_samples: "
+    description: "The number of training epochs, Type int"
+    type: int
+  - key: inputs.rdkit_error_logging
+    title: "rdkit_error_logging: "
+    description: "Enable or disable RDKit error logging"
+    type: string
diff --git a/utils/load-trained-molgan-model-plugin/load_trained_molgan_model_0@[email protected] b/utils/load-trained-molgan-model-plugin/load_trained_molgan_model_0@[email protected]
@@ -0,0 +1,149 @@
+#!/usr/bin/env cwl-runner
+cwlVersion: v1.0
+
+class: CommandLineTool
+
+label: MolGAN tool for generating small molecules
+
+baseCommand: ["python", "/MolGAN/run_trained_model.py"]
+
+hints:
+  DockerRequirement:
+    dockerPull: polusai/molgan-tool@sha256:e008e74170be12dcf50a936a417b8c330ccdebf7fe17abaa8fa2689dac210725
+
+# Set environment variables for the tool,
+# See: https://www.commonwl.org/user_guide/topics/environment-variables.html
+requirements:
+  EnvVarRequirement:
+    envDef:
+      RDKIT_ERROR_LOGGING: $(inputs.rdkit_error_logging)
+inputs:
+  input_data_path:
+    label: Path to the input data file
+    doc: |-
+      Path to the input data file
+      Type: string
+      File type: input
+      Accepted formats: pkl
+      Example file: https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/reference/classification/ref_output_model_support_vector_machine.pkl
+    type: string
+    format: edam:format_3653
+    inputBinding:
+      prefix: --input_data_path
+    default: system.pkl
+
+  input_NP_Score_path:
+    label: Output ceout file (AMBER ceout)
+    doc: |-
+      Output ceout file (AMBER ceout)
+      Type: string
+      File type: input
+      Accepted formats: gz
+      Example file: https://github.com/bioexcel/biobb_amber/raw/master/biobb_amber/test/data/cphstats/sander.ceout.gz
+    type: string
+    format: edam:format_3987
+    default: NP.gz
+    inputBinding:
+      prefix: --input_NP_Score_path
+
+  input_SA_Score_path:
+    label: Output ceout file (AMBER ceout)
+    doc: |-
+      Output ceout file (AMBER ceout)
+      Type: string
+      File type: input
+      Accepted formats: gz
+      Example file: https://github.com/bioexcel/biobb_amber/raw/master/biobb_amber/test/data/cphstats/sander.ceout.gz
+    type: string
+    format: edam:format_3987
+    default: SA.gz
+    inputBinding:
+      prefix: --input_SA_Score_path
+
+  input_model_dir:
+    label: Input directory of trained models
+    doc: |-
+      Input directory of trained models
+    type: string
+    format: edam:format_2330 # 'Textual format'
+    inputBinding:
+      prefix: --input_model_dir
+    default: output
+
+  output_log_path:
+    label: Path to the log file
+    doc: |-
+      Path to the log file
+      Type: string
+      File type: output
+      Accepted formats: log
+    type: string
+    format: edam:format_2330
+    inputBinding:
+      prefix: --output_log_path
+    default: system.log
+
+  output_sdf_path:
+    label: Path to the output file
+    doc: |-
+      Path to the output file
+      Type: string
+      File type: output
+      Accepted formats: sdf
+    type: string
+    format: edam:format_3814 # sdf
+    default: system.sdf
+    inputBinding:
+      prefix: --output_sdf_path
+
+  num_samples:
+    label: The number of new molecules to generate
+    doc: |-
+      The number of training epochs
+      Type: int
+    type: int?
+    format: edam:format_2330
+    inputBinding:
+      position: 7
+      prefix: --num_samples
+    default: 1000
+
+  rdkit_error_logging:
+    label: Enable or disable RDKit error logging
+    doc: |-
+      Enable or disable RDKit error logging
+    type: string?
+    # RDKit prints out all errors by default, which can pose issues for CI,
+    # particularly with large databases. It would be more efficient to suppress these errors.
+    default: "ON"
+outputs:
+  output_log_path:
+    label: Path to the log file
+    doc: |-
+      Path to the log file
+    type: File
+    outputBinding:
+      glob: $(inputs.output_log_path)
+    format: edam:format_2330
+
+  output_sdf_path:
+    label: Path to the output file
+    doc: |-
+      Path to the output file
+    type: File
+    outputBinding:
+      glob: $(inputs.output_sdf_path)
+    format: edam:format_3814 # sdf
+
+  stderr:
+    type: File
+    outputBinding:
+      glob: stderr
+
+stderr: stderr
+
+$namespaces:
+  edam: https://edamontology.org/
+
+$schemas:
+- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl