From 2be9a54f87fa650d3a88908fd47bd11ec4327ae9 Mon Sep 17 00:00:00 2001 From: Zhiyuan Chen Date: Tue, 8 Oct 2024 21:56:48 +0800 Subject: [PATCH] add EternaBench dataset Signed-off-by: Zhiyuan Chen --- docs/docs/datasets/eternabench.md | 9 ++ docs/mkdocs.yml | 1 + .../datasets/eternabench_cm/README.md | 110 ++++++++++++++ .../datasets/eternabench_cm/eternabench_cm.py | 63 ++++++++ .../datasets/eternabench_switch/README.md | 139 ++++++++++++++++++ .../eternabench_switch/eternabench_switch.py | 94 ++++++++++++ 6 files changed, 416 insertions(+) create mode 100644 docs/docs/datasets/eternabench.md create mode 100644 multimolecule/datasets/eternabench_cm/README.md create mode 100644 multimolecule/datasets/eternabench_cm/eternabench_cm.py create mode 100644 multimolecule/datasets/eternabench_switch/README.md create mode 100644 multimolecule/datasets/eternabench_switch/eternabench_switch.py diff --git a/docs/docs/datasets/eternabench.md b/docs/docs/datasets/eternabench.md new file mode 100644 index 00000000..833d4b2f --- /dev/null +++ b/docs/docs/datasets/eternabench.md @@ -0,0 +1,9 @@ +--- +authors: + - Zhiyuan Chen +date: 2024-05-04 +--- + +# EternaBench + +--8<-- "multimolecule/datasets/eternabench/README.md:21:" diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 5cef7518..e6bd4a30 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -23,6 +23,7 @@ nav: - bpRNA-spot: datasets/bprna-spot.md - bpRNA-new: datasets/bprna-new.md - RYOS: datasets/ryos.md + - EternaBench: datasets/eternabench.md - module: - module/index.md - heads: module/heads.md diff --git a/multimolecule/datasets/eternabench_cm/README.md b/multimolecule/datasets/eternabench_cm/README.md new file mode 100644 index 00000000..7cc71faa --- /dev/null +++ b/multimolecule/datasets/eternabench_cm/README.md @@ -0,0 +1,110 @@ +--- +language: rna +tags: + - Biology + - RNA +license: + - agpl-3.0 +size_categories: + - 1K. + +from __future__ import annotations + +import os + +import danling as dl +import pandas as pd +import torch + +from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_ +from multimolecule.datasets.conversion_utils import save_dataset + +torch.manual_seed(1016) + +cols = [ + "id", + "design", + "sequence", + "secondary_structure", + "reactivity", + "errors", + "signal_to_noise", +] + + +def convert_dataset_(df: pd.DataFrame): + df.signal_to_noise = df.signal_to_noise.str.split(":").str[-1].astype(float) + df = df.rename(columns={"ID": "id", "design_name": "design", "structure": "secondary_structure"}) + df = df.sort_values("id") + df = df[cols] + return df + + +def convert_dataset(convert_config): + train = dl.load_pandas(convert_config.train_path) + test = dl.load_pandas(convert_config.test_path) + save_dataset(convert_config, {"train": convert_dataset_(train), "test": convert_dataset_(test)}) + + +class ConvertConfig(ConvertConfig_): + root: str = os.path.dirname(__file__) + output_path: str = os.path.basename(os.path.dirname(__file__)).replace("_", "-") + + +if __name__ == "__main__": + config = ConvertConfig() + config.parse() # type: ignore[attr-defined] + convert_dataset(config) diff --git a/multimolecule/datasets/eternabench_switch/README.md b/multimolecule/datasets/eternabench_switch/README.md new file mode 100644 index 00000000..eb3def04 --- /dev/null +++ b/multimolecule/datasets/eternabench_switch/README.md @@ -0,0 +1,139 @@ +--- +language: rna +tags: + - Biology + - RNA +license: + - agpl-3.0 +size_categories: + - 1K. + +from __future__ import annotations + +import os + +import danling as dl +import pandas as pd +import torch + +from multimolecule.datasets.conversion_utils import ConvertConfig as ConvertConfig_ +from multimolecule.datasets.conversion_utils import save_dataset + +torch.manual_seed(1016) + +cols = [ + "id", + "design", + "sequence", + "activation_ratio", + "ligand", + "switch", + "kd_off", + "kd_on", + "kd_fmn", + "kd_no_fmn", + "min_kd_val", + "ms2_aptamer", + "lig_aptamer", + "ms2_lig_aptamer", + "log_kd_nolig", + "log_kd_lig", + "log_kd_nolig_scaled", + "log_kd_lig_scaled", + "log_AR", + "folding_subscore", + "num_clusters", +] + + +def convert_dataset_(df: pd.DataFrame): + df = df.rename( + columns={ + "index": "id", + "Design": "design", + "Activation Ratio": "activation_ratio", + "Folding_Subscore": "folding_subscore", + "KDOFF": "kd_off", + "KDON": "kd_on", + "KDFMN": "kd_fmn", + "KDnoFMN": "kd_no_fmn", + "NumberOfClusters": "num_clusters", + "logkd_nolig": "log_kd_nolig", + "logkd_lig": "log_kd_lig", + "logkd_nolig_scaled": "log_kd_nolig_scaled", + "logkd_lig_scaled": "log_kd_lig_scaled", + "MS2_aptamer": "ms2_aptamer", + "MS2_lig_aptamer": "ms2_lig_aptamer", + } + ) + df = df.sort_values("id") + df = df[cols] + return df + + +def convert_dataset(convert_config): + train = dl.load_pandas(convert_config.train_path) + test = dl.load_pandas(convert_config.test_path) + save_dataset(convert_config, {"train": convert_dataset_(train), "test": convert_dataset_(test)}) + + +class ConvertConfig(ConvertConfig_): + root: str = os.path.dirname(__file__) + output_path: str = os.path.basename(os.path.dirname(__file__)).replace("_", "-") + + +if __name__ == "__main__": + config = ConvertConfig() + config.parse() # type: ignore[attr-defined] + convert_dataset(config)