DNM

Signed-off-by: Zhiyuan Chen <[email protected]>
DLS5-Omics · Mar 26, 2024 · 9c53e8e · 9c53e8e
1 parent aded8a3
commit 9c53e8e
Show file tree

Hide file tree

Showing 12 changed files with 778 additions and 2 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -57,8 +57,8 @@ repos:
       - id: check-merge-conflict
       - id: check-vcs-permalinks
       - id: check-symlinks
-      - id: pretty-format-json
-        files: multimolecule
+      # - id: pretty-format-json
+      #   files: multimolecule
       - id: check-json
       - id: check-xml
       - id: check-toml

diff --git a/multimolecule/__init__.py b/multimolecule/__init__.py
diff --git a/multimolecule/models/__init__.py b/multimolecule/models/__init__.py
@@ -0,0 +1,3 @@
+from .rnabert import RnaBertConfig, RnaBertModel, RnaBertTokenizer
+
+__all__ = ["RnaBertConfig", "RnaBertModel", "RnaBertTokenizer"]
diff --git a/multimolecule/models/rnabert/__init__.py b/multimolecule/models/rnabert/__init__.py
@@ -0,0 +1,5 @@
+from .configuration_rnabert import RnaBertConfig
+from .modeling_rnabert import RnaBertModel
+from .tokenization_rnabert import RnaBertTokenizer
+
+__all__ = ["RnaBertConfig", "RnaBertModel", "RnaBertTokenizer"]
diff --git a/multimolecule/models/rnabert/config.json b/multimolecule/models/rnabert/config.json
@@ -0,0 +1,25 @@
+{
+  "architectures": ["RnaBertModel"],
+  "attention_probs_dropout_prob": 0.0,
+  "emb_layer_norm_before": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.0,
+  "hidden_size": 120,
+  "initializer_range": 0.02,
+  "intermediate_size": 40,
+  "layer_norm_eps": 1e-12,
+  "mask_token_id": null,
+  "max_position_embeddings": 440,
+  "model_type": "rnabert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "position_embedding_type": "absolute",
+  "ss_size": 8,
+  "token_dropout": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.39.1",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_list": ["<pad>", "<mask>", "A", "T", "G", "C"],
+  "vocab_size": 6
+}
diff --git a/multimolecule/models/rnabert/configuration_rnabert.py b/multimolecule/models/rnabert/configuration_rnabert.py
@@ -0,0 +1,106 @@
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class RnaBertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`RnaBertModel`]. It is used to instantiate a
+    RnaBert model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the RnaBert
+    [mana438/RNABERT](https://github.com/mana438/RNABERT/blob/master/RNA_bert_config.json) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*):
+            Vocabulary size of the RnaBert model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`RnaBertModel`].
+        mask_token_id (`int`, *optional*):
+            The index of the mask token in the vocabulary. This must be included in the config because of the
+            "mask-dropout" scaling trick, which will scale the inputs depending on the number of masked tokens.
+        pad_token_id (`int`, *optional*):
+            The index of the padding token in the vocabulary. This must be included in the config because certain parts
+            of the RnaBert code use this instead of the attention mask.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 1026):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        emb_layer_norm_before (`bool`, *optional*):
+            Whether to apply layer normalization after embeddings but before the main stem of the network.
+        token_dropout (`bool`, defaults to `False`):
+            When this is enabled, masked tokens are treated as if they had been dropped out by input dropout.
+
+    Examples:
+
+    ```python
+    >>> from transformers import RnaBertModel, RnaBertConfig
+
+    >>> # Initializing a RnaBert style configuration >>> configuration = RnaBertConfig()
+
+    >>> # Initializing a model from the configuration >>> model = RnaBertModel(configuration)
+
+    >>> # Accessing the model configuration >>> configuration = model.config
+    ```"""
+
+    model_type = "rnabert"
+
+    def __init__(
+        self,
+        vocab_size=None,
+        mask_token_id=None,
+        pad_token_id=None,
+        hidden_size=None,
+        multiple=None,
+        num_hidden_layers=6,
+        num_attention_heads=12,
+        intermediate_size=40,
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        max_position_embeddings=440,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        emb_layer_norm_before=None,
+        token_dropout=False,
+        vocab_list=None,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, mask_token_id=mask_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        if hidden_size is None:
+            hidden_size = num_attention_heads * multiple if multiple is not None else 120
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.emb_layer_norm_before = emb_layer_norm_before
+        self.token_dropout = token_dropout
+        self.vocab_list = vocab_list
+
+
+def get_default_vocab_list():
+    return ["<pad>", "<mask>", "A", "T", "G", "C"]
diff --git a/multimolecule/models/rnabert/convert_checkpoint.py b/multimolecule/models/rnabert/convert_checkpoint.py
@@ -0,0 +1,35 @@
+import sys
+from typing import Optional
+
+import chanfig
+import torch
+
+from . import RnaBertConfig, RnaBertModel
+from .configuration_rnabert import get_default_vocab_list
+
+
+def convert_checkpoint(checkpoint_path: str, output_path: Optional[str] = None):
+    if output_path is None:
+        output_path = "rnabert"
+    config = RnaBertConfig.from_dict(chanfig.load("config.json"))
+    config.vocab_list = get_default_vocab_list()
+    ckpt = torch.load(checkpoint_path)
+    bert_state_dict = ckpt
+    state_dict = {}
+
+    model = RnaBertModel(config)
+
+    for key, value in bert_state_dict.items():
+        if key.startswith("module.cls"):
+            continue
+        key = key[12:]
+        key = key.replace("gamma", "weight")
+        key = key.replace("beta", "bias")
+        state_dict[key] = value
+
+    model.load_state_dict(state_dict)
+    model.save_pretrained(output_path)
+
+
+if __name__ == "__main__":
+    convert_checkpoint(sys.argv[1], sys.argv[2] if len(sys.argv) > 2 else None)