diff --git a/.flake8 b/.flake8
index 1a256e6..eae28f6 100644
--- a/.flake8
+++ b/.flake8
@@ -9,11 +9,10 @@ ignore =
     # W503: line break before binary operator
     # W504: line break after binary operator
     # format by black
-    E203,E241,E704,W503,W504,
+    E203,E241,E704,W503,W504,E501,W505,
     # E501: line too long
     # W505: doc line too long
     # too long docstring due to long example blocks
-    E501,W505,
 per-file-ignores =
     # F401: module imported but unused
     # intentionally unused imports
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index e71c35e..2683657 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -24,7 +24,6 @@ What types of changes does your code introduce? Put an `x` in all the boxes that
 Go over all the following points, and put an `x` in all the boxes that apply.
 If you are unsure about any of these, don't hesitate to ask. We are here to help!
 
-<!-- - [ ] I have read the [CONTRIBUTION](https://safe-sora.readthedocs.io/en/latest/developer/contributing.html) guide. (**required**) -->
 - [ ] My change requires a change to the documentation.
 - [ ] I have updated the tests accordingly. (*required for a bug fix or a new feature*)
 - [ ] I have updated the documentation accordingly.
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 90cb6e2..4bdcc64 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -44,22 +44,6 @@ jobs:
         run: |
           make pre-commit
 
-      - name: ruff
-        run: |
-          make ruff
-
-      - name: flake8
-        run: |
-          make flake8
-
-      - name: pylint
-        run: |
-          make pylint
-
-      - name: isort and black
-        run: |
-          make py-format
-
       - name: addlicense
         run: |
           make addlicense
diff --git a/.gitignore b/.gitignore
index 6473400..c5be0c6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,9 +1,10 @@
 ##### Project Specification #####
-dataset/
 outputs/
 wandb/
 test/
 data/
+checkpoints/
+cache_dir
 
 ##### Python.gitignore #####
 # Byte-compiled / optimized / DLL files
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 28e7222..10f747c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,6 +33,11 @@ repos:
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]
+        exclude: |
+          (?x)(
+            ^safe_sora/models/multimodal_encoder/|
+            ^safe_sora/models/multimodal_projector/
+          )
   - repo: https://github.com/PyCQA/isort
     rev: 5.13.2
     hooks:
@@ -68,6 +73,8 @@ repos:
   - repo: local
     hooks:
       - id: pylint
+        args:
+          - --disable=R0801
         name: pylint
         entry: pylint
         language: system
@@ -78,5 +85,8 @@ repos:
             ^examples/|
             ^tests/|
             ^setup.py$|
+            ^safe_sora/models/multimodal_encoder/|
+            ^safe_sora/models/multimodal_projector/|
+            ^safe_sora/models/video_llava.py|
             ^docs/source/conf.py$
           )
diff --git a/Makefile b/Makefile
index 493bce0..1fb3b48 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@ PROJECT_NAME   = safe-sora
 COPYRIGHT      = "PKU-Alignment Team. All Rights Reserved."
 PROJECT_PATH   = safe_sora
 SHELL          = /bin/bash
-SOURCE_FOLDERS = $(PROJECT_PATH) examples tests docs
+SOURCE_FOLDERS = $(PROJECT_PATH) examples docs
 PYTHON_FILES   = $(shell find $(SOURCE_FOLDERS) -type f -name "*.py" -o -name "*.pyi")
 COMMIT_HASH    = $(shell git log -1 --format=%h)
 PATH           := $(HOME)/go/bin:$(PATH)
@@ -130,7 +130,7 @@ pre-commit: pre-commit-install
 # Documentation
 
 addlicense: addlicense-install
-	addlicense -c $(COPYRIGHT) -ignore tests/coverage.xml -l apache -y 2022-$(shell date +"%Y") -check $(SOURCE_FOLDERS)
+	addlicense -c $(COPYRIGHT) -ignore **/multimodal_encoder/** -ignore **/multimodal_projector/** -l apache -y 2022-$(shell date +"%Y") -check $(SOURCE_FOLDERS)
 
 docstyle: docs-install
 	make -C docs clean
diff --git a/conda-recipe.yaml b/conda-recipe.yaml
new file mode 100644
index 0000000..088560b
--- /dev/null
+++ b/conda-recipe.yaml
@@ -0,0 +1,56 @@
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Create virtual environment with command:
+#
+#   $ CONDA_OVERRIDE_CUDA=11.8 conda env create --file conda-recipe.yaml
+#
+
+name: safe-sora
+channels:
+  - huggingface
+  - pytorch
+  - nvidia/label/cuda-12.1.0
+  - defaults
+  - conda-forge
+dependencies:
+  - python = 3.11
+  - pip
+
+  - pytorch::pytorch >= 2.0
+  - pytorch::pytorch-mutex =*=*cuda*
+  - pytorch::torchvision
+  - transformers >= 4.42
+  - datasets
+  - tokenizers >= 0.19
+  - sentencepiece
+  - tensorboard
+  - wandb
+  - pip:
+      - accelerate
+      - deepspeed
+      - decord
+      - opencv-python
+
+  - nvidia/label/cuda-12.1.0::cuda-toolkit = 12.1
+
+  - matplotlib-base
+  - rich
+  - tqdm
+  - typing-extensions
+  - bitsandbytes
+  - av
+  - einops
+  - peft
diff --git a/docs/images/win_rate.png b/docs/images/win_rate.png
new file mode 100644
index 0000000..5f24428
Binary files /dev/null and b/docs/images/win_rate.png differ
diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 0000000..5d92e5d
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,63 @@
+<!-- markdownlint-disable html -->
+
+# Preference Model
+
+In this directory, we provide an example implementation of training a preference predictor reward model on our dataset.
+
+## Preference Modeling
+
+To modeling human preferences, it's common to use a preference predictor adhering to the Bradley-Terry Model. The preference data is symbolized as $y_w \succ y_{l} | x$ where $y_{w}$ denotes the more preferred video than $y_l$ corresponding to the prompt $x$.
+The log-likelihood loss used to train a parameterized predictor $R_\phi$ on dataset $\mathcal{D}$ is:
+
+$$\mathcal{L} (\phi; \mathcal{D}) = -\mathbb E_{{(x,y_w,y_l)\sim \mathcal{D}}} \left[\log \sigma (R_{\phi} (y_w,x) - R_{\phi} (y_l,x))\right]$$
+
+
+Leveraging a multi-modal model architecture modified on the [Video-LLaVA](https://github.com/PKU-YuanGroup/Video-LLaVA) and training with preference data from [SafeSora Dataset](https://huggingface.co/datasets/PKU-Alignment/SafeSora), we have develop a T-V reward model.
+The language head of the vision-language model is replaced with a score regression head, which predicts the preference score of the video given the prompt.
+
+This model translates abstract human values into quantifiable and optimizable scalar metrics.
+Consequently, the reward model can partially replace human evaluators in assessing outputs from video generation models and act as a supervisory signal to enhance the performance of these models.
+
+## Alignment Evaluation of Different Models
+
+The SafeSora dataset includes annotations across multiple dimensions of human preference. We have developed several distinct models that focus on different aspects of human preference, such as helpfulness, harmlessness, and four specific sub-dimensions of helpfulness. Our models achieve an agreement ratio of 65.29% for predicting helpfulness preference and 72.41% for predicting harmlessness preference when compared with crowdworker assessments.
+
+Furthermore, we utilize these models to evaluate four open-source models on our [Evaluation Dataset](https://huggingface.co/datasets/PKU-Alignment/SafeSora-Eval). The win-rate relationships among these models, assessed across the two alignment dimensions, are depicted in the figure below.
+
+<div align="center">
+  <img src="../docs/images/win_rate.png" alt="win_rate" width="85%"/>
+</div>
+
+## Training
+
+First, you need to [download our dataset](../README.md#data-access) to local and prepare the training environment using:
+
+```bash
+conda env create -f conda-recipe.yaml  # mamba env create -f conda-recipe.yaml
+conda activate safe-sora
+```
+
+Then, you need to download the Video-LLaVA model and the MM-MLP adapter from the Hugging Face model hub. For example, you can download them use the following commands:
+
+```bash
+huggingface-cli download --resume-download LanguageBind/Video-LLaVA-7B --local-dir ./LanguageBind/Video-LLaVA-7B
+huggingface-cli download --resume-download LanguageBind/Video-LLaVA-Pretrain-7B --local-dir ./LanguageBind/Video-LLaVA-Pretrain-7B
+```
+
+Then, you can run the following script to train the reward model on the SafeSora dataset:
+
+```bash
+bash examples/scripts/finetune_reward_model.sh \
+    --model_name_or_path <your-model-name-or-checkpoint-path> \
+    --mm_mlp_adapter_path <your-mm_mlp_adapter_path> \
+    --dimension <the-target-dimension-to-train> \
+    --output_dir examples/outputs/reward-model
+```
+
+where `<your-model-name-or-checkpoint-path>` is the name of the Video-LLaVA model or the path to the checkpoint directory, `<your-mm_mlp_adapter_path>` is the path to the `mm_projector.bin` file, and `<the-target-dimension-to-train>` is the preference dimension that the reward model will predict.
+
+**NOTE:** The parameter 'dimension' specifies the preference dimension that the reward model will predict. The SafeSora dataset currently supports the following dimensions: `helpfulness`, `harmlessness`, `instruction_following`, `correctness`, `informativeness`, and `aesthetics`. For the detailed information of the different dimensions, please refer to our [paper](https://arxiv.org/abs/2406.14477).
+
+## Acknowledgements
+
+This implementation benefits from [DeepSpeed](https://github.com/microsoft/DeepSpeed), [Transformers](https://github.com/huggingface/transformers), [LLaVA](https://github.com/haotian-liu/LLaVA), and [Video-LLaVA](https://github.com/PKU-YuanGroup/Video-LLaVA). Thanks for their wonderful works and their efforts for democratizing the LLM research.
diff --git a/examples/reward_model/inference.py b/examples/reward_model/inference.py
new file mode 100644
index 0000000..62507dc
--- /dev/null
+++ b/examples/reward_model/inference.py
@@ -0,0 +1,350 @@
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Adopted from https://github.com/PKU-YuanGroup/Video-LLaVA/blob/main/videollava/eval/video/run_inference_benchmark_general.py
+# Its original license is Apache-2.0 license.
+
+"""Inference script for reward model."""
+
+from __future__ import annotations
+
+import copy
+import json
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import Sequence
+
+import torch
+import transformers
+from torch import distributed as dist
+from torch.utils.data import DataLoader, Dataset, DistributedSampler
+from tqdm import tqdm
+from transformers import AutoTokenizer
+
+from safe_sora.conversations import conv_templates
+from safe_sora.datasets.video import VideoDataset
+from safe_sora.models import LlavaLlamaForScore
+from safe_sora.models.constants import (
+    DEFAULT_IM_END_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    DEFAULT_IMAGE_PATCH_TOKEN,
+    DEFAULT_VID_END_TOKEN,
+    DEFAULT_VID_START_TOKEN,
+    DEFAULT_VIDEO_PATCH_TOKEN,
+    MAX_VIDEO_LENGTH,
+)
+from safe_sora.models.score_model import ScoreModelOutput
+from safe_sora.utils import order_pick_k
+from utils import preprocess_multimodal, preprocess_text
+
+
+chat_template = conv_templates['video_rm']
+
+
+def distributed_max(tensor: torch.Tensor) -> torch.Tensor:
+    """Compute the maximum value of a tensor across all workers."""
+    if not dist.is_initialized():
+        logging.warning('Max without distributed initialization.')
+        return tensor
+
+    dist.all_reduce(tensor, op=dist.ReduceOp.MAX)
+    return tensor
+
+
+def distributed_gather(to_gather: torch.Tensor) -> torch.Tensor:
+    """Gather tensors from all workers."""
+    if not dist.is_initialized():
+        logging.warning('Gathering without distributed initialization.')
+        return to_gather
+
+    if not to_gather.is_cuda:
+        to_gather = to_gather.to(f'cuda:{dist.get_rank()}')
+
+    world_size = dist.get_world_size()
+
+    length = torch.tensor([to_gather.shape[0]], device=to_gather.device)
+    lengths = [torch.zeros_like(length) for _ in range(world_size)]
+    dist.all_gather(lengths, length)
+    max_length = max(lengths).item()
+
+    if to_gather.shape[0] < max_length:
+        padding = torch.zeros(
+            max_length - to_gather.shape[0],
+            *to_gather.shape[1:],
+            device=to_gather.device,
+        )
+        to_gather = torch.cat([to_gather, padding], dim=0)
+
+    gathered = [torch.zeros_like(to_gather) for _ in range(world_size)]
+    dist.all_gather(gathered, to_gather)
+
+    for i, length in enumerate(lengths):
+        gathered[i] = gathered[i][: length.item()]
+
+    return torch.cat(gathered, dim=0)
+
+
+@dataclass
+class EvalArguments:
+    model_name_or_path: str | None = field(default='facebook/opt-125m')
+    cache_dir: str | None = field(default=None)
+    output_dir: str | None = field(default=None)
+    model_max_length: int | None = field(default=2048)
+    batch_size: int | None = field(default=4)
+
+
+@dataclass
+class DataArguments:
+    is_multimodal: bool = True
+    image_aspect_ratio: str = 'square'
+    # ===================================================================
+    eval_data_path: str | None = field(
+        default=None,
+        metadata={'help': 'Path to the evaluation data.'},
+    )
+    image_dir: str | None = field(default=None)
+    video_dir: str | None = field(default=None)
+    num_frames: int = 8
+    # ===================================================================
+
+
+class LazyVideoDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(
+        self,
+        config_path: str,
+        video_dir: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        video_processor: transformers.VideoLlavaProcessor,
+        data_args: DataArguments,
+    ) -> None:
+        super().__init__()
+
+        self.dataset = VideoDataset.load(config_path, video_dir=video_dir)
+        self.tokenizer = tokenizer
+        self.video_processor = video_processor
+        self.data_args = data_args
+
+    def __len__(self) -> int:
+        return len(self.dataset)
+
+    @property
+    def modality_lengths(self) -> list[int]:
+        return [len(video_config['prompt_text'] for video_config in self.dataset)]
+
+    def __getitem__(self, index: int) -> dict[str, torch.Tensor]:
+        video_config = self.dataset[index]
+        conversation = [
+            {
+                'from': 'human',
+                'value': f"##Video Generation Prompt: {video_config['prompt_text']}",
+            },
+            {'from': 'gpt', 'value': '##Generated Video: \n<video>'},
+        ]
+        video_file = video_config['video_path']
+        video_file = video_file if isinstance(video_file, list) else [video_file]
+        video_file = order_pick_k(video_file, MAX_VIDEO_LENGTH)
+        image = [
+            self.video_processor(i, return_tensors='pt')['pixel_values'][0] for i in video_file
+        ]
+
+        sources = preprocess_multimodal(
+            copy.deepcopy([conversation]),
+            num_frames=self.data_args.num_frames,
+            mm_use_im_start_end=self.data_args.mm_use_im_start_end,
+        )
+        data_dict = preprocess_text(sources, self.tokenizer, has_image=True)
+        return {
+            'index': index,
+            'input_ids': data_dict['input_ids'][0],
+            'labels': data_dict['labels'][0],
+            'image': image,
+        }
+
+
+@dataclass
+class DataCollatorForSupervisedDataset:
+    """Collate examples for supervised fine-tuning."""
+
+    tokenizer: transformers.PreTrainedTokenizer
+
+    def __call__(self, instances: Sequence[dict]) -> dict[str, torch.Tensor]:
+        input_ids = [instance['input_ids'] for instance in instances]
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids,
+            batch_first=True,
+            padding_value=self.tokenizer.pad_token_id,
+        )
+        input_ids = input_ids[:, : self.tokenizer.model_max_length]
+
+        images = [instance['image'] for instance in instances]
+        new_images = []
+        for image in images:
+            if isinstance(image, list):
+                for i in image:
+                    new_images.append(i)
+            else:
+                new_images.append(image)
+        return {
+            'index': [instance['index'] for instance in instances],
+            'input_ids': input_ids,
+            'images': new_images,
+            'attention_mask': input_ids.ne(self.tokenizer.pad_token_id),
+        }
+
+
+def load_pretrained_model(
+    model_path: str,
+    device: str = 'cuda',
+    **kwargs,  # noqa
+) -> tuple[transformers.PreTrainedTokenizer, LlavaLlamaForScore, dict]:
+    """Load a pretrained model."""
+
+    kwargs['device_map'] = {'': device}
+    kwargs['torch_dtype'] = torch.float16
+
+    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+    model = LlavaLlamaForScore.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+
+    # ==========================================================================================================
+    processor = {'image': None, 'video': None}
+
+    mm_use_im_start_end = getattr(model.config, 'mm_use_im_start_end', False)
+    mm_use_im_patch_token = getattr(model.config, 'mm_use_im_patch_token', True)
+    if mm_use_im_patch_token:
+        tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+        tokenizer.add_tokens([DEFAULT_VIDEO_PATCH_TOKEN], special_tokens=True)
+    if mm_use_im_start_end:
+        tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+        tokenizer.add_tokens([DEFAULT_VID_START_TOKEN, DEFAULT_VID_END_TOKEN], special_tokens=True)
+    model.resize_token_embeddings(len(tokenizer))
+
+    if model.config.mm_image_tower is not None:
+        image_tower = model.get_image_tower()
+        if not image_tower.is_loaded:
+            image_tower.load_model()
+        image_tower.to(device=device, dtype=torch.float16)
+        image_processor = image_tower.image_processor
+        processor['image'] = image_processor
+
+    if model.config.mm_video_tower is not None:
+        video_tower = model.get_video_tower()
+        if not video_tower.is_loaded:
+            video_tower.load_model()
+        video_tower.to(device=device, dtype=torch.float16)
+        video_processor = video_tower.video_processor
+        processor['video'] = video_processor
+
+    return tokenizer, model, processor
+
+
+def main() -> None:
+    parser = transformers.HfArgumentParser((EvalArguments, DataArguments))
+    eval_args, data_args = parser.parse_args_into_dataclasses()
+
+    dist.init_process_group(backend='nccl', init_method='env://')
+    local_rank = dist.get_rank()
+    world_size = dist.get_world_size()
+    eval_args.device = f'cuda:{local_rank}'
+
+    tokenizer, model, processor = load_pretrained_model(
+        eval_args.model_name_or_path,
+        device=eval_args.device,
+    )
+    model = model.to(eval_args.device)
+
+    data_args.video_processor = processor['video']
+    data_args.mm_use_im_start_end = model.config.mm_use_im_start_end
+    data_args.mm_use_im_patch_token = model.config.mm_use_im_patch_token
+
+    dataset = LazyVideoDataset(
+        config_path=data_args.eval_data_path,
+        video_dir=data_args.video_dir,
+        tokenizer=tokenizer,
+        video_processor=processor['video'],
+        data_args=data_args,
+    )
+    collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    sampler = DistributedSampler(
+        dataset,
+        num_replicas=world_size,
+        rank=local_rank,
+        shuffle=False,
+    )
+
+    dataloader = DataLoader(
+        dataset,
+        batch_size=eval_args.batch_size,
+        sampler=sampler,
+        collate_fn=collator,
+        shuffle=False,
+    )
+
+    bar = tqdm(total=len(dataloader), desc='Inference ...', disable=local_rank != 0)
+
+    results = {
+        'index': [],
+        'end_scores': [],
+    }
+    for _, batch in enumerate(dataloader):
+        indexes = batch['index']
+        batch = {
+            'input_ids': batch['input_ids'].to(model.device),
+            'images': [image.half().to(model.device) for image in batch['images']],
+            'attention_mask': batch['attention_mask'].to(model.device),
+        }
+
+        with torch.no_grad():
+            outputs: ScoreModelOutput = model(**batch)
+            end_scores = outputs.end_scores
+            input_ids = batch['input_ids']
+            input_ids[input_ids == -200] = tokenizer.eos_token_id
+
+            for index, end_score in zip(indexes, end_scores):
+                results['index'].append(index)
+                results['end_scores'].append(end_score.item())
+
+        bar.update(1)
+    dist.barrier()
+    indexes = distributed_gather(torch.tensor(results['index']).to(eval_args.device))
+    end_scores = distributed_gather(torch.tensor(results['end_scores']).to(eval_args.device))
+    dist.barrier()
+
+    full_configs = dataset.dataset.configs
+    for index, end_score in zip(indexes, end_scores):
+        full_config = full_configs[index]
+        if 'scores' not in full_config:
+            full_config['scores'] = []
+        full_config['scores'].append(
+            {
+                'value': end_score.item(),
+                'from': eval_args.model_name_or_path,
+            },
+        )
+
+    bar.close()
+
+    if local_rank == 0:
+        basename = os.path.basename(data_args.eval_data_path).split('.')[0]
+        save_path = os.path.join(eval_args.output_dir, f'{basename}.json')
+        os.makedirs(eval_args.output_dir, exist_ok=True)
+        with open(save_path, 'w', encoding='utf-8') as f:
+            json.dump(full_configs, f, ensure_ascii=False, indent=4)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/reward_model/train_reward.py b/examples/reward_model/train_reward.py
new file mode 100644
index 0000000..be219e7
--- /dev/null
+++ b/examples/reward_model/train_reward.py
@@ -0,0 +1,417 @@
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Adopted from https://github.com/PKU-YuanGroup/Video-LLaVA/blob/main/videollava/train/train.py
+# Its original license is Apache-2.0 license.
+
+"""Train a reward model for text-to-video generation."""
+
+from __future__ import annotations
+
+import copy
+import pathlib
+from dataclasses import dataclass, field
+from typing import Any, Literal
+
+import torch
+import transformers
+from torch.utils.data import Dataset
+
+from safe_sora.conversations import conv_templates
+from safe_sora.datasets.base import VideoPairSample, VideoSample
+from safe_sora.datasets.pair import PairDataset
+from safe_sora.models import LlavaLlamaForScore
+from safe_sora.models.constants import MAX_VIDEO_LENGTH
+from safe_sora.models.video_llava import safe_save_model_for_hf_trainer
+from safe_sora.trainers import RewardTrainer
+from safe_sora.utils import order_pick_k
+from utils import preprocess_multimodal, preprocess_text
+
+
+local_rank = None
+chat_template = conv_templates['video_rm']
+
+
+@dataclass
+class ModelArguments:
+    model_name_or_path: str | None = field(default='facebook/opt-125m')
+    version: str | None = field(default='v0')
+    freeze_backbone: bool = field(default=False)
+    tune_mm_mlp_adapter: bool = field(default=False)
+    vision_tower: str | None = field(default=None)
+    mm_vision_select_layer: int | None = field(default=-1)  # default to the last layer
+    pretrain_mm_mlp_adapter: str | None = field(default=None)
+    mm_projector_type: str | None = field(default='linear')
+    mm_use_im_start_end: bool = field(default=False)
+    mm_use_im_patch_token: bool = field(default=True)
+    mm_vision_select_feature: str | None = field(default='patch')
+
+    # ===================================================================
+    image_tower: str | None = field(default=None)
+    video_tower: str | None = field(default=None)
+    # ===================================================================
+
+
+@dataclass
+class DataArguments:
+    is_multimodal: bool = False
+    image_aspect_ratio: str = 'square'
+    # ===================================================================
+    train_data_path: str | None = field(
+        default=None,
+        metadata={'help': 'Path to the training data.'},
+    )
+    eval_data_path: str | None = field(
+        default=None,
+        metadata={'help': 'Path to the evaluation data.'},
+    )
+    image_dir: str | None = field(default=None)
+    video_dir: str | None = field(default=None)
+    preference_dimension: Literal[
+        'helpfulness',
+        'harmlessness',
+        'instruction_following',
+        'correctness',
+        'informativeness',
+        'aesthetics',
+    ] = 'helpfulness'
+    num_frames: int = 8
+    # ===================================================================
+
+
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: str | None = field(default=None)
+    optim: str = field(default='adamw_torch')
+    remove_unused_columns: bool = field(default=False)
+    freeze_mm_mlp_adapter: bool = field(default=False)
+    mpt_attn_impl: str | None = field(default='triton')
+    model_max_length: int = field(
+        default=512,
+        metadata={
+            'help': 'Maximum list length. lists will be right padded (and possibly truncated).',
+        },
+    )
+    double_quant: bool = field(
+        default=True,
+        metadata={'help': 'Compress the quantization statistics through double quantization.'},
+    )
+    quant_type: str = field(
+        default='nf4',
+        metadata={'help': 'Quantization data type to use. Should be one of `fp4` or `nf4`.'},
+    )
+    bits: int = field(default=16, metadata={'help': 'How many bits to use.'})
+    lora_enable: bool = False
+    lora_r: int = 64
+    lora_alpha: int = 16
+    lora_dropout: float = 0.05
+    lora_weight_path: str = ''
+    lora_bias: str = 'none'
+    mm_projector_lr: float | None = None
+    group_by_modality_length: bool = field(default=False)
+
+    # ================================================
+    tokenizer_model_max_length: int | None = None
+    # ================================================
+
+
+class LazyRewardPairDataset(Dataset):
+    """Pair dataset for training reward model."""
+
+    def __init__(
+        self,
+        config_path: str,
+        video_dir: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        video_processor: transformers.VideoLlavaProcessor,
+        data_args: DataArguments,
+        preference_dimension: Literal[
+            'helpfulness',
+            'harmlessness',
+            'instruction_following',
+            'correctness',
+            'informativeness',
+            'aesthetics',
+        ] = 'helpfulness',
+    ) -> None:
+        super().__init__()
+
+        self.dataset = PairDataset.load(config_path, video_dir=video_dir)
+        self.tokenizer = tokenizer
+        self.video_processor = video_processor
+        self.preference_dimension = preference_dimension
+        self.data_args = data_args
+
+    def __len__(self) -> int:
+        return len(self.dataset)
+
+    @property
+    def modality_lengths(self) -> list[int]:
+        return [len(pair_config['prompt_text']) for pair_config in self.dataset]
+
+    def __getitem__(self, index: int) -> dict[str, torch.Tensor]:
+        pair_config: VideoPairSample = self.dataset[index]
+        if self.preference_dimension in ['helpfulness', 'harmlessness']:
+            higher_score_id = pair_config[self.preference_dimension]
+        elif self.preference_dimension in [
+            'instruction_following',
+            'correctness',
+            'informativeness',
+            'aesthetics',
+        ]:
+            higher_score_id = pair_config['sub_preferences'][self.preference_dimension]
+        else:
+            raise ValueError('Unknown preference dimension.')
+        lower_score_id = 'video_0' if higher_score_id == 'video_1' else 'video_1'
+
+        def process_multimodal_data(video_id: str) -> dict:
+            """Process VideoSample data into a dictionary for training."""
+            video_config: VideoSample = pair_config[video_id]
+            conversation = [
+                {
+                    'from': 'human',
+                    'value': f"##Video Generation Prompt: {pair_config['prompt_text']}",
+                },
+                {'from': 'gpt', 'value': '##Generated Video: \n<video>'},
+            ]
+            video_file = video_config['video_path']
+            video_file = video_file if isinstance(video_file, list) else [video_file]
+            video_file = order_pick_k(video_file, MAX_VIDEO_LENGTH)
+            image = [
+                self.video_processor(i, return_tensors='pt')['pixel_values'][0] for i in video_file
+            ]
+
+            sources = preprocess_multimodal(
+                copy.deepcopy([conversation]),
+                num_frames=self.data_args.num_frames,
+                mm_use_im_start_end=self.data_args.mm_use_im_start_end,
+            )
+            data_dict = preprocess_text(sources, self.tokenizer, has_image=True)
+            return {
+                'input_ids': data_dict['input_ids'][0],
+                'labels': data_dict['labels'][0],
+                'image': image,
+            }
+
+        return {
+            'higher_score': process_multimodal_data(higher_score_id),
+            'lower_score': process_multimodal_data(lower_score_id),
+        }
+
+
+@dataclass
+class DataCollatorForSupervisedDataset:
+    """Collate examples for supervised fine-tuning."""
+
+    tokenizer: transformers.PreTrainedTokenizer
+
+    def __call__(self, instances: list[dict]) -> dict[str, torch.Tensor]:
+        higher_scores = [instance['higher_score'] for instance in instances]
+        lower_scores = [instance['lower_score'] for instance in instances]
+        all_scores = higher_scores + lower_scores
+
+        input_ids = [instance['input_ids'] for instance in all_scores]
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids,
+            batch_first=True,
+            padding_value=self.tokenizer.pad_token_id,
+        )
+        input_ids = input_ids[:, : self.tokenizer.model_max_length]
+
+        images = [instance['image'] for instance in all_scores]
+        new_images = []
+        for image in images:
+            if isinstance(image, list):
+                for i in image:
+                    new_images.append(i)
+            else:
+                new_images.append(image)
+        images = new_images
+        return {
+            'input_ids': input_ids,
+            'images': images,
+            'attention_mask': input_ids.ne(self.tokenizer.pad_token_id),
+        }
+
+
+def make_supervised_data_module(
+    data_args: DataArguments,
+    tokenizer: transformers.PreTrainedTokenizer,
+    video_processor: transformers.VideoLlavaProcessor,
+) -> dict:
+    """Make dataset and collator for supervised fine-tuning."""
+
+    train_dataset = LazyRewardPairDataset(
+        config_path=data_args.train_data_path,
+        video_dir=data_args.video_dir,
+        tokenizer=tokenizer,
+        video_processor=video_processor,
+        preference_dimension=data_args.preference_dimension,
+        data_args=data_args,
+    )
+    eval_dataset = LazyRewardPairDataset(
+        config_path=data_args.eval_data_path,
+        video_dir=data_args.video_dir,
+        tokenizer=tokenizer,
+        video_processor=video_processor,
+        preference_dimension=data_args.preference_dimension,
+        data_args=data_args,
+    )
+
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return {
+        'train_dataset': train_dataset,
+        'eval_dataset': eval_dataset,
+        'data_collator': data_collator,
+    }
+
+
+def compute_metrics(data: transformers.EvalPrediction) -> dict:
+    max_score = data.predictions.max().item()
+    min_score = data.predictions.min().item()
+    accuracy = data.label_ids.mean().item()
+    return {'accuracy': accuracy, 'max_score': max_score, 'min_score': min_score}
+
+
+def train() -> None:
+    global local_rank
+
+    parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    local_rank = training_args.local_rank
+
+    if model_args.image_tower is None and model_args.video_tower is None:
+        raise ValueError('At least one of `image_tower` and `video_tower` should be set.')
+
+    model = LlavaLlamaForScore.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=training_args.cache_dir,
+    )
+    model.config.use_cache = False
+
+    if model_args.freeze_backbone:
+        model.model.requires_grad_(False)
+
+    if training_args.gradient_checkpointing:
+        if hasattr(model, 'enable_input_require_grads'):
+            model.enable_input_require_grads()
+        else:
+
+            def make_inputs_require_grad(module: Any, input: Any, output: Any) -> None:
+                output.requires_grad_(True)
+
+            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=training_args.cache_dir,
+        model_max_length=training_args.model_max_length,
+        padding_side='right',
+        use_fast=False,
+    )
+
+    tokenizer.pad_token = tokenizer.unk_token
+
+    # =============================================================================================================
+    if model_args.image_tower is not None or model_args.video_tower is not None:
+        # print(model_args)
+        model.get_model().initialize_vision_modules(model_args=model_args, fsdp=training_args.fsdp)
+        if model_args.image_tower is not None:
+            image_tower = model.get_image_tower()
+            image_tower.to(
+                dtype=torch.bfloat16 if training_args.bf16 else torch.float16,
+                device=training_args.device,
+            )
+
+            data_args.image_processor = image_tower.image_processor
+            data_args.is_multimodal = True
+        if model_args.video_tower is not None:
+            video_tower = model.get_video_tower()
+            video_tower.to(
+                dtype=torch.bfloat16 if training_args.bf16 else torch.float16,
+                device=training_args.device,
+            )
+
+            data_args.video_processor = video_tower.video_processor
+            data_args.is_multimodal = True
+            data_args.num_frames = video_tower.config.num_frames
+        # =============================================================================================================
+
+        model.config.image_aspect_ratio = data_args.image_aspect_ratio
+        model.config.tokenizer_padding_side = tokenizer.padding_side
+
+        # =============================================================================================================
+        tokenizer_model_max_length = training_args.tokenizer_model_max_length
+        model.config.tokenizer_model_max_length = (
+            tokenizer.model_max_length
+            if tokenizer_model_max_length is None
+            else tokenizer_model_max_length
+        )
+        # =============================================================================================================
+
+        model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = (
+            model_args.tune_mm_mlp_adapter
+        )
+        if model_args.tune_mm_mlp_adapter:
+            model.requires_grad_(False)
+            for p in model.get_model().mm_projector.parameters():
+                p.requires_grad = True
+
+        model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter
+        if training_args.freeze_mm_mlp_adapter:
+            for p in model.get_model().mm_projector.parameters():
+                p.requires_grad = False
+
+        model.config.mm_use_im_start_end = data_args.mm_use_im_start_end = (
+            model_args.mm_use_im_start_end
+        )
+        model.config.mm_projector_lr = training_args.mm_projector_lr
+        training_args.use_im_start_end = model_args.mm_use_im_start_end
+        model.config.mm_use_im_patch_token = model_args.mm_use_im_patch_token
+        model.initialize_vision_tokenizer(model_args, tokenizer=tokenizer)
+
+    data_module = make_supervised_data_module(
+        data_args=data_args,
+        tokenizer=tokenizer,
+        video_processor=data_args.video_processor,
+    )
+    trainer = RewardTrainer(
+        model=model,
+        tokenizer=tokenizer,
+        args=training_args,
+        compute_metrics=compute_metrics,
+        **data_module,
+    )
+
+    if list(pathlib.Path(training_args.output_dir).glob('checkpoint-*')):
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train(
+            ignore_keys_for_eval=[
+                'scores',
+                'last_hidden_state',
+                'end_last_hidden_state',
+                'end_index',
+            ],
+        )
+    trainer.evaluate()
+    trainer.save_state()
+
+    model.config.use_cache = True
+    safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir)
+
+
+if __name__ == '__main__':
+    train()
diff --git a/examples/reward_model/utils.py b/examples/reward_model/utils.py
new file mode 100644
index 0000000..e4cc6fc
--- /dev/null
+++ b/examples/reward_model/utils.py
@@ -0,0 +1,204 @@
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Adopted from https://github.com/PKU-YuanGroup/Video-LLaVA project
+# Its original license is Apache-2.0 License.
+
+"""Some utility functions for reward model training."""
+
+import torch
+import transformers
+
+from safe_sora.conversations import SeparatorStyle, conv_templates
+from safe_sora.models.constants import (
+    DEFAULT_IM_END_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    DEFAULT_IMAGE_TOKEN,
+    DEFAULT_VID_END_TOKEN,
+    DEFAULT_VID_START_TOKEN,
+    DEFAULT_VIDEO_TOKEN,
+    IGNORE_INDEX,
+    IMAGE_TOKEN_INDEX,
+    MAX_IMAGE_LENGTH,
+    MAX_VIDEO_LENGTH,
+)
+
+
+chat_template = conv_templates['video_rm']
+
+
+def tokenizer_image_token(
+    prompt: str,
+    tokenizer: transformers.PreTrainedTokenizer,
+    image_token_index: int = IMAGE_TOKEN_INDEX,
+    return_tensors: str | None = None,
+) -> torch.Tensor:
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
+
+    def insert_separator(X: list, sep: list) -> list:
+        return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
+
+    input_ids = []
+    offset = 0
+    if (
+        len(prompt_chunks) > 0
+        and len(prompt_chunks[0]) > 0
+        and prompt_chunks[0][0] == tokenizer.bos_token_id
+    ):
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+
+
+def preprocess_multimodal(
+    sources: list[str],
+    num_frames: int,
+    mm_use_im_start_end: bool,
+) -> dict:
+    for source in sources:
+        for sentence in source:
+
+            # ======================================================================================================
+            if sentence['value'].startswith(DEFAULT_IMAGE_TOKEN) or sentence['value'].startswith(
+                DEFAULT_VIDEO_TOKEN,
+            ):  # run with multi-im, multi-vid, multi-im & multi-vid
+                # <video><video><image><image>\nxxxxxxxxxxxxx  # must <video> first
+                # <image>\nxxxxxxxxxxxxx -> <image>\nxxxxxxxxxxxxx
+                # <video>\nxxxxxxxxxxxxx -> <video>\nxxxxxxxxxxxxx
+
+                IMAGE_TOKEN_NUM = sentence['value'].count(DEFAULT_IMAGE_TOKEN)
+                if IMAGE_TOKEN_NUM > MAX_IMAGE_LENGTH:
+                    sentence['value'] = (
+                        sentence['value']
+                        .replace(
+                            DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM,
+                            DEFAULT_IMAGE_TOKEN * MAX_IMAGE_LENGTH,
+                        )
+                        .strip()
+                    )
+                VIDEO_TOKEN_NUM = sentence['value'].count(DEFAULT_VIDEO_TOKEN)
+                if VIDEO_TOKEN_NUM > MAX_VIDEO_LENGTH:
+                    raise ValueError(f"{sentence['value']}")
+
+            # a <video> is treated as `num_frames * <image>`
+            replace_token, vid_replace_token = (
+                DEFAULT_IMAGE_TOKEN,
+                DEFAULT_IMAGE_TOKEN * num_frames,
+            )
+            if mm_use_im_start_end:
+                replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+                vid_replace_token = (
+                    DEFAULT_VID_START_TOKEN + vid_replace_token + DEFAULT_VID_END_TOKEN
+                )
+
+            # <video><video><image><image>\nxxxxxxxxxxxxx -> `num_frames*<image>``num_frames*<image>`<image><image>\nxxxxxxxxxxxxx
+            # <video>\nxxxxxxxxxxxxx -> `num_frames*<image>`\nxxxxxxxxxxxxx
+            sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, replace_token)
+            sentence['value'] = sentence['value'].replace(DEFAULT_VIDEO_TOKEN, vid_replace_token)
+            # ======================================================================================================
+
+    return sources
+
+
+def preprocess_text(
+    sources: list[list[dict[str, str]]],
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+) -> dict:
+    conv = chat_template.copy()
+    roles = {'human': conv.roles[0], 'gpt': conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]['from']] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence['from']]
+            assert role == conv.roles[j % 2], f'{i}'
+            conv.append_message(role, sentence['value'])
+        conversations.append(conv.get_prompt())
+
+    # Tokenize conversations
+
+    if has_image:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors='pt')
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors='pt',
+            padding='longest',
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == SeparatorStyle.TWO
+
+    # Mask targets
+    sep = conv.sep + conv.roles[1] + ': '
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for _, rou in enumerate(rounds):
+            if rou == '':
+                break
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if cur_len < tokenizer.model_max_length and cur_len != total_len:
+            target[:] = IGNORE_INDEX
+            print(f'WARNING: tokenization mismatch: {cur_len} vs. {total_len}. (ignored)')
+
+    return {
+        'input_ids': input_ids,
+        'labels': targets,
+    }
diff --git a/examples/scripts/ds_zero2.json b/examples/scripts/ds_zero2.json
new file mode 100644
index 0000000..f6ced4d
--- /dev/null
+++ b/examples/scripts/ds_zero2.json
@@ -0,0 +1,23 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 2,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto"
+    }
+}
diff --git a/examples/scripts/finetune_reward_model.sh b/examples/scripts/finetune_reward_model.sh
new file mode 100755
index 0000000..452f2e1
--- /dev/null
+++ b/examples/scripts/finetune_reward_model.sh
@@ -0,0 +1,165 @@
+#!/bin/bash
+#
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+if [ -z "${BASH_VERSION}" ]; then
+    echo "Please use bash to run this script." >&2
+    exit 1
+fi
+
+VIDEO_DIR="./SafeSora/videos"
+TRAIN_DATA_PATH="./SafeSora/config-train.json.gz"
+EVAL_DATA_PATH="./SafeSora/config-test.json.gz"
+MODEL_NAME_OR_PATH="LanguageBind/Video-LLaVA-7B"
+MM_MLP_ADAPTER_PATH="LanguageBind/Video-LLaVA-Pretrain-7B/mm_projector.bin"
+OUTPUT_DIR="./outputs"
+DIMENSION="helpfulness"
+
+while [[ "$#" -gt 0 ]]; do
+    arg="$1"
+    shift
+    case "${arg}" in
+    --video_dir)
+        VIDEO_DIR="$1"
+        shift
+        ;;
+    --video_dir=*)
+        VIDEO_DIR="${arg#*=}"
+        ;;
+    --train_data_path)
+        TRAIN_DATA_PATH="$1"
+        shift
+        ;;
+    --train_data_path=*)
+        TRAIN_DATA_PATH="${arg#*=}"
+        ;;
+    --eval_data_path)
+        EVAL_DATA_PATH="$1"
+        shift
+        ;;
+    --eval_data_path=*)
+        EVAL_DATA_PATH="${arg#*=}"
+        ;;
+    --model_name_or_path)
+        MODEL_NAME_OR_PATH="$1"
+        shift
+        ;;
+    --model_name_or_path=*)
+        MODEL_NAME_OR_PATH="${arg#*=}"
+        ;;
+    --mm_mlp_adapter_path)
+        MM_MLP_ADAPTER_PATH="$1"
+        shift
+        ;;
+    --mm_mlp_adapter_path=*)
+        MM_MLP_ADAPTER_PATH="${arg#*=}"
+        ;;
+    --output_dir)
+        OUTPUT_DIR="$1"
+        shift
+        ;;
+    --output_dir=*)
+        OUTPUT_DIR="${arg#*=}"
+        ;;
+    --dimension)
+        DIMENSION="$1"
+        shift
+        ;;
+    --dimension=*)
+        DIMENSION="${arg#*=}"
+        ;;
+    *)
+        echo "Unknown parameter passed: '${arg}'" >&2
+        exit 1
+        ;;
+    esac
+done
+
+if [[ ! "helpfulness harmlessness instruction_following correctness informativeness aesthetics" =~ (^|[[:space:]])"${DIMENSION}"($|[[:space:]]) ]]; then
+    echo "Invalid dimension: ${DIMENSION}, should be one of 'helpfulness', 'harmlessness', 'instruction_following', 'correctness', 'informativeness', 'aesthetics'." >&2
+    exit 1
+fi
+
+IMAGE_DIR="${VIDEO_DIR}"
+RUN_NAME="reward-${DIMENSION}"
+OUTPUT_DIR="${OUTPUT_DIR}/${RUN_NAME}"
+
+mkdir -p "${OUTPUT_DIR}"
+OUTPUT_DIR="$(cd "${OUTPUT_DIR}" &>/dev/null && pwd)"
+if [[ ! -f "${OUTPUT_DIR}/.gitignore" ]]; then
+    echo '*' >"${OUTPUT_DIR}/.gitignore"
+fi
+
+cp -f "$0" "${OUTPUT_DIR}/script.sh"
+
+MASTER_PORT_START=10000
+MASTER_PORT_END=65535
+MASTER_PORT="$(
+    comm -23 \
+        <(seq "${MASTER_PORT_START}" "${MASTER_PORT_END}" | sort) \
+        <(ss -Htan | awk '{ print $4 }' | awk -F ':' '{ print $NF }' | sort -u) |
+        shuf | head -n 1
+)"
+
+exec 1> >(tee "${OUTPUT_DIR}/stdout.log" >&1) 2> >(tee "${OUTPUT_DIR}/stderr.log" >&2)
+
+deepspeed --master_port="${MASTER_PORT}" examples/reward_model/train_reward.py \
+    --deepspeed examples/scripts/ds_zero2.json \
+    --version v1 \
+    --run_name "${RUN_NAME}" \
+    --model_name_or_path "${MODEL_NAME_OR_PATH}" \
+    --train_data_path "${TRAIN_DATA_PATH}" \
+    --eval_data_path "${EVAL_DATA_PATH}" \
+    --preference_dimension "${DIMENSION}" \
+    --image_dir "${IMAGE_DIR}" \
+    --video_dir "${VIDEO_DIR}" \
+    --image_tower LanguageBind/LanguageBind_Image \
+    --video_tower LanguageBind/LanguageBind_Video_merge \
+    --mm_projector_type mlp2x_gelu \
+    --pretrain_mm_mlp_adapter "${MM_MLP_ADAPTER_PATH}" \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --image_aspect_ratio pad \
+    --group_by_modality_length True \
+    --output_dir "${OUTPUT_DIR}" \
+    --cache_dir "./models/cache_dir" \
+    --num_train_epochs 3 \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 8 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "steps" \
+    --eval_steps 0.0499 \
+    --load_best_model_at_end True \
+    --metric_for_best_model "accuracy" \
+    --greater_is_better True \
+    --logging_first_step True \
+    --save_strategy "steps" \
+    --save_steps 0.0499 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0.1 \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --model_max_length 2048 \
+    --tokenizer_model_max_length 3072 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --report_to wandb \
+    --bf16 True \
+    --tf32 True \
+    --num_frames 8
diff --git a/examples/scripts/inference_reward_model.sh b/examples/scripts/inference_reward_model.sh
new file mode 100755
index 0000000..25675d0
--- /dev/null
+++ b/examples/scripts/inference_reward_model.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+#
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+if [ -z "${BASH_VERSION}" ]; then
+	echo "Please use bash to run this script." >&2
+	exit 1
+fi
+
+MODEL_NAME_OR_PATH="./checkpoints/reward-helpfulness"
+EVAL_DATA_PATH="./SafeSora-Label/config-test.json.gz"
+IMAGE_DIR="./SafeSora/videos"
+VIDEO_DIR="./SafeSora/videos"
+
+OUTPUT_DIR="examples/outputs"
+
+while [[ "$#" -gt 0 ]]; do
+	arg="$1"
+	shift
+	case "${arg}" in
+	--model_name_or_path)
+		MODEL_NAME_OR_PATH="$1"
+		shift
+		;;
+	--model_name_or_path=*)
+		MODEL_NAME_OR_PATH="${arg#*=}"
+		;;
+	--eval_data_path)
+		EVAL_DATA_PATH="$1"
+		shift
+		;;
+	--image_dir=*)
+		IMAGE_DIR="${arg#*=}"
+		;;
+	--image_dir)
+		IMAGE_DIR="$1"
+		shift
+		;;
+	--video_dir=*)
+		VIDEO_DIR="${arg#*=}"
+		;;
+	--video_dir)
+		VIDEO_DIR="$1"
+		shift
+		;;
+	--eval_data_path=*)
+		EVAL_DATA_PATH="${arg#*=}"
+		;;
+	--output_dir)
+		OUTPUT_DIR="$1"
+		shift
+		;;
+	--output_dir=*)
+		OUTPUT_DIR="${arg#*=}"
+		;;
+	*)
+		echo "Unknown parameter passed: '${arg}'" >&2
+		exit 1
+		;;
+	esac
+done
+
+torchrun --nproc_per_node=8 \
+	examples/reward_model/inference.py \
+	--model_name_or_path "$MODEL_NAME_OR_PATH" \
+	--cache_dir "./cache_dir" \
+	--output_dir "$OUTPUT_DIR" \
+	--model_max_length "2048" \
+	--is_multimodal "True" \
+	--image_aspect_ratio "pad" \
+	--eval_data_path "$EVAL_DATA_PATH" \
+	--image_dir "$IMAGE_DIR" \
+	--video_dir "$VIDEO_DIR" \
+	--num_frames "8" \
+	--batch_size "8"
diff --git a/pyproject.toml b/pyproject.toml
index d9647a1..2b4a45e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,16 @@ dependencies = [
     "transformers",
     "datasets",
     "tokenizers",
+    "accelerate",
+    "deepspeed",
+    "bitsandbytes",
+    "numpy",
+    "sentencepiece",
+    "wandb",
+    "tensorboard",
+    "matplotlib",
+    "tqdm",
+    "rich",
     "av",
 ]
 dynamic = ["version"]
@@ -168,6 +178,7 @@ ignore = [
     # S101: use of `assert` detected
     # internal use and may never raise at runtime
     "S101",
+    "S105",
     # PLR0402: use from {module} import {name} in lieu of alias
     # use alias for import convention (e.g., `import torch.nn as nn`)
     "PLR0402",
diff --git a/safe_sora/conversations.py b/safe_sora/conversations.py
new file mode 100644
index 0000000..2afc25c
--- /dev/null
+++ b/safe_sora/conversations.py
@@ -0,0 +1,343 @@
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Adopted from https://github.com/PKU-YuanGroup/Video-LLaVA/blob/main/videollava/conversation.py
+# and https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+# Its original license is Apache-2.0 License.
+
+"""Conversations"""
+
+from __future__ import annotations
+
+import base64
+import dataclasses
+from enum import Enum, auto
+from io import BytesIO
+from typing import Any
+
+from PIL import Image
+
+
+class SeparatorStyle(Enum):
+    """Different separator style."""
+
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+
+
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+
+    # pylint: disable=too-many-instance-attributes
+
+    system: str
+    roles: list[str]
+    messages: list[list[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = '###'
+    sep2: str = None
+    version: str = 'Unknown'
+
+    skip_next: bool = False
+
+    def get_prompt(self) -> str:
+        """
+        Get the prompt of the conversation.
+        """
+        # pylint: disable=too-many-branches
+        # pylint: disable=too-many-statements
+        messages = self.messages
+        if len(messages) > 0 and isinstance(messages[0][1], tuple):
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace('<image>', '').strip()
+            if 'mmtag' in self.version:
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], '<Image><image></Image>'))
+                messages.insert(1, (self.roles[1], 'Received.'))
+            else:
+                messages[0] = (init_role, '<image>\n' + init_msg)
+
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if isinstance(message, tuple):
+                        message, _, _ = message
+                    ret += role + ': ' + message + self.sep
+                else:
+                    ret += role + ':'
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if isinstance(message, tuple):
+                        message, _, _ = message
+                    ret += role + ': ' + message + seps[i % 2]
+                else:
+                    ret += role + ':'
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if isinstance(message, tuple):
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+
+            def wrap_sys(msg: str) -> str:
+                return f'<<SYS>>\n{msg}\n<</SYS>>\n\n'
+
+            def wrap_inst(msg: str) -> str:
+                return f'[INST] {msg} [/INST]'
+
+            # wrap_sys = lambda msg: f'<<SYS>>\n{msg}\n<</SYS>>\n\n'
+            # wrap_inst = lambda msg: f'[INST] {msg} [/INST]'
+
+            ret = ''
+
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, 'first message should not be none'
+                    assert role == self.roles[0], 'first message should come from user'
+                if message:
+                    if isinstance(message, tuple):
+                        message, _, _ = message
+                    if i == 0:
+                        message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += ' ' + message + ' ' + self.sep2
+                else:
+                    ret += ''
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (_role, message) in enumerate(messages):
+                if message:
+                    if isinstance(message, tuple):
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ''
+        else:
+            raise ValueError(f'Invalid style: {self.sep_style}')
+
+        return ret
+
+    def append_message(self, role: str, message: Any) -> None:
+        """
+        Append role and message into self.messages.
+        """
+        self.messages.append([role, message])
+
+    def get_images(self, return_pil: bool = False) -> list[Image.Image | str]:
+        """
+        Get images from self.messages.
+        """
+        # pylint: disable=too-many-locals
+        images = []
+        for i, (_role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0 and isinstance(msg, tuple):
+
+                msg, image, image_process_mode = msg
+                if image_process_mode == 'Pad':
+
+                    def expand2square(
+                        pil_img: Image.Image,
+                        background_color: tuple = (122, 116, 104),
+                    ) -> Image.Image:
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        if width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        result = Image.new(pil_img.mode, (height, height), background_color)
+                        result.paste(pil_img, ((height - width) // 2, 0))
+                        return result
+
+                    image = expand2square(image)
+                elif image_process_mode in ['Default', 'Crop']:
+                    pass
+                elif image_process_mode == 'Resize':
+                    image = image.resize((336, 336))
+                else:
+                    raise ValueError(f'Invalid image_process_mode: {image_process_mode}')
+                max_hw, min_hw = max(image.size), min(image.size)
+                aspect_ratio = max_hw / min_hw
+                max_len, min_len = 800, 400
+                shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                longest_edge = int(shortest_edge * aspect_ratio)
+                width, height = image.size
+                if longest_edge != max(image.size):
+                    if height > width:
+                        height, width = longest_edge, shortest_edge
+                    else:
+                        height, width = shortest_edge, longest_edge
+                    image = image.resize((width, height))
+                if return_pil:
+                    images.append(image)
+                else:
+                    buffered = BytesIO()
+                    image.save(buffered, format='PNG')
+                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                    images.append(img_b64_str)
+        return images
+
+    def to_gradio_chatbot(self) -> list[list[str]]:
+        """
+        Convert the conversation to a format that can be used in Gradio chatbot.
+        """
+        # pylint: disable=too-many-locals
+        ret = []
+        for i, (_role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if isinstance(msg, tuple):
+                    msg, image = msg
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    width, height = image.size
+                    if height > width:
+                        height, width = longest_edge, shortest_edge
+                    else:
+                        height, width = shortest_edge, longest_edge
+                    image = image.resize((width, height))
+                    buffered = BytesIO()
+                    image.save(buffered, format='JPEG')
+                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                    img_str = (
+                        f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
+                    )
+                    msg = img_str + msg.replace('<image>', '').strip()
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+
+    def copy(self) -> Conversation:
+        """
+        Return a copy of the conversation.
+        """
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version,
+        )
+
+    def dict(self) -> dict:
+        """
+        Return a dictionary representation of the conversation.
+        """
+        if len(self.get_images()) > 0:
+            return {
+                'system': self.system,
+                'roles': self.roles,
+                'messages': [[x, y[0] if isinstance(y, tuple) else y] for x, y in self.messages],
+                'offset': self.offset,
+                'sep': self.sep,
+                'sep2': self.sep2,
+            }
+        return {
+            'system': self.system,
+            'roles': self.roles,
+            'messages': self.messages,
+            'offset': self.offset,
+            'sep': self.sep,
+            'sep2': self.sep2,
+        }
+
+
+conv_vicuna_v0 = Conversation(
+    system='A chat between a curious human and an artificial intelligence assistant. '
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=('Human', 'Assistant'),
+    # pylint: disable=line-too-long
+    messages=(
+        (
+            'Human',
+            'What are the key differences between renewable and non-renewable energy sources?',
+        ),
+        (
+            'Assistant',
+            'Renewable energy sources are those that can be replenished naturally in a relatively '
+            'short amount of time, such as solar, wind, hydro, geothermal, and biomass. '
+            'Non-renewable energy sources, on the other hand, are finite and will eventually be '
+            'depleted, such as coal, oil, and natural gas. Here are some key differences between '
+            'renewable and non-renewable energy sources:\n'
+            '1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable '
+            'energy sources are finite and will eventually run out.\n'
+            '2. Environmental impact: Renewable energy sources have a much lower environmental impact '
+            'than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, '
+            'and other negative effects.\n'
+            '3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically '
+            'have lower operational costs than non-renewable sources.\n'
+            '4. Reliability: Renewable energy sources are often more reliable and can be used in more remote '
+            'locations than non-renewable sources.\n'
+            '5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different '
+            'situations and needs, while non-renewable sources are more rigid and inflexible.\n'
+            '6. Sustainability: Renewable energy sources are more sustainable over the long term, while '
+            'non-renewable sources are not, and their depletion can lead to economic and social instability.\n',
+        ),
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep='###',
+)
+
+conv_vicuna_v1 = Conversation(
+    system='A chat between a curious user and an artificial intelligence assistant. '
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=('USER', 'ASSISTANT'),
+    version='v1',
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=' ',
+    sep2='</s>',
+)
+
+default_conversation = conv_vicuna_v1
+conv_templates = {
+    'default': conv_vicuna_v0,
+    'vicuna_v0': conv_vicuna_v0,
+    'vicuna_v1': conv_vicuna_v1,
+    'video_rm': conv_vicuna_v1,
+}
+
+if __name__ == '__main__':
+    print(default_conversation.get_prompt())
diff --git a/safe_sora/datasets/base.py b/safe_sora/datasets/base.py
index d929d5d..7fc7384 100644
--- a/safe_sora/datasets/base.py
+++ b/safe_sora/datasets/base.py
@@ -199,7 +199,7 @@ class SubPreference(TypedDict, total=False):
 
     instruction_following: NotRequired[Literal['video_0', 'video_1']]
     correctness: NotRequired[Literal['video_0', 'video_1']]
-    information: NotRequired[Literal['video_0', 'video_1']]
+    informativeness: NotRequired[Literal['video_0', 'video_1']]
     aesthetics: NotRequired[Literal['video_0', 'video_1']]
 
 
@@ -214,7 +214,7 @@ def format_sub_preference_from_dict(data: dict) -> SubPreference:
     return SubPreference(
         instruction_following=data.get('instruction_following'),
         correctness=data.get('correctness'),
-        information=data.get('information'),
+        informativeness=data.get('informativeness'),
         aesthetics=data.get('aesthetics'),
     )
 
diff --git a/safe_sora/models/__init__.py b/safe_sora/models/__init__.py
new file mode 100644
index 0000000..356bc49
--- /dev/null
+++ b/safe_sora/models/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# The `multimodal_encoder` module and `multimodal_projector` module are copied from
+# the https://github.com/PKU-YuanGroup/Video-LLaVA project. Its original license is Apache-2.0
+# License.
+
+
+"""Module for Safe-Sora."""
+
+from safe_sora.models.score_model import LlavaLlamaForScore, LlavaScoreConfig, ScoreModelOutput
+
+
+__all__ = [
+    'LlavaLlamaForScore',
+    'LlavaScoreConfig',
+    'ScoreModelOutput',
+]
diff --git a/safe_sora/models/constants.py b/safe_sora/models/constants.py
new file mode 100644
index 0000000..4d94627
--- /dev/null
+++ b/safe_sora/models/constants.py
@@ -0,0 +1,47 @@
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Adopted from https://github.com/PKU-YuanGroup/Video-LLaVA project
+# Its original license is Apache-2.0 License.
+
+"""Constants used in the video-llava model."""
+
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+
+LOGDIR = '.'
+
+# Model Constants
+IGNORE_INDEX = -100
+
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = '<image>'
+DEFAULT_IMAGE_PATCH_TOKEN = '<im_patch>'
+DEFAULT_IM_START_TOKEN = '<im_start>'
+DEFAULT_IM_END_TOKEN = '<im_end>'
+IMAGE_PLACEHOLDER = '<image-placeholder>'
+
+# ==============================
+DEFAULT_VIDEO_TOKEN = '<video>'
+DEFAULT_VIDEO_PATCH_TOKEN = '<im_patch>'
+DEFAULT_VID_START_TOKEN = '<vid_start>'
+DEFAULT_VID_END_TOKEN = '<vid_end>'
+VIDEO_PLACEHOLDER = '<video-placeholder>'
+# ==============================
+
+MAX_IMAGE_LENGTH = 16
+MAX_VIDEO_LENGTH = 1  # current video datasets only have 1 video?
+
+PAD_LENGTH = 620
diff --git a/safe_sora/models/multimodal_encoder/builder.py b/safe_sora/models/multimodal_encoder/builder.py
new file mode 100644
index 0000000..b59ce0a
--- /dev/null
+++ b/safe_sora/models/multimodal_encoder/builder.py
@@ -0,0 +1,50 @@
+import os
+
+from .clip_encoder import CLIPVisionTower
+from .languagebind import LanguageBindImageTower, LanguageBindVideoTower
+
+
+# ============================================================================================================
+
+
+def build_image_tower(image_tower_cfg, **kwargs):
+    image_tower = getattr(
+        image_tower_cfg,
+        'mm_image_tower',
+        getattr(image_tower_cfg, 'image_tower', None),
+    )
+    is_absolute_path_exists = os.path.exists(image_tower)
+    if (
+        is_absolute_path_exists
+        or image_tower.startswith('openai')
+        or image_tower.startswith('laion')
+    ):
+        return CLIPVisionTower(image_tower, args=image_tower_cfg, **kwargs)
+    if image_tower.endswith('LanguageBind_Image'):
+        return LanguageBindImageTower(
+            image_tower,
+            args=image_tower_cfg,
+            cache_dir='./cache_dir',
+            **kwargs,
+        )
+
+    raise ValueError(f'Unknown image tower: {image_tower}')
+
+
+def build_video_tower(video_tower_cfg, **kwargs):
+    video_tower = getattr(
+        video_tower_cfg,
+        'mm_video_tower',
+        getattr(video_tower_cfg, 'video_tower', None),
+    )
+    if video_tower.endswith('LanguageBind_Video_merge'):
+        return LanguageBindVideoTower(
+            video_tower,
+            args=video_tower_cfg,
+            cache_dir='./cache_dir',
+            **kwargs,
+        )
+    raise ValueError(f'Unknown video tower: {video_tower}')
+
+
+# ============================================================================================================
diff --git a/safe_sora/models/multimodal_encoder/clip_encoder.py b/safe_sora/models/multimodal_encoder/clip_encoder.py
new file mode 100644
index 0000000..dd85c8b
--- /dev/null
+++ b/safe_sora/models/multimodal_encoder/clip_encoder.py
@@ -0,0 +1,83 @@
+import torch
+import torch.nn as nn
+from transformers import CLIPImageProcessor, CLIPVisionConfig, CLIPVisionModel
+
+
+class CLIPVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+
+        if not delay_load:
+            self.load_model()
+        else:
+            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
+
+    def load_model(self):
+        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
+        self.vision_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        hidden_state = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = hidden_state[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = hidden_state
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(
+                    image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
+                    output_hidden_states=True,
+                )
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(
+                images.to(device=self.device, dtype=self.dtype),
+                output_hidden_states=True,
+            )
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
diff --git a/safe_sora/models/multimodal_encoder/languagebind/__init__.py b/safe_sora/models/multimodal_encoder/languagebind/__init__.py
new file mode 100644
index 0000000..1b49931
--- /dev/null
+++ b/safe_sora/models/multimodal_encoder/languagebind/__init__.py
@@ -0,0 +1,251 @@
+import torch
+from torch import nn
+from transformers import AutoConfig
+
+from .image.configuration_image import LanguageBindImageConfig
+from .image.modeling_image import LanguageBindImage
+from .image.processing_image import LanguageBindImageProcessor
+from .image.tokenization_image import LanguageBindImageTokenizer
+from .video.configuration_video import LanguageBindVideoConfig
+from .video.modeling_video import LanguageBindVideo
+from .video.processing_video import LanguageBindVideoProcessor
+from .video.tokenization_video import LanguageBindVideoTokenizer
+
+
+config_dict = {
+    'image': LanguageBindImageConfig,
+    'video': LanguageBindVideoConfig,
+}
+model_dict = {
+    'image': LanguageBindImage,
+    'video': LanguageBindVideo,
+}
+transform_dict = {
+    'video': LanguageBindVideoProcessor,
+    'image': LanguageBindImageProcessor,
+}
+
+
+class LanguageBind(nn.Module):
+    def __init__(
+        self,
+        clip_type=('thermal', 'image', 'video', 'depth', 'audio'),
+        use_temp=True,
+        cache_dir='./cache_dir',
+    ):
+        super().__init__()
+        self.use_temp = use_temp
+        self.modality_encoder = {}
+        self.modality_proj = {}
+        self.modality_scale = {}
+        self.modality_config = {}
+        for c in clip_type:
+            pretrained_ckpt = f'LanguageBind/LanguageBind_{c.capitalize()}'
+            model = model_dict[c].from_pretrained(pretrained_ckpt, cache_dir=cache_dir)
+            self.modality_encoder[c] = model.vision_model
+            self.modality_proj[c] = model.visual_projection
+            self.modality_scale[c] = model.logit_scale
+            self.modality_config[c] = model.config
+        self.modality_encoder['language'] = model.text_model
+        self.modality_proj['language'] = model.text_projection
+
+        self.modality_encoder = nn.ModuleDict(self.modality_encoder)
+        self.modality_proj = nn.ModuleDict(self.modality_proj)
+
+    def forward(self, inputs):
+        outputs = {}
+        for key, value in inputs.items():
+            value = self.modality_encoder[key](**value)[1]
+            value = self.modality_proj[key](value)
+            value = value / value.norm(p=2, dim=-1, keepdim=True)
+            if self.use_temp and key != 'language':
+                value = value * self.modality_scale[key].exp()
+            outputs[key] = value
+        return outputs
+
+
+def to_device(x, device):
+    out_dict = {k: v.to(device) for k, v in x.items()}
+    return out_dict
+
+
+class LanguageBindImageTower(nn.Module):
+    def __init__(self, image_tower, args, delay_load=False, cache_dir='./cache_dir'):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.image_tower_name = image_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+
+        self.cache_dir = cache_dir
+
+        if not delay_load:
+            self.load_model()
+        else:
+            self.cfg_only = LanguageBindImageConfig.from_pretrained(
+                self.image_tower_name,
+                cache_dir=self.cache_dir,
+            )
+
+    ############################################################
+    def load_model(self):
+        model = LanguageBindImage.from_pretrained(self.image_tower_name, cache_dir=self.cache_dir)
+        self.image_tower = model.vision_model
+        self.image_tower.requires_grad_(False)
+
+        self.image_processor = LanguageBindImageProcessor(model.config)
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        hidden_state = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = hidden_state[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = hidden_state
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.image_tower(
+                    image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
+                    output_hidden_states=True,
+                )
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            # print('images', images.shape)
+            image_forward_outs = self.image_tower(
+                images.to(device=self.device, dtype=self.dtype),
+                output_hidden_states=True,
+            )
+            # print('image_forward_outs', len(image_forward_outs), image_forward_outs[0].shape)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+            # print('image_features', image_features.shape)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.image_tower.embeddings.class_embedding.dtype
+
+    @property
+    def device(self):
+        return self.image_tower.embeddings.class_embedding.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.image_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
+
+
+class LanguageBindVideoTower(nn.Module):
+    def __init__(self, video_tower, args, delay_load=False, cache_dir='./cache_dir'):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.video_tower_name = video_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+
+        self.cache_dir = cache_dir
+
+        if not delay_load:
+            self.load_model()
+        else:
+            self.cfg_only = LanguageBindVideoConfig.from_pretrained(
+                self.video_tower_name,
+                cache_dir=self.cache_dir,
+            )
+
+    ############################################################
+    def load_model(self):
+        model = LanguageBindVideo.from_pretrained(self.video_tower_name, cache_dir=self.cache_dir)
+        self.video_processor = LanguageBindVideoProcessor(model.config)
+
+        # model = LanguageBindImage.from_pretrained('LanguageBind/LanguageBind_Image', cache_dir=self.cache_dir)
+        self.video_tower = model.vision_model
+        self.video_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+    def feature_select(self, video_forward_outs):
+        video_features = video_forward_outs.hidden_states[self.select_layer]  # b t n c
+        return video_features  # return all
+        # b, t, n, c = video_features.shape
+        # if self.select_feature == 'patch':
+        #     video_features = video_features[:, :, 1:]
+        # else:
+        #     raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        # return video_features
+
+    @torch.no_grad()
+    def forward(self, videos):
+        if type(videos) is list:
+            video_features = []
+            for video in videos:
+                video_forward_out = self.video_tower(
+                    video.to(device=self.device, dtype=self.dtype).unsqueeze(0),
+                    output_hidden_states=True,
+                )
+                video_feature = self.feature_select(video_forward_out).to(video.dtype)
+                video_features.append(video_feature)
+        else:
+            video_forward_outs = self.video_tower(
+                videos.to(device=self.device, dtype=self.dtype),
+                output_hidden_states=True,
+            )
+            video_features = self.feature_select(video_forward_outs).to(videos.dtype)
+
+        return video_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.video_tower.embeddings.class_embedding.dtype
+        # return torch.randn(1).cuda().dtype
+
+    @property
+    def device(self):
+        return self.video_tower.embeddings.class_embedding.device
+        # return torch.randn(1).cuda().device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.video_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
diff --git a/safe_sora/models/multimodal_encoder/languagebind/image/configuration_image.py b/safe_sora/models/multimodal_encoder/languagebind/image/configuration_image.py
new file mode 100644
index 0000000..d1e16e1
--- /dev/null
+++ b/safe_sora/models/multimodal_encoder/languagebind/image/configuration_image.py
@@ -0,0 +1,454 @@
+import copy
+import os
+from typing import Union
+
+from transformers import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class CLIPTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP
+    text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the text encoder of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 49408):
+            Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`CLIPModel`].
+        hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 77):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import CLIPTextConfig, CLIPTextModel
+
+    >>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPTextConfig()
+
+    >>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = 'clip_text_model'
+
+    def __init__(
+        self,
+        vocab_size=49408,
+        hidden_size=512,
+        intermediate_size=2048,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=8,
+        max_position_embeddings=77,
+        hidden_act='quick_gelu',
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        # This differs from `CLIPTokenizer`'s default and from openai/clip
+        # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
+        pad_token_id=1,
+        bos_token_id=49406,
+        eos_token_id=49407,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.add_time_attn = False
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        **kwargs,
+    ) -> 'PretrainedConfig':
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from CLIPConfig
+        if config_dict.get('model_type') == 'clip':
+            config_dict = config_dict['text_config']
+
+        if (
+            'model_type' in config_dict
+            and hasattr(cls, 'model_type')
+            and config_dict['model_type'] != cls.model_type
+        ):
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.',
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class CLIPVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
+    CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import CLIPVisionConfig, CLIPVisionModel
+
+    >>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPVisionConfig()
+
+    >>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = 'clip_vision_model'
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=32,
+        hidden_act='quick_gelu',
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        add_time_attn=False,
+        num_frames=1,
+        force_patch_dropout=0.0,
+        lora_r=2,
+        lora_alpha=16,
+        lora_dropout=0.0,
+        num_mel_bins=0.0,
+        target_length=0.0,
+        video_decode_backend='decord',
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+        self.add_time_attn = add_time_attn
+        self.num_frames = num_frames
+        self.force_patch_dropout = force_patch_dropout
+        self.lora_r = lora_r
+        self.lora_alpha = lora_alpha
+        self.lora_dropout = lora_dropout
+        self.num_mel_bins = num_mel_bins
+        self.target_length = target_length
+        self.video_decode_backend = video_decode_backend
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        **kwargs,
+    ) -> 'PretrainedConfig':
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from CLIPConfig
+        if config_dict.get('model_type') == 'clip':
+            config_dict = config_dict['vision_config']
+
+        if (
+            'model_type' in config_dict
+            and hasattr(cls, 'model_type')
+            and config_dict['model_type'] != cls.model_type
+        ):
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.',
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class LanguageBindImageConfig(PretrainedConfig):
+    r"""
+    [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
+    a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
+    a configuration with the defaults will yield a similar configuration to that of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of text and vision projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The initial value of the *logit_scale* parameter. Default is used as per the original CLIP implementation.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import CLIPConfig, CLIPModel
+
+    >>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPConfig()
+
+    >>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
+    >>> from transformers import CLIPTextConfig, CLIPVisionConfig
+
+    >>> # Initializing a CLIPText and CLIPVision configuration
+    >>> config_text = CLIPTextConfig()
+    >>> config_vision = CLIPVisionConfig()
+
+    >>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
+    ```"""
+
+    model_type = 'LanguageBindImage'
+    is_composition = True
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        projection_dim=512,
+        logit_scale_init_value=2.6592,
+        **kwargs,
+    ):
+        # If `_config_dict` exist, we use them for the backward compatibility.
+        # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+        # of confusion!).
+        text_config_dict = kwargs.pop('text_config_dict', None)
+        vision_config_dict = kwargs.pop('vision_config_dict', None)
+
+        super().__init__(**kwargs)
+
+        # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+        # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+        # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
+        if text_config_dict is not None:
+            if text_config is None:
+                text_config = {}
+
+            # This is the complete result when using `text_config_dict`.
+            _text_config_dict = CLIPTextConfig(**text_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+            for key, value in _text_config_dict.items():
+                if (
+                    key in text_config
+                    and value != text_config[key]
+                    and key not in ['transformers_version']
+                ):
+                    # If specified in `text_config_dict`
+                    if key in text_config_dict:
+                        message = (
+                            f'`{key}` is found in both `text_config_dict` and `text_config` but with different values. '
+                            f'The value `text_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f'`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The '
+                            f'value `text_config["{key}"]` will be overridden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `text_config` with the ones in `_text_config_dict`.
+            text_config.update(_text_config_dict)
+
+        if vision_config_dict is not None:
+            if vision_config is None:
+                vision_config = {}
+
+            # This is the complete result when using `vision_config_dict`.
+            _vision_config_dict = CLIPVisionConfig(**vision_config_dict).to_dict()
+            # convert keys to string instead of integer
+            if 'id2label' in _vision_config_dict:
+                _vision_config_dict['id2label'] = {
+                    str(key): value for key, value in _vision_config_dict['id2label'].items()
+                }
+
+            # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
+            for key, value in _vision_config_dict.items():
+                if (
+                    key in vision_config
+                    and value != vision_config[key]
+                    and key not in ['transformers_version']
+                ):
+                    # If specified in `vision_config_dict`
+                    if key in vision_config_dict:
+                        message = (
+                            f'`{key}` is found in both `vision_config_dict` and `vision_config` but with different '
+                            f'values. The value `vision_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f'`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. '
+                            f'The value `vision_config["{key}"]` will be overridden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `vision_config` with the ones in `_vision_config_dict`.
+            vision_config.update(_vision_config_dict)
+
+        if text_config is None:
+            text_config = {}
+            logger.info(
+                '`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.',
+            )
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info(
+                '`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.',
+            )
+
+        self.text_config = CLIPTextConfig(**text_config)
+        self.vision_config = CLIPVisionConfig(**vision_config)
+
+        self.projection_dim = projection_dim
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = 1.0
+
+    @classmethod
+    def from_text_vision_configs(
+        cls,
+        text_config: CLIPTextConfig,
+        vision_config: CLIPVisionConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
+        configuration.
+
+        Returns:
+            [`CLIPConfig`]: An instance of a configuration object
+        """
+
+        return cls(
+            text_config=text_config.to_dict(),
+            vision_config=vision_config.to_dict(),
+            **kwargs,
+        )
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output['text_config'] = self.text_config.to_dict()
+        output['vision_config'] = self.vision_config.to_dict()
+        output['model_type'] = self.__class__.model_type
+        return output
diff --git a/safe_sora/models/multimodal_encoder/languagebind/image/modeling_image.py b/safe_sora/models/multimodal_encoder/languagebind/image/modeling_image.py
new file mode 100644
index 0000000..ab86bc2
--- /dev/null
+++ b/safe_sora/models/multimodal_encoder/languagebind/image/modeling_image.py
@@ -0,0 +1,1111 @@
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+from einops import rearrange
+from peft import LoraConfig, get_peft_model
+from torch import nn
+from torch.nn import functional as F
+from transformers import PreTrainedModel, add_start_docstrings
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.models.clip.modeling_clip import (
+    CLIPMLP,
+    CLIPAttention,
+    CLIPOutput,
+    CLIPTextEmbeddings,
+    CLIPTextModelWithProjection,
+    CLIPVisionEmbeddings,
+    CLIPVisionModelWithProjection,
+    _prepare_4d_attention_mask,
+    clip_loss,
+)
+from transformers.utils import add_start_docstrings_to_model_forward, replace_return_docstrings
+
+from .configuration_image import CLIPTextConfig, CLIPVisionConfig, LanguageBindImageConfig
+
+
+class PatchDropout(nn.Module):
+    """
+    https://arxiv.org/abs/2212.00794
+    """
+
+    def __init__(self, prob, exclude_first_token=True):
+        super().__init__()
+        assert 0 <= prob < 1.0
+        self.prob = prob
+        self.exclude_first_token = exclude_first_token  # exclude CLS token
+
+    def forward(self, x, B, T):
+        if not self.training or self.prob == 0.0:
+            return x
+
+        if self.exclude_first_token:
+            cls_tokens, x = x[:, :1], x[:, 1:]
+        else:
+            cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1])
+
+        batch = x.size()[0]
+        num_tokens = x.size()[1]
+
+        batch_indices = torch.arange(batch)
+        batch_indices = batch_indices[..., None]
+
+        keep_prob = 1 - self.prob
+        num_patches_keep = max(1, int(num_tokens * keep_prob))
+
+        if T == 1:
+            rand = torch.randn(batch, num_tokens)
+            patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+        else:
+            rand = torch.randn(B, num_tokens)
+            patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+            patch_indices_keep = patch_indices_keep.unsqueeze(1).repeat(1, T, 1)
+            patch_indices_keep = rearrange(patch_indices_keep, 'b t n -> (b t) n')
+
+        x = x[batch_indices, patch_indices_keep]
+
+        if self.exclude_first_token:
+            x = torch.cat((cls_tokens, x), dim=1)
+
+        return x
+
+
+class CLIPEncoderLayer(nn.Module):
+    def __init__(self, config: LanguageBindImageConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = CLIPAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = CLIPMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+        self.add_time_attn = config.add_time_attn
+        if self.add_time_attn:
+            self.t = config.num_frames
+            self.temporal_embedding = nn.Parameter(
+                torch.zeros(1, config.num_frames, config.hidden_size),
+            )
+            nn.init.normal_(self.temporal_embedding, std=config.hidden_size**-0.5)
+
+            self.embed_dim = config.hidden_size
+            self.temporal_attn = CLIPAttention(config)
+            self.temporal_layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.temporal_mlp = CLIPMLP(config)
+            self.temporal_layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+
+        if self.add_time_attn:
+            bt, n, d = hidden_states.shape
+            t = self.t
+
+            # time embed
+            if t != 1:
+                n = hidden_states.shape[1]
+                hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+                hidden_states = hidden_states + self.temporal_embedding[:, :t, :]
+                hidden_states = rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+            # time attn
+            residual = hidden_states
+            hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+            # hidden_states = self.layer_norm1(hidden_states)  # share layernorm
+            hidden_states = self.temporal_layer_norm1(hidden_states)
+            hidden_states, attn_weights = self.temporal_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                causal_attention_mask=causal_attention_mask,
+                output_attentions=output_attentions,
+            )
+            hidden_states = residual + rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+            residual = hidden_states
+            hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+            # hidden_states = self.layer_norm2(hidden_states)  # share layernorm
+            hidden_states = self.temporal_layer_norm2(hidden_states)
+            hidden_states = self.temporal_mlp(hidden_states)
+            hidden_states = residual + rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+        # spatial attn
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class CLIPPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = LanguageBindImageConfig
+    base_model_prefix = 'clip'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor
+        if isinstance(module, CLIPTextEmbeddings):
+            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        elif isinstance(module, CLIPVisionEmbeddings):
+            factor = self.config.initializer_factor
+            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
+            nn.init.normal_(
+                module.patch_embedding.weight,
+                std=module.config.initializer_range * factor,
+            )
+            nn.init.normal_(
+                module.position_embedding.weight,
+                std=module.config.initializer_range * factor,
+            )
+        elif isinstance(module, CLIPAttention):
+            factor = self.config.initializer_factor
+            in_proj_std = (
+                (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            )
+            out_proj_std = (module.embed_dim**-0.5) * factor
+            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+        elif isinstance(module, CLIPMLP):
+            factor = self.config.initializer_factor
+            in_proj_std = (
+                (module.config.hidden_size**-0.5)
+                * ((2 * module.config.num_hidden_layers) ** -0.5)
+                * factor
+            )
+            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+            nn.init.normal_(module.fc1.weight, std=fc_std)
+            nn.init.normal_(module.fc2.weight, std=in_proj_std)
+        elif isinstance(module, LanguageBindImage):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+        elif isinstance(module, CLIPVisionModelWithProjection):
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
+        elif isinstance(module, CLIPTextModelWithProjection):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
+
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, CLIPEncoder):
+            module.gradient_checkpointing = value
+
+
+CLIP_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CLIP_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIP_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class CLIPEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`CLIPEncoderLayer`].
+
+    Args:
+        config: CLIPConfig
+    """
+
+    def __init__(self, config: LanguageBindImageConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [CLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)],
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for _, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, encoder_states, all_attentions] if v is not None
+            )
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+        )
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size,
+    dtype: torch.dtype,
+    device: torch.device,
+    past_key_values_length: int = 0,
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat(
+            [torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask],
+            dim=-1,
+        )
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+class CLIPTextTransformer(nn.Module):
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = CLIPTextEmbeddings(config)
+        self.encoder = CLIPEncoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None:
+            raise ValueError('You have to specify input_ids')
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = _make_causal_mask(
+            input_shape,
+            hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+        pooled_output = last_hidden_state[
+            torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+            input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
+        ]
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """The text model from CLIP without any head or projection on top.""",
+    CLIP_START_DOCSTRING,
+)
+class CLIPTextModel(CLIPPreTrainedModel):
+    config_class = CLIPTextConfig
+
+    _no_split_modules = ['CLIPEncoderLayer']
+
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__(config)
+        self.text_model = CLIPTextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, CLIPTextModel
+
+        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class CLIPVisionTransformer(nn.Module):
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = CLIPVisionEmbeddings(config)
+        self.patch_dropout = PatchDropout(config.force_patch_dropout)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.encoder = CLIPEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=BaseModelOutputWithPooling,
+        config_class=CLIPVisionConfig,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError('You have to specify pixel_values')
+        ######################################
+        if len(pixel_values.shape) == 7:
+            b_new, pair_new, T, bs_new, channel_new, h_new, w_new = pixel_values.shape
+            # print(pixel_values.shape)
+            B = b_new * pair_new * bs_new
+            pixel_values = pixel_values.reshape(B * T, channel_new, h_new, w_new)
+
+        elif len(pixel_values.shape) == 5:
+            B, _, T, _, _ = pixel_values.shape
+            # print(pixel_values.shape)
+            pixel_values = rearrange(pixel_values, 'b c t h w -> (b t) c h w')
+        else:
+            # print(pixel_values.shape)
+            B, _, _, _ = pixel_values.shape
+            T = 1
+        ###########################
+        hidden_states = self.embeddings(pixel_values)
+
+        hidden_states = self.patch_dropout(
+            hidden_states,
+            B,
+            T,
+        )
+
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        pooled_output = pooled_output.reshape(B, T, -1).mean(1)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """The vision model from CLIP without any head or projection on top.""",
+    CLIP_START_DOCSTRING,
+)
+class CLIPVisionModel(CLIPPreTrainedModel):
+    config_class = CLIPVisionConfig
+    main_input_name = 'pixel_values'
+
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__(config)
+        self.vision_model = CLIPVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=BaseModelOutputWithPooling,
+        config_class=CLIPVisionConfig,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPVisionModel
+
+        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+@add_start_docstrings(CLIP_START_DOCSTRING)
+class LanguageBindImage(CLIPPreTrainedModel):
+    config_class = LanguageBindImageConfig
+
+    def __init__(self, config: LanguageBindImageConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, CLIPTextConfig):
+            raise ValueError(
+                'config.text_config is expected to be of type CLIPTextConfig but is of type'
+                f' {type(config.text_config)}.',
+            )
+
+        if not isinstance(config.vision_config, CLIPVisionConfig):
+            raise ValueError(
+                'config.vision_config is expected to be of type CLIPVisionConfig but is of type'
+                f' {type(config.vision_config)}.',
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+        self.add_time_attn = vision_config.add_time_attn
+        self.lora_r = vision_config.lora_r
+        self.lora_alpha = vision_config.lora_alpha
+        self.lora_dropout = vision_config.lora_dropout
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = CLIPTextTransformer(text_config)
+        self.vision_model = CLIPVisionTransformer(vision_config)
+
+        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
+
+        # Initialize weights and apply final processing
+        self.post_init()
+        self.convert_to_lora()
+        # self.resize_pos(self.vision_model.embeddings, vision_config)
+
+    def convert_to_lora(self):
+        if self.lora_r == 0:
+            return
+        if self.add_time_attn:
+            target_modules = [
+                'temporal_attn.k_proj',
+                'temporal_attn.v_proj',
+                'temporal_attn.q_proj',
+                'temporal_attn.out_proj',
+                'temporal_mlp.fc1',
+                'temporal_mlp.fc2',
+            ]
+        else:
+            target_modules = ['k_proj', 'v_proj', 'q_proj', 'out_proj']
+        config = LoraConfig(
+            r=self.lora_r,  # 16
+            lora_alpha=self.lora_alpha,  # 16
+            target_modules=target_modules,  # self_attn.out_proj
+            lora_dropout=self.lora_dropout,  # 0.1
+            bias='none',
+            modules_to_save=[],
+        )
+        self.vision_model.encoder.is_gradient_checkpointing = False
+        self.vision_model.encoder = get_peft_model(self.vision_model.encoder, config)
+
+    def resize_pos(self, m, vision_config):
+        # convert embedding
+        if vision_config.num_mel_bins != 0 and vision_config.target_length != 0:
+            m.image_size = [vision_config.num_mel_bins, vision_config.target_length]
+        m.config.image_size = (
+            [m.image_size, m.image_size] if isinstance(m.image_size, int) else m.image_size
+        )
+        # pos resize
+        old_pos_embed_state_dict = m.position_embedding.state_dict()
+        old_pos_embed = old_pos_embed_state_dict['weight']
+        dtype = old_pos_embed.dtype
+        grid_size = [m.config.image_size[0] // m.patch_size, m.config.image_size[1] // m.patch_size]
+        extra_tokens = 1  # FIXME detect different token configs (ie no class token, or more)
+        new_seq_len = grid_size[0] * grid_size[1] + extra_tokens
+        if new_seq_len == old_pos_embed.shape[0]:
+            # m.to(args.device)
+            return
+
+        m.num_patches = grid_size[0] * grid_size[1]
+        m.num_positions = m.num_patches + 1
+        m.register_buffer('position_ids', torch.arange(m.num_positions).expand((1, -1)))
+        new_position_embedding = nn.Embedding(m.num_positions, m.embed_dim)
+
+        if extra_tokens:
+            pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:]
+        else:
+            pos_emb_tok, pos_emb_img = None, old_pos_embed
+        old_grid_size = [int(math.sqrt(len(pos_emb_img)))] * 2
+
+        # if is_master(args):
+        #     logging.info('Resizing position embedding grid-size from %s to %s', old_grid_size, grid_size)
+        pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(
+            0,
+            3,
+            1,
+            2,
+        )
+        pos_emb_img = F.interpolate(
+            pos_emb_img,
+            size=grid_size,
+            mode='bicubic',
+            antialias=True,
+            align_corners=False,
+        )
+        pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)[0]
+        if pos_emb_tok is not None:
+            new_pos_embed = torch.cat([pos_emb_tok, pos_emb_img], dim=0)
+        else:
+            new_pos_embed = pos_emb_img
+        old_pos_embed_state_dict['weight'] = new_pos_embed.to(dtype)
+        m.position_embedding = new_position_embedding
+        m.position_embedding.load_state_dict(old_pos_embed_state_dict)
+
+        # m.to(args.device)
+
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+        text_features = self.text_projection(pooled_output)
+
+        return text_features
+
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(pooled_output)
+
+        return image_features
+
+    @add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CLIPOutput, config_class=LanguageBindImageConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLIPOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+        ... )
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
+
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+        logits_per_image = logits_per_text.t()
+
+        loss = None
+        if return_loss:
+            loss = clip_loss(logits_per_text)
+
+        if not return_dict:
+            output = (
+                logits_per_image,
+                logits_per_text,
+                text_embeds,
+                image_embeds,
+                text_outputs,
+                vision_outputs,
+            )
+            return ((loss,) + output) if loss is not None else output
+
+        return CLIPOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
diff --git a/safe_sora/models/multimodal_encoder/languagebind/image/processing_image.py b/safe_sora/models/multimodal_encoder/languagebind/image/processing_image.py
new file mode 100644
index 0000000..cddea6c
--- /dev/null
+++ b/safe_sora/models/multimodal_encoder/languagebind/image/processing_image.py
@@ -0,0 +1,91 @@
+import torch
+from PIL import Image
+from torchvision import transforms
+from transformers import ProcessorMixin
+
+
+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+
+
+def make_list_of_images(x):
+    if not isinstance(x, list):
+        return [x]
+    return x
+
+
+def get_image_transform(config):
+    config = config.vision_config
+    transform = transforms.Compose(
+        [
+            transforms.ToTensor(),
+            transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
+            transforms.CenterCrop(224),
+            transforms.Normalize(OPENAI_DATASET_MEAN, OPENAI_DATASET_STD),  # assume image
+        ],
+    )
+    return transform
+
+
+def load_and_transform_image(image_path, transform):
+    image = Image.open(image_path).convert('RGB') if isinstance(image_path, str) else image_path
+    image_outputs = transform(image)
+    return image_outputs
+
+
+class LanguageBindImageProcessor(ProcessorMixin):
+    attributes = []
+    tokenizer_class = 'LanguageBindImageTokenizer'
+
+    def __init__(self, config, tokenizer=None, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.transform = get_image_transform(config)
+        self.image_processor = load_and_transform_image
+        self.tokenizer = tokenizer
+        self.image_mean = OPENAI_DATASET_MEAN
+        self.crop_size = {'height': 224, 'width': 224}
+
+    def __call__(self, images=None, text=None, context_length=77, return_tensors=None, **kwargs):
+        if text is None and images is None:
+            raise ValueError('You have to specify either text or images. Both cannot be none.')
+
+        if text is not None:
+            encoding = self.tokenizer(
+                text,
+                max_length=context_length,
+                padding='max_length',
+                truncation=True,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+
+        if images is not None:
+            images = make_list_of_images(images)
+            image_features = [self.image_processor(image, self.transform) for image in images]
+            image_features = torch.stack(image_features)
+
+        if text is not None and images is not None:
+            encoding['pixel_values'] = image_features
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return {'pixel_values': image_features}
+
+    def preprocess(self, images, return_tensors):
+        return self.__call__(images=images, return_tensors=return_tensors)
+
+    def batch_decode(self, skip_special_tokens=True, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
+
+    def decode(self, skip_special_tokens=True, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
diff --git a/safe_sora/models/multimodal_encoder/languagebind/image/tokenization_image.py b/safe_sora/models/multimodal_encoder/languagebind/image/tokenization_image.py
new file mode 100644
index 0000000..96769c4
--- /dev/null
+++ b/safe_sora/models/multimodal_encoder/languagebind/image/tokenization_image.py
@@ -0,0 +1,80 @@
+from transformers import CLIPTokenizer
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    'vocab_file': 'vocab.json',
+    'merges_file': 'merges.txt',
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file': {
+        'lb203/LanguageBind-Image': 'https://huggingface.co/lb203/LanguageBind-Image/resolve/main/vocab.json',
+    },
+    'merges_file': {
+        'lb203/LanguageBind-Image': 'https://huggingface.co/lb203/LanguageBind-Image/resolve/main/merges.txt',
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'lb203/LanguageBind-Image': 77,
+}
+
+
+PRETRAINED_INIT_CONFIGURATION = {
+    'lb203/LanguageBind-Image': {},
+}
+
+
+class LanguageBindImageTokenizer(CLIPTokenizer):
+    """
+    Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The end of sequence token.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ['input_ids', 'attention_mask']
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors='replace',
+        unk_token='<|endoftext|>',
+        bos_token='<|startoftext|>',
+        eos_token='<|endoftext|>',
+        pad_token='<|endoftext|>',  # hack to enable padding
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file,
+            merges_file,
+            errors,
+            unk_token,
+            bos_token,
+            eos_token,
+            pad_token,  # hack to enable padding
+            **kwargs,
+        )
diff --git a/safe_sora/models/multimodal_encoder/languagebind/video/configuration_video.py b/safe_sora/models/multimodal_encoder/languagebind/video/configuration_video.py
new file mode 100644
index 0000000..44eefcd
--- /dev/null
+++ b/safe_sora/models/multimodal_encoder/languagebind/video/configuration_video.py
@@ -0,0 +1,454 @@
+import copy
+import os
+from typing import Union
+
+from transformers import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class CLIPTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP
+    text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the text encoder of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 49408):
+            Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`CLIPModel`].
+        hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 77):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import CLIPTextConfig, CLIPTextModel
+
+    >>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPTextConfig()
+
+    >>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = 'clip_text_model'
+
+    def __init__(
+        self,
+        vocab_size=49408,
+        hidden_size=512,
+        intermediate_size=2048,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=8,
+        max_position_embeddings=77,
+        hidden_act='quick_gelu',
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        # This differs from `CLIPTokenizer`'s default and from openai/clip
+        # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
+        pad_token_id=1,
+        bos_token_id=49406,
+        eos_token_id=49407,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.add_time_attn = False
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        **kwargs,
+    ) -> 'PretrainedConfig':
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from CLIPConfig
+        if config_dict.get('model_type') == 'clip':
+            config_dict = config_dict['text_config']
+
+        if (
+            'model_type' in config_dict
+            and hasattr(cls, 'model_type')
+            and config_dict['model_type'] != cls.model_type
+        ):
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.',
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class CLIPVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
+    CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import CLIPVisionConfig, CLIPVisionModel
+
+    >>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPVisionConfig()
+
+    >>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = 'clip_vision_model'
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=32,
+        hidden_act='quick_gelu',
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        add_time_attn=False,
+        num_frames=1,
+        force_patch_dropout=0.0,
+        lora_r=2,
+        lora_alpha=16,
+        lora_dropout=0.0,
+        num_mel_bins=0.0,
+        target_length=0.0,
+        video_decode_backend='decord',
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+        self.add_time_attn = add_time_attn
+        self.num_frames = num_frames
+        self.force_patch_dropout = force_patch_dropout
+        self.lora_r = lora_r
+        self.lora_alpha = lora_alpha
+        self.lora_dropout = lora_dropout
+        self.num_mel_bins = num_mel_bins
+        self.target_length = target_length
+        self.video_decode_backend = video_decode_backend
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        **kwargs,
+    ) -> 'PretrainedConfig':
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from CLIPConfig
+        if config_dict.get('model_type') == 'clip':
+            config_dict = config_dict['vision_config']
+
+        if (
+            'model_type' in config_dict
+            and hasattr(cls, 'model_type')
+            and config_dict['model_type'] != cls.model_type
+        ):
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.',
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class LanguageBindVideoConfig(PretrainedConfig):
+    r"""
+    [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
+    a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
+    a configuration with the defaults will yield a similar configuration to that of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of text and vision projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The initial value of the *logit_scale* parameter. Default is used as per the original CLIP implementation.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import CLIPConfig, CLIPModel
+
+    >>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPConfig()
+
+    >>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
+    >>> from transformers import CLIPTextConfig, CLIPVisionConfig
+
+    >>> # Initializing a CLIPText and CLIPVision configuration
+    >>> config_text = CLIPTextConfig()
+    >>> config_vision = CLIPVisionConfig()
+
+    >>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
+    ```"""
+
+    model_type = 'LanguageBindVideo'
+    is_composition = True
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        projection_dim=512,
+        logit_scale_init_value=2.6592,
+        **kwargs,
+    ):
+        # If `_config_dict` exist, we use them for the backward compatibility.
+        # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+        # of confusion!).
+        text_config_dict = kwargs.pop('text_config_dict', None)
+        vision_config_dict = kwargs.pop('vision_config_dict', None)
+
+        super().__init__(**kwargs)
+
+        # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+        # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+        # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
+        if text_config_dict is not None:
+            if text_config is None:
+                text_config = {}
+
+            # This is the complete result when using `text_config_dict`.
+            _text_config_dict = CLIPTextConfig(**text_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+            for key, value in _text_config_dict.items():
+                if (
+                    key in text_config
+                    and value != text_config[key]
+                    and key not in ['transformers_version']
+                ):
+                    # If specified in `text_config_dict`
+                    if key in text_config_dict:
+                        message = (
+                            f'`{key}` is found in both `text_config_dict` and `text_config` but with different values. '
+                            f'The value `text_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f'`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The '
+                            f'value `text_config["{key}"]` will be overridden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `text_config` with the ones in `_text_config_dict`.
+            text_config.update(_text_config_dict)
+
+        if vision_config_dict is not None:
+            if vision_config is None:
+                vision_config = {}
+
+            # This is the complete result when using `vision_config_dict`.
+            _vision_config_dict = CLIPVisionConfig(**vision_config_dict).to_dict()
+            # convert keys to string instead of integer
+            if 'id2label' in _vision_config_dict:
+                _vision_config_dict['id2label'] = {
+                    str(key): value for key, value in _vision_config_dict['id2label'].items()
+                }
+
+            # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
+            for key, value in _vision_config_dict.items():
+                if (
+                    key in vision_config
+                    and value != vision_config[key]
+                    and key not in ['transformers_version']
+                ):
+                    # If specified in `vision_config_dict`
+                    if key in vision_config_dict:
+                        message = (
+                            f'`{key}` is found in both `vision_config_dict` and `vision_config` but with different '
+                            f'values. The value `vision_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f'`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. '
+                            f'The value `vision_config["{key}"]` will be overridden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `vision_config` with the ones in `_vision_config_dict`.
+            vision_config.update(_vision_config_dict)
+
+        if text_config is None:
+            text_config = {}
+            logger.info(
+                '`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.',
+            )
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info(
+                '`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.',
+            )
+
+        self.text_config = CLIPTextConfig(**text_config)
+        self.vision_config = CLIPVisionConfig(**vision_config)
+
+        self.projection_dim = projection_dim
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = 1.0
+
+    @classmethod
+    def from_text_vision_configs(
+        cls,
+        text_config: CLIPTextConfig,
+        vision_config: CLIPVisionConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
+        configuration.
+
+        Returns:
+            [`CLIPConfig`]: An instance of a configuration object
+        """
+
+        return cls(
+            text_config=text_config.to_dict(),
+            vision_config=vision_config.to_dict(),
+            **kwargs,
+        )
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output['text_config'] = self.text_config.to_dict()
+        output['vision_config'] = self.vision_config.to_dict()
+        output['model_type'] = self.__class__.model_type
+        return output
diff --git a/safe_sora/models/multimodal_encoder/languagebind/video/modeling_video.py b/safe_sora/models/multimodal_encoder/languagebind/video/modeling_video.py
new file mode 100644
index 0000000..28773d2
--- /dev/null
+++ b/safe_sora/models/multimodal_encoder/languagebind/video/modeling_video.py
@@ -0,0 +1,1116 @@
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+from einops import rearrange
+from peft import LoraConfig, get_peft_model
+from torch import nn
+from torch.nn import functional as F
+from transformers import PreTrainedModel, add_start_docstrings
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.models.clip.modeling_clip import (
+    CLIPMLP,
+    CLIPAttention,
+    CLIPOutput,
+    CLIPTextEmbeddings,
+    CLIPTextModelWithProjection,
+    CLIPVisionEmbeddings,
+    CLIPVisionModelWithProjection,
+    _prepare_4d_attention_mask,
+    clip_loss,
+)
+from transformers.utils import add_start_docstrings_to_model_forward, replace_return_docstrings
+
+from .configuration_video import CLIPTextConfig, CLIPVisionConfig, LanguageBindVideoConfig
+
+
+class PatchDropout(nn.Module):
+    """
+    https://arxiv.org/abs/2212.00794
+    """
+
+    def __init__(self, prob, exclude_first_token=True):
+        super().__init__()
+        assert 0 <= prob < 1.0
+        self.prob = prob
+        self.exclude_first_token = exclude_first_token  # exclude CLS token
+
+    def forward(self, x, B, T):
+        if not self.training or self.prob == 0.0:
+            return x
+
+        if self.exclude_first_token:
+            cls_tokens, x = x[:, :1], x[:, 1:]
+        else:
+            cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1])
+
+        batch = x.size()[0]
+        num_tokens = x.size()[1]
+
+        batch_indices = torch.arange(batch)
+        batch_indices = batch_indices[..., None]
+
+        keep_prob = 1 - self.prob
+        num_patches_keep = max(1, int(num_tokens * keep_prob))
+
+        if T == 1:
+            rand = torch.randn(batch, num_tokens)
+            patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+        else:
+            rand = torch.randn(B, num_tokens)
+            patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+            patch_indices_keep = patch_indices_keep.unsqueeze(1).repeat(1, T, 1)
+            patch_indices_keep = rearrange(patch_indices_keep, 'b t n -> (b t) n')
+
+        x = x[batch_indices, patch_indices_keep]
+
+        if self.exclude_first_token:
+            x = torch.cat((cls_tokens, x), dim=1)
+
+        return x
+
+
+class CLIPEncoderLayer(nn.Module):
+    def __init__(self, config: LanguageBindVideoConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = CLIPAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = CLIPMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+        self.add_time_attn = config.add_time_attn
+        if self.add_time_attn:
+            self.t = config.num_frames
+            self.temporal_embedding = nn.Parameter(
+                torch.zeros(1, config.num_frames, config.hidden_size),
+            )
+            nn.init.normal_(self.temporal_embedding, std=config.hidden_size**-0.5)
+
+            self.embed_dim = config.hidden_size
+            self.temporal_attn = CLIPAttention(config)
+            self.temporal_layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+            # self.temporal_mlp = CLIPMLP(config)
+            # self.temporal_layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+
+        if self.add_time_attn:
+            bt, n, d = hidden_states.shape
+            t = self.t
+
+            # time embed
+            if t != 1:
+                n = hidden_states.shape[1]
+                hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+                hidden_states = hidden_states + self.temporal_embedding[:, :t, :]
+                hidden_states = rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+            # time attn
+            residual = hidden_states
+            hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+            # hidden_states = self.layer_norm1(hidden_states)  # share layernorm
+            hidden_states = self.temporal_layer_norm1(hidden_states)
+            hidden_states, attn_weights = self.temporal_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                causal_attention_mask=causal_attention_mask,
+                output_attentions=output_attentions,
+            )
+            hidden_states = residual + rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+            # residual = hidden_states
+            # hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+            # # hidden_states = self.layer_norm2(hidden_states)  # share layernorm
+            # hidden_states = self.temporal_layer_norm2(hidden_states)
+            # hidden_states = self.temporal_mlp(hidden_states)
+            # hidden_states = residual + rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+        # spatial attn
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class CLIPPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = LanguageBindVideoConfig
+    base_model_prefix = 'clip'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor
+        if isinstance(module, CLIPTextEmbeddings):
+            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        elif isinstance(module, CLIPVisionEmbeddings):
+            factor = self.config.initializer_factor
+            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
+            nn.init.normal_(
+                module.patch_embedding.weight,
+                std=module.config.initializer_range * factor,
+            )
+            nn.init.normal_(
+                module.position_embedding.weight,
+                std=module.config.initializer_range * factor,
+            )
+        elif isinstance(module, CLIPAttention):
+            factor = self.config.initializer_factor
+            in_proj_std = (
+                (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            )
+            out_proj_std = (module.embed_dim**-0.5) * factor
+            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+        elif isinstance(module, CLIPMLP):
+            factor = self.config.initializer_factor
+            in_proj_std = (
+                (module.config.hidden_size**-0.5)
+                * ((2 * module.config.num_hidden_layers) ** -0.5)
+                * factor
+            )
+            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+            nn.init.normal_(module.fc1.weight, std=fc_std)
+            nn.init.normal_(module.fc2.weight, std=in_proj_std)
+        elif isinstance(module, LanguageBindVideo):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+        elif isinstance(module, CLIPVisionModelWithProjection):
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
+        elif isinstance(module, CLIPTextModelWithProjection):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
+
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, CLIPEncoder):
+            module.gradient_checkpointing = value
+
+
+CLIP_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CLIP_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIP_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class CLIPEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`CLIPEncoderLayer`].
+
+    Args:
+        config: CLIPConfig
+    """
+
+    def __init__(self, config: LanguageBindVideoConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [CLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)],
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for _, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, encoder_states, all_attentions] if v is not None
+            )
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+        )
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size,
+    dtype: torch.dtype,
+    device: torch.device,
+    past_key_values_length: int = 0,
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat(
+            [torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask],
+            dim=-1,
+        )
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+class CLIPTextTransformer(nn.Module):
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = CLIPTextEmbeddings(config)
+        self.encoder = CLIPEncoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None:
+            raise ValueError('You have to specify input_ids')
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = _make_causal_mask(
+            input_shape,
+            hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+        pooled_output = last_hidden_state[
+            torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+            input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
+        ]
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """The text model from CLIP without any head or projection on top.""",
+    CLIP_START_DOCSTRING,
+)
+class CLIPTextModel(CLIPPreTrainedModel):
+    config_class = CLIPTextConfig
+
+    _no_split_modules = ['CLIPEncoderLayer']
+
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__(config)
+        self.text_model = CLIPTextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, CLIPTextModel
+
+        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class CLIPVisionTransformer(nn.Module):
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = CLIPVisionEmbeddings(config)
+        self.patch_dropout = PatchDropout(config.force_patch_dropout)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.encoder = CLIPEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=BaseModelOutputWithPooling,
+        config_class=CLIPVisionConfig,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # print('input video raw shape', pixel_values.shape)
+
+        if pixel_values is None:
+            raise ValueError('You have to specify pixel_values')
+        ######################################
+        if len(pixel_values.shape) == 7:
+            b_new, pair_new, T, bs_new, channel_new, h_new, w_new = pixel_values.shape
+            # print(pixel_values.shape)
+            B = b_new * pair_new * bs_new
+            pixel_values = pixel_values.reshape(B * T, channel_new, h_new, w_new)
+
+        elif len(pixel_values.shape) == 5:
+            B, _, T, _, _ = pixel_values.shape
+            # print(pixel_values.shape)
+            pixel_values = rearrange(pixel_values, 'b c t h w -> (b t) c h w')
+        else:
+            # print(pixel_values.shape)
+            B, _, _, _ = pixel_values.shape
+            T = 1
+        ###########################
+        hidden_states = self.embeddings(pixel_values)
+        # print(B, T)
+        hidden_states = self.patch_dropout(
+            hidden_states,
+            B,
+            T,
+        )
+
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        # print('video encoder last_hidden_state', last_hidden_state.shape)
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        pooled_output = pooled_output.reshape(B, T, -1).mean(1)
+        #################################
+        encoder_outputs.hidden_states = [
+            rearrange(i, '(b t) n c -> b t n c', b=B) for i in encoder_outputs.hidden_states
+        ]
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """The vision model from CLIP without any head or projection on top.""",
+    CLIP_START_DOCSTRING,
+)
+class CLIPVisionModel(CLIPPreTrainedModel):
+    config_class = CLIPVisionConfig
+    main_input_name = 'pixel_values'
+
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__(config)
+        self.vision_model = CLIPVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=BaseModelOutputWithPooling,
+        config_class=CLIPVisionConfig,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPVisionModel
+
+        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+@add_start_docstrings(CLIP_START_DOCSTRING)
+class LanguageBindVideo(CLIPPreTrainedModel):
+    config_class = LanguageBindVideoConfig
+
+    def __init__(self, config: LanguageBindVideoConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, CLIPTextConfig):
+            raise ValueError(
+                'config.text_config is expected to be of type CLIPTextConfig but is of type'
+                f' {type(config.text_config)}.',
+            )
+
+        if not isinstance(config.vision_config, CLIPVisionConfig):
+            raise ValueError(
+                'config.vision_config is expected to be of type CLIPVisionConfig but is of type'
+                f' {type(config.vision_config)}.',
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+        self.add_time_attn = vision_config.add_time_attn
+        self.lora_r = vision_config.lora_r
+        self.lora_alpha = vision_config.lora_alpha
+        self.lora_dropout = vision_config.lora_dropout
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = CLIPTextTransformer(text_config)
+        self.vision_model = CLIPVisionTransformer(vision_config)
+
+        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
+
+        # Initialize weights and apply final processing
+        self.post_init()
+        # self.convert_to_lora()  ############################################
+        # self.resize_pos(self.vision_model.embeddings, vision_config)
+
+    def convert_to_lora(self):
+        if self.lora_r == 0:
+            return
+        if self.add_time_attn:
+            target_modules = [
+                'temporal_attn.k_proj',
+                'temporal_attn.v_proj',
+                'temporal_attn.q_proj',
+                'temporal_attn.out_proj',
+                'temporal_mlp.fc1',
+                'temporal_mlp.fc2',
+            ]
+        else:
+            target_modules = ['k_proj', 'v_proj', 'q_proj', 'out_proj']
+        config = LoraConfig(
+            r=self.lora_r,  # 16
+            lora_alpha=self.lora_alpha,  # 16
+            target_modules=target_modules,  # self_attn.out_proj
+            lora_dropout=self.lora_dropout,  # 0.1
+            bias='none',
+            modules_to_save=[],
+        )
+        self.vision_model.encoder.is_gradient_checkpointing = False
+        self.vision_model.encoder = get_peft_model(self.vision_model.encoder, config)
+
+    def resize_pos(self, m, vision_config):
+        # convert embedding
+        if vision_config.num_mel_bins != 0 and vision_config.target_length != 0:
+            m.image_size = [vision_config.num_mel_bins, vision_config.target_length]
+        m.config.image_size = (
+            [m.image_size, m.image_size] if isinstance(m.image_size, int) else m.image_size
+        )
+        # pos resize
+        old_pos_embed_state_dict = m.position_embedding.state_dict()
+        old_pos_embed = old_pos_embed_state_dict['weight']
+        dtype = old_pos_embed.dtype
+        grid_size = [m.config.image_size[0] // m.patch_size, m.config.image_size[1] // m.patch_size]
+        extra_tokens = 1  # FIXME detect different token configs (ie no class token, or more)
+        new_seq_len = grid_size[0] * grid_size[1] + extra_tokens
+        if new_seq_len == old_pos_embed.shape[0]:
+            # m.to(args.device)
+            return
+
+        m.num_patches = grid_size[0] * grid_size[1]
+        m.num_positions = m.num_patches + 1
+        m.register_buffer('position_ids', torch.arange(m.num_positions).expand((1, -1)))
+        new_position_embedding = nn.Embedding(m.num_positions, m.embed_dim)
+
+        if extra_tokens:
+            pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:]
+        else:
+            pos_emb_tok, pos_emb_img = None, old_pos_embed
+        old_grid_size = [int(math.sqrt(len(pos_emb_img)))] * 2
+
+        # if is_master(args):
+        #     logging.info('Resizing position embedding grid-size from %s to %s', old_grid_size, grid_size)
+        pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(
+            0,
+            3,
+            1,
+            2,
+        )
+        pos_emb_img = F.interpolate(
+            pos_emb_img,
+            size=grid_size,
+            mode='bicubic',
+            antialias=True,
+            align_corners=False,
+        )
+        pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)[0]
+        if pos_emb_tok is not None:
+            new_pos_embed = torch.cat([pos_emb_tok, pos_emb_img], dim=0)
+        else:
+            new_pos_embed = pos_emb_img
+        old_pos_embed_state_dict['weight'] = new_pos_embed.to(dtype)
+        m.position_embedding = new_position_embedding
+        m.position_embedding.load_state_dict(old_pos_embed_state_dict)
+
+        # m.to(args.device)
+
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+        text_features = self.text_projection(pooled_output)
+
+        return text_features
+
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(pooled_output)
+
+        return image_features
+
+    @add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CLIPOutput, config_class=LanguageBindVideoConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLIPOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+        ... )
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
+
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+        logits_per_image = logits_per_text.t()
+
+        loss = None
+        if return_loss:
+            loss = clip_loss(logits_per_text)
+
+        if not return_dict:
+            output = (
+                logits_per_image,
+                logits_per_text,
+                text_embeds,
+                image_embeds,
+                text_outputs,
+                vision_outputs,
+            )
+            return ((loss,) + output) if loss is not None else output
+
+        return CLIPOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
diff --git a/safe_sora/models/multimodal_encoder/languagebind/video/processing_video.py b/safe_sora/models/multimodal_encoder/languagebind/video/processing_video.py
new file mode 100644
index 0000000..e4b2451
--- /dev/null
+++ b/safe_sora/models/multimodal_encoder/languagebind/video/processing_video.py
@@ -0,0 +1,280 @@
+import torch
+
+
+try:
+    import cv2
+except ImportError:
+    _HAS_CV2 = False
+else:
+    _HAS_CV2 = True
+# from pytorchvideo.transforms import ApplyTransformToKey, ShortSideScale, UniformTemporalSubsample
+import math
+
+import decord
+import numpy as np
+from decord import VideoReader, cpu
+
+# from pytorchvideo.data.encoded_video import EncodedVideo
+from torchvision.transforms import Compose, Lambda
+from torchvision.transforms._transforms_video import (
+    CenterCropVideo,
+    NormalizeVideo,
+    RandomHorizontalFlipVideo,
+)
+from transformers import ProcessorMixin
+
+
+@torch.jit.ignore
+def _interpolate_opencv(x: torch.Tensor, size: tuple[int, int], interpolation: str) -> torch.Tensor:
+    """
+    Down/up samples the input torch tensor x to the given size with given interpolation
+    mode.
+    Args:
+        input (Tensor): the input tensor to be down/up sampled.
+        size (Tuple[int, int]): expected output spatial size.
+        interpolation: model to perform interpolation, options include `nearest`,
+            `linear`, `bilinear`, `bicubic`.
+    """
+    if not _HAS_CV2:
+        raise ImportError(
+            'opencv is required to use opencv transforms. Please '
+            "install with 'pip install opencv-python'.",
+        )
+
+    _opencv_pytorch_interpolation_map = {
+        'nearest': cv2.INTER_NEAREST,
+        'linear': cv2.INTER_LINEAR,
+        'bilinear': cv2.INTER_LINEAR,
+        'bicubic': cv2.INTER_CUBIC,
+    }
+    assert interpolation in _opencv_pytorch_interpolation_map
+    new_h, new_w = size
+    img_array_list = [
+        img_tensor.squeeze(0).numpy() for img_tensor in x.permute(1, 2, 3, 0).split(1, dim=0)
+    ]
+    resized_img_array_list = [
+        cv2.resize(
+            img_array,
+            (new_w, new_h),  # The input order for OpenCV is w, h.
+            interpolation=_opencv_pytorch_interpolation_map[interpolation],
+        )
+        for img_array in img_array_list
+    ]
+    img_array = np.concatenate(
+        [np.expand_dims(img_array, axis=0) for img_array in resized_img_array_list],
+        axis=0,
+    )
+    img_tensor = torch.from_numpy(np.ascontiguousarray(img_array))
+    img_tensor = img_tensor.permute(3, 0, 1, 2)
+    return img_tensor
+
+
+def short_side_scale(
+    x: torch.Tensor,
+    size: int,
+    interpolation: str = 'bilinear',
+    backend: str = 'pytorch',
+) -> torch.Tensor:
+    """
+    Determines the shorter spatial dim of the video (i.e. width or height) and scales
+    it to the given size. To maintain aspect ratio, the longer side is then scaled
+    accordingly.
+    Args:
+        x (torch.Tensor): A video tensor of shape (C, T, H, W) and type torch.float32.
+        size (int): The size the shorter side is scaled to.
+        interpolation (str): Algorithm used for upsampling,
+            options: nearest' | 'linear' | 'bilinear' | 'bicubic' | 'trilinear' | 'area'
+        backend (str): backend used to perform interpolation. Options includes
+            `pytorch` as default, and `opencv`. Note that opencv and pytorch behave
+            differently on linear interpolation on some versions.
+            https://discuss.pytorch.org/t/pytorch-linear-interpolation-is-different-from-pil-opencv/71181
+    Returns:
+        An x-like Tensor with scaled spatial dims.
+    """
+    assert len(x.shape) == 4
+    assert x.dtype == torch.float32
+    assert backend in ('pytorch', 'opencv')
+    c, t, h, w = x.shape
+    if w < h:
+        new_h = int(math.floor((float(h) / w) * size))
+        new_w = size
+    else:
+        new_h = size
+        new_w = int(math.floor((float(w) / h) * size))
+    if backend == 'pytorch':
+        return torch.nn.functional.interpolate(
+            x,
+            size=(new_h, new_w),
+            mode=interpolation,
+            align_corners=False,
+        )
+    elif backend == 'opencv':
+        return _interpolate_opencv(x, size=(new_h, new_w), interpolation=interpolation)
+    else:
+        raise NotImplementedError(f'{backend} backend not supported.')
+
+
+class ShortSideScale(torch.nn.Module):
+    """
+    ``nn.Module`` wrapper for ``short_side_scale``.
+    """
+
+    def __init__(self, size: int, interpolation: str = 'bilinear', backend: str = 'pytorch'):
+        super().__init__()
+        self._size = size
+        self._interpolation = interpolation
+        self._backend = backend
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (torch.Tensor): video tensor with shape (C, T, H, W).
+        """
+        return short_side_scale(
+            x,
+            self._size,
+            self._interpolation,
+            self._backend,
+        )
+
+
+decord.bridge.set_bridge('torch')
+
+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+
+
+def make_list_of_images(x):
+    if not isinstance(x, list):
+        return [x]
+    return x
+
+
+def get_video_transform(config):
+    config = config.vision_config
+
+    if config.video_decode_backend == 'decord' or config.video_decode_backend == 'opencv':
+
+        transform = Compose(
+            [
+                # UniformTemporalSubsample(num_frames),
+                Lambda(lambda x: x / 255.0),
+                NormalizeVideo(mean=OPENAI_DATASET_MEAN, std=OPENAI_DATASET_STD),
+                ShortSideScale(size=224),
+                CenterCropVideo(224),
+                RandomHorizontalFlipVideo(p=0.5),
+            ],
+        )
+    else:
+        raise NameError('video_decode_backend should specify in (decord, opencv)')
+    return transform
+
+
+def load_and_transform_video(
+    video_path,
+    transform,
+    video_decode_backend='opencv',
+    clip_start_sec=0.0,
+    clip_end_sec=None,
+    num_frames=8,
+):
+
+    if video_decode_backend == 'decord':
+        decord.bridge.set_bridge('torch')
+        decord_vr = VideoReader(video_path, ctx=cpu(0))
+        duration = len(decord_vr)
+        frame_id_list = np.linspace(0, duration - 1, num_frames, dtype=int)
+        video_data = decord_vr.get_batch(frame_id_list)
+        video_data = video_data.permute(3, 0, 1, 2)  # (T, H, W, C) -> (C, T, H, W)
+        video_outputs = transform(video_data)
+
+    # elif video_decode_backend == 'pytorchvideo':
+    #     #  decord pyav
+    #     video = EncodedVideo.from_path(video_path, decoder="decord", decode_audio=False)
+    #     duration = video.duration
+    #     start_sec = clip_start_sec  # secs
+    #     end_sec = clip_end_sec if clip_end_sec is not None else duration  # secs
+    #     video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
+    #     video_outputs = transform(video_data)
+
+    elif video_decode_backend == 'opencv':
+        cv2_vr = cv2.VideoCapture(video_path)
+        duration = int(cv2_vr.get(cv2.CAP_PROP_FRAME_COUNT))
+        frame_id_list = np.linspace(0, duration - 1, num_frames, dtype=int)
+
+        video_data = []
+        for frame_idx in frame_id_list:
+            cv2_vr.set(1, frame_idx)
+            _, frame = cv2_vr.read()
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            video_data.append(torch.from_numpy(frame).permute(2, 0, 1))
+        cv2_vr.release()
+        video_data = torch.stack(video_data, dim=1)
+        video_outputs = transform(video_data)
+    else:
+        raise NameError('video_decode_backend should specify in (pytorchvideo, decord, opencv)')
+    return video_outputs
+
+
+class LanguageBindVideoProcessor(ProcessorMixin):
+    attributes = []
+    tokenizer_class = 'LanguageBindVideoTokenizer'
+
+    def __init__(self, config, tokenizer=None, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.transform = get_video_transform(config)
+        self.image_processor = load_and_transform_video
+        self.tokenizer = tokenizer
+
+    def __call__(self, images=None, text=None, context_length=77, return_tensors=None, **kwargs):
+        if text is None and images is None:
+            raise ValueError('You have to specify either text or images. Both cannot be none.')
+
+        if text is not None:
+            encoding = self.tokenizer(
+                text,
+                max_length=context_length,
+                padding='max_length',
+                truncation=True,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+
+        if images is not None:
+            images = make_list_of_images(images)
+            image_features = [
+                self.image_processor(
+                    image,
+                    self.transform,
+                    video_decode_backend=self.config.vision_config.video_decode_backend,
+                    num_frames=self.config.vision_config.num_frames,
+                )
+                for image in images
+            ]
+            image_features = torch.stack(image_features)
+
+        if text is not None and images is not None:
+            encoding['pixel_values'] = image_features
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return {'pixel_values': image_features}
+
+    def preprocess(self, images, return_tensors):
+        return self.__call__(images=images, return_tensors=return_tensors)
+
+    def batch_decode(self, skip_special_tokens=True, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
+
+    def decode(self, skip_special_tokens=True, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
diff --git a/safe_sora/models/multimodal_encoder/languagebind/video/tokenization_video.py b/safe_sora/models/multimodal_encoder/languagebind/video/tokenization_video.py
new file mode 100644
index 0000000..79bc2a9
--- /dev/null
+++ b/safe_sora/models/multimodal_encoder/languagebind/video/tokenization_video.py
@@ -0,0 +1,80 @@
+from transformers import CLIPTokenizer
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    'vocab_file': 'vocab.json',
+    'merges_file': 'merges.txt',
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file': {
+        'lb203/LanguageBind-Video': 'https://huggingface.co/lb203/LanguageBind-Video/resolve/main/vocab.json',
+    },
+    'merges_file': {
+        'lb203/LanguageBind-Video': 'https://huggingface.co/lb203/LanguageBind-Video/resolve/main/merges.txt',
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'lb203/LanguageBind-Video': 77,
+}
+
+
+PRETRAINED_INIT_CONFIGURATION = {
+    'lb203/LanguageBind-Video': {},
+}
+
+
+class LanguageBindVideoTokenizer(CLIPTokenizer):
+    """
+    Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The end of sequence token.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ['input_ids', 'attention_mask']
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors='replace',
+        unk_token='<|endoftext|>',
+        bos_token='<|startoftext|>',
+        eos_token='<|endoftext|>',
+        pad_token='<|endoftext|>',  # hack to enable padding
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file,
+            merges_file,
+            errors,
+            unk_token,
+            bos_token,
+            eos_token,
+            pad_token,  # hack to enable padding
+            **kwargs,
+        )
diff --git a/safe_sora/models/multimodal_projector/builder.py b/safe_sora/models/multimodal_projector/builder.py
new file mode 100644
index 0000000..8b484bb
--- /dev/null
+++ b/safe_sora/models/multimodal_projector/builder.py
@@ -0,0 +1,52 @@
+import re
+
+import torch.nn as nn
+
+
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, *args, **kwargs):
+        return x
+
+    @property
+    def config(self):
+        return {'mm_projector_type': 'identity'}
+
+
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+
+        self.proj = nn.Sequential(
+            nn.Linear(channels, channels),
+            nn.GELU(),
+            nn.Linear(channels, channels),
+        )
+
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+
+
+def build_vision_projector(config, delay_load=False, **kwargs):
+    projector_type = getattr(config, 'mm_projector_type', 'linear')
+
+    if projector_type == 'linear':
+        return nn.Linear(config.mm_hidden_size, config.hidden_size)
+
+    mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
+    if mlp_gelu_match:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        return nn.Sequential(*modules)
+
+    if projector_type == 'identity':
+        return IdentityMap()
+
+    raise ValueError(f'Unknown projector type: {projector_type}')
diff --git a/safe_sora/models/score_model.py b/safe_sora/models/score_model.py
new file mode 100644
index 0000000..972e582
--- /dev/null
+++ b/safe_sora/models/score_model.py
@@ -0,0 +1,243 @@
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Score model for the video captioning task."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+import torch
+from torch import nn
+from transformers import LlamaConfig, LlamaModel
+from transformers.modeling_utils import ModelOutput, PretrainedConfig
+from transformers.models.llama import LlamaPreTrainedModel
+
+from safe_sora.models.video_llava import LlavaMetaForCausalLM, LlavaMetaModel
+
+
+@dataclass
+class ScoreModelOutput(ModelOutput):
+    """
+    Output of the score model.
+
+    Args:
+        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, score_dim)`):
+            Prediction scores of the score model.
+        end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, score_dim)`):
+            Prediction scores of the end of the sequence.
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :
+                obj:`(batch_size, sequence_length, hidden_dim)`
+                ):
+            Sequence of hidden-states at the output of the last layer of the model.
+        end_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, hidden_dim)`):
+            Last hidden state of the sequence at the output of the last layer of the model.
+        end_index (`torch.LongTensor` of shape `(batch_size,)`):
+            Indices of the end of the sequence.
+    """
+
+    scores: torch.FloatTensor | None = None  # size = (B, L, D)
+    end_scores: torch.FloatTensor | None = None  # size = (B, D)
+    last_hidden_state: torch.FloatTensor | None = None  # size = (B, L, E)
+    end_last_hidden_state: torch.FloatTensor | None = None  # size = (B, E)
+    end_index: torch.LongTensor | None = None  # size = (B,)
+
+
+class LlamaForScore(LlamaPreTrainedModel):  # pylint: disable=abstract-method
+    """LlamaForScore is a model that predicts the score of the input sequence."""
+
+    def __init__(self, config: PretrainedConfig) -> None:
+        super().__init__(config)
+        self.model = LlamaModel(config)
+        self.score = nn.Linear(config.hidden_size, 1, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> Any:
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value: Any) -> Any:
+        self.model.embed_tokens = value
+
+    # pylint: disable=too-many-arguments
+    # pylint: disable=too-many-locals
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,  # pylint: disable=unused-argument
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+    ) -> tuple | ScoreModelOutput:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss.
+                Indices should be in `[0, ...,
+            config.num_labels - 1]`.
+                If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+                If`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = transformer_outputs.last_hidden_state
+        scores = self.score(last_hidden_state)  # size = (B, L, 1)
+
+        end_index = torch.cat([m.nonzero()[-1] for m in attention_mask])  # size = (B,)
+        end_last_hidden_state = torch.gather(  # size = (B, 1, E)
+            last_hidden_state,
+            dim=1,
+            index=(
+                end_index.to(last_hidden_state.device)
+                .unsqueeze(dim=1)
+                .unsqueeze(dim=2)
+                .expand(-1, -1, last_hidden_state.size(-1))
+            ),
+        )
+        end_scores = torch.gather(  # size = (B, 1, D)
+            scores,
+            dim=1,
+            index=(
+                end_index.to(scores.device)
+                .unsqueeze(dim=1)
+                .unsqueeze(dim=2)
+                .expand(-1, -1, scores.size(-1))
+            ),
+        )
+        end_last_hidden_state = end_last_hidden_state.squeeze(dim=1)  # size = (B, E)
+        end_scores = end_scores.squeeze(dim=1)  # size = (B, D)
+
+        if not return_dict:
+            return scores, end_scores
+
+        return ScoreModelOutput(
+            scores=scores,
+            end_scores=end_scores,
+            last_hidden_state=last_hidden_state,
+            end_last_hidden_state=end_last_hidden_state,
+            end_index=end_index,
+        )
+
+
+class LlavaScoreConfig(LlamaConfig):  # pylint: disable=too-many-ancestors
+    """Configuration for the llava score model"""
+
+    model_type = 'llava_score'
+
+
+class LlavaLlamaModel(LlavaMetaModel, LlamaModel):  # pylint: disable=too-many-ancestors
+    """LlavaLlamaModel."""
+
+    config_class = LlavaScoreConfig
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__(config)
+
+
+class LlavaLlamaForScore(LlamaForScore, LlavaMetaForCausalLM):  # pylint: disable=too-many-ancestors
+    """LlavaLlamaForScore is a model that predicts the score of the input sequence."""
+
+    config_class = LlavaScoreConfig
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super(LlamaForScore, self).__init__(config)
+        self.model = LlavaLlamaModel(config)
+        self.pretraining_tp = config.pretraining_tp
+        self.vocab_size = config.vocab_size
+        self.score = nn.Linear(config.hidden_size, 1, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self) -> Any:
+        return self.model
+
+    # pylint: disable=too-many-arguments
+    # pylint: disable=arguments-renamed
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        images: torch.FloatTensor | None = None,
+        return_dict: bool | None = None,
+    ) -> tuple | ScoreModelOutput:
+
+        if inputs_embeds is None:
+            (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = (
+                self.prepare_inputs_labels_for_multimodal(
+                    input_ids,
+                    position_ids,
+                    attention_mask,
+                    past_key_values,
+                    labels,
+                    images,
+                )
+            )
+
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+    # pylint: disable=arguments-differ
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: list[torch.FloatTensor] | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        **kwargs,  # noqa
+    ) -> dict:
+        images = kwargs.pop('images', None)
+        _inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+        if images is not None:
+            _inputs['images'] = images
+        return _inputs
diff --git a/safe_sora/models/video_llava.py b/safe_sora/models/video_llava.py
new file mode 100644
index 0000000..bd717a5
--- /dev/null
+++ b/safe_sora/models/video_llava.py
@@ -0,0 +1,601 @@
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Adopted from https://github.com/PKU-YuanGroup/Video-LLaVA project
+# Its original license is Apache-2.0 License.
+
+from __future__ import annotations
+
+import os
+from abc import ABC, abstractmethod
+from typing import Any
+
+import torch
+import transformers
+from deepspeed import zero
+from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+
+from safe_sora.models.constants import (
+    DEFAULT_IM_END_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    DEFAULT_IMAGE_PATCH_TOKEN,
+    IGNORE_INDEX,
+    IMAGE_TOKEN_INDEX,
+)
+from safe_sora.models.multimodal_encoder.builder import build_image_tower, build_video_tower
+from safe_sora.models.multimodal_projector.builder import build_vision_projector
+
+
+def maybe_zero_3(
+    param: torch.Tensor,
+    ignore_status: bool = False,
+    name: str | None = None,
+) -> torch.Tensor:
+
+    if hasattr(param, 'ds_id'):
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE and not ignore_status:
+            print(name, 'no ignore status')
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+
+
+def get_mm_adapter_state_maybe_zero_3(
+    named_params: list[tuple[str, torch.Tensor]],
+    keys_to_match: list[str],
+) -> dict[str, torch.Tensor]:
+    to_return = {
+        k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)
+    }
+    return {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()}
+
+
+def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str) -> None:
+    """Collects the state dict and dump to disk."""
+
+    if getattr(trainer.args, 'tune_mm_mlp_adapter', False):
+        # Only save Adapter
+        keys_to_match = ['mm_projector']
+        if getattr(trainer.args, 'use_im_start_end', False):
+            keys_to_match.extend(['embed_tokens', 'embed_in'])
+
+        weight_to_save = get_mm_adapter_state_maybe_zero_3(
+            trainer.model.named_parameters(),
+            keys_to_match,
+        )
+        trainer.model.config.save_pretrained(output_dir)
+
+        current_folder = output_dir.split('/')[-1]
+        parent_folder = os.path.dirname(output_dir)
+        if trainer.args.local_rank == 0 or trainer.args.local_rank == -1:
+            if current_folder.startswith('checkpoint-'):
+                mm_projector_folder = os.path.join(parent_folder, 'mm_projector')
+                os.makedirs(mm_projector_folder, exist_ok=True)
+                torch.save(
+                    weight_to_save,
+                    os.path.join(mm_projector_folder, f'{current_folder}.bin'),
+                )
+            else:
+                torch.save(weight_to_save, os.path.join(output_dir, 'mm_projector.bin'))
+        return
+
+    if trainer.deepspeed:
+        torch.cuda.synchronize()
+        trainer.save_model(output_dir)
+        return
+
+    state_dict = trainer.model.state_dict()
+    if trainer.args.should_save:
+        cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()}
+        del state_dict
+        trainer._save(output_dir, state_dict=cpu_state_dict)
+
+
+class LlavaMetaModel:
+
+    def __init__(self, config: dict) -> None:
+        super().__init__(config)
+
+        if getattr(config, 'mm_image_tower', None) is not None:
+            self.image_tower = build_image_tower(config, delay_load=True)
+        if getattr(config, 'mm_video_tower', None) is not None:
+            self.video_tower = build_video_tower(config, delay_load=True)
+        if (
+            getattr(config, 'mm_image_tower', None) is not None
+            or getattr(config, 'mm_video_tower', None) is not None
+        ):
+            self.mm_projector = build_vision_projector(config)
+
+    def get_image_tower(self) -> Any:
+        """Get the image tower."""
+        image_tower = getattr(self, 'image_tower', None)
+        if isinstance(image_tower, list):
+            image_tower = image_tower[0]
+        return image_tower
+
+    def get_video_tower(self) -> Any:
+        """Get the video tower."""
+        video_tower = getattr(self, 'video_tower', None)
+        if isinstance(video_tower, list):
+            video_tower = video_tower[0]
+        return video_tower
+
+    def initialize_vision_modules(self, model_args: Any, fsdp: Any = None) -> None:
+        """Initialize the vision modules."""
+        # ==============================================
+        image_tower = model_args.image_tower
+        video_tower = model_args.video_tower
+        assert image_tower is not None or video_tower is not None
+        # ==============================================
+        mm_vision_select_layer = model_args.mm_vision_select_layer
+        mm_vision_select_feature = model_args.mm_vision_select_feature
+        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+
+        # ==========================================================================
+
+        self.config.mm_image_tower = image_tower
+        if image_tower is not None:
+            if self.get_image_tower() is None:
+                image_tower = build_image_tower(model_args)
+
+                if fsdp is not None and len(fsdp) > 0:
+                    self.image_tower = [image_tower]
+                else:
+                    self.image_tower = image_tower
+            else:
+                if fsdp is not None and len(fsdp) > 0:
+                    image_tower = self.image_tower[0]
+                else:
+                    image_tower = self.image_tower
+                image_tower.load_model()
+
+        self.config.mm_video_tower = video_tower
+        if video_tower is not None:
+            if self.get_video_tower() is None:
+                video_tower = build_video_tower(model_args)
+
+                if fsdp is not None and len(fsdp) > 0:
+                    self.video_tower = [video_tower]
+                else:
+                    self.video_tower = video_tower
+            else:
+                if fsdp is not None and len(fsdp) > 0:
+                    video_tower = self.video_tower[0]
+                else:
+                    video_tower = self.video_tower
+                video_tower.load_model()
+
+        # ==========================================================================
+
+        self.config.use_mm_proj = True
+        self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+        self.config.mm_vision_select_feature = mm_vision_select_feature
+        # ==========================================================================
+        if (
+            image_tower is not None and video_tower is not None
+        ):  # TODO: support different hidden_size
+            assert image_tower.hidden_size == video_tower.hidden_size
+            self.config.mm_hidden_size = image_tower.hidden_size
+        else:
+            self.config.mm_hidden_size = max(
+                getattr(image_tower, 'hidden_size', -1),
+                getattr(video_tower, 'hidden_size', -1),
+            )
+        # ===================================================================================
+
+        if getattr(self, 'mm_projector', None) is None:
+            self.mm_projector = build_vision_projector(self.config)
+        else:
+            # In case it is frozen by LoRA
+            for p in self.mm_projector.parameters():
+                p.requires_grad = True
+
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
+
+            def get_w(weights: dict, keyword: str) -> dict:
+                return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
+
+            self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
+
+
+class LlavaMetaForCausalLM(ABC):
+
+    @abstractmethod
+    def get_model(self) -> Any:
+        pass
+
+    def get_image_tower(self) -> Any:
+        return self.get_model().get_image_tower()
+
+    def get_video_tower(self) -> Any:
+        return self.get_model().get_video_tower()
+
+    def encode_images(self, images: torch.Tensor) -> torch.Tensor:
+        image_features = self.get_model().get_image_tower()(images)
+        return self.get_model().mm_projector(image_features)
+
+    def encode_videos(self, videos: torch.Tensor) -> torch.Tensor:  # [mini_b, c, t, h, w]
+        b, _, t, _, _ = videos.shape
+        video_features = self.get_model().get_video_tower()(videos)  # [mini_b, t, n, c]
+        return self.get_model().mm_projector(video_features)
+
+    def prepare_inputs_labels_for_multimodal(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor | None,
+        attention_mask: torch.Tensor | None,
+        past_key_values: torch.Tensor | None,
+        labels: torch.Tensor | None,
+        images: torch.Tensor | None,
+    ) -> tuple:
+        # ===================================
+        image_tower = self.get_image_tower()
+        video_tower = self.get_video_tower()
+        if (
+            (image_tower is None and video_tower is None)
+            or images is None
+            or input_ids.shape[1] == 1
+        ):
+            if (
+                past_key_values is not None
+                and (image_tower is not None or video_tower is not None)
+                and images is not None
+                and input_ids.shape[1] == 1
+            ):
+                target_shape = past_key_values[-1][-1].shape[-2] + 1
+                attention_mask = torch.cat(
+                    (
+                        attention_mask,
+                        torch.ones(
+                            (attention_mask.shape[0], target_shape - attention_mask.shape[1]),
+                            dtype=attention_mask.dtype,
+                            device=attention_mask.device,
+                        ),
+                    ),
+                    dim=1,
+                )
+                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+            return input_ids, position_ids, attention_mask, past_key_values, None, labels
+
+        """
+            images is a list, if batch_size=6
+            [
+                image(3, 224, 224),      # sample 1
+                image(3, 224, 224),      # sample 2
+                video(t, 3, 224, 224),   # sample 3
+                image(3, 224, 224),      # sample 4
+                image(3, 224, 224),      # sample 4
+                video(t, 3, 224, 224),   # sample 5
+                video(t, 3, 224, 224),   # sample 5
+                video(t, 3, 224, 224),   # sample 6
+                image(3, 224, 224),      # sample 6
+            ]
+            will be converted to image_features, all video_feature will be flatten as image
+            [
+                [n, c],                  # sample 1
+                [n, c),                  # sample 2
+                *(t * [new_n, c]),       # sample 3
+                [n, c],                  # sample 4
+                [n, c],                  # sample 4
+                *(t * [new_n, c]),       # sample 5
+                *(t * [new_n, c]),       # sample 5
+                *(t * [new_n, c]),       # sample 6
+                [n, c],                  # sample 6
+            ]
+        """
+        image_idx = [idx for idx, img in enumerate(images) if img.ndim == 3]
+        video_idx = [idx for idx, vid in enumerate(images) if vid.ndim == 4]
+        images_minibatch = (
+            torch.stack([images[idx] for idx in image_idx]) if len(image_idx) > 0 else []
+        )  # mini_b c h w
+        videos_minibatch = (
+            torch.stack([images[idx] for idx in video_idx]) if len(video_idx) > 0 else []
+        )  # mini_b c t h w
+
+        tmp_image_features = [None] * (len(image_idx) + len(video_idx))
+        if getattr(images_minibatch, 'ndim', 0) == 4:  # batch consists of images, [mini_b, c, h, w]
+            if image_tower is not None:
+                image_features_minibatch = self.encode_images(images_minibatch)  # [mini_b, l, c]
+            else:
+                image_features_minibatch = torch.randn(1).to(
+                    self.device,
+                )  # dummy feature for video-only training under tuning
+            for i, pos in enumerate(image_idx):
+                tmp_image_features[pos] = image_features_minibatch[i]
+
+        if (
+            getattr(videos_minibatch, 'ndim', 0) == 5
+        ):  # batch consists of videos, [mini_b, c, t, h, w]
+            video_features_minibatch = self.encode_videos(
+                videos_minibatch,
+            )  # fake list [mini_b, t, l, c]
+            for i, pos in enumerate(video_idx):
+                t = video_features_minibatch[i].shape[0]
+                tmp_image_features[pos] = [video_features_minibatch[i][j] for j in range(t)]
+
+        new_tmp = []
+        for image in tmp_image_features:
+            # print(len(new_tmp), len(image))
+            if isinstance(image, list):
+                t = len(image)
+                for i in range(t):
+                    new_tmp.append(image[i])
+                # print('add video')
+            else:
+                new_tmp.append(image)
+        image_features = new_tmp
+        # print(len(image_features), *[i.shape for i in image_features])
+        # print(len(image_features), image_features[0].shape)
+        # ===================================
+
+        # TODO: image start / end is not implemented here to support pretraining.
+        if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(
+            self.config,
+            'mm_use_im_start_end',
+            False,
+        ):
+            raise NotImplementedError
+
+        # Let's just add dummy tensors if they do not exist,
+        # it is a headache to deal with None all the time.
+        # But it is not ideal, and if you have a better idea,
+        # please open an issue / submit a PR, thanks.
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()
+        if position_ids is None:
+            position_ids = torch.arange(
+                0,
+                input_ids.shape[1],
+                dtype=torch.long,
+                device=input_ids.device,
+            )
+        if labels is None:
+            labels = torch.full_like(input_ids, IGNORE_INDEX)
+
+        # remove the padding using attention_mask -- TODO: double check
+        input_ids = [
+            cur_input_ids[cur_attention_mask]
+            for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)
+        ]
+        labels = [
+            cur_labels[cur_attention_mask]
+            for cur_labels, cur_attention_mask in zip(labels, attention_mask)
+        ]
+
+        new_input_embeds = []
+        new_labels = []
+        cur_image_idx = 0
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+            # print(num_images, cur_input_ids)
+            if num_images == 0:
+                cur_image_features = image_features[cur_image_idx]
+                cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
+                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                continue
+
+            # image_token_indices = (
+            #     [-1]
+            #     + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist()
+            #     + [cur_input_ids.shape[0]]
+            # )
+            image_token_indices = [
+                -1,
+                *torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist(),
+                cur_input_ids.shape[0],
+            ]
+
+            cur_input_ids_noim = []
+            cur_labels = labels[batch_idx]
+            cur_labels_noim = []
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(
+                    cur_input_ids[image_token_indices[i] + 1 : image_token_indices[i + 1]],
+                )
+                cur_labels_noim.append(
+                    cur_labels[image_token_indices[i] + 1 : image_token_indices[i + 1]],
+                )
+            split_sizes = [x.shape[0] for x in cur_labels_noim]
+            cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
+            cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+            cur_new_input_embeds = []
+            cur_new_labels = []
+
+            for i in range(num_images + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                cur_new_labels.append(cur_labels_noim[i])
+                if i < num_images:
+                    # print(cur_image_idx)
+                    cur_image_features = image_features[cur_image_idx]
+                    cur_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_labels.append(
+                        torch.full(
+                            (cur_image_features.shape[0],),
+                            IGNORE_INDEX,
+                            device=cur_labels.device,
+                            dtype=cur_labels.dtype,
+                        ),
+                    )
+
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
+        if tokenizer_model_max_length is not None:
+            new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
+            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full(
+            (batch_size, max_len),
+            IGNORE_INDEX,
+            dtype=new_labels[0].dtype,
+            device=new_labels[0].device,
+        )
+        attention_mask = torch.zeros(
+            (batch_size, max_len),
+            dtype=attention_mask.dtype,
+            device=attention_mask.device,
+        )
+        position_ids = torch.zeros(
+            (batch_size, max_len),
+            dtype=position_ids.dtype,
+            device=position_ids.device,
+        )
+
+        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.config, 'tokenizer_padding_side', 'right') == 'left':
+                new_input_embeds_padded.append(
+                    torch.cat(
+                        (
+                            torch.zeros(
+                                (max_len - cur_len, cur_new_embed.shape[1]),
+                                dtype=cur_new_embed.dtype,
+                                device=cur_new_embed.device,
+                            ),
+                            cur_new_embed,
+                        ),
+                        dim=0,
+                    ),
+                )
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(
+                        0,
+                        cur_len,
+                        dtype=position_ids.dtype,
+                        device=position_ids.device,
+                    )
+            else:
+                new_input_embeds_padded.append(
+                    torch.cat(
+                        (
+                            cur_new_embed,
+                            torch.zeros(
+                                (max_len - cur_len, cur_new_embed.shape[1]),
+                                dtype=cur_new_embed.dtype,
+                                device=cur_new_embed.device,
+                            ),
+                        ),
+                        dim=0,
+                    ),
+                )
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    position_ids[i, :cur_len] = torch.arange(
+                        0,
+                        cur_len,
+                        dtype=position_ids.dtype,
+                        device=position_ids.device,
+                    )
+
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+
+        new_labels = None if _labels is None else new_labels_padded
+
+        if _attention_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+
+        if _position_ids is None:
+            position_ids = None
+
+        return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
+
+    def initialize_vision_tokenizer(
+        self,
+        model_args: Any,
+        tokenizer: transformers.PreTrainedTokenizer,
+    ) -> None:
+        if model_args.mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+
+        if model_args.mm_use_im_start_end:
+            num_new_tokens = tokenizer.add_tokens(
+                [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN],
+                special_tokens=True,
+            )
+            self.resize_token_embeddings(len(tokenizer))
+
+            if num_new_tokens > 0:
+                input_embeddings = self.get_input_embeddings().weight.data
+                output_embeddings = self.get_output_embeddings().weight.data
+
+                input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+                    dim=0,
+                    keepdim=True,
+                )
+
+                input_embeddings[-num_new_tokens:] = input_embeddings_avg
+                output_embeddings[-num_new_tokens:] = output_embeddings_avg
+
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = True
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
+
+            if model_args.pretrain_mm_mlp_adapter:
+                mm_projector_weights = torch.load(
+                    model_args.pretrain_mm_mlp_adapter,
+                    map_location='cpu',
+                )
+                embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
+                assert num_new_tokens == 2
+                if input_embeddings.shape == embed_tokens_weight.shape:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
+                elif embed_tokens_weight.shape[0] == num_new_tokens:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight
+                else:
+                    raise ValueError(
+                        f"""
+                        Unexpected embed_tokens_weight shape.
+                        Pretrained: {embed_tokens_weight.shape}.
+                        Current: {input_embeddings.shape}.
+                        Number of new tokens: {num_new_tokens}.
+                        """,
+                    )
+        elif model_args.mm_use_im_patch_token and model_args.tune_mm_mlp_adapter:
+            for p in self.get_input_embeddings().parameters():
+                p.requires_grad = False
+            for p in self.get_output_embeddings().parameters():
+                p.requires_grad = False
diff --git a/safe_sora/trainers/__init__.py b/safe_sora/trainers/__init__.py
new file mode 100644
index 0000000..7b16b15
--- /dev/null
+++ b/safe_sora/trainers/__init__.py
@@ -0,0 +1,22 @@
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Trainers for Safe-Sora."""
+
+from safe_sora.trainers.reward_trainer import RewardTrainer
+
+
+__all__ = [
+    'RewardTrainer',
+]
diff --git a/safe_sora/trainers/reward_trainer.py b/safe_sora/trainers/reward_trainer.py
new file mode 100644
index 0000000..7cedc72
--- /dev/null
+++ b/safe_sora/trainers/reward_trainer.py
@@ -0,0 +1,423 @@
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Adopted from
+# https://github.com/PKU-YuanGroup/Video-LLaVA/blob/main/videollava/train/llava_trainer.py
+# Its original license is Apache-2.0 License.
+
+
+"""Reward Trainer for training the model with the reward signal."""
+
+from __future__ import annotations
+
+import os
+from typing import Any, Generator, Iterator
+
+import bitsandbytes
+import torch
+import torch.nn.functional as F
+from deepspeed import zero
+from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+from torch import nn
+from torch.utils.data import Sampler
+from transformers import Trainer
+from transformers.trainer import (
+    ALL_LAYERNORM_LAYERS,
+    get_parameter_names,
+    has_length,
+    is_sagemaker_mp_enabled,
+    logger,
+)
+from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+
+from safe_sora.models import ScoreModelOutput
+from safe_sora.utils import get_all_reduce_mean
+
+
+def maybe_zero_3(
+    param: torch.Tensor,
+    ignore_status: bool = False,
+    name: str | None = None,
+) -> torch.Tensor:
+    """Gate the parameter with zero stage 3."""
+    if hasattr(param, 'ds_id'):
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE and not ignore_status:
+            print(name, 'no ignore status')
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+
+
+def get_mm_adapter_state_maybe_zero_3(
+    named_params: list[tuple[str, torch.Tensor]],
+    keys_to_match: list[str],
+) -> dict[str, torch.Tensor]:
+    """Get the state of the adapter with zero stage 3."""
+    to_return = {
+        k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)
+    }
+    return {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()}
+
+
+def split_to_even_chunks(
+    indices: list[int],
+    lengths: list[int],
+    num_chunks: int,
+) -> list[list[int]]:
+    """Split a list of indices into `chunks` chunks of roughly equal lengths."""
+
+    if len(indices) % num_chunks != 0:
+        return [indices[i::num_chunks] for i in range(num_chunks)]
+
+    num_indices_per_chunk = len(indices) // num_chunks
+
+    chunks = [[] for _ in range(num_chunks)]
+    chunks_lengths = [0 for _ in range(num_chunks)]
+    for index in indices:
+        shortest_chunk = chunks_lengths.index(min(chunks_lengths))
+        chunks[shortest_chunk].append(index)
+        chunks_lengths[shortest_chunk] += lengths[index]
+        if len(chunks[shortest_chunk]) == num_indices_per_chunk:
+            chunks_lengths[shortest_chunk] = float('inf')
+
+    return chunks
+
+
+# pylint: disable=too-many-locals
+def get_modality_length_grouped_indices(
+    lengths: list[int],
+    batch_size: int,
+    world_size: int,
+    generator: Generator | None = None,
+) -> list[int]:
+    """Get indices grouped by modality and length."""
+    # We need to use torch for the random part
+    # as a distributed sampler will set the random seed for torch.
+    assert all(length != 0 for length in lengths), 'Should not have zero length.'
+    if all(length > 0 for length in lengths) or all(length < 0 for length in lengths):
+        # all samples are in the same modality
+        return get_length_grouped_indices(lengths, batch_size, world_size, generator=generator)
+    mm_indices, mm_lengths = zip(*[(i, length) for i, length in enumerate(lengths) if length > 0])
+    lang_indices, lang_lengths = zip(
+        *[(i, -length) for i, length in enumerate(lengths) if length < 0],
+    )
+
+    mm_shuffle = [
+        mm_indices[i]
+        for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)
+    ]
+    lang_shuffle = [
+        lang_indices[i]
+        for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)
+    ]
+    megabatch_size = world_size * batch_size
+    mm_megabatches = [
+        mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)
+    ]
+    lang_megabatches = [
+        lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)
+    ]
+
+    last_mm = mm_megabatches[-1]
+    last_lang = lang_megabatches[-1]
+    additional_batch = last_mm + last_lang
+    megabatches = mm_megabatches[:-1] + lang_megabatches[:-1]
+    megabatch_indices = torch.randperm(len(megabatches), generator=generator)
+    megabatches = [megabatches[i] for i in megabatch_indices]
+
+    if len(additional_batch) > 0:
+        megabatches.append(sorted(additional_batch))
+
+    return [i for megabatch in megabatches for i in megabatch]
+
+
+def get_length_grouped_indices(
+    lengths: list[int],
+    batch_size: int,
+    world_size: int,
+    generator: Generator | None = None,
+) -> list[int]:
+    """Get indices grouped by length."""
+    # We need to use torch for the random part
+    # as a distributed sampler will set the random seed for torch.
+    indices = torch.randperm(len(lengths), generator=generator)
+    megabatch_size = world_size * batch_size
+    megabatches = [
+        indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)
+    ]
+    megabatches = [
+        sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches
+    ]
+    megabatches = [
+        split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches
+    ]
+
+    return [i for megabatch in megabatches for batch in megabatch for i in batch]
+
+
+class LengthGroupedSampler(Sampler):
+    """
+    Sampler that samples indices in a way that groups together
+    features of the dataset of roughly the same length while
+    keeping a bit of randomness.
+    """
+
+    def __init__(  # pylint: disable=too-many-arguments,super-init-not-called
+        self,
+        batch_size: int,
+        world_size: int,
+        lengths: list[int] | None = None,
+        generator: Generator | None = None,
+        group_by_modality: bool = False,
+    ) -> None:
+        if lengths is None:
+            raise ValueError('Lengths must be provided.')
+
+        self.batch_size = batch_size
+        self.world_size = world_size
+        self.lengths = lengths
+        self.generator = generator
+        self.group_by_modality = group_by_modality
+
+    def __len__(self) -> int:
+        return len(self.lengths)
+
+    def __iter__(self) -> Iterator[int]:
+        if self.group_by_modality:
+            indices = get_modality_length_grouped_indices(
+                self.lengths,
+                self.batch_size,
+                self.world_size,
+                generator=self.generator,
+            )
+        else:
+            indices = get_length_grouped_indices(
+                self.lengths,
+                self.batch_size,
+                self.world_size,
+                generator=self.generator,
+            )
+        return iter(indices)
+
+
+class RewardTrainer(Trainer):
+    """Reward Trainer for training the model with the reward signal."""
+
+    def _get_train_sampler(self) -> torch.utils.data.Sampler | None:
+        if self.train_dataset is None or not has_length(self.train_dataset):
+            return None
+
+        if self.args.group_by_modality_length:
+            lengths = self.train_dataset.modality_lengths
+            return LengthGroupedSampler(
+                self.args.train_batch_size,
+                world_size=self.args.world_size * self.args.gradient_accumulation_steps,
+                lengths=lengths,
+                group_by_modality=True,
+            )
+
+        return super()._get_train_sampler()  # pylint: disable=no-member
+
+    def create_optimizer(self) -> torch.optim.Optimizer:
+        """
+        Setup the optimizer.
+
+        We provide a reasonable default that works well. If you want to use something else, you can
+        pass a tuple in the Trainer's init through `optimizers`, or subclass and override this
+        method in a subclass.
+        """
+        if is_sagemaker_mp_enabled():
+            return super().create_optimizer()  # pylint: disable=no-member
+        # if self.sharded_ddp == ShardedDDPOption.SIMPLE:
+        #     return super().create_optimizer()
+
+        opt_model = self.model
+
+        if self.optimizer is None:
+            decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
+            decay_parameters = [name for name in decay_parameters if 'bias' not in name]
+            if self.args.mm_projector_lr is not None:
+                projector_parameters = [
+                    name for name, _ in opt_model.named_parameters() if 'mm_projector' in name
+                ]
+                optimizer_grouped_parameters = [
+                    {
+                        'params': [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (
+                                n in decay_parameters
+                                and n not in projector_parameters
+                                and p.requires_grad
+                            )
+                        ],
+                        'weight_decay': self.args.weight_decay,
+                    },
+                    {
+                        'params': [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (
+                                n not in decay_parameters
+                                and n not in projector_parameters
+                                and p.requires_grad
+                            )
+                        ],
+                        'weight_decay': 0.0,
+                    },
+                    {
+                        'params': [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (
+                                n in decay_parameters
+                                and n in projector_parameters
+                                and p.requires_grad
+                            )
+                        ],
+                        'weight_decay': self.args.weight_decay,
+                        'lr': self.args.mm_projector_lr,
+                    },
+                    {
+                        'params': [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (
+                                n not in decay_parameters
+                                and n in projector_parameters
+                                and p.requires_grad
+                            )
+                        ],
+                        'weight_decay': 0.0,
+                        'lr': self.args.mm_projector_lr,
+                    },
+                ]
+            else:
+                optimizer_grouped_parameters = [
+                    {
+                        'params': [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (n in decay_parameters and p.requires_grad)
+                        ],
+                        'weight_decay': self.args.weight_decay,
+                    },
+                    {
+                        'params': [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (n not in decay_parameters and p.requires_grad)
+                        ],
+                        'weight_decay': 0.0,
+                    },
+                ]
+
+            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
+
+            self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+            if optimizer_cls.__name__ == 'Adam8bit':
+
+                manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
+
+                skipped = 0
+                for module in opt_model.modules():
+                    if isinstance(module, nn.Embedding):
+                        skipped += sum(
+                            {p.data_ptr(): p.numel() for p in module.parameters()}.values(),
+                        )
+                        logger.info('skipped: %sM params', skipped / 2**20)
+                        manager.register_module_override(module, 'weight', {'optim_bits': 32})
+                        logger.debug('bitsandbytes: will optimize %s in fp32', module)
+                # logger.info(f'skipped: {skipped/2**20}M params')
+                logger.info('skipped: %sM params', skipped / 2**20)
+
+        return self.optimizer
+
+    def _save_checkpoint(self, model: Any, trial: Any, metrics: Any | None = None) -> None:
+        if getattr(self.args, 'tune_mm_mlp_adapter', False):
+
+            checkpoint_folder = f'{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}'
+
+            run_dir = self._get_output_dir(trial=trial)
+            output_dir = os.path.join(run_dir, checkpoint_folder)
+
+            # Only save Adapter
+            keys_to_match = ['mm_projector', 'vision_resampler']
+            if getattr(self.args, 'use_im_start_end', False):
+                keys_to_match.extend(['embed_tokens', 'embed_in'])
+
+            weight_to_save = get_mm_adapter_state_maybe_zero_3(
+                self.model.named_parameters(),
+                keys_to_match,
+            )
+
+            # if self.args.local_rank == 0 or self.args.local_rank == -1:
+            if self.args.local_rank in (-1, 0):
+                self.model.config.save_pretrained(output_dir)
+                torch.save(weight_to_save, os.path.join(output_dir, 'mm_projector.bin'))
+        else:
+            super()._save_checkpoint(model, trial, metrics)  # pylint: disable=no-member
+
+    def _save(self, output_dir: str | None = None, state_dict: bool | None = None) -> None:
+        if getattr(self.args, 'tune_mm_mlp_adapter', False):
+            pass
+        else:
+            super()._save(output_dir, state_dict)  # pylint: disable=no-member
+
+    def compute_loss(
+        self,
+        model: Any,
+        inputs: torch.Tensor,
+        return_outputs: bool = False,
+    ) -> float | tuple[float, Any]:
+        assert inputs['input_ids'].size(0) % 2 == 0, 'Batch size should be even.'
+
+        outputs: ScoreModelOutput = model(**inputs)
+        end_scores = outputs.end_scores
+        higher_end_scores, lower_end_scores = end_scores.squeeze(dim=-1).chunk(chunks=2, dim=0)
+        loss = -F.logsigmoid(  # pylint: disable=not-callable
+            higher_end_scores - lower_end_scores,
+        ).mean()
+        loss = loss + 0.001 * end_scores.square().mean()
+        accuracy = (higher_end_scores > lower_end_scores).float().mean()
+        accuracy = get_all_reduce_mean(accuracy)
+        self.log({'train_accuracy': accuracy.item()})
+
+        outputs.accuracy = accuracy
+        return (loss, outputs) if return_outputs else loss
+
+    def prediction_step(
+        self,
+        model: nn.Module,
+        inputs: dict[str, torch.Tensor | Any],
+        prediction_loss_only: bool,  # pylint: disable=unused-argument
+        ignore_keys: list[str] | None = None,  # pylint: disable=unused-argument
+    ) -> tuple[torch.Tensor | None, torch.Tensor | None, torch.Tensor | None]:
+
+        inputs = self._prepare_inputs(inputs)
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+            end_scores = outputs.end_scores
+            higher_end_scores, lower_end_scores = end_scores.squeeze(dim=-1).chunk(chunks=2, dim=0)
+            labels = (higher_end_scores > lower_end_scores).float()
+
+        return (
+            None,
+            end_scores,
+            labels,
+        )
diff --git a/safe_sora/utils.py b/safe_sora/utils.py
new file mode 100644
index 0000000..c2d3cd7
--- /dev/null
+++ b/safe_sora/utils.py
@@ -0,0 +1,41 @@
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility functions for SafeSora."""
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+
+def get_all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor:
+    """Perform all-reduce operation on a tensor cross all ranks and return the mean."""
+    if dist.is_initialized():
+        dist.all_reduce(tensor, op=dist.ReduceOp.AVG)
+    return tensor
+
+
+def order_pick_k(lst: list, k: int) -> list:
+    """
+    Randomly pick k elements from a list.
+    If k is larger than the length of the list, return the list directly.
+    """
+    if len(lst) <= k:
+        return lst
+    rng = np.random.random(len(lst))
+    index = np.argsort(rng)[:k]
+    index_sort = sorted(index)
+    new_lst = [lst[i] for i in index_sort]
+    print(f'WARNING: total file: {len(lst)}, random pick: {k}. (ignored)')
+    return new_lst