diff --git a/.flake8 b/.flake8
index 1a256e6..eae28f6 100644
--- a/.flake8
+++ b/.flake8
@@ -9,11 +9,10 @@ ignore =
# W503: line break before binary operator
# W504: line break after binary operator
# format by black
- E203,E241,E704,W503,W504,
+ E203,E241,E704,W503,W504,E501,W505,
# E501: line too long
# W505: doc line too long
# too long docstring due to long example blocks
- E501,W505,
per-file-ignores =
# F401: module imported but unused
# intentionally unused imports
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index e71c35e..2683657 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -24,7 +24,6 @@ What types of changes does your code introduce? Put an `x` in all the boxes that
Go over all the following points, and put an `x` in all the boxes that apply.
If you are unsure about any of these, don't hesitate to ask. We are here to help!
-
- [ ] My change requires a change to the documentation.
- [ ] I have updated the tests accordingly. (*required for a bug fix or a new feature*)
- [ ] I have updated the documentation accordingly.
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 90cb6e2..4bdcc64 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -44,22 +44,6 @@ jobs:
run: |
make pre-commit
- - name: ruff
- run: |
- make ruff
-
- - name: flake8
- run: |
- make flake8
-
- - name: pylint
- run: |
- make pylint
-
- - name: isort and black
- run: |
- make py-format
-
- name: addlicense
run: |
make addlicense
diff --git a/.gitignore b/.gitignore
index 6473400..c5be0c6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,9 +1,10 @@
##### Project Specification #####
-dataset/
outputs/
wandb/
test/
data/
+checkpoints/
+cache_dir
##### Python.gitignore #####
# Byte-compiled / optimized / DLL files
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 28e7222..10f747c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,6 +33,11 @@ repos:
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
+ exclude: |
+ (?x)(
+ ^safe_sora/models/multimodal_encoder/|
+ ^safe_sora/models/multimodal_projector/
+ )
- repo: https://github.com/PyCQA/isort
rev: 5.13.2
hooks:
@@ -68,6 +73,8 @@ repos:
- repo: local
hooks:
- id: pylint
+ args:
+ - --disable=R0801
name: pylint
entry: pylint
language: system
@@ -78,5 +85,8 @@ repos:
^examples/|
^tests/|
^setup.py$|
+ ^safe_sora/models/multimodal_encoder/|
+ ^safe_sora/models/multimodal_projector/|
+ ^safe_sora/models/video_llava.py|
^docs/source/conf.py$
)
diff --git a/Makefile b/Makefile
index 493bce0..1fb3b48 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@ PROJECT_NAME = safe-sora
COPYRIGHT = "PKU-Alignment Team. All Rights Reserved."
PROJECT_PATH = safe_sora
SHELL = /bin/bash
-SOURCE_FOLDERS = $(PROJECT_PATH) examples tests docs
+SOURCE_FOLDERS = $(PROJECT_PATH) examples docs
PYTHON_FILES = $(shell find $(SOURCE_FOLDERS) -type f -name "*.py" -o -name "*.pyi")
COMMIT_HASH = $(shell git log -1 --format=%h)
PATH := $(HOME)/go/bin:$(PATH)
@@ -130,7 +130,7 @@ pre-commit: pre-commit-install
# Documentation
addlicense: addlicense-install
- addlicense -c $(COPYRIGHT) -ignore tests/coverage.xml -l apache -y 2022-$(shell date +"%Y") -check $(SOURCE_FOLDERS)
+ addlicense -c $(COPYRIGHT) -ignore **/multimodal_encoder/** -ignore **/multimodal_projector/** -l apache -y 2022-$(shell date +"%Y") -check $(SOURCE_FOLDERS)
docstyle: docs-install
make -C docs clean
diff --git a/conda-recipe.yaml b/conda-recipe.yaml
new file mode 100644
index 0000000..088560b
--- /dev/null
+++ b/conda-recipe.yaml
@@ -0,0 +1,56 @@
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Create virtual environment with command:
+#
+# $ CONDA_OVERRIDE_CUDA=11.8 conda env create --file conda-recipe.yaml
+#
+
+name: safe-sora
+channels:
+ - huggingface
+ - pytorch
+ - nvidia/label/cuda-12.1.0
+ - defaults
+ - conda-forge
+dependencies:
+ - python = 3.11
+ - pip
+
+ - pytorch::pytorch >= 2.0
+ - pytorch::pytorch-mutex =*=*cuda*
+ - pytorch::torchvision
+ - transformers >= 4.42
+ - datasets
+ - tokenizers >= 0.19
+ - sentencepiece
+ - tensorboard
+ - wandb
+ - pip:
+ - accelerate
+ - deepspeed
+ - decord
+ - opencv-python
+
+ - nvidia/label/cuda-12.1.0::cuda-toolkit = 12.1
+
+ - matplotlib-base
+ - rich
+ - tqdm
+ - typing-extensions
+ - bitsandbytes
+ - av
+ - einops
+ - peft
diff --git a/docs/images/win_rate.png b/docs/images/win_rate.png
new file mode 100644
index 0000000..5f24428
Binary files /dev/null and b/docs/images/win_rate.png differ
diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 0000000..5d92e5d
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,63 @@
+
+
+# Preference Model
+
+In this directory, we provide an example implementation of training a preference predictor reward model on our dataset.
+
+## Preference Modeling
+
+To modeling human preferences, it's common to use a preference predictor adhering to the Bradley-Terry Model. The preference data is symbolized as $y_w \succ y_{l} | x$ where $y_{w}$ denotes the more preferred video than $y_l$ corresponding to the prompt $x$.
+The log-likelihood loss used to train a parameterized predictor $R_\phi$ on dataset $\mathcal{D}$ is:
+
+$$\mathcal{L} (\phi; \mathcal{D}) = -\mathbb E_{{(x,y_w,y_l)\sim \mathcal{D}}} \left[\log \sigma (R_{\phi} (y_w,x) - R_{\phi} (y_l,x))\right]$$
+
+
+Leveraging a multi-modal model architecture modified on the [Video-LLaVA](https://github.com/PKU-YuanGroup/Video-LLaVA) and training with preference data from [SafeSora Dataset](https://huggingface.co/datasets/PKU-Alignment/SafeSora), we have develop a T-V reward model.
+The language head of the vision-language model is replaced with a score regression head, which predicts the preference score of the video given the prompt.
+
+This model translates abstract human values into quantifiable and optimizable scalar metrics.
+Consequently, the reward model can partially replace human evaluators in assessing outputs from video generation models and act as a supervisory signal to enhance the performance of these models.
+
+## Alignment Evaluation of Different Models
+
+The SafeSora dataset includes annotations across multiple dimensions of human preference. We have developed several distinct models that focus on different aspects of human preference, such as helpfulness, harmlessness, and four specific sub-dimensions of helpfulness. Our models achieve an agreement ratio of 65.29% for predicting helpfulness preference and 72.41% for predicting harmlessness preference when compared with crowdworker assessments.
+
+Furthermore, we utilize these models to evaluate four open-source models on our [Evaluation Dataset](https://huggingface.co/datasets/PKU-Alignment/SafeSora-Eval). The win-rate relationships among these models, assessed across the two alignment dimensions, are depicted in the figure below.
+
+
+
+
+
+## Training
+
+First, you need to [download our dataset](../README.md#data-access) to local and prepare the training environment using:
+
+```bash
+conda env create -f conda-recipe.yaml # mamba env create -f conda-recipe.yaml
+conda activate safe-sora
+```
+
+Then, you need to download the Video-LLaVA model and the MM-MLP adapter from the Hugging Face model hub. For example, you can download them use the following commands:
+
+```bash
+huggingface-cli download --resume-download LanguageBind/Video-LLaVA-7B --local-dir ./LanguageBind/Video-LLaVA-7B
+huggingface-cli download --resume-download LanguageBind/Video-LLaVA-Pretrain-7B --local-dir ./LanguageBind/Video-LLaVA-Pretrain-7B
+```
+
+Then, you can run the following script to train the reward model on the SafeSora dataset:
+
+```bash
+bash examples/scripts/finetune_reward_model.sh \
+ --model_name_or_path \
+ --mm_mlp_adapter_path \
+ --dimension \
+ --output_dir examples/outputs/reward-model
+```
+
+where `` is the name of the Video-LLaVA model or the path to the checkpoint directory, `` is the path to the `mm_projector.bin` file, and `` is the preference dimension that the reward model will predict.
+
+**NOTE:** The parameter 'dimension' specifies the preference dimension that the reward model will predict. The SafeSora dataset currently supports the following dimensions: `helpfulness`, `harmlessness`, `instruction_following`, `correctness`, `informativeness`, and `aesthetics`. For the detailed information of the different dimensions, please refer to our [paper](https://arxiv.org/abs/2406.14477).
+
+## Acknowledgements
+
+This implementation benefits from [DeepSpeed](https://github.com/microsoft/DeepSpeed), [Transformers](https://github.com/huggingface/transformers), [LLaVA](https://github.com/haotian-liu/LLaVA), and [Video-LLaVA](https://github.com/PKU-YuanGroup/Video-LLaVA). Thanks for their wonderful works and their efforts for democratizing the LLM research.
diff --git a/examples/reward_model/inference.py b/examples/reward_model/inference.py
new file mode 100644
index 0000000..62507dc
--- /dev/null
+++ b/examples/reward_model/inference.py
@@ -0,0 +1,350 @@
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Adopted from https://github.com/PKU-YuanGroup/Video-LLaVA/blob/main/videollava/eval/video/run_inference_benchmark_general.py
+# Its original license is Apache-2.0 license.
+
+"""Inference script for reward model."""
+
+from __future__ import annotations
+
+import copy
+import json
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import Sequence
+
+import torch
+import transformers
+from torch import distributed as dist
+from torch.utils.data import DataLoader, Dataset, DistributedSampler
+from tqdm import tqdm
+from transformers import AutoTokenizer
+
+from safe_sora.conversations import conv_templates
+from safe_sora.datasets.video import VideoDataset
+from safe_sora.models import LlavaLlamaForScore
+from safe_sora.models.constants import (
+ DEFAULT_IM_END_TOKEN,
+ DEFAULT_IM_START_TOKEN,
+ DEFAULT_IMAGE_PATCH_TOKEN,
+ DEFAULT_VID_END_TOKEN,
+ DEFAULT_VID_START_TOKEN,
+ DEFAULT_VIDEO_PATCH_TOKEN,
+ MAX_VIDEO_LENGTH,
+)
+from safe_sora.models.score_model import ScoreModelOutput
+from safe_sora.utils import order_pick_k
+from utils import preprocess_multimodal, preprocess_text
+
+
+chat_template = conv_templates['video_rm']
+
+
+def distributed_max(tensor: torch.Tensor) -> torch.Tensor:
+ """Compute the maximum value of a tensor across all workers."""
+ if not dist.is_initialized():
+ logging.warning('Max without distributed initialization.')
+ return tensor
+
+ dist.all_reduce(tensor, op=dist.ReduceOp.MAX)
+ return tensor
+
+
+def distributed_gather(to_gather: torch.Tensor) -> torch.Tensor:
+ """Gather tensors from all workers."""
+ if not dist.is_initialized():
+ logging.warning('Gathering without distributed initialization.')
+ return to_gather
+
+ if not to_gather.is_cuda:
+ to_gather = to_gather.to(f'cuda:{dist.get_rank()}')
+
+ world_size = dist.get_world_size()
+
+ length = torch.tensor([to_gather.shape[0]], device=to_gather.device)
+ lengths = [torch.zeros_like(length) for _ in range(world_size)]
+ dist.all_gather(lengths, length)
+ max_length = max(lengths).item()
+
+ if to_gather.shape[0] < max_length:
+ padding = torch.zeros(
+ max_length - to_gather.shape[0],
+ *to_gather.shape[1:],
+ device=to_gather.device,
+ )
+ to_gather = torch.cat([to_gather, padding], dim=0)
+
+ gathered = [torch.zeros_like(to_gather) for _ in range(world_size)]
+ dist.all_gather(gathered, to_gather)
+
+ for i, length in enumerate(lengths):
+ gathered[i] = gathered[i][: length.item()]
+
+ return torch.cat(gathered, dim=0)
+
+
+@dataclass
+class EvalArguments:
+ model_name_or_path: str | None = field(default='facebook/opt-125m')
+ cache_dir: str | None = field(default=None)
+ output_dir: str | None = field(default=None)
+ model_max_length: int | None = field(default=2048)
+ batch_size: int | None = field(default=4)
+
+
+@dataclass
+class DataArguments:
+ is_multimodal: bool = True
+ image_aspect_ratio: str = 'square'
+ # ===================================================================
+ eval_data_path: str | None = field(
+ default=None,
+ metadata={'help': 'Path to the evaluation data.'},
+ )
+ image_dir: str | None = field(default=None)
+ video_dir: str | None = field(default=None)
+ num_frames: int = 8
+ # ===================================================================
+
+
+class LazyVideoDataset(Dataset):
+ """Dataset for supervised fine-tuning."""
+
+ def __init__(
+ self,
+ config_path: str,
+ video_dir: str,
+ tokenizer: transformers.PreTrainedTokenizer,
+ video_processor: transformers.VideoLlavaProcessor,
+ data_args: DataArguments,
+ ) -> None:
+ super().__init__()
+
+ self.dataset = VideoDataset.load(config_path, video_dir=video_dir)
+ self.tokenizer = tokenizer
+ self.video_processor = video_processor
+ self.data_args = data_args
+
+ def __len__(self) -> int:
+ return len(self.dataset)
+
+ @property
+ def modality_lengths(self) -> list[int]:
+ return [len(video_config['prompt_text'] for video_config in self.dataset)]
+
+ def __getitem__(self, index: int) -> dict[str, torch.Tensor]:
+ video_config = self.dataset[index]
+ conversation = [
+ {
+ 'from': 'human',
+ 'value': f"##Video Generation Prompt: {video_config['prompt_text']}",
+ },
+ {'from': 'gpt', 'value': '##Generated Video: \n