Add AIMv2- Updated all files

AlanPonnachan · Jan 7, 2025 · 92a2223 · 92a2223
1 parent 8f38f58
commit 92a2223
Show file tree

Hide file tree

Showing 10 changed files with 1,098 additions and 0 deletions.
diff --git a/docs/source/en/model_doc/aimv2.md b/docs/source/en/model_doc/aimv2.md
@@ -0,0 +1,62 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# AIMV2
+
+## Overview
+
+The AIMV2 model was proposed in [Multimodal Autoregressive Pre-training of Large Vision Encoders](https://arxiv.org/abs/2411.14402) by Enrico Fini, Mustafa Shukor, Xiujun Li, Philipp Dufter, Michal Klein, David Haldimann, Sai Aitharaju, Victor Guilherme Turrisi da Costa, Louis Béthune, Zhe Gan, Alexander T Toshev, Marcin Eichner, Moin Nabi, Yinfei Yang, Joshua M. Susskind, and Alaaeldin El-Nouby.
+AIMV2, a family of generalist vision encoders characterized by a straightforward pre-training process, scalability, and remarkable performance across a range of downstream tasks.
+
+The abstract from the paper is the following:
+
+*We introduce a novel method for pre-training of large-scale
+vision encoders. Building on recent advancements in autoregressive pre-training of vision models, we extend this
+framework to a multimodal setting, i.e., images and text. In
+this paper, we present AIMV2, a family of generalist vision
+encoders characterized by a straightforward pre-training
+process, scalability, and remarkable performance across a
+range of downstream tasks. This is achieved by pairing the
+vision encoder with a multimodal decoder that autoregressively generates raw image patches and text tokens. Our
+encoders excel not only in multimodal evaluations but also
+in vision benchmarks such as localization, grounding, and
+classification. Notably, our AIMV2-3B encoder achieves
+89.5% accuracy on ImageNet-1k with a frozen trunk. Furthermore, AIMV2 consistently outperforms state-of-the-art
+contrastive models (e.g., CLIP, SigLIP) in multimodal image understanding across diverse settings.
+*
+
+Tips:
+
+- The model is best suited for fine-tuning on downstream vision tasks such as image classification, object detection, and semantic segmentation.
+- When using the model for inference, make sure to use an `AutoImageProcessor` (or manually process the images) to ensure the input images are preprocessed correctly (resized, normalized, etc.). The recommended image size for AIMv2 is typically 224x224, though some variants are trained on other resolutions (e.g., 336x336, 448x448). See the specific model checkpoint's documentation for details.
+- AIMv2 models are trained using masked image modeling. If using the model for transfer learning, you may notice better performance by incorporating masked data during fine-tuning.
+
+This model was contributed by [AlanPonnachan](https://huggingface.co/AlanPonnachan).
+The original code can be found [here](https://github.com/apple/ml-aim).
+
+
+## AIMv2Config
+
+[[autodoc]] AIMv2Config
+
+## AIMv2Model
+
+[[autodoc]] AIMv2Model
+    - forward
+
+
+</pt>
+<tf>
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -5093,6 +5093,7 @@
         load_tf2_model_in_pytorch_model,
         load_tf2_weights_in_pytorch_model,
     )
+    from .models.aimv2 import AIMv2Config
     from .models.albert import AlbertConfig
     from .models.align import (
         AlignConfig,
@@ -6398,6 +6399,10 @@
         )
         from .modeling_rope_utils import ROPE_INIT_FUNCTIONS
         from .modeling_utils import PreTrainedModel
+        from .models.aimv2 import (
+            AIMv2Model,
+            AIMv2PreTrainedModel,
+        )
         from .models.albert import (
             AlbertForMaskedLM,
             AlbertForMultipleChoice,

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from . import (
+    aimv2,
     albert,
     align,
     altclip,

diff --git a/src/transformers/models/aimv2/__init__.py b/src/transformers/models/aimv2/__init__.py
@@ -0,0 +1,54 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from transformers.utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_aimv2": ["AIMv2Config"],
+    "modeling_aimv2": ["AIMv2Model", "AIMv2PreTrainedModel"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_aimv2"] = [
+        "AIMv2Model",
+        "AIMv2PreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_aimv2 import AIMv2Config
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_aimv2 import AIMv2Model, AIMv2PreTrainedModel
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/aimv2/configuration_aimv2.py b/src/transformers/models/aimv2/configuration_aimv2.py
@@ -0,0 +1,99 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""AIMv2 model configuration"""
+
+from typing import Any
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+__all__ = ["AIMv2Config"]
+
+
+class AIMv2Config(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of an [`AIMv2Model`].
+    Instantiating a configuration with the defaults will yield a similar configuration
+    to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 2816):
+            Dimension of the SwiGLU representations.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer
+            in the Transformer.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of input channels.
+        image_size (`int`, *optional*, defaults to 224):
+            Image size.
+        patch_size (`int`, *optional*, defaults to 14):
+            Patch size.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-5):
+            Epsilon value used for the RMS normalization layer.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout ratio for attention probabilities.
+        projection_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout ratio for the projection layer after the attention.
+        qkv_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the queries, keys and values.
+        use_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias in the feed-forward and projection layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        kwargs:
+            Keyword arguments for the [`PretrainedConfig`].
+    """
+
+    model_type: str = "aimv2"
+
+    def __init__(
+        self,
+        hidden_size: int = 1024,
+        intermediate_size: int = 2816,
+        num_hidden_layers: int = 24,
+        num_attention_heads: int = 8,
+        num_channels: int = 3,
+        image_size: int = 224,
+        patch_size: int = 14,
+        rms_norm_eps: float = 1e-5,
+        attention_dropout: float = 0.0,
+        projection_dropout: float = 0.0,
+        qkv_bias: bool = False,
+        use_bias: bool = False,
+        initializer_range: float = 0.02,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.rms_norm_eps = rms_norm_eps
+        self.initializer_range = initializer_range
+        self.projection_dropout = projection_dropout
+        self.qkv_bias = qkv_bias
+        self.use_bias = use_bias
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,6 +13,7 @@ @@
     # limitations under the License.
     from . import (
+        aimv2,
         albert,
         align,
         altclip,
@@ Expand Down @@