stanford-crfm · teetone · Mar 4, 2024 · Jan 14, 2024 · Jan 14, 2024 · Jan 14, 2024
diff --git a/requirements.txt b/requirements.txt
@@ -36,6 +36,8 @@ dacite==1.6.0
 datasets==2.5.2
 dill==0.3.5.1
 distlib==0.3.6
+einops==0.7.0   
+einops-exts==0.0.4
 emoji==2.1.0
 et-xmlfile==1.1.0
 exceptiongroup==1.1.0
@@ -89,6 +91,7 @@ nodeenv==1.7.0
 numba==0.56.4
 numpy==1.23.3
 openai==0.27.8
+open-clip-torch==2.24.0
 opencv-python==4.8.1.78
 openpyxl==3.0.10
 outcome==1.2.0
@@ -168,7 +171,7 @@ torchvision==0.13.1 ; sys_platform == "darwin"
 torch==1.12.1+cu113 ; sys_platform == "linux"
 torchvision==0.13.1+cu113 ; sys_platform == "linux"
 tqdm==4.64.1
-transformers==4.36.0
+transformers==4.32.0
 trio==0.22.0
 trio-websocket==0.9.2
 typer==0.4.2
@@ -196,4 +199,4 @@ zipp==3.11.0
 zope.event==4.5.0
 zope.interface==5.4.0
 zstandard==0.18.0
-fairlearn==0.9.0
+fairlearn==0.9.0
diff --git a/setup.cfg b/setup.cfg
@@ -52,7 +52,7 @@ install_requires=
     scikit-learn~=1.1.2
 
     # Models and Metrics Extras
-    transformers~=4.36.0  # For anthropic_client, vision_language.huggingface_vlm_client, huggingface_client, huggingface_tokenizer, test_openai_token_cost_estimator, model_summac (via summarization_metrics)
+    transformers>=4.28.0  # For anthropic_client, vision_language.huggingface_vlm_client, huggingface_client, huggingface_tokenizer, test_openai_token_cost_estimator, model_summac (via summarization_metrics)
     # TODO: Upgrade torch - we need > 2.0.0 for newer versions of transformers
     torch>=1.12.1,<3.0.0  # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics)
     torchvision>=0.13.1,<3.0.0  # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics)
@@ -135,7 +135,13 @@ models =
     crfm-helm[yandex]
 
 vlm =
-    torch~=2.1.2    # For IDEFICS
+    # For OpenFlamingo
+    einops~=0.7.0
+    einops-exts~=0.0.4
+    open-clip-torch~=2.24.0
+
+    # For IDEFICS
+    torch~=2.1.2
 
 heim =
     # HEIM scenarios
@@ -223,6 +229,7 @@ exclude =
     venv/*
     src/helm/proxy/clients/image_generation/dalle_mini/*
     src/helm/proxy/clients/image_generation/mindalle/*
+    src/helm/proxy/clients/vision_language/open_flamingo/*
 
 # Ignore completely:
 # E203 - White space before ':', (conflicts with black)
@@ -240,7 +247,7 @@ check_untyped_defs = True
 disable_error_code = annotation-unchecked
 # TODO: Change disallow_untyped_defs to True
 disallow_untyped_defs = False
-exclude = dalle_mini|mindalle
+exclude = dalle_mini|mindalle|open_flamingo
 
 [tool:pytest]
 addopts =

diff --git a/src/helm/benchmark/model_metadata_registry.py b/src/helm/benchmark/model_metadata_registry.py
@@ -56,10 +56,13 @@
 IDEFICS_INSTRUCT_MODEL_TAG: str = "IDEFICS_INSTRUCT_MODEL_TAG"
 # Llava should use a special prompt format (see `LlavaRunExpander`)
 LLAVA_MODEL_TAG: str = "LLAVA_MODEL_TAG"
-
+# OpenFlamingo
+OPEN_FLAMINGO_MODEL_TAG: str = "OPEN_FLAMINGO_MODEL_TAG"
 
 # Frozen is set to false as the model_deployment_registry.py file
 # might populate the deployment_names field.
+
+
 @dataclass(frozen=False)
 class ModelMetadata:
     name: str

diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml
@@ -464,6 +464,14 @@ model_deployments:
     max_sequence_length: 2048
     client_spec:
       class_name: "helm.proxy.clients.vision_language.huggingface_vlm_client.HuggingFaceVLMClient"
+
+  ## OpenFlamingo
+  - name: openflamingo/OpenFlamingo-9B-vitl-mpt7b
+    model_name: openflamingo/OpenFlamingo-9B-vitl-mpt7b
+    tokenizer_name: anas-awadalla/mpt-7b
+    max_sequence_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.vision_language.open_flamingo_client.OpenFlamingoClient"
 
   ## Mistral AI
   - name: huggingface/bakLlava-v1-hf

diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml
@@ -1092,7 +1092,16 @@ models:
     num_parameters: 13000000000
     release_date: 2023-10-05
     tags: [VISION_LANGUAGE_MODEL_TAG, LLAVA_MODEL_TAG]
-
+
+
+  - name: openflamingo/OpenFlamingo-9B-vitl-mpt7b
+    display_name: OpenFlamingo (9B)
+    description: OpenFlamingo is an open source implementation of DeepMind's Flamingo models. This 9B-parameter model uses a CLIP ViT-L/14 vision encoder and MPT-7B language model. ([paper](https://arxiv.org/abs/2308.01390))
+    creator_organization_name: OpenFlamingo
+    access: open
+    num_parameters: 9000000000
+    release_date: 2023-08-02
+    tags: [VISION_LANGUAGE_MODEL_TAG, OPEN_FLAMINGO_MODEL_TAG]
 
 
   # 01.AI

diff --git a/src/helm/config/tokenizer_configs.yaml b/src/helm/config/tokenizer_configs.yaml
@@ -165,6 +165,12 @@ tokenizer_configs:
       class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
     end_of_text_token: "</s>"
     prefix_token: "<s>"
+
+  - name: anas-awadalla/mpt-7b
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: ""
 
   # Huggingface
   - name: huggingface/gpt2

diff --git a/src/helm/proxy/clients/vision_language/open_flamingo/__init__.py b/src/helm/proxy/clients/vision_language/open_flamingo/__init__.py
@@ -0,0 +1,2 @@
+from .src.flamingo import Flamingo
+from .src.factory import create_model_and_transforms
diff --git a/src/helm/proxy/clients/vision_language/open_flamingo/src/__init__.py b/src/helm/proxy/clients/vision_language/open_flamingo/src/__init__.py
diff --git a/src/helm/proxy/clients/vision_language/open_flamingo/src/factory.py b/src/helm/proxy/clients/vision_language/open_flamingo/src/factory.py
@@ -0,0 +1,147 @@
+"""
+Source: https://github.com/mlfoundations/open_flamingo
+"""
+
+from typing import Optional
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from helm.common.general import handle_module_not_found_error
+from .flamingo import Flamingo
+from .flamingo_lm import FlamingoLMMixin
+from .utils import extend_instance
+
+
+def create_model_and_transforms(
+    clip_vision_encoder_path: str,
+    clip_vision_encoder_pretrained: str,
+    lang_encoder_path: str,
+    tokenizer_path: str,
+    cross_attn_every_n_layers: int = 1,
+    use_local_files: bool = False,
+    decoder_layers_attr_name: str = None,
+    freeze_lm_embeddings: bool = False,
+    cache_dir: Optional[str] = None,
+    **flamingo_kwargs,
+):
+    """
+    Initialize a Flamingo model from a pretrained vision encoder and language encoder.
+    Appends special tokens to the tokenizer and freezes backbones.
+
+    Args:
+        clip_vision_encoder_path (str): path to pretrained clip model (e.g. "ViT-B-32")
+        clip_vision_encoder_pretrained (str): name of pretraining dataset for clip model (e.g. "laion2b_s32b_b79k")
+        lang_encoder_path (str): path to pretrained language encoder
+        tokenizer_path (str): path to pretrained tokenizer
+        cross_attn_every_n_layers (int, optional): determines how often to add a cross-attention layer. Defaults to 1.
+        use_local_files (bool, optional): whether to use local files. Defaults to False.
+        decoder_layers_attr_name (str, optional): name of the decoder layers attribute. Defaults to None.
+        freeze_lm_embeddings (bool, optional): whether to freeze LM input embeddings when configuring Perceiver.
+        cache_dir (str, optional): path to cache directory for downloading OpenClip/HF weights.
+    Returns:
+        Flamingo: Flamingo model from pretrained vision and language encoders
+        Image processor: Pipeline to preprocess input images
+        Tokenizer: A tokenizer for the language model
+    """
+    try:
+        import open_clip
+    except ModuleNotFoundError as e:
+        handle_module_not_found_error(e, ["vlm"])
+
+    vision_encoder, _, image_processor = open_clip.create_model_and_transforms(
+        clip_vision_encoder_path,
+        pretrained=clip_vision_encoder_pretrained,
+        cache_dir=cache_dir,
+    )
+    # set the vision encoder to output the visual features
+    vision_encoder.visual.output_tokens = True
+
+    text_tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_path,
+        local_files_only=use_local_files,
+        trust_remote_code=True,
+        cache_dir=cache_dir,
+    )
+    # add Flamingo special tokens to the tokenizer
+    text_tokenizer.add_special_tokens({"additional_special_tokens": ["<|endofchunk|>", "<image>"]})
+    if text_tokenizer.pad_token is None:
+        # Issue: GPT models don't have a pad token, which we use to
+        # modify labels for the loss.
+        text_tokenizer.add_special_tokens({"pad_token": "<PAD>"})
+
+    lang_encoder = AutoModelForCausalLM.from_pretrained(
+        lang_encoder_path,
+        local_files_only=use_local_files,
+        trust_remote_code=True,
+        cache_dir=cache_dir,
+    )
+
+    # hacks for MPT-1B, which doesn't have a get_input_embeddings method
+    if "mpt-1b-redpajama-200b" in lang_encoder_path:
+
+        class EmbeddingFnMixin:
+            def get_input_embeddings(self):
+                return self.transformer.wte
+
+            def set_input_embeddings(self, new_embeddings):
+                self.transformer.wte = new_embeddings
+
+        extend_instance(lang_encoder, EmbeddingFnMixin)
+
+    # convert LM to FlamingoLM
+    extend_instance(lang_encoder, FlamingoLMMixin)
+
+    if decoder_layers_attr_name is None:
+        decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder)
+    lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name)
+    lang_encoder.resize_token_embeddings(len(text_tokenizer))
+
+    model = Flamingo(
+        vision_encoder,
+        lang_encoder,
+        text_tokenizer.encode("<|endofchunk|>")[-1],
+        text_tokenizer.encode("<image>")[-1],
+        vis_dim=open_clip.get_model_config(clip_vision_encoder_path)["vision_cfg"]["width"],
+        cross_attn_every_n_layers=cross_attn_every_n_layers,
+        **flamingo_kwargs,
+    )
+
+    # Freeze all parameters
+    model.requires_grad_(False)
+    assert sum(p.numel() for p in model.parameters() if p.requires_grad) == 0
+
+    # Unfreeze perceiver, gated_cross_attn_layers, and LM input embeddings
+    model.perceiver.requires_grad_(True)
+    model.lang_encoder.gated_cross_attn_layers.requires_grad_(True)
+    if not freeze_lm_embeddings:
+        model.lang_encoder.get_input_embeddings().requires_grad_(True)
+        # TODO: investigate also training the output embeddings when untied
+
+    print(
+        f"Flamingo model initialized with {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable parameters"
+    )
+
+    return model, image_processor, text_tokenizer
+
+
+def _infer_decoder_layers_attr_name(model):
+    for k in __KNOWN_DECODER_LAYERS_ATTR_NAMES:
+        if k.lower() in model.__class__.__name__.lower():
+            return __KNOWN_DECODER_LAYERS_ATTR_NAMES[k]
+
+    raise ValueError(
+        "We require the attribute name for the nn.ModuleList in the decoder storing the transformer block layers. "
+        "Please supply this string manually."
+    )
+
+
+__KNOWN_DECODER_LAYERS_ATTR_NAMES = {
+    "opt": "model.decoder.layers",
+    "gptj": "transformer.h",
+    "gpt-j": "transformer.h",
+    "pythia": "gpt_neox.layers",
+    "llama": "model.layers",
+    "gptneoxforcausallm": "gpt_neox.layers",
+    "mpt": "transformer.blocks",
+    "mosaicgpt": "transformer.blocks",
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .src.flamingo import Flamingo
		from .src.factory import create_model_and_transforms