Add OpenFlamingo (#2237)

Co-authored-by: Tony Lee <[email protected]> Co-authored-by: JosselinSomervilleRoberts <[email protected]>
stanford-crfm · Mar 4, 2024 · 481d12e · 481d12e
1 parent 5004acf
commit 481d12e
Showing 17 changed files with 1,203 additions and 3 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -30,6 +30,10 @@ certifi==2024.2.2
 cffi==1.16.0
 cfgv==3.4.0
 charset-normalizer==2.1.1
+Cython==0.29.32
+einops-exts==0.0.4
+emoji==2.1.0
+et-xmlfile==1.1.0
 chex==0.1.7
 click==8.1.7
 clip-anytorch==2.5.2
@@ -145,6 +149,10 @@ nltk==3.8.1
 nodeenv==1.8.0
 NudeNet==2.0.9
 numba==0.56.4
+open-clip-torch==2.24.0
+openpyxl==3.0.10
+outcome==1.2.0
+pathy==0.10.2
 numpy==1.23.5
 oauthlib==3.2.2
 omegaconf==2.3.0
@@ -253,6 +261,15 @@ timm==0.6.13
 tokenizers==0.15.2
 toml==0.10.2
 tomli==2.0.1
+trio==0.22.0
+trio-websocket==0.9.2
+types-Pillow==9.3.0.4
+types-pytz==2022.4.0.0
+types-redis==4.3.21.1
+types-requests==2.28.11.2
+types-tabulate==0.9.0.0
+types-urllib3==1.26.25
+typing==3.7.4.3
 toolz==0.12.1
 torch~=2.1.2
 torch-fidelity==0.3.0
@@ -281,3 +298,4 @@ xxhash==3.4.1
 yarl==1.9.4
 zipp==3.17.0
 zstandard==0.18.0
+fairlearn==0.9.0
diff --git a/setup.cfg b/setup.cfg
@@ -148,9 +148,14 @@ models =
     crfm-helm[yandex]
 
 vlm =
+    # For OpenFlamingo
+    einops~=0.7.0
+    einops-exts~=0.0.4
+    open-clip-torch~=2.24.0
+
     # VLM models
     crfm-helm[openai]
-    torch~=2.1.2    # For IDEFICS
+    torch~=2.1.2 # For IDEFICS
 
     # VLM scenarios
     crfm-helm[images]
@@ -178,7 +183,7 @@ heim =
     crfm-helm[openai]
 
     # For model, kakaobrain/mindall-e
-    einops~=0.6.0
+    einops~=0.7.0
     omegaconf~=2.3.0
     pytorch-lightning~=2.0.5
 
@@ -259,6 +264,7 @@ exclude =
     venv/*
     src/helm/clients/image_generation/dalle_mini/*
     src/helm/clients/image_generation/mindalle/*
+    src/helm/clients/vision_language/open_flamingo/*
 
 # Ignore completely:
 # E203 - White space before ':', (conflicts with black)
@@ -276,7 +282,7 @@ check_untyped_defs = True
 disable_error_code = annotation-unchecked
 # TODO: Change disallow_untyped_defs to True
 disallow_untyped_defs = False
-exclude = dalle_mini|mindalle
+exclude = dalle_mini|mindalle|open_flamingo
 
 [tool:pytest]
 addopts =

diff --git a/src/helm/benchmark/model_metadata_registry.py b/src/helm/benchmark/model_metadata_registry.py
@@ -61,13 +61,17 @@
 IDEFICS_MODEL_TAG: str = "IDEFICS_MODEL_TAG"
 # Llava should use a special prompt format (see `LlavaRunExpander`)
 LLAVA_MODEL_TAG: str = "LLAVA_MODEL_TAG"
+# OpenFlamingo has a special prompt format (see `OpenFlamingoRunExpander`)
+OPEN_FLAMINGO_MODEL_TAG: str = "OPEN_FLAMINGO_MODEL_TAG"
 # Some VLMs do not support multiple images in the prompt
 LIMITED_FUNCTIONALITY_VLM_TAG: str = "LIMITED_FUNCTIONALITY_VLM_TAG"
 FULL_FUNCTIONALITY_VLM_TAG: str = "FULL_FUNCTIONALITY_VLM_TAG"
 
 
 # Frozen is set to false as the model_deployment_registry.py file
 # might populate the deployment_names field.
+
+
 @dataclass(frozen=False)
 class ModelMetadata:
     name: str

diff --git a/src/helm/benchmark/presentation/run_specs_image2structure.conf b/src/helm/benchmark/presentation/run_specs_image2structure.conf
@@ -10,6 +10,11 @@ entries: [
     # sheetmusic2lilypond
     {description: "sheetmusic2lilypond:model=vlm", priority: 1}
 
+    # webpages
+    {description: "image2webpage:subset=css,model=vlm", priority: 1, groups: ["image2webpage"]}
+    {description: "image2webpage:subset=html,model=vlm", priority: 1, groups: ["image2webpage"]}
+    {description: "image2webpage:subset=javascript,model=vlm", priority: 1, groups: ["image2webpage"]}
+
     # chart2csv
     # {description: "chart2csv:model=vlm", priority: 1}
 ]
diff --git a/src/helm/benchmark/run_expander.py b/src/helm/benchmark/run_expander.py
@@ -447,6 +447,26 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
         ]
 
 
+class OpenFlamingoRunExpander(RunExpander):
+    """
+    Custom prompt for OpenFlamingo following: https://huggingface.co/openflamingo/OpenFlamingo-9B-vitl-mpt7b
+    """
+
+    name = "open_flamingo"
+
+    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        return [
+            replace(
+                run_spec,
+                name=run_spec.name,
+                adapter_spec=replace(
+                    run_spec.adapter_spec,
+                    input_prefix=f"<|endofchunk|>{run_spec.adapter_spec.input_prefix}",
+                ),
+            ),
+        ]
+
+
 class FormatPromptRunExpander(RunExpander):
     """Adds a prefix and suffix to the prompt."""
 

diff --git a/src/helm/benchmark/run_spec_factory.py b/src/helm/benchmark/run_spec_factory.py
@@ -17,6 +17,7 @@
     GOOGLE_PALM_2_MODEL_TAG,
     IDEFICS_INSTRUCT_MODEL_TAG,
     LLAVA_MODEL_TAG,
+    OPEN_FLAMINGO_MODEL_TAG,
     NLG_PREFIX_TAG,
     NO_NEWLINES_TAG,
     OPENAI_CHATGPT_MODEL_TAG,
@@ -33,6 +34,7 @@
     IDEFICSInstructRunExpander,
     IncreaseTemperatureRunExpander,
     LlavaRunExpander,
+    OpenFlamingoRunExpander,
     OpenAIRunExpander,
     MistralRunExpander,
     StopRunExpander,
@@ -147,6 +149,10 @@ def alter_run_spec(run_spec: RunSpec) -> RunSpec:
         if LLAVA_MODEL_TAG in model.tags:
             run_spec = singleton(LlavaRunExpander().expand(run_spec))
 
+        # OpenFlamingo
+        if OPEN_FLAMINGO_MODEL_TAG in model.tags:
+            run_spec = singleton(OpenFlamingoRunExpander().expand(run_spec))
+
         # For multiple choice
         if BUGGY_TEMP_0_TAG in model.tags and run_spec.adapter_spec.temperature == 0:
             increase_temperature_expander = IncreaseTemperatureRunExpander(value=1e-4)

diff --git a/src/helm/clients/vision_language/open_flamingo/__init__.py b/src/helm/clients/vision_language/open_flamingo/__init__.py
@@ -0,0 +1,2 @@
+from .src.flamingo import Flamingo
+from .src.factory import create_model_and_transforms
diff --git a/src/helm/clients/vision_language/open_flamingo/src/__init__.py b/src/helm/clients/vision_language/open_flamingo/src/__init__.py
diff --git a/src/helm/clients/vision_language/open_flamingo/src/factory.py b/src/helm/clients/vision_language/open_flamingo/src/factory.py
@@ -0,0 +1,147 @@
+"""
+Source: https://github.com/mlfoundations/open_flamingo
+"""
+
+from typing import Optional
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from helm.common.general import handle_module_not_found_error
+from .flamingo import Flamingo
+from .flamingo_lm import FlamingoLMMixin
+from .utils import extend_instance
+
+
+def create_model_and_transforms(
+    clip_vision_encoder_path: str,
+    clip_vision_encoder_pretrained: str,
+    lang_encoder_path: str,
+    tokenizer_path: str,
+    cross_attn_every_n_layers: int = 1,
+    use_local_files: bool = False,
+    decoder_layers_attr_name: str = None,
+    freeze_lm_embeddings: bool = False,
+    cache_dir: Optional[str] = None,
+    **flamingo_kwargs,
+):
+    """
+    Initialize a Flamingo model from a pretrained vision encoder and language encoder.
+    Appends special tokens to the tokenizer and freezes backbones.
+
+    Args:
+        clip_vision_encoder_path (str): path to pretrained clip model (e.g. "ViT-B-32")
+        clip_vision_encoder_pretrained (str): name of pretraining dataset for clip model (e.g. "laion2b_s32b_b79k")
+        lang_encoder_path (str): path to pretrained language encoder
+        tokenizer_path (str): path to pretrained tokenizer
+        cross_attn_every_n_layers (int, optional): determines how often to add a cross-attention layer. Defaults to 1.
+        use_local_files (bool, optional): whether to use local files. Defaults to False.
+        decoder_layers_attr_name (str, optional): name of the decoder layers attribute. Defaults to None.
+        freeze_lm_embeddings (bool, optional): whether to freeze LM input embeddings when configuring Perceiver.
+        cache_dir (str, optional): path to cache directory for downloading OpenClip/HF weights.
+    Returns:
+        Flamingo: Flamingo model from pretrained vision and language encoders
+        Image processor: Pipeline to preprocess input images
+        Tokenizer: A tokenizer for the language model
+    """
+    try:
+        import open_clip
+    except ModuleNotFoundError as e:
+        handle_module_not_found_error(e, ["vlm"])
+
+    vision_encoder, _, image_processor = open_clip.create_model_and_transforms(
+        clip_vision_encoder_path,
+        pretrained=clip_vision_encoder_pretrained,
+        cache_dir=cache_dir,
+    )
+    # set the vision encoder to output the visual features
+    vision_encoder.visual.output_tokens = True
+
+    text_tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_path,
+        local_files_only=use_local_files,
+        trust_remote_code=True,
+        cache_dir=cache_dir,
+    )
+    # add Flamingo special tokens to the tokenizer
+    text_tokenizer.add_special_tokens({"additional_special_tokens": ["<|endofchunk|>", "<image>"]})
+    if text_tokenizer.pad_token is None:
+        # Issue: GPT models don't have a pad token, which we use to
+        # modify labels for the loss.
+        text_tokenizer.add_special_tokens({"pad_token": "<PAD>"})
+
+    lang_encoder = AutoModelForCausalLM.from_pretrained(
+        lang_encoder_path,
+        local_files_only=use_local_files,
+        trust_remote_code=True,
+        cache_dir=cache_dir,
+    )
+
+    # hacks for MPT-1B, which doesn't have a get_input_embeddings method
+    if "mpt-1b-redpajama-200b" in lang_encoder_path:
+
+        class EmbeddingFnMixin:
+            def get_input_embeddings(self):
+                return self.transformer.wte
+
+            def set_input_embeddings(self, new_embeddings):
+                self.transformer.wte = new_embeddings
+
+        extend_instance(lang_encoder, EmbeddingFnMixin)
+
+    # convert LM to FlamingoLM
+    extend_instance(lang_encoder, FlamingoLMMixin)
+
+    if decoder_layers_attr_name is None:
+        decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder)
+    lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name)
+    lang_encoder.resize_token_embeddings(len(text_tokenizer))
+
+    model = Flamingo(
+        vision_encoder,
+        lang_encoder,
+        text_tokenizer.encode("<|endofchunk|>")[-1],
+        text_tokenizer.encode("<image>")[-1],
+        vis_dim=open_clip.get_model_config(clip_vision_encoder_path)["vision_cfg"]["width"],
+        cross_attn_every_n_layers=cross_attn_every_n_layers,
+        **flamingo_kwargs,
+    )
+
+    # Freeze all parameters
+    model.requires_grad_(False)
+    assert sum(p.numel() for p in model.parameters() if p.requires_grad) == 0
+
+    # Unfreeze perceiver, gated_cross_attn_layers, and LM input embeddings
+    model.perceiver.requires_grad_(True)
+    model.lang_encoder.gated_cross_attn_layers.requires_grad_(True)
+    if not freeze_lm_embeddings:
+        model.lang_encoder.get_input_embeddings().requires_grad_(True)
+        # TODO: investigate also training the output embeddings when untied
+
+    print(
+        f"Flamingo model initialized with {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable parameters"
+    )
+
+    return model, image_processor, text_tokenizer
+
+
+def _infer_decoder_layers_attr_name(model):
+    for k in __KNOWN_DECODER_LAYERS_ATTR_NAMES:
+        if k.lower() in model.__class__.__name__.lower():
+            return __KNOWN_DECODER_LAYERS_ATTR_NAMES[k]
+
+    raise ValueError(
+        "We require the attribute name for the nn.ModuleList in the decoder storing the transformer block layers. "
+        "Please supply this string manually."
+    )
+
+
+__KNOWN_DECODER_LAYERS_ATTR_NAMES = {
+    "opt": "model.decoder.layers",
+    "gptj": "transformer.h",
+    "gpt-j": "transformer.h",
+    "pythia": "gpt_neox.layers",
+    "llama": "model.layers",
+    "gptneoxforcausallm": "gpt_neox.layers",
+    "mpt": "transformer.blocks",
+    "mosaicgpt": "transformer.blocks",
+}
diff --git a/src/helm/clients/vision_language/open_flamingo/src/flamingo.py b/src/helm/clients/vision_language/open_flamingo/src/flamingo.py
@@ -0,0 +1,337 @@
+"""
+Source: https://github.com/mlfoundations/open_flamingo
+"""
+
+import torch
+from einops import rearrange
+from torch import nn
+from .helpers import PerceiverResampler
+from torch.distributed.fsdp.wrap import (
+    enable_wrap,
+    wrap,
+)
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from torch.distributed.fsdp import (
+    FullyShardedDataParallel as FSDP,
+)
+
+from .utils import apply_with_stopping_condition
+
+
+class Flamingo(nn.Module):
+    def __init__(
+        self,
+        vision_encoder: nn.Module,
+        lang_encoder: nn.Module,
+        eoc_token_id: int,
+        media_token_id: int,
+        vis_dim: int,
+        cross_attn_every_n_layers: int = 1,
+        gradient_checkpointing: bool = False,
+    ):
+        """
+        Args:
+            vision_encoder (nn.Module): HF CLIPModel
+            lang_encoder (nn.Module): HF causal language model
+            eoc_token_id (int): Token id for <|endofchunk|>
+            media_token_id (int): Token id for <image>
+            vis_dim (int): Dimension of the visual features.
+                Visual features are projected to match this shape along the last dimension.
+            cross_attn_every_n_layers (int, optional): How often to apply cross attention after transformer layer. Defaults to 1.
+        """
+        super().__init__()
+        self.eoc_token_id = eoc_token_id
+        self.media_token_id = media_token_id
+        self.vis_dim = vis_dim
+        if hasattr(lang_encoder.config, "d_model"):
+            self.lang_dim = lang_encoder.config.d_model  # mpt uses d_model
+        else:
+            self.lang_dim = lang_encoder.config.hidden_size
+
+        self.vision_encoder = vision_encoder.visual
+        self.perceiver = PerceiverResampler(dim=self.vis_dim)
+        self.lang_encoder = lang_encoder
+        self.lang_encoder.init_flamingo(
+            media_token_id=media_token_id,
+            lang_hidden_size=self.lang_dim,
+            vis_hidden_size=self.vis_dim,
+            cross_attn_every_n_layers=cross_attn_every_n_layers,
+            gradient_checkpointing=gradient_checkpointing,
+        )
+        self._use_gradient_checkpointing = gradient_checkpointing
+        self.perceiver._use_gradient_checkpointing = gradient_checkpointing
+
+    def forward(
+        self,
+        vision_x: torch.Tensor,
+        lang_x: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        labels: torch.Tensor = None,
+        clear_conditioned_layers: bool = True,
+        past_key_values=None,
+        use_cache: bool = False,
+    ):
+        """
+        Forward pass of Flamingo.
+
+        Args:
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W) with F=1
+            lang_x (torch.Tensor): Language input ids
+                shape (B, T_txt)
+            attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
+            labels (torch.Tensor, optional): Labels. Defaults to None.
+            clear_conditioned_layers: if True, clear the conditioned layers
+                once the foward pass is completed. Set this to false if the
+                same set of images will be reused in another subsequent
+                forward pass.
+            past_key_values: pre-computed values to pass to language model.
+                See past_key_values documentation in Hugging Face
+                CausalLM models.
+            use_cache: whether to use cached key values. See use_cache
+                documentation in Hugging Face CausalLM models.
+        """
+        assert (
+            self.lang_encoder.initialized_flamingo
+        ), "Flamingo layers are not initialized. Please call `init_flamingo` first."
+
+        assert (
+            self.lang_encoder._use_cached_vision_x or vision_x is not None
+        ), "Must provide either vision_x or have precached media using cache_media()."
+
+        if self.lang_encoder._use_cached_vision_x:
+            # Case: use cached; vision_x should be cached and other
+            # vision-related inputs should not be provided.
+            assert (
+                vision_x is None
+            ), "Expect vision_x to be None when media has been cached using cache_media(). Try uncache_media() first."
+            assert self.lang_encoder.is_conditioned()
+
+        else:
+            # Case: do not use caching (i.e. this is a standard forward pass);
+            self._encode_vision_x(vision_x=vision_x)
+            self._condition_media_locations(input_ids=lang_x)
+
+        output = self.lang_encoder(
+            input_ids=lang_x,
+            attention_mask=attention_mask,
+            labels=labels,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+        )
+
+        if clear_conditioned_layers:
+            self.lang_encoder.clear_conditioned_layers()
+
+        return output
+
+    def generate(
+        self,
+        vision_x: torch.Tensor,
+        lang_x: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        **kwargs,
+    ):
+        """
+        Generate text conditioned on vision and language inputs.
+
+        Args:
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W)
+                images in the same chunk are collated along T_img, and frames are collated along F
+                currently only F=1 is supported (single-frame videos)
+            lang_x (torch.Tensor): Language input
+                shape (B, T_txt)
+            **kwargs: see generate documentation in Hugging Face CausalLM models. Some notable kwargs:
+                max_length (int, optional): Maximum length of the output. Defaults to None.
+                attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
+                num_beams (int, optional): Number of beams. Defaults to 1.
+                max_new_tokens (int, optional): Maximum new tokens. Defaults to None.
+                temperature (float, optional): Temperature. Defaults to 1.0.
+                top_k (int, optional): Top k. Defaults to 50.
+                top_p (float, optional): Top p. Defaults to 1.0.
+                no_repeat_ngram_size (int, optional): No repeat ngram size. Defaults to 0.
+                length_penalty (float, optional): Length penalty. Defaults to 1.0.
+                num_return_sequences (int, optional): Number of return sequences. Defaults to 1.
+                do_sample (bool, optional): Do sample. Defaults to False.
+                early_stopping (bool, optional): Early stopping. Defaults to False.
+        Returns:
+            torch.Tensor: lang_x with generated tokens appended to it
+        """
+        num_beams = kwargs.pop("num_beams", 1)
+        if num_beams > 1:
+            vision_x = vision_x.repeat_interleave(num_beams, dim=0)
+
+        self.lang_encoder._use_cached_vision_x = True
+        self._encode_vision_x(vision_x=vision_x)
+
+        eos_token_id = kwargs.pop("eos_token_id", self.eoc_token_id)
+        output = self.lang_encoder.generate(
+            input_ids=lang_x,
+            attention_mask=attention_mask,
+            eos_token_id=eos_token_id,
+            num_beams=num_beams,
+            **kwargs,
+        )
+
+        self.lang_encoder.clear_conditioned_layers()
+        self.lang_encoder._use_cached_vision_x = False
+        return output
+
+    def _encode_vision_x(self, vision_x: torch.Tensor):
+        """
+        Compute media tokens from vision input by passing it through vision encoder and conditioning language model.
+        Args:
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W)
+                Images in the same chunk are collated along T_img, and frames are collated along F
+                Currently only F=1 is supported (single-frame videos)
+
+        rearrange code based on https://github.com/dhansmair/flamingo-mini
+        """
+
+        assert vision_x.ndim == 6, "vision_x should be of shape (b, T_img, F, C, H, W)"
+        b, T, F = vision_x.shape[:3]
+        assert F == 1, "Only single frame supported"
+
+        vision_x = rearrange(vision_x, "b T F c h w -> (b T F) c h w")
+        with torch.no_grad():
+            vision_x = self.vision_encoder(vision_x)[1]
+        vision_x = rearrange(vision_x, "(b T F) v d -> b T F v d", b=b, T=T, F=F)
+        vision_x = self.perceiver(vision_x)
+
+        for layer in self.lang_encoder._get_decoder_layers():
+            layer.condition_vis_x(vision_x)
+
+    def wrap_fsdp(self, wrapper_kwargs, device_id):
+        """
+        Manually wraps submodules for FSDP and move other parameters to device_id.
+
+        Why manually wrap?
+        - all parameters within the FSDP wrapper must have the same requires_grad.
+            We have a mix of frozen and unfrozen parameters.
+        - model.vision_encoder.visual needs to be individually wrapped or encode_vision_x errors
+            See: https://github.com/pytorch/pytorch/issues/82461#issuecomment-1269136344
+
+        The rough wrapping structure is:
+        - FlamingoModel
+            - FSDP(FSDP(vision_encoder))
+            - FSDP(FSDP(perceiver))
+            - lang_encoder
+                - FSDP(FSDP(input_embeddings))
+                - FlamingoLayers
+                    - FSDP(FSDP(gated_cross_attn_layer))
+                    - FSDP(FSDP(decoder_layer))
+                - FSDP(FSDP(output_embeddings))
+                - other parameters
+
+        Known issues:
+        - Our FSDP strategy is not compatible with tied embeddings. If the LM embeddings are tied,
+            train with DDP or set the --freeze_lm_embeddings flag to true.
+        - With FSDP + gradient ckpting, one can increase the batch size with seemingly no upper bound.
+            Although the training curves look okay, we found that downstream performance dramatically
+            degrades if the batch size is unreasonably large (e.g., 100 MMC4 batch size for OPT-125M).
+
+        FAQs about our FSDP wrapping strategy:
+        Why double wrap?
+        As of torch==2.0.1, FSDP's _post_forward_hook and _post_backward_hook
+        only free gathered parameters if the module is NOT FSDP root.
+
+        Why unfreeze the decoder_layers?
+        See https://github.com/pytorch/pytorch/issues/95805
+        As of torch==2.0.1, FSDP's _post_backward_hook is only registed if the flat param
+        requires_grad=True. We need the postback to fire to avoid OOM.
+        To effectively freeze the decoder layers, we exclude them from the optimizer.
+
+        What is assumed to be frozen v. unfrozen?
+        We assume that the model is being trained under normal Flamingo settings
+        with these lines being called in factory.py:
+            ```
+            # Freeze all parameters
+            model.requires_grad_(False)
+            assert sum(p.numel() for p in model.parameters() if p.requires_grad) == 0
+
+            # Unfreeze perceiver, gated_cross_attn_layers, and LM input embeddings
+            model.perceiver.requires_grad_(True)
+            model.lang_encoder.gated_cross_attn_layers.requires_grad_(True)
+            [optional] model.lang_encoder.get_input_embeddings().requires_grad_(True)
+            ```
+        """
+        # unfreeze the decoder layers
+        for block in self.lang_encoder.old_decoder_blocks:
+            block.requires_grad_(True)
+
+        # wrap in FSDP
+        with enable_wrap(wrapper_cls=FSDP, **wrapper_kwargs):
+            self.perceiver = wrap(wrap(self.perceiver))
+            self.lang_encoder.old_decoder_blocks = nn.ModuleList(
+                wrap(wrap(block)) for block in self.lang_encoder.old_decoder_blocks
+            )
+            self.lang_encoder.gated_cross_attn_layers = nn.ModuleList(
+                wrap(wrap(layer)) if layer is not None else None for layer in self.lang_encoder.gated_cross_attn_layers
+            )
+            self.lang_encoder.init_flamingo_layers(self._use_gradient_checkpointing)
+            self.lang_encoder.set_input_embeddings(wrap(wrap(self.lang_encoder.get_input_embeddings())))
+            self.lang_encoder.set_output_embeddings(wrap(wrap(self.lang_encoder.get_output_embeddings())))
+            self.vision_encoder = wrap(wrap(self.vision_encoder))  # frozen
+
+        # manually move non-FSDP managed parameters to device_id
+        # these are all in lang_encoder
+        apply_with_stopping_condition(
+            module=self.lang_encoder,
+            apply_fn=lambda m: m.to(device_id),
+            apply_condition=lambda m: len(list(m.children())) == 0,
+            stopping_condition=lambda m: isinstance(m, FSDP),
+        )
+
+        # exclude the original decoder layers from the optimizer
+        for block in self.lang_encoder.old_decoder_blocks:
+            for p in block.parameters():
+                p.exclude_from_optimizer = True
+
+        # set up clip_grad_norm_ function
+        def clip_grad_norm_(max_norm):
+            self.perceiver.clip_grad_norm_(max_norm)
+            for layer in self.lang_encoder.gated_cross_attn_layers:
+                if layer is not None:
+                    layer.clip_grad_norm_(max_norm)
+            self.lang_encoder.get_input_embeddings().clip_grad_norm_(max_norm)
+
+        self.clip_grad_norm_ = clip_grad_norm_
+
+    def _condition_media_locations(self, input_ids: torch.Tensor):
+        """
+        Compute the media token locations from lang_x and condition the language model on these.
+        Args:
+            input_ids (torch.Tensor): Language input
+                shape (B, T_txt)
+        """
+        media_locations = input_ids == self.media_token_id
+
+        for layer in self.lang_encoder._get_decoder_layers():
+            layer.condition_media_locations(media_locations)
+
+    def cache_media(self, input_ids: torch.Tensor, vision_x: torch.Tensor):
+        """
+        Pre-cache a prompt/sequence of images / text for log-likelihood evaluations.
+        All subsequent calls to forward() will generate attending to the LAST
+        image in vision_x.
+        This is not meant to be used to cache things for generate().
+        Args:
+            input_ids (torch.Tensor): Language input
+                shape (B, T_txt)
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W)
+                Images in the same chunk are collated along T_img, and frames are collated along F
+                Currently only F=1 is supported (single-frame videos)
+        """
+        self._encode_vision_x(vision_x=vision_x)
+        self._condition_media_locations(input_ids=input_ids)
+        self.lang_encoder._use_cached_vision_x = True
+
+    def uncache_media(self):
+        """
+        Clear all conditioning.
+        """
+        self.lang_encoder.clear_conditioned_layers()
+        self.lang_encoder._use_cached_vision_x = False
diff --git a/src/helm/clients/vision_language/open_flamingo/src/flamingo_lm.py b/src/helm/clients/vision_language/open_flamingo/src/flamingo_lm.py
@@ -0,0 +1,153 @@
+"""
+Source: https://github.com/mlfoundations/open_flamingo
+"""
+
+import torch.nn as nn
+from .helpers import GatedCrossAttentionBlock
+from .utils import getattr_recursive, setattr_recursive
+
+
+class FlamingoLayer(nn.Module):
+    """
+    FlamingoLayer is a wrapper around the GatedCrossAttentionBlock and DecoderLayer.
+    """
+
+    def __init__(self, gated_cross_attn_layer, decoder_layer, gradient_checkpointing=False):
+        super().__init__()
+        self.gated_cross_attn_layer = gated_cross_attn_layer
+        self.decoder_layer = decoder_layer
+        self.vis_x = None
+        self.media_locations = None
+        if self.gated_cross_attn_layer is not None:
+            self.gated_cross_attn_layer._use_gradient_checkpointing = gradient_checkpointing
+        self.decoder_layer._use_gradient_checkpointing = gradient_checkpointing
+
+    def is_conditioned(self) -> bool:
+        """Check whether the layer is conditioned."""
+        return self.vis_x is not None and self.media_locations is not None
+
+    # Used this great idea from this implementation of Flamingo (https://github.com/dhansmair/flamingo-mini/)
+    def condition_vis_x(self, vis_x):
+        self.vis_x = vis_x
+
+    def condition_media_locations(self, media_locations):
+        self.media_locations = media_locations
+
+    def condition_use_cached_media(self, use_cached_media):
+        self.use_cached_media = use_cached_media
+
+    def forward(
+        self,
+        lang_x,
+        attention_mask=None,
+        **decoder_layer_kwargs,
+    ):
+        # Cross attention
+        if self.gated_cross_attn_layer is not None:
+            if self.vis_x is None:
+                raise ValueError("vis_x must be conditioned before forward pass")
+
+            if self.media_locations is None:
+                raise ValueError("media_locations must be conditioned before forward pass")
+
+            lang_x = self.gated_cross_attn_layer(
+                lang_x,
+                self.vis_x,
+                media_locations=self.media_locations,
+                use_cached_media=self.use_cached_media,
+            )
+
+        # Normal decoder layer
+        lang_x = self.decoder_layer(lang_x, attention_mask=attention_mask, **decoder_layer_kwargs)
+        return lang_x
+
+
+class FlamingoLMMixin(nn.Module):
+    """
+    Mixin to add cross-attention layers to a language model.
+    """
+
+    def set_decoder_layers_attr_name(self, decoder_layers_attr_name):
+        self.decoder_layers_attr_name = decoder_layers_attr_name
+
+    def _get_decoder_layers(self):
+        return getattr_recursive(self, self.decoder_layers_attr_name)
+
+    def _set_decoder_layers(self, value):
+        setattr_recursive(self, self.decoder_layers_attr_name, value)
+
+    def init_flamingo(
+        self,
+        media_token_id,
+        lang_hidden_size,
+        vis_hidden_size,
+        cross_attn_every_n_layers,
+        gradient_checkpointing,
+    ):
+        """
+        Initialize Flamingo by adding a new gated cross attn to the decoder. Store the media token id for computing the media locations.
+        """
+        self.old_decoder_blocks = self._get_decoder_layers()
+        self.gated_cross_attn_layers = nn.ModuleList(
+            [
+                GatedCrossAttentionBlock(dim=lang_hidden_size, dim_visual=vis_hidden_size)
+                if (layer_idx + 1) % cross_attn_every_n_layers == 0
+                else None
+                for layer_idx, _ in enumerate(self._get_decoder_layers())
+            ]
+        )
+        self.init_flamingo_layers(gradient_checkpointing)
+        self.media_token_id = media_token_id
+        self.initialized_flamingo = True
+        self._use_cached_vision_x = False
+
+    def init_flamingo_layers(self, gradient_checkpointing):
+        """
+        Re initializes the FlamingoLayers.
+        Propagates any changes made to self.gated_corss_attn_layers or self.old_decoder_blocks
+        """
+        self._set_decoder_layers(
+            nn.ModuleList(
+                [
+                    FlamingoLayer(gated_cross_attn_layer, decoder_layer, gradient_checkpointing)
+                    for gated_cross_attn_layer, decoder_layer in zip(
+                        self.gated_cross_attn_layers, self.old_decoder_blocks
+                    )
+                ]
+            )
+        )
+
+    def forward(self, input_ids, attention_mask, **kwargs):
+        """Condition the Flamingo layers on the media locations before forward()"""
+        if not self.initialized_flamingo:
+            raise ValueError("Flamingo layers are not initialized. Please call `init_flamingo` first.")
+
+        media_locations = input_ids == self.media_token_id
+
+        # if there are media already cached and we're generating and there are no media tokens in the input,
+        # we'll assume that ALL input tokens should attend to the last previous media that is cached.
+        # this is especially important for HF generate() compatibility, since generate() calls forward()
+        # repeatedly one token at a time (with no media tokens).
+        # without this check, the model would not attend to any images when generating (after the first token)
+        use_cached_media_locations = self._use_cached_vision_x and self.is_conditioned() and not media_locations.any()
+
+        for layer in self._get_decoder_layers():
+            if not use_cached_media_locations:
+                layer.condition_media_locations(media_locations)
+            layer.condition_use_cached_media(use_cached_media_locations)
+
+        # package arguments for the other parent's forward. since we don't know the order of the arguments,
+        # make them all kwargs
+        kwargs["input_ids"] = input_ids
+        kwargs["attention_mask"] = attention_mask
+        return super().forward(**kwargs)  # Call the other parent's forward method
+
+    def is_conditioned(self) -> bool:
+        """Check whether all decoder layers are already conditioned."""
+        return all(l.is_conditioned() for l in self._get_decoder_layers())
+
+    def clear_conditioned_layers(self):
+        for layer in self._get_decoder_layers():
+            layer.condition_vis_x(None)
+            layer.condition_media_locations(None)
+            layer.condition_use_cached_media(None)
diff --git a/src/helm/clients/vision_language/open_flamingo/src/helpers.py b/src/helm/clients/vision_language/open_flamingo/src/helpers.py
@@ -0,0 +1,267 @@
+"""
+Based on: https://github.com/lucidrains/flamingo-pytorch
+"""
+
+import torch
+from einops import rearrange, repeat
+from einops_exts import rearrange_many
+from torch import einsum, nn
+
+
+def exists(val):
+    return val is not None
+
+
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+
+
+class PerceiverAttention(nn.Module):
+    def __init__(self, *, dim, dim_head=64, heads=8):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        inner_dim = dim_head * heads
+
+        self.norm_media = nn.LayerNorm(dim)
+        self.norm_latents = nn.LayerNorm(dim)
+
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+
+    def forward(self, x, latents):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, T, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, T, n2, D)
+        """
+        x = self.norm_media(x)
+        latents = self.norm_latents(latents)
+
+        h = self.heads
+
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+        q, k, v = rearrange_many((q, k, v), "b t n (h d) -> b h t n d", h=h)
+        q = q * self.scale
+
+        # attention
+        sim = einsum("... i d, ... j d  -> ... i j", q, k)
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        attn = sim.softmax(dim=-1)
+
+        out = einsum("... i j, ... j d -> ... i d", attn, v)
+        out = rearrange(out, "b h t n d -> b t n (h d)", h=h)
+        return self.to_out(out)
+
+
+class PerceiverResampler(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth=6,
+        dim_head=64,
+        heads=8,
+        num_latents=64,
+        max_num_media=None,
+        max_num_frames=None,
+        ff_mult=4,
+    ):
+        super().__init__()
+        self.latents = nn.Parameter(torch.randn(num_latents, dim))
+        self.frame_embs = nn.Parameter(torch.randn(max_num_frames, dim)) if exists(max_num_frames) else None
+        self.media_time_embs = nn.Parameter(torch.randn(max_num_media, 1, dim)) if exists(max_num_media) else None
+
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+
+        self.norm = nn.LayerNorm(dim)
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, T, F, v, D)
+        Returns:
+            shape (b, T, n, D) where n is self.num_latents
+        """
+        b, T, F, v = x.shape[:4]
+
+        # frame and media time embeddings
+        if exists(self.frame_embs):
+            frame_embs = repeat(self.frame_embs[:F], "F d -> b T F v d", b=b, T=T, v=v)
+            x = x + frame_embs
+        x = rearrange(x, "b T F v d -> b T (F v) d")  # flatten the frame and spatial dimensions
+        if exists(self.media_time_embs):
+            x = x + self.media_time_embs[:T]
+
+        # blocks
+        latents = repeat(self.latents, "n d -> b T n d", b=b, T=T)
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        return self.norm(latents)
+
+
+# gated cross attention
+class MaskedCrossAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_visual,
+        dim_head=64,
+        heads=8,
+        only_attend_immediate_media=True,
+    ):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        inner_dim = dim_head * heads
+
+        self.norm = nn.LayerNorm(dim)
+
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim_visual, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+
+        # whether for text to only attend to immediate preceding image, or all previous images
+        self.only_attend_immediate_media = only_attend_immediate_media
+
+    def forward(self, x, media, media_locations=None, use_cached_media=False):
+        """
+        Args:
+            x (torch.Tensor): text features
+                shape (B, T_txt, D_txt)
+            media (torch.Tensor): image features
+                shape (B, T_img, n, D_img) where n is the dim of the latents
+            media_locations: boolean mask identifying the media tokens in x
+                shape (B, T_txt)
+            use_cached_media: bool
+                If true, treat all of x as if they occur after the last media
+                registered in media_locations. T_txt does not need to exactly
+                equal media_locations.shape[1] in this case
+        """
+
+        if not use_cached_media:
+            assert (
+                media_locations.shape[1] == x.shape[1]
+            ), f"media_location.shape is {media_locations.shape} but x.shape is {x.shape}"
+
+        T_txt = x.shape[1]
+        _, T_img, n = media.shape[:3]
+        h = self.heads
+
+        x = self.norm(x)
+
+        q = self.to_q(x)
+        media = rearrange(media, "b t n d -> b (t n) d")
+
+        k, v = self.to_kv(media).chunk(2, dim=-1)
+        q, k, v = rearrange_many((q, k, v), "b n (h d) -> b h n d", h=h)
+
+        q = q * self.scale
+
+        sim = einsum("... i d, ... j d -> ... i j", q, k)
+
+        if exists(media_locations):
+            media_time = torch.arange(T_img, device=x.device) + 1
+
+            if use_cached_media:
+                # text time is set to the last cached media location
+                text_time = repeat(
+                    torch.count_nonzero(media_locations, dim=1),
+                    "b -> b i",
+                    i=T_txt,
+                )
+            else:
+                # at each boolean of True, increment the time counter (relative to media time)
+                text_time = media_locations.cumsum(dim=-1)
+
+            # text time must equal media time if only attending to most immediate image
+            # otherwise, as long as text time is greater than media time (if attending to all previous images / media)
+            mask_op = torch.eq if self.only_attend_immediate_media else torch.ge
+
+            text_to_media_mask = mask_op(
+                rearrange(text_time, "b i -> b 1 i 1"),
+                repeat(media_time, "j -> 1 1 1 (j n)", n=n),
+            )
+            sim = sim.masked_fill(~text_to_media_mask, -torch.finfo(sim.dtype).max)
+
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        attn = sim.softmax(dim=-1)
+
+        if exists(media_locations) and self.only_attend_immediate_media:
+            # any text without a preceding media needs to have attention zeroed out
+            text_without_media_mask = text_time == 0
+            text_without_media_mask = rearrange(text_without_media_mask, "b i -> b 1 i 1")
+            attn = attn.masked_fill(text_without_media_mask, 0.0)
+
+        out = einsum("... i j, ... j d -> ... i d", attn, v)
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+
+
+class GatedCrossAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_visual,
+        dim_head=64,
+        heads=8,
+        ff_mult=4,
+        only_attend_immediate_media=True,
+    ):
+        super().__init__()
+        self.attn = MaskedCrossAttention(
+            dim=dim,
+            dim_visual=dim_visual,
+            dim_head=dim_head,
+            heads=heads,
+            only_attend_immediate_media=only_attend_immediate_media,
+        )
+        self.attn_gate = nn.Parameter(torch.tensor([0.0]))
+
+        self.ff = FeedForward(dim, mult=ff_mult)
+        self.ff_gate = nn.Parameter(torch.tensor([0.0]))
+
+    def forward(
+        self,
+        x,
+        media,
+        media_locations=None,
+        use_cached_media=False,
+    ):
+        x = (
+            self.attn(
+                x,
+                media,
+                media_locations=media_locations,
+                use_cached_media=use_cached_media,
+            )
+            * self.attn_gate.tanh()
+            + x
+        )
+        x = self.ff(x) * self.ff_gate.tanh() + x
+
+        return x
diff --git a/src/helm/clients/vision_language/open_flamingo/src/utils.py b/src/helm/clients/vision_language/open_flamingo/src/utils.py
@@ -0,0 +1,47 @@
+"""
+Source: https://github.com/mlfoundations/open_flamingo
+"""
+
+
+def extend_instance(obj, mixin):
+    """Apply mixins to a class instance after creation"""
+    base_cls = obj.__class__
+    base_cls_name = obj.__class__.__name__
+    obj.__class__ = type(
+        base_cls_name, (mixin, base_cls), {}
+    )  # mixin needs to go first for our forward() logic to work
+
+
+def getattr_recursive(obj, att):
+    """
+    Return nested attribute of obj
+    Example: getattr_recursive(obj, 'a.b.c') is equivalent to obj.a.b.c
+    """
+    if att == "":
+        return obj
+    i = att.find(".")
+    if i < 0:
+        return getattr(obj, att)
+    else:
+        return getattr_recursive(getattr(obj, att[:i]), att[i + 1 :])
+
+
+def setattr_recursive(obj, att, val):
+    """
+    Set nested attribute of obj
+    Example: setattr_recursive(obj, 'a.b.c', val) is equivalent to obj.a.b.c = val
+    """
+    if "." in att:
+        obj = getattr_recursive(obj, ".".join(att.split(".")[:-1]))
+    setattr(obj, att.split(".")[-1], val)
+
+
+def apply_with_stopping_condition(module, apply_fn, apply_condition=None, stopping_condition=None, **other_args):
+    if stopping_condition(module):
+        return
+    if apply_condition(module):
+        apply_fn(module, **other_args)
+    for child in module.children():
+        apply_with_stopping_condition(
+            child, apply_fn, apply_condition=apply_condition, stopping_condition=stopping_condition, **other_args
+        )
diff --git a/src/helm/clients/vision_language/open_flamingo_client.py b/src/helm/clients/vision_language/open_flamingo_client.py
@@ -0,0 +1,160 @@
+from threading import Lock
+from typing import List, Optional, Tuple
+
+import torch
+from huggingface_hub import hf_hub_download
+
+from helm.common.cache import CacheConfig
+from helm.common.hierarchical_logger import hlog, htrack_block
+from helm.common.images_utils import open_image
+from helm.common.gpu_utils import get_torch_device_name
+from helm.common.media_object import TEXT_TYPE
+from helm.common.optional_dependencies import handle_module_not_found_error
+from helm.common.request import Request, RequestResult, Sequence, Token
+from helm.common.request import wrap_request_time
+from helm.clients.vision_language.open_flamingo import create_model_and_transforms
+from helm.clients.client import CachingClient, generate_uid_for_multimodal_prompt
+
+try:
+    from PIL import Image
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e, ["images"])
+
+
+class OpenFlamingoClient(CachingClient):
+    """
+    OpenFlamingo is an open source implementation of DeepMind's Flamingo models.
+    Implementation following:
+        https://github.com/mlfoundations/open_flamingo
+        https://huggingface.co/openflamingo/OpenFlamingo-9B-vitl-mpt7b
+    """
+
+    END_OF_CHUNK_TOKEN: str = "<|endofchunk|>"
+    IMAGE_TOKEN: str = "<image>"
+
+    _model_lock: Lock = Lock()
+
+    def __init__(
+        self,
+        cache_config: CacheConfig,
+        checkpoint_path: Optional[str] = None,
+        tokenizer_name: Optional[str] = None,
+        cross_attn_every_n_layers: int = 4,
+    ):
+        super().__init__(cache_config)
+        self._device: str = get_torch_device_name()
+        self._checkpoint_path: Optional[str] = checkpoint_path
+        self._tokenizer_name: Optional[str] = tokenizer_name
+        self._cross_attn_every_n_layers: int = cross_attn_every_n_layers
+
+        # Model
+        # The model is only initialized when the first request is made
+        # This is to avoid loading the model if it is not used
+        self._model: Optional[torch.nn.Module] = None
+
+    def _get_model(self):
+        if not self._checkpoint_path:
+            raise ValueError("OpenFlamingoClient requires a checkpoint path")
+        if not self._tokenizer_name:
+            raise ValueError("OpenFlamingoClient requires a tokenizer name")
+        with htrack_block("Initializing OpenFlamingo model"):
+            with self._model_lock:
+                self._model, self.image_processor, self.tokenizer = create_model_and_transforms(
+                    clip_vision_encoder_path="ViT-L-14",
+                    clip_vision_encoder_pretrained="openai",
+                    lang_encoder_path=self._tokenizer_name,
+                    tokenizer_path=self._tokenizer_name,
+                    cross_attn_every_n_layers=self._cross_attn_every_n_layers,
+                )
+                self.tokenizer.padding_side = "left"
+                checkpoint_path = hf_hub_download(self._checkpoint_path, "checkpoint.pt")
+                self._model.load_state_dict(torch.load(checkpoint_path), strict=False)
+                self._model = self._model.to(self._device)
+                hlog(f"Loaded model to {self._device}.")
+
+    def make_request(self, request: Request) -> RequestResult:
+        assert request.multimodal_prompt is not None, "Multimodal prompt is required"
+
+        # Load model if needed
+        if self._model is None:
+            self._get_model()
+
+        # Build the prompt
+        prompt_text: str = ""
+        images: List[Image.Image] = []
+        for media_object in request.multimodal_prompt.media_objects:
+            if media_object.is_type("image") and media_object.location:
+                images.append(open_image(media_object.location))
+                prompt_text += self.IMAGE_TOKEN
+            elif media_object.is_type(TEXT_TYPE):
+                if media_object.text is None:
+                    raise ValueError("MediaObject of text type has missing text field value")
+                prompt_text += media_object.text
+            else:
+                raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
+
+        # Preprocess
+        vision_x: torch.Tensor = torch.cat([self.image_processor(image).unsqueeze(0) for image in images], dim=0)
+        vision_x = vision_x.unsqueeze(1).unsqueeze(0)
+
+        lang_x = self.tokenizer(
+            [prompt_text],
+            return_tensors="pt",
+        )
+
+        # Generate
+        try:
+            generation_args = {
+                "max_new_tokens": request.max_tokens,
+                "num_beams": request.num_completions,
+                "n": request.num_completions,
+            }
+
+            def do_it():
+                tensors = self._model.generate(
+                    vision_x=vision_x.to(self._device),
+                    lang_x=lang_x["input_ids"].to(self._device),
+                    attention_mask=lang_x["attention_mask"].to(self._device),
+                    max_new_tokens=generation_args["max_new_tokens"],
+                    num_beams=generation_args["num_beams"],
+                    num_return_sequences=generation_args["num_beams"],
+                )
+                generated_completions: List[Tuple[str, List[str]]] = []
+                for tensor in tensors:
+                    generated_text: str = self.tokenizer.decode(tensor)
+                    assert generated_text.startswith(
+                        prompt_text
+                    ), f"Generated text: {generated_text} does not start with prompt: {prompt_text}"
+
+                    # Remove the prompt from the generated text
+                    generated_text = generated_text[len(prompt_text) :].replace(self.END_OF_CHUNK_TOKEN, "").strip()
+                    raw_tokens: List[str] = self.tokenizer.tokenize(generated_text)
+                    generated_completions.append((generated_text, raw_tokens))
+
+                return {"output": generated_completions}
+
+            cache_key = CachingClient.make_cache_key(
+                raw_request={
+                    "model": request.model,
+                    "prompt": generate_uid_for_multimodal_prompt(request.multimodal_prompt),
+                    **generation_args,
+                },
+                request=request,
+            )
+            result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+        except RuntimeError as e:
+            return RequestResult(success=False, cached=False, error=str(e), completions=[], embedding=[])
+
+        completions: List[Sequence] = []
+        for text, tokens in result["output"]:
+            completions.append(
+                Sequence(text=text, logprob=0, tokens=[Token(text=token, logprob=0) for token in tokens])
+            )
+
+        return RequestResult(
+            success=True,
+            cached=cached,
+            request_time=result["request_time"],
+            completions=completions,
+            embedding=[],
+        )
diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml
@@ -629,6 +629,18 @@ model_deployments:
     max_sequence_length: 2048
     client_spec:
       class_name: "helm.clients.vision_language.huggingface_vlm_client.HuggingFaceVLMClient"
+
+  ## OpenFlamingo
+  - name: openflamingo/OpenFlamingo-9B-vitl-mpt7b
+    model_name: openflamingo/OpenFlamingo-9B-vitl-mpt7b
+    tokenizer_name: anas-awadalla/mpt-7b
+    max_sequence_length: 2048
+    client_spec:
+      class_name: "helm.clients.vision_language.open_flamingo_client.OpenFlamingoClient"
+      args:
+        checkpoint_path: "openflamingo/OpenFlamingo-9B-vitl-mpt7b"
+        tokenizer_name: "anas-awadalla-2/mpt-7b"
+        cross_attn_every_n_layers: 4
 
   - name: together/phi-2
     model_name: microsoft/phi-2

diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml
@@ -1146,6 +1146,16 @@ models:
     num_parameters: 13000000000
     release_date: 2023-10-05
     tags: [VISION_LANGUAGE_MODEL_TAG, LLAVA_MODEL_TAG, LIMITED_FUNCTIONALITY_VLM_TAG]
+
+
+  - name: openflamingo/OpenFlamingo-9B-vitl-mpt7b
+    display_name: OpenFlamingo (9B)
+    description: OpenFlamingo is an open source implementation of DeepMind's Flamingo models. This 9B-parameter model uses a CLIP ViT-L/14 vision encoder and MPT-7B language model. ([paper](https://arxiv.org/abs/2308.01390))
+    creator_organization_name: OpenFlamingo
+    access: open
+    num_parameters: 9000000000
+    release_date: 2023-08-02
+    tags: [VISION_LANGUAGE_MODEL_TAG, OPEN_FLAMINGO_MODEL_TAG, LIMITED_FUNCTIONALITY_VLM_TAG]
 
   - name: microsoft/phi-2
     display_name: Phi-2

diff --git a/src/helm/config/tokenizer_configs.yaml b/src/helm/config/tokenizer_configs.yaml
@@ -170,6 +170,12 @@ tokenizer_configs:
       class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
     end_of_text_token: "</s>"
     prefix_token: "<s>"
+
+  - name: anas-awadalla/mpt-7b
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: ""
 
   # Huggingface
   - name: huggingface/gpt2
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .src.flamingo import Flamingo
		from .src.factory import create_model_and_transforms