From 2ba111669451fa115cfeae1418982e9c7aab7fcc Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Mon, 9 Sep 2024 10:47:00 -0700
Subject: [PATCH 01/26] Add more enum samples

Change-Id: I743d5967cc1cc91576b8ddf5a60db1767d94508d
---
 samples/controlled_generation.py | 41 ++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/samples/controlled_generation.py b/samples/controlled_generation.py
index 042209a72..8f6787676 100644
--- a/samples/controlled_generation.py
+++ b/samples/controlled_generation.py
@@ -160,6 +160,47 @@ def test_x_enum_raw(self):
         print(result)  # Keyboard
         # [END x_enum_raw]
 
+    def test_x_enum(self):
+        # [START x_enum]
+        import enum
+
+        class Choice(enum.Enum):
+            PERCUSSION = "Percussion"
+            STRING = "String"
+            WOODWIND = "Woodwind"
+            BRASS = "Brass"
+            KEYBOARD = "Keyboard"
+
+        model = genai.GenerativeModel("gemini-1.5-pro-latest")
+
+        organ = genai.upload_file(media / "organ.jpg")
+        result = model.generate_content(
+            ["What kind of instrument is this:", organ],
+            generation_config=genai.GenerationConfig(
+                response_mime_type="text/x.enum", response_schema=Choice
+            ),
+        )
+        print(result)  # "Keyboard"
+        # [END x_enum]
+
+    def test_x_enum_raw(self):
+        # [START x_enum_raw]
+        model = genai.GenerativeModel("gemini-1.5-pro-latest")
+
+        organ = genai.upload_file(media / "organ.jpg")
+        result = model.generate_content(
+            ["What kind of instrument is this:", organ],
+            generation_config=genai.GenerationConfig(
+                response_mime_type="text/x.enum",
+                response_schema={
+                    "type": "STRING",
+                    "enum": ["Percussion", "String", "Woodwind", "Brass", "Keyboard"],
+                },
+            ),
+        )
+        print(result)  # "Keyboard"
+        # [END x_enum_raw]
+
 
 if __name__ == "__main__":
     absltest.main()

From f62d7060691813324d5bd92ee448bfaf3b477d74 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Mon, 9 Sep 2024 11:08:48 -0700
Subject: [PATCH 02/26] format

Change-Id: I8f6f9389f1cae0a7c934217968d4e2e20bb9590e
---
 samples/controlled_generation.py | 41 --------------------------------
 1 file changed, 41 deletions(-)

diff --git a/samples/controlled_generation.py b/samples/controlled_generation.py
index 8f6787676..78c422464 100644
--- a/samples/controlled_generation.py
+++ b/samples/controlled_generation.py
@@ -119,47 +119,6 @@ def test_json_enum_raw(self):
         print(result)  # "Keyboard"
         # [END json_enum_raw]
 
-    def test_x_enum(self):
-        # [START x_enum]
-        import enum
-
-        class Choice(enum.Enum):
-            PERCUSSION = "Percussion"
-            STRING = "String"
-            WOODWIND = "Woodwind"
-            BRASS = "Brass"
-            KEYBOARD = "Keyboard"
-
-        model = genai.GenerativeModel("gemini-1.5-pro-latest")
-
-        organ = genai.upload_file(media / "organ.jpg")
-        result = model.generate_content(
-            ["What kind of instrument is this:", organ],
-            generation_config=genai.GenerationConfig(
-                response_mime_type="text/x.enum", response_schema=Choice
-            ),
-        )
-        print(result)  # Keyboard
-        # [END x_enum]
-
-    def test_x_enum_raw(self):
-        # [START x_enum_raw]
-        model = genai.GenerativeModel("gemini-1.5-pro-latest")
-
-        organ = genai.upload_file(media / "organ.jpg")
-        result = model.generate_content(
-            ["What kind of instrument is this:", organ],
-            generation_config=genai.GenerationConfig(
-                response_mime_type="text/x.enum",
-                response_schema={
-                    "type": "STRING",
-                    "enum": ["Percussion", "String", "Woodwind", "Brass", "Keyboard"],
-                },
-            ),
-        )
-        print(result)  # Keyboard
-        # [END x_enum_raw]
-
     def test_x_enum(self):
         # [START x_enum]
         import enum

From 9583950c8dc08054157044bf35dea46e5962e458 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Mon, 9 Sep 2024 14:05:13 -0700
Subject: [PATCH 03/26] From:
 https://github.com/googleapis/python-aiplatform/tree/v1.65.0/vertexai/vision_models

Change-Id: I693579ccf2994212f25d0354d091d3210fbf3212
---
 google/generativeai/vision_models/__init__.py |   45 +
 .../vision_models/_vision_models.py           | 1376 +++++++++++++++++
 2 files changed, 1421 insertions(+)
 create mode 100644 google/generativeai/vision_models/__init__.py
 create mode 100644 google/generativeai/vision_models/_vision_models.py

diff --git a/google/generativeai/vision_models/__init__.py b/google/generativeai/vision_models/__init__.py
new file mode 100644
index 000000000..1834b5ceb
--- /dev/null
+++ b/google/generativeai/vision_models/__init__.py
@@ -0,0 +1,45 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Classes for working with vision models."""
+
+from vertexai.vision_models._vision_models import (
+    GeneratedImage,
+    Image,
+    ImageCaptioningModel,
+    ImageGenerationModel,
+    ImageGenerationResponse,
+    ImageQnAModel,
+    ImageTextModel,
+    MultiModalEmbeddingModel,
+    MultiModalEmbeddingResponse,
+    Video,
+    VideoEmbedding,
+    VideoSegmentConfig,
+)
+
+__all__ = [
+    "GeneratedImage",
+    "Image",
+    "ImageCaptioningModel",
+    "ImageGenerationModel",
+    "ImageGenerationResponse",
+    "ImageQnAModel",
+    "ImageTextModel",
+    "MultiModalEmbeddingModel",
+    "MultiModalEmbeddingResponse",
+    "Video",
+    "VideoEmbedding",
+    "VideoSegmentConfig",
+]
diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py
new file mode 100644
index 000000000..a80f0b2dc
--- /dev/null
+++ b/google/generativeai/vision_models/_vision_models.py
@@ -0,0 +1,1376 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# pylint: disable=bad-continuation, line-too-long, protected-access
+"""Classes for working with vision models."""
+
+import base64
+import dataclasses
+import hashlib
+import io
+import json
+import pathlib
+import typing
+from typing import Any, Dict, List, Literal, Optional, Union
+import urllib
+
+from google.cloud import storage
+
+from google.cloud.aiplatform import initializer as aiplatform_initializer
+from vertexai._model_garden import _model_garden_models
+
+# pylint: disable=g-import-not-at-top
+try:
+    from IPython import display as IPython_display
+except ImportError:
+    IPython_display = None
+
+try:
+    from PIL import Image as PIL_Image
+except ImportError:
+    PIL_Image = None
+
+
+_SUPPORTED_UPSCALING_SIZES = [2048, 4096]
+
+
+class Image:
+    """Image."""
+
+    __module__ = "vertexai.vision_models"
+
+    _loaded_bytes: Optional[bytes] = None
+    _loaded_image: Optional["PIL_Image.Image"] = None
+    _gcs_uri: Optional[str] = None
+
+    def __init__(
+        self,
+        image_bytes: Optional[bytes] = None,
+        gcs_uri: Optional[str] = None,
+    ):
+        """Creates an `Image` object.
+
+        Args:
+            image_bytes: Image file bytes. Image can be in PNG or JPEG format.
+            gcs_uri: Image URI in Google Cloud Storage.
+        """
+        if bool(image_bytes) == bool(gcs_uri):
+            raise ValueError("Either image_bytes or gcs_uri must be provided.")
+
+        self._image_bytes = image_bytes
+        self._gcs_uri = gcs_uri
+
+    @staticmethod
+    def load_from_file(location: str) -> "Image":
+        """Loads image from local file or Google Cloud Storage.
+
+        Args:
+            location: Local path or Google Cloud Storage uri from where to load
+                the image.
+
+        Returns:
+            Loaded image as an `Image` object.
+        """
+        parsed_url = urllib.parse.urlparse(location)
+        if (
+            parsed_url.scheme == "https"
+            and parsed_url.netloc == "storage.googleapis.com"
+        ):
+            parsed_url = parsed_url._replace(
+                scheme="gs", netloc="", path=f"/{urllib.parse.unquote(parsed_url.path)}"
+            )
+            location = urllib.parse.urlunparse(parsed_url)
+
+        if parsed_url.scheme == "gs":
+            return Image(gcs_uri=location)
+
+        # Load image from local path
+        image_bytes = pathlib.Path(location).read_bytes()
+        image = Image(image_bytes=image_bytes)
+        return image
+
+    @property
+    def _blob(self) -> storage.Blob:
+        if self._gcs_uri is None:
+            raise AttributeError("_blob is only supported when gcs_uri is set.")
+        storage_client = storage.Client(
+            credentials=aiplatform_initializer.global_config.credentials
+        )
+        blob = storage.Blob.from_string(uri=self._gcs_uri, client=storage_client)
+        # Needed to populate `blob.content_type`
+        blob.reload()
+        return blob
+
+    @property
+    def _image_bytes(self) -> bytes:
+        if self._loaded_bytes is None:
+            self._loaded_bytes = self._blob.download_as_bytes()
+        return self._loaded_bytes
+
+    @_image_bytes.setter
+    def _image_bytes(self, value: bytes):
+        self._loaded_bytes = value
+
+    @property
+    def _pil_image(self) -> "PIL_Image.Image":
+        if self._loaded_image is None:
+            if not PIL_Image:
+                raise RuntimeError(
+                    "The PIL module is not available. Please install the Pillow package."
+                )
+            self._loaded_image = PIL_Image.open(io.BytesIO(self._image_bytes))
+        return self._loaded_image
+
+    @property
+    def _size(self):
+        return self._pil_image.size
+
+    @property
+    def _mime_type(self) -> str:
+        """Returns the MIME type of the image."""
+        if self._gcs_uri:
+            return self._blob.content_type
+        if PIL_Image:
+            return PIL_Image.MIME.get(self._pil_image.format, "image/jpeg")
+        # Fall back to jpeg
+        return "image/jpeg"
+
+    def show(self):
+        """Shows the image.
+
+        This method only works when in a notebook environment.
+        """
+        if PIL_Image and IPython_display:
+            IPython_display.display(self._pil_image)
+
+    def save(self, location: str):
+        """Saves image to a file.
+
+        Args:
+            location: Local path where to save the image.
+        """
+        pathlib.Path(location).write_bytes(self._image_bytes)
+
+    def _as_base64_string(self) -> str:
+        """Encodes image using the base64 encoding.
+
+        Returns:
+            Base64 encoding of the image as a string.
+        """
+        # ! b64encode returns `bytes` object, not `str`.
+        # We need to convert `bytes` to `str`, otherwise we get service error:
+        # "received initial metadata size exceeds limit"
+        return base64.b64encode(self._image_bytes).decode("ascii")
+
+
+class Video:
+    """Video."""
+
+    __module__ = "vertexai.vision_models"
+
+    _loaded_bytes: Optional[bytes] = None
+    _gcs_uri: Optional[str] = None
+
+    def __init__(
+        self,
+        video_bytes: Optional[bytes] = None,
+        gcs_uri: Optional[str] = None,
+    ):
+        """Creates a `Video` object.
+
+        Args:
+            video_bytes: Video file bytes. Video can be in AVI, FLV, MKV, MOV,
+                MP4, MPEG, MPG, WEBM, and WMV formats.
+            gcs_uri: Image URI in Google Cloud Storage.
+        """
+        if bool(video_bytes) == bool(gcs_uri):
+            raise ValueError("Either video_bytes or gcs_uri must be provided.")
+
+        self._video_bytes = video_bytes
+        self._gcs_uri = gcs_uri
+
+    @staticmethod
+    def load_from_file(location: str) -> "Video":
+        """Loads video from local file or Google Cloud Storage.
+
+        Args:
+            location: Local path or Google Cloud Storage uri from where to load
+                the video.
+
+        Returns:
+            Loaded video as an `Video` object.
+        """
+        parsed_url = urllib.parse.urlparse(location)
+        if (
+            parsed_url.scheme == "https"
+            and parsed_url.netloc == "storage.googleapis.com"
+        ):
+            parsed_url = parsed_url._replace(
+                scheme="gs", netloc="", path=f"/{urllib.parse.unquote(parsed_url.path)}"
+            )
+            location = urllib.parse.urlunparse(parsed_url)
+
+        if parsed_url.scheme == "gs":
+            return Video(gcs_uri=location)
+
+        # Load video from local path
+        video_bytes = pathlib.Path(location).read_bytes()
+        video = Video(video_bytes=video_bytes)
+        return video
+
+    @property
+    def _blob(self) -> storage.Blob:
+        if self._gcs_uri is None:
+            raise AttributeError("_blob is only supported when gcs_uri is set.")
+        storage_client = storage.Client(
+            credentials=aiplatform_initializer.global_config.credentials
+        )
+        blob = storage.Blob.from_string(uri=self._gcs_uri, client=storage_client)
+        # Needed to populate `blob.content_type`
+        blob.reload()
+        return blob
+
+    @property
+    def _video_bytes(self) -> bytes:
+        if self._loaded_bytes is None:
+            self._loaded_bytes = self._blob.download_as_bytes()
+        return self._loaded_bytes
+
+    @_video_bytes.setter
+    def _video_bytes(self, value: bytes):
+        self._loaded_bytes = value
+
+    @property
+    def _mime_type(self) -> str:
+        """Returns the MIME type of the video."""
+        if self._gcs_uri:
+            return self._blob.content_type
+        # Fall back to mp4
+        return "video/mp4"
+
+    def save(self, location: str):
+        """Saves video to a file.
+
+        Args:
+            location: Local path where to save the video.
+        """
+        pathlib.Path(location).write_bytes(self._video_bytes)
+
+    def _as_base64_string(self) -> str:
+        """Encodes video using the base64 encoding.
+
+        Returns:
+            Base64 encoding of the video as a string.
+        """
+        # ! b64encode returns `bytes` object, not `str`.
+        # We need to convert `bytes` to `str`, otherwise we get service error:
+        # "received initial metadata size exceeds limit"
+        return base64.b64encode(self._video_bytes).decode("ascii")
+
+
+class VideoSegmentConfig:
+    """The specific video segments (in seconds) the embeddings are generated for."""
+
+    __module__ = "vertexai.vision_models"
+
+    start_offset_sec: int
+    end_offset_sec: int
+    interval_sec: int
+
+    def __init__(
+        self,
+        start_offset_sec: int = 0,
+        end_offset_sec: int = 120,
+        interval_sec: int = 16,
+    ):
+        """Creates a `VideoSegmentConfig` object.
+
+        Args:
+            start_offset_sec: Start time offset (in seconds) to generate embeddings for.
+            end_offset_sec: End time offset (in seconds) to generate embeddings for.
+            interval_sec: Interval to divide video for generated embeddings.
+        """
+        self.start_offset_sec = start_offset_sec
+        self.end_offset_sec = end_offset_sec
+        self.interval_sec = interval_sec
+
+
+class VideoEmbedding:
+    """Embeddings generated from video with offset times."""
+
+    __module__ = "vertexai.vision_models"
+
+    start_offset_sec: int
+    end_offset_sec: int
+    embedding: List[float]
+
+    def __init__(
+        self, start_offset_sec: int, end_offset_sec: int, embedding: List[float]
+    ):
+        """Creates a `VideoEmbedding` object.
+
+        Args:
+            start_offset_sec: Start time offset (in seconds) of generated embeddings.
+            end_offset_sec: End time offset (in seconds) of generated embeddings.
+            embedding: Generated embedding for interval.
+        """
+        self.start_offset_sec = start_offset_sec
+        self.end_offset_sec = end_offset_sec
+        self.embedding = embedding
+
+
+class ImageGenerationModel(
+    _model_garden_models._ModelGardenModel  # pylint: disable=protected-access
+):
+    """Generates images from text prompt.
+
+    Examples::
+
+        model = ImageGenerationModel.from_pretrained("imagegeneration@002")
+        response = model.generate_images(
+            prompt="Astronaut riding a horse",
+            # Optional:
+            number_of_images=1,
+            seed=0,
+        )
+        response[0].show()
+        response[0].save("image1.png")
+    """
+
+    __module__ = "vertexai.preview.vision_models"
+
+    _INSTANCE_SCHEMA_URI = "gs://google-cloud-aiplatform/schema/predict/instance/vision_generative_model_1.0.0.yaml"
+
+    def _generate_images(
+        self,
+        prompt: str,
+        *,
+        negative_prompt: Optional[str] = None,
+        number_of_images: int = 1,
+        width: Optional[int] = None,
+        height: Optional[int] = None,
+        aspect_ratio: Optional[Literal["1:1", "9:16", "16:9", "4:3", "3:4"]] = None,
+        guidance_scale: Optional[float] = None,
+        seed: Optional[int] = None,
+        base_image: Optional["Image"] = None,
+        mask: Optional["Image"] = None,
+        edit_mode: Optional[
+            Literal[
+                "inpainting-insert",
+                "inpainting-remove",
+                "outpainting",
+                "product-image",
+            ]
+        ] = None,
+        mask_mode: Optional[Literal["background", "foreground", "semantic"]] = None,
+        segmentation_classes: Optional[List[str]] = None,
+        mask_dilation: Optional[float] = None,
+        product_position: Optional[Literal["fixed", "reposition"]] = None,
+        output_mime_type: Optional[Literal["image/png", "image/jpeg"]] = None,
+        compression_quality: Optional[float] = None,
+        language: Optional[str] = None,
+        output_gcs_uri: Optional[str] = None,
+        add_watermark: Optional[bool] = None,
+        safety_filter_level: Optional[
+            Literal["block_most", "block_some", "block_few", "block_fewest"]
+        ] = None,
+        person_generation: Optional[
+            Literal["dont_allow", "allow_adult", "allow_all"]
+        ] = None,
+    ) -> "ImageGenerationResponse":
+        """Generates images from text prompt.
+
+        Args:
+            prompt: Text prompt for the image.
+            negative_prompt: A description of what you want to omit in the generated
+              images.
+            number_of_images: Number of images to generate. Range: 1..8.
+            width: Width of the image. One of the sizes must be 256 or 1024.
+            height: Height of the image. One of the sizes must be 256 or 1024.
+            aspect_ratio: Aspect ratio for the image. Supported values are:
+                * 1:1 - Square image
+                * 9:16 - Portait image
+                * 16:9 - Landscape image
+                * 4:3 - Landscape, desktop ratio.
+                * 3:4 - Portrait, desktop ratio
+            guidance_scale: Controls the strength of the prompt. Suggested values
+              are - * 0-9 (low strength) * 10-20 (medium strength) * 21+ (high
+              strength)
+            seed: Image generation random seed.
+            base_image: Base image to use for the image generation.
+            mask: Mask for the base image.
+            edit_mode: Describes the editing mode for the request. Supported values
+              are - * inpainting-insert: fills the mask area based on the text
+              prompt (requires mask and text) * inpainting-remove: removes the
+              object(s) in the mask area. (requires mask)
+                * outpainting: extend the image based on the mask area. (Requires
+                  mask) * product-image: Changes the background for the predominant
+                  product or subject in the image
+            mask_mode: Solicits generation of the mask (v/s providing mask as an
+              input). Supported values are:
+                * background: Automatically generates a mask for all regions except
+                  the primary subject(s) of the image
+                * foreground: Automatically generates a mask for the primary
+                  subjects(s) of the image.
+                * semantic: Segment one or more of the segmentation classes using
+                  class ID
+            segmentation_classes: List of class IDs for segmentation. Max of 5 IDs
+            mask_dilation: Defines the dilation percentage of the mask provided.
+              Float between 0 and 1. Defaults to 0.03
+            product_position: Defines whether the product should stay fixed or be
+              repositioned. Supported Values:
+                * fixed: Fixed position
+                * reposition: Can be moved (default)
+            output_mime_type: Which image format should the output be saved as.
+              Supported values: * image/png: Save as a PNG image * image/jpeg: Save
+              as a JPEG image
+            compression_quality: Level of compression if the output mime type is
+              selected to be image/jpeg. Float between 0 to 100
+            language: Language of the text prompt for the image. Default: None.
+              Supported values are `"en"` for English, `"hi"` for Hindi, `"ja"` for
+              Japanese, `"ko"` for Korean, and `"auto"` for automatic language
+              detection.
+            output_gcs_uri: Google Cloud Storage uri to store the generated images.
+            add_watermark: Add a watermark to the generated image
+            safety_filter_level: Adds a filter level to Safety filtering. Supported
+              values are: * "block_most" : Strongest filtering level, most strict
+              blocking * "block_some" : Block some problematic prompts and responses
+              * "block_few" : Block fewer problematic prompts and responses *
+              "block_fewest" : Block very few problematic prompts and responses
+            person_generation: Allow generation of people by the model Supported
+              values are: * "dont_allow" : Block generation of people *
+              "allow_adult" : Generate adults, but not children * "allow_all" :
+              Generate adults and children
+
+        Returns:
+            An `ImageGenerationResponse` object.
+        """
+        # Note: Only a single prompt is supported by the service.
+        instance = {"prompt": prompt}
+        shared_generation_parameters = {
+            "prompt": prompt,
+            # b/295946075 The service stopped supporting image sizes.
+            # "width": width,
+            # "height": height,
+            "number_of_images_in_batch": number_of_images,
+        }
+
+        if base_image:
+            if base_image._gcs_uri:  # pylint: disable=protected-access
+                instance["image"] = {
+                    "gcsUri": base_image._gcs_uri  # pylint: disable=protected-access
+                }
+                shared_generation_parameters[
+                    "base_image_uri"
+                ] = base_image._gcs_uri  # pylint: disable=protected-access
+            else:
+                instance["image"] = {
+                    "bytesBase64Encoded": base_image._as_base64_string()  # pylint: disable=protected-access
+                }
+                shared_generation_parameters["base_image_hash"] = hashlib.sha1(
+                    base_image._image_bytes  # pylint: disable=protected-access
+                ).hexdigest()
+
+        if mask:
+            if mask._gcs_uri:  # pylint: disable=protected-access
+                instance["mask"] = {
+                    "image": {
+                        "gcsUri": mask._gcs_uri  # pylint: disable=protected-access
+                    },
+                }
+                shared_generation_parameters[
+                    "mask_uri"
+                ] = mask._gcs_uri  # pylint: disable=protected-access
+            else:
+                instance["mask"] = {
+                    "image": {
+                        "bytesBase64Encoded": mask._as_base64_string()  # pylint: disable=protected-access
+                    },
+                }
+                shared_generation_parameters["mask_hash"] = hashlib.sha1(
+                    mask._image_bytes  # pylint: disable=protected-access
+                ).hexdigest()
+
+        parameters = {}
+        max_size = max(width or 0, height or 0) or None
+        if aspect_ratio is not None:
+            parameters["aspectRatio"] = aspect_ratio
+        elif max_size:
+            # Note: The size needs to be a string
+            parameters["sampleImageSize"] = str(max_size)
+            if height is not None and width is not None and height != width:
+                parameters["aspectRatio"] = f"{width}:{height}"
+
+        parameters["sampleCount"] = number_of_images
+        if negative_prompt:
+            parameters["negativePrompt"] = negative_prompt
+            shared_generation_parameters["negative_prompt"] = negative_prompt
+
+        if seed is not None:
+            # Note: String seed and numerical seed give different results
+            parameters["seed"] = seed
+            shared_generation_parameters["seed"] = seed
+
+        if guidance_scale is not None:
+            parameters["guidanceScale"] = guidance_scale
+            shared_generation_parameters["guidance_scale"] = guidance_scale
+
+        if language is not None:
+            parameters["language"] = language
+            shared_generation_parameters["language"] = language
+
+        if output_gcs_uri is not None:
+            parameters["storageUri"] = output_gcs_uri
+            shared_generation_parameters["storage_uri"] = output_gcs_uri
+
+        parameters["editConfig"] = {}
+        if edit_mode is not None:
+            parameters["editConfig"]["editMode"] = edit_mode
+            shared_generation_parameters["edit_mode"] = edit_mode
+
+        if mask is None and edit_mode != "product-image":
+            parameters["editConfig"]["maskMode"] = {}
+            if mask_mode is not None:
+                parameters["editConfig"]["maskMode"]["maskType"] = mask_mode
+                shared_generation_parameters["mask_mode"] = mask_mode
+
+            if segmentation_classes is not None:
+                parameters["editConfig"]["maskMode"]["classes"] = segmentation_classes
+                shared_generation_parameters["classes"] = segmentation_classes
+
+        if mask_dilation is not None:
+            parameters["editConfig"]["maskDilation"] = mask_dilation
+            shared_generation_parameters["mask_dilation"] = mask_dilation
+
+        if product_position is not None:
+            parameters["editConfig"]["productPosition"] = product_position
+            shared_generation_parameters["product_position"] = product_position
+
+        parameters["outputOptions"] = {}
+        if output_mime_type is not None:
+            parameters["outputOptions"]["mimeType"] = output_mime_type
+            shared_generation_parameters["mime_type"] = output_mime_type
+
+        if compression_quality is not None:
+            parameters["outputOptions"]["compressionQuality"] = compression_quality
+            shared_generation_parameters["compression_quality"] = compression_quality
+
+        if add_watermark is not None:
+            parameters["addWatermark"] = add_watermark
+            shared_generation_parameters["add_watermark"] = add_watermark
+
+        if safety_filter_level is not None:
+            parameters["safetySetting"] = safety_filter_level
+            shared_generation_parameters["safety_filter_level"] = safety_filter_level
+
+        if person_generation is not None:
+            parameters["personGeneration"] = person_generation
+            shared_generation_parameters["person_generation"] = person_generation
+
+        response = self._endpoint.predict(
+            instances=[instance],
+            parameters=parameters,
+        )
+
+        generated_images: List["GeneratedImage"] = []
+        for idx, prediction in enumerate(response.predictions):
+            generation_parameters = dict(shared_generation_parameters)
+            generation_parameters["index_of_image_in_batch"] = idx
+            encoded_bytes = prediction.get("bytesBase64Encoded")
+            generated_image = GeneratedImage(
+                image_bytes=base64.b64decode(encoded_bytes) if encoded_bytes else None,
+                generation_parameters=generation_parameters,
+                gcs_uri=prediction.get("gcsUri"),
+            )
+            generated_images.append(generated_image)
+
+        return ImageGenerationResponse(images=generated_images)
+
+    def generate_images(
+        self,
+        prompt: str,
+        *,
+        negative_prompt: Optional[str] = None,
+        number_of_images: int = 1,
+        aspect_ratio: Optional[Literal["1:1", "9:16", "16:9", "4:3", "3:4"]] = None,
+        guidance_scale: Optional[float] = None,
+        language: Optional[str] = None,
+        seed: Optional[int] = None,
+        output_gcs_uri: Optional[str] = None,
+        add_watermark: Optional[bool] = True,
+        safety_filter_level: Optional[
+            Literal["block_most", "block_some", "block_few", "block_fewest"]
+        ] = None,
+        person_generation: Optional[
+            Literal["dont_allow", "allow_adult", "allow_all"]
+        ] = None,
+    ) -> "ImageGenerationResponse":
+        """Generates images from text prompt.
+
+        Args:
+            prompt: Text prompt for the image.
+            negative_prompt: A description of what you want to omit in the generated
+                images.
+            number_of_images: Number of images to generate. Range: 1..8.
+            aspect_ratio: Changes the aspect ratio of the generated image Supported
+                values are:
+                * "1:1" : 1:1 aspect ratio
+                * "9:16" : 9:16 aspect ratio
+                * "16:9" : 16:9 aspect ratio
+                * "4:3" : 4:3 aspect ratio
+                * "3:4" : 3:4 aspect_ratio
+            guidance_scale: Controls the strength of the prompt. Suggested values are:
+                * 0-9 (low strength)
+                * 10-20 (medium strength)
+                * 21+ (high strength)
+            language: Language of the text prompt for the image. Default: None.
+                Supported values are `"en"` for English, `"hi"` for Hindi, `"ja"`
+                for Japanese, `"ko"` for Korean, and `"auto"` for automatic language
+                detection.
+            seed: Image generation random seed.
+            output_gcs_uri: Google Cloud Storage uri to store the generated images.
+            add_watermark: Add a watermark to the generated image
+            safety_filter_level: Adds a filter level to Safety filtering. Supported
+                values are:
+                * "block_most" : Strongest filtering level, most strict
+                blocking
+                * "block_some" : Block some problematic prompts and responses
+                * "block_few" : Block fewer problematic prompts and responses
+                * "block_fewest" : Block very few problematic prompts and responses
+            person_generation: Allow generation of people by the model Supported
+                values are:
+                * "dont_allow" : Block generation of people
+                * "allow_adult" : Generate adults, but not children
+                * "allow_all" : Generate adults and children
+        Returns:
+            An `ImageGenerationResponse` object.
+        """
+        return self._generate_images(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            number_of_images=number_of_images,
+            aspect_ratio=aspect_ratio,
+            guidance_scale=guidance_scale,
+            language=language,
+            seed=seed,
+            output_gcs_uri=output_gcs_uri,
+            add_watermark=add_watermark,
+            safety_filter_level=safety_filter_level,
+            person_generation=person_generation,
+        )
+
+    def edit_image(
+        self,
+        *,
+        prompt: str,
+        base_image: "Image",
+        mask: Optional["Image"] = None,
+        negative_prompt: Optional[str] = None,
+        number_of_images: int = 1,
+        guidance_scale: Optional[float] = None,
+        edit_mode: Optional[
+            Literal[
+                "inpainting-insert", "inpainting-remove", "outpainting", "product-image"
+            ]
+        ] = None,
+        mask_mode: Optional[Literal["background", "foreground", "semantic"]] = None,
+        segmentation_classes: Optional[List[str]] = None,
+        mask_dilation: Optional[float] = None,
+        product_position: Optional[Literal["fixed", "reposition"]] = None,
+        output_mime_type: Optional[Literal["image/png", "image/jpeg"]] = None,
+        compression_quality: Optional[float] = None,
+        language: Optional[str] = None,
+        seed: Optional[int] = None,
+        output_gcs_uri: Optional[str] = None,
+        safety_filter_level: Optional[
+            Literal["block_most", "block_some", "block_few", "block_fewest"]
+        ] = None,
+        person_generation: Optional[
+            Literal["dont_allow", "allow_adult", "allow_all"]
+        ] = None,
+    ) -> "ImageGenerationResponse":
+        """Edits an existing image based on text prompt.
+
+        Args:
+            prompt: Text prompt for the image.
+            base_image: Base image from which to generate the new image.
+            mask: Mask for the base image.
+            negative_prompt: A description of what you want to omit in
+                the generated images.
+            number_of_images: Number of images to generate. Range: 1..8.
+            guidance_scale: Controls the strength of the prompt.
+                Suggested values are:
+                * 0-9 (low strength)
+                * 10-20 (medium strength)
+                * 21+ (high strength)
+            edit_mode: Describes the editing mode for the request. Supported values are:
+                * inpainting-insert: fills the mask area based on the text prompt
+                (requires mask and text)
+                * inpainting-remove: removes the object(s) in the mask area.
+                (requires mask)
+                * outpainting: extend the image based on the mask area.
+                (Requires mask)
+                * product-image: Changes the background for the predominant product
+                or subject in the image
+            mask_mode: Solicits generation of the mask (v/s providing mask as an
+                input). Supported values are:
+                * background: Automatically generates a mask for all regions except
+                the primary subject(s) of the image
+                * foreground: Automatically generates a mask for the primary
+                subjects(s) of the image.
+                * semantic: Segment one or more of the segmentation classes using
+                class ID
+            segmentation_classes: List of class IDs for segmentation. Max of 5 IDs
+            mask_dilation: Defines the dilation percentage of the mask provided.
+                Float between 0 and 1. Defaults to 0.03
+            product_position: Defines whether the product should stay fixed or be
+                repositioned. Supported Values:
+                * fixed: Fixed position
+                * reposition: Can be moved (default)
+            output_mime_type: Which image format should the output be saved as.
+                Supported values:
+                * image/png: Save as a PNG image
+                * image/jpeg: Save as a JPEG image
+            compression_quality: Level of compression if the output mime type is
+              selected to be image/jpeg. Float between 0 to 100
+            language: Language of the text prompt for the image. Default: None.
+                Supported values are `"en"` for English, `"hi"` for Hindi,
+                `"ja"` for Japanese, `"ko"` for Korean, and `"auto"` for
+                automatic language detection.
+            seed: Image generation random seed.
+            output_gcs_uri: Google Cloud Storage uri to store the edited images.
+            safety_filter_level: Adds a filter level to Safety filtering. Supported
+                values are:
+                * "block_most" : Strongest filtering level, most strict
+                blocking
+                * "block_some" : Block some problematic prompts and responses
+                * "block_few" : Block fewer problematic prompts and responses
+                * "block_fewest" : Block very few problematic prompts and responses
+            person_generation: Allow generation of people by the model Supported
+                values are:
+                * "dont_allow" : Block generation of people
+                * "allow_adult" : Generate adults, but not children
+                * "allow_all" : Generate adults and children
+
+        Returns:
+            An `ImageGenerationResponse` object.
+        """
+        return self._generate_images(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            number_of_images=number_of_images,
+            guidance_scale=guidance_scale,
+            seed=seed,
+            base_image=base_image,
+            mask=mask,
+            edit_mode=edit_mode,
+            mask_mode=mask_mode,
+            segmentation_classes=segmentation_classes,
+            mask_dilation=mask_dilation,
+            product_position=product_position,
+            output_mime_type=output_mime_type,
+            compression_quality=compression_quality,
+            language=language,
+            output_gcs_uri=output_gcs_uri,
+            add_watermark=False,  # Not supported for editing yet
+            safety_filter_level=safety_filter_level,
+            person_generation=person_generation,
+        )
+
+    def upscale_image(
+        self,
+        image: Union["Image", "GeneratedImage"],
+        new_size: Optional[int] = 2048,
+        upscale_factor: Optional[Literal["x2", "x4"]] = None,
+        output_mime_type: Optional[Literal["image/png", "image/jpeg"]] = "image/png",
+        output_compression_quality: Optional[int] = None,
+        output_gcs_uri: Optional[str] = None,
+    ) -> "Image":
+        """Upscales an image.
+
+        This supports upscaling images generated through the `generate_images()`
+        method, or upscaling a new image.
+
+        Examples::
+
+            # Upscale a generated image
+            model = ImageGenerationModel.from_pretrained("imagegeneration@002")
+            response = model.generate_images(
+                prompt="Astronaut riding a horse",
+            )
+            model.upscale_image(image=response[0])
+
+            # Upscale a new 1024x1024 image
+            my_image = Image.load_from_file("my-image.png")
+            model.upscale_image(image=my_image)
+
+            # Upscale a new arbitrary sized image using a x2 or x4 upscaling factor
+            my_image = Image.load_from_file("my-image.png")
+            model.upscale_image(image=my_image, upscale_factor="x2")
+
+            # Upscale an image and get the result in JPEG format
+            my_image = Image.load_from_file("my-image.png")
+            model.upscale_image(image=my_image, output_mime_type="image/jpeg",
+            output_compression_quality=90)
+
+        Args:
+            image (Union[GeneratedImage, Image]): Required. The generated image
+                to upscale.
+            new_size (int): The size of the biggest dimension of the upscaled
+                image.
+                Only 2048 and 4096 are currently supported. Results in a
+                2048x2048 or 4096x4096 image. Defaults to 2048 if not provided.
+            upscale_factor: The upscaling factor. Supported values are "x2" and
+                "x4". Defaults to None.
+            output_mime_type: The mime type of the output image. Supported values
+                are "image/png" and "image/jpeg". Defaults to "image/png".
+            output_compression_quality: The compression quality of the output
+                image
+                as an int (0-100). Only applicable if the output mime type is
+                "image/jpeg". Defaults to None.
+            output_gcs_uri: Google Cloud Storage uri to store the upscaled
+                images.
+
+        Returns:
+            An `Image` object.
+        """
+        target_image_size = new_size if new_size else None
+        longest_dim = max(image._size[0], image._size[1])
+
+        if not new_size and not upscale_factor:
+            raise ValueError("Either new_size or upscale_factor must be provided.")
+
+        if not upscale_factor:
+            x2_factor = 2.0
+            x4_factor = 4.0
+            epsilon = 0.1
+            is_upscaling_x2_request = abs(new_size / longest_dim - x2_factor) < epsilon
+            is_upscaling_x4_request = abs(new_size / longest_dim - x4_factor) < epsilon
+
+            if not is_upscaling_x2_request and not is_upscaling_x4_request:
+                raise ValueError(
+                    "Only x2 and x4 upscaling are currently supported. Requested"
+                    f" upscaling factor: {new_size / longest_dim}"
+                )
+        else:
+            if upscale_factor == "x2":
+                target_image_size = longest_dim * 2
+            else:
+                target_image_size = longest_dim * 4
+        if new_size not in _SUPPORTED_UPSCALING_SIZES:
+            raise ValueError(
+                "Only the folowing square upscaling sizes are currently supported:"
+                f" {_SUPPORTED_UPSCALING_SIZES}."
+            )
+
+        instance = {"prompt": ""}
+
+        if image._gcs_uri:  # pylint: disable=protected-access
+            instance["image"] = {
+                "gcsUri": image._gcs_uri  # pylint: disable=protected-access
+            }
+        else:
+            instance["image"] = {
+                "bytesBase64Encoded": image._as_base64_string()  # pylint: disable=protected-access
+            }
+
+        parameters = {
+            "sampleCount": 1,
+            "mode": "upscale",
+        }
+
+        if upscale_factor:
+            parameters["upscaleConfig"] = {"upscaleFactor": upscale_factor}
+
+        else:
+            parameters["sampleImageSize"] = str(new_size)
+
+        if output_gcs_uri is not None:
+            parameters["storageUri"] = output_gcs_uri
+
+        parameters["outputOptions"] = {"mimeType": output_mime_type}
+        if output_mime_type == "image/jpeg" and output_compression_quality is not None:
+            parameters["outputOptions"][
+                "compressionQuality"
+            ] = output_compression_quality
+
+        response = self._endpoint.predict(
+            instances=[instance],
+            parameters=parameters,
+        )
+
+        upscaled_image = response.predictions[0]
+
+        if isinstance(image, GeneratedImage):
+            generation_parameters = image.generation_parameters
+
+        else:
+            generation_parameters = {}
+
+        generation_parameters["upscaled_image_size"] = target_image_size
+
+        encoded_bytes = upscaled_image.get("bytesBase64Encoded")
+        return GeneratedImage(
+            image_bytes=base64.b64decode(encoded_bytes) if encoded_bytes else None,
+            generation_parameters=generation_parameters,
+            gcs_uri=upscaled_image.get("gcsUri"),
+        )
+
+
+@dataclasses.dataclass
+class ImageGenerationResponse:
+    """Image generation response.
+
+    Attributes:
+        images: The list of generated images.
+    """
+
+    __module__ = "vertexai.preview.vision_models"
+
+    images: List["GeneratedImage"]
+
+    def __iter__(self) -> typing.Iterator["GeneratedImage"]:
+        """Iterates through the generated images."""
+        yield from self.images
+
+    def __getitem__(self, idx: int) -> "GeneratedImage":
+        """Gets the generated image by index."""
+        return self.images[idx]
+
+
+_EXIF_USER_COMMENT_TAG_IDX = 0x9286
+_IMAGE_GENERATION_PARAMETERS_EXIF_KEY = (
+    "google.cloud.vertexai.image_generation.image_generation_parameters"
+)
+
+
+class GeneratedImage(Image):
+    """Generated image."""
+
+    __module__ = "vertexai.preview.vision_models"
+
+    def __init__(
+        self,
+        image_bytes: Optional[bytes],
+        generation_parameters: Dict[str, Any],
+        gcs_uri: Optional[str] = None,
+    ):
+        """Creates a `GeneratedImage` object.
+
+        Args:
+            image_bytes: Image file bytes. Image can be in PNG or JPEG format.
+            generation_parameters: Image generation parameter values.
+            gcs_uri: Image file Google Cloud Storage uri.
+        """
+        super().__init__(image_bytes=image_bytes, gcs_uri=gcs_uri)
+        self._generation_parameters = generation_parameters
+
+    @property
+    def generation_parameters(self):
+        """Image generation parameters as a dictionary."""
+        return self._generation_parameters
+
+    @staticmethod
+    def load_from_file(location: str) -> "GeneratedImage":
+        """Loads image from file.
+
+        Args:
+            location: Local path from where to load the image.
+
+        Returns:
+            Loaded image as a `GeneratedImage` object.
+        """
+        base_image = Image.load_from_file(location=location)
+        exif = base_image._pil_image.getexif()  # pylint: disable=protected-access
+        exif_comment_dict = json.loads(exif[_EXIF_USER_COMMENT_TAG_IDX])
+        generation_parameters = exif_comment_dict[_IMAGE_GENERATION_PARAMETERS_EXIF_KEY]
+        return GeneratedImage(
+            image_bytes=base_image._image_bytes,  # pylint: disable=protected-access
+            generation_parameters=generation_parameters,
+            gcs_uri=base_image._gcs_uri,  # pylint: disable=protected-access
+        )
+
+    def save(self, location: str, include_generation_parameters: bool = True):
+        """Saves image to a file.
+
+        Args:
+            location: Local path where to save the image.
+            include_generation_parameters: Whether to include the image
+                generation parameters in the image's EXIF metadata.
+        """
+        if include_generation_parameters:
+            if not self._generation_parameters:
+                raise ValueError("Image does not have generation parameters.")
+            if not PIL_Image:
+                raise ValueError(
+                    "The PIL module is required for saving generation parameters."
+                )
+
+            exif = self._pil_image.getexif()
+            exif[_EXIF_USER_COMMENT_TAG_IDX] = json.dumps(
+                {_IMAGE_GENERATION_PARAMETERS_EXIF_KEY: self._generation_parameters}
+            )
+            self._pil_image.save(location, exif=exif)
+        else:
+            super().save(location=location)
+
+
+class ImageCaptioningModel(
+    _model_garden_models._ModelGardenModel  # pylint: disable=protected-access
+):
+    """Generates captions from image.
+
+    Examples::
+
+        model = ImageCaptioningModel.from_pretrained("imagetext@001")
+        image = Image.load_from_file("image.png")
+        captions = model.get_captions(
+            image=image,
+            # Optional:
+            number_of_results=1,
+            language="en",
+        )
+    """
+
+    __module__ = "vertexai.vision_models"
+
+    _INSTANCE_SCHEMA_URI = "gs://google-cloud-aiplatform/schema/predict/instance/vision_reasoning_model_1.0.0.yaml"
+
+    def get_captions(
+        self,
+        image: Image,
+        *,
+        number_of_results: int = 1,
+        language: str = "en",
+        output_gcs_uri: Optional[str] = None,
+    ) -> List[str]:
+        """Generates captions for a given image.
+
+        Args:
+            image: The image to get captions for. Size limit: 10 MB.
+            number_of_results: Number of captions to produce. Range: 1-3.
+            language: Language to use for captions.
+                Supported languages: "en", "fr", "de", "it", "es"
+            output_gcs_uri: Google Cloud Storage uri to store the captioned images.
+
+        Returns:
+            A list of image caption strings.
+        """
+        instance = {}
+
+        if image._gcs_uri:  # pylint: disable=protected-access
+            instance["image"] = {
+                "gcsUri": image._gcs_uri  # pylint: disable=protected-access
+            }
+        else:
+            instance["image"] = {
+                "bytesBase64Encoded": image._as_base64_string()  # pylint: disable=protected-access
+            }
+        parameters = {
+            "sampleCount": number_of_results,
+            "language": language,
+        }
+        if output_gcs_uri is not None:
+            parameters["storageUri"] = output_gcs_uri
+
+        response = self._endpoint.predict(
+            instances=[instance],
+            parameters=parameters,
+        )
+        return response.predictions
+
+
+class ImageQnAModel(
+    _model_garden_models._ModelGardenModel  # pylint: disable=protected-access
+):
+    """Answers questions about an image.
+
+    Examples::
+
+        model = ImageQnAModel.from_pretrained("imagetext@001")
+        image = Image.load_from_file("image.png")
+        answers = model.ask_question(
+            image=image,
+            question="What color is the car in this image?",
+            # Optional:
+            number_of_results=1,
+        )
+    """
+
+    __module__ = "vertexai.vision_models"
+
+    _INSTANCE_SCHEMA_URI = "gs://google-cloud-aiplatform/schema/predict/instance/vision_reasoning_model_1.0.0.yaml"
+
+    def ask_question(
+        self,
+        image: Image,
+        question: str,
+        *,
+        number_of_results: int = 1,
+    ) -> List[str]:
+        """Answers questions about an image.
+
+        Args:
+            image: The image to get captions for. Size limit: 10 MB.
+            question: Question to ask about the image.
+            number_of_results: Number of captions to produce. Range: 1-3.
+
+        Returns:
+            A list of answers.
+        """
+        instance = {"prompt": question}
+
+        if image._gcs_uri:  # pylint: disable=protected-access
+            instance["image"] = {
+                "gcsUri": image._gcs_uri  # pylint: disable=protected-access
+            }
+        else:
+            instance["image"] = {
+                "bytesBase64Encoded": image._as_base64_string()  # pylint: disable=protected-access
+            }
+        parameters = {
+            "sampleCount": number_of_results,
+        }
+
+        response = self._endpoint.predict(
+            instances=[instance],
+            parameters=parameters,
+        )
+        return response.predictions
+
+
+class MultiModalEmbeddingModel(_model_garden_models._ModelGardenModel):
+    """Generates embedding vectors from images and videos.
+
+    Examples::
+
+        model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding@001")
+        image = Image.load_from_file("image.png")
+        video = Video.load_from_file("video.mp4")
+
+        embeddings = model.get_embeddings(
+            image=image,
+            video=video,
+            contextual_text="Hello world",
+        )
+        image_embedding = embeddings.image_embedding
+        video_embeddings = embeddings.video_embeddings
+        text_embedding = embeddings.text_embedding
+    """
+
+    __module__ = "vertexai.vision_models"
+
+    _INSTANCE_SCHEMA_URI = "gs://google-cloud-aiplatform/schema/predict/instance/vision_embedding_model_1.0.0.yaml"
+
+    def get_embeddings(
+        self,
+        image: Optional[Image] = None,
+        video: Optional[Video] = None,
+        contextual_text: Optional[str] = None,
+        dimension: Optional[int] = None,
+        video_segment_config: Optional[VideoSegmentConfig] = None,
+    ) -> "MultiModalEmbeddingResponse":
+        """Gets embedding vectors from the provided image.
+
+        Args:
+            image (Image): Optional. The image to generate embeddings for. One of
+              `image`, `video`, or `contextual_text` is required.
+            video (Video): Optional. The video to generate embeddings for. One of
+              `image`, `video` or `contextual_text` is required.
+            contextual_text (str): Optional. Contextual text for your input image or video.
+              If provided, the model will also generate an embedding vector for the
+              provided contextual text. The returned image and text embedding
+              vectors are in the same semantic space with the same dimensionality,
+              and the vectors can be used interchangeably for use cases like
+              searching image by text or searching text by image. One of `image`, `video` or
+              `contextual_text` is required.
+            dimension (int): Optional. The number of embedding dimensions. Lower
+              values offer decreased latency when using these embeddings for
+              subsequent tasks, while higher values offer better accuracy.
+              Available values: `128`, `256`, `512`, and `1408` (default).
+            video_segment_config (VideoSegmentConfig): Optional. The specific
+              video segments (in seconds) the embeddings are generated for.
+
+        Returns:
+            MultiModalEmbeddingResponse:
+                The image and text embedding vectors.
+        """
+
+        if not image and not video and not contextual_text:
+            raise ValueError(
+                "One of `image`, `video`, or `contextual_text` is required."
+            )
+
+        instance = {}
+
+        if image:
+            if image._gcs_uri:  # pylint: disable=protected-access
+                instance["image"] = {
+                    "gcsUri": image._gcs_uri  # pylint: disable=protected-access
+                }
+            else:
+                instance["image"] = {
+                    "bytesBase64Encoded": image._as_base64_string()  # pylint: disable=protected-access
+                }
+
+        if video:
+            if video._gcs_uri:  # pylint: disable=protected-access
+                instance["video"] = {
+                    "gcsUri": video._gcs_uri  # pylint: disable=protected-access
+                }
+            else:
+                instance["video"] = {
+                    "bytesBase64Encoded": video._as_base64_string()  # pylint: disable=protected-access
+                }  # pylint: disable=protected-access
+
+            if video_segment_config:
+                instance["video"]["videoSegmentConfig"] = {
+                    "startOffsetSec": video_segment_config.start_offset_sec,
+                    "endOffsetSec": video_segment_config.end_offset_sec,
+                    "intervalSec": video_segment_config.interval_sec,
+                }
+
+        if contextual_text:
+            instance["text"] = contextual_text
+
+        parameters = {}
+        if dimension:
+            parameters["dimension"] = dimension
+
+        response = self._endpoint.predict(
+            instances=[instance],
+            parameters=parameters,
+        )
+        image_embedding = response.predictions[0].get("imageEmbedding")
+        video_embeddings = []
+        for video_embedding in response.predictions[0].get("videoEmbeddings", []):
+            video_embeddings.append(
+                VideoEmbedding(
+                    embedding=video_embedding["embedding"],
+                    start_offset_sec=video_embedding["startOffsetSec"],
+                    end_offset_sec=video_embedding["endOffsetSec"],
+                )
+            )
+        text_embedding = (
+            response.predictions[0].get("textEmbedding")
+            if "textEmbedding" in response.predictions[0]
+            else None
+        )
+        return MultiModalEmbeddingResponse(
+            image_embedding=image_embedding,
+            video_embeddings=video_embeddings,
+            _prediction_response=response,
+            text_embedding=text_embedding,
+        )
+
+
+@dataclasses.dataclass
+class MultiModalEmbeddingResponse:
+    """The multimodal embedding response.
+
+    Attributes:
+        image_embedding (List[float]):
+            Optional. The embedding vector generated from your image.
+        video_embeddings (List[VideoEmbedding]):
+            Optional. The embedding vectors generated from your video.
+        text_embedding (List[float]):
+            Optional. The embedding vector generated from the contextual text provided for your image or video.
+    """
+
+    __module__ = "vertexai.vision_models"
+
+    _prediction_response: Any
+    image_embedding: Optional[List[float]] = None
+    video_embeddings: Optional[List[VideoEmbedding]] = None
+    text_embedding: Optional[List[float]] = None
+
+
+class ImageTextModel(ImageCaptioningModel, ImageQnAModel):
+    """Generates text from images.
+
+    Examples::
+
+        model = ImageTextModel.from_pretrained("imagetext@001")
+        image = Image.load_from_file("image.png")
+
+        captions = model.get_captions(
+            image=image,
+            # Optional:
+            number_of_results=1,
+            language="en",
+        )
+
+        answers = model.ask_question(
+            image=image,
+            question="What color is the car in this image?",
+            # Optional:
+            number_of_results=1,
+        )
+    """
+
+    __module__ = "vertexai.vision_models"
+
+    # NOTE: Using this ImageTextModel class is recommended over using ImageQnAModel or ImageCaptioningModel,
+    # since SDK Model Garden classes should follow the design pattern of exactly 1 SDK class to 1 Model Garden schema URI
+
+    _INSTANCE_SCHEMA_URI = "gs://google-cloud-aiplatform/schema/predict/instance/vision_reasoning_model_1.0.0.yaml"
+
+
+@dataclasses.dataclass
+class WatermarkVerificationResponse:
+
+    __module__ = "vertexai.preview.vision_models"
+
+    _prediction_response: Any
+    watermark_verification_result: Optional[str] = None
+
+
+class WatermarkVerificationModel(_model_garden_models._ModelGardenModel):
+    """Verifies if an image has a watermark."""
+
+    __module__ = "vertexai.preview.vision_models"
+
+    _INSTANCE_SCHEMA_URI = "gs://google-cloud-aiplatform/schema/predict/instance/watermark_verification_model_1.0.0.yaml"
+
+    def verify_image(self, image: Image) -> WatermarkVerificationResponse:
+        """Verifies the watermark of an image.
+
+        Args:
+            image: The image to verify.
+
+        Returns:
+            A WatermarkVerificationResponse, containing the confidence level of
+            the image being watermarked.
+        """
+        if not image:
+            raise ValueError("Image is required.")
+
+        instance = {}
+
+        if image._gcs_uri:
+            instance["image"] = {"gcsUri": image._gcs_uri}
+        else:
+            instance["image"] = {"bytesBase64Encoded": image._as_base64_string()}
+
+        parameters = {}
+        response = self._endpoint.predict(
+            instances=[instance],
+            parameters=parameters,
+        )
+
+        verification_likelihood = response.predictions[0].get("decision")
+        return WatermarkVerificationResponse(
+            _prediction_response=response,
+            watermark_verification_result=verification_likelihood,
+        )

From b6467baad1acf6906a1aefafb7dc0d1eb760e6ef Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Mon, 9 Sep 2024 15:16:08 -0700
Subject: [PATCH 04/26] clear out extra classes

Change-Id: I64dec6a71f3f04fa834f2174e4a6b2d4740f4e90
---
 google/generativeai/vision_models/__init__.py |  20 +-
 .../vision_models/_vision_models.py           | 525 +-----------------
 2 files changed, 2 insertions(+), 543 deletions(-)

diff --git a/google/generativeai/vision_models/__init__.py b/google/generativeai/vision_models/__init__.py
index 1834b5ceb..f519c9928 100644
--- a/google/generativeai/vision_models/__init__.py
+++ b/google/generativeai/vision_models/__init__.py
@@ -14,32 +14,14 @@
 #
 """Classes for working with vision models."""
 
-from vertexai.vision_models._vision_models import (
+from google.generativeai.vision_models._vision_models import (
     GeneratedImage,
-    Image,
-    ImageCaptioningModel,
     ImageGenerationModel,
     ImageGenerationResponse,
-    ImageQnAModel,
-    ImageTextModel,
-    MultiModalEmbeddingModel,
-    MultiModalEmbeddingResponse,
-    Video,
-    VideoEmbedding,
-    VideoSegmentConfig,
 )
 
 __all__ = [
     "GeneratedImage",
-    "Image",
-    "ImageCaptioningModel",
     "ImageGenerationModel",
     "ImageGenerationResponse",
-    "ImageQnAModel",
-    "ImageTextModel",
-    "MultiModalEmbeddingModel",
-    "MultiModalEmbeddingResponse",
-    "Video",
-    "VideoEmbedding",
-    "VideoSegmentConfig",
 ]
diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py
index a80f0b2dc..cc0cd5ca9 100644
--- a/google/generativeai/vision_models/_vision_models.py
+++ b/google/generativeai/vision_models/_vision_models.py
@@ -25,11 +25,6 @@
 from typing import Any, Dict, List, Literal, Optional, Union
 import urllib
 
-from google.cloud import storage
-
-from google.cloud.aiplatform import initializer as aiplatform_initializer
-from vertexai._model_garden import _model_garden_models
-
 # pylint: disable=g-import-not-at-top
 try:
     from IPython import display as IPython_display
@@ -100,17 +95,6 @@ def load_from_file(location: str) -> "Image":
         image = Image(image_bytes=image_bytes)
         return image
 
-    @property
-    def _blob(self) -> storage.Blob:
-        if self._gcs_uri is None:
-            raise AttributeError("_blob is only supported when gcs_uri is set.")
-        storage_client = storage.Client(
-            credentials=aiplatform_initializer.global_config.credentials
-        )
-        blob = storage.Blob.from_string(uri=self._gcs_uri, client=storage_client)
-        # Needed to populate `blob.content_type`
-        blob.reload()
-        return blob
 
     @property
     def _image_bytes(self) -> bytes:
@@ -174,165 +158,7 @@ def _as_base64_string(self) -> str:
         return base64.b64encode(self._image_bytes).decode("ascii")
 
 
-class Video:
-    """Video."""
-
-    __module__ = "vertexai.vision_models"
-
-    _loaded_bytes: Optional[bytes] = None
-    _gcs_uri: Optional[str] = None
-
-    def __init__(
-        self,
-        video_bytes: Optional[bytes] = None,
-        gcs_uri: Optional[str] = None,
-    ):
-        """Creates a `Video` object.
-
-        Args:
-            video_bytes: Video file bytes. Video can be in AVI, FLV, MKV, MOV,
-                MP4, MPEG, MPG, WEBM, and WMV formats.
-            gcs_uri: Image URI in Google Cloud Storage.
-        """
-        if bool(video_bytes) == bool(gcs_uri):
-            raise ValueError("Either video_bytes or gcs_uri must be provided.")
-
-        self._video_bytes = video_bytes
-        self._gcs_uri = gcs_uri
-
-    @staticmethod
-    def load_from_file(location: str) -> "Video":
-        """Loads video from local file or Google Cloud Storage.
-
-        Args:
-            location: Local path or Google Cloud Storage uri from where to load
-                the video.
-
-        Returns:
-            Loaded video as an `Video` object.
-        """
-        parsed_url = urllib.parse.urlparse(location)
-        if (
-            parsed_url.scheme == "https"
-            and parsed_url.netloc == "storage.googleapis.com"
-        ):
-            parsed_url = parsed_url._replace(
-                scheme="gs", netloc="", path=f"/{urllib.parse.unquote(parsed_url.path)}"
-            )
-            location = urllib.parse.urlunparse(parsed_url)
-
-        if parsed_url.scheme == "gs":
-            return Video(gcs_uri=location)
-
-        # Load video from local path
-        video_bytes = pathlib.Path(location).read_bytes()
-        video = Video(video_bytes=video_bytes)
-        return video
-
-    @property
-    def _blob(self) -> storage.Blob:
-        if self._gcs_uri is None:
-            raise AttributeError("_blob is only supported when gcs_uri is set.")
-        storage_client = storage.Client(
-            credentials=aiplatform_initializer.global_config.credentials
-        )
-        blob = storage.Blob.from_string(uri=self._gcs_uri, client=storage_client)
-        # Needed to populate `blob.content_type`
-        blob.reload()
-        return blob
-
-    @property
-    def _video_bytes(self) -> bytes:
-        if self._loaded_bytes is None:
-            self._loaded_bytes = self._blob.download_as_bytes()
-        return self._loaded_bytes
-
-    @_video_bytes.setter
-    def _video_bytes(self, value: bytes):
-        self._loaded_bytes = value
-
-    @property
-    def _mime_type(self) -> str:
-        """Returns the MIME type of the video."""
-        if self._gcs_uri:
-            return self._blob.content_type
-        # Fall back to mp4
-        return "video/mp4"
-
-    def save(self, location: str):
-        """Saves video to a file.
-
-        Args:
-            location: Local path where to save the video.
-        """
-        pathlib.Path(location).write_bytes(self._video_bytes)
-
-    def _as_base64_string(self) -> str:
-        """Encodes video using the base64 encoding.
-
-        Returns:
-            Base64 encoding of the video as a string.
-        """
-        # ! b64encode returns `bytes` object, not `str`.
-        # We need to convert `bytes` to `str`, otherwise we get service error:
-        # "received initial metadata size exceeds limit"
-        return base64.b64encode(self._video_bytes).decode("ascii")
-
-
-class VideoSegmentConfig:
-    """The specific video segments (in seconds) the embeddings are generated for."""
-
-    __module__ = "vertexai.vision_models"
-
-    start_offset_sec: int
-    end_offset_sec: int
-    interval_sec: int
-
-    def __init__(
-        self,
-        start_offset_sec: int = 0,
-        end_offset_sec: int = 120,
-        interval_sec: int = 16,
-    ):
-        """Creates a `VideoSegmentConfig` object.
-
-        Args:
-            start_offset_sec: Start time offset (in seconds) to generate embeddings for.
-            end_offset_sec: End time offset (in seconds) to generate embeddings for.
-            interval_sec: Interval to divide video for generated embeddings.
-        """
-        self.start_offset_sec = start_offset_sec
-        self.end_offset_sec = end_offset_sec
-        self.interval_sec = interval_sec
-
-
-class VideoEmbedding:
-    """Embeddings generated from video with offset times."""
-
-    __module__ = "vertexai.vision_models"
-
-    start_offset_sec: int
-    end_offset_sec: int
-    embedding: List[float]
-
-    def __init__(
-        self, start_offset_sec: int, end_offset_sec: int, embedding: List[float]
-    ):
-        """Creates a `VideoEmbedding` object.
-
-        Args:
-            start_offset_sec: Start time offset (in seconds) of generated embeddings.
-            end_offset_sec: End time offset (in seconds) of generated embeddings.
-            embedding: Generated embedding for interval.
-        """
-        self.start_offset_sec = start_offset_sec
-        self.end_offset_sec = end_offset_sec
-        self.embedding = embedding
-
-
-class ImageGenerationModel(
-    _model_garden_models._ModelGardenModel  # pylint: disable=protected-access
-):
+class ImageGenerationModel:
     """Generates images from text prompt.
 
     Examples::
@@ -1025,352 +851,3 @@ def save(self, location: str, include_generation_parameters: bool = True):
         else:
             super().save(location=location)
 
-
-class ImageCaptioningModel(
-    _model_garden_models._ModelGardenModel  # pylint: disable=protected-access
-):
-    """Generates captions from image.
-
-    Examples::
-
-        model = ImageCaptioningModel.from_pretrained("imagetext@001")
-        image = Image.load_from_file("image.png")
-        captions = model.get_captions(
-            image=image,
-            # Optional:
-            number_of_results=1,
-            language="en",
-        )
-    """
-
-    __module__ = "vertexai.vision_models"
-
-    _INSTANCE_SCHEMA_URI = "gs://google-cloud-aiplatform/schema/predict/instance/vision_reasoning_model_1.0.0.yaml"
-
-    def get_captions(
-        self,
-        image: Image,
-        *,
-        number_of_results: int = 1,
-        language: str = "en",
-        output_gcs_uri: Optional[str] = None,
-    ) -> List[str]:
-        """Generates captions for a given image.
-
-        Args:
-            image: The image to get captions for. Size limit: 10 MB.
-            number_of_results: Number of captions to produce. Range: 1-3.
-            language: Language to use for captions.
-                Supported languages: "en", "fr", "de", "it", "es"
-            output_gcs_uri: Google Cloud Storage uri to store the captioned images.
-
-        Returns:
-            A list of image caption strings.
-        """
-        instance = {}
-
-        if image._gcs_uri:  # pylint: disable=protected-access
-            instance["image"] = {
-                "gcsUri": image._gcs_uri  # pylint: disable=protected-access
-            }
-        else:
-            instance["image"] = {
-                "bytesBase64Encoded": image._as_base64_string()  # pylint: disable=protected-access
-            }
-        parameters = {
-            "sampleCount": number_of_results,
-            "language": language,
-        }
-        if output_gcs_uri is not None:
-            parameters["storageUri"] = output_gcs_uri
-
-        response = self._endpoint.predict(
-            instances=[instance],
-            parameters=parameters,
-        )
-        return response.predictions
-
-
-class ImageQnAModel(
-    _model_garden_models._ModelGardenModel  # pylint: disable=protected-access
-):
-    """Answers questions about an image.
-
-    Examples::
-
-        model = ImageQnAModel.from_pretrained("imagetext@001")
-        image = Image.load_from_file("image.png")
-        answers = model.ask_question(
-            image=image,
-            question="What color is the car in this image?",
-            # Optional:
-            number_of_results=1,
-        )
-    """
-
-    __module__ = "vertexai.vision_models"
-
-    _INSTANCE_SCHEMA_URI = "gs://google-cloud-aiplatform/schema/predict/instance/vision_reasoning_model_1.0.0.yaml"
-
-    def ask_question(
-        self,
-        image: Image,
-        question: str,
-        *,
-        number_of_results: int = 1,
-    ) -> List[str]:
-        """Answers questions about an image.
-
-        Args:
-            image: The image to get captions for. Size limit: 10 MB.
-            question: Question to ask about the image.
-            number_of_results: Number of captions to produce. Range: 1-3.
-
-        Returns:
-            A list of answers.
-        """
-        instance = {"prompt": question}
-
-        if image._gcs_uri:  # pylint: disable=protected-access
-            instance["image"] = {
-                "gcsUri": image._gcs_uri  # pylint: disable=protected-access
-            }
-        else:
-            instance["image"] = {
-                "bytesBase64Encoded": image._as_base64_string()  # pylint: disable=protected-access
-            }
-        parameters = {
-            "sampleCount": number_of_results,
-        }
-
-        response = self._endpoint.predict(
-            instances=[instance],
-            parameters=parameters,
-        )
-        return response.predictions
-
-
-class MultiModalEmbeddingModel(_model_garden_models._ModelGardenModel):
-    """Generates embedding vectors from images and videos.
-
-    Examples::
-
-        model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding@001")
-        image = Image.load_from_file("image.png")
-        video = Video.load_from_file("video.mp4")
-
-        embeddings = model.get_embeddings(
-            image=image,
-            video=video,
-            contextual_text="Hello world",
-        )
-        image_embedding = embeddings.image_embedding
-        video_embeddings = embeddings.video_embeddings
-        text_embedding = embeddings.text_embedding
-    """
-
-    __module__ = "vertexai.vision_models"
-
-    _INSTANCE_SCHEMA_URI = "gs://google-cloud-aiplatform/schema/predict/instance/vision_embedding_model_1.0.0.yaml"
-
-    def get_embeddings(
-        self,
-        image: Optional[Image] = None,
-        video: Optional[Video] = None,
-        contextual_text: Optional[str] = None,
-        dimension: Optional[int] = None,
-        video_segment_config: Optional[VideoSegmentConfig] = None,
-    ) -> "MultiModalEmbeddingResponse":
-        """Gets embedding vectors from the provided image.
-
-        Args:
-            image (Image): Optional. The image to generate embeddings for. One of
-              `image`, `video`, or `contextual_text` is required.
-            video (Video): Optional. The video to generate embeddings for. One of
-              `image`, `video` or `contextual_text` is required.
-            contextual_text (str): Optional. Contextual text for your input image or video.
-              If provided, the model will also generate an embedding vector for the
-              provided contextual text. The returned image and text embedding
-              vectors are in the same semantic space with the same dimensionality,
-              and the vectors can be used interchangeably for use cases like
-              searching image by text or searching text by image. One of `image`, `video` or
-              `contextual_text` is required.
-            dimension (int): Optional. The number of embedding dimensions. Lower
-              values offer decreased latency when using these embeddings for
-              subsequent tasks, while higher values offer better accuracy.
-              Available values: `128`, `256`, `512`, and `1408` (default).
-            video_segment_config (VideoSegmentConfig): Optional. The specific
-              video segments (in seconds) the embeddings are generated for.
-
-        Returns:
-            MultiModalEmbeddingResponse:
-                The image and text embedding vectors.
-        """
-
-        if not image and not video and not contextual_text:
-            raise ValueError(
-                "One of `image`, `video`, or `contextual_text` is required."
-            )
-
-        instance = {}
-
-        if image:
-            if image._gcs_uri:  # pylint: disable=protected-access
-                instance["image"] = {
-                    "gcsUri": image._gcs_uri  # pylint: disable=protected-access
-                }
-            else:
-                instance["image"] = {
-                    "bytesBase64Encoded": image._as_base64_string()  # pylint: disable=protected-access
-                }
-
-        if video:
-            if video._gcs_uri:  # pylint: disable=protected-access
-                instance["video"] = {
-                    "gcsUri": video._gcs_uri  # pylint: disable=protected-access
-                }
-            else:
-                instance["video"] = {
-                    "bytesBase64Encoded": video._as_base64_string()  # pylint: disable=protected-access
-                }  # pylint: disable=protected-access
-
-            if video_segment_config:
-                instance["video"]["videoSegmentConfig"] = {
-                    "startOffsetSec": video_segment_config.start_offset_sec,
-                    "endOffsetSec": video_segment_config.end_offset_sec,
-                    "intervalSec": video_segment_config.interval_sec,
-                }
-
-        if contextual_text:
-            instance["text"] = contextual_text
-
-        parameters = {}
-        if dimension:
-            parameters["dimension"] = dimension
-
-        response = self._endpoint.predict(
-            instances=[instance],
-            parameters=parameters,
-        )
-        image_embedding = response.predictions[0].get("imageEmbedding")
-        video_embeddings = []
-        for video_embedding in response.predictions[0].get("videoEmbeddings", []):
-            video_embeddings.append(
-                VideoEmbedding(
-                    embedding=video_embedding["embedding"],
-                    start_offset_sec=video_embedding["startOffsetSec"],
-                    end_offset_sec=video_embedding["endOffsetSec"],
-                )
-            )
-        text_embedding = (
-            response.predictions[0].get("textEmbedding")
-            if "textEmbedding" in response.predictions[0]
-            else None
-        )
-        return MultiModalEmbeddingResponse(
-            image_embedding=image_embedding,
-            video_embeddings=video_embeddings,
-            _prediction_response=response,
-            text_embedding=text_embedding,
-        )
-
-
-@dataclasses.dataclass
-class MultiModalEmbeddingResponse:
-    """The multimodal embedding response.
-
-    Attributes:
-        image_embedding (List[float]):
-            Optional. The embedding vector generated from your image.
-        video_embeddings (List[VideoEmbedding]):
-            Optional. The embedding vectors generated from your video.
-        text_embedding (List[float]):
-            Optional. The embedding vector generated from the contextual text provided for your image or video.
-    """
-
-    __module__ = "vertexai.vision_models"
-
-    _prediction_response: Any
-    image_embedding: Optional[List[float]] = None
-    video_embeddings: Optional[List[VideoEmbedding]] = None
-    text_embedding: Optional[List[float]] = None
-
-
-class ImageTextModel(ImageCaptioningModel, ImageQnAModel):
-    """Generates text from images.
-
-    Examples::
-
-        model = ImageTextModel.from_pretrained("imagetext@001")
-        image = Image.load_from_file("image.png")
-
-        captions = model.get_captions(
-            image=image,
-            # Optional:
-            number_of_results=1,
-            language="en",
-        )
-
-        answers = model.ask_question(
-            image=image,
-            question="What color is the car in this image?",
-            # Optional:
-            number_of_results=1,
-        )
-    """
-
-    __module__ = "vertexai.vision_models"
-
-    # NOTE: Using this ImageTextModel class is recommended over using ImageQnAModel or ImageCaptioningModel,
-    # since SDK Model Garden classes should follow the design pattern of exactly 1 SDK class to 1 Model Garden schema URI
-
-    _INSTANCE_SCHEMA_URI = "gs://google-cloud-aiplatform/schema/predict/instance/vision_reasoning_model_1.0.0.yaml"
-
-
-@dataclasses.dataclass
-class WatermarkVerificationResponse:
-
-    __module__ = "vertexai.preview.vision_models"
-
-    _prediction_response: Any
-    watermark_verification_result: Optional[str] = None
-
-
-class WatermarkVerificationModel(_model_garden_models._ModelGardenModel):
-    """Verifies if an image has a watermark."""
-
-    __module__ = "vertexai.preview.vision_models"
-
-    _INSTANCE_SCHEMA_URI = "gs://google-cloud-aiplatform/schema/predict/instance/watermark_verification_model_1.0.0.yaml"
-
-    def verify_image(self, image: Image) -> WatermarkVerificationResponse:
-        """Verifies the watermark of an image.
-
-        Args:
-            image: The image to verify.
-
-        Returns:
-            A WatermarkVerificationResponse, containing the confidence level of
-            the image being watermarked.
-        """
-        if not image:
-            raise ValueError("Image is required.")
-
-        instance = {}
-
-        if image._gcs_uri:
-            instance["image"] = {"gcsUri": image._gcs_uri}
-        else:
-            instance["image"] = {"bytesBase64Encoded": image._as_base64_string()}
-
-        parameters = {}
-        response = self._endpoint.predict(
-            instances=[instance],
-            parameters=parameters,
-        )
-
-        verification_likelihood = response.predictions[0].get("decision")
-        return WatermarkVerificationResponse(
-            _prediction_response=response,
-            watermark_verification_result=verification_likelihood,
-        )

From e518db4226d93d218e58567c94b726d06fe86162 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Mon, 9 Sep 2024 15:32:30 -0700
Subject: [PATCH 05/26] remove gcs_uri

Change-Id: Ieafe776bde8e79e1747ab6cf252b49a001ab66bf
---
 .../vision_models/_vision_models.py           | 26 +------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py
index cc0cd5ca9..8381cae9d 100644
--- a/google/generativeai/vision_models/_vision_models.py
+++ b/google/generativeai/vision_models/_vision_models.py
@@ -47,24 +47,17 @@ class Image:
 
     _loaded_bytes: Optional[bytes] = None
     _loaded_image: Optional["PIL_Image.Image"] = None
-    _gcs_uri: Optional[str] = None
 
     def __init__(
         self,
-        image_bytes: Optional[bytes] = None,
-        gcs_uri: Optional[str] = None,
+        image_bytes: Optional[bytes],
     ):
         """Creates an `Image` object.
 
         Args:
             image_bytes: Image file bytes. Image can be in PNG or JPEG format.
-            gcs_uri: Image URI in Google Cloud Storage.
         """
-        if bool(image_bytes) == bool(gcs_uri):
-            raise ValueError("Either image_bytes or gcs_uri must be provided.")
-
         self._image_bytes = image_bytes
-        self._gcs_uri = gcs_uri
 
     @staticmethod
     def load_from_file(location: str) -> "Image":
@@ -77,19 +70,6 @@ def load_from_file(location: str) -> "Image":
         Returns:
             Loaded image as an `Image` object.
         """
-        parsed_url = urllib.parse.urlparse(location)
-        if (
-            parsed_url.scheme == "https"
-            and parsed_url.netloc == "storage.googleapis.com"
-        ):
-            parsed_url = parsed_url._replace(
-                scheme="gs", netloc="", path=f"/{urllib.parse.unquote(parsed_url.path)}"
-            )
-            location = urllib.parse.urlunparse(parsed_url)
-
-        if parsed_url.scheme == "gs":
-            return Image(gcs_uri=location)
-
         # Load image from local path
         image_bytes = pathlib.Path(location).read_bytes()
         image = Image(image_bytes=image_bytes)
@@ -98,8 +78,6 @@ def load_from_file(location: str) -> "Image":
 
     @property
     def _image_bytes(self) -> bytes:
-        if self._loaded_bytes is None:
-            self._loaded_bytes = self._blob.download_as_bytes()
         return self._loaded_bytes
 
     @_image_bytes.setter
@@ -123,8 +101,6 @@ def _size(self):
     @property
     def _mime_type(self) -> str:
         """Returns the MIME type of the image."""
-        if self._gcs_uri:
-            return self._blob.content_type
         if PIL_Image:
             return PIL_Image.MIME.get(self._pil_image.format, "image/jpeg")
         # Fall back to jpeg

From 13dfe88e97d61297e9a73f9af803e5063e9fe1db Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Mon, 9 Sep 2024 15:34:15 -0700
Subject: [PATCH 06/26] IPython reprs

Change-Id: I586876f524684dc3d0ee0ea8510b56bc5153642b
---
 google/generativeai/vision_models/_vision_models.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py
index 8381cae9d..d7cea74a3 100644
--- a/google/generativeai/vision_models/_vision_models.py
+++ b/google/generativeai/vision_models/_vision_models.py
@@ -114,6 +114,12 @@ def show(self):
         if PIL_Image and IPython_display:
             IPython_display.display(self._pil_image)
 
+    def _repr_jpeg(self):
+        return self._pil_image._repr_jpeg()
+
+    def _repr_png(self):
+        return self._pil_image._repr_png()
+
     def save(self, location: str):
         """Saves image to a file.
 

From 73a312b363c05212b5a460d50643e1f4a70033ac Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Mon, 9 Sep 2024 15:46:21 -0700
Subject: [PATCH 07/26] remove output_gce_uri

Change-Id: Ic4424d06a705bf94f0fade34468160566cb3a6be
---
 .../vision_models/_vision_models.py            | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py
index d7cea74a3..45bcd5401 100644
--- a/google/generativeai/vision_models/_vision_models.py
+++ b/google/generativeai/vision_models/_vision_models.py
@@ -188,7 +188,6 @@ def _generate_images(
         output_mime_type: Optional[Literal["image/png", "image/jpeg"]] = None,
         compression_quality: Optional[float] = None,
         language: Optional[str] = None,
-        output_gcs_uri: Optional[str] = None,
         add_watermark: Optional[bool] = None,
         safety_filter_level: Optional[
             Literal["block_most", "block_some", "block_few", "block_fewest"]
@@ -249,7 +248,6 @@ class ID
               Supported values are `"en"` for English, `"hi"` for Hindi, `"ja"` for
               Japanese, `"ko"` for Korean, and `"auto"` for automatic language
               detection.
-            output_gcs_uri: Google Cloud Storage uri to store the generated images.
             add_watermark: Add a watermark to the generated image
             safety_filter_level: Adds a filter level to Safety filtering. Supported
               values are: * "block_most" : Strongest filtering level, most strict
@@ -338,10 +336,6 @@ class ID
             parameters["language"] = language
             shared_generation_parameters["language"] = language
 
-        if output_gcs_uri is not None:
-            parameters["storageUri"] = output_gcs_uri
-            shared_generation_parameters["storage_uri"] = output_gcs_uri
-
         parameters["editConfig"] = {}
         if edit_mode is not None:
             parameters["editConfig"]["editMode"] = edit_mode
@@ -415,7 +409,6 @@ def generate_images(
         guidance_scale: Optional[float] = None,
         language: Optional[str] = None,
         seed: Optional[int] = None,
-        output_gcs_uri: Optional[str] = None,
         add_watermark: Optional[bool] = True,
         safety_filter_level: Optional[
             Literal["block_most", "block_some", "block_few", "block_fewest"]
@@ -447,7 +440,6 @@ def generate_images(
                 for Japanese, `"ko"` for Korean, and `"auto"` for automatic language
                 detection.
             seed: Image generation random seed.
-            output_gcs_uri: Google Cloud Storage uri to store the generated images.
             add_watermark: Add a watermark to the generated image
             safety_filter_level: Adds a filter level to Safety filtering. Supported
                 values are:
@@ -472,7 +464,6 @@ def generate_images(
             guidance_scale=guidance_scale,
             language=language,
             seed=seed,
-            output_gcs_uri=output_gcs_uri,
             add_watermark=add_watermark,
             safety_filter_level=safety_filter_level,
             person_generation=person_generation,
@@ -500,7 +491,6 @@ def edit_image(
         compression_quality: Optional[float] = None,
         language: Optional[str] = None,
         seed: Optional[int] = None,
-        output_gcs_uri: Optional[str] = None,
         safety_filter_level: Optional[
             Literal["block_most", "block_some", "block_few", "block_fewest"]
         ] = None,
@@ -557,7 +547,6 @@ class ID
                 `"ja"` for Japanese, `"ko"` for Korean, and `"auto"` for
                 automatic language detection.
             seed: Image generation random seed.
-            output_gcs_uri: Google Cloud Storage uri to store the edited images.
             safety_filter_level: Adds a filter level to Safety filtering. Supported
                 values are:
                 * "block_most" : Strongest filtering level, most strict
@@ -590,7 +579,6 @@ class ID
             output_mime_type=output_mime_type,
             compression_quality=compression_quality,
             language=language,
-            output_gcs_uri=output_gcs_uri,
             add_watermark=False,  # Not supported for editing yet
             safety_filter_level=safety_filter_level,
             person_generation=person_generation,
@@ -603,7 +591,6 @@ def upscale_image(
         upscale_factor: Optional[Literal["x2", "x4"]] = None,
         output_mime_type: Optional[Literal["image/png", "image/jpeg"]] = "image/png",
         output_compression_quality: Optional[int] = None,
-        output_gcs_uri: Optional[str] = None,
     ) -> "Image":
         """Upscales an image.
 
@@ -647,8 +634,6 @@ def upscale_image(
                 image
                 as an int (0-100). Only applicable if the output mime type is
                 "image/jpeg". Defaults to None.
-            output_gcs_uri: Google Cloud Storage uri to store the upscaled
-                images.
 
         Returns:
             An `Image` object.
@@ -704,9 +689,6 @@ def upscale_image(
         else:
             parameters["sampleImageSize"] = str(new_size)
 
-        if output_gcs_uri is not None:
-            parameters["storageUri"] = output_gcs_uri
-
         parameters["outputOptions"] = {"mimeType": output_mime_type}
         if output_mime_type == "image/jpeg" and output_compression_quality is not None:
             parameters["outputOptions"][

From 04e3f0d9be1a2f2bd0e9b99964e1330bed198a80 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Mon, 9 Sep 2024 15:51:01 -0700
Subject: [PATCH 08/26] remove IPython reprs

Change-Id: I474c76c1e73d7a92a653932d1984c02de7e3b71a
---
 google/generativeai/vision_models/_vision_models.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py
index 45bcd5401..a9e796565 100644
--- a/google/generativeai/vision_models/_vision_models.py
+++ b/google/generativeai/vision_models/_vision_models.py
@@ -114,12 +114,6 @@ def show(self):
         if PIL_Image and IPython_display:
             IPython_display.display(self._pil_image)
 
-    def _repr_jpeg(self):
-        return self._pil_image._repr_jpeg()
-
-    def _repr_png(self):
-        return self._pil_image._repr_png()
-
     def save(self, location: str):
         """Saves image to a file.
 

From 196eaf71392650dbc2fdf28c02a60a3cec85ea33 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Mon, 9 Sep 2024 15:52:05 -0700
Subject: [PATCH 09/26] remove more gcs_uri.

Change-Id: I9f6b8c879dc8f38c5193ddb1e271a62984c7f020
---
 .../vision_models/_vision_models.py           | 64 ++++++-------------
 1 file changed, 18 insertions(+), 46 deletions(-)

diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py
index a9e796565..312d3f90b 100644
--- a/google/generativeai/vision_models/_vision_models.py
+++ b/google/generativeai/vision_models/_vision_models.py
@@ -267,40 +267,22 @@ class ID
         }
 
         if base_image:
-            if base_image._gcs_uri:  # pylint: disable=protected-access
-                instance["image"] = {
-                    "gcsUri": base_image._gcs_uri  # pylint: disable=protected-access
-                }
-                shared_generation_parameters[
-                    "base_image_uri"
-                ] = base_image._gcs_uri  # pylint: disable=protected-access
-            else:
-                instance["image"] = {
-                    "bytesBase64Encoded": base_image._as_base64_string()  # pylint: disable=protected-access
-                }
-                shared_generation_parameters["base_image_hash"] = hashlib.sha1(
-                    base_image._image_bytes  # pylint: disable=protected-access
-                ).hexdigest()
+            instance["image"] = {
+                "bytesBase64Encoded": base_image._as_base64_string()  # pylint: disable=protected-access
+            }
+            shared_generation_parameters["base_image_hash"] = hashlib.sha1(
+                base_image._image_bytes  # pylint: disable=protected-access
+            ).hexdigest()
 
         if mask:
-            if mask._gcs_uri:  # pylint: disable=protected-access
-                instance["mask"] = {
-                    "image": {
-                        "gcsUri": mask._gcs_uri  # pylint: disable=protected-access
-                    },
-                }
-                shared_generation_parameters[
-                    "mask_uri"
-                ] = mask._gcs_uri  # pylint: disable=protected-access
-            else:
-                instance["mask"] = {
-                    "image": {
-                        "bytesBase64Encoded": mask._as_base64_string()  # pylint: disable=protected-access
-                    },
-                }
-                shared_generation_parameters["mask_hash"] = hashlib.sha1(
-                    mask._image_bytes  # pylint: disable=protected-access
-                ).hexdigest()
+            instance["mask"] = {
+                "image": {
+                    "bytesBase64Encoded": mask._as_base64_string()  # pylint: disable=protected-access
+                },
+            }
+            shared_generation_parameters["mask_hash"] = hashlib.sha1(
+                mask._image_bytes  # pylint: disable=protected-access
+            ).hexdigest()
 
         parameters = {}
         max_size = max(width or 0, height or 0) or None
@@ -387,7 +369,6 @@ class ID
             generated_image = GeneratedImage(
                 image_bytes=base64.b64decode(encoded_bytes) if encoded_bytes else None,
                 generation_parameters=generation_parameters,
-                gcs_uri=prediction.get("gcsUri"),
             )
             generated_images.append(generated_image)
 
@@ -663,14 +644,9 @@ def upscale_image(
 
         instance = {"prompt": ""}
 
-        if image._gcs_uri:  # pylint: disable=protected-access
-            instance["image"] = {
-                "gcsUri": image._gcs_uri  # pylint: disable=protected-access
-            }
-        else:
-            instance["image"] = {
-                "bytesBase64Encoded": image._as_base64_string()  # pylint: disable=protected-access
-            }
+        instance["image"] = {
+            "bytesBase64Encoded": image._as_base64_string()  # pylint: disable=protected-access
+        }
 
         parameters = {
             "sampleCount": 1,
@@ -708,7 +684,6 @@ def upscale_image(
         return GeneratedImage(
             image_bytes=base64.b64decode(encoded_bytes) if encoded_bytes else None,
             generation_parameters=generation_parameters,
-            gcs_uri=upscaled_image.get("gcsUri"),
         )
 
 
@@ -748,16 +723,14 @@ def __init__(
         self,
         image_bytes: Optional[bytes],
         generation_parameters: Dict[str, Any],
-        gcs_uri: Optional[str] = None,
     ):
         """Creates a `GeneratedImage` object.
 
         Args:
             image_bytes: Image file bytes. Image can be in PNG or JPEG format.
             generation_parameters: Image generation parameter values.
-            gcs_uri: Image file Google Cloud Storage uri.
         """
-        super().__init__(image_bytes=image_bytes, gcs_uri=gcs_uri)
+        super().__init__(image_bytes=image_bytes)
         self._generation_parameters = generation_parameters
 
     @property
@@ -782,7 +755,6 @@ def load_from_file(location: str) -> "GeneratedImage":
         return GeneratedImage(
             image_bytes=base_image._image_bytes,  # pylint: disable=protected-access
             generation_parameters=generation_parameters,
-            gcs_uri=base_image._gcs_uri,  # pylint: disable=protected-access
         )
 
     def save(self, location: str, include_generation_parameters: bool = True):

From 1917b3c39b97c0fc3601de7c48cf9fbf641f80da Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Tue, 17 Sep 2024 08:44:02 -0700
Subject: [PATCH 10/26] handle instances converversion to Value protos.

Change-Id: Id33f8d2d6a4cffbfb7b0d37955cc800a867a70d5
---
 .../vision_models/_vision_models.py           | 62 ++++++++++++++++++-
 1 file changed, 59 insertions(+), 3 deletions(-)

diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py
index 312d3f90b..3a3d4f34b 100644
--- a/google/generativeai/vision_models/_vision_models.py
+++ b/google/generativeai/vision_models/_vision_models.py
@@ -16,6 +16,7 @@
 """Classes for working with vision models."""
 
 import base64
+import collections
 import dataclasses
 import hashlib
 import io
@@ -23,7 +24,12 @@
 import pathlib
 import typing
 from typing import Any, Dict, List, Literal, Optional, Union
-import urllib
+
+from google.protobuf import struct_pb2
+
+from proto.marshal.collections import maps
+from proto.marshal.collections import repeated
+
 
 # pylint: disable=g-import-not-at-top
 try:
@@ -37,6 +43,56 @@
     PIL_Image = None
 
 
+def to_value(value) -> struct_pb2.Value:
+    """Return a protobuf Value object representing this value."""
+    if isinstance(value, struct_pb2.Value):
+        return value
+    if value is None:
+        return struct_pb2.Value(null_value=0)
+    if isinstance(value, bool):
+        return struct_pb2.Value(bool_value=value)
+    if isinstance(value, (int, float)):
+        return struct_pb2.Value(number_value=float(value))
+    if isinstance(value, str):
+        return struct_pb2.Value(string_value=value)
+    if isinstance(value, collections.abc.Sequence):
+        return struct_pb2.Value(list_value=to_list_value(value))
+    if isinstance(value, collections.abc.Mapping):
+        return struct_pb2.Value(struct_value=to_mapping_value(value))
+    raise ValueError("Unable to coerce value: %r" % value)
+
+def to_list_value(value) -> struct_pb2.ListValue:
+    # We got a proto, or else something we sent originally.
+    # Preserve the instance we have.
+    if isinstance(value, struct_pb2.ListValue):
+        return value
+    if isinstance(value, repeated.RepeatedComposite):
+        return struct_pb2.ListValue(values=[v for v in value.pb])
+
+    # We got a list (or something list-like); convert it.
+    return struct_pb2.ListValue(
+        values=[to_value(v) for v in value]
+    )
+
+def to_mapping_value(value) -> struct_pb2.Struct:
+    # We got a proto, or else something we sent originally.
+    # Preserve the instance we have.
+    if isinstance(value, struct_pb2.Struct):
+        return value
+    if isinstance(value, maps.MapComposite):
+        return struct_pb2.Struct(
+            fields={k: v for k, v in value.pb.items()},
+        )
+
+    # We got a dict (or something dict-like); convert it.
+    return struct_pb2.Struct(
+        fields={
+            k: to_value(v) for k, v in value.items()
+        }
+    )
+
+
+
 _SUPPORTED_UPSCALING_SIZES = [2048, 4096]
 
 
@@ -357,7 +413,7 @@ class ID
             shared_generation_parameters["person_generation"] = person_generation
 
         response = self._endpoint.predict(
-            instances=[instance],
+            instances=[to_value(instance)],
             parameters=parameters,
         )
 
@@ -666,7 +722,7 @@ def upscale_image(
             ] = output_compression_quality
 
         response = self._endpoint.predict(
-            instances=[instance],
+            instances=[to_value(instance)],
             parameters=parameters,
         )
 

From 9212ec5b9d96f9c74ca2b15f9040026e770bfbd2 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Tue, 24 Sep 2024 14:59:21 -0700
Subject: [PATCH 11/26] Basically works.

Change-Id: I28364ab70b2a263b29026f2cf2d1d4f807d88f53
---
 google/generativeai/__init__.py               |  2 +
 google/generativeai/client.py                 |  8 +++
 .../vision_models/_vision_models.py           | 65 +++++++++----------
 3 files changed, 41 insertions(+), 34 deletions(-)

diff --git a/google/generativeai/__init__.py b/google/generativeai/__init__.py
index 5b143d768..73025a1b4 100644
--- a/google/generativeai/__init__.py
+++ b/google/generativeai/__init__.py
@@ -59,6 +59,8 @@
 from google.generativeai.generative_models import GenerativeModel
 from google.generativeai.generative_models import ChatSession
 
+from google.generativeai.vision_models import *
+
 from google.generativeai.models import list_models
 from google.generativeai.models import list_tuned_models
 
diff --git a/google/generativeai/client.py b/google/generativeai/client.py
index d2eb6b1c9..a75643f1a 100644
--- a/google/generativeai/client.py
+++ b/google/generativeai/client.py
@@ -384,3 +384,11 @@ def get_default_permission_client() -> glm.PermissionServiceClient:
 
 def get_default_permission_async_client() -> glm.PermissionServiceAsyncClient:
     return _client_manager.get_default_client("permission_async")
+
+
+def get_default_prediction_client() -> glm.PermissionServiceClient:
+    return _client_manager.get_default_client("prediction")
+
+
+def get_default_prediction_async_client() -> glm.PermissionServiceAsyncClient:
+    return _client_manager.get_default_client("prediction_async")
diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py
index 3a3d4f34b..2b67fe33f 100644
--- a/google/generativeai/vision_models/_vision_models.py
+++ b/google/generativeai/vision_models/_vision_models.py
@@ -25,6 +25,9 @@
 import typing
 from typing import Any, Dict, List, Literal, Optional, Union
 
+from google.generativeai import client
+from google.generativeai import protos
+
 from google.protobuf import struct_pb2
 
 from proto.marshal.collections import maps
@@ -43,6 +46,7 @@
     PIL_Image = None
 
 
+# This is to get around https://github.com/googleapis/proto-plus-python/issues/488
 def to_value(value) -> struct_pb2.Value:
     """Return a protobuf Value object representing this value."""
     if isinstance(value, struct_pb2.Value):
@@ -61,6 +65,7 @@ def to_value(value) -> struct_pb2.Value:
         return struct_pb2.Value(struct_value=to_mapping_value(value))
     raise ValueError("Unable to coerce value: %r" % value)
 
+
 def to_list_value(value) -> struct_pb2.ListValue:
     # We got a proto, or else something we sent originally.
     # Preserve the instance we have.
@@ -70,9 +75,8 @@ def to_list_value(value) -> struct_pb2.ListValue:
         return struct_pb2.ListValue(values=[v for v in value.pb])
 
     # We got a list (or something list-like); convert it.
-    return struct_pb2.ListValue(
-        values=[to_value(v) for v in value]
-    )
+    return struct_pb2.ListValue(values=[to_value(v) for v in value])
+
 
 def to_mapping_value(value) -> struct_pb2.Struct:
     # We got a proto, or else something we sent originally.
@@ -85,12 +89,7 @@ def to_mapping_value(value) -> struct_pb2.Struct:
         )
 
     # We got a dict (or something dict-like); convert it.
-    return struct_pb2.Struct(
-        fields={
-            k: to_value(v) for k, v in value.items()
-        }
-    )
-
+    return struct_pb2.Struct(fields={k: to_value(v) for k, v in value.items()})
 
 
 _SUPPORTED_UPSCALING_SIZES = [2048, 4096]
@@ -131,7 +130,6 @@ def load_from_file(location: str) -> "Image":
         image = Image(image_bytes=image_bytes)
         return image
 
-
     @property
     def _image_bytes(self) -> bytes:
         return self._loaded_bytes
@@ -206,9 +204,16 @@ class ImageGenerationModel:
         response[0].save("image1.png")
     """
 
-    __module__ = "vertexai.preview.vision_models"
+    def __init__(self, model_id: str):
+        if not model_id.startswith("models"):
+            model_id = f"models/{model_id}"
+        self.model_name = model_id
+        self._client = None
 
-    _INSTANCE_SCHEMA_URI = "gs://google-cloud-aiplatform/schema/predict/instance/vision_generative_model_1.0.0.yaml"
+    @classmethod
+    def from_pretrained(cls, model_name: str):
+        """For vertex compatibility"""
+        return cls(model_name)
 
     def _generate_images(
         self,
@@ -242,9 +247,7 @@ def _generate_images(
         safety_filter_level: Optional[
             Literal["block_most", "block_some", "block_few", "block_fewest"]
         ] = None,
-        person_generation: Optional[
-            Literal["dont_allow", "allow_adult", "allow_all"]
-        ] = None,
+        person_generation: Optional[Literal["dont_allow", "allow_adult", "allow_all"]] = None,
     ) -> "ImageGenerationResponse":
         """Generates images from text prompt.
 
@@ -312,6 +315,8 @@ class ID
         Returns:
             An `ImageGenerationResponse` object.
         """
+        if self._client is None:
+            self._client = client.get_default_prediction_client()
         # Note: Only a single prompt is supported by the service.
         instance = {"prompt": prompt}
         shared_generation_parameters = {
@@ -412,11 +417,14 @@ class ID
             parameters["personGeneration"] = person_generation
             shared_generation_parameters["person_generation"] = person_generation
 
-        response = self._endpoint.predict(
-            instances=[to_value(instance)],
-            parameters=parameters,
+        # This is to get around https://github.com/googleapis/proto-plus-python/issues/488
+        pr = protos.PredictRequest.pb()
+        request = pr(
+            model=self.model_name, instances=[to_value(instance)], parameters=to_value(parameters)
         )
 
+        response = self._client.predict(request)
+
         generated_images: List["GeneratedImage"] = []
         for idx, prediction in enumerate(response.predictions):
             generation_parameters = dict(shared_generation_parameters)
@@ -444,9 +452,7 @@ def generate_images(
         safety_filter_level: Optional[
             Literal["block_most", "block_some", "block_few", "block_fewest"]
         ] = None,
-        person_generation: Optional[
-            Literal["dont_allow", "allow_adult", "allow_all"]
-        ] = None,
+        person_generation: Optional[Literal["dont_allow", "allow_adult", "allow_all"]] = None,
     ) -> "ImageGenerationResponse":
         """Generates images from text prompt.
 
@@ -510,9 +516,7 @@ def edit_image(
         number_of_images: int = 1,
         guidance_scale: Optional[float] = None,
         edit_mode: Optional[
-            Literal[
-                "inpainting-insert", "inpainting-remove", "outpainting", "product-image"
-            ]
+            Literal["inpainting-insert", "inpainting-remove", "outpainting", "product-image"]
         ] = None,
         mask_mode: Optional[Literal["background", "foreground", "semantic"]] = None,
         segmentation_classes: Optional[List[str]] = None,
@@ -525,9 +529,7 @@ def edit_image(
         safety_filter_level: Optional[
             Literal["block_most", "block_some", "block_few", "block_fewest"]
         ] = None,
-        person_generation: Optional[
-            Literal["dont_allow", "allow_adult", "allow_all"]
-        ] = None,
+        person_generation: Optional[Literal["dont_allow", "allow_adult", "allow_all"]] = None,
     ) -> "ImageGenerationResponse":
         """Edits an existing image based on text prompt.
 
@@ -717,9 +719,7 @@ def upscale_image(
 
         parameters["outputOptions"] = {"mimeType": output_mime_type}
         if output_mime_type == "image/jpeg" and output_compression_quality is not None:
-            parameters["outputOptions"][
-                "compressionQuality"
-            ] = output_compression_quality
+            parameters["outputOptions"]["compressionQuality"] = output_compression_quality
 
         response = self._endpoint.predict(
             instances=[to_value(instance)],
@@ -825,9 +825,7 @@ def save(self, location: str, include_generation_parameters: bool = True):
             if not self._generation_parameters:
                 raise ValueError("Image does not have generation parameters.")
             if not PIL_Image:
-                raise ValueError(
-                    "The PIL module is required for saving generation parameters."
-                )
+                raise ValueError("The PIL module is required for saving generation parameters.")
 
             exif = self._pil_image.getexif()
             exif[_EXIF_USER_COMMENT_TAG_IDX] = json.dumps(
@@ -836,4 +834,3 @@ def save(self, location: str, include_generation_parameters: bool = True):
             self._pil_image.save(location, exif=exif)
         else:
             super().save(location=location)
-

From 21649f0aebd8b385b97f8524a70a456f8c79fa2f Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Wed, 25 Sep 2024 10:58:19 -0700
Subject: [PATCH 12/26] add _repr_png_

Change-Id: I436170460e17983637283d0086ebc232c9c425ce
---
 google/generativeai/vision_models/_vision_models.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py
index 2b67fe33f..6c112af98 100644
--- a/google/generativeai/vision_models/_vision_models.py
+++ b/google/generativeai/vision_models/_vision_models.py
@@ -187,6 +187,9 @@ def _as_base64_string(self) -> str:
         # "received initial metadata size exceeds limit"
         return base64.b64encode(self._image_bytes).decode("ascii")
 
+    def _repr_png_(self):
+        return self._pil_image._repr_png_()
+
 
 class ImageGenerationModel:
     """Generates images from text prompt.
@@ -773,7 +776,7 @@ def __getitem__(self, idx: int) -> "GeneratedImage":
 class GeneratedImage(Image):
     """Generated image."""
 
-    __module__ = "vertexai.preview.vision_models"
+    __module__ = "google.generativeai"
 
     def __init__(
         self,

From 3e34dcf08a03f59a52e59460784d9a1b9c94b3d1 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Wed, 25 Sep 2024 11:08:56 -0700
Subject: [PATCH 13/26] Remoive "add watermark" switch.

Change-Id: I6580ae1b508a458c9813fdf161a08329cd676c08
---
 google/generativeai/vision_models/_vision_models.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py
index 6c112af98..1745ba6ce 100644
--- a/google/generativeai/vision_models/_vision_models.py
+++ b/google/generativeai/vision_models/_vision_models.py
@@ -246,7 +246,6 @@ def _generate_images(
         output_mime_type: Optional[Literal["image/png", "image/jpeg"]] = None,
         compression_quality: Optional[float] = None,
         language: Optional[str] = None,
-        add_watermark: Optional[bool] = None,
         safety_filter_level: Optional[
             Literal["block_most", "block_some", "block_few", "block_fewest"]
         ] = None,
@@ -304,7 +303,6 @@ class ID
               Supported values are `"en"` for English, `"hi"` for Hindi, `"ja"` for
               Japanese, `"ko"` for Korean, and `"auto"` for automatic language
               detection.
-            add_watermark: Add a watermark to the generated image
             safety_filter_level: Adds a filter level to Safety filtering. Supported
               values are: * "block_most" : Strongest filtering level, most strict
               blocking * "block_some" : Block some problematic prompts and responses
@@ -408,10 +406,6 @@ class ID
             parameters["outputOptions"]["compressionQuality"] = compression_quality
             shared_generation_parameters["compression_quality"] = compression_quality
 
-        if add_watermark is not None:
-            parameters["addWatermark"] = add_watermark
-            shared_generation_parameters["add_watermark"] = add_watermark
-
         if safety_filter_level is not None:
             parameters["safetySetting"] = safety_filter_level
             shared_generation_parameters["safety_filter_level"] = safety_filter_level
@@ -451,7 +445,6 @@ def generate_images(
         guidance_scale: Optional[float] = None,
         language: Optional[str] = None,
         seed: Optional[int] = None,
-        add_watermark: Optional[bool] = True,
         safety_filter_level: Optional[
             Literal["block_most", "block_some", "block_few", "block_fewest"]
         ] = None,
@@ -480,7 +473,6 @@ def generate_images(
                 for Japanese, `"ko"` for Korean, and `"auto"` for automatic language
                 detection.
             seed: Image generation random seed.
-            add_watermark: Add a watermark to the generated image
             safety_filter_level: Adds a filter level to Safety filtering. Supported
                 values are:
                 * "block_most" : Strongest filtering level, most strict
@@ -504,7 +496,6 @@ def generate_images(
             guidance_scale=guidance_scale,
             language=language,
             seed=seed,
-            add_watermark=add_watermark,
             safety_filter_level=safety_filter_level,
             person_generation=person_generation,
         )
@@ -615,7 +606,6 @@ class ID
             output_mime_type=output_mime_type,
             compression_quality=compression_quality,
             language=language,
-            add_watermark=False,  # Not supported for editing yet
             safety_filter_level=safety_filter_level,
             person_generation=person_generation,
         )

From 898a3d76405750d9e30205f35386a419d1a871e3 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Wed, 25 Sep 2024 11:44:34 -0700
Subject: [PATCH 14/26] Remove seed (it doesn't work without watermark), +fix
 upscale

Change-Id: I4c394cd861d2646cd663224f0bbeec52580bc0bd
---
 .../vision_models/_vision_models.py           | 27 +++++++------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py
index 1745ba6ce..722bc1070 100644
--- a/google/generativeai/vision_models/_vision_models.py
+++ b/google/generativeai/vision_models/_vision_models.py
@@ -201,7 +201,6 @@ class ImageGenerationModel:
             prompt="Astronaut riding a horse",
             # Optional:
             number_of_images=1,
-            seed=0,
         )
         response[0].show()
         response[0].save("image1.png")
@@ -228,7 +227,6 @@ def _generate_images(
         height: Optional[int] = None,
         aspect_ratio: Optional[Literal["1:1", "9:16", "16:9", "4:3", "3:4"]] = None,
         guidance_scale: Optional[float] = None,
-        seed: Optional[int] = None,
         base_image: Optional["Image"] = None,
         mask: Optional["Image"] = None,
         edit_mode: Optional[
@@ -269,7 +267,6 @@ def _generate_images(
             guidance_scale: Controls the strength of the prompt. Suggested values
               are - * 0-9 (low strength) * 10-20 (medium strength) * 21+ (high
               strength)
-            seed: Image generation random seed.
             base_image: Base image to use for the image generation.
             mask: Mask for the base image.
             edit_mode: Describes the editing mode for the request. Supported values
@@ -361,11 +358,6 @@ class ID
             parameters["negativePrompt"] = negative_prompt
             shared_generation_parameters["negative_prompt"] = negative_prompt
 
-        if seed is not None:
-            # Note: String seed and numerical seed give different results
-            parameters["seed"] = seed
-            shared_generation_parameters["seed"] = seed
-
         if guidance_scale is not None:
             parameters["guidanceScale"] = guidance_scale
             shared_generation_parameters["guidance_scale"] = guidance_scale
@@ -444,7 +436,6 @@ def generate_images(
         aspect_ratio: Optional[Literal["1:1", "9:16", "16:9", "4:3", "3:4"]] = None,
         guidance_scale: Optional[float] = None,
         language: Optional[str] = None,
-        seed: Optional[int] = None,
         safety_filter_level: Optional[
             Literal["block_most", "block_some", "block_few", "block_fewest"]
         ] = None,
@@ -472,7 +463,6 @@ def generate_images(
                 Supported values are `"en"` for English, `"hi"` for Hindi, `"ja"`
                 for Japanese, `"ko"` for Korean, and `"auto"` for automatic language
                 detection.
-            seed: Image generation random seed.
             safety_filter_level: Adds a filter level to Safety filtering. Supported
                 values are:
                 * "block_most" : Strongest filtering level, most strict
@@ -495,7 +485,6 @@ def generate_images(
             aspect_ratio=aspect_ratio,
             guidance_scale=guidance_scale,
             language=language,
-            seed=seed,
             safety_filter_level=safety_filter_level,
             person_generation=person_generation,
         )
@@ -519,7 +508,6 @@ def edit_image(
         output_mime_type: Optional[Literal["image/png", "image/jpeg"]] = None,
         compression_quality: Optional[float] = None,
         language: Optional[str] = None,
-        seed: Optional[int] = None,
         safety_filter_level: Optional[
             Literal["block_most", "block_some", "block_few", "block_fewest"]
         ] = None,
@@ -573,7 +561,6 @@ class ID
                 Supported values are `"en"` for English, `"hi"` for Hindi,
                 `"ja"` for Japanese, `"ko"` for Korean, and `"auto"` for
                 automatic language detection.
-            seed: Image generation random seed.
             safety_filter_level: Adds a filter level to Safety filtering. Supported
                 values are:
                 * "block_most" : Strongest filtering level, most strict
@@ -595,7 +582,6 @@ class ID
             negative_prompt=negative_prompt,
             number_of_images=number_of_images,
             guidance_scale=guidance_scale,
-            seed=seed,
             base_image=base_image,
             mask=mask,
             edit_mode=edit_mode,
@@ -613,7 +599,7 @@ class ID
     def upscale_image(
         self,
         image: Union["Image", "GeneratedImage"],
-        new_size: Optional[int] = 2048,
+        new_size: Optional[int] = None,
         upscale_factor: Optional[Literal["x2", "x4"]] = None,
         output_mime_type: Optional[Literal["image/png", "image/jpeg"]] = "image/png",
         output_compression_quality: Optional[int] = None,
@@ -664,6 +650,9 @@ def upscale_image(
         Returns:
             An `Image` object.
         """
+        if self._client is None:
+            self._client = client.get_default_prediction_client()
+
         target_image_size = new_size if new_size else None
         longest_dim = max(image._size[0], image._size[1])
 
@@ -714,10 +703,12 @@ def upscale_image(
         if output_mime_type == "image/jpeg" and output_compression_quality is not None:
             parameters["outputOptions"]["compressionQuality"] = output_compression_quality
 
-        response = self._endpoint.predict(
-            instances=[to_value(instance)],
-            parameters=parameters,
+
+        pr = protos.PredictRequest.pb()
+        request = pr(
+            model=self.model_name, instances=[to_value(instance)], parameters=to_value(parameters)
         )
+        response = self._client.predict(request)
 
         upscaled_image = response.predictions[0]
 

From 0afc6b5f16f3edf2c6388f236890184671eee70e Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Wed, 25 Sep 2024 14:52:40 -0700
Subject: [PATCH 15/26] remove edit and upscale

Change-Id: Ic9c270279ee020baef2c3b2117199ff17b066d88
---
 google/generativeai/vision_models/__init__.py |   2 +
 .../vision_models/_vision_models.py           | 316 ------------------
 2 files changed, 2 insertions(+), 316 deletions(-)

diff --git a/google/generativeai/vision_models/__init__.py b/google/generativeai/vision_models/__init__.py
index f519c9928..65a545831 100644
--- a/google/generativeai/vision_models/__init__.py
+++ b/google/generativeai/vision_models/__init__.py
@@ -15,12 +15,14 @@
 """Classes for working with vision models."""
 
 from google.generativeai.vision_models._vision_models import (
+    Image,
     GeneratedImage,
     ImageGenerationModel,
     ImageGenerationResponse,
 )
 
 __all__ = [
+    "Image",
     "GeneratedImage",
     "ImageGenerationModel",
     "ImageGenerationResponse",
diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py
index 722bc1070..64fd35425 100644
--- a/google/generativeai/vision_models/_vision_models.py
+++ b/google/generativeai/vision_models/_vision_models.py
@@ -227,20 +227,6 @@ def _generate_images(
         height: Optional[int] = None,
         aspect_ratio: Optional[Literal["1:1", "9:16", "16:9", "4:3", "3:4"]] = None,
         guidance_scale: Optional[float] = None,
-        base_image: Optional["Image"] = None,
-        mask: Optional["Image"] = None,
-        edit_mode: Optional[
-            Literal[
-                "inpainting-insert",
-                "inpainting-remove",
-                "outpainting",
-                "product-image",
-            ]
-        ] = None,
-        mask_mode: Optional[Literal["background", "foreground", "semantic"]] = None,
-        segmentation_classes: Optional[List[str]] = None,
-        mask_dilation: Optional[float] = None,
-        product_position: Optional[Literal["fixed", "reposition"]] = None,
         output_mime_type: Optional[Literal["image/png", "image/jpeg"]] = None,
         compression_quality: Optional[float] = None,
         language: Optional[str] = None,
@@ -267,30 +253,6 @@ def _generate_images(
             guidance_scale: Controls the strength of the prompt. Suggested values
               are - * 0-9 (low strength) * 10-20 (medium strength) * 21+ (high
               strength)
-            base_image: Base image to use for the image generation.
-            mask: Mask for the base image.
-            edit_mode: Describes the editing mode for the request. Supported values
-              are - * inpainting-insert: fills the mask area based on the text
-              prompt (requires mask and text) * inpainting-remove: removes the
-              object(s) in the mask area. (requires mask)
-                * outpainting: extend the image based on the mask area. (Requires
-                  mask) * product-image: Changes the background for the predominant
-                  product or subject in the image
-            mask_mode: Solicits generation of the mask (v/s providing mask as an
-              input). Supported values are:
-                * background: Automatically generates a mask for all regions except
-                  the primary subject(s) of the image
-                * foreground: Automatically generates a mask for the primary
-                  subjects(s) of the image.
-                * semantic: Segment one or more of the segmentation classes using
-                  class ID
-            segmentation_classes: List of class IDs for segmentation. Max of 5 IDs
-            mask_dilation: Defines the dilation percentage of the mask provided.
-              Float between 0 and 1. Defaults to 0.03
-            product_position: Defines whether the product should stay fixed or be
-              repositioned. Supported Values:
-                * fixed: Fixed position
-                * reposition: Can be moved (default)
             output_mime_type: Which image format should the output be saved as.
               Supported values: * image/png: Save as a PNG image * image/jpeg: Save
               as a JPEG image
@@ -325,24 +287,6 @@ class ID
             "number_of_images_in_batch": number_of_images,
         }
 
-        if base_image:
-            instance["image"] = {
-                "bytesBase64Encoded": base_image._as_base64_string()  # pylint: disable=protected-access
-            }
-            shared_generation_parameters["base_image_hash"] = hashlib.sha1(
-                base_image._image_bytes  # pylint: disable=protected-access
-            ).hexdigest()
-
-        if mask:
-            instance["mask"] = {
-                "image": {
-                    "bytesBase64Encoded": mask._as_base64_string()  # pylint: disable=protected-access
-                },
-            }
-            shared_generation_parameters["mask_hash"] = hashlib.sha1(
-                mask._image_bytes  # pylint: disable=protected-access
-            ).hexdigest()
-
         parameters = {}
         max_size = max(width or 0, height or 0) or None
         if aspect_ratio is not None:
@@ -366,29 +310,6 @@ class ID
             parameters["language"] = language
             shared_generation_parameters["language"] = language
 
-        parameters["editConfig"] = {}
-        if edit_mode is not None:
-            parameters["editConfig"]["editMode"] = edit_mode
-            shared_generation_parameters["edit_mode"] = edit_mode
-
-        if mask is None and edit_mode != "product-image":
-            parameters["editConfig"]["maskMode"] = {}
-            if mask_mode is not None:
-                parameters["editConfig"]["maskMode"]["maskType"] = mask_mode
-                shared_generation_parameters["mask_mode"] = mask_mode
-
-            if segmentation_classes is not None:
-                parameters["editConfig"]["maskMode"]["classes"] = segmentation_classes
-                shared_generation_parameters["classes"] = segmentation_classes
-
-        if mask_dilation is not None:
-            parameters["editConfig"]["maskDilation"] = mask_dilation
-            shared_generation_parameters["mask_dilation"] = mask_dilation
-
-        if product_position is not None:
-            parameters["editConfig"]["productPosition"] = product_position
-            shared_generation_parameters["product_position"] = product_position
-
         parameters["outputOptions"] = {}
         if output_mime_type is not None:
             parameters["outputOptions"]["mimeType"] = output_mime_type
@@ -489,243 +410,6 @@ def generate_images(
             person_generation=person_generation,
         )
 
-    def edit_image(
-        self,
-        *,
-        prompt: str,
-        base_image: "Image",
-        mask: Optional["Image"] = None,
-        negative_prompt: Optional[str] = None,
-        number_of_images: int = 1,
-        guidance_scale: Optional[float] = None,
-        edit_mode: Optional[
-            Literal["inpainting-insert", "inpainting-remove", "outpainting", "product-image"]
-        ] = None,
-        mask_mode: Optional[Literal["background", "foreground", "semantic"]] = None,
-        segmentation_classes: Optional[List[str]] = None,
-        mask_dilation: Optional[float] = None,
-        product_position: Optional[Literal["fixed", "reposition"]] = None,
-        output_mime_type: Optional[Literal["image/png", "image/jpeg"]] = None,
-        compression_quality: Optional[float] = None,
-        language: Optional[str] = None,
-        safety_filter_level: Optional[
-            Literal["block_most", "block_some", "block_few", "block_fewest"]
-        ] = None,
-        person_generation: Optional[Literal["dont_allow", "allow_adult", "allow_all"]] = None,
-    ) -> "ImageGenerationResponse":
-        """Edits an existing image based on text prompt.
-
-        Args:
-            prompt: Text prompt for the image.
-            base_image: Base image from which to generate the new image.
-            mask: Mask for the base image.
-            negative_prompt: A description of what you want to omit in
-                the generated images.
-            number_of_images: Number of images to generate. Range: 1..8.
-            guidance_scale: Controls the strength of the prompt.
-                Suggested values are:
-                * 0-9 (low strength)
-                * 10-20 (medium strength)
-                * 21+ (high strength)
-            edit_mode: Describes the editing mode for the request. Supported values are:
-                * inpainting-insert: fills the mask area based on the text prompt
-                (requires mask and text)
-                * inpainting-remove: removes the object(s) in the mask area.
-                (requires mask)
-                * outpainting: extend the image based on the mask area.
-                (Requires mask)
-                * product-image: Changes the background for the predominant product
-                or subject in the image
-            mask_mode: Solicits generation of the mask (v/s providing mask as an
-                input). Supported values are:
-                * background: Automatically generates a mask for all regions except
-                the primary subject(s) of the image
-                * foreground: Automatically generates a mask for the primary
-                subjects(s) of the image.
-                * semantic: Segment one or more of the segmentation classes using
-                class ID
-            segmentation_classes: List of class IDs for segmentation. Max of 5 IDs
-            mask_dilation: Defines the dilation percentage of the mask provided.
-                Float between 0 and 1. Defaults to 0.03
-            product_position: Defines whether the product should stay fixed or be
-                repositioned. Supported Values:
-                * fixed: Fixed position
-                * reposition: Can be moved (default)
-            output_mime_type: Which image format should the output be saved as.
-                Supported values:
-                * image/png: Save as a PNG image
-                * image/jpeg: Save as a JPEG image
-            compression_quality: Level of compression if the output mime type is
-              selected to be image/jpeg. Float between 0 to 100
-            language: Language of the text prompt for the image. Default: None.
-                Supported values are `"en"` for English, `"hi"` for Hindi,
-                `"ja"` for Japanese, `"ko"` for Korean, and `"auto"` for
-                automatic language detection.
-            safety_filter_level: Adds a filter level to Safety filtering. Supported
-                values are:
-                * "block_most" : Strongest filtering level, most strict
-                blocking
-                * "block_some" : Block some problematic prompts and responses
-                * "block_few" : Block fewer problematic prompts and responses
-                * "block_fewest" : Block very few problematic prompts and responses
-            person_generation: Allow generation of people by the model Supported
-                values are:
-                * "dont_allow" : Block generation of people
-                * "allow_adult" : Generate adults, but not children
-                * "allow_all" : Generate adults and children
-
-        Returns:
-            An `ImageGenerationResponse` object.
-        """
-        return self._generate_images(
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            number_of_images=number_of_images,
-            guidance_scale=guidance_scale,
-            base_image=base_image,
-            mask=mask,
-            edit_mode=edit_mode,
-            mask_mode=mask_mode,
-            segmentation_classes=segmentation_classes,
-            mask_dilation=mask_dilation,
-            product_position=product_position,
-            output_mime_type=output_mime_type,
-            compression_quality=compression_quality,
-            language=language,
-            safety_filter_level=safety_filter_level,
-            person_generation=person_generation,
-        )
-
-    def upscale_image(
-        self,
-        image: Union["Image", "GeneratedImage"],
-        new_size: Optional[int] = None,
-        upscale_factor: Optional[Literal["x2", "x4"]] = None,
-        output_mime_type: Optional[Literal["image/png", "image/jpeg"]] = "image/png",
-        output_compression_quality: Optional[int] = None,
-    ) -> "Image":
-        """Upscales an image.
-
-        This supports upscaling images generated through the `generate_images()`
-        method, or upscaling a new image.
-
-        Examples::
-
-            # Upscale a generated image
-            model = ImageGenerationModel.from_pretrained("imagegeneration@002")
-            response = model.generate_images(
-                prompt="Astronaut riding a horse",
-            )
-            model.upscale_image(image=response[0])
-
-            # Upscale a new 1024x1024 image
-            my_image = Image.load_from_file("my-image.png")
-            model.upscale_image(image=my_image)
-
-            # Upscale a new arbitrary sized image using a x2 or x4 upscaling factor
-            my_image = Image.load_from_file("my-image.png")
-            model.upscale_image(image=my_image, upscale_factor="x2")
-
-            # Upscale an image and get the result in JPEG format
-            my_image = Image.load_from_file("my-image.png")
-            model.upscale_image(image=my_image, output_mime_type="image/jpeg",
-            output_compression_quality=90)
-
-        Args:
-            image (Union[GeneratedImage, Image]): Required. The generated image
-                to upscale.
-            new_size (int): The size of the biggest dimension of the upscaled
-                image.
-                Only 2048 and 4096 are currently supported. Results in a
-                2048x2048 or 4096x4096 image. Defaults to 2048 if not provided.
-            upscale_factor: The upscaling factor. Supported values are "x2" and
-                "x4". Defaults to None.
-            output_mime_type: The mime type of the output image. Supported values
-                are "image/png" and "image/jpeg". Defaults to "image/png".
-            output_compression_quality: The compression quality of the output
-                image
-                as an int (0-100). Only applicable if the output mime type is
-                "image/jpeg". Defaults to None.
-
-        Returns:
-            An `Image` object.
-        """
-        if self._client is None:
-            self._client = client.get_default_prediction_client()
-
-        target_image_size = new_size if new_size else None
-        longest_dim = max(image._size[0], image._size[1])
-
-        if not new_size and not upscale_factor:
-            raise ValueError("Either new_size or upscale_factor must be provided.")
-
-        if not upscale_factor:
-            x2_factor = 2.0
-            x4_factor = 4.0
-            epsilon = 0.1
-            is_upscaling_x2_request = abs(new_size / longest_dim - x2_factor) < epsilon
-            is_upscaling_x4_request = abs(new_size / longest_dim - x4_factor) < epsilon
-
-            if not is_upscaling_x2_request and not is_upscaling_x4_request:
-                raise ValueError(
-                    "Only x2 and x4 upscaling are currently supported. Requested"
-                    f" upscaling factor: {new_size / longest_dim}"
-                )
-        else:
-            if upscale_factor == "x2":
-                target_image_size = longest_dim * 2
-            else:
-                target_image_size = longest_dim * 4
-        if new_size not in _SUPPORTED_UPSCALING_SIZES:
-            raise ValueError(
-                "Only the folowing square upscaling sizes are currently supported:"
-                f" {_SUPPORTED_UPSCALING_SIZES}."
-            )
-
-        instance = {"prompt": ""}
-
-        instance["image"] = {
-            "bytesBase64Encoded": image._as_base64_string()  # pylint: disable=protected-access
-        }
-
-        parameters = {
-            "sampleCount": 1,
-            "mode": "upscale",
-        }
-
-        if upscale_factor:
-            parameters["upscaleConfig"] = {"upscaleFactor": upscale_factor}
-
-        else:
-            parameters["sampleImageSize"] = str(new_size)
-
-        parameters["outputOptions"] = {"mimeType": output_mime_type}
-        if output_mime_type == "image/jpeg" and output_compression_quality is not None:
-            parameters["outputOptions"]["compressionQuality"] = output_compression_quality
-
-
-        pr = protos.PredictRequest.pb()
-        request = pr(
-            model=self.model_name, instances=[to_value(instance)], parameters=to_value(parameters)
-        )
-        response = self._client.predict(request)
-
-        upscaled_image = response.predictions[0]
-
-        if isinstance(image, GeneratedImage):
-            generation_parameters = image.generation_parameters
-
-        else:
-            generation_parameters = {}
-
-        generation_parameters["upscaled_image_size"] = target_image_size
-
-        encoded_bytes = upscaled_image.get("bytesBase64Encoded")
-        return GeneratedImage(
-            image_bytes=base64.b64decode(encoded_bytes) if encoded_bytes else None,
-            generation_parameters=generation_parameters,
-        )
-
 
 @dataclasses.dataclass
 class ImageGenerationResponse:

From c1f739f6a0eeba827d03c76b6b8a24bf4aa2bd3c Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Wed, 25 Sep 2024 15:54:45 -0700
Subject: [PATCH 16/26] remove upscale

Change-Id: I84f3b603732c3c1e91da9d5abf38c332b24772cb
---
 google/generativeai/vision_models/_vision_models.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py
index 64fd35425..54b896e94 100644
--- a/google/generativeai/vision_models/_vision_models.py
+++ b/google/generativeai/vision_models/_vision_models.py
@@ -92,9 +92,6 @@ def to_mapping_value(value) -> struct_pb2.Struct:
     return struct_pb2.Struct(fields={k: to_value(v) for k, v in value.items()})
 
 
-_SUPPORTED_UPSCALING_SIZES = [2048, 4096]
-
-
 class Image:
     """Image."""
 

From 5b9bf58e84209e90f6444715c00a66eb01964258 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Wed, 25 Sep 2024 15:59:08 -0700
Subject: [PATCH 17/26] skip bad test

Change-Id: Ief70d4fdc9d7478b402cf2f790817f98e336b25a
---
 tests/test_async_code_match.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_async_code_match.py b/tests/test_async_code_match.py
index 0ec4550d4..2e5e3a9f3 100644
--- a/tests/test_async_code_match.py
+++ b/tests/test_async_code_match.py
@@ -75,6 +75,7 @@ def _execute_code_match(self, source, asource):
         asource = re.sub(" *?# type: ignore", "", asource)
         self.assertEqual(source, asource)
 
+    @absltest.skip('This test is broken: globally matching functions based only on the name')
     def test_code_match_for_async_methods(self):
         for fpath in (pathlib.Path(__file__).parent.parent / "google").rglob("*.py"):
             if fpath.name in EXEMPT_FILES or any([d in fpath.parts for d in EXEMPT_DIRS]):

From 1be0ea09b66549db976c1b844f0fef3a0c492c73 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Wed, 25 Sep 2024 16:06:39 -0700
Subject: [PATCH 18/26] format

Change-Id: I4d00cad9d0e6485f5b710d2ffcec82a9b738e2b0
---
 tests/test_async_code_match.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_async_code_match.py b/tests/test_async_code_match.py
index 2e5e3a9f3..457200b7b 100644
--- a/tests/test_async_code_match.py
+++ b/tests/test_async_code_match.py
@@ -75,7 +75,7 @@ def _execute_code_match(self, source, asource):
         asource = re.sub(" *?# type: ignore", "", asource)
         self.assertEqual(source, asource)
 
-    @absltest.skip('This test is broken: globally matching functions based only on the name')
+    @absltest.skip("This test is broken: globally matching functions based only on the name")
     def test_code_match_for_async_methods(self):
         for fpath in (pathlib.Path(__file__).parent.parent / "google").rglob("*.py"):
             if fpath.name in EXEMPT_FILES or any([d in fpath.parts for d in EXEMPT_DIRS]):

From 8943a8bc0b7f4ee6e2203e702cdf833375f92b1c Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Thu, 26 Sep 2024 11:24:13 -0700
Subject: [PATCH 19/26] check enums

Change-Id: I8e7b92fc15d3941b7fa74a97b95d2577be9d6c1f
---
 google/generativeai/generative_models.py       |  2 +-
 .../vision_models/_vision_models.py            | 18 ++++++++++++++----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/google/generativeai/generative_models.py b/google/generativeai/generative_models.py
index 134430b2e..dcda3135a 100644
--- a/google/generativeai/generative_models.py
+++ b/google/generativeai/generative_models.py
@@ -4,7 +4,7 @@
 
 from collections.abc import Iterable
 import textwrap
-from typing import Any, Union, overload
+from typing import Any, Literal, Union, overload
 import reprlib
 
 # pylint: disable=bad-continuation, line-too-long
diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py
index 54b896e94..3255533bb 100644
--- a/google/generativeai/vision_models/_vision_models.py
+++ b/google/generativeai/vision_models/_vision_models.py
@@ -91,6 +91,10 @@ def to_mapping_value(value) -> struct_pb2.Struct:
     # We got a dict (or something dict-like); convert it.
     return struct_pb2.Struct(fields={k: to_value(v) for k, v in value.items()})
 
+ASPECT_RATIOS = ["1:1", "9:16", "16:9", "4:3", "3:4"]
+OUTPUT_MIME_TYPES = ["image/png", "image/jpeg"]
+SAFETY_FILTER_LEVELS = ["block_most", "block_some", "block_few", "block_fewest"]
+PERSON_GENERATIONS = ["dont_allow", "allow_adult", "allow_all"]
 
 class Image:
     """Image."""
@@ -222,15 +226,15 @@ def _generate_images(
         number_of_images: int = 1,
         width: Optional[int] = None,
         height: Optional[int] = None,
-        aspect_ratio: Optional[Literal["1:1", "9:16", "16:9", "4:3", "3:4"]] = None,
+        aspect_ratio: Optional[Literal[*ASPECT_RATIOS]] = None,
         guidance_scale: Optional[float] = None,
-        output_mime_type: Optional[Literal["image/png", "image/jpeg"]] = None,
+        output_mime_type: Optional[Literal[*OUTPUT_MIME_TYPES]] = None,
         compression_quality: Optional[float] = None,
         language: Optional[str] = None,
         safety_filter_level: Optional[
-            Literal["block_most", "block_some", "block_few", "block_fewest"]
+            Literal[*SAFETY_FILTER_LEVELS]
         ] = None,
-        person_generation: Optional[Literal["dont_allow", "allow_adult", "allow_all"]] = None,
+        person_generation: Optional[Literal[*PERSON_GENERATIONS]] = None,
     ) -> "ImageGenerationResponse":
         """Generates images from text prompt.
 
@@ -287,6 +291,8 @@ def _generate_images(
         parameters = {}
         max_size = max(width or 0, height or 0) or None
         if aspect_ratio is not None:
+            if aspect_ratio not in ASPECT_RATIOS:
+                raise ValueError(f'aspect_ratio not in {ASPECT_RATIOS}')
             parameters["aspectRatio"] = aspect_ratio
         elif max_size:
             # Note: The size needs to be a string
@@ -309,6 +315,8 @@ def _generate_images(
 
         parameters["outputOptions"] = {}
         if output_mime_type is not None:
+            if output_mime_type not in OUTPUT_MIME_TYPES:
+                raise ValueError(f'output_mime_type not in {OUTPUT_MIME_TYPES}')
             parameters["outputOptions"]["mimeType"] = output_mime_type
             shared_generation_parameters["mime_type"] = output_mime_type
 
@@ -317,6 +325,8 @@ def _generate_images(
             shared_generation_parameters["compression_quality"] = compression_quality
 
         if safety_filter_level is not None:
+            if safety_filter_level not in SAFETY_FILTER_LEVELS:
+                raise ValueError(f'safety_filter_level not in {SAFETY_FILTER_LEVELS}')
             parameters["safetySetting"] = safety_filter_level
             shared_generation_parameters["safety_filter_level"] = safety_filter_level
 

From 2ff8141566fddd74fff66a37f3c10533b711c772 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Thu, 26 Sep 2024 11:39:13 -0700
Subject: [PATCH 20/26] Remove * unpackng

Change-Id: Iddc42b906dfab12dfa500f0f9774d58521684548
---
 .../vision_models/_vision_models.py           | 39 +++++++++++--------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py
index 3255533bb..46a34e44e 100644
--- a/google/generativeai/vision_models/_vision_models.py
+++ b/google/generativeai/vision_models/_vision_models.py
@@ -91,10 +91,19 @@ def to_mapping_value(value) -> struct_pb2.Struct:
     # We got a dict (or something dict-like); convert it.
     return struct_pb2.Struct(fields={k: to_value(v) for k, v in value.items()})
 
-ASPECT_RATIOS = ["1:1", "9:16", "16:9", "4:3", "3:4"]
-OUTPUT_MIME_TYPES = ["image/png", "image/jpeg"]
-SAFETY_FILTER_LEVELS = ["block_most", "block_some", "block_few", "block_fewest"]
-PERSON_GENERATIONS = ["dont_allow", "allow_adult", "allow_all"]
+
+AspectRatio = Literal["1:1", "9:16", "16:9", "4:3", "3:4"]
+ASPECT_RATIOS = AspectRatio.__args__
+
+OutputMimeType = Literal["image/png", "image/jpeg"]
+OUTPUT_MIME_TYPES = OutputMimeType.__args__
+
+SafetyFilterLevel = Literal["block_most", "block_some", "block_few", "block_fewest"]
+SAFETY_FILTER_LEVELS = SafetyFilterLevel.__args__
+
+PersonGeneration = Literal["dont_allow", "allow_adult", "allow_all"]
+PERSON_GENERATIONS = PersonGeneration.__args__
+
 
 class Image:
     """Image."""
@@ -226,15 +235,13 @@ def _generate_images(
         number_of_images: int = 1,
         width: Optional[int] = None,
         height: Optional[int] = None,
-        aspect_ratio: Optional[Literal[*ASPECT_RATIOS]] = None,
+        aspect_ratio: Optional[AspectRatio] = None,
         guidance_scale: Optional[float] = None,
-        output_mime_type: Optional[Literal[*OUTPUT_MIME_TYPES]] = None,
+        output_mime_type: Optional[OutputMimeType] = None,
         compression_quality: Optional[float] = None,
         language: Optional[str] = None,
-        safety_filter_level: Optional[
-            Literal[*SAFETY_FILTER_LEVELS]
-        ] = None,
-        person_generation: Optional[Literal[*PERSON_GENERATIONS]] = None,
+        safety_filter_level: Optional[SafetyFilterLevel] = None,
+        person_generation: Optional[PersonGeneration] = None,
     ) -> "ImageGenerationResponse":
         """Generates images from text prompt.
 
@@ -292,7 +299,7 @@ def _generate_images(
         max_size = max(width or 0, height or 0) or None
         if aspect_ratio is not None:
             if aspect_ratio not in ASPECT_RATIOS:
-                raise ValueError(f'aspect_ratio not in {ASPECT_RATIOS}')
+                raise ValueError(f"aspect_ratio not in {ASPECT_RATIOS}")
             parameters["aspectRatio"] = aspect_ratio
         elif max_size:
             # Note: The size needs to be a string
@@ -316,7 +323,7 @@ def _generate_images(
         parameters["outputOptions"] = {}
         if output_mime_type is not None:
             if output_mime_type not in OUTPUT_MIME_TYPES:
-                raise ValueError(f'output_mime_type not in {OUTPUT_MIME_TYPES}')
+                raise ValueError(f"output_mime_type not in {OUTPUT_MIME_TYPES}")
             parameters["outputOptions"]["mimeType"] = output_mime_type
             shared_generation_parameters["mime_type"] = output_mime_type
 
@@ -326,7 +333,7 @@ def _generate_images(
 
         if safety_filter_level is not None:
             if safety_filter_level not in SAFETY_FILTER_LEVELS:
-                raise ValueError(f'safety_filter_level not in {SAFETY_FILTER_LEVELS}')
+                raise ValueError(f"safety_filter_level not in {SAFETY_FILTER_LEVELS}")
             parameters["safetySetting"] = safety_filter_level
             shared_generation_parameters["safety_filter_level"] = safety_filter_level
 
@@ -361,13 +368,13 @@ def generate_images(
         *,
         negative_prompt: Optional[str] = None,
         number_of_images: int = 1,
-        aspect_ratio: Optional[Literal["1:1", "9:16", "16:9", "4:3", "3:4"]] = None,
+        aspect_ratio: Optional[AspectRatio] = None,
         guidance_scale: Optional[float] = None,
         language: Optional[str] = None,
         safety_filter_level: Optional[
-            Literal["block_most", "block_some", "block_few", "block_fewest"]
+            SafetyFilterLevel
         ] = None,
-        person_generation: Optional[Literal["dont_allow", "allow_adult", "allow_all"]] = None,
+        person_generation: Optional[PersonGeneration] = None,
     ) -> "ImageGenerationResponse":
         """Generates images from text prompt.
 

From d2f1a4e0203cc92ecb86836ec3964797f7101dec Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Thu, 26 Sep 2024 11:51:22 -0700
Subject: [PATCH 21/26] ignore typing errors, these are incorrect

Change-Id: Idc4e594811e52aa3d17562851ef02808e6dd5633
---
 google/generativeai/vision_models/_vision_models.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py
index 46a34e44e..7f3c8fedc 100644
--- a/google/generativeai/vision_models/_vision_models.py
+++ b/google/generativeai/vision_models/_vision_models.py
@@ -93,16 +93,16 @@ def to_mapping_value(value) -> struct_pb2.Struct:
 
 
 AspectRatio = Literal["1:1", "9:16", "16:9", "4:3", "3:4"]
-ASPECT_RATIOS = AspectRatio.__args__
+ASPECT_RATIOS = AspectRatio.__args__  # type: ignore
 
 OutputMimeType = Literal["image/png", "image/jpeg"]
-OUTPUT_MIME_TYPES = OutputMimeType.__args__
+OUTPUT_MIME_TYPES = OutputMimeType.__args__  # type: ignore
 
 SafetyFilterLevel = Literal["block_most", "block_some", "block_few", "block_fewest"]
-SAFETY_FILTER_LEVELS = SafetyFilterLevel.__args__
+SAFETY_FILTER_LEVELS = SafetyFilterLevel.__args__  # type: ignore
 
 PersonGeneration = Literal["dont_allow", "allow_adult", "allow_all"]
-PERSON_GENERATIONS = PersonGeneration.__args__
+PERSON_GENERATIONS = PersonGeneration.__args__  # type: ignore
 
 
 class Image:
@@ -198,7 +198,7 @@ def _as_base64_string(self) -> str:
         return base64.b64encode(self._image_bytes).decode("ascii")
 
     def _repr_png_(self):
-        return self._pil_image._repr_png_()
+        return self._pil_image._repr_png_()   # type:ignore
 
 
 class ImageGenerationModel:

From 57f25933db6fa21ededadd37ad3bca4b9fd0a705 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Thu, 26 Sep 2024 12:12:00 -0700
Subject: [PATCH 22/26] fix typing?

Change-Id: I5005dd38b7d3a03edc1c54d42f3a6c8f959d7274
---
 google/generativeai/vision_models/_vision_models.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py
index 7f3c8fedc..3a50b3872 100644
--- a/google/generativeai/vision_models/_vision_models.py
+++ b/google/generativeai/vision_models/_vision_models.py
@@ -149,7 +149,7 @@ def _image_bytes(self, value: bytes):
         self._loaded_bytes = value
 
     @property
-    def _pil_image(self) -> "PIL_Image.Image":
+    def _pil_image(self) -> "PIL_Image.Image":   # type: ignore
         if self._loaded_image is None:
             if not PIL_Image:
                 raise RuntimeError(
@@ -198,7 +198,7 @@ def _as_base64_string(self) -> str:
         return base64.b64encode(self._image_bytes).decode("ascii")
 
     def _repr_png_(self):
-        return self._pil_image._repr_png_()   # type:ignore
+        return self._pil_image._repr_png_()  # type:ignore
 
 
 class ImageGenerationModel:
@@ -371,9 +371,7 @@ def generate_images(
         aspect_ratio: Optional[AspectRatio] = None,
         guidance_scale: Optional[float] = None,
         language: Optional[str] = None,
-        safety_filter_level: Optional[
-            SafetyFilterLevel
-        ] = None,
+        safety_filter_level: Optional[SafetyFilterLevel] = None,
         person_generation: Optional[PersonGeneration] = None,
     ) -> "ImageGenerationResponse":
         """Generates images from text prompt.

From 0adcc1aa8f1f075e0053647942ba11ef8d70aa32 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Fri, 27 Sep 2024 08:46:51 -0700
Subject: [PATCH 23/26] fix lazy loading with typing

Change-Id: I2ebb3afbe3df93f918ffe5f66dac20392665c6cc
---
 google/generativeai/vision_models/_vision_models.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py
index 3a50b3872..38ef66e67 100644
--- a/google/generativeai/vision_models/_vision_models.py
+++ b/google/generativeai/vision_models/_vision_models.py
@@ -42,8 +42,10 @@
 
 try:
     from PIL import Image as PIL_Image
+    from PIL.Image import Image as PILImageClass
 except ImportError:
     PIL_Image = None
+    PILImageClass = None
 
 
 # This is to get around https://github.com/googleapis/proto-plus-python/issues/488
@@ -111,7 +113,7 @@ class Image:
     __module__ = "vertexai.vision_models"
 
     _loaded_bytes: Optional[bytes] = None
-    _loaded_image: Optional["PIL_Image.Image"] = None
+    _loaded_image: Optional["PILImageClass"] = None
 
     def __init__(
         self,
@@ -149,7 +151,7 @@ def _image_bytes(self, value: bytes):
         self._loaded_bytes = value
 
     @property
-    def _pil_image(self) -> "PIL_Image.Image":   # type: ignore
+    def _pil_image(self) -> "PILImageClass":
         if self._loaded_image is None:
             if not PIL_Image:
                 raise RuntimeError(

From 566e8a894cf0e2ae86f469c3faee0644cefabe9a Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Fri, 27 Sep 2024 09:49:38 -0700
Subject: [PATCH 24/26] Revert "fix lazy loading with typing"

This reverts commit 0baabfe01d3598c9b0f54e2d6e0ceef12ba5f8c3.
---
 google/generativeai/vision_models/_vision_models.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py
index 38ef66e67..3a50b3872 100644
--- a/google/generativeai/vision_models/_vision_models.py
+++ b/google/generativeai/vision_models/_vision_models.py
@@ -42,10 +42,8 @@
 
 try:
     from PIL import Image as PIL_Image
-    from PIL.Image import Image as PILImageClass
 except ImportError:
     PIL_Image = None
-    PILImageClass = None
 
 
 # This is to get around https://github.com/googleapis/proto-plus-python/issues/488
@@ -113,7 +111,7 @@ class Image:
     __module__ = "vertexai.vision_models"
 
     _loaded_bytes: Optional[bytes] = None
-    _loaded_image: Optional["PILImageClass"] = None
+    _loaded_image: Optional["PIL_Image.Image"] = None
 
     def __init__(
         self,
@@ -151,7 +149,7 @@ def _image_bytes(self, value: bytes):
         self._loaded_bytes = value
 
     @property
-    def _pil_image(self) -> "PILImageClass":
+    def _pil_image(self) -> "PIL_Image.Image":   # type: ignore
         if self._loaded_image is None:
             if not PIL_Image:
                 raise RuntimeError(

From f077610eb44c9207d8f8126084d6d0f98f10dad0 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Fri, 27 Sep 2024 11:07:43 -0700
Subject: [PATCH 25/26] Use if TYPE_CHECKING

Change-Id: I63c4c7909f7d7060a4b557f41cff4ba9010e004c
---
 .../vision_models/_vision_models.py           | 24 ++++++++++++-------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py
index 3a50b3872..78f495d65 100644
--- a/google/generativeai/vision_models/_vision_models.py
+++ b/google/generativeai/vision_models/_vision_models.py
@@ -34,16 +34,22 @@
 from proto.marshal.collections import repeated
 
 
-# pylint: disable=g-import-not-at-top
-try:
+# pylint: disable=g-import-not-at-top\
+if typing.TYPE_CHECKING:
     from IPython import display as IPython_display
-except ImportError:
-    IPython_display = None
-
-try:
-    from PIL import Image as PIL_Image
-except ImportError:
-    PIL_Image = None
+else:
+    try:
+        from IPython import display as IPython_display
+    except ImportError:
+        IPython_display = None
+
+if typing.TYPE_CHECKING:
+    import PIL.Image as PIL_Image
+else:
+    try:
+        from PIL import Image as PIL_Image
+    except ImportError:
+        PIL_Image = None
 
 
 # This is to get around https://github.com/googleapis/proto-plus-python/issues/488

From d97412812691cf31982eaf9edf166a0e757e7029 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Fri, 27 Sep 2024 11:15:38 -0700
Subject: [PATCH 26/26] format

Change-Id: I446692905361c56505b070d72d94ea7fecdaede8
---
 google/generativeai/vision_models/_vision_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py
index 78f495d65..0bb4f7dbe 100644
--- a/google/generativeai/vision_models/_vision_models.py
+++ b/google/generativeai/vision_models/_vision_models.py
@@ -155,7 +155,7 @@ def _image_bytes(self, value: bytes):
         self._loaded_bytes = value
 
     @property
-    def _pil_image(self) -> "PIL_Image.Image":   # type: ignore
+    def _pil_image(self) -> "PIL_Image.Image":  # type: ignore
         if self._loaded_image is None:
             if not PIL_Image:
                 raise RuntimeError(