From 2ba111669451fa115cfeae1418982e9c7aab7fcc Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Mon, 9 Sep 2024 10:47:00 -0700 Subject: [PATCH 01/26] Add more enum samples Change-Id: I743d5967cc1cc91576b8ddf5a60db1767d94508d --- samples/controlled_generation.py | 41 ++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/samples/controlled_generation.py b/samples/controlled_generation.py index 042209a72..8f6787676 100644 --- a/samples/controlled_generation.py +++ b/samples/controlled_generation.py @@ -160,6 +160,47 @@ def test_x_enum_raw(self): print(result) # Keyboard # [END x_enum_raw] + def test_x_enum(self): + # [START x_enum] + import enum + + class Choice(enum.Enum): + PERCUSSION = "Percussion" + STRING = "String" + WOODWIND = "Woodwind" + BRASS = "Brass" + KEYBOARD = "Keyboard" + + model = genai.GenerativeModel("gemini-1.5-pro-latest") + + organ = genai.upload_file(media / "organ.jpg") + result = model.generate_content( + ["What kind of instrument is this:", organ], + generation_config=genai.GenerationConfig( + response_mime_type="text/x.enum", response_schema=Choice + ), + ) + print(result) # "Keyboard" + # [END x_enum] + + def test_x_enum_raw(self): + # [START x_enum_raw] + model = genai.GenerativeModel("gemini-1.5-pro-latest") + + organ = genai.upload_file(media / "organ.jpg") + result = model.generate_content( + ["What kind of instrument is this:", organ], + generation_config=genai.GenerationConfig( + response_mime_type="text/x.enum", + response_schema={ + "type": "STRING", + "enum": ["Percussion", "String", "Woodwind", "Brass", "Keyboard"], + }, + ), + ) + print(result) # "Keyboard" + # [END x_enum_raw] + if __name__ == "__main__": absltest.main() From f62d7060691813324d5bd92ee448bfaf3b477d74 Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Mon, 9 Sep 2024 11:08:48 -0700 Subject: [PATCH 02/26] format Change-Id: I8f6f9389f1cae0a7c934217968d4e2e20bb9590e --- samples/controlled_generation.py | 41 -------------------------------- 1 file changed, 41 deletions(-) diff --git a/samples/controlled_generation.py b/samples/controlled_generation.py index 8f6787676..78c422464 100644 --- a/samples/controlled_generation.py +++ b/samples/controlled_generation.py @@ -119,47 +119,6 @@ def test_json_enum_raw(self): print(result) # "Keyboard" # [END json_enum_raw] - def test_x_enum(self): - # [START x_enum] - import enum - - class Choice(enum.Enum): - PERCUSSION = "Percussion" - STRING = "String" - WOODWIND = "Woodwind" - BRASS = "Brass" - KEYBOARD = "Keyboard" - - model = genai.GenerativeModel("gemini-1.5-pro-latest") - - organ = genai.upload_file(media / "organ.jpg") - result = model.generate_content( - ["What kind of instrument is this:", organ], - generation_config=genai.GenerationConfig( - response_mime_type="text/x.enum", response_schema=Choice - ), - ) - print(result) # Keyboard - # [END x_enum] - - def test_x_enum_raw(self): - # [START x_enum_raw] - model = genai.GenerativeModel("gemini-1.5-pro-latest") - - organ = genai.upload_file(media / "organ.jpg") - result = model.generate_content( - ["What kind of instrument is this:", organ], - generation_config=genai.GenerationConfig( - response_mime_type="text/x.enum", - response_schema={ - "type": "STRING", - "enum": ["Percussion", "String", "Woodwind", "Brass", "Keyboard"], - }, - ), - ) - print(result) # Keyboard - # [END x_enum_raw] - def test_x_enum(self): # [START x_enum] import enum From 9583950c8dc08054157044bf35dea46e5962e458 Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Mon, 9 Sep 2024 14:05:13 -0700 Subject: [PATCH 03/26] From: https://github.com/googleapis/python-aiplatform/tree/v1.65.0/vertexai/vision_models Change-Id: I693579ccf2994212f25d0354d091d3210fbf3212 --- google/generativeai/vision_models/__init__.py | 45 + .../vision_models/_vision_models.py | 1376 +++++++++++++++++ 2 files changed, 1421 insertions(+) create mode 100644 google/generativeai/vision_models/__init__.py create mode 100644 google/generativeai/vision_models/_vision_models.py diff --git a/google/generativeai/vision_models/__init__.py b/google/generativeai/vision_models/__init__.py new file mode 100644 index 000000000..1834b5ceb --- /dev/null +++ b/google/generativeai/vision_models/__init__.py @@ -0,0 +1,45 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Classes for working with vision models.""" + +from vertexai.vision_models._vision_models import ( + GeneratedImage, + Image, + ImageCaptioningModel, + ImageGenerationModel, + ImageGenerationResponse, + ImageQnAModel, + ImageTextModel, + MultiModalEmbeddingModel, + MultiModalEmbeddingResponse, + Video, + VideoEmbedding, + VideoSegmentConfig, +) + +__all__ = [ + "GeneratedImage", + "Image", + "ImageCaptioningModel", + "ImageGenerationModel", + "ImageGenerationResponse", + "ImageQnAModel", + "ImageTextModel", + "MultiModalEmbeddingModel", + "MultiModalEmbeddingResponse", + "Video", + "VideoEmbedding", + "VideoSegmentConfig", +] diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py new file mode 100644 index 000000000..a80f0b2dc --- /dev/null +++ b/google/generativeai/vision_models/_vision_models.py @@ -0,0 +1,1376 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# pylint: disable=bad-continuation, line-too-long, protected-access +"""Classes for working with vision models.""" + +import base64 +import dataclasses +import hashlib +import io +import json +import pathlib +import typing +from typing import Any, Dict, List, Literal, Optional, Union +import urllib + +from google.cloud import storage + +from google.cloud.aiplatform import initializer as aiplatform_initializer +from vertexai._model_garden import _model_garden_models + +# pylint: disable=g-import-not-at-top +try: + from IPython import display as IPython_display +except ImportError: + IPython_display = None + +try: + from PIL import Image as PIL_Image +except ImportError: + PIL_Image = None + + +_SUPPORTED_UPSCALING_SIZES = [2048, 4096] + + +class Image: + """Image.""" + + __module__ = "vertexai.vision_models" + + _loaded_bytes: Optional[bytes] = None + _loaded_image: Optional["PIL_Image.Image"] = None + _gcs_uri: Optional[str] = None + + def __init__( + self, + image_bytes: Optional[bytes] = None, + gcs_uri: Optional[str] = None, + ): + """Creates an `Image` object. + + Args: + image_bytes: Image file bytes. Image can be in PNG or JPEG format. + gcs_uri: Image URI in Google Cloud Storage. + """ + if bool(image_bytes) == bool(gcs_uri): + raise ValueError("Either image_bytes or gcs_uri must be provided.") + + self._image_bytes = image_bytes + self._gcs_uri = gcs_uri + + @staticmethod + def load_from_file(location: str) -> "Image": + """Loads image from local file or Google Cloud Storage. + + Args: + location: Local path or Google Cloud Storage uri from where to load + the image. + + Returns: + Loaded image as an `Image` object. + """ + parsed_url = urllib.parse.urlparse(location) + if ( + parsed_url.scheme == "https" + and parsed_url.netloc == "storage.googleapis.com" + ): + parsed_url = parsed_url._replace( + scheme="gs", netloc="", path=f"/{urllib.parse.unquote(parsed_url.path)}" + ) + location = urllib.parse.urlunparse(parsed_url) + + if parsed_url.scheme == "gs": + return Image(gcs_uri=location) + + # Load image from local path + image_bytes = pathlib.Path(location).read_bytes() + image = Image(image_bytes=image_bytes) + return image + + @property + def _blob(self) -> storage.Blob: + if self._gcs_uri is None: + raise AttributeError("_blob is only supported when gcs_uri is set.") + storage_client = storage.Client( + credentials=aiplatform_initializer.global_config.credentials + ) + blob = storage.Blob.from_string(uri=self._gcs_uri, client=storage_client) + # Needed to populate `blob.content_type` + blob.reload() + return blob + + @property + def _image_bytes(self) -> bytes: + if self._loaded_bytes is None: + self._loaded_bytes = self._blob.download_as_bytes() + return self._loaded_bytes + + @_image_bytes.setter + def _image_bytes(self, value: bytes): + self._loaded_bytes = value + + @property + def _pil_image(self) -> "PIL_Image.Image": + if self._loaded_image is None: + if not PIL_Image: + raise RuntimeError( + "The PIL module is not available. Please install the Pillow package." + ) + self._loaded_image = PIL_Image.open(io.BytesIO(self._image_bytes)) + return self._loaded_image + + @property + def _size(self): + return self._pil_image.size + + @property + def _mime_type(self) -> str: + """Returns the MIME type of the image.""" + if self._gcs_uri: + return self._blob.content_type + if PIL_Image: + return PIL_Image.MIME.get(self._pil_image.format, "image/jpeg") + # Fall back to jpeg + return "image/jpeg" + + def show(self): + """Shows the image. + + This method only works when in a notebook environment. + """ + if PIL_Image and IPython_display: + IPython_display.display(self._pil_image) + + def save(self, location: str): + """Saves image to a file. + + Args: + location: Local path where to save the image. + """ + pathlib.Path(location).write_bytes(self._image_bytes) + + def _as_base64_string(self) -> str: + """Encodes image using the base64 encoding. + + Returns: + Base64 encoding of the image as a string. + """ + # ! b64encode returns `bytes` object, not `str`. + # We need to convert `bytes` to `str`, otherwise we get service error: + # "received initial metadata size exceeds limit" + return base64.b64encode(self._image_bytes).decode("ascii") + + +class Video: + """Video.""" + + __module__ = "vertexai.vision_models" + + _loaded_bytes: Optional[bytes] = None + _gcs_uri: Optional[str] = None + + def __init__( + self, + video_bytes: Optional[bytes] = None, + gcs_uri: Optional[str] = None, + ): + """Creates a `Video` object. + + Args: + video_bytes: Video file bytes. Video can be in AVI, FLV, MKV, MOV, + MP4, MPEG, MPG, WEBM, and WMV formats. + gcs_uri: Image URI in Google Cloud Storage. + """ + if bool(video_bytes) == bool(gcs_uri): + raise ValueError("Either video_bytes or gcs_uri must be provided.") + + self._video_bytes = video_bytes + self._gcs_uri = gcs_uri + + @staticmethod + def load_from_file(location: str) -> "Video": + """Loads video from local file or Google Cloud Storage. + + Args: + location: Local path or Google Cloud Storage uri from where to load + the video. + + Returns: + Loaded video as an `Video` object. + """ + parsed_url = urllib.parse.urlparse(location) + if ( + parsed_url.scheme == "https" + and parsed_url.netloc == "storage.googleapis.com" + ): + parsed_url = parsed_url._replace( + scheme="gs", netloc="", path=f"/{urllib.parse.unquote(parsed_url.path)}" + ) + location = urllib.parse.urlunparse(parsed_url) + + if parsed_url.scheme == "gs": + return Video(gcs_uri=location) + + # Load video from local path + video_bytes = pathlib.Path(location).read_bytes() + video = Video(video_bytes=video_bytes) + return video + + @property + def _blob(self) -> storage.Blob: + if self._gcs_uri is None: + raise AttributeError("_blob is only supported when gcs_uri is set.") + storage_client = storage.Client( + credentials=aiplatform_initializer.global_config.credentials + ) + blob = storage.Blob.from_string(uri=self._gcs_uri, client=storage_client) + # Needed to populate `blob.content_type` + blob.reload() + return blob + + @property + def _video_bytes(self) -> bytes: + if self._loaded_bytes is None: + self._loaded_bytes = self._blob.download_as_bytes() + return self._loaded_bytes + + @_video_bytes.setter + def _video_bytes(self, value: bytes): + self._loaded_bytes = value + + @property + def _mime_type(self) -> str: + """Returns the MIME type of the video.""" + if self._gcs_uri: + return self._blob.content_type + # Fall back to mp4 + return "video/mp4" + + def save(self, location: str): + """Saves video to a file. + + Args: + location: Local path where to save the video. + """ + pathlib.Path(location).write_bytes(self._video_bytes) + + def _as_base64_string(self) -> str: + """Encodes video using the base64 encoding. + + Returns: + Base64 encoding of the video as a string. + """ + # ! b64encode returns `bytes` object, not `str`. + # We need to convert `bytes` to `str`, otherwise we get service error: + # "received initial metadata size exceeds limit" + return base64.b64encode(self._video_bytes).decode("ascii") + + +class VideoSegmentConfig: + """The specific video segments (in seconds) the embeddings are generated for.""" + + __module__ = "vertexai.vision_models" + + start_offset_sec: int + end_offset_sec: int + interval_sec: int + + def __init__( + self, + start_offset_sec: int = 0, + end_offset_sec: int = 120, + interval_sec: int = 16, + ): + """Creates a `VideoSegmentConfig` object. + + Args: + start_offset_sec: Start time offset (in seconds) to generate embeddings for. + end_offset_sec: End time offset (in seconds) to generate embeddings for. + interval_sec: Interval to divide video for generated embeddings. + """ + self.start_offset_sec = start_offset_sec + self.end_offset_sec = end_offset_sec + self.interval_sec = interval_sec + + +class VideoEmbedding: + """Embeddings generated from video with offset times.""" + + __module__ = "vertexai.vision_models" + + start_offset_sec: int + end_offset_sec: int + embedding: List[float] + + def __init__( + self, start_offset_sec: int, end_offset_sec: int, embedding: List[float] + ): + """Creates a `VideoEmbedding` object. + + Args: + start_offset_sec: Start time offset (in seconds) of generated embeddings. + end_offset_sec: End time offset (in seconds) of generated embeddings. + embedding: Generated embedding for interval. + """ + self.start_offset_sec = start_offset_sec + self.end_offset_sec = end_offset_sec + self.embedding = embedding + + +class ImageGenerationModel( + _model_garden_models._ModelGardenModel # pylint: disable=protected-access +): + """Generates images from text prompt. + + Examples:: + + model = ImageGenerationModel.from_pretrained("imagegeneration@002") + response = model.generate_images( + prompt="Astronaut riding a horse", + # Optional: + number_of_images=1, + seed=0, + ) + response[0].show() + response[0].save("image1.png") + """ + + __module__ = "vertexai.preview.vision_models" + + _INSTANCE_SCHEMA_URI = "gs://google-cloud-aiplatform/schema/predict/instance/vision_generative_model_1.0.0.yaml" + + def _generate_images( + self, + prompt: str, + *, + negative_prompt: Optional[str] = None, + number_of_images: int = 1, + width: Optional[int] = None, + height: Optional[int] = None, + aspect_ratio: Optional[Literal["1:1", "9:16", "16:9", "4:3", "3:4"]] = None, + guidance_scale: Optional[float] = None, + seed: Optional[int] = None, + base_image: Optional["Image"] = None, + mask: Optional["Image"] = None, + edit_mode: Optional[ + Literal[ + "inpainting-insert", + "inpainting-remove", + "outpainting", + "product-image", + ] + ] = None, + mask_mode: Optional[Literal["background", "foreground", "semantic"]] = None, + segmentation_classes: Optional[List[str]] = None, + mask_dilation: Optional[float] = None, + product_position: Optional[Literal["fixed", "reposition"]] = None, + output_mime_type: Optional[Literal["image/png", "image/jpeg"]] = None, + compression_quality: Optional[float] = None, + language: Optional[str] = None, + output_gcs_uri: Optional[str] = None, + add_watermark: Optional[bool] = None, + safety_filter_level: Optional[ + Literal["block_most", "block_some", "block_few", "block_fewest"] + ] = None, + person_generation: Optional[ + Literal["dont_allow", "allow_adult", "allow_all"] + ] = None, + ) -> "ImageGenerationResponse": + """Generates images from text prompt. + + Args: + prompt: Text prompt for the image. + negative_prompt: A description of what you want to omit in the generated + images. + number_of_images: Number of images to generate. Range: 1..8. + width: Width of the image. One of the sizes must be 256 or 1024. + height: Height of the image. One of the sizes must be 256 or 1024. + aspect_ratio: Aspect ratio for the image. Supported values are: + * 1:1 - Square image + * 9:16 - Portait image + * 16:9 - Landscape image + * 4:3 - Landscape, desktop ratio. + * 3:4 - Portrait, desktop ratio + guidance_scale: Controls the strength of the prompt. Suggested values + are - * 0-9 (low strength) * 10-20 (medium strength) * 21+ (high + strength) + seed: Image generation random seed. + base_image: Base image to use for the image generation. + mask: Mask for the base image. + edit_mode: Describes the editing mode for the request. Supported values + are - * inpainting-insert: fills the mask area based on the text + prompt (requires mask and text) * inpainting-remove: removes the + object(s) in the mask area. (requires mask) + * outpainting: extend the image based on the mask area. (Requires + mask) * product-image: Changes the background for the predominant + product or subject in the image + mask_mode: Solicits generation of the mask (v/s providing mask as an + input). Supported values are: + * background: Automatically generates a mask for all regions except + the primary subject(s) of the image + * foreground: Automatically generates a mask for the primary + subjects(s) of the image. + * semantic: Segment one or more of the segmentation classes using + class ID + segmentation_classes: List of class IDs for segmentation. Max of 5 IDs + mask_dilation: Defines the dilation percentage of the mask provided. + Float between 0 and 1. Defaults to 0.03 + product_position: Defines whether the product should stay fixed or be + repositioned. Supported Values: + * fixed: Fixed position + * reposition: Can be moved (default) + output_mime_type: Which image format should the output be saved as. + Supported values: * image/png: Save as a PNG image * image/jpeg: Save + as a JPEG image + compression_quality: Level of compression if the output mime type is + selected to be image/jpeg. Float between 0 to 100 + language: Language of the text prompt for the image. Default: None. + Supported values are `"en"` for English, `"hi"` for Hindi, `"ja"` for + Japanese, `"ko"` for Korean, and `"auto"` for automatic language + detection. + output_gcs_uri: Google Cloud Storage uri to store the generated images. + add_watermark: Add a watermark to the generated image + safety_filter_level: Adds a filter level to Safety filtering. Supported + values are: * "block_most" : Strongest filtering level, most strict + blocking * "block_some" : Block some problematic prompts and responses + * "block_few" : Block fewer problematic prompts and responses * + "block_fewest" : Block very few problematic prompts and responses + person_generation: Allow generation of people by the model Supported + values are: * "dont_allow" : Block generation of people * + "allow_adult" : Generate adults, but not children * "allow_all" : + Generate adults and children + + Returns: + An `ImageGenerationResponse` object. + """ + # Note: Only a single prompt is supported by the service. + instance = {"prompt": prompt} + shared_generation_parameters = { + "prompt": prompt, + # b/295946075 The service stopped supporting image sizes. + # "width": width, + # "height": height, + "number_of_images_in_batch": number_of_images, + } + + if base_image: + if base_image._gcs_uri: # pylint: disable=protected-access + instance["image"] = { + "gcsUri": base_image._gcs_uri # pylint: disable=protected-access + } + shared_generation_parameters[ + "base_image_uri" + ] = base_image._gcs_uri # pylint: disable=protected-access + else: + instance["image"] = { + "bytesBase64Encoded": base_image._as_base64_string() # pylint: disable=protected-access + } + shared_generation_parameters["base_image_hash"] = hashlib.sha1( + base_image._image_bytes # pylint: disable=protected-access + ).hexdigest() + + if mask: + if mask._gcs_uri: # pylint: disable=protected-access + instance["mask"] = { + "image": { + "gcsUri": mask._gcs_uri # pylint: disable=protected-access + }, + } + shared_generation_parameters[ + "mask_uri" + ] = mask._gcs_uri # pylint: disable=protected-access + else: + instance["mask"] = { + "image": { + "bytesBase64Encoded": mask._as_base64_string() # pylint: disable=protected-access + }, + } + shared_generation_parameters["mask_hash"] = hashlib.sha1( + mask._image_bytes # pylint: disable=protected-access + ).hexdigest() + + parameters = {} + max_size = max(width or 0, height or 0) or None + if aspect_ratio is not None: + parameters["aspectRatio"] = aspect_ratio + elif max_size: + # Note: The size needs to be a string + parameters["sampleImageSize"] = str(max_size) + if height is not None and width is not None and height != width: + parameters["aspectRatio"] = f"{width}:{height}" + + parameters["sampleCount"] = number_of_images + if negative_prompt: + parameters["negativePrompt"] = negative_prompt + shared_generation_parameters["negative_prompt"] = negative_prompt + + if seed is not None: + # Note: String seed and numerical seed give different results + parameters["seed"] = seed + shared_generation_parameters["seed"] = seed + + if guidance_scale is not None: + parameters["guidanceScale"] = guidance_scale + shared_generation_parameters["guidance_scale"] = guidance_scale + + if language is not None: + parameters["language"] = language + shared_generation_parameters["language"] = language + + if output_gcs_uri is not None: + parameters["storageUri"] = output_gcs_uri + shared_generation_parameters["storage_uri"] = output_gcs_uri + + parameters["editConfig"] = {} + if edit_mode is not None: + parameters["editConfig"]["editMode"] = edit_mode + shared_generation_parameters["edit_mode"] = edit_mode + + if mask is None and edit_mode != "product-image": + parameters["editConfig"]["maskMode"] = {} + if mask_mode is not None: + parameters["editConfig"]["maskMode"]["maskType"] = mask_mode + shared_generation_parameters["mask_mode"] = mask_mode + + if segmentation_classes is not None: + parameters["editConfig"]["maskMode"]["classes"] = segmentation_classes + shared_generation_parameters["classes"] = segmentation_classes + + if mask_dilation is not None: + parameters["editConfig"]["maskDilation"] = mask_dilation + shared_generation_parameters["mask_dilation"] = mask_dilation + + if product_position is not None: + parameters["editConfig"]["productPosition"] = product_position + shared_generation_parameters["product_position"] = product_position + + parameters["outputOptions"] = {} + if output_mime_type is not None: + parameters["outputOptions"]["mimeType"] = output_mime_type + shared_generation_parameters["mime_type"] = output_mime_type + + if compression_quality is not None: + parameters["outputOptions"]["compressionQuality"] = compression_quality + shared_generation_parameters["compression_quality"] = compression_quality + + if add_watermark is not None: + parameters["addWatermark"] = add_watermark + shared_generation_parameters["add_watermark"] = add_watermark + + if safety_filter_level is not None: + parameters["safetySetting"] = safety_filter_level + shared_generation_parameters["safety_filter_level"] = safety_filter_level + + if person_generation is not None: + parameters["personGeneration"] = person_generation + shared_generation_parameters["person_generation"] = person_generation + + response = self._endpoint.predict( + instances=[instance], + parameters=parameters, + ) + + generated_images: List["GeneratedImage"] = [] + for idx, prediction in enumerate(response.predictions): + generation_parameters = dict(shared_generation_parameters) + generation_parameters["index_of_image_in_batch"] = idx + encoded_bytes = prediction.get("bytesBase64Encoded") + generated_image = GeneratedImage( + image_bytes=base64.b64decode(encoded_bytes) if encoded_bytes else None, + generation_parameters=generation_parameters, + gcs_uri=prediction.get("gcsUri"), + ) + generated_images.append(generated_image) + + return ImageGenerationResponse(images=generated_images) + + def generate_images( + self, + prompt: str, + *, + negative_prompt: Optional[str] = None, + number_of_images: int = 1, + aspect_ratio: Optional[Literal["1:1", "9:16", "16:9", "4:3", "3:4"]] = None, + guidance_scale: Optional[float] = None, + language: Optional[str] = None, + seed: Optional[int] = None, + output_gcs_uri: Optional[str] = None, + add_watermark: Optional[bool] = True, + safety_filter_level: Optional[ + Literal["block_most", "block_some", "block_few", "block_fewest"] + ] = None, + person_generation: Optional[ + Literal["dont_allow", "allow_adult", "allow_all"] + ] = None, + ) -> "ImageGenerationResponse": + """Generates images from text prompt. + + Args: + prompt: Text prompt for the image. + negative_prompt: A description of what you want to omit in the generated + images. + number_of_images: Number of images to generate. Range: 1..8. + aspect_ratio: Changes the aspect ratio of the generated image Supported + values are: + * "1:1" : 1:1 aspect ratio + * "9:16" : 9:16 aspect ratio + * "16:9" : 16:9 aspect ratio + * "4:3" : 4:3 aspect ratio + * "3:4" : 3:4 aspect_ratio + guidance_scale: Controls the strength of the prompt. Suggested values are: + * 0-9 (low strength) + * 10-20 (medium strength) + * 21+ (high strength) + language: Language of the text prompt for the image. Default: None. + Supported values are `"en"` for English, `"hi"` for Hindi, `"ja"` + for Japanese, `"ko"` for Korean, and `"auto"` for automatic language + detection. + seed: Image generation random seed. + output_gcs_uri: Google Cloud Storage uri to store the generated images. + add_watermark: Add a watermark to the generated image + safety_filter_level: Adds a filter level to Safety filtering. Supported + values are: + * "block_most" : Strongest filtering level, most strict + blocking + * "block_some" : Block some problematic prompts and responses + * "block_few" : Block fewer problematic prompts and responses + * "block_fewest" : Block very few problematic prompts and responses + person_generation: Allow generation of people by the model Supported + values are: + * "dont_allow" : Block generation of people + * "allow_adult" : Generate adults, but not children + * "allow_all" : Generate adults and children + Returns: + An `ImageGenerationResponse` object. + """ + return self._generate_images( + prompt=prompt, + negative_prompt=negative_prompt, + number_of_images=number_of_images, + aspect_ratio=aspect_ratio, + guidance_scale=guidance_scale, + language=language, + seed=seed, + output_gcs_uri=output_gcs_uri, + add_watermark=add_watermark, + safety_filter_level=safety_filter_level, + person_generation=person_generation, + ) + + def edit_image( + self, + *, + prompt: str, + base_image: "Image", + mask: Optional["Image"] = None, + negative_prompt: Optional[str] = None, + number_of_images: int = 1, + guidance_scale: Optional[float] = None, + edit_mode: Optional[ + Literal[ + "inpainting-insert", "inpainting-remove", "outpainting", "product-image" + ] + ] = None, + mask_mode: Optional[Literal["background", "foreground", "semantic"]] = None, + segmentation_classes: Optional[List[str]] = None, + mask_dilation: Optional[float] = None, + product_position: Optional[Literal["fixed", "reposition"]] = None, + output_mime_type: Optional[Literal["image/png", "image/jpeg"]] = None, + compression_quality: Optional[float] = None, + language: Optional[str] = None, + seed: Optional[int] = None, + output_gcs_uri: Optional[str] = None, + safety_filter_level: Optional[ + Literal["block_most", "block_some", "block_few", "block_fewest"] + ] = None, + person_generation: Optional[ + Literal["dont_allow", "allow_adult", "allow_all"] + ] = None, + ) -> "ImageGenerationResponse": + """Edits an existing image based on text prompt. + + Args: + prompt: Text prompt for the image. + base_image: Base image from which to generate the new image. + mask: Mask for the base image. + negative_prompt: A description of what you want to omit in + the generated images. + number_of_images: Number of images to generate. Range: 1..8. + guidance_scale: Controls the strength of the prompt. + Suggested values are: + * 0-9 (low strength) + * 10-20 (medium strength) + * 21+ (high strength) + edit_mode: Describes the editing mode for the request. Supported values are: + * inpainting-insert: fills the mask area based on the text prompt + (requires mask and text) + * inpainting-remove: removes the object(s) in the mask area. + (requires mask) + * outpainting: extend the image based on the mask area. + (Requires mask) + * product-image: Changes the background for the predominant product + or subject in the image + mask_mode: Solicits generation of the mask (v/s providing mask as an + input). Supported values are: + * background: Automatically generates a mask for all regions except + the primary subject(s) of the image + * foreground: Automatically generates a mask for the primary + subjects(s) of the image. + * semantic: Segment one or more of the segmentation classes using + class ID + segmentation_classes: List of class IDs for segmentation. Max of 5 IDs + mask_dilation: Defines the dilation percentage of the mask provided. + Float between 0 and 1. Defaults to 0.03 + product_position: Defines whether the product should stay fixed or be + repositioned. Supported Values: + * fixed: Fixed position + * reposition: Can be moved (default) + output_mime_type: Which image format should the output be saved as. + Supported values: + * image/png: Save as a PNG image + * image/jpeg: Save as a JPEG image + compression_quality: Level of compression if the output mime type is + selected to be image/jpeg. Float between 0 to 100 + language: Language of the text prompt for the image. Default: None. + Supported values are `"en"` for English, `"hi"` for Hindi, + `"ja"` for Japanese, `"ko"` for Korean, and `"auto"` for + automatic language detection. + seed: Image generation random seed. + output_gcs_uri: Google Cloud Storage uri to store the edited images. + safety_filter_level: Adds a filter level to Safety filtering. Supported + values are: + * "block_most" : Strongest filtering level, most strict + blocking + * "block_some" : Block some problematic prompts and responses + * "block_few" : Block fewer problematic prompts and responses + * "block_fewest" : Block very few problematic prompts and responses + person_generation: Allow generation of people by the model Supported + values are: + * "dont_allow" : Block generation of people + * "allow_adult" : Generate adults, but not children + * "allow_all" : Generate adults and children + + Returns: + An `ImageGenerationResponse` object. + """ + return self._generate_images( + prompt=prompt, + negative_prompt=negative_prompt, + number_of_images=number_of_images, + guidance_scale=guidance_scale, + seed=seed, + base_image=base_image, + mask=mask, + edit_mode=edit_mode, + mask_mode=mask_mode, + segmentation_classes=segmentation_classes, + mask_dilation=mask_dilation, + product_position=product_position, + output_mime_type=output_mime_type, + compression_quality=compression_quality, + language=language, + output_gcs_uri=output_gcs_uri, + add_watermark=False, # Not supported for editing yet + safety_filter_level=safety_filter_level, + person_generation=person_generation, + ) + + def upscale_image( + self, + image: Union["Image", "GeneratedImage"], + new_size: Optional[int] = 2048, + upscale_factor: Optional[Literal["x2", "x4"]] = None, + output_mime_type: Optional[Literal["image/png", "image/jpeg"]] = "image/png", + output_compression_quality: Optional[int] = None, + output_gcs_uri: Optional[str] = None, + ) -> "Image": + """Upscales an image. + + This supports upscaling images generated through the `generate_images()` + method, or upscaling a new image. + + Examples:: + + # Upscale a generated image + model = ImageGenerationModel.from_pretrained("imagegeneration@002") + response = model.generate_images( + prompt="Astronaut riding a horse", + ) + model.upscale_image(image=response[0]) + + # Upscale a new 1024x1024 image + my_image = Image.load_from_file("my-image.png") + model.upscale_image(image=my_image) + + # Upscale a new arbitrary sized image using a x2 or x4 upscaling factor + my_image = Image.load_from_file("my-image.png") + model.upscale_image(image=my_image, upscale_factor="x2") + + # Upscale an image and get the result in JPEG format + my_image = Image.load_from_file("my-image.png") + model.upscale_image(image=my_image, output_mime_type="image/jpeg", + output_compression_quality=90) + + Args: + image (Union[GeneratedImage, Image]): Required. The generated image + to upscale. + new_size (int): The size of the biggest dimension of the upscaled + image. + Only 2048 and 4096 are currently supported. Results in a + 2048x2048 or 4096x4096 image. Defaults to 2048 if not provided. + upscale_factor: The upscaling factor. Supported values are "x2" and + "x4". Defaults to None. + output_mime_type: The mime type of the output image. Supported values + are "image/png" and "image/jpeg". Defaults to "image/png". + output_compression_quality: The compression quality of the output + image + as an int (0-100). Only applicable if the output mime type is + "image/jpeg". Defaults to None. + output_gcs_uri: Google Cloud Storage uri to store the upscaled + images. + + Returns: + An `Image` object. + """ + target_image_size = new_size if new_size else None + longest_dim = max(image._size[0], image._size[1]) + + if not new_size and not upscale_factor: + raise ValueError("Either new_size or upscale_factor must be provided.") + + if not upscale_factor: + x2_factor = 2.0 + x4_factor = 4.0 + epsilon = 0.1 + is_upscaling_x2_request = abs(new_size / longest_dim - x2_factor) < epsilon + is_upscaling_x4_request = abs(new_size / longest_dim - x4_factor) < epsilon + + if not is_upscaling_x2_request and not is_upscaling_x4_request: + raise ValueError( + "Only x2 and x4 upscaling are currently supported. Requested" + f" upscaling factor: {new_size / longest_dim}" + ) + else: + if upscale_factor == "x2": + target_image_size = longest_dim * 2 + else: + target_image_size = longest_dim * 4 + if new_size not in _SUPPORTED_UPSCALING_SIZES: + raise ValueError( + "Only the folowing square upscaling sizes are currently supported:" + f" {_SUPPORTED_UPSCALING_SIZES}." + ) + + instance = {"prompt": ""} + + if image._gcs_uri: # pylint: disable=protected-access + instance["image"] = { + "gcsUri": image._gcs_uri # pylint: disable=protected-access + } + else: + instance["image"] = { + "bytesBase64Encoded": image._as_base64_string() # pylint: disable=protected-access + } + + parameters = { + "sampleCount": 1, + "mode": "upscale", + } + + if upscale_factor: + parameters["upscaleConfig"] = {"upscaleFactor": upscale_factor} + + else: + parameters["sampleImageSize"] = str(new_size) + + if output_gcs_uri is not None: + parameters["storageUri"] = output_gcs_uri + + parameters["outputOptions"] = {"mimeType": output_mime_type} + if output_mime_type == "image/jpeg" and output_compression_quality is not None: + parameters["outputOptions"][ + "compressionQuality" + ] = output_compression_quality + + response = self._endpoint.predict( + instances=[instance], + parameters=parameters, + ) + + upscaled_image = response.predictions[0] + + if isinstance(image, GeneratedImage): + generation_parameters = image.generation_parameters + + else: + generation_parameters = {} + + generation_parameters["upscaled_image_size"] = target_image_size + + encoded_bytes = upscaled_image.get("bytesBase64Encoded") + return GeneratedImage( + image_bytes=base64.b64decode(encoded_bytes) if encoded_bytes else None, + generation_parameters=generation_parameters, + gcs_uri=upscaled_image.get("gcsUri"), + ) + + +@dataclasses.dataclass +class ImageGenerationResponse: + """Image generation response. + + Attributes: + images: The list of generated images. + """ + + __module__ = "vertexai.preview.vision_models" + + images: List["GeneratedImage"] + + def __iter__(self) -> typing.Iterator["GeneratedImage"]: + """Iterates through the generated images.""" + yield from self.images + + def __getitem__(self, idx: int) -> "GeneratedImage": + """Gets the generated image by index.""" + return self.images[idx] + + +_EXIF_USER_COMMENT_TAG_IDX = 0x9286 +_IMAGE_GENERATION_PARAMETERS_EXIF_KEY = ( + "google.cloud.vertexai.image_generation.image_generation_parameters" +) + + +class GeneratedImage(Image): + """Generated image.""" + + __module__ = "vertexai.preview.vision_models" + + def __init__( + self, + image_bytes: Optional[bytes], + generation_parameters: Dict[str, Any], + gcs_uri: Optional[str] = None, + ): + """Creates a `GeneratedImage` object. + + Args: + image_bytes: Image file bytes. Image can be in PNG or JPEG format. + generation_parameters: Image generation parameter values. + gcs_uri: Image file Google Cloud Storage uri. + """ + super().__init__(image_bytes=image_bytes, gcs_uri=gcs_uri) + self._generation_parameters = generation_parameters + + @property + def generation_parameters(self): + """Image generation parameters as a dictionary.""" + return self._generation_parameters + + @staticmethod + def load_from_file(location: str) -> "GeneratedImage": + """Loads image from file. + + Args: + location: Local path from where to load the image. + + Returns: + Loaded image as a `GeneratedImage` object. + """ + base_image = Image.load_from_file(location=location) + exif = base_image._pil_image.getexif() # pylint: disable=protected-access + exif_comment_dict = json.loads(exif[_EXIF_USER_COMMENT_TAG_IDX]) + generation_parameters = exif_comment_dict[_IMAGE_GENERATION_PARAMETERS_EXIF_KEY] + return GeneratedImage( + image_bytes=base_image._image_bytes, # pylint: disable=protected-access + generation_parameters=generation_parameters, + gcs_uri=base_image._gcs_uri, # pylint: disable=protected-access + ) + + def save(self, location: str, include_generation_parameters: bool = True): + """Saves image to a file. + + Args: + location: Local path where to save the image. + include_generation_parameters: Whether to include the image + generation parameters in the image's EXIF metadata. + """ + if include_generation_parameters: + if not self._generation_parameters: + raise ValueError("Image does not have generation parameters.") + if not PIL_Image: + raise ValueError( + "The PIL module is required for saving generation parameters." + ) + + exif = self._pil_image.getexif() + exif[_EXIF_USER_COMMENT_TAG_IDX] = json.dumps( + {_IMAGE_GENERATION_PARAMETERS_EXIF_KEY: self._generation_parameters} + ) + self._pil_image.save(location, exif=exif) + else: + super().save(location=location) + + +class ImageCaptioningModel( + _model_garden_models._ModelGardenModel # pylint: disable=protected-access +): + """Generates captions from image. + + Examples:: + + model = ImageCaptioningModel.from_pretrained("imagetext@001") + image = Image.load_from_file("image.png") + captions = model.get_captions( + image=image, + # Optional: + number_of_results=1, + language="en", + ) + """ + + __module__ = "vertexai.vision_models" + + _INSTANCE_SCHEMA_URI = "gs://google-cloud-aiplatform/schema/predict/instance/vision_reasoning_model_1.0.0.yaml" + + def get_captions( + self, + image: Image, + *, + number_of_results: int = 1, + language: str = "en", + output_gcs_uri: Optional[str] = None, + ) -> List[str]: + """Generates captions for a given image. + + Args: + image: The image to get captions for. Size limit: 10 MB. + number_of_results: Number of captions to produce. Range: 1-3. + language: Language to use for captions. + Supported languages: "en", "fr", "de", "it", "es" + output_gcs_uri: Google Cloud Storage uri to store the captioned images. + + Returns: + A list of image caption strings. + """ + instance = {} + + if image._gcs_uri: # pylint: disable=protected-access + instance["image"] = { + "gcsUri": image._gcs_uri # pylint: disable=protected-access + } + else: + instance["image"] = { + "bytesBase64Encoded": image._as_base64_string() # pylint: disable=protected-access + } + parameters = { + "sampleCount": number_of_results, + "language": language, + } + if output_gcs_uri is not None: + parameters["storageUri"] = output_gcs_uri + + response = self._endpoint.predict( + instances=[instance], + parameters=parameters, + ) + return response.predictions + + +class ImageQnAModel( + _model_garden_models._ModelGardenModel # pylint: disable=protected-access +): + """Answers questions about an image. + + Examples:: + + model = ImageQnAModel.from_pretrained("imagetext@001") + image = Image.load_from_file("image.png") + answers = model.ask_question( + image=image, + question="What color is the car in this image?", + # Optional: + number_of_results=1, + ) + """ + + __module__ = "vertexai.vision_models" + + _INSTANCE_SCHEMA_URI = "gs://google-cloud-aiplatform/schema/predict/instance/vision_reasoning_model_1.0.0.yaml" + + def ask_question( + self, + image: Image, + question: str, + *, + number_of_results: int = 1, + ) -> List[str]: + """Answers questions about an image. + + Args: + image: The image to get captions for. Size limit: 10 MB. + question: Question to ask about the image. + number_of_results: Number of captions to produce. Range: 1-3. + + Returns: + A list of answers. + """ + instance = {"prompt": question} + + if image._gcs_uri: # pylint: disable=protected-access + instance["image"] = { + "gcsUri": image._gcs_uri # pylint: disable=protected-access + } + else: + instance["image"] = { + "bytesBase64Encoded": image._as_base64_string() # pylint: disable=protected-access + } + parameters = { + "sampleCount": number_of_results, + } + + response = self._endpoint.predict( + instances=[instance], + parameters=parameters, + ) + return response.predictions + + +class MultiModalEmbeddingModel(_model_garden_models._ModelGardenModel): + """Generates embedding vectors from images and videos. + + Examples:: + + model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding@001") + image = Image.load_from_file("image.png") + video = Video.load_from_file("video.mp4") + + embeddings = model.get_embeddings( + image=image, + video=video, + contextual_text="Hello world", + ) + image_embedding = embeddings.image_embedding + video_embeddings = embeddings.video_embeddings + text_embedding = embeddings.text_embedding + """ + + __module__ = "vertexai.vision_models" + + _INSTANCE_SCHEMA_URI = "gs://google-cloud-aiplatform/schema/predict/instance/vision_embedding_model_1.0.0.yaml" + + def get_embeddings( + self, + image: Optional[Image] = None, + video: Optional[Video] = None, + contextual_text: Optional[str] = None, + dimension: Optional[int] = None, + video_segment_config: Optional[VideoSegmentConfig] = None, + ) -> "MultiModalEmbeddingResponse": + """Gets embedding vectors from the provided image. + + Args: + image (Image): Optional. The image to generate embeddings for. One of + `image`, `video`, or `contextual_text` is required. + video (Video): Optional. The video to generate embeddings for. One of + `image`, `video` or `contextual_text` is required. + contextual_text (str): Optional. Contextual text for your input image or video. + If provided, the model will also generate an embedding vector for the + provided contextual text. The returned image and text embedding + vectors are in the same semantic space with the same dimensionality, + and the vectors can be used interchangeably for use cases like + searching image by text or searching text by image. One of `image`, `video` or + `contextual_text` is required. + dimension (int): Optional. The number of embedding dimensions. Lower + values offer decreased latency when using these embeddings for + subsequent tasks, while higher values offer better accuracy. + Available values: `128`, `256`, `512`, and `1408` (default). + video_segment_config (VideoSegmentConfig): Optional. The specific + video segments (in seconds) the embeddings are generated for. + + Returns: + MultiModalEmbeddingResponse: + The image and text embedding vectors. + """ + + if not image and not video and not contextual_text: + raise ValueError( + "One of `image`, `video`, or `contextual_text` is required." + ) + + instance = {} + + if image: + if image._gcs_uri: # pylint: disable=protected-access + instance["image"] = { + "gcsUri": image._gcs_uri # pylint: disable=protected-access + } + else: + instance["image"] = { + "bytesBase64Encoded": image._as_base64_string() # pylint: disable=protected-access + } + + if video: + if video._gcs_uri: # pylint: disable=protected-access + instance["video"] = { + "gcsUri": video._gcs_uri # pylint: disable=protected-access + } + else: + instance["video"] = { + "bytesBase64Encoded": video._as_base64_string() # pylint: disable=protected-access + } # pylint: disable=protected-access + + if video_segment_config: + instance["video"]["videoSegmentConfig"] = { + "startOffsetSec": video_segment_config.start_offset_sec, + "endOffsetSec": video_segment_config.end_offset_sec, + "intervalSec": video_segment_config.interval_sec, + } + + if contextual_text: + instance["text"] = contextual_text + + parameters = {} + if dimension: + parameters["dimension"] = dimension + + response = self._endpoint.predict( + instances=[instance], + parameters=parameters, + ) + image_embedding = response.predictions[0].get("imageEmbedding") + video_embeddings = [] + for video_embedding in response.predictions[0].get("videoEmbeddings", []): + video_embeddings.append( + VideoEmbedding( + embedding=video_embedding["embedding"], + start_offset_sec=video_embedding["startOffsetSec"], + end_offset_sec=video_embedding["endOffsetSec"], + ) + ) + text_embedding = ( + response.predictions[0].get("textEmbedding") + if "textEmbedding" in response.predictions[0] + else None + ) + return MultiModalEmbeddingResponse( + image_embedding=image_embedding, + video_embeddings=video_embeddings, + _prediction_response=response, + text_embedding=text_embedding, + ) + + +@dataclasses.dataclass +class MultiModalEmbeddingResponse: + """The multimodal embedding response. + + Attributes: + image_embedding (List[float]): + Optional. The embedding vector generated from your image. + video_embeddings (List[VideoEmbedding]): + Optional. The embedding vectors generated from your video. + text_embedding (List[float]): + Optional. The embedding vector generated from the contextual text provided for your image or video. + """ + + __module__ = "vertexai.vision_models" + + _prediction_response: Any + image_embedding: Optional[List[float]] = None + video_embeddings: Optional[List[VideoEmbedding]] = None + text_embedding: Optional[List[float]] = None + + +class ImageTextModel(ImageCaptioningModel, ImageQnAModel): + """Generates text from images. + + Examples:: + + model = ImageTextModel.from_pretrained("imagetext@001") + image = Image.load_from_file("image.png") + + captions = model.get_captions( + image=image, + # Optional: + number_of_results=1, + language="en", + ) + + answers = model.ask_question( + image=image, + question="What color is the car in this image?", + # Optional: + number_of_results=1, + ) + """ + + __module__ = "vertexai.vision_models" + + # NOTE: Using this ImageTextModel class is recommended over using ImageQnAModel or ImageCaptioningModel, + # since SDK Model Garden classes should follow the design pattern of exactly 1 SDK class to 1 Model Garden schema URI + + _INSTANCE_SCHEMA_URI = "gs://google-cloud-aiplatform/schema/predict/instance/vision_reasoning_model_1.0.0.yaml" + + +@dataclasses.dataclass +class WatermarkVerificationResponse: + + __module__ = "vertexai.preview.vision_models" + + _prediction_response: Any + watermark_verification_result: Optional[str] = None + + +class WatermarkVerificationModel(_model_garden_models._ModelGardenModel): + """Verifies if an image has a watermark.""" + + __module__ = "vertexai.preview.vision_models" + + _INSTANCE_SCHEMA_URI = "gs://google-cloud-aiplatform/schema/predict/instance/watermark_verification_model_1.0.0.yaml" + + def verify_image(self, image: Image) -> WatermarkVerificationResponse: + """Verifies the watermark of an image. + + Args: + image: The image to verify. + + Returns: + A WatermarkVerificationResponse, containing the confidence level of + the image being watermarked. + """ + if not image: + raise ValueError("Image is required.") + + instance = {} + + if image._gcs_uri: + instance["image"] = {"gcsUri": image._gcs_uri} + else: + instance["image"] = {"bytesBase64Encoded": image._as_base64_string()} + + parameters = {} + response = self._endpoint.predict( + instances=[instance], + parameters=parameters, + ) + + verification_likelihood = response.predictions[0].get("decision") + return WatermarkVerificationResponse( + _prediction_response=response, + watermark_verification_result=verification_likelihood, + ) From b6467baad1acf6906a1aefafb7dc0d1eb760e6ef Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Mon, 9 Sep 2024 15:16:08 -0700 Subject: [PATCH 04/26] clear out extra classes Change-Id: I64dec6a71f3f04fa834f2174e4a6b2d4740f4e90 --- google/generativeai/vision_models/__init__.py | 20 +- .../vision_models/_vision_models.py | 525 +----------------- 2 files changed, 2 insertions(+), 543 deletions(-) diff --git a/google/generativeai/vision_models/__init__.py b/google/generativeai/vision_models/__init__.py index 1834b5ceb..f519c9928 100644 --- a/google/generativeai/vision_models/__init__.py +++ b/google/generativeai/vision_models/__init__.py @@ -14,32 +14,14 @@ # """Classes for working with vision models.""" -from vertexai.vision_models._vision_models import ( +from google.generativeai.vision_models._vision_models import ( GeneratedImage, - Image, - ImageCaptioningModel, ImageGenerationModel, ImageGenerationResponse, - ImageQnAModel, - ImageTextModel, - MultiModalEmbeddingModel, - MultiModalEmbeddingResponse, - Video, - VideoEmbedding, - VideoSegmentConfig, ) __all__ = [ "GeneratedImage", - "Image", - "ImageCaptioningModel", "ImageGenerationModel", "ImageGenerationResponse", - "ImageQnAModel", - "ImageTextModel", - "MultiModalEmbeddingModel", - "MultiModalEmbeddingResponse", - "Video", - "VideoEmbedding", - "VideoSegmentConfig", ] diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py index a80f0b2dc..cc0cd5ca9 100644 --- a/google/generativeai/vision_models/_vision_models.py +++ b/google/generativeai/vision_models/_vision_models.py @@ -25,11 +25,6 @@ from typing import Any, Dict, List, Literal, Optional, Union import urllib -from google.cloud import storage - -from google.cloud.aiplatform import initializer as aiplatform_initializer -from vertexai._model_garden import _model_garden_models - # pylint: disable=g-import-not-at-top try: from IPython import display as IPython_display @@ -100,17 +95,6 @@ def load_from_file(location: str) -> "Image": image = Image(image_bytes=image_bytes) return image - @property - def _blob(self) -> storage.Blob: - if self._gcs_uri is None: - raise AttributeError("_blob is only supported when gcs_uri is set.") - storage_client = storage.Client( - credentials=aiplatform_initializer.global_config.credentials - ) - blob = storage.Blob.from_string(uri=self._gcs_uri, client=storage_client) - # Needed to populate `blob.content_type` - blob.reload() - return blob @property def _image_bytes(self) -> bytes: @@ -174,165 +158,7 @@ def _as_base64_string(self) -> str: return base64.b64encode(self._image_bytes).decode("ascii") -class Video: - """Video.""" - - __module__ = "vertexai.vision_models" - - _loaded_bytes: Optional[bytes] = None - _gcs_uri: Optional[str] = None - - def __init__( - self, - video_bytes: Optional[bytes] = None, - gcs_uri: Optional[str] = None, - ): - """Creates a `Video` object. - - Args: - video_bytes: Video file bytes. Video can be in AVI, FLV, MKV, MOV, - MP4, MPEG, MPG, WEBM, and WMV formats. - gcs_uri: Image URI in Google Cloud Storage. - """ - if bool(video_bytes) == bool(gcs_uri): - raise ValueError("Either video_bytes or gcs_uri must be provided.") - - self._video_bytes = video_bytes - self._gcs_uri = gcs_uri - - @staticmethod - def load_from_file(location: str) -> "Video": - """Loads video from local file or Google Cloud Storage. - - Args: - location: Local path or Google Cloud Storage uri from where to load - the video. - - Returns: - Loaded video as an `Video` object. - """ - parsed_url = urllib.parse.urlparse(location) - if ( - parsed_url.scheme == "https" - and parsed_url.netloc == "storage.googleapis.com" - ): - parsed_url = parsed_url._replace( - scheme="gs", netloc="", path=f"/{urllib.parse.unquote(parsed_url.path)}" - ) - location = urllib.parse.urlunparse(parsed_url) - - if parsed_url.scheme == "gs": - return Video(gcs_uri=location) - - # Load video from local path - video_bytes = pathlib.Path(location).read_bytes() - video = Video(video_bytes=video_bytes) - return video - - @property - def _blob(self) -> storage.Blob: - if self._gcs_uri is None: - raise AttributeError("_blob is only supported when gcs_uri is set.") - storage_client = storage.Client( - credentials=aiplatform_initializer.global_config.credentials - ) - blob = storage.Blob.from_string(uri=self._gcs_uri, client=storage_client) - # Needed to populate `blob.content_type` - blob.reload() - return blob - - @property - def _video_bytes(self) -> bytes: - if self._loaded_bytes is None: - self._loaded_bytes = self._blob.download_as_bytes() - return self._loaded_bytes - - @_video_bytes.setter - def _video_bytes(self, value: bytes): - self._loaded_bytes = value - - @property - def _mime_type(self) -> str: - """Returns the MIME type of the video.""" - if self._gcs_uri: - return self._blob.content_type - # Fall back to mp4 - return "video/mp4" - - def save(self, location: str): - """Saves video to a file. - - Args: - location: Local path where to save the video. - """ - pathlib.Path(location).write_bytes(self._video_bytes) - - def _as_base64_string(self) -> str: - """Encodes video using the base64 encoding. - - Returns: - Base64 encoding of the video as a string. - """ - # ! b64encode returns `bytes` object, not `str`. - # We need to convert `bytes` to `str`, otherwise we get service error: - # "received initial metadata size exceeds limit" - return base64.b64encode(self._video_bytes).decode("ascii") - - -class VideoSegmentConfig: - """The specific video segments (in seconds) the embeddings are generated for.""" - - __module__ = "vertexai.vision_models" - - start_offset_sec: int - end_offset_sec: int - interval_sec: int - - def __init__( - self, - start_offset_sec: int = 0, - end_offset_sec: int = 120, - interval_sec: int = 16, - ): - """Creates a `VideoSegmentConfig` object. - - Args: - start_offset_sec: Start time offset (in seconds) to generate embeddings for. - end_offset_sec: End time offset (in seconds) to generate embeddings for. - interval_sec: Interval to divide video for generated embeddings. - """ - self.start_offset_sec = start_offset_sec - self.end_offset_sec = end_offset_sec - self.interval_sec = interval_sec - - -class VideoEmbedding: - """Embeddings generated from video with offset times.""" - - __module__ = "vertexai.vision_models" - - start_offset_sec: int - end_offset_sec: int - embedding: List[float] - - def __init__( - self, start_offset_sec: int, end_offset_sec: int, embedding: List[float] - ): - """Creates a `VideoEmbedding` object. - - Args: - start_offset_sec: Start time offset (in seconds) of generated embeddings. - end_offset_sec: End time offset (in seconds) of generated embeddings. - embedding: Generated embedding for interval. - """ - self.start_offset_sec = start_offset_sec - self.end_offset_sec = end_offset_sec - self.embedding = embedding - - -class ImageGenerationModel( - _model_garden_models._ModelGardenModel # pylint: disable=protected-access -): +class ImageGenerationModel: """Generates images from text prompt. Examples:: @@ -1025,352 +851,3 @@ def save(self, location: str, include_generation_parameters: bool = True): else: super().save(location=location) - -class ImageCaptioningModel( - _model_garden_models._ModelGardenModel # pylint: disable=protected-access -): - """Generates captions from image. - - Examples:: - - model = ImageCaptioningModel.from_pretrained("imagetext@001") - image = Image.load_from_file("image.png") - captions = model.get_captions( - image=image, - # Optional: - number_of_results=1, - language="en", - ) - """ - - __module__ = "vertexai.vision_models" - - _INSTANCE_SCHEMA_URI = "gs://google-cloud-aiplatform/schema/predict/instance/vision_reasoning_model_1.0.0.yaml" - - def get_captions( - self, - image: Image, - *, - number_of_results: int = 1, - language: str = "en", - output_gcs_uri: Optional[str] = None, - ) -> List[str]: - """Generates captions for a given image. - - Args: - image: The image to get captions for. Size limit: 10 MB. - number_of_results: Number of captions to produce. Range: 1-3. - language: Language to use for captions. - Supported languages: "en", "fr", "de", "it", "es" - output_gcs_uri: Google Cloud Storage uri to store the captioned images. - - Returns: - A list of image caption strings. - """ - instance = {} - - if image._gcs_uri: # pylint: disable=protected-access - instance["image"] = { - "gcsUri": image._gcs_uri # pylint: disable=protected-access - } - else: - instance["image"] = { - "bytesBase64Encoded": image._as_base64_string() # pylint: disable=protected-access - } - parameters = { - "sampleCount": number_of_results, - "language": language, - } - if output_gcs_uri is not None: - parameters["storageUri"] = output_gcs_uri - - response = self._endpoint.predict( - instances=[instance], - parameters=parameters, - ) - return response.predictions - - -class ImageQnAModel( - _model_garden_models._ModelGardenModel # pylint: disable=protected-access -): - """Answers questions about an image. - - Examples:: - - model = ImageQnAModel.from_pretrained("imagetext@001") - image = Image.load_from_file("image.png") - answers = model.ask_question( - image=image, - question="What color is the car in this image?", - # Optional: - number_of_results=1, - ) - """ - - __module__ = "vertexai.vision_models" - - _INSTANCE_SCHEMA_URI = "gs://google-cloud-aiplatform/schema/predict/instance/vision_reasoning_model_1.0.0.yaml" - - def ask_question( - self, - image: Image, - question: str, - *, - number_of_results: int = 1, - ) -> List[str]: - """Answers questions about an image. - - Args: - image: The image to get captions for. Size limit: 10 MB. - question: Question to ask about the image. - number_of_results: Number of captions to produce. Range: 1-3. - - Returns: - A list of answers. - """ - instance = {"prompt": question} - - if image._gcs_uri: # pylint: disable=protected-access - instance["image"] = { - "gcsUri": image._gcs_uri # pylint: disable=protected-access - } - else: - instance["image"] = { - "bytesBase64Encoded": image._as_base64_string() # pylint: disable=protected-access - } - parameters = { - "sampleCount": number_of_results, - } - - response = self._endpoint.predict( - instances=[instance], - parameters=parameters, - ) - return response.predictions - - -class MultiModalEmbeddingModel(_model_garden_models._ModelGardenModel): - """Generates embedding vectors from images and videos. - - Examples:: - - model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding@001") - image = Image.load_from_file("image.png") - video = Video.load_from_file("video.mp4") - - embeddings = model.get_embeddings( - image=image, - video=video, - contextual_text="Hello world", - ) - image_embedding = embeddings.image_embedding - video_embeddings = embeddings.video_embeddings - text_embedding = embeddings.text_embedding - """ - - __module__ = "vertexai.vision_models" - - _INSTANCE_SCHEMA_URI = "gs://google-cloud-aiplatform/schema/predict/instance/vision_embedding_model_1.0.0.yaml" - - def get_embeddings( - self, - image: Optional[Image] = None, - video: Optional[Video] = None, - contextual_text: Optional[str] = None, - dimension: Optional[int] = None, - video_segment_config: Optional[VideoSegmentConfig] = None, - ) -> "MultiModalEmbeddingResponse": - """Gets embedding vectors from the provided image. - - Args: - image (Image): Optional. The image to generate embeddings for. One of - `image`, `video`, or `contextual_text` is required. - video (Video): Optional. The video to generate embeddings for. One of - `image`, `video` or `contextual_text` is required. - contextual_text (str): Optional. Contextual text for your input image or video. - If provided, the model will also generate an embedding vector for the - provided contextual text. The returned image and text embedding - vectors are in the same semantic space with the same dimensionality, - and the vectors can be used interchangeably for use cases like - searching image by text or searching text by image. One of `image`, `video` or - `contextual_text` is required. - dimension (int): Optional. The number of embedding dimensions. Lower - values offer decreased latency when using these embeddings for - subsequent tasks, while higher values offer better accuracy. - Available values: `128`, `256`, `512`, and `1408` (default). - video_segment_config (VideoSegmentConfig): Optional. The specific - video segments (in seconds) the embeddings are generated for. - - Returns: - MultiModalEmbeddingResponse: - The image and text embedding vectors. - """ - - if not image and not video and not contextual_text: - raise ValueError( - "One of `image`, `video`, or `contextual_text` is required." - ) - - instance = {} - - if image: - if image._gcs_uri: # pylint: disable=protected-access - instance["image"] = { - "gcsUri": image._gcs_uri # pylint: disable=protected-access - } - else: - instance["image"] = { - "bytesBase64Encoded": image._as_base64_string() # pylint: disable=protected-access - } - - if video: - if video._gcs_uri: # pylint: disable=protected-access - instance["video"] = { - "gcsUri": video._gcs_uri # pylint: disable=protected-access - } - else: - instance["video"] = { - "bytesBase64Encoded": video._as_base64_string() # pylint: disable=protected-access - } # pylint: disable=protected-access - - if video_segment_config: - instance["video"]["videoSegmentConfig"] = { - "startOffsetSec": video_segment_config.start_offset_sec, - "endOffsetSec": video_segment_config.end_offset_sec, - "intervalSec": video_segment_config.interval_sec, - } - - if contextual_text: - instance["text"] = contextual_text - - parameters = {} - if dimension: - parameters["dimension"] = dimension - - response = self._endpoint.predict( - instances=[instance], - parameters=parameters, - ) - image_embedding = response.predictions[0].get("imageEmbedding") - video_embeddings = [] - for video_embedding in response.predictions[0].get("videoEmbeddings", []): - video_embeddings.append( - VideoEmbedding( - embedding=video_embedding["embedding"], - start_offset_sec=video_embedding["startOffsetSec"], - end_offset_sec=video_embedding["endOffsetSec"], - ) - ) - text_embedding = ( - response.predictions[0].get("textEmbedding") - if "textEmbedding" in response.predictions[0] - else None - ) - return MultiModalEmbeddingResponse( - image_embedding=image_embedding, - video_embeddings=video_embeddings, - _prediction_response=response, - text_embedding=text_embedding, - ) - - -@dataclasses.dataclass -class MultiModalEmbeddingResponse: - """The multimodal embedding response. - - Attributes: - image_embedding (List[float]): - Optional. The embedding vector generated from your image. - video_embeddings (List[VideoEmbedding]): - Optional. The embedding vectors generated from your video. - text_embedding (List[float]): - Optional. The embedding vector generated from the contextual text provided for your image or video. - """ - - __module__ = "vertexai.vision_models" - - _prediction_response: Any - image_embedding: Optional[List[float]] = None - video_embeddings: Optional[List[VideoEmbedding]] = None - text_embedding: Optional[List[float]] = None - - -class ImageTextModel(ImageCaptioningModel, ImageQnAModel): - """Generates text from images. - - Examples:: - - model = ImageTextModel.from_pretrained("imagetext@001") - image = Image.load_from_file("image.png") - - captions = model.get_captions( - image=image, - # Optional: - number_of_results=1, - language="en", - ) - - answers = model.ask_question( - image=image, - question="What color is the car in this image?", - # Optional: - number_of_results=1, - ) - """ - - __module__ = "vertexai.vision_models" - - # NOTE: Using this ImageTextModel class is recommended over using ImageQnAModel or ImageCaptioningModel, - # since SDK Model Garden classes should follow the design pattern of exactly 1 SDK class to 1 Model Garden schema URI - - _INSTANCE_SCHEMA_URI = "gs://google-cloud-aiplatform/schema/predict/instance/vision_reasoning_model_1.0.0.yaml" - - -@dataclasses.dataclass -class WatermarkVerificationResponse: - - __module__ = "vertexai.preview.vision_models" - - _prediction_response: Any - watermark_verification_result: Optional[str] = None - - -class WatermarkVerificationModel(_model_garden_models._ModelGardenModel): - """Verifies if an image has a watermark.""" - - __module__ = "vertexai.preview.vision_models" - - _INSTANCE_SCHEMA_URI = "gs://google-cloud-aiplatform/schema/predict/instance/watermark_verification_model_1.0.0.yaml" - - def verify_image(self, image: Image) -> WatermarkVerificationResponse: - """Verifies the watermark of an image. - - Args: - image: The image to verify. - - Returns: - A WatermarkVerificationResponse, containing the confidence level of - the image being watermarked. - """ - if not image: - raise ValueError("Image is required.") - - instance = {} - - if image._gcs_uri: - instance["image"] = {"gcsUri": image._gcs_uri} - else: - instance["image"] = {"bytesBase64Encoded": image._as_base64_string()} - - parameters = {} - response = self._endpoint.predict( - instances=[instance], - parameters=parameters, - ) - - verification_likelihood = response.predictions[0].get("decision") - return WatermarkVerificationResponse( - _prediction_response=response, - watermark_verification_result=verification_likelihood, - ) From e518db4226d93d218e58567c94b726d06fe86162 Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Mon, 9 Sep 2024 15:32:30 -0700 Subject: [PATCH 05/26] remove gcs_uri Change-Id: Ieafe776bde8e79e1747ab6cf252b49a001ab66bf --- .../vision_models/_vision_models.py | 26 +------------------ 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py index cc0cd5ca9..8381cae9d 100644 --- a/google/generativeai/vision_models/_vision_models.py +++ b/google/generativeai/vision_models/_vision_models.py @@ -47,24 +47,17 @@ class Image: _loaded_bytes: Optional[bytes] = None _loaded_image: Optional["PIL_Image.Image"] = None - _gcs_uri: Optional[str] = None def __init__( self, - image_bytes: Optional[bytes] = None, - gcs_uri: Optional[str] = None, + image_bytes: Optional[bytes], ): """Creates an `Image` object. Args: image_bytes: Image file bytes. Image can be in PNG or JPEG format. - gcs_uri: Image URI in Google Cloud Storage. """ - if bool(image_bytes) == bool(gcs_uri): - raise ValueError("Either image_bytes or gcs_uri must be provided.") - self._image_bytes = image_bytes - self._gcs_uri = gcs_uri @staticmethod def load_from_file(location: str) -> "Image": @@ -77,19 +70,6 @@ def load_from_file(location: str) -> "Image": Returns: Loaded image as an `Image` object. """ - parsed_url = urllib.parse.urlparse(location) - if ( - parsed_url.scheme == "https" - and parsed_url.netloc == "storage.googleapis.com" - ): - parsed_url = parsed_url._replace( - scheme="gs", netloc="", path=f"/{urllib.parse.unquote(parsed_url.path)}" - ) - location = urllib.parse.urlunparse(parsed_url) - - if parsed_url.scheme == "gs": - return Image(gcs_uri=location) - # Load image from local path image_bytes = pathlib.Path(location).read_bytes() image = Image(image_bytes=image_bytes) @@ -98,8 +78,6 @@ def load_from_file(location: str) -> "Image": @property def _image_bytes(self) -> bytes: - if self._loaded_bytes is None: - self._loaded_bytes = self._blob.download_as_bytes() return self._loaded_bytes @_image_bytes.setter @@ -123,8 +101,6 @@ def _size(self): @property def _mime_type(self) -> str: """Returns the MIME type of the image.""" - if self._gcs_uri: - return self._blob.content_type if PIL_Image: return PIL_Image.MIME.get(self._pil_image.format, "image/jpeg") # Fall back to jpeg From 13dfe88e97d61297e9a73f9af803e5063e9fe1db Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Mon, 9 Sep 2024 15:34:15 -0700 Subject: [PATCH 06/26] IPython reprs Change-Id: I586876f524684dc3d0ee0ea8510b56bc5153642b --- google/generativeai/vision_models/_vision_models.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py index 8381cae9d..d7cea74a3 100644 --- a/google/generativeai/vision_models/_vision_models.py +++ b/google/generativeai/vision_models/_vision_models.py @@ -114,6 +114,12 @@ def show(self): if PIL_Image and IPython_display: IPython_display.display(self._pil_image) + def _repr_jpeg(self): + return self._pil_image._repr_jpeg() + + def _repr_png(self): + return self._pil_image._repr_png() + def save(self, location: str): """Saves image to a file. From 73a312b363c05212b5a460d50643e1f4a70033ac Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Mon, 9 Sep 2024 15:46:21 -0700 Subject: [PATCH 07/26] remove output_gce_uri Change-Id: Ic4424d06a705bf94f0fade34468160566cb3a6be --- .../vision_models/_vision_models.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py index d7cea74a3..45bcd5401 100644 --- a/google/generativeai/vision_models/_vision_models.py +++ b/google/generativeai/vision_models/_vision_models.py @@ -188,7 +188,6 @@ def _generate_images( output_mime_type: Optional[Literal["image/png", "image/jpeg"]] = None, compression_quality: Optional[float] = None, language: Optional[str] = None, - output_gcs_uri: Optional[str] = None, add_watermark: Optional[bool] = None, safety_filter_level: Optional[ Literal["block_most", "block_some", "block_few", "block_fewest"] @@ -249,7 +248,6 @@ class ID Supported values are `"en"` for English, `"hi"` for Hindi, `"ja"` for Japanese, `"ko"` for Korean, and `"auto"` for automatic language detection. - output_gcs_uri: Google Cloud Storage uri to store the generated images. add_watermark: Add a watermark to the generated image safety_filter_level: Adds a filter level to Safety filtering. Supported values are: * "block_most" : Strongest filtering level, most strict @@ -338,10 +336,6 @@ class ID parameters["language"] = language shared_generation_parameters["language"] = language - if output_gcs_uri is not None: - parameters["storageUri"] = output_gcs_uri - shared_generation_parameters["storage_uri"] = output_gcs_uri - parameters["editConfig"] = {} if edit_mode is not None: parameters["editConfig"]["editMode"] = edit_mode @@ -415,7 +409,6 @@ def generate_images( guidance_scale: Optional[float] = None, language: Optional[str] = None, seed: Optional[int] = None, - output_gcs_uri: Optional[str] = None, add_watermark: Optional[bool] = True, safety_filter_level: Optional[ Literal["block_most", "block_some", "block_few", "block_fewest"] @@ -447,7 +440,6 @@ def generate_images( for Japanese, `"ko"` for Korean, and `"auto"` for automatic language detection. seed: Image generation random seed. - output_gcs_uri: Google Cloud Storage uri to store the generated images. add_watermark: Add a watermark to the generated image safety_filter_level: Adds a filter level to Safety filtering. Supported values are: @@ -472,7 +464,6 @@ def generate_images( guidance_scale=guidance_scale, language=language, seed=seed, - output_gcs_uri=output_gcs_uri, add_watermark=add_watermark, safety_filter_level=safety_filter_level, person_generation=person_generation, @@ -500,7 +491,6 @@ def edit_image( compression_quality: Optional[float] = None, language: Optional[str] = None, seed: Optional[int] = None, - output_gcs_uri: Optional[str] = None, safety_filter_level: Optional[ Literal["block_most", "block_some", "block_few", "block_fewest"] ] = None, @@ -557,7 +547,6 @@ class ID `"ja"` for Japanese, `"ko"` for Korean, and `"auto"` for automatic language detection. seed: Image generation random seed. - output_gcs_uri: Google Cloud Storage uri to store the edited images. safety_filter_level: Adds a filter level to Safety filtering. Supported values are: * "block_most" : Strongest filtering level, most strict @@ -590,7 +579,6 @@ class ID output_mime_type=output_mime_type, compression_quality=compression_quality, language=language, - output_gcs_uri=output_gcs_uri, add_watermark=False, # Not supported for editing yet safety_filter_level=safety_filter_level, person_generation=person_generation, @@ -603,7 +591,6 @@ def upscale_image( upscale_factor: Optional[Literal["x2", "x4"]] = None, output_mime_type: Optional[Literal["image/png", "image/jpeg"]] = "image/png", output_compression_quality: Optional[int] = None, - output_gcs_uri: Optional[str] = None, ) -> "Image": """Upscales an image. @@ -647,8 +634,6 @@ def upscale_image( image as an int (0-100). Only applicable if the output mime type is "image/jpeg". Defaults to None. - output_gcs_uri: Google Cloud Storage uri to store the upscaled - images. Returns: An `Image` object. @@ -704,9 +689,6 @@ def upscale_image( else: parameters["sampleImageSize"] = str(new_size) - if output_gcs_uri is not None: - parameters["storageUri"] = output_gcs_uri - parameters["outputOptions"] = {"mimeType": output_mime_type} if output_mime_type == "image/jpeg" and output_compression_quality is not None: parameters["outputOptions"][ From 04e3f0d9be1a2f2bd0e9b99964e1330bed198a80 Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Mon, 9 Sep 2024 15:51:01 -0700 Subject: [PATCH 08/26] remove IPython reprs Change-Id: I474c76c1e73d7a92a653932d1984c02de7e3b71a --- google/generativeai/vision_models/_vision_models.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py index 45bcd5401..a9e796565 100644 --- a/google/generativeai/vision_models/_vision_models.py +++ b/google/generativeai/vision_models/_vision_models.py @@ -114,12 +114,6 @@ def show(self): if PIL_Image and IPython_display: IPython_display.display(self._pil_image) - def _repr_jpeg(self): - return self._pil_image._repr_jpeg() - - def _repr_png(self): - return self._pil_image._repr_png() - def save(self, location: str): """Saves image to a file. From 196eaf71392650dbc2fdf28c02a60a3cec85ea33 Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Mon, 9 Sep 2024 15:52:05 -0700 Subject: [PATCH 09/26] remove more gcs_uri. Change-Id: I9f6b8c879dc8f38c5193ddb1e271a62984c7f020 --- .../vision_models/_vision_models.py | 64 ++++++------------- 1 file changed, 18 insertions(+), 46 deletions(-) diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py index a9e796565..312d3f90b 100644 --- a/google/generativeai/vision_models/_vision_models.py +++ b/google/generativeai/vision_models/_vision_models.py @@ -267,40 +267,22 @@ class ID } if base_image: - if base_image._gcs_uri: # pylint: disable=protected-access - instance["image"] = { - "gcsUri": base_image._gcs_uri # pylint: disable=protected-access - } - shared_generation_parameters[ - "base_image_uri" - ] = base_image._gcs_uri # pylint: disable=protected-access - else: - instance["image"] = { - "bytesBase64Encoded": base_image._as_base64_string() # pylint: disable=protected-access - } - shared_generation_parameters["base_image_hash"] = hashlib.sha1( - base_image._image_bytes # pylint: disable=protected-access - ).hexdigest() + instance["image"] = { + "bytesBase64Encoded": base_image._as_base64_string() # pylint: disable=protected-access + } + shared_generation_parameters["base_image_hash"] = hashlib.sha1( + base_image._image_bytes # pylint: disable=protected-access + ).hexdigest() if mask: - if mask._gcs_uri: # pylint: disable=protected-access - instance["mask"] = { - "image": { - "gcsUri": mask._gcs_uri # pylint: disable=protected-access - }, - } - shared_generation_parameters[ - "mask_uri" - ] = mask._gcs_uri # pylint: disable=protected-access - else: - instance["mask"] = { - "image": { - "bytesBase64Encoded": mask._as_base64_string() # pylint: disable=protected-access - }, - } - shared_generation_parameters["mask_hash"] = hashlib.sha1( - mask._image_bytes # pylint: disable=protected-access - ).hexdigest() + instance["mask"] = { + "image": { + "bytesBase64Encoded": mask._as_base64_string() # pylint: disable=protected-access + }, + } + shared_generation_parameters["mask_hash"] = hashlib.sha1( + mask._image_bytes # pylint: disable=protected-access + ).hexdigest() parameters = {} max_size = max(width or 0, height or 0) or None @@ -387,7 +369,6 @@ class ID generated_image = GeneratedImage( image_bytes=base64.b64decode(encoded_bytes) if encoded_bytes else None, generation_parameters=generation_parameters, - gcs_uri=prediction.get("gcsUri"), ) generated_images.append(generated_image) @@ -663,14 +644,9 @@ def upscale_image( instance = {"prompt": ""} - if image._gcs_uri: # pylint: disable=protected-access - instance["image"] = { - "gcsUri": image._gcs_uri # pylint: disable=protected-access - } - else: - instance["image"] = { - "bytesBase64Encoded": image._as_base64_string() # pylint: disable=protected-access - } + instance["image"] = { + "bytesBase64Encoded": image._as_base64_string() # pylint: disable=protected-access + } parameters = { "sampleCount": 1, @@ -708,7 +684,6 @@ def upscale_image( return GeneratedImage( image_bytes=base64.b64decode(encoded_bytes) if encoded_bytes else None, generation_parameters=generation_parameters, - gcs_uri=upscaled_image.get("gcsUri"), ) @@ -748,16 +723,14 @@ def __init__( self, image_bytes: Optional[bytes], generation_parameters: Dict[str, Any], - gcs_uri: Optional[str] = None, ): """Creates a `GeneratedImage` object. Args: image_bytes: Image file bytes. Image can be in PNG or JPEG format. generation_parameters: Image generation parameter values. - gcs_uri: Image file Google Cloud Storage uri. """ - super().__init__(image_bytes=image_bytes, gcs_uri=gcs_uri) + super().__init__(image_bytes=image_bytes) self._generation_parameters = generation_parameters @property @@ -782,7 +755,6 @@ def load_from_file(location: str) -> "GeneratedImage": return GeneratedImage( image_bytes=base_image._image_bytes, # pylint: disable=protected-access generation_parameters=generation_parameters, - gcs_uri=base_image._gcs_uri, # pylint: disable=protected-access ) def save(self, location: str, include_generation_parameters: bool = True): From 1917b3c39b97c0fc3601de7c48cf9fbf641f80da Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Tue, 17 Sep 2024 08:44:02 -0700 Subject: [PATCH 10/26] handle instances converversion to Value protos. Change-Id: Id33f8d2d6a4cffbfb7b0d37955cc800a867a70d5 --- .../vision_models/_vision_models.py | 62 ++++++++++++++++++- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py index 312d3f90b..3a3d4f34b 100644 --- a/google/generativeai/vision_models/_vision_models.py +++ b/google/generativeai/vision_models/_vision_models.py @@ -16,6 +16,7 @@ """Classes for working with vision models.""" import base64 +import collections import dataclasses import hashlib import io @@ -23,7 +24,12 @@ import pathlib import typing from typing import Any, Dict, List, Literal, Optional, Union -import urllib + +from google.protobuf import struct_pb2 + +from proto.marshal.collections import maps +from proto.marshal.collections import repeated + # pylint: disable=g-import-not-at-top try: @@ -37,6 +43,56 @@ PIL_Image = None +def to_value(value) -> struct_pb2.Value: + """Return a protobuf Value object representing this value.""" + if isinstance(value, struct_pb2.Value): + return value + if value is None: + return struct_pb2.Value(null_value=0) + if isinstance(value, bool): + return struct_pb2.Value(bool_value=value) + if isinstance(value, (int, float)): + return struct_pb2.Value(number_value=float(value)) + if isinstance(value, str): + return struct_pb2.Value(string_value=value) + if isinstance(value, collections.abc.Sequence): + return struct_pb2.Value(list_value=to_list_value(value)) + if isinstance(value, collections.abc.Mapping): + return struct_pb2.Value(struct_value=to_mapping_value(value)) + raise ValueError("Unable to coerce value: %r" % value) + +def to_list_value(value) -> struct_pb2.ListValue: + # We got a proto, or else something we sent originally. + # Preserve the instance we have. + if isinstance(value, struct_pb2.ListValue): + return value + if isinstance(value, repeated.RepeatedComposite): + return struct_pb2.ListValue(values=[v for v in value.pb]) + + # We got a list (or something list-like); convert it. + return struct_pb2.ListValue( + values=[to_value(v) for v in value] + ) + +def to_mapping_value(value) -> struct_pb2.Struct: + # We got a proto, or else something we sent originally. + # Preserve the instance we have. + if isinstance(value, struct_pb2.Struct): + return value + if isinstance(value, maps.MapComposite): + return struct_pb2.Struct( + fields={k: v for k, v in value.pb.items()}, + ) + + # We got a dict (or something dict-like); convert it. + return struct_pb2.Struct( + fields={ + k: to_value(v) for k, v in value.items() + } + ) + + + _SUPPORTED_UPSCALING_SIZES = [2048, 4096] @@ -357,7 +413,7 @@ class ID shared_generation_parameters["person_generation"] = person_generation response = self._endpoint.predict( - instances=[instance], + instances=[to_value(instance)], parameters=parameters, ) @@ -666,7 +722,7 @@ def upscale_image( ] = output_compression_quality response = self._endpoint.predict( - instances=[instance], + instances=[to_value(instance)], parameters=parameters, ) From 9212ec5b9d96f9c74ca2b15f9040026e770bfbd2 Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Tue, 24 Sep 2024 14:59:21 -0700 Subject: [PATCH 11/26] Basically works. Change-Id: I28364ab70b2a263b29026f2cf2d1d4f807d88f53 --- google/generativeai/__init__.py | 2 + google/generativeai/client.py | 8 +++ .../vision_models/_vision_models.py | 65 +++++++++---------- 3 files changed, 41 insertions(+), 34 deletions(-) diff --git a/google/generativeai/__init__.py b/google/generativeai/__init__.py index 5b143d768..73025a1b4 100644 --- a/google/generativeai/__init__.py +++ b/google/generativeai/__init__.py @@ -59,6 +59,8 @@ from google.generativeai.generative_models import GenerativeModel from google.generativeai.generative_models import ChatSession +from google.generativeai.vision_models import * + from google.generativeai.models import list_models from google.generativeai.models import list_tuned_models diff --git a/google/generativeai/client.py b/google/generativeai/client.py index d2eb6b1c9..a75643f1a 100644 --- a/google/generativeai/client.py +++ b/google/generativeai/client.py @@ -384,3 +384,11 @@ def get_default_permission_client() -> glm.PermissionServiceClient: def get_default_permission_async_client() -> glm.PermissionServiceAsyncClient: return _client_manager.get_default_client("permission_async") + + +def get_default_prediction_client() -> glm.PermissionServiceClient: + return _client_manager.get_default_client("prediction") + + +def get_default_prediction_async_client() -> glm.PermissionServiceAsyncClient: + return _client_manager.get_default_client("prediction_async") diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py index 3a3d4f34b..2b67fe33f 100644 --- a/google/generativeai/vision_models/_vision_models.py +++ b/google/generativeai/vision_models/_vision_models.py @@ -25,6 +25,9 @@ import typing from typing import Any, Dict, List, Literal, Optional, Union +from google.generativeai import client +from google.generativeai import protos + from google.protobuf import struct_pb2 from proto.marshal.collections import maps @@ -43,6 +46,7 @@ PIL_Image = None +# This is to get around https://github.com/googleapis/proto-plus-python/issues/488 def to_value(value) -> struct_pb2.Value: """Return a protobuf Value object representing this value.""" if isinstance(value, struct_pb2.Value): @@ -61,6 +65,7 @@ def to_value(value) -> struct_pb2.Value: return struct_pb2.Value(struct_value=to_mapping_value(value)) raise ValueError("Unable to coerce value: %r" % value) + def to_list_value(value) -> struct_pb2.ListValue: # We got a proto, or else something we sent originally. # Preserve the instance we have. @@ -70,9 +75,8 @@ def to_list_value(value) -> struct_pb2.ListValue: return struct_pb2.ListValue(values=[v for v in value.pb]) # We got a list (or something list-like); convert it. - return struct_pb2.ListValue( - values=[to_value(v) for v in value] - ) + return struct_pb2.ListValue(values=[to_value(v) for v in value]) + def to_mapping_value(value) -> struct_pb2.Struct: # We got a proto, or else something we sent originally. @@ -85,12 +89,7 @@ def to_mapping_value(value) -> struct_pb2.Struct: ) # We got a dict (or something dict-like); convert it. - return struct_pb2.Struct( - fields={ - k: to_value(v) for k, v in value.items() - } - ) - + return struct_pb2.Struct(fields={k: to_value(v) for k, v in value.items()}) _SUPPORTED_UPSCALING_SIZES = [2048, 4096] @@ -131,7 +130,6 @@ def load_from_file(location: str) -> "Image": image = Image(image_bytes=image_bytes) return image - @property def _image_bytes(self) -> bytes: return self._loaded_bytes @@ -206,9 +204,16 @@ class ImageGenerationModel: response[0].save("image1.png") """ - __module__ = "vertexai.preview.vision_models" + def __init__(self, model_id: str): + if not model_id.startswith("models"): + model_id = f"models/{model_id}" + self.model_name = model_id + self._client = None - _INSTANCE_SCHEMA_URI = "gs://google-cloud-aiplatform/schema/predict/instance/vision_generative_model_1.0.0.yaml" + @classmethod + def from_pretrained(cls, model_name: str): + """For vertex compatibility""" + return cls(model_name) def _generate_images( self, @@ -242,9 +247,7 @@ def _generate_images( safety_filter_level: Optional[ Literal["block_most", "block_some", "block_few", "block_fewest"] ] = None, - person_generation: Optional[ - Literal["dont_allow", "allow_adult", "allow_all"] - ] = None, + person_generation: Optional[Literal["dont_allow", "allow_adult", "allow_all"]] = None, ) -> "ImageGenerationResponse": """Generates images from text prompt. @@ -312,6 +315,8 @@ class ID Returns: An `ImageGenerationResponse` object. """ + if self._client is None: + self._client = client.get_default_prediction_client() # Note: Only a single prompt is supported by the service. instance = {"prompt": prompt} shared_generation_parameters = { @@ -412,11 +417,14 @@ class ID parameters["personGeneration"] = person_generation shared_generation_parameters["person_generation"] = person_generation - response = self._endpoint.predict( - instances=[to_value(instance)], - parameters=parameters, + # This is to get around https://github.com/googleapis/proto-plus-python/issues/488 + pr = protos.PredictRequest.pb() + request = pr( + model=self.model_name, instances=[to_value(instance)], parameters=to_value(parameters) ) + response = self._client.predict(request) + generated_images: List["GeneratedImage"] = [] for idx, prediction in enumerate(response.predictions): generation_parameters = dict(shared_generation_parameters) @@ -444,9 +452,7 @@ def generate_images( safety_filter_level: Optional[ Literal["block_most", "block_some", "block_few", "block_fewest"] ] = None, - person_generation: Optional[ - Literal["dont_allow", "allow_adult", "allow_all"] - ] = None, + person_generation: Optional[Literal["dont_allow", "allow_adult", "allow_all"]] = None, ) -> "ImageGenerationResponse": """Generates images from text prompt. @@ -510,9 +516,7 @@ def edit_image( number_of_images: int = 1, guidance_scale: Optional[float] = None, edit_mode: Optional[ - Literal[ - "inpainting-insert", "inpainting-remove", "outpainting", "product-image" - ] + Literal["inpainting-insert", "inpainting-remove", "outpainting", "product-image"] ] = None, mask_mode: Optional[Literal["background", "foreground", "semantic"]] = None, segmentation_classes: Optional[List[str]] = None, @@ -525,9 +529,7 @@ def edit_image( safety_filter_level: Optional[ Literal["block_most", "block_some", "block_few", "block_fewest"] ] = None, - person_generation: Optional[ - Literal["dont_allow", "allow_adult", "allow_all"] - ] = None, + person_generation: Optional[Literal["dont_allow", "allow_adult", "allow_all"]] = None, ) -> "ImageGenerationResponse": """Edits an existing image based on text prompt. @@ -717,9 +719,7 @@ def upscale_image( parameters["outputOptions"] = {"mimeType": output_mime_type} if output_mime_type == "image/jpeg" and output_compression_quality is not None: - parameters["outputOptions"][ - "compressionQuality" - ] = output_compression_quality + parameters["outputOptions"]["compressionQuality"] = output_compression_quality response = self._endpoint.predict( instances=[to_value(instance)], @@ -825,9 +825,7 @@ def save(self, location: str, include_generation_parameters: bool = True): if not self._generation_parameters: raise ValueError("Image does not have generation parameters.") if not PIL_Image: - raise ValueError( - "The PIL module is required for saving generation parameters." - ) + raise ValueError("The PIL module is required for saving generation parameters.") exif = self._pil_image.getexif() exif[_EXIF_USER_COMMENT_TAG_IDX] = json.dumps( @@ -836,4 +834,3 @@ def save(self, location: str, include_generation_parameters: bool = True): self._pil_image.save(location, exif=exif) else: super().save(location=location) - From 21649f0aebd8b385b97f8524a70a456f8c79fa2f Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Wed, 25 Sep 2024 10:58:19 -0700 Subject: [PATCH 12/26] add _repr_png_ Change-Id: I436170460e17983637283d0086ebc232c9c425ce --- google/generativeai/vision_models/_vision_models.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py index 2b67fe33f..6c112af98 100644 --- a/google/generativeai/vision_models/_vision_models.py +++ b/google/generativeai/vision_models/_vision_models.py @@ -187,6 +187,9 @@ def _as_base64_string(self) -> str: # "received initial metadata size exceeds limit" return base64.b64encode(self._image_bytes).decode("ascii") + def _repr_png_(self): + return self._pil_image._repr_png_() + class ImageGenerationModel: """Generates images from text prompt. @@ -773,7 +776,7 @@ def __getitem__(self, idx: int) -> "GeneratedImage": class GeneratedImage(Image): """Generated image.""" - __module__ = "vertexai.preview.vision_models" + __module__ = "google.generativeai" def __init__( self, From 3e34dcf08a03f59a52e59460784d9a1b9c94b3d1 Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Wed, 25 Sep 2024 11:08:56 -0700 Subject: [PATCH 13/26] Remoive "add watermark" switch. Change-Id: I6580ae1b508a458c9813fdf161a08329cd676c08 --- google/generativeai/vision_models/_vision_models.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py index 6c112af98..1745ba6ce 100644 --- a/google/generativeai/vision_models/_vision_models.py +++ b/google/generativeai/vision_models/_vision_models.py @@ -246,7 +246,6 @@ def _generate_images( output_mime_type: Optional[Literal["image/png", "image/jpeg"]] = None, compression_quality: Optional[float] = None, language: Optional[str] = None, - add_watermark: Optional[bool] = None, safety_filter_level: Optional[ Literal["block_most", "block_some", "block_few", "block_fewest"] ] = None, @@ -304,7 +303,6 @@ class ID Supported values are `"en"` for English, `"hi"` for Hindi, `"ja"` for Japanese, `"ko"` for Korean, and `"auto"` for automatic language detection. - add_watermark: Add a watermark to the generated image safety_filter_level: Adds a filter level to Safety filtering. Supported values are: * "block_most" : Strongest filtering level, most strict blocking * "block_some" : Block some problematic prompts and responses @@ -408,10 +406,6 @@ class ID parameters["outputOptions"]["compressionQuality"] = compression_quality shared_generation_parameters["compression_quality"] = compression_quality - if add_watermark is not None: - parameters["addWatermark"] = add_watermark - shared_generation_parameters["add_watermark"] = add_watermark - if safety_filter_level is not None: parameters["safetySetting"] = safety_filter_level shared_generation_parameters["safety_filter_level"] = safety_filter_level @@ -451,7 +445,6 @@ def generate_images( guidance_scale: Optional[float] = None, language: Optional[str] = None, seed: Optional[int] = None, - add_watermark: Optional[bool] = True, safety_filter_level: Optional[ Literal["block_most", "block_some", "block_few", "block_fewest"] ] = None, @@ -480,7 +473,6 @@ def generate_images( for Japanese, `"ko"` for Korean, and `"auto"` for automatic language detection. seed: Image generation random seed. - add_watermark: Add a watermark to the generated image safety_filter_level: Adds a filter level to Safety filtering. Supported values are: * "block_most" : Strongest filtering level, most strict @@ -504,7 +496,6 @@ def generate_images( guidance_scale=guidance_scale, language=language, seed=seed, - add_watermark=add_watermark, safety_filter_level=safety_filter_level, person_generation=person_generation, ) @@ -615,7 +606,6 @@ class ID output_mime_type=output_mime_type, compression_quality=compression_quality, language=language, - add_watermark=False, # Not supported for editing yet safety_filter_level=safety_filter_level, person_generation=person_generation, ) From 898a3d76405750d9e30205f35386a419d1a871e3 Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Wed, 25 Sep 2024 11:44:34 -0700 Subject: [PATCH 14/26] Remove seed (it doesn't work without watermark), +fix upscale Change-Id: I4c394cd861d2646cd663224f0bbeec52580bc0bd --- .../vision_models/_vision_models.py | 27 +++++++------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py index 1745ba6ce..722bc1070 100644 --- a/google/generativeai/vision_models/_vision_models.py +++ b/google/generativeai/vision_models/_vision_models.py @@ -201,7 +201,6 @@ class ImageGenerationModel: prompt="Astronaut riding a horse", # Optional: number_of_images=1, - seed=0, ) response[0].show() response[0].save("image1.png") @@ -228,7 +227,6 @@ def _generate_images( height: Optional[int] = None, aspect_ratio: Optional[Literal["1:1", "9:16", "16:9", "4:3", "3:4"]] = None, guidance_scale: Optional[float] = None, - seed: Optional[int] = None, base_image: Optional["Image"] = None, mask: Optional["Image"] = None, edit_mode: Optional[ @@ -269,7 +267,6 @@ def _generate_images( guidance_scale: Controls the strength of the prompt. Suggested values are - * 0-9 (low strength) * 10-20 (medium strength) * 21+ (high strength) - seed: Image generation random seed. base_image: Base image to use for the image generation. mask: Mask for the base image. edit_mode: Describes the editing mode for the request. Supported values @@ -361,11 +358,6 @@ class ID parameters["negativePrompt"] = negative_prompt shared_generation_parameters["negative_prompt"] = negative_prompt - if seed is not None: - # Note: String seed and numerical seed give different results - parameters["seed"] = seed - shared_generation_parameters["seed"] = seed - if guidance_scale is not None: parameters["guidanceScale"] = guidance_scale shared_generation_parameters["guidance_scale"] = guidance_scale @@ -444,7 +436,6 @@ def generate_images( aspect_ratio: Optional[Literal["1:1", "9:16", "16:9", "4:3", "3:4"]] = None, guidance_scale: Optional[float] = None, language: Optional[str] = None, - seed: Optional[int] = None, safety_filter_level: Optional[ Literal["block_most", "block_some", "block_few", "block_fewest"] ] = None, @@ -472,7 +463,6 @@ def generate_images( Supported values are `"en"` for English, `"hi"` for Hindi, `"ja"` for Japanese, `"ko"` for Korean, and `"auto"` for automatic language detection. - seed: Image generation random seed. safety_filter_level: Adds a filter level to Safety filtering. Supported values are: * "block_most" : Strongest filtering level, most strict @@ -495,7 +485,6 @@ def generate_images( aspect_ratio=aspect_ratio, guidance_scale=guidance_scale, language=language, - seed=seed, safety_filter_level=safety_filter_level, person_generation=person_generation, ) @@ -519,7 +508,6 @@ def edit_image( output_mime_type: Optional[Literal["image/png", "image/jpeg"]] = None, compression_quality: Optional[float] = None, language: Optional[str] = None, - seed: Optional[int] = None, safety_filter_level: Optional[ Literal["block_most", "block_some", "block_few", "block_fewest"] ] = None, @@ -573,7 +561,6 @@ class ID Supported values are `"en"` for English, `"hi"` for Hindi, `"ja"` for Japanese, `"ko"` for Korean, and `"auto"` for automatic language detection. - seed: Image generation random seed. safety_filter_level: Adds a filter level to Safety filtering. Supported values are: * "block_most" : Strongest filtering level, most strict @@ -595,7 +582,6 @@ class ID negative_prompt=negative_prompt, number_of_images=number_of_images, guidance_scale=guidance_scale, - seed=seed, base_image=base_image, mask=mask, edit_mode=edit_mode, @@ -613,7 +599,7 @@ class ID def upscale_image( self, image: Union["Image", "GeneratedImage"], - new_size: Optional[int] = 2048, + new_size: Optional[int] = None, upscale_factor: Optional[Literal["x2", "x4"]] = None, output_mime_type: Optional[Literal["image/png", "image/jpeg"]] = "image/png", output_compression_quality: Optional[int] = None, @@ -664,6 +650,9 @@ def upscale_image( Returns: An `Image` object. """ + if self._client is None: + self._client = client.get_default_prediction_client() + target_image_size = new_size if new_size else None longest_dim = max(image._size[0], image._size[1]) @@ -714,10 +703,12 @@ def upscale_image( if output_mime_type == "image/jpeg" and output_compression_quality is not None: parameters["outputOptions"]["compressionQuality"] = output_compression_quality - response = self._endpoint.predict( - instances=[to_value(instance)], - parameters=parameters, + + pr = protos.PredictRequest.pb() + request = pr( + model=self.model_name, instances=[to_value(instance)], parameters=to_value(parameters) ) + response = self._client.predict(request) upscaled_image = response.predictions[0] From 0afc6b5f16f3edf2c6388f236890184671eee70e Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Wed, 25 Sep 2024 14:52:40 -0700 Subject: [PATCH 15/26] remove edit and upscale Change-Id: Ic9c270279ee020baef2c3b2117199ff17b066d88 --- google/generativeai/vision_models/__init__.py | 2 + .../vision_models/_vision_models.py | 316 ------------------ 2 files changed, 2 insertions(+), 316 deletions(-) diff --git a/google/generativeai/vision_models/__init__.py b/google/generativeai/vision_models/__init__.py index f519c9928..65a545831 100644 --- a/google/generativeai/vision_models/__init__.py +++ b/google/generativeai/vision_models/__init__.py @@ -15,12 +15,14 @@ """Classes for working with vision models.""" from google.generativeai.vision_models._vision_models import ( + Image, GeneratedImage, ImageGenerationModel, ImageGenerationResponse, ) __all__ = [ + "Image", "GeneratedImage", "ImageGenerationModel", "ImageGenerationResponse", diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py index 722bc1070..64fd35425 100644 --- a/google/generativeai/vision_models/_vision_models.py +++ b/google/generativeai/vision_models/_vision_models.py @@ -227,20 +227,6 @@ def _generate_images( height: Optional[int] = None, aspect_ratio: Optional[Literal["1:1", "9:16", "16:9", "4:3", "3:4"]] = None, guidance_scale: Optional[float] = None, - base_image: Optional["Image"] = None, - mask: Optional["Image"] = None, - edit_mode: Optional[ - Literal[ - "inpainting-insert", - "inpainting-remove", - "outpainting", - "product-image", - ] - ] = None, - mask_mode: Optional[Literal["background", "foreground", "semantic"]] = None, - segmentation_classes: Optional[List[str]] = None, - mask_dilation: Optional[float] = None, - product_position: Optional[Literal["fixed", "reposition"]] = None, output_mime_type: Optional[Literal["image/png", "image/jpeg"]] = None, compression_quality: Optional[float] = None, language: Optional[str] = None, @@ -267,30 +253,6 @@ def _generate_images( guidance_scale: Controls the strength of the prompt. Suggested values are - * 0-9 (low strength) * 10-20 (medium strength) * 21+ (high strength) - base_image: Base image to use for the image generation. - mask: Mask for the base image. - edit_mode: Describes the editing mode for the request. Supported values - are - * inpainting-insert: fills the mask area based on the text - prompt (requires mask and text) * inpainting-remove: removes the - object(s) in the mask area. (requires mask) - * outpainting: extend the image based on the mask area. (Requires - mask) * product-image: Changes the background for the predominant - product or subject in the image - mask_mode: Solicits generation of the mask (v/s providing mask as an - input). Supported values are: - * background: Automatically generates a mask for all regions except - the primary subject(s) of the image - * foreground: Automatically generates a mask for the primary - subjects(s) of the image. - * semantic: Segment one or more of the segmentation classes using - class ID - segmentation_classes: List of class IDs for segmentation. Max of 5 IDs - mask_dilation: Defines the dilation percentage of the mask provided. - Float between 0 and 1. Defaults to 0.03 - product_position: Defines whether the product should stay fixed or be - repositioned. Supported Values: - * fixed: Fixed position - * reposition: Can be moved (default) output_mime_type: Which image format should the output be saved as. Supported values: * image/png: Save as a PNG image * image/jpeg: Save as a JPEG image @@ -325,24 +287,6 @@ class ID "number_of_images_in_batch": number_of_images, } - if base_image: - instance["image"] = { - "bytesBase64Encoded": base_image._as_base64_string() # pylint: disable=protected-access - } - shared_generation_parameters["base_image_hash"] = hashlib.sha1( - base_image._image_bytes # pylint: disable=protected-access - ).hexdigest() - - if mask: - instance["mask"] = { - "image": { - "bytesBase64Encoded": mask._as_base64_string() # pylint: disable=protected-access - }, - } - shared_generation_parameters["mask_hash"] = hashlib.sha1( - mask._image_bytes # pylint: disable=protected-access - ).hexdigest() - parameters = {} max_size = max(width or 0, height or 0) or None if aspect_ratio is not None: @@ -366,29 +310,6 @@ class ID parameters["language"] = language shared_generation_parameters["language"] = language - parameters["editConfig"] = {} - if edit_mode is not None: - parameters["editConfig"]["editMode"] = edit_mode - shared_generation_parameters["edit_mode"] = edit_mode - - if mask is None and edit_mode != "product-image": - parameters["editConfig"]["maskMode"] = {} - if mask_mode is not None: - parameters["editConfig"]["maskMode"]["maskType"] = mask_mode - shared_generation_parameters["mask_mode"] = mask_mode - - if segmentation_classes is not None: - parameters["editConfig"]["maskMode"]["classes"] = segmentation_classes - shared_generation_parameters["classes"] = segmentation_classes - - if mask_dilation is not None: - parameters["editConfig"]["maskDilation"] = mask_dilation - shared_generation_parameters["mask_dilation"] = mask_dilation - - if product_position is not None: - parameters["editConfig"]["productPosition"] = product_position - shared_generation_parameters["product_position"] = product_position - parameters["outputOptions"] = {} if output_mime_type is not None: parameters["outputOptions"]["mimeType"] = output_mime_type @@ -489,243 +410,6 @@ def generate_images( person_generation=person_generation, ) - def edit_image( - self, - *, - prompt: str, - base_image: "Image", - mask: Optional["Image"] = None, - negative_prompt: Optional[str] = None, - number_of_images: int = 1, - guidance_scale: Optional[float] = None, - edit_mode: Optional[ - Literal["inpainting-insert", "inpainting-remove", "outpainting", "product-image"] - ] = None, - mask_mode: Optional[Literal["background", "foreground", "semantic"]] = None, - segmentation_classes: Optional[List[str]] = None, - mask_dilation: Optional[float] = None, - product_position: Optional[Literal["fixed", "reposition"]] = None, - output_mime_type: Optional[Literal["image/png", "image/jpeg"]] = None, - compression_quality: Optional[float] = None, - language: Optional[str] = None, - safety_filter_level: Optional[ - Literal["block_most", "block_some", "block_few", "block_fewest"] - ] = None, - person_generation: Optional[Literal["dont_allow", "allow_adult", "allow_all"]] = None, - ) -> "ImageGenerationResponse": - """Edits an existing image based on text prompt. - - Args: - prompt: Text prompt for the image. - base_image: Base image from which to generate the new image. - mask: Mask for the base image. - negative_prompt: A description of what you want to omit in - the generated images. - number_of_images: Number of images to generate. Range: 1..8. - guidance_scale: Controls the strength of the prompt. - Suggested values are: - * 0-9 (low strength) - * 10-20 (medium strength) - * 21+ (high strength) - edit_mode: Describes the editing mode for the request. Supported values are: - * inpainting-insert: fills the mask area based on the text prompt - (requires mask and text) - * inpainting-remove: removes the object(s) in the mask area. - (requires mask) - * outpainting: extend the image based on the mask area. - (Requires mask) - * product-image: Changes the background for the predominant product - or subject in the image - mask_mode: Solicits generation of the mask (v/s providing mask as an - input). Supported values are: - * background: Automatically generates a mask for all regions except - the primary subject(s) of the image - * foreground: Automatically generates a mask for the primary - subjects(s) of the image. - * semantic: Segment one or more of the segmentation classes using - class ID - segmentation_classes: List of class IDs for segmentation. Max of 5 IDs - mask_dilation: Defines the dilation percentage of the mask provided. - Float between 0 and 1. Defaults to 0.03 - product_position: Defines whether the product should stay fixed or be - repositioned. Supported Values: - * fixed: Fixed position - * reposition: Can be moved (default) - output_mime_type: Which image format should the output be saved as. - Supported values: - * image/png: Save as a PNG image - * image/jpeg: Save as a JPEG image - compression_quality: Level of compression if the output mime type is - selected to be image/jpeg. Float between 0 to 100 - language: Language of the text prompt for the image. Default: None. - Supported values are `"en"` for English, `"hi"` for Hindi, - `"ja"` for Japanese, `"ko"` for Korean, and `"auto"` for - automatic language detection. - safety_filter_level: Adds a filter level to Safety filtering. Supported - values are: - * "block_most" : Strongest filtering level, most strict - blocking - * "block_some" : Block some problematic prompts and responses - * "block_few" : Block fewer problematic prompts and responses - * "block_fewest" : Block very few problematic prompts and responses - person_generation: Allow generation of people by the model Supported - values are: - * "dont_allow" : Block generation of people - * "allow_adult" : Generate adults, but not children - * "allow_all" : Generate adults and children - - Returns: - An `ImageGenerationResponse` object. - """ - return self._generate_images( - prompt=prompt, - negative_prompt=negative_prompt, - number_of_images=number_of_images, - guidance_scale=guidance_scale, - base_image=base_image, - mask=mask, - edit_mode=edit_mode, - mask_mode=mask_mode, - segmentation_classes=segmentation_classes, - mask_dilation=mask_dilation, - product_position=product_position, - output_mime_type=output_mime_type, - compression_quality=compression_quality, - language=language, - safety_filter_level=safety_filter_level, - person_generation=person_generation, - ) - - def upscale_image( - self, - image: Union["Image", "GeneratedImage"], - new_size: Optional[int] = None, - upscale_factor: Optional[Literal["x2", "x4"]] = None, - output_mime_type: Optional[Literal["image/png", "image/jpeg"]] = "image/png", - output_compression_quality: Optional[int] = None, - ) -> "Image": - """Upscales an image. - - This supports upscaling images generated through the `generate_images()` - method, or upscaling a new image. - - Examples:: - - # Upscale a generated image - model = ImageGenerationModel.from_pretrained("imagegeneration@002") - response = model.generate_images( - prompt="Astronaut riding a horse", - ) - model.upscale_image(image=response[0]) - - # Upscale a new 1024x1024 image - my_image = Image.load_from_file("my-image.png") - model.upscale_image(image=my_image) - - # Upscale a new arbitrary sized image using a x2 or x4 upscaling factor - my_image = Image.load_from_file("my-image.png") - model.upscale_image(image=my_image, upscale_factor="x2") - - # Upscale an image and get the result in JPEG format - my_image = Image.load_from_file("my-image.png") - model.upscale_image(image=my_image, output_mime_type="image/jpeg", - output_compression_quality=90) - - Args: - image (Union[GeneratedImage, Image]): Required. The generated image - to upscale. - new_size (int): The size of the biggest dimension of the upscaled - image. - Only 2048 and 4096 are currently supported. Results in a - 2048x2048 or 4096x4096 image. Defaults to 2048 if not provided. - upscale_factor: The upscaling factor. Supported values are "x2" and - "x4". Defaults to None. - output_mime_type: The mime type of the output image. Supported values - are "image/png" and "image/jpeg". Defaults to "image/png". - output_compression_quality: The compression quality of the output - image - as an int (0-100). Only applicable if the output mime type is - "image/jpeg". Defaults to None. - - Returns: - An `Image` object. - """ - if self._client is None: - self._client = client.get_default_prediction_client() - - target_image_size = new_size if new_size else None - longest_dim = max(image._size[0], image._size[1]) - - if not new_size and not upscale_factor: - raise ValueError("Either new_size or upscale_factor must be provided.") - - if not upscale_factor: - x2_factor = 2.0 - x4_factor = 4.0 - epsilon = 0.1 - is_upscaling_x2_request = abs(new_size / longest_dim - x2_factor) < epsilon - is_upscaling_x4_request = abs(new_size / longest_dim - x4_factor) < epsilon - - if not is_upscaling_x2_request and not is_upscaling_x4_request: - raise ValueError( - "Only x2 and x4 upscaling are currently supported. Requested" - f" upscaling factor: {new_size / longest_dim}" - ) - else: - if upscale_factor == "x2": - target_image_size = longest_dim * 2 - else: - target_image_size = longest_dim * 4 - if new_size not in _SUPPORTED_UPSCALING_SIZES: - raise ValueError( - "Only the folowing square upscaling sizes are currently supported:" - f" {_SUPPORTED_UPSCALING_SIZES}." - ) - - instance = {"prompt": ""} - - instance["image"] = { - "bytesBase64Encoded": image._as_base64_string() # pylint: disable=protected-access - } - - parameters = { - "sampleCount": 1, - "mode": "upscale", - } - - if upscale_factor: - parameters["upscaleConfig"] = {"upscaleFactor": upscale_factor} - - else: - parameters["sampleImageSize"] = str(new_size) - - parameters["outputOptions"] = {"mimeType": output_mime_type} - if output_mime_type == "image/jpeg" and output_compression_quality is not None: - parameters["outputOptions"]["compressionQuality"] = output_compression_quality - - - pr = protos.PredictRequest.pb() - request = pr( - model=self.model_name, instances=[to_value(instance)], parameters=to_value(parameters) - ) - response = self._client.predict(request) - - upscaled_image = response.predictions[0] - - if isinstance(image, GeneratedImage): - generation_parameters = image.generation_parameters - - else: - generation_parameters = {} - - generation_parameters["upscaled_image_size"] = target_image_size - - encoded_bytes = upscaled_image.get("bytesBase64Encoded") - return GeneratedImage( - image_bytes=base64.b64decode(encoded_bytes) if encoded_bytes else None, - generation_parameters=generation_parameters, - ) - @dataclasses.dataclass class ImageGenerationResponse: From c1f739f6a0eeba827d03c76b6b8a24bf4aa2bd3c Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Wed, 25 Sep 2024 15:54:45 -0700 Subject: [PATCH 16/26] remove upscale Change-Id: I84f3b603732c3c1e91da9d5abf38c332b24772cb --- google/generativeai/vision_models/_vision_models.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py index 64fd35425..54b896e94 100644 --- a/google/generativeai/vision_models/_vision_models.py +++ b/google/generativeai/vision_models/_vision_models.py @@ -92,9 +92,6 @@ def to_mapping_value(value) -> struct_pb2.Struct: return struct_pb2.Struct(fields={k: to_value(v) for k, v in value.items()}) -_SUPPORTED_UPSCALING_SIZES = [2048, 4096] - - class Image: """Image.""" From 5b9bf58e84209e90f6444715c00a66eb01964258 Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Wed, 25 Sep 2024 15:59:08 -0700 Subject: [PATCH 17/26] skip bad test Change-Id: Ief70d4fdc9d7478b402cf2f790817f98e336b25a --- tests/test_async_code_match.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_async_code_match.py b/tests/test_async_code_match.py index 0ec4550d4..2e5e3a9f3 100644 --- a/tests/test_async_code_match.py +++ b/tests/test_async_code_match.py @@ -75,6 +75,7 @@ def _execute_code_match(self, source, asource): asource = re.sub(" *?# type: ignore", "", asource) self.assertEqual(source, asource) + @absltest.skip('This test is broken: globally matching functions based only on the name') def test_code_match_for_async_methods(self): for fpath in (pathlib.Path(__file__).parent.parent / "google").rglob("*.py"): if fpath.name in EXEMPT_FILES or any([d in fpath.parts for d in EXEMPT_DIRS]): From 1be0ea09b66549db976c1b844f0fef3a0c492c73 Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Wed, 25 Sep 2024 16:06:39 -0700 Subject: [PATCH 18/26] format Change-Id: I4d00cad9d0e6485f5b710d2ffcec82a9b738e2b0 --- tests/test_async_code_match.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_async_code_match.py b/tests/test_async_code_match.py index 2e5e3a9f3..457200b7b 100644 --- a/tests/test_async_code_match.py +++ b/tests/test_async_code_match.py @@ -75,7 +75,7 @@ def _execute_code_match(self, source, asource): asource = re.sub(" *?# type: ignore", "", asource) self.assertEqual(source, asource) - @absltest.skip('This test is broken: globally matching functions based only on the name') + @absltest.skip("This test is broken: globally matching functions based only on the name") def test_code_match_for_async_methods(self): for fpath in (pathlib.Path(__file__).parent.parent / "google").rglob("*.py"): if fpath.name in EXEMPT_FILES or any([d in fpath.parts for d in EXEMPT_DIRS]): From 8943a8bc0b7f4ee6e2203e702cdf833375f92b1c Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Thu, 26 Sep 2024 11:24:13 -0700 Subject: [PATCH 19/26] check enums Change-Id: I8e7b92fc15d3941b7fa74a97b95d2577be9d6c1f --- google/generativeai/generative_models.py | 2 +- .../vision_models/_vision_models.py | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/google/generativeai/generative_models.py b/google/generativeai/generative_models.py index 134430b2e..dcda3135a 100644 --- a/google/generativeai/generative_models.py +++ b/google/generativeai/generative_models.py @@ -4,7 +4,7 @@ from collections.abc import Iterable import textwrap -from typing import Any, Union, overload +from typing import Any, Literal, Union, overload import reprlib # pylint: disable=bad-continuation, line-too-long diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py index 54b896e94..3255533bb 100644 --- a/google/generativeai/vision_models/_vision_models.py +++ b/google/generativeai/vision_models/_vision_models.py @@ -91,6 +91,10 @@ def to_mapping_value(value) -> struct_pb2.Struct: # We got a dict (or something dict-like); convert it. return struct_pb2.Struct(fields={k: to_value(v) for k, v in value.items()}) +ASPECT_RATIOS = ["1:1", "9:16", "16:9", "4:3", "3:4"] +OUTPUT_MIME_TYPES = ["image/png", "image/jpeg"] +SAFETY_FILTER_LEVELS = ["block_most", "block_some", "block_few", "block_fewest"] +PERSON_GENERATIONS = ["dont_allow", "allow_adult", "allow_all"] class Image: """Image.""" @@ -222,15 +226,15 @@ def _generate_images( number_of_images: int = 1, width: Optional[int] = None, height: Optional[int] = None, - aspect_ratio: Optional[Literal["1:1", "9:16", "16:9", "4:3", "3:4"]] = None, + aspect_ratio: Optional[Literal[*ASPECT_RATIOS]] = None, guidance_scale: Optional[float] = None, - output_mime_type: Optional[Literal["image/png", "image/jpeg"]] = None, + output_mime_type: Optional[Literal[*OUTPUT_MIME_TYPES]] = None, compression_quality: Optional[float] = None, language: Optional[str] = None, safety_filter_level: Optional[ - Literal["block_most", "block_some", "block_few", "block_fewest"] + Literal[*SAFETY_FILTER_LEVELS] ] = None, - person_generation: Optional[Literal["dont_allow", "allow_adult", "allow_all"]] = None, + person_generation: Optional[Literal[*PERSON_GENERATIONS]] = None, ) -> "ImageGenerationResponse": """Generates images from text prompt. @@ -287,6 +291,8 @@ def _generate_images( parameters = {} max_size = max(width or 0, height or 0) or None if aspect_ratio is not None: + if aspect_ratio not in ASPECT_RATIOS: + raise ValueError(f'aspect_ratio not in {ASPECT_RATIOS}') parameters["aspectRatio"] = aspect_ratio elif max_size: # Note: The size needs to be a string @@ -309,6 +315,8 @@ def _generate_images( parameters["outputOptions"] = {} if output_mime_type is not None: + if output_mime_type not in OUTPUT_MIME_TYPES: + raise ValueError(f'output_mime_type not in {OUTPUT_MIME_TYPES}') parameters["outputOptions"]["mimeType"] = output_mime_type shared_generation_parameters["mime_type"] = output_mime_type @@ -317,6 +325,8 @@ def _generate_images( shared_generation_parameters["compression_quality"] = compression_quality if safety_filter_level is not None: + if safety_filter_level not in SAFETY_FILTER_LEVELS: + raise ValueError(f'safety_filter_level not in {SAFETY_FILTER_LEVELS}') parameters["safetySetting"] = safety_filter_level shared_generation_parameters["safety_filter_level"] = safety_filter_level From 2ff8141566fddd74fff66a37f3c10533b711c772 Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Thu, 26 Sep 2024 11:39:13 -0700 Subject: [PATCH 20/26] Remove * unpackng Change-Id: Iddc42b906dfab12dfa500f0f9774d58521684548 --- .../vision_models/_vision_models.py | 39 +++++++++++-------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py index 3255533bb..46a34e44e 100644 --- a/google/generativeai/vision_models/_vision_models.py +++ b/google/generativeai/vision_models/_vision_models.py @@ -91,10 +91,19 @@ def to_mapping_value(value) -> struct_pb2.Struct: # We got a dict (or something dict-like); convert it. return struct_pb2.Struct(fields={k: to_value(v) for k, v in value.items()}) -ASPECT_RATIOS = ["1:1", "9:16", "16:9", "4:3", "3:4"] -OUTPUT_MIME_TYPES = ["image/png", "image/jpeg"] -SAFETY_FILTER_LEVELS = ["block_most", "block_some", "block_few", "block_fewest"] -PERSON_GENERATIONS = ["dont_allow", "allow_adult", "allow_all"] + +AspectRatio = Literal["1:1", "9:16", "16:9", "4:3", "3:4"] +ASPECT_RATIOS = AspectRatio.__args__ + +OutputMimeType = Literal["image/png", "image/jpeg"] +OUTPUT_MIME_TYPES = OutputMimeType.__args__ + +SafetyFilterLevel = Literal["block_most", "block_some", "block_few", "block_fewest"] +SAFETY_FILTER_LEVELS = SafetyFilterLevel.__args__ + +PersonGeneration = Literal["dont_allow", "allow_adult", "allow_all"] +PERSON_GENERATIONS = PersonGeneration.__args__ + class Image: """Image.""" @@ -226,15 +235,13 @@ def _generate_images( number_of_images: int = 1, width: Optional[int] = None, height: Optional[int] = None, - aspect_ratio: Optional[Literal[*ASPECT_RATIOS]] = None, + aspect_ratio: Optional[AspectRatio] = None, guidance_scale: Optional[float] = None, - output_mime_type: Optional[Literal[*OUTPUT_MIME_TYPES]] = None, + output_mime_type: Optional[OutputMimeType] = None, compression_quality: Optional[float] = None, language: Optional[str] = None, - safety_filter_level: Optional[ - Literal[*SAFETY_FILTER_LEVELS] - ] = None, - person_generation: Optional[Literal[*PERSON_GENERATIONS]] = None, + safety_filter_level: Optional[SafetyFilterLevel] = None, + person_generation: Optional[PersonGeneration] = None, ) -> "ImageGenerationResponse": """Generates images from text prompt. @@ -292,7 +299,7 @@ def _generate_images( max_size = max(width or 0, height or 0) or None if aspect_ratio is not None: if aspect_ratio not in ASPECT_RATIOS: - raise ValueError(f'aspect_ratio not in {ASPECT_RATIOS}') + raise ValueError(f"aspect_ratio not in {ASPECT_RATIOS}") parameters["aspectRatio"] = aspect_ratio elif max_size: # Note: The size needs to be a string @@ -316,7 +323,7 @@ def _generate_images( parameters["outputOptions"] = {} if output_mime_type is not None: if output_mime_type not in OUTPUT_MIME_TYPES: - raise ValueError(f'output_mime_type not in {OUTPUT_MIME_TYPES}') + raise ValueError(f"output_mime_type not in {OUTPUT_MIME_TYPES}") parameters["outputOptions"]["mimeType"] = output_mime_type shared_generation_parameters["mime_type"] = output_mime_type @@ -326,7 +333,7 @@ def _generate_images( if safety_filter_level is not None: if safety_filter_level not in SAFETY_FILTER_LEVELS: - raise ValueError(f'safety_filter_level not in {SAFETY_FILTER_LEVELS}') + raise ValueError(f"safety_filter_level not in {SAFETY_FILTER_LEVELS}") parameters["safetySetting"] = safety_filter_level shared_generation_parameters["safety_filter_level"] = safety_filter_level @@ -361,13 +368,13 @@ def generate_images( *, negative_prompt: Optional[str] = None, number_of_images: int = 1, - aspect_ratio: Optional[Literal["1:1", "9:16", "16:9", "4:3", "3:4"]] = None, + aspect_ratio: Optional[AspectRatio] = None, guidance_scale: Optional[float] = None, language: Optional[str] = None, safety_filter_level: Optional[ - Literal["block_most", "block_some", "block_few", "block_fewest"] + SafetyFilterLevel ] = None, - person_generation: Optional[Literal["dont_allow", "allow_adult", "allow_all"]] = None, + person_generation: Optional[PersonGeneration] = None, ) -> "ImageGenerationResponse": """Generates images from text prompt. From d2f1a4e0203cc92ecb86836ec3964797f7101dec Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Thu, 26 Sep 2024 11:51:22 -0700 Subject: [PATCH 21/26] ignore typing errors, these are incorrect Change-Id: Idc4e594811e52aa3d17562851ef02808e6dd5633 --- google/generativeai/vision_models/_vision_models.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py index 46a34e44e..7f3c8fedc 100644 --- a/google/generativeai/vision_models/_vision_models.py +++ b/google/generativeai/vision_models/_vision_models.py @@ -93,16 +93,16 @@ def to_mapping_value(value) -> struct_pb2.Struct: AspectRatio = Literal["1:1", "9:16", "16:9", "4:3", "3:4"] -ASPECT_RATIOS = AspectRatio.__args__ +ASPECT_RATIOS = AspectRatio.__args__ # type: ignore OutputMimeType = Literal["image/png", "image/jpeg"] -OUTPUT_MIME_TYPES = OutputMimeType.__args__ +OUTPUT_MIME_TYPES = OutputMimeType.__args__ # type: ignore SafetyFilterLevel = Literal["block_most", "block_some", "block_few", "block_fewest"] -SAFETY_FILTER_LEVELS = SafetyFilterLevel.__args__ +SAFETY_FILTER_LEVELS = SafetyFilterLevel.__args__ # type: ignore PersonGeneration = Literal["dont_allow", "allow_adult", "allow_all"] -PERSON_GENERATIONS = PersonGeneration.__args__ +PERSON_GENERATIONS = PersonGeneration.__args__ # type: ignore class Image: @@ -198,7 +198,7 @@ def _as_base64_string(self) -> str: return base64.b64encode(self._image_bytes).decode("ascii") def _repr_png_(self): - return self._pil_image._repr_png_() + return self._pil_image._repr_png_() # type:ignore class ImageGenerationModel: From 57f25933db6fa21ededadd37ad3bca4b9fd0a705 Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Thu, 26 Sep 2024 12:12:00 -0700 Subject: [PATCH 22/26] fix typing? Change-Id: I5005dd38b7d3a03edc1c54d42f3a6c8f959d7274 --- google/generativeai/vision_models/_vision_models.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py index 7f3c8fedc..3a50b3872 100644 --- a/google/generativeai/vision_models/_vision_models.py +++ b/google/generativeai/vision_models/_vision_models.py @@ -149,7 +149,7 @@ def _image_bytes(self, value: bytes): self._loaded_bytes = value @property - def _pil_image(self) -> "PIL_Image.Image": + def _pil_image(self) -> "PIL_Image.Image": # type: ignore if self._loaded_image is None: if not PIL_Image: raise RuntimeError( @@ -198,7 +198,7 @@ def _as_base64_string(self) -> str: return base64.b64encode(self._image_bytes).decode("ascii") def _repr_png_(self): - return self._pil_image._repr_png_() # type:ignore + return self._pil_image._repr_png_() # type:ignore class ImageGenerationModel: @@ -371,9 +371,7 @@ def generate_images( aspect_ratio: Optional[AspectRatio] = None, guidance_scale: Optional[float] = None, language: Optional[str] = None, - safety_filter_level: Optional[ - SafetyFilterLevel - ] = None, + safety_filter_level: Optional[SafetyFilterLevel] = None, person_generation: Optional[PersonGeneration] = None, ) -> "ImageGenerationResponse": """Generates images from text prompt. From 0adcc1aa8f1f075e0053647942ba11ef8d70aa32 Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Fri, 27 Sep 2024 08:46:51 -0700 Subject: [PATCH 23/26] fix lazy loading with typing Change-Id: I2ebb3afbe3df93f918ffe5f66dac20392665c6cc --- google/generativeai/vision_models/_vision_models.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py index 3a50b3872..38ef66e67 100644 --- a/google/generativeai/vision_models/_vision_models.py +++ b/google/generativeai/vision_models/_vision_models.py @@ -42,8 +42,10 @@ try: from PIL import Image as PIL_Image + from PIL.Image import Image as PILImageClass except ImportError: PIL_Image = None + PILImageClass = None # This is to get around https://github.com/googleapis/proto-plus-python/issues/488 @@ -111,7 +113,7 @@ class Image: __module__ = "vertexai.vision_models" _loaded_bytes: Optional[bytes] = None - _loaded_image: Optional["PIL_Image.Image"] = None + _loaded_image: Optional["PILImageClass"] = None def __init__( self, @@ -149,7 +151,7 @@ def _image_bytes(self, value: bytes): self._loaded_bytes = value @property - def _pil_image(self) -> "PIL_Image.Image": # type: ignore + def _pil_image(self) -> "PILImageClass": if self._loaded_image is None: if not PIL_Image: raise RuntimeError( From 566e8a894cf0e2ae86f469c3faee0644cefabe9a Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Fri, 27 Sep 2024 09:49:38 -0700 Subject: [PATCH 24/26] Revert "fix lazy loading with typing" This reverts commit 0baabfe01d3598c9b0f54e2d6e0ceef12ba5f8c3. --- google/generativeai/vision_models/_vision_models.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py index 38ef66e67..3a50b3872 100644 --- a/google/generativeai/vision_models/_vision_models.py +++ b/google/generativeai/vision_models/_vision_models.py @@ -42,10 +42,8 @@ try: from PIL import Image as PIL_Image - from PIL.Image import Image as PILImageClass except ImportError: PIL_Image = None - PILImageClass = None # This is to get around https://github.com/googleapis/proto-plus-python/issues/488 @@ -113,7 +111,7 @@ class Image: __module__ = "vertexai.vision_models" _loaded_bytes: Optional[bytes] = None - _loaded_image: Optional["PILImageClass"] = None + _loaded_image: Optional["PIL_Image.Image"] = None def __init__( self, @@ -151,7 +149,7 @@ def _image_bytes(self, value: bytes): self._loaded_bytes = value @property - def _pil_image(self) -> "PILImageClass": + def _pil_image(self) -> "PIL_Image.Image": # type: ignore if self._loaded_image is None: if not PIL_Image: raise RuntimeError( From f077610eb44c9207d8f8126084d6d0f98f10dad0 Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Fri, 27 Sep 2024 11:07:43 -0700 Subject: [PATCH 25/26] Use if TYPE_CHECKING Change-Id: I63c4c7909f7d7060a4b557f41cff4ba9010e004c --- .../vision_models/_vision_models.py | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py index 3a50b3872..78f495d65 100644 --- a/google/generativeai/vision_models/_vision_models.py +++ b/google/generativeai/vision_models/_vision_models.py @@ -34,16 +34,22 @@ from proto.marshal.collections import repeated -# pylint: disable=g-import-not-at-top -try: +# pylint: disable=g-import-not-at-top\ +if typing.TYPE_CHECKING: from IPython import display as IPython_display -except ImportError: - IPython_display = None - -try: - from PIL import Image as PIL_Image -except ImportError: - PIL_Image = None +else: + try: + from IPython import display as IPython_display + except ImportError: + IPython_display = None + +if typing.TYPE_CHECKING: + import PIL.Image as PIL_Image +else: + try: + from PIL import Image as PIL_Image + except ImportError: + PIL_Image = None # This is to get around https://github.com/googleapis/proto-plus-python/issues/488 From d97412812691cf31982eaf9edf166a0e757e7029 Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Fri, 27 Sep 2024 11:15:38 -0700 Subject: [PATCH 26/26] format Change-Id: I446692905361c56505b070d72d94ea7fecdaede8 --- google/generativeai/vision_models/_vision_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google/generativeai/vision_models/_vision_models.py b/google/generativeai/vision_models/_vision_models.py index 78f495d65..0bb4f7dbe 100644 --- a/google/generativeai/vision_models/_vision_models.py +++ b/google/generativeai/vision_models/_vision_models.py @@ -155,7 +155,7 @@ def _image_bytes(self, value: bytes): self._loaded_bytes = value @property - def _pil_image(self) -> "PIL_Image.Image": # type: ignore + def _pil_image(self) -> "PIL_Image.Image": # type: ignore if self._loaded_image is None: if not PIL_Image: raise RuntimeError(