From 7f434e3431e35ebe819fa936e37ec0eda487eed3 Mon Sep 17 00:00:00 2001
From: Facundo Santiago <santiagof@outlook.com>
Date: Wed, 15 Jan 2025 16:49:35 +0000
Subject: [PATCH] feat: image embeddings

---
 .../GetChatCompletions_MaximumSet_Gen.json    |   1 +
 .../GetImageEmbeddings_MaximumSet_Gen.json    |  53 +++++++
 .../GetImageEmbeddings_MinimumSet_Gen.json    |  47 +++++++
 .../models/image_embeddings.tsp               |  61 ++++++++
 specification/ai/ModelInference/routes.tsp    |  15 ++
 .../GetChatCompletions_MaximumSet_Gen.json    |   1 +
 .../GetImageEmbeddings_MaximumSet_Gen.json    |  53 +++++++
 .../GetImageEmbeddings_MinimumSet_Gen.json    |  47 +++++++
 .../preview/2024-05-01-preview/openapi.json   | 133 ++++++++++++++++++
 .../preview/2024-05-01-preview/openapi.yaml   |  93 ++++++++++++
 10 files changed, 504 insertions(+)
 create mode 100644 specification/ai/ModelInference/examples/2024-05-01-preview/GetImageEmbeddings_MaximumSet_Gen.json
 create mode 100644 specification/ai/ModelInference/examples/2024-05-01-preview/GetImageEmbeddings_MinimumSet_Gen.json
 create mode 100644 specification/ai/ModelInference/models/image_embeddings.tsp
 create mode 100644 specification/ai/data-plane/ModelInference/preview/2024-05-01-preview/examples/GetImageEmbeddings_MaximumSet_Gen.json
 create mode 100644 specification/ai/data-plane/ModelInference/preview/2024-05-01-preview/examples/GetImageEmbeddings_MinimumSet_Gen.json

diff --git a/specification/ai/ModelInference/examples/2024-05-01-preview/GetChatCompletions_MaximumSet_Gen.json b/specification/ai/ModelInference/examples/2024-05-01-preview/GetChatCompletions_MaximumSet_Gen.json
index 2dff1c3b6470..eaf087423616 100644
--- a/specification/ai/ModelInference/examples/2024-05-01-preview/GetChatCompletions_MaximumSet_Gen.json
+++ b/specification/ai/ModelInference/examples/2024-05-01-preview/GetChatCompletions_MaximumSet_Gen.json
@@ -5,6 +5,7 @@
     "api-version": "2024-05-01-preview",
     "extra-parameters": "error",
     "body": {
+      "modalities": [ "text" ],
       "messages": [
         {
           "role": "system",
diff --git a/specification/ai/ModelInference/examples/2024-05-01-preview/GetImageEmbeddings_MaximumSet_Gen.json b/specification/ai/ModelInference/examples/2024-05-01-preview/GetImageEmbeddings_MaximumSet_Gen.json
new file mode 100644
index 000000000000..f952674b6e3f
--- /dev/null
+++ b/specification/ai/ModelInference/examples/2024-05-01-preview/GetImageEmbeddings_MaximumSet_Gen.json
@@ -0,0 +1,53 @@
+{
+  "title": "maximum set image embeddings",
+  "operationId": "GetImageEmbeddings",
+  "parameters": {
+    "api-version": "2024-05-01-preview",
+    "extra-parameters": "error",
+    "body": {
+      "input": [
+        {
+          "image": "puqkvvlvgcjyzughesnkena",
+          "text": "azrzyjsmnuefqpowpvfmyobeehqsni"
+        }
+      ],
+      "dimensions": 1024,
+      "encoding_format": "float",
+      "input_type": "text",
+      "model": "my-model-name"
+    }
+  },
+  "responses": {
+    "200": {
+      "body": {
+        "id": "cknxthfa",
+        "data": [
+          {
+            "index": 0,
+            "object": "embedding",
+            "embedding": [
+              0,
+              0,
+              0,
+              0,
+              0,
+              0,
+              0,
+              0,
+              0,
+              0,
+              0,
+              0
+            ]
+          }
+        ],
+        "object": "list",
+        "model": "my-model-name",
+        "usage": {
+          "prompt_tokens": 15,
+          "total_tokens": 15
+        }
+      }
+    }
+  }
+}
diff --git a/specification/ai/ModelInference/examples/2024-05-01-preview/GetImageEmbeddings_MinimumSet_Gen.json b/specification/ai/ModelInference/examples/2024-05-01-preview/GetImageEmbeddings_MinimumSet_Gen.json
new file mode 100644
index 000000000000..b90e35374308
--- /dev/null
+++ b/specification/ai/ModelInference/examples/2024-05-01-preview/GetImageEmbeddings_MinimumSet_Gen.json
@@ -0,0 +1,47 @@
+{
+  "title": "minimum set image embeddings",
+  "operationId": "GetImageEmbeddings",
+  "parameters": {
+    "api-version": "2024-05-01-preview",
+    "body": {
+      "input": [
+        {
+          "image": "gvmojtfooxixxzayrditjlyymg"
+        }
+      ]
+    }
+  },
+  "responses": {
+    "200": {
+      "body": {
+        "id": "cknxthfa",
+        "data": [
+          {
+            "index": 0,
+            "object": "embedding",
+            "embedding": [
+              0,
+              0,
+              0,
+              0,
+              0,
+              0,
+              0,
+              0,
+              0,
+              0,
+              0,
+              0
+            ]
+          }
+        ],
+        "object": "list",
+        "model": "my-model-name",
+        "usage": {
+          "prompt_tokens": 15,
+          "total_tokens": 15
+        }
+      }
+    }
+  }
+}
diff --git a/specification/ai/ModelInference/models/image_embeddings.tsp b/specification/ai/ModelInference/models/image_embeddings.tsp
new file mode 100644
index 000000000000..28f16c5090c5
--- /dev/null
+++ b/specification/ai/ModelInference/models/image_embeddings.tsp
@@ -0,0 +1,61 @@
+import "@typespec/rest";
+import "@typespec/http";
+
+import "./embeddings.tsp";
+
+using TypeSpec.Rest;
+using TypeSpec.Http;
+
+namespace ModelInference;
+
+@doc("""
+  The configuration information for an image embeddings request.
+  """)
+model ImageEmbeddingsOptions {
+  @doc("""
+    Input image to embed. To embed multiple inputs in a single request, pass an array.
+    The input must not exceed the max input tokens for the model.
+    """)
+  input: ImageEmbeddingInput[];
+
+  @doc("""
+    Optional. The number of dimensions the resulting output embeddings should have.
+    Passing null causes the model to use its default value.
+    Returns a 422 error if the model doesn't support the value or parameter.
+    """)
+  dimensions?: int32;
+
+  @doc("""
+    Optional. The number of dimensions the resulting output embeddings should have.
+    Passing null causes the model to use its default value.
+    Returns a 422 error if the model doesn't support the value or parameter.
+    """)
+  encoding_format?: EmbeddingEncodingFormat;
+
+  @doc("""
+    Optional. The type of the input.
+    Returns a 422 error if the model doesn't support the value or parameter.
+    """)
+  input_type?: EmbeddingInputType;
+
+  @doc("""
+    ID of the specific AI model to use, if more than one model is available on the endpoint.
+    """)
+  `model`?: string;
+
+  ...Record<unknown>;
+}
+
+@doc("Represents an image with optional text.")
+model ImageEmbeddingInput {
+  @doc("""
+    The input image encoded in base64 string as a data URL. Example: `data:image/{format};base64,{data}`.
+    """)
+  image: string;
+
+  @doc("""
+    Optional. The text input to feed into the model (like DINO, CLIP).
+    Returns a 422 error if the model doesn't support the value or parameter.
+    """)
+  text?: string;
+}
diff --git a/specification/ai/ModelInference/routes.tsp b/specification/ai/ModelInference/routes.tsp
index 7c7eb6c7903c..b5f2e5d02407 100644
--- a/specification/ai/ModelInference/routes.tsp
+++ b/specification/ai/ModelInference/routes.tsp
@@ -6,6 +6,7 @@ import "@typespec/versioning";
 import "./models/common.tsp";
 import "./models/chat_completions.tsp";
 import "./models/embeddings.tsp";
+import "./models/image_embeddings.tsp";
 
 using TypeSpec.Rest;
 using TypeSpec.Http;
@@ -44,6 +45,20 @@ op getEmbeddings is Azure.Core.RpcOperation<
   EmbeddingsResult
 >;
 
+@doc("""
+  Return the embedding vectors for given images.
+  The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
+  """)
+@actionSeparator("/")
+@route("images/embeddings")
+op getImageEmbeddings is Azure.Core.RpcOperation<
+  {
+    ...ImageEmbeddingsOptions;
+    ...AdditionalRequestHeaders;
+  },
+  EmbeddingsResult
+>;
+
 @doc("""
   Returns information about the AI model.
   The method makes a REST API call to the `/info` route on the given endpoint.
diff --git a/specification/ai/data-plane/ModelInference/preview/2024-05-01-preview/examples/GetChatCompletions_MaximumSet_Gen.json b/specification/ai/data-plane/ModelInference/preview/2024-05-01-preview/examples/GetChatCompletions_MaximumSet_Gen.json
index 2dff1c3b6470..eaf087423616 100644
--- a/specification/ai/data-plane/ModelInference/preview/2024-05-01-preview/examples/GetChatCompletions_MaximumSet_Gen.json
+++ b/specification/ai/data-plane/ModelInference/preview/2024-05-01-preview/examples/GetChatCompletions_MaximumSet_Gen.json
@@ -5,6 +5,7 @@
     "api-version": "2024-05-01-preview",
     "extra-parameters": "error",
     "body": {
+      "modalities": [ "text" ],
       "messages": [
         {
           "role": "system",
diff --git a/specification/ai/data-plane/ModelInference/preview/2024-05-01-preview/examples/GetImageEmbeddings_MaximumSet_Gen.json b/specification/ai/data-plane/ModelInference/preview/2024-05-01-preview/examples/GetImageEmbeddings_MaximumSet_Gen.json
new file mode 100644
index 000000000000..f952674b6e3f
--- /dev/null
+++ b/specification/ai/data-plane/ModelInference/preview/2024-05-01-preview/examples/GetImageEmbeddings_MaximumSet_Gen.json
@@ -0,0 +1,53 @@
+{
+  "title": "maximum set image embeddings",
+  "operationId": "GetImageEmbeddings",
+  "parameters": {
+    "api-version": "2024-05-01-preview",
+    "extra-parameters": "error",
+    "body": {
+      "input": [
+        {
+          "image": "puqkvvlvgcjyzughesnkena",
+          "text": "azrzyjsmnuefqpowpvfmyobeehqsni"
+        }
+      ],
+      "dimensions": 1024,
+      "encoding_format": "float",
+      "input_type": "text",
+      "model": "my-model-name"
+    }
+  },
+  "responses": {
+    "200": {
+      "body": {
+        "id": "cknxthfa",
+        "data": [
+          {
+            "index": 0,
+            "object": "embedding",
+            "embedding": [
+              0,
+              0,
+              0,
+              0,
+              0,
+              0,
+              0,
+              0,
+              0,
+              0,
+              0,
+              0
+            ]
+          }
+        ],
+        "object": "list",
+        "model": "my-model-name",
+        "usage": {
+          "prompt_tokens": 15,
+          "total_tokens": 15
+        }
+      }
+    }
+  }
+}
diff --git a/specification/ai/data-plane/ModelInference/preview/2024-05-01-preview/examples/GetImageEmbeddings_MinimumSet_Gen.json b/specification/ai/data-plane/ModelInference/preview/2024-05-01-preview/examples/GetImageEmbeddings_MinimumSet_Gen.json
new file mode 100644
index 000000000000..b90e35374308
--- /dev/null
+++ b/specification/ai/data-plane/ModelInference/preview/2024-05-01-preview/examples/GetImageEmbeddings_MinimumSet_Gen.json
@@ -0,0 +1,47 @@
+{
+  "title": "minimum set image embeddings",
+  "operationId": "GetImageEmbeddings",
+  "parameters": {
+    "api-version": "2024-05-01-preview",
+    "body": {
+      "input": [
+        {
+          "image": "gvmojtfooxixxzayrditjlyymg"
+        }
+      ]
+    }
+  },
+  "responses": {
+    "200": {
+      "body": {
+        "id": "cknxthfa",
+        "data": [
+          {
+            "index": 0,
+            "object": "embedding",
+            "embedding": [
+              0,
+              0,
+              0,
+              0,
+              0,
+              0,
+              0,
+              0,
+              0,
+              0,
+              0,
+              0
+            ]
+          }
+        ],
+        "object": "list",
+        "model": "my-model-name",
+        "usage": {
+          "prompt_tokens": 15,
+          "total_tokens": 15
+        }
+      }
+    }
+  }
+}
diff --git a/specification/ai/data-plane/ModelInference/preview/2024-05-01-preview/openapi.json b/specification/ai/data-plane/ModelInference/preview/2024-05-01-preview/openapi.json
index f79c11054e2c..ba1e6818a098 100644
--- a/specification/ai/data-plane/ModelInference/preview/2024-05-01-preview/openapi.json
+++ b/specification/ai/data-plane/ModelInference/preview/2024-05-01-preview/openapi.json
@@ -207,6 +207,87 @@
         }
       }
     },
+    "/images/embeddings": {
+      "post": {
+        "operationId": "GetImageEmbeddings",
+        "description": "Return the embedding vectors for given images.\nThe method makes a REST API call to the `/images/embeddings` route on the given endpoint.",
+        "parameters": [
+          {
+            "$ref": "#/parameters/Azure.Core.Foundations.ApiVersionParameter"
+          },
+          {
+            "name": "extra-parameters",
+            "in": "header",
+            "description": "Controls what happens if extra parameters, undefined by the REST API,\nare passed in the JSON request payload.\nThis sets the HTTP request header `extra-parameters`.",
+            "required": false,
+            "type": "string",
+            "enum": [
+              "error",
+              "drop",
+              "pass-through"
+            ],
+            "x-ms-enum": {
+              "name": "ExtraParameters",
+              "modelAsString": true,
+              "values": [
+                {
+                  "name": "error",
+                  "value": "error",
+                  "description": "The service will error if it detected extra parameters in the request payload. This is the service default."
+                },
+                {
+                  "name": "drop",
+                  "value": "drop",
+                  "description": "The service will ignore (drop) extra parameters in the request payload. It will only pass the known parameters to the back-end AI model."
+                },
+                {
+                  "name": "pass_through",
+                  "value": "pass-through",
+                  "description": "The service will pass extra parameters to the back-end AI model."
+                }
+              ]
+            },
+            "x-ms-client-name": "extra_params"
+          },
+          {
+            "name": "body",
+            "in": "body",
+            "required": true,
+            "schema": {
+              "$ref": "#/definitions/ImageEmbeddingsOptions"
+            }
+          }
+        ],
+        "responses": {
+          "200": {
+            "description": "The request has succeeded.",
+            "schema": {
+              "$ref": "#/definitions/EmbeddingsResult"
+            }
+          },
+          "default": {
+            "description": "An unexpected error response.",
+            "schema": {
+              "$ref": "#/definitions/Azure.Core.Foundations.ErrorResponse"
+            },
+            "headers": {
+              "x-ms-error-code": {
+                "type": "string",
+                "description": "String error code indicating what went wrong."
+              }
+            }
+          }
+        },
+        "x-ms-examples": {
+          "maximum set image embeddings": {
+            "$ref": "./examples/GetImageEmbeddings_MaximumSet_Gen.json"
+          },
+          "minimum set image embeddings": {
+            "$ref": "./examples/GetImageEmbeddings_MinimumSet_Gen.json"
+          }
+        }
+      }
+    },
     "/info": {
       "get": {
         "operationId": "GetModelInfo",
@@ -1455,6 +1536,58 @@
         "name"
       ]
     },
+    "ImageEmbeddingInput": {
+      "type": "object",
+      "description": "Represents an image with optional text.",
+      "properties": {
+        "image": {
+          "type": "string",
+          "description": "The input image encoded in base64 string as a data URL. Example: `data:image/{format};base64,{data}`."
+        },
+        "text": {
+          "type": "string",
+          "description": "Optional. The text input to feed into the model (like DINO, CLIP).\nReturns a 422 error if the model doesn't support the value or parameter."
+        }
+      },
+      "required": [
+        "image"
+      ]
+    },
+    "ImageEmbeddingsOptions": {
+      "type": "object",
+      "description": "The configuration information for an image embeddings request.",
+      "properties": {
+        "input": {
+          "type": "array",
+          "description": "Input image to embed. To embed multiple inputs in a single request, pass an array.\nThe input must not exceed the max input tokens for the model.",
+          "items": {
+            "$ref": "#/definitions/ImageEmbeddingInput"
+          },
+          "x-ms-identifiers": []
+        },
+        "dimensions": {
+          "type": "integer",
+          "format": "int32",
+          "description": "Optional. The number of dimensions the resulting output embeddings should have.\nPassing null causes the model to use its default value.\nReturns a 422 error if the model doesn't support the value or parameter."
+        },
+        "encoding_format": {
+          "$ref": "#/definitions/EmbeddingEncodingFormat",
+          "description": "Optional. The number of dimensions the resulting output embeddings should have.\nPassing null causes the model to use its default value.\nReturns a 422 error if the model doesn't support the value or parameter."
+        },
+        "input_type": {
+          "$ref": "#/definitions/EmbeddingInputType",
+          "description": "Optional. The type of the input.\nReturns a 422 error if the model doesn't support the value or parameter."
+        },
+        "model": {
+          "type": "string",
+          "description": "ID of the specific AI model to use, if more than one model is available on the endpoint."
+        }
+      },
+      "required": [
+        "input"
+      ],
+      "additionalProperties": {}
+    },
     "ModelInfo": {
       "type": "object",
       "description": "Represents some basic information about the AI model.",
diff --git a/specification/ai/data-plane/ModelInference/preview/2024-05-01-preview/openapi.yaml b/specification/ai/data-plane/ModelInference/preview/2024-05-01-preview/openapi.yaml
index 1f72cb0abda8..ef69ad847651 100644
--- a/specification/ai/data-plane/ModelInference/preview/2024-05-01-preview/openapi.yaml
+++ b/specification/ai/data-plane/ModelInference/preview/2024-05-01-preview/openapi.yaml
@@ -90,6 +90,48 @@ paths:
           application/json:
             schema:
               $ref: '#/components/schemas/EmbeddingsOptions'
+  /images/embeddings:
+    post:
+      operationId: getImageEmbeddings
+      description: |-
+        Return the embedding vectors for given images.
+        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
+      parameters:
+        - $ref: '#/components/parameters/Azure.Core.Foundations.ApiVersionParameter'
+        - name: extra-parameters
+          in: header
+          required: false
+          description: |-
+            Controls what happens if extra parameters, undefined by the REST API,
+            are passed in the JSON request payload.
+            This sets the HTTP request header `extra-parameters`.
+          schema:
+            $ref: '#/components/schemas/ExtraParameters'
+      responses:
+        '200':
+          description: The request has succeeded.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/EmbeddingsResult'
+        default:
+          description: An unexpected error response.
+          headers:
+            x-ms-error-code:
+              required: false
+              description: String error code indicating what went wrong.
+              schema:
+                type: string
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Azure.Core.Foundations.ErrorResponse'
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/ImageEmbeddingsOptions'
   /info:
     get:
       operationId: getModelInfo
@@ -1067,6 +1109,57 @@ components:
           additionalProperties: {}
           description: The parameters the function accepts, described as a JSON Schema object.
       description: The definition of a caller-specified function that chat completions may invoke in response to matching user input.
+    ImageEmbeddingInput:
+      type: object
+      required:
+        - image
+      properties:
+        image:
+          type: string
+          description: 'The input image encoded in base64 string as a data URL. Example: `data:image/{format};base64,{data}`.'
+        text:
+          type: string
+          description: |-
+            Optional. The text input to feed into the model (like DINO, CLIP).
+            Returns a 422 error if the model doesn't support the value or parameter.
+      description: Represents an image with optional text.
+    ImageEmbeddingsOptions:
+      type: object
+      required:
+        - input
+      properties:
+        input:
+          type: array
+          items:
+            $ref: '#/components/schemas/ImageEmbeddingInput'
+          description: |-
+            Input image to embed. To embed multiple inputs in a single request, pass an array.
+            The input must not exceed the max input tokens for the model.
+        dimensions:
+          type: integer
+          format: int32
+          description: |-
+            Optional. The number of dimensions the resulting output embeddings should have.
+            Passing null causes the model to use its default value.
+            Returns a 422 error if the model doesn't support the value or parameter.
+        encoding_format:
+          allOf:
+            - $ref: '#/components/schemas/EmbeddingEncodingFormat'
+          description: |-
+            Optional. The number of dimensions the resulting output embeddings should have.
+            Passing null causes the model to use its default value.
+            Returns a 422 error if the model doesn't support the value or parameter.
+        input_type:
+          allOf:
+            - $ref: '#/components/schemas/EmbeddingInputType'
+          description: |-
+            Optional. The type of the input.
+            Returns a 422 error if the model doesn't support the value or parameter.
+        model:
+          type: string
+          description: ID of the specific AI model to use, if more than one model is available on the endpoint.
+      additionalProperties: {}
+      description: The configuration information for an image embeddings request.
     ModelInfo:
       type: object
       required: