feat(model): support text generate chat, visual question answering an…

…d image2image task
instill-ai · Dec 3, 2023 · 7e8ee04 · 7e8ee04
1 parent 47e6e31
commit 7e8ee04
Show file tree

Hide file tree

Showing 25 changed files with 1,663 additions and 160 deletions.
diff --git a/config/model/model.json b/config/model/model.json
@@ -5,11 +5,40 @@
   "title": "Model",
   "description": "The model data structure",
   "additionalProperties": false,
-  "required": ["id", "model_definition", "configuration"],
+  "required": [
+    "id",
+    "model_definition",
+    "configuration"
+  ],
   "anyOf": [
-    {"required": ["user"], "not": {"required": ["org"]}},
-    {"required": ["org"], "not": {"required": ["user"]}},
-    {"not": {"required": ["user", "org"]}}
+    {
+      "required": [
+        "user"
+      ],
+      "not": {
+        "required": [
+          "org"
+        ]
+      }
+    },
+    {
+      "required": [
+        "org"
+      ],
+      "not": {
+        "required": [
+          "user"
+        ]
+      }
+    },
+    {
+      "not": {
+        "required": [
+          "user",
+          "org"
+        ]
+      }
+    }
   ],
   "properties": {
     "name": {
@@ -86,7 +115,11 @@
       "ui_hidden": true,
       "ui_disabled": true,
       "ui_component": "select",
-      "ui_enum": ["", "Private", "Public"]
+      "ui_enum": [
+        "",
+        "Private",
+        "Public"
+      ]
     },
     "user": {
       "type": "string",
@@ -122,27 +155,60 @@
       "ui_hidden": true,
       "ui_disabled": true,
       "ui_component": "select",
-      "ui_enum": ["", "Offline", "Online", "Error"]
+      "ui_enum": [
+        "",
+        "Offline",
+        "Online",
+        "Error"
+      ]
     },
     "task": {
       "type": "string",
       "title": "Task",
       "description": "The model task",
-      "enum": ["TASK_UNSPECIFIED", "TASK_CLASSIFICATION", "TASK_DETECTION", "TASK_KEYPOINT", "TASK_INSTANCE_SEGMENTATION", "TASK_SEMANTIC_SEGMENTATION", "TASK_OCR", "TASK_TEXT_TO_IMAGE", "TASK_TEXT_GENERATION"],
+      "enum": [
+        "TASK_UNSPECIFIED",
+        "TASK_CLASSIFICATION",
+        "TASK_DETECTION",
+        "TASK_KEYPOINT",
+        "TASK_INSTANCE_SEGMENTATION",
+        "TASK_SEMANTIC_SEGMENTATION",
+        "TASK_OCR",
+        "TASK_TEXT_TO_IMAGE",
+        "TASK_IMAGE_TO_IMAGE",
+        "TASK_TEXT_GENERATION",
+        "TASK_TEXT_GENERATION_CHAT",
+        "TASK_VISUAL_QUESTION_ANSWERING"
+      ],
       "default": "TASK_UNSPECIFIED",
       "readOnly": true,
       "ui_order": 4,
       "ui_hidden": true,
       "ui_disabled": true,
       "ui_component": "select",
-      "ui_enum": ["", "Image classification", "Object detection", "Keypoint detection", "Instance segmentation", "Semantic segmentation", "OCR", "Text to image", "Text generation"]
+      "ui_enum": [
+        "",
+        "Image classification",
+        "Object detection",
+        "Keypoint detection",
+        "Instance segmentation",
+        "Semantic segmentation",
+        "OCR",
+        "Text to image",
+        "Image to image",
+        "Text generation",
+        "Text generation chat",
+        "Visual Question Answering"
+      ]
     },
     "create_time": {
       "type": "string",
       "format": "date-time",
       "title": "Create time",
       "description": "The date when the model is created in the format YYYY-MM-DDTHH:MM::SS.",
-      "examples": ["2022-04-29T00:20:06.703547Z"],
+      "examples": [
+        "2022-04-29T00:20:06.703547Z"
+      ],
       "readOnly": true,
       "ui_order": 8,
       "ui_hidden": true,
@@ -154,24 +220,31 @@
       "format": "date-time",
       "title": "Update time",
       "description": "The date when the model is updated in the format YYYY-MM-DDTHH:MM::SS.",
-      "examples": ["2022-04-29T01:33:34.910865Z"],
+      "examples": [
+        "2022-04-29T01:33:34.910865Z"
+      ],
       "readOnly": true,
       "ui_order": 9,
       "ui_hidden": true,
       "ui_disabled": true,
       "ui_component": "text"
     },
     "delete_time": {
-      "type": ["string", "null"],
+      "type": [
+        "string",
+        "null"
+      ],
       "format": "date-time",
       "title": "Delete time",
       "description": "The date when the model is deleted in the format YYYY-MM-DDTHH:MM::SS.",
-      "examples": ["2022-04-29T01:33:34.910865Z"],
+      "examples": [
+        "2022-04-29T01:33:34.910865Z"
+      ],
       "readOnly": true,
       "ui_order": 9,
       "ui_hidden": true,
       "ui_disabled": true,
       "ui_component": "text"
     }
   }
-}
+}
diff --git a/config/model/model_definition.json b/config/model/model_definition.json
@@ -90,20 +90,51 @@
       "ui_hidden": true,
       "ui_disabled": true,
       "ui_component": "select",
-      "ui_enum": ["", "Offline", "Online", "Error"]
+      "ui_enum": [
+        "",
+        "Offline",
+        "Online",
+        "Error"
+      ]
     },
     "task": {
       "type": "string",
       "title": "Task",
       "description": "The model task",
-      "enum": ["TASK_UNSPECIFIED", "TASK_CLASSIFICATION", "TASK_DETECTION", "TASK_KEYPOINT", "TASK_INSTANCE_SEGMENTATION", "TASK_SEMANTIC_SEGMENTATION", "TASK_OCR", "TASK_TEXT_TO_IMAGE", "TASK_TEXT_GENERATION"],
+      "enum": [
+        "TASK_UNSPECIFIED",
+        "TASK_CLASSIFICATION",
+        "TASK_DETECTION",
+        "TASK_KEYPOINT",
+        "TASK_INSTANCE_SEGMENTATION",
+        "TASK_SEMANTIC_SEGMENTATION",
+        "TASK_OCR",
+        "TASK_TEXT_TO_IMAGE",
+        "TASK_IMAGE_TO_IMAGE",
+        "TASK_TEXT_GENERATION",
+        "TASK_TEXT_GENERATION_CHAT",
+        "TASK_VISUAL_QUESTION_ANSWERING"
+      ],
       "default": "TASK_UNSPECIFIED",
       "readOnly": true,
       "ui_order": 4,
       "ui_hidden": true,
       "ui_disabled": true,
       "ui_component": "select",
-      "ui_enum": ["", "Image classification", "Object detection", "Keypoint detection", "Instance segmentation", "Semantic segmentation", "OCR", "Text to image", "Text generation"]
+      "ui_enum": [
+        "",
+        "Image classification",
+        "Object detection",
+        "Keypoint detection",
+        "Instance segmentation",
+        "Semantic segmentation",
+        "OCR",
+        "Text to image",
+        "Image to image",
+        "Text generation",
+        "Text generation chat",
+        "Visual question answering"
+      ]
     },
     "model_spec": {
       "type": "object",
@@ -133,7 +164,13 @@
       "ui_hidden": true,
       "ui_disabled": true,
       "ui_component": "select",
-      "ui_enum": ["", "Alpha", "Beta", "Generally available", "Custom"]
+      "ui_enum": [
+        "",
+        "Alpha",
+        "Beta",
+        "Generally available",
+        "Custom"
+      ]
     }
   }
-}
+}
diff --git a/integration-test/const.js b/integration-test/const.js
@@ -19,10 +19,10 @@ export const defaultPassword = "password"
 export const gRPCPrivateHost = "model-backend:3083"
 export const apiPrivateHost = "http://model-backend:3083"
 
-export const gRPCPublicHost = apiGatewayMode ? `${__ENV.API_GATEWAY_URL}`: `api-gateway:8080`
+export const gRPCPublicHost = apiGatewayMode ? `${__ENV.API_GATEWAY_URL}` : `api-gateway:8080`
 export const apiPublicHost = apiGatewayMode ? `${proto}://${__ENV.API_GATEWAY_URL}/model` : `http://api-gateway:8080/model`
 
-export const mgmtGRPCPublicHost =  apiGatewayMode ? `${__ENV.API_GATEWAY_URL}`: `api-gateway:8080`
+export const mgmtGRPCPublicHost = apiGatewayMode ? `${__ENV.API_GATEWAY_URL}` : `api-gateway:8080`
 export const mgmtPublicHost = apiGatewayMode ? `${proto}://${__ENV.API_GATEWAY_URL}/core` : `http://api-gateway:8080/core`
 
 export const mgmtGRPCPrivateHost = "mgmt-backend:3084"
@@ -43,7 +43,10 @@ export const semantic_segmentation_model_bz9 = open(`${__ENV.TEST_FOLDER_ABS_PAT
 export const instance_segmentation_model = open(`${__ENV.TEST_FOLDER_ABS_PATH}/integration-test//data/dummy-instance-segmentation-model.zip`, "b");
 export const instance_segmentation_model_bz9 = open(`${__ENV.TEST_FOLDER_ABS_PATH}/integration-test//data/dummy-instance-segmentation-model-bz9.zip`, "b");
 export const text_to_image_model = open(`${__ENV.TEST_FOLDER_ABS_PATH}/integration-test//data/dummy-text-to-image-model.zip`, "b");
+export const image_to_image_model = open(`${__ENV.TEST_FOLDER_ABS_PATH}/integration-test//data/dummy-image-to-image-model.zip`, "b");
 export const text_generation_model = open(`${__ENV.TEST_FOLDER_ABS_PATH}/integration-test//data/dummy-text-generation-model.zip`, "b");
+export const text_generation_chat_model = open(`${__ENV.TEST_FOLDER_ABS_PATH}/integration-test//data/dummy-text-generation-chat-model.zip`, "b");
+export const visual_question_answering = open(`${__ENV.TEST_FOLDER_ABS_PATH}/integration-test//data/dummy-visual-question-answering-model.zip`, "b");
 
 
 export const dog_img = open(`${__ENV.TEST_FOLDER_ABS_PATH}/integration-test//data/dog.jpg`, "b");

diff --git a/integration-test/data/dummy-image-to-image-model.zip b/integration-test/data/dummy-image-to-image-model.zip
diff --git a/integration-test/data/dummy-text-generation-chat-model-not-supported-not-ensemble.zip b/integration-test/data/dummy-text-generation-chat-model-not-supported-not-ensemble.zip
diff --git a/integration-test/data/dummy-text-generation-chat-model.zip b/integration-test/data/dummy-text-generation-chat-model.zip
diff --git a/integration-test/data/dummy-text-generation-model.zip b/integration-test/data/dummy-text-generation-model.zip
diff --git a/integration-test/data/dummy-visual-question-answering-model.zip b/integration-test/data/dummy-visual-question-answering-model.zip
diff --git a/integration-test/proto/common/task/v1alpha/task.proto b/integration-test/proto/common/task/v1alpha/task.proto
@@ -22,10 +22,14 @@ enum Task {
   TASK_TEXT_TO_IMAGE = 7;
   // Task: TEXT GENERATION
   TASK_TEXT_GENERATION = 8;
+  // Task: TEXT GENERATION CHAT
+  TASK_TEXT_GENERATION_CHAT = 9;
+  // Task: VISUAL QUESTION ANSWERING
+  TASK_VISUAL_QUESTION_ANSWERING = 10;
   // Task: IMAGE TO IMAGE
-  TASK_IMAGE_TO_IMAGE = 9;
+  TASK_IMAGE_TO_IMAGE = 11;
   // Task: TEXT EMBEDDINGS
-  TASK_TEXT_EMBEDDINGS = 10;
+  TASK_TEXT_EMBEDDINGS = 12;
   // Task: SPEECH RECOGNITION
-  TASK_SPEECH_RECOGNITION = 11;
+  TASK_SPEECH_RECOGNITION = 13;
 }
diff --git a/integration-test/proto/model/model/v1alpha/common.proto b/integration-test/proto/model/model/v1alpha/common.proto
@@ -8,11 +8,28 @@ import "google/api/field_behavior.proto";
 // BoundingBox represents the bounding box data structure
 message BoundingBox {
   // Bounding box top y-axis value
-  float top = 1 [ (google.api.field_behavior) = OUTPUT_ONLY ];
+  float top = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
   // Bounding box left x-axis value
-  float left = 2 [ (google.api.field_behavior) = OUTPUT_ONLY ];
+  float left = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
   // Bounding box width value
-  float width = 3 [ (google.api.field_behavior) = OUTPUT_ONLY ];
+  float width = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
   // Bounding box height value
-  float height = 4 [ (google.api.field_behavior) = OUTPUT_ONLY ];
+  float height = 4 [(google.api.field_behavior) = OUTPUT_ONLY];
+}
+
+// Additional hyperparameters for model inferences
+// or other configuration not listsed in protobuf
+message ExtraParamObject {
+  // Name of the hyperparameter
+  string param_name = 1;
+  // Value of the hyperparameter
+  string param_value = 2;
+}
+
+// Conversation based prompt for text generation model
+message ConversationObject {
+  // Role name of the conversation
+  string role = 1;
+  // Content of the conversation
+  string content = 2;
 }
diff --git a/integration-test/proto/model/model/v1alpha/model.proto b/integration-test/proto/model/model/v1alpha/model.proto
@@ -23,7 +23,10 @@ import "../../../model/model/v1alpha/task_ocr.proto";
 import "../../../model/model/v1alpha/task_instance_segmentation.proto";
 import "../../../model/model/v1alpha/task_semantic_segmentation.proto";
 import "../../../model/model/v1alpha/task_text_to_image.proto";
+import "../../../model/model/v1alpha/task_image_to_image.proto";
 import "../../../model/model/v1alpha/task_text_generation.proto";
+import "../../../model/model/v1alpha/task_text_generation_chat.proto";
+import "../../../model/model/v1alpha/task_visual_question_answering.proto";
 import "../../../model/model/v1alpha/task_unspecified.proto";
 
 // LivenessRequest represents a request to check a service liveness status
@@ -467,10 +470,16 @@ message TaskInput {
     SemanticSegmentationInput semantic_segmentation = 6;
     // The text to image input
     TextToImageInput text_to_image = 7;
+    // The image to image input
+    ImageToImageInput image_to_image = 8;
     // The text generation input
-    TextGenerationInput text_generation = 8;
+    TextGenerationInput text_generation = 9;
+    // The text generation chat input
+    TextGenerationChatInput text_generation_chat = 10;
+    // The visual question answering input
+    VisualQuestionAnsweringInput visual_question_answering = 11;
     // The unspecified task input
-    UnspecifiedInput unspecified = 9;
+    UnspecifiedInput unspecified = 12;
   }
 }
 
@@ -492,10 +501,16 @@ message TaskInputStream {
     SemanticSegmentationInputStream semantic_segmentation = 6;
     // The text to image input
     TextToImageInput text_to_image = 7;
+    // The image to image input
+    ImageToImageInput image_to_image = 8;
     // The text generation input
-    TextGenerationInput text_generation = 8;
+    TextGenerationInput text_generation = 9;
+    // The text generation chat input
+    TextGenerationChatInput text_generation_chat = 10;
+    // The visual question answering input
+    VisualQuestionAnsweringInput visual_question_answering = 11;
     // The unspecified task input
-    UnspecifiedInput unspecified = 9;
+    UnspecifiedInput unspecified = 12;
   }
 }
 
@@ -517,10 +532,16 @@ message TaskOutput {
     SemanticSegmentationOutput semantic_segmentation = 6 [(google.api.field_behavior) = OUTPUT_ONLY];
     // The text to image output
     TextToImageOutput text_to_image = 7 [(google.api.field_behavior) = OUTPUT_ONLY];
+    // The image to image output
+    ImageToImageOutput image_to_image = 8 [(google.api.field_behavior) = OUTPUT_ONLY];
     // The text generation output
-    TextGenerationOutput text_generation = 8 [(google.api.field_behavior) = OUTPUT_ONLY];
+    TextGenerationOutput text_generation = 9 [(google.api.field_behavior) = OUTPUT_ONLY];
+    // The text generation output
+    TextGenerationChatOutput text_generation_chat = 10 [(google.api.field_behavior) = OUTPUT_ONLY];
+    // The text generation output
+    VisualQuestionAnsweringOutput visual_question_answering = 11 [(google.api.field_behavior) = OUTPUT_ONLY];
     // The unspecified task output
-    UnspecifiedOutput unspecified = 9 [(google.api.field_behavior) = OUTPUT_ONLY];
+    UnspecifiedOutput unspecified = 12 [(google.api.field_behavior) = OUTPUT_ONLY];
   }
 }