From a3849b4d74417137d07aeb75580d09e14859aeab Mon Sep 17 00:00:00 2001
From: Nathalie Jonathan <143617992+nathaliellenaa@users.noreply.github.com>
Date: Tue, 31 Dec 2024 03:57:27 -0800
Subject: [PATCH] Add ML Predict, Train APIs (#755)

* Added ML predict and train APIs, updated CHANGELOG.

Signed-off-by: Nathalie Jonathan <nathhjo@amazon.com>

* Added ML train and predict API, fixed test for predict and train API, defined ML status as an enum, updated ML task state enum, updated CHANGELOG.md.

Signed-off-by: Nathalie Jonathan <nathhjo@amazon.com>

* Updated  examples to use books themed data, removed output from predict and train_predict API, fixed vale complaints.

Signed-off-by: Nathalie Jonathan <nathhjo@amazon.com>

* Removed excluded parts, changed files path.

Signed-off-by: Nathalie Jonathan <nathhjo@amazon.com>

---------

Signed-off-by: Nathalie Jonathan <nathhjo@amazon.com>
---
 CHANGELOG.md                               |   1 +
 spec/namespaces/ml.yaml                    | 134 +++++++++++++
 spec/schemas/ml._common.yaml               | 221 +++++++++++++++++++++
 tests/plugins/ml/ml/predict.yaml           |  70 +++++++
 tests/plugins/ml/ml/train.yaml             |  55 +++++
 tests/plugins/ml/ml/train_and_predict.yaml |  84 ++++++++
 6 files changed, 565 insertions(+)
 create mode 100644 tests/plugins/ml/ml/predict.yaml
 create mode 100644 tests/plugins/ml/ml/train.yaml
 create mode 100644 tests/plugins/ml/ml/train_and_predict.yaml

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 52954a1d..e0a81795 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -40,6 +40,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 - Added support for combining output variables ([#737](https://github.com/opensearch-project/opensearch-api-specification/pull/737))
 - Added 404 response to `/_search/scroll` ([#749](https://github.com/opensearch-project/opensearch-api-specification/pull/749))
 - Added `node_failures` to `DELETE /_search/scroll` and `DELETE /_search/scroll/{scroll_id}` ([#749](https://github.com/opensearch-project/opensearch-api-specification/pull/749))
+- Added `POST /_plugins/_ml/_train/{algorithm_name}`, `_predict/{algorithm_name}/{model_id}`, and `_train_predict/{algorithm_name}` ([#755](https://github.com/opensearch-project/opensearch-api-specification/pull/755))
 
 ### Removed
 - Removed unsupported `_common.mapping:SourceField`'s `mode` field and associated `_common.mapping:SourceFieldMode` enum ([#652](https://github.com/opensearch-project/opensearch-api-specification/pull/652))
diff --git a/spec/namespaces/ml.yaml b/spec/namespaces/ml.yaml
index 67ec2460..25ec908b 100644
--- a/spec/namespaces/ml.yaml
+++ b/spec/namespaces/ml.yaml
@@ -102,6 +102,43 @@ paths:
       responses:
         '200':
           $ref: '#/components/responses/ml.search_models@200'
+  /_plugins/_ml/_predict/{algorithm_name}/{model_id}:
+    post:
+      operationId: ml.predict.0
+      x-operation-group: ml.predict
+      description: Predicts new data with trained model.
+      parameters:
+        - $ref: '#/components/parameters/ml.predict::path.algorithm_name'
+        - $ref: '#/components/parameters/ml.predict::path.model_id'
+      requestBody:
+        $ref: '#/components/requestBodies/ml.predict'
+      responses:
+        '200':
+          $ref: '#/components/responses/ml.predict@200'
+  /_plugins/_ml/_train/{algorithm_name}:
+    post:
+      operationId: ml.train.0
+      x-operation-group: ml.train
+      description: Trains a model synchronously.
+      parameters:
+        - $ref: '#/components/parameters/ml.train::path.algorithm_name'
+      requestBody:
+        $ref: '#/components/requestBodies/ml.train'
+      responses:
+        '200':
+          $ref: '#/components/responses/ml.train@200'
+  /_plugins/_ml/_train_predict/{algorithm_name}:
+    post:
+      operationId: ml.train_predict.0
+      x-operation-group: ml.train_predict
+      description: Trains a model and predicts against the same training dataset.
+      parameters:
+        - $ref: '#/components/parameters/ml.train_predict::path.algorithm_name'
+      requestBody:
+        $ref: '#/components/requestBodies/ml.train_predict'
+      responses:
+        '200':
+          $ref: '#/components/responses/ml.train_predict@200'
   /_plugins/_ml/connectors/_create:
     post:
       operationId: ml.create_connector.0
@@ -212,6 +249,64 @@ components:
             required:
               - query
               - size
+    ml.predict:
+      content:
+        application/json:
+          schema:
+            type: object
+            properties:
+              input_query:
+                $ref: '../schemas/ml._common.yaml#/components/schemas/InputQuery'
+              input_index:
+                type: array
+                items:
+                  type: string
+                description: The input index.
+              text_docs:
+                type: array
+                items:
+                  type: string
+                description: The text documents.
+              return_number:
+                type: boolean
+                description: Whether to return bytes in model output.
+              target_response:
+                type: array
+                items:
+                  type: string
+                description: The target response.
+    ml.train:
+      content:
+        application/json:
+          schema:
+            type: object
+            properties:
+              parameters:
+                $ref: '../schemas/ml._common.yaml#/components/schemas/TrainParameters'
+              input_query:
+                $ref: '../schemas/ml._common.yaml#/components/schemas/InputQuery'
+              input_index:
+                type: array
+                description: The input index.
+                items:
+                  type: string
+    ml.train_predict:
+      content:
+        application/json:
+          schema:
+            type: object
+            properties:
+              parameters:
+                $ref: '../schemas/ml._common.yaml#/components/schemas/TrainParameters'
+              input_query:
+                $ref: '../schemas/ml._common.yaml#/components/schemas/InputQuery'
+              input_index:
+                type: array
+                description: The input index.
+                items:
+                  type: string
+              input_data:
+                $ref: '../schemas/ml._common.yaml#/components/schemas/PredictionResult'
     ml.create_connector:
       content:
         application/json:
@@ -340,6 +435,21 @@ components:
         application/json:
           schema:
             $ref: '../schemas/ml._common.yaml#/components/schemas/SearchModelsResponse'
+    ml.predict@200:
+      content:
+        application/json:
+          schema:
+            $ref: '../schemas/ml._common.yaml#/components/schemas/PredictResponse'
+    ml.train@200:
+      content:
+        application/json:
+          schema:
+            $ref: '../schemas/ml._common.yaml#/components/schemas/TrainResponse'
+    ml.train_predict@200:
+      content:
+        application/json:
+          schema:
+            $ref: '../schemas/ml._common.yaml#/components/schemas/TrainPredictResponse'
     ml.create_connector@200:
       content:
         application/json:
@@ -403,6 +513,30 @@ components:
       required: true
       schema:
         type: string
+    ml.predict::path.algorithm_name:
+      name: algorithm_name
+      in: path
+      required: true
+      schema:
+        type: string
+    ml.predict::path.model_id:
+      name: model_id
+      in: path
+      required: true
+      schema:
+        type: string
+    ml.train::path.algorithm_name:
+      name: algorithm_name
+      in: path
+      required: true
+      schema:
+        type: string
+    ml.train_predict::path.algorithm_name:
+      name: algorithm_name
+      in: path
+      required: true
+      schema:
+        type: string
     ml.delete_connector::path.connector_id:
       name: connector_id
       in: path
diff --git a/spec/schemas/ml._common.yaml b/spec/schemas/ml._common.yaml
index 1b83fcfc..0c482a35 100644
--- a/spec/schemas/ml._common.yaml
+++ b/spec/schemas/ml._common.yaml
@@ -187,6 +187,225 @@ components:
         framework_type:
           type: string
           description: The framework type.
+    Status:
+      type: string
+      description: The status.
+      enum:
+        - CANCELLED
+        - COMPLETED
+        - COMPLETED_WITH_ERROR
+        - CREATED
+        - FAILED
+        - RUNNING
+    PredictResponse:
+      type: object
+      properties:
+        inference_results:
+          # Update this once the PR for ML Model APIs (#733) has been merged
+          type: array
+          items:
+            $ref: '#/components/schemas/InferenceResults'
+        status:
+          $ref: '#/components/schemas/Status'
+        prediction_result:
+          $ref: '#/components/schemas/PredictionResult'
+    InferenceResults:
+      type: object
+      properties:
+        output:
+          type: array
+          items:
+            $ref: '#/components/schemas/Output'
+    Output:
+      type: object
+      properties:
+        name:
+          type: string
+          description: The output name.
+        data_type:
+          type: string
+          description: The output data type.
+          enum:
+            - BOOLEAN
+            - FLOAT16
+            - FLOAT32
+            - FLOAT64
+            - INT32
+            - INT64
+            - INT8
+            - STRING
+            - UINT8
+            - UNKNOWN
+        shape:
+          type: array
+          items:
+            type: integer
+            format: int64
+            description: The output shape.
+        data:
+          type: array
+          items:
+            type: number
+            format: double
+            description: The output data.
+        byte_buffer:
+          $ref: '#/components/schemas/ByteBuffer'
+      required:
+        - data
+    ByteBuffer:
+      type: object
+      properties:
+        array:
+          type: string
+          description: The byte buffer array.
+        order:
+          type: string
+          description: The byte buffer order.
+          enum:
+            - BIG_ENDIAN
+            - LITTLE_ENDIAN
+    PredictionResult:
+      type: object
+      properties:
+        column_metas:
+          type: array
+          items:
+            $ref: '#/components/schemas/ColumnMeta'
+        rows:
+          type: array
+          items:
+            $ref: '#/components/schemas/Rows'
+    ColumnMeta:
+      type: object
+      properties:
+        name:
+          $ref: '_common.yaml#/components/schemas/Name'
+        column_type:
+          type: string
+          description: The column type.
+          enum:
+            - BOOLEAN
+            - DOUBLE
+            - INTEGER
+            - STRING
+    Rows:
+      type: object
+      properties:
+        values:
+          type: array
+          items:
+            $ref: '#/components/schemas/Values'
+    Values:
+      type: object
+      properties:
+        column_type:
+          type: string
+          description: The column type.
+          enum:
+            - BOOLEAN
+            - DOUBLE
+            - INTEGER
+            - STRING
+        value:
+          type: number
+          description: The value.
+    InputQuery:
+      type: object
+      properties:
+        _source:
+          type: array
+          items:
+            type: string
+          description: The source fields
+        size:
+          type: integer
+          format: int64
+          description: The size of the query.
+        query:
+          $ref: '#/components/schemas/Query'
+    Query:
+      type: object
+      description: The query.
+      properties:
+        bool:
+          $ref: '#/components/schemas/BoolQuery'
+    BoolQuery:
+      type: object
+      description: The Boolean query.
+      properties:
+        filter:
+          type: array
+          description: Filter query that appears in matching documents.
+          items:
+            $ref: '#/components/schemas/Filter'
+    Filter:
+      type: object
+      description: The filter element.
+      properties:
+        range:
+          $ref: '#/components/schemas/Range'
+    Range:
+      type: object
+      description: The filter range.
+      properties:
+        k1:
+          $ref: '#/components/schemas/Key'
+        k2:
+          $ref: '#/components/schemas/Key'
+        k3:
+          $ref: '#/components/schemas/Key'
+    Key:
+      type: object
+      description: The key.
+      properties:
+        gte:
+          type: number
+          description: Greater than or equal to.
+        lte:
+          type: number
+          description: Less than or equal to.
+        gt:
+          type: number
+          description: Greater than.
+        lt:
+          type: number
+          description: Less than.
+    TrainParameters:
+      type: object
+      properties:
+        centroids:
+          type: integer
+          format: int64
+          description: The centroids.
+        iterations:
+          type: integer
+          format: int64
+          description: The iterations.
+        distance_type:
+          type: string
+          description: The distance type.
+          enum:
+            - COSINE
+            - EUCLIDEAN
+            - L1
+    TrainResponse:
+      type: object
+      properties:
+        model_id:
+          $ref: '_common.yaml#/components/schemas/Name'
+        status:
+          $ref: '#/components/schemas/Status'
+      required:
+        - status
+    TrainPredictResponse:
+      type: object
+      properties:
+        status:
+          $ref: '#/components/schemas/Status'
+        prediction_result:
+          $ref: '#/components/schemas/PredictionResult'
+      required:
+        - status
     ModelGroupRegistration:
       type: object
       properties:
@@ -236,9 +455,11 @@ components:
           type: string
           enum:
             - CANCELLED
+            - CANCELLING
             - COMPLETED
             - COMPLETED_WITH_ERROR
             - CREATED
+            - EXPIRED
             - FAILED
             - RUNNING
         task_type:
diff --git a/tests/plugins/ml/ml/predict.yaml b/tests/plugins/ml/ml/predict.yaml
new file mode 100644
index 00000000..cf2072f0
--- /dev/null
+++ b/tests/plugins/ml/ml/predict.yaml
@@ -0,0 +1,70 @@
+$schema: ../../../../json_schemas/test_story.schema.yaml
+
+description: Test the prediction of new data with trained model.
+prologues:
+  - path: /_bulk
+    method: POST
+    request:
+      content_type: application/x-ndjson
+      payload:
+        - {index: {_index: books_data}}
+        - {pages: 320, price: 24.99, sales_first_month: 15000, average_rating: 4.5, publication_year: 2022}
+        - {index: {_index: books_data}}
+        - {pages: 450, price: 29.99, sales_first_month: 45000, average_rating: 4.8, publication_year: 2022}
+        - {index: {_index: books_data}}
+        - {pages: 280, price: 19.99, sales_first_month: 8000, average_rating: 3.9, publication_year: 2022}
+        - {index: {_index: books_data}}
+        - {pages: 380, price: 27.99, sales_first_month: 25000, average_rating: 4.2, publication_year: 2022}
+  - path: /books_data/_refresh
+    method: POST
+  - path: _plugins/_ml/_train/{algorithm_name}
+    id: train_model
+    method: POST
+    parameters:
+      algorithm_name: KMEANS
+    request:
+      payload:
+        parameters:
+          centroids: 3
+          iterations: 10
+          distance_type: COSINE
+        input_query:
+          _source: 
+            - average_rating
+            - price
+            - sales_first_month
+          size: 10000
+        input_index:
+          - books_data
+    output:
+      model_id: payload.model_id
+epilogues:
+  - path: /_plugins/_ml/models/{model_id}
+    parameters:
+      model_id: ${train_model.model_id}
+    method: DELETE
+    status: [200, 404]
+  - path: /books_data
+    method: DELETE
+    status: [200, 404]
+chapters:
+  - synopsis: Predict trained model.
+    path: /_plugins/_ml/_predict/{algorithm_name}/{model_id}
+    method: POST
+    parameters:
+      algorithm_name: KMEANS
+      model_id: ${train_model.model_id}
+    request:
+      payload:
+        input_query:
+          _source: 
+            - average_rating
+            - price
+            - sales_first_month
+          size: 10000
+        input_index:
+          - books_data
+    response:
+      status: 200
+      payload:
+        status: COMPLETED
diff --git a/tests/plugins/ml/ml/train.yaml b/tests/plugins/ml/ml/train.yaml
new file mode 100644
index 00000000..25a0ed72
--- /dev/null
+++ b/tests/plugins/ml/ml/train.yaml
@@ -0,0 +1,55 @@
+$schema: ../../../../json_schemas/test_story.schema.yaml
+
+description: Test the training of a model synchronously.
+prologues:
+  - path: /_bulk
+    method: POST
+    request:
+      content_type: application/x-ndjson
+      payload:
+        - {index: {_index: books_data}}
+        - {pages: 320, price: 24.99, sales_first_month: 15000, average_rating: 4.5, publication_year: 2022}
+        - {index: {_index: books_data}}
+        - {pages: 450, price: 29.99, sales_first_month: 45000, average_rating: 4.8, publication_year: 2022}
+        - {index: {_index: books_data}}
+        - {pages: 280, price: 19.99, sales_first_month: 8000, average_rating: 3.9, publication_year: 2022}
+        - {index: {_index: books_data}}
+        - {pages: 380, price: 27.99, sales_first_month: 25000, average_rating: 4.2, publication_year: 2022}
+  - path: /books_data/_refresh
+    method: POST
+epilogues:
+  - path: /_plugins/_ml/models/{model_id}
+    parameters:
+      model_id: ${train_model.model_id}
+    method: DELETE
+    status: [200, 404]
+  - path: /books_data
+    method: DELETE
+    status: [200, 404]
+chapters:
+  - synopsis: Train model synchronously.
+    id: train_model
+    path: /_plugins/_ml/_train/{algorithm_name}
+    method: POST
+    parameters:
+      algorithm_name: KMEANS
+    request:
+      payload:
+        parameters:
+          centroids: 3
+          iterations: 10
+          distance_type: COSINE
+        input_query:
+          _source: 
+            - average_rating
+            - price
+            - sales_first_month
+          size: 10000
+        input_index:
+          - books_data
+    response:
+      status: 200
+      payload:
+        status: COMPLETED
+    output:
+      model_id: payload.model_id
diff --git a/tests/plugins/ml/ml/train_and_predict.yaml b/tests/plugins/ml/ml/train_and_predict.yaml
new file mode 100644
index 00000000..5da035c7
--- /dev/null
+++ b/tests/plugins/ml/ml/train_and_predict.yaml
@@ -0,0 +1,84 @@
+$schema: ../../../../json_schemas/test_story.schema.yaml
+
+description: Test training a model, then immediately predict against the same training dataset.
+prologues:
+  - path: /_bulk
+    method: POST
+    request:
+      content_type: application/x-ndjson
+      payload:
+        - {index: {_index: books_data}}
+        - {pages: 320, k1: 24.99, sales_first_month: 15000, average_rating: 4.5, publication_year: 2022}
+        - {index: {_index: books_data}}
+        - {pages: 450, k1: 29.99, sales_first_month: 45000, average_rating: 4.8, publication_year: 2022}
+        - {index: {_index: books_data}}
+        - {pages: 280, k1: 19.99, sales_first_month: 8000, average_rating: 3.9, publication_year: 2022}
+        - {index: {_index: books_data}}
+        - {pages: 380, k1: 27.99, sales_first_month: 25000, average_rating: 4.2, publication_year: 2022}
+  - path: /books_data/_refresh
+    method: POST
+epilogues:
+  - path: /books_data
+    method: DELETE
+    status: [200, 404]
+chapters:
+  - synopsis: Train and predict with indexed data.
+    id: train_predict_model
+    path: /_plugins/_ml/_train_predict/{algorithm_name}
+    method: POST
+    parameters:
+      algorithm_name: KMEANS
+    request:
+      payload:
+        parameters:
+          centroids: 3
+          iterations: 10
+          distance_type: COSINE
+        input_query:
+          query:
+            bool:
+              filter:
+                - range:
+                    k1:
+                      gte: 15.99
+                      lte: 25.99
+          size: 10
+        input_index:
+          - books_data
+    response:
+      status: 200
+      payload:
+        status: COMPLETED
+  - synopsis: Train and predict with data directly.
+    id: train_predict_model
+    path: /_plugins/_ml/_train_predict/{algorithm_name}
+    method: POST
+    parameters:
+      algorithm_name: KMEANS
+    request:
+      payload:
+        parameters:
+          centroids: 3
+          iterations: 10
+          distance_type: COSINE
+        input_data:
+          column_metas:
+            - name: k1
+              column_type: DOUBLE
+            - name: average_rating
+              column_type: DOUBLE
+          rows:
+            - values:
+                - column_type: DOUBLE
+                  value: 24.99
+                - column_type: DOUBLE
+                  value: 29.99
+            - values:
+                - column_type: DOUBLE
+                  value: 4.5
+                - column_type: DOUBLE
+                  value: 4.8
+    response:
+      status: 200
+      payload:
+        status: COMPLETED