From a3849b4d74417137d07aeb75580d09e14859aeab Mon Sep 17 00:00:00 2001 From: Nathalie Jonathan <143617992+nathaliellenaa@users.noreply.github.com> Date: Tue, 31 Dec 2024 03:57:27 -0800 Subject: [PATCH] Add ML Predict, Train APIs (#755) * Added ML predict and train APIs, updated CHANGELOG. Signed-off-by: Nathalie Jonathan * Added ML train and predict API, fixed test for predict and train API, defined ML status as an enum, updated ML task state enum, updated CHANGELOG.md. Signed-off-by: Nathalie Jonathan * Updated examples to use books themed data, removed output from predict and train_predict API, fixed vale complaints. Signed-off-by: Nathalie Jonathan * Removed excluded parts, changed files path. Signed-off-by: Nathalie Jonathan --------- Signed-off-by: Nathalie Jonathan --- CHANGELOG.md | 1 + spec/namespaces/ml.yaml | 134 +++++++++++++ spec/schemas/ml._common.yaml | 221 +++++++++++++++++++++ tests/plugins/ml/ml/predict.yaml | 70 +++++++ tests/plugins/ml/ml/train.yaml | 55 +++++ tests/plugins/ml/ml/train_and_predict.yaml | 84 ++++++++ 6 files changed, 565 insertions(+) create mode 100644 tests/plugins/ml/ml/predict.yaml create mode 100644 tests/plugins/ml/ml/train.yaml create mode 100644 tests/plugins/ml/ml/train_and_predict.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index 52954a1d..e0a81795 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,6 +40,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - Added support for combining output variables ([#737](https://github.com/opensearch-project/opensearch-api-specification/pull/737)) - Added 404 response to `/_search/scroll` ([#749](https://github.com/opensearch-project/opensearch-api-specification/pull/749)) - Added `node_failures` to `DELETE /_search/scroll` and `DELETE /_search/scroll/{scroll_id}` ([#749](https://github.com/opensearch-project/opensearch-api-specification/pull/749)) +- Added `POST /_plugins/_ml/_train/{algorithm_name}`, `_predict/{algorithm_name}/{model_id}`, and `_train_predict/{algorithm_name}` ([#755](https://github.com/opensearch-project/opensearch-api-specification/pull/755)) ### Removed - Removed unsupported `_common.mapping:SourceField`'s `mode` field and associated `_common.mapping:SourceFieldMode` enum ([#652](https://github.com/opensearch-project/opensearch-api-specification/pull/652)) diff --git a/spec/namespaces/ml.yaml b/spec/namespaces/ml.yaml index 67ec2460..25ec908b 100644 --- a/spec/namespaces/ml.yaml +++ b/spec/namespaces/ml.yaml @@ -102,6 +102,43 @@ paths: responses: '200': $ref: '#/components/responses/ml.search_models@200' + /_plugins/_ml/_predict/{algorithm_name}/{model_id}: + post: + operationId: ml.predict.0 + x-operation-group: ml.predict + description: Predicts new data with trained model. + parameters: + - $ref: '#/components/parameters/ml.predict::path.algorithm_name' + - $ref: '#/components/parameters/ml.predict::path.model_id' + requestBody: + $ref: '#/components/requestBodies/ml.predict' + responses: + '200': + $ref: '#/components/responses/ml.predict@200' + /_plugins/_ml/_train/{algorithm_name}: + post: + operationId: ml.train.0 + x-operation-group: ml.train + description: Trains a model synchronously. + parameters: + - $ref: '#/components/parameters/ml.train::path.algorithm_name' + requestBody: + $ref: '#/components/requestBodies/ml.train' + responses: + '200': + $ref: '#/components/responses/ml.train@200' + /_plugins/_ml/_train_predict/{algorithm_name}: + post: + operationId: ml.train_predict.0 + x-operation-group: ml.train_predict + description: Trains a model and predicts against the same training dataset. + parameters: + - $ref: '#/components/parameters/ml.train_predict::path.algorithm_name' + requestBody: + $ref: '#/components/requestBodies/ml.train_predict' + responses: + '200': + $ref: '#/components/responses/ml.train_predict@200' /_plugins/_ml/connectors/_create: post: operationId: ml.create_connector.0 @@ -212,6 +249,64 @@ components: required: - query - size + ml.predict: + content: + application/json: + schema: + type: object + properties: + input_query: + $ref: '../schemas/ml._common.yaml#/components/schemas/InputQuery' + input_index: + type: array + items: + type: string + description: The input index. + text_docs: + type: array + items: + type: string + description: The text documents. + return_number: + type: boolean + description: Whether to return bytes in model output. + target_response: + type: array + items: + type: string + description: The target response. + ml.train: + content: + application/json: + schema: + type: object + properties: + parameters: + $ref: '../schemas/ml._common.yaml#/components/schemas/TrainParameters' + input_query: + $ref: '../schemas/ml._common.yaml#/components/schemas/InputQuery' + input_index: + type: array + description: The input index. + items: + type: string + ml.train_predict: + content: + application/json: + schema: + type: object + properties: + parameters: + $ref: '../schemas/ml._common.yaml#/components/schemas/TrainParameters' + input_query: + $ref: '../schemas/ml._common.yaml#/components/schemas/InputQuery' + input_index: + type: array + description: The input index. + items: + type: string + input_data: + $ref: '../schemas/ml._common.yaml#/components/schemas/PredictionResult' ml.create_connector: content: application/json: @@ -340,6 +435,21 @@ components: application/json: schema: $ref: '../schemas/ml._common.yaml#/components/schemas/SearchModelsResponse' + ml.predict@200: + content: + application/json: + schema: + $ref: '../schemas/ml._common.yaml#/components/schemas/PredictResponse' + ml.train@200: + content: + application/json: + schema: + $ref: '../schemas/ml._common.yaml#/components/schemas/TrainResponse' + ml.train_predict@200: + content: + application/json: + schema: + $ref: '../schemas/ml._common.yaml#/components/schemas/TrainPredictResponse' ml.create_connector@200: content: application/json: @@ -403,6 +513,30 @@ components: required: true schema: type: string + ml.predict::path.algorithm_name: + name: algorithm_name + in: path + required: true + schema: + type: string + ml.predict::path.model_id: + name: model_id + in: path + required: true + schema: + type: string + ml.train::path.algorithm_name: + name: algorithm_name + in: path + required: true + schema: + type: string + ml.train_predict::path.algorithm_name: + name: algorithm_name + in: path + required: true + schema: + type: string ml.delete_connector::path.connector_id: name: connector_id in: path diff --git a/spec/schemas/ml._common.yaml b/spec/schemas/ml._common.yaml index 1b83fcfc..0c482a35 100644 --- a/spec/schemas/ml._common.yaml +++ b/spec/schemas/ml._common.yaml @@ -187,6 +187,225 @@ components: framework_type: type: string description: The framework type. + Status: + type: string + description: The status. + enum: + - CANCELLED + - COMPLETED + - COMPLETED_WITH_ERROR + - CREATED + - FAILED + - RUNNING + PredictResponse: + type: object + properties: + inference_results: + # Update this once the PR for ML Model APIs (#733) has been merged + type: array + items: + $ref: '#/components/schemas/InferenceResults' + status: + $ref: '#/components/schemas/Status' + prediction_result: + $ref: '#/components/schemas/PredictionResult' + InferenceResults: + type: object + properties: + output: + type: array + items: + $ref: '#/components/schemas/Output' + Output: + type: object + properties: + name: + type: string + description: The output name. + data_type: + type: string + description: The output data type. + enum: + - BOOLEAN + - FLOAT16 + - FLOAT32 + - FLOAT64 + - INT32 + - INT64 + - INT8 + - STRING + - UINT8 + - UNKNOWN + shape: + type: array + items: + type: integer + format: int64 + description: The output shape. + data: + type: array + items: + type: number + format: double + description: The output data. + byte_buffer: + $ref: '#/components/schemas/ByteBuffer' + required: + - data + ByteBuffer: + type: object + properties: + array: + type: string + description: The byte buffer array. + order: + type: string + description: The byte buffer order. + enum: + - BIG_ENDIAN + - LITTLE_ENDIAN + PredictionResult: + type: object + properties: + column_metas: + type: array + items: + $ref: '#/components/schemas/ColumnMeta' + rows: + type: array + items: + $ref: '#/components/schemas/Rows' + ColumnMeta: + type: object + properties: + name: + $ref: '_common.yaml#/components/schemas/Name' + column_type: + type: string + description: The column type. + enum: + - BOOLEAN + - DOUBLE + - INTEGER + - STRING + Rows: + type: object + properties: + values: + type: array + items: + $ref: '#/components/schemas/Values' + Values: + type: object + properties: + column_type: + type: string + description: The column type. + enum: + - BOOLEAN + - DOUBLE + - INTEGER + - STRING + value: + type: number + description: The value. + InputQuery: + type: object + properties: + _source: + type: array + items: + type: string + description: The source fields + size: + type: integer + format: int64 + description: The size of the query. + query: + $ref: '#/components/schemas/Query' + Query: + type: object + description: The query. + properties: + bool: + $ref: '#/components/schemas/BoolQuery' + BoolQuery: + type: object + description: The Boolean query. + properties: + filter: + type: array + description: Filter query that appears in matching documents. + items: + $ref: '#/components/schemas/Filter' + Filter: + type: object + description: The filter element. + properties: + range: + $ref: '#/components/schemas/Range' + Range: + type: object + description: The filter range. + properties: + k1: + $ref: '#/components/schemas/Key' + k2: + $ref: '#/components/schemas/Key' + k3: + $ref: '#/components/schemas/Key' + Key: + type: object + description: The key. + properties: + gte: + type: number + description: Greater than or equal to. + lte: + type: number + description: Less than or equal to. + gt: + type: number + description: Greater than. + lt: + type: number + description: Less than. + TrainParameters: + type: object + properties: + centroids: + type: integer + format: int64 + description: The centroids. + iterations: + type: integer + format: int64 + description: The iterations. + distance_type: + type: string + description: The distance type. + enum: + - COSINE + - EUCLIDEAN + - L1 + TrainResponse: + type: object + properties: + model_id: + $ref: '_common.yaml#/components/schemas/Name' + status: + $ref: '#/components/schemas/Status' + required: + - status + TrainPredictResponse: + type: object + properties: + status: + $ref: '#/components/schemas/Status' + prediction_result: + $ref: '#/components/schemas/PredictionResult' + required: + - status ModelGroupRegistration: type: object properties: @@ -236,9 +455,11 @@ components: type: string enum: - CANCELLED + - CANCELLING - COMPLETED - COMPLETED_WITH_ERROR - CREATED + - EXPIRED - FAILED - RUNNING task_type: diff --git a/tests/plugins/ml/ml/predict.yaml b/tests/plugins/ml/ml/predict.yaml new file mode 100644 index 00000000..cf2072f0 --- /dev/null +++ b/tests/plugins/ml/ml/predict.yaml @@ -0,0 +1,70 @@ +$schema: ../../../../json_schemas/test_story.schema.yaml + +description: Test the prediction of new data with trained model. +prologues: + - path: /_bulk + method: POST + request: + content_type: application/x-ndjson + payload: + - {index: {_index: books_data}} + - {pages: 320, price: 24.99, sales_first_month: 15000, average_rating: 4.5, publication_year: 2022} + - {index: {_index: books_data}} + - {pages: 450, price: 29.99, sales_first_month: 45000, average_rating: 4.8, publication_year: 2022} + - {index: {_index: books_data}} + - {pages: 280, price: 19.99, sales_first_month: 8000, average_rating: 3.9, publication_year: 2022} + - {index: {_index: books_data}} + - {pages: 380, price: 27.99, sales_first_month: 25000, average_rating: 4.2, publication_year: 2022} + - path: /books_data/_refresh + method: POST + - path: _plugins/_ml/_train/{algorithm_name} + id: train_model + method: POST + parameters: + algorithm_name: KMEANS + request: + payload: + parameters: + centroids: 3 + iterations: 10 + distance_type: COSINE + input_query: + _source: + - average_rating + - price + - sales_first_month + size: 10000 + input_index: + - books_data + output: + model_id: payload.model_id +epilogues: + - path: /_plugins/_ml/models/{model_id} + parameters: + model_id: ${train_model.model_id} + method: DELETE + status: [200, 404] + - path: /books_data + method: DELETE + status: [200, 404] +chapters: + - synopsis: Predict trained model. + path: /_plugins/_ml/_predict/{algorithm_name}/{model_id} + method: POST + parameters: + algorithm_name: KMEANS + model_id: ${train_model.model_id} + request: + payload: + input_query: + _source: + - average_rating + - price + - sales_first_month + size: 10000 + input_index: + - books_data + response: + status: 200 + payload: + status: COMPLETED diff --git a/tests/plugins/ml/ml/train.yaml b/tests/plugins/ml/ml/train.yaml new file mode 100644 index 00000000..25a0ed72 --- /dev/null +++ b/tests/plugins/ml/ml/train.yaml @@ -0,0 +1,55 @@ +$schema: ../../../../json_schemas/test_story.schema.yaml + +description: Test the training of a model synchronously. +prologues: + - path: /_bulk + method: POST + request: + content_type: application/x-ndjson + payload: + - {index: {_index: books_data}} + - {pages: 320, price: 24.99, sales_first_month: 15000, average_rating: 4.5, publication_year: 2022} + - {index: {_index: books_data}} + - {pages: 450, price: 29.99, sales_first_month: 45000, average_rating: 4.8, publication_year: 2022} + - {index: {_index: books_data}} + - {pages: 280, price: 19.99, sales_first_month: 8000, average_rating: 3.9, publication_year: 2022} + - {index: {_index: books_data}} + - {pages: 380, price: 27.99, sales_first_month: 25000, average_rating: 4.2, publication_year: 2022} + - path: /books_data/_refresh + method: POST +epilogues: + - path: /_plugins/_ml/models/{model_id} + parameters: + model_id: ${train_model.model_id} + method: DELETE + status: [200, 404] + - path: /books_data + method: DELETE + status: [200, 404] +chapters: + - synopsis: Train model synchronously. + id: train_model + path: /_plugins/_ml/_train/{algorithm_name} + method: POST + parameters: + algorithm_name: KMEANS + request: + payload: + parameters: + centroids: 3 + iterations: 10 + distance_type: COSINE + input_query: + _source: + - average_rating + - price + - sales_first_month + size: 10000 + input_index: + - books_data + response: + status: 200 + payload: + status: COMPLETED + output: + model_id: payload.model_id diff --git a/tests/plugins/ml/ml/train_and_predict.yaml b/tests/plugins/ml/ml/train_and_predict.yaml new file mode 100644 index 00000000..5da035c7 --- /dev/null +++ b/tests/plugins/ml/ml/train_and_predict.yaml @@ -0,0 +1,84 @@ +$schema: ../../../../json_schemas/test_story.schema.yaml + +description: Test training a model, then immediately predict against the same training dataset. +prologues: + - path: /_bulk + method: POST + request: + content_type: application/x-ndjson + payload: + - {index: {_index: books_data}} + - {pages: 320, k1: 24.99, sales_first_month: 15000, average_rating: 4.5, publication_year: 2022} + - {index: {_index: books_data}} + - {pages: 450, k1: 29.99, sales_first_month: 45000, average_rating: 4.8, publication_year: 2022} + - {index: {_index: books_data}} + - {pages: 280, k1: 19.99, sales_first_month: 8000, average_rating: 3.9, publication_year: 2022} + - {index: {_index: books_data}} + - {pages: 380, k1: 27.99, sales_first_month: 25000, average_rating: 4.2, publication_year: 2022} + - path: /books_data/_refresh + method: POST +epilogues: + - path: /books_data + method: DELETE + status: [200, 404] +chapters: + - synopsis: Train and predict with indexed data. + id: train_predict_model + path: /_plugins/_ml/_train_predict/{algorithm_name} + method: POST + parameters: + algorithm_name: KMEANS + request: + payload: + parameters: + centroids: 3 + iterations: 10 + distance_type: COSINE + input_query: + query: + bool: + filter: + - range: + k1: + gte: 15.99 + lte: 25.99 + size: 10 + input_index: + - books_data + response: + status: 200 + payload: + status: COMPLETED + - synopsis: Train and predict with data directly. + id: train_predict_model + path: /_plugins/_ml/_train_predict/{algorithm_name} + method: POST + parameters: + algorithm_name: KMEANS + request: + payload: + parameters: + centroids: 3 + iterations: 10 + distance_type: COSINE + input_data: + column_metas: + - name: k1 + column_type: DOUBLE + - name: average_rating + column_type: DOUBLE + rows: + - values: + - column_type: DOUBLE + value: 24.99 + - column_type: DOUBLE + value: 29.99 + - values: + - column_type: DOUBLE + value: 4.5 + - column_type: DOUBLE + value: 4.8 + response: + status: 200 + payload: + status: COMPLETED