Skip to content

Commit

Permalink
create endpoint with InferenceAmiVersion (#602)
Browse files Browse the repository at this point in the history
* x

* pickup changes from cohere-ai/cohere-aws#196

* misc cleanup and import fixes

---------

Co-authored-by: CoderHam <[email protected]>
  • Loading branch information
CoderHam and hemant-co authored Nov 21, 2024
1 parent 044344f commit 756515a
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 31 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ dist/
__pycache__/
poetry.toml
.ruff_cache/
.venv/
112 changes: 81 additions & 31 deletions src/cohere/manually_maintained/cohere_aws/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,12 @@
import tarfile
import tempfile
import time
from typing import Any, Dict, List, Optional, Tuple, Union
from typing import Any, Dict, List, Optional, Union

from .classification import Classification, Classifications
from .embeddings import Embeddings
from .error import CohereError
from .generation import (Generation, Generations,
StreamingGenerations,
TokenLikelihood)
from .generation import Generations, StreamingGenerations
from .chat import Chat, StreamingChat
from .rerank import Reranking
from .summary import Summary
Expand Down Expand Up @@ -135,7 +133,7 @@ def create_endpoint(
instance_type (str, optional): The EC2 instance type to deploy the endpoint to. Defaults to "ml.g4dn.xlarge".
n_instances (int, optional): Number of endpoint instances. Defaults to 1.
recreate (bool, optional): Force re-creation of endpoint if it already exists. Defaults to False.
rool (str, optional): The IAM role to use for the endpoint. If not provided, sagemaker.get_execution_role()
role (str, optional): The IAM role to use for the endpoint. If not provided, sagemaker.get_execution_role()
will be used to get the role. This should work when one uses the client inside SageMaker. If this errors
out, the default role "ServiceRoleSagemaker" will be used, which generally works outside of SageMaker.
"""
Expand All @@ -150,6 +148,7 @@ def create_endpoint(
kwargs = {}
model_data = None
validation_params = dict()
useBoto = False
if s3_models_dir is not None:
# If s3_models_dir is given, we assume to have custom fine-tuned models -> Algorithm
kwargs["algorithm_arn"] = arn
Expand All @@ -163,6 +162,7 @@ def create_endpoint(
model_data_download_timeout=2400,
container_startup_health_check_timeout=2400
)
useBoto = True

# Out of precaution, check if there is an endpoint config and delete it if that's the case
# Otherwise it might block deployment
Expand All @@ -171,30 +171,80 @@ def create_endpoint(
except lazy_botocore().ClientError:
pass

try:
self._service_client.delete_model(ModelName=endpoint_name)
except lazy_botocore().ClientError:
pass

if role is None:
try:
role = lazy_sagemaker().get_execution_role()
except ValueError:
print("Using default role: 'ServiceRoleSagemaker'.")
role = "ServiceRoleSagemaker"
if useBoto:
accountID = lazy_sagemaker().account_id()
role = f"arn:aws:iam::{accountID}:role/ServiceRoleSagemaker"
else:
try:
role = lazy_sagemaker().get_execution_role()
except ValueError:
print("Using default role: 'ServiceRoleSagemaker'.")
role = "ServiceRoleSagemaker"

model = lazy_sagemaker().ModelPackage(
role=role,
model_data=model_data,
sagemaker_session=self._sess, # makes sure the right region is used
**kwargs
)
# deploy fine-tuned model using sagemaker SDK
if s3_models_dir is not None:
model = lazy_sagemaker().ModelPackage(
role=role,
model_data=model_data,
sagemaker_session=self._sess, # makes sure the right region is used
**kwargs
)

try:
model.deploy(
n_instances,
instance_type,
endpoint_name=endpoint_name,
**validation_params
try:
model.deploy(
n_instances,
instance_type,
endpoint_name=endpoint_name,
**validation_params
)
except lazy_botocore().ParamValidationError:
# For at least some versions of python 3.6, SageMaker SDK does not support the validation_params
model.deploy(n_instances, instance_type, endpoint_name=endpoint_name)
else:
# deploy pre-trained model using boto to add InferenceAmiVersion
self._service_client.create_model(
ModelName=endpoint_name,
ExecutionRoleArn=role,
EnableNetworkIsolation=True,
PrimaryContainer={
'ModelPackageName': arn,
},
)
self._service_client.create_endpoint_config(
EndpointConfigName=endpoint_name,
ProductionVariants=[
{
'VariantName': 'AllTraffic',
'ModelName': endpoint_name,
'InstanceType': instance_type,
'InitialInstanceCount': n_instances,
'InferenceAmiVersion': 'al2-ami-sagemaker-inference-gpu-2'
},
],
)
except lazy_botocore().ParamValidationError:
# For at least some versions of python 3.6, SageMaker SDK does not support the validation_params
model.deploy(n_instances, instance_type, endpoint_name=endpoint_name)
self._service_client.create_endpoint(
EndpointName=endpoint_name,
EndpointConfigName=endpoint_name,
)

waiter = self._service_client.get_waiter('endpoint_in_service')
try:
print(f"Waiting for endpoint {endpoint_name} to be in service...")
waiter.wait(
EndpointName=endpoint_name,
WaiterConfig={
'Delay': 30,
'MaxAttempts': 80
}
)
except Exception as e:
raise CohereError(f"Failed to create endpoint: {e}")
self.connect_to_endpoint(endpoint_name)

def chat(
Expand Down Expand Up @@ -725,12 +775,12 @@ def create_finetune(
s3_resource = lazy_boto3().resource("s3")

# Copy new model to root of output_model_dir
bucket, old_key = parse_s3_url(current_filepath)
_, new_key = parse_s3_url(f"{s3_models_dir}{name}.tar.gz")
bucket, old_key = lazy_sagemaker().s3.parse_s3_url(current_filepath)
_, new_key = lazy_sagemaker().s3.parse_s3_url(f"{s3_models_dir}{name}.tar.gz")
s3_resource.Object(bucket, new_key).copy(CopySource={"Bucket": bucket, "Key": old_key})

# Delete old dir
bucket, old_short_key = parse_s3_url(s3_models_dir + job_name)
bucket, old_short_key = lazy_sagemaker().s3.parse_s3_url(s3_models_dir + job_name)
s3_resource.Bucket(bucket).objects.filter(Prefix=old_short_key).delete()

def export_finetune(
Expand Down Expand Up @@ -791,12 +841,12 @@ def export_finetune(
s3_resource = lazy_boto3().resource("s3")

# Copy the exported TensorRT-LLM engine to the root of s3_output_dir
bucket, old_key = parse_s3_url(current_filepath)
_, new_key = parse_s3_url(f"{s3_output_dir}{name}.tar.gz")
bucket, old_key = lazy_sagemaker().s3.parse_s3_url(current_filepath)
_, new_key = lazy_sagemaker().s3.parse_s3_url(f"{s3_output_dir}{name}.tar.gz")
s3_resource.Object(bucket, new_key).copy(CopySource={"Bucket": bucket, "Key": old_key})

# Delete the old S3 directory
bucket, old_short_key = parse_s3_url(f"{s3_output_dir}{job_name}")
bucket, old_short_key = lazy_sagemaker().s3.parse_s3_url(f"{s3_output_dir}{job_name}")
s3_resource.Bucket(bucket).objects.filter(Prefix=old_short_key).delete()

def wait_for_finetune_job(self, job_id: str, timeout: int = 2*60*60) -> str:
Expand Down

0 comments on commit 756515a

Please sign in to comment.