Add Amazon Titan (#2165)

stanford-crfm · Feb 17, 2024 · aa3e20b · aa3e20b
1 parent 2db78bc
commit aa3e20b
Show file tree

Hide file tree

Showing 7 changed files with 268 additions and 5 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -112,6 +112,11 @@ aleph-alpha =
     aleph-alpha-client~=2.14.0
     tokenizers>=0.13.3
 
+amazon = 
+    boto3~=1.28.57
+    awscli~=1.29.57
+    botocore~=1.31.57
+
 anthropic =
     anthropic~=0.2.5
     websocket-client~=1.3.2  # For legacy stanford-online-all-v4-s3
@@ -134,6 +139,7 @@ yandex =
 
 models =
     crfm-helm[aleph-alpha]
+    crfm-helm[amazon]
     crfm-helm[anthropic]
     crfm-helm[google]
     crfm-helm[mistral]

diff --git a/src/helm/benchmark/test_model_deployment_definition.py b/src/helm/benchmark/test_model_deployment_definition.py
@@ -53,7 +53,11 @@ def test_models_has_window_service(self, deployment_name: str):
                 return
 
             # Can't test Vertex AI because it requires Google credentials
-            if "text-bison" in model.name or "text-unicorn" in model.name:
+            if deployment_name.startswith("google/"):
+                return
+
+            # Can't test Bedrock because it requires Amazon credentials
+            if deployment_name.startswith("amazon/"):
                 return
 
             # Loads the model, window service and tokenizer

diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml
@@ -130,6 +130,30 @@ model_deployments:
     window_service_spec:
       class_name: "helm.benchmark.window_services.image_generation.clip_window_service.CLIPWindowService"
 
+
+  # Amazon
+  - name: amazon/titan-text-lite-v1
+    model_name: amazon/titan-text-lite-v1
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 4000
+    client_spec:
+      class_name: "helm.proxy.clients.bedrock_client.BedrockTitanClient"
+
+  - name: amazon/titan-tg1-large
+    model_name: amazon/titan-tg1-large
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 8000
+    client_spec:
+      class_name: "helm.proxy.clients.bedrock_client.BedrockTitanClient"
+
+  - name: amazon/titan-text-express-v1
+    model_name: amazon/titan-text-express-v1
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 8000
+    client_spec:
+      class_name: "helm.proxy.clients.bedrock_client.BedrockTitanClient"
+
+
   # Anthropic
   - name: anthropic/claude-v1.3
     model_name: anthropic/claude-v1.3

diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml
@@ -146,8 +146,7 @@ models:
   #   release_date: TBD
   #   # Does not support echo.
   #   tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
-
-
+
   - name: AlephAlpha/m-vader
     display_name: MultiFusion (13B)
     description: MultiFusion is a multimodal, multilingual diffusion model that extend the capabilities of Stable Diffusion v1.4 by integrating different pre-trained modules, which transfers capabilities to the downstream model ([paper](https://arxiv.org/abs/2305.15296))
@@ -158,6 +157,36 @@ models:
     tags: [TEXT_TO_IMAGE_MODEL_TAG]
 
 
+  # Amazon
+  # References for Amazon Titan models:
+  # - https://aws.amazon.com/bedrock/titan/
+  # - https://community.aws/content/2ZUVD3fkNtqEOYIa2iUJAFArS7c/family-of-titan-text-models---cli-demo
+  # - https://aws.amazon.com/about-aws/whats-new/2023/11/amazon-titan-models-express-lite-bedrock/
+  - name: amazon/titan-text-lite-v1
+    display_name: Amazon Titan Text Lite
+    description: Amazon Titan Text Lite is a lightweight, efficient model perfect for fine-tuning English-language tasks like summarization and copywriting. It caters to customers seeking a smaller, cost-effective, and highly customizable model. It supports various formats, including text generation, code generation, rich text formatting, and orchestration (agents). Key model attributes encompass fine-tuning, text generation, code generation, and rich text formatting.
+    creator_organization_name: Amazon
+    access: limited
+    release_date: 2023-11-29
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: amazon/titan-tg1-large
+    display_name: Amazon Titan Large
+    description: Amazon Titan Large is efficient model perfect for fine-tuning English-language tasks like summarization, create article, marketing campaign.
+    creator_organization_name: Amazon
+    access: limited
+    release_date: 2023-11-29
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: amazon/titan-text-express-v1
+    display_name: Amazon Titan Text Express
+    description: Amazon Titan Text Express, with a context length of up to 8,000 tokens, excels in advanced language tasks like open-ended text generation and conversational chat. It's also optimized for Retrieval Augmented Generation (RAG). Initially designed for English, the model offers preview multilingual support for over 100 additional languages.
+    creator_organization_name: Amazon
+    access: limited
+    release_date: 2023-11-29
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
   # Anthropic
   - name: anthropic/claude-v1.3
     display_name: Anthropic Claude v1.3

diff --git a/src/helm/proxy/clients/bedrock_client.py b/src/helm/proxy/clients/bedrock_client.py
@@ -0,0 +1,128 @@
+from abc import abstractmethod
+from copy import deepcopy
+import json
+import os
+from typing import Any, Dict, List, Mapping, Optional
+
+from helm.common.cache import CacheConfig
+from helm.proxy.clients.client import CachingClient, truncate_and_tokenize_response_text
+from helm.common.request import Request, RequestResult, Sequence, wrap_request_time
+from helm.proxy.clients.bedrock_utils import get_bedrock_client
+from helm.proxy.tokenizers.tokenizer import Tokenizer
+
+
+JSON_CONTENT_TYPE = "application/json"
+
+
+class BedrockClient(CachingClient):
+    @abstractmethod
+    def convert_request_to_raw_request(self, request: Request) -> Dict:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def convert_raw_response_to_completions(self, response: Dict, request: Request) -> List[Sequence]:
+        raise NotImplementedError()
+
+    def __init__(
+        self,
+        cache_config: CacheConfig,
+        tokenizer: Tokenizer,
+        tokenizer_name: str,
+        bedrock_model_id: Optional[str] = None,
+        assumed_role: Optional[str] = None,
+        region: Optional[str] = None,
+    ):
+        super().__init__(cache_config=cache_config)
+        self.tokenizer = tokenizer
+        self.tokenizer_name = tokenizer_name
+        self.bedrock_model_id = bedrock_model_id
+        self.bedrock_client = get_bedrock_client(
+            assumed_role=assumed_role or os.environ.get("BEDROCK_ASSUME_ROLE", None),
+            region=region or os.environ.get("AWS_DEFAULT_REGION", None),
+        )
+
+    def make_request(self, request: Request) -> RequestResult:
+        # model_id should be something like "amazon.titan-tg1-large"
+        model_id = self.bedrock_model_id if self.bedrock_model_id else request.model.replace("/", ".")
+        raw_request = self.convert_request_to_raw_request(request)
+
+        # modelId isn't part of raw_request, so it must be explicitly passed into the input to
+        raw_request_for_cache: Dict = {"modelId": model_id, **deepcopy(raw_request)}
+        cache_key: Mapping = CachingClient.make_cache_key(raw_request_for_cache, request)
+
+        def do_it() -> Dict[Any, Any]:
+            response = self.bedrock_client.invoke_model(
+                body=json.dumps(raw_request), modelId=model_id, accept=JSON_CONTENT_TYPE, contentType=JSON_CONTENT_TYPE
+            )
+            return json.loads(response.get("body").read())
+
+        try:
+            response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+        except Exception as error:
+            return RequestResult(
+                success=False,
+                cached=False,
+                error=str(error),
+                completions=[],
+                embedding=[],
+            )
+
+        completions = self.convert_raw_response_to_completions(response, request)
+
+        return RequestResult(
+            success=True,
+            cached=cached,
+            request_time=response["request_time"],
+            request_datetime=response["request_datetime"],
+            completions=completions,
+            embedding=[],
+        )
+
+
+class BedrockTitanClient(BedrockClient):
+    _COMPLETION_REASON_TO_FINISH_REASON = {
+        "LENGTH": "length",
+        "FINISH": "endoftext",
+    }
+
+    def convert_request_to_raw_request(self, request: Request) -> Dict:
+        # TODO: Support the following:
+        # - top_k_per_token
+        # - echo_prompt
+        # - num_completions
+        return {
+            "inputText": request.prompt,
+            "textGenerationConfig": {
+                "maxTokenCount": request.max_tokens,
+                # We ignore stop sequences in the request and always set stop sequences to the empty list.
+                # This is because:
+                #
+                # 1. The only permitted stop sequences are "|" and "User:"
+                #     - https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-titan-text.html
+                #     - https://github.com/boto/boto3/issues/3993
+                #     - https://github.com/aws/aws-sdk/issues/692
+                #
+                # 2. Titan has the tendency to emit "\n" as the first token in the generated text output,
+                #    which would cause the output to stop immediately if "\n" is in the stop_sequences.
+                "stopSequences": [],
+                "temperature": request.temperature,
+                "topP": request.top_p,
+            },
+        }
+
+    def convert_raw_response_to_completions(self, response: Dict, request: Request) -> List[Sequence]:
+        # TODO: Support the following:
+        # - tokens
+        # - logprob
+        completions: List[Sequence] = []
+        for raw_completion in response["results"]:
+            output_text = raw_completion["outputText"]
+            # Call lstrip() Titan has the tendency to emit "\n" as the first token in the generated text output.
+            finish_reason = BedrockTitanClient._COMPLETION_REASON_TO_FINISH_REASON.get(
+                raw_completion["completionReason"], raw_completion["completionReason"].lower()
+            )
+            completion = truncate_and_tokenize_response_text(
+                output_text.lstrip(), request, self.tokenizer, self.tokenizer_name, finish_reason
+            )
+            completions.append(completion)
+        return completions
diff --git a/src/helm/proxy/clients/bedrock_utils.py b/src/helm/proxy/clients/bedrock_utils.py
@@ -0,0 +1,72 @@
+"""Helper utilities for working with Amazon Bedrock."""
+
+import os
+from typing import Optional
+
+from helm.common.hierarchical_logger import hlog
+from helm.common.optional_dependencies import handle_module_not_found_error
+
+try:
+    import boto3
+    from botocore.config import Config
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e, ["aws"])
+
+
+# From https://github.com/aws-samples/amazon-bedrock-workshop/blob/main/01_Generation/00_generate_w_bedrock.ipynb
+# MIT-0 Licensed
+def get_bedrock_client(
+    assumed_role: Optional[str] = None,
+    region: Optional[str] = None,
+    runtime: Optional[bool] = True,
+):
+    """Create a boto3 client for Amazon Bedrock, with optional configuration overrides
+
+    Parameters
+    ----------
+    assumed_role :
+        Optional ARN of an AWS IAM role to assume for calling the Bedrock service. If not
+        specified, the current active credentials will be used.
+    region :
+        Optional name of the AWS Region in which the service should be called (e.g. "us-east-1").
+        If not specified, AWS_REGION or AWS_DEFAULT_REGION environment variable will be used.
+    runtime :
+        Optional choice of getting different client to perform operations with the Amazon Bedrock service.
+    """
+    if region is None:
+        target_region = os.environ.get("AWS_REGION", os.environ.get("AWS_DEFAULT_REGION"))
+    else:
+        target_region = region
+
+    session_kwargs = {"region_name": target_region}
+    client_kwargs = {**session_kwargs}
+
+    profile_name = os.environ.get("AWS_PROFILE")
+    if profile_name:
+        session_kwargs["profile_name"] = profile_name
+
+    retry_config = Config(
+        region_name=target_region,
+        retries={
+            "max_attempts": 10,
+            "mode": "standard",
+        },
+    )
+    session = boto3.Session(**session_kwargs)
+
+    if assumed_role:
+        sts = session.client("sts")
+        response = sts.assume_role(RoleArn=str(assumed_role), RoleSessionName="crfm-helm")
+        client_kwargs["aws_access_key_id"] = response["Credentials"]["AccessKeyId"]
+        client_kwargs["aws_secret_access_key"] = response["Credentials"]["SecretAccessKey"]
+        client_kwargs["aws_session_token"] = response["Credentials"]["SessionToken"]
+
+    if runtime:
+        service_name = "bedrock-runtime"
+    else:
+        service_name = "bedrock"
+
+    bedrock_client = session.client(service_name=service_name, config=retry_config, **client_kwargs)
+
+    hlog(f"Amazon Bedrock client successfully created with endpoint {bedrock_client._endpoint}")
+    return bedrock_client
diff --git a/src/helm/proxy/clients/client.py b/src/helm/proxy/clients/client.py
@@ -116,7 +116,7 @@ def truncate_sequence(sequence: Sequence, request: Request, print_warning: bool
 
 
 def truncate_and_tokenize_response_text(
-    text: str, request: Request, tokenizer: Tokenizer, tokenizer_name: str
+    text: str, request: Request, tokenizer: Tokenizer, tokenizer_name: str, original_finish_reason: str = "endoftext"
 ) -> Sequence:
     """Truncate a string-only response to respect stop_sequences and max_tokens.
 
@@ -135,7 +135,7 @@ def truncate_and_tokenize_response_text(
     This is because the the tokens are derived from the truncated text using the tokenizer,
     so the text and the tokens in the resulting result are guranteed to match."""
     # Finish reason strings are token from basic_metrics._compute_finish_reason_metrics()
-    finish_reason: str = "endoftext"
+    finish_reason: str = original_finish_reason
     if request.echo_prompt:
         raise Exception("truncate_and_tokenize_response_text() does not support requests with echo_prompt = True")