Feature yaml costs (#180)

* Introduce TokenKind and add duration to consumptions * Introduce DurationManager * Use Anthropic-like Usage object * Make Usage empty instead of optional * Move costs into yaml files * Implement get_cost_map() instead of direct access * Fix * Update docs * Fix print statement * Adjust consumption docs * Add consumptions print in test_llm_caching_middleware.py * Address comments * Fix
chain-ml · Oct 30, 2024 · 3746c63 · 3746c63
1 parent 60c63c5
commit 3746c63
Show file tree

Hide file tree

Showing 16 changed files with 508 additions and 220 deletions.
diff --git a/council/contexts/_budget.py b/council/contexts/_budget.py
@@ -13,16 +13,6 @@ class BudgetExpiredException(Exception):
 class Consumption:
     """
     A class representing a consumption measurement with value, unit, and kind information.
-
-    Attributes:
-        _value (float): The numeric value of the consumption measurement.
-        _unit (str): The unit of measurement for the consumption (e.g., tokens, api_calls, etc.).
-        _kind (str): The kind or category of the consumption.
-
-    Methods:
-        __init__(value: float, unit: str, kind: str):
-            Initializes a Consumption instance with the provided value, unit, and kind.
-
     """
 
     def __init__(self, value: float, unit: str, kind: str) -> None:
@@ -41,14 +31,17 @@ def __init__(self, value: float, unit: str, kind: str) -> None:
 
     @property
     def value(self) -> float:
+        """The numeric value of the consumption measurement."""
         return self._value
 
     @property
     def unit(self) -> str:
+        """The unit of measurement for the consumption (e.g., tokens, api_calls, etc.)."""
         return self._unit
 
     @property
     def kind(self) -> str:
+        """The kind or category of the consumption."""
         return self._kind
 
     def __str__(self) -> str:
@@ -83,14 +76,22 @@ def to_dict(self) -> Dict[str, Any]:
 
     @staticmethod
     def call(value: int, kind: str) -> Consumption:
+        """Returns a Consumption instance with "call" unit."""
         return Consumption(value, "call", kind)
 
+    @staticmethod
+    def duration(value: float, kind: str) -> Consumption:
+        """Returns a Consumption instance with "second" unit."""
+        return Consumption(value, "second", kind)
+
     @staticmethod
     def token(value: int, kind: str) -> Consumption:
+        """Returns a Consumption instance with "token" unit."""
         return Consumption(value, "token", kind)
 
     @staticmethod
     def cost(value: float, kind: str) -> Consumption:
+        """Returns a Consumption instance with "USD" unit."""
         return Consumption(value, "USD", kind)
 
 

diff --git a/council/llm/__init__.py b/council/llm/__init__.py
@@ -9,7 +9,7 @@
 from .llm_exception import LLMException, LLMCallException, LLMCallTimeoutException, LLMTokenLimitException
 from .llm_message import LLMMessageRole, LLMMessage, LLMMessageTokenCounterBase
 from .llm_base import LLMBase, LLMResult, LLMConfigurationBase
-from .llm_cost import LLMCostCard, LLMConsumptionCalculatorBase
+from .llm_cost import LLMCostCard, LLMConsumptionCalculatorBase, TokenKind, LLMCostManagerSpec, LLMCostManagerObject
 from .llm_fallback import LLMFallback
 from .llm_middleware import (
     LLMRequest,

diff --git a/council/llm/anthropic.py b/council/llm/anthropic.py
@@ -8,23 +8,57 @@
 from council.llm import LLMMessage
 
 
+class Usage:
+    """Represents token usage statistics for an Anthropic API request."""
+
+    def __init__(
+        self,
+        prompt_tokens: int,
+        completion_tokens: int,
+        cache_creation_prompt_tokens: int,
+        cache_read_prompt_tokens: int,
+    ):
+        self.prompt_tokens = prompt_tokens
+        self.completion_tokens = completion_tokens
+        self.cache_creation_prompt_tokens = cache_creation_prompt_tokens
+        self.cache_read_prompt_tokens = cache_read_prompt_tokens
+        self.total_tokens = cache_creation_prompt_tokens + cache_read_prompt_tokens + prompt_tokens + completion_tokens
+
+    @staticmethod
+    def from_dict(values: Dict[str, int]) -> Usage:
+        prompt_tokens = values["input_tokens"]
+        completion_tokens = values["output_tokens"]
+        cache_creation_prompt_tokens = values.get("cache_creation_input_tokens", 0)
+        cache_read_prompt_tokens = values.get("cache_read_input_tokens", 0)
+        return Usage(prompt_tokens, completion_tokens, cache_creation_prompt_tokens, cache_read_prompt_tokens)
+
+    @staticmethod
+    def empty() -> Usage:
+        return Usage(0, 0, 0, 0)
+
+
 class AnthropicAPIClientResult:
-    def __init__(self, choices: List[str], raw_response: Optional[Dict[str, Any]] = None) -> None:
+    def __init__(self, choices: List[str], usage: Usage, raw_response: Optional[Dict[str, Any]] = None) -> None:
         self._choices = choices
+        self._usage = usage
         self._raw_response = raw_response
 
     @property
     def choices(self) -> List[str]:
         return self._choices
 
+    @property
+    def usage(self) -> Usage:
+        return self._usage
+
     @property
     def raw_response(self) -> Optional[Dict[str, Any]]:
         return self._raw_response
 
     @staticmethod
     def from_completion(result: Completion) -> AnthropicAPIClientResult:
         """For legacy completion API"""
-        return AnthropicAPIClientResult(choices=[result.completion])
+        return AnthropicAPIClientResult(choices=[result.completion], usage=Usage.empty())
 
 
 class AnthropicAPIClientWrapper(ABC):

diff --git a/council/llm/anthropic_llm.py b/council/llm/anthropic_llm.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Any, Dict, List, Mapping, Optional, Sequence
+from typing import Any, List, Mapping, Optional, Sequence
 
 from anthropic import Anthropic, APIStatusError, APITimeoutError
 from council.contexts import Consumption, LLMContext
@@ -12,13 +12,16 @@
     LLMConfigObject,
     LLMConsumptionCalculatorBase,
     LLMCostCard,
+    LLMCostManagerObject,
     LLMMessage,
     LLMMessageTokenCounterBase,
     LLMProviders,
     LLMResult,
+    TokenKind,
 )
+from council.utils.utils import DurationManager
 
-from .anthropic import AnthropicAPIClientWrapper
+from .anthropic import AnthropicAPIClientWrapper, Usage
 from .anthropic_completion_llm import AnthropicCompletionLLM
 from .anthropic_messages_llm import AnthropicMessagesLLM
 
@@ -35,68 +38,52 @@ def count_messages_token(self, messages: Sequence[LLMMessage]) -> int:
 
 
 class AnthropicConsumptionCalculator(LLMConsumptionCalculatorBase):
-    # https://www.anthropic.com/pricing#anthropic-api
-    COSTS: Mapping[str, LLMCostCard] = {
-        "claude-3-haiku-20240307": LLMCostCard(input=0.25, output=1.25),
-        "claude-3-sonnet-20240229": LLMCostCard(input=3.00, output=15.00),
-        "claude-3-5-sonnet-20240620": LLMCostCard(input=3.00, output=15.00),
-        "claude-3-5-sonnet-20241022": LLMCostCard(input=3.00, output=15.00),
-        "claude-3-opus-20240229": LLMCostCard(input=15.00, output=75.00),
-    }
-
-    # input - cache write; output - cache read; note - not all model support prompt caching
-    COSTS_CACHING: Mapping[str, LLMCostCard] = {
-        "claude-3-haiku-20240307": LLMCostCard(input=0.30, output=0.03),
-        "claude-3-5-sonnet-20240620": LLMCostCard(input=3.75, output=0.30),
-        "claude-3-5-sonnet-20241022": LLMCostCard(input=3.75, output=0.30),
-        "claude-3-opus-20240229": LLMCostCard(input=18.75, output=1.50),
-    }
+    _cost_manager = LLMCostManagerObject.anthropic()
+    COSTS: Mapping[str, LLMCostCard] = _cost_manager.get_cost_map("default")
+    COSTS_CACHING: Mapping[str, LLMCostCard] = _cost_manager.get_cost_map("caching")
 
     def find_model_costs(self) -> Optional[LLMCostCard]:
         return self.COSTS.get(self.model)
 
     def find_caching_costs(self) -> Optional[LLMCostCard]:
         return self.COSTS_CACHING.get(self.model)
 
-    def get_cache_consumptions(self, usage: Dict[str, int]) -> List[Consumption]:
+    def get_anthropic_consumptions(self, duration: float, usage: Usage) -> List[Consumption]:
         """
         Get consumptions specific for Anthropic prompt caching:
             - 1 call
+            - specified duration
             - cache_creation_prompt, cache_read_prompt, prompt, completion and total tokens
             - costs if both regular and caching LLMCostCards can be found
         """
-        consumptions = self.get_cache_token_consumptions(usage) + self.get_cache_cost_consumptions(usage)
-        return self.filter_zeros(consumptions)  # could occur for cache tokens
 
-    def get_cache_token_consumptions(self, usage: Dict[str, int]) -> List[Consumption]:
-        total = sum(
-            [
-                usage["cache_creation_prompt_tokens"],
-                usage["cache_read_prompt_tokens"],
-                usage["prompt_tokens"],
-                usage["completion_tokens"],
-            ]
+        consumptions = self.get_anthropic_base_consumptions(duration, usage) + self.get_anthropic_cost_consumptions(
+            usage
         )
+        return self.filter_zeros(consumptions)  # could occur for cache tokens
+
+    def get_anthropic_base_consumptions(self, duration: float, usage: Usage) -> List[Consumption]:
         return [
             Consumption.call(1, self.model),
-            Consumption.token(usage["cache_creation_prompt_tokens"], self.format_kind("cache_creation_prompt")),
-            Consumption.token(usage["cache_read_prompt_tokens"], self.format_kind("cache_read_prompt")),
-            Consumption.token(usage["prompt_tokens"], self.format_kind("prompt")),
-            Consumption.token(usage["completion_tokens"], self.format_kind("completion")),
-            Consumption.token(total, self.format_kind("total")),
+            Consumption.duration(duration, self.model),
+            Consumption.token(usage.cache_creation_prompt_tokens, self.format_kind(TokenKind.cache_creation_prompt)),
+            Consumption.token(usage.cache_read_prompt_tokens, self.format_kind(TokenKind.cache_read_prompt)),
+            Consumption.token(usage.prompt_tokens, self.format_kind(TokenKind.prompt)),
+            Consumption.token(usage.completion_tokens, self.format_kind(TokenKind.completion)),
+            Consumption.token(usage.total_tokens, self.format_kind(TokenKind.total)),
         ]
 
-    def get_cache_cost_consumptions(self, usage: Dict[str, int]) -> List[Consumption]:
+    def get_anthropic_cost_consumptions(self, usage: Usage) -> List[Consumption]:
         cost_card = self.find_model_costs()
         caching_cost_card = self.find_caching_costs()
 
         if cost_card is None or caching_cost_card is None:
             return []
 
-        prompt_tokens_cost = cost_card.input_cost(usage["prompt_tokens"])
-        completion_tokens_cost = cost_card.output_cost(usage["completion_tokens"])
-        cache_creation_prompt_tokens_cost = caching_cost_card.input_cost(usage["cache_creation_prompt_tokens"])
-        cache_read_prompt_tokens_cost = caching_cost_card.output_cost(usage["cache_read_prompt_tokens"])
+        prompt_tokens_cost = cost_card.input_cost(usage.prompt_tokens)
+        completion_tokens_cost = cost_card.output_cost(usage.completion_tokens)
+        cache_creation_prompt_tokens_cost = caching_cost_card.input_cost(usage.cache_creation_prompt_tokens)
+        cache_read_prompt_tokens_cost = caching_cost_card.output_cost(usage.cache_read_prompt_tokens)
 
         total_cost = sum(
             [
@@ -108,11 +95,13 @@ def get_cache_cost_consumptions(self, usage: Dict[str, int]) -> List[Consumption
         )
 
         return [
-            Consumption.cost(cache_creation_prompt_tokens_cost, self.format_kind("cache_creation_prompt", cost=True)),
-            Consumption.cost(cache_read_prompt_tokens_cost, self.format_kind("cache_read_prompt", cost=True)),
-            Consumption.cost(prompt_tokens_cost, self.format_kind("prompt", cost=True)),
-            Consumption.cost(completion_tokens_cost, self.format_kind("completion", cost=True)),
-            Consumption.cost(total_cost, self.format_kind("total", cost=True)),
+            Consumption.cost(
+                cache_creation_prompt_tokens_cost, self.format_kind(TokenKind.cache_creation_prompt, cost=True)
+            ),
+            Consumption.cost(cache_read_prompt_tokens_cost, self.format_kind(TokenKind.cache_read_prompt, cost=True)),
+            Consumption.cost(prompt_tokens_cost, self.format_kind(TokenKind.prompt, cost=True)),
+            Consumption.cost(completion_tokens_cost, self.format_kind(TokenKind.completion, cost=True)),
+            Consumption.cost(total_cost, self.format_kind(TokenKind.total, cost=True)),
         ]
 
 
@@ -130,28 +119,22 @@ def __init__(self, config: AnthropicLLMConfiguration, name: Optional[str] = None
 
     def _post_chat_request(self, context: LLMContext, messages: Sequence[LLMMessage], **kwargs: Any) -> LLMResult:
         try:
-            response = self._api.post_chat_request(messages=messages)
-            usage = response.raw_response["usage"] if response.raw_response is not None else {}
+            with DurationManager() as timer:
+                response = self._api.post_chat_request(messages=messages)
             return LLMResult(
                 choices=response.choices,
-                consumptions=self.to_consumptions(usage),
+                consumptions=self.to_consumptions(timer.duration, response.usage),
                 raw_response=response.raw_response,
             )
         except APITimeoutError as e:
             raise LLMCallTimeoutException(self._configuration.timeout.value, self._name) from e
         except APIStatusError as e:
             raise LLMCallException(code=e.status_code, error=e.message, llm_name=self._name) from e
 
-    def to_consumptions(self, usage: Dict[str, int]) -> Sequence[Consumption]:
-        if "input_tokens" not in usage or "output_tokens" not in usage:
-            return []
-
+    def to_consumptions(self, duration: float, usage: Usage) -> Sequence[Consumption]:
         model = self._configuration.model_name()
         consumption_calculator = AnthropicConsumptionCalculator(model)
-        if "cache_creation_input_tokens" in usage:
-            return consumption_calculator.get_cache_consumptions(usage)
-
-        return consumption_calculator.get_consumptions(usage["input_tokens"], usage["output_tokens"])
+        return consumption_calculator.get_anthropic_consumptions(duration, usage)
 
     def _get_api_wrapper(self) -> AnthropicAPIClientWrapper:
         if self._configuration is not None and self._configuration.model_name() == "claude-2":

diff --git a/council/llm/anthropic_messages_llm.py b/council/llm/anthropic_messages_llm.py
@@ -6,7 +6,7 @@
 from anthropic._types import NOT_GIVEN
 from anthropic.types import MessageParam, TextBlock
 from council.llm import AnthropicLLMConfiguration, LLMMessage, LLMMessageRole
-from council.llm.anthropic import AnthropicAPIClientResult, AnthropicAPIClientWrapper
+from council.llm.anthropic import AnthropicAPIClientResult, AnthropicAPIClientWrapper, Usage
 from council.llm.llm_message import LLMCacheControlData
 
 
@@ -44,7 +44,9 @@ def post_chat_request(self, messages: Sequence[LLMMessage]) -> AnthropicAPIClien
         )
         choices = [content.text for content in completion.content if isinstance(content, TextBlock)]
 
-        return AnthropicAPIClientResult(choices=choices, raw_response=completion.to_dict())
+        return AnthropicAPIClientResult(
+            choices=choices, usage=Usage.from_dict(completion.usage.to_dict()), raw_response=completion.to_dict()
+        )
 
     @staticmethod
     def _to_anthropic_system_messages(messages: Sequence[LLMMessage]) -> Dict[str, List[Dict[str, Any]]]:

diff --git a/council/llm/data/anthropic-costs.yaml b/council/llm/data/anthropic-costs.yaml
@@ -0,0 +1,44 @@
+kind: LLMCostManager
+version: 0.1
+metadata:
+  name: anthropic-costs
+  labels:
+    provider: Anthropic
+    reference: https://www.anthropic.com/pricing#anthropic-api
+spec:
+  default:
+    description: |
+      Default model costs
+    models:
+      claude-3-haiku-20240307:
+        input: 0.25
+        output: 1.25
+      claude-3-sonnet-20240229:
+        input: 3.00
+        output: 15.00
+      claude-3-5-sonnet-20240620:
+        input: 3.00
+        output: 15.00
+      claude-3-5-sonnet-20241022:
+        input: 3.00
+        output: 15.00
+      claude-3-opus-20240229:
+        input: 15.00
+        output: 75.00
+  caching:
+    description: |
+      Prompt caching costs: input - cache write; output - cache read; 
+      Note - not all model support prompt caching
+    models:
+      claude-3-haiku-20240307:
+        input: 0.30
+        output: 0.03
+      claude-3-5-sonnet-20240620:
+        input: 3.75
+        output: 0.30
+      claude-3-5-sonnet-20241022:
+        input: 3.75
+        output: 0.30
+      claude-3-opus-20240229:
+        input: 18.75
+        output: 1.50