Skip to content

Commit

Permalink
Feature yaml costs (#180)
Browse files Browse the repository at this point in the history
* Introduce TokenKind and add duration to consumptions

* Introduce DurationManager

* Use Anthropic-like Usage object

* Make Usage empty instead of optional

* Move costs into yaml files

* Implement get_cost_map() instead of direct access

* Fix

* Update docs

* Fix print statement

* Adjust consumption docs

* Add consumptions print in test_llm_caching_middleware.py

* Address comments

* Fix
  • Loading branch information
Winston-503 authored Oct 30, 2024
1 parent 60c63c5 commit 3746c63
Show file tree
Hide file tree
Showing 16 changed files with 508 additions and 220 deletions.
21 changes: 11 additions & 10 deletions council/contexts/_budget.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,6 @@ class BudgetExpiredException(Exception):
class Consumption:
"""
A class representing a consumption measurement with value, unit, and kind information.
Attributes:
_value (float): The numeric value of the consumption measurement.
_unit (str): The unit of measurement for the consumption (e.g., tokens, api_calls, etc.).
_kind (str): The kind or category of the consumption.
Methods:
__init__(value: float, unit: str, kind: str):
Initializes a Consumption instance with the provided value, unit, and kind.
"""

def __init__(self, value: float, unit: str, kind: str) -> None:
Expand All @@ -41,14 +31,17 @@ def __init__(self, value: float, unit: str, kind: str) -> None:

@property
def value(self) -> float:
"""The numeric value of the consumption measurement."""
return self._value

@property
def unit(self) -> str:
"""The unit of measurement for the consumption (e.g., tokens, api_calls, etc.)."""
return self._unit

@property
def kind(self) -> str:
"""The kind or category of the consumption."""
return self._kind

def __str__(self) -> str:
Expand Down Expand Up @@ -83,14 +76,22 @@ def to_dict(self) -> Dict[str, Any]:

@staticmethod
def call(value: int, kind: str) -> Consumption:
"""Returns a Consumption instance with "call" unit."""
return Consumption(value, "call", kind)

@staticmethod
def duration(value: float, kind: str) -> Consumption:
"""Returns a Consumption instance with "second" unit."""
return Consumption(value, "second", kind)

@staticmethod
def token(value: int, kind: str) -> Consumption:
"""Returns a Consumption instance with "token" unit."""
return Consumption(value, "token", kind)

@staticmethod
def cost(value: float, kind: str) -> Consumption:
"""Returns a Consumption instance with "USD" unit."""
return Consumption(value, "USD", kind)


Expand Down
2 changes: 1 addition & 1 deletion council/llm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from .llm_exception import LLMException, LLMCallException, LLMCallTimeoutException, LLMTokenLimitException
from .llm_message import LLMMessageRole, LLMMessage, LLMMessageTokenCounterBase
from .llm_base import LLMBase, LLMResult, LLMConfigurationBase
from .llm_cost import LLMCostCard, LLMConsumptionCalculatorBase
from .llm_cost import LLMCostCard, LLMConsumptionCalculatorBase, TokenKind, LLMCostManagerSpec, LLMCostManagerObject
from .llm_fallback import LLMFallback
from .llm_middleware import (
LLMRequest,
Expand Down
38 changes: 36 additions & 2 deletions council/llm/anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,57 @@
from council.llm import LLMMessage


class Usage:
"""Represents token usage statistics for an Anthropic API request."""

def __init__(
self,
prompt_tokens: int,
completion_tokens: int,
cache_creation_prompt_tokens: int,
cache_read_prompt_tokens: int,
):
self.prompt_tokens = prompt_tokens
self.completion_tokens = completion_tokens
self.cache_creation_prompt_tokens = cache_creation_prompt_tokens
self.cache_read_prompt_tokens = cache_read_prompt_tokens
self.total_tokens = cache_creation_prompt_tokens + cache_read_prompt_tokens + prompt_tokens + completion_tokens

@staticmethod
def from_dict(values: Dict[str, int]) -> Usage:
prompt_tokens = values["input_tokens"]
completion_tokens = values["output_tokens"]
cache_creation_prompt_tokens = values.get("cache_creation_input_tokens", 0)
cache_read_prompt_tokens = values.get("cache_read_input_tokens", 0)
return Usage(prompt_tokens, completion_tokens, cache_creation_prompt_tokens, cache_read_prompt_tokens)

@staticmethod
def empty() -> Usage:
return Usage(0, 0, 0, 0)


class AnthropicAPIClientResult:
def __init__(self, choices: List[str], raw_response: Optional[Dict[str, Any]] = None) -> None:
def __init__(self, choices: List[str], usage: Usage, raw_response: Optional[Dict[str, Any]] = None) -> None:
self._choices = choices
self._usage = usage
self._raw_response = raw_response

@property
def choices(self) -> List[str]:
return self._choices

@property
def usage(self) -> Usage:
return self._usage

@property
def raw_response(self) -> Optional[Dict[str, Any]]:
return self._raw_response

@staticmethod
def from_completion(result: Completion) -> AnthropicAPIClientResult:
"""For legacy completion API"""
return AnthropicAPIClientResult(choices=[result.completion])
return AnthropicAPIClientResult(choices=[result.completion], usage=Usage.empty())


class AnthropicAPIClientWrapper(ABC):
Expand Down
93 changes: 38 additions & 55 deletions council/llm/anthropic_llm.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

from typing import Any, Dict, List, Mapping, Optional, Sequence
from typing import Any, List, Mapping, Optional, Sequence

from anthropic import Anthropic, APIStatusError, APITimeoutError
from council.contexts import Consumption, LLMContext
Expand All @@ -12,13 +12,16 @@
LLMConfigObject,
LLMConsumptionCalculatorBase,
LLMCostCard,
LLMCostManagerObject,
LLMMessage,
LLMMessageTokenCounterBase,
LLMProviders,
LLMResult,
TokenKind,
)
from council.utils.utils import DurationManager

from .anthropic import AnthropicAPIClientWrapper
from .anthropic import AnthropicAPIClientWrapper, Usage
from .anthropic_completion_llm import AnthropicCompletionLLM
from .anthropic_messages_llm import AnthropicMessagesLLM

Expand All @@ -35,68 +38,52 @@ def count_messages_token(self, messages: Sequence[LLMMessage]) -> int:


class AnthropicConsumptionCalculator(LLMConsumptionCalculatorBase):
# https://www.anthropic.com/pricing#anthropic-api
COSTS: Mapping[str, LLMCostCard] = {
"claude-3-haiku-20240307": LLMCostCard(input=0.25, output=1.25),
"claude-3-sonnet-20240229": LLMCostCard(input=3.00, output=15.00),
"claude-3-5-sonnet-20240620": LLMCostCard(input=3.00, output=15.00),
"claude-3-5-sonnet-20241022": LLMCostCard(input=3.00, output=15.00),
"claude-3-opus-20240229": LLMCostCard(input=15.00, output=75.00),
}

# input - cache write; output - cache read; note - not all model support prompt caching
COSTS_CACHING: Mapping[str, LLMCostCard] = {
"claude-3-haiku-20240307": LLMCostCard(input=0.30, output=0.03),
"claude-3-5-sonnet-20240620": LLMCostCard(input=3.75, output=0.30),
"claude-3-5-sonnet-20241022": LLMCostCard(input=3.75, output=0.30),
"claude-3-opus-20240229": LLMCostCard(input=18.75, output=1.50),
}
_cost_manager = LLMCostManagerObject.anthropic()
COSTS: Mapping[str, LLMCostCard] = _cost_manager.get_cost_map("default")
COSTS_CACHING: Mapping[str, LLMCostCard] = _cost_manager.get_cost_map("caching")

def find_model_costs(self) -> Optional[LLMCostCard]:
return self.COSTS.get(self.model)

def find_caching_costs(self) -> Optional[LLMCostCard]:
return self.COSTS_CACHING.get(self.model)

def get_cache_consumptions(self, usage: Dict[str, int]) -> List[Consumption]:
def get_anthropic_consumptions(self, duration: float, usage: Usage) -> List[Consumption]:
"""
Get consumptions specific for Anthropic prompt caching:
- 1 call
- specified duration
- cache_creation_prompt, cache_read_prompt, prompt, completion and total tokens
- costs if both regular and caching LLMCostCards can be found
"""
consumptions = self.get_cache_token_consumptions(usage) + self.get_cache_cost_consumptions(usage)
return self.filter_zeros(consumptions) # could occur for cache tokens

def get_cache_token_consumptions(self, usage: Dict[str, int]) -> List[Consumption]:
total = sum(
[
usage["cache_creation_prompt_tokens"],
usage["cache_read_prompt_tokens"],
usage["prompt_tokens"],
usage["completion_tokens"],
]
consumptions = self.get_anthropic_base_consumptions(duration, usage) + self.get_anthropic_cost_consumptions(
usage
)
return self.filter_zeros(consumptions) # could occur for cache tokens

def get_anthropic_base_consumptions(self, duration: float, usage: Usage) -> List[Consumption]:
return [
Consumption.call(1, self.model),
Consumption.token(usage["cache_creation_prompt_tokens"], self.format_kind("cache_creation_prompt")),
Consumption.token(usage["cache_read_prompt_tokens"], self.format_kind("cache_read_prompt")),
Consumption.token(usage["prompt_tokens"], self.format_kind("prompt")),
Consumption.token(usage["completion_tokens"], self.format_kind("completion")),
Consumption.token(total, self.format_kind("total")),
Consumption.duration(duration, self.model),
Consumption.token(usage.cache_creation_prompt_tokens, self.format_kind(TokenKind.cache_creation_prompt)),
Consumption.token(usage.cache_read_prompt_tokens, self.format_kind(TokenKind.cache_read_prompt)),
Consumption.token(usage.prompt_tokens, self.format_kind(TokenKind.prompt)),
Consumption.token(usage.completion_tokens, self.format_kind(TokenKind.completion)),
Consumption.token(usage.total_tokens, self.format_kind(TokenKind.total)),
]

def get_cache_cost_consumptions(self, usage: Dict[str, int]) -> List[Consumption]:
def get_anthropic_cost_consumptions(self, usage: Usage) -> List[Consumption]:
cost_card = self.find_model_costs()
caching_cost_card = self.find_caching_costs()

if cost_card is None or caching_cost_card is None:
return []

prompt_tokens_cost = cost_card.input_cost(usage["prompt_tokens"])
completion_tokens_cost = cost_card.output_cost(usage["completion_tokens"])
cache_creation_prompt_tokens_cost = caching_cost_card.input_cost(usage["cache_creation_prompt_tokens"])
cache_read_prompt_tokens_cost = caching_cost_card.output_cost(usage["cache_read_prompt_tokens"])
prompt_tokens_cost = cost_card.input_cost(usage.prompt_tokens)
completion_tokens_cost = cost_card.output_cost(usage.completion_tokens)
cache_creation_prompt_tokens_cost = caching_cost_card.input_cost(usage.cache_creation_prompt_tokens)
cache_read_prompt_tokens_cost = caching_cost_card.output_cost(usage.cache_read_prompt_tokens)

total_cost = sum(
[
Expand All @@ -108,11 +95,13 @@ def get_cache_cost_consumptions(self, usage: Dict[str, int]) -> List[Consumption
)

return [
Consumption.cost(cache_creation_prompt_tokens_cost, self.format_kind("cache_creation_prompt", cost=True)),
Consumption.cost(cache_read_prompt_tokens_cost, self.format_kind("cache_read_prompt", cost=True)),
Consumption.cost(prompt_tokens_cost, self.format_kind("prompt", cost=True)),
Consumption.cost(completion_tokens_cost, self.format_kind("completion", cost=True)),
Consumption.cost(total_cost, self.format_kind("total", cost=True)),
Consumption.cost(
cache_creation_prompt_tokens_cost, self.format_kind(TokenKind.cache_creation_prompt, cost=True)
),
Consumption.cost(cache_read_prompt_tokens_cost, self.format_kind(TokenKind.cache_read_prompt, cost=True)),
Consumption.cost(prompt_tokens_cost, self.format_kind(TokenKind.prompt, cost=True)),
Consumption.cost(completion_tokens_cost, self.format_kind(TokenKind.completion, cost=True)),
Consumption.cost(total_cost, self.format_kind(TokenKind.total, cost=True)),
]


Expand All @@ -130,28 +119,22 @@ def __init__(self, config: AnthropicLLMConfiguration, name: Optional[str] = None

def _post_chat_request(self, context: LLMContext, messages: Sequence[LLMMessage], **kwargs: Any) -> LLMResult:
try:
response = self._api.post_chat_request(messages=messages)
usage = response.raw_response["usage"] if response.raw_response is not None else {}
with DurationManager() as timer:
response = self._api.post_chat_request(messages=messages)
return LLMResult(
choices=response.choices,
consumptions=self.to_consumptions(usage),
consumptions=self.to_consumptions(timer.duration, response.usage),
raw_response=response.raw_response,
)
except APITimeoutError as e:
raise LLMCallTimeoutException(self._configuration.timeout.value, self._name) from e
except APIStatusError as e:
raise LLMCallException(code=e.status_code, error=e.message, llm_name=self._name) from e

def to_consumptions(self, usage: Dict[str, int]) -> Sequence[Consumption]:
if "input_tokens" not in usage or "output_tokens" not in usage:
return []

def to_consumptions(self, duration: float, usage: Usage) -> Sequence[Consumption]:
model = self._configuration.model_name()
consumption_calculator = AnthropicConsumptionCalculator(model)
if "cache_creation_input_tokens" in usage:
return consumption_calculator.get_cache_consumptions(usage)

return consumption_calculator.get_consumptions(usage["input_tokens"], usage["output_tokens"])
return consumption_calculator.get_anthropic_consumptions(duration, usage)

def _get_api_wrapper(self) -> AnthropicAPIClientWrapper:
if self._configuration is not None and self._configuration.model_name() == "claude-2":
Expand Down
6 changes: 4 additions & 2 deletions council/llm/anthropic_messages_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from anthropic._types import NOT_GIVEN
from anthropic.types import MessageParam, TextBlock
from council.llm import AnthropicLLMConfiguration, LLMMessage, LLMMessageRole
from council.llm.anthropic import AnthropicAPIClientResult, AnthropicAPIClientWrapper
from council.llm.anthropic import AnthropicAPIClientResult, AnthropicAPIClientWrapper, Usage
from council.llm.llm_message import LLMCacheControlData


Expand Down Expand Up @@ -44,7 +44,9 @@ def post_chat_request(self, messages: Sequence[LLMMessage]) -> AnthropicAPIClien
)
choices = [content.text for content in completion.content if isinstance(content, TextBlock)]

return AnthropicAPIClientResult(choices=choices, raw_response=completion.to_dict())
return AnthropicAPIClientResult(
choices=choices, usage=Usage.from_dict(completion.usage.to_dict()), raw_response=completion.to_dict()
)

@staticmethod
def _to_anthropic_system_messages(messages: Sequence[LLMMessage]) -> Dict[str, List[Dict[str, Any]]]:
Expand Down
44 changes: 44 additions & 0 deletions council/llm/data/anthropic-costs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
kind: LLMCostManager
version: 0.1
metadata:
name: anthropic-costs
labels:
provider: Anthropic
reference: https://www.anthropic.com/pricing#anthropic-api
spec:
default:
description: |
Default model costs
models:
claude-3-haiku-20240307:
input: 0.25
output: 1.25
claude-3-sonnet-20240229:
input: 3.00
output: 15.00
claude-3-5-sonnet-20240620:
input: 3.00
output: 15.00
claude-3-5-sonnet-20241022:
input: 3.00
output: 15.00
claude-3-opus-20240229:
input: 15.00
output: 75.00
caching:
description: |
Prompt caching costs: input - cache write; output - cache read;
Note - not all model support prompt caching
models:
claude-3-haiku-20240307:
input: 0.30
output: 0.03
claude-3-5-sonnet-20240620:
input: 3.75
output: 0.30
claude-3-5-sonnet-20241022:
input: 3.75
output: 0.30
claude-3-opus-20240229:
input: 18.75
output: 1.50
Loading

0 comments on commit 3746c63

Please sign in to comment.