diff --git a/council/contexts/_budget.py b/council/contexts/_budget.py index 49285a42..0b9b9104 100644 --- a/council/contexts/_budget.py +++ b/council/contexts/_budget.py @@ -13,16 +13,6 @@ class BudgetExpiredException(Exception): class Consumption: """ A class representing a consumption measurement with value, unit, and kind information. - - Attributes: - _value (float): The numeric value of the consumption measurement. - _unit (str): The unit of measurement for the consumption (e.g., tokens, api_calls, etc.). - _kind (str): The kind or category of the consumption. - - Methods: - __init__(value: float, unit: str, kind: str): - Initializes a Consumption instance with the provided value, unit, and kind. - """ def __init__(self, value: float, unit: str, kind: str) -> None: @@ -41,14 +31,17 @@ def __init__(self, value: float, unit: str, kind: str) -> None: @property def value(self) -> float: + """The numeric value of the consumption measurement.""" return self._value @property def unit(self) -> str: + """The unit of measurement for the consumption (e.g., tokens, api_calls, etc.).""" return self._unit @property def kind(self) -> str: + """The kind or category of the consumption.""" return self._kind def __str__(self) -> str: @@ -83,14 +76,22 @@ def to_dict(self) -> Dict[str, Any]: @staticmethod def call(value: int, kind: str) -> Consumption: + """Returns a Consumption instance with "call" unit.""" return Consumption(value, "call", kind) + @staticmethod + def duration(value: float, kind: str) -> Consumption: + """Returns a Consumption instance with "second" unit.""" + return Consumption(value, "second", kind) + @staticmethod def token(value: int, kind: str) -> Consumption: + """Returns a Consumption instance with "token" unit.""" return Consumption(value, "token", kind) @staticmethod def cost(value: float, kind: str) -> Consumption: + """Returns a Consumption instance with "USD" unit.""" return Consumption(value, "USD", kind) diff --git a/council/llm/__init__.py b/council/llm/__init__.py index 9709460a..a213c12d 100644 --- a/council/llm/__init__.py +++ b/council/llm/__init__.py @@ -9,7 +9,7 @@ from .llm_exception import LLMException, LLMCallException, LLMCallTimeoutException, LLMTokenLimitException from .llm_message import LLMMessageRole, LLMMessage, LLMMessageTokenCounterBase from .llm_base import LLMBase, LLMResult, LLMConfigurationBase -from .llm_cost import LLMCostCard, LLMConsumptionCalculatorBase +from .llm_cost import LLMCostCard, LLMConsumptionCalculatorBase, TokenKind, LLMCostManagerSpec, LLMCostManagerObject from .llm_fallback import LLMFallback from .llm_middleware import ( LLMRequest, diff --git a/council/llm/anthropic.py b/council/llm/anthropic.py index 726311a4..572eccf7 100644 --- a/council/llm/anthropic.py +++ b/council/llm/anthropic.py @@ -8,15 +8,49 @@ from council.llm import LLMMessage +class Usage: + """Represents token usage statistics for an Anthropic API request.""" + + def __init__( + self, + prompt_tokens: int, + completion_tokens: int, + cache_creation_prompt_tokens: int, + cache_read_prompt_tokens: int, + ): + self.prompt_tokens = prompt_tokens + self.completion_tokens = completion_tokens + self.cache_creation_prompt_tokens = cache_creation_prompt_tokens + self.cache_read_prompt_tokens = cache_read_prompt_tokens + self.total_tokens = cache_creation_prompt_tokens + cache_read_prompt_tokens + prompt_tokens + completion_tokens + + @staticmethod + def from_dict(values: Dict[str, int]) -> Usage: + prompt_tokens = values["input_tokens"] + completion_tokens = values["output_tokens"] + cache_creation_prompt_tokens = values.get("cache_creation_input_tokens", 0) + cache_read_prompt_tokens = values.get("cache_read_input_tokens", 0) + return Usage(prompt_tokens, completion_tokens, cache_creation_prompt_tokens, cache_read_prompt_tokens) + + @staticmethod + def empty() -> Usage: + return Usage(0, 0, 0, 0) + + class AnthropicAPIClientResult: - def __init__(self, choices: List[str], raw_response: Optional[Dict[str, Any]] = None) -> None: + def __init__(self, choices: List[str], usage: Usage, raw_response: Optional[Dict[str, Any]] = None) -> None: self._choices = choices + self._usage = usage self._raw_response = raw_response @property def choices(self) -> List[str]: return self._choices + @property + def usage(self) -> Usage: + return self._usage + @property def raw_response(self) -> Optional[Dict[str, Any]]: return self._raw_response @@ -24,7 +58,7 @@ def raw_response(self) -> Optional[Dict[str, Any]]: @staticmethod def from_completion(result: Completion) -> AnthropicAPIClientResult: """For legacy completion API""" - return AnthropicAPIClientResult(choices=[result.completion]) + return AnthropicAPIClientResult(choices=[result.completion], usage=Usage.empty()) class AnthropicAPIClientWrapper(ABC): diff --git a/council/llm/anthropic_llm.py b/council/llm/anthropic_llm.py index 8b0e695e..e89f9eb6 100644 --- a/council/llm/anthropic_llm.py +++ b/council/llm/anthropic_llm.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Dict, List, Mapping, Optional, Sequence +from typing import Any, List, Mapping, Optional, Sequence from anthropic import Anthropic, APIStatusError, APITimeoutError from council.contexts import Consumption, LLMContext @@ -12,13 +12,16 @@ LLMConfigObject, LLMConsumptionCalculatorBase, LLMCostCard, + LLMCostManagerObject, LLMMessage, LLMMessageTokenCounterBase, LLMProviders, LLMResult, + TokenKind, ) +from council.utils.utils import DurationManager -from .anthropic import AnthropicAPIClientWrapper +from .anthropic import AnthropicAPIClientWrapper, Usage from .anthropic_completion_llm import AnthropicCompletionLLM from .anthropic_messages_llm import AnthropicMessagesLLM @@ -35,22 +38,9 @@ def count_messages_token(self, messages: Sequence[LLMMessage]) -> int: class AnthropicConsumptionCalculator(LLMConsumptionCalculatorBase): - # https://www.anthropic.com/pricing#anthropic-api - COSTS: Mapping[str, LLMCostCard] = { - "claude-3-haiku-20240307": LLMCostCard(input=0.25, output=1.25), - "claude-3-sonnet-20240229": LLMCostCard(input=3.00, output=15.00), - "claude-3-5-sonnet-20240620": LLMCostCard(input=3.00, output=15.00), - "claude-3-5-sonnet-20241022": LLMCostCard(input=3.00, output=15.00), - "claude-3-opus-20240229": LLMCostCard(input=15.00, output=75.00), - } - - # input - cache write; output - cache read; note - not all model support prompt caching - COSTS_CACHING: Mapping[str, LLMCostCard] = { - "claude-3-haiku-20240307": LLMCostCard(input=0.30, output=0.03), - "claude-3-5-sonnet-20240620": LLMCostCard(input=3.75, output=0.30), - "claude-3-5-sonnet-20241022": LLMCostCard(input=3.75, output=0.30), - "claude-3-opus-20240229": LLMCostCard(input=18.75, output=1.50), - } + _cost_manager = LLMCostManagerObject.anthropic() + COSTS: Mapping[str, LLMCostCard] = _cost_manager.get_cost_map("default") + COSTS_CACHING: Mapping[str, LLMCostCard] = _cost_manager.get_cost_map("caching") def find_model_costs(self) -> Optional[LLMCostCard]: return self.COSTS.get(self.model) @@ -58,45 +48,42 @@ def find_model_costs(self) -> Optional[LLMCostCard]: def find_caching_costs(self) -> Optional[LLMCostCard]: return self.COSTS_CACHING.get(self.model) - def get_cache_consumptions(self, usage: Dict[str, int]) -> List[Consumption]: + def get_anthropic_consumptions(self, duration: float, usage: Usage) -> List[Consumption]: """ Get consumptions specific for Anthropic prompt caching: - 1 call + - specified duration - cache_creation_prompt, cache_read_prompt, prompt, completion and total tokens - costs if both regular and caching LLMCostCards can be found """ - consumptions = self.get_cache_token_consumptions(usage) + self.get_cache_cost_consumptions(usage) - return self.filter_zeros(consumptions) # could occur for cache tokens - def get_cache_token_consumptions(self, usage: Dict[str, int]) -> List[Consumption]: - total = sum( - [ - usage["cache_creation_prompt_tokens"], - usage["cache_read_prompt_tokens"], - usage["prompt_tokens"], - usage["completion_tokens"], - ] + consumptions = self.get_anthropic_base_consumptions(duration, usage) + self.get_anthropic_cost_consumptions( + usage ) + return self.filter_zeros(consumptions) # could occur for cache tokens + + def get_anthropic_base_consumptions(self, duration: float, usage: Usage) -> List[Consumption]: return [ Consumption.call(1, self.model), - Consumption.token(usage["cache_creation_prompt_tokens"], self.format_kind("cache_creation_prompt")), - Consumption.token(usage["cache_read_prompt_tokens"], self.format_kind("cache_read_prompt")), - Consumption.token(usage["prompt_tokens"], self.format_kind("prompt")), - Consumption.token(usage["completion_tokens"], self.format_kind("completion")), - Consumption.token(total, self.format_kind("total")), + Consumption.duration(duration, self.model), + Consumption.token(usage.cache_creation_prompt_tokens, self.format_kind(TokenKind.cache_creation_prompt)), + Consumption.token(usage.cache_read_prompt_tokens, self.format_kind(TokenKind.cache_read_prompt)), + Consumption.token(usage.prompt_tokens, self.format_kind(TokenKind.prompt)), + Consumption.token(usage.completion_tokens, self.format_kind(TokenKind.completion)), + Consumption.token(usage.total_tokens, self.format_kind(TokenKind.total)), ] - def get_cache_cost_consumptions(self, usage: Dict[str, int]) -> List[Consumption]: + def get_anthropic_cost_consumptions(self, usage: Usage) -> List[Consumption]: cost_card = self.find_model_costs() caching_cost_card = self.find_caching_costs() if cost_card is None or caching_cost_card is None: return [] - prompt_tokens_cost = cost_card.input_cost(usage["prompt_tokens"]) - completion_tokens_cost = cost_card.output_cost(usage["completion_tokens"]) - cache_creation_prompt_tokens_cost = caching_cost_card.input_cost(usage["cache_creation_prompt_tokens"]) - cache_read_prompt_tokens_cost = caching_cost_card.output_cost(usage["cache_read_prompt_tokens"]) + prompt_tokens_cost = cost_card.input_cost(usage.prompt_tokens) + completion_tokens_cost = cost_card.output_cost(usage.completion_tokens) + cache_creation_prompt_tokens_cost = caching_cost_card.input_cost(usage.cache_creation_prompt_tokens) + cache_read_prompt_tokens_cost = caching_cost_card.output_cost(usage.cache_read_prompt_tokens) total_cost = sum( [ @@ -108,11 +95,13 @@ def get_cache_cost_consumptions(self, usage: Dict[str, int]) -> List[Consumption ) return [ - Consumption.cost(cache_creation_prompt_tokens_cost, self.format_kind("cache_creation_prompt", cost=True)), - Consumption.cost(cache_read_prompt_tokens_cost, self.format_kind("cache_read_prompt", cost=True)), - Consumption.cost(prompt_tokens_cost, self.format_kind("prompt", cost=True)), - Consumption.cost(completion_tokens_cost, self.format_kind("completion", cost=True)), - Consumption.cost(total_cost, self.format_kind("total", cost=True)), + Consumption.cost( + cache_creation_prompt_tokens_cost, self.format_kind(TokenKind.cache_creation_prompt, cost=True) + ), + Consumption.cost(cache_read_prompt_tokens_cost, self.format_kind(TokenKind.cache_read_prompt, cost=True)), + Consumption.cost(prompt_tokens_cost, self.format_kind(TokenKind.prompt, cost=True)), + Consumption.cost(completion_tokens_cost, self.format_kind(TokenKind.completion, cost=True)), + Consumption.cost(total_cost, self.format_kind(TokenKind.total, cost=True)), ] @@ -130,11 +119,11 @@ def __init__(self, config: AnthropicLLMConfiguration, name: Optional[str] = None def _post_chat_request(self, context: LLMContext, messages: Sequence[LLMMessage], **kwargs: Any) -> LLMResult: try: - response = self._api.post_chat_request(messages=messages) - usage = response.raw_response["usage"] if response.raw_response is not None else {} + with DurationManager() as timer: + response = self._api.post_chat_request(messages=messages) return LLMResult( choices=response.choices, - consumptions=self.to_consumptions(usage), + consumptions=self.to_consumptions(timer.duration, response.usage), raw_response=response.raw_response, ) except APITimeoutError as e: @@ -142,16 +131,10 @@ def _post_chat_request(self, context: LLMContext, messages: Sequence[LLMMessage] except APIStatusError as e: raise LLMCallException(code=e.status_code, error=e.message, llm_name=self._name) from e - def to_consumptions(self, usage: Dict[str, int]) -> Sequence[Consumption]: - if "input_tokens" not in usage or "output_tokens" not in usage: - return [] - + def to_consumptions(self, duration: float, usage: Usage) -> Sequence[Consumption]: model = self._configuration.model_name() consumption_calculator = AnthropicConsumptionCalculator(model) - if "cache_creation_input_tokens" in usage: - return consumption_calculator.get_cache_consumptions(usage) - - return consumption_calculator.get_consumptions(usage["input_tokens"], usage["output_tokens"]) + return consumption_calculator.get_anthropic_consumptions(duration, usage) def _get_api_wrapper(self) -> AnthropicAPIClientWrapper: if self._configuration is not None and self._configuration.model_name() == "claude-2": diff --git a/council/llm/anthropic_messages_llm.py b/council/llm/anthropic_messages_llm.py index df2055e7..cc534155 100644 --- a/council/llm/anthropic_messages_llm.py +++ b/council/llm/anthropic_messages_llm.py @@ -6,7 +6,7 @@ from anthropic._types import NOT_GIVEN from anthropic.types import MessageParam, TextBlock from council.llm import AnthropicLLMConfiguration, LLMMessage, LLMMessageRole -from council.llm.anthropic import AnthropicAPIClientResult, AnthropicAPIClientWrapper +from council.llm.anthropic import AnthropicAPIClientResult, AnthropicAPIClientWrapper, Usage from council.llm.llm_message import LLMCacheControlData @@ -44,7 +44,9 @@ def post_chat_request(self, messages: Sequence[LLMMessage]) -> AnthropicAPIClien ) choices = [content.text for content in completion.content if isinstance(content, TextBlock)] - return AnthropicAPIClientResult(choices=choices, raw_response=completion.to_dict()) + return AnthropicAPIClientResult( + choices=choices, usage=Usage.from_dict(completion.usage.to_dict()), raw_response=completion.to_dict() + ) @staticmethod def _to_anthropic_system_messages(messages: Sequence[LLMMessage]) -> Dict[str, List[Dict[str, Any]]]: diff --git a/council/llm/data/anthropic-costs.yaml b/council/llm/data/anthropic-costs.yaml new file mode 100644 index 00000000..89e85746 --- /dev/null +++ b/council/llm/data/anthropic-costs.yaml @@ -0,0 +1,44 @@ +kind: LLMCostManager +version: 0.1 +metadata: + name: anthropic-costs + labels: + provider: Anthropic + reference: https://www.anthropic.com/pricing#anthropic-api +spec: + default: + description: | + Default model costs + models: + claude-3-haiku-20240307: + input: 0.25 + output: 1.25 + claude-3-sonnet-20240229: + input: 3.00 + output: 15.00 + claude-3-5-sonnet-20240620: + input: 3.00 + output: 15.00 + claude-3-5-sonnet-20241022: + input: 3.00 + output: 15.00 + claude-3-opus-20240229: + input: 15.00 + output: 75.00 + caching: + description: | + Prompt caching costs: input - cache write; output - cache read; + Note - not all model support prompt caching + models: + claude-3-haiku-20240307: + input: 0.30 + output: 0.03 + claude-3-5-sonnet-20240620: + input: 3.75 + output: 0.30 + claude-3-5-sonnet-20241022: + input: 3.75 + output: 0.30 + claude-3-opus-20240229: + input: 18.75 + output: 1.50 diff --git a/council/llm/data/gemini-costs.yaml b/council/llm/data/gemini-costs.yaml new file mode 100644 index 00000000..c868fb30 --- /dev/null +++ b/council/llm/data/gemini-costs.yaml @@ -0,0 +1,40 @@ +kind: LLMCostManager +version: 0.1 +metadata: + name: gemini-costs + labels: + provider: Google + reference: https://ai.google.dev/pricing +spec: + under_128k: + description: | + Costs for prompt tokens up to 128k + models: + gemini-1.5-flash: + input: 0.075 + output: 0.30 + gemini-1.5-flash-8b: + input: 0.0375 + output: 0.15 + gemini-1.5-pro: + input: 1.25 + output: 5.00 + gemini-1.0-pro: + input: 0.50 + output: 1.50 + over_128k: + description: | + Costs for prompt tokens over 128k + models: + gemini-1.5-flash: + input: 0.15 + output: 0.60 + gemini-1.5-flash-8b: + input: 0.075 + output: 0.30 + gemini-1.5-pro: + input: 2.50 + output: 10.00 + gemini-1.0-pro: + input: 0.50 + output: 1.50 diff --git a/council/llm/data/openai-costs.yaml b/council/llm/data/openai-costs.yaml new file mode 100644 index 00000000..ec9d5249 --- /dev/null +++ b/council/llm/data/openai-costs.yaml @@ -0,0 +1,90 @@ +kind: LLMCostManager +version: 0.1 +metadata: + name: openai-costs + labels: + provider: OpenAI + reference: https://openai.com/api/pricing/ +spec: + gpt_35_turbo_family: + description: | + Costs for GPT-3.5 Turbo family models + models: + gpt-3.5-turbo-0125: + input: 0.50 + output: 1.50 + gpt-3.5-turbo-instruct: + input: 1.50 + output: 2.00 + gpt-3.5-turbo-1106: + input: 1.00 + output: 2.00 + gpt-3.5-turbo-0613: + input: 1.50 + output: 2.00 + gpt-3.5-turbo-16k-0613: + input: 3.00 + output: 4.00 + gpt-3.5-turbo-0301: + input: 1.50 + output: 2.00 + gpt_4_family: + description: | + Costs for GPT-4 family models + models: + gpt-4-turbo: + input: 10.00 + output: 30.00 + gpt-4-turbo-2024-04-09: + input: 10.00 + output: 30.00 + gpt-4: + input: 30.00 + output: 60.00 + gpt-4-32k: + input: 60.00 + output: 120.00 + gpt-4-0125-preview: + input: 10.00 + output: 30.00 + gpt-4-1106-preview: + input: 10.00 + output: 30.00 + gpt-4-vision-preview: + input: 10.00 + output: 30.00 + gpt_4o_family: + description: | + Costs for GPT-4o family models + models: + gpt-4o: + input: 2.50 + output: 10.00 + gpt-4o-2024-08-06: + input: 2.50 + output: 10.00 + gpt-4o-2024-05-13: + input: 5.00 + output: 15.00 + gpt-4o-mini: + input: 0.150 + output: 0.60 + gpt-4o-mini-2024-07-18: + input: 0.150 + output: 0.60 + o1_family: + description: | + Costs for o1 family models + models: + o1-preview: + input: 15.00 + output: 60.00 + o1-preview-2024-09-12: + input: 15.00 + output: 60.00 + o1-mini: + input: 3.00 + output: 12.00 + o1-mini-2024-09-12: + input: 3.00 + output: 12.00 diff --git a/council/llm/gemini_llm.py b/council/llm/gemini_llm.py index b08cb286..9ba37122 100644 --- a/council/llm/gemini_llm.py +++ b/council/llm/gemini_llm.py @@ -10,32 +10,22 @@ LLMConfigObject, LLMConsumptionCalculatorBase, LLMCostCard, + LLMCostManagerObject, LLMMessage, LLMMessageRole, LLMProviders, LLMResult, ) +from council.utils.utils import DurationManager from google.ai.generativelanguage import FileData from google.ai.generativelanguage_v1 import HarmCategory # type: ignore from google.generativeai.types import GenerateContentResponse, HarmBlockThreshold # type: ignore class GeminiConsumptionCalculator(LLMConsumptionCalculatorBase): - # https://ai.google.dev/pricing - # different strategy for prompt up to 128k tokens - COSTS_UNDER_128k: Mapping[str, LLMCostCard] = { - "gemini-1.5-flash": LLMCostCard(input=0.075, output=0.30), - "gemini-1.5-flash-8b": LLMCostCard(input=0.0375, output=0.15), - "gemini-1.5-pro": LLMCostCard(input=1.25, output=5.00), - "gemini-1.0-pro": LLMCostCard(input=0.50, output=1.50), - } - - COSTS_OVER_128k: Mapping[str, LLMCostCard] = { - "gemini-1.5-flash": LLMCostCard(input=0.15, output=0.60), - "gemini-1.5-flash-8b": LLMCostCard(input=0.075, output=0.30), - "gemini-1.5-pro": LLMCostCard(input=2.50, output=10.00), - "gemini-1.0-pro": LLMCostCard(input=0.50, output=1.50), - } + _cost_manager = LLMCostManagerObject.gemini() + COSTS_UNDER_128k: Mapping[str, LLMCostCard] = _cost_manager.get_cost_map("under_128k") + COSTS_OVER_128k: Mapping[str, LLMCostCard] = _cost_manager.get_cost_map("over_128k") def __init__(self, model: str, num_tokens: int) -> None: super().__init__(model) @@ -67,16 +57,17 @@ def __init__(self, config: GeminiLLMConfiguration) -> None: def _post_chat_request(self, context: LLMContext, messages: Sequence[LLMMessage], **kwargs: Any) -> LLMResult: history, last = self._to_chat_history(messages=messages) chat = self._model.start_chat(history=history) - response = chat.send_message(last) - return LLMResult(choices=[response.text], consumptions=self.to_consumptions(response)) + with DurationManager() as timer: + response = chat.send_message(last) + return LLMResult(choices=[response.text], consumptions=self.to_consumptions(timer.duration, response)) - def to_consumptions(self, response: GenerateContentResponse) -> Sequence[Consumption]: + def to_consumptions(self, duration: float, response: GenerateContentResponse) -> Sequence[Consumption]: model = self._configuration.model_name() prompt_tokens = response.usage_metadata.prompt_token_count completion_tokens = response.usage_metadata.candidates_token_count - consumption_calculator = GeminiConsumptionCalculator(model, prompt_tokens) - return consumption_calculator.get_consumptions(prompt_tokens, completion_tokens) + calculator = GeminiConsumptionCalculator(model, prompt_tokens) + return calculator.get_consumptions(duration, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens) @staticmethod def from_env() -> GeminiLLM: diff --git a/council/llm/llm_cost.py b/council/llm/llm_cost.py index 2fb775df..154baf3a 100644 --- a/council/llm/llm_cost.py +++ b/council/llm/llm_cost.py @@ -1,7 +1,18 @@ +from __future__ import annotations + import abc -from typing import List, Optional, Tuple +import os +from enum import Enum +from typing import Any, Dict, Final, List, Optional, Tuple +import yaml from council.contexts import Consumption +from council.utils import DataObject, DataObjectSpecBase + +DATA_PATH: Final[str] = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") +ANTHROPIC_COSTS_FILENAME: Final[str] = "anthropic-costs.yaml" +GEMINI_COSTS_FILENAME: Final[str] = "gemini-costs.yaml" +OPENAI_COSTS_FILENAME: Final[str] = "openai-costs.yaml" class LLMCostCard: @@ -13,25 +24,53 @@ def __init__(self, input: float, output: float) -> None: @property def input(self) -> float: + """Cost per million input (prompt) tokens.""" return self._input @property def output(self) -> float: + """Cost per million output (completion) tokens.""" return self._output def __str__(self) -> str: return f"${self.input}/${self.output} per 1m tokens" def input_cost(self, tokens: int) -> float: + """Get prompt_tokens_cost for a given amount of input tokens.""" return tokens * self.input / 1e6 def output_cost(self, tokens: int) -> float: + """Get completion_token_cost for a given amount of completion tokens.""" return tokens * self.output / 1e6 def get_costs(self, prompt_tokens: int, completion_tokens: int) -> Tuple[float, float]: """Return tuple of (prompt_tokens_cost, completion_token_cost)""" return self.input_cost(prompt_tokens), self.output_cost(completion_tokens) + @staticmethod + def from_dict(data: Dict[str, float]) -> LLMCostCard: + return LLMCostCard(input=data["input"], output=data["output"]) + + +class TokenKind(str, Enum): + prompt = "prompt" + """Prompt tokens""" + + completion = "completion" + """Completion tokens""" + + total = "total" + """Total tokens""" + + reasoning = "reasoning" + """Reasoning tokens, specific for OpenAI o1 models""" + + cache_creation_prompt = "cache_creation_prompt" + """Cache creation prompt tokens, specific for Anthropic prompt caching""" + + cache_read_prompt = "cache_read_prompt" + """Cache read prompt tokens, specific for Anthropic and OpenAI prompt caching""" + class LLMConsumptionCalculatorBase(abc.ABC): """Helper class to manage LLM consumptions.""" @@ -39,58 +78,47 @@ class LLMConsumptionCalculatorBase(abc.ABC): def __init__(self, model: str): self.model = model - def format_kind(self, token_kind: str, cost: bool = False) -> str: + def format_kind(self, token_kind: TokenKind, cost: bool = False) -> str: """Format Consumption.kind - from 'prompt' to '{self.model}:prompt_tokens'""" - options = [ - "prompt", - "completion", - "total", - "reasoning", # OpenAI o1 - "cache_creation_prompt", # Anthropic prompt caching - "cache_read_prompt", # Anthropic & OpenAI prompt caching - ] - result = f"{self.model}:" - if token_kind not in options: - raise ValueError( - f"Unknown kind `{token_kind}` for LLMConsumptionCalculator; expected one of `{','.join(options)}`" - ) - - result += f"{token_kind}_tokens" + kind = token_kind.value + return f"{self.model}:{kind}_tokens" if not cost else f"{self.model}:{kind}_tokens_cost" - if cost: - result += "_cost" - - return result + def get_consumptions(self, duration: float, *, prompt_tokens: int, completion_tokens: int) -> List[Consumption]: + """Get base and cost consumptions if any""" + base_consumptions = self.get_base_consumptions( + duration, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens + ) + cost_consumptions = self.get_cost_consumptions(prompt_tokens=prompt_tokens, completion_tokens=completion_tokens) + return base_consumptions + cost_consumptions - def get_consumptions(self, prompt_tokens: int, completion_tokens: int) -> List[Consumption]: + def get_base_consumptions( + self, duration: float, *, prompt_tokens: int, completion_tokens: int + ) -> List[Consumption]: """ - Get default consumptions: + Get base consumptions: - 1 call + - specified duration - prompt, completion and total tokens - - cost for prompt, completion and total tokens if LLMCostCard can be found """ - return self.get_token_consumptions(prompt_tokens, completion_tokens) + self.get_cost_consumptions( - prompt_tokens, completion_tokens - ) - - def get_token_consumptions(self, prompt_tokens: int, completion_tokens: int) -> List[Consumption]: return [ Consumption.call(1, self.model), - Consumption.token(prompt_tokens, self.format_kind("prompt")), - Consumption.token(completion_tokens, self.format_kind("completion")), - Consumption.token(prompt_tokens + completion_tokens, self.format_kind("total")), + Consumption.duration(duration, self.model), + Consumption.token(prompt_tokens, self.format_kind(TokenKind.prompt)), + Consumption.token(completion_tokens, self.format_kind(TokenKind.completion)), + Consumption.token(prompt_tokens + completion_tokens, self.format_kind(TokenKind.total)), ] - def get_cost_consumptions(self, prompt_tokens: int, completion_tokens: int) -> List[Consumption]: + def get_cost_consumptions(self, *, prompt_tokens: int, completion_tokens: int) -> List[Consumption]: + """Get cost for prompt, completion and total tokens if LLMCostCard can be found.""" cost_card = self.find_model_costs() if cost_card is None: return [] prompt_tokens_cost, completion_tokens_cost = cost_card.get_costs(prompt_tokens, completion_tokens) return [ - Consumption.cost(prompt_tokens_cost, self.format_kind("prompt", cost=True)), - Consumption.cost(completion_tokens_cost, self.format_kind("completion", cost=True)), - Consumption.cost(prompt_tokens_cost + completion_tokens_cost, self.format_kind("total", cost=True)), + Consumption.cost(prompt_tokens_cost, self.format_kind(TokenKind.prompt, cost=True)), + Consumption.cost(completion_tokens_cost, self.format_kind(TokenKind.completion, cost=True)), + Consumption.cost(prompt_tokens_cost + completion_tokens_cost, self.format_kind(TokenKind.total, cost=True)), ] @abc.abstractmethod @@ -101,3 +129,71 @@ def find_model_costs(self) -> Optional[LLMCostCard]: @staticmethod def filter_zeros(consumptions: List[Consumption]) -> List[Consumption]: return list(filter(lambda consumption: consumption.value > 0, consumptions)) + + +class LLMCostManagerSpec(DataObjectSpecBase): + def __init__(self, costs: Dict[str, Dict[str, LLMCostCard]]) -> None: + """ + Initializes a new instance of LLMCostManagerSpec + + Args: + costs (Dict[str, Dict[str, LLMCostCard]]): collection of cost cards of shape + {category: {model_1: LLMCostCard, model_2: LLMCostCard}, another_category: {...}} + """ + self.costs = costs + + @classmethod + def from_dict(cls, values: Dict[str, Any]) -> LLMCostManagerSpec: + costs = { + category: { + model: LLMCostCard.from_dict(model_data) for model, model_data in category_data["models"].items() + } + for category, category_data in values.items() + } + + return LLMCostManagerSpec(costs) + + def to_dict(self) -> Dict[str, Any]: + return self.costs + + def __str__(self) -> str: + return f"LLMCostCards for {len(self.costs.keys())} categories" + + +class LLMCostManagerObject(DataObject[LLMCostManagerSpec]): + """ + Helper class to instantiate an LLMCostManagerObject from a YAML file + """ + + @classmethod + def from_dict(cls, values: Dict[str, Any]) -> LLMCostManagerObject: + return super()._from_dict(LLMCostManagerSpec, values) + + @classmethod + def from_yaml(cls, filename: str) -> LLMCostManagerObject: + with open(filename, "r", encoding="utf-8") as f: + values = yaml.safe_load(f) + cls._check_kind(values, "LLMCostManager") + return LLMCostManagerObject.from_dict(values) + + @staticmethod + def anthropic(): + """Get LLMCostManager for Anthropic models""" + return LLMCostManagerObject.from_yaml(os.path.join(DATA_PATH, ANTHROPIC_COSTS_FILENAME)) + + @staticmethod + def gemini(): + """Get LLMCostManager for Gemini models""" + return LLMCostManagerObject.from_yaml(os.path.join(DATA_PATH, GEMINI_COSTS_FILENAME)) + + @staticmethod + def openai(): + """Get LLMCostManager for OpenAI models""" + return LLMCostManagerObject.from_yaml(os.path.join(DATA_PATH, OPENAI_COSTS_FILENAME)) + + def get_cost_map(self, category: str) -> Dict[str, LLMCostCard]: + """Get cost mapping {model: LLMCostCard} for a given category""" + if category not in self.spec.costs: + raise ValueError(f"Unexpected category `{category}` for LLMCostManager") + + return self.spec.costs[category] diff --git a/council/llm/openai_chat_completions_llm.py b/council/llm/openai_chat_completions_llm.py index f487e311..8990774b 100644 --- a/council/llm/openai_chat_completions_llm.py +++ b/council/llm/openai_chat_completions_llm.py @@ -10,12 +10,13 @@ LLMCallException, LLMConsumptionCalculatorBase, LLMCostCard, + LLMCostManagerObject, LLMMessage, LLMMessageTokenCounterBase, LLMResult, + TokenKind, ) - -from ..utils import truncate_dict_values_to_str +from council.utils.utils import DurationManager, truncate_dict_values_to_str class Provider(Protocol): @@ -123,40 +124,11 @@ def from_dict(obj: Any) -> Usage: class OpenAIConsumptionCalculator(LLMConsumptionCalculatorBase): - # https://openai.com/api/pricing/ - COSTS_gpt_35_turbo_FAMILY: Mapping[str, LLMCostCard] = { - "gpt-3.5-turbo-0125": LLMCostCard(input=0.50, output=1.50), - "gpt-3.5-turbo-instruct": LLMCostCard(input=1.50, output=2.00), - "gpt-3.5-turbo-1106": LLMCostCard(input=1.00, output=2.00), - "gpt-3.5-turbo-0613": LLMCostCard(input=1.50, output=2.00), - "gpt-3.5-turbo-16k-0613": LLMCostCard(input=3.00, output=4.00), - "gpt-3.5-turbo-0301": LLMCostCard(input=1.50, output=2.00), - } - - COSTS_gpt_4_FAMILY: Mapping[str, LLMCostCard] = { - "gpt-4-turbo": LLMCostCard(input=10.00, output=30.00), - "gpt-4-turbo-2024-04-09": LLMCostCard(input=10.00, output=30.00), - "gpt-4": LLMCostCard(input=30.00, output=60.00), - "gpt-4-32k": LLMCostCard(input=60.00, output=120.00), - "gpt-4-0125-preview": LLMCostCard(input=10.00, output=30.00), - "gpt-4-1106-preview": LLMCostCard(input=10.00, output=30.00), - "gpt-4-vision-preview": LLMCostCard(input=10.00, output=30.00), - } - - COSTS_gpt_4o_FAMILY: Mapping[str, LLMCostCard] = { - "gpt-4o": LLMCostCard(input=2.50, output=10.00), - "gpt-4o-2024-08-06": LLMCostCard(input=2.50, output=10.00), - "gpt-4o-2024-05-13": LLMCostCard(input=5.00, output=15.00), - "gpt-4o-mini": LLMCostCard(input=0.150, output=0.60), - "gpt-4o-mini-2024-07-18": LLMCostCard(input=0.150, output=0.60), - } - - COSTS_o1_FAMILY: Mapping[str, LLMCostCard] = { - "o1-preview": LLMCostCard(input=15.00, output=60.00), - "o1-preview-2024-09-12": LLMCostCard(input=15.00, output=60.00), - "o1-mini": LLMCostCard(input=3.00, output=12.00), - "o1-mini-2024-09-12": LLMCostCard(input=3.00, output=12.00), - } + _cost_manager = LLMCostManagerObject.openai() + COSTS_gpt_35_turbo_FAMILY: Mapping[str, LLMCostCard] = _cost_manager.get_cost_map("gpt_35_turbo_family") + COSTS_gpt_4_FAMILY: Mapping[str, LLMCostCard] = _cost_manager.get_cost_map("gpt_4_family") + COSTS_gpt_4o_FAMILY: Mapping[str, LLMCostCard] = _cost_manager.get_cost_map("gpt_4o_family") + COSTS_o1_FAMILY: Mapping[str, LLMCostCard] = _cost_manager.get_cost_map("o1_family") def find_model_costs(self) -> Optional[LLMCostCard]: if self.model.startswith("o1"): @@ -170,24 +142,26 @@ def find_model_costs(self) -> Optional[LLMCostCard]: return None - def get_openai_consumptions(self, usage: Usage) -> List[Consumption]: + def get_openai_consumptions(self, duration: float, usage: Usage) -> List[Consumption]: """ Get consumptions specific for OpenAI: - 1 call + - specified duration - cache_read_prompt, prompt, reasoning, completion and total tokens - costs LLMCostCard can be found """ - consumptions = self.get_openai_token_consumptions(usage) + self.get_openai_cost_consumptions(usage) + consumptions = self.get_openai_base_consumptions(duration, usage) + self.get_openai_cost_consumptions(usage) return self.filter_zeros(consumptions) # could occur for cache/reasoning tokens - def get_openai_token_consumptions(self, usage: Usage) -> List[Consumption]: + def get_openai_base_consumptions(self, duration: float, usage: Usage) -> List[Consumption]: return [ Consumption.call(1, self.model), - Consumption.token(usage.cached_tokens, self.format_kind("cache_read_prompt")), - Consumption.token(usage.prompt_tokens, self.format_kind("prompt")), - Consumption.token(usage.reasoning_tokens, self.format_kind("reasoning")), - Consumption.token(usage.completion_tokens, self.format_kind("completion")), - Consumption.token(usage.total_tokens, self.format_kind("total")), + Consumption.duration(duration, self.model), + Consumption.token(usage.cached_tokens, self.format_kind(TokenKind.cache_read_prompt)), + Consumption.token(usage.prompt_tokens, self.format_kind(TokenKind.prompt)), + Consumption.token(usage.reasoning_tokens, self.format_kind(TokenKind.reasoning)), + Consumption.token(usage.completion_tokens, self.format_kind(TokenKind.completion)), + Consumption.token(usage.total_tokens, self.format_kind(TokenKind.total)), ] def get_openai_cost_consumptions(self, usage: Usage) -> List[Consumption]: @@ -202,11 +176,11 @@ def get_openai_cost_consumptions(self, usage: Usage) -> List[Consumption]: total_cost = sum([cached_tokens_cost, prompt_tokens_cost, reasoning_tokens_cost, completion_tokens_cost]) return [ - Consumption.cost(cached_tokens_cost, self.format_kind("cache_read_prompt", cost=True)), - Consumption.cost(prompt_tokens_cost, self.format_kind("prompt", cost=True)), - Consumption.cost(reasoning_tokens_cost, self.format_kind("reasoning", cost=True)), - Consumption.cost(completion_tokens_cost, self.format_kind("completion", cost=True)), - Consumption.cost(total_cost, self.format_kind("total", cost=True)), + Consumption.cost(cached_tokens_cost, self.format_kind(TokenKind.cache_read_prompt, cost=True)), + Consumption.cost(prompt_tokens_cost, self.format_kind(TokenKind.prompt, cost=True)), + Consumption.cost(reasoning_tokens_cost, self.format_kind(TokenKind.reasoning, cost=True)), + Consumption.cost(completion_tokens_cost, self.format_kind(TokenKind.completion, cost=True)), + Consumption.cost(total_cost, self.format_kind(TokenKind.total, cost=True)), ] @@ -250,9 +224,9 @@ def choices(self) -> Sequence[Choice]: def raw_response(self) -> Dict[str, Any]: return self._raw_response - def to_consumptions(self) -> Sequence[Consumption]: + def to_consumptions(self, duration: float) -> Sequence[Consumption]: consumption_calculator = OpenAIConsumptionCalculator(self.model) - return consumption_calculator.get_openai_consumptions(self.usage) + return consumption_calculator.get_openai_consumptions(duration, self.usage) @staticmethod def from_response(response: Dict[str, Any]) -> OpenAIChatCompletionsResult: @@ -289,13 +263,14 @@ def _post_chat_request(self, context: LLMContext, messages: Sequence[LLMMessage] context.logger.debug( f'message="Sending chat GPT completions request to {self._name}" payload="{truncate_dict_values_to_str(payload, 100)}"' ) - r = self._post_request(payload) + with DurationManager() as timer: + r = self._post_request(payload) context.logger.debug( f'message="Got chat GPT completions result from {self._name}" id="{r.id}" model="{r.model}" {r.usage}' ) return LLMResult( choices=[c.message.content for c in r.choices], - consumptions=r.to_consumptions(), + consumptions=r.to_consumptions(timer.duration), raw_response=r.raw_response, ) diff --git a/council/utils/utils.py b/council/utils/utils.py index 6ebd822c..27fa862e 100644 --- a/council/utils/utils.py +++ b/council/utils/utils.py @@ -1,4 +1,15 @@ -from typing import Dict +import time +from typing import ContextManager, Dict + + +class DurationManager(ContextManager): + def __enter__(self): + self.start_time = time.time() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.end_time = time.time() + self.duration = self.end_time - self.start_time def truncate_dict_values_to_str(data: Dict, max_length: int = 20): diff --git a/docs/source/reference/contexts/consumption.rst b/docs/source/reference/contexts/consumption.rst index 138d00ca..5e58bc8a 100644 --- a/docs/source/reference/contexts/consumption.rst +++ b/docs/source/reference/contexts/consumption.rst @@ -4,3 +4,4 @@ Consumption ----------- .. autoclass:: council.contexts.Consumption + :member-order: bysource diff --git a/docs/source/reference/llm/llm_cost.rst b/docs/source/reference/llm/llm_cost.rst new file mode 100644 index 00000000..71d9cb99 --- /dev/null +++ b/docs/source/reference/llm/llm_cost.rst @@ -0,0 +1,21 @@ +LLMCostCard +----------- + +.. autoclass:: council.llm.LLMCostCard + +LLMConsumptionCalculatorBase +---------------------------- + +.. autoclass:: council.llm.LLMConsumptionCalculatorBase + +LLMCostManagerObject +-------------------- + +.. autoclass:: council.llm.LLMCostManagerObject + :member-order: bysource + +TokenKind +--------- + +.. autoclass:: council.llm.TokenKind + :member-order: bysource diff --git a/tests/integration/llm/test_llm_caching_middleware.py b/tests/integration/llm/test_llm_caching_middleware.py index 09bd527e..86368362 100644 --- a/tests/integration/llm/test_llm_caching_middleware.py +++ b/tests/integration/llm/test_llm_caching_middleware.py @@ -32,6 +32,8 @@ def execute_llm_func(llm_func: LLMFunction, message: str, to_print: str, **kwarg response = llm_func.execute(message, **kwargs) print(f"\n{to_print}") print(f"\tResponse duration: {response.duration:.3f}s") + for consumption in response.result.consumptions: + print(f"\t{consumption}") return response diff --git a/tests/unit/llm/test_llm_consumption_calculators.py b/tests/unit/llm/test_llm_consumption_calculators.py index bac6de8e..f046869b 100644 --- a/tests/unit/llm/test_llm_consumption_calculators.py +++ b/tests/unit/llm/test_llm_consumption_calculators.py @@ -1,8 +1,8 @@ import unittest -from council.llm.anthropic_llm import AnthropicConsumptionCalculator +from council.llm.anthropic_llm import AnthropicConsumptionCalculator, Usage as AnthropicUsage from council.llm.gemini_llm import GeminiConsumptionCalculator -from council.llm.openai_chat_completions_llm import OpenAIConsumptionCalculator, Usage +from council.llm.openai_chat_completions_llm import OpenAIConsumptionCalculator, Usage as OpenAIUsage class TestAnthropicConsumptionCalculator(unittest.TestCase): @@ -23,13 +23,13 @@ def test_haiku_cost_calculation(self): self.assertEqual(completion_cost, 0.0625) # $1.25 * 0.05 def test_haiku_cache_cost_calculation(self): - consumptions = AnthropicConsumptionCalculator("claude-3-haiku-20240307").get_cache_cost_consumptions( - { - "cache_creation_prompt_tokens": 1_000_000, - "cache_read_prompt_tokens": 500_000, - "prompt_tokens": 100_000, - "completion_tokens": 50_000, - } + consumptions = AnthropicConsumptionCalculator("claude-3-haiku-20240307").get_anthropic_cost_consumptions( + AnthropicUsage( + prompt_tokens=100_000, + completion_tokens=50_000, + cache_creation_prompt_tokens=1_000_000, + cache_read_prompt_tokens=500_000, + ) ) cache_creation_cost = next(c for c in consumptions if "cache_creation_prompt_tokens_cost" in c.kind) @@ -56,13 +56,13 @@ def test_sonnet_cache_cost_calculation(self): sonnet_versions = ["claude-3-5-sonnet-20240620", "claude-3-5-sonnet-20241022"] for version in sonnet_versions: - consumptions = AnthropicConsumptionCalculator(version).get_cache_cost_consumptions( - { - "cache_creation_prompt_tokens": 1_000_000, - "cache_read_prompt_tokens": 500_000, - "prompt_tokens": 100_000, - "completion_tokens": 50_000, - } + consumptions = AnthropicConsumptionCalculator(version).get_anthropic_cost_consumptions( + AnthropicUsage( + prompt_tokens=100_000, + completion_tokens=50_000, + cache_creation_prompt_tokens=1_000_000, + cache_read_prompt_tokens=500_000, + ) ) cache_creation_cost = next(c for c in consumptions if "cache_creation_prompt_tokens_cost" in c.kind) @@ -83,13 +83,13 @@ def test_opus_cost_calculation(self): self.assertEqual(completion_cost, 3.75) # $75.00 * 0.05 def test_opus_cache_cost_calculation(self): - consumptions = AnthropicConsumptionCalculator("claude-3-opus-20240229").get_cache_cost_consumptions( - { - "cache_creation_prompt_tokens": 1_000_000, - "cache_read_prompt_tokens": 500_000, - "prompt_tokens": 100_000, - "completion_tokens": 50_000, - } + consumptions = AnthropicConsumptionCalculator("claude-3-opus-20240229").get_anthropic_cost_consumptions( + AnthropicUsage( + prompt_tokens=100_000, + completion_tokens=50_000, + cache_creation_prompt_tokens=1_000_000, + cache_read_prompt_tokens=500_000, + ) ) cache_creation_cost = next(c for c in consumptions if "cache_creation_prompt_tokens_cost" in c.kind) @@ -103,13 +103,13 @@ def test_invalid_model(self): def test_invalid_model_cache_costs(self): # doesn't support caching - consumptions = AnthropicConsumptionCalculator("claude-3-sonnet-20240229").get_cache_cost_consumptions( - { - "cache_creation_prompt_tokens": 1_000_000, - "cache_read_prompt_tokens": 500_000, - "prompt_tokens": 100_000, - "completion_tokens": 50_000, - } + consumptions = AnthropicConsumptionCalculator("claude-3-sonnet-20240229").get_anthropic_cost_consumptions( + AnthropicUsage( + prompt_tokens=100_000, + completion_tokens=50_000, + cache_creation_prompt_tokens=1_000_000, + cache_read_prompt_tokens=500_000, + ) ) self.assertEqual(len(consumptions), 0) @@ -117,7 +117,7 @@ def test_invalid_model_cache_costs(self): def test_consumption_units_and_types(self): model = "claude-3-haiku-20240307" calculator = AnthropicConsumptionCalculator(model) - consumptions = calculator.get_cost_consumptions(1_000, 1_000) + consumptions = calculator.get_cost_consumptions(prompt_tokens=1_000, completion_tokens=1_000) for consumption in consumptions: self.assertEqual(consumption.unit, "USD") @@ -125,13 +125,10 @@ def test_consumption_units_and_types(self): def test_cache_consumption_units_and_types(self): model = "claude-3-5-sonnet-20241022" - consumptions = AnthropicConsumptionCalculator(model).get_cache_cost_consumptions( - { - "cache_creation_prompt_tokens": 1000, - "cache_read_prompt_tokens": 500, - "prompt_tokens": 100, - "completion_tokens": 50, - } + consumptions = AnthropicConsumptionCalculator(model).get_anthropic_cost_consumptions( + AnthropicUsage( + prompt_tokens=100, completion_tokens=50, cache_creation_prompt_tokens=1000, cache_read_prompt_tokens=500 + ) ) for consumption in consumptions: @@ -269,7 +266,7 @@ def test_invalid_models(self): self.assertIsNone(OpenAIConsumptionCalculator("gpt-4-invalid").find_model_costs()) def test_cached_tokens_cost_calculations(self): - usage = Usage( + usage = OpenAIUsage( completion_tokens=0, prompt_tokens=500_000, total_tokens=1_500_000, @@ -289,7 +286,7 @@ def test_cached_tokens_cost_calculations(self): self.assertEqual(prompt_cost, 1.25) def test_reasoning_tokens_cost_calculations(self): - usage = Usage( + usage = OpenAIUsage( completion_tokens=1_000_000, prompt_tokens=0, total_tokens=2_000_000,