Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Set max tokens by prompt #255

Merged
merged 4 commits into from
Jan 24, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions graphiti_core/llm_client/anthropic_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,10 @@ def __init__(self, config: LLMConfig | None = None, cache: bool = False):
)

async def _generate_response(
self, messages: list[Message], response_model: type[BaseModel] | None = None
self,
messages: list[Message],
response_model: type[BaseModel] | None = None,
max_tokens: int = DEFAULT_MAX_TOKENS,
) -> dict[str, typing.Any]:
system_message = messages[0]
user_messages = [{'role': m.role, 'content': m.content} for m in messages[1:]] + [
Expand All @@ -59,7 +62,7 @@ async def _generate_response(
result = await self.client.messages.create(
system='Only include JSON in the response. Do not include any additional text or explanation of the content.\n'
+ system_message.content,
max_tokens=self.max_tokens,
max_tokens=max_tokens or self.max_tokens,
temperature=self.temperature,
messages=user_messages, # type: ignore
model=self.model or DEFAULT_MODEL,
Expand Down
21 changes: 15 additions & 6 deletions graphiti_core/llm_client/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from tenacity import retry, retry_if_exception, stop_after_attempt, wait_random_exponential

from ..prompts.models import Message
from .config import LLMConfig
from .config import DEFAULT_MAX_TOKENS, LLMConfig
from .errors import RateLimitError

DEFAULT_TEMPERATURE = 0
Expand Down Expand Up @@ -90,16 +90,22 @@ def _clean_input(self, input: str) -> str:
reraise=True,
)
async def _generate_response_with_retry(
self, messages: list[Message], response_model: type[BaseModel] | None = None
self,
messages: list[Message],
response_model: type[BaseModel] | None = None,
max_tokens: int = DEFAULT_MAX_TOKENS,
) -> dict[str, typing.Any]:
try:
return await self._generate_response(messages, response_model)
return await self._generate_response(messages, response_model, max_tokens)
except (httpx.HTTPStatusError, RateLimitError) as e:
raise e

@abstractmethod
async def _generate_response(
self, messages: list[Message], response_model: type[BaseModel] | None = None
self,
messages: list[Message],
response_model: type[BaseModel] | None = None,
max_tokens: int = DEFAULT_MAX_TOKENS,
) -> dict[str, typing.Any]:
pass

Expand All @@ -110,7 +116,10 @@ def _get_cache_key(self, messages: list[Message]) -> str:
return hashlib.md5(key_str.encode()).hexdigest()

async def generate_response(
self, messages: list[Message], response_model: type[BaseModel] | None = None
self,
messages: list[Message],
response_model: type[BaseModel] | None = None,
max_tokens: int = DEFAULT_MAX_TOKENS,
) -> dict[str, typing.Any]:
if response_model is not None:
serialized_model = json.dumps(response_model.model_json_schema())
Expand All @@ -131,7 +140,7 @@ async def generate_response(
for message in messages:
message.content = self._clean_input(message.content)

response = await self._generate_response_with_retry(messages, response_model)
response = await self._generate_response_with_retry(messages, response_model, max_tokens)

if self.cache_enabled:
self.cache_dir.set(cache_key, response)
Expand Down
2 changes: 1 addition & 1 deletion graphiti_core/llm_client/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
limitations under the License.
"""

DEFAULT_MAX_TOKENS = 16384
DEFAULT_MAX_TOKENS = 1024
DEFAULT_TEMPERATURE = 0


Expand Down
7 changes: 5 additions & 2 deletions graphiti_core/llm_client/groq_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,10 @@ def __init__(self, config: LLMConfig | None = None, cache: bool = False):
self.client = AsyncGroq(api_key=config.api_key)

async def _generate_response(
self, messages: list[Message], response_model: type[BaseModel] | None = None
self,
messages: list[Message],
response_model: type[BaseModel] | None = None,
max_tokens: int = DEFAULT_MAX_TOKENS,
) -> dict[str, typing.Any]:
msgs: list[ChatCompletionMessageParam] = []
for m in messages:
Expand All @@ -58,7 +61,7 @@ async def _generate_response(
model=self.model or DEFAULT_MODEL,
messages=msgs,
temperature=self.temperature,
max_tokens=self.max_tokens,
max_tokens=max_tokens or self.max_tokens,
response_format={'type': 'json_object'},
)
result = response.choices[0].message.content or ''
Expand Down
22 changes: 16 additions & 6 deletions graphiti_core/llm_client/openai_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

from ..prompts.models import Message
from .client import LLMClient
from .config import LLMConfig
from .config import DEFAULT_MAX_TOKENS, LLMConfig
from .errors import RateLimitError, RefusalError

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -58,7 +58,11 @@ class OpenAIClient(LLMClient):
MAX_RETRIES: ClassVar[int] = 2

def __init__(
self, config: LLMConfig | None = None, cache: bool = False, client: typing.Any = None
self,
config: LLMConfig | None = None,
cache: bool = False,
client: typing.Any = None,
max_tokens: int = DEFAULT_MAX_TOKENS,
):
"""
Initialize the OpenAIClient with the provided configuration, cache setting, and client.
Expand All @@ -84,7 +88,10 @@ def __init__(
self.client = client

async def _generate_response(
self, messages: list[Message], response_model: type[BaseModel] | None = None
self,
messages: list[Message],
response_model: type[BaseModel] | None = None,
max_tokens: int = DEFAULT_MAX_TOKENS,
) -> dict[str, typing.Any]:
openai_messages: list[ChatCompletionMessageParam] = []
for m in messages:
Expand All @@ -98,7 +105,7 @@ async def _generate_response(
model=self.model or DEFAULT_MODEL,
messages=openai_messages,
temperature=self.temperature,
max_tokens=self.max_tokens,
max_tokens=max_tokens or self.max_tokens,
response_format=response_model, # type: ignore
)

Expand All @@ -119,14 +126,17 @@ async def _generate_response(
raise

async def generate_response(
self, messages: list[Message], response_model: type[BaseModel] | None = None
self,
messages: list[Message],
response_model: type[BaseModel] | None = None,
max_tokens: int = DEFAULT_MAX_TOKENS,
) -> dict[str, typing.Any]:
retry_count = 0
last_error = None

while retry_count <= self.MAX_RETRIES:
try:
response = await self._generate_response(messages, response_model)
response = await self._generate_response(messages, response_model, max_tokens)
return response
except (RateLimitError, RefusalError):
# These errors should not trigger retries
Expand Down
16 changes: 12 additions & 4 deletions graphiti_core/llm_client/openai_generic_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

from ..prompts.models import Message
from .client import LLMClient
from .config import LLMConfig
from .config import DEFAULT_MAX_TOKENS, LLMConfig
from .errors import RateLimitError, RefusalError

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -85,7 +85,10 @@ def __init__(
self.client = client

async def _generate_response(
self, messages: list[Message], response_model: type[BaseModel] | None = None
self,
messages: list[Message],
response_model: type[BaseModel] | None = None,
max_tokens: int = DEFAULT_MAX_TOKENS,
) -> dict[str, typing.Any]:
openai_messages: list[ChatCompletionMessageParam] = []
for m in messages:
Expand All @@ -111,7 +114,10 @@ async def _generate_response(
raise

async def generate_response(
self, messages: list[Message], response_model: type[BaseModel] | None = None
self,
messages: list[Message],
response_model: type[BaseModel] | None = None,
max_tokens: int = DEFAULT_MAX_TOKENS,
) -> dict[str, typing.Any]:
retry_count = 0
last_error = None
Expand All @@ -126,7 +132,9 @@ async def generate_response(

while retry_count <= self.MAX_RETRIES:
try:
response = await self._generate_response(messages, response_model)
response = await self._generate_response(
messages, response_model, max_tokens=max_tokens
)
return response
except (RateLimitError, RefusalError):
# These errors should not trigger retries
Expand Down
6 changes: 5 additions & 1 deletion graphiti_core/utils/maintenance/edge_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ async def extract_edges(
) -> list[EntityEdge]:
start = time()

EXTRACT_EDGES_MAX_TOKENS = 16384

node_uuids_by_name_map = {node.name: node.uuid for node in nodes}

# Prepare context for LLM
Expand All @@ -93,7 +95,9 @@ async def extract_edges(
reflexion_iterations = 0
while facts_missed and reflexion_iterations < MAX_REFLEXION_ITERATIONS:
llm_response = await llm_client.generate_response(
prompt_library.extract_edges.edge(context), response_model=ExtractedEdges
prompt_library.extract_edges.edge(context),
response_model=ExtractedEdges,
max_tokens=EXTRACT_EDGES_MAX_TOKENS,
)
edges_data = llm_response.get('edges', [])

Expand Down
Loading
Loading