Merge branch 'main' into main

citadel-ai · Feb 13, 2025 · f6641de · f6641de
2 parents bb192b1 + 3da6f22
commit f6641de
Show file tree

Hide file tree

Showing 10 changed files with 196 additions and 80 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "langcheck"
-version = "0.9.0"
+version = "0.9.0.dev1"
 description = "Simple, Pythonic building blocks to evaluate LLM-based applications"
 readme = "README.md"
 authors = [{ name = "Citadel AI", email = "[email protected]" }]

diff --git a/src/langcheck/__init__.py b/src/langcheck/__init__.py
@@ -1,4 +1,4 @@
 from langcheck import augment, metrics, plot, utils
 
 __all__ = ["augment", "metrics", "plot", "utils"]
-__version__ = "0.9.0"
+__version__ = "0.9.0.dev1"
diff --git a/src/langcheck/metrics/eval_clients/_anthropic.py b/src/langcheck/metrics/eval_clients/_anthropic.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import asyncio
+import warnings
 from collections.abc import Iterable
 from typing import Any
 
@@ -21,6 +22,7 @@ def __init__(
         anthropic_args: dict[str, Any] | None = None,
         *,
         use_async: bool = False,
+        system_prompt: str | None = None,
     ):
         """
         Initialize the Anthropic evaluation client. The authentication
@@ -32,6 +34,8 @@ def __init__(
             anthropic_args: (Optional) dict of additional args to pass in to
                 the ``client.messages.create`` function
             use_async: (Optional) If True, the async client will be used.
+            system_prompt: (Optional) The system prompt to use. If not provided,
+                no system prompt will be used.
         """
         if anthropic_client:
             self._client = anthropic_client
@@ -42,12 +46,20 @@ def __init__(
 
         self._anthropic_args = anthropic_args or {}
         self._use_async = use_async
+        self._system_prompt = system_prompt
+
+        if system_prompt and "system" in self._anthropic_args:
+            warnings.warn(
+                '"system" of anthropic_args will be ignored because '
+                "system_prompt is provided."
+            )
 
     def _call_api(
         self,
         prompts: Iterable[str | None],
         config: dict[str, Any],
         *,
+        system_prompt: str | None = None,
         tqdm_description: str | None = None,
     ) -> list[Any]:
         # A helper function to call the API with exception filter for alignment
@@ -60,8 +72,14 @@ def _call_api_with_exception_filter(model_input: dict[str, Any]) -> Any:
             except Exception as e:
                 return e
 
+        if system_prompt:
+            config["system"] = system_prompt
+
         model_inputs = [
-            {"messages": [{"role": "user", "content": prompt}], **config}
+            {
+                "messages": [{"role": "user", "content": prompt}],
+                **config,
+            }
             for prompt in prompts
         ]
 
@@ -121,7 +139,10 @@ def get_text_responses(
         config.update(self._anthropic_args or {})
         tqdm_description = tqdm_description or "Intermediate assessments (1/2)"
         responses = self._call_api(
-            prompts=prompts, config=config, tqdm_description=tqdm_description
+            prompts=prompts,
+            config=config,
+            tqdm_description=tqdm_description,
+            system_prompt=self._system_prompt,
         )
         response_texts = [
             response.content[0].text if response else None

diff --git a/src/langcheck/metrics/eval_clients/_gemini.py b/src/langcheck/metrics/eval_clients/_gemini.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import os
+import warnings
 from collections.abc import Iterable
 from typing import Any
 
@@ -26,6 +27,8 @@ def __init__(
         model_args: dict[str, Any] | None = None,
         generate_content_args: dict[str, Any] | None = None,
         embed_model_name: str | None = None,
+        *,
+        system_prompt: str | None = None,
     ):
         """
         Initialize the Gemini evaluation client. The authentication
@@ -47,19 +50,32 @@ def __init__(
                 ``generate_content`` function.
             embed_model_name: (Optional) The name of the embedding model to use.
                 If not provided, the models/embedding-001 model will be used.
+            system_prompt: (Optional) The system prompt to use. If not provided,
+                no system prompt will be used.
         """
         if model:
-            self._model = model
+            self._text_response_model = model
+            self._structured_assessment_model = model
         else:
             configure(api_key=os.getenv("GOOGLE_API_KEY"))
             model_args = model_args or {}
-            self._model = GenerativeModel(**model_args)
+            self._structured_assessment_model = GenerativeModel(**model_args)
+            # Only add system prompt to the text response model if it is provided
+            if system_prompt:
+                if "system_instruction" in model_args:
+                    warnings.warn(
+                        '"system_instruction" of model_args will be ignored because '
+                        "system_prompt is provided."
+                    )
+                model_args["system_instruction"] = system_prompt
+            self._text_response_model = GenerativeModel(**model_args)
 
         self._generate_content_args = generate_content_args or {}
         self._embed_model_name = embed_model_name
 
     def _call_api(
         self,
+        model: GenerativeModel,
         prompts: Iterable[str | None],
         config: dict[str, Any],
         *,
@@ -69,7 +85,7 @@ def _call_api(
         # of exception handling with the async version.
         def _call_api_with_exception_filter(prompt: str) -> Any:
             try:
-                return self._model.generate_content(prompt, **config)
+                return model.generate_content(prompt, **config)
             except Exception as e:
                 return e
 
@@ -109,7 +125,10 @@ def get_text_responses(
         config.update(self._generate_content_args or {})
         tqdm_description = tqdm_description or "Intermediate assessments (1/2)"
         responses = self._call_api(
-            prompts=prompts, config=config, tqdm_description=tqdm_description
+            model=self._text_response_model,
+            prompts=prompts,
+            config=config,
+            tqdm_description=tqdm_description,
         )
         response_texts = [
             response.text if response else None for response in responses
@@ -189,6 +208,7 @@ def get_float_score(
 
         tqdm_description = tqdm_description or "Scores (2/2)"
         responses = self._call_api(
+            model=self._structured_assessment_model,
             prompts=fn_call_messages,
             config=config_structured_assessments,
             tqdm_description=tqdm_description,

diff --git a/src/langcheck/metrics/eval_clients/_llama.py b/src/langcheck/metrics/eval_clients/_llama.py
@@ -32,6 +32,8 @@ def __init__(
         torch_dtype: str = "bfloat16",
         tensor_parallel_size: int = 1,
         device: str = "cuda",
+        *,
+        system_prompt: str | None = None,
     ):
         """
         Initialize the Llama evaluation client.
@@ -42,6 +44,8 @@ def __init__(
             tensor_parallel_size: The number of GPUs to use for distributed
             execution with tensor parallelism.
             device: The device to load the model on.
+            system_prompt: The system prompt to use. If not provided, default
+                system prompts based on the language will be used.
         """
         self._model = LLM(
             model=model_name,
@@ -58,7 +62,8 @@ def __init__(
             stop="<|eot_id|>",
             skip_special_tokens=True,
         )
-        self._system_prompts = {
+        self._system_prompt = system_prompt
+        self._default_system_prompts = {
             "en": "You are a helpful and competent assistant.",
             "ja": "あなたは誠実で優秀な日本人のアシスタントです。以下は、タスクを説明する指示です。要求を適切に満たす応答を日本語で書きなさい。",
         }
@@ -80,11 +85,16 @@ def get_text_responses(
         if language not in ["en", "ja"]:
             raise ValueError(f"Unsupported language: {language}")
 
+        if self._system_prompt is None:
+            system_prompt = self._default_system_prompts[language]
+        else:
+            system_prompt = self._system_prompt
+
         messages = [
             [
                 {
                     "role": "system",
-                    "content": self._system_prompts[language],
+                    "content": system_prompt,
                 },
                 {
                     "role": "user",
@@ -157,7 +167,7 @@ def get_float_score(
             [
                 {
                     "role": "system",
-                    "content": self._system_prompts[language],
+                    "content": self._default_system_prompts[language],
                 },
                 {
                     "role": "user",

diff --git a/src/langcheck/metrics/eval_clients/_openai.py b/src/langcheck/metrics/eval_clients/_openai.py
@@ -26,6 +26,7 @@ def __init__(
         openai_args: dict[str, str] | None = None,
         *,
         use_async: bool = False,
+        system_prompt: str | None = None,
     ):
         """
         Initialize the OpenAI evaluation client.
@@ -35,6 +36,8 @@ def __init__(
             openai_args: (Optional) dict of additional args to pass in to the
             ``client.chat.completions.create`` function.
             use_async: (Optional) If True, the async client will be used.
+            system_prompt: (Optional) The system prompt to use. If not provided,
+                no system prompt will be used.
         """
         if openai_client:
             self._client = openai_client
@@ -45,13 +48,15 @@ def __init__(
 
         self._openai_args = openai_args
         self._use_async = use_async
+        self._system_prompt = system_prompt
 
     def _call_api(
         self,
         prompts: Iterable[str | None],
         config: dict[str, str],
         *,
         tqdm_description: str | None = None,
+        system_prompt: str | None = None,
     ) -> list[Any]:
         # A helper function to call the API with exception filter for alignment
         # of exception handling with the async version.
@@ -63,10 +68,15 @@ def _call_api_with_exception_filter(model_input: dict[str, Any]) -> Any:
             except Exception as e:
                 return e
 
+        system_message = []
+        if system_prompt:
+            system_message.append({"role": "system", "content": system_prompt})
+
         # Call API with different seed values for each prompt.
         model_inputs = [
             {
-                "messages": [{"role": "user", "content": prompt}],
+                "messages": system_message
+                + [{"role": "user", "content": prompt}],
                 "seed": i,
                 **config,
             }
@@ -131,6 +141,7 @@ def get_text_responses(
             prompts=prompts,
             config=config,
             tqdm_description=tqdm_description,
+            system_prompt=self._system_prompt,
         )
         response_texts = [
             response.choices[0].message.content if response else None
@@ -169,7 +180,10 @@ def get_text_responses_with_log_likelihood(
         config.update(self._openai_args or {})
         tqdm_description = tqdm_description or "Getting log likelihoods"
         responses = self._call_api(
-            prompts=prompts, config=config, tqdm_description=tqdm_description
+            prompts=prompts,
+            config=config,
+            tqdm_description=tqdm_description,
+            system_prompt=self._system_prompt,
         )
         response_texts_with_log_likelihood = []
         for response in responses:
@@ -328,6 +342,7 @@ def __init__(
         openai_args: dict[str, str] | None = None,
         *,
         use_async: bool = False,
+        system_prompt: str | None = None,
     ):
         """
         Intialize the Azure OpenAI evaluation client.
@@ -345,6 +360,8 @@ def __init__(
             openai_args: (Optional) dict of additional args to pass in to the
             ``client.chat.completions.create`` function
             use_async: (Optional) If True, the async client will be used.
+            system_prompt: (Optional) The system prompt to use. If not provided,
+                no system prompt will be used.
         """
         assert (
             text_model_name is not None or embedding_model_name is not None
@@ -368,6 +385,7 @@ def __init__(
         self._text_model_name = text_model_name
         self._embedding_model_name = embedding_model_name
         self._openai_args = openai_args or {}
+        self._system_prompt = system_prompt
 
         if self._text_model_name is not None:
             self._openai_args["model"] = self._text_model_name
@@ -411,31 +429,31 @@ def __init__(
         self._use_async = use_async
 
     async def _async_embed(self, inputs: list[str]) -> CreateEmbeddingResponse:
-      """Embed the inputs using the OpenAI API in async mode."""
-      assert isinstance(self.openai_client, AsyncOpenAI)
-      if self.openai_args:
-          responses = await self.openai_client.embeddings.create(
-              input=inputs, **self.openai_args
-          )
-      else:
-          responses = await self.openai_client.embeddings.create(
-              input=inputs, model="text-embedding-3-small"
-          )
-      return responses
-      
+        """Embed the inputs using the OpenAI API in async mode."""
+        assert isinstance(self.openai_client, AsyncOpenAI)
+        if self.openai_args:
+            responses = await self.openai_client.embeddings.create(
+                input=inputs, **self.openai_args
+            )
+        else:
+            responses = await self.openai_client.embeddings.create(
+                input=inputs, model="text-embedding-3-small"
+            )
+        return responses
+
     def _embed(self, inputs: list[str]) -> torch.Tensor:
         """Embed the inputs using the OpenAI API."""
 
         # TODO: Fix that this async call could be much slower than the sync
         # version. https://github.com/citadel-ai/langcheck/issues/160
         if self._use_async:
-          try:
-            loop = asyncio.get_event_loop()
-          except RuntimeError:  # pragma: py-lt-310
-            loop = asyncio.new_event_loop()
-            asyncio.set_event_loop(loop)
-          embed_response = loop.run_until_complete(self._async_embed(inputs))
-          embeddings = [item.embedding for item in embed_response.data]
+            try:
+                loop = asyncio.get_event_loop()
+            except RuntimeError:  # pragma: py-lt-310
+                loop = asyncio.new_event_loop()
+                asyncio.set_event_loop(loop)
+            embed_response = loop.run_until_complete(self._async_embed(inputs))
+            embeddings = [item.embedding for item in embed_response.data]
         else:
             assert isinstance(self.openai_client, OpenAI)