add: csv reader, chart visual

LamoomAI · Feb 23, 2025 · 81a8653 · 81a8653
1 parent 1aed4f8
commit 81a8653
Show file tree

Hide file tree

Showing 11 changed files with 253 additions and 27 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,6 @@
 .venv
 .env
-__pycache__/
-*.py[cod]
+__pycache__
+*.py[cod]
+.pytest_cache
+.vscode
diff --git a/README.md b/README.md
@@ -0,0 +1,106 @@
+# LLM Prompt Evaluation Tool
+
+This tool allows you to evaluate how well your LLM responses match an ideal answer by comparing generated questions and answers. You can use the tool either by calling the comparison functions manually in your code or by passing a CSV file containing test cases. The tool also supports visualizing your test scores on a chart, with results automatically grouped by different prompt IDs.
+
+## Key Parameters
+
+- **ideal_answer (required):**  
+  The reference or "ideal" answer that your LLM response is compared against.  
+  **Example:**  
+  ```
+  "Blockchain is like a digital ledger that everyone can see but no one can change."
+  ```
+
+- **llm_response (optional):**  
+  Your LLM's response. If prompt is provided in `optional_params`, `llm_response` can be left as `None`.
+
+- **optional_params (optional):**  
+  A JSON-like dictionary that may include extra details for the test. It has the following structure:
+  - `prompt`: A string with the prompt to be used (if you want to override the `prompt_id` prompt).
+  - `context`: (Optional) Additional context for the prompt.
+  - `prompt_id`: (Optional) A unique identifier for the prompt to be fetched online from Lamoom Service.  
+  **Example:**  
+  ```json
+  {
+    "prompt": "Explain blockchain to a beginner.",
+    "context": {},
+    "prompt_id": "beginner_blockchain"
+  }
+  ```
+
+## Using the Tool
+
+### 1. Manual Testing
+
+You can manually call the `compare()` method by passing the required `ideal_answer` and (optionally) `llm_response` and `optional_params`. Each call will automatically accumulate the test results based on the provided (or default) `prompt_id`.
+
+**Example:**
+
+```python
+ideal_answer = (
+    "Blockchain is like a digital notebook that everyone can see, but no one can secretly change. "
+    "Imagine a shared Google Doc where every change is recorded forever, and no one can edit past entries."
+)
+optional_params = {
+    "prompt": "Explain the concept of blockchain to someone with no technical background."
+}
+
+test_response = TestLLMResponse(openai_key=os.environ.get("OPENAI_KEY"))
+# When llm_response is not passed, it defaults to None.
+result = test_response.compare(ideal_answer, optional_params=optional_params)
+
+# Print individual question details
+for question in result.questions:
+    print(question.to_dict())
+
+# Print overall score details
+print(result.score.to_dict())
+```
+
+### 2. Testing with CSV
+
+You can also pass multiple test cases using a CSV file. The CSV file should contain the following columns:
+
+- **ideal_answer:** (Required) The ideal answer text.
+- **optional_params:** (Optional) A JSON string containing the optional parameters.  
+- **llm_response:** (Optional) If not provided, this value is set to `None`.
+
+Multiple rows can be included, and you can use different `prompt_id` values to test various prompts.
+
+**Example CSV Content:**
+
+```csv
+ideal_answer,optional_params,llm_response
+"Blockchain is a secure, immutable digital ledger.","{\"prompt\": \"Explain blockchain simply.\", \"prompt_id\": \"simple_blockchain\"}", 
+"Blockchain is like a shared Google Doc that records every change.","{\"prompt\": \"Describe blockchain in simple terms.\", \"prompt_id\": \"google_doc_blockchain\"}", 
+```
+
+**Usage Example:**
+
+```python
+csv_file_path = "test_data.csv"
+test_response = TestLLMResponse(openai_key=os.environ.get("OPENAI_KEY"))
+accumulated_results = test_response.compare_from_csv(csv_file_path)
+```
+
+### 3. Visualizing Test Scores
+
+After running tests (whether manually or using a CSV), the results are automatically accumulated by `prompt_id`. To see a visual chart of test scores, use the provided visualization function.
+
+**Example:**
+
+```python
+visualize_test_results(accumulated_results)
+```
+
+This function will generate a line chart with the x-axis representing the test instance number (as integers) and the y-axis representing the score percentage. Each line on the chart corresponds to a different `prompt_id`.
+
+## Summary
+
+- **ideal_answer** is the only required parameter.
+- **llm_response** and **optional_params** are optional, with `optional_params` offering extra configuration (like a custom prompt and a unique `prompt_id`).
+- You can compare responses either manually or via CSV (which supports multiple test cases).
+- The tool accumulates results for each `prompt_id` across multiple calls.
+- Use the visualization function to see your test scores on an easy-to-read chart.
+
+Enjoy using the tool to refine and evaluate your LLM prompts!
diff --git a/lamoom_cicd/responses.py b/lamoom_cicd/responses.py
@@ -28,5 +28,6 @@ def to_dict(self):
 
 @dataclass(kw_only=True)
 class TestResult:
+    prompt_id: str
     questions: list[Question]
     score: Score
diff --git a/lamoom_cicd/test_llm_response.py b/lamoom_cicd/test_llm_response.py
@@ -2,14 +2,18 @@
 import logging
 from lamoom import Lamoom, AIModelsBehaviour, AttemptToCall, OpenAIModel, C_128K, PipePrompt
 from lamoom.response_parsers.response_parser import get_json_from_response
-from dataclasses import dataclass
+from dataclasses import dataclass, field
+from collections import defaultdict
+import matplotlib.pyplot as plt
 
 from prompts.prompt_generate_facts import agent as generate_facts_agent
 from prompts.prompt_compare_results import agent as compare_results_agent
 
 from lamoom_cicd.responses import Question, TestResult, Score
 from lamoom_cicd.exceptions import GenerateFactsException
 
+from lamoom_cicd.utils import parse_csv_file
+
 logger = logging.getLogger(__name__)
 
 default_behaviour = AIModelsBehaviour(
@@ -35,6 +39,10 @@ class TestLLMResponse:
     claude_key: str = None
     nebius_key: str = None
 
+    threshold: int = 70
+
+    accumulated_results: list[TestResult] = field(default_factory=list)
+
     def get_generated_test(self, statements: list, questions: dict):
         generated_test = {}
         for statement, question in questions.items():
@@ -49,7 +57,7 @@ def get_generated_test(self, statements: list, questions: dict):
 
         return generated_test
 
-    def calculate_score(self, test_results: dict, threshold: int = 70) -> Score:
+    def calculate_score(self, test_results: dict, threshold: int) -> Score:
         pass_count = 0
         question_numb = len(test_results.items()) or 1
         for _, values in test_results.items():
@@ -103,6 +111,53 @@ def compare(self, ideal_answer: str,
             for q, v in test_results.items()
         ]
 
-        score = self.calculate_score(test_results)
+        score = self.calculate_score(test_results, self.threshold)
+
+        return TestResult(prompt_id=user_prompt.id, questions=questions_list, score=score)
+
+
+    def compare_from_csv(self, csv_file: str) -> list[TestResult]:
+        """
+        Reads a CSV file and runs compare() for each row.
+        Expected CSV columns: ideal_answer, llm_response, optional_params
+        (optional_params should be a valid JSON string if provided)
+        Returns a list of test results.
+        """
+        test_cases = parse_csv_file(csv_file)
+        results = []
+        logger.info(f"CASES: {test_cases}")
+        for row in test_cases:
+            ideal_answer = row.get("ideal_answer")
+            llm_response = row.get("llm_response")
+            optional_params = json.loads(row.get("optional_params"))
+            test_result = self.compare(ideal_answer, llm_response, optional_params)
+            self.accumulated_results.append(test_result)
+            results.append(test_result)
+
+        return results
+
+    def visualize_test_results(self):
+        """
+        Plots a line chart of accumulated scores grouped by prompt_id.
+        """
+        # Group scores by prompt_id.
+        groups = defaultdict(list)
+        for item in self.accumulated_results:
+            prompt_id = item.prompt_id
+            score = item.score.score
+            groups[prompt_id].append(score)
+
+        plt.figure(figsize=(10, 6))
+        max_length = 0
+        for prompt_id, scores in groups.items():
+            x_values = list(range(1, len(scores) + 1))
+            plt.plot(x_values, scores, marker='o', linestyle='-', label=f"Prompt: {prompt_id}")
+            max_length = max(max_length, len(scores))
 
-        return TestResult(questions=questions_list, score=score)
+        plt.title(f"LLM Test Scores per Prompt (Passing score = {self.threshold}%)")
+        plt.xlabel("Test Instance")
+        plt.xticks(range(1, max_length + 1))
+        plt.ylabel("Score (%)")
+        plt.legend()
+        plt.grid(True)
+        plt.show()
diff --git a/lamoom_cicd/utils.py b/lamoom_cicd/utils.py
@@ -0,0 +1,37 @@
+import pandas as pd
+import json
+import logging
+
+logger = logging.getLogger(__name__)
+
+def parse_csv_file(file_path: str) -> list:
+    """
+    Reads a CSV file and returns a list of dictionaries with keys:
+    - ideal_answer
+    - llm_response
+    - optional_params (parsed as dict if not empty)
+    """
+    try:
+        df = pd.read_csv(file_path)
+    except Exception as e:
+        logger.error(f"Error reading CSV file: {e}")
+        return []
+
+    test_cases = []
+    for _, row in df.iterrows():
+        case = {
+            "ideal_answer": row.get("ideal_answer"),
+            "llm_response": row.get("llm_response")
+        }
+        opt_params = row.get("optional_params")
+        if pd.notna(opt_params) and opt_params:
+            try:
+                case["optional_params"] = json.loads(opt_params)
+            except json.JSONDecodeError as e:
+                logger.error(f"Error parsing optional_params: {e}")
+                case["optional_params"] = None
+        else:
+            case["optional_params"] = None
+        test_cases.append(case)
+
+    return test_cases
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,6 +9,8 @@ readme = "README.md"
 python = "^3.11"
 lamoom = "^0.1.33"
 dotenv = "^0.9.9"
+matplotlib = "^3.10.0"
+pandas = "^2.2.3"
 
 [build-system]
 requires = ["poetry-core"]

diff --git a/settings.json b/settings.json
@@ -0,0 +1,7 @@
+{
+    "python.testing.pytestArgs": [
+        "tests"
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true
+}
diff --git a/test_cicd.py b/test_cicd.py
diff --git a/tests/test_cicd.py b/tests/test_cicd.py
@@ -0,0 +1,21 @@
+from lamoom_cicd import TestLLMResponse
+import os
+import dotenv
+
+dotenv.load_dotenv(dotenv.find_dotenv())
+
+def test_compare():
+    ideal_answer = """Blockchain is like a digital notebook that everyone can see 
+            but no one can secretly change. Imagine a shared Google Doc where every change 
+            is recorded forever, and no one can erase or edit past entries. 
+            Instead of one company controlling it, thousands of computers around 
+            the world keep copies, making it nearly impossible to hack or fake. 
+            This is why it’s used for things like Bitcoin—to keep transactions 
+            secure and transparent without needing a bank in the middle."""
+    optional_params =  {'prompt': "Explain the concept of blockchain to someone with no technical background."}
+
+    test_response = TestLLMResponse(openai_key=os.environ.get("OPENAI_KEY"))
+    result = test_response.compare(ideal_answer, optional_params=optional_params)
+
+    assert 'score' in result.score.to_dict()
+    assert 'passed' in result.score.to_dict()
diff --git a/tests/test_csv_visualize.py b/tests/test_csv_visualize.py
@@ -0,0 +1,14 @@
+from lamoom_cicd import TestLLMResponse
+import os
+import dotenv
+dotenv.load_dotenv(dotenv.find_dotenv())
+
+
+def test_csv_compare():
+    test_response = TestLLMResponse(threshold=70, openai_key=os.environ.get("OPENAI_KEY"))
+    results = test_response.compare_from_csv("tests/test_data.csv")
+
+    test_response.visualize_test_results()
+
+    assert 'score' in results[0].score.to_dict()
+    assert 'passed' in results[0].score.to_dict()
diff --git a/tests/test_data.csv b/tests/test_data.csv
@@ -0,0 +1,2 @@
+ideal_answer,optional_params
+"Blockchain is like a digital notebook that everyone can see but no one can secretly change. Imagine a shared Google Doc where every change is recorded forever, and no one can erase or edit past entries. Instead of one company controlling it, thousands of computers around the world keep copies, making it nearly impossible to hack or fake. This is why it’s used for things like Bitcoin—to keep transactions secure and transparent without needing a bank in the middle.", "{\"prompt\": \"Explain blockchain to someone with no technical background.\"}"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		ideal_answer,optional_params
		"Blockchain is like a digital notebook that everyone can see but no one can secretly change. Imagine a shared Google Doc where every change is recorded forever, and no one can erase or edit past entries. Instead of one company controlling it, thousands of computers around the world keep copies, making it nearly impossible to hack or fake. This is why it’s used for things like Bitcoin—to keep transactions secure and transparent without needing a bank in the middle.", "{\"prompt\": \"Explain blockchain to someone with no technical background.\"}"