fet: added structure for aggregated solution

KaiserRuben · KaiserRuben · commit 730d3dd24323 · 2024-07-15T15:23:05.000+02:00
diff --git a/src/ai/Grouping/FindingBatcher.py b/src/ai/Grouping/FindingBatcher.py
@@ -0,0 +1,90 @@
+from typing import List
+from collections import defaultdict
+from data.Finding import Finding
+from utils.token_utils import fits_in_context
+
+
+class FindingBatcher:
+    def __init__(self, llm_service):
+        self.llm_service = llm_service
+        self.category_attributes = [
+            'security_aspect', 'affected_component', 'technology_stack',
+            'remediation_type', 'severity_level', 'compliance', 'environment'
+        ]
+
+    def create_batches(self, findings: List[Finding]) -> List[List[Finding]]:
+        """Create batches of findings that fit within the LLM's context."""
+        if self._fits_in_context(findings, include_solution=True):
+            return [findings]
+        elif self._fits_in_context(findings, include_solution=False):
+            return [self._strip_solutions(findings)]
+        return self._recursive_batch(findings)
+
+    def _recursive_batch(self, findings: List[Finding], depth: int = 0) -> List[List[Finding]]:
+        """Recursively batch findings based on category attributes."""
+        if depth >= len(self.category_attributes):
+            return self._final_split(findings)
+
+        grouped = self._group_by_attribute(findings, self.category_attributes[depth])
+        batches = []
+
+        for group in grouped.values():
+            if len(group) == 1:
+                # batches.append(group)
+                pass  # Remove single findings, as they are not useful for *aggregated* solutions
+            elif self._fits_in_context(group, include_solution=True):
+                batches.append(group)
+            elif self._fits_in_context(group, include_solution=False):
+                batches.append(self._strip_solutions(group))
+            else:
+                batches.extend(self._recursive_batch(group, depth + 1))
+
+        return batches
+
+    def _group_by_attribute(self, findings: List[Finding], attribute: str) -> dict:
+        """Group findings by a specific category attribute."""
+        grouped = defaultdict(list)
+        for finding in findings:
+            if finding.category and getattr(finding.category, attribute):
+                key = getattr(finding.category, attribute).value
+            else:
+                key = 'unknown'
+            grouped[key].append(finding)
+        return grouped
+
+    def _final_split(self, findings: List[Finding]) -> List[List[Finding]]:
+        """Split findings when all category attributes have been exhausted."""
+        batches = []
+        current_batch = []
+
+        for finding in findings:
+            current_batch.append(finding)
+            if self._fits_in_context(current_batch, include_solution=True):
+                continue
+            elif self._fits_in_context(current_batch, include_solution=False):
+                current_batch = self._strip_solutions(current_batch)
+            else:
+                # If adding this finding exceeds the context, start a new batch
+                batches.append(current_batch[:-1])
+                current_batch = [finding]
+
+        if current_batch:
+            batches.append(current_batch)
+
+        return batches
+
+    def _fits_in_context(self, findings: List[Finding], include_solution: bool) -> bool:
+        """Check if a list of findings fits within the LLM's context."""
+        content = "\n".join(self._finding_to_string(f, include_solution) for f in findings)
+        return fits_in_context(content, self.llm_service)
+
+    def _finding_to_string(self, finding: Finding, include_solution: bool) -> str:
+        """Convert a finding to a string representation."""
+        content = f"Description: {finding.description}"
+        if include_solution and finding.solution:
+            content += f"\nSolution: {finding.solution.short_description}"
+        return content
+
+    def _strip_solutions(self, findings: List[Finding]) -> List[Finding]:
+        """Remove solutions from a list of findings."""
+        return [Finding(**{**f.dict(), 'solution': None}) for f in findings]
diff --git a/src/ai/Grouping/FindingGrouper.py b/src/ai/Grouping/FindingGrouper.py
@@ -0,0 +1,24 @@
+from typing import List
+
+from tqdm import tqdm
+
+from ai.Grouping.FindingBatcher import FindingBatcher
+from ai.LLM.BaseLLMService import BaseLLMService
+from data.AggregatedSolution import AggregatedSolution
+from data.VulnerabilityReport import VulnerabilityReport
+
+
+class FindingGrouper:
+    def __init__(self, vulnerability_report: VulnerabilityReport, llm_service: BaseLLMService):
+        self.vulnerability_report = vulnerability_report
+        self.llm_service = llm_service
+        self.batcher = FindingBatcher(llm_service)
+        self.batches = self.batcher.create_batches(vulnerability_report.get_findings())
+        self.aggregated_solutions: List[AggregatedSolution] = []
+
+    def generate_aggregated_solutions(self):
+        for batch in tqdm(self.batches, desc="Generating Aggregated Solutions"):
+            result_list = self.llm_service.generate_aggregated_solution(batch)
+            for result in result_list:
+                self.aggregated_solutions.append(AggregatedSolution(result[1], result[0], result[2]))
+        self.vulnerability_report.set_aggregated_solutions(self.aggregated_solutions)
diff --git a/src/ai/LLM/BaseLLMService.py b/src/ai/LLM/BaseLLMService.py
@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
 from enum import Enum
-from typing import Dict, Optional, List, Union
+from typing import Dict, Optional, List, Union, Tuple
 import logging
 
 from data.Finding import Finding
@@ -94,6 +94,54 @@ def _get_search_terms_prompt(self, finding: Finding) -> str:
     def _process_search_terms_response(self, response: Dict[str, str], finding: Finding) -> str:
         pass
 
+    def generate_aggregated_solution(self, findings: List[Finding]) -> List[Tuple[str, List[Finding], Dict]]:
+        """
+        Generate an aggregated solution for a group of findings.
+
+        Args:
+            findings (List[Finding]): The findings to generate a solution for.
+
+        Returns:
+            List[Tuple[str, List[Finding], Dict]]: The generated solution, the findings it applies to, and any additional metadata
+        """
+        finding_groups = self._subdivide_finding_group(findings)
+        if len(finding_groups) < 1:
+            return []  # No suitable groups found
+
+        results = []
+
+        for group, meta_info in finding_groups:
+            prompt = self._get_aggregated_solution_prompt(group, meta_info)
+            response = self.generate(prompt)
+            solution = self._process_aggregated_solution_response(response)
+
+            if solution:
+                results.append((solution, group, meta_info))
+
+        return results
+
+
+    def _subdivide_finding_group(self, findings: List[Finding]) -> List[Tuple[List[Finding], Dict]]:
+        prompt = self._get_subdivision_prompt(findings)
+        response = self.generate(prompt)
+        return self._process_subdivision_response(response, findings)
+
+    @abstractmethod
+    def _get_subdivision_prompt(self, findings: List[Finding]) -> str:
+        pass
+
+    @abstractmethod
+    def _process_subdivision_response(self, response: Dict, findings: List[Finding]) -> List[Tuple[List[Finding], Dict]]:
+        pass
+
+    @abstractmethod
+    def _get_aggregated_solution_prompt(self, findings: List[Finding], meta_info: Dict) -> str:
+        pass
+
+    @abstractmethod
+    def _process_aggregated_solution_response(self, response: Dict) -> str:
+        pass
+
     @abstractmethod
     def convert_dict_to_str(self, data) -> str:
         pass
diff --git a/src/ai/LLM/LLMServiceStrategy.py b/src/ai/LLM/LLMServiceStrategy.py
@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import Dict, Optional, List, Union
+from typing import Dict, Optional, List, Union, Tuple
 
 from ai.LLM.BaseLLMService import BaseLLMService
 from data.Finding import Finding
@@ -21,6 +21,10 @@ def get_model_name(self) -> str:
         """Get the name of the current LLM model."""
         return self.llm_service.get_model_name()
 
+    def get_context_size(self) -> int:
+        """Get the context size of the current LLM service."""
+        return self.llm_service.get_context_size()
+
     def get_url(self) -> str:
         """Get the URL associated with the current LLM service."""
         return self.llm_service.get_url()
@@ -88,6 +92,18 @@ def get_search_terms(self, finding: Finding) -> str:
         """
         return self.llm_service.get_search_terms(finding)
 
+    def generate_aggregated_solution(self, findings: List[Finding]) ->  List[Tuple[str, List[Finding], Dict]]:
+        """
+        Generate an aggregated solution for a group of findings.
+
+        Args:
+            findings (List[Finding]): The findings to generate a solution for.
+
+        Returns:
+            List[Tuple[str, List[Finding], Dict]]: The generated solution, the findings it applies to, and any additional metadata
+        """
+        return self.llm_service.generate_aggregated_solution(findings)
+
     def convert_dict_to_str(self, data: Dict) -> str:
         """
         Convert a dictionary to a string representation.
diff --git a/src/data/AggregatedSolution.py b/src/data/AggregatedSolution.py
@@ -0,0 +1,28 @@
+from typing import List
+
+from data.Finding import Finding
+from db.base import BaseModel
+
+
+class AggregatedSolution:
+    findings: List[Finding] = None
+    solution: str = ""
+    metadata: dict = {}
+
+    def __init__(self, findings: List[Finding], solution: str, metadata=None):
+        self.findings = findings
+        self.solution = solution
+        self.metadata = metadata
+
+    def __str__(self):
+        return self.solution
+
+    def to_dict(self):
+        return {
+            "findings": [finding.to_dict() for finding in self.findings],
+            "solution": self.solution,
+            "metadata": self.metadata
+        }
+
+    def to_html(self):
+        return f"<p>{self.solution}</p>"
diff --git a/src/data/Categories.py b/src/data/Categories.py
@@ -84,13 +84,13 @@ class Environment(Enum):
 
 
 class Category(BaseModel):
-    technology_stack: Optional[List[TechnologyStack]] = None
-    security_aspect: Optional[List[SecurityAspect]] = None
+    technology_stack: Optional[TechnologyStack] = None
+    security_aspect: Optional[SecurityAspect] = None
     severity_level: Optional[SeverityLevel] = None
-    remediation_type: Optional[List[RemediationType]] = None
-    affected_component: Optional[List[AffectedComponent]] = None
-    compliance: Optional[List[Compliance]] = None
-    environment: Optional[List[Environment]] = None
+    remediation_type: Optional[RemediationType] = None
+    affected_component: Optional[AffectedComponent] = None
+    compliance: Optional[Compliance] = None
+    environment: Optional[Environment] = None
 
     def __str__(self):
         my_str = ""
diff --git a/src/data/VulnerabilityReport.py b/src/data/VulnerabilityReport.py
@@ -1,8 +1,10 @@
 import json
+from typing import List
 
 from tqdm import tqdm
 from random import shuffle
 
+from data.AggregatedSolution import AggregatedSolution
 from data.Finding import Finding
 from ai.LLM.LLMServiceStrategy import LLMServiceStrategy
 from ai.Clustering.AgglomerativeClusterer import AgglomerativeClusterer
@@ -17,6 +19,7 @@
 class VulnerabilityReport:
     def __init__(self):
         self.findings: list[Finding] = []
+        self.aggregated_solutions: List[AggregatedSolution] = []
 
     def set_llm_service(self, llm_service: "LLMServiceStrategy"):
         """
@@ -68,6 +71,12 @@ def add_solution(self, long=True, short=True, search_term=True):
             finding.generate_solution(long, short, search_term)
         return self
 
+    def set_aggregated_solutions(self, aggregated_solutions: List[AggregatedSolution]):
+        self.aggregated_solutions = aggregated_solutions
+
+    def get_aggregated_solutions(self) -> List[AggregatedSolution]:
+        return self.aggregated_solutions
+
     def sort(self, by: str = "severity", reverse: bool = True):
         """
         This function sorts the findings by severity or priority.
@@ -87,13 +96,24 @@ def sort(self, by: str = "severity", reverse: bool = True):
         return self
 
     def to_dict(self):
-        return [f.to_dict() for f in self.findings]
+        findings = [f.to_dict() for f in self.findings]
+        if len(self.get_aggregated_solutions()) > 0:
+            aggregated_solutions = [f.to_dict() for f in self.get_aggregated_solutions()]
+            return {"findings": findings, "aggregated_solutions": aggregated_solutions}
+        return {"findings": findings}
 
     def __str__(self):
-        return "\n\n".join([str(f) for f in self.findings])
+        findings_str = "\n".join([str(f) for f in self.findings])
+        if len(self.get_aggregated_solutions()) > 0:
+            aggregated_solutions_str = "\n".join([str(f) for f in self.get_aggregated_solutions()])
+            return findings_str + "\n\n" + aggregated_solutions_str
+        return findings_str
 
     def to_html(self, table=False):
-        return "".join([f.to_html(table) for f in self.findings])
+        my_str = "<br/>".join([f.to_html(table) for f in self.findings])
+        if len(self.get_aggregated_solutions()) > 0:
+            my_str += "<br/><br/>" + "<br/>".join([f.to_html() for f in self.get_aggregated_solutions()])
+        return my_str
 
     def export_to_json(self, filename="VulnerabilityReport.json"):
         """
diff --git a/src/utils/token_utils.py b/src/utils/token_utils.py
@@ -1,4 +1,6 @@
 import re
+
+from ai.LLM.BaseLLMService import BaseLLMService
 from config import config
 
 
@@ -50,7 +52,7 @@ def estimate_tokens(text):
     return total_tokens
 
 
-def fits_in_context(text):
+def fits_in_context(text, llm_service: BaseLLMService = None):
     """
     Check if the given text fits within the maximum context size.
 
@@ -64,7 +66,7 @@ def fits_in_context(text):
     bool: True if the text fits within the context, False otherwise.
     """
     estimated_tokens = estimate_tokens(text)
-    return estimated_tokens <= config.max_context_length
+    return bool(llm_service) and estimated_tokens <= llm_service.get_context_size()
 
 
 # Example usage
@@ -76,4 +78,4 @@ def fits_in_context(text):
     if fits_in_context(sample_text):
         print("The text fits within the maximum context.")
     else:
-        print("The text exceeds the maximum context size.")
+        print("The text exceeds the maximum context size.")