|
| 1 | +from typing import List |
| 2 | +from collections import defaultdict |
| 3 | +from data.Finding import Finding |
| 4 | +from utils.token_utils import fits_in_context |
| 5 | + |
| 6 | + |
| 7 | +class FindingBatcher: |
| 8 | + def __init__(self, llm_service): |
| 9 | + self.llm_service = llm_service |
| 10 | + self.category_attributes = [ |
| 11 | + 'security_aspect', 'affected_component', 'technology_stack', |
| 12 | + 'remediation_type', 'severity_level', 'compliance', 'environment' |
| 13 | + ] |
| 14 | + |
| 15 | + def create_batches(self, findings: List[Finding]) -> List[List[Finding]]: |
| 16 | + """Create batches of findings that fit within the LLM's context.""" |
| 17 | + if self._fits_in_context(findings, include_solution=True): |
| 18 | + return [findings] |
| 19 | + elif self._fits_in_context(findings, include_solution=False): |
| 20 | + return [self._strip_solutions(findings)] |
| 21 | + return self._recursive_batch(findings) |
| 22 | + |
| 23 | + def _recursive_batch(self, findings: List[Finding], depth: int = 0) -> List[List[Finding]]: |
| 24 | + """Recursively batch findings based on category attributes.""" |
| 25 | + if depth >= len(self.category_attributes): |
| 26 | + return self._final_split(findings) |
| 27 | + |
| 28 | + grouped = self._group_by_attribute(findings, self.category_attributes[depth]) |
| 29 | + batches = [] |
| 30 | + |
| 31 | + for group in grouped.values(): |
| 32 | + if len(group) == 1: |
| 33 | + # batches.append(group) |
| 34 | + pass # Remove single findings, as they are not useful for *aggregated* solutions |
| 35 | + elif self._fits_in_context(group, include_solution=True): |
| 36 | + batches.append(group) |
| 37 | + elif self._fits_in_context(group, include_solution=False): |
| 38 | + batches.append(self._strip_solutions(group)) |
| 39 | + else: |
| 40 | + batches.extend(self._recursive_batch(group, depth + 1)) |
| 41 | + |
| 42 | + return batches |
| 43 | + |
| 44 | + def _group_by_attribute(self, findings: List[Finding], attribute: str) -> dict: |
| 45 | + """Group findings by a specific category attribute.""" |
| 46 | + grouped = defaultdict(list) |
| 47 | + for finding in findings: |
| 48 | + if finding.category and getattr(finding.category, attribute): |
| 49 | + key = getattr(finding.category, attribute).value |
| 50 | + else: |
| 51 | + key = 'unknown' |
| 52 | + grouped[key].append(finding) |
| 53 | + return grouped |
| 54 | + |
| 55 | + def _final_split(self, findings: List[Finding]) -> List[List[Finding]]: |
| 56 | + """Split findings when all category attributes have been exhausted.""" |
| 57 | + batches = [] |
| 58 | + current_batch = [] |
| 59 | + |
| 60 | + for finding in findings: |
| 61 | + current_batch.append(finding) |
| 62 | + if self._fits_in_context(current_batch, include_solution=True): |
| 63 | + continue |
| 64 | + elif self._fits_in_context(current_batch, include_solution=False): |
| 65 | + current_batch = self._strip_solutions(current_batch) |
| 66 | + else: |
| 67 | + # If adding this finding exceeds the context, start a new batch |
| 68 | + batches.append(current_batch[:-1]) |
| 69 | + current_batch = [finding] |
| 70 | + |
| 71 | + if current_batch: |
| 72 | + batches.append(current_batch) |
| 73 | + |
| 74 | + return batches |
| 75 | + |
| 76 | + def _fits_in_context(self, findings: List[Finding], include_solution: bool) -> bool: |
| 77 | + """Check if a list of findings fits within the LLM's context.""" |
| 78 | + content = "\n".join(self._finding_to_string(f, include_solution) for f in findings) |
| 79 | + return fits_in_context(content, self.llm_service) |
| 80 | + |
| 81 | + def _finding_to_string(self, finding: Finding, include_solution: bool) -> str: |
| 82 | + """Convert a finding to a string representation.""" |
| 83 | + content = f"Description: {finding.description}" |
| 84 | + if include_solution and finding.solution: |
| 85 | + content += f"\nSolution: {finding.solution.short_description}" |
| 86 | + return content |
| 87 | + |
| 88 | + def _strip_solutions(self, findings: List[Finding]) -> List[Finding]: |
| 89 | + """Remove solutions from a list of findings.""" |
| 90 | + return [Finding(**{**f.dict(), 'solution': None}) for f in findings] |
0 commit comments