From a23302cc7ba8c38b69d87ab93fc69b47679afa6e Mon Sep 17 00:00:00 2001 From: Leon Derczynski Date: Thu, 20 Feb 2025 07:09:01 +0100 Subject: [PATCH] separate out a policy harness, add a hook to let it do its magic --- garak/command.py | 14 +++++++------- garak/harnesses/probewise.py | 30 +++++++++++++++++++----------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/garak/command.py b/garak/command.py index 8ff41ed8..f2feb64c 100644 --- a/garak/command.py +++ b/garak/command.py @@ -239,13 +239,11 @@ def plugin_info(plugin_name): # do a run -def probewise_run(generator, probe_names, evaluator, buffs, policy_run=False): +def probewise_run(generator, probe_names, evaluator, buffs): import garak.harnesses.probewise probewise_h = garak.harnesses.probewise.ProbewiseHarness() - return list( - probewise_h.run(generator, probe_names, evaluator, buffs, policy_run=policy_run) - ) + return list(probewise_h.run(generator, probe_names, evaluator, buffs)) def pxd_run(generator, probe_names, detector_names, evaluator, buffs): @@ -321,9 +319,11 @@ def run_policy_scan(generator, _config): evaluator = garak.evaluators.ThresholdEvaluator(_config.run.eval_threshold) buffs = [] - result = probewise_run( - generator, policy_probe_names, evaluator, buffs, policy_run=True - ) + + import garak.harnesses.probewise + + policy_h = garak.harnesses.probewise.PolicyHarness() + result = list(policy_h.run(generator, policy_probe_names, evaluator, buffs)) policy = garak.policy.Policy() policy.parse_eval_result(result, threshold=_config.policy.threshold) diff --git a/garak/harnesses/probewise.py b/garak/harnesses/probewise.py index 18968684..e52ce0c6 100644 --- a/garak/harnesses/probewise.py +++ b/garak/harnesses/probewise.py @@ -16,6 +16,7 @@ class ProbewiseHarness(Harness): + def _load_detector(self, detector_name: str) -> Detector: detector = _plugins.load_plugin( "detectors." + detector_name, break_on_fail=False @@ -27,7 +28,10 @@ def _load_detector(self, detector_name: str) -> Detector: logging.error(f" detector load failed: {detector_name}, skipping >>") return False - def run(self, model, probenames, evaluator, buff_names=None, policy_run=False): + def _probe_check(self, probe): + return probe + + def run(self, model, probenames, evaluator, buff_names=None): """Execute a probe-by-probe scan Probes are executed in name order. For each probe, the detectors @@ -54,9 +58,6 @@ def run(self, model, probenames, evaluator, buff_names=None, policy_run=False): :type buff_names: List[str] """ - if buff_names is None: - buff_names = [] - if not probenames: msg = "No probes, nothing to do" logging.warning(msg) @@ -64,6 +65,9 @@ def run(self, model, probenames, evaluator, buff_names=None, policy_run=False): print(msg) raise ValueError(msg) + if buff_names is None: + buff_names = [] + self._load_buffs(buff_names) probenames = sorted(probenames) @@ -83,13 +87,7 @@ def run(self, model, probenames, evaluator, buff_names=None, policy_run=False): continue detectors = [] - if ( - policy_run - ): # policy run conditions: probe is policy probe; use different generation count (def. 1) - assert ( - probe.policy_probe == True - ), "only policy probes should be used in policy runs" - setattr(probe, "generations", _config.policy.generations) + probe = self._probe_check(probe) if probe.primary_detector: d = self._load_detector(probe.primary_detector) @@ -116,3 +114,13 @@ def run(self, model, probenames, evaluator, buff_names=None, policy_run=False): result = h._execute(model, [probe], detectors, evaluator) yield list(result) # ensure the generator is executed logging.debug("harness probewise: complete") + + +class PolicyHarness(ProbewiseHarness): + + def _probe_check(self, probe): + assert ( + probe.policy_probe == True + ), "only policy probes should be used in policy runs" + setattr(probe, "generations", _config.policy.generations) + return probe