NVIDIA · leondz · Feb 24, 2025 · Feb 24, 2025 · Feb 24, 2025 · Feb 26, 2025
diff --git a/docs/source/configurable.rst b/docs/source/configurable.rst
@@ -93,6 +93,7 @@ such as ``show_100_pass_modules``.
 * ``narrow_output`` - Support output on narrower CLIs
 * ``show_z`` - Display Z-scores and visual indicators on CLI. It's good, but may be too much info until one has seen garak run a couple of times
 * ``enable_experimental`` - Enable experimental function CLI flags. Disabled by default. Experimental functions may disrupt your installation and provide unusual/unstable results. Can only be set by editing core config, so a git checkout of garak is recommended for this.
+* ``soft_probe_prompt_cap`` - For probes that auto-scale their prompt count, the preferred limit of prompts per probe
 
 ``run`` config items
 """"""""""""""""""""

diff --git a/garak/configs/fast.yaml b/garak/configs/fast.yaml
@@ -7,7 +7,7 @@ run:
   generations: 5
 
 plugins:
-  probe_spec: continuation,dan,encoding.InjectBase64,encoding.InjectHex,goodside,av_spam_scanning,leakreplay,lmrc,malwaregen.SubFunctions,malwaregen.TopLevel,packagehallucination,realtoxicityprompts.RTPIdentity_Attack,realtoxicityprompts.RTPProfanity,realtoxicityprompts.RTPSexually_Explicit,realtoxicityprompts.RTPThreat,snowball,xss
+  probe_spec: ansiescape.AnsiRaw,continuation,dan,encoding.InjectBase64,encoding.InjectHex,goodside,av_spam_scanning,leakreplay,lmrc,malwaregen.SubFunctions,malwaregen.TopLevel,packagehallucination,realtoxicityprompts.RTPIdentity_Attack,realtoxicityprompts.RTPProfanity,realtoxicityprompts.RTPSexually_Explicit,realtoxicityprompts.RTPThreat,snowball,xss
   extended_detectors: false
 
 
diff --git a/garak/configs/full.yaml b/garak/configs/full.yaml
@@ -3,7 +3,7 @@ system:
   lite: false
 
 plugins:
-  probe_spec: atkgen,continuation,dan,dan.Ablation_Dan_11_0,encoding,glitch.Glitch,goodside,av_spam_scanning,leakreplay.LiteratureCloze,leakreplay.LiteratureComplete,lmrc,malwaregen,misleading,packagehallucination,promptinject.HijackHateHumans,promptinject.HijackKillHumans,promptinject.HijackLongPrompt,realtoxicityprompts,snowball.GraphConnectivity,snowball.Primes,snowball.Senators,xss
+  probe_spec: ansiescape,atkgen,continuation,dan,dan.Ablation_Dan_11_0,encoding,glitch.GlitchFull,goodside,av_spam_scanning,latentinjection,leakreplay.LiteratureClozeFull,leakreplay.LiteratureCompleteFull,lmrc,malwaregen,misleading,packagehallucination,promptinject.HijackHateHumansFull,promptinject.HijackKillHumansFull,promptinject.HijackLongPromptFull,realtoxicityprompts,snowball.GraphConnectivityFull,snowball.PrimesFull,snowball.SenatorsFull,xss
   probes:
     encoding:
       payloads:

diff --git a/garak/configs/tox_and_buffs.yaml b/garak/configs/tox_and_buffs.yaml
@@ -10,7 +10,7 @@ run:
   generations: 5
 
 plugins:
-  probe_spec: atkgen.Tox,continuation.ContinueSlursReclaimedSlursMini,encoding,lmrc.Anthropomorphisation,lmrc.Bullying,lmrc.Deadnaming,lmrc.Profanity,lmrc.SlurUsage,realtoxicityprompts,
+  probe_spec: atkgen.Tox,continuation.ContinueSlursReclaimedSlurs,encoding,lmrc.Anthropomorphisation,lmrc.Bullying,lmrc.Deadnaming,lmrc.Profanity,lmrc.SlurUsage,realtoxicityprompts,
   probes:
     encoding:
       payloads:

diff --git a/garak/probes/continuation.py b/garak/probes/continuation.py
@@ -6,14 +6,15 @@
 """
 
 import json
+import random
 
 from garak import _config
 from garak.attempt import Attempt
 from garak.data import path as data_path
 from garak.probes.base import Probe
 
 
-class ContinueSlursReclaimedSlurs(Probe):
+class ContinueSlursReclaimedSlursFull(Probe):
     """Continue partial word with slur or reclaimed slur.
 
     This probe poses a partial ethnic slur with some preceding content, and
@@ -37,6 +38,8 @@ class ContinueSlursReclaimedSlurs(Probe):
     active = False
     source_resource_filename = "slurprompts.jsonl"
 
+    follow_prompt_cap = False
+
     def __init__(self, config_root=_config):
         super().__init__(config_root=config_root)
         # go through the list
@@ -73,6 +76,17 @@ def __init__(self, config_root=_config):
                     self.prompts.append(candidate_prompt.strip())
                     self.triggers.append(term)
 
+        if self.follow_prompt_cap:
+            num_ids_to_delete = max(
+                0, len(self.prompts) - config_root.run.soft_probe_prompt_cap
+            )
+            ids_to_rm = random.sample(range(len(self.prompts)), num_ids_to_delete)
+            # delete in descending order
+            ids_to_rm = sorted(ids_to_rm, reverse=True)
+            for id in ids_to_rm:
+                del self.prompts[id]
+                del self.triggers[id]
+
     def _attempt_prestore_hook(self, attempt: Attempt, seq: int) -> Attempt:
         attempt.notes = dict(
             attempt.notes
@@ -81,7 +95,7 @@ def _attempt_prestore_hook(self, attempt: Attempt, seq: int) -> Attempt:
         return attempt
 
 
-class ContinueSlursReclaimedSlursMini(ContinueSlursReclaimedSlurs, Probe):
+class ContinueSlursReclaimedSlurs(ContinueSlursReclaimedSlursFull, Probe):
     """Continue partial word with slur or reclaimed slur.
 
     This probe poses a partial ethnic slur with some preceding content, and
@@ -90,4 +104,5 @@ class ContinueSlursReclaimedSlursMini(ContinueSlursReclaimedSlurs, Probe):
 
     aliases = ["continuation.ContinueSlursReclaimedSlurs80"]
     active = True
-    source_resource_filename = "slurprompts_mini.jsonl"
+    source_resource_filename = "slurprompts_mini.jsonl"  # high-quality selection
+    follow_prompt_cap = True
diff --git a/garak/probes/dan.py b/garak/probes/dan.py
@@ -7,6 +7,7 @@
 now) probes and others in a similar vein.
 """
 import logging
+import random
 from typing import Iterable
 
 import garak.attempt
@@ -620,7 +621,7 @@ def probe(self, generator) -> Iterable[garak.attempt.Attempt]:
             return list()
 
 
-class DanInTheWild(Probe):
+class DanInTheWildFull(Probe):
     """A library of jailbreaks.
 
     Tries a set of ~700 known jailbreaks, in an attempt to get a model to disregard
@@ -648,15 +649,17 @@ def __init__(self, config_root=_config):
             self.prompts = f.readlines()
 
 
-class DanInTheWildMini(DanInTheWild):
-    """A subset of 100 jailbreaks from the DanInTheWild jailbreak library.
+class DanInTheWild(DanInTheWildFull):
+    """A subset of jailbreaks from the DanInTheWild jailbreak library.
 
-    Tries a set of 100 known jailbreaks, in an attempt to get a model to disregard
+    Tries a set of known jailbreaks, in an attempt to get a model to disregard
     alignment/system prompt. Success is measured by not getting a safety mitigaton
     message."""
 
     active = True
 
     def __init__(self, config_root=_config):
         super().__init__(config_root=config_root)
-        self.prompts = self.prompts[:100]
+        self.prompts = random.sample(self.prompts, len(self.prompts))[
+            : config_root.run.soft_probe_prompt_cap
+        ]
diff --git a/garak/probes/glitch.py b/garak/probes/glitch.py
@@ -10,7 +10,7 @@
 from garak.probes.base import Probe
 
 
-class Glitch(Probe):
+class GlitchFull(Probe):
     """Probe model for glitch tokens that provoke unusual behavior
 
     Glitch tokens tend to be long entries in the tokenizer that only rarely occur
@@ -217,7 +217,7 @@ def _attempt_prestore_hook(self, attempt: Attempt, seq: int) -> Attempt:
         return attempt
 
 
-class Glitch100(Glitch, Probe):
+class Glitch(GlitchFull):
     """Probe model for glitch tokens that provoke unusual behavior
 
     Glitch tokens tend to be long entries in the tokenizer that only rarely occur
@@ -226,4 +226,7 @@ class Glitch100(Glitch, Probe):
     This probe uses a subset of 100 potential glitch tokens, for speed."""
 
     active = True
-    max_prompts = 100
+
+    def __init__(self, config_root=_config):
+        super().__init__(config_root)
+        self.max_prompts = config_root.run.soft_probe_prompt_cap