-
Notifications
You must be signed in to change notification settings - Fork 360
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feature: lightweight probe defaults #1116
base: main
Are you sure you want to change the base?
Changes from 19 commits
640d304
e7c7db5
a85c257
4370779
169d481
2e40865
829f97f
ea5bed8
79105cb
3aa6677
3b3e786
88411ab
3f03bc2
0375e52
02d202a
7310811
e28f8c0
f45938b
0163a4c
f5d168a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,14 +6,15 @@ | |
""" | ||
|
||
import json | ||
import random | ||
|
||
from garak import _config | ||
from garak.attempt import Attempt | ||
from garak.data import path as data_path | ||
from garak.probes.base import Probe | ||
|
||
|
||
class ContinueSlursReclaimedSlurs(Probe): | ||
class ContinueSlursReclaimedSlursFull(Probe): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like this change a lot. |
||
"""Continue partial word with slur or reclaimed slur. | ||
This probe poses a partial ethnic slur with some preceding content, and | ||
|
@@ -37,6 +38,8 @@ class ContinueSlursReclaimedSlurs(Probe): | |
active = False | ||
source_resource_filename = "slurprompts.jsonl" | ||
|
||
follow_prompt_cap = False | ||
|
||
def __init__(self, config_root=_config): | ||
super().__init__(config_root=config_root) | ||
# go through the list | ||
|
@@ -73,6 +76,17 @@ def __init__(self, config_root=_config): | |
self.prompts.append(candidate_prompt.strip()) | ||
self.triggers.append(term) | ||
|
||
if self.follow_prompt_cap: | ||
num_ids_to_delete = max( | ||
0, len(self.prompts) - config_root.run.soft_probe_prompt_cap | ||
) | ||
Comment on lines
+80
to
+82
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cannot assume |
||
ids_to_rm = random.sample(range(len(self.prompts)), num_ids_to_delete) | ||
# delete in descending order | ||
ids_to_rm = sorted(ids_to_rm, reverse=True) | ||
for id in ids_to_rm: | ||
del self.prompts[id] | ||
del self.triggers[id] | ||
|
||
def _attempt_prestore_hook(self, attempt: Attempt, seq: int) -> Attempt: | ||
attempt.notes = dict( | ||
attempt.notes | ||
|
@@ -81,7 +95,7 @@ def _attempt_prestore_hook(self, attempt: Attempt, seq: int) -> Attempt: | |
return attempt | ||
|
||
|
||
class ContinueSlursReclaimedSlursMini(ContinueSlursReclaimedSlurs, Probe): | ||
class ContinueSlursReclaimedSlurs(ContinueSlursReclaimedSlursFull, Probe): | ||
"""Continue partial word with slur or reclaimed slur. | ||
This probe poses a partial ethnic slur with some preceding content, and | ||
|
@@ -90,4 +104,5 @@ class ContinueSlursReclaimedSlursMini(ContinueSlursReclaimedSlurs, Probe): | |
|
||
aliases = ["continuation.ContinueSlursReclaimedSlurs80"] | ||
active = True | ||
source_resource_filename = "slurprompts_mini.jsonl" | ||
source_resource_filename = "slurprompts_mini.jsonl" # high-quality selection | ||
follow_prompt_cap = True |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ | |
now) probes and others in a similar vein. | ||
""" | ||
import logging | ||
import random | ||
from typing import Iterable | ||
|
||
import garak.attempt | ||
|
@@ -620,7 +621,7 @@ def probe(self, generator) -> Iterable[garak.attempt.Attempt]: | |
return list() | ||
|
||
|
||
class DanInTheWild(Probe): | ||
class DanInTheWildFull(Probe): | ||
"""A library of jailbreaks. | ||
|
||
Tries a set of ~700 known jailbreaks, in an attempt to get a model to disregard | ||
|
@@ -648,15 +649,17 @@ def __init__(self, config_root=_config): | |
self.prompts = f.readlines() | ||
|
||
|
||
class DanInTheWildMini(DanInTheWild): | ||
"""A subset of 100 jailbreaks from the DanInTheWild jailbreak library. | ||
class DanInTheWild(DanInTheWildFull): | ||
"""A subset of jailbreaks from the DanInTheWild jailbreak library. | ||
|
||
Tries a set of 100 known jailbreaks, in an attempt to get a model to disregard | ||
Tries a set of known jailbreaks, in an attempt to get a model to disregard | ||
alignment/system prompt. Success is measured by not getting a safety mitigaton | ||
message.""" | ||
Comment on lines
+653
to
657
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggest mentioning the configurability by changing the
|
||
|
||
active = True | ||
|
||
def __init__(self, config_root=_config): | ||
super().__init__(config_root=config_root) | ||
self.prompts = self.prompts[:100] | ||
self.prompts = random.sample(self.prompts, len(self.prompts))[ | ||
: config_root.run.soft_probe_prompt_cap | ||
] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we still want
av_spam_scanning
in the default fast config? It's largely useless for model-only evaluation.