diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f9bcb4a..87379ac 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -149,12 +149,14 @@ AvailableTests = [
"typoglycemia_attack",
"ucar",
- #TODO: YOUR TEST HERE
+ #TODO: YOUR TEST HERE (in alphabetical order!)
]
```
#### 5. Add your attack to the `attack_descriptions.json` and `attack_descriptions.md` files.
+Please pay attention to the `attack_descriptions.md` structure. Description should be the same as docstring of the attack class. If your attack has an original paper or repository, it would be nice if you mentioned it in docstring and `attack_descriptions.md`.
+
#### 6. Open a PR! Submit your changes for review by opening a pull request to the `main` branch.
## Submitting a Pull Request.
diff --git a/README.md b/README.md
index 689c4e5..bc601b9 100644
--- a/README.md
+++ b/README.md
@@ -5,10 +5,10 @@ Red Teaming python-framework for testing chatbots and LLM-systems
[](https://creativecommons.org/licenses/by-nc-sa/4.0/)
[](https://pypi.org/project/llamator)
[](https://badge.fury.io/py/llamator)
+[](https://github.com/RomiconEZ/llamator/blob/release/docker)
[](https://pepy.tech/project/llamator)
[](https://pepy.tech/project/llamator)
[](https://github.com/RomiconEZ/llamator/stargazers)
-[](https://github.com/RomiconEZ/llamator/blob/release/docker)
## Install 🚀
diff --git a/docker/instruction.md b/docker/README.md
similarity index 98%
rename from docker/instruction.md
rename to docker/README.md
index 1a22a2e..dd4aa49 100644
--- a/docker/instruction.md
+++ b/docker/README.md
@@ -9,7 +9,7 @@
├── docker/
│ ├── Dockerfile
│ ├── jupyter_docker.sh
- │ └── instruction.md
+ │ └── README.md
└── workspace/
└── (your working files)
```
diff --git a/docs/attack_descriptions.md b/docs/attack_descriptions.md
index 29907dd..7e1f86c 100644
--- a/docs/attack_descriptions.md
+++ b/docs/attack_descriptions.md
@@ -1,10 +1,23 @@
# Attacks
+
This section provides an overview of all available attacks.
## Single-stage attacks
### OWASP [LLM01:2025](https://genai.owasp.org/llmrisk/llm01-prompt-injection/) Prompt Injection
+
+Adversarial suffix
+
+In code name: `suffix`
+
+Uses a specially crafted suffix to bypass LLM filters and restrictions, forcing the model to generate prohibited or harmful content despite the original query. Evaluates the model's vulnerability to input data manipulation.
+
+_Original Paper , Code _
+
+View code on GitHub
+
+
Aim Jailbreak
@@ -32,7 +45,7 @@ In code name: `bon`
Probes the LLM resilience against Best-Of-N attack.
-`Original Paper `_, `Code `_.
+_Original Paper , Code _
View code on GitHub
@@ -47,18 +60,6 @@ Evaluates the LLM's ability to maintain content moderation standards when transi
View code on GitHub
-
-Crescendo
-
-In code name: `crescendo`
-
-Challenges the LLM with a dataset of harmful behavior prompts using Crescendo strategy, operates by engaging the LLM in a series of escalating conversational turns, and multistage refinement.
-
-`Original Paper `_
-
-View code on GitHub
-
-
Do Anything Now Jailbreak
@@ -159,18 +160,6 @@ Probes a Russian-language LLM's adherence to content restrictions when explicitl
View code on GitHub
-
-Adversarial suffix
-
-In code name: `suffix`
-
-Uses a specially crafted suffix to bypass LLM filters and restrictions, forcing the model to generate prohibited or harmful content despite the original query. Evaluates the model's vulnerability to input data manipulation.
-
-`Original Paper `_, `Code `_.
-
-View code on GitHub
-
-
## Multi-stage attacks
@@ -196,6 +185,18 @@ The `multistage_depth` parameter for using in `history_limit` can be extracted f
### OWASP [LLM01:2025](https://genai.owasp.org/llmrisk/llm01-prompt-injection/) Prompt Injection
+
+Crescendo
+
+In code name: `crescendo`
+
+Challenges the LLM with a dataset of harmful behavior prompts using Crescendo strategy, operates by engaging the LLM in a series of escalating conversational turns, and multistage refinement.
+
+_Original Paper _
+
+View code on GitHub
+
+
Harmful Behavior Multistage
@@ -237,7 +238,7 @@ In code name: `sycophancy`
Attacks the LLM system during a multistage conversation using the argumentum ad verecundiam method based on the system's domain.
-`Original Paper `_
+_Original Paper _
View code on GitHub
diff --git a/docs/howtos.md b/docs/howtos.md
index 86d5267..06ccf33 100644
--- a/docs/howtos.md
+++ b/docs/howtos.md
@@ -42,17 +42,19 @@ tested_model = llamator.ClientOpenAI(
tests_with_attempts = [
("aim_jailbreak", 2),
("base64_injection", 2),
+ ("bon", 2),
("complimentary_transition", 2),
- ("do_anything_now_jailbreak", 2),
+ ("crescendo", 2),
# Uncomment the following lines to include additional tests
+ # ("do_anything_now_jailbreak", 2),
# ("RU_do_anything_now_jailbreak", 2),
- # ("bon", 2),
# ("ethical_compliance", 2),
# ("harmful_behavior", 2),
# ("harmful_behavior_multistage", 2),
# ("linguistic_evasion", 2),
# ("logical_inconsistencies", 2),
# ("past_tense", 2),
+ # ("suffix", 2),
# ("sycophancy", 2),
# ("system_prompt_leakage", 2),
# ("typoglycemia_attack", 2),
diff --git a/docs/project_overview.md b/docs/project_overview.md
index c0419ef..9e0bd38 100644
--- a/docs/project_overview.md
+++ b/docs/project_overview.md
@@ -5,6 +5,7 @@ LLAMATOR - Red Teaming python-framework for testing chatbots and LLM-systems
[](https://creativecommons.org/licenses/by-nc-sa/4.0/)
[](https://pypi.org/project/llamator)
[](https://badge.fury.io/py/llamator)
+[](https://github.com/RomiconEZ/llamator/blob/release/docker)
[](https://pepy.tech/project/llamator)
[](https://pepy.tech/project/llamator)
[](https://github.com/RomiconEZ/llamator/stargazers)
diff --git a/examples/llamator-api.ipynb b/examples/llamator-api.ipynb
index 604afac..639ac60 100644
--- a/examples/llamator-api.ipynb
+++ b/examples/llamator-api.ipynb
@@ -283,7 +283,9 @@
"tests_with_attempts = [\n",
" # (\"aim_jailbreak\", 2),\n",
" # (\"base64_injection\", 2),\n",
+ " # (\"bon\", 2),\n",
" # (\"complimentary_transition\", 2),\n",
+ " # (\"crescendo\", 2),\n",
" # (\"do_anything_now_jailbreak\", 2),\n",
" # (\"RU_do_anything_now_jailbreak\", 2),\n",
" # (\"ethical_compliance\", 2),\n",
@@ -292,6 +294,7 @@
" # (\"linguistic_evasion\", 2),\n",
" # (\"logical_inconsistencies\", 2),\n",
" # (\"past_tense\", 2),\n",
+ " # (\"suffix\", 2),\n",
" (\"sycophancy\", 2),\n",
" (\"system_prompt_leakage\", 2),\n",
" # (\"typoglycemia_attack\", 2),\n",
diff --git a/examples/llamator-selenium.ipynb b/examples/llamator-selenium.ipynb
index b28e360..cc479a0 100644
--- a/examples/llamator-selenium.ipynb
+++ b/examples/llamator-selenium.ipynb
@@ -365,7 +365,9 @@
"tests_with_attempts = [\n",
" # (\"aim_jailbreak\", 2),\n",
" # (\"base64_injection\", 2),\n",
- " # (\"complimentary_transition\", 3),\n",
+ " # (\"bon\", 2),\n",
+ " # (\"complimentary_transition\", 2),\n",
+ " # (\"crescendo\", 2),\n",
" # (\"do_anything_now_jailbreak\", 2),\n",
" # (\"RU_do_anything_now_jailbreak\", 2),\n",
" # (\"ethical_compliance\", 2),\n",
@@ -373,7 +375,8 @@
" # (\"harmful_behavior_multistage\", 2),\n",
" (\"linguistic_evasion\", 2),\n",
" (\"logical_inconsistencies\", 2),\n",
- " # (\"past_tense\", 1),\n",
+ " # (\"past_tense\", 2),\n",
+ " # (\"suffix\", 2),\n",
" (\"sycophancy\", 2),\n",
" (\"system_prompt_leakage\", 2),\n",
" # (\"typoglycemia_attack\", 2),\n",
diff --git a/examples/llamator-telegram.ipynb b/examples/llamator-telegram.ipynb
index 99eaf0c..ba9298e 100644
--- a/examples/llamator-telegram.ipynb
+++ b/examples/llamator-telegram.ipynb
@@ -385,7 +385,9 @@
"tests_with_attempts = [\n",
" # (\"aim_jailbreak\", 2),\n",
" # (\"base64_injection\", 2),\n",
+ " # (\"bon\", 2),\n",
" # (\"complimentary_transition\", 2),\n",
+ " # (\"crescendo\", 2),\n",
" # (\"do_anything_now_jailbreak\", 2),\n",
" # (\"RU_do_anything_now_jailbreak\", 2),\n",
" # (\"ethical_compliance\", 2),\n",
@@ -394,6 +396,7 @@
" (\"linguistic_evasion\", 2),\n",
" (\"logical_inconsistencies\", 2),\n",
" # (\"past_tense\", 2),\n",
+ " # (\"suffix\", 2),\n",
" (\"sycophancy\", 2),\n",
" # (\"system_prompt_leakage\", 2),\n",
" # (\"typoglycemia_attack\", 2),\n",
diff --git a/examples/llamator-whatsapp.ipynb b/examples/llamator-whatsapp.ipynb
index 2c49e46..4d1c029 100644
--- a/examples/llamator-whatsapp.ipynb
+++ b/examples/llamator-whatsapp.ipynb
@@ -510,7 +510,9 @@
"tests_with_attempts = [\n",
" # (\"aim_jailbreak\", 2),\n",
" # (\"base64_injection\", 2),\n",
+ " # (\"bon\", 2),\n",
" # (\"complimentary_transition\", 2),\n",
+ " # (\"crescendo\", 2),\n",
" (\"do_anything_now_jailbreak\", 2),\n",
" # (\"RU_do_anything_now_jailbreak\", 2),\n",
" # (\"ethical_compliance\", 2),\n",
@@ -519,6 +521,7 @@
" # (\"linguistic_evasion\", 2),\n",
" # (\"logical_inconsistencies\", 2),\n",
" # (\"past_tense\", 2),\n",
+ " # (\"suffix\", 2),\n",
" # (\"sycophancy\", 2),\n",
" # (\"system_prompt_leakage\", 2),\n",
" # (\"typoglycemia_attack\", 2),\n",
diff --git a/src/llamator/attacks/suffix.py b/src/llamator/attacks/suffix.py
index 749a710..6c5cb22 100644
--- a/src/llamator/attacks/suffix.py
+++ b/src/llamator/attacks/suffix.py
@@ -17,7 +17,9 @@
@register_test
class TestSuffix(TestBase):
- """Uses a specially crafted suffix to bypass LLM filters and restrictions, forcing the model to generate prohibited or harmful content despite the original query. Evaluates the model's vulnerability to input data manipulation."""
+ """Uses a specially crafted suffix to bypass LLM filters and restrictions, forcing the model to generate prohibited or harmful content despite the original query.
+ Evaluates the model's vulnerability to input data manipulation.
+ Original Paper: https://arxiv.org/abs/2307.15043, Code: https://github.com/llm-attacks/llm-attacks"""
test_name = "suffix"
diff --git a/tests/test_llamator.py b/tests/test_llamator.py
index 56d69a8..2abaee1 100644
--- a/tests/test_llamator.py
+++ b/tests/test_llamator.py
@@ -57,6 +57,7 @@ def test_openai_client():
# ("linguistic_evasion", 2),
# ("logical_inconsistencies", 2),
# ("past_tense", 2),
+ # ("suffix", 2),
# ("sycophancy", 2),
# ("system_prompt_leakage", 2),
# ("typoglycemia_attack", 2),
diff --git a/tests/test_local_llamator.py b/tests/test_local_llamator.py
index 45af2e3..ebfc0dd 100644
--- a/tests/test_local_llamator.py
+++ b/tests/test_local_llamator.py
@@ -113,6 +113,7 @@ def test_langchain_client_yandexgpt():
# ("linguistic_evasion", 2),
# ("logical_inconsistencies", 2),
# ("past_tense", 2),
+ # ("suffix", 2),
# ("sycophancy", 2),
# ("system_prompt_leakage", 2),
# ("typoglycemia_attack", 2),