feat(unsupervised clustering): Added to notebooks

DigitalProductInnovationAndDevelopment · KaiserRuben · Jul 14, 2024 · Jun 27, 2024 · Jun 28, 2024 · Jun 28, 2024
commit 4caed98576181cd1831eb9535cc9735fd9fdb0ca
diff --git a/src/notebooks/Development/Anthropic_tester.ipynb b/src/notebooks/Development/Anthropic_tester.ipynb
@@ -34,11 +34,11 @@
     "from ai.LLM.LLMServiceStrategy import LLMServiceStrategy \n",
     "\n",
     "# We need the json data, so we are loading it from the file\n",
-    "with open(data_folder / Path('Dataset.json')) as f:\n",
+    "with open(data_folder / Path('Dataset_Juicebox.json')) as f:\n",
     "    data = json.load(f)\n",
     "    \n",
-    "from ai.LLM.Stretegies.AnthropicService import AnthropicService\n",
-    "from ai.LLM.Stretegies.OLLAMAService import OLLAMAService\n",
+    "from ai.LLM.Strategies.AnthropicService import AnthropicService\n",
+    "from ai.LLM.Strategies.OLLAMAService import OLLAMAService\n",
     "\n",
     "my_strategy = AnthropicService()\n",
     "llm_service = LLMServiceStrategy(my_strategy)\n",
@@ -89,6 +89,23 @@
    "outputs": [],
    "execution_count": null
   },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "# Adding unsupervised categories",
+   "id": "6492a80e60bede59"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "_, Clusterer = vulnerability_report.add_unsupervised_category()\n",
+    "Clusterer.get_cluster_graph()"
+   ],
+   "id": "2f18a779a759a5e5",
+   "outputs": [],
+   "execution_count": null
+  },
   {
    "metadata": {},
    "cell_type": "markdown",

diff --git a/src/notebooks/Development/OLLAMA_tester.ipynb b/src/notebooks/Development/OLLAMA_tester.ipynb
@@ -34,7 +34,7 @@
     "from ai.LLM.LLMServiceStrategy import LLMServiceStrategy \n",
     "\n",
     "# We need the json data, so we are loading it from the file\n",
-    "with open(data_folder / Path('Dataset.json')) as f:\n",
+    "with open(data_folder / Path('Dataset_Juicebox.json')) as f:\n",
     "    data = json.load(f)\n",
     "from ai.LLM.Strategies.OLLAMAService import OLLAMAService\n",
     "my_strategy_local = OLLAMAService()\n",
@@ -84,12 +84,37 @@
    "outputs": [],
    "execution_count": null
   },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "# Adding unsupervised categories",
+   "id": "a7d6995811fd279c"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "_, Clusterer = vulnerability_report.add_unsupervised_category()\n",
+    "Clusterer.get_cluster_graph()"
+   ],
+   "id": "ef521dcc7bc50b35"
+  },
   {
    "metadata": {},
    "cell_type": "markdown",
    "source": "# Evaluate results",
    "id": "a6b50f27d4d3c240"
   },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": "vulnerability_report.to_dict()",
+   "id": "1683ffa9778eb37f",
+   "outputs": [],
+   "execution_count": null
+  },
   {
    "metadata": {},
    "cell_type": "code",

diff --git a/src/notebooks/Development/OpenAI_tester.ipynb b/src/notebooks/Development/OpenAI_tester.ipynb
@@ -89,6 +89,23 @@
    "outputs": [],
    "execution_count": null
   },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "# Adding unsupervised categories",
+   "id": "f67ba404abafd572"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "_, Clusterer = vulnerability_report.add_unsupervised_category()\n",
+    "Clusterer.get_cluster_graph()"
+   ],
+   "id": "deb771a27f68aed9"
+  },
   {
    "metadata": {},
    "cell_type": "markdown",

diff --git a/src/notebooks/getting_started.ipynb b/src/notebooks/getting_started.ipynb
@@ -4,21 +4,27 @@
    "cell_type": "code",
    "id": "3027aad445a8ca34",
    "metadata": {
-    "jupyter": {
-     "is_executing": true
+    "ExecuteTime": {
+     "end_time": "2024-07-04T21:21:47.564156Z",
+     "start_time": "2024-07-04T21:21:47.542742Z"
     }
    },
    "source": [
     "%load_ext autoreload\n",
     "%autoreload 2"
    ],
    "outputs": [],
-   "execution_count": null
+   "execution_count": 1
   },
   {
    "cell_type": "code",
    "id": "857d764bfbe99f68",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-04T21:21:47.801729Z",
+     "start_time": "2024-07-04T21:21:47.722357Z"
+    }
+   },
    "source": [
     "import json\n",
     "import logging\n",
@@ -30,22 +36,40 @@
     "from ai.LLM.LLMServiceStrategy import LLMServiceStrategy "
    ],
    "outputs": [],
-   "execution_count": null
+   "execution_count": 2
   },
   {
    "cell_type": "code",
    "id": "f490c5d5b210be2b",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-04T21:21:48.170910Z",
+     "start_time": "2024-07-04T21:21:47.944920Z"
+    }
+   },
    "source": [
     "from pathlib import Path\n",
     "\n",
     "data_folder = Path('../../data/') # Please change to the folder, Dataset.json is located in\n",
     "# We need the json data, so we are loading it from the file\n",
-    "with open(data_folder / Path('Dataset.json')) as f:\n",
+    "with open(data_folder / Path('Dataset_Juicebox.json')) as f:\n",
     "    data = json.load(f)"
    ],
-   "outputs": [],
-   "execution_count": null
+   "outputs": [
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: '../../data/Dataset.json'",
+     "output_type": "error",
+     "traceback": [
+      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[0;31mFileNotFoundError\u001B[0m                         Traceback (most recent call last)",
+      "Cell \u001B[0;32mIn[3], line 5\u001B[0m\n\u001B[1;32m      3\u001B[0m data_folder \u001B[38;5;241m=\u001B[39m Path(\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m../../data/\u001B[39m\u001B[38;5;124m'\u001B[39m) \u001B[38;5;66;03m# Please change to the folder, Dataset.json is located in\u001B[39;00m\n\u001B[1;32m      4\u001B[0m \u001B[38;5;66;03m# We need the json data, so we are loading it from the file\u001B[39;00m\n\u001B[0;32m----> 5\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28;43mopen\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mdata_folder\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m/\u001B[39;49m\u001B[43m \u001B[49m\u001B[43mPath\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mDataset.json\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m)\u001B[49m\u001B[43m)\u001B[49m \u001B[38;5;28;01mas\u001B[39;00m f:\n\u001B[1;32m      6\u001B[0m     data \u001B[38;5;241m=\u001B[39m json\u001B[38;5;241m.\u001B[39mload(f)\n",
+      "File \u001B[0;32m~/Desktop/Uni/SS24/Praktikum/Security-Findings-Recommender-System/src/venv/lib/python3.9/site-packages/IPython/core/interactiveshell.py:310\u001B[0m, in \u001B[0;36m_modified_open\u001B[0;34m(file, *args, **kwargs)\u001B[0m\n\u001B[1;32m    303\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m file \u001B[38;5;129;01min\u001B[39;00m {\u001B[38;5;241m0\u001B[39m, \u001B[38;5;241m1\u001B[39m, \u001B[38;5;241m2\u001B[39m}:\n\u001B[1;32m    304\u001B[0m     \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[1;32m    305\u001B[0m         \u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mIPython won\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mt let you open fd=\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mfile\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m by default \u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m    306\u001B[0m         \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mas it is likely to crash IPython. If you know what you are doing, \u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m    307\u001B[0m         \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124myou can use builtins\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m open.\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m    308\u001B[0m     )\n\u001B[0;32m--> 310\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mio_open\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfile\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n",
+      "\u001B[0;31mFileNotFoundError\u001B[0m: [Errno 2] No such file or directory: '../../data/Dataset.json'"
+     ]
+    }
+   ],
+   "execution_count": 3
   },
   {
    "metadata": {},
@@ -69,7 +93,12 @@
    "id": "cbb43a8273ab2e09"
   },
   {
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-04T21:21:49.449238Z",
+     "start_time": "2024-07-04T21:21:49.228925Z"
+    }
+   },
    "cell_type": "code",
    "source": [
     "from ai.LLM.Strategies.OpenAIService import OpenAIService\n",
@@ -78,7 +107,7 @@
    ],
    "id": "a17be348b6b7294a",
    "outputs": [],
-   "execution_count": null
+   "execution_count": 4
   },
   {
    "metadata": {},
@@ -92,7 +121,12 @@
    "id": "33ce7bb1fd20e336"
   },
   {
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-04T21:21:50.558588Z",
+     "start_time": "2024-07-04T21:21:50.538901Z"
+    }
+   },
    "cell_type": "code",
    "source": [
     "from ai.LLM.Strategies.OpenAIService import OpenAIService\n",
@@ -101,7 +135,7 @@
    ],
    "id": "901c5824178df8bb",
    "outputs": [],
-   "execution_count": null
+   "execution_count": 5
   },
   {
    "metadata": {},
@@ -115,7 +149,12 @@
    "id": "3357a17a826c90f7"
   },
   {
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-04T21:21:52.159152Z",
+     "start_time": "2024-07-04T21:21:51.469910Z"
+    }
+   },
    "cell_type": "code",
    "source": [
     "from ai.LLM.Strategies.OLLAMAService import OLLAMAService # this is the service that uses ollama to generate solution\n",
@@ -124,8 +163,16 @@
     "my_strategy = OLLAMAService(model_name=model_name)"
    ],
    "id": "845858d36a9b448d",
-   "outputs": [],
-   "execution_count": null
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "model llama3:instruct\n"
+     ]
+    }
+   ],
+   "execution_count": 6
   },
   {
    "metadata": {},
@@ -137,12 +184,17 @@
    "id": "b70749b293caeb08"
   },
   {
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-04T21:21:53.352668Z",
+     "start_time": "2024-07-04T21:21:53.332426Z"
+    }
+   },
    "cell_type": "code",
    "source": "llm_service = LLMServiceStrategy(my_strategy)",
    "id": "9750ed3f60f2a320",
    "outputs": [],
-   "execution_count": null
+   "execution_count": 7
   },
   {
    "metadata": {},
@@ -157,14 +209,31 @@
   {
    "cell_type": "code",
    "id": "926f7b0460c0f790",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-04T21:21:54.584013Z",
+     "start_time": "2024-07-04T21:21:54.531852Z"
+    }
+   },
    "source": [
-    "n=5 # number of findings to import\n",
+    "n=-1 # number of findings to import\n",
     "vulnerability_report = create_from_flama_json(data, n = n, llm_service=llm_service) # here, we create the VulnerabilityReport object, consisting of a list of Finding objects which each have Solution objects. We pass the llm_service to the VulnerabilityReport object, but this can also just be omitted, in which case it will be created in each Finding object.\n",
     "vulnerability_report.sort() # this will sort the findings by severity"
    ],
-   "outputs": [],
-   "execution_count": null
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'data' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[0;31mNameError\u001B[0m                                 Traceback (most recent call last)",
+      "Cell \u001B[0;32mIn[8], line 2\u001B[0m\n\u001B[1;32m      1\u001B[0m n\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m-\u001B[39m\u001B[38;5;241m1\u001B[39m \u001B[38;5;66;03m# number of findings to import\u001B[39;00m\n\u001B[0;32m----> 2\u001B[0m vulnerability_report \u001B[38;5;241m=\u001B[39m create_from_flama_json(\u001B[43mdata\u001B[49m, n \u001B[38;5;241m=\u001B[39m n, llm_service\u001B[38;5;241m=\u001B[39mllm_service) \u001B[38;5;66;03m# here, we create the VulnerabilityReport object, consisting of a list of Finding objects which each have Solution objects. We pass the llm_service to the VulnerabilityReport object, but this can also just be omitted, in which case it will be created in each Finding object.\u001B[39;00m\n\u001B[1;32m      3\u001B[0m vulnerability_report\u001B[38;5;241m.\u001B[39msort() \u001B[38;5;66;03m# this will sort the findings by severity\u001B[39;00m\n",
+      "\u001B[0;31mNameError\u001B[0m: name 'data' is not defined"
+     ]
+    }
+   ],
+   "execution_count": 8
   },
   {
    "cell_type": "markdown",
@@ -184,12 +253,29 @@
   {
    "cell_type": "code",
    "id": "14edeb4a26eeab5e",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-04T21:21:57.189559Z",
+     "start_time": "2024-07-04T21:21:57.159896Z"
+    }
+   },
    "source": [
     "vulnerability_report.add_category() # this will add categories to the findings"
    ],
-   "outputs": [],
-   "execution_count": null
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'vulnerability_report' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[0;31mNameError\u001B[0m                                 Traceback (most recent call last)",
+      "Cell \u001B[0;32mIn[9], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[43mvulnerability_report\u001B[49m\u001B[38;5;241m.\u001B[39madd_category() \u001B[38;5;66;03m# this will add categories to the findings\u001B[39;00m\n",
+      "\u001B[0;31mNameError\u001B[0m: name 'vulnerability_report' is not defined"
+     ]
+    }
+   ],
+   "execution_count": 9
   },
   {
    "cell_type": "markdown",
@@ -212,6 +298,23 @@
    "outputs": [],
    "execution_count": null
   },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "# Adding unsupervised categories",
+   "id": "229f80ff75352254"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "_, Clusterer = vulnerability_report.add_unsupervised_category()\n",
+    "Clusterer.get_cluster_graph()"
+   ],
+   "id": "f813be464bed8a04"
+  },
   {
    "metadata": {},
    "cell_type": "markdown",