finish code

ALjone · Nov 11, 2021 · be0b038 · be0b038
1 parent f536bdb
commit be0b038
Show file tree

Hide file tree

Showing 63 changed files with 11,627 additions and 2,365 deletions.
diff --git a/.ipynb_checkpoints/Bert-checkpoint.py b/.ipynb_checkpoints/Bert-checkpoint.py
diff --git a/.ipynb_checkpoints/GPT2Tuner-checkpoint.py b/.ipynb_checkpoints/GPT2Tuner-checkpoint.py
@@ -107,7 +107,7 @@ def __clean_data(self, data_path: str):
         #GPT2 takes only 1024 tokens, so we limit the text to 1021
         for text, label in zip(df.iloc[:,0], df.iloc[:,1]):
             label = str(label)
-            sequences.append(str(label) + self.bos + ' '.join(str(text).split()[:1021]) + self.eos)
+            sequences.append(str(label) + self.bos + text + self.eos)
             labels.append(label)
 
         #Clean the sequences

diff --git a/Bert.py b/Bert.py
diff --git a/GPT2Tuner.py b/GPT2Tuner.py
@@ -107,7 +107,7 @@ def __clean_data(self, data_path: str):
         #GPT2 takes only 1024 tokens, so we limit the text to 1021
         for text, label in zip(df.iloc[:,0], df.iloc[:,1]):
             label = str(label)
-            sequences.append(str(label) + self.bos + ' '.join(str(text).split()[:1021]) + self.eos)
+            sequences.append(str(label) + self.bos + text + self.eos)
             labels.append(label)
 
         #Clean the sequences

diff --git a/GanBert.py b/GanBert.py
@@ -12,13 +12,18 @@
 from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
 import pandas as pd
 
-
+# Source
 #------------------------------
 #   The Generator as in 
 #   https://www.aclweb.org/anthology/2020.acl-main.191/
 #   https://github.com/crux82/ganbert
 #------------------------------
+
 class Generator(nn.Module):
+    '''
+     generator model that produces “fake” examples resembling the data distribution. Used togheter with the discrimenator
+     then training the ganbert model. 
+    '''
     def __init__(self, noise_size=100, output_size=512, hidden_sizes=[512], dropout_rate=0.1):
         super(Generator, self).__init__()
         layers = []
@@ -39,6 +44,9 @@ def forward(self, noise):
 #   https://github.com/crux82/ganbert
 #------------------------------
 class Discriminator(nn.Module):
+    '''
+    The BERT  model is used as a discriminator. After the training this model is used for predictions
+    '''
     def __init__(self, input_size=512, hidden_sizes=[512], num_labels=2, dropout_rate=0.1):
         super(Discriminator, self).__init__()
         self.input_dropout = nn.Dropout(p=dropout_rate)
@@ -58,8 +66,16 @@ def forward(self, input_rep):
         probs = self.softmax(logits)
         return last_rep, logits, probs
 
-
+#------------------------------
+#   GANBERT Model
+#   https://github.com/crux82/ganbert
+#------------------------------
 class GanBert():
+    '''
+    The ganbert model used a generator to produces “fake” examples resembling the data distribution and a discriminator to
+    distinguish samples of the generator from the real instances.. 
+    The bert model is used as discriminator. 
+    '''
     def __init__(self,batch_size=64,max_seq_length = 64,num_hidden_layers_g = 1,
                     num_hidden_layers_d =1,noise_size = 100,out_dropout_rate = 0.2,
                     apply_balance = True,learning_rate_discriminator = 5e-5,learning_rate_generator = 5e-5,

diff --git a/Notebooks/.ipynb_checkpoints/results-checkpoint.ipynb b/Notebooks/.ipynb_checkpoints/results-checkpoint.ipynb
diff --git a/Notebooks/Bert.ipynb b/Notebooks/Bert.ipynb
@@ -1,5 +1,12 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Install dependencies "
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,
@@ -15,11 +22,13 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "\u001b[31mERROR: Could not find a version that satisfies the requirement official.nlp (from versions: none)\u001b[0m\n",
+      "\u001b[31mERROR: No matching distribution found for official.nlp\u001b[0m\n",
       "Requirement already satisfied: numpy in /opt/conda/lib/python3.7/site-packages (1.19.5)\n",
       "Requirement already satisfied: pandas in /opt/conda/lib/python3.7/site-packages (1.3.2)\n",
       "Requirement already satisfied: numpy>=1.17.3 in /opt/conda/lib/python3.7/site-packages (from pandas) (1.19.5)\n",
-      "Requirement already satisfied: python-dateutil>=2.7.3 in /opt/conda/lib/python3.7/site-packages (from pandas) (2.8.2)\n",
       "Requirement already satisfied: pytz>=2017.3 in /opt/conda/lib/python3.7/site-packages (from pandas) (2021.1)\n",
+      "Requirement already satisfied: python-dateutil>=2.7.3 in /opt/conda/lib/python3.7/site-packages (from pandas) (2.8.2)\n",
       "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas) (1.15.0)\n"
      ]
     }
@@ -30,10 +39,25 @@
     "# A dependency of the preprocessing for BERT inputs\n",
     "!pip install -q -U tensorflow-text\n",
     "!pip install -q tf-models-official\n",
+    "\n",
+    "!pip install -q tensorflow\n",
+    "!pip install -q tensorflow_hub\n",
+    "\n",
+    "!pip install -q official.nlp\n",
+    "\n",
+    "\n",
+    "!pip install -q matplotlib\n",
     "!pip install numpy\n",
     "!pip install pandas"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Import libraries"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 2,
@@ -52,6 +76,22 @@
     "from Bert import Bert\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Set-up\n",
+    "\n",
+    "Set gloabal variable for changing dataset.\n",
+    "\n",
+    "Set data_name to the name of your dataset. This needs to correspond to a folder in /data/, which should be generated by the generate_data.ipynb notebook. num_classes manually needs to be set to the number of classes in your dataset.\n",
+    "\n",
+    "- data_name: \"imdb\" or \"medical\"\n",
+    "- num_classes:\n",
+    "    - imdb: 2\n",
+    "    - medical: 5"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 3,
@@ -62,14 +102,23 @@
    "source": [
     "## gloabal variable for changing dataset.\n",
     "## data_name possible values: \"imdb\", \"medical\"\n",
-    "data_name = \"imdb\"\n",
+    "data_name = \"medical\"\n",
     "## num_classes possible values: \"imdb\"=2, \"medical\"=5\n",
-    "num_classes = 2"
+    "num_classes = 5"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Get data paths\n",
+    "- Get the file path for training data sets, 5, 10, 25, and 50 per label\n",
+    "- Get the path for the test set"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -82,9 +131,16 @@
     "test_path = data_path+\"/test.csv\""
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Set hyper-parameters for Bert"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {
     "id": "IgGLg_kOxhg3"
    },
@@ -100,9 +156,16 @@
     "results=pd.DataFrame(columns=[\"n_per_class\", \"accuracy\"])"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Traning and evalueate the Bert classifier"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
@@ -123,68 +186,68 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2021-11-11 16:06:22.949889: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10800 MB memory:  -> device: 0, name: Tesla K80, pci bus id: 0000:06:00.0, compute capability: 3.7\n",
-      "2021-11-11 16:06:26.100253: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n"
+      "2021-11-11 19:36:54.988275: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10800 MB memory:  -> device: 0, name: Tesla K80, pci bus id: 0000:06:00.0, compute capability: 3.7\n",
+      "2021-11-11 19:36:58.084455: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "data/imdb/train_labeled_5.csv\n",
+      "data/medical/train_labeled_5.csv\n",
       "Epoch 1/5\n",
-      "3/3 [==============================] - 21s 261ms/step - loss: 0.8798 - accuracy: 0.5000\n",
+      "7/7 [==============================] - 22s 269ms/step - loss: 1.5570 - accuracy: 0.3600\n",
       "Epoch 2/5\n",
-      "3/3 [==============================] - 1s 242ms/step - loss: 0.8023 - accuracy: 0.6000\n",
+      "7/7 [==============================] - 2s 262ms/step - loss: 1.0369 - accuracy: 0.6000\n",
       "Epoch 3/5\n",
-      "3/3 [==============================] - 1s 242ms/step - loss: 0.8143 - accuracy: 0.5000\n",
+      "7/7 [==============================] - 2s 265ms/step - loss: 1.1334 - accuracy: 0.6000\n",
       "Epoch 4/5\n",
-      "3/3 [==============================] - 1s 242ms/step - loss: 0.3634 - accuracy: 0.9000\n",
+      "7/7 [==============================] - 2s 262ms/step - loss: 0.9176 - accuracy: 0.7600\n",
       "Epoch 5/5\n",
-      "3/3 [==============================] - 1s 243ms/step - loss: 0.4016 - accuracy: 0.9000\n",
-      "500/500 [==============================] - 18s 34ms/step - loss: 0.7131 - accuracy: 0.5780\n",
+      "7/7 [==============================] - 2s 261ms/step - loss: 0.8068 - accuracy: 0.7200\n",
+      "500/500 [==============================] - 18s 33ms/step - loss: 0.9731 - accuracy: 0.6000\n",
       "BERT model selected           : https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3\n",
       "Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3\n",
-      "data/imdb/train_labeled_10.csv\n",
+      "data/medical/train_labeled_10.csv\n",
       "Epoch 1/5\n",
-      "5/5 [==============================] - 21s 291ms/step - loss: 0.6899 - accuracy: 0.6500\n",
+      "13/13 [==============================] - 23s 278ms/step - loss: 2.0240 - accuracy: 0.1000\n",
       "Epoch 2/5\n",
-      "5/5 [==============================] - 1s 280ms/step - loss: 0.8311 - accuracy: 0.5000\n",
+      "13/13 [==============================] - 4s 272ms/step - loss: 1.4455 - accuracy: 0.4000\n",
       "Epoch 3/5\n",
-      "5/5 [==============================] - 1s 279ms/step - loss: 0.4144 - accuracy: 0.7500\n",
+      "13/13 [==============================] - 4s 273ms/step - loss: 1.1934 - accuracy: 0.4600\n",
       "Epoch 4/5\n",
-      "5/5 [==============================] - 1s 279ms/step - loss: 0.2541 - accuracy: 0.9500\n",
+      "13/13 [==============================] - 4s 275ms/step - loss: 0.8327 - accuracy: 0.7600\n",
       "Epoch 5/5\n",
-      "5/5 [==============================] - 1s 282ms/step - loss: 0.2200 - accuracy: 1.0000\n",
-      "500/500 [==============================] - 18s 33ms/step - loss: 0.6528 - accuracy: 0.6360\n",
+      "13/13 [==============================] - 4s 274ms/step - loss: 0.6555 - accuracy: 0.8600\n",
+      "500/500 [==============================] - 18s 33ms/step - loss: 1.7167 - accuracy: 0.2740\n",
       "BERT model selected           : https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3\n",
       "Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3\n",
-      "data/imdb/train_labeled_25.csv\n",
+      "data/medical/train_labeled_25.csv\n",
       "Epoch 1/5\n",
-      "13/13 [==============================] - 24s 276ms/step - loss: 0.7634 - accuracy: 0.6200\n",
+      "32/32 [==============================] - 28s 278ms/step - loss: 1.2559 - accuracy: 0.4880\n",
       "Epoch 2/5\n",
-      "13/13 [==============================] - 4s 272ms/step - loss: 0.9393 - accuracy: 0.5000\n",
+      "32/32 [==============================] - 9s 277ms/step - loss: 0.9699 - accuracy: 0.5920\n",
       "Epoch 3/5\n",
-      "13/13 [==============================] - 4s 271ms/step - loss: 0.8266 - accuracy: 0.4400\n",
+      "32/32 [==============================] - 9s 276ms/step - loss: 0.7226 - accuracy: 0.7120\n",
       "Epoch 4/5\n",
-      "13/13 [==============================] - 4s 273ms/step - loss: 0.5808 - accuracy: 0.7600\n",
+      "32/32 [==============================] - 9s 276ms/step - loss: 0.4782 - accuracy: 0.8160\n",
       "Epoch 5/5\n",
-      "13/13 [==============================] - 4s 274ms/step - loss: 0.3491 - accuracy: 0.8200\n",
-      "500/500 [==============================] - 17s 32ms/step - loss: 0.7582 - accuracy: 0.5700\n",
+      "32/32 [==============================] - 9s 276ms/step - loss: 0.2463 - accuracy: 0.8880\n",
+      "500/500 [==============================] - 18s 33ms/step - loss: 1.4256 - accuracy: 0.5620\n",
       "BERT model selected           : https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3\n",
       "Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3\n",
-      "data/imdb/train_labeled_50.csv\n",
+      "data/medical/train_labeled_50.csv\n",
       "Epoch 1/5\n",
-      "25/25 [==============================] - 27s 280ms/step - loss: 0.9254 - accuracy: 0.5000\n",
+      "63/63 [==============================] - 37s 279ms/step - loss: 1.5747 - accuracy: 0.3200\n",
       "Epoch 2/5\n",
-      "25/25 [==============================] - 7s 278ms/step - loss: 0.6534 - accuracy: 0.6100\n",
+      "63/63 [==============================] - 18s 280ms/step - loss: 1.3896 - accuracy: 0.3600\n",
       "Epoch 3/5\n",
-      "25/25 [==============================] - 7s 279ms/step - loss: 0.3462 - accuracy: 0.8500\n",
+      "63/63 [==============================] - 18s 280ms/step - loss: 1.0450 - accuracy: 0.5520\n",
       "Epoch 4/5\n",
-      "25/25 [==============================] - 7s 279ms/step - loss: 0.1675 - accuracy: 0.9300\n",
+      "63/63 [==============================] - 18s 280ms/step - loss: 0.4800 - accuracy: 0.8440\n",
       "Epoch 5/5\n",
-      "25/25 [==============================] - 7s 278ms/step - loss: 0.1383 - accuracy: 0.9700\n",
-      "500/500 [==============================] - 18s 32ms/step - loss: 1.0870 - accuracy: 0.7920\n"
+      "63/63 [==============================] - 18s 281ms/step - loss: 0.1538 - accuracy: 0.9600\n",
+      "500/500 [==============================] - 18s 34ms/step - loss: 1.5763 - accuracy: 0.5200\n"
      ]
     }
    ],
@@ -193,7 +256,7 @@
     "for n_per_class in [5,10,25,50]:\n",
     "    data_file = \"\"\n",
     "    result = {\"n_per_class\":n_per_class}\n",
-    "    bert = Bert(num_classes = 2, random_state = seed) # create model \n",
+    "    bert = Bert(num_classes = num_classes, random_state = seed) # create model \n",
     "    for file in labeled_files: ## find correct file\n",
     "        if f\"train_labeled_{n_per_class}.csv\" in file:\n",
     "            data_file = file\n",
@@ -210,7 +273,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/",
@@ -249,36 +312,36 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>5.0</td>\n",
-       "      <td>0.578</td>\n",
+       "      <td>0.600</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>10.0</td>\n",
-       "      <td>0.636</td>\n",
+       "      <td>0.274</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>25.0</td>\n",
-       "      <td>0.570</td>\n",
+       "      <td>0.562</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>50.0</td>\n",
-       "      <td>0.792</td>\n",
+       "      <td>0.520</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
        "   n_per_class  accuracy\n",
-       "0          5.0     0.578\n",
-       "1         10.0     0.636\n",
-       "2         25.0     0.570\n",
-       "3         50.0     0.792"
+       "0          5.0     0.600\n",
+       "1         10.0     0.274\n",
+       "2         25.0     0.562\n",
+       "3         50.0     0.520"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -290,7 +353,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "metadata": {
     "id": "gZNtDZIgDXhw"
    },