Skip to content

Commit

Permalink
finish code
Browse files Browse the repository at this point in the history
  • Loading branch information
MortenBlorstad committed Nov 11, 2021
1 parent f536bdb commit be0b038
Show file tree
Hide file tree
Showing 63 changed files with 11,627 additions and 2,365 deletions.
601 changes: 340 additions & 261 deletions .ipynb_checkpoints/Bert-checkpoint.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion .ipynb_checkpoints/GPT2Tuner-checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def __clean_data(self, data_path: str):
#GPT2 takes only 1024 tokens, so we limit the text to 1021
for text, label in zip(df.iloc[:,0], df.iloc[:,1]):
label = str(label)
sequences.append(str(label) + self.bos + ' '.join(str(text).split()[:1021]) + self.eos)
sequences.append(str(label) + self.bos + text + self.eos)
labels.append(label)

#Clean the sequences
Expand Down
601 changes: 340 additions & 261 deletions Bert.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion GPT2Tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def __clean_data(self, data_path: str):
#GPT2 takes only 1024 tokens, so we limit the text to 1021
for text, label in zip(df.iloc[:,0], df.iloc[:,1]):
label = str(label)
sequences.append(str(label) + self.bos + ' '.join(str(text).split()[:1021]) + self.eos)
sequences.append(str(label) + self.bos + text + self.eos)
labels.append(label)

#Clean the sequences
Expand Down
20 changes: 18 additions & 2 deletions GanBert.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,18 @@
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import pandas as pd


# Source
#------------------------------
# The Generator as in
# https://www.aclweb.org/anthology/2020.acl-main.191/
# https://github.com/crux82/ganbert
#------------------------------

class Generator(nn.Module):
'''
generator model that produces “fake” examples resembling the data distribution. Used togheter with the discrimenator
then training the ganbert model.
'''
def __init__(self, noise_size=100, output_size=512, hidden_sizes=[512], dropout_rate=0.1):
super(Generator, self).__init__()
layers = []
Expand All @@ -39,6 +44,9 @@ def forward(self, noise):
# https://github.com/crux82/ganbert
#------------------------------
class Discriminator(nn.Module):
'''
The BERT model is used as a discriminator. After the training this model is used for predictions
'''
def __init__(self, input_size=512, hidden_sizes=[512], num_labels=2, dropout_rate=0.1):
super(Discriminator, self).__init__()
self.input_dropout = nn.Dropout(p=dropout_rate)
Expand All @@ -58,8 +66,16 @@ def forward(self, input_rep):
probs = self.softmax(logits)
return last_rep, logits, probs


#------------------------------
# GANBERT Model
# https://github.com/crux82/ganbert
#------------------------------
class GanBert():
'''
The ganbert model used a generator to produces “fake” examples resembling the data distribution and a discriminator to
distinguish samples of the generator from the real instances..
The bert model is used as discriminator.
'''
def __init__(self,batch_size=64,max_seq_length = 64,num_hidden_layers_g = 1,
num_hidden_layers_d =1,noise_size = 100,out_dropout_rate = 0.2,
apply_balance = True,learning_rate_discriminator = 5e-5,learning_rate_generator = 5e-5,
Expand Down
126 changes: 126 additions & 0 deletions Notebooks/.ipynb_checkpoints/results-checkpoint.ipynb

Large diffs are not rendered by default.

159 changes: 111 additions & 48 deletions Notebooks/Bert.ipynb
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Install dependencies "
]
},
{
"cell_type": "code",
"execution_count": 1,
Expand All @@ -15,11 +22,13 @@
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[31mERROR: Could not find a version that satisfies the requirement official.nlp (from versions: none)\u001b[0m\n",
"\u001b[31mERROR: No matching distribution found for official.nlp\u001b[0m\n",
"Requirement already satisfied: numpy in /opt/conda/lib/python3.7/site-packages (1.19.5)\n",
"Requirement already satisfied: pandas in /opt/conda/lib/python3.7/site-packages (1.3.2)\n",
"Requirement already satisfied: numpy>=1.17.3 in /opt/conda/lib/python3.7/site-packages (from pandas) (1.19.5)\n",
"Requirement already satisfied: python-dateutil>=2.7.3 in /opt/conda/lib/python3.7/site-packages (from pandas) (2.8.2)\n",
"Requirement already satisfied: pytz>=2017.3 in /opt/conda/lib/python3.7/site-packages (from pandas) (2021.1)\n",
"Requirement already satisfied: python-dateutil>=2.7.3 in /opt/conda/lib/python3.7/site-packages (from pandas) (2.8.2)\n",
"Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas) (1.15.0)\n"
]
}
Expand All @@ -30,10 +39,25 @@
"# A dependency of the preprocessing for BERT inputs\n",
"!pip install -q -U tensorflow-text\n",
"!pip install -q tf-models-official\n",
"\n",
"!pip install -q tensorflow\n",
"!pip install -q tensorflow_hub\n",
"\n",
"!pip install -q official.nlp\n",
"\n",
"\n",
"!pip install -q matplotlib\n",
"!pip install numpy\n",
"!pip install pandas"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Import libraries"
]
},
{
"cell_type": "code",
"execution_count": 2,
Expand All @@ -52,6 +76,22 @@
"from Bert import Bert\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Set-up\n",
"\n",
"Set gloabal variable for changing dataset.\n",
"\n",
"Set data_name to the name of your dataset. This needs to correspond to a folder in /data/, which should be generated by the generate_data.ipynb notebook. num_classes manually needs to be set to the number of classes in your dataset.\n",
"\n",
"- data_name: \"imdb\" or \"medical\"\n",
"- num_classes:\n",
" - imdb: 2\n",
" - medical: 5"
]
},
{
"cell_type": "code",
"execution_count": 3,
Expand All @@ -62,14 +102,23 @@
"source": [
"## gloabal variable for changing dataset.\n",
"## data_name possible values: \"imdb\", \"medical\"\n",
"data_name = \"imdb\"\n",
"data_name = \"medical\"\n",
"## num_classes possible values: \"imdb\"=2, \"medical\"=5\n",
"num_classes = 2"
"num_classes = 5"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get data paths\n",
"- Get the file path for training data sets, 5, 10, 25, and 50 per label\n",
"- Get the path for the test set"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -82,9 +131,16 @@
"test_path = data_path+\"/test.csv\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Set hyper-parameters for Bert"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {
"id": "IgGLg_kOxhg3"
},
Expand All @@ -100,9 +156,16 @@
"results=pd.DataFrame(columns=[\"n_per_class\", \"accuracy\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Traning and evalueate the Bert classifier"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
Expand All @@ -123,68 +186,68 @@
"name": "stderr",
"output_type": "stream",
"text": [
"2021-11-11 16:06:22.949889: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10800 MB memory: -> device: 0, name: Tesla K80, pci bus id: 0000:06:00.0, compute capability: 3.7\n",
"2021-11-11 16:06:26.100253: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n"
"2021-11-11 19:36:54.988275: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10800 MB memory: -> device: 0, name: Tesla K80, pci bus id: 0000:06:00.0, compute capability: 3.7\n",
"2021-11-11 19:36:58.084455: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"data/imdb/train_labeled_5.csv\n",
"data/medical/train_labeled_5.csv\n",
"Epoch 1/5\n",
"3/3 [==============================] - 21s 261ms/step - loss: 0.8798 - accuracy: 0.5000\n",
"7/7 [==============================] - 22s 269ms/step - loss: 1.5570 - accuracy: 0.3600\n",
"Epoch 2/5\n",
"3/3 [==============================] - 1s 242ms/step - loss: 0.8023 - accuracy: 0.6000\n",
"7/7 [==============================] - 2s 262ms/step - loss: 1.0369 - accuracy: 0.6000\n",
"Epoch 3/5\n",
"3/3 [==============================] - 1s 242ms/step - loss: 0.8143 - accuracy: 0.5000\n",
"7/7 [==============================] - 2s 265ms/step - loss: 1.1334 - accuracy: 0.6000\n",
"Epoch 4/5\n",
"3/3 [==============================] - 1s 242ms/step - loss: 0.3634 - accuracy: 0.9000\n",
"7/7 [==============================] - 2s 262ms/step - loss: 0.9176 - accuracy: 0.7600\n",
"Epoch 5/5\n",
"3/3 [==============================] - 1s 243ms/step - loss: 0.4016 - accuracy: 0.9000\n",
"500/500 [==============================] - 18s 34ms/step - loss: 0.7131 - accuracy: 0.5780\n",
"7/7 [==============================] - 2s 261ms/step - loss: 0.8068 - accuracy: 0.7200\n",
"500/500 [==============================] - 18s 33ms/step - loss: 0.9731 - accuracy: 0.6000\n",
"BERT model selected : https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3\n",
"Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3\n",
"data/imdb/train_labeled_10.csv\n",
"data/medical/train_labeled_10.csv\n",
"Epoch 1/5\n",
"5/5 [==============================] - 21s 291ms/step - loss: 0.6899 - accuracy: 0.6500\n",
"13/13 [==============================] - 23s 278ms/step - loss: 2.0240 - accuracy: 0.1000\n",
"Epoch 2/5\n",
"5/5 [==============================] - 1s 280ms/step - loss: 0.8311 - accuracy: 0.5000\n",
"13/13 [==============================] - 4s 272ms/step - loss: 1.4455 - accuracy: 0.4000\n",
"Epoch 3/5\n",
"5/5 [==============================] - 1s 279ms/step - loss: 0.4144 - accuracy: 0.7500\n",
"13/13 [==============================] - 4s 273ms/step - loss: 1.1934 - accuracy: 0.4600\n",
"Epoch 4/5\n",
"5/5 [==============================] - 1s 279ms/step - loss: 0.2541 - accuracy: 0.9500\n",
"13/13 [==============================] - 4s 275ms/step - loss: 0.8327 - accuracy: 0.7600\n",
"Epoch 5/5\n",
"5/5 [==============================] - 1s 282ms/step - loss: 0.2200 - accuracy: 1.0000\n",
"500/500 [==============================] - 18s 33ms/step - loss: 0.6528 - accuracy: 0.6360\n",
"13/13 [==============================] - 4s 274ms/step - loss: 0.6555 - accuracy: 0.8600\n",
"500/500 [==============================] - 18s 33ms/step - loss: 1.7167 - accuracy: 0.2740\n",
"BERT model selected : https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3\n",
"Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3\n",
"data/imdb/train_labeled_25.csv\n",
"data/medical/train_labeled_25.csv\n",
"Epoch 1/5\n",
"13/13 [==============================] - 24s 276ms/step - loss: 0.7634 - accuracy: 0.6200\n",
"32/32 [==============================] - 28s 278ms/step - loss: 1.2559 - accuracy: 0.4880\n",
"Epoch 2/5\n",
"13/13 [==============================] - 4s 272ms/step - loss: 0.9393 - accuracy: 0.5000\n",
"32/32 [==============================] - 9s 277ms/step - loss: 0.9699 - accuracy: 0.5920\n",
"Epoch 3/5\n",
"13/13 [==============================] - 4s 271ms/step - loss: 0.8266 - accuracy: 0.4400\n",
"32/32 [==============================] - 9s 276ms/step - loss: 0.7226 - accuracy: 0.7120\n",
"Epoch 4/5\n",
"13/13 [==============================] - 4s 273ms/step - loss: 0.5808 - accuracy: 0.7600\n",
"32/32 [==============================] - 9s 276ms/step - loss: 0.4782 - accuracy: 0.8160\n",
"Epoch 5/5\n",
"13/13 [==============================] - 4s 274ms/step - loss: 0.3491 - accuracy: 0.8200\n",
"500/500 [==============================] - 17s 32ms/step - loss: 0.7582 - accuracy: 0.5700\n",
"32/32 [==============================] - 9s 276ms/step - loss: 0.2463 - accuracy: 0.8880\n",
"500/500 [==============================] - 18s 33ms/step - loss: 1.4256 - accuracy: 0.5620\n",
"BERT model selected : https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3\n",
"Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3\n",
"data/imdb/train_labeled_50.csv\n",
"data/medical/train_labeled_50.csv\n",
"Epoch 1/5\n",
"25/25 [==============================] - 27s 280ms/step - loss: 0.9254 - accuracy: 0.5000\n",
"63/63 [==============================] - 37s 279ms/step - loss: 1.5747 - accuracy: 0.3200\n",
"Epoch 2/5\n",
"25/25 [==============================] - 7s 278ms/step - loss: 0.6534 - accuracy: 0.6100\n",
"63/63 [==============================] - 18s 280ms/step - loss: 1.3896 - accuracy: 0.3600\n",
"Epoch 3/5\n",
"25/25 [==============================] - 7s 279ms/step - loss: 0.3462 - accuracy: 0.8500\n",
"63/63 [==============================] - 18s 280ms/step - loss: 1.0450 - accuracy: 0.5520\n",
"Epoch 4/5\n",
"25/25 [==============================] - 7s 279ms/step - loss: 0.1675 - accuracy: 0.9300\n",
"63/63 [==============================] - 18s 280ms/step - loss: 0.4800 - accuracy: 0.8440\n",
"Epoch 5/5\n",
"25/25 [==============================] - 7s 278ms/step - loss: 0.1383 - accuracy: 0.9700\n",
"500/500 [==============================] - 18s 32ms/step - loss: 1.0870 - accuracy: 0.7920\n"
"63/63 [==============================] - 18s 281ms/step - loss: 0.1538 - accuracy: 0.9600\n",
"500/500 [==============================] - 18s 34ms/step - loss: 1.5763 - accuracy: 0.5200\n"
]
}
],
Expand All @@ -193,7 +256,7 @@
"for n_per_class in [5,10,25,50]:\n",
" data_file = \"\"\n",
" result = {\"n_per_class\":n_per_class}\n",
" bert = Bert(num_classes = 2, random_state = seed) # create model \n",
" bert = Bert(num_classes = num_classes, random_state = seed) # create model \n",
" for file in labeled_files: ## find correct file\n",
" if f\"train_labeled_{n_per_class}.csv\" in file:\n",
" data_file = file\n",
Expand All @@ -210,7 +273,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
Expand Down Expand Up @@ -249,36 +312,36 @@
" <tr>\n",
" <th>0</th>\n",
" <td>5.0</td>\n",
" <td>0.578</td>\n",
" <td>0.600</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>10.0</td>\n",
" <td>0.636</td>\n",
" <td>0.274</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>25.0</td>\n",
" <td>0.570</td>\n",
" <td>0.562</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>50.0</td>\n",
" <td>0.792</td>\n",
" <td>0.520</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" n_per_class accuracy\n",
"0 5.0 0.578\n",
"1 10.0 0.636\n",
"2 25.0 0.570\n",
"3 50.0 0.792"
"0 5.0 0.600\n",
"1 10.0 0.274\n",
"2 25.0 0.562\n",
"3 50.0 0.520"
]
},
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -290,7 +353,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"metadata": {
"id": "gZNtDZIgDXhw"
},
Expand Down
Loading

0 comments on commit be0b038

Please sign in to comment.