diff --git a/.gitignore b/.gitignore index 3ec3ce2..b826598 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +*.pkl + .vscode/ __pycache__ .ipynb_checkpoints diff --git a/RCV1 Dataset Visualization/Self Organizing Map.ipynb b/RCV1 Dataset Visualization/Self Organizing Map.ipynb new file mode 100644 index 0000000..e179079 --- /dev/null +++ b/RCV1 Dataset Visualization/Self Organizing Map.ipynb @@ -0,0 +1,461 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "

In The Name Of GOD

\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RCV1 Dataset Visualization with Self Organizing Map" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "import itertools\n", + "import numpy as np\n", + "import pickle as pkl\n", + "from numpy.ma.core import ceil \n", + "from numpy import linalg as LA\n", + "from joblib import Parallel, delayed, effective_n_jobs\n", + "from numpy import argmin, unravel_index, sqrt, ogrid, newaxis\n", + "from sklearn.metrics import DistanceMetric #distance calculation\n", + "from sklearn.utils import resample #resampling\n", + "from sklearn.preprocessing import MinMaxScaler, StandardScaler #normalization\n", + "from sklearn.pipeline import Pipeline #pipeline\n", + "from sklearn.model_selection import train_test_split #split data\n", + "from sklearn.metrics import accuracy_score #scoring\n", + "from sklearn.metrics import confusion_matrix\n", + "import matplotlib.pyplot as plt\n", + "from matplotlib import animation, colors\n", + "from tqdm import tqdm\n", + "import pprint" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Hyper parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "NUM_SAMPLES = 10000 # number of samples to use\n", + "NUM_NEURONS = (5 * np.sqrt(NUM_SAMPLES)) #number of neurons in the SOM rectangular grid\n", + "GRID_SIZE = (ceil(np.sqrt(NUM_NEURONS)).astype(np.int32), ceil(np.sqrt(NUM_NEURONS)).astype(np.int32)) #size of the grid\n", + "NUM_ITERS = 3000 #number of iterations to run the SOM\n", + "BETA0 = 0.5 #initial learning rate\n", + "MU = 2 # initial mu for normal distribution\n", + "SIGMA0 = 11 # initial sigma for normal distribution\n", + "N_JOBS = effective_n_jobs() #number of jobs to run in parallel\n", + "MIN_BETA = 0.05 #minimum learning rate\n", + "MIN_SIGMA = 1 #minimum sigma for normal distribution" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Self Organizing Map (SOM) Implementation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Helper Functions" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "def normal(x, mu, sigma): #calculate the normal distribution\n", + " p = np.divide(1, np.sqrt(2 * np.pi * sigma**2))\n", + " return np.multiply(p, np.exp(-0.5 / sigma**2 * (x - mu)**2))\n", + "\n", + "def get_beta(epoch): #get the learning rate for the SOM\n", + " # return max(BETA0 * np.exp(-5 * np.divide(epoch, NUM_ITERS)), MIN_BETA)\n", + " return max(MIN_BETA, (BETA0 - MIN_BETA) * (1 - np.divide(epoch, NUM_ITERS)) + MIN_BETA)\n", + "\n", + "def get_sigma(epoch): #get the sigma for the normal distribution\n", + " # return max(SIGMA0 * np.exp(-5 * np.divide(epoch, NUM_ITERS)), MIN_SIGMA)\n", + " return max(MIN_SIGMA, (SIGMA0 - MIN_SIGMA) * (1 - np.divide(epoch, NUM_ITERS)) + MIN_SIGMA)\n", + "\n", + "def expand(x, shape): #expand the normal distribution to the grid size\n", + " return np.tile(x[:, :, newaxis], (1, 1, shape))\n", + "\n", + "def update_neurons(grid, best_match_idx, w, epoch): #update the neurons\n", + " x0, y0 = best_match_idx #get the coordinates of the best match\n", + " x, y = ogrid[0:GRID_SIZE[0], 0:GRID_SIZE[1]] #create a grid of coordinates\n", + " distance_to_best_idx = sqrt(np.power((x - x0), 2) + np.power((y - y0), 2)) #calculate the distance between the neurons and the best match\n", + " \n", + " sigma = get_sigma(epoch) #get the sigma for the normal distribution\n", + " # ns_values = normal(distance_to_best_idx, MU, sigma) #calculate the normal distribution\n", + " ns_values = 1 / (distance_to_best_idx + 1)\n", + " coefficient = expand(ns_values, grid.shape[-1]) #expand the normal distribution to the grid size\n", + " \n", + " # coefficient = np.tile(normal(ns_values, MU, get_sigma(epoch)), grid.shape) #calculate the coefficient for the neurons\n", + " distances = coefficient * (grid - w) #calculate the distance between the neurons and the input\n", + " \n", + " grid = grid + get_beta(epoch) * distances #update the neurons\n", + "\n", + "def find_winning_neuron(grid, x): #find the winning neuron\n", + " distances = LA.norm(grid - x, axis=-1) #calculate the distance between the neurons and the input\n", + " return unravel_index(argmin(distances), grid.shape[0:-1])\n", + "\n", + "def get_pipeline(scaler=StandardScaler()): #create a pipeline for the data\n", + " return Pipeline([\n", + " ('scaler', scaler)\n", + " ])\n", + "\n", + "def get_best_matching_units(grid, X: list): #get the best matching units for the input list\n", + " return Parallel(n_jobs=N_JOBS)(delayed(find_winning_neuron)(grid, X[i]) for i in tqdm(range(len(X)), desc='Finding BMUs'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Import Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import fetch_rcv1 #fetch the RCV1 dataset\n", + "rcv1 = fetch_rcv1() \n", + "\n", + "X, Y = rcv1.data, rcv1.target" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'X.shape: (804414, 47236)'\n", + "'Y.shape: (804414, 103)'\n", + "'num_samples: 804414'\n", + "'num_features: 47236'\n", + "'num_classes: 103'\n" + ] + } + ], + "source": [ + "pprint.pprint(f'X.shape: {X.shape}')\n", + "pprint.pprint(f'Y.shape: {Y.shape}')\n", + "pprint.pprint(f'num_samples: {X.shape[0]}')\n", + "pprint.pprint(f'num_features: {X.shape[1]}')\n", + "pprint.pprint(f'num_classes: {Y.shape[1]}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "X_resampled, Y_resampled = resample(X, Y, n_samples=NUM_SAMPLES, random_state=42) #resample data\n", + "\n", + "X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42) #split data into training and testing sets\n", + "\n", + "pipeline = get_pipeline()\n", + "X_train_pipelined = pipeline.fit_transform(X_train.toarray()) #scale data\n", + "X_test_pipelined = pipeline.transform(X_test.toarray()) #scale data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Self Organizing Map (SOM) Initialization" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'Grid rectangle is of width and height of (23, 23)'\n", + "'Neuron grid is of shape (23, 23, 47236)'\n" + ] + } + ], + "source": [ + "grid = np.random.rand(*GRID_SIZE, X_train_pipelined.shape[1]) #initialize the grid with random values\n", + "\n", + "pprint.pprint(f'Grid rectangle is of width and height of {GRID_SIZE}')\n", + "pprint.pprint(f'Neuron grid is of shape {grid.shape}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training Self Organizing Map (SOM)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Epochs: 100%|██████████| 3000/3000 [10:17<00:00, 4.86it/s]\n" + ] + } + ], + "source": [ + "for epoch in tqdm(range(NUM_ITERS), desc='Epochs', leave=True): #train the SOM\n", + " rnd_idx = np.random.randint(0, X_train_pipelined.shape[0]) #get a random index\n", + " x = X_train_pipelined[rnd_idx] #get the data\n", + " best_match_idx = find_winning_neuron(grid, x) #find the index of the neuron with the smallest distance to the input\n", + " update_neurons(grid, best_match_idx, x, epoch) #update the neurons" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Saving weights of SOM" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'SOM saved to model/som.pkl'\n" + ] + } + ], + "source": [ + "with open('model/som.pkl', 'wb') as f: #save the SOM\n", + " pkl.dump(grid, f)\n", + " pprint.pprint(f'SOM saved to model/som.pkl')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Loading weights of SOM" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'Grid has been loaded from model/som.pkl'\n" + ] + } + ], + "source": [ + "with open('model/som.pkl', 'rb') as f: #load the SOM\n", + " grid = pkl.load(f)\n", + " pprint.pprint(f'Grid has been loaded from model/som.pkl')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualizing SOM" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Helper Functions" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "dx = [0, 0, 1, -1] #neighborhood offsets\n", + "dy = [1, -1, 0, 0] #neighborhood offsets\n", + "\n", + "def calc_u_matrix(grid): #calculate the U matrix\n", + " u_matrix = np.zeros(GRID_SIZE) #initialize the U matrix\n", + " for pos in itertools.product(range(grid.shape[0]), range(grid.shape[1])):\n", + " num_neighbors = 0\n", + " for i in range(len(dx)):\n", + " x = pos[0] + dx[i]\n", + " y = pos[1] + dy[i]\n", + " if x >= 0 and x < grid.shape[0] and y >= 0 and y < grid.shape[1]:\n", + " u_matrix[pos] += LA.norm(grid[pos] - grid[x, y], axis=-1)\n", + " num_neighbors += 1\n", + " u_matrix[pos] /= num_neighbors\n", + " return u_matrix" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plotting U-Matrix" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "u_matrix = calc_u_matrix(grid) #calculate the U matrix" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "cmap=\"CMRmap\"\n", + "fontsize=12\n", + "_, ax = plt.subplots(figsize=(10, 8))\n", + "img = ax.imshow(u_matrix, cmap=cmap)\n", + "\n", + "# ticks and labels\n", + "for tick in ax.xaxis.get_major_ticks():\n", + " tick.label.set_fontsize(fontsize)\n", + "for tick in ax.yaxis.get_major_ticks():\n", + " tick.label.set_fontsize(fontsize)\n", + "ax.set_ylabel(\"SOM rows\", fontsize=fontsize)\n", + "ax.set_xlabel(\"SOM columns\", fontsize=fontsize)\n", + "\n", + "# colorbar\n", + "cbar = plt.colorbar(img, ax=ax, fraction=0.04, pad=0.04)\n", + "cbar.ax.set_ylabel(\n", + " \"Distance measure (a.u.)\", rotation=90, fontsize=fontsize, labelpad=20\n", + ")\n", + "cbar.ax.tick_params(labelsize=fontsize)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### calculating best matching units for each test data point\n", + "\n", + "the performance of getting the best matching unit is very good compared to the performance of the training part because of the **parallelization** of the algorithm.(it is about 10 times faster than the training part on my 10 core cpu machine)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Finding BMUs: 100%|██████████| 2000/2000 [00:39<00:00, 50.41it/s]\n" + ] + } + ], + "source": [ + "bmus = get_best_matching_units(grid, X_test_pipelined) " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.12", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "465bf8bf39eb6d29a2a62c1b7df28f314b583a714539b89eff15d00bb86422b4" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/RCV1 Dataset Visualization/model/README.md b/RCV1 Dataset Visualization/model/README.md new file mode 100644 index 0000000..2446e46 --- /dev/null +++ b/RCV1 Dataset Visualization/model/README.md @@ -0,0 +1,3 @@ +
+

This directory is empty because I didn't want to increase the amount of storage that the repository has to an unreasonable amount.

+