diff --git a/.gitignore b/.gitignore
index 3ec3ce2..b826598 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+*.pkl
+
.vscode/
__pycache__
.ipynb_checkpoints
diff --git a/RCV1 Dataset Visualization/Self Organizing Map.ipynb b/RCV1 Dataset Visualization/Self Organizing Map.ipynb
new file mode 100644
index 0000000..e179079
--- /dev/null
+++ b/RCV1 Dataset Visualization/Self Organizing Map.ipynb
@@ -0,0 +1,461 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
In The Name Of GOD
\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# RCV1 Dataset Visualization with Self Organizing Map"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Import Libraries"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import itertools\n",
+ "import numpy as np\n",
+ "import pickle as pkl\n",
+ "from numpy.ma.core import ceil \n",
+ "from numpy import linalg as LA\n",
+ "from joblib import Parallel, delayed, effective_n_jobs\n",
+ "from numpy import argmin, unravel_index, sqrt, ogrid, newaxis\n",
+ "from sklearn.metrics import DistanceMetric #distance calculation\n",
+ "from sklearn.utils import resample #resampling\n",
+ "from sklearn.preprocessing import MinMaxScaler, StandardScaler #normalization\n",
+ "from sklearn.pipeline import Pipeline #pipeline\n",
+ "from sklearn.model_selection import train_test_split #split data\n",
+ "from sklearn.metrics import accuracy_score #scoring\n",
+ "from sklearn.metrics import confusion_matrix\n",
+ "import matplotlib.pyplot as plt\n",
+ "from matplotlib import animation, colors\n",
+ "from tqdm import tqdm\n",
+ "import pprint"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Hyper parameters"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "NUM_SAMPLES = 10000 # number of samples to use\n",
+ "NUM_NEURONS = (5 * np.sqrt(NUM_SAMPLES)) #number of neurons in the SOM rectangular grid\n",
+ "GRID_SIZE = (ceil(np.sqrt(NUM_NEURONS)).astype(np.int32), ceil(np.sqrt(NUM_NEURONS)).astype(np.int32)) #size of the grid\n",
+ "NUM_ITERS = 3000 #number of iterations to run the SOM\n",
+ "BETA0 = 0.5 #initial learning rate\n",
+ "MU = 2 # initial mu for normal distribution\n",
+ "SIGMA0 = 11 # initial sigma for normal distribution\n",
+ "N_JOBS = effective_n_jobs() #number of jobs to run in parallel\n",
+ "MIN_BETA = 0.05 #minimum learning rate\n",
+ "MIN_SIGMA = 1 #minimum sigma for normal distribution"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Self Organizing Map (SOM) Implementation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Helper Functions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def normal(x, mu, sigma): #calculate the normal distribution\n",
+ " p = np.divide(1, np.sqrt(2 * np.pi * sigma**2))\n",
+ " return np.multiply(p, np.exp(-0.5 / sigma**2 * (x - mu)**2))\n",
+ "\n",
+ "def get_beta(epoch): #get the learning rate for the SOM\n",
+ " # return max(BETA0 * np.exp(-5 * np.divide(epoch, NUM_ITERS)), MIN_BETA)\n",
+ " return max(MIN_BETA, (BETA0 - MIN_BETA) * (1 - np.divide(epoch, NUM_ITERS)) + MIN_BETA)\n",
+ "\n",
+ "def get_sigma(epoch): #get the sigma for the normal distribution\n",
+ " # return max(SIGMA0 * np.exp(-5 * np.divide(epoch, NUM_ITERS)), MIN_SIGMA)\n",
+ " return max(MIN_SIGMA, (SIGMA0 - MIN_SIGMA) * (1 - np.divide(epoch, NUM_ITERS)) + MIN_SIGMA)\n",
+ "\n",
+ "def expand(x, shape): #expand the normal distribution to the grid size\n",
+ " return np.tile(x[:, :, newaxis], (1, 1, shape))\n",
+ "\n",
+ "def update_neurons(grid, best_match_idx, w, epoch): #update the neurons\n",
+ " x0, y0 = best_match_idx #get the coordinates of the best match\n",
+ " x, y = ogrid[0:GRID_SIZE[0], 0:GRID_SIZE[1]] #create a grid of coordinates\n",
+ " distance_to_best_idx = sqrt(np.power((x - x0), 2) + np.power((y - y0), 2)) #calculate the distance between the neurons and the best match\n",
+ " \n",
+ " sigma = get_sigma(epoch) #get the sigma for the normal distribution\n",
+ " # ns_values = normal(distance_to_best_idx, MU, sigma) #calculate the normal distribution\n",
+ " ns_values = 1 / (distance_to_best_idx + 1)\n",
+ " coefficient = expand(ns_values, grid.shape[-1]) #expand the normal distribution to the grid size\n",
+ " \n",
+ " # coefficient = np.tile(normal(ns_values, MU, get_sigma(epoch)), grid.shape) #calculate the coefficient for the neurons\n",
+ " distances = coefficient * (grid - w) #calculate the distance between the neurons and the input\n",
+ " \n",
+ " grid = grid + get_beta(epoch) * distances #update the neurons\n",
+ "\n",
+ "def find_winning_neuron(grid, x): #find the winning neuron\n",
+ " distances = LA.norm(grid - x, axis=-1) #calculate the distance between the neurons and the input\n",
+ " return unravel_index(argmin(distances), grid.shape[0:-1])\n",
+ "\n",
+ "def get_pipeline(scaler=StandardScaler()): #create a pipeline for the data\n",
+ " return Pipeline([\n",
+ " ('scaler', scaler)\n",
+ " ])\n",
+ "\n",
+ "def get_best_matching_units(grid, X: list): #get the best matching units for the input list\n",
+ " return Parallel(n_jobs=N_JOBS)(delayed(find_winning_neuron)(grid, X[i]) for i in tqdm(range(len(X)), desc='Finding BMUs'))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Import Dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.datasets import fetch_rcv1 #fetch the RCV1 dataset\n",
+ "rcv1 = fetch_rcv1() \n",
+ "\n",
+ "X, Y = rcv1.data, rcv1.target"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "'X.shape: (804414, 47236)'\n",
+ "'Y.shape: (804414, 103)'\n",
+ "'num_samples: 804414'\n",
+ "'num_features: 47236'\n",
+ "'num_classes: 103'\n"
+ ]
+ }
+ ],
+ "source": [
+ "pprint.pprint(f'X.shape: {X.shape}')\n",
+ "pprint.pprint(f'Y.shape: {Y.shape}')\n",
+ "pprint.pprint(f'num_samples: {X.shape[0]}')\n",
+ "pprint.pprint(f'num_features: {X.shape[1]}')\n",
+ "pprint.pprint(f'num_classes: {Y.shape[1]}')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Data Preprocessing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_resampled, Y_resampled = resample(X, Y, n_samples=NUM_SAMPLES, random_state=42) #resample data\n",
+ "\n",
+ "X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42) #split data into training and testing sets\n",
+ "\n",
+ "pipeline = get_pipeline()\n",
+ "X_train_pipelined = pipeline.fit_transform(X_train.toarray()) #scale data\n",
+ "X_test_pipelined = pipeline.transform(X_test.toarray()) #scale data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Self Organizing Map (SOM) Initialization"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "'Grid rectangle is of width and height of (23, 23)'\n",
+ "'Neuron grid is of shape (23, 23, 47236)'\n"
+ ]
+ }
+ ],
+ "source": [
+ "grid = np.random.rand(*GRID_SIZE, X_train_pipelined.shape[1]) #initialize the grid with random values\n",
+ "\n",
+ "pprint.pprint(f'Grid rectangle is of width and height of {GRID_SIZE}')\n",
+ "pprint.pprint(f'Neuron grid is of shape {grid.shape}')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Training Self Organizing Map (SOM)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Epochs: 100%|██████████| 3000/3000 [10:17<00:00, 4.86it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "for epoch in tqdm(range(NUM_ITERS), desc='Epochs', leave=True): #train the SOM\n",
+ " rnd_idx = np.random.randint(0, X_train_pipelined.shape[0]) #get a random index\n",
+ " x = X_train_pipelined[rnd_idx] #get the data\n",
+ " best_match_idx = find_winning_neuron(grid, x) #find the index of the neuron with the smallest distance to the input\n",
+ " update_neurons(grid, best_match_idx, x, epoch) #update the neurons"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Saving weights of SOM"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "'SOM saved to model/som.pkl'\n"
+ ]
+ }
+ ],
+ "source": [
+ "with open('model/som.pkl', 'wb') as f: #save the SOM\n",
+ " pkl.dump(grid, f)\n",
+ " pprint.pprint(f'SOM saved to model/som.pkl')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Loading weights of SOM"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "'Grid has been loaded from model/som.pkl'\n"
+ ]
+ }
+ ],
+ "source": [
+ "with open('model/som.pkl', 'rb') as f: #load the SOM\n",
+ " grid = pkl.load(f)\n",
+ " pprint.pprint(f'Grid has been loaded from model/som.pkl')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Visualizing SOM"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Helper Functions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dx = [0, 0, 1, -1] #neighborhood offsets\n",
+ "dy = [1, -1, 0, 0] #neighborhood offsets\n",
+ "\n",
+ "def calc_u_matrix(grid): #calculate the U matrix\n",
+ " u_matrix = np.zeros(GRID_SIZE) #initialize the U matrix\n",
+ " for pos in itertools.product(range(grid.shape[0]), range(grid.shape[1])):\n",
+ " num_neighbors = 0\n",
+ " for i in range(len(dx)):\n",
+ " x = pos[0] + dx[i]\n",
+ " y = pos[1] + dy[i]\n",
+ " if x >= 0 and x < grid.shape[0] and y >= 0 and y < grid.shape[1]:\n",
+ " u_matrix[pos] += LA.norm(grid[pos] - grid[x, y], axis=-1)\n",
+ " num_neighbors += 1\n",
+ " u_matrix[pos] /= num_neighbors\n",
+ " return u_matrix"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Plotting U-Matrix"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "u_matrix = calc_u_matrix(grid) #calculate the U matrix"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "