kenextra
diff --git a/‎README.md
+24 b/‎README.md
+24
diff --git a/‎SVM.ipynb
+361 b/‎SVM.ipynb
+361
@@ -0,0 +1,24 @@
+# SVM ALGORITHM
+
+## preprocess.py:
+Input: raw data
+Output: document term matrix
+Overview: Contains functions that takes the raw data and produces document-term matrix
+
+## SVM.py:
+Input: document-term matrix
+Output: trained model and predictions with model
+Overview: Contains an svm class use to build, train and predict a given data set. It also has a function
+            for creating the confusion matrix
+
+## Packages:
+The following packages are required:
+numpy for scientific computing
+pandas for loading files
+scipy for mathematics, science and engineering calculations
+nltk for natural language processing
+scikit-learn for machine learning algorithms
+
+# CITATIONS:
+I consulted a matlab code from Machine Learning course on coursera taught by Stanford University professor
+Andrew Ng.
@@ -0,0 +1,361 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ALL IMPORT STATEMENT"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from numpy import linalg\n",
+    "import scipy.io as spio"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# FUNCTION TO CALCULATE CONFUSING MATRIX, ACCURACY AND FM"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def confusionMatrix(y_actual, y_predicted):\n",
+    "    tp = 0\n",
+    "    tn = 0\n",
+    "    fp = 0\n",
+    "    fn = 0\n",
+    "    \n",
+    "    for i in range(len(y_actual)):\n",
+    "        if y_actual[i] > 0:\n",
+    "            if y_actual[i] == y_predicted[i]:\n",
+    "                tp = tp + 1\n",
+    "            else:\n",
+    "                fn = fn + 1\n",
+    "        if y_actual[i] < 1:\n",
+    "            if y_actual[i] == y_predicted[i]:\n",
+    "                tn = tn + 1\n",
+    "            else:\n",
+    "                fp = fp + 1\n",
+    "                \n",
+    "    cm = [[tn, fp], [fn, tp]]\n",
+    "    accuracy = (tp+tn)/(tp+tn+fp+fn)\n",
+    "    sens = tp/(tp+fn)\n",
+    "    prec = tp/(tp+fp)\n",
+    "    fm = (2*prec*sens)/(prec+sens)\n",
+    "    return cm, accuracy, fm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# FUNCTION FOR EACH SVM KERNEL"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def linear_kernel(x1, x2):\n",
+    "    return np.dot(x1, x2)\n",
+    "    \n",
+    "def polynomial_kernel(x, y, p=3):\n",
+    "    return (1 + np.dot(x, y)) ** p\n",
+    "\n",
+    "def gaussian_kernel(x, y, sigma=5.0):\n",
+    "    return np.exp(-linalg.norm(x-y)**2 / (2 * (sigma ** 2)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# SVM CLASS WITH TRAIN AND PREDICT FUNCTION"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class SVM(object):\n",
+    "    \n",
+    "    def __init__(self, kernel=linear_kernel, tol=1e-3, C=0.1, max_passes=5):\n",
+    "        \n",
+    "        self.kernel = kernel\n",
+    "        self.tol = tol\n",
+    "        self.C = C\n",
+    "        self.max_passes = max_passes\n",
+    "        self.model = dict()\n",
+    "    \n",
+    "    def svmTrain(self, X, Y):\n",
+    "        # Data parameters\n",
+    "        m = X.shape[0]\n",
+    "        \n",
+    "        # Map 0 to -1\n",
+    "        Y = np.where(Y == 0, -1, 1)\n",
+    "        \n",
+    "        # Variables\n",
+    "        alphas = np.zeros((m, 1), dtype=float)\n",
+    "        b = 0.0\n",
+    "        E = np.zeros((m, 1),dtype=float)\n",
+    "        passes = 0\n",
+    "        \n",
+    "        # Precompute the kernel matrix\n",
+    "        if self.kernel == linear_kernel:\n",
+    "            print('Precomputing the kernel matrix')\n",
+    "            K = X @ X.T\n",
+    "        elif self.kernel == gaussian_kernel:\n",
+    "            print('Precomputing the kernel matrix')\n",
+    "            X2 = np.sum(np.power(X, 2), axis=1).reshape(-1, 1)\n",
+    "            K = X2 + (X2.T - (2 * (X @ X.T)))\n",
+    "            K = np.power(self.kernel(1, 0), K)\n",
+    "        else:\n",
+    "            # Pre-compute the Kernel Matrix\n",
+    "            # The following can be slow due to lack of vectorization\n",
+    "            print('Precomputing the kernel matrix')\n",
+    "            K = np.zeros((m, m))\n",
+    "            for i in range(m):\n",
+    "                for j in range(m):\n",
+    "                    x1 = np.transpose(X[i, :])\n",
+    "                    x2 = np.transpose(X[j, :])\n",
+    "                    K[i, j] = self.kernel(x1, x2)\n",
+    "                    K[i, j] = K[j, i]\n",
+    "                    \n",
+    "        print('Training...')\n",
+    "        print('This may take 1 to 2 minutes')\n",
+    "\n",
+    "        while passes < self.max_passes:\n",
+    "            num_changed_alphas = 0\n",
+    "            \n",
+    "            for i in range(m):\n",
+    "\n",
+    "                E[i] = b + np.sum( alphas * Y * K[:, i].reshape(-1, 1)) - Y[i]\n",
+    "\n",
+    "                if (Y[i] * E[i] < -self.tol and alphas[i] < self.C) or (Y[i] * E[i] > self.tol and alphas[i] > 0):\n",
+    "                    j = np.random.randint(0, m)\n",
+    "                    while j == i:\n",
+    "                        # make sure i is not equal to j\n",
+    "                        j = np.random.randint(0, m)\n",
+    "\n",
+    "                    E[j] = b + np.sum(alphas * Y * K[:, j].reshape(-1, 1)) - Y[j]\n",
+    "\n",
+    "                    # Save old alphas\n",
+    "                    alpha_i_old = alphas[i, 0]\n",
+    "                    alpha_j_old = alphas[j, 0]\n",
+    "\n",
+    "                    # Compute L and H by (10) or (11)\n",
+    "                    if Y[i] == Y[j]:\n",
+    "                        L = max(0, alphas[j] + alphas[i] - self.C)\n",
+    "                        H = min(self.C, alphas[j] + alphas[i])\n",
+    "                    else:\n",
+    "                        L = max(0, alphas[j] - alphas[i])\n",
+    "                        H = min(self.C, self.C + alphas[j] - alphas[i])\n",
+    "                    if L == H:\n",
+    "                        # continue to next i\n",
+    "                        continue\n",
+    "\n",
+    "                    # compute eta by (14)\n",
+    "                    eta = 2 * K[i, j] - K[i, i] - K[j, j]\n",
+    "                    if eta >= 0:\n",
+    "                        # continue to next i\n",
+    "                        continue\n",
+    "\n",
+    "                    # compute and clip new value for alpha j using (12) and (15)\n",
+    "                    alphas[j] = alphas[j] - (Y[j] * (E[i] - E[j])) / eta\n",
+    "\n",
+    "                    # Clip\n",
+    "                    alphas[j] = min(H, alphas[j])\n",
+    "                    alphas[j] = max(L, alphas[j])\n",
+    "\n",
+    "                    # Check if change in alpha is significant\n",
+    "                    if np.abs(alphas[j] - alpha_j_old) < self.tol:\n",
+    "                        # continue to the next i\n",
+    "                        # replace anyway\n",
+    "                        alphas[j] = alpha_j_old\n",
+    "                        continue\n",
+    "\n",
+    "                    # Determine value for alpha i using (16)\n",
+    "                    alphas[i] = alphas[i] + Y[i] * Y[j] * (alpha_j_old - alphas[j])\n",
+    "\n",
+    "                    # Compute b1 and b2 using (17) and (18) respectively.\n",
+    "                    b1 = b - E[i] - Y[i] * (alphas[i] - alpha_i_old) * K[i, j] - Y[j] * (alphas[j] - alpha_j_old) * K[i, j]\n",
+    "                    \n",
+    "                    b2 = b - E[j] - Y[i] * (alphas[i] - alpha_i_old) * K[i, j] - Y[j] * (alphas[j] - alpha_j_old) * K[j, j]\n",
+    "                    \n",
+    "                    # Compute b by (19).\n",
+    "                    if 0 < alphas[i] and alphas[i] < self.C:\n",
+    "                        b = b1\n",
+    "                    elif 0 < alphas[j] and alphas[j] < self.C:\n",
+    "                        b = b2\n",
+    "                    else:\n",
+    "                        b = (b1 + b2) / 2\n",
+    "                    num_changed_alphas = num_changed_alphas + 1\n",
+    "\n",
+    "            if num_changed_alphas == 0:\n",
+    "                passes = passes + 1\n",
+    "            else:\n",
+    "                passes = 0\n",
+    "\n",
+    "            print('....')\n",
+    "\n",
+    "        print(' DONE! ')\n",
+    "\n",
+    "        # Save the model\n",
+    "        idx = alphas > 0\n",
+    "        \n",
+    "        self.model['X'] = X[idx.reshape(1, -1)[0], :]\n",
+    "        self.model['y'] = Y[idx.reshape(1, -1)[0]]\n",
+    "        self.model['kernelFunction'] = self.kernel\n",
+    "        self.model['b'] = b\n",
+    "        self.model['alphas'] = alphas[idx.reshape(1, -1)[0]]\n",
+    "        self.model['w'] = np.transpose(np.matmul(np.transpose(alphas * Y), X))\n",
+    "        # return model\n",
+    "    \n",
+    "    def svmPredict(self, X):\n",
+    "        if X.shape[1] == 1:\n",
+    "            X = np.transpose(X)\n",
+    "\n",
+    "        # Dataset\n",
+    "        m = X.shape[0]\n",
+    "        p = np.zeros((m, 1))\n",
+    "        pred = np.zeros((m, 1))\n",
+    "        \n",
+    "        if self.model['kernelFunction'] == linear_kernel:\n",
+    "            p = X.dot(self.model['w']) + self.model['b']\n",
+    "            \n",
+    "        elif self.model['kernelFunction'] == gaussian_kernel:\n",
+    "            # Vectorized RBF Kernel\n",
+    "            # This is equivalent to computing the kernel on every pair of examples\n",
+    "            X1 = np.sum(np.power(X, 2), axis=1).reshape(-1, 1)\n",
+    "            X2 = np.transpose(np.sum(np.power(self.model['X'], 2), axis=1))\n",
+    "            K = X1 + (X2.T - (2 * (X @ (self.model['X']).T)))\n",
+    "            K = np.power(self.model['kernelFunction'](1, 0), K)\n",
+    "            K = np.transpose(self.model['y']) * K\n",
+    "            K = np.transpose(self.model['alphas']) * K\n",
+    "            p = np.sum(K, axis=1)\n",
+    "            \n",
+    "        else:\n",
+    "            for i in range(m):\n",
+    "                prediction = 0\n",
+    "                for j in range(self.model['X'].shape[0]):\n",
+    "                    prediction = prediction + self.model['alphas'][j] * self.model['y'][j] * self.model['kernelFunction'](np.transpose(X[i,:]), np.transpose(self.model['X'][j,:]))\n",
+    "                    \n",
+    "                p[i] = prediction + self.model['b']\n",
+    "\n",
+    "        # Convert predictions into 0 and 1                                                                                                                   \n",
+    "        pred[p >= 0] = 1\n",
+    "        pred[p < 0] = 0\n",
+    "        return pred"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# TESTING MY SVM"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train = spio.loadmat('spamTrain.mat')\n",
+    "test = spio.loadmat('spamTest.mat')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train = np.double(train.get('X'))\n",
+    "y_train = np.double(train.get('y'))\n",
+    "X_test = np.double(test.get('Xtest'))\n",
+    "y_test = np.double(test.get('ytest'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = SVM()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.svmTrain(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_predicted = model.svmPredict(X_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "confusionMatrix(y_train, y_predicted)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}