Skip to content

Commit 11d6644

Browse files
committed
first commit
0 parents  commit 11d6644

File tree

5 files changed

+600
-0
lines changed

5 files changed

+600
-0
lines changed

README.md

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# SVM ALGORITHM
2+
3+
## preprocess.py:
4+
Input: raw data
5+
Output: document term matrix
6+
Overview: Contains functions that takes the raw data and produces document-term matrix
7+
8+
## SVM.py:
9+
Input: document-term matrix
10+
Output: trained model and predictions with model
11+
Overview: Contains an svm class use to build, train and predict a given data set. It also has a function
12+
for creating the confusion matrix
13+
14+
## Packages:
15+
The following packages are required:
16+
numpy for scientific computing
17+
pandas for loading files
18+
scipy for mathematics, science and engineering calculations
19+
nltk for natural language processing
20+
scikit-learn for machine learning algorithms
21+
22+
# CITATIONS:
23+
I consulted a matlab code from Machine Learning course on coursera taught by Stanford University professor
24+
Andrew Ng.

SVM.ipynb

+361
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,361 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# ALL IMPORT STATEMENT"
8+
]
9+
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": null,
13+
"metadata": {},
14+
"outputs": [],
15+
"source": [
16+
"import numpy as np\n",
17+
"from numpy import linalg\n",
18+
"import scipy.io as spio"
19+
]
20+
},
21+
{
22+
"cell_type": "markdown",
23+
"metadata": {},
24+
"source": [
25+
"# FUNCTION TO CALCULATE CONFUSING MATRIX, ACCURACY AND FM"
26+
]
27+
},
28+
{
29+
"cell_type": "code",
30+
"execution_count": null,
31+
"metadata": {},
32+
"outputs": [],
33+
"source": [
34+
"def confusionMatrix(y_actual, y_predicted):\n",
35+
" tp = 0\n",
36+
" tn = 0\n",
37+
" fp = 0\n",
38+
" fn = 0\n",
39+
" \n",
40+
" for i in range(len(y_actual)):\n",
41+
" if y_actual[i] > 0:\n",
42+
" if y_actual[i] == y_predicted[i]:\n",
43+
" tp = tp + 1\n",
44+
" else:\n",
45+
" fn = fn + 1\n",
46+
" if y_actual[i] < 1:\n",
47+
" if y_actual[i] == y_predicted[i]:\n",
48+
" tn = tn + 1\n",
49+
" else:\n",
50+
" fp = fp + 1\n",
51+
" \n",
52+
" cm = [[tn, fp], [fn, tp]]\n",
53+
" accuracy = (tp+tn)/(tp+tn+fp+fn)\n",
54+
" sens = tp/(tp+fn)\n",
55+
" prec = tp/(tp+fp)\n",
56+
" fm = (2*prec*sens)/(prec+sens)\n",
57+
" return cm, accuracy, fm"
58+
]
59+
},
60+
{
61+
"cell_type": "markdown",
62+
"metadata": {},
63+
"source": [
64+
"# FUNCTION FOR EACH SVM KERNEL"
65+
]
66+
},
67+
{
68+
"cell_type": "code",
69+
"execution_count": null,
70+
"metadata": {},
71+
"outputs": [],
72+
"source": [
73+
"def linear_kernel(x1, x2):\n",
74+
" return np.dot(x1, x2)\n",
75+
" \n",
76+
"def polynomial_kernel(x, y, p=3):\n",
77+
" return (1 + np.dot(x, y)) ** p\n",
78+
"\n",
79+
"def gaussian_kernel(x, y, sigma=5.0):\n",
80+
" return np.exp(-linalg.norm(x-y)**2 / (2 * (sigma ** 2)))"
81+
]
82+
},
83+
{
84+
"cell_type": "markdown",
85+
"metadata": {},
86+
"source": [
87+
"# SVM CLASS WITH TRAIN AND PREDICT FUNCTION"
88+
]
89+
},
90+
{
91+
"cell_type": "code",
92+
"execution_count": null,
93+
"metadata": {},
94+
"outputs": [],
95+
"source": [
96+
"class SVM(object):\n",
97+
" \n",
98+
" def __init__(self, kernel=linear_kernel, tol=1e-3, C=0.1, max_passes=5):\n",
99+
" \n",
100+
" self.kernel = kernel\n",
101+
" self.tol = tol\n",
102+
" self.C = C\n",
103+
" self.max_passes = max_passes\n",
104+
" self.model = dict()\n",
105+
" \n",
106+
" def svmTrain(self, X, Y):\n",
107+
" # Data parameters\n",
108+
" m = X.shape[0]\n",
109+
" \n",
110+
" # Map 0 to -1\n",
111+
" Y = np.where(Y == 0, -1, 1)\n",
112+
" \n",
113+
" # Variables\n",
114+
" alphas = np.zeros((m, 1), dtype=float)\n",
115+
" b = 0.0\n",
116+
" E = np.zeros((m, 1),dtype=float)\n",
117+
" passes = 0\n",
118+
" \n",
119+
" # Precompute the kernel matrix\n",
120+
" if self.kernel == linear_kernel:\n",
121+
" print('Precomputing the kernel matrix')\n",
122+
" K = X @ X.T\n",
123+
" elif self.kernel == gaussian_kernel:\n",
124+
" print('Precomputing the kernel matrix')\n",
125+
" X2 = np.sum(np.power(X, 2), axis=1).reshape(-1, 1)\n",
126+
" K = X2 + (X2.T - (2 * (X @ X.T)))\n",
127+
" K = np.power(self.kernel(1, 0), K)\n",
128+
" else:\n",
129+
" # Pre-compute the Kernel Matrix\n",
130+
" # The following can be slow due to lack of vectorization\n",
131+
" print('Precomputing the kernel matrix')\n",
132+
" K = np.zeros((m, m))\n",
133+
" for i in range(m):\n",
134+
" for j in range(m):\n",
135+
" x1 = np.transpose(X[i, :])\n",
136+
" x2 = np.transpose(X[j, :])\n",
137+
" K[i, j] = self.kernel(x1, x2)\n",
138+
" K[i, j] = K[j, i]\n",
139+
" \n",
140+
" print('Training...')\n",
141+
" print('This may take 1 to 2 minutes')\n",
142+
"\n",
143+
" while passes < self.max_passes:\n",
144+
" num_changed_alphas = 0\n",
145+
" \n",
146+
" for i in range(m):\n",
147+
"\n",
148+
" E[i] = b + np.sum( alphas * Y * K[:, i].reshape(-1, 1)) - Y[i]\n",
149+
"\n",
150+
" if (Y[i] * E[i] < -self.tol and alphas[i] < self.C) or (Y[i] * E[i] > self.tol and alphas[i] > 0):\n",
151+
" j = np.random.randint(0, m)\n",
152+
" while j == i:\n",
153+
" # make sure i is not equal to j\n",
154+
" j = np.random.randint(0, m)\n",
155+
"\n",
156+
" E[j] = b + np.sum(alphas * Y * K[:, j].reshape(-1, 1)) - Y[j]\n",
157+
"\n",
158+
" # Save old alphas\n",
159+
" alpha_i_old = alphas[i, 0]\n",
160+
" alpha_j_old = alphas[j, 0]\n",
161+
"\n",
162+
" # Compute L and H by (10) or (11)\n",
163+
" if Y[i] == Y[j]:\n",
164+
" L = max(0, alphas[j] + alphas[i] - self.C)\n",
165+
" H = min(self.C, alphas[j] + alphas[i])\n",
166+
" else:\n",
167+
" L = max(0, alphas[j] - alphas[i])\n",
168+
" H = min(self.C, self.C + alphas[j] - alphas[i])\n",
169+
" if L == H:\n",
170+
" # continue to next i\n",
171+
" continue\n",
172+
"\n",
173+
" # compute eta by (14)\n",
174+
" eta = 2 * K[i, j] - K[i, i] - K[j, j]\n",
175+
" if eta >= 0:\n",
176+
" # continue to next i\n",
177+
" continue\n",
178+
"\n",
179+
" # compute and clip new value for alpha j using (12) and (15)\n",
180+
" alphas[j] = alphas[j] - (Y[j] * (E[i] - E[j])) / eta\n",
181+
"\n",
182+
" # Clip\n",
183+
" alphas[j] = min(H, alphas[j])\n",
184+
" alphas[j] = max(L, alphas[j])\n",
185+
"\n",
186+
" # Check if change in alpha is significant\n",
187+
" if np.abs(alphas[j] - alpha_j_old) < self.tol:\n",
188+
" # continue to the next i\n",
189+
" # replace anyway\n",
190+
" alphas[j] = alpha_j_old\n",
191+
" continue\n",
192+
"\n",
193+
" # Determine value for alpha i using (16)\n",
194+
" alphas[i] = alphas[i] + Y[i] * Y[j] * (alpha_j_old - alphas[j])\n",
195+
"\n",
196+
" # Compute b1 and b2 using (17) and (18) respectively.\n",
197+
" b1 = b - E[i] - Y[i] * (alphas[i] - alpha_i_old) * K[i, j] - Y[j] * (alphas[j] - alpha_j_old) * K[i, j]\n",
198+
" \n",
199+
" b2 = b - E[j] - Y[i] * (alphas[i] - alpha_i_old) * K[i, j] - Y[j] * (alphas[j] - alpha_j_old) * K[j, j]\n",
200+
" \n",
201+
" # Compute b by (19).\n",
202+
" if 0 < alphas[i] and alphas[i] < self.C:\n",
203+
" b = b1\n",
204+
" elif 0 < alphas[j] and alphas[j] < self.C:\n",
205+
" b = b2\n",
206+
" else:\n",
207+
" b = (b1 + b2) / 2\n",
208+
" num_changed_alphas = num_changed_alphas + 1\n",
209+
"\n",
210+
" if num_changed_alphas == 0:\n",
211+
" passes = passes + 1\n",
212+
" else:\n",
213+
" passes = 0\n",
214+
"\n",
215+
" print('....')\n",
216+
"\n",
217+
" print(' DONE! ')\n",
218+
"\n",
219+
" # Save the model\n",
220+
" idx = alphas > 0\n",
221+
" \n",
222+
" self.model['X'] = X[idx.reshape(1, -1)[0], :]\n",
223+
" self.model['y'] = Y[idx.reshape(1, -1)[0]]\n",
224+
" self.model['kernelFunction'] = self.kernel\n",
225+
" self.model['b'] = b\n",
226+
" self.model['alphas'] = alphas[idx.reshape(1, -1)[0]]\n",
227+
" self.model['w'] = np.transpose(np.matmul(np.transpose(alphas * Y), X))\n",
228+
" # return model\n",
229+
" \n",
230+
" def svmPredict(self, X):\n",
231+
" if X.shape[1] == 1:\n",
232+
" X = np.transpose(X)\n",
233+
"\n",
234+
" # Dataset\n",
235+
" m = X.shape[0]\n",
236+
" p = np.zeros((m, 1))\n",
237+
" pred = np.zeros((m, 1))\n",
238+
" \n",
239+
" if self.model['kernelFunction'] == linear_kernel:\n",
240+
" p = X.dot(self.model['w']) + self.model['b']\n",
241+
" \n",
242+
" elif self.model['kernelFunction'] == gaussian_kernel:\n",
243+
" # Vectorized RBF Kernel\n",
244+
" # This is equivalent to computing the kernel on every pair of examples\n",
245+
" X1 = np.sum(np.power(X, 2), axis=1).reshape(-1, 1)\n",
246+
" X2 = np.transpose(np.sum(np.power(self.model['X'], 2), axis=1))\n",
247+
" K = X1 + (X2.T - (2 * (X @ (self.model['X']).T)))\n",
248+
" K = np.power(self.model['kernelFunction'](1, 0), K)\n",
249+
" K = np.transpose(self.model['y']) * K\n",
250+
" K = np.transpose(self.model['alphas']) * K\n",
251+
" p = np.sum(K, axis=1)\n",
252+
" \n",
253+
" else:\n",
254+
" for i in range(m):\n",
255+
" prediction = 0\n",
256+
" for j in range(self.model['X'].shape[0]):\n",
257+
" prediction = prediction + self.model['alphas'][j] * self.model['y'][j] * self.model['kernelFunction'](np.transpose(X[i,:]), np.transpose(self.model['X'][j,:]))\n",
258+
" \n",
259+
" p[i] = prediction + self.model['b']\n",
260+
"\n",
261+
" # Convert predictions into 0 and 1 \n",
262+
" pred[p >= 0] = 1\n",
263+
" pred[p < 0] = 0\n",
264+
" return pred"
265+
]
266+
},
267+
{
268+
"cell_type": "markdown",
269+
"metadata": {},
270+
"source": [
271+
"# TESTING MY SVM"
272+
]
273+
},
274+
{
275+
"cell_type": "code",
276+
"execution_count": null,
277+
"metadata": {},
278+
"outputs": [],
279+
"source": [
280+
"train = spio.loadmat('spamTrain.mat')\n",
281+
"test = spio.loadmat('spamTest.mat')"
282+
]
283+
},
284+
{
285+
"cell_type": "code",
286+
"execution_count": null,
287+
"metadata": {},
288+
"outputs": [],
289+
"source": [
290+
"X_train = np.double(train.get('X'))\n",
291+
"y_train = np.double(train.get('y'))\n",
292+
"X_test = np.double(test.get('Xtest'))\n",
293+
"y_test = np.double(test.get('ytest'))"
294+
]
295+
},
296+
{
297+
"cell_type": "code",
298+
"execution_count": null,
299+
"metadata": {},
300+
"outputs": [],
301+
"source": [
302+
"model = SVM()"
303+
]
304+
},
305+
{
306+
"cell_type": "code",
307+
"execution_count": null,
308+
"metadata": {},
309+
"outputs": [],
310+
"source": [
311+
"model.svmTrain(X_train, y_train)"
312+
]
313+
},
314+
{
315+
"cell_type": "code",
316+
"execution_count": null,
317+
"metadata": {},
318+
"outputs": [],
319+
"source": [
320+
"y_predicted = model.svmPredict(X_train)"
321+
]
322+
},
323+
{
324+
"cell_type": "code",
325+
"execution_count": null,
326+
"metadata": {},
327+
"outputs": [],
328+
"source": [
329+
"confusionMatrix(y_train, y_predicted)"
330+
]
331+
},
332+
{
333+
"cell_type": "code",
334+
"execution_count": null,
335+
"metadata": {},
336+
"outputs": [],
337+
"source": []
338+
}
339+
],
340+
"metadata": {
341+
"kernelspec": {
342+
"display_name": "Python 3",
343+
"language": "python",
344+
"name": "python3"
345+
},
346+
"language_info": {
347+
"codemirror_mode": {
348+
"name": "ipython",
349+
"version": 3
350+
},
351+
"file_extension": ".py",
352+
"mimetype": "text/x-python",
353+
"name": "python",
354+
"nbconvert_exporter": "python",
355+
"pygments_lexer": "ipython3",
356+
"version": "3.6.6"
357+
}
358+
},
359+
"nbformat": 4,
360+
"nbformat_minor": 2
361+
}

0 commit comments

Comments
 (0)