exploring dataset and working on svm

saini-shiven · Dec 5, 2022 · 5dc3f90 · 5dc3f90
1 parent 7b5c305
commit 5dc3f90
Show file tree

Hide file tree

Showing 4 changed files with 63 additions and 41 deletions.
diff --git a/ResultVariance.png b/ResultVariance.png
diff --git a/dataset_exploration.py b/dataset_exploration.py
@@ -29,9 +29,9 @@
 
 #Percentage of class values
 plt.figure(figsize=(13, 6))
-ax = sns.barplot(x=phishing['Result'], y=phishing['Result'], data=df, estimator=lambda x: len(x) / len(df) * 100, color = 'Red')
+ax = sns.barplot(x=phishing['Result'], y=phishing['Result'], data=df, estimator=lambda x: len(x) / len(df) * 100, color = 'Blue')
 ax.set(ylabel="Percent");
-ax.set_title('The Percentage of Phishing vs Legitimate Webites', size = 15);
+ax.set_title('The Percentage of Phishing Websites vs Legitimate Webites', size = 15);
 plt.savefig('resultpercentageplot.png', dpi=300, bbox_inches='tight');
 
 #Correlation between variables

diff --git a/resultpercentageplot.png b/resultpercentageplot.png
diff --git a/svm.py b/svm.py
@@ -30,30 +30,7 @@ def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues
     plt.tight_layout()
     plt.ylabel('True label')
     plt.xlabel('Predicted label')
-
-#load the dataset 
-df = pd.read_csv("phishingDataset.csv", na_values=['NaN'])
-
-#shuffle the data
-df = df.reindex(np.random.permutation(df.index))
-
-print(df.head())
-
-#list of columns that aren't result
-cols = []
-for x in df.columns:
-    if x != 'Result':
-        cols.append(x)
-
-#defining x and y 
-X = df[cols].values
-y = df['Result'].values
-
-
-#Split the data into training and testing
-X_train, X_test, y_train, y_test = train_test_split(    
-    X, y, test_size=0.25, random_state=42) 
-
+
 #plootting the data as a svc decision function
 
 def plot_svc_decision_function(model, ax=None, plot_support=True):
@@ -97,35 +74,80 @@ def plot_svm(N=10, ax=None):
     ax.set_ylim(-1, 6)
     plot_svc_decision_function(model, ax)
 
+#load the dataset 
+df = (pd.read_csv("phishingDataset.csv", na_values=['NaN'])).head(400)
+
+#shuffle the data
+df = df.reindex(np.random.permutation(df.index))
+
+print(df.head())
+
+#list of columns that aren't result
+cols = []
+for x in df.columns:
+    if x != 'Result':
+        cols.append(x)
+
+#defining x and y 
+X = df[cols].values
+y = df['Result'].values
+
+
+#Split the data into training and testing
+X_train, X_test, y_train, y_test = train_test_split(    
+    X, y, test_size=0.25, random_state=42) 
+
+
+#Plotting the first 60 and 120 data points with line of best fit
 fig, ax = plt.subplots(1, 2, figsize=(16, 6))
 fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1)
 for axi, N in zip(ax, [60, 120]):
     plot_svm(N, axi)
     axi.set_title('N = {0}'.format(N))
 
-#build multiclass svm and fit data
-multi_svm = SVC(gamma='scale', decision_function_shape='ovo')  
-multi_svm.fit(X_train,y_train)
-
-#predicting the data
-y_pred = multi_svm.predict(X_test)
 
+svm_model = SVC(kernel='rbf', C=100).fit(X, y)
 
-#put the results into a DataFrame and print side-by-side
-output = pd.DataFrame(data=np.c_[y_test,y_pred])
-print(output)
-
-#calculate accuracy score and print
+y_pred = svm_model.predict(X_test)
+#print(y_test)
+#print(y_pred)
 print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
-
-#find the confusion matrix, normalise and print
 cm = confusion_matrix(y_test, y_pred)
 np.set_printoptions(precision=2)
+print('Confusion matrix, without normalization')
+print(cm)
+#normalised confusion matrix
 cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
 print('Normalized confusion matrix')
 print(cm_normalized)
-
-#confusion matrix as a figure
 plt.figure()
 plot_confusion_matrix(cm_normalized, [-1,1], title='Normalized confusion matrix')
 plt.show()
+print(svm_model.get_params())
+
+# #build multiclass svm and fit data
+# multi_svm = SVC(gamma='scale', decision_function_shape='ovo')  
+# multi_svm.fit(X_train,y_train)
+
+# #predicting the data
+# y_pred = multi_svm.predict(X_test)
+
+
+# #put the results into a DataFrame and print side-by-side
+# output = pd.DataFrame(data=np.c_[y_test,y_pred])
+# print(output)
+
+# #calculate accuracy score and print
+# print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
+
+# #find the confusion matrix, normalise and print
+# cm = confusion_matrix(y_test, y_pred)
+# np.set_printoptions(precision=2)
+# cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+# print('Normalized confusion matrix')
+# print(cm_normalized)
+
+# #confusion matrix as a figure
+# plt.figure()
+# plot_confusion_matrix(cm_normalized, [-1,1], title='Normalized confusion matrix')
+# plt.show()