diff --git a/ResultVariance.png b/ResultVariance.png new file mode 100644 index 0000000..d28fce3 Binary files /dev/null and b/ResultVariance.png differ diff --git a/dataset_exploration.py b/dataset_exploration.py index 3b677f0..bf317e9 100644 --- a/dataset_exploration.py +++ b/dataset_exploration.py @@ -29,9 +29,9 @@ #Percentage of class values plt.figure(figsize=(13, 6)) -ax = sns.barplot(x=phishing['Result'], y=phishing['Result'], data=df, estimator=lambda x: len(x) / len(df) * 100, color = 'Red') +ax = sns.barplot(x=phishing['Result'], y=phishing['Result'], data=df, estimator=lambda x: len(x) / len(df) * 100, color = 'Blue') ax.set(ylabel="Percent"); -ax.set_title('The Percentage of Phishing vs Legitimate Webites', size = 15); +ax.set_title('The Percentage of Phishing Websites vs Legitimate Webites', size = 15); plt.savefig('resultpercentageplot.png', dpi=300, bbox_inches='tight'); #Correlation between variables diff --git a/resultpercentageplot.png b/resultpercentageplot.png index 31f3014..ae18582 100644 Binary files a/resultpercentageplot.png and b/resultpercentageplot.png differ diff --git a/svm.py b/svm.py index cf9bb81..57835a4 100644 --- a/svm.py +++ b/svm.py @@ -30,30 +30,7 @@ def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') - -#load the dataset -df = pd.read_csv("phishingDataset.csv", na_values=['NaN']) - -#shuffle the data -df = df.reindex(np.random.permutation(df.index)) - -print(df.head()) - -#list of columns that aren't result -cols = [] -for x in df.columns: - if x != 'Result': - cols.append(x) - -#defining x and y -X = df[cols].values -y = df['Result'].values - - -#Split the data into training and testing -X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.25, random_state=42) - + #plootting the data as a svc decision function def plot_svc_decision_function(model, ax=None, plot_support=True): @@ -97,35 +74,80 @@ def plot_svm(N=10, ax=None): ax.set_ylim(-1, 6) plot_svc_decision_function(model, ax) +#load the dataset +df = (pd.read_csv("phishingDataset.csv", na_values=['NaN'])).head(400) + +#shuffle the data +df = df.reindex(np.random.permutation(df.index)) + +print(df.head()) + +#list of columns that aren't result +cols = [] +for x in df.columns: + if x != 'Result': + cols.append(x) + +#defining x and y +X = df[cols].values +y = df['Result'].values + + +#Split the data into training and testing +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.25, random_state=42) + + +#Plotting the first 60 and 120 data points with line of best fit fig, ax = plt.subplots(1, 2, figsize=(16, 6)) fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1) for axi, N in zip(ax, [60, 120]): plot_svm(N, axi) axi.set_title('N = {0}'.format(N)) -#build multiclass svm and fit data -multi_svm = SVC(gamma='scale', decision_function_shape='ovo') -multi_svm.fit(X_train,y_train) - -#predicting the data -y_pred = multi_svm.predict(X_test) +svm_model = SVC(kernel='rbf', C=100).fit(X, y) -#put the results into a DataFrame and print side-by-side -output = pd.DataFrame(data=np.c_[y_test,y_pred]) -print(output) - -#calculate accuracy score and print +y_pred = svm_model.predict(X_test) +#print(y_test) +#print(y_pred) print('Accuracy: %.2f' % accuracy_score(y_test, y_pred)) - -#find the confusion matrix, normalise and print cm = confusion_matrix(y_test, y_pred) np.set_printoptions(precision=2) +print('Confusion matrix, without normalization') +print(cm) +#normalised confusion matrix cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print('Normalized confusion matrix') print(cm_normalized) - -#confusion matrix as a figure plt.figure() plot_confusion_matrix(cm_normalized, [-1,1], title='Normalized confusion matrix') plt.show() +print(svm_model.get_params()) + +# #build multiclass svm and fit data +# multi_svm = SVC(gamma='scale', decision_function_shape='ovo') +# multi_svm.fit(X_train,y_train) + +# #predicting the data +# y_pred = multi_svm.predict(X_test) + + +# #put the results into a DataFrame and print side-by-side +# output = pd.DataFrame(data=np.c_[y_test,y_pred]) +# print(output) + +# #calculate accuracy score and print +# print('Accuracy: %.2f' % accuracy_score(y_test, y_pred)) + +# #find the confusion matrix, normalise and print +# cm = confusion_matrix(y_test, y_pred) +# np.set_printoptions(precision=2) +# cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] +# print('Normalized confusion matrix') +# print(cm_normalized) + +# #confusion matrix as a figure +# plt.figure() +# plot_confusion_matrix(cm_normalized, [-1,1], title='Normalized confusion matrix') +# plt.show()