Skip to content

Commit

Permalink
exploring dataset and working on svm
Browse files Browse the repository at this point in the history
  • Loading branch information
saini-shiven committed Dec 5, 2022
1 parent 7b5c305 commit 5dc3f90
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 41 deletions.
Binary file added ResultVariance.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 2 additions & 2 deletions dataset_exploration.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@

#Percentage of class values
plt.figure(figsize=(13, 6))
ax = sns.barplot(x=phishing['Result'], y=phishing['Result'], data=df, estimator=lambda x: len(x) / len(df) * 100, color = 'Red')
ax = sns.barplot(x=phishing['Result'], y=phishing['Result'], data=df, estimator=lambda x: len(x) / len(df) * 100, color = 'Blue')
ax.set(ylabel="Percent");
ax.set_title('The Percentage of Phishing vs Legitimate Webites', size = 15);
ax.set_title('The Percentage of Phishing Websites vs Legitimate Webites', size = 15);
plt.savefig('resultpercentageplot.png', dpi=300, bbox_inches='tight');

#Correlation between variables
Expand Down
Binary file modified resultpercentageplot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
100 changes: 61 additions & 39 deletions svm.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,30 +30,7 @@ def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')

#load the dataset
df = pd.read_csv("phishingDataset.csv", na_values=['NaN'])

#shuffle the data
df = df.reindex(np.random.permutation(df.index))

print(df.head())

#list of columns that aren't result
cols = []
for x in df.columns:
if x != 'Result':
cols.append(x)

#defining x and y
X = df[cols].values
y = df['Result'].values


#Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.25, random_state=42)


#plootting the data as a svc decision function

def plot_svc_decision_function(model, ax=None, plot_support=True):
Expand Down Expand Up @@ -97,35 +74,80 @@ def plot_svm(N=10, ax=None):
ax.set_ylim(-1, 6)
plot_svc_decision_function(model, ax)

#load the dataset
df = (pd.read_csv("phishingDataset.csv", na_values=['NaN'])).head(400)

#shuffle the data
df = df.reindex(np.random.permutation(df.index))

print(df.head())

#list of columns that aren't result
cols = []
for x in df.columns:
if x != 'Result':
cols.append(x)

#defining x and y
X = df[cols].values
y = df['Result'].values


#Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.25, random_state=42)


#Plotting the first 60 and 120 data points with line of best fit
fig, ax = plt.subplots(1, 2, figsize=(16, 6))
fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1)
for axi, N in zip(ax, [60, 120]):
plot_svm(N, axi)
axi.set_title('N = {0}'.format(N))

#build multiclass svm and fit data
multi_svm = SVC(gamma='scale', decision_function_shape='ovo')
multi_svm.fit(X_train,y_train)

#predicting the data
y_pred = multi_svm.predict(X_test)

svm_model = SVC(kernel='rbf', C=100).fit(X, y)

#put the results into a DataFrame and print side-by-side
output = pd.DataFrame(data=np.c_[y_test,y_pred])
print(output)

#calculate accuracy score and print
y_pred = svm_model.predict(X_test)
#print(y_test)
#print(y_pred)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

#find the confusion matrix, normalise and print
cm = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm)
#normalised confusion matrix
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Normalized confusion matrix')
print(cm_normalized)

#confusion matrix as a figure
plt.figure()
plot_confusion_matrix(cm_normalized, [-1,1], title='Normalized confusion matrix')
plt.show()
print(svm_model.get_params())

# #build multiclass svm and fit data
# multi_svm = SVC(gamma='scale', decision_function_shape='ovo')
# multi_svm.fit(X_train,y_train)

# #predicting the data
# y_pred = multi_svm.predict(X_test)


# #put the results into a DataFrame and print side-by-side
# output = pd.DataFrame(data=np.c_[y_test,y_pred])
# print(output)

# #calculate accuracy score and print
# print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

# #find the confusion matrix, normalise and print
# cm = confusion_matrix(y_test, y_pred)
# np.set_printoptions(precision=2)
# cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
# print('Normalized confusion matrix')
# print(cm_normalized)

# #confusion matrix as a figure
# plt.figure()
# plot_confusion_matrix(cm_normalized, [-1,1], title='Normalized confusion matrix')
# plt.show()

0 comments on commit 5dc3f90

Please sign in to comment.