-
Notifications
You must be signed in to change notification settings - Fork 20
/
run_classifier_evaluation.py
77 lines (63 loc) · 3.27 KB
/
run_classifier_evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import config
import pandas as pd
from sklearn import linear_model, metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
if __name__ == '__main__':
spotify_tracks = pd.read_csv(config.csv_data_path())
# adding a new column in data set duration_mins by diving duration_ms (duration milliseconds) by 60000
spotify_tracks['duration_mins'] = spotify_tracks['duration_ms'] / 60000
# dropping the columns we don't need
spotify_tracks = spotify_tracks.drop(['track_id', 'duration_ms'], axis=1)
popularity_verdict = spotify_tracks.copy()
# As almost 15% of entries have 0 popularity score, we drop the records with 0 popularity score as this will help
# model in predicting better. 0 value records will not have significance in our analysis.
popularity_verdict = popularity_verdict[popularity_verdict.popularity > 0]
# setting ratings based on popularity score - popularity score 0 - 50 = Low, score = 51 - 100 = Popular
popularity_verdict['verdict'] = ''
for i, row in popularity_verdict.iterrows():
score = 'low'
if row.popularity >= 50:
score = 'popular'
popularity_verdict.at[i, 'verdict'] = score
pop_ver_att = popularity_verdict[['year', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'duration_mins']]
# defining x and y df for our analysis
X = pop_ver_att.select_dtypes(include='number')
print(X.columns)
y = popularity_verdict['verdict']
scaler = StandardScaler()
model_X = scaler.fit(X)
X_scaled = model_X.transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=42, test_size=0.3, shuffle=True)
log_reg = linear_model.LogisticRegression(solver='lbfgs', max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
print("Logistic Regression Model Accuracy (in %):",
metrics.accuracy_score(y_test, y_pred) * 100)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("KNN Model Accuracy (in %):",
metrics.accuracy_score(y_test, pred) * 100)
rforest = RandomForestClassifier(n_estimators=100)
rforest.fit(X_train, y_train)
y_pred = rforest.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Random Forest Model Accuracy (in %):",
metrics.accuracy_score(y_test, y_pred) * 100)
d_tree = DecisionTreeClassifier(random_state=42, max_depth=2)
d_tree.fit(X_train, y_train)
y_pred = d_tree.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Decsision Tree Model Accuracy (in %):",
metrics.accuracy_score(y_test, y_pred) * 100)