-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdbscan-code-4
54 lines (46 loc) · 1.88 KB
/
dbscan-code-4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
x = data['galactitol']
y = data['D-glucose']
# Scatter plot to see if there is any clustering
plt.figure(figsize=(8, 6))
plt.scatter(x, y, s=50, c='green', alpha=0.7, edgecolors='k')
plt.title('galactitol vs glucose')
plt.xlabel('galactitol')
plt.ylabel('glucose')
plt.grid(True)
plt.show()
X_train = df[['D.glucose', 'galactitol']]
X_train = X_train.fillna(X_train.mean())
k_values = range(2, 9)
# Store the optimal epsilon values for each k
optimal_epsilons = []
plt.figure(figsize=(8,6))
for k in k_values:
# Fit the NearestNeighbors model
neighbors = NearestNeighbors(n_neighbors=k)
neighbors_fit = neighbors.fit(X_train)
distances, indices = neighbors_fit.kneighbors(X_train)
mean_distances = distances.mean(axis=1)
mean_distances = np.sort(mean_distances)
kneedle = KneeLocator(range(len(mean_distances)), mean_distances, S=1.0, curve='convex', direction='increasing')
optimal_k = kneedle.elbow
optimal_epsilon = mean_distances[optimal_k]
optimal_epsilons.append((k, optimal_epsilon))
plt.plot(range(len(mean_distances)), mean_distances, label=f'k={k}')
plt.axvline(x=optimal_k, color='r', linestyle='--')
plt.xlabel('Points sorted by distance')
plt.ylabel('Mean Distance to k Nearest Neighbors')
plt.title('Elbow Plot for Different k Values')
plt.legend()
plt.show()
for k, epsilon in optimal_epsilons:
print(f'Optimal epsilon for k={k}: {epsilon}')
dbscan = DBSCAN(eps=0.8010010322532317, min_samples=4)
dbscan.fit(X_train)
df['cluster'] = dbscan.labels_
print(df['cluster'].value_counts())
outliers = df[df['cluster'] == -1]
clusters = df[df['cluster'] != -1]
sns.scatterplot(x='D.glucose', y='galactitol', data=clusters, hue='cluster', palette='tab20', s=300)
plt.scatter(outliers['creatinine'], outliers['homocysteine'], color='black', s=10, label='noise', marker='o')
plt.legend(loc='upper right', bbox_to_anchor=(1.17, 1), title='Clusters')
plt.show()