-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdbscan-code-2
53 lines (47 loc) · 2.18 KB
/
dbscan-code-2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#clincial contains AGE and BMI of patients along with patient ID of 99 GBM patients
#metabolites contains the same data as metabolome, except the patient ids fall under Patient_ID column
data1 = pd.read_csv('clinical.csv')
data2 = pd.read_csv('metabolites.csv')
#merge data
df = pd.merge(data1, data2, on='Patient_ID', how='outer')
df
X_train = df[['lactic.acid', 'pyruvic.acid', 'D.glucose']]
X_train = X_train.fillna(X_train.mean())
k_values = range(4, 9)
# Store the optimal epsilon values for each k
optimal_epsilons = []
plt.figure(figsize=(8,6))
for k in k_values:
# Fit the NearestNeighbors model
neighbors = NearestNeighbors(n_neighbors=k)
neighbors_fit = neighbors.fit(X_train)
distances, indices = neighbors_fit.kneighbors(X_train)
mean_distances = distances.mean(axis=1)
mean_distances = np.sort(mean_distances)
kneedle = KneeLocator(range(len(mean_distances)), mean_distances, S=1.0, curve='convex', direction='increasing')
optimal_k = kneedle.elbow
optimal_epsilon = mean_distances[optimal_k]
optimal_epsilons.append((k, optimal_epsilon))
plt.plot(range(len(mean_distances)), mean_distances, label=f'k={k}')
plt.axvline(x=optimal_k, color='r', linestyle='--')
plt.xlabel('Points sorted by distance')
plt.ylabel('Mean Distance to k Nearest Neighbors')
plt.title('Elbow Plot for Different k Values')
plt.legend()
plt.show()
for k, epsilon in optimal_epsilons:
print(f'Optimal epsilon for k={k}: {epsilon}')
dbscan = DBSCAN(eps=0.9904756251458471, min_samples=4)
dbscan.fit(X_train)
df['cluster'] = dbscan.labels_
print(df['cluster'].value_counts())
outliers = df[df['cluster'] == -1]
clusters = df[df['cluster'] != -1]
fig3, axes = plt.subplots(1, 3, figsize=(20, 5))
for i, (x_var, y_var) in enumerate([('D.glucose', 'pyruvic.acid'), ('D.glucose', 'lactic.acid'), ('pyruvic.acid', 'lactic.acid')]):
sns.scatterplot(ax=axes[i], x=x_var, y=y_var, data=clusters, hue='cluster', palette='Set1', s=100)
axes[i].scatter(outliers[x_var], outliers[y_var], color='black', s=10, label='noise', marker='o')
axes[i].legend(loc='upper right', bbox_to_anchor=(1.2, 1), title='Clusters')
axes[i].set_title(f'{x_var} vs {y_var}')
plt.tight_layout()
plt.show()