Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Locate nearest clusters for given data #214

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions examples/nearest_node.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""
nearest_nodes example based on breast cancer data.
"""

from plot_breast_cancer import *
from sklearn import neighbors, preprocessing

# new patient data incoming
i = np.random.randint(len(X))
new_patient_data = 1.05*X[i]
new_patient_data = new_patient_data.reshape(1, -1)

# re-use lens1 model
newlens1 = model.decision_function(new_patient_data)

# re-construct lens2 model
X_norm = np.linalg.norm(X, axis=1)
scaler = preprocessing.MinMaxScaler()
scaler.fit(X_norm.reshape(-1, 1))

newlens2 = scaler.transform(np.linalg.norm(new_patient_data, axis=1).reshape(1, -1))

newlens = np.c_[newlens1, newlens2]

# find nearest nodes
nn = neighbors.NearestNeighbors(n_neighbors=3)
node_ids = mapper.nearest_nodes(newlens, new_patient_data, graph, mapper.cover, lens, X, nn)

print("Nearest nodes:")
for node_id in node_ids:
diags = y[graph['nodes'][node_id]]
print(" {}: diagnosis {:.1f}%".format(node_id, np.sum(diags)*100.0/len(diags)))
31 changes: 21 additions & 10 deletions kmapper/cover.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,9 +243,9 @@ def transform_single(self, data, center, i=0):

return hypercube

def transform(self, data, centers=None):
"""Find entries of all hypercubes. If `centers=None`, then use `self.centers_` as computed in `self.fit`.

def transform(self, data, centers=None, return_centers=False):
""" Find entries of all hypercubes. If `centers=None`, then use `self.centers_` as computed in `self.fit`.
Empty hypercubes are removed from the result

Parameters
Expand All @@ -255,12 +255,15 @@ def transform(self, data, centers=None):
Data to find in entries in cube. Warning: first column must be index column.
centers: list of array-like
Center points for all cubes as returned by `self.fit`. Default is to use `self.centers_`.
return_centers: boolean
Whether to also return the kept center IDs.

Returns
=========
hypercubes: list of array-like
list of entries in each hypercube in `data`.

center_ids: array-like
list of center IDs kept.
"""

centers = centers or self.centers_
Expand All @@ -269,30 +272,38 @@ def transform(self, data, centers=None):
]

# Clean out any empty cubes (common in high dimensions)
hypercubes = [cube for cube in hypercubes if len(cube)]
return hypercubes
trimmed_hypercubes = [cube for cube in hypercubes if len(cube)]
if return_centers:
trimmed_cube_ids = np.array([i for i, cube in enumerate(hypercubes) if len(cube)])
return trimmed_hypercubes, trimmed_cube_ids
else:
return trimmed_hypercubes

def fit_transform(self, data):
self.fit(data)
return self.transform(data)

def find(self, data_point):
"""Finds the hypercubes that contain the given data point.
def find(self, data_point, centers=None):
""" Finds the hypercubes that contain the given data point.
If `centers=None`, then use `self.centers_` as computed in `self.fit`.

Parameters
===========

data_point: array-like
The data point to locate.
centers: list of array-like
Center points for all cubes as returned by `self.fit`. Default is to use `self.centers_`.

Returns
=========
cube_ids: list of int
list of hypercube indices, empty if the data point is outside the cover.
list of hypercube indices (w.r.t. `self.fit`), empty if the data point is outside the cover.

"""
cube_ids = []
for i, center in enumerate(self.centers_):
centers = centers or self.centers_
for i, center in enumerate(centers):
lower_bounds, upper_bounds = center - self.radius_, center + self.radius_
if np.all(data_point >= lower_bounds) and np.all(
data_point <= upper_bounds
Expand Down
84 changes: 84 additions & 0 deletions kmapper/kmapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -951,6 +951,90 @@ def data_from_cluster_id(self, cluster_id, graph, data):
else:
return np.array([])

def find_nodes(self, cube_ids, graph, cover, lens):
"""Returns the clusters and their members from the subset of the cover spanned by the given cube_ids
Copy link
Collaborator

@deargle deargle Mar 18, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thinking out loud. I'm trying to think of another name. Kmapper has a separate Cover class, so calling this clusters_from_cover suggests to me that a cover should be passed, but it isn't.

But a Cover doesn't have clusters, so I don't think this should go in the Cover class.

If graph were a class, this would go in there as graph.find_clusters_by_cube_ids(cube_ids) or something.

Sort-of following the pattern from the last PR, maybe we rename this to find_clusters find_nodes


Parameters
----------
cube_ids : list of int
List of hypercube indices.
graph : dict
The resulting dictionary after applying map().
cover : kmapper.Cover
The cover used to build `graph`.
lens: Numpy Array
Lower dimensional representation of data.

Returns
-------
nodes : dict
cluster membership indexed by cluster ID (subset of `graph["nodes"]`).

"""
lens_ids = np.array([x for x in range(lens.shape[0])])
lens = np.c_[lens_ids, lens]
_, cube_id_mapping = cover.transform(lens, return_centers=True)

transformed_cube_ids = np.concatenate([np.flatnonzero(cube_id_mapping==cube_id) for cube_id in cube_ids])

clusters = {}
cluster_id_prefixes = tuple(["cube"+str(i)+"_" for i in transformed_cube_ids])
for cluster_id, cluster_members in graph["nodes"].items():
if cluster_id.startswith(cluster_id_prefixes):
clusters[cluster_id] = cluster_members
return clusters

def nearest_nodes(self, newlens, newdata, graph, cover, lens, data, nn):
"""Returns the nodes nearest to the `newdata` using the given NearestNeighbors algorithm

Parameters
----------
newdata : Numpy array
New dataset. Accepts both 1-D and 2-D array.
graph : dict
The resulting dictionary after applying map().
cover : kmapper.Cover
The cover used to build `graph`.
data : Numpy array
Original dataset.
lens: Numpy Array
Lower dimensional representation of data.
nn : NearestNeighbors
Scikit-learn NearestNeighbors instance to use.

Returns
-------
node_ids : numpy array
Node IDs.

"""
if newlens.shape[0] != newdata.shape[0]:
raise Exception("newlens and newdata must have the same number of rows.")

if len(newdata.shape) == 1:
newlens = newlens[np.newaxis]
newdata = newdata[np.newaxis]

cube_ids = np.concatenate([cover.find(row) for row in newlens])
if len(cube_ids) == 0:
return np.empty((0,))

nodes = self.find_nodes(cube_ids, graph, cover, lens)
if len(nodes) == 0:
return np.empty((0,))

nn_data = []
nn_cluster_ids = []
for cluster_id, cluster_members in nodes.items():
cluster_data = data[cluster_members]
nn_data.append(cluster_data)
nn_cluster_ids.append([cluster_id]*len(cluster_data))
nn_data = np.vstack(nn_data)
nn_cluster_ids = np.concatenate(nn_cluster_ids)
nn.fit(nn_data)
nn_ids = nn.kneighbors(newdata, return_distance=False)
return np.unique(nn_cluster_ids[nn_ids])

def _process_projection_tuple(self, projection):
# Detect if projection is a tuple (for prediction functions)
# TODO: multi-label models
Expand Down
64 changes: 64 additions & 0 deletions test/test_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,54 @@ def test_wrong_id(self):
mems = mapper.data_from_cluster_id("new node", graph, data)
np.testing.assert_array_equal(mems, np.array([]))

def test_find_nodes(self):
mapper = KeplerMapper(verbose=1)
data = np.random.rand(100, 2)

graph = mapper.map(data)
# pick a data point that exists in the graph
_, members = list(graph["nodes"].items())[-1]
data_point = data[members[-1]]

cube_ids = mapper.cover.find(data_point)
mems = mapper.find_nodes(cube_ids, graph, mapper.cover, data)
assert len(mems) > 0
for cluster_id, cluster_members in mems.items():
np.testing.assert_array_equal(cluster_members, graph["nodes"][cluster_id])

def test_node_not_found(self):
mapper = KeplerMapper(verbose=1)
data = np.random.rand(100, 2)

graph = mapper.map(data)
mems = mapper.find_nodes([999], graph, mapper.cover, data)
assert len(mems) == 0

def test_nearest_nodes_1(self):
mapper = KeplerMapper(verbose=1)
data = np.random.rand(100, 2)

graph = mapper.map(data)
nn = neighbors.NearestNeighbors(n_neighbors=1)
expected_id, members = list(graph["nodes"].items())[-1]
newdata = data[members[-1]]
node_ids = mapper.nearest_nodes(newdata, newdata, graph, mapper.cover, data, data, nn)
assert all(node_ids == [expected_id]), node_ids

def test_nearest_nodes_2(self):
mapper = KeplerMapper(verbose=1)
data = np.random.rand(100, 2)

graph = mapper.map(data)
nn = neighbors.NearestNeighbors(n_neighbors=1)
expected_clusters = [(cluster_id, members) for cluster_id, members in graph['nodes'].items()][:2]
cluster_id1 = expected_clusters[0][0]
cluster_id2 = expected_clusters[1][0]
newdata1 = data[expected_clusters[0][1][-1]]
newdata2 = data[expected_clusters[1][1][-1]]
newdata = np.vstack([newdata1, newdata2])
node_ids = mapper.nearest_nodes(newdata, newdata, graph, mapper.cover, data, data, nn)
assert all(node_ids == [cluster_id1, cluster_id2]), node_ids

class TestMap:
def test_simplices(self):
Expand All @@ -94,6 +142,22 @@ def test_simplices(self):
assert len(nodes) == 3
assert len(edges) == 3

def test_nodes(self):
mapper = KeplerMapper()

X = np.random.rand(100, 2)
lens = mapper.fit_transform(X)
graph = mapper.map(
lens,
X=X,
cover=Cover(n_cubes=3, perc_overlap=0.75),
clusterer=cluster.DBSCAN(metric="euclidean", min_samples=3),
)
assert len(graph["nodes"]) == 3
for i, cluster_id in enumerate(graph["nodes"]):
# verify cluster ID format
assert cluster_id == "cube{}_cluster0".format(i)

def test_precomputed(self):
mapper = KeplerMapper()

Expand Down