Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Lazy signals for Hierarchical Clustering #6348

Merged
merged 4 commits into from
Aug 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 42 additions & 36 deletions Orange/widgets/unsupervised/owhierarchicalclustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@

from Orange.widgets.utils.localization import pl
from orangewidget.utils.itemmodels import PyListModel
from orangewidget.utils.signals import LazyValue

import Orange.data
from Orange.data.domain import filter_visible
from Orange.data import Domain, DiscreteVariable, ContinuousVariable, \
StringVariable
StringVariable, Table
import Orange.misc
from Orange.clustering.hierarchical import \
postorder, preorder, Tree, tree_from_linkage, dist_matrix_linkage, \
Expand All @@ -32,8 +33,11 @@

from Orange.widgets import widget, gui, settings
from Orange.widgets.utils import itemmodels, combobox
from Orange.widgets.utils.annotated_data import (create_annotated_table,
ANNOTATED_DATA_SIGNAL_NAME)
from Orange.widgets.utils.annotated_data import (lazy_annotated_table,
ANNOTATED_DATA_SIGNAL_NAME,
domain_with_annotation_column,
add_columns,
create_annotated_table)
from Orange.widgets.utils.widgetpreview import WidgetPreview
from Orange.widgets.visualize.utils.plotutils import AxisItem
from Orange.widgets.widget import Input, Output, Msg
Expand Down Expand Up @@ -776,71 +780,73 @@ def commit(self):
for node in selection]

selected_indices = list(chain(*maps))
unselected_indices = sorted(set(range(self.root.value.last)) -
set(selected_indices))

if not selected_indices:
self.Outputs.selected_data.send(None)
annotated_data = create_annotated_table(items, []) \
annotated_data = lazy_annotated_table(items, []) \
if self.selection_method == 0 and self.matrix.axis else None
self.Outputs.annotated_data.send(annotated_data)
return

selected_data = None
selected_data = annotated_data = None

if isinstance(items, Orange.data.Table) and self.matrix.axis == 1:
# Select rows
c = np.zeros(self.matrix.shape[0])
data, domain = items, items.domain

c = np.full(self.matrix.shape[0], len(maps))
for i, indices in enumerate(maps):
c[indices] = i
c[unselected_indices] = len(maps)

mask = c != len(maps)

data, domain = items, items.domain
attrs = domain.attributes
classes = domain.class_vars
metas = domain.metas

var_name = get_unique_names(domain, "Cluster")
clust_name = get_unique_names(domain, "Cluster")
values = [f"C{i + 1}" for i in range(len(maps))]

clust_var = Orange.data.DiscreteVariable(
var_name, values=values + ["Other"])
domain = Orange.data.Domain(attrs, classes, metas + (clust_var,))
data = items.transform(domain)
with data.unlocked(data.metas):
data.set_column(clust_var, c)

if selected_indices:
selected_data = data[mask]
clust_var = Orange.data.DiscreteVariable(
var_name, values=values)
selected_data.domain = Domain(
attrs, classes, metas + (clust_var, ))

annotated_data = create_annotated_table(data, selected_indices)
sel_clust_var = Orange.data.DiscreteVariable(
name=clust_name, values=values)
sel_domain = add_columns(domain, metas=(sel_clust_var,))
selected_data = LazyValue[Table](
lambda: items.add_column(
sel_clust_var, c, to_metas=True)[c != len(maps)],
domain=sel_domain, length=len(selected_indices))

ann_clust_var = Orange.data.DiscreteVariable(
name=clust_name, values=values + ["Other"]
)
ann_domain = add_columns(
domain_with_annotation_column(data)[0], metas=(ann_clust_var, ))
annotated_data = LazyValue[Table](
lambda: create_annotated_table(
data=items.add_column(ann_clust_var, c, to_metas=True),
selected_indices=selected_indices),
domain=ann_domain, length=len(items)
)

elif isinstance(items, Orange.data.Table) and self.matrix.axis == 0:
# Select columns
attrs = []
unselected_indices = sorted(set(range(self.root.value.last)) -
set(selected_indices))
for clust, indices in chain(enumerate(maps, start=1),
[(0, unselected_indices)]):
for i in indices:
attr = items.domain[i].copy()
attr.attributes["cluster"] = clust
attrs.append(attr)
domain = Orange.data.Domain(
all_domain = Orange.data.Domain(
# len(unselected_indices) can be 0
attrs[:len(attrs) - len(unselected_indices)],
items.domain.class_vars, items.domain.metas)
selected_data = items.from_table(domain, items)

domain = Orange.data.Domain(
selected_data = LazyValue[Table](
lambda: items.from_table(all_domain, items),
domain=all_domain, length=len(items))

sel_domain = Orange.data.Domain(
attrs,
items.domain.class_vars, items.domain.metas)
annotated_data = items.from_table(domain, items)
annotated_data = LazyValue[Table](
lambda: items.from_table(sel_domain, items),
domain=sel_domain, length=len(items))

self.Outputs.selected_data.send(selected_data)
self.Outputs.annotated_data.send(annotated_data)
Expand Down
61 changes: 50 additions & 11 deletions Orange/widgets/utils/annotated_data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
from typing import Union

import numpy as np
from Orange.data import Domain, DiscreteVariable

from orangewidget.utils.signals import LazyValue

from Orange.data import Domain, DiscreteVariable, Table
from Orange.data.util import get_unique_names

ANNOTATED_DATA_SIGNAL_NAME = "Data"
Expand Down Expand Up @@ -30,16 +35,26 @@ def add_columns(domain, attributes=(), class_vars=(), metas=()):
return Domain(attributes, class_vars, metas)


def domain_with_annotation_column(
data: Union[Table, Domain],
values=("No", "Yes"),
var_name=ANNOTATED_DATA_FEATURE_NAME):
domain = data if isinstance(data, Domain) else data.domain
var = DiscreteVariable(get_unique_names(domain, var_name), values)
class_vars, metas = domain.class_vars, domain.metas
if not domain.class_vars:
class_vars += (var, )
else:
metas += (var, )
return Domain(domain.attributes, class_vars, metas), var


def _table_with_annotation_column(data, values, column_data, var_name):
var = DiscreteVariable(get_unique_names(data.domain, var_name), values)
class_vars, metas = data.domain.class_vars, data.domain.metas
domain, var = domain_with_annotation_column(data, values, var_name)
if not data.domain.class_vars:
class_vars += (var, )
column_data = column_data.reshape((len(data), ))
else:
metas += (var, )
column_data = column_data.reshape((len(data), 1))
domain = Domain(data.domain.attributes, class_vars, metas)
table = data.transform(domain)
with table.unlocked(table.Y if not data.domain.class_vars else table.metas):
table[:, var] = column_data
Expand All @@ -65,17 +80,20 @@ def create_annotated_table(data, selected_indices):
data, ("No", "Yes"), annotated, ANNOTATED_DATA_FEATURE_NAME)


def lazy_annotated_table(data, selected_indices):
domain, _ = domain_with_annotation_column(data)
return LazyValue[Table](
lambda: create_annotated_table(data, selected_indices),
length=len(data), domain=domain)


def create_groups_table(data, selection,
include_unselected=True,
var_name=ANNOTATED_DATA_FEATURE_NAME,
values=None):
if data is None:
return None
max_sel = np.max(selection)
if values is None:
values = ["G{}".format(i + 1) for i in range(max_sel)]
if include_unselected:
values.append("Unselected")
values, max_sel = group_values(selection, include_unselected, values)
if include_unselected:
# Place Unselected instances in the "last group", so that the group
# colors and scatter diagram marker colors will match
Expand All @@ -88,3 +106,24 @@ def create_groups_table(data, selection,
data = data[mask]
selection = selection[mask] - 1
return _table_with_annotation_column(data, values, selection, var_name)


def lazy_groups_table(data, selection, include_unselected=True,
var_name=ANNOTATED_DATA_FEATURE_NAME, values=None):
length = len(data) if include_unselected else np.sum(selection != 0)
values, _ = group_values(selection, include_unselected, values)
domain, _ = domain_with_annotation_column(data, values, var_name)
return LazyValue[Table](
lambda: create_groups_table(data, selection, include_unselected,
var_name, values),
length=length, domain=domain
)


def group_values(selection, include_unselected, values):
max_sel = np.max(selection)
if values is None:
values = ["G{}".format(i + 1) for i in range(max_sel)]
if include_unselected:
values.append("Unselected")
return values, max_sel
Loading
Loading