diff --git a/devel.txt b/CHANGELOG.md old mode 100755 new mode 100644 similarity index 64% rename from devel.txt rename to CHANGELOG.md index 05087d5..e1e8870 --- a/devel.txt +++ b/CHANGELOG.md @@ -1,12 +1,8 @@ -Pending: -======== -* Create a PerturbationNetwork and make SampleSpecificPerturbationNetwork dependent on the more general PerturbationNetwork. Adapt this so there can be more than one sample as the query. -* Should convert_network actually be convert_symmetric? -* Since iGraph is a dependency, just make code cleaner without the workarounds for not having it as a dependency -Completed: -========== -* 2022.02.09 - Added support for iGraph and non-fully connected networks. Also added UMAP fuzzy_simplical_set graph + +#### Completed: +* 2023.07.18 - Fixed issue with `SampleSpecificPerturbationNetwork` not being able to handle `X.index` with a `.name` that was not `NoneType`. Created a hack to allow `pd.MultiIndex` support (converts to strings and warns). Made `include_reference_for_samplespecific=True` the new default which creates a clone of the reference and uses that as the background network. Added `is_square` to `Symmetric` object. +* 2022.02.09 - Added support for iGraph and non-fully connected networks. Also added UMAP `fuzzy_simplical_set` graph * 2021.06.24 - Added `get_weights_from_graph` function * 2021.06.09 - Fixed `condensed_to_dense` ability to handle self interactions * 2021.04.21 - Fixed `idx_nodes = pd.Index(sorted(set(groups[lambda x: x == group].index) & set(df_dense.index)))` in `connectivity` function to prepare for pandas deprecation. @@ -15,3 +11,8 @@ Completed: * 2020.07.24 - Added `DifferentialEnsembleAssociationNetwork` * 2020.07.21 - `SampleSpecificPerturbationNetwork` fit method returns self + +#### Pending: +* Rename `Symmetric` object to something more generalizable to similarity and dissimilarity matrices that do not have to be symmetric or completely connected. +* Should `convert_network` actually be `convert_symmetric`? +* Since iGraph is a dependency, just make code cleaner without the workarounds for not having it as a dependency \ No newline at end of file diff --git a/LICENSE.txt b/LICENSE similarity index 77% rename from LICENSE.txt rename to LICENSE index d753176..4a9f45d 100644 --- a/LICENSE.txt +++ b/LICENSE @@ -1,17 +1,3 @@ -# ============== -# Ensemble NetworkX -# ============== -# Ensemble network methods in Python -# ------------------------------------ -# GitHub: https://github.com/jolespin/ensemble_networkx -# PyPI: https://pypi.org/project/ensemble_networkx -# ------------------------------------ -# ======= -# Contact -# ======= -# Producer: Josh L. Espinoza -# Contact: jespinoz@jcvi.org, jol.espinoz@gmail.com -# Google Scholar: https://scholar.google.com/citations?user=r9y1tTQAAAAJ&hl # ======= # License BSD-3 # ======= diff --git a/README.md b/README.md old mode 100755 new mode 100644 index 6ffaa9f..89230c6 --- a/README.md +++ b/README.md @@ -22,9 +22,10 @@ Compatible for Python 3. #### Install: ``` -# "Stable" release (still developmental) +# Stable release (Preferred) pip install ensemble_networkx -# Current release + +# Current developmental release pip install git+https://github.com/jolespin/ensemble_networkx ``` @@ -45,7 +46,7 @@ import ensemble_networkx as enx ```python # Load in data import soothsayer_utils as syu -X = syu.get_iris_data(["X"]) +X,y = syu.get_iris_data(["X", "y"]) # Create ensemble network ens = enx.EnsembleAssociationNetwork(name="Iris", node_type="leaf measurement", edge_type="association", observation_type="specimen") diff --git a/ensemble_networkx/__init__.py b/ensemble_networkx/__init__.py old mode 100755 new mode 100644 index 780d894..3c62121 --- a/ensemble_networkx/__init__.py +++ b/ensemble_networkx/__init__.py @@ -33,7 +33,7 @@ # ======= # Version # ======= -__version__= "2022.2.9" +__version__= "2023.7.18" __author__ = "Josh L. Espinoza" __email__ = "jespinoz@jcvi.org, jol.espinoz@gmail.com" __url__ = "https://github.com/jolespin/ensemble_networkx" @@ -48,7 +48,6 @@ "umap_fuzzy_simplical_set_graph", ] + [ "signed", - "get_weights_from_graph", "get_symmetric_category", "dense_to_condensed", "condensed_to_dense", diff --git a/ensemble_networkx/ensemble_networkx.py b/ensemble_networkx/ensemble_networkx.py old mode 100755 new mode 100644 index 16599c4..c131b33 --- a/ensemble_networkx/ensemble_networkx.py +++ b/ensemble_networkx/ensemble_networkx.py @@ -11,10 +11,11 @@ from collections.abc import Mapping, Hashable from itertools import combinations, product -# PyData +# Packages import pandas as pd import numpy as np import networkx as nx +import igraph as ig import xarray as xr from scipy import stats from scipy.special import comb @@ -31,56 +32,27 @@ except ImportError: __version__ = "ImportError: attempted relative import with no known parent package" -# ===================s -# Transformations -# =================== -# Unsigned network to signed network -def signed(X): - """ - unsigned -> signed correlation - """ - return (X + 1)/2 - -# ===================s -# Transformations -# =================== - - - # =================== # Converting Networks -# =================== -#Get weights from graph -def get_weights_from_graph(graph, into=np.asarray, weight="weight", generator=False): - weights = map(lambda edge_data: edge_data[-1][weight], graph.edges(data=True)) - if generator: - return weights - else: - weights = list(weights) - return into(weights) - +# =================== +# Remove self-loops +def remove_self_edges(graph): + self_edges = list(nx.selfloop_edges(graph)) + return graph.remove_edges_from(self_edges) + # pd.DataFrame 2D to pd.Series def dense_to_condensed(X, name=None, assert_symmetry=True, tol=None, nans_ok=True): if assert_symmetry: assert is_symmetrical(X, tol=tol, nans_ok=nans_ok), "`X` is not symmetric with tol=`{}` or try with `nans_ok=True`".format(tol) labels = X.index - index=pd.Index(list(map(frozenset, combinations(labels, 2))), name=name) + index = pd.Index(list(map(frozenset, combinations(labels, 2))), name=name) data = squareform(X, checks=False) return pd.Series(data, index=index, name=name) # pd.Series to pd.DataFrame 2D -def condensed_to_dense(y:pd.Series, fill_diagonal="infer", index=None): - # Check if there are self-interactions - number_of_unique_nodes_in_edges = y.index.map(len).unique() - assert set(number_of_unique_nodes_in_edges) <= {1,2}, "Number of unique nodes in edge must be either 1 or 2" - if isinstance(fill_diagonal, str): - if fill_diagonal == "infer": - if number_of_unique_nodes_in_edges.min() == 1: - fill_diagonal = None - else: - fill_diagonal = np.nan - +def condensed_to_dense(y:pd.Series, fill_diagonal=None, index=None): + # Need to optimize this data = defaultdict(dict) for edge, w in y.iteritems(): @@ -102,32 +74,58 @@ def condensed_to_dense(y:pd.Series, fill_diagonal="infer", index=None): df_dense = pd.DataFrame(data) if index is None: - index = df_dense.index + index = sorted(df_dense.index) return df_dense.loc[index,index] # Get symmetric category def get_symmetric_category(obj): - if type(obj) is type: - string_representation = str(obj) - else: - string_representation = str(type(obj)) - class_type = string_representation.split("'")[1] - fields = class_type.split(".") - module = fields[0] - category = None - if fields[-1] == "Symmetric": - category = "Symmetric" - if module == "pandas": - if fields[-1] == "Series": - category = ("pandas", "Series") - if fields[-1] == "DataFrame": - category = ("pandas", "DataFrame") - if module in {"networkx", "igraph"}: - category = module - assert category is not None, "`data` must be either a pandas[Series, DataFrame], ensemble_networkx[Symmetric], networkx[Graph, DiGraph, OrdereredGraph, DiOrderedGraph], igraph[Graph]" - return category + """ + Future: Add support for Hive objects + """ + if type(obj) is type: + string_representation = str(obj) + else: + string_representation = str(type(obj)) + class_type = string_representation.split("'")[1] + fields = class_type.split(".") + module = fields[0] + category = None + if fields[-1] == "Symmetric": + category = "Symmetric" + if module == "pandas": + if fields[-1] == "Series": + category = ("pandas", "Series") + if fields[-1] == "DataFrame": + category = ("pandas", "DataFrame") + if module in {"networkx", "igraph"}: + category = module + assert category is not None, "`data` must be either a pandas[Series, DataFrame], ensemble_networkx[Symmetric], networkx[Graph, DiGraph, OrdereredGraph, DiOrderedGraph], igraph[Graph]" + return category -def convert_network(data, into, index=None, assert_symmetry=True, tol=1e-10, **attrs): +# Convert Networks +def convert_network( + # I/O + data, + into, + + # Subgraphs + node_subgraph:list=None, + edge_subgraph:list=None, + + # Symmetry + remove_self_interactions:bool=True, + assert_symmetry=True, + tol=None, + + # Missing values + remove_missing_values:bool=True, + fill_missing_values_with=np.nan, + fill_diagonal=np.nan, + + # Attributes + # propogate_input_attributes=False, + **attrs, + ): """ Convert to and from the following network structures: * pd.DataFrame (must be symmetrical) @@ -135,8 +133,10 @@ def convert_network(data, into, index=None, assert_symmetry=True, tol=1e-10, **a * Symmetric * nx.[Di|Ordered]Graph * ig.Graph - - NOTE: This needs to be cleaned up + + Future: + Add support for existing attributes to propogate to next level. Currently, Symmetric isn't supported so I removed it entirely. + Add support for Hive objects """ input_category = get_symmetric_category(data) output_category = get_symmetric_category(into) @@ -145,69 +145,153 @@ def convert_network(data, into, index=None, assert_symmetry=True, tol=1e-10, **a assert isinstance(into, type), "`into` must be an instantiated object: a pandas[Series, DataFrame], ensemble_networkx[Symmetric], networkx[Graph, DiGraph, OrdereredGraph, DiOrderedGraph], igraph[Graph]" assert into not in {nx.MultiGraph, nx.MultiDiGraph}, "`into` cannot be a `Multi[Di]Graph`" - # self -> self - if isinstance(data, into): + + assert not (node_subgraph is not None) & (edge_subgraph is not None), "Cannot create subgraph from `node_subgraph` and `edge_subgraph`" + + if all([ + input_category == output_category, + node_subgraph is None, + edge_subgraph is None, + remove_self_interactions == False, + ]): return data.copy() - # HACK and I'm not proud of it... - @check_packages(["igraph"]) - def _to_igraph(data): - return Symmetric(data).to_igraph(**attrs) - if output_category == "igraph": - return _to_igraph(convert_network(data, into=pd.Series)) - - # pd.Series --> Symmetric - if isinstance(data, pd.Series): - data = Symmetric(data, **attrs) - if into == Symmetric: - return data + _attrs = dict() - # pd.DataFrame -> Symmetric or NetworkX - if isinstance(data, pd.DataFrame) and (into in {Symmetric, nx.Graph, nx.OrderedGraph, nx.DiGraph, nx.OrderedDiGraph}): - weights = dense_to_condensed(data, assert_symmetry=assert_symmetry, tol=tol) - if into == Symmetric: - return Symmetric(weights, **attrs) + # Convert to pd.Series + if input_category == "igraph": + # iGraph -> NetworkX + if all([ + output_category == "network", + node_subgraph is None, + edge_subgraph is None, + remove_self_interactions == False, + ]): + return data.to_networkx(into) + else: + if data.is_directed(): + warnings.warn("Currently conversions cannot handle directed graphs and will force into undirected configuration") + weights = data.es["weight"] + nodes = np.asarray(data.vs["name"]) + edges = list(map(lambda edge_indicies: frozenset(nodes[list(edge_indicies)]), data.get_edgelist())) + data = pd.Series(weights, index=edges) + input_category == ("pandas", "Series") + + if input_category == "networkx": + # Update attrs + # _attrs.update(graph.data) + # NetworkX -> iGraph + if all([ + output_category == "igraph", + node_subgraph is None, + edge_subgraph is None, + remove_self_interactions == False, + ]): + return ig.Graph.from_networkx(data) else: - return Symmetric(weights).to_networkx(into=into, **attrs) + # Weights + edge_weights = dict() + for edge_data in data.edges(data=True): + edge = frozenset(edge_data[:-1]) + weight = edge_data[-1]["weight"] + edge_weights[edge] = weight + data = pd.Series(edge_weights) + input_category == ("pandas", "Series") + + if input_category == ("pandas", "DataFrame"): + assert len(data.shape) == 2, "`data.shape` must be square (2 dimensions)" + assert np.all(data.index == data.columns), "`data.index` must equal `data.columns`" + # if assert_symmetry: + # assert is_symmetrical(data, tol=tol, nans_ok=True), "`data` is not symmetric with tol=`{}` or try with `nans_ok=True`".format(tol) + # data = data.stack() + # data.index = data.index.map(frozenset) - # pd.DataFrame -> pd.Series - if isinstance(data, pd.DataFrame) and (into in {pd.Series}): - return dense_to_condensed(data, assert_symmetry=assert_symmetry, tol=tol) + data = dense_to_condensed(data, assert_symmetry=assert_symmetry, tol=tol, nans_ok=True) + input_category == ("pandas", "Series") + if input_category == "Symmetric": + data = data.weights + input_category == ("pandas", "Series") - # Symmetric -> pd.DataFrame, pd.Series, or NetworkX - if isinstance(data, Symmetric): - # pd.DataFrame - if into == pd.DataFrame: - df = data.to_dense() - if index is None: - return df + + if input_category == ("pandas", "Series"): + if data.name is not None: + if "name" in _attrs: + if _attrs["name"] is None: + _attrs["name"] = data.name else: - assert set(index) <= set(df.index), "Not all `index` values are in `data`" - return df.loc[index,index] - elif into == pd.Series: - return data.weights.copy() - # Graph - else: - return data.to_networkx(into=into, **attrs) - - # NetworkX -> Symmetric - if isinstance(data, (nx.Graph, nx.OrderedGraph, nx.DiGraph, nx.OrderedDiGraph)): - if into == Symmetric: - return Symmetric(data=data, **attrs) - if into == pd.DataFrame: - return Symmetric(data=data, **attrs).to_dense() - if into in {nx.Graph, nx.OrderedGraph, nx.DiGraph, nx.OrderedDiGraph}: - return Symmetric(data=data).to_networkx(into=into, **attrs) - if into == pd.Series: - return convert_network(data=data, into=Symmetric, index=index, assert_symmetry=assert_symmetry, tol=tol).weights - + _attrs["name"] = data.name + # Overwrite existing attrs with provided attrs + for k, v in attrs.items(): + if v is not None: + _attrs[k] = v + + # Remove duplicates + data = data[~data.index.duplicated()] + + # Subgraphs + if node_subgraph is not None: + nodes = frozenset.union(*data.index) + node_subgraph = frozenset(node_subgraph) + assert node_subgraph <= nodes, "`node_subgraph` must be a subset of the nodes in `data`" + edge_subgraph = sorted(set(data.index[data.index.map(lambda edge: edge <= node_subgraph)])) + + if edge_subgraph is not None: + assert set(edge_subgraph) <= set(data.index), "`edge_subgraph` must be a subset of the edge set in `data`" + data = data[edge_subgraph] + + if remove_self_interactions: + data = data[data.index.map(lambda edge: len(edge) > 1)] + + # Missing values + if remove_missing_values: + data = data.dropna() + else: + if data.isnull().sum() > 0: + data = data.fillna(fill_missing_values_with) + + # Output + if output_category == ("pandas", "Series"): + data.name = _attrs.get("name", None) + return data + + if output_category == ("pandas", "DataFrame"): + return condensed_to_dense(data, fill_diagonal=fill_diagonal) + + if output_category == "networkx": + graph = into(**_attrs) + for edge, w in data.items(): + if len(edge) == 1: + node_a = node_b = list(edge)[0] + else: + node_a, node_b = list(edge) + graph.add_edge(node_a, node_b, weight=w) + return graph + + if output_category == "igraph": + graph = into() + # nodes = sorted(flatten(data.index.map(list), unique=True)) + nodes = sorted(frozenset.union(*data.index)) + graph.add_vertices(nodes) + for edge, w in data.items(): + if len(edge) == 1: + node_a = node_b = list(edge)[0] + else: + node_a, node_b = list(edge) + graph.add_edge(node_a, node_b, weight=w) + for k, v in _attrs.items(): + graph[k] = v + return graph + + if output_category == "Symmetric": + return Symmetric(data=data, **_attrs) + # =================== # Network Statistics # =================== # Connectivity -def connectivity(data, groups:pd.Series=None, include_self_loops=False, tol=1e-10): +def connectivity(data, groups:pd.Series=None, remove_self_interactions=True, tol=1e-10): """ Calculate connectivity from pd.DataFrame (must be symmetric), Symmetric, Hive, or NetworkX graph @@ -215,26 +299,30 @@ def connectivity(data, groups:pd.Series=None, include_self_loops=False, tol=1e-1 """ # This is a hack to allow Hives from hive_networkx if is_query_class(data, "Hive"): - data = condensed_to_dense(data.weights) - assert isinstance(data, (pd.DataFrame, Symmetric, nx.Graph, nx.DiGraph, nx.OrderedGraph, nx.OrderedDiGraph)), "Must be either a symmetric pd.DataFrame, Symmetric, nx.Graph, or hx.Hive object" - if is_graph(data): - weights = dict() - for edge_data in data.edges(data=True): - edge = frozenset(edge_data[:-1]) - weight = edge_data[-1]["weight"] - weights[edge] = weight - weights = pd.Series(weights, name="Weights")#.sort_index() - data = Symmetric(weights) - if isinstance(data, Symmetric): - df_dense = condensed_to_dense(data.weights) - - if isinstance(data, pd.DataFrame): - assert is_symmetrical(data, tol=tol) - df_dense = data - - - df_dense = df_dense.copy() - if not include_self_loops: + data = data.weights + # assert isinstance(data, (pd.DataFrame, Symmetric, nx.Graph, nx.DiGraph, nx.OrderedGraph, nx.OrderedDiGraph)), "Must be either a symmetric pd.DataFrame, Symmetric, nx.Graph, or hx.Hive object" +# if is_graph(data): +# weights = dict() +# for edge_data in data.edges(data=True): +# edge = frozenset(edge_data[:-1]) +# weight = edge_data[-1]["weight"] +# weights[edge] = weight +# weights = pd.Series(weights, name="Weights")#.sort_index() +# data = Symmetric(weights) +# if isinstance(data, Symmetric): +# df_dense = condensed_to_dense(data.weights) + +# if isinstance(data, pd.DataFrame): +# assert is_symmetrical(data, tol=tol) +# df_dense = data + + if not isinstance(data, pd.DataFrame): + df_dense = convert_network(data=data, into=pd.DataFrame, remove_missing_values=False, remove_self_interactions=False, tol=tol) + else: + assert is_symmetrical(X=data, tol=tol, nans_ok=True) + df_dense = data.copy() + + if remove_self_interactions: np.fill_diagonal(df_dense.values, 0) #kTotal @@ -251,8 +339,8 @@ def connectivity(data, groups:pd.Series=None, include_self_loops=False, tol=1e-1 #kWithin k_within = list() for group in groups.unique(): - idx_nodes = pd.Index(sorted(set(groups[lambda x: x == group].index) & set(df_dense.index))) - k_group = df_dense.loc[idx_nodes,idx_nodes].sum(axis=1) + index_nodes = pd.Index(sorted(set(groups[lambda x: x == group].index) & set(df_dense.index))) + k_group = df_dense.loc[index_nodes,index_nodes].sum(axis=1) k_within.append(k_group) data_connectivity["kWithin"] = pd.concat(k_within) @@ -292,7 +380,16 @@ def heterogeneity(k:pd.Series): return np.sqrt(number_of_nodes * np.sum(k**2)/np.sum(k)**2 - 1) # Topological overlap -def topological_overlap_measure(data, into=None, node_type=None, edge_type="topological_overlap_measure", association="network", assert_symmetry=True, tol=1e-10): +def topological_overlap_measure( + data, + into=None, + node_type=None, + edge_type="topological_overlap_measure", + association_type="network", + assert_symmetry=True, + tol=1e-10, + fill_diagonal=np.nan, + ): """ Compute the topological overlap for a weighted adjacency matrix @@ -365,7 +462,7 @@ def _compute_tom(A): else: df_tom = pd.DataFrame(A_tom, index=node_labels, columns=node_labels) df_tom.index.name = df_tom.columns.name = node_type - return convert_network(df_tom, into=into, assert_symmetry=assert_symmetry, tol=tol, adjacency="network", node_type=node_type, edge_type=edge_type, association=association) + return convert_network(df_tom, into=into, fill_diagonal=fill_diagonal, assert_symmetry=assert_symmetry, tol=tol, association_type="network", node_type=node_type, edge_type=edge_type) @@ -391,23 +488,22 @@ def community_detection(graph, n_iter:int=100, weight:str="weight", random_state _algo_kws = {} _algo_kws.update(algo_kws) + graph = convert_network(graph, nx.Graph) + def partition_function(graph, weight, random_state, algo_kws): return best_partition(graph, weight=weight, random_state=random_state, **algo_kws) # Leiden if algorithm == "leiden": - try: - import igraph as ig - except ModuleNotFoundError: - Exception("Please install `igraph` to use {} algorithm".format(algorithm)) try: from leidenalg import find_partition, ModularityVertexPartition except ModuleNotFoundError: Exception("Please install `leidenalg` to use {} algorithm".format(algorithm)) # Convert NetworkX to iGraph - graph = ig.Graph.from_networkx(graph) - nodes_list = np.asarray(graph.vs["_nx_name"]) + # graph = ig.Graph.from_networkx(graph) + graph = convert_network(graph, ig.Graph) + nodes_list = np.asarray(graph.vs["name"]) # Keywords _algo_kws = {"partition_type":ModularityVertexPartition, "n_iterations":-1} @@ -577,7 +673,15 @@ def umap_fuzzy_simplical_set_graph( return data else: # Get symmetric object - network = Symmetric(data=data, association="network", assert_symmetry=False, remove_missing_values=True, name=name, node_type=node_type, edge_type=edge_type) + network = Symmetric( + data=data, + association_type="network", + assert_symmetry=False, + remove_missing_values=True, + name=name, + node_type=node_type, + edge_type=edge_type, + ) if into == Symmetric: return network @@ -696,7 +800,7 @@ class Symmetric(object): df_sim.values[a,b] = df_sim.values[b,a] = df_sim.values[a,b]*-1 # Create a Symmetric object from the association matrix - sym_iris = enx.Symmetric(data=df_sim, node_type="iris sample", edge_type=method, name="iris", association="network") + sym_iris = enx.Symmetric(data=df_sim, node_type="iris sample", edge_type=method, name="iris", association_type="network") # ==================================== # Symmetric(Name:iris, dtype: float64) # ==================================== @@ -721,6 +825,9 @@ class Symmetric(object): ===== devel ===== + 2022-Feb-12 + * Completely rewrote Symmetric to clean it up. A little slower but much easier to prototype + 2022-Feb-08 * Added support for iGraph * Added support for non-fully-connected graphs @@ -750,196 +857,78 @@ def __init__( node_type=None, edge_type=None, func_metric=None, - association="infer", - acceptable_associations={"similarity", "dissimilarity", "statistical_test", "network", "infer", None}, + association_type=None, # Symmetry assert_symmetry=True, tol=None, + diagonal=None, # Missing values remove_missing_values=True, fill_missing_values_with=np.nan, nans_ok=True, + # Attributes **attrs, ): - self._acceptable_associations = acceptable_associations - - self.name = name - self.node_type = node_type - self.edge_type = edge_type - self.func_metric = func_metric - self.association = association - self.diagonal = None + # Metadata self.metadata = dict() - # Missing values - self.remove_missing_values=remove_missing_values - self.fill_missing_values_with=fill_missing_values_with - self.nans_ok=nans_ok + # Association type + assert_acceptable_arguments(association_type, {"similarity", "dissimilarity", "statistical_test", "network", None}) + - # Get input type - input_category = get_symmetric_category(data) + # Keywords + kwargs = {"name":name, "node_type":node_type, "edge_type":edge_type, "func_metric":func_metric, "association_type":association_type} + # From Symmetric object - if input_category == "Symmetric": + # if input_category == "Symmetric": + if isinstance(data, type(self)): if not nans_ok: assert not np.any(data.weights.isnull()), "Cannot move forward with missing values" - self._from_symmetric(data=data, name=name, node_type=node_type, edge_type=edge_type, func_metric=func_metric, association=association) - - # From networkx - if input_category == "networkx": - self._from_networkx(data=data, association=association) + self.__dict__.update(data.__dict__) - # From igraph - if input_category == "igraph": - self._from_igraph(data=data, association=association) - - # From pandas - if input_category in [("pandas", "Series"), ("pandas", "DataFrame")]: - if not nans_ok: - assert not np.any(data.isnull()), "Cannot move forward with missing values" - # From pd.DataFrame object - if isinstance(data, pd.DataFrame): - self._from_pandas_dataframe(data=data, association=association, assert_symmetry=assert_symmetry, nans_ok=nans_ok, tol=tol) - - # From pd.Series object - if isinstance(data, pd.Series): - self._from_pandas_series(data=data, association=association) - - # Universal + # Set keywords + for k, v in kwargs.items(): + if v is not None: + setattr(self, k, v) + + # Convert network type + else: + self.weights = convert_network( + data=data, + into=pd.Series, + remove_self_interactions=True, + assert_symmetry=assert_symmetry, + tol=tol, + remove_missing_values=remove_missing_values, + fill_missing_values_with=fill_missing_values_with, + name="Weights", + ) + self.edges = pd.Index(self.weights.index, name="Edges") + self.nodes = pd.Index(sorted(frozenset.union(*self.weights.index)), name="Nodes") + + # Set keywords + for k, v in kwargs.items(): + setattr(self, k, v) + # If there's still no `edge_type` and `func_metric` is not empty, then use this the name of `func_metric` if (self.edge_type is None) and (self.func_metric is not None): self.edge_type = self.func_metric.__name__ - + + self.set_diagonal(diagonal) self.values = self.weights.values self.number_of_nodes = self.nodes.size self.number_of_edges = self.edges.size -# self.graph = self.to_networkx(into=graph) # Not storing graph because it will double the storage self.memory = self.weights.memory_usage() self.metadata.update(attrs) self.__synthesized__ = datetime.datetime.utcnow() - - - - # ======= - # Utility - # ======= - - def _infer_association(self, X): - diagonal = np.diagonal(X) - diagonal_elements = set(diagonal) - assert len(diagonal_elements) == 1, "Cannot infer relationships from diagonal because multiple values" - assert diagonal_elements <= {0,1}, "Diagonal should be either 0.0 for dissimilarity or 1.0 for similarity" - return {0.0:"dissimilarity", 1.0:"similarity"}[list(diagonal_elements)[0]] - - def _from_symmetric(self,data, name, node_type, edge_type, func_metric, association): - self.__dict__.update(data.__dict__) - # If there's no `name`, then get `name` of `data` - if self.name is None: - self.name = name - # If there's no `node_type`, then get `node_type` of `data` - if self.node_type is None: - self.node_type = node_type - # If there's no `edge_type`, then get `edge_type` of `data` - if self.edge_type is None: - self.edge_type = edge_type - # If there's no `func_metric`, then get `func_metric` of `data` - if self.func_metric is None: - if func_metric is not None: - assert hasattr(func_metric, "__call__"), "`func_metric` must be a function" - self.func_metric = func_metric - - # Infer associations - if self.association is None: - assert_acceptable_arguments(association, self._acceptable_associations) - if association != "infer": - self.association = association - - def _from_networkx(self, data, association): - # assert isinstance(data, (nx.Graph, nx.OrderedGraph, nx.DiGraph, nx.OrderedDiGraph)), "`If data` is a graph, it must be in {nx.Graph, nx.OrderedGraph, nx.DiGraph, nx.OrderedDiGraph}" - assert_acceptable_arguments(association, self._acceptable_associations) - if association == "infer": - if association is None: - association = "network" - assert_acceptable_arguments(association, self._acceptable_associations) - - # Propogate information from graph - for attr in ["name", "node_type", "edge_type", "func_metric"]: - if getattr(self, attr) is None: - if attr in data.graph: - value = data.graph[attr] - if bool(value): - setattr(self, attr, value) - - # Weights - edge_weights = dict() - for edge_data in data.edges(data=True): - edge = frozenset(edge_data[:-1]) - weight = edge_data[-1]["weight"] - edge_weights[edge] = weight - data = pd.Series(edge_weights) - self._from_pandas_series(data=data, association=association) - - def _from_igraph(self, data, association): - if data.is_directed(): - warnings.warn("Currently `Symmetric` objects cannot handle directed graphs and will force into undirected configuration") - # assert isinstance(data, (ig.Graph)), "`If data` is a graph, it must be in {ig.Graph}" - assert_acceptable_arguments(association, self._acceptable_associations) - if association == "infer": - if association is None: - association = "network" - assert_acceptable_arguments(association, self._acceptable_associations) - - - # Propogate information from graph - for attr in ["name", "node_type", "edge_type", "func_metric"]: - if getattr(self, attr) is None: - try: - setattr(self, attr, getattr(data, attr)) - except (KeyError, AttributeError): - pass - # Weights - weights = data.es["weight"] - nodes = np.asarray(g.vs["name"]) - edges = list(map(lambda edge_indicies: frozenset(nodes[list(edge_indicies)]), data.get_edgelist())) - data = pd.Series(weights, index=edges) - - self._from_pandas_series(data=data, association=association) - - def _from_pandas_dataframe(self, data:pd.DataFrame, association, assert_symmetry, nans_ok, tol): - if assert_symmetry: - assert is_symmetrical(data, tol=tol), "`X` is not symmetric. Consider dropping the `tol` to a value such as `1e-10` or using `(X+X.T)/2` to force symmetry" - assert_acceptable_arguments(association, self._acceptable_associations) - if association == "infer": - association = self._infer_association(data) - self.association = association - self.nodes = pd.Index(data.index) - self.diagonal = pd.Series(np.diagonal(data), index=data.index, name="Diagonal")[self.nodes] - self.weights = dense_to_condensed(data, name="Weights", assert_symmetry=assert_symmetry, tol=tol) - self.edges = pd.Index(self.weights.index, name="Edges") - - def _from_pandas_series(self, data:pd.Series, association): - assert all(data.index.map(lambda edge: isinstance(edge, frozenset))), "If `data` is pd.Series then each key in the index must be a frozenset of size 2" - assert_acceptable_arguments(association, self._acceptable_associations) - if association == "infer": - association = None - self.association = association - # To ensure that the ordering is maintained and this is compatible with methods that use an unlabeled upper triangle, we must reindex and sort - self.nodes = pd.Index(sorted(frozenset.union(*data.index))) - - if self.remove_missing_values: - data = data.dropna() - self.edges = pd.Index(data.index, name="Edges") - self.weights = pd.Series(data, name="Weights") - else: - self.edges = pd.Index(map(frozenset, combinations(self.nodes, r=2)), name="Edges") - self.weights = pd.Series(data, name="Weights").reindex(self.edges) - if pd.notnull(self.fill_missing_values_with): - self.weights = self.weights.fillna(self.fill_missing_values_with) + + # Setting the diagonal def set_diagonal(self, diagonal): if diagonal is None: self.diagonal = None @@ -957,12 +946,15 @@ def __repr__(self): pad = 4 header = format_header("Symmetric(Name:{}, dtype: {})".format(self.name, self.weights.dtype),line_character="=") n = len(header.split("\n")[0]) + expected_edges_if_square = 0.5*(self.number_of_nodes**2 - self.number_of_nodes) fields = [ header, pad*" " + "* Number of nodes ({}): {}".format(self.node_type, self.number_of_nodes), pad*" " + "* Number of edges ({}): {}".format(self.edge_type, self.number_of_edges), - pad*" " + "* Association: {}".format(self.association), + pad*" " + "* Association: {}".format(self.association_type), + pad*" " + "* is_square: {}".format(expected_edges_if_square == self.number_of_edges), pad*" " + "* Memory: {}".format(format_memory(self.memory)), + *map(lambda line:pad*" " + line, format_header("| Weights", "-", n=n-pad).split("\n")), *map(lambda line: pad*" " + line, repr(self.weights).split("\n")[1:-1]), ] @@ -973,17 +965,22 @@ def __getitem__(self, key): """ `key` can be a node or non-string iterable of edges """ - + if is_nonstring_iterable(key): - assert len(key) >= 2, "`key` must have at least 2 identifiers. e.g. ('A','B')" + # assert len(key) >= 2, "`key` must have at least 2 identifiers. e.g. ('A','B')" key = frozenset(key) + + # Is it a diagonal value if len(key) == 1: + assert self.diagonal is not None, "Please use `set_diagonal` before querying single node edge weights" return self.diagonal[list(key)[0]] else: + # A list of nodes if len(key) > 2: key = list(map(frozenset, combinations(key, r=2))) - return self.weights[key] + return self.weights[key] else: + # Get all edges connected to a node if key in self.nodes: s = frozenset([key]) mask = self.edges.map(lambda x: bool(s & x)) @@ -1001,7 +998,7 @@ def __call__(self, key, func=np.sum): return func(self[key]) def __len__(self): - return self.number_of_nodes + return self.number_of_edges def __iter__(self): for v in self.weights: yield v @@ -1011,7 +1008,6 @@ def iteritems(self): return self.weights.iteritems() def keys(self): return self.weights.keys() - def apply(self, func): return func(self.weights) def mean(self): @@ -1041,19 +1037,81 @@ def map(self, func): def entropy(self, base=2): assert np.all(self.weights > 0), "All weights must be greater than 0" return stats.entropy(self.weights, base=base) - + + # ========== + # Network Metrics + # ========== + def edge_connectivity(self, node_subgraph=None, edge_subgraph=None, remove_self_interactions=True): + return self.to_pandas_series(node_subgraph=node_subgraph, edge_subgraph=edge_subgraph) + + def node_connectivity(self, node_subgraph=None, edge_subgraph=None, groups:pd.Series=None, remove_self_interactions=True): + """ + Total node_connectivity is twice that of total edge_connectivity because of symmetry + """ + data = self.to_pandas_dataframe(node_subgraph=node_subgraph, edge_subgraph=edge_subgraph) + assert np.all(data.values >= 0), "To use network metrics such as connectivity, density, centralization, and heterogeneity. All weights must ≥ 0." + return connectivity(data=data, groups=groups, remove_self_interactions=remove_self_interactions) + + def density(self, node_subgraph=None, edge_subgraph=None, remove_self_interactions=True): + data = self.to_pandas_dataframe(node_subgraph=node_subgraph, edge_subgraph=edge_subgraph) + k = connectivity(data=data, remove_self_interactions=remove_self_interactions) + return density(k) + + def centralization(self, node_subgraph=None, edge_subgraph=None, remove_self_interactions=True): + data = self.to_pandas_dataframe(node_subgraph=node_subgraph, edge_subgraph=edge_subgraph) + k = connectivity(data=data, remove_self_interactions=remove_self_interactions) + return centralization(k) + + def heterogeneity(self, node_subgraph=None, edge_subgraph=None, remove_self_interactions=True): + data = self.to_pandas_dataframe(node_subgraph=node_subgraph, edge_subgraph=edge_subgraph) + k = connectivity(data=data, remove_self_interactions=remove_self_interactions) + return heterogeneity(k) + + def network_summary_statistics(self, node_subgraph=None, edge_subgraph=None,remove_self_interactions=True): + data = self.to_pandas_dataframe(node_subgraph=node_subgraph, edge_subgraph=edge_subgraph) + k = connectivity(data=data, remove_self_interactions=remove_self_interactions) + return pd.Series(OrderedDict([ + + ("node_connectivity(∑)", k.sum()), + ("node_connectivity(µ)", k.mean()), + ("node_connectivity(σ)", k.std()), + ("density", density(k)), + ("centralization", centralization(k)), + ("heterogeneity", heterogeneity(k)), + ]), name=self.name) + + def topological_overlap_measure(self, into=pd.Series, node_subgraph=None, edge_subgraph=None, fill_diagonal=np.nan): + return topological_overlap_measure( + data=self.to_pandas_dataframe(node_subgraph=node_subgraph, edge_subgraph=edge_subgraph, fill_diagonal=np.nan), + fill_diagonal=fill_diagonal, + node_type=self.node_type, + ) # ========== # Conversion # ========== - def to_dense(self, index=None, fill_diagonal=None): - if fill_diagonal is None: - fill_diagonal=self.diagonal - if index is None: - index = self.nodes - return condensed_to_dense(y=self.weights, fill_diagonal=fill_diagonal, index=index) + # def to_dense(self, node_subgraph=None, fill_diagonal=None): + # if fill_diagonal is None: + # fill_diagonal = self.diagonal + # if node_subgraph is None: + # node_subgraph = self.nodes + # return condensed_to_dense(y=self.weights, fill_diagonal=fill_diagonal, index=node_subgraph) - def to_pandas_dataframe(self, index=None, fill_diagonal=None, vertical=False): - df = self.to_dense(index=index, fill_diagonal=fill_diagonal) + def to_pandas_dataframe(self, node_subgraph=None, edge_subgraph=None, fill_diagonal=None, vertical=False, **convert_network_kws): + # df = self.to_dense(node_subgraph=node_subgraph, fill_diagonal=fill_diagonal) + if fill_diagonal is None: + fill_diagonal = self.diagonal + if (node_subgraph is None) & (edge_subgraph is None): + node_subgraph = self.nodes + + df = convert_network( + data=self.weights, + into=pd.DataFrame, + node_subgraph=node_subgraph, + edge_subgraph=edge_subgraph, + fill_diagonal=fill_diagonal, + **convert_network_kws, + ) + if not vertical: return df else: @@ -1062,12 +1120,17 @@ def to_pandas_dataframe(self, index=None, fill_diagonal=None, vertical=False): df.index.name = "Edge_Index" return df - def to_condensed(self): - return self.weights +# def to_condensed(self): +# return self.weights - def to_pandas_series(self): - return self.to_condensed() - + def to_pandas_series(self, node_subgraph=None, edge_subgraph=None, **convert_network_kws): + return convert_network( + data=self.weights, + into=pd.Series, + node_subgraph=node_subgraph, + edge_subgraph=edge_subgraph, + **convert_network_kws, + ) # @check_packages(["ete3", "skbio"]) # def to_tree(self, method="average", into=None, node_prefix="y"): # assert self.association == "dissimilarity", "`association` must be 'dissimilarity' to construct tree" @@ -1082,27 +1145,45 @@ def to_pandas_series(self): # tree = into(newick=self.newick, name=self.name) # return name_tree_nodes(tree, node_prefix) - def to_networkx(self, into=None, **attrs): + def to_networkx(self, into=None, node_subgraph=None, edge_subgraph=None, **attrs): if into is None: into = nx.Graph - metadata = { "node_type":self.node_type, "edge_type":self.edge_type, "func_metric":self.func_metric} + metadata = {"name":self.name, "node_type":self.node_type, "edge_type":self.edge_type, "func_metric":self.func_metric, "association_type":self.association_type} metadata.update(attrs) - graph = into(name=self.name, **metadata) - - for (node_A, node_B), weight in self.weights.iteritems(): - graph.add_edge(node_A, node_B, weight=weight) +# graph = into(name=self.name, **metadata) + +# for (node_A, node_B), weight in self.weights.iteritems(): +# graph.add_edge(node_A, node_B, weight=weight) + graph = convert_network( + data=self.weights, + into=into, + node_subgraph=node_subgraph, + edge_subgraph=edge_subgraph, + ) + graph.graph.update(metadata) return graph - @check_packages(["igraph"]) - def to_igraph(self, directed=False, **attrs): - from igraph import Graph - graph = Graph(directed=directed) - graph.add_vertices(self.nodes) # this adds adjacency.shape[0] vertices - graph.add_edges(self.edges.map(tuple)) - graph.es['weight'] = self.weights.values - - for k, v in { "node_type":self.node_type, "edge_type":self.edge_type, "func_metric":self.func_metric}.items(): + def to_igraph(self, directed=False, node_subgraph=None, edge_subgraph=None, **attrs): + metadata = {"name":self.name, "node_type":self.node_type, "edge_type":self.edge_type, "func_metric":self.func_metric, "association_type":self.association_type} + metadata.update(attrs) + + # graph = ig.Graph(directed=directed) + # graph.add_vertices(self.nodes) # this adds adjacency.shape[0] vertices + # graph.add_edges(self.edges.map(tuple)) + # graph.es['weight'] = self.weights.values + + graph = convert_network( + data=self.weights, + into=ig.Graph, + node_subgraph=node_subgraph, + edge_subgraph=edge_subgraph, + ) + if directed: + graph.to_directed() + + for k, v in metadata.items(): graph[k] = v + return graph def to_file(self, path, **kwargs): @@ -1112,7 +1193,6 @@ def copy(self): return copy.deepcopy(self) - # ============================= # Feature Engineering # ============================= @@ -1575,18 +1655,18 @@ def fit( if metric in {"spearman", "pearson", "kendall"}: association = metric metric = lambda X: self._pandas_association(X=X, metric=association) + # def metric(X, metric): + # return self._pandas_association(X=X, metric=association) assert hasattr(metric, "__call__"), "`metric` must be either one of the following: [{}], \ a custom metric that returns an association (set `function_is_pairwise=False`), or a custom \ metric that returns a 2D square/symmetric pd.DataFrame (set `function_is_pairwise=True`)".format(acceptable_metrics) # Transformations - acceptable_transformations = {"signed", "abs"} + acceptable_transformations = {"abs"} if transformation: if isinstance(transformation, str): assert_acceptable_arguments(transformation, acceptable_transformations) - if transformation == "signed": - transformation = signed if transformation == "abs": transformation = np.abs assert hasattr(transformation, "__call__"), "`transformation` must be either one of the following: [{}] or a function(pd.DataFrame) -> pd.DataFrame".format(acceptable_transformations) @@ -1785,9 +1865,10 @@ def to_condensed(self, weight="mean", into=Symmetric): node_type=self.node_type, edge_type=self.edge_type, func_metric=self.metric_, - association="network", + association_type="network", assert_symmetry=self.assert_symmetry, - nans_ok=self.nans_ok, tol=self.tol, + nans_ok=self.nans_ok, + tol=self.tol, ) if into == Symmetric: return sym_network @@ -1968,6 +2049,7 @@ def fit( transformation=None, random_state=0, with_replacement=False, + include_reference_for_samplespecific=True, function_is_pairwise=True, stats_summary=[np.mean, np.var, stats.kurtosis, stats.skew], # Need to adjust for NaN robust stats_tests=[stats.normaltest], @@ -1993,8 +2075,42 @@ def fit( assert reference in y.unique(), "`reference({}, type:{}) not in `y`".format(reference, type(reference)) assert set(X.index) == set(y.index), "`X.index` must have same keys as `y.index`" - y = y[X.index] - + y = y[X.index].astype(str) + + # Data + self.classes_ = y.value_counts() + + if copy_X: + self.X_ = X.copy() + self.X_memory_ = X.memory_usage().sum() + self.memory_ += self.X_memory_ + + if copy_y: + self.y_ = y.copy() + + # Checks for index + if isinstance(X.index, pd.MultiIndex): # https://github.com/jolespin/ensemble_networkx/issues/2 + warnings.warn("SampleSpecificPerturbationNetwork cannot work with MultiIndex or tupled index. Converting MultiIndex to flat index strings. (i.e., X.index = y.index = X.index.to_flat_index().map(str))") + X.index = y.index = X.index.to_flat_index().map(str) + X.index.name = y.index.name = None # https://github.com/jolespin/ensemble_networkx/issues/1 + + if isinstance(X.columns, pd.MultiIndex): # https://github.com/jolespin/ensemble_networkx/issues/2 + warnings.warn("SampleSpecificPerturbationNetwork cannot work with MultiIndex or tupled index. Converting MultiIndex to flat index strings (i.e., X.columns = X.columns.to_flat_index().map(str))") + X.columns = X.columns.to_flat_index().map(str) + X.columns.name = None # https://github.com/jolespin/ensemble_networkx/issues/1 + + if include_reference_for_samplespecific: + y_clone = y[y == reference] + y_clone[y_clone == reference] = f"Reference({reference}[clone])" + y_clone.index = y_clone.index.map(lambda x: f"{x}[clone]") + y = pd.concat([y, y_clone]) + + X_clone = X.loc[y[y == reference].index] + X_clone.index = X_clone.index.map(lambda x: f"{x}[clone]") + X = pd.concat([X, X_clone], axis=0) + + reference = f"Reference({reference}[clone])" + # Ensemble Reference index_reference = sorted(y[lambda i: i == reference].index) ensemble_reference = EnsembleAssociationNetwork( @@ -2144,14 +2260,7 @@ def fit( self.stats_memory_ = self.stats_.nbytes self.memory_ += self.stats_memory_ - # Data - if copy_X: - self.X_ = X.copy() - self.X_memory_ = X.memory_usage().sum() - self.memory_ += self.X_memory_ - if copy_y: - self.y_ = y.copy() # Reference ensemble self.ensemble_reference_ = ensemble_reference @@ -2170,9 +2279,9 @@ def fit( name=self.name, dims=["Samples", "Iterations", "Edges"], coords={ - "Samples":index_samplespecific, - "Iterations":range(n_iter), - "Edges":edges, + "Samples":list(index_samplespecific), + "Iterations":list(range(n_iter)), + "Edges":list(edges), }, ) self.ensemble_memory_ = self.ensemble_.nbytes @@ -2222,7 +2331,7 @@ def to_condensed(self, sample, weight="mean", into=Symmetric): node_type=self.node_type, edge_type=self.edge_type, func_metric=self.metric_, - association="network", + association_type="network", assert_symmetry=self.assert_symmetry, nans_ok=self.nans_ok, tol=self.tol, @@ -2592,7 +2701,7 @@ def to_condensed(self, weight="mean", into=Symmetric): node_type=self.node_type, edge_type=self.edge_type, func_metric=self.metric_, - association="network", + association_type="network", assert_symmetry=self.assert_symmetry, nans_ok=self.nans_ok, tol=self.tol, diff --git a/setup.py b/setup.py old mode 100755 new mode 100644