diff --git a/network_evaluation_tools/network_propagation.py b/network_evaluation_tools/network_propagation.py index 125f7d6..2aa2bab 100644 --- a/network_evaluation_tools/network_propagation.py +++ b/network_evaluation_tools/network_propagation.py @@ -8,70 +8,76 @@ import pandas as pd import copy + # Normalize network (or network subgraph) for random walk propagation def normalize_network(network, symmetric_norm=False): - adj_mat = nx.adjacency_matrix(network) - adj_array = np.array(adj_mat.todense()) - if symmetric_norm: - D = np.diag(1/np.sqrt(sum(adj_array))) - adj_array_norm = np.dot(np.dot(D, adj_array), D) - else: - degree_norm_array = np.diag(1/sum(adj_array).astype(float)) - sparse_degree_norm_array = scipy.sparse.csr_matrix(degree_norm_array) - adj_array_norm = sparse_degree_norm_array.dot(adj_mat).toarray() - return adj_array_norm + adj_mat = nx.adjacency_matrix(network) + adj_array = np.array(adj_mat.todense()) + if symmetric_norm: + D = np.diag(1 / np.sqrt(sum(adj_array))) + adj_array_norm = np.dot(np.dot(D, adj_array), D) + else: + degree_norm_array = np.diag(1 / sum(adj_array).astype(float)) + sparse_degree_norm_array = scipy.sparse.csr_matrix(degree_norm_array) + adj_array_norm = sparse_degree_norm_array.dot(adj_mat).toarray() + return adj_array_norm + + # Note about normalizing by degree, if multiply by degree_norm_array first (D^-1 * A), then do not need to return # transposed adjacency array, it is already in the correct orientation # Calculate optimal propagation coefficient (updated model) def calculate_alpha(network, m=-0.02935302, b=0.74842057): - log_edge_count = np.log10(len(network.edges())) - alpha_val = round(m*log_edge_count+b,3) - if alpha_val <=0: - raise ValueError('Alpha <= 0 - Network Edge Count is too high') - # There should never be a case where Alpha >= 1, as avg node degree will never be negative - else: - return alpha_val + log_edge_count = np.log10(len(network.edges())) + alpha_val = round(m * log_edge_count + b, 3) + if alpha_val <= 0: + raise ValueError('Alpha <= 0 - Network Edge Count is too high') + # There should never be a case where Alpha >= 1, as avg node degree will never be negative + else: + return alpha_val + # Closed form random-walk propagation (as seen in HotNet2) for each subgraph: Ft = (1-alpha)*Fo * (I-alpha*norm_adj_mat)^-1 # Concatenate to previous set of subgraphs def fast_random_walk(alpha, binary_mat, subgraph_norm, prop_data): - term1=(1-alpha)*binary_mat - term2=np.identity(binary_mat.shape[1])-alpha*subgraph_norm - term2_inv = np.linalg.inv(term2) - subgraph_prop = np.dot(term1, term2_inv) - return np.concatenate((prop_data, subgraph_prop), axis=1) + term1 = (1 - alpha) * binary_mat + term2 = np.identity(binary_mat.shape[1]) - alpha * subgraph_norm + term2_inv = np.linalg.inv(term2) + subgraph_prop = np.dot(term1, term2_inv) + return np.concatenate((prop_data, subgraph_prop), axis=1) + # Wrapper for random walk propagation of full network by subgraphs -def closed_form_network_propagation(network, binary_matrix, network_alpha, symmetric_norm=False, verbose=False, save_path=None): - starttime=time.time() - if verbose: - print 'Alpha:', network_alpha - # Separate network into connected components and calculate propagation values of each sub-sample on each connected component - subgraphs = list(nx.connected_component_subgraphs(network)) - # Initialize propagation results by propagating first subgraph - subgraph = subgraphs[0] - subgraph_nodes = list(subgraph.nodes) - prop_data_node_order = list(subgraph_nodes) - binary_matrix_filt = np.array(binary_matrix.T.ix[subgraph_nodes].fillna(0).T) - subgraph_norm = normalize_network(subgraph, symmetric_norm=symmetric_norm) - prop_data_empty = np.zeros((binary_matrix_filt.shape[0], 1)) - prop_data = fast_random_walk(network_alpha, binary_matrix_filt, subgraph_norm, prop_data_empty) - # Get propagated results for remaining subgraphs - for subgraph in subgraphs[1:]: - subgraph_nodes = list(subgraph.nodes) - prop_data_node_order = prop_data_node_order + subgraph_nodes - binary_matrix_filt = np.array(binary_matrix.T.ix[subgraph_nodes].fillna(0).T) - subgraph_norm = normalize_network(subgraph, symmetric_norm=symmetric_norm) - prop_data = fast_random_walk(network_alpha, binary_matrix_filt, subgraph_norm, prop_data) - # Return propagated result as dataframe - prop_data_df = pd.DataFrame(data=prop_data[:,1:], index = binary_matrix.index, columns=prop_data_node_order) - if save_path is None: - if verbose: - print 'Network Propagation Complete:', time.time()-starttime, 'seconds' - return prop_data_df - else: - prop_data_df.to_csv(save_path) - if verbose: - print 'Network Propagation Complete:', time.time()-starttime, 'seconds' - return prop_data_df +def closed_form_network_propagation(network, binary_matrix, network_alpha, symmetric_norm=False, verbose=False, + save_path=None): + starttime = time.time() + if verbose: + print 'Alpha:', network_alpha + # Separate network into connected components and calculate propagation values of each sub-sample on each connected component + subgraphs = list(nx.connected_component_subgraphs(network)) + # Initialize propagation results by propagating first subgraph + subgraph = subgraphs[0] + subgraph_nodes = list(subgraph.nodes) + prop_data_node_order = list(subgraph_nodes) + binary_matrix_filt = np.array(binary_matrix.T.ix[subgraph_nodes].fillna(0).T) + subgraph_norm = normalize_network(subgraph, symmetric_norm=symmetric_norm) + prop_data_empty = np.zeros((binary_matrix_filt.shape[0], 1)) + prop_data = fast_random_walk(network_alpha, binary_matrix_filt, subgraph_norm, prop_data_empty) + # Get propagated results for remaining subgraphs + for subgraph in subgraphs[1:]: + subgraph_nodes = list(subgraph.nodes) + prop_data_node_order = prop_data_node_order + subgraph_nodes + binary_matrix_filt = np.array(binary_matrix.T.ix[subgraph_nodes].fillna(0).T) + subgraph_norm = normalize_network(subgraph, symmetric_norm=symmetric_norm) + prop_data = fast_random_walk(network_alpha, binary_matrix_filt, subgraph_norm, prop_data) + # Return propagated result as dataframe + prop_data_df = pd.DataFrame(data=prop_data[:, 1:], index=binary_matrix.index, columns=prop_data_node_order) + if save_path is None: + if verbose: + print 'Network Propagation Complete:', time.time() - starttime, 'seconds' + return prop_data_df + else: + prop_data_df.to_csv(save_path) + if verbose: + print 'Network Propagation Complete:', time.time() - starttime, 'seconds' + return prop_data_df diff --git a/test_suite/__init__.py b/test_suite/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test_suite/test_network_evaluation.py b/test_suite/test_network_evaluation.py new file mode 100644 index 0000000..3b97054 --- /dev/null +++ b/test_suite/test_network_evaluation.py @@ -0,0 +1,52 @@ +import math +import networkx as nx + +import pytest +from network_evaluation_tools import data_import_tools as dit +from network_evaluation_tools import network_evaluation_functions as nef +from network_evaluation_tools import network_propagation as prop +import pandas as pd +import numpy as np +import pickle + +network_test_file = '../Data/Networks/YoungvsOld_UP.csv' +disease_test_file = '../Data/Evaluations/DisGeNET_genesets.txt' +networkx_test_file = '../Data/NetworkCYJS/graph1_Young_Old_Fuzzy_95.pkl' + +AUPRC_values = {'Carcinoma, Lewis Lung': 0.5136054421768708, 'Fanconi Anemia': 0.5048184241212726, + 'Endometrial adenocarcinoma': 0.5036461554318696, 'Follicular adenoma': -1.0, + 'Intracranial Aneurysm': -1.0} +network = dit.load_network_file('../Data/Networks/YoungvsOld_UP.csv', delimiter=',', verbose=True) +genesets = dit.load_node_sets('../Data/Evaluations/DisGeNET_genesets.txt') +genesets = {'Carcinoma, Lewis Lung': genesets['Carcinoma, Lewis Lung'], + 'Fanconi Anemia': genesets['Fanconi Anemia'], + 'Endometrial adenocarcinoma': genesets['Endometrial adenocarcinoma'], + 'Follicular adenoma': genesets['Follicular adenoma'], + 'Intracranial Aneurysm': genesets['Intracranial Aneurysm'], + 'Muscle Weakness': genesets['Muscle Weakness'] + } +genesets_p = {'Carcinoma, Lewis Lung': 0.5921, + 'Fanconi Anemia': 0.5589, + 'Endometrial adenocarcinoma': 0.5921, + 'Follicular adenoma': 0.649, + 'Intracranial Aneurysm': float('inf'), + 'Muscle Weakness': float('inf')} +alpha = 0.684 + + +def test_construct_prop_kernel(): + """ + This test generates the kernel based on a specific network \ + of 206 nodes. + + :return: + """ + _network = dit.load_network_file(network_test_file, delimiter=',', verbose=True) + _gene_sets = dit.load_node_sets(disease_test_file) + _gene_sets_p = nef.calculate_p(_network, _gene_sets) # calculate the sub-sampling rate p for each node set + _alpha = prop.calculate_alpha(_network) # Calculate the Network Alpha + kernel = nef.construct_prop_kernel(_network, alpha=_alpha, verbose=True) + assert isinstance(kernel, pd.DataFrame) + assert kernel.shape == (len(_network.nodes), len(_network.nodes)) # Propagate using the random walk model + +