diff --git a/setup.cfg b/setup.cfg index 51e1749d..455719a4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,7 +4,7 @@ # Configuring setup() [metadata] name = pecanpy -version = 2.0.2-dev +version = 2.0.2 description = A parallelized, efficient, and accelerated node2vec long_description = file: README.md long_description_content_type = text/markdown diff --git a/src/pecanpy/cli.py b/src/pecanpy/cli.py index 30baf85f..b74e6672 100755 --- a/src/pecanpy/cli.py +++ b/src/pecanpy/cli.py @@ -144,6 +144,13 @@ def parse_args(): help="Use node2vec+ extension", ) + parser.add_argument( + "--gamma", + type=float, + default=0, + help="Noisy edge threshold parameter.", + ) + return parser.parse_args() @@ -234,6 +241,7 @@ def read_graph(args): weighted = args.weighted directed = args.directed extend = args.extend + gamma = args.gamma mode = args.mode task = args.task @@ -250,7 +258,7 @@ def read_graph(args): exit() pecanpy_mode = getattr(pecanpy, mode, None) - g = pecanpy_mode(p, q, workers, verbose, extend) + g = pecanpy_mode(p, q, workers, verbose, extend, gamma) read_func = g.read_npz if fp.endswith(".npz") else g.read_edg read_func(fp, weighted, directed) diff --git a/src/pecanpy/pecanpy.py b/src/pecanpy/pecanpy.py index 58f589b2..4c69f006 100755 --- a/src/pecanpy/pecanpy.py +++ b/src/pecanpy/pecanpy.py @@ -41,7 +41,7 @@ class Base: """ - def __init__(self, p, q, workers, verbose=False, extend=False): + def __init__(self, p, q, workers, verbose=False, extend=False, gamma=0): """Initializ node2vec base class. Args: @@ -53,7 +53,11 @@ def __init__(self, p, q, workers, verbose=False, extend=False): workers (int): number of threads to be spawned for runing node2vec including walk generation and word2vec embedding. verbose (bool): show progress bar for walk generation. - extend (bool): ``True`` if use node2vec+ extension, default is ``False`` + extend (bool): use node2vec+ extension if set to :obj:`True` + (default: :obj:`False`). + gamma (float): Multiplication factor for the std term of edge + weights added to the average edge weights as the noisy edge + threashold, only used by node2vec+ (default: 0) """ super().__init__() @@ -62,6 +66,7 @@ def __init__(self, p, q, workers, verbose=False, extend=False): self.workers = workers self.verbose = verbose self.extend = extend + self.gamma = gamma def _map_walk(self, walk_idx_ary): """Map walk from node index to node ID. @@ -148,16 +153,16 @@ def setup_get_normalized_probs(self): probability computation function ``get_extended_normalized_probs``, if node2vec+ is used. Otherwise, return the normal transition function ``get_noramlized_probs`` with a trivial placeholder for average edge - weights array ``avg_wts``. + weights array ``noise_thresholds``. """ if self.extend: # use n2v+ get_normalized_probs = self.get_extended_normalized_probs - avg_wts = self.get_average_weights() + noise_thresholds = self.get_noise_thresholds() else: # use normal n2v get_normalized_probs = self.get_normalized_probs - avg_wts = None - return get_normalized_probs, avg_wts + noise_thresholds = None + return get_normalized_probs, noise_thresholds def preprocess_transition_probs(self): """Null default preprocess method.""" @@ -221,9 +226,9 @@ def embed( class FirstOrderUnweighted(Base, SparseRWGraph): """Directly sample edges for first order random walks.""" - def __init__(self, p, q, workers, verbose=False, extend=False): + def __init__(self, *args, **kwargs): """Initialize FirstOrderUnweighted mode.""" - Base.__init__(self, p, q, workers, verbose, extend) + Base.__init__(self, *args, **kwargs) def get_move_forward(self): """Wrap ``move_forward``.""" @@ -241,9 +246,9 @@ def move_forward(cur_idx, prev_idx=None): class PreCompFirstOrder(Base, SparseRWGraph): """Precompute transition probabilities for first order random walks.""" - def __init__(self, p, q, workers, verbose=False, extend=False): + def __init__(self, *args, **kwargs): """Initialize PreCompFirstOrder mode.""" - Base.__init__(self, p, q, workers, verbose, extend) + Base.__init__(self, *args, **kwargs) self.alias_j = self.alias_q = None def get_move_forward(self): @@ -304,9 +309,9 @@ class PreComp(Base, SparseRWGraph): """ - def __init__(self, p, q, workers, verbose=False, extend=False): + def __init__(self, *args, **kwargs): """Initialize PreComp mode node2vec.""" - Base.__init__(self, p, q, workers, verbose, extend) + Base.__init__(self, *args, **kwargs) self.alias_j = self.alias_q = self.alias_indptr = self.alias_dim = None def get_move_forward(self): @@ -390,7 +395,7 @@ def preprocess_transition_probs(self): q = self.q # Retrieve transition probability computation callback function - get_normalized_probs, avg_wts = self.setup_get_normalized_probs() + get_normalized_probs, noise_thresholds = self.setup_get_normalized_probs() # Determine the dimensionality of the 2nd order transition probs n_nodes = self.indptr.size - 1 # number of nodes @@ -423,7 +428,7 @@ def compute_all_transition_probs(): q, idx, nbr, - avg_wts, + noise_thresholds, ) start = offset + dim * nbr_idx @@ -444,9 +449,9 @@ class SparseOTF(Base, SparseRWGraph): """ - def __init__(self, p, q, workers, verbose=False, extend=False): + def __init__(self, *args, **kwargs): """Initialize PreComp mode node2vec.""" - Base.__init__(self, p, q, workers, verbose, extend) + Base.__init__(self, *args, **kwargs) def get_move_forward(self): """Wrap ``move_forward``. @@ -467,7 +472,7 @@ def get_move_forward(self): p = self.p q = self.q - get_normalized_probs, avg_wts = self.setup_get_normalized_probs() + get_normalized_probs, noise_thresholds = self.setup_get_normalized_probs() @njit(nogil=True) def move_forward(cur_idx, prev_idx=None): @@ -480,7 +485,7 @@ def move_forward(cur_idx, prev_idx=None): q, cur_idx, prev_idx, - avg_wts, + noise_thresholds, ) cdf = np.cumsum(normalized_probs) choice = np.searchsorted(cdf, np.random.random()) @@ -499,9 +504,9 @@ class DenseOTF(Base, DenseRWGraph): """ - def __init__(self, p, q, workers, verbose=False, extend=False): + def __init__(self, *args, **kwargs): """Initialize DenseOTF mode node2vec.""" - Base.__init__(self, p, q, workers, verbose, extend) + Base.__init__(self, *args, **kwargs) def get_move_forward(self): """Wrap ``move_forward``. @@ -521,7 +526,7 @@ def get_move_forward(self): p = self.p q = self.q - get_normalized_probs, avg_wts = self.setup_get_normalized_probs() + get_normalized_probs, noise_thresholds = self.setup_get_normalized_probs() @njit(nogil=True) def move_forward(cur_idx, prev_idx=None): @@ -533,7 +538,7 @@ def move_forward(cur_idx, prev_idx=None): q, cur_idx, prev_idx, - avg_wts, + noise_thresholds, ) cdf = np.cumsum(normalized_probs) choice = np.searchsorted(cdf, np.random.random()) diff --git a/src/pecanpy/rw/dense_rw.py b/src/pecanpy/rw/dense_rw.py index 0ea1fe98..a89a9b6c 100644 --- a/src/pecanpy/rw/dense_rw.py +++ b/src/pecanpy/rw/dense_rw.py @@ -7,11 +7,16 @@ class DenseRWGraph(DenseGraph): """Dense Graph object equipped with random walk computation.""" - def get_average_weights(self): + def get_noise_thresholds(self): """Compute average edge weights.""" - deg_ary = self.data.sum(axis=1) - n_nbrs_ary = self.nonzero.sum(axis=1) - return deg_ary / n_nbrs_ary + num_nodes = len(self.IDlst) + average_weight_ary = np.zeros(num_nodes, dtype=np.float32) + for i in range(num_nodes): + weights = self.data[i, self.nonzero[i]] + average_weight_ary[i] = weights.mean() + self.gamma * weights.std() + average_weight_ary = np.maximum(average_weight_ary, 0) + + return average_weight_ary def get_has_nbrs(self): """Wrap ``has_nbrs``.""" @@ -87,14 +92,16 @@ def get_extended_normalized_probs( if prev_idx is not None: # 2nd order biased walks prev_nbrs_weight = data[prev_idx].copy() - inout_ind = cur_nbrs_ind & (prev_nbrs_weight < average_weight_ary) - inout_ind[prev_idx] = False # exclude previous state from out biases + # Note: we assume here the network is undirectly, hence the edge + # weight connecting the next to prev is the same as the reverse. + out_ind = cur_nbrs_ind & (prev_nbrs_weight < average_weight_ary) + out_ind[prev_idx] = False # exclude previous state from out biases # print("CURRENT: ", cur_idx) - # print("INOUT: ", np.where(inout_ind)[0]) - # print("NUM INOUT: ", inout_ind.sum(), "\n") + # print("INOUT: ", np.where(out_ind)[0]) + # print("NUM INOUT: ", out_ind.sum(), "\n") - t = prev_nbrs_weight[inout_ind] / average_weight_ary[inout_ind] + t = prev_nbrs_weight[out_ind] / average_weight_ary[out_ind] # optional nonlinear parameterization # b = 1; t = b * t / (1 - (b - 1) * t) @@ -103,10 +110,10 @@ def get_extended_normalized_probs( # suppress noisy edges alpha[ - unnormalized_probs[inout_ind] < average_weight_ary[cur_idx] + unnormalized_probs[out_ind] < average_weight_ary[cur_idx] ] = np.minimum(1, 1 / q) - unnormalized_probs[inout_ind] *= alpha # apply out biases - unnormalized_probs[prev_idx] /= p # apply the return bias + unnormalized_probs[out_ind] *= alpha # apply out biases + unnormalized_probs[prev_idx] /= p # apply the return bias unnormalized_probs = unnormalized_probs[cur_nbrs_ind] normalized_probs = unnormalized_probs / unnormalized_probs.sum() diff --git a/src/pecanpy/rw/sparse_rw.py b/src/pecanpy/rw/sparse_rw.py index 265875dd..a04f7ebf 100644 --- a/src/pecanpy/rw/sparse_rw.py +++ b/src/pecanpy/rw/sparse_rw.py @@ -17,15 +17,19 @@ def has_nbrs(idx): return has_nbrs - def get_average_weights(self): + def get_noise_thresholds(self): """Compute average edge weights.""" data = self.data indptr = self.indptr num_nodes = len(self.IDlst) average_weight_ary = np.zeros(num_nodes, dtype=np.float32) - for idx in range(num_nodes): - average_weight_ary[idx] = data[indptr[idx] : indptr[idx + 1]].mean() + for i in range(num_nodes): + average_weight_ary[i] = ( + data[indptr[i] : indptr[i + 1]].mean() + + self.gamma * data[indptr[i] : indptr[i + 1]].std() + ) + average_weight_ary = np.maximum(average_weight_ary, 0) return average_weight_ary @@ -226,7 +230,7 @@ def isnotin(ptr_ary1, ptr_ary2): @njit(nogil=True) -def isnotin_extended(ptr_ary1, ptr_ary2, wts_ary2, avg_wts): +def isnotin_extended(ptr_ary1, ptr_ary2, wts_ary2, noise_thresholds): """Find node2vec+ out edges. The node2vec+ out edges is determined by considering the edge weights @@ -242,8 +246,9 @@ def isnotin_extended(ptr_ary1, ptr_ary2, wts_ary2, avg_wts): the neighbors of the previous state wts_ary2 (:obj: `numpy.ndarray` of :obj:`float32`): array of edge weights of the previous state - avg_wts (:obj: `numpy.ndarray` of :obj:`float32`): array of average - edge weights of each node + noise_thresholds (:obj: `numpy.ndarray` of :obj:`float32`): array of + noisy edge threshold computed based on the average and the std of + the edge weights of each node Return: Indicator of whether a neighbor of the current state is considered as @@ -255,7 +260,7 @@ def isnotin_extended(ptr_ary1, ptr_ary2, wts_ary2, avg_wts): t = np.zeros(ptr_ary1.size, dtype=np.float32) idx2 = 0 for idx1 in range(ptr_ary1.size): - if idx2 == ptr_ary2.size: # end of ary2 + if idx2 >= ptr_ary2.size: # end of ary2 break ptr1 = ptr_ary1[idx1] @@ -265,21 +270,22 @@ def isnotin_extended(ptr_ary1, ptr_ary2, wts_ary2, avg_wts): continue elif ptr1 == ptr2: # found a matching value - if wts_ary2[idx2] >= avg_wts[ptr2]: # check if loose + # If connection is not loose, identify as an in-edge + if wts_ary2[idx2] >= noise_thresholds[ptr2]: indicator[idx1] = False else: - t[idx1] = wts_ary2[idx2] / avg_wts[ptr2] + t[idx1] = wts_ary2[idx2] / noise_thresholds[ptr2] idx2 += 1 elif ptr1 > ptr2: - # sweep through ptr_ary2 until ptr2 catch up on ptr1 - for j in range(idx2, ptr_ary2.size): + # Sweep through ptr_ary2 until ptr2 catch up on ptr1 + for j in range(idx2 + 1, ptr_ary2.size): ptr2 = ptr_ary2[j] if ptr2 == ptr1: - if wts_ary2[j] >= avg_wts[ptr2]: + if wts_ary2[j] >= noise_thresholds[ptr2]: indicator[idx1] = False else: - t[idx1] = wts_ary2[j] / avg_wts[ptr2] + t[idx1] = wts_ary2[j] / noise_thresholds[ptr2] idx2 = j + 1 break