Skip to content

Commit

Permalink
Gamma hyperparam (#60)
Browse files Browse the repository at this point in the history
* Add gamma parameter

* Use gamma to compute noisy edge threshold

* Prevent negative noise treshold, improve comments

* Rename avg_wts to noise_threaholds

* Bump version 2.0.2
  • Loading branch information
RemyLau authored Jan 11, 2022
1 parent abf77fb commit ac71ef7
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 49 deletions.
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Configuring setup()
[metadata]
name = pecanpy
version = 2.0.2-dev
version = 2.0.2
description = A parallelized, efficient, and accelerated node2vec
long_description = file: README.md
long_description_content_type = text/markdown
Expand Down
10 changes: 9 additions & 1 deletion src/pecanpy/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,13 @@ def parse_args():
help="Use node2vec+ extension",
)

parser.add_argument(
"--gamma",
type=float,
default=0,
help="Noisy edge threshold parameter.",
)

return parser.parse_args()


Expand Down Expand Up @@ -234,6 +241,7 @@ def read_graph(args):
weighted = args.weighted
directed = args.directed
extend = args.extend
gamma = args.gamma
mode = args.mode
task = args.task

Expand All @@ -250,7 +258,7 @@ def read_graph(args):
exit()

pecanpy_mode = getattr(pecanpy, mode, None)
g = pecanpy_mode(p, q, workers, verbose, extend)
g = pecanpy_mode(p, q, workers, verbose, extend, gamma)

read_func = g.read_npz if fp.endswith(".npz") else g.read_edg
read_func(fp, weighted, directed)
Expand Down
49 changes: 27 additions & 22 deletions src/pecanpy/pecanpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class Base:
"""

def __init__(self, p, q, workers, verbose=False, extend=False):
def __init__(self, p, q, workers, verbose=False, extend=False, gamma=0):
"""Initializ node2vec base class.
Args:
Expand All @@ -53,7 +53,11 @@ def __init__(self, p, q, workers, verbose=False, extend=False):
workers (int): number of threads to be spawned for runing node2vec
including walk generation and word2vec embedding.
verbose (bool): show progress bar for walk generation.
extend (bool): ``True`` if use node2vec+ extension, default is ``False``
extend (bool): use node2vec+ extension if set to :obj:`True`
(default: :obj:`False`).
gamma (float): Multiplication factor for the std term of edge
weights added to the average edge weights as the noisy edge
threashold, only used by node2vec+ (default: 0)
"""
super().__init__()
Expand All @@ -62,6 +66,7 @@ def __init__(self, p, q, workers, verbose=False, extend=False):
self.workers = workers
self.verbose = verbose
self.extend = extend
self.gamma = gamma

def _map_walk(self, walk_idx_ary):
"""Map walk from node index to node ID.
Expand Down Expand Up @@ -148,16 +153,16 @@ def setup_get_normalized_probs(self):
probability computation function ``get_extended_normalized_probs``,
if node2vec+ is used. Otherwise, return the normal transition function
``get_noramlized_probs`` with a trivial placeholder for average edge
weights array ``avg_wts``.
weights array ``noise_thresholds``.
"""
if self.extend: # use n2v+
get_normalized_probs = self.get_extended_normalized_probs
avg_wts = self.get_average_weights()
noise_thresholds = self.get_noise_thresholds()
else: # use normal n2v
get_normalized_probs = self.get_normalized_probs
avg_wts = None
return get_normalized_probs, avg_wts
noise_thresholds = None
return get_normalized_probs, noise_thresholds

def preprocess_transition_probs(self):
"""Null default preprocess method."""
Expand Down Expand Up @@ -221,9 +226,9 @@ def embed(
class FirstOrderUnweighted(Base, SparseRWGraph):
"""Directly sample edges for first order random walks."""

def __init__(self, p, q, workers, verbose=False, extend=False):
def __init__(self, *args, **kwargs):
"""Initialize FirstOrderUnweighted mode."""
Base.__init__(self, p, q, workers, verbose, extend)
Base.__init__(self, *args, **kwargs)

def get_move_forward(self):
"""Wrap ``move_forward``."""
Expand All @@ -241,9 +246,9 @@ def move_forward(cur_idx, prev_idx=None):
class PreCompFirstOrder(Base, SparseRWGraph):
"""Precompute transition probabilities for first order random walks."""

def __init__(self, p, q, workers, verbose=False, extend=False):
def __init__(self, *args, **kwargs):
"""Initialize PreCompFirstOrder mode."""
Base.__init__(self, p, q, workers, verbose, extend)
Base.__init__(self, *args, **kwargs)
self.alias_j = self.alias_q = None

def get_move_forward(self):
Expand Down Expand Up @@ -304,9 +309,9 @@ class PreComp(Base, SparseRWGraph):
"""

def __init__(self, p, q, workers, verbose=False, extend=False):
def __init__(self, *args, **kwargs):
"""Initialize PreComp mode node2vec."""
Base.__init__(self, p, q, workers, verbose, extend)
Base.__init__(self, *args, **kwargs)
self.alias_j = self.alias_q = self.alias_indptr = self.alias_dim = None

def get_move_forward(self):
Expand Down Expand Up @@ -390,7 +395,7 @@ def preprocess_transition_probs(self):
q = self.q

# Retrieve transition probability computation callback function
get_normalized_probs, avg_wts = self.setup_get_normalized_probs()
get_normalized_probs, noise_thresholds = self.setup_get_normalized_probs()

# Determine the dimensionality of the 2nd order transition probs
n_nodes = self.indptr.size - 1 # number of nodes
Expand Down Expand Up @@ -423,7 +428,7 @@ def compute_all_transition_probs():
q,
idx,
nbr,
avg_wts,
noise_thresholds,
)

start = offset + dim * nbr_idx
Expand All @@ -444,9 +449,9 @@ class SparseOTF(Base, SparseRWGraph):
"""

def __init__(self, p, q, workers, verbose=False, extend=False):
def __init__(self, *args, **kwargs):
"""Initialize PreComp mode node2vec."""
Base.__init__(self, p, q, workers, verbose, extend)
Base.__init__(self, *args, **kwargs)

def get_move_forward(self):
"""Wrap ``move_forward``.
Expand All @@ -467,7 +472,7 @@ def get_move_forward(self):
p = self.p
q = self.q

get_normalized_probs, avg_wts = self.setup_get_normalized_probs()
get_normalized_probs, noise_thresholds = self.setup_get_normalized_probs()

@njit(nogil=True)
def move_forward(cur_idx, prev_idx=None):
Expand All @@ -480,7 +485,7 @@ def move_forward(cur_idx, prev_idx=None):
q,
cur_idx,
prev_idx,
avg_wts,
noise_thresholds,
)
cdf = np.cumsum(normalized_probs)
choice = np.searchsorted(cdf, np.random.random())
Expand All @@ -499,9 +504,9 @@ class DenseOTF(Base, DenseRWGraph):
"""

def __init__(self, p, q, workers, verbose=False, extend=False):
def __init__(self, *args, **kwargs):
"""Initialize DenseOTF mode node2vec."""
Base.__init__(self, p, q, workers, verbose, extend)
Base.__init__(self, *args, **kwargs)

def get_move_forward(self):
"""Wrap ``move_forward``.
Expand All @@ -521,7 +526,7 @@ def get_move_forward(self):
p = self.p
q = self.q

get_normalized_probs, avg_wts = self.setup_get_normalized_probs()
get_normalized_probs, noise_thresholds = self.setup_get_normalized_probs()

@njit(nogil=True)
def move_forward(cur_idx, prev_idx=None):
Expand All @@ -533,7 +538,7 @@ def move_forward(cur_idx, prev_idx=None):
q,
cur_idx,
prev_idx,
avg_wts,
noise_thresholds,
)
cdf = np.cumsum(normalized_probs)
choice = np.searchsorted(cdf, np.random.random())
Expand Down
31 changes: 19 additions & 12 deletions src/pecanpy/rw/dense_rw.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,16 @@
class DenseRWGraph(DenseGraph):
"""Dense Graph object equipped with random walk computation."""

def get_average_weights(self):
def get_noise_thresholds(self):
"""Compute average edge weights."""
deg_ary = self.data.sum(axis=1)
n_nbrs_ary = self.nonzero.sum(axis=1)
return deg_ary / n_nbrs_ary
num_nodes = len(self.IDlst)
average_weight_ary = np.zeros(num_nodes, dtype=np.float32)
for i in range(num_nodes):
weights = self.data[i, self.nonzero[i]]
average_weight_ary[i] = weights.mean() + self.gamma * weights.std()
average_weight_ary = np.maximum(average_weight_ary, 0)

return average_weight_ary

def get_has_nbrs(self):
"""Wrap ``has_nbrs``."""
Expand Down Expand Up @@ -87,14 +92,16 @@ def get_extended_normalized_probs(
if prev_idx is not None: # 2nd order biased walks
prev_nbrs_weight = data[prev_idx].copy()

inout_ind = cur_nbrs_ind & (prev_nbrs_weight < average_weight_ary)
inout_ind[prev_idx] = False # exclude previous state from out biases
# Note: we assume here the network is undirectly, hence the edge
# weight connecting the next to prev is the same as the reverse.
out_ind = cur_nbrs_ind & (prev_nbrs_weight < average_weight_ary)
out_ind[prev_idx] = False # exclude previous state from out biases

# print("CURRENT: ", cur_idx)
# print("INOUT: ", np.where(inout_ind)[0])
# print("NUM INOUT: ", inout_ind.sum(), "\n")
# print("INOUT: ", np.where(out_ind)[0])
# print("NUM INOUT: ", out_ind.sum(), "\n")

t = prev_nbrs_weight[inout_ind] / average_weight_ary[inout_ind]
t = prev_nbrs_weight[out_ind] / average_weight_ary[out_ind]
# optional nonlinear parameterization
# b = 1; t = b * t / (1 - (b - 1) * t)

Expand All @@ -103,10 +110,10 @@ def get_extended_normalized_probs(

# suppress noisy edges
alpha[
unnormalized_probs[inout_ind] < average_weight_ary[cur_idx]
unnormalized_probs[out_ind] < average_weight_ary[cur_idx]
] = np.minimum(1, 1 / q)
unnormalized_probs[inout_ind] *= alpha # apply out biases
unnormalized_probs[prev_idx] /= p # apply the return bias
unnormalized_probs[out_ind] *= alpha # apply out biases
unnormalized_probs[prev_idx] /= p # apply the return bias

unnormalized_probs = unnormalized_probs[cur_nbrs_ind]
normalized_probs = unnormalized_probs / unnormalized_probs.sum()
Expand Down
32 changes: 19 additions & 13 deletions src/pecanpy/rw/sparse_rw.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,19 @@ def has_nbrs(idx):

return has_nbrs

def get_average_weights(self):
def get_noise_thresholds(self):
"""Compute average edge weights."""
data = self.data
indptr = self.indptr

num_nodes = len(self.IDlst)
average_weight_ary = np.zeros(num_nodes, dtype=np.float32)
for idx in range(num_nodes):
average_weight_ary[idx] = data[indptr[idx] : indptr[idx + 1]].mean()
for i in range(num_nodes):
average_weight_ary[i] = (
data[indptr[i] : indptr[i + 1]].mean()
+ self.gamma * data[indptr[i] : indptr[i + 1]].std()
)
average_weight_ary = np.maximum(average_weight_ary, 0)

return average_weight_ary

Expand Down Expand Up @@ -226,7 +230,7 @@ def isnotin(ptr_ary1, ptr_ary2):


@njit(nogil=True)
def isnotin_extended(ptr_ary1, ptr_ary2, wts_ary2, avg_wts):
def isnotin_extended(ptr_ary1, ptr_ary2, wts_ary2, noise_thresholds):
"""Find node2vec+ out edges.
The node2vec+ out edges is determined by considering the edge weights
Expand All @@ -242,8 +246,9 @@ def isnotin_extended(ptr_ary1, ptr_ary2, wts_ary2, avg_wts):
the neighbors of the previous state
wts_ary2 (:obj: `numpy.ndarray` of :obj:`float32`): array of edge
weights of the previous state
avg_wts (:obj: `numpy.ndarray` of :obj:`float32`): array of average
edge weights of each node
noise_thresholds (:obj: `numpy.ndarray` of :obj:`float32`): array of
noisy edge threshold computed based on the average and the std of
the edge weights of each node
Return:
Indicator of whether a neighbor of the current state is considered as
Expand All @@ -255,7 +260,7 @@ def isnotin_extended(ptr_ary1, ptr_ary2, wts_ary2, avg_wts):
t = np.zeros(ptr_ary1.size, dtype=np.float32)
idx2 = 0
for idx1 in range(ptr_ary1.size):
if idx2 == ptr_ary2.size: # end of ary2
if idx2 >= ptr_ary2.size: # end of ary2
break

ptr1 = ptr_ary1[idx1]
Expand All @@ -265,21 +270,22 @@ def isnotin_extended(ptr_ary1, ptr_ary2, wts_ary2, avg_wts):
continue

elif ptr1 == ptr2: # found a matching value
if wts_ary2[idx2] >= avg_wts[ptr2]: # check if loose
# If connection is not loose, identify as an in-edge
if wts_ary2[idx2] >= noise_thresholds[ptr2]:
indicator[idx1] = False
else:
t[idx1] = wts_ary2[idx2] / avg_wts[ptr2]
t[idx1] = wts_ary2[idx2] / noise_thresholds[ptr2]
idx2 += 1

elif ptr1 > ptr2:
# sweep through ptr_ary2 until ptr2 catch up on ptr1
for j in range(idx2, ptr_ary2.size):
# Sweep through ptr_ary2 until ptr2 catch up on ptr1
for j in range(idx2 + 1, ptr_ary2.size):
ptr2 = ptr_ary2[j]
if ptr2 == ptr1:
if wts_ary2[j] >= avg_wts[ptr2]:
if wts_ary2[j] >= noise_thresholds[ptr2]:
indicator[idx1] = False
else:
t[idx1] = wts_ary2[j] / avg_wts[ptr2]
t[idx1] = wts_ary2[j] / noise_thresholds[ptr2]
idx2 = j + 1
break

Expand Down

0 comments on commit ac71ef7

Please sign in to comment.