From 4a0ac3a48075444ec13fdcb9f67428b01d75dffe Mon Sep 17 00:00:00 2001 From: Madongshenme <137074046@qq.com> Date: Fri, 19 Feb 2021 23:11:06 +0800 Subject: [PATCH 1/3] . . --- ge/models/deepwalk.py | 64 ----------------------------- ge/models/line.py | 32 ++++++++++----- ge/models/node2vec.py | 96 ++++++++++++++++++++++++------------------- ge/utils.py | 9 +--- 4 files changed, 77 insertions(+), 124 deletions(-) delete mode 100644 ge/models/deepwalk.py diff --git a/ge/models/deepwalk.py b/ge/models/deepwalk.py deleted file mode 100644 index d0fadc7..0000000 --- a/ge/models/deepwalk.py +++ /dev/null @@ -1,64 +0,0 @@ -# -*- coding:utf-8 -*- - -""" - - - -Author: - - Weichen Shen,wcshen1994@163.com - - - -Reference: - - [1] Perozzi B, Al-Rfou R, Skiena S. Deepwalk: Online learning of social representations[C]//Proceedings of the 20th ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2014: 701-710.(http://www.perozzi.net/publications/14_kdd_deepwalk.pdf) - - - -""" -from ..walker import RandomWalker -from gensim.models import Word2Vec -import pandas as pd - - -class DeepWalk: - def __init__(self, graph, walk_length, num_walks, workers=1): - - self.graph = graph - self.w2v_model = None - self._embeddings = {} - - self.walker = RandomWalker( - graph, p=1, q=1, ) - self.sentences = self.walker.simulate_walks( - num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1) - - def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs): - - kwargs["sentences"] = self.sentences - kwargs["min_count"] = kwargs.get("min_count", 0) - kwargs["size"] = embed_size - kwargs["sg"] = 1 # skip gram - kwargs["hs"] = 1 # deepwalk use Hierarchical Softmax - kwargs["workers"] = workers - kwargs["window"] = window_size - kwargs["iter"] = iter - - print("Learning embedding vectors...") - model = Word2Vec(**kwargs) - print("Learning embedding vectors done!") - - self.w2v_model = model - return model - - def get_embeddings(self,): - if self.w2v_model is None: - print("model not train") - return {} - - self._embeddings = {} - for word in self.graph.nodes(): - self._embeddings[word] = self.w2v_model.wv[word] - - return self._embeddings diff --git a/ge/models/line.py b/ge/models/line.py index 04c5073..c1634da 100644 --- a/ge/models/line.py +++ b/ge/models/line.py @@ -30,11 +30,11 @@ from ..utils import preprocess_nxgraph -def line_loss(y_true, y_pred): +def line_loss(y_true, y_pred): ##no problem return -K.mean(K.log(K.sigmoid(y_true*y_pred))) -def create_model(numNodes, embedding_size, order='second'): +def create_model(numNodes, embedding_size, order='second'): ##no problem v_i = Input(shape=(1,)) v_j = Input(shape=(1,)) @@ -48,12 +48,18 @@ def create_model(numNodes, embedding_size, order='second'): v_i_emb_second = second_emb(v_i) v_j_context_emb = context_emb(v_j) - - first = Lambda(lambda x: tf.reduce_sum( - x[0]*x[1], axis=-1, keep_dims=False), name='first_order')([v_i_emb, v_j_emb]) - second = Lambda(lambda x: tf.reduce_sum( - x[0]*x[1], axis=-1, keep_dims=False), name='second_order')([v_i_emb_second, v_j_context_emb]) - + try: + first = Lambda(lambda x: tf.reduce_sum( + x[0]*x[1], axis=-1, keepdims=False), name='first_order')([v_i_emb, v_j_emb]) + except(TypeError): + first = Lambda(lambda x: tf.reduce_sum( + x[0]*x[1], axis=-1, keep_dims=False), name='first_order')([v_i_emb, v_j_emb]) + try: + second = Lambda(lambda x: tf.reduce_sum( + x[0]*x[1], axis=-1, keepdims=False), name='second_order')([v_i_emb_second, v_j_context_emb]) + except(TypeError): + second = Lambda(lambda x: tf.reduce_sum( + x[0]*x[1], axis=-1, keep_dims=False), name='second_order')([v_i_emb_second, v_j_context_emb]) if order == 'first': output_list = [first] elif order == 'second': @@ -205,9 +211,13 @@ def get_embeddings(self,): return self._embeddings - def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1, times=1): + def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1, times=1,workers=tf.data.experimental.AUTOTUNE,use_multiprocessing=True): self.reset_training_config(batch_size, times) - hist = self.model.fit_generator(self.batch_it, epochs=epochs, initial_epoch=initial_epoch, steps_per_epoch=self.steps_per_epoch, - verbose=verbose) + try: + hist = self.model.fit_generator(self.batch_it, epochs=epochs, initial_epoch=initial_epoch, steps_per_epoch=self.steps_per_epoch, + verbose=verbose,workers=workers,use_multiprocessing=use_multiprocessing) + except: + hist = self.model.fit(self.batch_it, epochs=epochs, initial_epoch=initial_epoch, steps_per_epoch=self.steps_per_epoch, + verbose=verbose,workers=workers,use_multiprocessing=use_multiprocessing) return hist diff --git a/ge/models/node2vec.py b/ge/models/node2vec.py index 16f86cb..d0d4e7b 100644 --- a/ge/models/node2vec.py +++ b/ge/models/node2vec.py @@ -1,62 +1,74 @@ # -*- coding:utf-8 -*- - -""" - - - -Author: - - Weichen Shen,wcshen1994@163.com - - - -Reference: - - [1] Grover A, Leskovec J. node2vec: Scalable feature learning for networks[C]//Proceedings of the 22nd ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2016: 855-864.(https://www.kdd.org/kdd2016/papers/files/rfp0218-groverA.pdf) - - - -""" - -from gensim.models import Word2Vec import pandas as pd +import networkx as nx +import csrgraph as cg -from ..walker import RandomWalker - - -class Node2Vec: +import gc +import numba +import time +import numpy as np +import pandas as pd +from gensim.models import word2vec - def __init__(self, graph, walk_length, num_walks, p=1.0, q=1.0, workers=1, use_rejection_sampling=0): - self.graph = graph - self._embeddings = {} - self.walker = RandomWalker( - graph, p=p, q=q, use_rejection_sampling=use_rejection_sampling) - print("Preprocess transition probs...") - self.walker.preprocess_transition_probs() - self.sentences = self.walker.simulate_walks( - num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1) +class Node2Vec: - def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs): + def __init__(self, graph, walk_length, num_walks, p=1.0, q=1.0,threads=1): + + if type(threads) is not int: + raise ValueError('Threads must be int!') + if walk_length<1: + raise ValueError('Walk lengh must be >1') + if num_walks<1: + raise ValueError('num_walks must be >1') + if type(walk_length) is not int or type(num_walks) is not int: + raise ValueError('Walk length or num_walks must be int') + + self.walk_length=walk_length + self.num_walks=num_walks + self.p=p + self.q=q + self.threads=threads + # todo numba-based use_rejection_samplling + + if not isinstance(graph, cg.csrgraph): + self.graph = cg.csrgraph(graph, threads=self.threads) + if self.graph.threads != self.threads: + self.graph.set_threads(self.threads) + self.node_names = self.graph.names + if type(self.node_names[0]) not in [int, str, np.int32, np.uint32, + np.int64, np.uint64]: + raise ValueError("Graph node names must be int or str!") + + + + def train(self, embed_size=128, window_size=5, workers=3, iters=5 **kwargs): + print('Start making random walks...') + start=time.time() + self.sentences=self.graph.random_walks(walklen=self.walk_length,epochs=self.num_walks, \ + return_weight=self.p,neighbor_weight=self.q).astype(str).tolist() # It seems gensim word2vec only accept list and string types data + end=time.time() + print('Random walks uses '+str(end-start)+' seconds') + + - kwargs["sentences"] = self.sentences kwargs["min_count"] = kwargs.get("min_count", 0) kwargs["size"] = embed_size kwargs["sg"] = 1 - kwargs["hs"] = 0 # node2vec not use Hierarchical Softmax + kwargs["hs"] = 0 # node2vec don't need to use Hierarchical Softmax kwargs["workers"] = workers kwargs["window"] = window_size - kwargs["iter"] = iter - + kwargs["iter"] = iters + print("Learning embedding vectors...") - model = Word2Vec(**kwargs) + model = word2vec.Word2Vec(sentences=self.sentences,**kwargs) ##Avoid to copy self.sentences in order to save the memory print("Learning embedding vectors done!") self.w2v_model = model + self.node_dict = dict(zip(np.arange(len(self.node_names)).astype(str),self.node_names)) # map the node_names to the original node names - return model def get_embeddings(self,): if self.w2v_model is None: @@ -64,7 +76,7 @@ def get_embeddings(self,): return {} self._embeddings = {} - for word in self.graph.nodes(): - self._embeddings[word] = self.w2v_model.wv[word] + for word in self.node_dict.keys(): + self._embeddings[self.node_dict[word]] = self.w2v_model.wv[self.node_dict[word]] return self._embeddings diff --git a/ge/utils.py b/ge/utils.py index 8929eec..946a691 100644 --- a/ge/utils.py +++ b/ge/utils.py @@ -1,11 +1,6 @@ def preprocess_nxgraph(graph): - node2idx = {} - idx2node = [] - node_size = 0 - for node in graph.nodes(): - node2idx[node] = node_size - idx2node.append(node) - node_size += 1 + idx2node = list(graph.nodes()) + node2idx = dict(zip(idx2node,list(range(len(idx2node))))) return idx2node, node2idx From 4ea51c3370f77795c17fef49ec9e6fdf5df36746 Mon Sep 17 00:00:00 2001 From: wangbingnan136 <42114835+wangbingnan136@users.noreply.github.com> Date: Thu, 8 Jul 2021 08:27:43 +0800 Subject: [PATCH 2/3] update deepwalk --- ge/models/deepwalk.py | 98 +++++++++++++++++++++++++++++++++++++++++++ ge/models/line.py | 32 +++++--------- ge/models/node2vec.py | 96 +++++++++++++++++++----------------------- ge/utils.py | 9 +++- 4 files changed, 158 insertions(+), 77 deletions(-) create mode 100644 ge/models/deepwalk.py diff --git a/ge/models/deepwalk.py b/ge/models/deepwalk.py new file mode 100644 index 0000000..9aa0dfa --- /dev/null +++ b/ge/models/deepwalk.py @@ -0,0 +1,98 @@ +# -*- coding:utf-8 -*- + +""" + + + +Author: + + Weichen Shen,wcshen1994@163.com + + + +Reference: + + [1] Perozzi B, Al-Rfou R, Skiena S. Deepwalk: Online learning of social representations[C]//Proceedings of the 20th ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2014: 701-710.(http://www.perozzi.net/publications/14_kdd_deepwalk.pdf) + + + +""" +from ..walker import RandomWalker +from gensim.models import Word2Vec +import pandas as pd +import numpy as np +from csrgraph import csrgraph +class DeepWalk: + def __init__(self, graph, walk_length, num_walks, workers=1,use_csrgraph=False): + self.use_csrgraph=use_csrgraph + self.w2v_model = None + self._embeddings = {} + + if self.use_csrgraph: + node_names=list(graph.nodes()) + self.graph=csrgraph(graph,nodenames=node_names,threads=workers) + + self.sentences = pd.DataFrame(self.graph.random_walks( + epochs=num_walks, walklen=walk_length, return_weight=1.,neighbor_weight=1.)) + # Map nodeId -> node name + node_dict = dict(zip(np.arange(len(node_names)), node_names)) + + for col in self.sentences.columns: + self.sentences[col] = self.sentences[col].map(node_dict).astype(str) + # Somehow gensim only trains on this list iterator + # it silently mistrains on array input + self.sentences = [list(x) for x in self.sentences.itertuples(False, None)] + + else: + self.graph = graph + self.walker = RandomWalker( + graph, p=1, q=1, ) + self.sentences = self.walker.simulate_walks( + num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1) + + def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs): + + + if self.use_csrgraph: + kwargs["sentences"] = self.sentences + kwargs["min_count"] = kwargs.get("min_count", 0) + kwargs["vector_size"] = embed_size + kwargs["sg"] = 1 # skip gram + kwargs["hs"] = 1 # deepwalk use Hierarchical Softmax + kwargs["workers"] = workers + kwargs["window"] = window_size + kwargs["epochs"] = iter + + print("Learning embedding vectors...") + model = Word2Vec(**kwargs) + print("Learning embedding vectors done!") + self.w2v_model = model + + else: + kwargs["sentences"] = self.sentences + kwargs["min_count"] = kwargs.get("min_count", 0) + kwargs["vector_size"] = embed_size + kwargs["sg"] = 1 # skip gram + kwargs["hs"] = 1 # deepwalk use Hierarchical Softmax + kwargs["workers"] = workers + kwargs["window"] = window_size + kwargs["epochs"] = iter + + print("Learning embedding vectors...") + model = Word2Vec(**kwargs) + print("Learning embedding vectors done!") + + self.w2v_model = model + + return model + + def get_embeddings(self,): + if self.w2v_model is None: + print("model not train") + return {} + + self._embeddings = {} + for word in self.graph.nodes(): + self._embeddings[word] = self.w2v_model.wv[word] + + return self._embeddings diff --git a/ge/models/line.py b/ge/models/line.py index c1634da..04c5073 100644 --- a/ge/models/line.py +++ b/ge/models/line.py @@ -30,11 +30,11 @@ from ..utils import preprocess_nxgraph -def line_loss(y_true, y_pred): ##no problem +def line_loss(y_true, y_pred): return -K.mean(K.log(K.sigmoid(y_true*y_pred))) -def create_model(numNodes, embedding_size, order='second'): ##no problem +def create_model(numNodes, embedding_size, order='second'): v_i = Input(shape=(1,)) v_j = Input(shape=(1,)) @@ -48,18 +48,12 @@ def create_model(numNodes, embedding_size, order='second'): ##no problem v_i_emb_second = second_emb(v_i) v_j_context_emb = context_emb(v_j) - try: - first = Lambda(lambda x: tf.reduce_sum( - x[0]*x[1], axis=-1, keepdims=False), name='first_order')([v_i_emb, v_j_emb]) - except(TypeError): - first = Lambda(lambda x: tf.reduce_sum( - x[0]*x[1], axis=-1, keep_dims=False), name='first_order')([v_i_emb, v_j_emb]) - try: - second = Lambda(lambda x: tf.reduce_sum( - x[0]*x[1], axis=-1, keepdims=False), name='second_order')([v_i_emb_second, v_j_context_emb]) - except(TypeError): - second = Lambda(lambda x: tf.reduce_sum( - x[0]*x[1], axis=-1, keep_dims=False), name='second_order')([v_i_emb_second, v_j_context_emb]) + + first = Lambda(lambda x: tf.reduce_sum( + x[0]*x[1], axis=-1, keep_dims=False), name='first_order')([v_i_emb, v_j_emb]) + second = Lambda(lambda x: tf.reduce_sum( + x[0]*x[1], axis=-1, keep_dims=False), name='second_order')([v_i_emb_second, v_j_context_emb]) + if order == 'first': output_list = [first] elif order == 'second': @@ -211,13 +205,9 @@ def get_embeddings(self,): return self._embeddings - def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1, times=1,workers=tf.data.experimental.AUTOTUNE,use_multiprocessing=True): + def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1, times=1): self.reset_training_config(batch_size, times) - try: - hist = self.model.fit_generator(self.batch_it, epochs=epochs, initial_epoch=initial_epoch, steps_per_epoch=self.steps_per_epoch, - verbose=verbose,workers=workers,use_multiprocessing=use_multiprocessing) - except: - hist = self.model.fit(self.batch_it, epochs=epochs, initial_epoch=initial_epoch, steps_per_epoch=self.steps_per_epoch, - verbose=verbose,workers=workers,use_multiprocessing=use_multiprocessing) + hist = self.model.fit_generator(self.batch_it, epochs=epochs, initial_epoch=initial_epoch, steps_per_epoch=self.steps_per_epoch, + verbose=verbose) return hist diff --git a/ge/models/node2vec.py b/ge/models/node2vec.py index d0d4e7b..16f86cb 100644 --- a/ge/models/node2vec.py +++ b/ge/models/node2vec.py @@ -1,74 +1,62 @@ # -*- coding:utf-8 -*- -import pandas as pd -import networkx as nx -import csrgraph as cg -import gc -import numba -import time -import numpy as np -import pandas as pd -from gensim.models import word2vec +""" + + + +Author: + + Weichen Shen,wcshen1994@163.com + + + +Reference: + + [1] Grover A, Leskovec J. node2vec: Scalable feature learning for networks[C]//Proceedings of the 22nd ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2016: 855-864.(https://www.kdd.org/kdd2016/papers/files/rfp0218-groverA.pdf) +""" + +from gensim.models import Word2Vec +import pandas as pd + +from ..walker import RandomWalker + class Node2Vec: - def __init__(self, graph, walk_length, num_walks, p=1.0, q=1.0,threads=1): - - if type(threads) is not int: - raise ValueError('Threads must be int!') - if walk_length<1: - raise ValueError('Walk lengh must be >1') - if num_walks<1: - raise ValueError('num_walks must be >1') - if type(walk_length) is not int or type(num_walks) is not int: - raise ValueError('Walk length or num_walks must be int') - - self.walk_length=walk_length - self.num_walks=num_walks - self.p=p - self.q=q - self.threads=threads - # todo numba-based use_rejection_samplling - - if not isinstance(graph, cg.csrgraph): - self.graph = cg.csrgraph(graph, threads=self.threads) - if self.graph.threads != self.threads: - self.graph.set_threads(self.threads) - self.node_names = self.graph.names - if type(self.node_names[0]) not in [int, str, np.int32, np.uint32, - np.int64, np.uint64]: - raise ValueError("Graph node names must be int or str!") - - - - def train(self, embed_size=128, window_size=5, workers=3, iters=5 **kwargs): - print('Start making random walks...') - start=time.time() - self.sentences=self.graph.random_walks(walklen=self.walk_length,epochs=self.num_walks, \ - return_weight=self.p,neighbor_weight=self.q).astype(str).tolist() # It seems gensim word2vec only accept list and string types data - end=time.time() - print('Random walks uses '+str(end-start)+' seconds') - - + def __init__(self, graph, walk_length, num_walks, p=1.0, q=1.0, workers=1, use_rejection_sampling=0): + + self.graph = graph + self._embeddings = {} + self.walker = RandomWalker( + graph, p=p, q=q, use_rejection_sampling=use_rejection_sampling) + + print("Preprocess transition probs...") + self.walker.preprocess_transition_probs() + + self.sentences = self.walker.simulate_walks( + num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1) + def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs): + + kwargs["sentences"] = self.sentences kwargs["min_count"] = kwargs.get("min_count", 0) kwargs["size"] = embed_size kwargs["sg"] = 1 - kwargs["hs"] = 0 # node2vec don't need to use Hierarchical Softmax + kwargs["hs"] = 0 # node2vec not use Hierarchical Softmax kwargs["workers"] = workers kwargs["window"] = window_size - kwargs["iter"] = iters - + kwargs["iter"] = iter + print("Learning embedding vectors...") - model = word2vec.Word2Vec(sentences=self.sentences,**kwargs) ##Avoid to copy self.sentences in order to save the memory + model = Word2Vec(**kwargs) print("Learning embedding vectors done!") self.w2v_model = model - self.node_dict = dict(zip(np.arange(len(self.node_names)).astype(str),self.node_names)) # map the node_names to the original node names + return model def get_embeddings(self,): if self.w2v_model is None: @@ -76,7 +64,7 @@ def get_embeddings(self,): return {} self._embeddings = {} - for word in self.node_dict.keys(): - self._embeddings[self.node_dict[word]] = self.w2v_model.wv[self.node_dict[word]] + for word in self.graph.nodes(): + self._embeddings[word] = self.w2v_model.wv[word] return self._embeddings diff --git a/ge/utils.py b/ge/utils.py index 946a691..8929eec 100644 --- a/ge/utils.py +++ b/ge/utils.py @@ -1,6 +1,11 @@ def preprocess_nxgraph(graph): - idx2node = list(graph.nodes()) - node2idx = dict(zip(idx2node,list(range(len(idx2node))))) + node2idx = {} + idx2node = [] + node_size = 0 + for node in graph.nodes(): + node2idx[node] = node_size + idx2node.append(node) + node_size += 1 return idx2node, node2idx From 1fae0a956b8dd00531b745f3e1a5730cd3b0b9cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B5=85=E6=A2=A6?= Date: Sun, 11 Jul 2021 16:19:35 +0800 Subject: [PATCH 3/3] add example of deepwalk with csrgraph --- examples/deepwalk_wiki_csrgraph.py | 53 ++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 examples/deepwalk_wiki_csrgraph.py diff --git a/examples/deepwalk_wiki_csrgraph.py b/examples/deepwalk_wiki_csrgraph.py new file mode 100644 index 0000000..28a973f --- /dev/null +++ b/examples/deepwalk_wiki_csrgraph.py @@ -0,0 +1,53 @@ + +import numpy as np + +from ge.classify import read_node_label, Classifier +from ge import DeepWalk +from sklearn.linear_model import LogisticRegression + +import matplotlib.pyplot as plt +import networkx as nx +from sklearn.manifold import TSNE + + +def evaluate_embeddings(embeddings): + X, Y = read_node_label('../data/wiki/wiki_labels.txt') + tr_frac = 0.8 + print("Training classifier using {:.2f}% nodes...".format( + tr_frac * 100)) + clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) + clf.split_train_evaluate(X, Y, tr_frac) + + +def plot_embeddings(embeddings,): + X, Y = read_node_label('../data/wiki/wiki_labels.txt') + + emb_list = [] + for k in X: + emb_list.append(embeddings[k]) + emb_list = np.array(emb_list) + + model = TSNE(n_components=2) + node_pos = model.fit_transform(emb_list) + + color_idx = {} + for i in range(len(X)): + color_idx.setdefault(Y[i][0], []) + color_idx[Y[i][0]].append(i) + + for c, idx in color_idx.items(): + plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) + plt.legend() + plt.show() + + +if __name__ == "__main__": + G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt', + create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)]) + + model = DeepWalk(G, walk_length=10, num_walks=80, workers=1, use_csrgraph=True) + model.train(window_size=5, iter=3) + embeddings = model.get_embeddings() + + evaluate_embeddings(embeddings) + #plot_embeddings(embeddings)