Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

. #48

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open

. #48

Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 0 additions & 64 deletions ge/models/deepwalk.py

This file was deleted.

32 changes: 21 additions & 11 deletions ge/models/line.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@
from ..utils import preprocess_nxgraph


def line_loss(y_true, y_pred):
def line_loss(y_true, y_pred): ##no problem
return -K.mean(K.log(K.sigmoid(y_true*y_pred)))


def create_model(numNodes, embedding_size, order='second'):
def create_model(numNodes, embedding_size, order='second'): ##no problem
shenweichen marked this conversation as resolved.
Show resolved Hide resolved

v_i = Input(shape=(1,))
v_j = Input(shape=(1,))
Expand All @@ -48,12 +48,18 @@ def create_model(numNodes, embedding_size, order='second'):

v_i_emb_second = second_emb(v_i)
v_j_context_emb = context_emb(v_j)

first = Lambda(lambda x: tf.reduce_sum(
x[0]*x[1], axis=-1, keep_dims=False), name='first_order')([v_i_emb, v_j_emb])
second = Lambda(lambda x: tf.reduce_sum(
x[0]*x[1], axis=-1, keep_dims=False), name='second_order')([v_i_emb_second, v_j_context_emb])

try:
first = Lambda(lambda x: tf.reduce_sum(
x[0]*x[1], axis=-1, keepdims=False), name='first_order')([v_i_emb, v_j_emb])
except(TypeError):
first = Lambda(lambda x: tf.reduce_sum(
x[0]*x[1], axis=-1, keep_dims=False), name='first_order')([v_i_emb, v_j_emb])
try:
second = Lambda(lambda x: tf.reduce_sum(
x[0]*x[1], axis=-1, keepdims=False), name='second_order')([v_i_emb_second, v_j_context_emb])
except(TypeError):
second = Lambda(lambda x: tf.reduce_sum(
x[0]*x[1], axis=-1, keep_dims=False), name='second_order')([v_i_emb_second, v_j_context_emb])
if order == 'first':
output_list = [first]
elif order == 'second':
Expand Down Expand Up @@ -205,9 +211,13 @@ def get_embeddings(self,):

return self._embeddings

def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1, times=1):
def train(self, batch_size=1024, epochs=1, initial_epoch=0, verbose=1, times=1,workers=tf.data.experimental.AUTOTUNE,use_multiprocessing=True):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里的修改是为什么呀

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tf.data.experimental.AUTOTUNE可以让程序自动的选择最优的线程并行个数

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

当然用户也可以自己选择workers的数量,这里就是做为默认的设定

self.reset_training_config(batch_size, times)
hist = self.model.fit_generator(self.batch_it, epochs=epochs, initial_epoch=initial_epoch, steps_per_epoch=self.steps_per_epoch,
verbose=verbose)
try:
hist = self.model.fit_generator(self.batch_it, epochs=epochs, initial_epoch=initial_epoch, steps_per_epoch=self.steps_per_epoch,
verbose=verbose,workers=workers,use_multiprocessing=use_multiprocessing)
except:
hist = self.model.fit(self.batch_it, epochs=epochs, initial_epoch=initial_epoch, steps_per_epoch=self.steps_per_epoch,
verbose=verbose,workers=workers,use_multiprocessing=use_multiprocessing)

return hist
96 changes: 54 additions & 42 deletions ge/models/node2vec.py
Original file line number Diff line number Diff line change
@@ -1,70 +1,82 @@
# -*- coding:utf-8 -*-

"""



Author:

Weichen Shen,[email protected]



Reference:

[1] Grover A, Leskovec J. node2vec: Scalable feature learning for networks[C]//Proceedings of the 22nd ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2016: 855-864.(https://www.kdd.org/kdd2016/papers/files/rfp0218-groverA.pdf)



"""
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这一大块为啥删除了?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

修改的时候直接复制进来,给替换掉了。。。


from gensim.models import Word2Vec
import pandas as pd
import networkx as nx
import csrgraph as cg

from ..walker import RandomWalker


class Node2Vec:
import gc
import numba
import time
import numpy as np
import pandas as pd
from gensim.models import word2vec

def __init__(self, graph, walk_length, num_walks, p=1.0, q=1.0, workers=1, use_rejection_sampling=0):

self.graph = graph
self._embeddings = {}
self.walker = RandomWalker(
graph, p=p, q=q, use_rejection_sampling=use_rejection_sampling)

print("Preprocess transition probs...")
self.walker.preprocess_transition_probs()

self.sentences = self.walker.simulate_walks(
num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1)
class Node2Vec:

def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs):
def __init__(self, graph, walk_length, num_walks, p=1.0, q=1.0,threads=1):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

新的函数参数比旧的少了。。

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

def init(self, graph, walk_length, num_walks, p=1.0, q=1.0, workers=1, use_rejection_sampling=0):部分的参数移动到train的部分了,use_rejection_sampling 这个木有实现

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use_rejection_sampling 如果需要增加这个的numba实现我可以写一下

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

那相当于把原来有的功能给删除了,这里还是要保持一致的


if type(threads) is not int:
raise ValueError('Threads must be int!')
if walk_length<1:
raise ValueError('Walk lengh must be >1')
if num_walks<1:
raise ValueError('num_walks must be >1')
if type(walk_length) is not int or type(num_walks) is not int:
raise ValueError('Walk length or num_walks must be int')

self.walk_length=walk_length
self.num_walks=num_walks
self.p=p
self.q=q
self.threads=threads
# todo numba-based use_rejection_samplling

if not isinstance(graph, cg.csrgraph):
self.graph = cg.csrgraph(graph, threads=self.threads)
if self.graph.threads != self.threads:
self.graph.set_threads(self.threads)
self.node_names = self.graph.names
if type(self.node_names[0]) not in [int, str, np.int32, np.uint32,
np.int64, np.uint64]:
raise ValueError("Graph node names must be int or str!")



def train(self, embed_size=128, window_size=5, workers=3, iters=5 **kwargs):
print('Start making random walks...')
start=time.time()
self.sentences=self.graph.random_walks(walklen=self.walk_length,epochs=self.num_walks, \
return_weight=self.p,neighbor_weight=self.q).astype(str).tolist() # It seems gensim word2vec only accept list and string types data
end=time.time()
print('Random walks uses '+str(end-start)+' seconds')



kwargs["sentences"] = self.sentences
kwargs["min_count"] = kwargs.get("min_count", 0)
kwargs["size"] = embed_size
kwargs["sg"] = 1
kwargs["hs"] = 0 # node2vec not use Hierarchical Softmax
kwargs["hs"] = 0 # node2vec don't need to use Hierarchical Softmax
kwargs["workers"] = workers
kwargs["window"] = window_size
kwargs["iter"] = iter

kwargs["iter"] = iters
print("Learning embedding vectors...")
model = Word2Vec(**kwargs)
model = word2vec.Word2Vec(sentences=self.sentences,**kwargs) ##Avoid to copy self.sentences in order to save the memory
print("Learning embedding vectors done!")

self.w2v_model = model
self.node_dict = dict(zip(np.arange(len(self.node_names)).astype(str),self.node_names)) # map the node_names to the original node names

return model

def get_embeddings(self,):
if self.w2v_model is None:
print("model not train")
return {}

self._embeddings = {}
for word in self.graph.nodes():
self._embeddings[word] = self.w2v_model.wv[word]
for word in self.node_dict.keys():
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

为什么用self.node_dict替换self.graph?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

csrgraph是以scipy形式存储图的,所以节点的名字变成了0,1,2,3.。。。这样的形式,node_dict是networkx和csrgraph之间的节点名字的对应关系,比如原来节点叫“XXX”可能对应的是新的节点名是1这样

self._embeddings[self.node_dict[word]] = self.w2v_model.wv[self.node_dict[word]]

return self._embeddings
9 changes: 2 additions & 7 deletions ge/utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
def preprocess_nxgraph(graph):
node2idx = {}
idx2node = []
node_size = 0
for node in graph.nodes():
node2idx[node] = node_size
idx2node.append(node)
node_size += 1
idx2node = list(graph.nodes())
node2idx = dict(zip(idx2node,list(range(len(idx2node)))))
return idx2node, node2idx


Expand Down