Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

. #48

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open

. #48

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions examples/deepwalk_wiki_csrgraph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@

import numpy as np

from ge.classify import read_node_label, Classifier
from ge import DeepWalk
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt
import networkx as nx
from sklearn.manifold import TSNE


def evaluate_embeddings(embeddings):
X, Y = read_node_label('../data/wiki/wiki_labels.txt')
tr_frac = 0.8
print("Training classifier using {:.2f}% nodes...".format(
tr_frac * 100))
clf = Classifier(embeddings=embeddings, clf=LogisticRegression())
clf.split_train_evaluate(X, Y, tr_frac)


def plot_embeddings(embeddings,):
X, Y = read_node_label('../data/wiki/wiki_labels.txt')

emb_list = []
for k in X:
emb_list.append(embeddings[k])
emb_list = np.array(emb_list)

model = TSNE(n_components=2)
node_pos = model.fit_transform(emb_list)

color_idx = {}
for i in range(len(X)):
color_idx.setdefault(Y[i][0], [])
color_idx[Y[i][0]].append(i)

for c, idx in color_idx.items():
plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c)
plt.legend()
plt.show()


if __name__ == "__main__":
G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt',
create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])

model = DeepWalk(G, walk_length=10, num_walks=80, workers=1, use_csrgraph=True)
model.train(window_size=5, iter=3)
embeddings = model.get_embeddings()

evaluate_embeddings(embeddings)
#plot_embeddings(embeddings)
84 changes: 59 additions & 25 deletions ge/models/deepwalk.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,36 +20,70 @@
from ..walker import RandomWalker
from gensim.models import Word2Vec
import pandas as pd


import numpy as np
from csrgraph import csrgraph
class DeepWalk:
def __init__(self, graph, walk_length, num_walks, workers=1):

self.graph = graph
def __init__(self, graph, walk_length, num_walks, workers=1,use_csrgraph=False):
self.use_csrgraph=use_csrgraph
self.w2v_model = None
self._embeddings = {}

self.walker = RandomWalker(
graph, p=1, q=1, )
self.sentences = self.walker.simulate_walks(
num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1)

if self.use_csrgraph:
node_names=list(graph.nodes())
self.graph=csrgraph(graph,nodenames=node_names,threads=workers)

self.sentences = pd.DataFrame(self.graph.random_walks(
epochs=num_walks, walklen=walk_length, return_weight=1.,neighbor_weight=1.))
# Map nodeId -> node name
node_dict = dict(zip(np.arange(len(node_names)), node_names))

for col in self.sentences.columns:
self.sentences[col] = self.sentences[col].map(node_dict).astype(str)
# Somehow gensim only trains on this list iterator
# it silently mistrains on array input
self.sentences = [list(x) for x in self.sentences.itertuples(False, None)]

else:
self.graph = graph
self.walker = RandomWalker(
graph, p=1, q=1, )
self.sentences = self.walker.simulate_walks(
num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1)

def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs):

kwargs["sentences"] = self.sentences
kwargs["min_count"] = kwargs.get("min_count", 0)
kwargs["size"] = embed_size
kwargs["sg"] = 1 # skip gram
kwargs["hs"] = 1 # deepwalk use Hierarchical Softmax
kwargs["workers"] = workers
kwargs["window"] = window_size
kwargs["iter"] = iter

print("Learning embedding vectors...")
model = Word2Vec(**kwargs)
print("Learning embedding vectors done!")

self.w2v_model = model


if self.use_csrgraph:
kwargs["sentences"] = self.sentences
kwargs["min_count"] = kwargs.get("min_count", 0)
kwargs["vector_size"] = embed_size
kwargs["sg"] = 1 # skip gram
kwargs["hs"] = 1 # deepwalk use Hierarchical Softmax
kwargs["workers"] = workers
kwargs["window"] = window_size
kwargs["epochs"] = iter

print("Learning embedding vectors...")
model = Word2Vec(**kwargs)
print("Learning embedding vectors done!")
self.w2v_model = model

else:
kwargs["sentences"] = self.sentences
kwargs["min_count"] = kwargs.get("min_count", 0)
kwargs["vector_size"] = embed_size
kwargs["sg"] = 1 # skip gram
kwargs["hs"] = 1 # deepwalk use Hierarchical Softmax
kwargs["workers"] = workers
kwargs["window"] = window_size
kwargs["epochs"] = iter

print("Learning embedding vectors...")
model = Word2Vec(**kwargs)
print("Learning embedding vectors done!")

self.w2v_model = model

return model

def get_embeddings(self,):
Expand Down