参数设定
import warnings
import random
warnings.filterwarnings('ignore')
import argparse
import numpy as np
import networkx as nx
#import node2vec
from gensim.models import Word2Vec
import random
np.random.seed(1)
def parse_args():
'''
Parses the node2vec arguments.
'''
# 使用parser加载信息
parser = argparse.ArgumentParser(description="Run node2vec.")
# 输入文件
parser.add_argument('--input', nargs='?', default='../graph/karate.edgelist',
help='Input graph path')
# 输出文件
parser.add_argument('--output', nargs='?', default='../emb/karate.emb',
help='Embeddings path')
# embedding维度
parser.add_argument('--dimensions', type=int, default=128,
help='Number of dimensions. Default is 128.')
# 节点序列长度
parser.add_argument('--walk-length', type=int, default=80,
help='Length of walk per source. Default is 80.')
# 随机游走的次数
parser.add_argument('--num-walks', type=int, default=10,
help='Number of walks per source. Default is 10.')
# word2vec窗口大小,word2vec参数
parser.add_argument('--window-size', type=int, default=10,
help='Context size for optimization. Default is 10.')
# SGD优化时epoch数量,word2vec参数
parser.add_argument('--iter', default=10, type=int,
help='Number of epochs in SGD')
# 并行化核数,word2vec参数
parser.add_argument('--workers', type=int, default=8,
help='Number of parallel workers. Default is 8.')
# 参数p
parser.add_argument('--p', type=float, default=1,
help='Return hyperparameter. Default is 1.')
# 参数q
parser.add_argument('--q', type=float, default=1,
help='Inout hyperparameter. Default is 1.')
# 权重
parser.add_argument('--weighted', dest='weighted', action='store_true',
help='Boolean specifying (un)weighted. Default is unweighted.')
parser.add_argument('--unweighted', dest='unweighted', action='store_false')
parser.set_defaults(weighted=False)
# 有向无向
parser.add_argument('--directed', dest='directed', action='store_true',
help='Graph is (un)directed. Default is undirected.')
parser.add_argument('--undirected', dest='undirected', action='store_false')
parser.set_defaults(directed=False)
return parser.parse_args(args=[])
# return parser.parse_known_args()
def read_graph(file):
graph_x = nx.read_edgelist(file,nodetype = int,create_using = nx.DiGraph())
for edge in graph_x.edges():
graph_x[edge[0]][edge[1]]['weight'] = 1
# 无向操作
if args.undirected:
graph_x = graph_x.to_undirected()
return graph_x
args = parse_args()
nx_G = read_graph(args.input)
nx.draw(nx_G,with_labels = True)
list(nx_G.neighbors(25))
[32, 28, 26]
采样算法
def alias_setup(probs):
smaller = []
larger = []
Q = np.zeros(len(probs),dtype = int)
P = np.zeros(len(probs))
probs = [prob / sum(probs) for prob in probs]
for t,prob in enumerate(probs):
P[t] = prob * len(probs)
if P[t] > 1:
larger.append(t)
else:
smaller.append(t)
# 准备开始采样
while larger and smaller:
large = larger.pop()
small = smaller.pop()
Q[small] = large
P[large] -= (1 - P[small])
if P[large] < 1:
smaller.append(large)
else:
larger.append(large)
return Q,P
#J,q = alias_setup([1/2,1/3,1/12,1/12])
def alias_draw(J, q):
length = len(J)
PP = int(np.floor(np.random.rand() * length))
QQ = int(np.floor(np.random.rand()))
if QQ <= q[PP]:
return PP
else:
return J[QQ]
定义模型的Graph类
class Graph(object):
def __init__(self,nx_G,is_directed, p, q):
self.nx_G = nx_G
self.is_directed = is_directed
self.p = p
self.q = q
def get_alias_edge(self, src, dst):
nx_G = self.nx_G
p = self.p
q = self.q
unnormalized_probs = []
for node in sorted(nx_G.neighbors(dst)):
if node == src:
prob = nx_G[dst][node]['weight'] / p
elif nx_G.has_edge(node,src):
prob = nx_G[dst][node]['weight']
else:
prob = nx_G[dst][node]['weight'] / q
unnormalized_probs.append(prob)
normalized_probs = [float(prob) / sum(unnormalized_probs) for prob in unnormalized_probs]
#print(normalized_probs)
return alias_setup(normalized_probs)
def preprocess_transition_probs(self):
is_directed = self.is_directed
nx_G = self.nx_G
# 结点 的采样映射
alias_nodes = {}
for node in nx_G.nodes():
unnormalized_probs = [nx_G[node][neighbor]['weight'] for neighbor in sorted(nx_G.neighbors(node))]
probs = [prob / sum(unnormalized_probs) for prob in unnormalized_probs]
alias_nodes[node] = alias_setup(probs)
# 边的采样映射
alias_edges = {}
for src,tgt in nx_G.edges():
unnormalized_probs = self.get_alias_edge(src,tgt)
alias_edges[(src,tgt)] = unnormalized_probs
if not is_directed:
unnormalized_probs = self.get_alias_edge(tgt,src)
alias_edges[(tgt,src)] = unnormalized_probs
self.alias_nodes = alias_nodes
self.alias_edges = alias_edges
return alias_nodes,alias_edges
def node2vec_walk(self, walk_length, start_node):
nx_G = self.nx_G
alias_nodes = self.alias_nodes
alias_edges = self.alias_edges
walk = [start_node]
while len(walk) < walk_length:
cur = walk[-1]
neighbors = sorted(nx_G.neighbors(cur))
if neighbors:
if len(walk) == 1:
# 只有一个结点的话就对点采样
q,p = alias_nodes[cur]
sample_index = alias_draw(q,p)
walk.append(neighbors[sample_index])
else:
# 超过两个点,就进行边采样
pre = walk[-2]
q,p = alias_edges[(pre,cur)]
sample_index = alias_draw(q,p)
walk.append(neighbors[sample_index])
else:
break
return walk
def simulate_walks(self, num_walks, walk_length):
nx_G = self.nx_G
nodes = list(nx_G.nodes())
walks = []
for t in range(num_walks):
random.shuffle(nodes)
#print('epoch {}'.format(t))
for node in nodes:
walks.append(self.node2vec_walk(walk_length,node))
return walks
word2vec训练部分代码
def learn_embeddings(walks):
# 将node的类型int转化为string
walk_lol = []
for walk in walks:
tmp = []
for node in walk:
tmp.append(str(node))
walk_lol.append(tmp)
model = Word2Vec(walk_lol,
size = 2,
window = args.window_size,
min_count = 0,
sg = 1,
workers = args.workers,
iter = args.iter)
model.wv.save_word2vec_format(args.output)
return model
聚类
# k-means聚类
from sklearn import cluster
from sklearn.metrics import adjusted_rand_score
from sklearn.model_selection import train_test_split
import pandas as pd
def draw_cluster(p,q,pos):
g = Graph(nx_G,False,p,q)
g.preprocess_transition_probs()
walks = g.simulate_walks(40,40)
# 训练node2vec模型
model = learn_embeddings(walks)
# 导入节点名称,获取embedding
embedding_node=[]
for i in range(1,35):
j=str(i)
embedding_node.append(model[j])
embedding_node=np.matrix(embedding_node).reshape((34,-1))
y_pred = cluster.KMeans(n_clusters=4).fit_predict(embedding_node) # 调用 test_RandomForestClassifier
print(y_pred)
pos = nx.spring_layout(nx_G)
nx.draw_networkx_nodes(nx_G,pos,node_color = y_pred,label = True)
nx.draw_networkx_edges(nx_G,pos,nodelist = nx_G.edges())
实验效果
# p小 q大,偏向宽度优先搜索,模型更具有同构(注意黄色点,起着桥接的属性!!!)
p = 0.1
q = 20
draw_cluster(p,q)
[2 2 3 2 1 1 1 2 3 3 1 2 2 2 0 0 1 2 0 2 0 2 0 0 0 0 0 0 3 0 3 3 0 0]
# p大 q小,偏向深度优先搜索,模型更具有社群属性
p = 20
q = 0.1
draw_cluster(p,q)
[0 0 3 0 2 2 2 0 3 3 2 0 0 0 1 1 2 0 1 0 1 0 1 1 1 1 1 1 3 1 3 3 1 1]