数据挖掘KDD领域-CELF已成为一种经典的社会网络影响最大化发现算法,用于改进贪心算法的效率(提升700%)。获得KDD 2019的经典论文奖,作者:Jure Leskovec,论文:Cost-effective Outbreak Detection in Networks(2007)。
CELF算法是基于影响力具有子模性特征提出的,即所有节点的影响力随着种子节点集合中节点数目增加在减弱,具有单调递减性。
import matplotlib.pyplot as plt
from random import uniform, seed
import numpy as np
import time
from igraph import *
def IC(g,S,p=0.5,mc=1000):
"""
Input: graph object, set of seed nodes, propagation probability
and the number of Monte-Carlo simulations
Output: average number of nodes influenced by the seed nodes
"""
# Loop over the Monte-Carlo Simulations
spread = []
for i in range(mc):
# Simulate propagation process
new_active, A = S[:], S[:]
while new_active:
# For each newly active node, find its neighbors that become activated
new_ones = []
for node in new_active:
# Determine neighbors that become infected
np.random.seed(i)
success = np.random.uniform(0,1,len(g.neighbors(node,mode="out"))) < p
new_ones += list(np.extract(success, g.neighbors(node,mode="out")))
new_active = list(set(new_ones) - set(A))
# Add newly activated nodes to the set of activated nodes
A += new_active
spread.append(len(A))
return(np.mean(spread))
def celf(g,k,p=0.1,mc=1000):
"""
Input: graph object, number of seed nodes
Output: optimal seed set, resulting spread, time for each iteration
"""
# --------------------
# Find the first node with greedy algorithm
# --------------------
# Calculate the first iteration sorted list
start_time = time.time()
marg_gain = [IC(g,[node],p,mc) for node in range(g.vcount())]
# Create the sorted list of nodes and their marginal gain
Q = sorted(zip(range(g.vcount()),marg_gain), key=lambda x: x[1],reverse=True)
# Select the first node and remove from candidate list
S, spread, SPREAD = [Q[0][0]], Q[0][1], [Q[0][1]]
Q, LOOKUPS, timelapse = Q[1:], [g.vcount()], [time.time()-start_time]
# --------------------
# Find the next k-1 nodes using the list-sorting procedure
# --------------------
for _ in range(k-1):
check, node_lookup = False, 0
while not check:
# Count the number of times the spread is computed
node_lookup += 1
# Recalculate spread of top node
current = Q[0][0]
# Evaluate the spread function and store the marginal gain in the list
Q[0] = (current,IC(g,S+[current],p,mc) - spread)
# Re-sort the list
Q = sorted(Q, key = lambda x: x[1], reverse = True)
# Check if previous top node stayed on top after the sort
check = (Q[0][0] == current)
# Select the next node
spread += Q[0][1]
S.append(Q[0][0])
SPREAD.append(spread)
LOOKUPS.append(node_lookup)
timelapse.append(time.time() - start_time)
# Remove the selected node from the list
Q = Q[1:]
return(S,SPREAD,timelapse,LOOKUPS)
# Create simple network with 0 and 1 as the influential nodes
source = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,3,4,5]
target = [2,3,4,5,6,7,8,9,2,3,4,5,6,7,8,9,6,7,8,9]
g = Graph(directed=True)
g.add_vertices(range(10))
g.add_edges(zip(source,target))
# Plot graph
g.vs["label"], g.es["color"], g.vs["color"] = range(10), "#B3CDE3", "#FBB4AE"
plot(g,bbox = (200,200),margin = 20,layout = g.layout("kk"))
# Run algorithms
celf_output = celf(g,2,p = 0.2,mc = 1000)
# Print results
print("celf output: " + str(celf_output[0]))
运行结果:celf output: [0, 1]
代码来源:https://hautahi.com/im_greedycelf