-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Code cleanup to make use of packaged influence.mining project
- Loading branch information
Showing
21 changed files
with
239 additions
and
1,030,912 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
###################################################################################### | ||
##################################### Hypothesis ##################################### | ||
# By correlating coreness of nodes with other heuristics and traits, we can discover | ||
# which nodes will not be vital | ||
###################################################################################### | ||
|
||
# Load required libraries | ||
library(igraph) | ||
library(arules) | ||
library(dplyr) | ||
library(caret) | ||
library(party) | ||
library(doParallel) | ||
library(corrplot) | ||
library(influence.mining) | ||
|
||
# Load required source files | ||
source('util/graph_util.R') | ||
source('util/classification_util.R') | ||
source('util/influence_maximization.R') | ||
source('util/heuristics.R') | ||
|
||
|
||
# Read test data set | ||
author <- largest_component(read.graph("Experiments/data/author_netscience.txt", directed=FALSE)) | ||
ita2000 <- largest_component(read.graph("Experiments/data/ita2000.txt", directed=FALSE)) | ||
caida <- largest_component(read.graph("Experiments/data/as-caida.txt", directed=FALSE)) | ||
jdk <- largest_component(read.graph("Experiments/data/jdk6_dependencies.txt", directed=FALSE)) | ||
|
||
# Plot correlation matrix of dataset | ||
author_data <- get_node_influence_traits(author) | ||
author_corr <- round(cor(as.data.frame(author_data)), 2) | ||
write.table(author_corr, file="Experiments/results/author_correlation.csv", quote=FALSE, sep=",") | ||
corrplot(author_corr) | ||
|
||
ita2000_data <- get_traits(ita2000) | ||
ita2000_corr <- round(cor(as.data.frame(ita2000_data)), 2) | ||
write.table(ita2000_corr, file="Experiments/results/ita2000_correlation.csv", quote=FALSE, sep=",") | ||
corrplot(round(cor(as.data.frame(ita2000_data)), 2)) | ||
|
||
caida_data <- get_traits(caida) | ||
caida_corr <- round(cor(as.data.frame(caida_data)), 2) | ||
write.table(caida_corr, file="Experiments/results/caida_correlation.csv", quote=FALSE, sep=",") | ||
corrplot(round(cor(as.data.frame(caida_data)), 2)) | ||
|
||
jdk_data <- get_traits(jdk) | ||
jdk_corr <- round(cor(as.data.frame(jdk_data)), 2) | ||
write.table(jdk_corr, file="Experiments/results/jdk_correlation.csv", quote=FALSE, sep=",") | ||
corrplot(round(cor(as.data.frame(jdk_data)), 2)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
###################################################################################### | ||
##################################### Hypothesis ##################################### | ||
# Connecting few nodes with degree = 1, and have greater distances between them will | ||
# change influential nodes | ||
###################################################################################### | ||
|
||
library(igraph) | ||
library(influence.mining) | ||
|
||
# Generate a graph | ||
seed <- 100 | ||
budget <- 5 | ||
set.seed(seed) | ||
graphs <- data.frame() | ||
results <- data.frame() | ||
for (size in seq(100, 1000, by=100)) { | ||
# graph <- generate_small_world(size, 3/size) | ||
graph <- generate_scale_free(size, 1 + (1/log(size))) | ||
# Remove disconnected nodes | ||
graph <- largest_component(graph) | ||
graphs <- rbind(graphs, as.data.frame(graph_summary(graph))) | ||
V(graph)$label <- 1:length(V(graph)) | ||
plot(graph, vertex.size=2) | ||
|
||
# Extract influential nodes using Greedy method | ||
influential <- greedy_influential(graph, budget, test_method="RESILIENCE") | ||
|
||
# Find shortest paths for every pair of nodes | ||
paths <- shortest.paths(graph, V(graph), V(graph)) | ||
|
||
# Select row-wise maximum shortest path | ||
max_paths <- apply(paths, 1, function(x) max(x)) | ||
max_path <- max(max_paths) | ||
distinct <- data.frame() | ||
for (i in 1:length(paths[1,])) { | ||
for (j in i:length(paths[,1])) { | ||
if (paths[i,j] == max_path) { | ||
distinct <- rbind(distinct, c(i, j)) | ||
} | ||
} | ||
} | ||
names(distinct) <- c("from", "to") | ||
|
||
# (Optional) Create a fraction of links instead of all | ||
set.seed(seed) | ||
distinct <- distinct[sample(1:nrow(distinct), ceiling(sqrt(nrow(distinct)))),] | ||
|
||
# Sequentially create an edge between each set of distinct nodes | ||
edges <- unlist(mapply(c, V(graph)[distinct$from], V(graph)[distinct$to], SIMPLIFY=FALSE)) | ||
graph <- add.edges(graph, edges) | ||
plot(graph, vertex.size=2) | ||
|
||
# Extract influential nodes again | ||
new_influential <- greedy_influential(graph, budget, test_method="RESILIENCE") | ||
|
||
# Compare both influential node sets | ||
diff <- setdiff(influential, new_influential) | ||
|
||
# Capture results | ||
results <- rbind(results, c(size, length(influential), length(diff))) | ||
} | ||
names(results) <- c("size", "inf_size", "diff") | ||
results$change <- round(results$diff / results$inf_size * 100) | ||
graphs <- cbind(graphs, results) | ||
graphs | ||
|
||
# Conclusion (small-world): | ||
# The experiment analyzed change in influential nodes by connecting distinct nodes of 10 networks of sizes from 100 to 1000. | ||
# Distinct nodes are considered to be those with the longest shortest path. | ||
# We create an edge between some (a Sq.root subset) of the distinct nodes. | ||
# There was visible difference in set of influential nodes before and after node wiring. The avg. change measured to be 13% | ||
# Possible concerns: | ||
# 1. There is still no straight way to ensure that only the size of the network grows, while other traits remain same | ||
# 2. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
######################################################################################## | ||
##################################### Hypothesis A ##################################### | ||
# Attaching a node to all nodes with degree = 1 will not change influential nodes | ||
######################################################################################## | ||
|
||
library(igraph) | ||
library(influence.mining) | ||
|
||
# Generate a graph | ||
set.seed(1) | ||
size <- 100 | ||
budget <- 5 | ||
#graph <- generate_small_world(size, log(size)/size) | ||
graph <- generate_scale_free(size, 1.5) | ||
|
||
# Remove disconnected nodes | ||
graph <- largest_component(graph) | ||
V(graph)$label <- 1:length(V(graph)) | ||
plot(graph, vertex.size=2) | ||
|
||
# Extract influential nodes using Greedy method | ||
influential <- greedy_influential(graph, budget, test_method="RESILIENCE") | ||
|
||
# Algorithm: For each existing node with degree = 1, add a new node to the network | ||
# Fetch nodes with degree = 1 | ||
degree_1 <- V(graph)[which(degree(graph) == 1)] | ||
|
||
# Attach a new node with each degree_1 node | ||
graph <- add.vertices(graph, length(degree_1)) | ||
|
||
# Assign missing labels to new nodes | ||
V(graph)$label[is.na(V(graph)$label)] <- seq(size + 1, size + length(degree_1)) | ||
|
||
degree_1 <- V(graph)[which(degree(graph) == 1)] | ||
degree_0 <- V(graph)[which(degree(graph) == 0)] | ||
|
||
# Sequentially create an edge between both node sets | ||
edges <- unlist(mapply(c, degree_1, degree_0, SIMPLIFY=FALSE)) | ||
graph <- add.edges(graph, edges) | ||
plot(graph, vertex.size=2) | ||
|
||
# Readjust budget so that the number of nodes stays same | ||
budget <- size / length(V(graph)) * budget | ||
|
||
# Extract influential nodes again | ||
new_influential <- greedy_influential(graph, budget, test_method="RESILIENCE") | ||
|
||
# Compare both influential node sets | ||
influential | ||
new_influential | ||
setdiff(influential$influential_nodes, new_influential$influential_nodes) | ||
|
||
|
||
######################################################################################## | ||
##################################### Hypothesis B ##################################### | ||
# Creating a new triad by adding an edge with the third node (connection of connection) | ||
######################################################################################## | ||
|
||
# Generate a graph | ||
set.seed(1) | ||
size <- 100 | ||
budget <- 5 | ||
#graph <- generate_small_world(size, log(size)/size) | ||
graph <- generate_scale_free(size, 1.5) | ||
|
||
# Remove disconnected nodes | ||
graph <- largest_component(graph) | ||
V(graph)$label <- 1:length(V(graph)) | ||
plot(graph, vertex.size=2) | ||
|
||
# Extract influential nodes using Greedy method | ||
influential <- greedy_influential(graph, budget, test_method="RESILIENCE") | ||
|
||
# Algorithm: For each existing node with degree = 1, add a new node to the network | ||
# Fetch nodes with degree = 1 | ||
degree_1 <- V(graph)[which(degree(graph) == 1)] | ||
|
||
# Attach a new node with each degree_1 node | ||
graph <- add.vertices(graph, length(degree_1)) | ||
|
||
# Assign missing labels to new nodes | ||
V(graph)$label[is.na(V(graph)$label)] <- seq(size + 1, size + length(degree_1)) | ||
|
||
degree_1 <- V(graph)[which(degree(graph) == 1)] | ||
degree_0 <- V(graph)[which(degree(graph) == 0)] | ||
|
||
# Fetch the nodes which the degree_1 nodes were initially connected with | ||
first_node <- V(graph)[sapply(degree_1, function(x) { unlist(neighborhood(graph, x, order=1))[2] })] | ||
|
||
# Sequentially create an edge between both node sets | ||
edges <- unlist(mapply(c, degree_1, degree_0, SIMPLIFY=FALSE)) | ||
|
||
# Also create edges with the first_node set | ||
edges <- c(edges, unlist(mapply(c, first_node, degree_0, SIMPLIFY=FALSE))) | ||
|
||
graph <- add.edges(graph, edges) | ||
plot(graph, vertex.size=2) | ||
|
||
# Readjust budget so that the number of nodes stays same | ||
budget <- size / length(V(graph)) * budget | ||
|
||
# Extract influential nodes again | ||
new_influential <- greedy_influential(graph, budget, test_method="RESILIENCE") | ||
|
||
# Compare both influential node sets | ||
influential | ||
new_influential | ||
setdiff(influential$influential_nodes, new_influential$influential_nodes) | ||
|
||
# Conclusions of Hypothesis A(a & b): | ||
# 1. For small-world graphs of length ranging between 100 and 600, no change was noticed in the set of influential nodes (top 5%) | ||
# 2. The possible cause is the formation of star, since in a star network, the only influential node is the core node | ||
# 3. The hypothesis goes against Fitness model of network theory, in a sense that new nodes are connecting to most unfit nodes | ||
# 4. In case of Scale-free, difference of exactly 2 nodes was observed in 3 out of 5 cases | ||
# 5. The above conclusions stayed true even after creating a triad between newly added node and the node which was the connection of degree_1 node |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,96 +1 @@ | ||
# Project: influence-mining | ||
### Version: 0.1.2 | ||
|
||
The purpose of this project is to provide an interface to perform influence mining operations on networks, preferably Social networks. | ||
|
||
The source contains the following files: | ||
1. Influence.R | ||
2. Under construction | ||
|
||
``` | ||
Influence.R | ||
``` | ||
The file contains source code for implementation of two basic influence mining models: Independent Cascade model and Linear Threshold model[1] | ||
``` | ||
influence (graph, seed, budget, steps, model, maximize, seed_method, prob) | ||
``` | ||
This is a wrapper function to call influence_LT and influence_IC functions | ||
- graph: is the igraph object | ||
- budget: defines what percentage of most influential nodes out of all nodes is required as output. Default value is 1 | ||
- seed: (optional) is a set of seed (initial nodes). If this parameter is NULL, then seed_method parameter should be given | ||
- steps: is the time steps for which, the diffusion process should run. If exhaustive run is required, provide a high value (like 100). Default value is 1 | ||
- model: is influence model to run the dataset on. Value MUST either be "LT" or "IC" | ||
- maximize: should be TRUE if influential nodes are to be derived using Greedy algorithm | ||
- seed_method: is the selection method for seed (initial nodes). Value can be "random", "degree", "closeness", "betweenness", "coreness", "eigenvector", "a-degree", "a-closeness", "a-betweenness", "a-coreness", "a-eigenvector". Default value is "random" | ||
- prob: is the probability of activation of a neighbour node. This is applicable only to IC model currently. Default value is 0.5 | ||
|
||
> Output: summary of influence process, including no. of nodes, edges, seed set size, nodes influenced and time taken | ||
``` | ||
influence_LT | ||
``` | ||
This function calculates influence (number of nodes in the network expected to be activated) under Linear Threshold model. For parameters, see influence function. | ||
|
||
``` | ||
influence_IC | ||
``` | ||
This function calculates influence (number of nodes in the network expected to be activated) under Independent Cascade model. For parameters, see influence function. | ||
|
||
``` | ||
select_seed | ||
``` | ||
This function returns a set of nodes, to be used as seed in influence functions on the basis of given seed selection method | ||
- G: a graph object of library *igraph* | ||
- k: percentage of seed nodes from the network to be chosen | ||
- seed_method: see influence function | ||
|
||
> Output: subset vector of nodes in a graph | ||
``` | ||
select_adaptive_seed | ||
``` | ||
This function returns a set of nodes, to be used as seed in influence functions on the basis of given adaptive method for seed selection | ||
- G: a graph object of library *igraph* | ||
- k: percentage of seed nodes from the network to be chosen | ||
- seed_method: see influence function | ||
|
||
> Output: subset vector of nodes in a graph | ||
``` | ||
find_communities | ||
``` | ||
This method finds communities in the given graph and returns the graph after adding a vector "group" to its vertices | ||
- G: a graph object of library *igraph* | ||
- method: is the method to generate communities. Available algorithms are "multilevel", "edgebetweenness", "fastgreedy", "eigenvector", "spinglass", "walktrap", "labelpropagation", "clique", "largescale" | ||
|
||
> Output: graph object with additional vector "group" to vertices | ||
``` | ||
community.significance.test | ||
``` | ||
This function performs a Wilcoxon rank-sum test on the "internal" and "external" degrees of a community in order to quantify its significance. | ||
|
||
|
||
### Examples: | ||
1. Calculate influence under defaults (model="LT", budget=5, steps=1 and seed_method="random") | ||
``` | ||
influence(edgesFile="C:/Datasets/twitter_edges.csv") | ||
``` | ||
2. Calculate influence under IC model, budget=10% for 2 time steps and seed_method="random" | ||
``` | ||
influence(edgesFile="C:/Datasets/twitter_edges.csv", budget=10, steps=2, model="IC") | ||
``` | ||
3. Calculate influence under IC model to select 10% nodes for 2 time steps and seed selection criteria to be nodes with highest degree | ||
``` | ||
influence(edgesFile="C:/Datasets/twitter_edges.csv", budget=10, steps=2, model="IC", seed_method="degree") | ||
``` | ||
4. Calculate influence under LT model to select 5% nodes for 1 time steps and seed selection criteria to be nodes with highest betweenness | ||
``` | ||
influence(edgesFile="C:/Datasets/twitter_edges.csv", seed_method="betweenness") | ||
``` | ||
### Libraries used | ||
jsonlite, uuid, sampling, digest, RWeka, doMC, snow, doSNOW, iterpc, foreach, igraph, caret, e1071, party, rpart, rpart.plot, randomForest, RColorBrewer, nnet, rattle, ggplot2, Rcpp | ||
|
||
|
||
### References: | ||
[1] Kempe, D., Kleinberg, J., & Tardos, É. (2003). Maximizing the Spread of Influence through a Social Network. In Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining - KDD ’03 (p. 137). New York, New York, USA: ACM Press. doi:10.1145/956755.956769 | ||
# Experiments on influence mining |
Oops, something went wrong.