Code cleanup to make use of packaged influence.mining project

seekme94 · Jul 12, 2020 · 05e75f4 · 05e75f4
1 parent 25861a4
commit 05e75f4
Show file tree

Hide file tree

Showing 21 changed files with 239 additions and 1,030,912 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
diff --git a/Experiments/Sir Faraz.R b/Experiments/Sir Faraz.R
diff --git a/Experiments/changing_dynamics/correlation_analysis.R b/Experiments/changing_dynamics/correlation_analysis.R
@@ -0,0 +1,49 @@
+######################################################################################
+##################################### Hypothesis #####################################
+# By correlating coreness of nodes with other heuristics and traits, we can discover
+# which nodes will not be vital
+######################################################################################
+
+# Load required libraries
+library(igraph)
+library(arules)
+library(dplyr)
+library(caret)
+library(party)
+library(doParallel)
+library(corrplot)
+library(influence.mining)
+
+# Load required source files
+source('util/graph_util.R')
+source('util/classification_util.R')
+source('util/influence_maximization.R')
+source('util/heuristics.R')
+
+
+# Read test data set
+author <- largest_component(read.graph("Experiments/data/author_netscience.txt", directed=FALSE))
+ita2000 <- largest_component(read.graph("Experiments/data/ita2000.txt", directed=FALSE))
+caida <- largest_component(read.graph("Experiments/data/as-caida.txt", directed=FALSE))
+jdk <- largest_component(read.graph("Experiments/data/jdk6_dependencies.txt", directed=FALSE))
+
+# Plot correlation matrix of dataset
+author_data <- get_node_influence_traits(author)
+author_corr <- round(cor(as.data.frame(author_data)), 2)
+write.table(author_corr, file="Experiments/results/author_correlation.csv", quote=FALSE, sep=",")
+corrplot(author_corr)
+
+ita2000_data <- get_traits(ita2000)
+ita2000_corr <- round(cor(as.data.frame(ita2000_data)), 2)
+write.table(ita2000_corr, file="Experiments/results/ita2000_correlation.csv", quote=FALSE, sep=",")
+corrplot(round(cor(as.data.frame(ita2000_data)), 2))
+
+caida_data <- get_traits(caida)
+caida_corr <- round(cor(as.data.frame(caida_data)), 2)
+write.table(caida_corr, file="Experiments/results/caida_correlation.csv", quote=FALSE, sep=",")
+corrplot(round(cor(as.data.frame(caida_data)), 2))
+
+jdk_data <- get_traits(jdk)
+jdk_corr <- round(cor(as.data.frame(jdk_data)), 2)
+write.table(jdk_corr, file="Experiments/results/jdk_correlation.csv", quote=FALSE, sep=",")
+corrplot(round(cor(as.data.frame(jdk_data)), 2))
diff --git a/Experiments/changing_dynamics/distant_edge_attachment.R b/Experiments/changing_dynamics/distant_edge_attachment.R
@@ -0,0 +1,74 @@
+######################################################################################
+##################################### Hypothesis #####################################
+# Connecting few nodes with degree = 1, and have greater distances between them will 
+# change influential nodes
+######################################################################################
+
+library(igraph)
+library(influence.mining)
+
+# Generate a graph
+seed <- 100
+budget <- 5
+set.seed(seed)
+graphs <- data.frame()
+results <- data.frame()
+for (size in seq(100, 1000, by=100)) {
+  # graph <- generate_small_world(size, 3/size)
+  graph <- generate_scale_free(size, 1 + (1/log(size)))
+  # Remove disconnected nodes
+  graph <- largest_component(graph)
+  graphs <- rbind(graphs, as.data.frame(graph_summary(graph)))
+  V(graph)$label <- 1:length(V(graph))
+  plot(graph, vertex.size=2)
+
+  # Extract influential nodes using Greedy method
+  influential <- greedy_influential(graph, budget, test_method="RESILIENCE")
+
+  # Find shortest paths for every pair of nodes
+  paths <- shortest.paths(graph, V(graph), V(graph))
+
+  # Select row-wise maximum shortest path
+  max_paths <- apply(paths, 1, function(x) max(x))
+  max_path <- max(max_paths)
+  distinct <- data.frame()
+  for (i in 1:length(paths[1,])) {
+    for (j in i:length(paths[,1])) {
+      if (paths[i,j] == max_path) {
+        distinct <- rbind(distinct, c(i, j))
+      }
+    }
+  }
+  names(distinct) <- c("from", "to")
+
+  # (Optional) Create a fraction of links instead of all
+  set.seed(seed)
+  distinct <- distinct[sample(1:nrow(distinct), ceiling(sqrt(nrow(distinct)))),]
+
+  # Sequentially create an edge between each set of distinct nodes
+  edges <- unlist(mapply(c, V(graph)[distinct$from], V(graph)[distinct$to], SIMPLIFY=FALSE))
+  graph <- add.edges(graph, edges)
+  plot(graph, vertex.size=2)
+
+  # Extract influential nodes again
+  new_influential <- greedy_influential(graph, budget, test_method="RESILIENCE")
+
+  # Compare both influential node sets
+  diff <- setdiff(influential, new_influential)
+
+  # Capture results
+  results <- rbind(results, c(size, length(influential), length(diff)))
+}
+names(results) <- c("size", "inf_size", "diff")
+results$change <- round(results$diff / results$inf_size * 100)
+graphs <- cbind(graphs, results)
+graphs
+
+# Conclusion (small-world):
+# The experiment analyzed change in influential nodes by connecting distinct nodes of 10 networks of sizes from 100 to 1000.
+# Distinct nodes are considered to be those with the longest shortest path.
+# We create an edge between some (a Sq.root subset) of the distinct nodes.
+# There was visible difference in set of influential nodes before and after node wiring. The avg. change measured to be 13%
+# Possible concerns:
+# 1. There is still no straight way to ensure that only the size of the network grows, while other traits remain same
+# 2. 
diff --git a/Experiments/changing_dynamics/weak_node_attachment.R b/Experiments/changing_dynamics/weak_node_attachment.R
@@ -0,0 +1,115 @@
+########################################################################################
+##################################### Hypothesis A #####################################
+# Attaching a node to all nodes with degree = 1 will not change influential nodes
+########################################################################################
+
+library(igraph)
+library(influence.mining)
+
+# Generate a graph
+set.seed(1)
+size <- 100
+budget <- 5
+#graph <- generate_small_world(size, log(size)/size)
+graph <- generate_scale_free(size, 1.5)
+
+# Remove disconnected nodes
+graph <- largest_component(graph)
+V(graph)$label <- 1:length(V(graph))
+plot(graph, vertex.size=2)
+
+# Extract influential nodes using Greedy method
+influential <- greedy_influential(graph, budget, test_method="RESILIENCE")
+
+# Algorithm: For each existing node with degree = 1, add a new node to the network
+# Fetch nodes with degree = 1
+degree_1 <- V(graph)[which(degree(graph) == 1)]
+
+# Attach a new node with each degree_1 node
+graph <- add.vertices(graph, length(degree_1))
+
+# Assign missing labels to new nodes
+V(graph)$label[is.na(V(graph)$label)] <- seq(size + 1, size + length(degree_1))
+
+degree_1 <- V(graph)[which(degree(graph) == 1)]
+degree_0 <- V(graph)[which(degree(graph) == 0)]
+
+# Sequentially create an edge between both node sets
+edges <- unlist(mapply(c, degree_1, degree_0, SIMPLIFY=FALSE))
+graph <- add.edges(graph, edges)
+plot(graph, vertex.size=2)
+
+# Readjust budget so that the number of nodes stays same
+budget <- size / length(V(graph)) * budget
+
+# Extract influential nodes again
+new_influential <- greedy_influential(graph, budget, test_method="RESILIENCE")
+
+# Compare both influential node sets
+influential
+new_influential
+setdiff(influential$influential_nodes, new_influential$influential_nodes)
+
+
+########################################################################################
+##################################### Hypothesis B #####################################
+# Creating a new triad by adding an edge with the third node (connection of connection)
+########################################################################################
+
+# Generate a graph
+set.seed(1)
+size <- 100
+budget <- 5
+#graph <- generate_small_world(size, log(size)/size)
+graph <- generate_scale_free(size, 1.5)
+
+# Remove disconnected nodes
+graph <- largest_component(graph)
+V(graph)$label <- 1:length(V(graph))
+plot(graph, vertex.size=2)
+
+# Extract influential nodes using Greedy method
+influential <- greedy_influential(graph, budget, test_method="RESILIENCE")
+
+# Algorithm: For each existing node with degree = 1, add a new node to the network
+# Fetch nodes with degree = 1
+degree_1 <- V(graph)[which(degree(graph) == 1)]
+
+# Attach a new node with each degree_1 node
+graph <- add.vertices(graph, length(degree_1))
+
+# Assign missing labels to new nodes
+V(graph)$label[is.na(V(graph)$label)] <- seq(size + 1, size + length(degree_1))
+
+degree_1 <- V(graph)[which(degree(graph) == 1)]
+degree_0 <- V(graph)[which(degree(graph) == 0)]
+
+# Fetch the nodes which the degree_1 nodes were initially connected with
+first_node <- V(graph)[sapply(degree_1, function(x) { unlist(neighborhood(graph, x, order=1))[2] })]
+
+# Sequentially create an edge between both node sets
+edges <- unlist(mapply(c, degree_1, degree_0, SIMPLIFY=FALSE))
+
+# Also create edges with the first_node set
+edges <- c(edges, unlist(mapply(c, first_node, degree_0, SIMPLIFY=FALSE)))
+
+graph <- add.edges(graph, edges)
+plot(graph, vertex.size=2)
+
+# Readjust budget so that the number of nodes stays same
+budget <- size / length(V(graph)) * budget
+
+# Extract influential nodes again
+new_influential <- greedy_influential(graph, budget, test_method="RESILIENCE")
+
+# Compare both influential node sets
+influential
+new_influential
+setdiff(influential$influential_nodes, new_influential$influential_nodes)
+
+# Conclusions of Hypothesis A(a & b):
+# 1. For small-world graphs of length ranging between 100 and 600, no change was noticed in the set of influential nodes (top 5%)
+# 2. The possible cause is the formation of star, since in a star network, the only influential node is the core node
+# 3. The hypothesis goes against Fitness model of network theory, in a sense that new nodes are connecting to most unfit nodes
+# 4. In case of Scale-free, difference of exactly 2 nodes was observed in 3 out of 5 cases
+# 5. The above conclusions stayed true even after creating a triad between newly added node and the node which was the connection of degree_1 node
diff --git a/README.md b/README.md
@@ -1,96 +1 @@
-# Project: influence-mining
-### Version: 0.1.2
-
-The purpose of this project is to provide an interface to perform influence mining operations on networks, preferably Social networks.
-
-The source contains the following files:
-  1. Influence.R
-  2. Under construction
-
-```
-Influence.R
-```
-The file contains source code for implementation of two basic influence mining models: Independent Cascade model and Linear Threshold model[1]
-```
-influence (graph, seed, budget, steps, model, maximize, seed_method, prob)
-```
-This is a wrapper function to call influence_LT and influence_IC functions
-- graph: is the igraph object
-- budget: defines what percentage of most influential nodes out of all nodes is required as output. Default value is 1
-- seed: (optional) is a set of seed (initial nodes). If this parameter is NULL, then seed_method parameter should be given
-- steps: is the time steps for which, the diffusion process should run. If exhaustive run is required, provide a high value (like 100). Default value is 1
-- model: is influence model to run the dataset on. Value MUST either be "LT" or "IC"
-- maximize: should be TRUE if influential nodes are to be derived using Greedy algorithm
-- seed_method: is the selection method for seed (initial nodes). Value can be "random", "degree", "closeness", "betweenness", "coreness", "eigenvector", "a-degree", "a-closeness", "a-betweenness", "a-coreness", "a-eigenvector". Default value is "random"
-- prob: is the probability of activation of a neighbour node. This is applicable only to IC model currently. Default value is 0.5
-
-> Output: summary of influence process, including no. of nodes, edges, seed set size, nodes influenced and time taken
-
-```
-influence_LT
-```
-This function calculates influence (number of nodes in the network expected to be activated) under Linear Threshold model. For parameters, see influence function.
-
-```
-influence_IC
-```
-This function calculates influence (number of nodes in the network expected to be activated) under Independent Cascade model. For parameters, see influence function.
-
-```
-select_seed
-```
-This function returns a set of nodes, to be used as seed in influence functions on the basis of given seed selection method
-- G: a graph object of library *igraph*
-- k: percentage of seed nodes from the network to be chosen
-- seed_method: see influence function
-
-> Output: subset vector of nodes in a graph
-
-```
-select_adaptive_seed
-```
-This function returns a set of nodes, to be used as seed in influence functions on the basis of given adaptive method for seed selection
-- G: a graph object of library *igraph*
-- k: percentage of seed nodes from the network to be chosen
-- seed_method: see influence function
-
-> Output: subset vector of nodes in a graph
-
-```
-find_communities
-```
-This method finds communities in the given graph and returns the graph after adding a vector "group" to its vertices
-- G: a graph object of library *igraph*
-- method: is the method to generate communities. Available algorithms are "multilevel", "edgebetweenness", "fastgreedy", "eigenvector", "spinglass", "walktrap", "labelpropagation", "clique", "largescale"
-
-> Output: graph object with additional vector "group" to vertices
-
-```
-community.significance.test
-```
-This function performs a Wilcoxon rank-sum test on the "internal" and "external" degrees of a community in order to quantify its significance.
-
-
-### Examples:
-1. Calculate influence under defaults (model="LT", budget=5, steps=1 and seed_method="random")
-```
-influence(edgesFile="C:/Datasets/twitter_edges.csv")
-```
-2. Calculate influence under IC model, budget=10% for 2 time steps and seed_method="random"
-```
-influence(edgesFile="C:/Datasets/twitter_edges.csv", budget=10, steps=2, model="IC")
-```
-3. Calculate influence under IC model to select 10% nodes for 2 time steps and seed selection criteria to be nodes with highest degree
-```
-influence(edgesFile="C:/Datasets/twitter_edges.csv", budget=10, steps=2, model="IC", seed_method="degree")
-```
-4. Calculate influence under LT model to select 5% nodes for 1 time steps and seed selection criteria to be nodes with highest betweenness
-```
-influence(edgesFile="C:/Datasets/twitter_edges.csv", seed_method="betweenness")
-```
-### Libraries used
-jsonlite, uuid, sampling, digest, RWeka, doMC, snow, doSNOW, iterpc, foreach, igraph, caret, e1071, party, rpart, rpart.plot, randomForest, RColorBrewer, nnet, rattle, ggplot2, Rcpp
-
-
-### References:
-[1] Kempe, D., Kleinberg, J., & Tardos, É. (2003). Maximizing the Spread of Influence through a Social Network. In Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining - KDD ’03 (p. 137). New York, New York, USA: ACM Press. doi:10.1145/956755.956769
+# Experiments on influence mining