Code cleanup to make use of packaged influence.mining project
seekme94 committed Jul 12, 2020
1 parent 25861a4 commit 05e75f4
Showing 21 changed files with 239 additions and 1,030,912 deletions.
DESCRIPTION

This file was deleted.

Experiments/Sir Faraz.R

This file was deleted.

Experiments/changing_dynamics/correlation_analysis.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
##################################### Hypothesis #####################################
# By correlating coreness of nodes with other heuristics and traits, we can discover
# which nodes will not be vital

# Load required libraries

# Load required source files

# Read test data set
author <- largest_component(read.graph("Experiments/data/author_netscience.txt", directed=FALSE))
ita2000 <- largest_component(read.graph("Experiments/data/ita2000.txt", directed=FALSE))
caida <- largest_component(read.graph("Experiments/data/as-caida.txt", directed=FALSE))
jdk <- largest_component(read.graph("Experiments/data/jdk6_dependencies.txt", directed=FALSE))

# Plot correlation matrix of dataset
author_data <- get_node_influence_traits(author)
author_corr <- round(cor(, 2)
write.table(author_corr, file="Experiments/results/author_correlation.csv", quote=FALSE, sep=",")

ita2000_data <- get_traits(ita2000)
ita2000_corr <- round(cor(, 2)
write.table(ita2000_corr, file="Experiments/results/ita2000_correlation.csv", quote=FALSE, sep=",")
corrplot(round(cor(, 2))

caida_data <- get_traits(caida)
caida_corr <- round(cor(, 2)
write.table(caida_corr, file="Experiments/results/caida_correlation.csv", quote=FALSE, sep=",")
corrplot(round(cor(, 2))

jdk_data <- get_traits(jdk)
jdk_corr <- round(cor(, 2)
write.table(jdk_corr, file="Experiments/results/jdk_correlation.csv", quote=FALSE, sep=",")
corrplot(round(cor(, 2))
Experiments/changing_dynamics/distant_edge_attachment.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
##################################### Hypothesis #####################################
# Connecting few nodes with degree = 1, and have greater distances between them will
# change influential nodes


# Generate a graph
seed <- 100
budget <- 5
graphs <- data.frame()
results <- data.frame()
for (size in seq(100, 1000, by=100)) {
# graph <- generate_small_world(size, 3/size)
graph <- generate_scale_free(size, 1 + (1/log(size)))
# Remove disconnected nodes
graph <- largest_component(graph)
graphs <- rbind(graphs,
V(graph)$label <- 1:length(V(graph))
plot(graph, vertex.size=2)

# Extract influential nodes using Greedy method
influential <- greedy_influential(graph, budget, test_method="RESILIENCE")

# Find shortest paths for every pair of nodes
paths <- shortest.paths(graph, V(graph), V(graph))

# Select row-wise maximum shortest path
max_paths <- apply(paths, 1, function(x) max(x))
max_path <- max(max_paths)
distinct <- data.frame()
for (i in 1:length(paths[1,])) {
for (j in i:length(paths[,1])) {
if (paths[i,j] == max_path) {
distinct <- rbind(distinct, c(i, j))
names(distinct) <- c("from", "to")

# (Optional) Create a fraction of links instead of all
distinct <- distinct[sample(1:nrow(distinct), ceiling(sqrt(nrow(distinct)))),]

# Sequentially create an edge between each set of distinct nodes
edges <- unlist(mapply(c, V(graph)[distinct$from], V(graph)[distinct$to], SIMPLIFY=FALSE))
graph <- add.edges(graph, edges)
plot(graph, vertex.size=2)

# Extract influential nodes again
new_influential <- greedy_influential(graph, budget, test_method="RESILIENCE")

# Compare both influential node sets
diff <- setdiff(influential, new_influential)

# Capture results
results <- rbind(results, c(size, length(influential), length(diff)))
names(results) <- c("size", "inf_size", "diff")
results$change <- round(results$diff / results$inf_size * 100)
graphs <- cbind(graphs, results)

# Conclusion (small-world):
# The experiment analyzed change in influential nodes by connecting distinct nodes of 10 networks of sizes from 100 to 1000.
# Distinct nodes are considered to be those with the longest shortest path.
# We create an edge between some (a Sq.root subset) of the distinct nodes.
# There was visible difference in set of influential nodes before and after node wiring. The avg. change measured to be 13%
# Possible concerns:
# 1. There is still no straight way to ensure that only the size of the network grows, while other traits remain same
# 2.
Experiments/changing_dynamics/weak_node_attachment.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
##################################### Hypothesis A #####################################
# Attaching a node to all nodes with degree = 1 will not change influential nodes


# Generate a graph
size <- 100
budget <- 5
#graph <- generate_small_world(size, log(size)/size)
graph <- generate_scale_free(size, 1.5)

# Remove disconnected nodes
graph <- largest_component(graph)
V(graph)$label <- 1:length(V(graph))
plot(graph, vertex.size=2)

# Extract influential nodes using Greedy method
influential <- greedy_influential(graph, budget, test_method="RESILIENCE")

# Algorithm: For each existing node with degree = 1, add a new node to the network
# Fetch nodes with degree = 1
degree_1 <- V(graph)[which(degree(graph) == 1)]

# Attach a new node with each degree_1 node
graph <- add.vertices(graph, length(degree_1))

# Assign missing labels to new nodes
V(graph)$label[$label)] <- seq(size + 1, size + length(degree_1))

degree_1 <- V(graph)[which(degree(graph) == 1)]
degree_0 <- V(graph)[which(degree(graph) == 0)]

# Sequentially create an edge between both node sets
edges <- unlist(mapply(c, degree_1, degree_0, SIMPLIFY=FALSE))
graph <- add.edges(graph, edges)
plot(graph, vertex.size=2)

# Readjust budget so that the number of nodes stays same
budget <- size / length(V(graph)) * budget

# Extract influential nodes again
new_influential <- greedy_influential(graph, budget, test_method="RESILIENCE")

# Compare both influential node sets
setdiff(influential$influential_nodes, new_influential$influential_nodes)

##################################### Hypothesis B #####################################
# Creating a new triad by adding an edge with the third node (connection of connection)

# Generate a graph
size <- 100
budget <- 5
#graph <- generate_small_world(size, log(size)/size)
graph <- generate_scale_free(size, 1.5)

# Remove disconnected nodes
graph <- largest_component(graph)
V(graph)$label <- 1:length(V(graph))
plot(graph, vertex.size=2)

# Extract influential nodes using Greedy method
influential <- greedy_influential(graph, budget, test_method="RESILIENCE")

# Algorithm: For each existing node with degree = 1, add a new node to the network
# Fetch nodes with degree = 1
degree_1 <- V(graph)[which(degree(graph) == 1)]

# Attach a new node with each degree_1 node
graph <- add.vertices(graph, length(degree_1))

# Assign missing labels to new nodes
V(graph)$label[$label)] <- seq(size + 1, size + length(degree_1))

degree_1 <- V(graph)[which(degree(graph) == 1)]
degree_0 <- V(graph)[which(degree(graph) == 0)]

# Fetch the nodes which the degree_1 nodes were initially connected with
first_node <- V(graph)[sapply(degree_1, function(x) { unlist(neighborhood(graph, x, order=1))[2] })]

# Sequentially create an edge between both node sets
edges <- unlist(mapply(c, degree_1, degree_0, SIMPLIFY=FALSE))

# Also create edges with the first_node set
edges <- c(edges, unlist(mapply(c, first_node, degree_0, SIMPLIFY=FALSE)))

graph <- add.edges(graph, edges)
plot(graph, vertex.size=2)

# Readjust budget so that the number of nodes stays same
budget <- size / length(V(graph)) * budget

# Extract influential nodes again
new_influential <- greedy_influential(graph, budget, test_method="RESILIENCE")

# Compare both influential node sets
setdiff(influential$influential_nodes, new_influential$influential_nodes)

# Conclusions of Hypothesis A(a & b):
# 1. For small-world graphs of length ranging between 100 and 600, no change was noticed in the set of influential nodes (top 5%)
# 2. The possible cause is the formation of star, since in a star network, the only influential node is the core node
# 3. The hypothesis goes against Fitness model of network theory, in a sense that new nodes are connecting to most unfit nodes
# 4. In case of Scale-free, difference of exactly 2 nodes was observed in 3 out of 5 cases
# 5. The above conclusions stayed true even after creating a triad between newly added node and the node which was the connection of degree_1 node
Experiments/README.md
Original file line number Diff line number Diff line change
@@ -1,96 +1 @@
# Project: influence-mining
### Version: 0.1.2

The purpose of this project is to provide an interface to perform influence mining operations on networks, preferably Social networks.

The source contains the following files:
1. Influence.R
2. Under construction

The file contains source code for implementation of two basic influence mining models: Independent Cascade model and Linear Threshold model[1]
influence (graph, seed, budget, steps, model, maximize, seed_method, prob)
This is a wrapper function to call influence_LT and influence_IC functions
- graph: is the igraph object
- budget: defines what percentage of most influential nodes out of all nodes is required as output. Default value is 1
- seed: (optional) is a set of seed (initial nodes). If this parameter is NULL, then seed_method parameter should be given
- steps: is the time steps for which, the diffusion process should run. If exhaustive run is required, provide a high value (like 100). Default value is 1
- model: is influence model to run the dataset on. Value MUST either be "LT" or "IC"
- maximize: should be TRUE if influential nodes are to be derived using Greedy algorithm
- seed_method: is the selection method for seed (initial nodes). Value can be "random", "degree", "closeness", "betweenness", "coreness", "eigenvector", "a-degree", "a-closeness", "a-betweenness", "a-coreness", "a-eigenvector". Default value is "random"
- prob: is the probability of activation of a neighbour node. This is applicable only to IC model currently. Default value is 0.5

> Output: summary of influence process, including no. of nodes, edges, seed set size, nodes influenced and time taken
This function calculates influence (number of nodes in the network expected to be activated) under Linear Threshold model. For parameters, see influence function.

This function calculates influence (number of nodes in the network expected to be activated) under Independent Cascade model. For parameters, see influence function.

This function returns a set of nodes, to be used as seed in influence functions on the basis of given seed selection method
- G: a graph object of library *igraph*
- k: percentage of seed nodes from the network to be chosen
- seed_method: see influence function

> Output: subset vector of nodes in a graph
This function returns a set of nodes, to be used as seed in influence functions on the basis of given adaptive method for seed selection
- G: a graph object of library *igraph*
- k: percentage of seed nodes from the network to be chosen
- seed_method: see influence function

> Output: subset vector of nodes in a graph
This method finds communities in the given graph and returns the graph after adding a vector "group" to its vertices
- G: a graph object of library *igraph*
- method: is the method to generate communities. Available algorithms are "multilevel", "edgebetweenness", "fastgreedy", "eigenvector", "spinglass", "walktrap", "labelpropagation", "clique", "largescale"

> Output: graph object with additional vector "group" to vertices
This function performs a Wilcoxon rank-sum test on the "internal" and "external" degrees of a community in order to quantify its significance.

### Examples:
1. Calculate influence under defaults (model="LT", budget=5, steps=1 and seed_method="random")
2. Calculate influence under IC model, budget=10% for 2 time steps and seed_method="random"
influence(edgesFile="C:/Datasets/twitter_edges.csv", budget=10, steps=2, model="IC")
3. Calculate influence under IC model to select 10% nodes for 2 time steps and seed selection criteria to be nodes with highest degree
influence(edgesFile="C:/Datasets/twitter_edges.csv", budget=10, steps=2, model="IC", seed_method="degree")
4. Calculate influence under LT model to select 5% nodes for 1 time steps and seed selection criteria to be nodes with highest betweenness
influence(edgesFile="C:/Datasets/twitter_edges.csv", seed_method="betweenness")
### Libraries used
jsonlite, uuid, sampling, digest, RWeka, doMC, snow, doSNOW, iterpc, foreach, igraph, caret, e1071, party, rpart, rpart.plot, randomForest, RColorBrewer, nnet, rattle, ggplot2, Rcpp

### References:
[1] Kempe, D., Kleinberg, J., & Tardos, É. (2003). Maximizing the Spread of Influence through a Social Network. In Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining - KDD ’03 (p. 137). New York, New York, USA: ACM Press. doi:10.1145/956755.956769
# Experiments on influence mining

