Skip to content

Commit

Permalink
Code cleanup to make use of packaged influence.mining project
Browse files Browse the repository at this point in the history
  • Loading branch information
seekme94 committed Jul 12, 2020
1 parent 25861a4 commit 05e75f4
Show file tree
Hide file tree
Showing 21 changed files with 239 additions and 1,030,912 deletions.
11 changes: 0 additions & 11 deletions DESCRIPTION

This file was deleted.

44 changes: 0 additions & 44 deletions Experiments/Sir Faraz.R

This file was deleted.

49 changes: 49 additions & 0 deletions Experiments/changing_dynamics/correlation_analysis.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
######################################################################################
##################################### Hypothesis #####################################
# By correlating coreness of nodes with other heuristics and traits, we can discover
# which nodes will not be vital
######################################################################################

# Load required libraries
library(igraph)
library(arules)
library(dplyr)
library(caret)
library(party)
library(doParallel)
library(corrplot)
library(influence.mining)

# Load required source files
source('util/graph_util.R')
source('util/classification_util.R')
source('util/influence_maximization.R')
source('util/heuristics.R')


# Read test data set
author <- largest_component(read.graph("Experiments/data/author_netscience.txt", directed=FALSE))
ita2000 <- largest_component(read.graph("Experiments/data/ita2000.txt", directed=FALSE))
caida <- largest_component(read.graph("Experiments/data/as-caida.txt", directed=FALSE))
jdk <- largest_component(read.graph("Experiments/data/jdk6_dependencies.txt", directed=FALSE))

# Plot correlation matrix of dataset
author_data <- get_node_influence_traits(author)
author_corr <- round(cor(as.data.frame(author_data)), 2)
write.table(author_corr, file="Experiments/results/author_correlation.csv", quote=FALSE, sep=",")
corrplot(author_corr)

ita2000_data <- get_traits(ita2000)
ita2000_corr <- round(cor(as.data.frame(ita2000_data)), 2)
write.table(ita2000_corr, file="Experiments/results/ita2000_correlation.csv", quote=FALSE, sep=",")
corrplot(round(cor(as.data.frame(ita2000_data)), 2))

caida_data <- get_traits(caida)
caida_corr <- round(cor(as.data.frame(caida_data)), 2)
write.table(caida_corr, file="Experiments/results/caida_correlation.csv", quote=FALSE, sep=",")
corrplot(round(cor(as.data.frame(caida_data)), 2))

jdk_data <- get_traits(jdk)
jdk_corr <- round(cor(as.data.frame(jdk_data)), 2)
write.table(jdk_corr, file="Experiments/results/jdk_correlation.csv", quote=FALSE, sep=",")
corrplot(round(cor(as.data.frame(jdk_data)), 2))
74 changes: 74 additions & 0 deletions Experiments/changing_dynamics/distant_edge_attachment.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
######################################################################################
##################################### Hypothesis #####################################
# Connecting few nodes with degree = 1, and have greater distances between them will
# change influential nodes
######################################################################################

library(igraph)
library(influence.mining)

# Generate a graph
seed <- 100
budget <- 5
set.seed(seed)
graphs <- data.frame()
results <- data.frame()
for (size in seq(100, 1000, by=100)) {
# graph <- generate_small_world(size, 3/size)
graph <- generate_scale_free(size, 1 + (1/log(size)))
# Remove disconnected nodes
graph <- largest_component(graph)
graphs <- rbind(graphs, as.data.frame(graph_summary(graph)))
V(graph)$label <- 1:length(V(graph))
plot(graph, vertex.size=2)

# Extract influential nodes using Greedy method
influential <- greedy_influential(graph, budget, test_method="RESILIENCE")

# Find shortest paths for every pair of nodes
paths <- shortest.paths(graph, V(graph), V(graph))

# Select row-wise maximum shortest path
max_paths <- apply(paths, 1, function(x) max(x))
max_path <- max(max_paths)
distinct <- data.frame()
for (i in 1:length(paths[1,])) {
for (j in i:length(paths[,1])) {
if (paths[i,j] == max_path) {
distinct <- rbind(distinct, c(i, j))
}
}
}
names(distinct) <- c("from", "to")

# (Optional) Create a fraction of links instead of all
set.seed(seed)
distinct <- distinct[sample(1:nrow(distinct), ceiling(sqrt(nrow(distinct)))),]

# Sequentially create an edge between each set of distinct nodes
edges <- unlist(mapply(c, V(graph)[distinct$from], V(graph)[distinct$to], SIMPLIFY=FALSE))
graph <- add.edges(graph, edges)
plot(graph, vertex.size=2)

# Extract influential nodes again
new_influential <- greedy_influential(graph, budget, test_method="RESILIENCE")

# Compare both influential node sets
diff <- setdiff(influential, new_influential)

# Capture results
results <- rbind(results, c(size, length(influential), length(diff)))
}
names(results) <- c("size", "inf_size", "diff")
results$change <- round(results$diff / results$inf_size * 100)
graphs <- cbind(graphs, results)
graphs

# Conclusion (small-world):
# The experiment analyzed change in influential nodes by connecting distinct nodes of 10 networks of sizes from 100 to 1000.
# Distinct nodes are considered to be those with the longest shortest path.
# We create an edge between some (a Sq.root subset) of the distinct nodes.
# There was visible difference in set of influential nodes before and after node wiring. The avg. change measured to be 13%
# Possible concerns:
# 1. There is still no straight way to ensure that only the size of the network grows, while other traits remain same
# 2.
115 changes: 115 additions & 0 deletions Experiments/changing_dynamics/weak_node_attachment.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
########################################################################################
##################################### Hypothesis A #####################################
# Attaching a node to all nodes with degree = 1 will not change influential nodes
########################################################################################

library(igraph)
library(influence.mining)

# Generate a graph
set.seed(1)
size <- 100
budget <- 5
#graph <- generate_small_world(size, log(size)/size)
graph <- generate_scale_free(size, 1.5)

# Remove disconnected nodes
graph <- largest_component(graph)
V(graph)$label <- 1:length(V(graph))
plot(graph, vertex.size=2)

# Extract influential nodes using Greedy method
influential <- greedy_influential(graph, budget, test_method="RESILIENCE")

# Algorithm: For each existing node with degree = 1, add a new node to the network
# Fetch nodes with degree = 1
degree_1 <- V(graph)[which(degree(graph) == 1)]

# Attach a new node with each degree_1 node
graph <- add.vertices(graph, length(degree_1))

# Assign missing labels to new nodes
V(graph)$label[is.na(V(graph)$label)] <- seq(size + 1, size + length(degree_1))

degree_1 <- V(graph)[which(degree(graph) == 1)]
degree_0 <- V(graph)[which(degree(graph) == 0)]

# Sequentially create an edge between both node sets
edges <- unlist(mapply(c, degree_1, degree_0, SIMPLIFY=FALSE))
graph <- add.edges(graph, edges)
plot(graph, vertex.size=2)

# Readjust budget so that the number of nodes stays same
budget <- size / length(V(graph)) * budget

# Extract influential nodes again
new_influential <- greedy_influential(graph, budget, test_method="RESILIENCE")

# Compare both influential node sets
influential
new_influential
setdiff(influential$influential_nodes, new_influential$influential_nodes)


########################################################################################
##################################### Hypothesis B #####################################
# Creating a new triad by adding an edge with the third node (connection of connection)
########################################################################################

# Generate a graph
set.seed(1)
size <- 100
budget <- 5
#graph <- generate_small_world(size, log(size)/size)
graph <- generate_scale_free(size, 1.5)

# Remove disconnected nodes
graph <- largest_component(graph)
V(graph)$label <- 1:length(V(graph))
plot(graph, vertex.size=2)

# Extract influential nodes using Greedy method
influential <- greedy_influential(graph, budget, test_method="RESILIENCE")

# Algorithm: For each existing node with degree = 1, add a new node to the network
# Fetch nodes with degree = 1
degree_1 <- V(graph)[which(degree(graph) == 1)]

# Attach a new node with each degree_1 node
graph <- add.vertices(graph, length(degree_1))

# Assign missing labels to new nodes
V(graph)$label[is.na(V(graph)$label)] <- seq(size + 1, size + length(degree_1))

degree_1 <- V(graph)[which(degree(graph) == 1)]
degree_0 <- V(graph)[which(degree(graph) == 0)]

# Fetch the nodes which the degree_1 nodes were initially connected with
first_node <- V(graph)[sapply(degree_1, function(x) { unlist(neighborhood(graph, x, order=1))[2] })]

# Sequentially create an edge between both node sets
edges <- unlist(mapply(c, degree_1, degree_0, SIMPLIFY=FALSE))

# Also create edges with the first_node set
edges <- c(edges, unlist(mapply(c, first_node, degree_0, SIMPLIFY=FALSE)))

graph <- add.edges(graph, edges)
plot(graph, vertex.size=2)

# Readjust budget so that the number of nodes stays same
budget <- size / length(V(graph)) * budget

# Extract influential nodes again
new_influential <- greedy_influential(graph, budget, test_method="RESILIENCE")

# Compare both influential node sets
influential
new_influential
setdiff(influential$influential_nodes, new_influential$influential_nodes)

# Conclusions of Hypothesis A(a & b):
# 1. For small-world graphs of length ranging between 100 and 600, no change was noticed in the set of influential nodes (top 5%)
# 2. The possible cause is the formation of star, since in a star network, the only influential node is the core node
# 3. The hypothesis goes against Fitness model of network theory, in a sense that new nodes are connecting to most unfit nodes
# 4. In case of Scale-free, difference of exactly 2 nodes was observed in 3 out of 5 cases
# 5. The above conclusions stayed true even after creating a triad between newly added node and the node which was the connection of degree_1 node
97 changes: 1 addition & 96 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,96 +1 @@
# Project: influence-mining
### Version: 0.1.2

The purpose of this project is to provide an interface to perform influence mining operations on networks, preferably Social networks.

The source contains the following files:
1. Influence.R
2. Under construction

```
Influence.R
```
The file contains source code for implementation of two basic influence mining models: Independent Cascade model and Linear Threshold model[1]
```
influence (graph, seed, budget, steps, model, maximize, seed_method, prob)
```
This is a wrapper function to call influence_LT and influence_IC functions
- graph: is the igraph object
- budget: defines what percentage of most influential nodes out of all nodes is required as output. Default value is 1
- seed: (optional) is a set of seed (initial nodes). If this parameter is NULL, then seed_method parameter should be given
- steps: is the time steps for which, the diffusion process should run. If exhaustive run is required, provide a high value (like 100). Default value is 1
- model: is influence model to run the dataset on. Value MUST either be "LT" or "IC"
- maximize: should be TRUE if influential nodes are to be derived using Greedy algorithm
- seed_method: is the selection method for seed (initial nodes). Value can be "random", "degree", "closeness", "betweenness", "coreness", "eigenvector", "a-degree", "a-closeness", "a-betweenness", "a-coreness", "a-eigenvector". Default value is "random"
- prob: is the probability of activation of a neighbour node. This is applicable only to IC model currently. Default value is 0.5

> Output: summary of influence process, including no. of nodes, edges, seed set size, nodes influenced and time taken
```
influence_LT
```
This function calculates influence (number of nodes in the network expected to be activated) under Linear Threshold model. For parameters, see influence function.

```
influence_IC
```
This function calculates influence (number of nodes in the network expected to be activated) under Independent Cascade model. For parameters, see influence function.

```
select_seed
```
This function returns a set of nodes, to be used as seed in influence functions on the basis of given seed selection method
- G: a graph object of library *igraph*
- k: percentage of seed nodes from the network to be chosen
- seed_method: see influence function

> Output: subset vector of nodes in a graph
```
select_adaptive_seed
```
This function returns a set of nodes, to be used as seed in influence functions on the basis of given adaptive method for seed selection
- G: a graph object of library *igraph*
- k: percentage of seed nodes from the network to be chosen
- seed_method: see influence function

> Output: subset vector of nodes in a graph
```
find_communities
```
This method finds communities in the given graph and returns the graph after adding a vector "group" to its vertices
- G: a graph object of library *igraph*
- method: is the method to generate communities. Available algorithms are "multilevel", "edgebetweenness", "fastgreedy", "eigenvector", "spinglass", "walktrap", "labelpropagation", "clique", "largescale"

> Output: graph object with additional vector "group" to vertices
```
community.significance.test
```
This function performs a Wilcoxon rank-sum test on the "internal" and "external" degrees of a community in order to quantify its significance.


### Examples:
1. Calculate influence under defaults (model="LT", budget=5, steps=1 and seed_method="random")
```
influence(edgesFile="C:/Datasets/twitter_edges.csv")
```
2. Calculate influence under IC model, budget=10% for 2 time steps and seed_method="random"
```
influence(edgesFile="C:/Datasets/twitter_edges.csv", budget=10, steps=2, model="IC")
```
3. Calculate influence under IC model to select 10% nodes for 2 time steps and seed selection criteria to be nodes with highest degree
```
influence(edgesFile="C:/Datasets/twitter_edges.csv", budget=10, steps=2, model="IC", seed_method="degree")
```
4. Calculate influence under LT model to select 5% nodes for 1 time steps and seed selection criteria to be nodes with highest betweenness
```
influence(edgesFile="C:/Datasets/twitter_edges.csv", seed_method="betweenness")
```
### Libraries used
jsonlite, uuid, sampling, digest, RWeka, doMC, snow, doSNOW, iterpc, foreach, igraph, caret, e1071, party, rpart, rpart.plot, randomForest, RColorBrewer, nnet, rattle, ggplot2, Rcpp


### References:
[1] Kempe, D., Kleinberg, J., & Tardos, É. (2003). Maximizing the Spread of Influence through a Social Network. In Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining - KDD ’03 (p. 137). New York, New York, USA: ACM Press. doi:10.1145/956755.956769
# Experiments on influence mining
Loading

0 comments on commit 05e75f4

Please sign in to comment.