Skip to content

Commit

Permalink
Switch person graph generation over to networkit.
Browse files Browse the repository at this point in the history
  • Loading branch information
sligocki committed Dec 7, 2021
1 parent f198119 commit 06dc9b4
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 80 deletions.
3 changes: 2 additions & 1 deletion circles_communities.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@
for comm in ["all"] + sorted_comms:
name = f"{args.focus}/Community_{comm}"
circles_json[name] = [comm_circles[comm][i]
for i in range(max(comm_circles[comm].keys()) + 1)]
for i in range(max(comm_circles[comm].keys()) + 1)
if (i % 2) == 0] # TODO: Only do this for bipartite?

with open(args.out_circles, "w") as f:
json.dump(circles_json, f)
Expand Down
15 changes: 13 additions & 2 deletions community_generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,18 @@
utils.log("Calculating communities")
communities = nk.community.detectCommunities(G)

utils.log("Writing Communities to file")
nk.community.writeCommunities(communities, args.out_communities)
utils.log("Sorting communities by size")
community_size_index = [(size, index)
for (index, size) in enumerate(communities.subsetSizes())]
community_size_index.sort(reverse=True)
# Convert from (arbitrary) starting community order to order descending by size.
index2order = {index: order for (order, (_, index)) in enumerate(community_size_index)}

utils.log("Writing communities to file")
with open(args.out_communities, "w") as f:
for node in G.iterNodes():
old_index = communities[node]
new_order = index2order[old_index]
f.write(f"{new_order}\n")

utils.log("Finished")
26 changes: 13 additions & 13 deletions dump_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,18 @@ echo "(2) Process new dump"
time python3 csv_to_sqlite.py --version=${TIMESTAMP}
time bash process_categories.sh ${TIMESTAMP}

# time python3 graph_make_family_bipartite.py --version=${TIMESTAMP}
# mkdir -p ${VERSION_DIR}/communities/
# for x in $(seq 0 9); do
# time python3 community_generate.py \
# ${VERSION_DIR}/fam_bipartite.main.graph \
# ${VERSION_DIR}/communities/fam_bipartite.main.comm_${x}
# done
# time python3 community_intersect.py \
# ${VERSION_DIR}/fam_bipartite.main.graph \
# ${VERSION_DIR}/communities/fam_bipartite.main.comm_* \
# --out-communities=${VERSION_DIR}/communities/fam_bipartite.main.inter_10
# time python3 community_analyze.py \
time python3 graph_make_family_bipartite.py --version=${TIMESTAMP}
mkdir -p ${VERSION_DIR}/communities/
for x in $(seq 0 9); do
time python3 community_generate.py \
${VERSION_DIR}/fam_bipartite.main.graph \
${VERSION_DIR}/communities/fam_bipartite.main.comm_${x}
done
time python3 community_intersect.py \
${VERSION_DIR}/fam_bipartite.main.graph \
${VERSION_DIR}/communities/fam_bipartite.main.comm_* \
--out-communities=${VERSION_DIR}/communities/fam_bipartite.main.inter_10
# time python3 community_analyze.py --version=${TIMESTAMP} \
# ${VERSION_DIR}/fam_bipartite.main.graph \
# ${VERSION_DIR}/communities/fam_bipartite.main.inter_10

Expand All @@ -48,7 +48,7 @@ time bash process_categories.sh ${TIMESTAMP}
# ${VERSION_DIR}/family.dist_to_core.db

# Note: Skipping big graphs
# time python3 sqlite_to_graph.py --version=${TIMESTAMP}
# time python3 graph_make_person.py --version=${TIMESTAMP}

# Load connected components of graph
# time python3 csv_to_partitions.py --version=${TIMESTAMP}
Expand Down
43 changes: 22 additions & 21 deletions graph_make_family_bipartite.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,29 @@
# Produces a graph representing "nuclear family units".
#
# Represented as a bipartite graph where each node is either:
# * a person or
# * a family unit (representing 2 parents and all their children
# or a married couple with no children)
# Every person is connected to every family unit they are a member of.
#
# Note: This is similar to the graph created by graph_make_family.py, see the
# comments in that file to understand the difference.
#
# In the traditional connection graph, there are many highly connected blocks.
# For example, given 2 parents with 4 children, all 6 will be directly connected
# to each other. But does this really represent 15 different connections?
#
# Instead, in this representation it would be represented as a star of 6 person
# nodes attached to a central family node.
#
# The resulting graph will have drastically fewer cliques and running through
# graph_core.py will be much more effective.
"""
Produces a graph representing "nuclear family units".
Represented as a bipartite graph where each node is either:
* a person or
* a family unit (representing 2 parents and all their children
or a married couple with no children)
Every person is connected to every family unit they are a member of.
Note: This is similar to the graph created by graph_make_family.py, see the
comments in that file to understand the difference.
In the traditional connection graph, there are many highly connected blocks.
For example, given 2 parents with 4 children, all 6 will be directly connected
to each other. But does this really represent 15 different connections?
Instead, in this representation it would be represented as a star of 6 person
nodes attached to a central family node.
The resulting graph will have drastically fewer cliques and running through
graph_core.py will be much more effective.
"""

import argparse
import collections
from pathlib import Path
import time

import networkit as nk

Expand Down
67 changes: 67 additions & 0 deletions graph_make_person.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
"""
Make person graph.
"""

import argparse
from pathlib import Path

import networkit as nk

import data_reader
import graph_tools
import utils


parser = argparse.ArgumentParser()
parser.add_argument("--version", help="Data version (defaults to most recent).")
args = parser.parse_args()

utils.log("Loading DB")
db = data_reader.Database(args.version)

utils.log("Building list of all nodes and edges")
people_ids = set()
edge_ids = set()
i = 0
for person, neigh, rel_type in db.enum_connections():
# Make sure to avoid "coparent" which is not considered a connection by connection-finder.
if person < neigh and rel_type in ["child", "parent", "sibling", "spouse"]:
person_id = db.num2id(person)
neigh_id = db.num2id(neigh)
people_ids.update([person_id, neigh_id])
edge_ids.add((person_id, neigh_id))
i += 1
if i % 1000000 == 0:
utils.log(f" ... {i:,} connections loaded")
utils.log(f"Loaded {len(people_ids):_} nodes / {len(edge_ids):_} edges")

utils.log("Index nodes")
ids = list(people_ids)
id2index = {}
for node_index, wikitree_id in enumerate(ids):
id2index[wikitree_id] = node_index


graph = nk.Graph(len(ids))
utils.log("Building graph")
for (id1, id2) in edge_ids:
graph.addEdge(id2index[id1], id2index[id2])
utils.log(f"Built graph with {graph.numberOfNodes():_} Nodes / {graph.numberOfEdges():_} Edges")


utils.log("Saving full graph")
data_dir = utils.data_version_dir(args.version)
filename = Path(data_dir, "person.all.graph")
graph_tools.write_graph_nk(graph, ids, filename)

utils.log("Finding largest connected component")
main_component = graph_tools.largest_component_nk(graph)
print(f"Main component size: {main_component.numberOfNodes():,} Nodes / {main_component.numberOfEdges():,} Edges")

utils.log("Saving main component")
filename = Path(data_dir, "person.main.graph")
# Subset ids to those in main_component ... hopefully this order is correct/consistent ...
component_ids = [ids[index] for index in main_component.iterNodes()]
graph_tools.write_graph_nk(main_component, component_ids, filename)

utils.log("Finished")
43 changes: 0 additions & 43 deletions sqlite_to_graph.py

This file was deleted.

0 comments on commit 06dc9b4

Please sign in to comment.