Skip to content

Commit

Permalink
A lot of community cleanup. (1) Only do analysis in community_analyze…
Browse files Browse the repository at this point in the history
….py, (2) Switch all community scripts to start with community_.
  • Loading branch information
sligocki committed Dec 6, 2021
1 parent de98681 commit ece54ba
Show file tree
Hide file tree
Showing 8 changed files with 138 additions and 345 deletions.
32 changes: 21 additions & 11 deletions circles_communities.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import argparse
import collections
import json

import networkit as nk

Expand All @@ -11,7 +12,8 @@
parser.add_argument("focus")
parser.add_argument("graph")
parser.add_argument("communities")
parser.add_argument("--min-community-size", type=int, default=50_000)
parser.add_argument("out_circles")
parser.add_argument("--min-community-size", type=int, default=1000)
args = parser.parse_args()

utils.log("Reading graph")
Expand All @@ -22,15 +24,14 @@
communities = nk.community.readCommunities(args.communities)

community_size_index = [(size, index)
for (index, size) in enumerate(communities.subsetSizes())
if size >= args.min_community_size]
for (index, size) in enumerate(communities.subsetSizes())]
community_size_index.sort(reverse=True)
large_communities = [index for (_, index) in community_size_index]
sorted_comms = [index for (size, index) in community_size_index
if size >= args.min_community_size]

utils.log("Find distances to all nodes from focus")
comm_circles = {index: collections.Counter()
for index in large_communities}
comm_circles["other"] = collections.Counter()
for index in sorted_comms}
comm_circles["all"] = collections.Counter()

focus_index = names_db.name2index(args.focus)
Expand All @@ -39,18 +40,27 @@

for dest in G.iterNodes():
dest_comm = communities[dest]
if dest_comm not in comm_circles:
dest_comm = "other"
dist = int(bfs.distance(dest))
comm_circles[dest_comm][dist] += 1
if dest_comm in comm_circles:
comm_circles[dest_comm][dist] += 1
comm_circles["all"][dist] += 1

utils.log("Analyzing each community")
for comm in ["all"] + large_communities + ["other"]:
utils.log("Analyzing largest communities")
for comm in ["all"] + sorted_comms[:20]:
size = sum(count for (dist, count) in comm_circles[comm].items())
mean_dist = sum(dist * count for (dist, count) in comm_circles[comm].items()) / size
mode_count, mode_dist = max((count, dist)
for (dist, count) in comm_circles[comm].items())
print(f"Community {str(comm):>6s} {size=:10_d} / {mode_dist=:3d} {mode_count=:9_d} / {mean_dist=:5.1f}")

utils.log("Writing circles to file")
circles_json = {}
for comm in ["all"] + sorted_comms:
name = f"{args.focus}/Community_{comm}"
circles_json[name] = [comm_circles[comm][i]
for i in range(max(comm_circles[comm].keys()) + 1)]

with open(args.out_circles, "w") as f:
json.dump(circles_json, f)

utils.log("Finished")
3 changes: 2 additions & 1 deletion circles_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def median_index(circle_sizes):
parser = argparse.ArgumentParser()
parser.add_argument("circles_json", nargs="+", type=Path)
parser.add_argument("--wikitree-ids", "--ids", nargs="*")
parser.add_argument("--max-plots", type=int, default=20)

parser.add_argument("--log-y", action="store_true",
help="Plot with log-Y axis.")
Expand Down Expand Up @@ -85,7 +86,7 @@ def median_index(circle_sizes):


utils.log("Plotting Graph")
ids = args.wikitree_ids if args.wikitree_ids else circle_sizes.keys()
ids = args.wikitree_ids if args.wikitree_ids else list(circle_sizes.keys())[:args.max_plots]
for wikitree_id in ids:
sizes = circle_sizes[wikitree_id]
xs = list(range(len(sizes)))
Expand Down
15 changes: 10 additions & 5 deletions community_analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,14 +118,20 @@ def summarize_community(index):
if size <= 50_000:
closeness = nk.centrality.Closeness(subG, False, nk.centrality.ClosenessVariant.Generalized)
else:
# If we have too many nodes, excact closeness is too slow.
# If we have too many nodes, exact closeness is too slow.
closeness = nk.centrality.ApproxCloseness(subG, 100)
closeness.run()

center, _ = closeness.ranking()[0]
bfs = nk.distance.BFS(subG, center)
bfs.run()

for node_index, score in closeness.ranking()[:10]:
node_name = names_db.index2name(node_index)
user_nums = name2users(node_name)
id_str = "/".join(db.num2id(user_num) for user_num in user_nums)
print(f" - {1/score:6.2f} {id_str}")
dist_center = int(bfs.distance(node_index))
print(f" - {1/score:6.2f} {dist_center:3d} {id_str}")


print()
Expand All @@ -146,9 +152,8 @@ def summarize_community(index):
"Gardahaut-1",
"Vatant-5",
"Andersson-5056",
# "Mars-121",
# "Lothrop-29",
# "Windsor-1",
"Mars-121",
"Lothrop-29",
]:
node_index = names_db.name2index(node_name)
community_index = communities[node_index]
Expand Down
28 changes: 28 additions & 0 deletions community_generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""
Identify communities of graph and save to file.
"""

import argparse

import networkit as nk

import graph_tools
import utils


parser = argparse.ArgumentParser()
parser.add_argument("graph")
parser.add_argument("out_communities")
args = parser.parse_args()

utils.log("Reading graph")
G, names_db = graph_tools.load_graph_nk(args.graph)
utils.log(f"Loaded graph with {G.numberOfNodes():_} nodes / {G.numberOfEdges():_} edges")

utils.log("Calculating communities")
communities = nk.community.detectCommunities(G)

utils.log("Writing Communities to file")
nk.community.writeCommunities(communities, args.out_communities)

utils.log("Finished")
52 changes: 52 additions & 0 deletions community_intersect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""
Intersect multiple community partitions into a single partition.
Communities in this "intersection partition" are the collections of people who
were categorized into the same community in all input partitions.
"""

import argparse
import collections

import networkit as nk

import graph_tools
import utils


parser = argparse.ArgumentParser()
parser.add_argument("graph")
parser.add_argument("in_communities", nargs="+")
parser.add_argument("--out-communities")
args = parser.parse_args()

utils.log("Reading graph")
G, names_db = graph_tools.load_graph_nk(args.graph)
utils.log(f"Loaded graph with {G.numberOfNodes():_} nodes / {G.numberOfEdges():_} edges")

utils.log(f"Reading {len(args.in_communities)} partitions")
partitions = [nk.community.readCommunities(community_file)
for community_file in args.in_communities]

utils.log("Calculating the intersection of all partitions")
intersect_partition = collections.defaultdict(list)
node2comm = {}
for node in G.iterNodes():
all_comms = [partition[node] for partition in partitions]
intersect_name = ",".join(str(comm) for comm in all_comms)
node2comm[node] = intersect_name
intersect_partition[intersect_name].append(node)
utils.log(f"Found {len(intersect_partition)} partition intersections")

utils.log("Sorting intersections by size")
size_part = [(len(nodes), name) for (name, nodes) in intersect_partition.items()]
size_part.sort(reverse=True)
name2index = {name: index for (index, (_, name)) in enumerate(size_part)}

utils.log("Writing intersections")
with open(args.out_communities, "w") as f:
for node in G.iterNodes():
comm_name = node2comm[node]
comm_index = name2index[comm_name]
f.write(f"{comm_index}\n")

utils.log("Finished")
37 changes: 25 additions & 12 deletions dump_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,25 +20,38 @@ done
echo
echo "(2) Process new dump"
time python3 csv_to_sqlite.py --version=${TIMESTAMP}

time bash process_categories.sh ${TIMESTAMP}

time python3 graph_make_family.py --version=${TIMESTAMP}
time python3 graph_core.py ${VERSION_DIR}/family.main.adj.nx \
${VERSION_DIR}/family.core.adj.nx \
${VERSION_DIR}/family.core.collapse.csv
time python3 graph_core_annotate.py ${VERSION_DIR}/family.main.adj.nx \
${VERSION_DIR}/family.core.adj.nx \
${VERSION_DIR}/family.core.weighted.edgelist.nx.gz \
${VERSION_DIR}/family.core.collapse.csv \
${VERSION_DIR}/family.dist_to_core.db
# time python3 graph_make_family_bipartite.py --version=${TIMESTAMP}
# mkdir -p ${VERSION_DIR}/communities/
# for x in $(seq 0 9); do
# time python3 community_generate.py \
# ${VERSION_DIR}/fam_bipartite.main.graph \
# ${VERSION_DIR}/communities/fam_bipartite.main.comm_${x}
# done
# time python3 community_intersect.py \
# ${VERSION_DIR}/fam_bipartite.main.graph \
# ${VERSION_DIR}/communities/fam_bipartite.main.comm_* \
# --out-communities=${VERSION_DIR}/communities/fam_bipartite.main.inter_10
# time python3 community_analyze.py \
# ${VERSION_DIR}/fam_bipartite.main.graph \
# ${VERSION_DIR}/communities/fam_bipartite.main.inter_10

# time python3 graph_make_family.py --version=${TIMESTAMP}
# time python3 graph_core.py ${VERSION_DIR}/family.main.adj.nx \
# ${VERSION_DIR}/family.core.adj.nx \
# ${VERSION_DIR}/family.core.collapse.csv
# time python3 graph_core_annotate.py ${VERSION_DIR}/family.main.adj.nx \
# ${VERSION_DIR}/family.core.adj.nx \
# ${VERSION_DIR}/family.core.weighted.edgelist.nx.gz \
# ${VERSION_DIR}/family.core.collapse.csv \
# ${VERSION_DIR}/family.dist_to_core.db

# Note: Skipping big graphs
# time python3 sqlite_to_graph.py --version=${TIMESTAMP}
# time python3 graph_make_family_bipartite.py --version=${TIMESTAMP}

# Load connected components of graph
time python3 csv_to_partitions.py --version=${TIMESTAMP}
# time python3 csv_to_partitions.py --version=${TIMESTAMP}
# time python3 csv_to_partitions.py --version=${TIMESTAMP} --sibling-in-law

echo
Expand Down
Loading

0 comments on commit ece54ba

Please sign in to comment.