A lot of community cleanup. (1) Only do analysis in community_analyze…

….py, (2) Switch all community scripts to start with community_.
sligocki · Dec 6, 2021 · ece54ba · ece54ba
1 parent de98681
commit ece54ba
Show file tree

Hide file tree

Showing 8 changed files with 138 additions and 345 deletions.
diff --git a/circles_communities.py b/circles_communities.py
@@ -1,5 +1,6 @@
 import argparse
 import collections
+import json
 
 import networkit as nk
 
@@ -11,7 +12,8 @@
 parser.add_argument("focus")
 parser.add_argument("graph")
 parser.add_argument("communities")
-parser.add_argument("--min-community-size", type=int, default=50_000)
+parser.add_argument("out_circles")
+parser.add_argument("--min-community-size", type=int, default=1000)
 args = parser.parse_args()
 
 utils.log("Reading graph")
@@ -22,15 +24,14 @@
 communities = nk.community.readCommunities(args.communities)
 
 community_size_index = [(size, index)
-                        for (index, size) in enumerate(communities.subsetSizes())
-                        if size >= args.min_community_size]
+                        for (index, size) in enumerate(communities.subsetSizes())]
 community_size_index.sort(reverse=True)
-large_communities = [index for (_, index) in community_size_index]
+sorted_comms = [index for (size, index) in community_size_index
+               if size >= args.min_community_size]
 
 utils.log("Find distances to all nodes from focus")
 comm_circles = {index: collections.Counter()
-                for index in large_communities}
-comm_circles["other"] = collections.Counter()
+                for index in sorted_comms}
 comm_circles["all"] = collections.Counter()
 
 focus_index = names_db.name2index(args.focus)
@@ -39,18 +40,27 @@
 
 for dest in G.iterNodes():
   dest_comm = communities[dest]
-  if dest_comm not in comm_circles:
-    dest_comm = "other"
   dist = int(bfs.distance(dest))
-  comm_circles[dest_comm][dist] += 1
+  if dest_comm in comm_circles:
+    comm_circles[dest_comm][dist] += 1
   comm_circles["all"][dist] += 1
 
-utils.log("Analyzing each community")
-for comm in ["all"] + large_communities + ["other"]:
+utils.log("Analyzing largest communities")
+for comm in ["all"] + sorted_comms[:20]:
   size = sum(count for (dist, count) in comm_circles[comm].items())
   mean_dist = sum(dist * count for (dist, count) in comm_circles[comm].items()) / size
   mode_count, mode_dist = max((count, dist)
                               for (dist, count) in comm_circles[comm].items())
   print(f"Community {str(comm):>6s} {size=:10_d}  /  {mode_dist=:3d}  {mode_count=:9_d}  /  {mean_dist=:5.1f}")
 
+utils.log("Writing circles to file")
+circles_json = {}
+for comm in ["all"] + sorted_comms:
+  name = f"{args.focus}/Community_{comm}"
+  circles_json[name] = [comm_circles[comm][i]
+                        for i in range(max(comm_circles[comm].keys()) + 1)]
+
+with open(args.out_circles, "w") as f:
+  json.dump(circles_json, f)
+
 utils.log("Finished")
diff --git a/circles_plot.py b/circles_plot.py
@@ -26,6 +26,7 @@ def median_index(circle_sizes):
 parser = argparse.ArgumentParser()
 parser.add_argument("circles_json", nargs="+", type=Path)
 parser.add_argument("--wikitree-ids", "--ids", nargs="*")
+parser.add_argument("--max-plots", type=int, default=20)
 
 parser.add_argument("--log-y", action="store_true",
                     help="Plot with log-Y axis.")
@@ -85,7 +86,7 @@ def median_index(circle_sizes):
 
 
 utils.log("Plotting Graph")
-ids = args.wikitree_ids if args.wikitree_ids else circle_sizes.keys()
+ids = args.wikitree_ids if args.wikitree_ids else list(circle_sizes.keys())[:args.max_plots]
 for wikitree_id in ids:
   sizes = circle_sizes[wikitree_id]
   xs = list(range(len(sizes)))

diff --git a/community_analyze.py b/community_analyze.py
@@ -118,14 +118,20 @@ def summarize_community(index):
   if size <= 50_000:
     closeness = nk.centrality.Closeness(subG, False, nk.centrality.ClosenessVariant.Generalized)
   else:
-    # If we have too many nodes, excact closeness is too slow.
+    # If we have too many nodes, exact closeness is too slow.
     closeness = nk.centrality.ApproxCloseness(subG, 100)
   closeness.run()
+
+  center, _ = closeness.ranking()[0]
+  bfs = nk.distance.BFS(subG, center)
+  bfs.run()
+
   for node_index, score in closeness.ranking()[:10]:
     node_name = names_db.index2name(node_index)
     user_nums = name2users(node_name)
     id_str = "/".join(db.num2id(user_num) for user_num in user_nums)
-    print(f" - {1/score:6.2f}  {id_str}")
+    dist_center = int(bfs.distance(node_index))
+    print(f" - {1/score:6.2f}  {dist_center:3d}  {id_str}")
 
 
 print()
@@ -146,9 +152,8 @@ def summarize_community(index):
       "Gardahaut-1",
       "Vatant-5",
       "Andersson-5056",
-      # "Mars-121",
-      # "Lothrop-29",
-      # "Windsor-1",
+      "Mars-121",
+      "Lothrop-29",
     ]:
     node_index = names_db.name2index(node_name)
     community_index = communities[node_index]

diff --git a/community_generate.py b/community_generate.py
@@ -0,0 +1,28 @@
+"""
+Identify communities of graph and save to file.
+"""
+
+import argparse
+
+import networkit as nk
+
+import graph_tools
+import utils
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("graph")
+parser.add_argument("out_communities")
+args = parser.parse_args()
+
+utils.log("Reading graph")
+G, names_db = graph_tools.load_graph_nk(args.graph)
+utils.log(f"Loaded graph with {G.numberOfNodes():_} nodes / {G.numberOfEdges():_} edges")
+
+utils.log("Calculating communities")
+communities = nk.community.detectCommunities(G)
+
+utils.log("Writing Communities to file")
+nk.community.writeCommunities(communities, args.out_communities)
+
+utils.log("Finished")
diff --git a/community_intersect.py b/community_intersect.py
@@ -0,0 +1,52 @@
+"""
+Intersect multiple community partitions into a single partition.
+Communities in this "intersection partition" are the collections of people who
+were categorized into the same community in all input partitions.
+"""
+
+import argparse
+import collections
+
+import networkit as nk
+
+import graph_tools
+import utils
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("graph")
+parser.add_argument("in_communities", nargs="+")
+parser.add_argument("--out-communities")
+args = parser.parse_args()
+
+utils.log("Reading graph")
+G, names_db = graph_tools.load_graph_nk(args.graph)
+utils.log(f"Loaded graph with {G.numberOfNodes():_} nodes / {G.numberOfEdges():_} edges")
+
+utils.log(f"Reading {len(args.in_communities)} partitions")
+partitions = [nk.community.readCommunities(community_file)
+              for community_file in args.in_communities]
+
+utils.log("Calculating the intersection of all partitions")
+intersect_partition = collections.defaultdict(list)
+node2comm = {}
+for node in G.iterNodes():
+  all_comms = [partition[node] for partition in partitions]
+  intersect_name = ",".join(str(comm) for comm in all_comms)
+  node2comm[node] = intersect_name
+  intersect_partition[intersect_name].append(node)
+utils.log(f"Found {len(intersect_partition)} partition intersections")
+
+utils.log("Sorting intersections by size")
+size_part = [(len(nodes), name) for (name, nodes) in intersect_partition.items()]
+size_part.sort(reverse=True)
+name2index = {name: index for (index, (_, name)) in enumerate(size_part)}
+
+utils.log("Writing intersections")
+with open(args.out_communities, "w") as f:
+  for node in G.iterNodes():
+    comm_name = node2comm[node]
+    comm_index = name2index[comm_name]
+    f.write(f"{comm_index}\n")
+
+utils.log("Finished")
diff --git a/dump_build.sh b/dump_build.sh
@@ -20,25 +20,38 @@ done
 echo
 echo "(2) Process new dump"
 time python3 csv_to_sqlite.py --version=${TIMESTAMP}
-
 time bash process_categories.sh ${TIMESTAMP}
 
-time python3 graph_make_family.py --version=${TIMESTAMP}
-time python3 graph_core.py ${VERSION_DIR}/family.main.adj.nx \
-                           ${VERSION_DIR}/family.core.adj.nx \
-                           ${VERSION_DIR}/family.core.collapse.csv
-time python3 graph_core_annotate.py ${VERSION_DIR}/family.main.adj.nx \
-                                    ${VERSION_DIR}/family.core.adj.nx \
-                                    ${VERSION_DIR}/family.core.weighted.edgelist.nx.gz \
-                                    ${VERSION_DIR}/family.core.collapse.csv \
-                                    ${VERSION_DIR}/family.dist_to_core.db
+# time python3 graph_make_family_bipartite.py --version=${TIMESTAMP}
+# mkdir -p ${VERSION_DIR}/communities/
+# for x in $(seq 0 9); do
+#   time python3 community_generate.py \
+#     ${VERSION_DIR}/fam_bipartite.main.graph \
+#     ${VERSION_DIR}/communities/fam_bipartite.main.comm_${x}
+# done
+# time python3 community_intersect.py \
+#   ${VERSION_DIR}/fam_bipartite.main.graph \
+#   ${VERSION_DIR}/communities/fam_bipartite.main.comm_* \
+#   --out-communities=${VERSION_DIR}/communities/fam_bipartite.main.inter_10
+# time python3 community_analyze.py \
+#   ${VERSION_DIR}/fam_bipartite.main.graph \
+#   ${VERSION_DIR}/communities/fam_bipartite.main.inter_10
+
+# time python3 graph_make_family.py --version=${TIMESTAMP}
+# time python3 graph_core.py ${VERSION_DIR}/family.main.adj.nx \
+#                            ${VERSION_DIR}/family.core.adj.nx \
+#                            ${VERSION_DIR}/family.core.collapse.csv
+# time python3 graph_core_annotate.py ${VERSION_DIR}/family.main.adj.nx \
+#                                     ${VERSION_DIR}/family.core.adj.nx \
+#                                     ${VERSION_DIR}/family.core.weighted.edgelist.nx.gz \
+#                                     ${VERSION_DIR}/family.core.collapse.csv \
+#                                     ${VERSION_DIR}/family.dist_to_core.db
 
 # Note: Skipping big graphs
 # time python3 sqlite_to_graph.py --version=${TIMESTAMP}
-# time python3 graph_make_family_bipartite.py --version=${TIMESTAMP}
 
 # Load connected components of graph
-time python3 csv_to_partitions.py --version=${TIMESTAMP}
+# time python3 csv_to_partitions.py --version=${TIMESTAMP}
 # time python3 csv_to_partitions.py --version=${TIMESTAMP} --sibling-in-law
 
 echo