Skip to content

Commit

Permalink
mypy passes on all!
Browse files Browse the repository at this point in the history
  • Loading branch information
sligocki committed Feb 27, 2024
1 parent b5c685c commit c6808c3
Show file tree
Hide file tree
Showing 12 changed files with 294 additions and 262 deletions.
97 changes: 50 additions & 47 deletions circles_communities.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,60 +8,63 @@
import utils


parser = argparse.ArgumentParser()
parser.add_argument("focus")
parser.add_argument("graph")
parser.add_argument("communities")
parser.add_argument("out_circles")
parser.add_argument("--min-community-size", type=int, default=1000)
args = parser.parse_args()
def main():
parser = argparse.ArgumentParser()
parser.add_argument("focus")
parser.add_argument("graph")
parser.add_argument("communities")
parser.add_argument("out_circles")
parser.add_argument("--min-community-size", type=int, default=1000)
args = parser.parse_args()

utils.log("Reading graph")
G, names_db = graph_tools.load_graph_nk(args.graph)
utils.log(f"Loaded graph with {G.numberOfNodes():_} nodes / {G.numberOfEdges():_} edges")
utils.log("Reading graph")
G, names_db = graph_tools.load_graph_nk(args.graph)
utils.log(f"Loaded graph with {G.numberOfNodes():_} nodes / {G.numberOfEdges():_} edges")

utils.log("Reading communities")
communities = nk.community.readCommunities(args.communities)
utils.log("Reading communities")
communities = nk.community.readCommunities(args.communities)

community_size_index = [(size, index)
for (index, size) in enumerate(communities.subsetSizes())]
community_size_index.sort(reverse=True)
sorted_comms = [index for (size, index) in community_size_index
if size >= args.min_community_size]
community_size_index = [(size, index)
for (index, size) in enumerate(communities.subsetSizes())]
community_size_index.sort(reverse=True)
sorted_comms = [index for (size, index) in community_size_index
if size >= args.min_community_size]

utils.log("Find distances to all nodes from focus")
comm_circles = {index: collections.Counter()
for index in sorted_comms}
comm_circles["all"] = collections.Counter()
utils.log("Find distances to all nodes from focus")
comm_circles = {index: collections.Counter()
for index in sorted_comms}
comm_circles["all"] = collections.Counter()

focus_index = names_db.name2index(args.focus)
bfs = nk.distance.BFS(G, focus_index)
bfs.run()
focus_index = names_db.name2index(args.focus)
bfs = nk.distance.BFS(G, focus_index)
bfs.run()

for dest in G.iterNodes():
dest_comm = communities[dest]
dist = int(bfs.distance(dest))
if dest_comm in comm_circles:
comm_circles[dest_comm][dist] += 1
comm_circles["all"][dist] += 1
for dest in G.iterNodes():
dest_comm = communities[dest]
dist = int(bfs.distance(dest))
if dest_comm in comm_circles:
comm_circles[dest_comm][dist] += 1
comm_circles["all"][dist] += 1

utils.log("Analyzing largest communities")
for comm in ["all"] + sorted_comms[:20]:
size = sum(count for (dist, count) in comm_circles[comm].items())
mean_dist = sum(dist * count for (dist, count) in comm_circles[comm].items()) / size
mode_count, mode_dist = max((count, dist)
for (dist, count) in comm_circles[comm].items())
print(f"Community {str(comm):>6s} {size=:10_d} / {mode_dist=:3d} {mode_count=:9_d} / {mean_dist=:5.1f}")
utils.log("Analyzing largest communities")
for comm in ["all"] + sorted_comms[:20]:
size = sum(count for (dist, count) in comm_circles[comm].items())
mean_dist = sum(dist * count for (dist, count) in comm_circles[comm].items()) / size
mode_count, mode_dist = max((count, dist)
for (dist, count) in comm_circles[comm].items())
print(f"Community {str(comm):>6s} {size=:10_d} / {mode_dist=:3d} {mode_count=:9_d} / {mean_dist=:5.1f}")

utils.log("Writing circles to file")
circles_json = {}
for comm in ["all"] + sorted_comms:
name = f"{args.focus}/Community_{comm}"
circles_json[name] = [comm_circles[comm][i]
for i in range(max(comm_circles[comm].keys()) + 1)
if (i % 2) == 0] # TODO: Only do this for bipartite?
utils.log("Writing circles to file")
circles_json = {}
for comm in ["all"] + sorted_comms:
name = f"{args.focus}/Community_{comm}"
circles_json[name] = [comm_circles[comm][i]
for i in range(max(comm_circles[comm].keys()) + 1)
if (i % 2) == 0] # TODO: Only do this for bipartite?

with open(args.out_circles, "w") as f:
json.dump(circles_json, f)
with open(args.out_circles, "w") as f:
json.dump(circles_json, f)

utils.log("Finished")
utils.log("Finished")

main()
89 changes: 46 additions & 43 deletions circles_perspective.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,46 +7,49 @@
import utils


parser = argparse.ArgumentParser()
parser.add_argument("focus_a")
parser.add_argument("circle_num", type=int)
parser.add_argument("focus_b")
parser.add_argument("--graph")
args = parser.parse_args()

utils.log("Reading graph")
G, names_db = graph_tools.load_graph_nk(args.graph)
utils.log(f"Loaded graph with {G.numberOfNodes():_} nodes / {G.numberOfEdges():_} edges")

utils.log("Computing BFS around focus_a")
focus_index = names_db.name2index(args.focus_a)
bfs = nk.distance.BFS(G, focus_index)
bfs.run()

utils.log(f"Identifying all nodes in circle {args.circle_num}")
circle = set()
for node in G.iterNodes():
if int(bfs.distance(node)) == args.circle_num:
circle.add(node)
utils.log(f"Found {len(circle):_} nodes at dist {args.circle_num}")


utils.log("Computing BFS around focus_b")
focus_index = names_db.name2index(args.focus_b)
bfs = nk.distance.BFS(G, focus_index)
bfs.run()

utils.log("Collecting distances to identified circle")
dists = collections.Counter()
for node in circle:
dists[int(bfs.distance(node))] += 1
print("Distances: ", [dists[i] for i in range(max(dists.keys()) + 1)])

utils.log("Analyzing distribution")
size = sum(count for (dist, count) in dists.items())
mean_dist = sum(dist * count for (dist, count) in dists.items()) / size
mode_count, mode_dist = max((count, dist)
for (dist, count) in dists.items())
print(f"Summary: {size=:10_d} / {mode_dist=:3d} {mode_count=:9_d} / {mean_dist=:5.1f}")

utils.log("Finished")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("focus_a")
parser.add_argument("circle_num", type=int)
parser.add_argument("focus_b")
parser.add_argument("--graph")
args = parser.parse_args()

utils.log("Reading graph")
G, names_db = graph_tools.load_graph_nk(args.graph)
utils.log(f"Loaded graph with {G.numberOfNodes():_} nodes / {G.numberOfEdges():_} edges")

utils.log("Computing BFS around focus_a")
focus_index = names_db.name2index(args.focus_a)
bfs = nk.distance.BFS(G, focus_index)
bfs.run()

utils.log(f"Identifying all nodes in circle {args.circle_num}")
circle = set()
for node in G.iterNodes():
if int(bfs.distance(node)) == args.circle_num:
circle.add(node)
utils.log(f"Found {len(circle):_} nodes at dist {args.circle_num}")


utils.log("Computing BFS around focus_b")
focus_index = names_db.name2index(args.focus_b)
bfs = nk.distance.BFS(G, focus_index)
bfs.run()

utils.log("Collecting distances to identified circle")
dists = collections.Counter()
for node in circle:
dists[int(bfs.distance(node))] += 1
print("Distances: ", [dists[i] for i in range(max(dists.keys()) + 1)])

utils.log("Analyzing distribution")
size = sum(count for (dist, count) in dists.items())
mean_dist = sum(dist * count for (dist, count) in dists.items()) / size
mode_count, mode_dist = max((count, dist)
for (dist, count) in dists.items())
print(f"Summary: {size=:10_d} / {mode_dist=:3d} {mode_count=:9_d} / {mean_dist=:5.1f}")

utils.log("Finished")

main()
101 changes: 52 additions & 49 deletions complete_analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,52 +54,55 @@ def plot_deg_dist_single(deg_dist, ax, type, label = "Degree Distribution",
ax.legend(loc = "upper right")


parser = argparse.ArgumentParser()
parser.add_argument("--version", help="Data version (defaults to most recent).")
args = parser.parse_args()

db = data_reader.Database(args.version)

# Levels of completeness
people = {}
utils.log("Loading Level 1: Marked no_more_children")
people["no_more_children"] = frozenset(db.enum_people_no_more_children())
utils.log(f"# no_more_children = {len(people['no_more_children']):_}")
utils.log("Loading Level 2: Have both parents linked")
people["both_parents"] = frozenset(p for p in people["no_more_children"]
if len(db.parents_of(p)) == 2)
utils.log(f"# both_parents = {len(people['both_parents']):_}")
utils.log("Loading Level 3: Have birth and death dates")
people["vital_dates"] = frozenset(p for p in people["both_parents"]
if db.age_at_death_of(p) is not None)
utils.log(f"# vital_dates = {len(people['vital_dates']):_}")
utils.log("Loading Level 4: At least one parent is marked no_more_children (so we guess this means no more siblings)")
# This is not exactly correct. Technically, we should require both parents to be
# in people["no_more_children"], but this is a heuristic.
people["no_more_siblings"] = frozenset(p for p in people["vital_dates"]
if db.mother_of(p) in people["no_more_children"]
or db.father_of(p) in people["no_more_children"])
utils.log(f"# no_more_siblings = {len(people['no_more_siblings']):_}")
utils.log("Loading Level 5: Ignore people who died as children, they are over-represented")
year_delta = datetime.timedelta(days = 365.24)
people["not_child"] = frozenset(p for p in people["no_more_siblings"]
if db.age_at_death_of(p) > 15 * year_delta)
utils.log(f"# not_child = {len(people['not_child']):_}")


utils.log("Processing degree")
degree_distr = {}
for type in people.keys():
degree_distr[type] = collections.Counter()
for person in people[type]:
degree = len(db.neighbors_of(person))
degree_distr[type][degree] += 1

utils.log("Plotting results")
fig, ax = plt.subplots()
for type in people.keys():
plot_deg_dist_single(degree_distr[type], ax, "Log-Linear", type)

plt.show()

utils.log("Done")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--version", help="Data version (defaults to most recent).")
args = parser.parse_args()

db = data_reader.Database(args.version)

# Levels of completeness
people = {}
utils.log("Loading Level 1: Marked no_more_children")
people["no_more_children"] = frozenset(db.enum_people_no_more_children())
utils.log(f"# no_more_children = {len(people['no_more_children']):_}")
utils.log("Loading Level 2: Have both parents linked")
people["both_parents"] = frozenset(p for p in people["no_more_children"]
if len(db.parents_of(p)) == 2)
utils.log(f"# both_parents = {len(people['both_parents']):_}")
utils.log("Loading Level 3: Have birth and death dates")
people["vital_dates"] = frozenset(p for p in people["both_parents"]
if db.age_at_death_of(p) is not None)
utils.log(f"# vital_dates = {len(people['vital_dates']):_}")
utils.log("Loading Level 4: At least one parent is marked no_more_children (so we guess this means no more siblings)")
# This is not exactly correct. Technically, we should require both parents to be
# in people["no_more_children"], but this is a heuristic.
people["no_more_siblings"] = frozenset(p for p in people["vital_dates"]
if db.mother_of(p) in people["no_more_children"]
or db.father_of(p) in people["no_more_children"])
utils.log(f"# no_more_siblings = {len(people['no_more_siblings']):_}")
utils.log("Loading Level 5: Ignore people who died as children, they are over-represented")
year_delta = datetime.timedelta(days = 365.24)
people["not_child"] = frozenset(p for p in people["no_more_siblings"]
if db.age_at_death_of(p) > 15 * year_delta)
utils.log(f"# not_child = {len(people['not_child']):_}")


utils.log("Processing degree")
degree_distr = {}
for type in people.keys():
degree_distr[type] = collections.Counter()
for person in people[type]:
degree = len(db.neighbors_of(person))
degree_distr[type][degree] += 1

utils.log("Plotting results")
fig, ax = plt.subplots()
for type in people.keys():
plot_deg_dist_single(degree_distr[type], ax, "Log-Linear", type)

plt.show()

utils.log("Done")

main()
75 changes: 39 additions & 36 deletions connection_random.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,39 +16,42 @@
import utils


parser = argparse.ArgumentParser()
parser.add_argument("--version", help="Data version (defaults to most recent).")
parser.add_argument("--load-db", action="store_true")
args = parser.parse_args()

utils.log("Load DB")
db = data_reader.Database(version=args.version)
if args.load_db:
db.load_connections()
partition_db = partition_tools.PartitionDb(version=args.version)

utils.log("Loading all user_nums in main tree")
focus_id = db.id2num("Lothrop-29")
rep = partition_db.find_partition_rep("connected", focus_id)
main_nums = list(partition_db.list_partition("connected", rep))
utils.log(f"Loaded {len(main_nums):_} nodes")

hist = collections.Counter()
total = 0
total2 = 0
for i in itertools.count():
start_time = time.time()
start_num = random.choice(main_nums)
end_num = random.choice(main_nums)
paths = connection.find_connections(db, start_num, end_num)
dist = len(next(paths)) - 1
utils.log(f"Distance {i}: {start_num} -> {end_num} = {dist} ({time.time() - start_time:.1f}s)")

hist[dist] += 1
total += dist
total2 += dist**2
count = i + 1
mean = total / count
stddev = math.sqrt(total2 / count - mean**2)
utils.log(f"Mean dist = {mean:.1f} ± {stddev:.1f}")
utils.log("Dist", [hist[i] for i in range(max(hist.keys()) + 1)])
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--version", help="Data version (defaults to most recent).")
parser.add_argument("--load-db", action="store_true")
args = parser.parse_args()

utils.log("Load DB")
db = data_reader.Database(version=args.version)
if args.load_db:
db.load_connections()
partition_db = partition_tools.PartitionDb(version=args.version)

utils.log("Loading all user_nums in main tree")
focus_id = db.id2num("Lothrop-29")
rep = partition_db.find_partition_rep("connected", focus_id)
main_nums = list(partition_db.list_partition("connected", rep))
utils.log(f"Loaded {len(main_nums):_} nodes")

hist = collections.Counter()
total = 0
total2 = 0
for i in itertools.count():
start_time = time.time()
start_num = random.choice(main_nums)
end_num = random.choice(main_nums)
paths = connection.find_connections(db, start_num, end_num)
dist = len(next(paths)) - 1
utils.log(f"Distance {i}: {start_num} -> {end_num} = {dist} ({time.time() - start_time:.1f}s)")

hist[dist] += 1
total += dist
total2 += dist**2
count = i + 1
mean = total / count
stddev = math.sqrt(total2 / count - mean**2)
utils.log(f"Mean dist = {mean:.1f} ± {stddev:.1f}")
utils.log("Dist", [hist[i] for i in range(max(hist.keys()) + 1)])

main()
Loading

0 comments on commit c6808c3

Please sign in to comment.