Skip to content

Commit

Permalink
More type cleanup.
Browse files Browse the repository at this point in the history
  • Loading branch information
sligocki committed Feb 27, 2024
1 parent daf557f commit b56bcfd
Show file tree
Hide file tree
Showing 6 changed files with 125 additions and 112 deletions.
19 changes: 10 additions & 9 deletions csv_to_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@
}


def rename_columns(table, column_map, assert_all_columns):
def rename_columns(table : pa.Table, column_map : dict[str, str],
assert_all_columns : bool) -> pa.Table:
"""Rename columns from the original CSV"""

unspecified_column_names = set(table.column_names) - column_map.keys()
Expand All @@ -79,7 +80,7 @@ def rename_columns(table, column_map, assert_all_columns):

return table.rename_columns(new_names)

def parse_wikitree_dates(table, column_names):
def parse_wikitree_dates(table : pa.Table, column_names : list[str]) -> pa.Table:
"""Parse WikiTree dates which may be of many formats"""
for column_name in column_names:
array = table[column_name].combine_chunks()
Expand All @@ -104,7 +105,7 @@ def parse_wikitree_dates(table, column_names):
table.schema.get_field_index(column_name), column_name, array)
return table

def load_person_csv(csv_path, is_custom):
def load_person_csv(csv_path : Path, is_custom : bool) -> pa.Table:
utils.log(f"Loading {str(csv_path)}")
table = pa.csv.read_csv(csv_path,
parse_options=pa.csv.ParseOptions(
Expand Down Expand Up @@ -142,7 +143,7 @@ def load_person_csv(csv_path, is_custom):

return table

def load_marriages_csv(csv_path, is_custom):
def load_marriages_csv(csv_path : Path, is_custom : bool) -> pa.Table:
utils.log(f"Loading {str(csv_path)}")
table = pa.csv.read_csv(csv_path,
parse_options=pa.csv.ParseOptions(
Expand All @@ -165,7 +166,7 @@ def load_marriages_csv(csv_path, is_custom):

return table

def load_categories_csv(csv_path):
def load_categories_csv(csv_path : Path) -> pa.Table:
utils.log(f"Loading {str(csv_path)}")
table = pa.csv.read_csv(csv_path,
parse_options=pa.csv.ParseOptions(
Expand All @@ -178,7 +179,7 @@ def load_categories_csv(csv_path):

return table

def csv_to_parquet(data_dir):
def csv_to_parquet(data_dir : Path) -> None:
person_custom_table = load_person_csv(
Path("data", "custom_users.csv"), is_custom=True)
person_table = load_person_csv(
Expand All @@ -190,7 +191,7 @@ def csv_to_parquet(data_dir):
utils.log(f" Filtered out duplicates from custom: {person_table.num_rows:_} rows of people")

person_table = pa.concat_tables([person_table, person_custom_table], promote=True)
pq.write_table(person_table, Path(data_dir, "people.parquet"))
pq.write_table(person_table, Path(data_dir, "people.parquet")) # type: ignore[arg-type]

utils.log(f"Wrote {person_table.num_rows:_} rows of people")

Expand All @@ -200,11 +201,11 @@ def csv_to_parquet(data_dir):
marriages_table = load_marriages_csv(
Path(data_dir, "dump_people_marriages.csv"), is_custom=False)
marriages_table = pa.concat_tables([marriages_table, marriage_custom_table], promote=True)
pq.write_table(marriages_table, Path(data_dir, "marriages.parquet"))
pq.write_table(marriages_table, Path(data_dir, "marriages.parquet")) # type: ignore[arg-type]
utils.log(f"Wrote {marriages_table.num_rows:_} rows of marriages")

categories_table = load_categories_csv(Path(data_dir, "dump_categories.csv"))
pq.write_table(categories_table, Path(data_dir, "categories.parquet"))
pq.write_table(categories_table, Path(data_dir, "categories.parquet")) # type: ignore[arg-type]
utils.log(f"Wrote {categories_table.num_rows:_} rows of categories")

utils.log("Done")
Expand Down
33 changes: 18 additions & 15 deletions graph_nk2nx.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,25 @@
import utils


parser = argparse.ArgumentParser()
parser.add_argument("in_graph")
parser.add_argument("out_graph")
args = parser.parse_args()
def main():
parser = argparse.ArgumentParser()
parser.add_argument("in_graph")
parser.add_argument("out_graph")
args = parser.parse_args()

utils.log("Reading graph")
Gnk, names_db = graph_tools.load_graph_nk(args.in_graph)
utils.log("Reading graph")
Gnk, names_db = graph_tools.load_graph_nk(args.in_graph)

utils.log("Converting graph")
Gnx = nx.Graph()
for node in Gnk.iterNodes():
Gnx.add_node(names_db.index2name(node))
for (node_a, node_b) in Gnk.iterEdges():
Gnx.add_edge(names_db.index2name(node_a), names_db.index2name(node_b))
utils.log("Converting graph")
Gnx = nx.Graph()
for node in Gnk.iterNodes():
Gnx.add_node(names_db.index2name(node))
for (node_a, node_b) in Gnk.iterEdges():
Gnx.add_edge(names_db.index2name(node_a), names_db.index2name(node_b))

utils.log("Writing graph")
graph_tools.write_graph(Gnx, args.out_graph)
utils.log("Writing graph")
graph_tools.write_graph(Gnx, args.out_graph)

utils.log("Finished")
utils.log("Finished")

main()
61 changes: 32 additions & 29 deletions graph_nx2nk.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,32 +7,35 @@
import utils


parser = argparse.ArgumentParser()
parser.add_argument("in_graph")
parser.add_argument("out_graph")
args = parser.parse_args()

utils.log("Reading graph")
Gnx = graph_tools.load_graph(args.in_graph)

utils.log("Create node id -> num conversion")
name2index = {node: index for index, node in enumerate(Gnx)}

utils.log("Converting graph")
# Copy directly so we keep track of name2index
# Gnk = nk.nxadapter.nx2nk(Gnx)
Gnk = nk.Graph(Gnx.number_of_nodes())
for (node_a, node_b) in Gnx.edges:
Gnk.addEdge(name2index[node_a], name2index[node_b])

utils.log("Writing graph")
nk.graphio.writeGraph(Gnk, args.out_graph, nk.Format.METIS)

utils.log("Writing node names")
names_db = graph_tools.NamesDb(f"{args.out_graph}.names.db")
names_db.create_table()
for node_name, index in name2index.items():
names_db.insert(index, node_name)
names_db.commit()

utils.log("Finished")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("in_graph")
parser.add_argument("out_graph")
args = parser.parse_args()

utils.log("Reading graph")
Gnx = graph_tools.load_graph(args.in_graph)

utils.log("Create node id -> num conversion")
name2index = {node: index for index, node in enumerate(Gnx)}

utils.log("Converting graph")
# Copy directly so we keep track of name2index
# Gnk = nk.nxadapter.nx2nk(Gnx)
Gnk = nk.Graph(Gnx.number_of_nodes())
for (node_a, node_b) in Gnx.edges:
Gnk.addEdge(name2index[node_a], name2index[node_b])

utils.log("Writing graph")
nk.graphio.writeGraph(Gnk, args.out_graph, nk.Format.METIS)

utils.log("Writing node names")
names_db = graph_tools.NamesDb(f"{args.out_graph}.names.db")
names_db.create_table()
for node_name, index in name2index.items():
names_db.insert(index, node_name)
names_db.commit()

utils.log("Finished")

main()
2 changes: 1 addition & 1 deletion graph_random.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import utils


def build_exponential(num_nodes, edges_per_node):
def build_exponential(num_nodes : int, edges_per_node : int) -> nx.Graph:
# We always add one edge to each added node. Then we can add some extras anywhere.
extra_edges_per_node = edges_per_node - 1
assert extra_edges_per_node >= 0
Expand Down
63 changes: 33 additions & 30 deletions graph_strip_leaves.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,39 +9,42 @@
import networkx as nx


parser = argparse.ArgumentParser()
parser.add_argument("graph_in")
parser.add_argument("graph_out")
args = parser.parse_args()

print("Loading graph", time.process_time())
graph = nx.read_adjlist(args.graph_in)

print("Find leaf nodes", time.process_time())
to_delete = set()
for node in graph.nodes():
if graph.degree[node] <= 1:
to_delete.add(node)

i = 0
while to_delete:
print(f"Stripping graph: Iter {i} # Nodes: {len(graph.nodes):,} # Edges: {len(graph.edges):,} {len(to_delete):,}", time.process_time())
i += 1
to_check = set()
for node in to_delete:
to_check.update(graph.neighbors(node))
graph.remove_node(node)

# We only need to check nodes that were adjacent to a deleted node.
# All other nodes still have > 1 degree because none of their edges were rm-ed
def main():
parser = argparse.ArgumentParser()
parser.add_argument("graph_in")
parser.add_argument("graph_out")
args = parser.parse_args()

print("Loading graph", time.process_time())
graph = nx.read_adjlist(args.graph_in)

print("Find leaf nodes", time.process_time())
to_delete = set()
for node in to_check:
for node in graph.nodes():
if graph.degree[node] <= 1:
to_delete.add(node)

print(f"Final graph: # Nodes: {len(graph.nodes):,} # Edges: {len(graph.edges):,}")
i = 0
while to_delete:
print(f"Stripping graph: Iter {i} # Nodes: {len(graph.nodes):,} # Edges: {len(graph.edges):,} {len(to_delete):,}", time.process_time())
i += 1
to_check = set()
for node in to_delete:
to_check.update(graph.neighbors(node))
graph.remove_node(node)

# We only need to check nodes that were adjacent to a deleted node.
# All other nodes still have > 1 degree because none of their edges were rm-ed
to_delete = set()
for node in to_check:
if graph.degree[node] <= 1:
to_delete.add(node)

print(f"Final graph: # Nodes: {len(graph.nodes):,} # Edges: {len(graph.edges):,}")

print("Saving to disk", time.process_time())
nx.write_adjlist(graph, args.graph_out)

print("Saving to disk", time.process_time())
nx.write_adjlist(graph, args.graph_out)
print("Done", time.process_time())

print("Done", time.process_time())
main()
59 changes: 31 additions & 28 deletions graph_to_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,33 +5,36 @@
import utils


g = nx.read_adjlist("data/version/default/connection_graph.adj.nx")

# Get list of 5 biggest components and one component for each order of magnitude.
top_n = utils.TopN(5)
comps_by_size = {}
num_comps_by_size = {}
for comp in nx.connected_components(g):
sg = g.subgraph(comp)
top_n.add(len(comp), sg)
oom = int(math.log(len(comp), 10))
if oom not in comps_by_size:
comps_by_size[oom] = (len(comp), sg)
num_comps_by_size[oom] = 0
num_comps_by_size[oom] += 1


print("Top components")
for num_nodes, sg in top_n.items:
print(num_nodes)
filename = "comp-%d.adj.nx" % num_nodes
nx.write_adjlist(sg, filename)
sizes_written = {num_nodes for num_nodes, _ in top_n.items}

print("Num of components by size")
for oom in sorted(num_comps_by_size.keys()):
print("10^%d %d (ex: %d)" % (oom, num_comps_by_size[oom], comps_by_size[oom][0]))
num_nodes, sg = comps_by_size[oom]
if num_nodes not in sizes_written:
def main():
g = nx.read_adjlist("data/version/default/connection_graph.adj.nx")

# Get list of 5 biggest components and one component for each order of magnitude.
top_n = utils.TopN(5)
comps_by_size = {}
num_comps_by_size = {}
for comp in nx.connected_components(g):
sg = g.subgraph(comp)
top_n.add(len(comp), sg)
oom = int(math.log(len(comp), 10))
if oom not in comps_by_size:
comps_by_size[oom] = (len(comp), sg)
num_comps_by_size[oom] = 0
num_comps_by_size[oom] += 1


print("Top components")
for num_nodes, sg in top_n.items:
print(num_nodes)
filename = "comp-%d.adj.nx" % num_nodes
nx.write_adjlist(sg, filename)
sizes_written = {num_nodes for num_nodes, _ in top_n.items}

print("Num of components by size")
for oom in sorted(num_comps_by_size.keys()):
print("10^%d %d (ex: %d)" % (oom, num_comps_by_size[oom], comps_by_size[oom][0]))
num_nodes, sg = comps_by_size[oom]
if num_nodes not in sizes_written:
filename = "comp-%d.adj.nx" % num_nodes
nx.write_adjlist(sg, filename)

main()

0 comments on commit b56bcfd

Please sign in to comment.