diff --git a/csv_to_parquet.py b/csv_to_parquet.py index 32bb2d4..26b90f2 100644 --- a/csv_to_parquet.py +++ b/csv_to_parquet.py @@ -66,7 +66,8 @@ } -def rename_columns(table, column_map, assert_all_columns): +def rename_columns(table : pa.Table, column_map : dict[str, str], + assert_all_columns : bool) -> pa.Table: """Rename columns from the original CSV""" unspecified_column_names = set(table.column_names) - column_map.keys() @@ -79,7 +80,7 @@ def rename_columns(table, column_map, assert_all_columns): return table.rename_columns(new_names) -def parse_wikitree_dates(table, column_names): +def parse_wikitree_dates(table : pa.Table, column_names : list[str]) -> pa.Table: """Parse WikiTree dates which may be of many formats""" for column_name in column_names: array = table[column_name].combine_chunks() @@ -104,7 +105,7 @@ def parse_wikitree_dates(table, column_names): table.schema.get_field_index(column_name), column_name, array) return table -def load_person_csv(csv_path, is_custom): +def load_person_csv(csv_path : Path, is_custom : bool) -> pa.Table: utils.log(f"Loading {str(csv_path)}") table = pa.csv.read_csv(csv_path, parse_options=pa.csv.ParseOptions( @@ -142,7 +143,7 @@ def load_person_csv(csv_path, is_custom): return table -def load_marriages_csv(csv_path, is_custom): +def load_marriages_csv(csv_path : Path, is_custom : bool) -> pa.Table: utils.log(f"Loading {str(csv_path)}") table = pa.csv.read_csv(csv_path, parse_options=pa.csv.ParseOptions( @@ -165,7 +166,7 @@ def load_marriages_csv(csv_path, is_custom): return table -def load_categories_csv(csv_path): +def load_categories_csv(csv_path : Path) -> pa.Table: utils.log(f"Loading {str(csv_path)}") table = pa.csv.read_csv(csv_path, parse_options=pa.csv.ParseOptions( @@ -178,7 +179,7 @@ def load_categories_csv(csv_path): return table -def csv_to_parquet(data_dir): +def csv_to_parquet(data_dir : Path) -> None: person_custom_table = load_person_csv( Path("data", "custom_users.csv"), is_custom=True) person_table = load_person_csv( @@ -190,7 +191,7 @@ def csv_to_parquet(data_dir): utils.log(f" Filtered out duplicates from custom: {person_table.num_rows:_} rows of people") person_table = pa.concat_tables([person_table, person_custom_table], promote=True) - pq.write_table(person_table, Path(data_dir, "people.parquet")) + pq.write_table(person_table, Path(data_dir, "people.parquet")) # type: ignore[arg-type] utils.log(f"Wrote {person_table.num_rows:_} rows of people") @@ -200,11 +201,11 @@ def csv_to_parquet(data_dir): marriages_table = load_marriages_csv( Path(data_dir, "dump_people_marriages.csv"), is_custom=False) marriages_table = pa.concat_tables([marriages_table, marriage_custom_table], promote=True) - pq.write_table(marriages_table, Path(data_dir, "marriages.parquet")) + pq.write_table(marriages_table, Path(data_dir, "marriages.parquet")) # type: ignore[arg-type] utils.log(f"Wrote {marriages_table.num_rows:_} rows of marriages") categories_table = load_categories_csv(Path(data_dir, "dump_categories.csv")) - pq.write_table(categories_table, Path(data_dir, "categories.parquet")) + pq.write_table(categories_table, Path(data_dir, "categories.parquet")) # type: ignore[arg-type] utils.log(f"Wrote {categories_table.num_rows:_} rows of categories") utils.log("Done") diff --git a/graph_nk2nx.py b/graph_nk2nx.py index fbaecf0..4ea9a5e 100644 --- a/graph_nk2nx.py +++ b/graph_nk2nx.py @@ -6,22 +6,25 @@ import utils -parser = argparse.ArgumentParser() -parser.add_argument("in_graph") -parser.add_argument("out_graph") -args = parser.parse_args() +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("in_graph") + parser.add_argument("out_graph") + args = parser.parse_args() -utils.log("Reading graph") -Gnk, names_db = graph_tools.load_graph_nk(args.in_graph) + utils.log("Reading graph") + Gnk, names_db = graph_tools.load_graph_nk(args.in_graph) -utils.log("Converting graph") -Gnx = nx.Graph() -for node in Gnk.iterNodes(): - Gnx.add_node(names_db.index2name(node)) -for (node_a, node_b) in Gnk.iterEdges(): - Gnx.add_edge(names_db.index2name(node_a), names_db.index2name(node_b)) + utils.log("Converting graph") + Gnx = nx.Graph() + for node in Gnk.iterNodes(): + Gnx.add_node(names_db.index2name(node)) + for (node_a, node_b) in Gnk.iterEdges(): + Gnx.add_edge(names_db.index2name(node_a), names_db.index2name(node_b)) -utils.log("Writing graph") -graph_tools.write_graph(Gnx, args.out_graph) + utils.log("Writing graph") + graph_tools.write_graph(Gnx, args.out_graph) -utils.log("Finished") + utils.log("Finished") + +main() diff --git a/graph_nx2nk.py b/graph_nx2nk.py index 23a1164..f4eebe2 100644 --- a/graph_nx2nk.py +++ b/graph_nx2nk.py @@ -7,32 +7,35 @@ import utils -parser = argparse.ArgumentParser() -parser.add_argument("in_graph") -parser.add_argument("out_graph") -args = parser.parse_args() - -utils.log("Reading graph") -Gnx = graph_tools.load_graph(args.in_graph) - -utils.log("Create node id -> num conversion") -name2index = {node: index for index, node in enumerate(Gnx)} - -utils.log("Converting graph") -# Copy directly so we keep track of name2index -# Gnk = nk.nxadapter.nx2nk(Gnx) -Gnk = nk.Graph(Gnx.number_of_nodes()) -for (node_a, node_b) in Gnx.edges: - Gnk.addEdge(name2index[node_a], name2index[node_b]) - -utils.log("Writing graph") -nk.graphio.writeGraph(Gnk, args.out_graph, nk.Format.METIS) - -utils.log("Writing node names") -names_db = graph_tools.NamesDb(f"{args.out_graph}.names.db") -names_db.create_table() -for node_name, index in name2index.items(): - names_db.insert(index, node_name) -names_db.commit() - -utils.log("Finished") +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("in_graph") + parser.add_argument("out_graph") + args = parser.parse_args() + + utils.log("Reading graph") + Gnx = graph_tools.load_graph(args.in_graph) + + utils.log("Create node id -> num conversion") + name2index = {node: index for index, node in enumerate(Gnx)} + + utils.log("Converting graph") + # Copy directly so we keep track of name2index + # Gnk = nk.nxadapter.nx2nk(Gnx) + Gnk = nk.Graph(Gnx.number_of_nodes()) + for (node_a, node_b) in Gnx.edges: + Gnk.addEdge(name2index[node_a], name2index[node_b]) + + utils.log("Writing graph") + nk.graphio.writeGraph(Gnk, args.out_graph, nk.Format.METIS) + + utils.log("Writing node names") + names_db = graph_tools.NamesDb(f"{args.out_graph}.names.db") + names_db.create_table() + for node_name, index in name2index.items(): + names_db.insert(index, node_name) + names_db.commit() + + utils.log("Finished") + +main() diff --git a/graph_random.py b/graph_random.py index 556b306..ab234bb 100644 --- a/graph_random.py +++ b/graph_random.py @@ -16,7 +16,7 @@ import utils -def build_exponential(num_nodes, edges_per_node): +def build_exponential(num_nodes : int, edges_per_node : int) -> nx.Graph: # We always add one edge to each added node. Then we can add some extras anywhere. extra_edges_per_node = edges_per_node - 1 assert extra_edges_per_node >= 0 diff --git a/graph_strip_leaves.py b/graph_strip_leaves.py index d1b0db6..c2a9f51 100644 --- a/graph_strip_leaves.py +++ b/graph_strip_leaves.py @@ -9,39 +9,42 @@ import networkx as nx -parser = argparse.ArgumentParser() -parser.add_argument("graph_in") -parser.add_argument("graph_out") -args = parser.parse_args() - -print("Loading graph", time.process_time()) -graph = nx.read_adjlist(args.graph_in) - -print("Find leaf nodes", time.process_time()) -to_delete = set() -for node in graph.nodes(): - if graph.degree[node] <= 1: - to_delete.add(node) - -i = 0 -while to_delete: - print(f"Stripping graph: Iter {i} # Nodes: {len(graph.nodes):,} # Edges: {len(graph.edges):,} {len(to_delete):,}", time.process_time()) - i += 1 - to_check = set() - for node in to_delete: - to_check.update(graph.neighbors(node)) - graph.remove_node(node) - - # We only need to check nodes that were adjacent to a deleted node. - # All other nodes still have > 1 degree because none of their edges were rm-ed +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("graph_in") + parser.add_argument("graph_out") + args = parser.parse_args() + + print("Loading graph", time.process_time()) + graph = nx.read_adjlist(args.graph_in) + + print("Find leaf nodes", time.process_time()) to_delete = set() - for node in to_check: + for node in graph.nodes(): if graph.degree[node] <= 1: to_delete.add(node) -print(f"Final graph: # Nodes: {len(graph.nodes):,} # Edges: {len(graph.edges):,}") + i = 0 + while to_delete: + print(f"Stripping graph: Iter {i} # Nodes: {len(graph.nodes):,} # Edges: {len(graph.edges):,} {len(to_delete):,}", time.process_time()) + i += 1 + to_check = set() + for node in to_delete: + to_check.update(graph.neighbors(node)) + graph.remove_node(node) + + # We only need to check nodes that were adjacent to a deleted node. + # All other nodes still have > 1 degree because none of their edges were rm-ed + to_delete = set() + for node in to_check: + if graph.degree[node] <= 1: + to_delete.add(node) + + print(f"Final graph: # Nodes: {len(graph.nodes):,} # Edges: {len(graph.edges):,}") + + print("Saving to disk", time.process_time()) + nx.write_adjlist(graph, args.graph_out) -print("Saving to disk", time.process_time()) -nx.write_adjlist(graph, args.graph_out) + print("Done", time.process_time()) -print("Done", time.process_time()) +main() diff --git a/graph_to_components.py b/graph_to_components.py index be263fe..4d32d98 100644 --- a/graph_to_components.py +++ b/graph_to_components.py @@ -5,33 +5,36 @@ import utils -g = nx.read_adjlist("data/version/default/connection_graph.adj.nx") - -# Get list of 5 biggest components and one component for each order of magnitude. -top_n = utils.TopN(5) -comps_by_size = {} -num_comps_by_size = {} -for comp in nx.connected_components(g): - sg = g.subgraph(comp) - top_n.add(len(comp), sg) - oom = int(math.log(len(comp), 10)) - if oom not in comps_by_size: - comps_by_size[oom] = (len(comp), sg) - num_comps_by_size[oom] = 0 - num_comps_by_size[oom] += 1 - - -print("Top components") -for num_nodes, sg in top_n.items: - print(num_nodes) - filename = "comp-%d.adj.nx" % num_nodes - nx.write_adjlist(sg, filename) -sizes_written = {num_nodes for num_nodes, _ in top_n.items} - -print("Num of components by size") -for oom in sorted(num_comps_by_size.keys()): - print("10^%d %d (ex: %d)" % (oom, num_comps_by_size[oom], comps_by_size[oom][0])) - num_nodes, sg = comps_by_size[oom] - if num_nodes not in sizes_written: +def main(): + g = nx.read_adjlist("data/version/default/connection_graph.adj.nx") + + # Get list of 5 biggest components and one component for each order of magnitude. + top_n = utils.TopN(5) + comps_by_size = {} + num_comps_by_size = {} + for comp in nx.connected_components(g): + sg = g.subgraph(comp) + top_n.add(len(comp), sg) + oom = int(math.log(len(comp), 10)) + if oom not in comps_by_size: + comps_by_size[oom] = (len(comp), sg) + num_comps_by_size[oom] = 0 + num_comps_by_size[oom] += 1 + + + print("Top components") + for num_nodes, sg in top_n.items: + print(num_nodes) filename = "comp-%d.adj.nx" % num_nodes nx.write_adjlist(sg, filename) + sizes_written = {num_nodes for num_nodes, _ in top_n.items} + + print("Num of components by size") + for oom in sorted(num_comps_by_size.keys()): + print("10^%d %d (ex: %d)" % (oom, num_comps_by_size[oom], comps_by_size[oom][0])) + num_nodes, sg = comps_by_size[oom] + if num_nodes not in sizes_written: + filename = "comp-%d.adj.nx" % num_nodes + nx.write_adjlist(sg, filename) + +main()