Skip to content

Commit

Permalink
1) Rename group -> partition since we are using it to mean connected …
Browse files Browse the repository at this point in the history
…tree or sibling-in-law-ism.

2) Allow having multiple versions of data.
  • Loading branch information
sligocki committed May 28, 2021
1 parent a1bddc0 commit 85172a5
Show file tree
Hide file tree
Showing 14 changed files with 202 additions and 152 deletions.
28 changes: 15 additions & 13 deletions connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import graphviz

import data_reader
import group_tools
import partition_tools


class Bfs(object):
Expand Down Expand Up @@ -88,7 +88,8 @@ def find_connections(db, person1, person2, rel_types=frozenset(["parent", "child
len(bfs1.paths), bfs1.num_steps, db.num2id(person1), len(bfs2.paths), bfs2.num_steps, db.num2id(person2)))


def find_connections_group(db, start, group, rel_types=frozenset(["parent", "child", "sibling", "spouse"])):
def find_connections_group(db, start, group,
rel_types=frozenset(["parent", "child", "sibling", "spouse"])):
bfs = Bfs(db, start, rel_types)

found = False
Expand Down Expand Up @@ -149,8 +150,8 @@ def main():
help="Only print the distance (not connection sequence).")
parser.add_argument("--max-dist", type=int)

parser.add_argument("--to-group",
help="Destination is group rather than specific person.")
parser.add_argument("--to-partition",
help="Destination is partition rather than specific person.")

parser.add_argument("--rel-types", nargs='+', default=frozenset(["parent", "child", "sibling", "spouse"]))
parser.add_argument("--genetic", dest="rel_types", action="store_const", const=frozenset(["parent", "child"]),
Expand All @@ -160,19 +161,20 @@ def main():
args = parser.parse_args()

db = data_reader.Database()
partition_db = partition_tools.PartitionDb()

if args.to_group:
# Find shortest connection from person to any member of a group.
group_type, member_id = args.to_group.split(":")
if args.to_partition:
# Find shortest connection from person to any member of a partition.
partition_type, member_id = args.to_partition.split(":")
member_num = db.id2num(member_id)
rep = group_tools.find_group_rep(group_type, member_num)
group_members = group_tools.list_group(group_type, rep)
rep = partition_db.find_partition_rep(partition_type, member_num)
partition_members = partition_db.list_partition(partition_type, rep)
for start_id in args.person_id:
print("Connections from", start_id, "to group", args.to_group)
plot_name = "results/Connections_%s_%s" % (start_id, args.to_group)
connections = find_connections_group(db,
print("Connections from", start_id, "to partition", args.to_partition)
plot_name = "results/Connections_%s_%s" % (start_id, args.to_partition)
connections = find_connections_partition(db,
db.id2num(start_id),
group_members,
partition_members,
args.rel_types)
print_connections(args, db, connections, plot_name)

Expand Down
21 changes: 15 additions & 6 deletions connection_random.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,34 @@
"""


import argparse
import random

import collections
import connection
import data_reader
import group_tools
import itertools
import math
import partition_tools
import time
import utils


db = data_reader.Database()
db.load_connections()
parser = argparse.ArgumentParser()
parser.add_argument("--version", help="Data version (defaults to most recent).")
parser.add_argument("--load-db", action="store_true")
args = parser.parse_args()

utils.log("Load DB")
db = data_reader.Database(version=args.version)
if args.load_db:
db.load_connections()
partition_db = partition_tools.PartitionDb(version=args.version)

utils.log("Loading all user_nums in main tree")
focus_id = db.id2num("Lothrop-29")
rep = group_tools.find_group_rep("connected", focus_id)
main_nums = list(group_tools.list_group("connected", rep))
rep = partition_db.find_partition_rep("connected", focus_id)
main_nums = list(partition_db.list_partition("connected", rep))
utils.log(f"Loaded {len(main_nums):_} nodes")

hist = collections.Counter()
Expand All @@ -45,5 +54,5 @@
utils.log(f"Mean dist = {mean:.1f} ± {stddev:.1f}")

except KeyboardInterrupt:
utils.log("Dist", sorted(hist.items()))
utils.log("Dist", [hist[i] for i in range(max(hist.keys()) + 1)])
utils.log("Quiting")
15 changes: 11 additions & 4 deletions csv_iterate.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import csv
import datetime
from pathlib import Path

import utils


def LoadMin(s):
Expand Down Expand Up @@ -156,10 +159,12 @@ def iterate_users_file(filename):
yield UserRow(row, key)


def iterate_users(only_custom=False):
def iterate_users(*, version=None, only_custom=False):
if not only_custom:
for user in iterate_users_file("data/dump_people_users.csv"):
for user in iterate_users_file(Path(utils.data_version_dir(version),
"dump_people_users.csv")):
yield user
# Note: We don't use version dir for custom_users. Just use global one.
for user in iterate_users_file("data/custom_users.csv"):
yield user

Expand Down Expand Up @@ -192,10 +197,12 @@ def iterate_marriages_file(filename):
yield MarriageRow(row, key)


def iterate_marriages(only_custom=False):
def iterate_marriages(*, version=None, only_custom=False):
if not only_custom:
for marriage in iterate_marriages_file("data/dump_people_marriages.csv"):
for marriage in iterate_marriages_file(Path(utils.data_version_dir(version),
"dump_people_marriages.csv")):
yield marriage
# Note: We don't use version dir for custom_users. Just use global one.
for marriage in iterate_marriages_file("data/custom_marriages.csv"):
yield marriage

Expand Down
33 changes: 19 additions & 14 deletions csv_to_groups.py → csv_to_partitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import collections

import data_reader
import group_tools
import partition_tools


def connected_to(start, connections):
Expand All @@ -19,38 +19,43 @@ def connected_to(start, connections):
return visited


def get_connection_groups(connections):
def get_connection_partitions(connections):
"""Partition people into groups based on who is connected to who.
Returns a map {representative -> set(members)}
"""
groups = {}
partitions = {}
visited = set()
for person in connections:
if person not in visited:
group = connected_to(person, connections)
visited.update(group)
rep = min(group)
groups[rep] = group
return groups
partition = connected_to(person, connections)
visited.update(partition)
rep = min(partition)
partitions[rep] = partition
return partitions

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--version", help="Data version (defaults to most recent).")
parser.add_argument("--sibling-in-law", action="store_true")
args = parser.parse_args()

partition_db = partition_tools.PartitionDb(args.version)

if args.sibling_in_law:
connections = data_reader.load_connections(include_parents=False,
connections = data_reader.load_connections(version=args.version,
include_parents=False,
include_children=False,
include_siblings=True,
include_spouses=True)
groups = get_connection_groups(connections)
group_tools.write_group("sibling_in_law", groups)
partitions = get_connection_partitions(connections)
partition_db.write_partition("sibling_in_law", partitions)

else:
connections = data_reader.load_connections(include_parents=True,
connections = data_reader.load_connections(version=args.version,
include_parents=True,
include_children=True,
include_siblings=True,
include_spouses=True)
groups = get_connection_groups(connections)
group_tools.write_group("connected", groups)
partitions = get_connection_partitions(connections)
partition_db.write_partition("connected", partitions)
46 changes: 23 additions & 23 deletions csv_to_sqlite.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,35 @@
import argparse
from pathlib import Path
import sqlite3
import time

import csv_iterate
import utils


def csv_to_sqlite(only_update_custom=False):
conn = sqlite3.connect("data/wikitree_dump.db")
def csv_to_sqlite(args):
data_dir = utils.data_version_dir(args.version)
conn = sqlite3.connect(Path(data_dir, "wikitree_dump.db"))
c = conn.cursor()

if not only_update_custom:
# Create output table.
c.execute("""CREATE TABLE people (
user_num INT, wikitree_id STRING, birth_name STRING,
father_num INT, mother_num INT,
birth_date DATE, death_date DATE,
birth_location STRING, death_location STRING,
gender_code INT, no_more_children BOOL,
registered_time TIMESTAMP, touched_time TIMESTAMP,
edit_count INT, privacy_level INT,
manager_num INT,
PRIMARY KEY (user_num))""")
c.execute("CREATE TABLE relationships (user_num INT, relative_num INT, relationship_type ENUM)")
# Create output table.
c.execute("""CREATE TABLE people (
user_num INT, wikitree_id STRING, birth_name STRING,
father_num INT, mother_num INT,
birth_date DATE, death_date DATE,
birth_location STRING, death_location STRING,
gender_code INT, no_more_children BOOL,
registered_time TIMESTAMP, touched_time TIMESTAMP,
edit_count INT, privacy_level INT,
manager_num INT,
PRIMARY KEY (user_num))""")
c.execute("CREATE TABLE relationships (user_num INT, relative_num INT, relationship_type ENUM)")

# Iterate CSV
i = 0
num_rels = 0
print("Loading people from CSV", time.process_time())
for person in csv_iterate.iterate_users(only_update_custom):
for person in csv_iterate.iterate_users(version=args.version):
try:
c.execute("INSERT INTO people VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
(person.user_num(), person.wikitree_id(),
Expand Down Expand Up @@ -59,7 +61,7 @@ def csv_to_sqlite(only_update_custom=False):
conn.commit()

print("Loading marriages from CSV", time.process_time())
for marriage in csv_iterate.iterate_marriages(only_update_custom):
for marriage in csv_iterate.iterate_marriages(version=args.version):
user1, user2 = marriage.user_nums()
c.execute("INSERT INTO relationships VALUES (?,?,'spouse')",
(user1, user2))
Expand All @@ -70,10 +72,8 @@ def csv_to_sqlite(only_update_custom=False):
print("People: {:,}".format(i), "Relationships: {:,}".format(num_rels), "Runtime:", time.process_time())
conn.commit()

# TODO: Figure out how to update siblings incrementally.
if not only_update_custom:
print("Computing siblings", time.process_time())
c.execute("INSERT INTO relationships SELECT a.relative_num, b.relative_num, 'sibling' FROM relationships AS a, relationships AS b WHERE a.relationship_type = 'child' AND b.relationship_type = 'child' AND a.user_num = b.user_num AND a.relative_num <> b.relative_num")
print("Computing siblings", time.process_time())
c.execute("INSERT INTO relationships SELECT a.relative_num, b.relative_num, 'sibling' FROM relationships AS a, relationships AS b WHERE a.relationship_type = 'child' AND b.relationship_type = 'child' AND a.user_num = b.user_num AND a.relative_num <> b.relative_num")

print("Done", time.process_time())
conn.commit()
Expand All @@ -88,7 +88,7 @@ def csv_to_sqlite(only_update_custom=False):


parser = argparse.ArgumentParser()
parser.add_argument("--only-update-custom", action="store_true")
parser.add_argument("--version", help="Data version (defaults to most recent).")
args = parser.parse_args()

csv_to_sqlite(args.only_update_custom)
csv_to_sqlite(args)
15 changes: 9 additions & 6 deletions data_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
import sqlite_reader


def load_connections(include_parents,
def load_connections(version,
include_parents,
include_children,
include_siblings,
include_spouses):
Expand All @@ -18,7 +19,7 @@ def load_connections(include_parents,

print("Loading people", time.process_time())
num_conns = 0
for i, person in enumerate(csv_iterate.iterate_users()):
for i, person in enumerate(csv_iterate.iterate_users(version=version)):
person_num = person.user_num()
for parent_num in (person.father_num(), person.mother_num()):
if parent_num:
Expand All @@ -39,7 +40,7 @@ def load_connections(include_parents,

if include_spouses:
print("Loading marriages", time.process_time())
for marriage in csv_iterate.iterate_marriages():
for marriage in csv_iterate.iterate_marriages(version=version):
user1, user2 = marriage.user_nums()
connections[user1].add(user2)
connections[user2].add(user1)
Expand All @@ -48,8 +49,9 @@ def load_connections(include_parents,
return connections

class Database(sqlite_reader.Database):
def __init__(self):
super(Database, self).__init__()
def __init__(self, version=None):
super(Database, self).__init__(version)
self.version = version
self.connections = None

def neighbors_of(self, person):
Expand All @@ -59,7 +61,8 @@ def neighbors_of(self, person):
return super(Database, self).neighbors_of(person)

def load_connections(self):
self.connections = load_connections(include_parents=True,
self.connections = load_connections(version=args.version,
include_parents=True,
include_children=True,
include_siblings=True,
include_spouses=True)
Loading

0 comments on commit 85172a5

Please sign in to comment.