Skip to content

Commit

Permalink
Add watchlist filter tool.
Browse files Browse the repository at this point in the history
  • Loading branch information
sligocki committed Apr 21, 2023
1 parent 672e27e commit 81de8de
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 4 deletions.
7 changes: 4 additions & 3 deletions distances.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,13 @@ def enum_user_nums(db, args):
parser.add_argument("wikitree_id", nargs="*")
args = parser.parse_args()

utils.log("Loading DB")
db = data_reader.Database(args.version)
# db.load_connections()

ignore_ids = args.ignore_people.split(",") if args.ignore_people else []
ignore_nums = frozenset(db.id2num(id) for id in ignore_ids)

db = data_reader.Database(args.version)
db.load_connections()

circle_sizes = {}
for user_num in enum_user_nums(db, args):
utils.log("Loading distances from", db.num2id(user_num))
Expand Down
2 changes: 1 addition & 1 deletion dump_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ time python3 csv_to_parquet.py --version=${TIMESTAMP}
echo
echo "(3) Compute relationships"
# 6m
time python pq_compute_relatives.py --version=${TIMESTAMP}
time python3 pq_compute_relatives.py --version=${TIMESTAMP}

# TODO: Remove these once we give up on sqlite?
# 30m
Expand Down
1 change: 1 addition & 0 deletions sqlite_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def __init__(self, version):
self.cursor = self.conn.cursor()

def get(self, user_num, attribute):
assert isinstance(user_num, int), repr(user_num)
self.cursor.execute(f"SELECT {attribute} FROM people WHERE user_num=?", (user_num,))
rows = self.cursor.fetchall()
if rows:
Expand Down
85 changes: 85 additions & 0 deletions watchlist_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# Filter watchlist by various criteria to find profiles that might be worth removing.

import argparse
import json
from pathlib import Path
import random

import data_reader
import distances
import utils


def load_recursive(focus, func, num_iters):
this = set([focus])
ret = set(this)
for _ in range(num_iters):
next = set()
for x in this:
next.update(func(x))
this = next
ret.update(this)
return ret

def load_ancestors(db, focus, num_gens):
return load_recursive(focus, db.parents_of, num_gens)

def load_descendants(db, focus, num_gens):
return load_recursive(focus, db.children_of, num_gens)

def main():
parser = argparse.ArgumentParser()
parser.add_argument("--focus", default="Ligocki-7")
parser.add_argument("--watchlist", type=Path,
default=Path("data/watchlist.json"))

# Filter parameters
parser.add_argument("--circles", type=int, default=7)
parser.add_argument("--ancestor-gens", type=int, default=10)
parser.add_argument("--descendant-gens", type=int, default=2)

parser.add_argument("--version", help="Data version (defaults to most recent).")
args = parser.parse_args()

db = data_reader.Database(args.version)
focus_num = db.get_person_num(args.focus)
focus_id = db.num2id(focus_num)

with open(args.watchlist) as f:
js = json.load(f)
assert len(js) == 1
assert js[0]["watchlistCount"] == len(js[0]["watchlist"])
watchlist = frozenset(x["Id"] for x in js[0]["watchlist"])
utils.log(f"Loaded watchlist. Size: {len(watchlist):_}")
watchlist = frozenset(x for x in watchlist if db.num2id(x))
utils.log(f"Filtered watchlist down to: {len(watchlist):_}")

dists, _, _, _ = distances.get_distances(
db, focus_num, dist_cutoff=args.circles)
circles = frozenset(dists.keys())
utils.log(f"Loaded {len(circles):_} people within {args.circles} of {focus_id}")

ancestors = frozenset(load_ancestors(
db, focus_num, args.ancestor_gens))
utils.log(f"Loaded {len(ancestors):_} ancestors of {focus_id}")

relatives = ancestors.union(*[
load_descendants(db, x, args.descendant_gens)
for x in ancestors])
utils.log(f"Loaded {len(relatives):_} relatives of {focus_id}")

kin = relatives.union(*[db.partners_of(x) for x in relatives])
utils.log(f"Loaded {len(kin):_} kin of {focus_id}")

good = circles | kin
utils.log(f" # Kin or in circles: {len(good):_}")

print()
print(f" * {len(good - watchlist)=}")
print(f" * {len(watchlist - good)=}")

bad = watchlist - good
print([db.num2id(x) for x in random.sample(list(bad), 20)])


main()

0 comments on commit 81de8de

Please sign in to comment.