Skip to content

Commit

Permalink
* Add api_tools.py for calling WikiTree API. Start with just a single…
Browse files Browse the repository at this point in the history
… method (to see if an id is a redirect (has been merged into another profile)).

* Update graph_compare.py to use networkit.
  • Loading branch information
sligocki committed Feb 11, 2022
1 parent 1cb0dec commit a1c40c0
Show file tree
Hide file tree
Showing 6 changed files with 168 additions and 41 deletions.
29 changes: 29 additions & 0 deletions api_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""
Tools for dealing with WikiTree API.
"""

import json
import re
import urllib.parse
import urllib.request


def api_req(**params):
encoded_params = urllib.parse.urlencode(params)
resp = urllib.request.urlopen("https://api.wikitree.com/api.php",
data=encoded_params.encode("utf-8"))
return json.loads(resp.read())

def is_redirect(profile_num_or_id):
"""Lookup a profile by # or id and figure out it is a redirect or not.
If it is, return the id of the profile it now redirects to."""
resp = api_req(action="getBio", key=profile_num_or_id)
# status == 0 is success. On failure, we see things like:
# status == "Invalid page id"
if resp[0]["status"] == 0:
m = re.fullmatch(r"#REDIRECT \[\[(.*)\]\]", resp[0]["bio"])
if m:
# Return wikitree_id of profile this is redirected to.
return m.group(1)
# If not a redirect, return nothing
return None
66 changes: 44 additions & 22 deletions connected_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,37 @@

import argparse
import csv
import partition_tools
from pathlib import Path
import random


def load_connection_status(version, debug_limit_read=None):
def load_connection_status(use_dump_conn, version, debug_limit_read=None):
connected = set()
unconnected = set()
with open(Path("data", "version", version, "dump_people_users.csv"), "r") as f:
csv_reader = csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
for row in csv_reader:
if row["Connected"] == "1":
connected.add(row["User ID"])
if use_dump_conn:
# Use computed network connectivity from data dump
p_db = partition_tools.PartitionDb(version)
main_component_rep = p_db.main_component_rep("connected")
for row in p_db.enum_all("connected"):
if row["rep"] == main_component_rep:
connected.add(row["user_num"])
else:
assert row["Connected"] == "0", row
unconnected.add(row["User ID"])
unconnected.add(row["user_num"])
if debug_limit_read and len(unconnected) + len(connected) >= debug_limit_read:
return connected, unconnected
else:
# Use boolean in the data dump
with open(Path("data", "version", version, "dump_people_users.csv"), "r") as f:
csv_reader = csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
for row in csv_reader:
if row["Connected"] == "1":
connected.add(row["User ID"])
else:
assert row["Connected"] == "0", row
unconnected.add(row["User ID"])
if debug_limit_read and len(unconnected) + len(connected) >= debug_limit_read:
return connected, unconnected
return connected, unconnected

def count_overlap(a, b1, b2):
Expand All @@ -30,27 +45,34 @@ def main():
parser = argparse.ArgumentParser()
parser.add_argument("old_version")
parser.add_argument("new_version")
parser.add_argument("--use-dump-conn", action="store_true",
help="If true, calculate connectivity via the dump network itself. Note that since the dump does not contain any private profiles, this will show fewer profiles connected to the global tree. If false, we instead use the boolean set in the dump.")
parser.add_argument("--debug-limit-read", type=int)
args = parser.parse_args()

old_connected, old_unconnected = load_connection_status(args.old_version, args.debug_limit_read)
new_connected, new_unconnected = load_connection_status(args.new_version, args.debug_limit_read)
old_connected, old_unconnected = load_connection_status(args.use_dump_conn, args.old_version, args.debug_limit_read)
old_all = old_connected | old_unconnected
new_connected, new_unconnected = load_connection_status(args.use_dump_conn, args.new_version, args.debug_limit_read)
new_all = new_connected | new_unconnected

con_con, con_uncon, con_added = \
count_overlap(new_connected, old_connected, old_unconnected)
uncon_con, uncon_uncon, uncon_added = \
count_overlap(new_unconnected, old_connected, old_unconnected)
print(f"""
Version {args.new_version} vs. {args.old_version}
* Totals {len(new_connected) + len(new_unconnected):_} vs. {len(old_connected) + len(old_unconnected):_}
* Connected: Total: {len(new_connected):_} ({len(new_connected) / (len(new_connected) + len(new_unconnected)):.1%})
- New profiles: {len(con_added):_} ({len(con_added) / len(new_connected):.1%})
- Previously unconnected: {len(con_uncon):_} ({len(con_uncon) / len(new_connected):.1%})
- Previously connected: {len(con_con):_} ({len(con_con) / len(new_connected):.1%})
* Unconnected: Total: {len(new_unconnected):_} ({len(new_unconnected) / (len(new_connected) + len(new_unconnected)):.1%})
- New profiles: {len(uncon_added):_} ({len(uncon_added) / len(new_unconnected):.1%})
- Previously unconnected: {len(uncon_uncon):_} ({len(uncon_uncon) / len(new_unconnected):.1%})
- Previously connected: {len(uncon_con):_} ({len(uncon_con) / len(new_unconnected):.1%})
""")

print(f"Version {args.new_version} vs. {args.old_version}")
print(f" * Totals {len(new_all):_} vs. {len(old_all):_}")
print(f" - Added: {len(new_all - old_all):_}")
print(f" - Deleted/Merged: {len(old_all - new_all):_}")
print(f" * Connected: Total: {len(new_connected):_} ({len(new_connected) / (len(new_all)):.1%})")
if new_connected:
print(f" - New profiles: {len(con_added):_} ({len(con_added) / len(new_connected):.1%})")
print(f" - Previously unconnected: {len(con_uncon):_} ({len(con_uncon) / len(new_connected):.1%})")
print(f" - Previously connected: {len(con_con):_} ({len(con_con) / len(new_connected):.1%})")
print(f" * Unconnected: Total: {len(new_unconnected):_} ({len(new_unconnected) / (len(new_all)):.1%})")
if new_unconnected:
print(f" - New profiles: {len(uncon_added):_} ({len(uncon_added) / len(new_unconnected):.1%})")
print(f" - Previously unconnected: {len(uncon_uncon):_} ({len(uncon_uncon) / len(new_unconnected):.1%})")
print(f" - Previously connected: {len(uncon_con):_} ({len(uncon_con) / len(new_unconnected):.1%})")

main()
50 changes: 50 additions & 0 deletions dump_diff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""
Compare two dumps to see details about how profiles were added/deleted.
"""

import argparse
import csv
import partition_tools
from pathlib import Path
import random

import api_tools


def load_all_profiles(version, debug_limit_read=None):
all_profiles = set()
# Use boolean in the data dump
with open(Path("data", "version", version, "dump_people_users.csv"), "r") as f:
csv_reader = csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
for row in csv_reader:
all_profiles.add(row["User ID"])
return all_profiles

def main():
parser = argparse.ArgumentParser()
parser.add_argument("old_version")
parser.add_argument("new_version")
parser.add_argument("--sample-api", type=int, default=1000,
help="Number of profiles to try looking up via API.")
args = parser.parse_args()

old_profiles = load_all_profiles(args.old_version)
new_profiles = load_all_profiles(args.new_version)

added_profiles = new_profiles - old_profiles
deleted_profiles = old_profiles - new_profiles

print(f"Version {args.new_version} vs. {args.old_version}")
print(f" * {len(old_profiles)=:_}")
print(f" * {len(new_profiles)=:_}")
print(f" * {len(added_profiles)=:_}")
print(f" * {len(deleted_profiles)=:_}")

sample_deleted = random.sample(list(deleted_profiles), args.sample_api)
num_redirects = 0
for profile_num in sample_deleted:
if api_tools.is_redirect(profile_num):
num_redirects += 1
print(f'Of "deleted" profiles, {num_redirects / len(sample_deleted):.0%} were actually merges')

main()
22 changes: 13 additions & 9 deletions graph_compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,34 @@
import collections
from pathlib import Path

import networkx as nx
import networkit as nk

import graph_tools
import utils


def load_node_edge_sets(filename):
graph, names_db = graph_tools.load_graph_nk(filename)
names = names_db.all_index2names()

node_set = set(names[node] for node in graph.iterNodes())
edge_set = set(frozenset([names[u], names[v]])
for (u, v) in graph.iterEdges())
return node_set, edge_set

def main():
parser = argparse.ArgumentParser()
parser.add_argument("graph_before", type=Path)
parser.add_argument("graph_after", type=Path)
args = parser.parse_args()

utils.log("Load graph_before")
graph_before = nx.read_adjlist(args.graph_before)
nodes_before = set(graph_before.nodes())
edges_before = set(frozenset(edge) for edge in graph_before.edges())
del graph_before
nodes_before, edges_before = load_node_edge_sets(args.graph_before)
utils.log(f"Loaded graph with {len(nodes_before):_} nodes and {len(edges_before):_} edges")

print()
utils.log("Load graph_after")
graph_after = nx.read_adjlist(args.graph_after)
nodes_after = set(graph_after.nodes())
edges_after = set(frozenset(edge) for edge in graph_after.edges())
del graph_after
nodes_after, edges_after = load_node_edge_sets(args.graph_after)
utils.log(f"Loaded graph with {len(nodes_after):_} nodes and {len(edges_after):_} edges")

print()
Expand Down
6 changes: 6 additions & 0 deletions graph_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ def name2index(self, node_name):
rows = cursor.fetchall()
assert len(rows) == 1, (node_name, rows)
return rows[0]["graph_index"]

def all_index2names(self):
cursor = self.conn.cursor()
cursor.execute("SELECT graph_index, node_name FROM nodes")
rows = cursor.fetchall()
return {row["graph_index"]: row["node_name"] for row in rows}


def load_graph_nk(filename):
Expand Down
36 changes: 26 additions & 10 deletions partition_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,37 +17,53 @@ def __init__(self, version):
self.filename = Path(utils.data_version_dir(version), "partitions.db")
self.conn = sqlite3.connect(self.filename)
self.conn.row_factory = sqlite3.Row
self.cursor = self.conn.cursor()

# Readers
def find_partition_rep(self, table, person):
self.cursor.execute(f"SELECT rep FROM {table} WHERE user_num=?",
cursor = self.conn.cursor()
cursor.execute(f"SELECT rep FROM {table} WHERE user_num=?",
(person,))
rows = self.cursor.fetchall()
rows = cursor.fetchall()
assert len(rows) == 1, (person, rows)
return rows[0]["rep"]


def list_partition(self, table, rep):
self.cursor.execute(f"SELECT user_num FROM {table} WHERE rep=?", (rep,))
return frozenset(row["user_num"] for row in self.cursor.fetchall())
cursor = self.conn.cursor()
cursor.execute(f"SELECT user_num FROM {table} WHERE rep=?", (rep,))
return frozenset(row["user_num"] for row in cursor.fetchall())

def main_component_rep(self, table):
# Note: I just pick the component that Samuel Lothrop (Lothrop-29) belongs
# to. He is one of the most central profiles on WikiTree. This is certainly
# the correct component for the `connected` graph. For other partitions,
# it may not be the largest component ...
return self.find_partition_rep(table, 142891) # Lothrop-29

def enum_all(self, table):
cursor = self.conn.cursor()
cursor.execute(f"SELECT user_num, rep FROM {table}")
while row := cursor.fetchone():
yield row


# Writers
def write_partition(self, table, partitions):
# TODO: Maybe restructure this so that all partitions use the same table with a partition_type field.
self.cursor.execute(f"DROP TABLE IF EXISTS {table}")
self.cursor.execute(f"CREATE TABLE {table} (user_num INT, rep INT, PRIMARY KEY (user_num))")
cursor = self.conn.cursor()
cursor.execute(f"DROP TABLE IF EXISTS {table}")
cursor.execute(f"CREATE TABLE {table} (user_num INT, rep INT, PRIMARY KEY (user_num))")

i = 0
for rep in partitions:
for person in partitions[rep]:
self.cursor.execute(f"INSERT INTO {table} VALUES (?,?)",
cursor.execute(f"INSERT INTO {table} VALUES (?,?)",
(person, rep))
i += 1
if i % 1000000 == 0:
self.conn.commit()
self.conn.commit()

self.cursor.execute(f"CREATE INDEX idx_{table}_rep ON {table} (rep)")
cursor.execute(f"CREATE INDEX idx_{table}_rep ON {table} (rep)")
self.conn.commit()


Expand Down

0 comments on commit a1c40c0

Please sign in to comment.