* Add api_tools.py for calling WikiTree API. Start with just a single…

… method (to see if an id is a redirect (has been merged into another profile)). * Update graph_compare.py to use networkit.
sligocki · Feb 11, 2022 · a1c40c0 · a1c40c0
1 parent 1cb0dec
commit a1c40c0
Show file tree

Hide file tree

Showing 6 changed files with 168 additions and 41 deletions.
diff --git a/api_tools.py b/api_tools.py
@@ -0,0 +1,29 @@
+"""
+Tools for dealing with WikiTree API.
+"""
+
+import json
+import re
+import urllib.parse
+import urllib.request
+
+
+def api_req(**params):
+  encoded_params = urllib.parse.urlencode(params)
+  resp = urllib.request.urlopen("https://api.wikitree.com/api.php", 
+                                data=encoded_params.encode("utf-8"))
+  return json.loads(resp.read())
+
+def is_redirect(profile_num_or_id):
+  """Lookup a profile by # or id and figure out it is a redirect or not.
+  If it is, return the id of the profile it now redirects to."""
+  resp = api_req(action="getBio", key=profile_num_or_id)
+  # status == 0 is success. On failure, we see things like:
+  # status == "Invalid page id"
+  if resp[0]["status"] == 0:
+    m = re.fullmatch(r"#REDIRECT \[\[(.*)\]\]", resp[0]["bio"])
+    if m:
+      # Return wikitree_id of profile this is redirected to.
+      return m.group(1)
+  # If not a redirect, return nothing
+  return None
diff --git a/connected_diff.py b/connected_diff.py
@@ -5,22 +5,37 @@
 
 import argparse
 import csv
+import partition_tools
 from pathlib import Path
+import random
 
 
-def load_connection_status(version, debug_limit_read=None):
+def load_connection_status(use_dump_conn, version, debug_limit_read=None):
   connected = set()
   unconnected = set()
-  with open(Path("data", "version", version, "dump_people_users.csv"), "r") as f:
-    csv_reader = csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
-    for row in csv_reader:
-      if row["Connected"] == "1":
-        connected.add(row["User ID"])
+  if use_dump_conn:
+    # Use computed network connectivity from data dump
+    p_db = partition_tools.PartitionDb(version)
+    main_component_rep = p_db.main_component_rep("connected")
+    for row in p_db.enum_all("connected"):
+      if row["rep"] == main_component_rep:
+        connected.add(row["user_num"])
       else:
-        assert row["Connected"] == "0", row
-        unconnected.add(row["User ID"])
+        unconnected.add(row["user_num"])
       if debug_limit_read and len(unconnected) + len(connected) >= debug_limit_read:
         return connected, unconnected
+  else:
+    # Use boolean in the data dump
+    with open(Path("data", "version", version, "dump_people_users.csv"), "r") as f:
+      csv_reader = csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
+      for row in csv_reader:
+        if row["Connected"] == "1":
+          connected.add(row["User ID"])
+        else:
+          assert row["Connected"] == "0", row
+          unconnected.add(row["User ID"])
+        if debug_limit_read and len(unconnected) + len(connected) >= debug_limit_read:
+          return connected, unconnected
   return connected, unconnected
 
 def count_overlap(a, b1, b2):
@@ -30,27 +45,34 @@ def main():
   parser = argparse.ArgumentParser()
   parser.add_argument("old_version")
   parser.add_argument("new_version")
+  parser.add_argument("--use-dump-conn", action="store_true",
+                      help="If true, calculate connectivity via the dump network itself. Note that since the dump does not contain any private profiles, this will show fewer profiles connected to the global tree. If false, we instead use the boolean set in the dump.")
   parser.add_argument("--debug-limit-read", type=int)
   args = parser.parse_args()
 
-  old_connected, old_unconnected = load_connection_status(args.old_version, args.debug_limit_read)
-  new_connected, new_unconnected = load_connection_status(args.new_version, args.debug_limit_read)
+  old_connected, old_unconnected = load_connection_status(args.use_dump_conn, args.old_version, args.debug_limit_read)
+  old_all = old_connected | old_unconnected
+  new_connected, new_unconnected = load_connection_status(args.use_dump_conn, args.new_version, args.debug_limit_read)
+  new_all = new_connected | new_unconnected
 
   con_con, con_uncon, con_added = \
     count_overlap(new_connected, old_connected, old_unconnected)
   uncon_con, uncon_uncon, uncon_added = \
     count_overlap(new_unconnected, old_connected, old_unconnected)
-  print(f"""
-Version {args.new_version} vs. {args.old_version}
- * Totals {len(new_connected) + len(new_unconnected):_} vs. {len(old_connected) + len(old_unconnected):_}
- * Connected: Total: {len(new_connected):_} ({len(new_connected) / (len(new_connected) + len(new_unconnected)):.1%})
-   - New profiles: {len(con_added):_} ({len(con_added) / len(new_connected):.1%})
-   - Previously unconnected: {len(con_uncon):_} ({len(con_uncon) / len(new_connected):.1%})
-   - Previously connected: {len(con_con):_} ({len(con_con) / len(new_connected):.1%})
- * Unconnected: Total: {len(new_unconnected):_} ({len(new_unconnected) / (len(new_connected) + len(new_unconnected)):.1%})
-   - New profiles: {len(uncon_added):_} ({len(uncon_added) / len(new_unconnected):.1%})
-   - Previously unconnected: {len(uncon_uncon):_} ({len(uncon_uncon) / len(new_unconnected):.1%})
-   - Previously connected: {len(uncon_con):_} ({len(uncon_con) / len(new_unconnected):.1%})
-""")
+
+  print(f"Version {args.new_version} vs. {args.old_version}")
+  print(f" * Totals {len(new_all):_} vs. {len(old_all):_}")
+  print(f"   - Added: {len(new_all - old_all):_}")
+  print(f"   - Deleted/Merged: {len(old_all - new_all):_}")
+  print(f" * Connected: Total: {len(new_connected):_} ({len(new_connected) / (len(new_all)):.1%})")
+  if new_connected:
+    print(f"   - New profiles: {len(con_added):_} ({len(con_added) / len(new_connected):.1%})")
+    print(f"   - Previously unconnected: {len(con_uncon):_} ({len(con_uncon) / len(new_connected):.1%})")
+    print(f"   - Previously connected: {len(con_con):_} ({len(con_con) / len(new_connected):.1%})")
+  print(f" * Unconnected: Total: {len(new_unconnected):_} ({len(new_unconnected) / (len(new_all)):.1%})")
+  if new_unconnected:
+    print(f"   - New profiles: {len(uncon_added):_} ({len(uncon_added) / len(new_unconnected):.1%})")
+    print(f"   - Previously unconnected: {len(uncon_uncon):_} ({len(uncon_uncon) / len(new_unconnected):.1%})")
+    print(f"   - Previously connected: {len(uncon_con):_} ({len(uncon_con) / len(new_unconnected):.1%})")
 
 main()
diff --git a/dump_diff.py b/dump_diff.py
@@ -0,0 +1,50 @@
+"""
+Compare two dumps to see details about how profiles were added/deleted.
+"""
+
+import argparse
+import csv
+import partition_tools
+from pathlib import Path
+import random
+
+import api_tools
+
+
+def load_all_profiles(version, debug_limit_read=None):
+  all_profiles = set()
+  # Use boolean in the data dump
+  with open(Path("data", "version", version, "dump_people_users.csv"), "r") as f:
+    csv_reader = csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
+    for row in csv_reader:
+      all_profiles.add(row["User ID"])
+  return all_profiles
+
+def main():
+  parser = argparse.ArgumentParser()
+  parser.add_argument("old_version")
+  parser.add_argument("new_version")
+  parser.add_argument("--sample-api", type=int, default=1000,
+                      help="Number of profiles to try looking up via API.")
+  args = parser.parse_args()
+
+  old_profiles = load_all_profiles(args.old_version)
+  new_profiles = load_all_profiles(args.new_version)
+
+  added_profiles = new_profiles - old_profiles
+  deleted_profiles = old_profiles - new_profiles
+
+  print(f"Version {args.new_version} vs. {args.old_version}")
+  print(f" * {len(old_profiles)=:_}")
+  print(f" * {len(new_profiles)=:_}")
+  print(f" * {len(added_profiles)=:_}")
+  print(f" * {len(deleted_profiles)=:_}")
+
+  sample_deleted = random.sample(list(deleted_profiles), args.sample_api)
+  num_redirects = 0
+  for profile_num in sample_deleted:
+    if api_tools.is_redirect(profile_num):
+      num_redirects += 1
+  print(f'Of "deleted" profiles, {num_redirects / len(sample_deleted):.0%} were actually merges')
+
+main()
diff --git a/graph_compare.py b/graph_compare.py
@@ -6,30 +6,34 @@
 import collections
 from pathlib import Path
 
-import networkx as nx
+import networkit as nk
 
+import graph_tools
 import utils
 
 
+def load_node_edge_sets(filename):
+  graph, names_db = graph_tools.load_graph_nk(filename)
+  names = names_db.all_index2names()
+
+  node_set = set(names[node] for node in graph.iterNodes())
+  edge_set = set(frozenset([names[u], names[v]])
+                 for (u, v) in graph.iterEdges())
+  return node_set, edge_set
+
 def main():
   parser = argparse.ArgumentParser()
   parser.add_argument("graph_before", type=Path)
   parser.add_argument("graph_after", type=Path)
   args = parser.parse_args()
 
   utils.log("Load graph_before")
-  graph_before = nx.read_adjlist(args.graph_before)
-  nodes_before = set(graph_before.nodes())
-  edges_before = set(frozenset(edge) for edge in graph_before.edges())
-  del graph_before
+  nodes_before, edges_before = load_node_edge_sets(args.graph_before)
   utils.log(f"Loaded graph with {len(nodes_before):_} nodes and {len(edges_before):_} edges")
 
   print()
   utils.log("Load graph_after")
-  graph_after = nx.read_adjlist(args.graph_after)
-  nodes_after = set(graph_after.nodes())
-  edges_after = set(frozenset(edge) for edge in graph_after.edges())
-  del graph_after
+  nodes_after, edges_after = load_node_edge_sets(args.graph_after)
   utils.log(f"Loaded graph with {len(nodes_after):_} nodes and {len(edges_after):_} edges")
 
   print()

diff --git a/graph_tools.py b/graph_tools.py
@@ -42,6 +42,12 @@ def name2index(self, node_name):
     rows = cursor.fetchall()
     assert len(rows) == 1, (node_name, rows)
     return rows[0]["graph_index"]
+
+  def all_index2names(self):
+    cursor = self.conn.cursor()
+    cursor.execute("SELECT graph_index, node_name FROM nodes")
+    rows = cursor.fetchall()
+    return {row["graph_index"]: row["node_name"] for row in rows}
 
 
 def load_graph_nk(filename):

diff --git a/partition_tools.py b/partition_tools.py
@@ -17,37 +17,53 @@ def __init__(self, version):
     self.filename = Path(utils.data_version_dir(version), "partitions.db")
     self.conn = sqlite3.connect(self.filename)
     self.conn.row_factory = sqlite3.Row
-    self.cursor = self.conn.cursor()
 
+  # Readers
   def find_partition_rep(self, table, person):
-    self.cursor.execute(f"SELECT rep FROM {table} WHERE user_num=?",
+    cursor = self.conn.cursor()
+    cursor.execute(f"SELECT rep FROM {table} WHERE user_num=?",
                    (person,))
-    rows = self.cursor.fetchall()
+    rows = cursor.fetchall()
     assert len(rows) == 1, (person, rows)
     return rows[0]["rep"]
 
-
   def list_partition(self, table, rep):
-    self.cursor.execute(f"SELECT user_num FROM {table} WHERE rep=?", (rep,))
-    return frozenset(row["user_num"] for row in self.cursor.fetchall())
+    cursor = self.conn.cursor()
+    cursor.execute(f"SELECT user_num FROM {table} WHERE rep=?", (rep,))
+    return frozenset(row["user_num"] for row in cursor.fetchall())
+
+  def main_component_rep(self, table):
+    # Note: I just pick the component that Samuel Lothrop (Lothrop-29) belongs
+    # to. He is one of the most central profiles on WikiTree. This is certainly
+    # the correct component for the `connected` graph. For other partitions,
+    # it may not be the largest component ...
+    return self.find_partition_rep(table, 142891)  # Lothrop-29
+
+  def enum_all(self, table):
+    cursor = self.conn.cursor()
+    cursor.execute(f"SELECT user_num, rep FROM {table}")
+    while row := cursor.fetchone():
+      yield row
 
 
+  # Writers
   def write_partition(self, table, partitions):
     # TODO: Maybe restructure this so that all partitions use the same table with a partition_type field.
-    self.cursor.execute(f"DROP TABLE IF EXISTS {table}")
-    self.cursor.execute(f"CREATE TABLE {table} (user_num INT, rep INT, PRIMARY KEY (user_num))")
+    cursor = self.conn.cursor()
+    cursor.execute(f"DROP TABLE IF EXISTS {table}")
+    cursor.execute(f"CREATE TABLE {table} (user_num INT, rep INT, PRIMARY KEY (user_num))")
 
     i = 0
     for rep in partitions:
       for person in partitions[rep]:
-        self.cursor.execute(f"INSERT INTO {table} VALUES (?,?)",
+        cursor.execute(f"INSERT INTO {table} VALUES (?,?)",
                   (person, rep))
         i += 1
         if i % 1000000 == 0:
           self.conn.commit()
     self.conn.commit()
 
-    self.cursor.execute(f"CREATE INDEX idx_{table}_rep ON {table} (rep)")
+    cursor.execute(f"CREATE INDEX idx_{table}_rep ON {table} (rep)")
     self.conn.commit()