Skip to content

Commit

Permalink
* Update api_tools.is_redirect to return info about why it isn't.
Browse files Browse the repository at this point in the history
* Actually delete old dumps.
  • Loading branch information
sligocki committed Apr 4, 2023
1 parent 2553c95 commit 5c4ffe6
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 32 deletions.
27 changes: 16 additions & 11 deletions api_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,25 @@

def api_req(**params):
encoded_params = urllib.parse.urlencode(params)
resp = urllib.request.urlopen("https://api.wikitree.com/api.php",
resp = urllib.request.urlopen("https://api.wikitree.com/api.php",
data=encoded_params.encode("utf-8"))
return json.loads(resp.read())

def is_redirect(profile_num_or_id):

class RedirectInfo:
def __init__(self, json_response):
self.status = json_response[0]["status"]

# Save Redirection info
self.redirects_to = None
if self.status == 0:
m = re.fullmatch(r"#REDIRECT \[\[(.*)\]\]", json_response[0]["bio"])
if m:
self.redirects_to = m.group(1)


def redirect_info(profile_num_or_id):
"""Lookup a profile by # or id and figure out it is a redirect or not.
If it is, return the id of the profile it now redirects to."""
resp = api_req(action="getBio", key=profile_num_or_id)
# status == 0 is success. On failure, we see things like:
# status == "Invalid page id"
if resp[0]["status"] == 0:
m = re.fullmatch(r"#REDIRECT \[\[(.*)\]\]", resp[0]["bio"])
if m:
# Return wikitree_id of profile this is redirected to.
return m.group(1)
# If not a redirect, return nothing
return None
return RedirectInfo(resp)
4 changes: 3 additions & 1 deletion dump_cleanup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ set -e
echo Staring Cleanup
# Save 4 most recent versions.
for x in $(ls data/version/ | egrep '^20\d\d-\d\d-\d\d$' | sort | head --lines=-4); do
echo Removing data/version/$x # TODO: rm -rf data/version/$x
echo Removing data/version/$x
rm -rf data/version/$x
done
echo Finished Cleanup
echo
df -h .
67 changes: 48 additions & 19 deletions dump_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,48 +3,77 @@
"""

import argparse
import collections
import csv
import partition_tools
from pathlib import Path
import random

import api_tools

import pandas as pd


WIKITREE_ID_COL = "WikiTree ID_DB"
def load_all_profiles(version, debug_limit_read=None):
all_profiles = set()
# Use boolean in the data dump
with open(Path("data", "version", version, "dump_people_users.csv"), "r") as f:
csv_reader = csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
for row in csv_reader:
all_profiles.add(row["User ID"])
return all_profiles
in_path = Path("data", "version", version, "dump_people_users.csv")
df = pd.read_csv(in_path, delimiter='\t', quoting=csv.QUOTE_NONE, usecols=[WIKITREE_ID_COL])
return set(df[WIKITREE_ID_COL])


def main():
parser = argparse.ArgumentParser()
parser.add_argument("old_version")
parser.add_argument("new_version")
parser.add_argument("--sample-api", type=int, default=1000,
parser.add_argument("--sample-api", type=int, default=100,
help="Number of profiles to try looking up via API.")
args = parser.parse_args()

old_profiles = load_all_profiles(args.old_version)
new_profiles = load_all_profiles(args.new_version)

added_profiles = new_profiles - old_profiles
deleted_profiles = old_profiles - new_profiles

print(f"Version {args.new_version} vs. {args.old_version}")
print(f" * {len(old_profiles)=:_}")
print(f" * {len(new_profiles)=:_}")
print(f" * {len(added_profiles)=:_}")
print(f" * {len(deleted_profiles)=:_}")

sample_deleted = random.sample(list(deleted_profiles), args.sample_api)
num_redirects = 0
for profile_num in sample_deleted:
if api_tools.is_redirect(profile_num):
num_redirects += 1
print(f'Of "deleted" profiles, {num_redirects / len(sample_deleted):.0%} were actually merges')
print(f" * {len(deleted_profiles)=:_} (includes merges and moves)")

if args.sample_api > 0:
sample_deleted = random.sample(list(deleted_profiles), args.sample_api)

# Keep counter so that we keep track if multiple profiles
# were merged into a single destination.
redirect_dst_counts = collections.Counter()
# Reasons that deleted profiles could not be accessed.
non_redirect_reasons = collections.Counter()
for profile_id in sample_deleted:
redirect_info = api_tools.redirect_info(profile_id)
if redirect_info.redirects_to:
redirect_dst_counts[redirect_info.redirects_to] += 1
else:
non_redirect_reasons[redirect_info.status] += 1

# If destination pre-existed, this is a merge
num_merged = sum(cnt for dst, cnt in redirect_dst_counts.items()
if dst in old_profiles)
# If destination did not pre-exist, this is a move/rename.
num_moved = sum(cnt for dst, cnt in redirect_dst_counts.items()
if dst not in old_profiles)

print()
print('Of "deleted" profiles:')
if len(sample_deleted) < len(deleted_profiles):
print(f" * ~{num_moved / len(sample_deleted):.0%} were moves / renames")
print(f" * ~{num_merged / len(sample_deleted):.0%} were merges")
print(f" * ~{non_redirect_reasons.total() / len(sample_deleted):.0%} were not redirects (deleted / made private)")
else:
print(f" * {num_moved=:_}")
print(f" * {num_merged=:_}")
print(f" * {non_redirect_reasons.total()=:_}")
print(" * Non-redirects:", non_redirect_reasons)


main()
4 changes: 3 additions & 1 deletion dump_update.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@ source dump_download.sh
if $DOWNLOADED; then
time bash dump_build.sh $TIMESTAMP

echo "Update default version"
echo
echo "Update default version to $TIMESTAMP"
rm -f data/version/default
ln -s $TIMESTAMP data/version/default

echo
bash dump_cleanup.sh
fi

0 comments on commit 5c4ffe6

Please sign in to comment.