Skip to content

Commit

Permalink
Automate update_data_dump completely
Browse files Browse the repository at this point in the history
  • Loading branch information
sligocki committed Feb 3, 2020
1 parent 20a441d commit 06dca99
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 12 deletions.
1 change: 1 addition & 0 deletions connection.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python3
"""
Find all of the shortest length connections between two people.
"""
Expand Down
47 changes: 47 additions & 0 deletions process_ls_date.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""
Convert date from call to ls to standardized form.
"""

import datetime
import sys

month_name2num = {
"Jan" : 1,
"Feb" : 2,
"Mar" : 3,
"Apr" : 4,
"May" : 5,
"Jun" : 6,
"Jul" : 7,
"Aug" : 8,
"Sep" : 9,
"Oct" : 10,
"Nov" : 11,
"Dec" : 12,
}

text = sys.stdin.read()
fields = text.split()
# Expected format:
# -rw-r--r-- ? 517 517 995891602 Jan 26 14:21 foo.txt
# or
# -rw-r--r-- 1 sl929 wheel 0 Dec 13 1985 /tmp/foo
month_str, day_str, year_or_time_str = fields[-4:-1]
day = int(day_str)
month_num = month_name2num[month_str]

if ":" in year_or_time_str:
# Year is implicit -> It's within the last 12 months.
today = datetime.date.today()
if month_num <= today.month:
# It's this calendar year.
year = today.year
else:
# It's last calendar year.
year = today.year - 1
else:
# Year is explicitly in ls output.
year = int(year_or_time_str)

file_date = datetime.date(year, month_num, day)
print(file_date.isoformat())
24 changes: 12 additions & 12 deletions update_data_dump.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/bin/bash
# Script for updating data dump from apps.wikitree.com

set -u
Expand All @@ -9,31 +10,28 @@ PW_FILE=$(realpath data/apps.password.txt)

echo "(1) Check version of data dump"
# TODO: Put password in file.
echo ls -l dumps/dump_people_users.csv.gz | sshpass -f "$PW_FILE" sftp ${USERNAME}@apps.wikitree.com

echo "(2) Figure out timestamp from ls"
# TODO: Auto-extract timestamp from ls format.
echo "Input timestamp now: [^C to cancel]"
read TIMESTAMP
echo "You entered [$TIMESTAMP]"

TIMESTAMP=$( \
echo ls -l dumps/dump_people_users.csv.gz \
| sshpass -f "$PW_FILE" sftp ${USERNAME}@apps.wikitree.com \
| python3 process_ls_date.py)
echo "Data dump version: $TIMESTAMP"
if [ -d data/dumps/$TIMESTAMP ]; then
echo "We already have this dump"
echo "Done"
exit 0
fi

echo "(3) If we don't yet have this dump, download it"
echo "(2) If we don't yet have this dump, download it"
mkdir data/dumps/$TIMESTAMP
cd data/dumps/$TIMESTAMP
echo get dumps/*.csv.gz | sshpass -f "$PW_FILE" sftp ${USERNAME}@apps.wikitree.com

echo "(4) Unzip (overwriting previous CSVs)"
echo "(3) Unzip (overwriting previous CSVs)"
for x in users marriages; do
gunzip dump_people_${x}.csv.gz -c > ../../dump_people_${x}.csv
done

echo "(5) Process new dump"
echo "(4) Process new dump"
cd ../../.. # Back to main repo
rm -f data/wikitree_dump.db
echo csv_to_sqlite.py
Expand All @@ -45,5 +43,7 @@ python csv_to_groups.py
echo "csv_to_groups.py --sibling-in-law"
python csv_to_groups.py --sibling-in-law

echo "(6) Print stats about new dump"
echo "(5) Print stats about new dump"
# TODO

echo "Done"

0 comments on commit 06dca99

Please sign in to comment.