twitter-to-sqlite import command, closes #4

dogsheep · Oct 11, 2019 · 2019ee9 · 2019ee9
1 parent 436a170
commit 2019ee9
Show file tree

Hide file tree

Showing 13 changed files with 389 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -215,6 +215,22 @@ Here's how to start following tweets from every user ID currently represented as
         --sql="select distinct followed_id from following" \
         --ids
 
+## Importing data from your Twitter archive
+
+You can request an archive of your Twitter data by [following these instructions](https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive).
+
+Twitter will send you a link to download a `.zip` file. You can import the contents of that file into a set of tables (each beginning with the `archive-` prefix) using the `import` command:
+
+    $ twitter-to-sqlite import archive.db ~/Downloads/twitter-2019-06-25-b31f2.zip
+
+This command does not populate any of the regular tables, since Twitter's export data does not exactly match the schema returned by the Twitter API.
+
+You may want to use other commands to populate tables based on data from the archive. For example, to retrieve full API versions of each of the tweets you have favourited in your archive, you could run the following:
+
+    $ twitter-to-sqlite statuses-lookup archive.db \
+        --sql='select tweetId from [archive-like]' \
+        --skip-existing
+
 ## Design notes
 
 * Tweet IDs are stored as integers, to afford sorting by ID in a sensible way

diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/test_create_zip.py b/tests/test_create_zip.py
@@ -0,0 +1,10 @@
+import pathlib
+
+from .utils import create_zip
+
+
+def test_create_zip():
+    zf = create_zip()
+    assert {"account.js", "saved-search.js", "following.js", "follower.js"} == {
+        f.filename for f in zf.filelist
+    }
diff --git a/tests/test_import.py b/tests/test_import.py
@@ -0,0 +1,51 @@
+import io
+
+import pytest
+import sqlite_utils
+from click.testing import CliRunner
+from twitter_to_sqlite import cli
+
+from .utils import create_zip
+
+
+def test_cli_import(tmpdir):
+    archive = str(tmpdir / "archive.zip")
+    output = str(tmpdir / "output.db")
+    buf = io.BytesIO()
+    zf = create_zip(buf)
+    zf.close()
+    open(archive, "wb").write(buf.getbuffer())
+    result = CliRunner().invoke(cli.cli, ["import", output, archive])
+    assert 0 == result.exit_code, result.stderr
+    db = sqlite_utils.Database(output)
+    assert {
+        "archive-follower",
+        "archive-saved-search",
+        "archive-account",
+        "archive-following",
+    } == set(db.table_names())
+
+    assert [{"accountId": "73747798"}, {"accountId": "386025404"}] == list(
+        db["archive-follower"].rows
+    )
+    assert [{"accountId": "547842573"}, {"accountId": "12158"}] == list(
+        db["archive-following"].rows
+    )
+
+    assert [
+        {"savedSearchId": "42214", "query": "simonw"},
+        {"savedSearchId": "55814", "query": "django"},
+    ] == list(db["archive-saved-search"].rows)
+    dd = list(db["archive-account"].rows)
+    assert [
+        {
+            "pk": "c4e32e91742df2331ef3ad1e481d1a64d781183a",
+            "phoneNumber": "+15555555555",
+            "email": "[email protected]",
+            "createdVia": "web",
+            "username": "simonw",
+            "accountId": "12497",
+            "createdAt": "2006-11-15T13:18:50.000Z",
+            "accountDisplayName": "Simon Willison",
+        }
+    ] == dd
diff --git a/tests/test_save_tweets.py b/tests/test_save_tweets.py
@@ -1,8 +1,9 @@
-from twitter_to_sqlite import utils
-import pytest
+import json
 import pathlib
+
+import pytest
 import sqlite_utils
-import json
+from twitter_to_sqlite import utils
 
 
 @pytest.fixture

diff --git a/tests/utils.py b/tests/utils.py
@@ -0,0 +1,14 @@
+import io
+import pathlib
+import zipfile
+
+
+def create_zip(buf=None):
+    if buf is None:
+        buf = io.BytesIO()
+    path = pathlib.Path(__file__).parent / "zip_contents"
+    zf = zipfile.ZipFile(buf, "w")
+    for filepath in path.glob("**/*"):
+        if filepath.is_file():
+            zf.write(filepath, str(filepath.relative_to(path)))
+    return zf
diff --git a/tests/zip_contents/account.js b/tests/zip_contents/account.js
@@ -0,0 +1,11 @@
+window.YTD.account.part0 = [ {
+  "account" : {
+    "phoneNumber" : "+15555555555",
+    "email" : "[email protected]",
+    "createdVia" : "web",
+    "username" : "simonw",
+    "accountId" : "12497",
+    "createdAt" : "2006-11-15T13:18:50.000Z",
+    "accountDisplayName" : "Simon Willison"
+  }
+} ]
diff --git a/tests/zip_contents/follower.js b/tests/zip_contents/follower.js
@@ -0,0 +1,9 @@
+window.YTD.follower.part0 = [ {
+  "follower" : {
+    "accountId" : "73747798"
+  }
+}, {
+  "follower" : {
+    "accountId" : "386025404"
+  }
+} ]
diff --git a/tests/zip_contents/following.js b/tests/zip_contents/following.js
@@ -0,0 +1,9 @@
+window.YTD.following.part0 = [ {
+  "following" : {
+    "accountId" : "547842573"
+  }
+}, {
+  "following" : {
+    "accountId" : "12158"
+  }
+} ]
diff --git a/tests/zip_contents/saved-search.js b/tests/zip_contents/saved-search.js
@@ -0,0 +1,11 @@
+window.YTD.saved_search.part0 = [ {
+  "savedSearch" : {
+    "savedSearchId" : "42214",
+    "query" : "simonw"
+  }
+}, {
+  "savedSearch" : {
+    "savedSearchId" : "55814",
+    "query" : "django"
+  }
+} ]
diff --git a/twitter_to_sqlite/archive.py b/twitter_to_sqlite/archive.py
@@ -0,0 +1,203 @@
+# Utilities for dealing with Twitter archives
+import json
+
+# Goal is to have a mapping of filename to a tuple with
+# (callable, pk=) triples, where the callable
+# takes the JSON from that file and returns a dictionary
+# of tables that should be created {"tabe": [rows-to-upsert]}
+transformers = {}
+
+
+def register(filename, each, pk=None):
+    def callback(data):
+        return {filename: [item.get(each) for item in data]}
+
+    transformers[filename] = (callback, pk)
+
+
+def register_each(filename, pk=None):
+    def inner(fn):
+        def callback(data):
+            return {filename: [fn(item) for item in data]}
+
+        transformers[filename] = (callback, pk)
+
+    return inner
+
+
+def register_multi(filename):
+    def inner(fn):
+        transformers[filename] = (fn, None)
+
+    return inner
+
+
+def register_all(filename):
+    def inner(fn):
+        transformers[filename] = (fn, None)
+
+    return inner
+
+
+def extract_json(contents):
+    # window.YTD.account_creation_ip.part0 = [ ... data ...]
+    contents = contents.strip()
+    if contents.startswith(b"window."):
+        contents = contents.split(b" = ", 1)[1]
+    return json.loads(contents)
+
+
+register("account-creation-ip", each="accountCreationIp")
+register("account-suspension", each="accountSuspension")
+register("account-timezone", each="accountTimezone")
+register("account", each="account")
+
+
+@register_each("ad-engagements")
+def ad_engagements(item):
+    return item["ad"]["adsUserData"]["adEngagements"]
+
+
+@register_each("ad-impressions")
+def ad_impressions(item):
+    return item["ad"]["adsUserData"]["adImpressions"]
+
+
+@register_each("ad-mobile-conversions-attributed")
+def ad_mobile_conversions_attributed(item):
+    return item["ad"]["adsUserData"]["attributedMobileAppConversions"]
+
+
+@register_each("ad-mobile-conversions-unattributed")
+def ad_mobile_conversions_unattributed(item):
+    return item["ad"]["adsUserData"]["unattributedMobileAppConversions"]
+
+
+@register_each("ad-online-conversions-attributed")
+def ad_online_conversions_attributed(item):
+    return item["ad"]["adsUserData"]["attributedOnlineConversions"]
+
+
+@register_each("ad-online-conversions-unattributed")
+def ad_online_conversions_unattributed(item):
+    return item["ad"]["adsUserData"]["unattributedOnlineConversions"]
+
+
+@register_each("ageinfo")
+def ageinfo(item):
+    return item["ageMeta"]["ageInfo"]
+
+
+register("block", each="blocking", pk="accountId")
+register("connected-applications", each="connectedApplication", pk="id")
+# register("contact", ...)
+register("direct-message-group-headers", each="dmConversation", pk="conversationId")
+register("direct-message-group", each="dmConversation", pk="conversationId")
+register("direct-message-headers", each="dmConversation", pk="conversationId")
+# pk for this one is NOT set, because there are dupes:
+# TODO: These actually do warrant separate tables:
+register("direct-message", each="dmConversation")
+
+register("email-address-change", each="emailAddressChange")
+register("follower", each="follower", pk="accountId")
+register("following", each="following", pk="accountId")
+register("ip-audit", each="ipAudit")
+register("like", each="like", pk="tweetId")
+
+
+@register_all("lists-created")
+def lists_created(data):
+    return {"lists-created": _list_from_common(data)}
+
+
+@register_all("lists-member")
+def lists_member(data):
+    return {"lists-member": _list_from_common(data)}
+
+
+@register_all("lists-subscribed")
+def lists_subscribed(data):
+    return {"lists-subscribed": _list_from_common(data)}
+
+
+register("moment", each="moment", pk="momentId")
+# register("mute", ...)
+
+
+@register_all("ni-devices")
+def lists_created(data):
+    devices = []
+    for block in data:
+        block = block["niDeviceResponse"]
+        category = list(block.keys())[0]
+        details = list(block.values())[0]
+        details["category"] = category
+        devices.append(details)
+    return {"ne-devices": devices}
+
+
+# Skipped all the periscope- stuff for the moment
+
+
+@register_multi("personalization")
+def personalization(data):
+    data = data[0]
+    # As a multi, we get to return a dict of
+    # table names => list of objects to insert
+    to_create = {}
+    demographics = data["p13nData"]["demographics"]
+    to_create["personalization-demographics-languages"] = demographics["languages"]
+    to_create["personalization-demographics-genderInfo"] = [demographics["genderInfo"]]
+    to_create["personalization-interests"] = data["p13nData"]["interests"]["interests"]
+    to_create["personalization-partnerInterests"] = data["p13nData"]["interests"][
+        "partnerInterests"
+    ]
+    to_create["personalization-advertisers"] = [
+        {"name": name}
+        for name in data["p13nData"]["interests"]["audienceAndAdvertisers"][
+            "advertisers"
+        ]
+    ]
+    to_create["personalization-num-audiences"] = [
+        {
+            "numAudiences": data["p13nData"]["interests"]["audienceAndAdvertisers"][
+                "numAudiences"
+            ]
+        }
+    ]
+    to_create["personalization-shows"] = [
+        {"name": name} for name in data["p13nData"]["interests"]["shows"]
+    ]
+    to_create["personalization-locationHistory"] = [
+        {"name": name} for name in data["p13nData"]["locationHistory"]
+    ]
+    to_create["personalization-inferredAgeInfo"] = [data["p13nData"]["inferredAgeInfo"]]
+    return to_create
+
+
+register("phone-number", each="device")
+register("profile", each="profile")
+# protected-history.js
+
+register("saved-search", each="savedSearch", pk="savedSearchId")
+# screen-name-change.js
+
+
+@register_each("tweet", pk="id")
+def tweet(item):
+    for key in item:
+        if key == "id" or key.endswith("_id"):
+            item[key] = int(item[key])
+    return item
+
+
+register("verified", each="verified")
+
+
+def _list_from_common(data):
+    lists = []
+    for block in data:
+        for url in block["userListInfo"]["urls"]:
+            bits = url.split("/")
+            lists.append({"screen_name": bits[-3], "list_slug": bits[-1]})
+    return lists