Skip to content

Commit

Permalink
twitter-to-sqlite import command, closes #4
Browse files Browse the repository at this point in the history
  • Loading branch information
simonw authored Oct 11, 2019
1 parent 436a170 commit 2019ee9
Show file tree
Hide file tree
Showing 13 changed files with 389 additions and 10 deletions.
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,22 @@ Here's how to start following tweets from every user ID currently represented as
--sql="select distinct followed_id from following" \
--ids

## Importing data from your Twitter archive

You can request an archive of your Twitter data by [following these instructions](https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive).

Twitter will send you a link to download a `.zip` file. You can import the contents of that file into a set of tables (each beginning with the `archive-` prefix) using the `import` command:

$ twitter-to-sqlite import archive.db ~/Downloads/twitter-2019-06-25-b31f2.zip

This command does not populate any of the regular tables, since Twitter's export data does not exactly match the schema returned by the Twitter API.

You may want to use other commands to populate tables based on data from the archive. For example, to retrieve full API versions of each of the tweets you have favourited in your archive, you could run the following:

$ twitter-to-sqlite statuses-lookup archive.db \
--sql='select tweetId from [archive-like]' \
--skip-existing

## Design notes

* Tweet IDs are stored as integers, to afford sorting by ID in a sensible way
Expand Down
Empty file added tests/__init__.py
Empty file.
10 changes: 10 additions & 0 deletions tests/test_create_zip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import pathlib

from .utils import create_zip


def test_create_zip():
zf = create_zip()
assert {"account.js", "saved-search.js", "following.js", "follower.js"} == {
f.filename for f in zf.filelist
}
51 changes: 51 additions & 0 deletions tests/test_import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import io

import pytest
import sqlite_utils
from click.testing import CliRunner
from twitter_to_sqlite import cli

from .utils import create_zip


def test_cli_import(tmpdir):
archive = str(tmpdir / "archive.zip")
output = str(tmpdir / "output.db")
buf = io.BytesIO()
zf = create_zip(buf)
zf.close()
open(archive, "wb").write(buf.getbuffer())
result = CliRunner().invoke(cli.cli, ["import", output, archive])
assert 0 == result.exit_code, result.stderr
db = sqlite_utils.Database(output)
assert {
"archive-follower",
"archive-saved-search",
"archive-account",
"archive-following",
} == set(db.table_names())

assert [{"accountId": "73747798"}, {"accountId": "386025404"}] == list(
db["archive-follower"].rows
)
assert [{"accountId": "547842573"}, {"accountId": "12158"}] == list(
db["archive-following"].rows
)

assert [
{"savedSearchId": "42214", "query": "simonw"},
{"savedSearchId": "55814", "query": "django"},
] == list(db["archive-saved-search"].rows)
dd = list(db["archive-account"].rows)
assert [
{
"pk": "c4e32e91742df2331ef3ad1e481d1a64d781183a",
"phoneNumber": "+15555555555",
"email": "[email protected]",
"createdVia": "web",
"username": "simonw",
"accountId": "12497",
"createdAt": "2006-11-15T13:18:50.000Z",
"accountDisplayName": "Simon Willison",
}
] == dd
7 changes: 4 additions & 3 deletions tests/test_save_tweets.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from twitter_to_sqlite import utils
import pytest
import json
import pathlib

import pytest
import sqlite_utils
import json
from twitter_to_sqlite import utils


@pytest.fixture
Expand Down
14 changes: 14 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import io
import pathlib
import zipfile


def create_zip(buf=None):
if buf is None:
buf = io.BytesIO()
path = pathlib.Path(__file__).parent / "zip_contents"
zf = zipfile.ZipFile(buf, "w")
for filepath in path.glob("**/*"):
if filepath.is_file():
zf.write(filepath, str(filepath.relative_to(path)))
return zf
11 changes: 11 additions & 0 deletions tests/zip_contents/account.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
window.YTD.account.part0 = [ {
"account" : {
"phoneNumber" : "+15555555555",
"email" : "[email protected]",
"createdVia" : "web",
"username" : "simonw",
"accountId" : "12497",
"createdAt" : "2006-11-15T13:18:50.000Z",
"accountDisplayName" : "Simon Willison"
}
} ]
9 changes: 9 additions & 0 deletions tests/zip_contents/follower.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
window.YTD.follower.part0 = [ {
"follower" : {
"accountId" : "73747798"
}
}, {
"follower" : {
"accountId" : "386025404"
}
} ]
9 changes: 9 additions & 0 deletions tests/zip_contents/following.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
window.YTD.following.part0 = [ {
"following" : {
"accountId" : "547842573"
}
}, {
"following" : {
"accountId" : "12158"
}
} ]
11 changes: 11 additions & 0 deletions tests/zip_contents/saved-search.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
window.YTD.saved_search.part0 = [ {
"savedSearch" : {
"savedSearchId" : "42214",
"query" : "simonw"
}
}, {
"savedSearch" : {
"savedSearchId" : "55814",
"query" : "django"
}
} ]
203 changes: 203 additions & 0 deletions twitter_to_sqlite/archive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
# Utilities for dealing with Twitter archives
import json

# Goal is to have a mapping of filename to a tuple with
# (callable, pk=) triples, where the callable
# takes the JSON from that file and returns a dictionary
# of tables that should be created {"tabe": [rows-to-upsert]}
transformers = {}


def register(filename, each, pk=None):
def callback(data):
return {filename: [item.get(each) for item in data]}

transformers[filename] = (callback, pk)


def register_each(filename, pk=None):
def inner(fn):
def callback(data):
return {filename: [fn(item) for item in data]}

transformers[filename] = (callback, pk)

return inner


def register_multi(filename):
def inner(fn):
transformers[filename] = (fn, None)

return inner


def register_all(filename):
def inner(fn):
transformers[filename] = (fn, None)

return inner


def extract_json(contents):
# window.YTD.account_creation_ip.part0 = [ ... data ...]
contents = contents.strip()
if contents.startswith(b"window."):
contents = contents.split(b" = ", 1)[1]
return json.loads(contents)


register("account-creation-ip", each="accountCreationIp")
register("account-suspension", each="accountSuspension")
register("account-timezone", each="accountTimezone")
register("account", each="account")


@register_each("ad-engagements")
def ad_engagements(item):
return item["ad"]["adsUserData"]["adEngagements"]


@register_each("ad-impressions")
def ad_impressions(item):
return item["ad"]["adsUserData"]["adImpressions"]


@register_each("ad-mobile-conversions-attributed")
def ad_mobile_conversions_attributed(item):
return item["ad"]["adsUserData"]["attributedMobileAppConversions"]


@register_each("ad-mobile-conversions-unattributed")
def ad_mobile_conversions_unattributed(item):
return item["ad"]["adsUserData"]["unattributedMobileAppConversions"]


@register_each("ad-online-conversions-attributed")
def ad_online_conversions_attributed(item):
return item["ad"]["adsUserData"]["attributedOnlineConversions"]


@register_each("ad-online-conversions-unattributed")
def ad_online_conversions_unattributed(item):
return item["ad"]["adsUserData"]["unattributedOnlineConversions"]


@register_each("ageinfo")
def ageinfo(item):
return item["ageMeta"]["ageInfo"]


register("block", each="blocking", pk="accountId")
register("connected-applications", each="connectedApplication", pk="id")
# register("contact", ...)
register("direct-message-group-headers", each="dmConversation", pk="conversationId")
register("direct-message-group", each="dmConversation", pk="conversationId")
register("direct-message-headers", each="dmConversation", pk="conversationId")
# pk for this one is NOT set, because there are dupes:
# TODO: These actually do warrant separate tables:
register("direct-message", each="dmConversation")

register("email-address-change", each="emailAddressChange")
register("follower", each="follower", pk="accountId")
register("following", each="following", pk="accountId")
register("ip-audit", each="ipAudit")
register("like", each="like", pk="tweetId")


@register_all("lists-created")
def lists_created(data):
return {"lists-created": _list_from_common(data)}


@register_all("lists-member")
def lists_member(data):
return {"lists-member": _list_from_common(data)}


@register_all("lists-subscribed")
def lists_subscribed(data):
return {"lists-subscribed": _list_from_common(data)}


register("moment", each="moment", pk="momentId")
# register("mute", ...)


@register_all("ni-devices")
def lists_created(data):
devices = []
for block in data:
block = block["niDeviceResponse"]
category = list(block.keys())[0]
details = list(block.values())[0]
details["category"] = category
devices.append(details)
return {"ne-devices": devices}


# Skipped all the periscope- stuff for the moment


@register_multi("personalization")
def personalization(data):
data = data[0]
# As a multi, we get to return a dict of
# table names => list of objects to insert
to_create = {}
demographics = data["p13nData"]["demographics"]
to_create["personalization-demographics-languages"] = demographics["languages"]
to_create["personalization-demographics-genderInfo"] = [demographics["genderInfo"]]
to_create["personalization-interests"] = data["p13nData"]["interests"]["interests"]
to_create["personalization-partnerInterests"] = data["p13nData"]["interests"][
"partnerInterests"
]
to_create["personalization-advertisers"] = [
{"name": name}
for name in data["p13nData"]["interests"]["audienceAndAdvertisers"][
"advertisers"
]
]
to_create["personalization-num-audiences"] = [
{
"numAudiences": data["p13nData"]["interests"]["audienceAndAdvertisers"][
"numAudiences"
]
}
]
to_create["personalization-shows"] = [
{"name": name} for name in data["p13nData"]["interests"]["shows"]
]
to_create["personalization-locationHistory"] = [
{"name": name} for name in data["p13nData"]["locationHistory"]
]
to_create["personalization-inferredAgeInfo"] = [data["p13nData"]["inferredAgeInfo"]]
return to_create


register("phone-number", each="device")
register("profile", each="profile")
# protected-history.js

register("saved-search", each="savedSearch", pk="savedSearchId")
# screen-name-change.js


@register_each("tweet", pk="id")
def tweet(item):
for key in item:
if key == "id" or key.endswith("_id"):
item[key] = int(item[key])
return item


register("verified", each="verified")


def _list_from_common(data):
lists = []
for block in data:
for url in block["userListInfo"]["urls"]:
bits = url.split("/")
lists.append({"screen_name": bits[-3], "list_slug": bits[-1]})
return lists
Loading

0 comments on commit 2019ee9

Please sign in to comment.