diff --git a/docs/twarc2_en_us.md b/docs/twarc2_en_us.md index eabe7601..90d1adcc 100644 --- a/docs/twarc2_en_us.md +++ b/docs/twarc2_en_us.md @@ -227,6 +227,32 @@ conversation: twarc2 conversation 266031293945503744 > conversation.jsonl +## Likes + +Twarc supports the two approaches that the Twitter API exposes for collecting likes via the `liked-tweets` and `liking-users` commands. + +The `liked-tweets` command returns the tweets that have been liked by a specific account. The account is specified by the user ID of that account, in the following example is the account of Twitter's founder: + + twarc2 liked-tweets 12 jacks-likes.jsonl + +In this case the output file contains all of the likes of publicly accessible tweets. Note that the order of likes is not guaranteed by the API, but is probably reverse chronological, or most recent likes by that account first. The underlying tweet objects contain no information about when the tweet was liked. + +The `liking-users` command returns the user profiles of the accounts that have liked a specific tweet (specified by the ID of the tweet): + + twarc2 liking-users 1460417326130421765 liking-users.jsonl + +In this example the output file contains all of the user profiles of the publicly accessible accounts that have liked that specific tweet. Note that the order of profiles is not guaranteed by the API, but is probably reverse chronological, or the profile of the most recent like for that account first. The underlying profile objects contain no information about when the tweet was liked. + +Note that likes of tweets that are not publicly accessible, or likes by accounts that are protected will not be retrieved by either of these methods. Therefore, the metrics available on a tweet object (under the `public_metrics.like_count` field) will likely be higher than the number of likes you can retrieve via the Twitter API using these endpoints. + +## Retweets + +You can retrieve the user profiles of publicly accessible accounts that have retweeted a specific tweet, using the `retweeted_by` command and the ID of the tweet as an identifier. For example: + + twarc2 retweeted-by 1460417326130421765 retweeting-users.jsonl + +Unfortunately this only returns the user profiles (presumably in reverse chronological order) of the retweeters of that tweet - this means that important information, like when the tweet was retweeted is not present in the returned object. + ## Dehydrate The `dehydrate` command generates an id list from a file of tweets: diff --git a/test_twarc2.py b/test_twarc2.py index 5b06d0a0..3dec7c10 100644 --- a/test_twarc2.py +++ b/test_twarc2.py @@ -505,6 +505,54 @@ def test_ensure_user_id(): assert T._ensure_user_id(1033441111677788160) == "1033441111677788160" +def test_liking_users(): + + # This is one of @jack's tweets about the Twitter API + likes = T.liking_users(1460417326130421765) + + like_count = 0 + + for page in likes: + assert "data" in page + # These should be user objects. + assert "description" in page["data"][0] + like_count += len(page["data"]) + if like_count > 300: + break + + +def test_retweeted_by(): + + # This is one of @jack's tweets about the Twitter API + retweet_users = T.retweeted_by(1460417326130421765) + + retweet_count = 0 + + for page in retweet_users: + assert "data" in page + # These should be user objects. + assert "description" in page["data"][0] + retweet_count += len(page["data"]) + if retweet_count > 150: + break + + +def test_liked_tweets(): + + # What has @jack liked? + liked_tweets = T.liked_tweets(12) + + like_count = 0 + + for page in liked_tweets: + assert "data" in page + # These should be tweet objects. + assert "text" in page["data"][0] + like_count += len(page["data"]) + if like_count > 300: + break + + def test_twarc_metadata(): # With metadata (default) diff --git a/twarc/client2.py b/twarc/client2.py index 1d7e8a21..f4cce475 100644 --- a/twarc/client2.py +++ b/twarc/client2.py @@ -1034,6 +1034,109 @@ def followers( url = f"https://api.twitter.com/2/users/{user_id}/followers" return self.get_paginated(url, params=params) + def liking_users( + self, + tweet_id, + expansions=None, + tweet_fields=None, + user_fields=None, + max_results=100, + pagination_token=None, + ): + """ + Retrieve the user profiles of accounts that have liked the given tweet. + + """ + url = f"https://api.twitter.com/2/tweets/{tweet_id}/liking_users" + + params = self._prepare_params( + tweet_fields=tweet_fields, + user_fields=user_fields, + max_results=max_results, + pagination_token=pagination_token, + ) + + if expansions: + params["expansions"] = "pinned_tweet_id" + + for page in self.get_paginated(url, params=params): + if "data" in page: + yield page + else: + log.info( + f"Retrieved an empty page of results for liking_users of {tweet_id}" + ) + + def liked_tweets( + self, + user_id, + max_results=100, + expansions=None, + tweet_fields=None, + user_fields=None, + media_fields=None, + poll_fields=None, + place_fields=None, + pagination_token=None, + ): + """ + Retrieve the tweets liked by the given user_id. + + """ + url = f"https://api.twitter.com/2/users/{user_id}/liked_tweets" + + params = self._prepare_params( + max_results=100, + expansions=None, + tweet_fields=None, + user_fields=None, + media_fields=None, + poll_fields=None, + place_fields=None, + pagination_token=None, + ) + + for page in self.get_paginated(url, params=params): + if "data" in page: + yield page + else: + log.info( + f"Retrieved an empty page of results for liked_tweets of {user_id}" + ) + + def retweeted_by( + self, + tweet_id, + expansions=None, + tweet_fields=None, + user_fields=None, + max_results=100, + pagination_token=None, + ): + """ + Retrieve the user profiles of accounts that have retweeted the given tweet. + + """ + url = f"https://api.twitter.com/2/tweets/{tweet_id}/retweeted_by" + + params = self._prepare_params( + tweet_fields=tweet_fields, + user_fields=user_fields, + max_results=max_results, + pagination_token=pagination_token, + ) + + if expansions: + params["expansions"] = "pinned_tweet_id" + + for page in self.get_paginated(url, params=params): + if "data" in page: + yield page + else: + log.info( + f"Retrieved an empty page of results for retweeted_by of {tweet_id}" + ) + @catch_request_exceptions @rate_limit def get(self, *args, **kwargs): @@ -1078,7 +1181,15 @@ def get_paginated(self, *args, **kwargs): yield page - endings = ["mentions", "tweets", "following", "followers"] + endings = [ + "mentions", + "tweets", + "following", + "followers", + "liked_tweets", + "liking_users", + "retweeted_by", + ] # The search endpoints only take a next_token, but the timeline # endpoints take a pagination_token instead - this is a bit of a hack, diff --git a/twarc/command2.py b/twarc/command2.py index be608de1..7dfafc89 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -876,6 +876,155 @@ def following(T, user, outfile, limit, max_results, hide_progress): break +@twarc2.command("liking-users") +@click.option( + "--limit", + default=0, + help="Maximum number of liking users to retrieve. Increments of 100 or --max-results if set.", + type=int, +) +@click.option( + "--max-results", + default=100, + help="Maximum number of users (likes) per page. Default is and maximum is 100.", + type=int, +) +@command_line_progressbar_option +@click.argument("tweet_id", type=str) +@click.argument("outfile", type=click.File("w"), default="-") +@click.pass_obj +@cli_api_error +def liking_users(T, tweet_id, outfile, limit, max_results, hide_progress): + """ + Get the users that liked a specific tweet. + + Note that the progress bar is approximate. + + """ + count = 0 + lookup_total = 0 + + if not re.match("^\d+$", str(tweet_id)): + click.echo(click.style("Please enter a tweet ID", fg="red"), err=True) + + if outfile is not None and (outfile.name == ""): + hide_progress = True + + if not hide_progress: + # TODO: we could probably do this everytime, and avoid doing any lookups + # for tweets that don't exist anymore. + target_tweet = list(T.tweet_lookup([tweet_id]))[0] + if "data" in target_tweet: + lookup_total = target_tweet["data"][0]["public_metrics"]["like_count"] + + with tqdm(disable=hide_progress, total=lookup_total) as progress: + for result in T.liking_users(tweet_id, max_results=max_results): + _write(result, outfile) + count += len(result.get("data", [])) + progress.update(len(result.get("data", []))) + if limit != 0 and count >= limit: + progress.desc = f"Set --limit of {limit} reached" + break + + +@twarc2.command("retweeted-by") +@click.option( + "--limit", + default=0, + help="Maximum number of retweeting users to retrieve. Increments of 100 or --max-results if set.", + type=int, +) +@click.option( + "--max-results", + default=100, + help="Maximum number of users (retweets) per page of results. Default and maximum is 100.", + type=int, +) +@command_line_progressbar_option +@click.argument("tweet_id", type=str) +@click.argument("outfile", type=click.File("w"), default="-") +@click.pass_obj +@cli_api_error +def retweeted_by(T, tweet_id, outfile, limit, max_results, hide_progress): + """ + Get the users that retweeted a specific tweet. + + Note that the progress bar is approximate. + + """ + count = 0 + lookup_total = 0 + + if not re.match("^\d+$", str(tweet_id)): + click.echo(click.style("Please enter a tweet ID", fg="red"), err=True) + + if outfile is not None and (outfile.name == ""): + hide_progress = True + + if not hide_progress: + # TODO: we could probably do this everytime, and avoid doing any lookups + # for tweets that don't exist anymore. + target_tweet = list(T.tweet_lookup([tweet_id]))[0] + if "data" in target_tweet: + lookup_total = target_tweet["data"][0]["public_metrics"]["retweet_count"] + + with tqdm(disable=hide_progress, total=lookup_total) as progress: + for result in T.retweeted_by(tweet_id, max_results=max_results): + _write(result, outfile) + count += len(result.get("data", [])) + progress.update(len(result.get("data", []))) + if limit != 0 and count >= limit: + progress.desc = f"Set --limit of {limit} reached" + break + + +@twarc2.command("liked-tweets") +@click.option( + "--limit", + default=0, + help="Maximum number of liked tweets to retrieve. Increments of 100 or --max-results if set.", + type=int, +) +@click.option( + "--max-results", + default=100, + help="Maximum number of liked tweets per page of results. Default and maximum is 100.", + type=int, +) +@command_line_progressbar_option +@click.argument("user_id", type=str) +@click.argument("outfile", type=click.File("w"), default="-") +@click.pass_obj +@cli_api_error +def liked_tweets(T, user_id, outfile, limit, max_results, hide_progress): + """ + Get the tweets liked by a specific user_id. + + Note that the progress bar is approximate. + + """ + count = 0 + lookup_total = 0 + + if not re.match("^\d+$", str(user_id)): + click.echo(click.style("Please enter a user ID", fg="red"), err=True) + + if outfile is not None and (outfile.name == ""): + hide_progress = True + + # NB: there doesn't appear to be anyway to get the total count of likes + # a user has made, so the progress bar isn't very useful in this case... + + with tqdm(disable=hide_progress) as progress: + for result in T.liked_tweets(user_id, max_results=max_results): + _write(result, outfile) + count += len(result.get("data", [])) + progress.update(len(result.get("data", []))) + if limit != 0 and count >= limit: + progress.desc = f"Set --limit of {limit} reached" + break + + @twarc2.command("sample") @command_line_expansions_shortcuts @command_line_expansions_options