diff --git a/setup.py b/setup.py index 91d32459..0d96c2c6 100644 --- a/setup.py +++ b/setup.py @@ -30,6 +30,7 @@ setup_requires=["pytest-runner"], tests_require=[ "pytest", + "black", "pytest-black", "python-dotenv", "pytz", diff --git a/test_twarc.py b/test_twarc.py index 5cff4a24..e6d396e6 100644 --- a/test_twarc.py +++ b/test_twarc.py @@ -337,7 +337,6 @@ def test_follower_ids_with_user_id(): def test_follower_ids_max_pages(): - ids = list(T.follower_ids(813286, max_pages=1)) assert 0 < len(ids) <= 5000 ids = list(T.follower_ids(813286, max_pages=2)) @@ -363,7 +362,6 @@ def test_friend_ids_with_user_id(): def test_friend_ids_max_pages(): - ids = list(T.friend_ids(27260086, max_pages=1)) assert 0 < len(ids) <= 5000 ids = list(T.friend_ids(27260086, max_pages=2)) @@ -799,7 +797,6 @@ def test_csv_retweet(): def test_csv_retweet_hashtag(): - toplevel_hashtags = 0 rt_hashtags = 0 diff --git a/test_twarc2.py b/test_twarc2.py index 715fe86d..43f08adc 100644 --- a/test_twarc2.py +++ b/test_twarc2.py @@ -31,7 +31,7 @@ ) -def atest_version(): +def test_version(): import setup assert setup.version == version @@ -40,7 +40,7 @@ def atest_version(): assert f"twarc/{version}" in user_agent -def atest_auth_types_interaction(): +def test_auth_types_interaction(): """ Test the various options for configuration work as expected. """ @@ -81,7 +81,7 @@ def atest_auth_types_interaction(): tw.sample() -def atest_sample(): +def test_sample(): # event to tell the filter stream to close event = threading.Event() @@ -101,7 +101,6 @@ def atest_sample(): @pytest.mark.parametrize("sort_order", ["recency", "relevancy"]) def test_search_recent(sort_order): - found_tweets = 0 pages = 0 @@ -116,8 +115,7 @@ def test_search_recent(sort_order): assert 100 <= found_tweets <= 200 -def atest_counts_recent(): - +def test_counts_recent(): found_counts = 0 for response_page in T.counts_recent("twitter is:verified", granularity="day"): @@ -132,8 +130,7 @@ def atest_counts_recent(): os.environ.get("SKIP_ACADEMIC_PRODUCT_TRACK") != None, reason="No Academic Research Product Track access", ) -def atest_counts_empty_page(): - +def test_counts_empty_page(): found_counts = 0 for response_page in T.counts_all( @@ -148,7 +145,7 @@ def atest_counts_empty_page(): assert found_counts == 72 -def atest_search_times(): +def test_search_times(): found = False now = datetime.datetime.now(tz=pytz.timezone("Australia/Melbourne")) # twitter api doesn't resolve microseconds so strip them for comparison @@ -169,12 +166,11 @@ def atest_search_times(): assert found -def atest_user_ids_lookup(): +def test_user_ids_lookup(): users_found = 0 users_not_found = 0 for response in T.user_lookup(range(1, 1000)): - for profile in response["data"]: users_found += 1 @@ -189,7 +185,7 @@ def atest_user_ids_lookup(): assert users_found + users_not_found == 999 -def atest_usernames_lookup(): +def test_usernames_lookup(): users_found = 0 usernames = ["jack", "barackobama", "rihanna"] for response in T.user_lookup(usernames, usernames=True): @@ -198,13 +194,11 @@ def atest_usernames_lookup(): assert users_found == 3 -def atest_tweet_lookup(): - +def test_tweet_lookup(): tweets_found = 0 tweets_not_found = 0 for response in T.tweet_lookup(range(1000, 2000)): - for tweet in response["data"]: tweets_found += 1 @@ -227,7 +221,7 @@ def atest_tweet_lookup(): os.environ.get("GITHUB_ACTIONS") != None, reason="stream() seems to throw a 400 error under GitHub Actions?!", ) -def atest_stream(): +def test_stream(): # remove any active stream rules rules = T.get_stream_rules() if "data" in rules and len(rules["data"]) > 0: @@ -280,7 +274,7 @@ def atest_stream(): assert "data" not in rules -def atest_timeline(): +def test_timeline(): """ Test the user timeline endpoints. @@ -301,7 +295,7 @@ def atest_timeline(): assert found >= 200 -def atest_timeline_username(): +def test_timeline_username(): """ Test the user timeline endpoints with username. @@ -322,12 +316,12 @@ def atest_timeline_username(): assert found >= 200 -def atest_missing_timeline(): +def test_missing_timeline(): results = T.timeline(1033441111677788160) assert len(list(results)) == 0 -def atest_follows(): +def test_follows(): """ Test followers and and following. @@ -349,7 +343,7 @@ def atest_follows(): assert found >= 1000 -def atest_follows_username(): +def test_follows_username(): """ Test followers and and following by username. @@ -371,7 +365,7 @@ def atest_follows_username(): assert found >= 1000 -def atest_flattened(): +def test_flattened(): """ This test uses the search API to test response flattening. It will look at each tweet to find evidence that all the expansions have worked. Once it @@ -457,7 +451,7 @@ def atest_flattened(): assert found_referenced_tweets, "found referenced tweets" -def atest_ensure_flattened(): +def test_ensure_flattened(): resp = next(T.search_recent("twitter", max_results=20)) # flatten a response @@ -510,7 +504,7 @@ def atest_ensure_flattened(): twarc.expansions.ensure_flattened([[{"data": {"fake": "list_of_lists"}}]]) -def atest_ensure_flattened_errors(): +def test_ensure_flattened_errors(): """ Test that ensure_flattened doesn't return tweets for API responses that only contain errors. """ @@ -518,7 +512,7 @@ def atest_ensure_flattened_errors(): assert twarc.expansions.ensure_flattened(data) == [] -def atest_ensure_user_id(): +def test_ensure_user_id(): """ Test _ensure_user_id's ability to discriminate correctly between IDs and screen names. @@ -538,8 +532,7 @@ def atest_ensure_user_id(): assert T._ensure_user_id(1033441111677788160) == "1033441111677788160" -def atest_liking_users(): - +def test_liking_users(): # This is one of @jack's tweets about the Twitter API likes = T.liking_users(1460417326130421765) @@ -554,8 +547,7 @@ def atest_liking_users(): break -def atest_retweeted_by(): - +def test_retweeted_by(): # This is one of @jack's tweets about the Twitter API retweet_users = T.retweeted_by(1460417326130421765) @@ -570,8 +562,7 @@ def atest_retweeted_by(): break -def atest_liked_tweets(): - +def test_liked_tweets(): # What has @jack liked? liked_tweets = T.liked_tweets(12) @@ -586,62 +577,61 @@ def atest_liked_tweets(): break -def atest_list_lookup(): +def test_list_lookup(): parks_list = T.list_lookup(715919216927322112) assert "data" in parks_list assert parks_list["data"]["name"] == "National-parks" -def atest_list_members(): +def test_list_members(): response = list(T.list_members(715919216927322112)) assert len(response) == 1 members = twarc.expansions.flatten(response[0]) assert len(members) == 8 -def atest_list_followers(): +def test_list_followers(): response = list(T.list_followers(715919216927322112)) assert len(response) >= 2 followers = twarc.expansions.flatten(response[0]) assert len(followers) > 50 -def atest_list_memberships(): +def test_list_memberships(): response = list(T.list_memberships("64flavors")) assert len(response) == 1 lists = twarc.expansions.flatten(response[0]) assert len(lists) >= 9 -def atest_followed_lists(): +def test_followed_lists(): response = list(T.followed_lists("nasa")) assert len(response) == 1 lists = twarc.expansions.flatten(response[0]) assert len(lists) >= 1 -def atest_owned_lists(): +def test_owned_lists(): response = list(T.owned_lists("nasa")) assert len(response) >= 1 lists = twarc.expansions.flatten(response[0]) assert len(lists) >= 11 -def atest_list_tweets(): +def test_list_tweets(): response = next(T.list_tweets(715919216927322112)) assert "data" in response tweets = twarc.expansions.flatten(response) assert len(tweets) >= 90 -def atest_user_lookup_non_existent(): +def test_user_lookup_non_existent(): with pytest.raises(ValueError): # This user does not exist, and a value error should be raised T._ensure_user("noasdfasdf") -def atest_twarc_metadata(): - +def test_twarc_metadata(): # With metadata (default) event = threading.Event() for i, response in enumerate(T.sample(event=event)): @@ -667,7 +657,7 @@ def atest_twarc_metadata(): T.metadata = True -def atest_docs_requirements(): +def test_docs_requirements(): """ Make sure that the mkdocs requirements has everything that is in the twarc requirements so the readthedocs build doesn't fail. @@ -678,7 +668,7 @@ def atest_docs_requirements(): assert twarc_reqs.issubset(mkdocs_reqs) -def atest_geo(): +def test_geo(): print(T.geo(query="Silver Spring")) diff --git a/twarc/client.py b/twarc/client.py index 00b30533..7ec218ca 100644 --- a/twarc/client.py +++ b/twarc/client.py @@ -139,7 +139,6 @@ def search( reached_end = False while True: - # note: max_id changes as results are retrieved if max_id: params["max_id"] = max_id @@ -724,7 +723,6 @@ def replies(self, tweet, recursive=False, prune=()): tweet_id = tweet["id_str"] log.info("looking for replies to: %s", tweet_id) for reply in self.search("to:%s" % screen_name, since_id=tweet_id): - if reply["in_reply_to_status_id_str"] != tweet_id: continue diff --git a/twarc/client2.py b/twarc/client2.py index a6a0058c..5bd3e5cb 100644 --- a/twarc/client2.py +++ b/twarc/client2.py @@ -256,7 +256,6 @@ def _search( if using_counts: while True: for response in self.get_paginated(url, params=params): - # Note that we're ensuring the appropriate amount of sleep is # taken before yielding every item. This ensures that we won't # exceed the rate limit even in cases where a response generator @@ -309,7 +308,6 @@ def _search( else: for response in self.get_paginated(url, params=params): - # Note that we're ensuring the appropriate amount of sleep is # taken before yielding every item. This ensures that we won't # exceed the rate limit even in cases where a response generator @@ -914,7 +912,6 @@ def tweet_lookup( """ def lookup_batch(tweet_id): - url = "https://api.twitter.com/2/tweets" params = self._prepare_params( @@ -1653,7 +1650,6 @@ def get_paginated(self, *args, **kwargs): token_param = "next_token" while "meta" in page and "next_token" in page["meta"]: - if "params" in kwargs: kwargs["params"][token_param] = page["meta"]["next_token"] else: diff --git a/twarc/command.py b/twarc/command.py index e2056dc1..85f0977a 100644 --- a/twarc/command.py +++ b/twarc/command.py @@ -291,7 +291,6 @@ def stop(signal, frame): line_count = 0 file_count = 0 for thing in things: - # rotate the files if necessary if args.output and args.split and line_count % args.split == 0: file_count += 1 diff --git a/twarc/command2.py b/twarc/command2.py index 15ec79ec..e2be2f3b 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -416,7 +416,6 @@ def _validate_max_results(context, parameter, value): ) if value: - if not archive_set and value > 100: raise click.BadParameter( "--max-results cannot be greater than 100 when using Standard Access. Specify --archive if you have Academic Access." @@ -431,7 +430,6 @@ def _validate_max_results(context, parameter, value): return value else: - if archive_set and ( no_context_annotations_set or minimal_fields_set @@ -1490,7 +1488,6 @@ def timelines( break for user in users: - # only process a given user once if user in seen: log.info("already processed %s, skipping", user) @@ -1704,7 +1701,6 @@ def searches( # TODO: Needs an inputlines progress bar instead, as the queries are variable # size. with FileLineProgressBar(infile, outfile, disable=hide_progress) as progress: - merged_query = "" extended_query = None query = None @@ -1755,7 +1751,6 @@ def searches( response = api_method(issue_query, **kwargs) for result in response: - if counts_only: for r in result["data"]: click.echo( @@ -1780,7 +1775,6 @@ def searches( response = api_method(merged_query, **kwargs) for result in response: - if counts_only: for r in result["data"]: click.echo( @@ -1902,7 +1896,6 @@ def f(): conv_count = 0 for conv_id in conv_ids: - if conv_id in seen: log.info(f"already fetched conversation_id {conv_id}") seen.add(conv_id) @@ -2805,7 +2798,6 @@ def _wait_for_job(T, job, hide_progress=False): disable=hide_progress, bar_format="{l_bar}{bar}| Waiting {n_time}/{total_time}{postfix}", ) as pbar: - while True: try: pbar.refresh() diff --git a/twarc/decorators2.py b/twarc/decorators2.py index 451546a4..d3107271 100644 --- a/twarc/decorators2.py +++ b/twarc/decorators2.py @@ -29,7 +29,6 @@ def new_f(*args, **kwargs): errors = 0 return resp elif resp.status_code == 429: - # Check the headers, and try to infer why we're hitting the # rate limit. Because the search/all endpoints also have a # 1r/s rate limit that isn't obvious in the headers, we need @@ -132,7 +131,6 @@ def new_f(self, *args, **kwargs): errors = 0 return resp except (requests.exceptions.RequestException, ConnectionError) as e: - # don't catch any HTTP errors since those are handled separately if isinstance(e, requests.exceptions.HTTPError): raise e diff --git a/twarc/expansions.py b/twarc/expansions.py index 62f25634..8f01310a 100644 --- a/twarc/expansions.py +++ b/twarc/expansions.py @@ -65,6 +65,7 @@ "source", "withheld", "edit_controls", + "edit_history_tweet_ids", ] MEDIA_FIELDS = [ @@ -76,6 +77,7 @@ "type", "url", "width", + "variants", # "non_public_metrics", # private # "organic_metrics", # private # "promoted_metrics", # private @@ -96,6 +98,8 @@ ] LIST_FIELDS = [ + "id", + "name", "owner_id", "created_at", "member_count", diff --git a/twarc/handshake.py b/twarc/handshake.py index 62cca714..21da4d46 100644 --- a/twarc/handshake.py +++ b/twarc/handshake.py @@ -9,7 +9,6 @@ def handshake(): - # Default empty keys consumer_key = "" consumer_secret = "" diff --git a/twarc/version.py b/twarc/version.py index a4500f77..a1dac848 100644 --- a/twarc/version.py +++ b/twarc/version.py @@ -1,5 +1,5 @@ import platform -version = "2.12.0" +version = "2.12.1" user_agent = f"twarc/{version} ({platform.system()} {platform.machine()}) {platform.python_implementation()}/{platform.python_version()}" diff --git a/utils/extractor.py b/utils/extractor.py index ae203361..5ae9aafb 100755 --- a/utils/extractor.py +++ b/utils/extractor.py @@ -119,7 +119,6 @@ def extract(json_object, args, csv_writer): found = found1 for row in found: - csv_writer.writerow(row) return len(found) diff --git a/utils/media2warc.py b/utils/media2warc.py index 1ccaef9c..1c15c925 100755 --- a/utils/media2warc.py +++ b/utils/media2warc.py @@ -76,7 +76,6 @@ def __init__(self, out_queue, warcfile): self.dedup = Dedup() def run(self): - with open(self.warcfile, "ab") as output: while True: self.lock.acquire() @@ -157,7 +156,6 @@ def parse_extended_entities(extended_entities_dict): if "media" in extended_entities_dict.keys(): for item in extended_entities_dict["media"]: - # add static image urls.append(item["media_url_https"]) diff --git a/utils/media_urls.py b/utils/media_urls.py index a7cebe11..7eebceb3 100755 --- a/utils/media_urls.py +++ b/utils/media_urls.py @@ -22,7 +22,6 @@ if "extended_entities" in tweet and "media" in tweet["extended_entities"]: for media in tweet["extended_entities"]["media"]: - if media["type"] == "animated_gif": print(id, media["media_url_https"])