From 4a3ccd0e3478fc610f95513b3acf7a8958389d5b Mon Sep 17 00:00:00 2001 From: Justin Clark Date: Mon, 6 Nov 2023 13:54:01 -0500 Subject: [PATCH 01/10] Add semaphore of 15 to get_analytics_codes and process_url --- wayback_google_analytics/scraper.py | 76 +++++++++++++++-------------- 1 file changed, 39 insertions(+), 37 deletions(-) diff --git a/wayback_google_analytics/scraper.py b/wayback_google_analytics/scraper.py index 8a597b6..dab4d89 100644 --- a/wayback_google_analytics/scraper.py +++ b/wayback_google_analytics/scraper.py @@ -38,7 +38,7 @@ async def get_html(session, url): return None -async def process_url(session, url, start_date, end_date, frequency, limit): +async def process_url(session, url, start_date, end_date, frequency, limit, semaphore): """Returns a dictionary of current and archived UA/GA codes for a single url. Args: @@ -74,42 +74,42 @@ async def process_url(session, url, start_date, end_date, frequency, limit): }, """ + async with semaphore: + # Initialize dict for entry + curr_entry = {url: {}} + + # Get html + current codes + html = await get_html(session, url) + print("Retrieving current codes for: ", url) + if html: + curr_entry[url]["current_UA_code"] = get_UA_code(html) + curr_entry[url]["current_GA_code"] = get_GA_code(html) + curr_entry[url]["current_GTM_code"] = get_GTM_code(html) + curr_entry[url]["current_GTM_code"] = get_GTM_code(html) + print("Finished gathering current codes for: ", url) + + # Get snapshots for Wayback Machine + print("Retrieving archived codes for: ", url) + archived_snapshots = await get_snapshot_timestamps( + session=session, + url=url, + start_date=start_date, + end_date=end_date, + frequency=frequency, + limit=limit, + ) + + # Get historic codes from archived snapshots, appending them to curr_entry + archived_codes = await get_codes_from_snapshots( + session=session, url=url, timestamps=archived_snapshots + ) + curr_entry[url]["archived_UA_codes"] = archived_codes["UA_codes"] + curr_entry[url]["archived_GA_codes"] = archived_codes["GA_codes"] + curr_entry[url]["archived_GTM_codes"] = archived_codes["GTM_codes"] + + print("Finished retrieving archived codes for: ", url) - # Initialize dict for entry - curr_entry = {url: {}} - - # Get html + current codes - html = await get_html(session, url) - print("Retrieving current codes for: ", url) - if html: - curr_entry[url]["current_UA_code"] = get_UA_code(html) - curr_entry[url]["current_GA_code"] = get_GA_code(html) - curr_entry[url]["current_GTM_code"] = get_GTM_code(html) - curr_entry[url]["current_GTM_code"] = get_GTM_code(html) - print("Finished gathering current codes for: ", url) - - # Get snapshots for Wayback Machine - print("Retrieving archived codes for: ", url) - archived_snapshots = await get_snapshot_timestamps( - session=session, - url=url, - start_date=start_date, - end_date=end_date, - frequency=frequency, - limit=limit, - ) - - # Get historic codes from archived snapshots, appending them to curr_entry - archived_codes = await get_codes_from_snapshots( - session=session, url=url, timestamps=archived_snapshots - ) - curr_entry[url]["archived_UA_codes"] = archived_codes["UA_codes"] - curr_entry[url]["archived_GA_codes"] = archived_codes["GA_codes"] - curr_entry[url]["archived_GTM_codes"] = archived_codes["GTM_codes"] - - print("Finished retrieving archived codes for: ", url) - - return curr_entry + return curr_entry async def get_analytics_codes( @@ -163,6 +163,7 @@ async def get_analytics_codes( """ # Comprehension to create list of tasks for asyncio.gather() + semaphore = asyncio.Semaphore(15) tasks = [ process_url( @@ -172,10 +173,11 @@ async def get_analytics_codes( end_date=end_date, frequency=frequency, limit=limit, + semaphore=semaphore, ) for url in urls ] # Process urls concurrently and return results results = await asyncio.gather(*tasks) - return results \ No newline at end of file + return results From 16c20336a30bf65be8570c3ac951d796ee5e1a07 Mon Sep 17 00:00:00 2001 From: Justin Clark Date: Tue, 7 Nov 2023 11:54:37 -0500 Subject: [PATCH 02/10] Add global semaphore to app + add 5 sec delay between requests to CDX api --- wayback_google_analytics/async_utils.py | 14 +++--- wayback_google_analytics/main.py | 26 +++++++---- wayback_google_analytics/scraper.py | 61 +++++++++++++------------ wayback_google_analytics/utils.py | 16 +++++++ 4 files changed, 73 insertions(+), 44 deletions(-) diff --git a/wayback_google_analytics/async_utils.py b/wayback_google_analytics/async_utils.py index 8cfa5a7..3306ab5 100644 --- a/wayback_google_analytics/async_utils.py +++ b/wayback_google_analytics/async_utils.py @@ -14,6 +14,7 @@ async def get_snapshot_timestamps( end_date, frequency, limit, + semaphore, ): """Takes a url and returns an array of snapshot timestamps for a given time range. @@ -52,8 +53,9 @@ async def get_snapshot_timestamps( pattern = re.compile(r"\d{14}") # Use session to get timestamps - async with session.get(cdx_url, headers=DEFAULT_HEADERS) as response: - timestamps = pattern.findall(await response.text()) + async with semaphore: + async with session.get(cdx_url, headers=DEFAULT_HEADERS) as response: + timestamps = pattern.findall(await response.text()) print("Timestamps from CDX api: ", timestamps) @@ -61,7 +63,7 @@ async def get_snapshot_timestamps( return sorted(timestamps) -async def get_codes_from_snapshots(session, url, timestamps): +async def get_codes_from_snapshots(session, url, timestamps, semaphore): """Returns an array of UA/GA codes for a given url using the Archive.org Wayback Machine. Args: @@ -103,7 +105,7 @@ async def get_codes_from_snapshots(session, url, timestamps): # Get codes from each timestamp with asyncio.gather(). tasks = [ - get_codes_from_single_timestamp(session, base_url, timestamp, results) + get_codes_from_single_timestamp(session, base_url, timestamp, results, semaphore) for timestamp in timestamps ] await asyncio.gather(*tasks) @@ -120,7 +122,7 @@ async def get_codes_from_snapshots(session, url, timestamps): return results -async def get_codes_from_single_timestamp(session, base_url, timestamp, results): +async def get_codes_from_single_timestamp(session, base_url, timestamp, results, semaphore): """Returns UA/GA codes from a single archive.org snapshot and adds it to the results dictionary. Args: @@ -134,7 +136,7 @@ async def get_codes_from_single_timestamp(session, base_url, timestamp, results) """ # Use semaphore to limit number of concurrent requests - async with sem: + async with semaphore: async with session.get( base_url.format(timestamp=timestamp), headers=DEFAULT_HEADERS ) as response: diff --git a/wayback_google_analytics/main.py b/wayback_google_analytics/main.py index c18f921..d0ff890 100644 --- a/wayback_google_analytics/main.py +++ b/wayback_google_analytics/main.py @@ -66,16 +66,20 @@ async def main(args): ) args.frequency = COLLAPSE_OPTIONS[args.frequency] - async with aiohttp.ClientSession() as session: - results = await get_analytics_codes( - session=session, - urls=args.urls, - start_date=args.start_date, - end_date=args.end_date, - frequency=args.frequency, - limit=args.limit, - ) - print(results) + semaphore = asyncio.Semaphore(15) + + async with semaphore: + async with aiohttp.ClientSession() as session: + results = await get_analytics_codes( + session=session, + urls=args.urls, + start_date=args.start_date, + end_date=args.end_date, + frequency=args.frequency, + limit=args.limit, + semaphore=semaphore, + ) + print(results) # handle printing the output if args.output: @@ -147,9 +151,11 @@ def setup_args(): return parser.parse_args() + def main_entrypoint(): args = setup_args() asyncio.run(main(args)) + if __name__ == "__main__": main_entrypoint() diff --git a/wayback_google_analytics/scraper.py b/wayback_google_analytics/scraper.py index dab4d89..e633b0d 100644 --- a/wayback_google_analytics/scraper.py +++ b/wayback_google_analytics/scraper.py @@ -1,5 +1,7 @@ import aiohttp import asyncio +from aiohttp_retry import RetryClient, ExponentialRetry +import backoff from wayback_google_analytics.codes import ( get_UA_code, get_GA_code, @@ -15,7 +17,8 @@ ) -async def get_html(session, url): +# @backoff.on_exception(backoff.expo, aiohttp.ClientConnectorError, max_tries=10) +async def get_html(session, url, semaphore): """Returns html from a single url. Args: @@ -25,17 +28,17 @@ async def get_html(session, url): Returns: html (str): html from url. """ - - try: - async with session.get(url, headers=DEFAULT_HEADERS) as response: - return await response.text() - except aiohttp.ServerTimeoutError as e: - print(f"Request to {url} timed out", e) - except aiohttp.ClientError as e: - print(f"Failed to reach {url}", e) - except Exception as e: - print(f"Error getting data from {url}", e) - return None + async with semaphore: + try: + async with session.get(url, headers=DEFAULT_HEADERS) as response: + return await response.text() + except aiohttp.ServerTimeoutError as e: + print(f"Request to {url} timed out", e) + except aiohttp.ClientError as e: + print(f"Failed to reach {url}", e) + except Exception as e: + print(f"Error getting data from {url}", e) + return None async def process_url(session, url, start_date, end_date, frequency, limit, semaphore): @@ -79,7 +82,7 @@ async def process_url(session, url, start_date, end_date, frequency, limit, sema curr_entry = {url: {}} # Get html + current codes - html = await get_html(session, url) + html = await get_html(session, url, semaphore) print("Retrieving current codes for: ", url) if html: curr_entry[url]["current_UA_code"] = get_UA_code(html) @@ -97,11 +100,12 @@ async def process_url(session, url, start_date, end_date, frequency, limit, sema end_date=end_date, frequency=frequency, limit=limit, + semaphore=semaphore, ) # Get historic codes from archived snapshots, appending them to curr_entry archived_codes = await get_codes_from_snapshots( - session=session, url=url, timestamps=archived_snapshots + session=session, url=url, timestamps=archived_snapshots, semaphore=semaphore ) curr_entry[url]["archived_UA_codes"] = archived_codes["UA_codes"] curr_entry[url]["archived_GA_codes"] = archived_codes["GA_codes"] @@ -119,6 +123,7 @@ async def get_analytics_codes( end_date=None, frequency=None, limit=None, + semaphore=None, ): """Takes array of urls and returns array of dictionaries with all found analytics codes for a given time range. @@ -162,21 +167,21 @@ async def get_analytics_codes( } """ - # Comprehension to create list of tasks for asyncio.gather() - semaphore = asyncio.Semaphore(15) - - tasks = [ - process_url( - session=session, - url=url, - start_date=start_date, - end_date=end_date, - frequency=frequency, - limit=limit, - semaphore=semaphore, + tasks = [] + for url in urls: + task = asyncio.create_task( + process_url( + session=session, + url=url, + start_date=start_date, + end_date=end_date, + frequency=frequency, + limit=limit, + semaphore=semaphore, + ) ) - for url in urls - ] + tasks.append(task) + await asyncio.sleep(5) # Process urls concurrently and return results results = await asyncio.gather(*tasks) diff --git a/wayback_google_analytics/utils.py b/wayback_google_analytics/utils.py index 6702cba..0b719d5 100644 --- a/wayback_google_analytics/utils.py +++ b/wayback_google_analytics/utils.py @@ -136,3 +136,19 @@ def get_14_digit_timestamp(date): # Convert datetime object to 14-digit timestamp return date.strftime("%Y%m%d%H%M%S") + +def generate_semaphore(url_list, limit): + """Generates appropriate semaphore given a list of urls and a limit.""" + + url_count = len(url_list) + + operations = url_count * limit + + if operations <= 100: + return 10 + + if operations <= 1000: + return 5 + + if operations <= 10000: + return 1 \ No newline at end of file From cf20bfddfc5b8191f7919e6b54fc15c64ca74b6e Mon Sep 17 00:00:00 2001 From: Justin Clark Date: Tue, 7 Nov 2023 13:46:33 -0500 Subject: [PATCH 03/10] Remove mocked semaphore as no longer global variable in async_utils.py --- tests/test_async_utils.py | 7 +------ tests/test_main.py | 4 ++++ 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/test_async_utils.py b/tests/test_async_utils.py index b0cef01..5c79892 100644 --- a/tests/test_async_utils.py +++ b/tests/test_async_utils.py @@ -62,16 +62,11 @@ async def mock_text_method(): @patch("wayback_google_analytics.async_utils.get_UA_code") @patch("wayback_google_analytics.async_utils.get_GA_code") @patch("wayback_google_analytics.async_utils.get_GTM_code") - @patch("wayback_google_analytics.async_utils.sem", new_callable=MagicMock()) async def test_get_codes_from_single_timestamp( - self, mock_sem, mock_GTM, mock_GA, mock_UA, mock_get + self, mock_GTM, mock_GA, mock_UA, mock_get ): """Does get_codes_from_single_timestamp return correct codes from a single archive.org snapshot?""" - # Mock semaphore - mock_sem.__aenter__.return_value = MagicMock() - mock_sem.__aexit__.return_value = MagicMock() - # Mock the response from the server mock_response = MagicMock() diff --git a/tests/test_main.py b/tests/test_main.py index e23bcc0..de8237b 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -89,6 +89,7 @@ def test_setup_args_valid_args(self): "daily", "--limit", "10", + "--skip_current", ] args = setup_args() @@ -102,6 +103,7 @@ def test_setup_args_valid_args(self): self.assertEqual(args.end_date, "01/01/2013:12:00") self.assertEqual(args.frequency, "daily") self.assertEqual(args.limit, "10") + self.assertEqual(args.skip_current, True) def test_setup_args_valid_args_shorthand(self): """Does setup_args return args if valid args provided using shorthand commands?""" @@ -120,6 +122,7 @@ def test_setup_args_valid_args_shorthand(self): "daily", "-l", "10", + "-sc", ] args = setup_args() @@ -133,3 +136,4 @@ def test_setup_args_valid_args_shorthand(self): self.assertEqual(args.end_date, "01/01/2013:12:00") self.assertEqual(args.frequency, "daily") self.assertEqual(args.limit, "10") + self.assertEqual(args.skip_current, True) From ff5c35ca4571d1d5c5efb97cd14f4d340420d127 Mon Sep 17 00:00:00 2001 From: Justin Clark Date: Tue, 7 Nov 2023 13:50:06 -0500 Subject: [PATCH 04/10] Add semaphore + improve arg handling - Add semaphore of 10 to main.py - Add --skip_current flag to arguments that skips scraping of current GA code data (improves speed when working with dead links) - Add warning/confirmation message if limit > 500 or urls > 10. --- wayback_google_analytics/main.py | 57 ++++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/wayback_google_analytics/main.py b/wayback_google_analytics/main.py index d0ff890..ae2ee61 100644 --- a/wayback_google_analytics/main.py +++ b/wayback_google_analytics/main.py @@ -66,24 +66,38 @@ async def main(args): ) args.frequency = COLLAPSE_OPTIONS[args.frequency] - semaphore = asyncio.Semaphore(15) - - async with semaphore: - async with aiohttp.ClientSession() as session: - results = await get_analytics_codes( - session=session, - urls=args.urls, - start_date=args.start_date, - end_date=args.end_date, - frequency=args.frequency, - limit=args.limit, - semaphore=semaphore, - ) - print(results) - - # handle printing the output - if args.output: - write_output(output_file, args.output, results) + semaphore = asyncio.Semaphore(10) + + # Warn user if large request + if abs(int(args.limit)) > 500 or len(args.urls) > 9: + response = input(f"""Large requests can lead to being rate limited by archive.org.\n\n Current limit: {args.limit} (Recommended < 500) \n\n Current # of urls: {len(args.urls)} (Recommended < 10, unless limit < 50) + + Do you wish to proceed? (Yes/no) + """) + if response.lower() not in ('yes', 'y'): + print("Request cancelled.") + exit() + + try: + async with semaphore: + async with aiohttp.ClientSession() as session: + results = await get_analytics_codes( + session=session, + urls=args.urls, + start_date=args.start_date, + end_date=args.end_date, + frequency=args.frequency, + limit=args.limit, + semaphore=semaphore, + skip_current=args.skip_current, + ) + print(results) + + # handle printing the output + if args.output: + write_output(output_file, args.output, results) + except aiohttp.ClientError as e: + print("Your request was rate limited. Wait 5 minutes and try again and consider reducing the limit and # of numbers.") def setup_args(): @@ -95,6 +109,7 @@ def setup_args(): --end_date: End date for time range. Defaults to None. --frequency: Can limit snapshots to remove duplicates (1 per hr, day, month, etc). Defaults to None. --limit: Limit number of snapshots returned. Defaults to None. + --skip_current: Add this flag to skip current UA/GA codes when getting archived codes. Returns: Command line arguments (argparse) @@ -148,6 +163,12 @@ def setup_args(): default=-100, help="Limits number of snapshots returned. Defaults to -100 (most recent 100 snapshots).", ) + parser.add_argument( + "-sc", + "--skip_current", + action='store_true', + help="Add this flag to skip current UA/GA codes when getting archived codes.", + ) return parser.parse_args() From 4e05b80332aac28cf20e5d0b2a5136e03fbb494d Mon Sep 17 00:00:00 2001 From: Justin Clark Date: Tue, 7 Nov 2023 13:50:34 -0500 Subject: [PATCH 05/10] Fix formatting --- wayback_google_analytics/main.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/wayback_google_analytics/main.py b/wayback_google_analytics/main.py index ae2ee61..f984d9f 100644 --- a/wayback_google_analytics/main.py +++ b/wayback_google_analytics/main.py @@ -70,11 +70,13 @@ async def main(args): # Warn user if large request if abs(int(args.limit)) > 500 or len(args.urls) > 9: - response = input(f"""Large requests can lead to being rate limited by archive.org.\n\n Current limit: {args.limit} (Recommended < 500) \n\n Current # of urls: {len(args.urls)} (Recommended < 10, unless limit < 50) + response = input( + f"""Large requests can lead to being rate limited by archive.org.\n\n Current limit: {args.limit} (Recommended < 500) \n\n Current # of urls: {len(args.urls)} (Recommended < 10, unless limit < 50) Do you wish to proceed? (Yes/no) - """) - if response.lower() not in ('yes', 'y'): + """ + ) + if response.lower() not in ("yes", "y"): print("Request cancelled.") exit() @@ -97,7 +99,9 @@ async def main(args): if args.output: write_output(output_file, args.output, results) except aiohttp.ClientError as e: - print("Your request was rate limited. Wait 5 minutes and try again and consider reducing the limit and # of numbers.") + print( + "Your request was rate limited. Wait 5 minutes and try again and consider reducing the limit and # of numbers." + ) def setup_args(): @@ -166,7 +170,7 @@ def setup_args(): parser.add_argument( "-sc", "--skip_current", - action='store_true', + action="store_true", help="Add this flag to skip current UA/GA codes when getting archived codes.", ) From c0606fea1b587fd748ad49134e81e4bd9f5f44ca Mon Sep 17 00:00:00 2001 From: Justin Clark Date: Tue, 7 Nov 2023 13:55:09 -0500 Subject: [PATCH 06/10] Add semaphore + minor refactor - Add sleep of 5 seconds when using asyncio.gather() on tasks to avoid 443 from archive.org - Add semaphore to function args - Add skip_current to function args + update process_url to skip current codes if -sc flag enabled --- wayback_google_analytics/scraper.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/wayback_google_analytics/scraper.py b/wayback_google_analytics/scraper.py index e633b0d..a4dd757 100644 --- a/wayback_google_analytics/scraper.py +++ b/wayback_google_analytics/scraper.py @@ -1,7 +1,5 @@ import aiohttp import asyncio -from aiohttp_retry import RetryClient, ExponentialRetry -import backoff from wayback_google_analytics.codes import ( get_UA_code, get_GA_code, @@ -17,13 +15,13 @@ ) -# @backoff.on_exception(backoff.expo, aiohttp.ClientConnectorError, max_tries=10) async def get_html(session, url, semaphore): """Returns html from a single url. Args: session (aiohttp.ClientSession) url (str): Url to scrape html from. + semaphore: asyncio.semaphore Returns: html (str): html from url. @@ -41,7 +39,9 @@ async def get_html(session, url, semaphore): return None -async def process_url(session, url, start_date, end_date, frequency, limit, semaphore): +async def process_url( + session, url, start_date, end_date, frequency, limit, semaphore, skip_current +): """Returns a dictionary of current and archived UA/GA codes for a single url. Args: @@ -51,6 +51,8 @@ async def process_url(session, url, start_date, end_date, frequency, limit, sema end_date (str): End date for time range frequency (int): limit (int): + semaphore: asyncio.semaphore + skip_current (bool): Determine whether to skip getting current codes Returns: "someurl.com": { @@ -82,14 +84,15 @@ async def process_url(session, url, start_date, end_date, frequency, limit, sema curr_entry = {url: {}} # Get html + current codes - html = await get_html(session, url, semaphore) - print("Retrieving current codes for: ", url) - if html: - curr_entry[url]["current_UA_code"] = get_UA_code(html) - curr_entry[url]["current_GA_code"] = get_GA_code(html) - curr_entry[url]["current_GTM_code"] = get_GTM_code(html) - curr_entry[url]["current_GTM_code"] = get_GTM_code(html) - print("Finished gathering current codes for: ", url) + if not skip_current: + html = await get_html(session, url, semaphore) + print("Retrieving current codes for: ", url) + if html: + curr_entry[url]["current_UA_code"] = get_UA_code(html) + curr_entry[url]["current_GA_code"] = get_GA_code(html) + curr_entry[url]["current_GTM_code"] = get_GTM_code(html) + curr_entry[url]["current_GTM_code"] = get_GTM_code(html) + print("Finished gathering current codes for: ", url) # Get snapshots for Wayback Machine print("Retrieving archived codes for: ", url) @@ -124,6 +127,7 @@ async def get_analytics_codes( frequency=None, limit=None, semaphore=None, + skip_current=False, ): """Takes array of urls and returns array of dictionaries with all found analytics codes for a given time range. @@ -178,6 +182,7 @@ async def get_analytics_codes( frequency=frequency, limit=limit, semaphore=semaphore, + skip_current=skip_current, ) ) tasks.append(task) From b8a9711623f86caa5969da62511ad46895e363d6 Mon Sep 17 00:00:00 2001 From: Justin Clark Date: Tue, 7 Nov 2023 14:05:29 -0500 Subject: [PATCH 07/10] Add default value for semaphore + update docstrings --- wayback_google_analytics/async_utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/wayback_google_analytics/async_utils.py b/wayback_google_analytics/async_utils.py index 3306ab5..7c7651f 100644 --- a/wayback_google_analytics/async_utils.py +++ b/wayback_google_analytics/async_utils.py @@ -3,9 +3,6 @@ from wayback_google_analytics.codes import get_UA_code, get_GA_code, get_GTM_code from wayback_google_analytics.utils import get_date_from_timestamp, DEFAULT_HEADERS -# Semaphore to limit number of concurrent requests (10-15 appears to work fine. 20+ causes 443 error from web.archive.org) -sem = asyncio.Semaphore(10) - async def get_snapshot_timestamps( session, @@ -14,7 +11,7 @@ async def get_snapshot_timestamps( end_date, frequency, limit, - semaphore, + semaphore=asyncio.Semaphore(10), ): """Takes a url and returns an array of snapshot timestamps for a given time range. @@ -25,6 +22,7 @@ async def get_snapshot_timestamps( end_date (str, optional): End date for time range. frequency (str, optional): Can limit snapshots to remove duplicates (1 per hr, day, week, etc). limit (int, optional): Limit number of snapshots returned. + semaphore: asyncio.Semaphore() Returns: Array of timestamps: @@ -63,13 +61,14 @@ async def get_snapshot_timestamps( return sorted(timestamps) -async def get_codes_from_snapshots(session, url, timestamps, semaphore): +async def get_codes_from_snapshots(session, url, timestamps, semaphore=asyncio.Semaphore(10)): """Returns an array of UA/GA codes for a given url using the Archive.org Wayback Machine. Args: session (aiohttp.ClientSession) url (str) timestamps (list): List of timestamps to get codes from. + semaphore: asyncio.Semaphore() Returns: { @@ -122,7 +121,7 @@ async def get_codes_from_snapshots(session, url, timestamps, semaphore): return results -async def get_codes_from_single_timestamp(session, base_url, timestamp, results, semaphore): +async def get_codes_from_single_timestamp(session, base_url, timestamp, results, semaphore=asyncio.Semaphore(10)): """Returns UA/GA codes from a single archive.org snapshot and adds it to the results dictionary. Args: @@ -130,6 +129,7 @@ async def get_codes_from_single_timestamp(session, base_url, timestamp, results, base_url (str): Base url for archive.org snapshot. timestamp (str): 14-digit timestamp. results (dict): Dictionary to add codes to (inherited from get_codes_from_snapshots()). + semaphore: asyncio.Semaphore() Returns: None From 14a21516f70293c104e255534c8bd9739eca9522 Mon Sep 17 00:00:00 2001 From: Justin Clark Date: Tue, 7 Nov 2023 14:21:40 -0500 Subject: [PATCH 08/10] Improve readme - Add Limitations section - Update CLI options with --skip_current --- README.md | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index cc0d6b3..af612d2 100644 --- a/README.md +++ b/README.md @@ -227,27 +227,28 @@ Options list (run `wayback-google-analytics -h` to see in terminal): options: -h, --help show this help message and exit -i INPUT_FILE, --input_file INPUT_FILE - Enter a file path to a list of urls in a readable file - type (e.g. .txt, .csv, .md) + Enter a file path to a list of urls in a readable file type + (e.g. .txt, .csv, .md) -u URLS [URLS ...], --urls URLS [URLS ...] - Enter a list of urls separated by spaces to get their - UA/GA codes (e.g. --urls https://www.google.com + Enter a list of urls separated by spaces to get their UA/GA + codes (e.g. --urls https://www.google.com https://www.facebook.com) -o {csv,txt,json,xlsx}, --output {csv,txt,json,xlsx} - Enter an output type to write results to file. - Defaults to json. + Enter an output type to write results to file. Defaults to + json. -s START_DATE, --start_date START_DATE - Start date for time range (dd/mm/YYYY:HH:MM) Defaults - to 01/10/2012:00:00, when UA codes were adopted. + Start date for time range (dd/mm/YYYY:HH:MM) Defaults to + 01/10/2012:00:00, when UA codes were adopted. -e END_DATE, --end_date END_DATE - End date for time range (dd/mm/YYYY:HH:MM). Defaults - to None. + End date for time range (dd/mm/YYYY:HH:MM). Defaults to None. -f {yearly,monthly,daily,hourly}, --frequency {yearly,monthly,daily,hourly} - Can limit snapshots to remove duplicates (1 per hr, - day, month, etc). Defaults to None. + Can limit snapshots to remove duplicates (1 per hr, day, month, + etc). Defaults to None. -l LIMIT, --limit LIMIT - Limits number of snapshots returned. Defaults to -100 - (most recent 100 snapshots). + Limits number of snapshots returned. Defaults to -100 (most + recent 100 snapshots). + -sc, --skip_current Add this flag to skip current UA/GA codes when getting archived + codes. ``` @@ -289,7 +290,15 @@ Ordered by code:

(back to top)

+ +## Limitations & Rate Limits +We recommend that you limit your list of urls to ~10 and your max snapshot limit to <500 during queries. While Wayback Google Analytics doesn't have any hardcoded limitations in regards to how many urls or snapshots you can request, large queries can cause 443 errors (rate limiting). Being rate limited can result in a temporary 5-10 minute ban from web.archive.org and the CDX api. + +The app currently uses `asyncio.Semaphore()` along with delays between requests, but large queries or operations that take a long time can still result in a 443. Use your judgment and break large queries into smaller, more manageable pieces if you find yourself getting rate limited. + + +

(back to top)

## Contributing @@ -325,8 +334,6 @@ Distributed under the MIT License. See `LICENSE.txt` for more information.

(back to top)

- - ## Contact From 48546ef96fb5aeff6b3950757684cf71572f594b Mon Sep 17 00:00:00 2001 From: Justin Clark Date: Tue, 7 Nov 2023 14:34:15 -0500 Subject: [PATCH 09/10] bump version to 0.2.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 292b4dc..2b2713b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "wayback-google-analytics" -version = "0.1.6" +version = "0.2.0" description = "A tool for gathering current and historic google analytics ids from multiple websites" authors = ["Justin Clark "] license = "MIT" From 4f6b0970766e880f1d2b1ed551cf403fefe2b259 Mon Sep 17 00:00:00 2001 From: Justin Clark Date: Tue, 7 Nov 2023 15:20:44 -0500 Subject: [PATCH 10/10] Updatecommands for downloading from source --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index af612d2..0cd2d0e 100644 --- a/README.md +++ b/README.md @@ -202,7 +202,7 @@ You can also clone and download the repo from github and use the tool locally. 3. Get a high-level overview: ```terminal - python main.py -h + python -m wayback_google_analytics.main.py -h ```

(back to top)