From 026a0dc409eea977d4ab60f92e2da8bf3dcd60b6 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 14 May 2023 20:30:40 -0500 Subject: [PATCH 1/5] Use full s3 URL for connection timeout errors --- data_analysis/compare_scheduled_and_rt.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/data_analysis/compare_scheduled_and_rt.py b/data_analysis/compare_scheduled_and_rt.py index 7ff0652..8a3a2bd 100644 --- a/data_analysis/compare_scheduled_and_rt.py +++ b/data_analysis/compare_scheduled_and_rt.py @@ -1,4 +1,5 @@ import os + from dataclasses import dataclass, field from typing import List, Tuple import logging @@ -10,6 +11,7 @@ import pendulum from tqdm import tqdm from dotenv import load_dotenv +import botocore import data_analysis.static_gtfs_analysis as static_gtfs_analysis from scrape_data.scrape_schedule_versions import create_schedule_list @@ -232,11 +234,17 @@ def combine_real_time_rt_comparison( # Use low_memory option to avoid warning about columns # with mixed dtypes. - daily_data = pd.read_csv( - (BASE_PATH / f"bus_full_day_data_v2/{date_str}.csv") - .as_uri(), - low_memory=False - ) + try: + daily_data = pd.read_csv( + (BASE_PATH / f"bus_full_day_data_v2/{date_str}.csv") + .as_uri(), + low_memory=False + ) + except (botocore.exceptions.ConnectTimeoutError, botocore.exceptions.EndpointConnectionError): + daily_data = pd.read_csv( + f'https://chn-ghost-buses-public.s3.us-east-2.amazonaws.com/bus_full_day_data_v2/{date_str}.csv', + low_memory=False + ) daily_data = make_daily_summary(daily_data) From 2583c4c79f416aaa59baa58398c6580b00167a74 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Mon, 15 May 2023 18:57:46 -0500 Subject: [PATCH 2/5] Change URL to take name of bucket in f-string --- data_analysis/compare_scheduled_and_rt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_analysis/compare_scheduled_and_rt.py b/data_analysis/compare_scheduled_and_rt.py index 8a3a2bd..61a01b5 100644 --- a/data_analysis/compare_scheduled_and_rt.py +++ b/data_analysis/compare_scheduled_and_rt.py @@ -242,7 +242,7 @@ def combine_real_time_rt_comparison( ) except (botocore.exceptions.ConnectTimeoutError, botocore.exceptions.EndpointConnectionError): daily_data = pd.read_csv( - f'https://chn-ghost-buses-public.s3.us-east-2.amazonaws.com/bus_full_day_data_v2/{date_str}.csv', + f'https://{BUCKET_PUBLIC}.s3.us-east-2.amazonaws.com/bus_full_day_data_v2/{date_str}.csv', low_memory=False ) From 4b950bf5ac74b5837fc0e171c858d626a1117096 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Mon, 6 Nov 2023 21:21:28 -0600 Subject: [PATCH 3/5] Add try-except block for reading csv --- scrape_data/cta_data_downloads.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 75f10d3..808c18e 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -5,7 +5,7 @@ import pendulum from io import StringIO import pandas as pd - +import botocore ACCESS_KEY = sys.argv[1] SECRET_KEY = sys.argv[2] @@ -89,11 +89,17 @@ def save_realtime_daily_summary() -> None: end_date = end_date.to_date_string() - daily_data = pd.read_csv( + try: + daily_data = pd.read_csv( (csrt.BASE_PATH / f"bus_full_day_data_v2/{end_date}.csv") .as_uri(), low_memory=False ) + except (botocore.exceptions.ConnectTimeoutError, botocore.exceptions.EndpointConnectionError): + daily_data = pd.read_csv( + f'https://{csrt.BUCKET_PUBLIC}.s3.us-east-2.amazonaws.com/bus_full_day_data_v2/{end_date}.csv', + low_memory=False + ) daily_data = csrt.make_daily_summary(daily_data) filename = f'realtime_summaries/daily_job/bus_full_day_data_v2/{end_date}.csv' From 24105a066df5f31188ee6ff883c8b2d8db207975 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Wed, 8 Nov 2023 20:18:48 -0600 Subject: [PATCH 4/5] Create new function for reading csvs from S3 --- data_analysis/compare_scheduled_and_rt.py | 20 ++++------------- scrape_data/cta_data_downloads.py | 22 ++++++------------- utils/s3_csv_reader.py | 26 +++++++++++++++++++++++ 3 files changed, 37 insertions(+), 31 deletions(-) create mode 100644 utils/s3_csv_reader.py diff --git a/data_analysis/compare_scheduled_and_rt.py b/data_analysis/compare_scheduled_and_rt.py index 61a01b5..509fe38 100644 --- a/data_analysis/compare_scheduled_and_rt.py +++ b/data_analysis/compare_scheduled_and_rt.py @@ -11,10 +11,10 @@ import pendulum from tqdm import tqdm from dotenv import load_dotenv -import botocore import data_analysis.static_gtfs_analysis as static_gtfs_analysis from scrape_data.scrape_schedule_versions import create_schedule_list +from utils import s3_csv_reader load_dotenv() @@ -232,20 +232,7 @@ def combine_real_time_rt_comparison( f"{pendulum.now().to_datetime_string()}" ) - # Use low_memory option to avoid warning about columns - # with mixed dtypes. - try: - daily_data = pd.read_csv( - (BASE_PATH / f"bus_full_day_data_v2/{date_str}.csv") - .as_uri(), - low_memory=False - ) - except (botocore.exceptions.ConnectTimeoutError, botocore.exceptions.EndpointConnectionError): - daily_data = pd.read_csv( - f'https://{BUCKET_PUBLIC}.s3.us-east-2.amazonaws.com/bus_full_day_data_v2/{date_str}.csv', - low_memory=False - ) - + daily_data = s3_csv_reader.read_csv(BASE_PATH / f"bus_full_day_data_v2/{date_str}.csv") daily_data = make_daily_summary(daily_data) rt_raw = pd.concat([rt_raw, daily_data]) @@ -352,7 +339,8 @@ def main(freq: str = 'D') -> Tuple[List[dict],pd.DataFrame, pd.DataFrame]: logger.info("\nExtracting data") data = static_gtfs_analysis.GTFSFeed.extract_data( CTA_GTFS, - version_id=schedule_version + version_id=schedule_version, + cta_download=False ) data = static_gtfs_analysis.format_dates_hours(data) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 808c18e..9e02e31 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -1,11 +1,13 @@ import boto3 import sys -import data_analysis.static_gtfs_analysis as sga -import data_analysis.compare_scheduled_and_rt as csrt + import pendulum from io import StringIO import pandas as pd -import botocore + +import data_analysis.static_gtfs_analysis as sga +import data_analysis.compare_scheduled_and_rt as csrt +from utils import s3_csv_reader ACCESS_KEY = sys.argv[1] SECRET_KEY = sys.argv[2] @@ -89,18 +91,8 @@ def save_realtime_daily_summary() -> None: end_date = end_date.to_date_string() - try: - daily_data = pd.read_csv( - (csrt.BASE_PATH / f"bus_full_day_data_v2/{end_date}.csv") - .as_uri(), - low_memory=False - ) - except (botocore.exceptions.ConnectTimeoutError, botocore.exceptions.EndpointConnectionError): - daily_data = pd.read_csv( - f'https://{csrt.BUCKET_PUBLIC}.s3.us-east-2.amazonaws.com/bus_full_day_data_v2/{end_date}.csv', - low_memory=False - ) - + daily_data = s3_csv_reader.read_csv(csrt.BASE_PATH / f"bus_full_day_data_v2/{end_date}.csv") + daily_data = csrt.make_daily_summary(daily_data) filename = f'realtime_summaries/daily_job/bus_full_day_data_v2/{end_date}.csv' save_csv_to_bucket(daily_data, filename=filename) diff --git a/utils/s3_csv_reader.py b/utils/s3_csv_reader.py new file mode 100644 index 0000000..98e96b8 --- /dev/null +++ b/utils/s3_csv_reader.py @@ -0,0 +1,26 @@ +import pandas as pd +from pathlib import Path +import botocore +import data_analysis.compare_scheduled_and_rt as csrt + +def read_csv(filename: str | Path) -> pd.DataFrame: + """Read pandas csv from S3 using multiple methods + + Args: + filename (str | Path): file to download from S3. + + Returns: + pd.DataFrame: A Pandas DataFrame from the S3 file. + """ + if isinstance(filename, str): + filename = Path(filename) + # Use low_memory option to avoid warning about columns with mixed dtypes. + try: + df = pd.read_csv(filename.as_uri(), low_memory=False) + except (botocore.exceptions.ConnectTimeoutError, botocore.exceptions.EndpointConnectionError): + s3_filename = '/'.join(filename.parts[-2:]) + df = pd.read_csv( + f'https://{csrt.BUCKET_PUBLIC}.s3.us-east-2.amazonaws.com/{s3_filename}', + low_memory=False + ) + return df \ No newline at end of file From 3305dd8990bdc4fe4a685651dcb6dfe2b04bc88e Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 26 Nov 2023 18:24:03 -0600 Subject: [PATCH 5/5] Remove try-except block --- utils/s3_csv_reader.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/utils/s3_csv_reader.py b/utils/s3_csv_reader.py index 98e96b8..ae1d63c 100644 --- a/utils/s3_csv_reader.py +++ b/utils/s3_csv_reader.py @@ -1,10 +1,9 @@ import pandas as pd from pathlib import Path -import botocore import data_analysis.compare_scheduled_and_rt as csrt def read_csv(filename: str | Path) -> pd.DataFrame: - """Read pandas csv from S3 using multiple methods + """Read pandas csv from S3 Args: filename (str | Path): file to download from S3. @@ -14,13 +13,10 @@ def read_csv(filename: str | Path) -> pd.DataFrame: """ if isinstance(filename, str): filename = Path(filename) - # Use low_memory option to avoid warning about columns with mixed dtypes. - try: - df = pd.read_csv(filename.as_uri(), low_memory=False) - except (botocore.exceptions.ConnectTimeoutError, botocore.exceptions.EndpointConnectionError): - s3_filename = '/'.join(filename.parts[-2:]) - df = pd.read_csv( + s3_filename = '/'.join(filename.parts[-2:]) + df = pd.read_csv( f'https://{csrt.BUCKET_PUBLIC}.s3.us-east-2.amazonaws.com/{s3_filename}', low_memory=False ) - return df \ No newline at end of file + return df + \ No newline at end of file