diff --git a/data_analysis/compare_scheduled_and_rt.py b/data_analysis/compare_scheduled_and_rt.py index 7ff0652..509fe38 100644 --- a/data_analysis/compare_scheduled_and_rt.py +++ b/data_analysis/compare_scheduled_and_rt.py @@ -1,4 +1,5 @@ import os + from dataclasses import dataclass, field from typing import List, Tuple import logging @@ -13,6 +14,7 @@ import data_analysis.static_gtfs_analysis as static_gtfs_analysis from scrape_data.scrape_schedule_versions import create_schedule_list +from utils import s3_csv_reader load_dotenv() @@ -230,14 +232,7 @@ def combine_real_time_rt_comparison( f"{pendulum.now().to_datetime_string()}" ) - # Use low_memory option to avoid warning about columns - # with mixed dtypes. - daily_data = pd.read_csv( - (BASE_PATH / f"bus_full_day_data_v2/{date_str}.csv") - .as_uri(), - low_memory=False - ) - + daily_data = s3_csv_reader.read_csv(BASE_PATH / f"bus_full_day_data_v2/{date_str}.csv") daily_data = make_daily_summary(daily_data) rt_raw = pd.concat([rt_raw, daily_data]) @@ -344,7 +339,8 @@ def main(freq: str = 'D') -> Tuple[List[dict],pd.DataFrame, pd.DataFrame]: logger.info("\nExtracting data") data = static_gtfs_analysis.GTFSFeed.extract_data( CTA_GTFS, - version_id=schedule_version + version_id=schedule_version, + cta_download=False ) data = static_gtfs_analysis.format_dates_hours(data) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 75f10d3..9e02e31 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -1,11 +1,13 @@ import boto3 import sys -import data_analysis.static_gtfs_analysis as sga -import data_analysis.compare_scheduled_and_rt as csrt + import pendulum from io import StringIO import pandas as pd +import data_analysis.static_gtfs_analysis as sga +import data_analysis.compare_scheduled_and_rt as csrt +from utils import s3_csv_reader ACCESS_KEY = sys.argv[1] SECRET_KEY = sys.argv[2] @@ -89,12 +91,8 @@ def save_realtime_daily_summary() -> None: end_date = end_date.to_date_string() - daily_data = pd.read_csv( - (csrt.BASE_PATH / f"bus_full_day_data_v2/{end_date}.csv") - .as_uri(), - low_memory=False - ) - + daily_data = s3_csv_reader.read_csv(csrt.BASE_PATH / f"bus_full_day_data_v2/{end_date}.csv") + daily_data = csrt.make_daily_summary(daily_data) filename = f'realtime_summaries/daily_job/bus_full_day_data_v2/{end_date}.csv' save_csv_to_bucket(daily_data, filename=filename) diff --git a/utils/s3_csv_reader.py b/utils/s3_csv_reader.py new file mode 100644 index 0000000..ae1d63c --- /dev/null +++ b/utils/s3_csv_reader.py @@ -0,0 +1,22 @@ +import pandas as pd +from pathlib import Path +import data_analysis.compare_scheduled_and_rt as csrt + +def read_csv(filename: str | Path) -> pd.DataFrame: + """Read pandas csv from S3 + + Args: + filename (str | Path): file to download from S3. + + Returns: + pd.DataFrame: A Pandas DataFrame from the S3 file. + """ + if isinstance(filename, str): + filename = Path(filename) + s3_filename = '/'.join(filename.parts[-2:]) + df = pd.read_csv( + f'https://{csrt.BUCKET_PUBLIC}.s3.us-east-2.amazonaws.com/{s3_filename}', + low_memory=False + ) + return df + \ No newline at end of file