Skip to content

Commit

Permalink
Merge pull request #60 from dcjohnson24/connection-timeout-workaround
Browse files Browse the repository at this point in the history
Use full s3 URL for ConnectionTimeoutError
  • Loading branch information
dcjohnson24 authored Jan 15, 2024
2 parents 423f4bd + 3305dd8 commit b51491b
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 17 deletions.
14 changes: 5 additions & 9 deletions data_analysis/compare_scheduled_and_rt.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os

from dataclasses import dataclass, field
from typing import List, Tuple
import logging
Expand All @@ -13,6 +14,7 @@

import data_analysis.static_gtfs_analysis as static_gtfs_analysis
from scrape_data.scrape_schedule_versions import create_schedule_list
from utils import s3_csv_reader

load_dotenv()

Expand Down Expand Up @@ -230,14 +232,7 @@ def combine_real_time_rt_comparison(
f"{pendulum.now().to_datetime_string()}"
)

# Use low_memory option to avoid warning about columns
# with mixed dtypes.
daily_data = pd.read_csv(
(BASE_PATH / f"bus_full_day_data_v2/{date_str}.csv")
.as_uri(),
low_memory=False
)

daily_data = s3_csv_reader.read_csv(BASE_PATH / f"bus_full_day_data_v2/{date_str}.csv")
daily_data = make_daily_summary(daily_data)

rt_raw = pd.concat([rt_raw, daily_data])
Expand Down Expand Up @@ -344,7 +339,8 @@ def main(freq: str = 'D') -> Tuple[List[dict],pd.DataFrame, pd.DataFrame]:
logger.info("\nExtracting data")
data = static_gtfs_analysis.GTFSFeed.extract_data(
CTA_GTFS,
version_id=schedule_version
version_id=schedule_version,
cta_download=False
)
data = static_gtfs_analysis.format_dates_hours(data)

Expand Down
14 changes: 6 additions & 8 deletions scrape_data/cta_data_downloads.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import boto3
import sys
import data_analysis.static_gtfs_analysis as sga
import data_analysis.compare_scheduled_and_rt as csrt

import pendulum
from io import StringIO
import pandas as pd

import data_analysis.static_gtfs_analysis as sga
import data_analysis.compare_scheduled_and_rt as csrt
from utils import s3_csv_reader

ACCESS_KEY = sys.argv[1]
SECRET_KEY = sys.argv[2]
Expand Down Expand Up @@ -89,12 +91,8 @@ def save_realtime_daily_summary() -> None:

end_date = end_date.to_date_string()

daily_data = pd.read_csv(
(csrt.BASE_PATH / f"bus_full_day_data_v2/{end_date}.csv")
.as_uri(),
low_memory=False
)

daily_data = s3_csv_reader.read_csv(csrt.BASE_PATH / f"bus_full_day_data_v2/{end_date}.csv")

daily_data = csrt.make_daily_summary(daily_data)
filename = f'realtime_summaries/daily_job/bus_full_day_data_v2/{end_date}.csv'
save_csv_to_bucket(daily_data, filename=filename)
Expand Down
22 changes: 22 additions & 0 deletions utils/s3_csv_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import pandas as pd
from pathlib import Path
import data_analysis.compare_scheduled_and_rt as csrt

def read_csv(filename: str | Path) -> pd.DataFrame:
"""Read pandas csv from S3
Args:
filename (str | Path): file to download from S3.
Returns:
pd.DataFrame: A Pandas DataFrame from the S3 file.
"""
if isinstance(filename, str):
filename = Path(filename)
s3_filename = '/'.join(filename.parts[-2:])
df = pd.read_csv(
f'https://{csrt.BUCKET_PUBLIC}.s3.us-east-2.amazonaws.com/{s3_filename}',
low_memory=False
)
return df

0 comments on commit b51491b

Please sign in to comment.