From d4152992372aef02daaebe0deedf72355f4495c6 Mon Sep 17 00:00:00 2001 From: laurie merrell Date: Mon, 10 Oct 2022 17:32:28 -0500 Subject: [PATCH 1/6] backfill data 2022-08-08 through 2022-10-08 into public bucket --- .gitignore | 1 + data_analysis/__init__.py | 0 data_analysis/requirements.txt | 3 +- data_analysis/rt_daily_aggregations.py | 127 +------- scrape_data/__init__.py | 0 scrape_data/combine_daily_files.py | 7 +- utils/copy_raw_data_to_public_bucket.ipynb | 321 +++++++++++++++++++++ 7 files changed, 336 insertions(+), 123 deletions(-) create mode 100644 data_analysis/__init__.py create mode 100644 scrape_data/__init__.py create mode 100644 utils/copy_raw_data_to_public_bucket.ipynb diff --git a/.gitignore b/.gitignore index 97d9349..f623156 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .DS_Store +__pycache__/ .ipynb_checkpoints/ venv .venv diff --git a/data_analysis/__init__.py b/data_analysis/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/data_analysis/requirements.txt b/data_analysis/requirements.txt index 91558f9..8440841 100644 --- a/data_analysis/requirements.txt +++ b/data_analysis/requirements.txt @@ -3,4 +3,5 @@ pandas==1.4.3 geopandas==0.11.1 s3fs==2022.7.1 shapely==1.8.4 -jupyter==1.0.0 \ No newline at end of file +jupyter==1.0.0 +typer[all]==0.6.1 \ No newline at end of file diff --git a/data_analysis/rt_daily_aggregations.py b/data_analysis/rt_daily_aggregations.py index c26964d..d21b1dd 100644 --- a/data_analysis/rt_daily_aggregations.py +++ b/data_analysis/rt_daily_aggregations.py @@ -7,32 +7,20 @@ import pandas as pd import pendulum from tqdm import tqdm +import typer from dotenv import load_dotenv +# have to run from root directory for this to work +from scrape_data import combine_daily_files load_dotenv() +app = typer.Typer() -BUCKET_PUBLIC = os.getenv('BUCKET_PUBLIC', 'chn-ghost-buses-public') logger = logging.getLogger() logging.basicConfig(level=logging.INFO) +@app.command() +def combine_daily_totals(start_date: str, end_date: str): -def compute_hourly_totals( - start_data: str, end_date: str, bucket_type: str = None -) -> Tuple[pd.DataFrame]: - """Aggregate route data to hourly totals. - - Args: - start_data (str): starting date of the route - end_date (str): ending date of route - bucket_type (str, optional): which bucket to pull data from. If - 'public', BUCKET_PUBLIC is used. If 'private', BUCKET_PRIVATE. - Defaults to None. - - Returns: - Tuple[pd.DataFrame]: a tuple of the data, errors, and combined - hourly summaries DataFrames. - """ - hourly_summary_combined = pd.DataFrame() date_range = [ d for d in pendulum.period( @@ -40,110 +28,11 @@ def compute_hourly_totals( pendulum.from_format(end_date, "YYYY-MM-DD"), ).range("days") ] - s3 = boto3.resource("s3") - if bucket_type is None or bucket_type == "public": - bucket = s3.Bucket(BUCKET_PUBLIC) - else: - bucket = s3.Bucket(bucket_type) pbar = tqdm(date_range) for day in pbar: - date_str = day.to_date_string() - pbar.set_description( - f"Processing {date_str} at {pendulum.now().to_datetime_string()}") - objects = bucket.objects.filter(Prefix=f"bus_data/{date_str}") - - logger.info(f"------- loading data at" - f"{pendulum.now().to_datetime_string()}") - - # load data - data_dict = {} - - # Access denied for public bucket - obj_pbar = tqdm(objects) - for obj in obj_pbar: - obj_pbar.set_description(f"loading {obj}") - obj_name = obj.key - # https://stackoverflow.com/questions/31976273/open-s3-object-as-a-string-with-boto3 - obj_body = json.loads(obj.get()["Body"].read().decode("utf-8")) - data_dict[obj_name] = obj_body - - # parse data into actual vehicle locations and errors - - logger.info(f"------- parsing data at" - f"{pendulum.now().to_datetime_string()}") - - data = pd.DataFrame() - errors = pd.DataFrame() - - # k, v here are filename: full dict of JSON - data_dict_pbar = tqdm(data_dict.items()) - for k, v in data_dict_pbar: - data_dict_pbar.set_description(f"processing {k}") - filename = k - new_data = pd.DataFrame() - new_errors = pd.DataFrame() - # expect ~12 "chunks" per JSON - for chunk, contents in v.items(): - if "vehicle" in v[chunk]["bustime-response"].keys(): - new_data = new_data.append( - pd.DataFrame(v[chunk]["bustime-response"]["vehicle"]) - ) - if "error" in v[chunk]["bustime-response"].keys(): - new_errors = new_errors.append( - pd.DataFrame(v[chunk]["bustime-response"]["error"]) - ) - new_data["scrape_file"] = filename - new_errors["scrape_file"] = filename - data = data.append(new_data) - errors = errors.append(new_errors) - - logger.info(f"------- saving data at" - f"{pendulum.now().to_datetime_string()}") - - if len(errors) > 0: - bucket.put_object( - Body=errors.to_csv(index=False), - Key=f"bus_full_day_errors_v2/{date_str}.csv", - ) - - if len(data) > 0: - # convert data time to actual datetime - data["data_time"] = pd.to_datetime( - data["tmstmp"], format="%Y%m%d %H:%M") - - data["data_hour"] = data.data_time.dt.hour - data["data_date"] = data.data_time.dt.date - - bucket.put_object( - Body=data.to_csv(index=False), - Key=f"bus_full_day_data_v2/{date_str}.csv", - ) - - # combine vids into a set (drops duplicates): - # https://stackoverflow.com/a/45925961 - hourly_summary = ( - data.groupby(["data_date", "data_hour", "rt", "des"]) - .agg({"vid": set, "tatripid": set, "tablockid": set}) - .reset_index() - ) - # get number of vehicles per hour per route - hourly_summary["vh_count"] = hourly_summary["vid"].apply(len) - hourly_summary["trip_count"] = hourly_summary["tatripid"].apply( - len) - hourly_summary["block_count"] = hourly_summary["tablockid"].apply( - len) - - bucket.put_object( - Body=hourly_summary.to_csv(index=False), - Key=f"bus_hourly_summary_v2/{date_str}.csv", - ) - pd.concat([hourly_summary_combined, hourly_summary]) - return data, errors, hourly_summary + combine_daily_files.combine_daily_files(day.to_date_string(), ['chn-ghost-buses-private']) if __name__ == "__main__": - start_date = "2022-07-17" - end_date = "2022-08-07" - - data, errors, hourly_summary = compute_hourly_totals(start_date, end_date) + app() diff --git a/scrape_data/__init__.py b/scrape_data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scrape_data/combine_daily_files.py b/scrape_data/combine_daily_files.py index 90e84ee..9b0b93a 100644 --- a/scrape_data/combine_daily_files.py +++ b/scrape_data/combine_daily_files.py @@ -1,5 +1,6 @@ import os import logging +from typing import List import boto3 import json @@ -20,7 +21,7 @@ logging.basicConfig(level=logging.INFO) -def combine_daily_files(date: str): +def combine_daily_files(date: str, bucket_list: List[str]): """Combine raw JSON files returned by API into daily CSVs. Args: @@ -28,7 +29,7 @@ def combine_daily_files(date: str): """ s3 = boto3.resource("s3") - for bucket_name in [BUCKET_PRIVATE, BUCKET_PUBLIC]: + for bucket_name in bucket_list: logging.info(f"processing data from {bucket_name}") bucket = s3.Bucket(bucket_name) objects = bucket.objects.filter(Prefix=f"bus_data/{date}") @@ -104,4 +105,4 @@ def combine_daily_files(date: str): def lambda_handler(event, context): date = pendulum.yesterday("America/Chicago").to_date_string() - combine_daily_files(date) + combine_daily_files(date, [BUCKET_PRIVATE, BUCKET_PUBLIC]) diff --git a/utils/copy_raw_data_to_public_bucket.ipynb b/utils/copy_raw_data_to_public_bucket.ipynb new file mode 100644 index 0000000..2d847d7 --- /dev/null +++ b/utils/copy_raw_data_to_public_bucket.ipynb @@ -0,0 +1,321 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "id": "f25eb6c8", + "metadata": {}, + "outputs": [], + "source": [ + "import boto3\n", + "import pendulum" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d480191d", + "metadata": {}, + "outputs": [], + "source": [ + "s3 = boto3.resource('s3')\n", + "private_bucket = s3.Bucket('chn-ghost-buses-private')\n", + "public_bucket = s3.Bucket('chn-ghost-buses-public')" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "1211fabe", + "metadata": {}, + "outputs": [], + "source": [ + "# copy raw data from the private bucket to the public bucket \n", + "start_date = '2022-08-10'\n", + "end_date = '2022-10-08'\n", + "\n", + "date_range = [d for d in pendulum.period(pendulum.from_format(start_date, 'YYYY-MM-DD'), pendulum.from_format(end_date, 'YYYY-MM-DD')).range('days')]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "174dd1e4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing 2022-05-19 at 2022-10-10 11:22:23\n", + "Processing 2022-05-20 at 2022-10-10 11:22:28\n", + "Processing 2022-05-21 at 2022-10-10 11:23:36\n", + "Processing 2022-05-22 at 2022-10-10 11:24:37\n", + "Processing 2022-05-23 at 2022-10-10 11:25:35\n", + "Processing 2022-05-24 at 2022-10-10 11:26:41\n", + "Processing 2022-05-25 at 2022-10-10 11:27:48\n", + "Processing 2022-05-26 at 2022-10-10 11:28:56\n", + "Processing 2022-05-27 at 2022-10-10 11:30:06\n", + "Processing 2022-05-28 at 2022-10-10 11:31:15\n", + "Processing 2022-05-29 at 2022-10-10 11:32:19\n", + "Processing 2022-05-30 at 2022-10-10 11:33:18\n", + "Processing 2022-05-31 at 2022-10-10 11:34:17\n", + "Processing 2022-06-01 at 2022-10-10 11:35:24\n", + "Processing 2022-06-02 at 2022-10-10 11:36:31\n", + "Processing 2022-06-03 at 2022-10-10 11:37:36\n", + "Processing 2022-06-04 at 2022-10-10 11:38:41\n", + "Processing 2022-06-05 at 2022-10-10 11:39:42\n", + "Processing 2022-06-06 at 2022-10-10 11:40:41\n", + "Processing 2022-06-07 at 2022-10-10 11:41:47\n", + "Processing 2022-06-08 at 2022-10-10 11:42:51\n", + "Processing 2022-06-09 at 2022-10-10 11:43:56\n", + "Processing 2022-06-10 at 2022-10-10 11:45:01\n", + "Processing 2022-06-11 at 2022-10-10 11:46:05\n", + "Processing 2022-06-12 at 2022-10-10 11:47:04\n", + "Processing 2022-06-13 at 2022-10-10 11:48:00\n", + "Processing 2022-06-14 at 2022-10-10 11:49:03\n", + "Processing 2022-06-15 at 2022-10-10 11:50:08\n", + "Processing 2022-06-16 at 2022-10-10 11:51:11\n", + "Processing 2022-06-17 at 2022-10-10 11:52:13\n", + "Processing 2022-06-18 at 2022-10-10 11:53:16\n", + "Processing 2022-06-19 at 2022-10-10 11:54:16\n", + "Processing 2022-06-20 at 2022-10-10 11:55:13\n", + "Processing 2022-06-21 at 2022-10-10 11:56:17\n", + "Processing 2022-06-22 at 2022-10-10 11:57:22\n", + "Processing 2022-06-23 at 2022-10-10 11:58:24\n", + "Processing 2022-06-24 at 2022-10-10 11:59:29\n", + "Processing 2022-06-25 at 2022-10-10 12:00:34\n", + "Processing 2022-06-26 at 2022-10-10 12:01:39\n", + "Processing 2022-06-27 at 2022-10-10 12:02:37\n", + "Processing 2022-06-28 at 2022-10-10 12:03:41\n", + "Processing 2022-06-29 at 2022-10-10 12:04:46\n", + "Processing 2022-06-30 at 2022-10-10 12:05:54\n", + "Processing 2022-07-01 at 2022-10-10 12:06:59\n", + "Processing 2022-07-02 at 2022-10-10 12:08:03\n", + "Processing 2022-07-03 at 2022-10-10 12:09:11\n", + "Processing 2022-07-04 at 2022-10-10 12:10:07\n", + "Processing 2022-07-05 at 2022-10-10 12:11:06\n", + "Processing 2022-07-06 at 2022-10-10 12:12:12\n", + "Processing 2022-07-07 at 2022-10-10 12:13:17\n", + "Processing 2022-07-08 at 2022-10-10 12:14:23\n", + "Processing 2022-07-09 at 2022-10-10 12:15:30\n", + "Processing 2022-07-10 at 2022-10-10 12:16:31\n", + "Processing 2022-07-11 at 2022-10-10 12:17:28\n", + "Processing 2022-07-12 at 2022-10-10 12:18:32\n", + "Processing 2022-07-13 at 2022-10-10 12:19:38\n", + "Processing 2022-07-14 at 2022-10-10 12:20:47\n", + "Processing 2022-07-15 at 2022-10-10 12:21:52\n", + "Processing 2022-07-16 at 2022-10-10 12:22:57\n", + "Processing 2022-07-17 at 2022-10-10 12:23:57\n", + "Processing 2022-07-18 at 2022-10-10 12:24:54\n", + "Processing 2022-07-19 at 2022-10-10 12:25:58\n", + "Processing 2022-07-20 at 2022-10-10 12:27:03\n", + "Processing 2022-07-21 at 2022-10-10 12:28:07\n", + "Processing 2022-07-22 at 2022-10-10 12:29:15\n", + "Processing 2022-07-23 at 2022-10-10 12:30:23\n", + "Processing 2022-07-24 at 2022-10-10 12:31:26\n", + "Processing 2022-07-25 at 2022-10-10 12:32:23\n", + "Processing 2022-07-26 at 2022-10-10 12:33:30\n", + "Processing 2022-07-27 at 2022-10-10 12:34:37\n", + "Processing 2022-07-28 at 2022-10-10 12:35:45\n", + "Processing 2022-07-29 at 2022-10-10 12:36:52\n", + "Processing 2022-07-30 at 2022-10-10 12:38:00\n", + "Processing 2022-07-31 at 2022-10-10 12:39:01\n", + "Processing 2022-08-01 at 2022-10-10 12:39:59\n", + "Processing 2022-08-02 at 2022-10-10 12:41:05\n", + "Processing 2022-08-03 at 2022-10-10 12:42:11\n", + "Processing 2022-08-04 at 2022-10-10 12:43:16\n", + "Processing 2022-08-05 at 2022-10-10 12:44:21\n", + "Processing 2022-08-06 at 2022-10-10 12:45:26\n", + "Processing 2022-08-07 at 2022-10-10 12:46:27\n", + "Processing 2022-08-08 at 2022-10-10 12:47:26\n", + "Processing 2022-08-09 at 2022-10-10 12:48:32\n", + "Processing 2022-08-10 at 2022-10-10 12:49:37\n", + "Processing 2022-08-11 at 2022-10-10 12:50:44\n", + "Processing 2022-08-12 at 2022-10-10 12:51:49\n", + "Processing 2022-08-13 at 2022-10-10 12:52:55\n", + "Processing 2022-08-14 at 2022-10-10 12:53:55\n", + "Processing 2022-08-15 at 2022-10-10 12:54:51\n", + "Processing 2022-08-16 at 2022-10-10 12:55:56\n", + "Processing 2022-08-17 at 2022-10-10 12:57:01\n", + "Processing 2022-08-18 at 2022-10-10 12:58:08\n", + "Processing 2022-08-19 at 2022-10-10 12:59:15\n", + "Processing 2022-08-20 at 2022-10-10 13:00:25\n", + "Processing 2022-08-21 at 2022-10-10 13:01:26\n", + "Processing 2022-08-22 at 2022-10-10 13:02:23\n", + "Processing 2022-08-23 at 2022-10-10 13:03:29\n", + "Processing 2022-08-24 at 2022-10-10 13:04:34\n", + "Processing 2022-08-25 at 2022-10-10 13:05:43\n", + "Processing 2022-08-26 at 2022-10-10 13:06:50\n", + "Processing 2022-08-27 at 2022-10-10 13:07:53\n", + "Processing 2022-08-28 at 2022-10-10 13:08:57\n", + "Processing 2022-08-29 at 2022-10-10 13:09:59\n", + "Processing 2022-08-30 at 2022-10-10 13:11:04\n", + "Processing 2022-08-31 at 2022-10-10 13:12:12\n", + "Processing 2022-09-01 at 2022-10-10 13:13:22\n", + "Processing 2022-09-02 at 2022-10-10 13:14:33\n", + "Processing 2022-09-03 at 2022-10-10 13:15:44\n", + "Processing 2022-09-04 at 2022-10-10 13:16:54\n", + "Processing 2022-09-05 at 2022-10-10 13:17:56\n", + "Processing 2022-09-06 at 2022-10-10 13:19:01\n", + "Processing 2022-09-07 at 2022-10-10 13:20:13\n", + "Processing 2022-09-08 at 2022-10-10 13:21:20\n", + "Processing 2022-09-09 at 2022-10-10 13:22:29\n", + "Processing 2022-09-10 at 2022-10-10 13:23:35\n", + "Processing 2022-09-11 at 2022-10-10 13:24:41\n", + "Processing 2022-09-12 at 2022-10-10 13:25:41\n", + "Processing 2022-09-13 at 2022-10-10 13:26:49\n", + "Processing 2022-09-14 at 2022-10-10 13:27:59\n", + "Processing 2022-09-15 at 2022-10-10 13:29:08\n", + "Processing 2022-09-16 at 2022-10-10 13:30:16\n", + "Processing 2022-09-17 at 2022-10-10 13:31:27\n", + "Processing 2022-09-18 at 2022-10-10 13:32:34\n", + "Processing 2022-09-19 at 2022-10-10 13:33:35\n", + "Processing 2022-09-20 at 2022-10-10 13:34:45\n", + "Processing 2022-09-21 at 2022-10-10 13:35:57\n", + "Processing 2022-09-22 at 2022-10-10 13:37:08\n", + "Processing 2022-09-23 at 2022-10-10 13:38:17\n", + "Processing 2022-09-24 at 2022-10-10 13:39:25\n", + "Processing 2022-09-25 at 2022-10-10 13:40:33\n", + "Processing 2022-09-26 at 2022-10-10 13:41:37\n", + "Processing 2022-09-27 at 2022-10-10 13:42:46\n", + "Processing 2022-09-28 at 2022-10-10 13:43:56\n", + "Processing 2022-09-29 at 2022-10-10 13:45:05\n", + "Processing 2022-09-30 at 2022-10-10 13:46:14\n", + "Processing 2022-10-01 at 2022-10-10 13:47:20\n", + "Processing 2022-10-02 at 2022-10-10 13:48:28\n", + "Processing 2022-10-03 at 2022-10-10 13:49:30\n", + "Processing 2022-10-04 at 2022-10-10 13:50:43\n", + "Processing 2022-10-05 at 2022-10-10 13:51:52\n", + "Processing 2022-10-06 at 2022-10-10 13:52:59\n", + "Processing 2022-10-07 at 2022-10-10 13:54:06\n", + "Processing 2022-10-08 at 2022-10-10 13:55:15\n", + "Processing 2022-10-09 at 2022-10-10 13:56:22\n" + ] + } + ], + "source": [ + "for day in date_range:\n", + " date_str = day.to_date_string()\n", + " print(f\"Processing {date_str} at {pendulum.now().to_datetime_string()}\")\n", + " objects = private_bucket.objects.filter(Prefix = f'bus_data/{date_str}')\n", + " for obj in objects:\n", + " public_bucket.copy({'Bucket': private_bucket.name, 'Key': obj.key}, obj.key)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "433829c8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing 2022-08-10 at 2022-10-10 17:01:41\n", + "Processing 2022-08-11 at 2022-10-10 17:01:43\n", + "Processing 2022-08-12 at 2022-10-10 17:01:44\n", + "Processing 2022-08-13 at 2022-10-10 17:01:45\n", + "Processing 2022-08-14 at 2022-10-10 17:01:46\n", + "Processing 2022-08-15 at 2022-10-10 17:01:47\n", + "Processing 2022-08-16 at 2022-10-10 17:01:48\n", + "Processing 2022-08-17 at 2022-10-10 17:01:49\n", + "Processing 2022-08-18 at 2022-10-10 17:01:50\n", + "Processing 2022-08-19 at 2022-10-10 17:01:51\n", + "Processing 2022-08-20 at 2022-10-10 17:01:52\n", + "Processing 2022-08-21 at 2022-10-10 17:01:53\n", + "Processing 2022-08-22 at 2022-10-10 17:01:54\n", + "Processing 2022-08-23 at 2022-10-10 17:01:55\n", + "Processing 2022-08-24 at 2022-10-10 17:01:57\n", + "Processing 2022-08-25 at 2022-10-10 17:01:57\n", + "Processing 2022-08-26 at 2022-10-10 17:01:58\n", + "Processing 2022-08-27 at 2022-10-10 17:01:59\n", + "Processing 2022-08-28 at 2022-10-10 17:02:01\n", + "Processing 2022-08-29 at 2022-10-10 17:02:03\n", + "Processing 2022-08-30 at 2022-10-10 17:02:04\n", + "Processing 2022-08-31 at 2022-10-10 17:02:05\n", + "Processing 2022-09-01 at 2022-10-10 17:02:06\n", + "Processing 2022-09-02 at 2022-10-10 17:02:07\n", + "Processing 2022-09-03 at 2022-10-10 17:02:08\n", + "Processing 2022-09-04 at 2022-10-10 17:02:09\n", + "Processing 2022-09-05 at 2022-10-10 17:02:10\n", + "Processing 2022-09-06 at 2022-10-10 17:02:11\n", + "Processing 2022-09-07 at 2022-10-10 17:02:13\n", + "Processing 2022-09-08 at 2022-10-10 17:02:14\n", + "Processing 2022-09-09 at 2022-10-10 17:02:15\n", + "Processing 2022-09-10 at 2022-10-10 17:02:16\n", + "Processing 2022-09-11 at 2022-10-10 17:02:17\n", + "Processing 2022-09-12 at 2022-10-10 17:02:18\n", + "Processing 2022-09-13 at 2022-10-10 17:02:19\n", + "Processing 2022-09-14 at 2022-10-10 17:02:20\n", + "Processing 2022-09-15 at 2022-10-10 17:02:21\n", + "Processing 2022-09-16 at 2022-10-10 17:02:22\n", + "Processing 2022-09-17 at 2022-10-10 17:02:23\n", + "Processing 2022-09-18 at 2022-10-10 17:02:24\n", + "Processing 2022-09-19 at 2022-10-10 17:02:25\n", + "Processing 2022-09-20 at 2022-10-10 17:02:27\n", + "Processing 2022-09-21 at 2022-10-10 17:02:28\n", + "Processing 2022-09-22 at 2022-10-10 17:02:29\n", + "Processing 2022-09-23 at 2022-10-10 17:02:30\n", + "Processing 2022-09-24 at 2022-10-10 17:02:31\n", + "Processing 2022-09-25 at 2022-10-10 17:02:32\n", + "Processing 2022-09-26 at 2022-10-10 17:02:33\n", + "Processing 2022-09-27 at 2022-10-10 17:02:34\n", + "Processing 2022-09-28 at 2022-10-10 17:02:35\n", + "Processing 2022-09-29 at 2022-10-10 17:02:36\n", + "Processing 2022-09-30 at 2022-10-10 17:02:37\n", + "Processing 2022-10-01 at 2022-10-10 17:02:38\n", + "Processing 2022-10-02 at 2022-10-10 17:02:39\n", + "Processing 2022-10-03 at 2022-10-10 17:02:40\n", + "Processing 2022-10-04 at 2022-10-10 17:02:42\n", + "Processing 2022-10-05 at 2022-10-10 17:02:43\n", + "Processing 2022-10-06 at 2022-10-10 17:02:44\n", + "Processing 2022-10-07 at 2022-10-10 17:02:45\n", + "Processing 2022-10-08 at 2022-10-10 17:02:47\n" + ] + } + ], + "source": [ + "for day in date_range:\n", + " date_str = day.to_date_string()\n", + " print(f\"Processing {date_str} at {pendulum.now().to_datetime_string()}\")\n", + " data_objects = private_bucket.objects.filter(Prefix = f'bus_full_day_data_v2/{date_str}.csv')\n", + " error_objects = private_bucket.objects.filter(Prefix = f'bus_full_day_errors_v2/{date_str}.csv')\n", + " for obj in data_objects:\n", + " public_bucket.copy({'Bucket': private_bucket.name, 'Key': obj.key}, obj.key)\n", + " for obj in error_objects: \n", + " public_bucket.copy({'Bucket': private_bucket.name, 'Key': obj.key}, obj.key)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acf91f7e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 58110ac843296675af7630129c9aa1156ad4834a Mon Sep 17 00:00:00 2001 From: laurie merrell Date: Mon, 10 Oct 2022 17:39:28 -0500 Subject: [PATCH 2/6] remove output cells and comment a little --- utils/copy_raw_data_to_public_bucket.ipynb | 246 ++------------------- 1 file changed, 15 insertions(+), 231 deletions(-) diff --git a/utils/copy_raw_data_to_public_bucket.ipynb b/utils/copy_raw_data_to_public_bucket.ipynb index 2d847d7..c40067e 100644 --- a/utils/copy_raw_data_to_public_bucket.ipynb +++ b/utils/copy_raw_data_to_public_bucket.ipynb @@ -2,8 +2,8 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, - "id": "f25eb6c8", + "execution_count": null, + "id": "12ebecef", "metadata": {}, "outputs": [], "source": [ @@ -13,8 +13,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "d480191d", + "execution_count": null, + "id": "dc86c046", "metadata": {}, "outputs": [], "source": [ @@ -25,8 +25,8 @@ }, { "cell_type": "code", - "execution_count": 19, - "id": "1211fabe", + "execution_count": null, + "id": "4b83b078", "metadata": {}, "outputs": [], "source": [ @@ -39,162 +39,12 @@ }, { "cell_type": "code", - "execution_count": 18, - "id": "174dd1e4", + "execution_count": null, + "id": "204bb9d3", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing 2022-05-19 at 2022-10-10 11:22:23\n", - "Processing 2022-05-20 at 2022-10-10 11:22:28\n", - "Processing 2022-05-21 at 2022-10-10 11:23:36\n", - "Processing 2022-05-22 at 2022-10-10 11:24:37\n", - "Processing 2022-05-23 at 2022-10-10 11:25:35\n", - "Processing 2022-05-24 at 2022-10-10 11:26:41\n", - "Processing 2022-05-25 at 2022-10-10 11:27:48\n", - "Processing 2022-05-26 at 2022-10-10 11:28:56\n", - "Processing 2022-05-27 at 2022-10-10 11:30:06\n", - "Processing 2022-05-28 at 2022-10-10 11:31:15\n", - "Processing 2022-05-29 at 2022-10-10 11:32:19\n", - "Processing 2022-05-30 at 2022-10-10 11:33:18\n", - "Processing 2022-05-31 at 2022-10-10 11:34:17\n", - "Processing 2022-06-01 at 2022-10-10 11:35:24\n", - "Processing 2022-06-02 at 2022-10-10 11:36:31\n", - "Processing 2022-06-03 at 2022-10-10 11:37:36\n", - "Processing 2022-06-04 at 2022-10-10 11:38:41\n", - "Processing 2022-06-05 at 2022-10-10 11:39:42\n", - "Processing 2022-06-06 at 2022-10-10 11:40:41\n", - "Processing 2022-06-07 at 2022-10-10 11:41:47\n", - "Processing 2022-06-08 at 2022-10-10 11:42:51\n", - "Processing 2022-06-09 at 2022-10-10 11:43:56\n", - "Processing 2022-06-10 at 2022-10-10 11:45:01\n", - "Processing 2022-06-11 at 2022-10-10 11:46:05\n", - "Processing 2022-06-12 at 2022-10-10 11:47:04\n", - "Processing 2022-06-13 at 2022-10-10 11:48:00\n", - "Processing 2022-06-14 at 2022-10-10 11:49:03\n", - "Processing 2022-06-15 at 2022-10-10 11:50:08\n", - "Processing 2022-06-16 at 2022-10-10 11:51:11\n", - "Processing 2022-06-17 at 2022-10-10 11:52:13\n", - "Processing 2022-06-18 at 2022-10-10 11:53:16\n", - "Processing 2022-06-19 at 2022-10-10 11:54:16\n", - "Processing 2022-06-20 at 2022-10-10 11:55:13\n", - "Processing 2022-06-21 at 2022-10-10 11:56:17\n", - "Processing 2022-06-22 at 2022-10-10 11:57:22\n", - "Processing 2022-06-23 at 2022-10-10 11:58:24\n", - "Processing 2022-06-24 at 2022-10-10 11:59:29\n", - "Processing 2022-06-25 at 2022-10-10 12:00:34\n", - "Processing 2022-06-26 at 2022-10-10 12:01:39\n", - "Processing 2022-06-27 at 2022-10-10 12:02:37\n", - "Processing 2022-06-28 at 2022-10-10 12:03:41\n", - "Processing 2022-06-29 at 2022-10-10 12:04:46\n", - "Processing 2022-06-30 at 2022-10-10 12:05:54\n", - "Processing 2022-07-01 at 2022-10-10 12:06:59\n", - "Processing 2022-07-02 at 2022-10-10 12:08:03\n", - "Processing 2022-07-03 at 2022-10-10 12:09:11\n", - "Processing 2022-07-04 at 2022-10-10 12:10:07\n", - "Processing 2022-07-05 at 2022-10-10 12:11:06\n", - "Processing 2022-07-06 at 2022-10-10 12:12:12\n", - "Processing 2022-07-07 at 2022-10-10 12:13:17\n", - "Processing 2022-07-08 at 2022-10-10 12:14:23\n", - "Processing 2022-07-09 at 2022-10-10 12:15:30\n", - "Processing 2022-07-10 at 2022-10-10 12:16:31\n", - "Processing 2022-07-11 at 2022-10-10 12:17:28\n", - "Processing 2022-07-12 at 2022-10-10 12:18:32\n", - "Processing 2022-07-13 at 2022-10-10 12:19:38\n", - "Processing 2022-07-14 at 2022-10-10 12:20:47\n", - "Processing 2022-07-15 at 2022-10-10 12:21:52\n", - "Processing 2022-07-16 at 2022-10-10 12:22:57\n", - "Processing 2022-07-17 at 2022-10-10 12:23:57\n", - "Processing 2022-07-18 at 2022-10-10 12:24:54\n", - "Processing 2022-07-19 at 2022-10-10 12:25:58\n", - "Processing 2022-07-20 at 2022-10-10 12:27:03\n", - "Processing 2022-07-21 at 2022-10-10 12:28:07\n", - "Processing 2022-07-22 at 2022-10-10 12:29:15\n", - "Processing 2022-07-23 at 2022-10-10 12:30:23\n", - "Processing 2022-07-24 at 2022-10-10 12:31:26\n", - "Processing 2022-07-25 at 2022-10-10 12:32:23\n", - "Processing 2022-07-26 at 2022-10-10 12:33:30\n", - "Processing 2022-07-27 at 2022-10-10 12:34:37\n", - "Processing 2022-07-28 at 2022-10-10 12:35:45\n", - "Processing 2022-07-29 at 2022-10-10 12:36:52\n", - "Processing 2022-07-30 at 2022-10-10 12:38:00\n", - "Processing 2022-07-31 at 2022-10-10 12:39:01\n", - "Processing 2022-08-01 at 2022-10-10 12:39:59\n", - "Processing 2022-08-02 at 2022-10-10 12:41:05\n", - "Processing 2022-08-03 at 2022-10-10 12:42:11\n", - "Processing 2022-08-04 at 2022-10-10 12:43:16\n", - "Processing 2022-08-05 at 2022-10-10 12:44:21\n", - "Processing 2022-08-06 at 2022-10-10 12:45:26\n", - "Processing 2022-08-07 at 2022-10-10 12:46:27\n", - "Processing 2022-08-08 at 2022-10-10 12:47:26\n", - "Processing 2022-08-09 at 2022-10-10 12:48:32\n", - "Processing 2022-08-10 at 2022-10-10 12:49:37\n", - "Processing 2022-08-11 at 2022-10-10 12:50:44\n", - "Processing 2022-08-12 at 2022-10-10 12:51:49\n", - "Processing 2022-08-13 at 2022-10-10 12:52:55\n", - "Processing 2022-08-14 at 2022-10-10 12:53:55\n", - "Processing 2022-08-15 at 2022-10-10 12:54:51\n", - "Processing 2022-08-16 at 2022-10-10 12:55:56\n", - "Processing 2022-08-17 at 2022-10-10 12:57:01\n", - "Processing 2022-08-18 at 2022-10-10 12:58:08\n", - "Processing 2022-08-19 at 2022-10-10 12:59:15\n", - "Processing 2022-08-20 at 2022-10-10 13:00:25\n", - "Processing 2022-08-21 at 2022-10-10 13:01:26\n", - "Processing 2022-08-22 at 2022-10-10 13:02:23\n", - "Processing 2022-08-23 at 2022-10-10 13:03:29\n", - "Processing 2022-08-24 at 2022-10-10 13:04:34\n", - "Processing 2022-08-25 at 2022-10-10 13:05:43\n", - "Processing 2022-08-26 at 2022-10-10 13:06:50\n", - "Processing 2022-08-27 at 2022-10-10 13:07:53\n", - "Processing 2022-08-28 at 2022-10-10 13:08:57\n", - "Processing 2022-08-29 at 2022-10-10 13:09:59\n", - "Processing 2022-08-30 at 2022-10-10 13:11:04\n", - "Processing 2022-08-31 at 2022-10-10 13:12:12\n", - "Processing 2022-09-01 at 2022-10-10 13:13:22\n", - "Processing 2022-09-02 at 2022-10-10 13:14:33\n", - "Processing 2022-09-03 at 2022-10-10 13:15:44\n", - "Processing 2022-09-04 at 2022-10-10 13:16:54\n", - "Processing 2022-09-05 at 2022-10-10 13:17:56\n", - "Processing 2022-09-06 at 2022-10-10 13:19:01\n", - "Processing 2022-09-07 at 2022-10-10 13:20:13\n", - "Processing 2022-09-08 at 2022-10-10 13:21:20\n", - "Processing 2022-09-09 at 2022-10-10 13:22:29\n", - "Processing 2022-09-10 at 2022-10-10 13:23:35\n", - "Processing 2022-09-11 at 2022-10-10 13:24:41\n", - "Processing 2022-09-12 at 2022-10-10 13:25:41\n", - "Processing 2022-09-13 at 2022-10-10 13:26:49\n", - "Processing 2022-09-14 at 2022-10-10 13:27:59\n", - "Processing 2022-09-15 at 2022-10-10 13:29:08\n", - "Processing 2022-09-16 at 2022-10-10 13:30:16\n", - "Processing 2022-09-17 at 2022-10-10 13:31:27\n", - "Processing 2022-09-18 at 2022-10-10 13:32:34\n", - "Processing 2022-09-19 at 2022-10-10 13:33:35\n", - "Processing 2022-09-20 at 2022-10-10 13:34:45\n", - "Processing 2022-09-21 at 2022-10-10 13:35:57\n", - "Processing 2022-09-22 at 2022-10-10 13:37:08\n", - "Processing 2022-09-23 at 2022-10-10 13:38:17\n", - "Processing 2022-09-24 at 2022-10-10 13:39:25\n", - "Processing 2022-09-25 at 2022-10-10 13:40:33\n", - "Processing 2022-09-26 at 2022-10-10 13:41:37\n", - "Processing 2022-09-27 at 2022-10-10 13:42:46\n", - "Processing 2022-09-28 at 2022-10-10 13:43:56\n", - "Processing 2022-09-29 at 2022-10-10 13:45:05\n", - "Processing 2022-09-30 at 2022-10-10 13:46:14\n", - "Processing 2022-10-01 at 2022-10-10 13:47:20\n", - "Processing 2022-10-02 at 2022-10-10 13:48:28\n", - "Processing 2022-10-03 at 2022-10-10 13:49:30\n", - "Processing 2022-10-04 at 2022-10-10 13:50:43\n", - "Processing 2022-10-05 at 2022-10-10 13:51:52\n", - "Processing 2022-10-06 at 2022-10-10 13:52:59\n", - "Processing 2022-10-07 at 2022-10-10 13:54:06\n", - "Processing 2022-10-08 at 2022-10-10 13:55:15\n", - "Processing 2022-10-09 at 2022-10-10 13:56:22\n" - ] - } - ], + "outputs": [], "source": [ + "# copy fully raw JSON data from private to public\n", "for day in date_range:\n", " date_str = day.to_date_string()\n", " print(f\"Processing {date_str} at {pendulum.now().to_datetime_string()}\")\n", @@ -205,78 +55,12 @@ }, { "cell_type": "code", - "execution_count": 21, - "id": "433829c8", + "execution_count": null, + "id": "d2ac2752", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing 2022-08-10 at 2022-10-10 17:01:41\n", - "Processing 2022-08-11 at 2022-10-10 17:01:43\n", - "Processing 2022-08-12 at 2022-10-10 17:01:44\n", - "Processing 2022-08-13 at 2022-10-10 17:01:45\n", - "Processing 2022-08-14 at 2022-10-10 17:01:46\n", - "Processing 2022-08-15 at 2022-10-10 17:01:47\n", - "Processing 2022-08-16 at 2022-10-10 17:01:48\n", - "Processing 2022-08-17 at 2022-10-10 17:01:49\n", - "Processing 2022-08-18 at 2022-10-10 17:01:50\n", - "Processing 2022-08-19 at 2022-10-10 17:01:51\n", - "Processing 2022-08-20 at 2022-10-10 17:01:52\n", - "Processing 2022-08-21 at 2022-10-10 17:01:53\n", - "Processing 2022-08-22 at 2022-10-10 17:01:54\n", - "Processing 2022-08-23 at 2022-10-10 17:01:55\n", - "Processing 2022-08-24 at 2022-10-10 17:01:57\n", - "Processing 2022-08-25 at 2022-10-10 17:01:57\n", - "Processing 2022-08-26 at 2022-10-10 17:01:58\n", - "Processing 2022-08-27 at 2022-10-10 17:01:59\n", - "Processing 2022-08-28 at 2022-10-10 17:02:01\n", - "Processing 2022-08-29 at 2022-10-10 17:02:03\n", - "Processing 2022-08-30 at 2022-10-10 17:02:04\n", - "Processing 2022-08-31 at 2022-10-10 17:02:05\n", - "Processing 2022-09-01 at 2022-10-10 17:02:06\n", - "Processing 2022-09-02 at 2022-10-10 17:02:07\n", - "Processing 2022-09-03 at 2022-10-10 17:02:08\n", - "Processing 2022-09-04 at 2022-10-10 17:02:09\n", - "Processing 2022-09-05 at 2022-10-10 17:02:10\n", - "Processing 2022-09-06 at 2022-10-10 17:02:11\n", - "Processing 2022-09-07 at 2022-10-10 17:02:13\n", - "Processing 2022-09-08 at 2022-10-10 17:02:14\n", - "Processing 2022-09-09 at 2022-10-10 17:02:15\n", - "Processing 2022-09-10 at 2022-10-10 17:02:16\n", - "Processing 2022-09-11 at 2022-10-10 17:02:17\n", - "Processing 2022-09-12 at 2022-10-10 17:02:18\n", - "Processing 2022-09-13 at 2022-10-10 17:02:19\n", - "Processing 2022-09-14 at 2022-10-10 17:02:20\n", - "Processing 2022-09-15 at 2022-10-10 17:02:21\n", - "Processing 2022-09-16 at 2022-10-10 17:02:22\n", - "Processing 2022-09-17 at 2022-10-10 17:02:23\n", - "Processing 2022-09-18 at 2022-10-10 17:02:24\n", - "Processing 2022-09-19 at 2022-10-10 17:02:25\n", - "Processing 2022-09-20 at 2022-10-10 17:02:27\n", - "Processing 2022-09-21 at 2022-10-10 17:02:28\n", - "Processing 2022-09-22 at 2022-10-10 17:02:29\n", - "Processing 2022-09-23 at 2022-10-10 17:02:30\n", - "Processing 2022-09-24 at 2022-10-10 17:02:31\n", - "Processing 2022-09-25 at 2022-10-10 17:02:32\n", - "Processing 2022-09-26 at 2022-10-10 17:02:33\n", - "Processing 2022-09-27 at 2022-10-10 17:02:34\n", - "Processing 2022-09-28 at 2022-10-10 17:02:35\n", - "Processing 2022-09-29 at 2022-10-10 17:02:36\n", - "Processing 2022-09-30 at 2022-10-10 17:02:37\n", - "Processing 2022-10-01 at 2022-10-10 17:02:38\n", - "Processing 2022-10-02 at 2022-10-10 17:02:39\n", - "Processing 2022-10-03 at 2022-10-10 17:02:40\n", - "Processing 2022-10-04 at 2022-10-10 17:02:42\n", - "Processing 2022-10-05 at 2022-10-10 17:02:43\n", - "Processing 2022-10-06 at 2022-10-10 17:02:44\n", - "Processing 2022-10-07 at 2022-10-10 17:02:45\n", - "Processing 2022-10-08 at 2022-10-10 17:02:47\n" - ] - } - ], + "outputs": [], "source": [ + "# copy daily CSVs from private to public\n", "for day in date_range:\n", " date_str = day.to_date_string()\n", " print(f\"Processing {date_str} at {pendulum.now().to_datetime_string()}\")\n", @@ -291,7 +75,7 @@ { "cell_type": "code", "execution_count": null, - "id": "acf91f7e", + "id": "397493c2", "metadata": {}, "outputs": [], "source": [] From 0bf4a0260248fef6b582acdb7556b3b731749bbb Mon Sep 17 00:00:00 2001 From: laurie merrell Date: Tue, 11 Oct 2022 21:46:07 -0500 Subject: [PATCH 3/6] remove unused imports --- data_analysis/rt_daily_aggregations.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/data_analysis/rt_daily_aggregations.py b/data_analysis/rt_daily_aggregations.py index d21b1dd..7f81a0b 100644 --- a/data_analysis/rt_daily_aggregations.py +++ b/data_analysis/rt_daily_aggregations.py @@ -1,10 +1,6 @@ -import os -from typing import Tuple import logging import boto3 -import json -import pandas as pd import pendulum from tqdm import tqdm import typer From 8a6f012087413e6478c71607cd0c39ee79bc24f9 Mon Sep 17 00:00:00 2001 From: laurie merrell Date: Tue, 11 Oct 2022 22:10:01 -0500 Subject: [PATCH 4/6] PR review comments: add help text --- data_analysis/rt_daily_aggregations.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/data_analysis/rt_daily_aggregations.py b/data_analysis/rt_daily_aggregations.py index 7f81a0b..d835bdc 100644 --- a/data_analysis/rt_daily_aggregations.py +++ b/data_analysis/rt_daily_aggregations.py @@ -15,8 +15,16 @@ logging.basicConfig(level=logging.INFO) @app.command() -def combine_daily_totals(start_date: str, end_date: str): - +def combine_daily_totals( + start_date: str = typer.Argument(..., help = "Start date in format YYYY-MM-DD. Acceptable dates are any date between 2022-05-19 and yesterday (so, if today is 2022-10-11, can input any date up to 2022-10-10. Must be before end date."), + end_date: str = typer.Argument(..., help = "End date in format YYYY-MM-DD. Acceptable dates are any date between 2022-05-19 and yesterday (so, if today is 2022-10-11, can input any date up to 2022-10-10. Must be after start date."), + bucket_type: str = typer.Argument(..., help = "Either 'public' or 'private' -- bucket to read data from. User running must have read access to the bucket in question, so general users should list 'public'.")): + """ + Take all the raw JSON files for each date in a date range from one of the CHN ghost buses S3 buckets (private or public) + and aggregate them into daily CSV files. + TODO: Add save parameter so these can be saved locally rather than just back to the bucket. + """ + date_range = [ d for d in pendulum.period( @@ -27,7 +35,7 @@ def combine_daily_totals(start_date: str, end_date: str): pbar = tqdm(date_range) for day in pbar: - combine_daily_files.combine_daily_files(day.to_date_string(), ['chn-ghost-buses-private']) + combine_daily_files.combine_daily_files(day.to_date_string(), [f'chn-ghost-buses-{bucket_type}']) if __name__ == "__main__": From 3ccb283e3331c76a32c3d02d32b80bd11a7977b0 Mon Sep 17 00:00:00 2001 From: laurie merrell Date: Sun, 16 Oct 2022 21:24:46 -0500 Subject: [PATCH 5/6] add save parameter to combine_daily_files --- scrape_data/combine_daily_files.py | 47 ++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/scrape_data/combine_daily_files.py b/scrape_data/combine_daily_files.py index 9b0b93a..0b8ad1b 100644 --- a/scrape_data/combine_daily_files.py +++ b/scrape_data/combine_daily_files.py @@ -1,6 +1,6 @@ import os import logging -from typing import List +from typing import List, Optional import boto3 import json @@ -21,7 +21,7 @@ logging.basicConfig(level=logging.INFO) -def combine_daily_files(date: str, bucket_list: List[str]): +def combine_daily_files(date: str, bucket_list: List[str], save: Optional[str] = None): """Combine raw JSON files returned by API into daily CSVs. Args: @@ -78,13 +78,22 @@ def combine_daily_files(date: str, bucket_list: List[str]): data = pd.concat(data_list, ignore_index=True) errors = pd.concat(errors_list, ignore_index=True) + logging.info(f"found {len(errors)} errors and {len(data)} data points for {date}") + if len(errors) > 0: - error_key = f"bus_full_day_errors_v2/{date}.csv" - logging.info(f"saving errors to {bucket}/{error_key}") - bucket.put_object( - Body=errors.to_csv(index=False), - Key=error_key, - ) + if save == "bucket": + error_key = f"bus_full_day_errors_v2/{date}.csv" + logging.info(f"saving errors to {bucket}/{error_key}") + bucket.put_object( + Body=errors.to_csv(index=False), + Key=error_key, + ) + if save == "local": + local_filename = f"ghost_buses_full_day_errors_from_{bucket}_{date}.csv" + logging.info(f"saving errors to {local_filename}") + errors.to_csv(local_filename, index = False) + else: + logging.info(f"no errors found for {date}, not saving any error file") if len(data) > 0: # convert data time to actual datetime @@ -94,15 +103,23 @@ def combine_daily_files(date: str, bucket_list: List[str]): data["data_hour"] = data.data_time.dt.hour data["data_date"] = data.data_time.dt.date + if save == "bucket": + data_key = f"bus_full_day_data_v2/{date}.csv" + logging.info(f"saving data to {bucket}/{data_key}") + bucket.put_object( + Body=data.to_csv(index=False), + Key=data_key, + ) + if save == "local": + local_filename = f"ghost_buses_full_day_data_from_{bucket}_{date}.csv" + logging.info(f"saving errors to {local_filename}") + data.to_csv(local_filename, index = False) + else: + logging.info(f"no data found for {date}, not saving any data file") - data_key = f"bus_full_day_data_v2/{date}.csv" - logging.info(f"saving data to {bucket}/{data_key}") - bucket.put_object( - Body=data.to_csv(index=False), - Key=data_key, - ) +def add_daily_data_to_combined_file(content: pd.DataFrame, ) def lambda_handler(event, context): date = pendulum.yesterday("America/Chicago").to_date_string() - combine_daily_files(date, [BUCKET_PRIVATE, BUCKET_PUBLIC]) + combine_daily_files(date, [BUCKET_PRIVATE, BUCKET_PUBLIC], save = "bucket") From ae03fc09b1b3d00f29abba9091d440958dbff48f Mon Sep 17 00:00:00 2001 From: laurie merrell Date: Sun, 16 Oct 2022 21:52:33 -0500 Subject: [PATCH 6/6] actually add save parameter, make combine daily files return results --- data_analysis/rt_daily_aggregations.py | 9 ++++++--- scrape_data/combine_daily_files.py | 9 ++++----- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/data_analysis/rt_daily_aggregations.py b/data_analysis/rt_daily_aggregations.py index d835bdc..14bcddd 100644 --- a/data_analysis/rt_daily_aggregations.py +++ b/data_analysis/rt_daily_aggregations.py @@ -18,7 +18,9 @@ def combine_daily_totals( start_date: str = typer.Argument(..., help = "Start date in format YYYY-MM-DD. Acceptable dates are any date between 2022-05-19 and yesterday (so, if today is 2022-10-11, can input any date up to 2022-10-10. Must be before end date."), end_date: str = typer.Argument(..., help = "End date in format YYYY-MM-DD. Acceptable dates are any date between 2022-05-19 and yesterday (so, if today is 2022-10-11, can input any date up to 2022-10-10. Must be after start date."), - bucket_type: str = typer.Argument(..., help = "Either 'public' or 'private' -- bucket to read data from. User running must have read access to the bucket in question, so general users should list 'public'.")): + bucket_type: str = typer.Argument(..., help = "Either 'public' or 'private' -- bucket to read data from. User running must have read access to the bucket in question, so general users should list 'public'."), + save: str = typer.Argument(..., help = "Optional string 'bucket' or 'local' indicating whether you want to save your combined files in the bucket they're being read from or locally. If empty, files will not be saved.") + ): """ Take all the raw JSON files for each date in a date range from one of the CHN ghost buses S3 buckets (private or public) and aggregate them into daily CSV files. @@ -35,8 +37,9 @@ def combine_daily_totals( pbar = tqdm(date_range) for day in pbar: - combine_daily_files.combine_daily_files(day.to_date_string(), [f'chn-ghost-buses-{bucket_type}']) - + data, errors = combine_daily_files.combine_daily_files(day.to_date_string(), [f'chn-ghost-buses-{bucket_type}'], save) +# can run from the root of the repo like: +# python3 -m data_analysis.rt_daily_aggregations if __name__ == "__main__": app() diff --git a/scrape_data/combine_daily_files.py b/scrape_data/combine_daily_files.py index 0b8ad1b..8ce1a71 100644 --- a/scrape_data/combine_daily_files.py +++ b/scrape_data/combine_daily_files.py @@ -89,7 +89,7 @@ def combine_daily_files(date: str, bucket_list: List[str], save: Optional[str] = Key=error_key, ) if save == "local": - local_filename = f"ghost_buses_full_day_errors_from_{bucket}_{date}.csv" + local_filename = f"ghost_buses_full_day_errors_from_{bucket.name}_{date}.csv" logging.info(f"saving errors to {local_filename}") errors.to_csv(local_filename, index = False) else: @@ -111,15 +111,14 @@ def combine_daily_files(date: str, bucket_list: List[str], save: Optional[str] = Key=data_key, ) if save == "local": - local_filename = f"ghost_buses_full_day_data_from_{bucket}_{date}.csv" + local_filename = f"ghost_buses_full_day_data_from_{bucket.name}_{date}.csv" logging.info(f"saving errors to {local_filename}") data.to_csv(local_filename, index = False) else: logging.info(f"no data found for {date}, not saving any data file") -def add_daily_data_to_combined_file(content: pd.DataFrame, ) - + return data, errors def lambda_handler(event, context): date = pendulum.yesterday("America/Chicago").to_date_string() - combine_daily_files(date, [BUCKET_PRIVATE, BUCKET_PUBLIC], save = "bucket") + data, errors = combine_daily_files(date, [BUCKET_PRIVATE, BUCKET_PUBLIC], save = "bucket")