From d4152992372aef02daaebe0deedf72355f4495c6 Mon Sep 17 00:00:00 2001
From: laurie merrell <lauriemerrell@gmail.com>
Date: Mon, 10 Oct 2022 17:32:28 -0500
Subject: [PATCH 1/6] backfill data 2022-08-08 through 2022-10-08 into public
 bucket

---
 .gitignore                                 |   1 +
 data_analysis/__init__.py                  |   0
 data_analysis/requirements.txt             |   3 +-
 data_analysis/rt_daily_aggregations.py     | 127 +-------
 scrape_data/__init__.py                    |   0
 scrape_data/combine_daily_files.py         |   7 +-
 utils/copy_raw_data_to_public_bucket.ipynb | 321 +++++++++++++++++++++
 7 files changed, 336 insertions(+), 123 deletions(-)
 create mode 100644 data_analysis/__init__.py
 create mode 100644 scrape_data/__init__.py
 create mode 100644 utils/copy_raw_data_to_public_bucket.ipynb

diff --git a/.gitignore b/.gitignore
index 97d9349..f623156 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 .DS_Store
+__pycache__/
 .ipynb_checkpoints/
 venv
 .venv
diff --git a/data_analysis/__init__.py b/data_analysis/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/data_analysis/requirements.txt b/data_analysis/requirements.txt
index 91558f9..8440841 100644
--- a/data_analysis/requirements.txt
+++ b/data_analysis/requirements.txt
@@ -3,4 +3,5 @@ pandas==1.4.3
 geopandas==0.11.1
 s3fs==2022.7.1
 shapely==1.8.4
-jupyter==1.0.0
\ No newline at end of file
+jupyter==1.0.0
+typer[all]==0.6.1
\ No newline at end of file
diff --git a/data_analysis/rt_daily_aggregations.py b/data_analysis/rt_daily_aggregations.py
index c26964d..d21b1dd 100644
--- a/data_analysis/rt_daily_aggregations.py
+++ b/data_analysis/rt_daily_aggregations.py
@@ -7,32 +7,20 @@
 import pandas as pd
 import pendulum
 from tqdm import tqdm
+import typer
 from dotenv import load_dotenv
+# have to run from root directory for this to work 
+from scrape_data import combine_daily_files
 
 load_dotenv()
+app = typer.Typer()
 
-BUCKET_PUBLIC = os.getenv('BUCKET_PUBLIC', 'chn-ghost-buses-public')
 logger = logging.getLogger()
 logging.basicConfig(level=logging.INFO)
 
+@app.command()
+def combine_daily_totals(start_date: str, end_date: str):
 
-def compute_hourly_totals(
-    start_data: str, end_date: str, bucket_type: str = None
-) -> Tuple[pd.DataFrame]:
-    """Aggregate route data to hourly totals.
-
-    Args:
-        start_data (str): starting date of the route
-        end_date (str): ending date of route
-        bucket_type (str, optional): which bucket to pull data from. If
-            'public', BUCKET_PUBLIC is used. If 'private', BUCKET_PRIVATE.
-            Defaults to None.
-
-    Returns:
-        Tuple[pd.DataFrame]: a tuple of the data, errors, and combined
-            hourly summaries DataFrames.
-    """
-    hourly_summary_combined = pd.DataFrame()
     date_range = [
         d
         for d in pendulum.period(
@@ -40,110 +28,11 @@ def compute_hourly_totals(
             pendulum.from_format(end_date, "YYYY-MM-DD"),
         ).range("days")
     ]
-    s3 = boto3.resource("s3")
-    if bucket_type is None or bucket_type == "public":
-        bucket = s3.Bucket(BUCKET_PUBLIC)
-    else:
-        bucket = s3.Bucket(bucket_type)
 
     pbar = tqdm(date_range)
     for day in pbar:
-        date_str = day.to_date_string()
-        pbar.set_description(
-            f"Processing {date_str} at {pendulum.now().to_datetime_string()}")
-        objects = bucket.objects.filter(Prefix=f"bus_data/{date_str}")
-
-        logger.info(f"------- loading data at"
-                    f"{pendulum.now().to_datetime_string()}")
-
-        # load data
-        data_dict = {}
-
-        # Access denied for public bucket
-        obj_pbar = tqdm(objects)
-        for obj in obj_pbar:
-            obj_pbar.set_description(f"loading {obj}")
-            obj_name = obj.key
-            # https://stackoverflow.com/questions/31976273/open-s3-object-as-a-string-with-boto3
-            obj_body = json.loads(obj.get()["Body"].read().decode("utf-8"))
-            data_dict[obj_name] = obj_body
-
-        # parse data into actual vehicle locations and errors
-
-        logger.info(f"------- parsing data at"
-                    f"{pendulum.now().to_datetime_string()}")
-
-        data = pd.DataFrame()
-        errors = pd.DataFrame()
-
-        # k, v here are filename: full dict of JSON
-        data_dict_pbar = tqdm(data_dict.items())
-        for k, v in data_dict_pbar:
-            data_dict_pbar.set_description(f"processing {k}")
-            filename = k
-            new_data = pd.DataFrame()
-            new_errors = pd.DataFrame()
-            # expect ~12 "chunks" per JSON
-            for chunk, contents in v.items():
-                if "vehicle" in v[chunk]["bustime-response"].keys():
-                    new_data = new_data.append(
-                        pd.DataFrame(v[chunk]["bustime-response"]["vehicle"])
-                    )
-                if "error" in v[chunk]["bustime-response"].keys():
-                    new_errors = new_errors.append(
-                        pd.DataFrame(v[chunk]["bustime-response"]["error"])
-                    )
-            new_data["scrape_file"] = filename
-            new_errors["scrape_file"] = filename
-            data = data.append(new_data)
-            errors = errors.append(new_errors)
-
-        logger.info(f"------- saving data at"
-                    f"{pendulum.now().to_datetime_string()}")
-
-        if len(errors) > 0:
-            bucket.put_object(
-                Body=errors.to_csv(index=False),
-                Key=f"bus_full_day_errors_v2/{date_str}.csv",
-            )
-
-        if len(data) > 0:
-            # convert data time to actual datetime
-            data["data_time"] = pd.to_datetime(
-                data["tmstmp"], format="%Y%m%d %H:%M")
-
-            data["data_hour"] = data.data_time.dt.hour
-            data["data_date"] = data.data_time.dt.date
-
-            bucket.put_object(
-                Body=data.to_csv(index=False),
-                Key=f"bus_full_day_data_v2/{date_str}.csv",
-            )
-
-            # combine vids into a set (drops duplicates):
-            # https://stackoverflow.com/a/45925961
-            hourly_summary = (
-                data.groupby(["data_date", "data_hour", "rt", "des"])
-                .agg({"vid": set, "tatripid": set, "tablockid": set})
-                .reset_index()
-            )
-            # get number of vehicles per hour per route
-            hourly_summary["vh_count"] = hourly_summary["vid"].apply(len)
-            hourly_summary["trip_count"] = hourly_summary["tatripid"].apply(
-                len)
-            hourly_summary["block_count"] = hourly_summary["tablockid"].apply(
-                len)
-
-            bucket.put_object(
-                Body=hourly_summary.to_csv(index=False),
-                Key=f"bus_hourly_summary_v2/{date_str}.csv",
-            )
-            pd.concat([hourly_summary_combined, hourly_summary])
-        return data, errors, hourly_summary
+        combine_daily_files.combine_daily_files(day.to_date_string(), ['chn-ghost-buses-private'])
 
 
 if __name__ == "__main__":
-    start_date = "2022-07-17"
-    end_date = "2022-08-07"
-
-    data, errors, hourly_summary = compute_hourly_totals(start_date, end_date)
+    app()
diff --git a/scrape_data/__init__.py b/scrape_data/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scrape_data/combine_daily_files.py b/scrape_data/combine_daily_files.py
index 90e84ee..9b0b93a 100644
--- a/scrape_data/combine_daily_files.py
+++ b/scrape_data/combine_daily_files.py
@@ -1,5 +1,6 @@
 import os
 import logging
+from typing import List
 
 import boto3
 import json
@@ -20,7 +21,7 @@
 logging.basicConfig(level=logging.INFO)
 
 
-def combine_daily_files(date: str):
+def combine_daily_files(date: str, bucket_list: List[str]):
     """Combine raw JSON files returned by API into daily CSVs. 
 
     Args:
@@ -28,7 +29,7 @@ def combine_daily_files(date: str):
     """
     s3 = boto3.resource("s3")
 
-    for bucket_name in [BUCKET_PRIVATE, BUCKET_PUBLIC]:
+    for bucket_name in bucket_list:
         logging.info(f"processing data from {bucket_name}")
         bucket = s3.Bucket(bucket_name)
         objects = bucket.objects.filter(Prefix=f"bus_data/{date}")
@@ -104,4 +105,4 @@ def combine_daily_files(date: str):
 
 def lambda_handler(event, context):
     date = pendulum.yesterday("America/Chicago").to_date_string()
-    combine_daily_files(date)
+    combine_daily_files(date, [BUCKET_PRIVATE, BUCKET_PUBLIC])
diff --git a/utils/copy_raw_data_to_public_bucket.ipynb b/utils/copy_raw_data_to_public_bucket.ipynb
new file mode 100644
index 0000000..2d847d7
--- /dev/null
+++ b/utils/copy_raw_data_to_public_bucket.ipynb
@@ -0,0 +1,321 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "f25eb6c8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import boto3\n",
+    "import pendulum"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "d480191d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "s3 = boto3.resource('s3')\n",
+    "private_bucket = s3.Bucket('chn-ghost-buses-private')\n",
+    "public_bucket = s3.Bucket('chn-ghost-buses-public')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "1211fabe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# copy raw data from the private bucket to the public bucket \n",
+    "start_date = '2022-08-10'\n",
+    "end_date = '2022-10-08'\n",
+    "\n",
+    "date_range = [d for d in pendulum.period(pendulum.from_format(start_date, 'YYYY-MM-DD'), pendulum.from_format(end_date, 'YYYY-MM-DD')).range('days')]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "174dd1e4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processing 2022-05-19 at 2022-10-10 11:22:23\n",
+      "Processing 2022-05-20 at 2022-10-10 11:22:28\n",
+      "Processing 2022-05-21 at 2022-10-10 11:23:36\n",
+      "Processing 2022-05-22 at 2022-10-10 11:24:37\n",
+      "Processing 2022-05-23 at 2022-10-10 11:25:35\n",
+      "Processing 2022-05-24 at 2022-10-10 11:26:41\n",
+      "Processing 2022-05-25 at 2022-10-10 11:27:48\n",
+      "Processing 2022-05-26 at 2022-10-10 11:28:56\n",
+      "Processing 2022-05-27 at 2022-10-10 11:30:06\n",
+      "Processing 2022-05-28 at 2022-10-10 11:31:15\n",
+      "Processing 2022-05-29 at 2022-10-10 11:32:19\n",
+      "Processing 2022-05-30 at 2022-10-10 11:33:18\n",
+      "Processing 2022-05-31 at 2022-10-10 11:34:17\n",
+      "Processing 2022-06-01 at 2022-10-10 11:35:24\n",
+      "Processing 2022-06-02 at 2022-10-10 11:36:31\n",
+      "Processing 2022-06-03 at 2022-10-10 11:37:36\n",
+      "Processing 2022-06-04 at 2022-10-10 11:38:41\n",
+      "Processing 2022-06-05 at 2022-10-10 11:39:42\n",
+      "Processing 2022-06-06 at 2022-10-10 11:40:41\n",
+      "Processing 2022-06-07 at 2022-10-10 11:41:47\n",
+      "Processing 2022-06-08 at 2022-10-10 11:42:51\n",
+      "Processing 2022-06-09 at 2022-10-10 11:43:56\n",
+      "Processing 2022-06-10 at 2022-10-10 11:45:01\n",
+      "Processing 2022-06-11 at 2022-10-10 11:46:05\n",
+      "Processing 2022-06-12 at 2022-10-10 11:47:04\n",
+      "Processing 2022-06-13 at 2022-10-10 11:48:00\n",
+      "Processing 2022-06-14 at 2022-10-10 11:49:03\n",
+      "Processing 2022-06-15 at 2022-10-10 11:50:08\n",
+      "Processing 2022-06-16 at 2022-10-10 11:51:11\n",
+      "Processing 2022-06-17 at 2022-10-10 11:52:13\n",
+      "Processing 2022-06-18 at 2022-10-10 11:53:16\n",
+      "Processing 2022-06-19 at 2022-10-10 11:54:16\n",
+      "Processing 2022-06-20 at 2022-10-10 11:55:13\n",
+      "Processing 2022-06-21 at 2022-10-10 11:56:17\n",
+      "Processing 2022-06-22 at 2022-10-10 11:57:22\n",
+      "Processing 2022-06-23 at 2022-10-10 11:58:24\n",
+      "Processing 2022-06-24 at 2022-10-10 11:59:29\n",
+      "Processing 2022-06-25 at 2022-10-10 12:00:34\n",
+      "Processing 2022-06-26 at 2022-10-10 12:01:39\n",
+      "Processing 2022-06-27 at 2022-10-10 12:02:37\n",
+      "Processing 2022-06-28 at 2022-10-10 12:03:41\n",
+      "Processing 2022-06-29 at 2022-10-10 12:04:46\n",
+      "Processing 2022-06-30 at 2022-10-10 12:05:54\n",
+      "Processing 2022-07-01 at 2022-10-10 12:06:59\n",
+      "Processing 2022-07-02 at 2022-10-10 12:08:03\n",
+      "Processing 2022-07-03 at 2022-10-10 12:09:11\n",
+      "Processing 2022-07-04 at 2022-10-10 12:10:07\n",
+      "Processing 2022-07-05 at 2022-10-10 12:11:06\n",
+      "Processing 2022-07-06 at 2022-10-10 12:12:12\n",
+      "Processing 2022-07-07 at 2022-10-10 12:13:17\n",
+      "Processing 2022-07-08 at 2022-10-10 12:14:23\n",
+      "Processing 2022-07-09 at 2022-10-10 12:15:30\n",
+      "Processing 2022-07-10 at 2022-10-10 12:16:31\n",
+      "Processing 2022-07-11 at 2022-10-10 12:17:28\n",
+      "Processing 2022-07-12 at 2022-10-10 12:18:32\n",
+      "Processing 2022-07-13 at 2022-10-10 12:19:38\n",
+      "Processing 2022-07-14 at 2022-10-10 12:20:47\n",
+      "Processing 2022-07-15 at 2022-10-10 12:21:52\n",
+      "Processing 2022-07-16 at 2022-10-10 12:22:57\n",
+      "Processing 2022-07-17 at 2022-10-10 12:23:57\n",
+      "Processing 2022-07-18 at 2022-10-10 12:24:54\n",
+      "Processing 2022-07-19 at 2022-10-10 12:25:58\n",
+      "Processing 2022-07-20 at 2022-10-10 12:27:03\n",
+      "Processing 2022-07-21 at 2022-10-10 12:28:07\n",
+      "Processing 2022-07-22 at 2022-10-10 12:29:15\n",
+      "Processing 2022-07-23 at 2022-10-10 12:30:23\n",
+      "Processing 2022-07-24 at 2022-10-10 12:31:26\n",
+      "Processing 2022-07-25 at 2022-10-10 12:32:23\n",
+      "Processing 2022-07-26 at 2022-10-10 12:33:30\n",
+      "Processing 2022-07-27 at 2022-10-10 12:34:37\n",
+      "Processing 2022-07-28 at 2022-10-10 12:35:45\n",
+      "Processing 2022-07-29 at 2022-10-10 12:36:52\n",
+      "Processing 2022-07-30 at 2022-10-10 12:38:00\n",
+      "Processing 2022-07-31 at 2022-10-10 12:39:01\n",
+      "Processing 2022-08-01 at 2022-10-10 12:39:59\n",
+      "Processing 2022-08-02 at 2022-10-10 12:41:05\n",
+      "Processing 2022-08-03 at 2022-10-10 12:42:11\n",
+      "Processing 2022-08-04 at 2022-10-10 12:43:16\n",
+      "Processing 2022-08-05 at 2022-10-10 12:44:21\n",
+      "Processing 2022-08-06 at 2022-10-10 12:45:26\n",
+      "Processing 2022-08-07 at 2022-10-10 12:46:27\n",
+      "Processing 2022-08-08 at 2022-10-10 12:47:26\n",
+      "Processing 2022-08-09 at 2022-10-10 12:48:32\n",
+      "Processing 2022-08-10 at 2022-10-10 12:49:37\n",
+      "Processing 2022-08-11 at 2022-10-10 12:50:44\n",
+      "Processing 2022-08-12 at 2022-10-10 12:51:49\n",
+      "Processing 2022-08-13 at 2022-10-10 12:52:55\n",
+      "Processing 2022-08-14 at 2022-10-10 12:53:55\n",
+      "Processing 2022-08-15 at 2022-10-10 12:54:51\n",
+      "Processing 2022-08-16 at 2022-10-10 12:55:56\n",
+      "Processing 2022-08-17 at 2022-10-10 12:57:01\n",
+      "Processing 2022-08-18 at 2022-10-10 12:58:08\n",
+      "Processing 2022-08-19 at 2022-10-10 12:59:15\n",
+      "Processing 2022-08-20 at 2022-10-10 13:00:25\n",
+      "Processing 2022-08-21 at 2022-10-10 13:01:26\n",
+      "Processing 2022-08-22 at 2022-10-10 13:02:23\n",
+      "Processing 2022-08-23 at 2022-10-10 13:03:29\n",
+      "Processing 2022-08-24 at 2022-10-10 13:04:34\n",
+      "Processing 2022-08-25 at 2022-10-10 13:05:43\n",
+      "Processing 2022-08-26 at 2022-10-10 13:06:50\n",
+      "Processing 2022-08-27 at 2022-10-10 13:07:53\n",
+      "Processing 2022-08-28 at 2022-10-10 13:08:57\n",
+      "Processing 2022-08-29 at 2022-10-10 13:09:59\n",
+      "Processing 2022-08-30 at 2022-10-10 13:11:04\n",
+      "Processing 2022-08-31 at 2022-10-10 13:12:12\n",
+      "Processing 2022-09-01 at 2022-10-10 13:13:22\n",
+      "Processing 2022-09-02 at 2022-10-10 13:14:33\n",
+      "Processing 2022-09-03 at 2022-10-10 13:15:44\n",
+      "Processing 2022-09-04 at 2022-10-10 13:16:54\n",
+      "Processing 2022-09-05 at 2022-10-10 13:17:56\n",
+      "Processing 2022-09-06 at 2022-10-10 13:19:01\n",
+      "Processing 2022-09-07 at 2022-10-10 13:20:13\n",
+      "Processing 2022-09-08 at 2022-10-10 13:21:20\n",
+      "Processing 2022-09-09 at 2022-10-10 13:22:29\n",
+      "Processing 2022-09-10 at 2022-10-10 13:23:35\n",
+      "Processing 2022-09-11 at 2022-10-10 13:24:41\n",
+      "Processing 2022-09-12 at 2022-10-10 13:25:41\n",
+      "Processing 2022-09-13 at 2022-10-10 13:26:49\n",
+      "Processing 2022-09-14 at 2022-10-10 13:27:59\n",
+      "Processing 2022-09-15 at 2022-10-10 13:29:08\n",
+      "Processing 2022-09-16 at 2022-10-10 13:30:16\n",
+      "Processing 2022-09-17 at 2022-10-10 13:31:27\n",
+      "Processing 2022-09-18 at 2022-10-10 13:32:34\n",
+      "Processing 2022-09-19 at 2022-10-10 13:33:35\n",
+      "Processing 2022-09-20 at 2022-10-10 13:34:45\n",
+      "Processing 2022-09-21 at 2022-10-10 13:35:57\n",
+      "Processing 2022-09-22 at 2022-10-10 13:37:08\n",
+      "Processing 2022-09-23 at 2022-10-10 13:38:17\n",
+      "Processing 2022-09-24 at 2022-10-10 13:39:25\n",
+      "Processing 2022-09-25 at 2022-10-10 13:40:33\n",
+      "Processing 2022-09-26 at 2022-10-10 13:41:37\n",
+      "Processing 2022-09-27 at 2022-10-10 13:42:46\n",
+      "Processing 2022-09-28 at 2022-10-10 13:43:56\n",
+      "Processing 2022-09-29 at 2022-10-10 13:45:05\n",
+      "Processing 2022-09-30 at 2022-10-10 13:46:14\n",
+      "Processing 2022-10-01 at 2022-10-10 13:47:20\n",
+      "Processing 2022-10-02 at 2022-10-10 13:48:28\n",
+      "Processing 2022-10-03 at 2022-10-10 13:49:30\n",
+      "Processing 2022-10-04 at 2022-10-10 13:50:43\n",
+      "Processing 2022-10-05 at 2022-10-10 13:51:52\n",
+      "Processing 2022-10-06 at 2022-10-10 13:52:59\n",
+      "Processing 2022-10-07 at 2022-10-10 13:54:06\n",
+      "Processing 2022-10-08 at 2022-10-10 13:55:15\n",
+      "Processing 2022-10-09 at 2022-10-10 13:56:22\n"
+     ]
+    }
+   ],
+   "source": [
+    "for day in date_range:\n",
+    "    date_str = day.to_date_string()\n",
+    "    print(f\"Processing {date_str} at {pendulum.now().to_datetime_string()}\")\n",
+    "    objects = private_bucket.objects.filter(Prefix = f'bus_data/{date_str}')\n",
+    "    for obj in objects:\n",
+    "        public_bucket.copy({'Bucket': private_bucket.name, 'Key': obj.key}, obj.key)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "433829c8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processing 2022-08-10 at 2022-10-10 17:01:41\n",
+      "Processing 2022-08-11 at 2022-10-10 17:01:43\n",
+      "Processing 2022-08-12 at 2022-10-10 17:01:44\n",
+      "Processing 2022-08-13 at 2022-10-10 17:01:45\n",
+      "Processing 2022-08-14 at 2022-10-10 17:01:46\n",
+      "Processing 2022-08-15 at 2022-10-10 17:01:47\n",
+      "Processing 2022-08-16 at 2022-10-10 17:01:48\n",
+      "Processing 2022-08-17 at 2022-10-10 17:01:49\n",
+      "Processing 2022-08-18 at 2022-10-10 17:01:50\n",
+      "Processing 2022-08-19 at 2022-10-10 17:01:51\n",
+      "Processing 2022-08-20 at 2022-10-10 17:01:52\n",
+      "Processing 2022-08-21 at 2022-10-10 17:01:53\n",
+      "Processing 2022-08-22 at 2022-10-10 17:01:54\n",
+      "Processing 2022-08-23 at 2022-10-10 17:01:55\n",
+      "Processing 2022-08-24 at 2022-10-10 17:01:57\n",
+      "Processing 2022-08-25 at 2022-10-10 17:01:57\n",
+      "Processing 2022-08-26 at 2022-10-10 17:01:58\n",
+      "Processing 2022-08-27 at 2022-10-10 17:01:59\n",
+      "Processing 2022-08-28 at 2022-10-10 17:02:01\n",
+      "Processing 2022-08-29 at 2022-10-10 17:02:03\n",
+      "Processing 2022-08-30 at 2022-10-10 17:02:04\n",
+      "Processing 2022-08-31 at 2022-10-10 17:02:05\n",
+      "Processing 2022-09-01 at 2022-10-10 17:02:06\n",
+      "Processing 2022-09-02 at 2022-10-10 17:02:07\n",
+      "Processing 2022-09-03 at 2022-10-10 17:02:08\n",
+      "Processing 2022-09-04 at 2022-10-10 17:02:09\n",
+      "Processing 2022-09-05 at 2022-10-10 17:02:10\n",
+      "Processing 2022-09-06 at 2022-10-10 17:02:11\n",
+      "Processing 2022-09-07 at 2022-10-10 17:02:13\n",
+      "Processing 2022-09-08 at 2022-10-10 17:02:14\n",
+      "Processing 2022-09-09 at 2022-10-10 17:02:15\n",
+      "Processing 2022-09-10 at 2022-10-10 17:02:16\n",
+      "Processing 2022-09-11 at 2022-10-10 17:02:17\n",
+      "Processing 2022-09-12 at 2022-10-10 17:02:18\n",
+      "Processing 2022-09-13 at 2022-10-10 17:02:19\n",
+      "Processing 2022-09-14 at 2022-10-10 17:02:20\n",
+      "Processing 2022-09-15 at 2022-10-10 17:02:21\n",
+      "Processing 2022-09-16 at 2022-10-10 17:02:22\n",
+      "Processing 2022-09-17 at 2022-10-10 17:02:23\n",
+      "Processing 2022-09-18 at 2022-10-10 17:02:24\n",
+      "Processing 2022-09-19 at 2022-10-10 17:02:25\n",
+      "Processing 2022-09-20 at 2022-10-10 17:02:27\n",
+      "Processing 2022-09-21 at 2022-10-10 17:02:28\n",
+      "Processing 2022-09-22 at 2022-10-10 17:02:29\n",
+      "Processing 2022-09-23 at 2022-10-10 17:02:30\n",
+      "Processing 2022-09-24 at 2022-10-10 17:02:31\n",
+      "Processing 2022-09-25 at 2022-10-10 17:02:32\n",
+      "Processing 2022-09-26 at 2022-10-10 17:02:33\n",
+      "Processing 2022-09-27 at 2022-10-10 17:02:34\n",
+      "Processing 2022-09-28 at 2022-10-10 17:02:35\n",
+      "Processing 2022-09-29 at 2022-10-10 17:02:36\n",
+      "Processing 2022-09-30 at 2022-10-10 17:02:37\n",
+      "Processing 2022-10-01 at 2022-10-10 17:02:38\n",
+      "Processing 2022-10-02 at 2022-10-10 17:02:39\n",
+      "Processing 2022-10-03 at 2022-10-10 17:02:40\n",
+      "Processing 2022-10-04 at 2022-10-10 17:02:42\n",
+      "Processing 2022-10-05 at 2022-10-10 17:02:43\n",
+      "Processing 2022-10-06 at 2022-10-10 17:02:44\n",
+      "Processing 2022-10-07 at 2022-10-10 17:02:45\n",
+      "Processing 2022-10-08 at 2022-10-10 17:02:47\n"
+     ]
+    }
+   ],
+   "source": [
+    "for day in date_range:\n",
+    "    date_str = day.to_date_string()\n",
+    "    print(f\"Processing {date_str} at {pendulum.now().to_datetime_string()}\")\n",
+    "    data_objects = private_bucket.objects.filter(Prefix = f'bus_full_day_data_v2/{date_str}.csv')\n",
+    "    error_objects = private_bucket.objects.filter(Prefix = f'bus_full_day_errors_v2/{date_str}.csv')\n",
+    "    for obj in data_objects:\n",
+    "        public_bucket.copy({'Bucket': private_bucket.name, 'Key': obj.key}, obj.key)\n",
+    "    for obj in error_objects: \n",
+    "        public_bucket.copy({'Bucket': private_bucket.name, 'Key': obj.key}, obj.key)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "acf91f7e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 58110ac843296675af7630129c9aa1156ad4834a Mon Sep 17 00:00:00 2001
From: laurie merrell <lauriemerrell@gmail.com>
Date: Mon, 10 Oct 2022 17:39:28 -0500
Subject: [PATCH 2/6] remove output cells and comment a little

---
 utils/copy_raw_data_to_public_bucket.ipynb | 246 ++-------------------
 1 file changed, 15 insertions(+), 231 deletions(-)

diff --git a/utils/copy_raw_data_to_public_bucket.ipynb b/utils/copy_raw_data_to_public_bucket.ipynb
index 2d847d7..c40067e 100644
--- a/utils/copy_raw_data_to_public_bucket.ipynb
+++ b/utils/copy_raw_data_to_public_bucket.ipynb
@@ -2,8 +2,8 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "id": "f25eb6c8",
+   "execution_count": null,
+   "id": "12ebecef",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -13,8 +13,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "id": "d480191d",
+   "execution_count": null,
+   "id": "dc86c046",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -25,8 +25,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
-   "id": "1211fabe",
+   "execution_count": null,
+   "id": "4b83b078",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -39,162 +39,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
-   "id": "174dd1e4",
+   "execution_count": null,
+   "id": "204bb9d3",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Processing 2022-05-19 at 2022-10-10 11:22:23\n",
-      "Processing 2022-05-20 at 2022-10-10 11:22:28\n",
-      "Processing 2022-05-21 at 2022-10-10 11:23:36\n",
-      "Processing 2022-05-22 at 2022-10-10 11:24:37\n",
-      "Processing 2022-05-23 at 2022-10-10 11:25:35\n",
-      "Processing 2022-05-24 at 2022-10-10 11:26:41\n",
-      "Processing 2022-05-25 at 2022-10-10 11:27:48\n",
-      "Processing 2022-05-26 at 2022-10-10 11:28:56\n",
-      "Processing 2022-05-27 at 2022-10-10 11:30:06\n",
-      "Processing 2022-05-28 at 2022-10-10 11:31:15\n",
-      "Processing 2022-05-29 at 2022-10-10 11:32:19\n",
-      "Processing 2022-05-30 at 2022-10-10 11:33:18\n",
-      "Processing 2022-05-31 at 2022-10-10 11:34:17\n",
-      "Processing 2022-06-01 at 2022-10-10 11:35:24\n",
-      "Processing 2022-06-02 at 2022-10-10 11:36:31\n",
-      "Processing 2022-06-03 at 2022-10-10 11:37:36\n",
-      "Processing 2022-06-04 at 2022-10-10 11:38:41\n",
-      "Processing 2022-06-05 at 2022-10-10 11:39:42\n",
-      "Processing 2022-06-06 at 2022-10-10 11:40:41\n",
-      "Processing 2022-06-07 at 2022-10-10 11:41:47\n",
-      "Processing 2022-06-08 at 2022-10-10 11:42:51\n",
-      "Processing 2022-06-09 at 2022-10-10 11:43:56\n",
-      "Processing 2022-06-10 at 2022-10-10 11:45:01\n",
-      "Processing 2022-06-11 at 2022-10-10 11:46:05\n",
-      "Processing 2022-06-12 at 2022-10-10 11:47:04\n",
-      "Processing 2022-06-13 at 2022-10-10 11:48:00\n",
-      "Processing 2022-06-14 at 2022-10-10 11:49:03\n",
-      "Processing 2022-06-15 at 2022-10-10 11:50:08\n",
-      "Processing 2022-06-16 at 2022-10-10 11:51:11\n",
-      "Processing 2022-06-17 at 2022-10-10 11:52:13\n",
-      "Processing 2022-06-18 at 2022-10-10 11:53:16\n",
-      "Processing 2022-06-19 at 2022-10-10 11:54:16\n",
-      "Processing 2022-06-20 at 2022-10-10 11:55:13\n",
-      "Processing 2022-06-21 at 2022-10-10 11:56:17\n",
-      "Processing 2022-06-22 at 2022-10-10 11:57:22\n",
-      "Processing 2022-06-23 at 2022-10-10 11:58:24\n",
-      "Processing 2022-06-24 at 2022-10-10 11:59:29\n",
-      "Processing 2022-06-25 at 2022-10-10 12:00:34\n",
-      "Processing 2022-06-26 at 2022-10-10 12:01:39\n",
-      "Processing 2022-06-27 at 2022-10-10 12:02:37\n",
-      "Processing 2022-06-28 at 2022-10-10 12:03:41\n",
-      "Processing 2022-06-29 at 2022-10-10 12:04:46\n",
-      "Processing 2022-06-30 at 2022-10-10 12:05:54\n",
-      "Processing 2022-07-01 at 2022-10-10 12:06:59\n",
-      "Processing 2022-07-02 at 2022-10-10 12:08:03\n",
-      "Processing 2022-07-03 at 2022-10-10 12:09:11\n",
-      "Processing 2022-07-04 at 2022-10-10 12:10:07\n",
-      "Processing 2022-07-05 at 2022-10-10 12:11:06\n",
-      "Processing 2022-07-06 at 2022-10-10 12:12:12\n",
-      "Processing 2022-07-07 at 2022-10-10 12:13:17\n",
-      "Processing 2022-07-08 at 2022-10-10 12:14:23\n",
-      "Processing 2022-07-09 at 2022-10-10 12:15:30\n",
-      "Processing 2022-07-10 at 2022-10-10 12:16:31\n",
-      "Processing 2022-07-11 at 2022-10-10 12:17:28\n",
-      "Processing 2022-07-12 at 2022-10-10 12:18:32\n",
-      "Processing 2022-07-13 at 2022-10-10 12:19:38\n",
-      "Processing 2022-07-14 at 2022-10-10 12:20:47\n",
-      "Processing 2022-07-15 at 2022-10-10 12:21:52\n",
-      "Processing 2022-07-16 at 2022-10-10 12:22:57\n",
-      "Processing 2022-07-17 at 2022-10-10 12:23:57\n",
-      "Processing 2022-07-18 at 2022-10-10 12:24:54\n",
-      "Processing 2022-07-19 at 2022-10-10 12:25:58\n",
-      "Processing 2022-07-20 at 2022-10-10 12:27:03\n",
-      "Processing 2022-07-21 at 2022-10-10 12:28:07\n",
-      "Processing 2022-07-22 at 2022-10-10 12:29:15\n",
-      "Processing 2022-07-23 at 2022-10-10 12:30:23\n",
-      "Processing 2022-07-24 at 2022-10-10 12:31:26\n",
-      "Processing 2022-07-25 at 2022-10-10 12:32:23\n",
-      "Processing 2022-07-26 at 2022-10-10 12:33:30\n",
-      "Processing 2022-07-27 at 2022-10-10 12:34:37\n",
-      "Processing 2022-07-28 at 2022-10-10 12:35:45\n",
-      "Processing 2022-07-29 at 2022-10-10 12:36:52\n",
-      "Processing 2022-07-30 at 2022-10-10 12:38:00\n",
-      "Processing 2022-07-31 at 2022-10-10 12:39:01\n",
-      "Processing 2022-08-01 at 2022-10-10 12:39:59\n",
-      "Processing 2022-08-02 at 2022-10-10 12:41:05\n",
-      "Processing 2022-08-03 at 2022-10-10 12:42:11\n",
-      "Processing 2022-08-04 at 2022-10-10 12:43:16\n",
-      "Processing 2022-08-05 at 2022-10-10 12:44:21\n",
-      "Processing 2022-08-06 at 2022-10-10 12:45:26\n",
-      "Processing 2022-08-07 at 2022-10-10 12:46:27\n",
-      "Processing 2022-08-08 at 2022-10-10 12:47:26\n",
-      "Processing 2022-08-09 at 2022-10-10 12:48:32\n",
-      "Processing 2022-08-10 at 2022-10-10 12:49:37\n",
-      "Processing 2022-08-11 at 2022-10-10 12:50:44\n",
-      "Processing 2022-08-12 at 2022-10-10 12:51:49\n",
-      "Processing 2022-08-13 at 2022-10-10 12:52:55\n",
-      "Processing 2022-08-14 at 2022-10-10 12:53:55\n",
-      "Processing 2022-08-15 at 2022-10-10 12:54:51\n",
-      "Processing 2022-08-16 at 2022-10-10 12:55:56\n",
-      "Processing 2022-08-17 at 2022-10-10 12:57:01\n",
-      "Processing 2022-08-18 at 2022-10-10 12:58:08\n",
-      "Processing 2022-08-19 at 2022-10-10 12:59:15\n",
-      "Processing 2022-08-20 at 2022-10-10 13:00:25\n",
-      "Processing 2022-08-21 at 2022-10-10 13:01:26\n",
-      "Processing 2022-08-22 at 2022-10-10 13:02:23\n",
-      "Processing 2022-08-23 at 2022-10-10 13:03:29\n",
-      "Processing 2022-08-24 at 2022-10-10 13:04:34\n",
-      "Processing 2022-08-25 at 2022-10-10 13:05:43\n",
-      "Processing 2022-08-26 at 2022-10-10 13:06:50\n",
-      "Processing 2022-08-27 at 2022-10-10 13:07:53\n",
-      "Processing 2022-08-28 at 2022-10-10 13:08:57\n",
-      "Processing 2022-08-29 at 2022-10-10 13:09:59\n",
-      "Processing 2022-08-30 at 2022-10-10 13:11:04\n",
-      "Processing 2022-08-31 at 2022-10-10 13:12:12\n",
-      "Processing 2022-09-01 at 2022-10-10 13:13:22\n",
-      "Processing 2022-09-02 at 2022-10-10 13:14:33\n",
-      "Processing 2022-09-03 at 2022-10-10 13:15:44\n",
-      "Processing 2022-09-04 at 2022-10-10 13:16:54\n",
-      "Processing 2022-09-05 at 2022-10-10 13:17:56\n",
-      "Processing 2022-09-06 at 2022-10-10 13:19:01\n",
-      "Processing 2022-09-07 at 2022-10-10 13:20:13\n",
-      "Processing 2022-09-08 at 2022-10-10 13:21:20\n",
-      "Processing 2022-09-09 at 2022-10-10 13:22:29\n",
-      "Processing 2022-09-10 at 2022-10-10 13:23:35\n",
-      "Processing 2022-09-11 at 2022-10-10 13:24:41\n",
-      "Processing 2022-09-12 at 2022-10-10 13:25:41\n",
-      "Processing 2022-09-13 at 2022-10-10 13:26:49\n",
-      "Processing 2022-09-14 at 2022-10-10 13:27:59\n",
-      "Processing 2022-09-15 at 2022-10-10 13:29:08\n",
-      "Processing 2022-09-16 at 2022-10-10 13:30:16\n",
-      "Processing 2022-09-17 at 2022-10-10 13:31:27\n",
-      "Processing 2022-09-18 at 2022-10-10 13:32:34\n",
-      "Processing 2022-09-19 at 2022-10-10 13:33:35\n",
-      "Processing 2022-09-20 at 2022-10-10 13:34:45\n",
-      "Processing 2022-09-21 at 2022-10-10 13:35:57\n",
-      "Processing 2022-09-22 at 2022-10-10 13:37:08\n",
-      "Processing 2022-09-23 at 2022-10-10 13:38:17\n",
-      "Processing 2022-09-24 at 2022-10-10 13:39:25\n",
-      "Processing 2022-09-25 at 2022-10-10 13:40:33\n",
-      "Processing 2022-09-26 at 2022-10-10 13:41:37\n",
-      "Processing 2022-09-27 at 2022-10-10 13:42:46\n",
-      "Processing 2022-09-28 at 2022-10-10 13:43:56\n",
-      "Processing 2022-09-29 at 2022-10-10 13:45:05\n",
-      "Processing 2022-09-30 at 2022-10-10 13:46:14\n",
-      "Processing 2022-10-01 at 2022-10-10 13:47:20\n",
-      "Processing 2022-10-02 at 2022-10-10 13:48:28\n",
-      "Processing 2022-10-03 at 2022-10-10 13:49:30\n",
-      "Processing 2022-10-04 at 2022-10-10 13:50:43\n",
-      "Processing 2022-10-05 at 2022-10-10 13:51:52\n",
-      "Processing 2022-10-06 at 2022-10-10 13:52:59\n",
-      "Processing 2022-10-07 at 2022-10-10 13:54:06\n",
-      "Processing 2022-10-08 at 2022-10-10 13:55:15\n",
-      "Processing 2022-10-09 at 2022-10-10 13:56:22\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
+    "# copy fully raw JSON data from private to public\n",
     "for day in date_range:\n",
     "    date_str = day.to_date_string()\n",
     "    print(f\"Processing {date_str} at {pendulum.now().to_datetime_string()}\")\n",
@@ -205,78 +55,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
-   "id": "433829c8",
+   "execution_count": null,
+   "id": "d2ac2752",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Processing 2022-08-10 at 2022-10-10 17:01:41\n",
-      "Processing 2022-08-11 at 2022-10-10 17:01:43\n",
-      "Processing 2022-08-12 at 2022-10-10 17:01:44\n",
-      "Processing 2022-08-13 at 2022-10-10 17:01:45\n",
-      "Processing 2022-08-14 at 2022-10-10 17:01:46\n",
-      "Processing 2022-08-15 at 2022-10-10 17:01:47\n",
-      "Processing 2022-08-16 at 2022-10-10 17:01:48\n",
-      "Processing 2022-08-17 at 2022-10-10 17:01:49\n",
-      "Processing 2022-08-18 at 2022-10-10 17:01:50\n",
-      "Processing 2022-08-19 at 2022-10-10 17:01:51\n",
-      "Processing 2022-08-20 at 2022-10-10 17:01:52\n",
-      "Processing 2022-08-21 at 2022-10-10 17:01:53\n",
-      "Processing 2022-08-22 at 2022-10-10 17:01:54\n",
-      "Processing 2022-08-23 at 2022-10-10 17:01:55\n",
-      "Processing 2022-08-24 at 2022-10-10 17:01:57\n",
-      "Processing 2022-08-25 at 2022-10-10 17:01:57\n",
-      "Processing 2022-08-26 at 2022-10-10 17:01:58\n",
-      "Processing 2022-08-27 at 2022-10-10 17:01:59\n",
-      "Processing 2022-08-28 at 2022-10-10 17:02:01\n",
-      "Processing 2022-08-29 at 2022-10-10 17:02:03\n",
-      "Processing 2022-08-30 at 2022-10-10 17:02:04\n",
-      "Processing 2022-08-31 at 2022-10-10 17:02:05\n",
-      "Processing 2022-09-01 at 2022-10-10 17:02:06\n",
-      "Processing 2022-09-02 at 2022-10-10 17:02:07\n",
-      "Processing 2022-09-03 at 2022-10-10 17:02:08\n",
-      "Processing 2022-09-04 at 2022-10-10 17:02:09\n",
-      "Processing 2022-09-05 at 2022-10-10 17:02:10\n",
-      "Processing 2022-09-06 at 2022-10-10 17:02:11\n",
-      "Processing 2022-09-07 at 2022-10-10 17:02:13\n",
-      "Processing 2022-09-08 at 2022-10-10 17:02:14\n",
-      "Processing 2022-09-09 at 2022-10-10 17:02:15\n",
-      "Processing 2022-09-10 at 2022-10-10 17:02:16\n",
-      "Processing 2022-09-11 at 2022-10-10 17:02:17\n",
-      "Processing 2022-09-12 at 2022-10-10 17:02:18\n",
-      "Processing 2022-09-13 at 2022-10-10 17:02:19\n",
-      "Processing 2022-09-14 at 2022-10-10 17:02:20\n",
-      "Processing 2022-09-15 at 2022-10-10 17:02:21\n",
-      "Processing 2022-09-16 at 2022-10-10 17:02:22\n",
-      "Processing 2022-09-17 at 2022-10-10 17:02:23\n",
-      "Processing 2022-09-18 at 2022-10-10 17:02:24\n",
-      "Processing 2022-09-19 at 2022-10-10 17:02:25\n",
-      "Processing 2022-09-20 at 2022-10-10 17:02:27\n",
-      "Processing 2022-09-21 at 2022-10-10 17:02:28\n",
-      "Processing 2022-09-22 at 2022-10-10 17:02:29\n",
-      "Processing 2022-09-23 at 2022-10-10 17:02:30\n",
-      "Processing 2022-09-24 at 2022-10-10 17:02:31\n",
-      "Processing 2022-09-25 at 2022-10-10 17:02:32\n",
-      "Processing 2022-09-26 at 2022-10-10 17:02:33\n",
-      "Processing 2022-09-27 at 2022-10-10 17:02:34\n",
-      "Processing 2022-09-28 at 2022-10-10 17:02:35\n",
-      "Processing 2022-09-29 at 2022-10-10 17:02:36\n",
-      "Processing 2022-09-30 at 2022-10-10 17:02:37\n",
-      "Processing 2022-10-01 at 2022-10-10 17:02:38\n",
-      "Processing 2022-10-02 at 2022-10-10 17:02:39\n",
-      "Processing 2022-10-03 at 2022-10-10 17:02:40\n",
-      "Processing 2022-10-04 at 2022-10-10 17:02:42\n",
-      "Processing 2022-10-05 at 2022-10-10 17:02:43\n",
-      "Processing 2022-10-06 at 2022-10-10 17:02:44\n",
-      "Processing 2022-10-07 at 2022-10-10 17:02:45\n",
-      "Processing 2022-10-08 at 2022-10-10 17:02:47\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
+    "# copy daily CSVs from private to public\n",
     "for day in date_range:\n",
     "    date_str = day.to_date_string()\n",
     "    print(f\"Processing {date_str} at {pendulum.now().to_datetime_string()}\")\n",
@@ -291,7 +75,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "acf91f7e",
+   "id": "397493c2",
    "metadata": {},
    "outputs": [],
    "source": []

From 0bf4a0260248fef6b582acdb7556b3b731749bbb Mon Sep 17 00:00:00 2001
From: laurie merrell <lauriemerrell@gmail.com>
Date: Tue, 11 Oct 2022 21:46:07 -0500
Subject: [PATCH 3/6] remove unused imports

---
 data_analysis/rt_daily_aggregations.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/data_analysis/rt_daily_aggregations.py b/data_analysis/rt_daily_aggregations.py
index d21b1dd..7f81a0b 100644
--- a/data_analysis/rt_daily_aggregations.py
+++ b/data_analysis/rt_daily_aggregations.py
@@ -1,10 +1,6 @@
-import os
-from typing import Tuple
 import logging
 
 import boto3
-import json
-import pandas as pd
 import pendulum
 from tqdm import tqdm
 import typer

From 8a6f012087413e6478c71607cd0c39ee79bc24f9 Mon Sep 17 00:00:00 2001
From: laurie merrell <lauriemerrell@gmail.com>
Date: Tue, 11 Oct 2022 22:10:01 -0500
Subject: [PATCH 4/6] PR review comments: add help text

---
 data_analysis/rt_daily_aggregations.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/data_analysis/rt_daily_aggregations.py b/data_analysis/rt_daily_aggregations.py
index 7f81a0b..d835bdc 100644
--- a/data_analysis/rt_daily_aggregations.py
+++ b/data_analysis/rt_daily_aggregations.py
@@ -15,8 +15,16 @@
 logging.basicConfig(level=logging.INFO)
 
 @app.command()
-def combine_daily_totals(start_date: str, end_date: str):
-
+def combine_daily_totals(
+    start_date: str = typer.Argument(..., help = "Start date in format YYYY-MM-DD. Acceptable dates are any date between 2022-05-19 and yesterday (so, if today is 2022-10-11, can input any date up to 2022-10-10. Must be before end date."), 
+    end_date: str =  typer.Argument(..., help = "End date in format YYYY-MM-DD. Acceptable dates are any date between 2022-05-19 and yesterday (so, if today is 2022-10-11, can input any date up to 2022-10-10. Must be after start date."), 
+    bucket_type: str = typer.Argument(..., help = "Either 'public' or 'private' -- bucket to read data from. User running must have read access to the bucket in question, so general users should list 'public'.")):
+    """
+    Take all the raw JSON files for each date in a date range from one of the CHN ghost buses S3 buckets (private or public)
+    and aggregate them into daily CSV files.
+    TODO: Add save parameter so these can be saved locally rather than just back to the bucket.
+    """
+    
     date_range = [
         d
         for d in pendulum.period(
@@ -27,7 +35,7 @@ def combine_daily_totals(start_date: str, end_date: str):
 
     pbar = tqdm(date_range)
     for day in pbar:
-        combine_daily_files.combine_daily_files(day.to_date_string(), ['chn-ghost-buses-private'])
+        combine_daily_files.combine_daily_files(day.to_date_string(), [f'chn-ghost-buses-{bucket_type}'])
 
 
 if __name__ == "__main__":

From 3ccb283e3331c76a32c3d02d32b80bd11a7977b0 Mon Sep 17 00:00:00 2001
From: laurie merrell <lauriemerrell@gmail.com>
Date: Sun, 16 Oct 2022 21:24:46 -0500
Subject: [PATCH 5/6] add save parameter to combine_daily_files

---
 scrape_data/combine_daily_files.py | 47 ++++++++++++++++++++----------
 1 file changed, 32 insertions(+), 15 deletions(-)

diff --git a/scrape_data/combine_daily_files.py b/scrape_data/combine_daily_files.py
index 9b0b93a..0b8ad1b 100644
--- a/scrape_data/combine_daily_files.py
+++ b/scrape_data/combine_daily_files.py
@@ -1,6 +1,6 @@
 import os
 import logging
-from typing import List
+from typing import List, Optional
 
 import boto3
 import json
@@ -21,7 +21,7 @@
 logging.basicConfig(level=logging.INFO)
 
 
-def combine_daily_files(date: str, bucket_list: List[str]):
+def combine_daily_files(date: str, bucket_list: List[str], save: Optional[str] = None):
     """Combine raw JSON files returned by API into daily CSVs. 
 
     Args:
@@ -78,13 +78,22 @@ def combine_daily_files(date: str, bucket_list: List[str]):
         data = pd.concat(data_list, ignore_index=True)
         errors = pd.concat(errors_list, ignore_index=True)
 
+        logging.info(f"found {len(errors)} errors and {len(data)} data points for {date}")
+
         if len(errors) > 0:
-            error_key = f"bus_full_day_errors_v2/{date}.csv"
-            logging.info(f"saving errors to {bucket}/{error_key}")
-            bucket.put_object(
-                Body=errors.to_csv(index=False),
-                Key=error_key,
-            )
+            if save == "bucket":
+                error_key = f"bus_full_day_errors_v2/{date}.csv"
+                logging.info(f"saving errors to {bucket}/{error_key}")
+                bucket.put_object(
+                    Body=errors.to_csv(index=False),
+                    Key=error_key,
+                )
+            if save == "local":
+                local_filename = f"ghost_buses_full_day_errors_from_{bucket}_{date}.csv"
+                logging.info(f"saving errors to {local_filename}")
+                errors.to_csv(local_filename, index = False)
+        else:
+            logging.info(f"no errors found for {date}, not saving any error file")
 
         if len(data) > 0:
             # convert data time to actual datetime
@@ -94,15 +103,23 @@ def combine_daily_files(date: str, bucket_list: List[str]):
 
             data["data_hour"] = data.data_time.dt.hour
             data["data_date"] = data.data_time.dt.date
+            if save == "bucket":
+                data_key = f"bus_full_day_data_v2/{date}.csv"
+                logging.info(f"saving data to {bucket}/{data_key}")
+                bucket.put_object(
+                    Body=data.to_csv(index=False),
+                    Key=data_key,
+                )
+            if save == "local":
+                local_filename = f"ghost_buses_full_day_data_from_{bucket}_{date}.csv"
+                logging.info(f"saving errors to {local_filename}")
+                data.to_csv(local_filename, index = False)
+        else:
+            logging.info(f"no data found for {date}, not saving any data file")
 
-            data_key = f"bus_full_day_data_v2/{date}.csv"
-            logging.info(f"saving data to {bucket}/{data_key}")
-            bucket.put_object(
-                Body=data.to_csv(index=False),
-                Key=data_key,
-            )
+def add_daily_data_to_combined_file(content: pd.DataFrame, )
 
 
 def lambda_handler(event, context):
     date = pendulum.yesterday("America/Chicago").to_date_string()
-    combine_daily_files(date, [BUCKET_PRIVATE, BUCKET_PUBLIC])
+    combine_daily_files(date, [BUCKET_PRIVATE, BUCKET_PUBLIC], save = "bucket")

From ae03fc09b1b3d00f29abba9091d440958dbff48f Mon Sep 17 00:00:00 2001
From: laurie merrell <lauriemerrell@gmail.com>
Date: Sun, 16 Oct 2022 21:52:33 -0500
Subject: [PATCH 6/6] actually add save parameter, make combine daily files
 return results

---
 data_analysis/rt_daily_aggregations.py | 9 ++++++---
 scrape_data/combine_daily_files.py     | 9 ++++-----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/data_analysis/rt_daily_aggregations.py b/data_analysis/rt_daily_aggregations.py
index d835bdc..14bcddd 100644
--- a/data_analysis/rt_daily_aggregations.py
+++ b/data_analysis/rt_daily_aggregations.py
@@ -18,7 +18,9 @@
 def combine_daily_totals(
     start_date: str = typer.Argument(..., help = "Start date in format YYYY-MM-DD. Acceptable dates are any date between 2022-05-19 and yesterday (so, if today is 2022-10-11, can input any date up to 2022-10-10. Must be before end date."), 
     end_date: str =  typer.Argument(..., help = "End date in format YYYY-MM-DD. Acceptable dates are any date between 2022-05-19 and yesterday (so, if today is 2022-10-11, can input any date up to 2022-10-10. Must be after start date."), 
-    bucket_type: str = typer.Argument(..., help = "Either 'public' or 'private' -- bucket to read data from. User running must have read access to the bucket in question, so general users should list 'public'.")):
+    bucket_type: str = typer.Argument(..., help = "Either 'public' or 'private' -- bucket to read data from. User running must have read access to the bucket in question, so general users should list 'public'."),
+    save: str = typer.Argument(..., help = "Optional string 'bucket' or 'local' indicating whether you want to save your combined files in the bucket they're being read from or locally. If empty, files will not be saved.")
+    ):
     """
     Take all the raw JSON files for each date in a date range from one of the CHN ghost buses S3 buckets (private or public)
     and aggregate them into daily CSV files.
@@ -35,8 +37,9 @@ def combine_daily_totals(
 
     pbar = tqdm(date_range)
     for day in pbar:
-        combine_daily_files.combine_daily_files(day.to_date_string(), [f'chn-ghost-buses-{bucket_type}'])
-
+        data, errors = combine_daily_files.combine_daily_files(day.to_date_string(), [f'chn-ghost-buses-{bucket_type}'], save)
 
+# can run from the root of the repo like:
+# python3 -m data_analysis.rt_daily_aggregations <start_date> <end_date> <bucket_name> <save>
 if __name__ == "__main__":
     app()
diff --git a/scrape_data/combine_daily_files.py b/scrape_data/combine_daily_files.py
index 0b8ad1b..8ce1a71 100644
--- a/scrape_data/combine_daily_files.py
+++ b/scrape_data/combine_daily_files.py
@@ -89,7 +89,7 @@ def combine_daily_files(date: str, bucket_list: List[str], save: Optional[str] =
                     Key=error_key,
                 )
             if save == "local":
-                local_filename = f"ghost_buses_full_day_errors_from_{bucket}_{date}.csv"
+                local_filename = f"ghost_buses_full_day_errors_from_{bucket.name}_{date}.csv"
                 logging.info(f"saving errors to {local_filename}")
                 errors.to_csv(local_filename, index = False)
         else:
@@ -111,15 +111,14 @@ def combine_daily_files(date: str, bucket_list: List[str], save: Optional[str] =
                     Key=data_key,
                 )
             if save == "local":
-                local_filename = f"ghost_buses_full_day_data_from_{bucket}_{date}.csv"
+                local_filename = f"ghost_buses_full_day_data_from_{bucket.name}_{date}.csv"
                 logging.info(f"saving errors to {local_filename}")
                 data.to_csv(local_filename, index = False)
         else:
             logging.info(f"no data found for {date}, not saving any data file")
 
-def add_daily_data_to_combined_file(content: pd.DataFrame, )
-
+    return data, errors
 
 def lambda_handler(event, context):
     date = pendulum.yesterday("America/Chicago").to_date_string()
-    combine_daily_files(date, [BUCKET_PRIVATE, BUCKET_PUBLIC], save = "bucket")
+    data, errors = combine_daily_files(date, [BUCKET_PRIVATE, BUCKET_PUBLIC], save = "bucket")