From 865f4ea2404e18b7ab7fc5692c79a187744f2615 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Mon, 15 Jul 2019 14:46:17 -0500
Subject: [PATCH 01/46] create folder and save functions

---
 .../get_single_tidepool_dataset.py            | 91 ++++++++++++++-----
 1 file changed, 66 insertions(+), 25 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py b/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py
index 290b5324..8d89c2c5 100644
--- a/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py
+++ b/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py
@@ -56,7 +56,7 @@
 parser.add_argument(
     "-u",
     "--userid",
-    dest="userid_of_shared_user",
+    dest="userid",
     default=np.nan,
     help="userid of account shared with the donor group or master account"
 )
@@ -111,6 +111,55 @@ def make_folder_if_doesnt_exist(folder_paths):
     return
 
 
+def create_output_folder(
+        data_path=args.data_path,
+        date_stamp=args.date_stamp,
+        folder_name="not-specified",
+        phi=True
+):
+    if phi:
+        date_stamp = "PHI-" + date_stamp
+    donor_folder = os.path.join(data_path, date_stamp + "-donor-data")
+    dataset_path = os.path.join(
+        donor_folder,
+        date_stamp + "-" + folder_name
+    )
+    make_folder_if_doesnt_exist(dataset_path)
+
+    return dataset_path
+
+
+def save_df(
+        df,
+        userid=args.userid,
+        data_path=args.data_path,
+        date_stamp=args.date_stamp,
+        folder_name="not-specified",
+        phi=True
+):
+
+    output_folder = create_output_folder(
+        data_path=data_path,
+        date_stamp=date_stamp,
+        folder_name=folder_name,
+        phi=phi
+    )
+
+    # if the data contains phi, add prefix to the file
+    if phi:
+        phi_prefix = 'PHI-'
+    else:
+        phi_prefix = ''
+    output_path = os.path.join(
+        output_folder,
+        phi_prefix + userid + "-dataSummary.csv.gz"
+    )
+
+    df.to_csv(output_path)
+
+    return output_path
+
+
 def get_data_api(userid, startDate, endDate, headers):
 
     startDate = startDate.strftime("%Y-%m-%d") + "T00:00:00.000Z"
@@ -145,7 +194,7 @@ def get_data_api(userid, startDate, endDate, headers):
 def get_data(
     weeks_of_data=10*52,
     donor_group=np.nan,
-    userid_of_shared_user=np.nan,
+    userid=np.nan,
     auth=np.nan,
     email=np.nan,
     password=np.nan,
@@ -180,8 +229,8 @@ def get_data(
     else:
         sys.exit("Error with " + auth[0] + ":" + str(api_response.status_code))
 
-    if pd.isnull(userid_of_shared_user):
-        userid_of_shared_user = userid_master
+    if pd.isnull(userid):
+        userid = userid_master
         print(
             "getting data for the master account since no shared " +
             "user account was given"
@@ -204,7 +253,7 @@ def get_data(
                 endDate.day + 1
             )
             year_df, endDate = get_data_api(
-                userid_of_shared_user,
+                userid,
                 startDate,
                 endDate,
                 headers
@@ -222,7 +271,7 @@ def get_data(
         )
 
         df, _ = get_data_api(
-            userid_of_shared_user,
+            userid,
             startDate,
             endDate,
             headers
@@ -241,7 +290,7 @@ def get_data(
             auth[0] + ":" + str(api_response.status_code)
         )
 
-    return df, userid_of_shared_user
+    return df, userid
 
 
 # %% START OF CODE
@@ -250,40 +299,32 @@ def get_and_save_dataset(
     data_path=args.data_path,
     weeks_of_data=args.weeks_of_data,
     donor_group=args.donor_group,
-    userid_of_shared_user=args.userid_of_shared_user,
+    userid=args.userid,
     auth=args.auth,
     email=args.email,
     password=args.password
 ):
-    # create output folders if they don't exist
-
-    phi_date_stamp = "PHI-" + date_stamp
-    donor_folder = os.path.join(data_path, phi_date_stamp + "-donor-data")
-
-    dataset_path = os.path.join(
-        donor_folder,
-        phi_date_stamp + "-csvData"
-    )
-    make_folder_if_doesnt_exist(dataset_path)
 
     # get dataset
     data, userid = get_data(
         weeks_of_data=weeks_of_data,
         donor_group=donor_group,
-        userid_of_shared_user=userid_of_shared_user,
+        userid=userid,
         auth=auth,
         email=email,
         password=password
     )
 
     # save data
-    dataset_output_path = os.path.join(
-        dataset_path,
-        'PHI-' + userid + ".csv"
+    _ = save_df(
+            data,
+            userid=userid,
+            data_path=data_path,
+            date_stamp=date_stamp,
+            folder_name="csvData",
+            phi=True
     )
 
-    data.to_csv(dataset_output_path)
-
 
 if __name__ == "__main__":
     get_and_save_dataset(
@@ -291,7 +332,7 @@ def get_and_save_dataset(
         data_path=args.data_path,
         weeks_of_data=args.weeks_of_data,
         donor_group=args.donor_group,
-        userid_of_shared_user=args.userid_of_shared_user,
+        userid=args.userid,
         auth=args.auth,
         email=args.email,
         password=args.password

From 10116db6b847e3ce43d2fa16c78111f4c8a14539 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Mon, 15 Jul 2019 22:02:35 -0500
Subject: [PATCH 02/46] save as gzipped csv

---
 .../get_all_donor_data_batch_process.py       | 36 ++++++++++++++++---
 .../get_single_donor_metadata.py              |  2 +-
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process.py b/projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process.py
index 15daa252..8e81b372 100644
--- a/projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process.py
+++ b/projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process.py
@@ -117,12 +117,11 @@ def get_all_data(userid, donor_group):
 
 metadata_path = os.path.join(
     args.data_path,
-    "PHI-" + "2019-07-13" + "-donor-data",
-    "PHI-" + "2019-07-13" + "-metadata"
-
+    phi_date_stamp + "-donor-data",
+    phi_date_stamp + "-metadata"
 )
 
-all_files = glob.glob(os.path.join(metadata_path, "*.csv"))
+all_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
 all_metadata = pd.DataFrame()
 for f in all_files:
     temp_meta = pd.read_csv(f)
@@ -137,3 +136,32 @@ def get_all_data(userid, donor_group):
     os.path.join(donor_folder, phi_date_stamp + "-donor-metadata.csv")
 )
 print("saving metadata...code complete")
+
+
+# %% COMBINE AND SAVE ALL DATASET INFO (METADATA)
+print("combining all dataset metadata")
+
+metadata_path = os.path.join(
+    args.data_path,
+    phi_date_stamp + "-donor-data",
+    phi_date_stamp + "-datasetSummary"
+)
+
+all_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
+dataset_metadata = pd.DataFrame()
+for f in all_files:
+    temp_meta = pd.read_csv(f)
+    temp_meta.rename(columns={"Unnamed: 0": "col_name"}, inplace=True)
+    userid = f[-32:-22]
+    temp_meta["userid"] = userid
+    dataset_metadata = pd.concat(
+        [dataset_metadata, temp_meta],
+        ignore_index=True,
+        sort=False
+    )
+
+dataset_metadata.to_csv(
+    os.path.join(donor_folder, phi_date_stamp + "-all-dataset-info.csv")
+)
+print("saving all-dataset-info-metadata...code complete")
+
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_single_donor_metadata.py b/projects/bigdata-processing-pipeline/get-donor-data/get_single_donor_metadata.py
index 3135ff41..e02708a9 100644
--- a/projects/bigdata-processing-pipeline/get-donor-data/get_single_donor_metadata.py
+++ b/projects/bigdata-processing-pipeline/get-donor-data/get_single_donor_metadata.py
@@ -233,7 +233,7 @@ def get_and_save_metadata(
     # save data
     meta_output_path = os.path.join(
         metadata_path,
-        'PHI-' + userid + ".csv"
+        'PHI-' + userid + ".csv.gz"
     )
 
     meta_df.to_csv(meta_output_path)

From fe9a04a62777ef4a8124beb4e89649bf5e249188 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Mon, 15 Jul 2019 22:04:09 -0500
Subject: [PATCH 03/46] capture dataset info

---
 .../get-donor-data/get_single_dataset_info.py | 357 ++++++++++++++++++
 .../get_single_tidepool_dataset.py            | 293 +++++++-------
 2 files changed, 496 insertions(+), 154 deletions(-)
 create mode 100644 projects/bigdata-processing-pipeline/get-donor-data/get_single_dataset_info.py

diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_single_dataset_info.py b/projects/bigdata-processing-pipeline/get-donor-data/get_single_dataset_info.py
new file mode 100644
index 00000000..d1ddab75
--- /dev/null
+++ b/projects/bigdata-processing-pipeline/get-donor-data/get_single_dataset_info.py
@@ -0,0 +1,357 @@
+# -*- coding: utf-8 -*-
+"""get_donor_data_and_metadata.py
+This code takes a tidepool dataset as input, and gives
+a description of the type of data in the dataset.
+"""
+
+
+# %% REQUIRED LIBRARIES
+import pandas as pd
+import datetime as dt
+import numpy as np
+import os
+import ast
+import argparse
+
+
+# %% FUNCTIONS
+def get_type(val):
+    return type(val).__name__
+
+
+def get_len(val):
+    return len(val)
+
+
+def get_val(val, k):
+    return val[k]
+
+
+def literal_return(val):
+    try:
+        return ast.literal_eval(val)
+    except (ValueError, SyntaxError):
+        return val
+
+
+def remove_cols(df, cols_to_remove):
+
+    temp_remove_cols = list(set(df) & set(cols_to_remove))
+    tempDf = df[temp_remove_cols]
+    df = df.drop(columns=temp_remove_cols)
+
+    return df, tempDf
+
+
+def make_folder_if_doesnt_exist(folder_paths):
+    ''' function requires a single path or a list of paths'''
+    if not isinstance(folder_paths, list):
+        folder_paths = [folder_paths]
+    for folder_path in folder_paths:
+        if not os.path.exists(folder_path):
+            os.makedirs(folder_path)
+    return
+
+
+def create_output_folder(
+        data_path,
+        date_stamp,
+        folder_name,
+        phi=True
+):
+    if phi:
+        date_stamp = "PHI-" + date_stamp
+    donor_folder = os.path.join(data_path, date_stamp + "-donor-data")
+    dataset_path = os.path.join(
+        donor_folder,
+        date_stamp + "-" + folder_name
+    )
+    make_folder_if_doesnt_exist(dataset_path)
+
+    return dataset_path
+
+
+def save_df(
+        df,
+        userid,
+        data_path,
+        date_stamp,
+        folder_name,
+        phi=True,
+        name_suffix="",
+):
+
+    output_folder = create_output_folder(
+        data_path=data_path,
+        date_stamp=date_stamp,
+        folder_name=folder_name,
+        phi=phi
+    )
+
+    # if the data contains phi, add prefix to the file
+    if phi:
+        phi_prefix = 'PHI-'
+    else:
+        phi_prefix = ''
+    output_path = os.path.join(
+        output_folder,
+        phi_prefix + userid + "{}.csv.gz".format(name_suffix)
+    )
+
+    df.to_csv(output_path)
+
+    return output_path
+
+
+def expand_df(df, do_not_expand_list=[]):
+
+    # remove fields that we don't want to flatten
+    df, hold_df = remove_cols(df, do_not_expand_list)
+
+    # get a description of the original columns
+    col_df = pd.DataFrame(df.dtypes, columns=["dtype"])
+
+    # go through each dtype that is an object to see if it
+    # contains strings, mixed datatypes, embedded json, or lists
+    col_df["nObjectTypes"] = np.nan
+    col_df["objectType"] = np.nan
+
+    new_df = pd.DataFrame()
+    for col in col_df[col_df["dtype"] == "object"].index:
+        rows = df.index[df[col].notnull()].tolist()
+
+        # sometimes the object gets wrapped in a string
+        literal_df = pd.DataFrame(df.loc[rows, col].apply(literal_return))
+
+        # see if there are mixed ojbect types
+        type_df = pd.DataFrame(literal_df.loc[rows, col].apply(get_type))
+        unique_types = type_df[col].unique()
+        col_df.loc[col, "nObjectTypes"] = len(unique_types)
+        col_df.loc[col, "objectType"] = str(unique_types)
+
+        # USE UNDERSCORE FOR LIST EXPANSION
+        if "list" in col_df.loc[col, "objectType"]:
+            list_df = pd.DataFrame(literal_df.loc[type_df[col] == "list", col])
+            list_df["len"] = list_df[col].apply(get_len)
+
+            for i in np.arange(1, list_df["len"].max() + 1):
+                blob_df = pd.DataFrame(
+                    list_df.loc[
+                        list_df["len"] >= i, col
+                        ].apply(get_val, k=i-1)
+                    ).add_suffix('_' + str(i))
+
+                new_df = pd.concat([new_df, blob_df], axis=1)
+
+        # USE DOT FOR JSON (DICT) EXPANSION
+        if "dict" in col_df.loc[col, "objectType"]:
+            json_blob = literal_df.loc[type_df[col] == "dict", col]
+            blob_df = pd.DataFrame(
+                json_blob.tolist(),
+                index=json_blob.index
+            ).add_prefix(col + '.')
+            new_df = pd.concat([new_df, blob_df], axis=1)
+
+    # merge the dataframes together
+    df = pd.concat([df, new_df, hold_df], axis=1)
+
+    df.sort_index(axis=1, inplace=True)
+
+    return df, col_df
+
+
+def expand_data(starting_df, depth=10):
+    print("\ninitial df has {} columns".format(len(starting_df.columns)))
+    print("starting expansion ...")
+    temp_df, temp_col = expand_df(starting_df)
+    col_df = temp_col.copy()
+    skip_columns = starting_df.columns.tolist()
+    d = 1
+    n_col_expanded = len(list(temp_df)) - len(list(starting_df))
+    print("{} columns added". format(n_col_expanded))
+
+    while not ((d >= depth) | (len(temp_col) == 0)):
+        print("expanding layer {} ... ".format(d))
+        next_skip_columns = temp_df.columns.tolist()
+        temp_df, temp_col = expand_df(temp_df, skip_columns)
+        skip_columns = next_skip_columns.copy()
+
+        col_df = pd.concat([col_df, temp_col])
+        n_col_expanded = len(list(temp_df)) - len(next_skip_columns)
+        print("{} columns added". format(n_col_expanded))
+        d += 1
+
+    print("expansion complete...getting dataset summary info...")
+
+    col_df.sort_index(inplace=True)
+
+    # get the start and end time for each data type
+    print("getting data start and end times for each data type ...")
+    col_df["startTime"] = np.nan
+    col_df["endTime"] = np.nan
+    for col in col_df.index:
+        try:
+            start_time = temp_df.loc[temp_df[col].notnull(), ["time"]].min()
+            end_time = temp_df.loc[temp_df[col].notnull(), ["time"]].max()
+            col_df.loc[col, "startTime"] = start_time.values[0]
+            col_df.loc[col, "endTime"] = end_time.values[0]
+        except:
+            print(col, "missing timestamp")
+
+    # get summary information
+    print("getting summary information ...")
+    df_info = pd.DataFrame(temp_df.describe(include='all').T)
+    df_info.loc["_all", ["count", "unique"]] = temp_df.shape
+    df_info.sort_index(inplace=True)
+
+    # add which type (or subtype) each column comes from
+    for typeType in ["type", "subType"]:
+        if typeType in list(starting_df):
+            type_groups = temp_df.groupby(by=typeType)
+            not_null_index = temp_df[typeType].notnull()
+            for type_ in temp_df.loc[not_null_index, typeType].unique():
+                type_df = type_groups.get_group(type_).dropna(
+                    axis=1,
+                    how="all"
+                )
+                df_info.loc[type_df.columns, typeType + "=" + type_] = type_
+
+    # get memory size of each data type
+    print("getting memory information ...")
+    mem_usage = pd.DataFrame(
+        temp_df.memory_usage(index=True, deep=True),
+        columns=["memorySize"]
+    )
+    mem_usage.rename(index={"Index": "_all"}, inplace=True)
+    df_info["memorySize"] = mem_usage["memorySize"]
+    df_info.loc["_all", "memorySize"] = temp_df.memory_usage(
+        index=True, deep=True
+    ).sum()
+
+    # combine with col_summary
+    summary_df = pd.concat([col_df, df_info], axis=1, sort=True)
+
+    # get/add a list of string values
+    print("getting a a list of string values ...")
+    str_cols = summary_df[
+        ((summary_df["objectType"] == "['str']") &
+         (summary_df["unique"] > 1) &
+         (summary_df["unique"] < 50)
+        )
+    ].index
+    for str_col in str_cols:
+        not_null_index = temp_df[str_col].notnull()
+        str_vals = temp_df.loc[not_null_index, str_col].unique().tolist()
+        summary_df.loc[str_col, "strVals"] = str(str_vals)
+
+    print("dataset summary info complete\n")
+
+    return summary_df, temp_df
+
+
+# %% START OF CODE
+def get_dataset_info(
+    data,
+    date_stamp,
+    data_path,
+    userid,
+    save_expanded
+):
+
+    if userid == "not-specified":
+        userid = input("Enter userid of dataset you want info on:\n")
+
+    if type(data) is float:  # np.nan is a float
+        dataset_folder = create_output_folder(
+            data_path,
+            date_stamp,
+            "csvData"
+        )
+        dataset_path = os.path.join(
+            dataset_folder,
+            "PHI-{}.csv.gz".format(userid)
+        )
+        data = pd.read_csv(dataset_path, low_memory=False, index_col=0)
+
+    # expand embedded lists and json within dataset
+    summary_df, expanded_df = expand_data(data.copy(), depth=10)
+
+    # save summary data
+    _ = save_df(
+        summary_df,
+        userid=userid,
+        data_path=data_path,
+        date_stamp=date_stamp,
+        folder_name="datasetSummary",
+        phi=True,
+        name_suffix="-datasetSummary"
+    )
+
+    if save_expanded:
+        # save expanded data
+        _ = save_df(
+            expanded_df,
+            userid=userid,
+            data_path=args.data_path,
+            date_stamp=args.date_stamp,
+            folder_name="expandedData",
+            phi=True,
+            name_suffix="-expandedData"
+        )
+
+
+if __name__ == "__main__":
+    # USER INPUTS (choices to be made in order to run the code)
+    codeDescription = "get an overview of the columns and data in the dataset"
+    parser = argparse.ArgumentParser(description=codeDescription)
+
+    parser.add_argument(
+        "-d",
+        "--date-stamp",
+        dest="date_stamp",
+        default=dt.datetime.now().strftime("%Y-%m-%d"),
+        help="date, in '%Y-%m-%d' format, of the date when " +
+        "donors were accepted"
+    )
+
+    parser.add_argument(
+        "-u",
+        "--userid",
+        dest="userid",
+        default="not-specified",
+        help="userid of the dataset you are interested in"
+    )
+
+    parser.add_argument(
+        "-o",
+        "--output-data-path",
+        dest="data_path",
+        default=os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__), "..", "data"
+            )
+        ),
+        help="the output path where the data is stored"
+    )
+
+    parser.add_argument(
+        "-s",
+        "--save-expanded-dataset",
+        dest="save_expanded",
+        default=True,
+        help=(
+            "specify if you want to save the expanded datafram (True/False)"
+            + "NOTE: these files can be rather large"
+        )
+    )
+
+    args = parser.parse_args()
+
+    # main function
+    get_dataset_info(
+        data=np.nan,
+        date_stamp=args.date_stamp,
+        data_path=args.data_path,
+        userid=args.userid,
+        save_expanded=args.save_expanded
+    )
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py b/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py
index 8d89c2c5..7526fe77 100644
--- a/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py
+++ b/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py
@@ -8,6 +8,7 @@
 """
 
 # %% REQUIRED LIBRARIES
+from get_single_dataset_info import expand_data, save_df
 import pandas as pd
 import datetime as dt
 import numpy as np
@@ -16,7 +17,6 @@
 import getpass
 import requests
 import json
-import pdb
 import argparse
 envPath = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 if envPath not in sys.path:
@@ -24,142 +24,7 @@
 import environmentalVariables
 
 
-# %% USER INPUTS (choices to be made in order to run the code)
-codeDescription = "get donor metadata"
-parser = argparse.ArgumentParser(description=codeDescription)
-
-parser.add_argument(
-    "-d",
-    "--date-stamp",
-    dest="date_stamp",
-    default=dt.datetime.now().strftime("%Y-%m-%d"),
-    help="date, in '%Y-%m-%d' format, of the date when " +
-    "donors were accepted"
-)
-
-parser.add_argument(
-    "-w",
-    "--weeks-of-data",
-    dest="weeks_of_data",
-    default=52*10,
-    help="enter the number of weeks of data you want to download"
-)
-
-parser.add_argument(
-    "-dg",
-    "--donor-group",
-    dest="donor_group",
-    default=np.nan,
-    help="name of the donor group in the tidepool .env file"
-)
-
-parser.add_argument(
-    "-u",
-    "--userid",
-    dest="userid",
-    default=np.nan,
-    help="userid of account shared with the donor group or master account"
-)
-
-parser.add_argument(
-    "-a",
-    "--auth",
-    dest="auth",
-    default=np.nan,
-    help="tuple that contains (email, password)"
-)
-
-parser.add_argument(
-    "-e",
-    "--email",
-    dest="email",
-    default=np.nan,
-    help="email address of the master account"
-)
-
-parser.add_argument(
-    "-p",
-    "--password",
-    dest="password",
-    default=np.nan,
-    help="password of the master account"
-)
-
-parser.add_argument(
-    "-o",
-    "--output-data-path",
-    dest="data_path",
-    default=os.path.abspath(
-        os.path.join(
-            os.path.dirname(__file__), "..", "data"
-        )
-    ),
-    help="the output path where the data is stored"
-)
-
-args = parser.parse_args()
-
-
 # %% FUNCTIONS
-def make_folder_if_doesnt_exist(folder_paths):
-    ''' function requires a single path or a list of paths'''
-    if not isinstance(folder_paths, list):
-        folder_paths = [folder_paths]
-    for folder_path in folder_paths:
-        if not os.path.exists(folder_path):
-            os.makedirs(folder_path)
-    return
-
-
-def create_output_folder(
-        data_path=args.data_path,
-        date_stamp=args.date_stamp,
-        folder_name="not-specified",
-        phi=True
-):
-    if phi:
-        date_stamp = "PHI-" + date_stamp
-    donor_folder = os.path.join(data_path, date_stamp + "-donor-data")
-    dataset_path = os.path.join(
-        donor_folder,
-        date_stamp + "-" + folder_name
-    )
-    make_folder_if_doesnt_exist(dataset_path)
-
-    return dataset_path
-
-
-def save_df(
-        df,
-        userid=args.userid,
-        data_path=args.data_path,
-        date_stamp=args.date_stamp,
-        folder_name="not-specified",
-        phi=True
-):
-
-    output_folder = create_output_folder(
-        data_path=data_path,
-        date_stamp=date_stamp,
-        folder_name=folder_name,
-        phi=phi
-    )
-
-    # if the data contains phi, add prefix to the file
-    if phi:
-        phi_prefix = 'PHI-'
-    else:
-        phi_prefix = ''
-    output_path = os.path.join(
-        output_folder,
-        phi_prefix + userid + "-dataSummary.csv.gz"
-    )
-
-    df.to_csv(output_path)
-
-    return output_path
-
-
 def get_data_api(userid, startDate, endDate, headers):
 
     startDate = startDate.strftime("%Y-%m-%d") + "T00:00:00.000Z"
@@ -295,14 +160,15 @@ def get_data(
 
 # %% START OF CODE
 def get_and_save_dataset(
-    date_stamp=args.date_stamp,
-    data_path=args.data_path,
-    weeks_of_data=args.weeks_of_data,
-    donor_group=args.donor_group,
-    userid=args.userid,
-    auth=args.auth,
-    email=args.email,
-    password=args.password
+    date_stamp,
+    data_path,
+    weeks_of_data,
+    donor_group,
+    userid,
+    auth,
+    email,
+    password,
+    expand_dataset
 ):
 
     # get dataset
@@ -315,18 +181,136 @@ def get_and_save_dataset(
         password=password
     )
 
-    # save data
-    _ = save_df(
-            data,
-            userid=userid,
-            data_path=data_path,
-            date_stamp=date_stamp,
-            folder_name="csvData",
-            phi=True
-    )
+    # if the there is data
+    if len(data) > 1:
+        # save data
+        print("saving csv data...")
+        _ = save_df(
+                data,
+                userid=userid,
+                data_path=data_path,
+                date_stamp=date_stamp,
+                folder_name="csvData",
+                phi=True
+        )
+
+        # get dataset info
+        if expand_dataset:
+            summary_df, expanded_df = expand_data(data)
+            print("saving summary data...")
+            _ = save_df(
+                summary_df,
+                userid=userid,
+                data_path=data_path,
+                date_stamp=date_stamp,
+                folder_name="datasetSummary",
+                phi=True,
+                name_suffix="-datasetSummary"
+            )
+
+            # save expanded data
+            print("saving expanded data...")
+            _ = save_df(
+                expanded_df,
+                userid=userid,
+                data_path=args.data_path,
+                date_stamp=args.date_stamp,
+                folder_name="expandedData",
+                phi=True,
+                name_suffix="-expandedData"
+            )
+    else:
+        print("{} has no data".format(userid))
 
 
 if __name__ == "__main__":
+    # USER INPUTS (choices to be made in order to run the code)
+    codeDescription = "get donor metadata"
+    parser = argparse.ArgumentParser(description=codeDescription)
+
+    parser.add_argument(
+        "-d",
+        "--date-stamp",
+        dest="date_stamp",
+        default=dt.datetime.now().strftime("%Y-%m-%d"),
+        help="date, in '%Y-%m-%d' format, of the date when " +
+        "donors were accepted"
+    )
+
+    parser.add_argument(
+        "-w",
+        "--weeks-of-data",
+        dest="weeks_of_data",
+        default=52*10,
+        help="enter the number of weeks of data you want to download"
+    )
+
+    parser.add_argument(
+        "-dg",
+        "--donor-group",
+        dest="donor_group",
+        default=np.nan,
+        help="name of the donor group in the tidepool .env file"
+    )
+
+    parser.add_argument(
+        "-u",
+        "--userid",
+        dest="userid",
+        default=np.nan,
+        help="userid of account shared with the donor group or master account"
+    )
+
+    parser.add_argument(
+        "-a",
+        "--auth",
+        dest="auth",
+        default=np.nan,
+        help="tuple that contains (email, password)"
+    )
+
+    parser.add_argument(
+        "-e",
+        "--email",
+        dest="email",
+        default=np.nan,
+        help="email address of the master account"
+    )
+
+    parser.add_argument(
+        "-p",
+        "--password",
+        dest="password",
+        default=np.nan,
+        help="password of the master account"
+    )
+
+    parser.add_argument(
+        "-o",
+        "--output-data-path",
+        dest="data_path",
+        default=os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__), "..", "data"
+            )
+        ),
+        help="the output path where the data is stored"
+    )
+
+    parser.add_argument(
+        "-ex",
+        "--expand-dataset",
+        dest="expand_dataset",
+        default=True,
+        help=(
+            "specify if you want to get/save the expanded datafram (True/False)"
+            + "NOTE: this process is time consuming"
+        )
+    )
+
+    args = parser.parse_args()
+
+    # the main function
     get_and_save_dataset(
         date_stamp=args.date_stamp,
         data_path=args.data_path,
@@ -335,5 +319,6 @@ def get_and_save_dataset(
         userid=args.userid,
         auth=args.auth,
         email=args.email,
-        password=args.password
+        password=args.password,
+        expand_dataset=args.expand_dataset
     )

From c80ca20c0e9205578d54bde7b1a92d87e9755ab3 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Wed, 17 Jul 2019 09:45:44 -0500
Subject: [PATCH 04/46] sort by userid instead of donor group

---
 .../get-donor-data/accept_new_donors_and_get_donor_list.py     | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/projects/bigdata-processing-pipeline/get-donor-data/accept_new_donors_and_get_donor_list.py b/projects/bigdata-processing-pipeline/get-donor-data/accept_new_donors_and_get_donor_list.py
index 0d8c4a41..b17f5c9e 100644
--- a/projects/bigdata-processing-pipeline/get-donor-data/accept_new_donors_and_get_donor_list.py
+++ b/projects/bigdata-processing-pipeline/get-donor-data/accept_new_donors_and_get_donor_list.py
@@ -16,6 +16,7 @@
 import requests
 import json
 import argparse
+import pdb
 envPath = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 if envPath not in sys.path:
     sys.path.insert(0, envPath)
@@ -247,7 +248,7 @@ def accept_and_get_list(args):
     )
 
     # polish up the final donor list
-    final_donor_list.sort_values(by="donorGroup", inplace=True)
+    final_donor_list.sort_values(by="userID", inplace=True)
     final_donor_list.reset_index(drop=True, inplace=True)
 
     if args.save_donor_list:

From 7340a5c3abc474984958cd0e967cf92ec12b046b Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Wed, 17 Jul 2019 09:46:44 -0500
Subject: [PATCH 05/46] just get json file

---
 .../get_all_donor_data_batch_process_json.py  | 138 ++++++++
 .../get_single_tidepool_dataset_json.py       | 299 ++++++++++++++++++
 2 files changed, 437 insertions(+)
 create mode 100644 projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process_json.py
 create mode 100644 projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset_json.py

diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process_json.py b/projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process_json.py
new file mode 100644
index 00000000..d43b8e9a
--- /dev/null
+++ b/projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process_json.py
@@ -0,0 +1,138 @@
+# -*- coding: utf-8 -*-
+"""accept_donors_and_pull_data.py
+This is a wrapper script that accepts all bigdata donation project donors,
+and then pulls of their datasets for further processing.
+"""
+
+# %% REQUIRED LIBRARIES
+from accept_new_donors_and_get_donor_list import accept_and_get_list
+import datetime as dt
+import pandas as pd
+import subprocess as sub
+import os
+import glob
+import time
+import argparse
+from multiprocessing import Pool
+
+
+# %% USER INPUTS (choices to be made in order to run the code)
+codeDescription = "accepts new donors (shares) and grab their data"
+parser = argparse.ArgumentParser(description=codeDescription)
+
+parser.add_argument(
+    "-d",
+    "--date-stamp",
+    dest="date_stamp",
+    default=dt.datetime.now().strftime("%Y-%m-%d"),
+    help="date, in '%Y-%m-%d' format, of the date when " +
+    "donors were accepted"
+)
+
+parser.add_argument(
+    "-o",
+    "--output-data-path",
+    dest="data_path",
+    default=os.path.abspath(
+        os.path.join(
+            os.path.dirname(__file__), "..", "data"
+        )
+    ),
+    help="the output path where the data is stored"
+)
+
+parser.add_argument(
+    "-s",
+    "--save-donor-list",
+    dest="save_donor_list",
+    default=True,
+    help="specify if you want to save the donor list (True/False)"
+)
+
+args = parser.parse_args()
+
+
+# %% FUNCTIONS
+def run_process(func_name, userid, donor_group):
+    func_path = os.path.join(".", func_name)
+
+    p = sub.Popen(
+        [
+             "python", func_path,
+             "-d", args.date_stamp,
+             "-dg", donor_group,
+             "-u", userid,
+             "-o", args.data_path
+         ],
+        stdout=sub.PIPE,
+        stderr=sub.PIPE
+    )
+
+    output, errors = p.communicate()
+    output = output.decode("utf-8")
+    errors = errors.decode("utf-8")
+
+    if errors == '':
+        print(output)
+    else:
+        print(errors)
+
+    return
+
+
+def get_all_data(userid, donor_group):
+
+    run_process("get_single_donor_metadata.py", userid, donor_group)
+    run_process("get_single_tidepool_dataset_json.py", userid, donor_group)
+
+    return
+
+
+# %% GET LATEST DONOR LIST
+final_donor_list = accept_and_get_list(args)
+
+
+# %% GET DONOR META DATA AND DATASETS
+# use multiple cores to process
+startTime = time.time()
+print("starting at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
+pool = Pool(os.cpu_count())
+pool.starmap(get_all_data, zip(
+    final_donor_list["userID"],
+    final_donor_list["donorGroup"]
+))
+pool.close()
+endTime = time.time()
+print(
+  "finshed pulling data at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+)
+total_duration = round((endTime - startTime) / 60, 1)
+print("total duration was %s minutes" % total_duration)
+
+
+# %% COMBINE AND SAVE ALL DONOR METADATA
+print("combining all metadata")
+phi_date_stamp = "PHI-" + args.date_stamp
+donor_folder = os.path.join(args.data_path, phi_date_stamp + "-donor-data")
+
+metadata_path = os.path.join(
+    args.data_path,
+    phi_date_stamp + "-donor-data",
+    phi_date_stamp + "-metadata"
+)
+
+all_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
+all_metadata = pd.DataFrame()
+for f in all_files:
+    temp_meta = pd.read_csv(f)
+    temp_meta.rename(columns={"Unnamed: 0": "userid"}, inplace=True)
+    all_metadata = pd.concat(
+        [all_metadata, temp_meta],
+        ignore_index=True,
+        sort=False
+    )
+
+all_metadata.to_csv(
+    os.path.join(donor_folder, phi_date_stamp + "-donor-metadata.csv")
+)
+print("saving metadata...code complete")
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset_json.py b/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset_json.py
new file mode 100644
index 00000000..5a17d6d8
--- /dev/null
+++ b/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset_json.py
@@ -0,0 +1,299 @@
+# -*- coding: utf-8 -*-
+"""get_donor_data_and_metadata.py
+In the context of the big data donation
+project, this code grabs donor data and metadata.
+
+This code calls accept_new_donors_and_get_donor_list.py
+to get the most recent donor list
+"""
+
+# %% REQUIRED LIBRARIES
+import pandas as pd
+import datetime as dt
+import numpy as np
+import os
+import sys
+import time
+import getpass
+import requests
+import json
+import argparse
+import pdb
+envPath = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+if envPath not in sys.path:
+    sys.path.insert(0, envPath)
+import environmentalVariables
+
+
+# %% FUNCTIONS
+def make_folder_if_doesnt_exist(folder_paths):
+    ''' function requires a single path or a list of paths'''
+    if not isinstance(folder_paths, list):
+        folder_paths = [folder_paths]
+    for folder_path in folder_paths:
+        if not os.path.exists(folder_path):
+            os.makedirs(folder_path)
+    return
+
+
+def get_data_api(userid, startDate, endDate, headers):
+
+    startDate = startDate.strftime("%Y-%m-%d") + "T00:00:00.000Z"
+    endDate = endDate.strftime("%Y-%m-%d") + "T23:59:59.999Z"
+
+    api_call = (
+        "https://api.tidepool.org/data/" + userid + "?" +
+        "endDate=" + endDate + "&" +
+        "startDate=" + startDate + "&" +
+        "dexcom=true" + "&" +
+        "medtronic=true" + "&" +
+        "carelink=true"
+    )
+
+    api_response = requests.get(api_call, headers=headers)
+    if(api_response.ok):
+        print("getting data between %s and %s" % (startDate, endDate))
+        json_data = json.loads(api_response.content.decode())
+
+    else:
+        sys.exit(
+            "ERROR in getting data between %s and %s" % (startDate, endDate),
+            api_response.status_code
+        )
+
+    endDate = pd.to_datetime(startDate) - pd.Timedelta(1, unit="d")
+
+    return json_data, endDate
+
+
+def get_data(
+        weeks_of_data=10*52,
+        save_data_path=os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__),
+                "..",
+                "data"
+            )
+        ),
+        overwrite_hours=24,
+        donor_group=np.nan,
+        userid=np.nan,
+        auth=np.nan,
+        email=np.nan,
+        password=np.nan,
+):
+    # login
+    if pd.notnull(donor_group):
+        if donor_group == "bigdata":
+            dg = ""
+        else:
+            dg = donor_group
+
+        auth = environmentalVariables.get_environmental_variables(dg)
+
+    if pd.isnull(auth):
+        if pd.isnull(email):
+            email = input("Enter Tidepool email address:\n")
+
+        if pd.isnull(password):
+            password = getpass.getpass("Enter password:\n")
+
+        auth = (email, password)
+
+    api_call = "https://api.tidepool.org/auth/login"
+    api_response = requests.post(api_call, auth=auth)
+    if(api_response.ok):
+        xtoken = api_response.headers["x-tidepool-session-token"]
+        userid_master = json.loads(api_response.content.decode())["userid"]
+        headers = {
+            "x-tidepool-session-token": xtoken,
+            "Content-Type": "application/json"
+        }
+    else:
+        sys.exit("Error with " + auth[0] + ":" + str(api_response.status_code))
+
+    if pd.isnull(userid):
+        userid = userid_master
+        print(
+            "getting data for the master account since no shared " +
+            "user account was given"
+        )
+
+    print("logging into", auth[0], "...")
+
+    # download user data
+    print("downloading data for {} ...".format(userid))
+    endDate = pd.datetime.now() + pd.Timedelta(1, unit="d")
+
+    output_folder = os.path.join(
+        save_data_path,
+        "dremio",
+        userid,
+    )
+
+    output_file_path = os.path.join(
+        output_folder,
+        "PHI-{}.json".format(userid)
+    )
+
+    download_ = True
+    for f in [output_folder, output_file_path]:
+        path_exist = os.path.exists(f)
+        if path_exist:
+            last_save = os.path.getmtime(f)
+            time_threshold = time.time() - (overwrite_hours * 3600)
+            within_time_threshold = last_save > time_threshold
+            if within_time_threshold:
+                download_ = False
+
+    if download_:
+        make_folder_if_doesnt_exist(output_folder)
+        big_json_file = []
+
+        if weeks_of_data > 52:
+            years_of_data = int(np.floor(weeks_of_data/52))
+
+            for years in range(0, years_of_data + 1):
+                startDate = pd.datetime(
+                    endDate.year - 1,
+                    endDate.month,
+                    endDate.day + 1
+                )
+                json_data, endDate = get_data_api(
+                    userid,
+                    startDate,
+                    endDate,
+                    headers
+                )
+
+                big_json_file = big_json_file + json_data
+
+        else:
+            startDate = (
+                pd.to_datetime(endDate) - pd.Timedelta(weeks_of_data*7, "d")
+            )
+
+            json_data, _ = get_data_api(
+                userid,
+                startDate,
+                endDate,
+                headers
+                )
+
+            big_json_file = big_json_file + json_data
+
+        # save data
+        if len(big_json_file) > 1:
+            print("saving data for {}".format(userid))
+            with open(output_file_path, 'w') as outfile:
+                json.dump(big_json_file, outfile)
+        else:
+            print("{} has no data".format(userid))
+
+        # logout
+        api_call = "https://api.tidepool.org/auth/logout"
+        api_response = requests.post(api_call, auth=auth)
+
+        if(api_response.ok):
+            print("successfully logged out of", auth[0])
+
+        else:
+            sys.exit(
+                "Error with logging out for " +
+                auth[0] + ":" + str(api_response.status_code)
+            )
+    else:
+        print("skipping bc {}'s data was downloaded (attempted)".format(userid)
+              + " within the last {} hours".format(overwrite_hours)
+        )
+
+    return
+
+
+# %% MAIN
+if __name__ == "__main__":
+    # USER INPUTS (choices to be made in order to run the code)
+    codeDescription = "get donor json file"
+    parser = argparse.ArgumentParser(description=codeDescription)
+
+    parser.add_argument(
+        "-o",
+        "--output-data-path",
+        dest="data_path",
+        default=os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__), "..", "data"
+            )
+        ),
+        help="the output path where the data is stored"
+    )
+
+    parser.add_argument(
+        "-w",
+        "--weeks-of-data",
+        dest="weeks_of_data",
+        default=52*10,  # go back the last 10 years as default
+        help="enter the number of weeks of data you want to download"
+    )
+
+    parser.add_argument(
+        "-ow",
+        "--over-write",
+        dest="overwrite_hours",
+        default=24,
+        help="if data was downloaded in the last <24> hours, skip download"
+    )
+
+    parser.add_argument(
+        "-dg",
+        "--donor-group",
+        dest="donor_group",
+        default=np.nan,
+        help="name of the donor group in the tidepool .env file"
+    )
+
+    parser.add_argument(
+        "-u",
+        "--userid",
+        dest="userid",
+        default=np.nan,
+        help="userid of account shared with the donor group or master account"
+    )
+
+    parser.add_argument(
+        "-a",
+        "--auth",
+        dest="auth",
+        default=np.nan,
+        help="tuple that contains (email, password)"
+    )
+
+    parser.add_argument(
+        "-e",
+        "--email",
+        dest="email",
+        default=np.nan,
+        help="email address of the master account"
+    )
+
+    parser.add_argument(
+        "-p",
+        "--password",
+        dest="password",
+        default=np.nan,
+        help="password of the master account"
+    )
+
+    args = parser.parse_args()
+
+    # the main function
+    get_data(
+        save_data_path=args.data_path,
+        weeks_of_data=args.weeks_of_data,
+        overwrite_hours=args.overwrite_hours,
+        donor_group=args.donor_group,
+        userid=args.userid,
+        auth=args.auth,
+        email=args.email,
+        password=args.password,
+    )

From 0d08b821ac1d16f76799162523c00515c8f3e259 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sun, 21 Jul 2019 11:44:33 -0500
Subject: [PATCH 06/46] get interim data summaries

---
 .../get_interim_dataset_summaries.py          | 98 +++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 projects/bigdata-processing-pipeline/get-donor-data/get_interim_dataset_summaries.py

diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_interim_dataset_summaries.py b/projects/bigdata-processing-pipeline/get-donor-data/get_interim_dataset_summaries.py
new file mode 100644
index 00000000..0fa04201
--- /dev/null
+++ b/projects/bigdata-processing-pipeline/get-donor-data/get_interim_dataset_summaries.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+# %% REQUIRED LIBRARIES
+import datetime as dt
+import pandas as pd
+import os
+import glob
+import argparse
+
+
+# %% FUNCTIONS
+def get_dataset_summaries(
+        save_data_path=os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__),
+                "..",
+                "data"
+            )
+        ),
+        date_stamp=dt.datetime.now().strftime("%Y-%m-%d"),
+):
+
+
+
+    phi_date_stamp = "PHI-" + args.date_stamp
+    donor_folder = os.path.join(args.data_path, phi_date_stamp + "-donor-data")
+
+    print("combining all dataset metadata")
+
+    metadata_path = os.path.join(
+        args.data_path,
+        phi_date_stamp + "-donor-data",
+        phi_date_stamp + "-datasetSummary"
+    )
+
+    all_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
+    dataset_metadata = pd.DataFrame()
+    n_files = len(all_files)
+    print("there are {} files".format(n_files))
+    f_counter = 1
+    for f in all_files:
+        temp_meta = pd.read_csv(f)
+        temp_meta.rename(columns={"Unnamed: 0": "col_name"}, inplace=True)
+        userid = f[-32:-22]
+        temp_meta["userid"] = userid
+        dataset_metadata = pd.concat(
+            [dataset_metadata, temp_meta],
+            ignore_index=True,
+            sort=False
+        )
+
+        if f_counter % 10 == 0:
+            print("completed file {} of {}".format(f_counter, n_files))
+        f_counter = f_counter + 1
+    dataset_metadata.to_csv(
+        os.path.join(donor_folder, phi_date_stamp + "-all-dataset-info.csv.gz")
+    )
+    print("saving all-dataset-info-metadata...code complete")
+
+    return
+
+
+# %% MAIN
+if __name__ == "__main__":
+    # USER INPUTS (choices to be made in order to run the code)
+    codeDescription = "get donor json file"
+    parser = argparse.ArgumentParser(description=codeDescription)
+
+    parser.add_argument(
+        "-d",
+        "--date-stamp",
+        dest="date_stamp",
+        default=dt.datetime.now().strftime("%Y-%m-%d"),
+        help="date, in '%Y-%m-%d' format, of the date when " +
+        "donors were accepted"
+    )
+
+    parser.add_argument(
+        "-o",
+        "--output-data-path",
+        dest="data_path",
+        default=os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__), "..", "data"
+            )
+        ),
+        help="the output path where the data is stored"
+    )
+
+    args = parser.parse_args()
+
+    # the main function
+    get_dataset_summaries(
+        save_data_path=args.data_path,
+        date_stamp=args.date_stamp
+    )

From 7b28164a4d40db60ff21f00b1ae59702911ad82d Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Wed, 24 Jul 2019 07:38:28 -0500
Subject: [PATCH 07/46] update env to include latest spyder

---
 projects/bigdata-processing-pipeline/environment.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/environment.yml b/projects/bigdata-processing-pipeline/environment.yml
index 4c945436..64ef3601 100644
--- a/projects/bigdata-processing-pipeline/environment.yml
+++ b/projects/bigdata-processing-pipeline/environment.yml
@@ -3,9 +3,8 @@ channels:
   - defaults
 dependencies:
   - python=3.7.3
-  - numpy=1.16.4
   - pandas=0.24.2
+  - spyder=3.3.6
   - pip=19.1.1
-  - spyder=3.3.5
   - pip:
     - python-dotenv==0.10.3

From 09eaca08429db8407f7e9f5e54a79f46ec7e15cd Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Wed, 24 Jul 2019 08:33:52 -0500
Subject: [PATCH 08/46] add __init__.py and rename folders

---
 projects/bigdata-processing-pipeline/__init__.py    |   0
 .../README.md                                       |   0
 .../anonymize-and-export.py                         |   0
 .../example-data/dataFieldExportList.csv            |   0
 .../example-data/jill-jellyfish-lite.csv            |   0
 .../example-data/jill-jellyfish-lite.json           |   0
 .../example-data/jill-jellyfish-lite.xlsx           | Bin
 .../.gitignore                                      |   0
 .../README.md                                       |   0
 .../estimate-local-time.py                          |   0
 .../estimateLocalTime-batchProcess.py               |   0
 .../example-csv.csv                                 |   0
 .../example-json.json                               |   0
 .../example-xlsx.xlsx                               | Bin
 .../wikipedia-timezone-aliases-2018-04-28.csv       |   0
 .../{get-donor-data => get_donor_data}/README.md    |   0
 .../get_donor_data/__init__.py                      |   0
 .../accept_new_donors_and_get_donor_list.py         |   0
 .../deprecated/accept-new-donors.py                 |   0
 .../deprecated/get-all-col-headings.py              |   0
 .../deprecated/get-donor-json-files.py              |   0
 .../deprecated/get-donor-list.py                    |   0
 .../deprecated/get_all_donor_data.py                |   0
 .../example_get_all_data_for_single_user.py         |   2 +-
 .../get_all_donor_data_batch_process.py             |   0
 .../get_all_donor_data_batch_process_json.py        |   0
 .../get_interim_dataset_summaries.py                |   0
 .../get_single_dataset_info.py                      |   0
 .../get_single_donor_metadata.py                    |   0
 .../get_single_tidepool_dataset.py                  |   5 ++++-
 .../get_single_tidepool_dataset_json.py             |   0
 .../get_stats/__init__.py                           |   0
 .../{qualify-data => qualify_data}/README.md        |   0
 .../deprecated/qualify-data.py                      |   0
 .../qualify_all_donor_data_batch_process.py         |   0
 .../qualify_single_dataset.py                       |   0
 .../tidepool-qualification-criteria.json            |   0
 37 files changed, 5 insertions(+), 2 deletions(-)
 create mode 100644 projects/bigdata-processing-pipeline/__init__.py
 rename projects/bigdata-processing-pipeline/{anonymize-and-export-data => anonymize_and_export_data}/README.md (100%)
 rename projects/bigdata-processing-pipeline/{anonymize-and-export-data => anonymize_and_export_data}/anonymize-and-export.py (100%)
 rename projects/bigdata-processing-pipeline/{anonymize-and-export-data => anonymize_and_export_data}/example-data/dataFieldExportList.csv (100%)
 rename projects/bigdata-processing-pipeline/{anonymize-and-export-data => anonymize_and_export_data}/example-data/jill-jellyfish-lite.csv (100%)
 rename projects/bigdata-processing-pipeline/{anonymize-and-export-data => anonymize_and_export_data}/example-data/jill-jellyfish-lite.json (100%)
 rename projects/bigdata-processing-pipeline/{anonymize-and-export-data => anonymize_and_export_data}/example-data/jill-jellyfish-lite.xlsx (100%)
 rename projects/bigdata-processing-pipeline/{estimate-local-time => estimate_local_time}/.gitignore (100%)
 rename projects/bigdata-processing-pipeline/{estimate-local-time => estimate_local_time}/README.md (100%)
 rename projects/bigdata-processing-pipeline/{estimate-local-time => estimate_local_time}/estimate-local-time.py (100%)
 rename projects/bigdata-processing-pipeline/{estimate-local-time => estimate_local_time}/estimateLocalTime-batchProcess.py (100%)
 rename projects/bigdata-processing-pipeline/{estimate-local-time => estimate_local_time}/example-csv.csv (100%)
 rename projects/bigdata-processing-pipeline/{estimate-local-time => estimate_local_time}/example-json.json (100%)
 rename projects/bigdata-processing-pipeline/{estimate-local-time => estimate_local_time}/example-xlsx.xlsx (100%)
 rename projects/bigdata-processing-pipeline/{estimate-local-time => estimate_local_time}/wikipedia-timezone-aliases-2018-04-28.csv (100%)
 rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/README.md (100%)
 create mode 100644 projects/bigdata-processing-pipeline/get_donor_data/__init__.py
 rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/accept_new_donors_and_get_donor_list.py (100%)
 rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/deprecated/accept-new-donors.py (100%)
 rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/deprecated/get-all-col-headings.py (100%)
 rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/deprecated/get-donor-json-files.py (100%)
 rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/deprecated/get-donor-list.py (100%)
 rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/deprecated/get_all_donor_data.py (100%)
 rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/example_get_all_data_for_single_user.py (95%)
 rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/get_all_donor_data_batch_process.py (100%)
 rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/get_all_donor_data_batch_process_json.py (100%)
 rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/get_interim_dataset_summaries.py (100%)
 rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/get_single_dataset_info.py (100%)
 rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/get_single_donor_metadata.py (100%)
 rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/get_single_tidepool_dataset.py (98%)
 rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/get_single_tidepool_dataset_json.py (100%)
 create mode 100644 projects/bigdata-processing-pipeline/get_stats/__init__.py
 rename projects/bigdata-processing-pipeline/{qualify-data => qualify_data}/README.md (100%)
 rename projects/bigdata-processing-pipeline/{qualify-data => qualify_data}/deprecated/qualify-data.py (100%)
 rename projects/bigdata-processing-pipeline/{qualify-data => qualify_data}/qualify_all_donor_data_batch_process.py (100%)
 rename projects/bigdata-processing-pipeline/{qualify-data => qualify_data}/qualify_single_dataset.py (100%)
 rename projects/bigdata-processing-pipeline/{qualify-data => qualify_data}/tidepool-qualification-criteria.json (100%)

diff --git a/projects/bigdata-processing-pipeline/__init__.py b/projects/bigdata-processing-pipeline/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/projects/bigdata-processing-pipeline/anonymize-and-export-data/README.md b/projects/bigdata-processing-pipeline/anonymize_and_export_data/README.md
similarity index 100%
rename from projects/bigdata-processing-pipeline/anonymize-and-export-data/README.md
rename to projects/bigdata-processing-pipeline/anonymize_and_export_data/README.md
diff --git a/projects/bigdata-processing-pipeline/anonymize-and-export-data/anonymize-and-export.py b/projects/bigdata-processing-pipeline/anonymize_and_export_data/anonymize-and-export.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/anonymize-and-export-data/anonymize-and-export.py
rename to projects/bigdata-processing-pipeline/anonymize_and_export_data/anonymize-and-export.py
diff --git a/projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/dataFieldExportList.csv b/projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/dataFieldExportList.csv
similarity index 100%
rename from projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/dataFieldExportList.csv
rename to projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/dataFieldExportList.csv
diff --git a/projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/jill-jellyfish-lite.csv b/projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/jill-jellyfish-lite.csv
similarity index 100%
rename from projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/jill-jellyfish-lite.csv
rename to projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/jill-jellyfish-lite.csv
diff --git a/projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/jill-jellyfish-lite.json b/projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/jill-jellyfish-lite.json
similarity index 100%
rename from projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/jill-jellyfish-lite.json
rename to projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/jill-jellyfish-lite.json
diff --git a/projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/jill-jellyfish-lite.xlsx b/projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/jill-jellyfish-lite.xlsx
similarity index 100%
rename from projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/jill-jellyfish-lite.xlsx
rename to projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/jill-jellyfish-lite.xlsx
diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/.gitignore b/projects/bigdata-processing-pipeline/estimate_local_time/.gitignore
similarity index 100%
rename from projects/bigdata-processing-pipeline/estimate-local-time/.gitignore
rename to projects/bigdata-processing-pipeline/estimate_local_time/.gitignore
diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/README.md b/projects/bigdata-processing-pipeline/estimate_local_time/README.md
similarity index 100%
rename from projects/bigdata-processing-pipeline/estimate-local-time/README.md
rename to projects/bigdata-processing-pipeline/estimate_local_time/README.md
diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/estimate-local-time.py b/projects/bigdata-processing-pipeline/estimate_local_time/estimate-local-time.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/estimate-local-time/estimate-local-time.py
rename to projects/bigdata-processing-pipeline/estimate_local_time/estimate-local-time.py
diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/estimateLocalTime-batchProcess.py b/projects/bigdata-processing-pipeline/estimate_local_time/estimateLocalTime-batchProcess.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/estimate-local-time/estimateLocalTime-batchProcess.py
rename to projects/bigdata-processing-pipeline/estimate_local_time/estimateLocalTime-batchProcess.py
diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/example-csv.csv b/projects/bigdata-processing-pipeline/estimate_local_time/example-csv.csv
similarity index 100%
rename from projects/bigdata-processing-pipeline/estimate-local-time/example-csv.csv
rename to projects/bigdata-processing-pipeline/estimate_local_time/example-csv.csv
diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/example-json.json b/projects/bigdata-processing-pipeline/estimate_local_time/example-json.json
similarity index 100%
rename from projects/bigdata-processing-pipeline/estimate-local-time/example-json.json
rename to projects/bigdata-processing-pipeline/estimate_local_time/example-json.json
diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/example-xlsx.xlsx b/projects/bigdata-processing-pipeline/estimate_local_time/example-xlsx.xlsx
similarity index 100%
rename from projects/bigdata-processing-pipeline/estimate-local-time/example-xlsx.xlsx
rename to projects/bigdata-processing-pipeline/estimate_local_time/example-xlsx.xlsx
diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/wikipedia-timezone-aliases-2018-04-28.csv b/projects/bigdata-processing-pipeline/estimate_local_time/wikipedia-timezone-aliases-2018-04-28.csv
similarity index 100%
rename from projects/bigdata-processing-pipeline/estimate-local-time/wikipedia-timezone-aliases-2018-04-28.csv
rename to projects/bigdata-processing-pipeline/estimate_local_time/wikipedia-timezone-aliases-2018-04-28.csv
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/README.md b/projects/bigdata-processing-pipeline/get_donor_data/README.md
similarity index 100%
rename from projects/bigdata-processing-pipeline/get-donor-data/README.md
rename to projects/bigdata-processing-pipeline/get_donor_data/README.md
diff --git a/projects/bigdata-processing-pipeline/get_donor_data/__init__.py b/projects/bigdata-processing-pipeline/get_donor_data/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/accept_new_donors_and_get_donor_list.py b/projects/bigdata-processing-pipeline/get_donor_data/accept_new_donors_and_get_donor_list.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/get-donor-data/accept_new_donors_and_get_donor_list.py
rename to projects/bigdata-processing-pipeline/get_donor_data/accept_new_donors_and_get_donor_list.py
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/deprecated/accept-new-donors.py b/projects/bigdata-processing-pipeline/get_donor_data/deprecated/accept-new-donors.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/get-donor-data/deprecated/accept-new-donors.py
rename to projects/bigdata-processing-pipeline/get_donor_data/deprecated/accept-new-donors.py
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/deprecated/get-all-col-headings.py b/projects/bigdata-processing-pipeline/get_donor_data/deprecated/get-all-col-headings.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/get-donor-data/deprecated/get-all-col-headings.py
rename to projects/bigdata-processing-pipeline/get_donor_data/deprecated/get-all-col-headings.py
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/deprecated/get-donor-json-files.py b/projects/bigdata-processing-pipeline/get_donor_data/deprecated/get-donor-json-files.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/get-donor-data/deprecated/get-donor-json-files.py
rename to projects/bigdata-processing-pipeline/get_donor_data/deprecated/get-donor-json-files.py
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/deprecated/get-donor-list.py b/projects/bigdata-processing-pipeline/get_donor_data/deprecated/get-donor-list.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/get-donor-data/deprecated/get-donor-list.py
rename to projects/bigdata-processing-pipeline/get_donor_data/deprecated/get-donor-list.py
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/deprecated/get_all_donor_data.py b/projects/bigdata-processing-pipeline/get_donor_data/deprecated/get_all_donor_data.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/get-donor-data/deprecated/get_all_donor_data.py
rename to projects/bigdata-processing-pipeline/get_donor_data/deprecated/get_all_donor_data.py
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/example_get_all_data_for_single_user.py b/projects/bigdata-processing-pipeline/get_donor_data/example_get_all_data_for_single_user.py
similarity index 95%
rename from projects/bigdata-processing-pipeline/get-donor-data/example_get_all_data_for_single_user.py
rename to projects/bigdata-processing-pipeline/get_donor_data/example_get_all_data_for_single_user.py
index 14767119..3a0966d9 100644
--- a/projects/bigdata-processing-pipeline/get-donor-data/example_get_all_data_for_single_user.py
+++ b/projects/bigdata-processing-pipeline/get_donor_data/example_get_all_data_for_single_user.py
@@ -25,6 +25,6 @@
 )
 data, _ = get_data(
     donor_group="bigdata",
-    userid_of_shared_user="0d4524bc11",
+    userid="0d4524bc11",
     weeks_of_data=4
     )
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process.py b/projects/bigdata-processing-pipeline/get_donor_data/get_all_donor_data_batch_process.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process.py
rename to projects/bigdata-processing-pipeline/get_donor_data/get_all_donor_data_batch_process.py
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process_json.py b/projects/bigdata-processing-pipeline/get_donor_data/get_all_donor_data_batch_process_json.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process_json.py
rename to projects/bigdata-processing-pipeline/get_donor_data/get_all_donor_data_batch_process_json.py
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_interim_dataset_summaries.py b/projects/bigdata-processing-pipeline/get_donor_data/get_interim_dataset_summaries.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/get-donor-data/get_interim_dataset_summaries.py
rename to projects/bigdata-processing-pipeline/get_donor_data/get_interim_dataset_summaries.py
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_single_dataset_info.py b/projects/bigdata-processing-pipeline/get_donor_data/get_single_dataset_info.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/get-donor-data/get_single_dataset_info.py
rename to projects/bigdata-processing-pipeline/get_donor_data/get_single_dataset_info.py
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_single_donor_metadata.py b/projects/bigdata-processing-pipeline/get_donor_data/get_single_donor_metadata.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/get-donor-data/get_single_donor_metadata.py
rename to projects/bigdata-processing-pipeline/get_donor_data/get_single_donor_metadata.py
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset.py
similarity index 98%
rename from projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py
rename to projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset.py
index 7526fe77..84563d9c 100644
--- a/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py
+++ b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset.py
@@ -8,7 +8,10 @@
 """
 
 # %% REQUIRED LIBRARIES
-from get_single_dataset_info import expand_data, save_df
+try:
+    from get_single_dataset_info import expand_data, save_df
+except:
+    from get_donor_data.get_single_dataset_info import expand_data, save_df
 import pandas as pd
 import datetime as dt
 import numpy as np
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset_json.py b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset_json.py
rename to projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py
diff --git a/projects/bigdata-processing-pipeline/get_stats/__init__.py b/projects/bigdata-processing-pipeline/get_stats/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/projects/bigdata-processing-pipeline/qualify-data/README.md b/projects/bigdata-processing-pipeline/qualify_data/README.md
similarity index 100%
rename from projects/bigdata-processing-pipeline/qualify-data/README.md
rename to projects/bigdata-processing-pipeline/qualify_data/README.md
diff --git a/projects/bigdata-processing-pipeline/qualify-data/deprecated/qualify-data.py b/projects/bigdata-processing-pipeline/qualify_data/deprecated/qualify-data.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/qualify-data/deprecated/qualify-data.py
rename to projects/bigdata-processing-pipeline/qualify_data/deprecated/qualify-data.py
diff --git a/projects/bigdata-processing-pipeline/qualify-data/qualify_all_donor_data_batch_process.py b/projects/bigdata-processing-pipeline/qualify_data/qualify_all_donor_data_batch_process.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/qualify-data/qualify_all_donor_data_batch_process.py
rename to projects/bigdata-processing-pipeline/qualify_data/qualify_all_donor_data_batch_process.py
diff --git a/projects/bigdata-processing-pipeline/qualify-data/qualify_single_dataset.py b/projects/bigdata-processing-pipeline/qualify_data/qualify_single_dataset.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/qualify-data/qualify_single_dataset.py
rename to projects/bigdata-processing-pipeline/qualify_data/qualify_single_dataset.py
diff --git a/projects/bigdata-processing-pipeline/qualify-data/tidepool-qualification-criteria.json b/projects/bigdata-processing-pipeline/qualify_data/tidepool-qualification-criteria.json
similarity index 100%
rename from projects/bigdata-processing-pipeline/qualify-data/tidepool-qualification-criteria.json
rename to projects/bigdata-processing-pipeline/qualify_data/tidepool-qualification-criteria.json

From a1ce9911897d72e4786eec0578c31b8edeac1cc3 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Wed, 24 Jul 2019 08:34:33 -0500
Subject: [PATCH 09/46] get data from api

---
 .../get_stats/get_cgm_stats.py                | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
new file mode 100644
index 00000000..32f6e824
--- /dev/null
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+'''
+calculate cgm statsistics for a single tidepool (donor) dataset
+'''
+
+
+# %% REQUIRED LIBRARIES
+import os
+import sys
+# TODO: figure out how to get rid of these path dependcies
+get_donor_data_path = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..")
+)
+if get_donor_data_path not in sys.path:
+    sys.path.insert(0, get_donor_data_path)
+from get_donor_data.get_single_donor_metadata import get_shared_metadata
+from get_donor_data.get_single_tidepool_dataset import get_data
+
+
+# %% GET DATA FROM API
+'''
+get metadata and data for a donor that has shared with bigdata
+NOTE: functions assume you have an .env with bigdata account credentials
+'''
+
+userid = "0d4524bc11"
+donor_group = "bigdata"
+
+metadata, _ = get_shared_metadata(
+    donor_group=donor_group,
+    userid_of_shared_user=userid
+)
+data, _ = get_data(
+    donor_group=donor_group,
+    userid=userid,
+    weeks_of_data=52
+    )
+
+

From 5629152d446a73de213d9ce0efa2088f1cf5602d Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Wed, 24 Jul 2019 08:37:16 -0500
Subject: [PATCH 10/46] add path if needed

---
 .../get_donor_data/get_single_tidepool_dataset.py               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset.py b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset.py
index 84563d9c..0b3e384f 100644
--- a/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset.py
+++ b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset.py
@@ -10,7 +10,7 @@
 # %% REQUIRED LIBRARIES
 try:
     from get_single_dataset_info import expand_data, save_df
-except:
+except: # TODO: there has to be a better way to do this
     from get_donor_data.get_single_dataset_info import expand_data, save_df
 import pandas as pd
 import datetime as dt

From 8fa5f2c465f87f2bc2c91de6962f35bac3911958 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Wed, 24 Jul 2019 19:39:19 -0500
Subject: [PATCH 11/46] initial commit WIP

---
 .../get_stats/get_cgm_stats.py                | 268 +++++++++++++++++-
 1 file changed, 266 insertions(+), 2 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
index 32f6e824..2ede6fda 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
@@ -8,15 +8,201 @@
 # %% REQUIRED LIBRARIES
 import os
 import sys
+import hashlib
+import pytz
+import numpy as np
+import pandas as pd
+import datetime as dt
+
+
 # TODO: figure out how to get rid of these path dependcies
 get_donor_data_path = os.path.abspath(
     os.path.join(os.path.dirname(__file__), "..")
 )
 if get_donor_data_path not in sys.path:
     sys.path.insert(0, get_donor_data_path)
+import environmentalVariables
 from get_donor_data.get_single_donor_metadata import get_shared_metadata
 from get_donor_data.get_single_tidepool_dataset import get_data
 
+# %% CONSTANTS
+MGDL_PER_MMOLL = 18.01559
+
+
+# %% FUNCTIONS
+'''
+the functions that are called in this script,
+which includes notes of where the functions came from,
+and whether they were refactored
+'''
+
+
+def hash_userid(userid, salt):
+    '''
+    taken from anonymize-and-export.py
+    refactored name(s) to meet style guide
+    '''
+    usr_string = userid + salt
+    hash_user = hashlib.sha256(usr_string.encode())
+    hashid = hash_user.hexdigest()
+
+    return hashid
+
+
+def get_type(val):
+    return type(val).__name__
+
+
+def remove_negative_durations(df):
+    '''
+    taken from https://github.com/tidepool-org/data-analytics/blob/
+    etn/get-settings-and-events/projects/get-donors-pump-settings/
+    get-users-settings-and-events.py
+
+    refactored name(s) to meet style guide
+    refactored pandas field call to df["field"] instead of df.field
+    refactored because physical activity includes embedded json, whereas
+    the other fields in the data model require a integer
+    '''
+    if "duration" in list(df):
+        type_ = df["duration"].apply(get_type)
+        valid_index = ((type_ == "int") & (df["duration"].notnull()))
+        n_negative_durations = sum(df.loc[valid_index, "duration"] < 0)
+        if n_negative_durations > 0:
+            df = df[~(df.loc[valid_index, "duration"] < 0)]
+    else:
+        n_negative_durations = np.nan
+
+    return df, n_negative_durations
+
+
+def expand_embedded_dict(df, field, key_):
+    '''
+    this is new, should be refactored for speed as the current process
+    creates a dataframe of all of keys instead of just the key of interest
+    '''
+    if field in list(df):
+        notnull_idx = df[field].notnull()
+        temp_df = pd.DataFrame(df.loc[notnull_idx, field].tolist())  # TODO: this can be sped up by only getting the field key of interest
+        if key_ in list(temp_df):
+            df[field + "." + key_] = temp_df[key_]
+    return df
+
+
+def tslim_calibration_fix(df):
+    '''
+    taken from https://github.com/tidepool-org/data-analytics/blob/
+    etn/get-settings-and-events/projects/get-donors-pump-settings/
+    get-users-settings-and-events.py
+
+    refactored name(s) to meet style guide
+    refactored pandas field call to df["field"] instead of df.field
+    refactored to only expand one field
+    '''
+
+    # expand payload field one level
+    df = expand_embedded_dict(df, "payload", "calibration_reading")
+
+    if "payload.calibration_reading" in list(df):
+
+        search_for = ['tan']
+        tandem_data_index = (
+            (df["deviceId"].str.contains('|'.join(search_for)))
+            & (df["type"] == "deviceEvent")
+        )
+
+        cal_index = df["payload.calibration_reading"].notnull()
+        valid_index = tandem_data_index & cal_index
+
+        n_cal_readings = sum(valid_index)
+
+        if n_cal_readings > 0:
+            # if reading is > 30 then it is in the wrong units
+            if df["payload.calibration_reading"].min() > 30:
+                df.loc[cal_index, "value"] = (
+                    df.loc[valid_index, "payload.calibration_reading"]
+                    / MGDL_PER_MMOLL
+                )
+            else:
+                df.loc[cal_index, "value"] = (
+                    df.loc[valid_index, "payload.calibration_reading"]
+                )
+    else:
+        n_cal_readings = 0
+    return df, n_cal_readings
+
+
+def get_and_fill_timezone(df):
+    '''
+    this is new to deal with healthkit data
+    requires that a data frame that contains payload and HKTimeZone is passed
+    '''
+    df = expand_embedded_dict(df, "payload", "HKTimeZone")
+    if "timezone" not in list(df):
+        if "payload.HKTimeZone" in list(df):
+            df.rename(columns={"payload.HKTimeZone": "timezone"}, inplace=True)
+        else:
+            df["timezone"] = np.nan
+    else:
+        if "payload.HKTimeZone" in list(df):
+            hk_tz_idx = df["payload.HKTimeZone"].notnull()
+            df.loc[hk_tz_idx, "timezone"] = (
+                df.loc[hk_tz_idx, "payload.HKTimeZone"]
+            )
+
+    df["timezone"].fillna(method='ffill', inplace=True)
+    df["timezone"].fillna(method='bfill', inplace=True)
+
+    return df["timezone"]
+
+
+def make_tz_unaware(date_time):
+    return date_time.replace(tzinfo=None)
+
+
+def to_utc_datetime(df):
+    '''
+    this is new to deal with perfomance issue with the previous method
+    of converting to string to datetime with pd.to_datetime()
+    '''
+    utc_time_tz_aware = pd.to_datetime(
+        df["time"],
+        format="%Y-%m-%dT%H:%M:%S",
+        utc=True
+    )
+    utc_tz_unaware = utc_time_tz_aware.apply(make_tz_unaware)
+
+    return utc_tz_unaware
+
+
+def get_timezone_offset(currentDate, currentTimezone):
+
+    # edge case for 'US/Pacific-New'
+    if currentTimezone == 'US/Pacific-New':
+        currentTimezone = 'US/Pacific'
+
+    tz = pytz.timezone(currentTimezone)
+
+    tzoNum = int(
+        tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z")
+    )
+    tzoHours = np.floor(tzoNum / 100)
+    tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0)
+    tzoSign = np.sign(tzoHours)
+    tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign))
+
+    return tzo
+
+
+def get_local_time(df):
+
+    tzo = df[['utcTime', 'inferredTimezone']].apply(
+        lambda x: get_timezone_offset(*x), axis=1
+    )
+    local_time = df['utcTime'] + pd.to_timedelta(tzo, unit="m")
+
+    return local_time
+
 
 # %% GET DATA FROM API
 '''
@@ -29,12 +215,90 @@
 
 metadata, _ = get_shared_metadata(
     donor_group=donor_group,
-    userid_of_shared_user=userid
+    userid_of_shared_user=userid  # TODO: this should be refactored in several places to be userid
 )
 data, _ = get_data(
     donor_group=donor_group,
     userid=userid,
-    weeks_of_data=52
+    weeks_of_data=4
+    )
+
+
+# %% CREATE META DATAFRAME (metadata)
+metadata = pd.DataFrame(index=[userid])
+
+
+# %% HASH USER ID
+hashid = hash_userid(userid, os.environ['BIGDATA_SALT'])
+data["userid"] = userid
+data["hashid"] = hashid
+
+
+# %% CLEAN DATA
+data_fields = list(data)
+# remove negative durations
+if "duration" in data_fields:
+    data["duration"], n_negative_durations = (
+        remove_negative_durations(data[["duration"]].copy())
     )
+else:
+    n_negative_durations = np.nan
+metadata["nNegativeDurations"] = n_negative_durations
+
+# Tslim calibration bug fix
+data, n_cal_readings = tslim_calibration_fix(data)
+metadata["nTandemAndPayloadCalReadings"] = n_cal_readings
+
+
+# %% TIME RELATED ITEMS
+data["utcTime"] = to_utc_datetime(data[["time"]].copy())
+if "timezone" not in list(data):
+    data["timezone"] = np.nan
+data["inferredTimezone"] = get_and_fill_timezone(
+    data[["timezone", "payload"]].copy()
+)
+# estimate local time (simple method)
+# TODO: this really needs to be sped up
+data["localTime"] = get_local_time(
+    data[['utcTime', 'inferredTimezone']].copy()
+)
+
+
+
+
+
+#data["day"] = pd.DatetimeIndex(data["localTime"]).date
+#
+## round to the nearest 5 minutes
+## TODO: once roundTime is pushed to tidals repository then this line can be replaced
+## with td.clean.round_time
+#data = round_time(data, timeIntervalMinutes=5, timeField="time",
+#                  roundedTimeFieldName="roundedTime", startWithFirstRecord=True,
+#                  verbose=False)
+#
+#data["roundedLocalTime"] = data["roundedTime"] + pd.to_timedelta(data["tzo"], unit="m")
+#data.sort_values("uploadTime", ascending=False, inplace=True)
+#
+## AGE, & YLW
+#data["age"] = np.floor((data["localTime"] - bDate).dt.days/365.25).astype(int)
+#data["ylw"] = np.floor((data["localTime"] - dDate).dt.days/365.25).astype(int)
+
+
+# %% CGM DATA
+
+#def removeInvalidCgmValues(df):
+#
+#    nBefore = len(df)
+#    # remove values < 38 and > 402 mg/dL
+#    df = df.drop(df[((df.type == "cbg") &
+#                     (df.value < 2.109284236597303))].index)
+#    df = df.drop(df[((df.type == "cbg") &
+#                     (df.value > 22.314006924003046))].index)
+#    nRemoved = nBefore - len(df)
+#
+#    return df, nRemoved
 
+# get rid of cgm values too low/high (< 38 & > 402 mg/dL)
+#data, nInvalidCgmValues = removeInvalidCgmValues(data)
+#metadata["nInvalidCgmValues"] = nInvalidCgmValues
 

From fae47883954898e322031b4e266d4ff726723f83 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Mon, 29 Jul 2019 09:48:53 -0500
Subject: [PATCH 12/46] distinguish donor metadata from data metadata

---
 projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
index 2ede6fda..ad9fe110 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
@@ -213,7 +213,7 @@ def get_local_time(df):
 userid = "0d4524bc11"
 donor_group = "bigdata"
 
-metadata, _ = get_shared_metadata(
+donor_metadata, _ = get_shared_metadata(
     donor_group=donor_group,
     userid_of_shared_user=userid  # TODO: this should be refactored in several places to be userid
 )

From 9ef98f1e79cd658c495397cb912e3f1ef63d64be Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Mon, 29 Jul 2019 11:51:27 -0500
Subject: [PATCH 13/46] refactor round_time

---
 .../get_stats/get_cgm_stats.py                | 126 +++++++++++++++++-
 1 file changed, 123 insertions(+), 3 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
index ad9fe110..b831e2d5 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
@@ -8,12 +8,13 @@
 # %% REQUIRED LIBRARIES
 import os
 import sys
+import sys
 import hashlib
 import pytz
 import numpy as np
 import pandas as pd
 import datetime as dt
-
+import pdb
 
 # TODO: figure out how to get rid of these path dependcies
 get_donor_data_path = os.path.abspath(
@@ -204,6 +205,117 @@ def get_local_time(df):
     return local_time
 
 
+def round_time(
+        df,
+        time_interval_minutes=5,
+        start_with_first_record=True,
+        return_calculation_columns=False
+):
+    '''
+    A general purpose round time function that rounds the "time"
+    field to nearest <time_interval_minutes> minutes
+    INPUTS:
+        * a dataframe (df) or time series that contains only one time field
+        that you want to round
+        * time_interval_minutes (defaults to 5 minutes given that most cgms
+        output every 5 minutes)
+        * start_with_first_record starts the rounding with the first record
+        if True, and the last record if False (defaults to True)
+        * return_calculation_columns specifies whether the extra columns
+        used to make calculations are returned
+    refactored name(s) to meet style guide
+    '''
+    # if a time series is passed in, convert to dataframe
+    if "Series" in get_type(df):
+        df = pd.DataFrame(df)
+    columns_ = list(df)
+    if len(columns_) > 1:
+        sys.exit(
+            "Error: df should only have one time column"
+        )
+    else:
+        df.rename(columns={columns_[0]: "t"}, inplace=True)
+
+    df.sort_values(
+        by="t",
+        ascending=start_with_first_record,
+        inplace=True
+    )
+
+    df.reset_index(drop=False, inplace=True)
+    df.rename(columns={"index": "originalIndex"}, inplace=True)
+
+    # calculate the time between consecutive records
+    df["t_shift"] = df["t"].shift(1)
+    df["timeBetweenRecords"] = round(
+        (df["t"] - df["t_shift"]).dt.days*(86400/(60 * time_interval_minutes))
+        + (df["t"] - df["t_shift"]).dt.seconds/(60 * time_interval_minutes)
+    ) * time_interval_minutes
+
+    # separate the data into chunks if timeBetweenRecords is greater than
+    # 2 times the <time_interval_minutes> minutes so the rounding process
+    # starts over
+    big_gaps = list(
+        df.query("abs(timeBetweenRecords) > "
+                 + str(time_interval_minutes * 2)).index
+    )
+    big_gaps.insert(0, 0)
+    big_gaps.append(len(df))
+
+    for gap_index in range(0, len(big_gaps) - 1):
+        chunk = df["t"][big_gaps[gap_index]:big_gaps[gap_index+1]]
+        first_chunk = df["t"][big_gaps[gap_index]]
+
+        # calculate the time difference between
+        # each time record and the first record
+        df.loc[
+            big_gaps[gap_index]:big_gaps[gap_index+1],
+            "minutesFromFirstRecord"
+        ] = (
+            (chunk - first_chunk).dt.days*(86400/60)
+            + (chunk - first_chunk).dt.seconds/60
+        )
+
+        # then round to the nearest X Minutes
+        # NOTE: the ".000001" ensures that mulitples of 2:30 always rounds up.
+        df.loc[
+            big_gaps[gap_index]:big_gaps[gap_index+1],
+            "roundedMinutesFromFirstRecord"
+        ] = round(
+            (df.loc[
+                big_gaps[gap_index]:big_gaps[gap_index+1],
+                "minutesFromFirstRecord"
+            ] / time_interval_minutes) + 0.000001
+        ) * (time_interval_minutes)
+
+        rounded_first_record = (
+            first_chunk + pd.Timedelta("1microseconds")
+        ).round(str(time_interval_minutes) + "min")
+
+        df.loc[
+            big_gaps[gap_index]:big_gaps[gap_index+1],
+            "roundedTime"
+        ] = rounded_first_record + pd.to_timedelta(
+            df.loc[
+                big_gaps[gap_index]:big_gaps[gap_index+1],
+                "roundedMinutesFromFirstRecord"
+            ], unit="m"
+        )
+
+    if return_calculation_columns is False:
+        df.drop(
+            columns=[
+                "timeBetweenRecords",
+                "minutesFromFirstRecord",
+                "roundedMinutesFromFirstRecord"
+            ], inplace=True
+        )
+    # sort back to the original index
+    df.sort_values(by="originalIndex", inplace=True)
+
+    return df["roundedTime"].values
+
+
 # %% GET DATA FROM API
 '''
 get metadata and data for a donor that has shared with bigdata
@@ -263,6 +375,14 @@ def get_local_time(df):
     data[['utcTime', 'inferredTimezone']].copy()
 )
 
+# round all data to the nearest 5 minutes
+data["roundedTime"] = round_time(
+    data["localTime"].copy(),
+    time_interval_minutes=5,
+    start_with_first_record=True,
+    return_calculation_columns=False
+)
+
 
 
 
@@ -272,8 +392,8 @@ def get_local_time(df):
 ## round to the nearest 5 minutes
 ## TODO: once roundTime is pushed to tidals repository then this line can be replaced
 ## with td.clean.round_time
-#data = round_time(data, timeIntervalMinutes=5, timeField="time",
-#                  roundedTimeFieldName="roundedTime", startWithFirstRecord=True,
+#data = round_time(data, time_interval_minutes=5, time_field="time",
+#                  rounded_field_name="roundedTime", start_with_first_record=True,
 #                  verbose=False)
 #
 #data["roundedLocalTime"] = data["roundedTime"] + pd.to_timedelta(data["tzo"], unit="m")

From 302e45b7edbee9dd86bacd06044d7b699828aded Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Mon, 29 Jul 2019 15:16:15 -0500
Subject: [PATCH 14/46] add upload time to data

---
 .../get_stats/get_cgm_stats.py                | 65 +++++++++++++++----
 1 file changed, 54 insertions(+), 11 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
index b831e2d5..45bc7079 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
@@ -316,6 +316,43 @@ def round_time(
     return df["roundedTime"].values
 
 
+def add_upload_time(df):
+    '''
+    this is taken from a colab notebook that is not in our github
+    given that it has been refactored to account for bug where there are
+    no upload records
+    NOTE: this is a new fix introduced with healthkit data...we now have
+    data that does not have an upload record
+
+    '''
+
+    if "upload" in df.type.unique():
+        upload_times = pd.DataFrame(
+            df[df.type == "upload"].groupby("uploadId")["utcTime"].max()
+        )
+    else:
+        upload_times = pd.DataFrame(columns=["utcTime"])
+
+    unique_uploadIds = set(df["uploadId"].unique())
+    unique_uploadRecords = set(
+        df.loc[df["type"] == "upload", "uploadId"].unique()
+    )
+    uploadIds_missing_uploadRecords = unique_uploadIds - unique_uploadRecords
+
+    for upId in uploadIds_missing_uploadRecords:
+        last_upload_time = df.loc[df["uploadId"] == upId, "utcTime"].max()
+        upload_times.loc[upId, "utcTime"] = last_upload_time
+
+    upload_times.reset_index(inplace=True)
+    upload_times.rename(
+        columns={"utcTime": "uploadTime"},
+        inplace=True
+    )
+    df = pd.merge(df, upload_times, how='left', on='uploadId')
+
+    return df["uploadTime"].values
+
+
 # %% GET DATA FROM API
 '''
 get metadata and data for a donor that has shared with bigdata
@@ -362,6 +399,7 @@ def round_time(
 metadata["nTandemAndPayloadCalReadings"] = n_cal_readings
 
 
+
 # %% TIME RELATED ITEMS
 data["utcTime"] = to_utc_datetime(data[["time"]].copy())
 if "timezone" not in list(data):
@@ -383,20 +421,17 @@ def round_time(
     return_calculation_columns=False
 )
 
+# add upload time to the data, which is needed to get rid of duplicates
+data["uploadTime"] = add_upload_time(data[
+    ["type", "uploadId", "utcTime"]
+].copy())
 
 
+# %% TIME CATEGORIES
 
-
-#data["day"] = pd.DatetimeIndex(data["localTime"]).date
-#
-## round to the nearest 5 minutes
-## TODO: once roundTime is pushed to tidals repository then this line can be replaced
-## with td.clean.round_time
-#data = round_time(data, time_interval_minutes=5, time_field="time",
-#                  rounded_field_name="roundedTime", start_with_first_record=True,
-#                  verbose=False)
-#
-#data["roundedLocalTime"] = data["roundedTime"] + pd.to_timedelta(data["tzo"], unit="m")
+# add the day of the localTime that starts at 12am
+#data["day12AM"] = pd.DatetimeIndex(data["localTime"]).date
+#data["day6AM"] = data["localTime"] - pd.Timedelta(6, unit="hours")
 #data.sort_values("uploadTime", ascending=False, inplace=True)
 #
 ## AGE, & YLW
@@ -404,6 +439,14 @@ def round_time(
 #data["ylw"] = np.floor((data["localTime"] - dDate).dt.days/365.25).astype(int)
 
 
+## group data by type
+#if "uploadId" not in data:
+#    sys.exit(
+#        "Error: expected that uploadId is in data"
+#    )
+#
+#type_groups = data.groupby("type")
+
 # %% CGM DATA
 
 #def removeInvalidCgmValues(df):

From 75bb4ff4217b0b941e9d01e618bc8d4da0fed340 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Mon, 29 Jul 2019 15:45:48 -0500
Subject: [PATCH 15/46] handle edge case where uploadId is not given

---
 .../bigdata-processing-pipeline/get_stats/get_cgm_stats.py    | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
index 45bc7079..deb2a107 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
@@ -345,9 +345,11 @@ def add_upload_time(df):
 
     upload_times.reset_index(inplace=True)
     upload_times.rename(
-        columns={"utcTime": "uploadTime"},
+        columns={"utcTime": "uploadTime",
+                 "index": "uploadId"},
         inplace=True
     )
+
     df = pd.merge(df, upload_times, how='left', on='uploadId')
 
     return df["uploadTime"].values

From 61ae41c6075c29478713f1708839bfd408921e5f Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Mon, 29 Jul 2019 15:46:25 -0500
Subject: [PATCH 16/46] apply timezoneOffset correction

---
 .../get_stats/get_cgm_stats.py                | 46 +++++++++++++++++--
 1 file changed, 42 insertions(+), 4 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
index deb2a107..00015b61 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
@@ -176,8 +176,39 @@ def to_utc_datetime(df):
     return utc_tz_unaware
 
 
-def get_timezone_offset(currentDate, currentTimezone):
+# apply the large timezone offset correction (AKA Darin's fix)
+def timezone_offset_bug_fix(df):
+    '''
+    this is taken from estimate-local-time.py
+    TODO: add in unit testing where there is no TZP that is > 840 or < -720
+    '''
+
+    if "timezoneOffset" in list(df):
+
+        while ((df.timezoneOffset > 840).sum() > 0):
+            df.loc[df.timezoneOffset > 840, ["conversionOffset"]] = (
+                df.loc[df.timezoneOffset > 840, ["conversionOffset"]]
+                - (1440 * 60 * 1000)
+                )
+
+            df.loc[df.timezoneOffset > 840, ["timezoneOffset"]] = (
+                df.loc[df.timezoneOffset > 840, ["timezoneOffset"]] - 1440
+            )
+
+        while ((df.timezoneOffset < -720).sum() > 0):
+            df.loc[df.timezoneOffset < -720, ["conversionOffset"]] = (
+                df.loc[df.timezoneOffset < -720, ["conversionOffset"]]
+                + (1440 * 60 * 1000)
+            )
+
+            df.loc[df.timezoneOffset < -720, ["timezoneOffset"]] = (
+                df.loc[df.timezoneOffset < -720, ["timezoneOffset"]] + 1440
+            )
+
+    return df
 
+
+def get_timezone_offset(currentDate, currentTimezone):
     # edge case for 'US/Pacific-New'
     if currentTimezone == 'US/Pacific-New':
         currentTimezone = 'US/Pacific'
@@ -371,7 +402,7 @@ def add_upload_time(df):
 data, _ = get_data(
     donor_group=donor_group,
     userid=userid,
-    weeks_of_data=4
+    weeks_of_data=52
     )
 
 
@@ -400,17 +431,23 @@ def add_upload_time(df):
 data, n_cal_readings = tslim_calibration_fix(data)
 metadata["nTandemAndPayloadCalReadings"] = n_cal_readings
 
+# fix large timzoneOffset bug
+data = timezone_offset_bug_fix(data)
 
 
 # %% TIME RELATED ITEMS
 data["utcTime"] = to_utc_datetime(data[["time"]].copy())
 if "timezone" not in list(data):
     data["timezone"] = np.nan
+
+
+
+
+# estimate local time (simple method)
 data["inferredTimezone"] = get_and_fill_timezone(
     data[["timezone", "payload"]].copy()
 )
-# estimate local time (simple method)
-# TODO: this really needs to be sped up
+# TODO: this really needs to be sped up AND/OR use complex version
 data["localTime"] = get_local_time(
     data[['utcTime', 'inferredTimezone']].copy()
 )
@@ -430,6 +467,7 @@ def add_upload_time(df):
 
 
 # %% TIME CATEGORIES
+contiguousDays = createContiguousDaySeries(data)
 
 # add the day of the localTime that starts at 12am
 #data["day12AM"] = pd.DatetimeIndex(data["localTime"]).date

From 4b86a07a331bb7ab6ac6844aaf6cd084ae08ab24 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Mon, 29 Jul 2019 21:29:49 -0500
Subject: [PATCH 17/46] refactor of estimate-local-time to handle healthkit
 data

---
 .../get_stats/get_cgm_stats.py                | 829 ++++++++++++++++--
 1 file changed, 742 insertions(+), 87 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
index 00015b61..be3dcecf 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
@@ -8,13 +8,11 @@
 # %% REQUIRED LIBRARIES
 import os
 import sys
-import sys
 import hashlib
 import pytz
 import numpy as np
 import pandas as pd
 import datetime as dt
-import pdb
 
 # TODO: figure out how to get rid of these path dependcies
 get_donor_data_path = os.path.abspath(
@@ -26,6 +24,7 @@
 from get_donor_data.get_single_donor_metadata import get_shared_metadata
 from get_donor_data.get_single_tidepool_dataset import get_data
 
+
 # %% CONSTANTS
 MGDL_PER_MMOLL = 18.01559
 
@@ -133,23 +132,37 @@ def tslim_calibration_fix(df):
     return df, n_cal_readings
 
 
-def get_and_fill_timezone(df):
-    '''
-    this is new to deal with healthkit data
-    requires that a data frame that contains payload and HKTimeZone is passed
-    '''
+def get_healthkit_timezone(df):
     df = expand_embedded_dict(df, "payload", "HKTimeZone")
     if "timezone" not in list(df):
         if "payload.HKTimeZone" in list(df):
+            hk_tz_idx = df["payload.HKTimeZone"].notnull()
+            df.loc[hk_tz_idx, "deviceType"] = "healthkit"
             df.rename(columns={"payload.HKTimeZone": "timezone"}, inplace=True)
+
         else:
             df["timezone"] = np.nan
+            df["deviceType"] = np.nan
     else:
         if "payload.HKTimeZone" in list(df):
             hk_tz_idx = df["payload.HKTimeZone"].notnull()
             df.loc[hk_tz_idx, "timezone"] = (
                 df.loc[hk_tz_idx, "payload.HKTimeZone"]
             )
+            df.loc[hk_tz_idx, "deviceType"] = "healthkit"
+        else:
+            df["timezone"] = np.nan
+            df["deviceType"] = np.nan
+
+    return df[["timezone", "deviceType"]]
+
+
+def get_and_fill_timezone(df):
+    '''
+    this is new to deal with healthkit data
+    requires that a data frame that contains payload and HKTimeZone is passed
+    '''
+    df = get_healthkit_timezone(df)
 
     df["timezone"].fillna(method='ffill', inplace=True)
     df["timezone"].fillna(method='bfill', inplace=True)
@@ -208,24 +221,6 @@ def timezone_offset_bug_fix(df):
     return df
 
 
-def get_timezone_offset(currentDate, currentTimezone):
-    # edge case for 'US/Pacific-New'
-    if currentTimezone == 'US/Pacific-New':
-        currentTimezone = 'US/Pacific'
-
-    tz = pytz.timezone(currentTimezone)
-
-    tzoNum = int(
-        tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z")
-    )
-    tzoHours = np.floor(tzoNum / 100)
-    tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0)
-    tzoSign = np.sign(tzoHours)
-    tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign))
-
-    return tzo
-
-
 def get_local_time(df):
 
     tzo = df[['utcTime', 'inferredTimezone']].apply(
@@ -386,6 +381,706 @@ def add_upload_time(df):
     return df["uploadTime"].values
 
 
+# %% ESTIMATE LOCAL TIME FUNCTIONS
+def create_contiguous_day_series(df):
+    first_day = df["date"].min()
+    last_day = df["date"].max()
+    rng = pd.date_range(first_day, last_day).date
+    contiguousDaySeries = \
+        pd.DataFrame(rng, columns=["date"]).sort_values(
+                "date", ascending=False).reset_index(drop=True)
+
+    return contiguousDaySeries
+
+
+def add_device_type(df):
+    col_headings = list(df)
+    if "deviceType" not in col_headings:
+        df["deviceType"] = np.nan
+    if "deviceTags" in col_headings:
+        # first make sure deviceTag is in string format
+        df["deviceTags"] = df.deviceTags.astype(str)
+        # filter by type not null device tags
+        ud = df[df["deviceTags"].notnull()].copy()
+        # define a device type (e.g., pump, cgm, or healthkit)
+        ud.loc[
+            ((ud["deviceTags"].str.contains("pump"))
+             & (ud["deviceType"].isnull())),
+            ["deviceType"]
+        ] = "pump"
+
+        # define a device type (e.g., cgm)
+        ud.loc[
+            ((ud["deviceTags"].str.contains("cgm"))
+             & (ud["deviceType"].isnull())),
+            ["deviceType"]
+        ] = "cgm"
+
+        return ud["deviceType"]
+    else:
+        return np.nan
+
+
+def get_timezone_offset(currentDate, currentTimezone):
+
+    tz = pytz.timezone(currentTimezone)
+    # here we add 1 day to the current date to account for changes to/from DST
+    tzoNum = int(
+        tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z")
+    )
+    tzoHours = np.floor(tzoNum / 100)
+    tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0)
+    tzoSign = np.sign(tzoHours)
+    tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign))
+
+    return tzo
+
+
+def add_device_day_series(df, dfContDays, deviceTypeName):
+    if len(df) > 0:
+        dfDayGroups = df.groupby("date")
+        if "timezoneOffset" in df:
+            dfDaySeries = pd.DataFrame(dfDayGroups["timezoneOffset"].median())
+        else:
+            dfDaySeries = pd.DataFrame(columns=["timezoneOffset"])
+
+        if "upload" in deviceTypeName:
+            if "timezone" in df:
+#                if dfDayGroups.timezone.count().values[0] > 0:  # NOT SURE WHY THIS IS HERE
+                dfDaySeries["timezone"] = (
+                    dfDayGroups.timezone.describe()["top"]
+                )
+                # get the timezone offset for the timezone
+                for i in dfDaySeries.index:
+                    if pd.notnull(dfDaySeries.loc[i, "timezone"]):
+                        tzo = get_timezone_offset(
+                                pd.to_datetime(i),
+                                dfDaySeries.loc[i, "timezone"])
+                        dfDaySeries.loc[i, ["timezoneOffset"]] = tzo
+                if "timeProcessing" in dfDaySeries:
+                    dfDaySeries["timeProcessing"] = \
+                        dfDayGroups.timeProcessing.describe()["top"]
+                else:
+                    dfDaySeries["timeProcessing"] = np.nan
+
+        dfDaySeries = dfDaySeries.add_prefix(deviceTypeName + "."). \
+            rename(columns={deviceTypeName + ".date": "date"})
+
+        dfContDays = pd.merge(dfContDays, dfDaySeries.reset_index(),
+                              on="date", how="left")
+
+    else:
+        dfContDays[deviceTypeName + ".timezoneOffset"] = np.nan
+
+    return dfContDays
+
+
+def impute_upload_records(df, contDays, deviceTypeName):
+    daySeries = \
+        add_device_day_series(df, contDays, deviceTypeName)
+
+    if ((len(df) > 0) & (deviceTypeName + ".timezone" in daySeries)):
+        for i in daySeries.index[1:]:
+            if pd.isnull(daySeries[deviceTypeName + ".timezone"][i]):
+                daySeries.loc[i, [deviceTypeName + ".timezone"]] = (
+                    daySeries.loc[i-1, deviceTypeName + ".timezone"]
+                )
+            if pd.notnull(daySeries[deviceTypeName + ".timezone"][i]):
+                tz = daySeries.loc[i, deviceTypeName + ".timezone"]
+                tzo = get_timezone_offset(
+                    pd.to_datetime(daySeries.loc[i, "date"]),
+                    tz
+                )
+                daySeries.loc[i, deviceTypeName + ".timezoneOffset"] = tzo
+
+            if pd.notnull(daySeries[deviceTypeName + ".timeProcessing"][i-1]):
+                daySeries.loc[i, deviceTypeName + ".timeProcessing"] = \
+                    daySeries.loc[i-1, deviceTypeName + ".timeProcessing"]
+
+    else:
+        daySeries[deviceTypeName + ".timezone"] = np.nan
+        daySeries[deviceTypeName + ".timeProcessing"] = np.nan
+
+    return daySeries
+
+
+def add_home_timezone(df, contDays):
+
+    if "timezone" in df:
+        homeTimezone = df["timezone"].describe()["top"]
+        tzo = contDays.date.apply(
+                lambda x: get_timezone_offset(pd.to_datetime(x), homeTimezone))
+
+        contDays["home.imputed.timezoneOffset"] = tzo
+        contDays["home.imputed.timezone"] = homeTimezone
+
+    else:
+        contDays["home.imputed.timezoneOffset"] = np.nan
+        contDays["home.imputed.timezone"] = np.nan
+    contDays["home.imputed.timeProcessing"] = np.nan
+
+    return contDays
+
+
+def estimateTzAndTzoWithUploadRecords(cDF):
+
+    cDF["est.type"] = np.nan
+    cDF["est.gapSize"] = np.nan
+    cDF["est.timezoneOffset"] = cDF["upload.timezoneOffset"]
+    cDF["est.annotations"] = np.nan
+
+    if "upload.timezone" in cDF:
+        cDF.loc[cDF["upload.timezone"].notnull(), ["est.type"]] = "UPLOAD"
+        cDF["est.timezone"] = cDF["upload.timezone"]
+        cDF["est.timeProcessing"] = cDF["upload.timeProcessing"]
+    else:
+        cDF["est.timezone"] = np.nan
+        cDF["est.timeProcessing"] = np.nan
+
+    cDF.loc[((cDF["est.timezoneOffset"] !=
+              cDF["home.imputed.timezoneOffset"]) &
+            (pd.notnull(cDF["est.timezoneOffset"]))),
+            "est.annotations"] = "travel"
+
+    return cDF
+
+
+def assignTzoFromImputedSeries(df, i, imputedSeries):
+    df.loc[i, ["est.type"]] = "DEVICE"
+
+    df.loc[i, ["est.timezoneOffset"]] = \
+        df.loc[i, imputedSeries + ".timezoneOffset"]
+
+    df.loc[i, ["est.timezone"]] = \
+        df.loc[i, imputedSeries + ".timezone"]
+
+    df.loc[i, ["est.timeProcessing"]] = \
+        df.loc[i, imputedSeries + ".timeProcessing"]
+
+    return df
+
+
+def compareDeviceTzoToImputedSeries(df, sIdx, device):
+    for i in sIdx:
+        # if the device tzo = imputed tzo, then chose the imputed tz and tzo
+        # note, dst is accounted for in the imputed tzo
+        for imputedSeries in ["pump.upload.imputed", "cgm.upload.imputed",
+                              "healthkit.upload.imputed", "home.imputed"]:
+            # if the estimate has not already been made
+            if pd.isnull(df.loc[i, "est.timezone"]):
+
+                if df.loc[i, device + ".timezoneOffset"] == \
+                  df.loc[i, imputedSeries + ".timezoneOffset"]:
+
+                    assignTzoFromImputedSeries(df, i, imputedSeries)
+
+                    df = addAnnotation(df, i,
+                                       "tz-inferred-from-" + imputedSeries)
+
+                # if the imputed series has a timezone estimate, then see if
+                # the current day is a dst change day
+                elif (pd.notnull(df.loc[i, imputedSeries + ".timezone"])):
+                    imputedTimezone = df.loc[i, imputedSeries + ".timezone"]
+                    if isDSTChangeDay(df.loc[i, "date"], imputedTimezone):
+
+                        dstRange = getRangeOfTZOsForTimezone(imputedTimezone)
+                        if ((df.loc[i, device + ".timezoneOffset"] in dstRange)
+                          & (df.loc[i, imputedSeries + ".timezoneOffset"] in dstRange)):
+
+                            assignTzoFromImputedSeries(df, i, imputedSeries)
+
+                            df = addAnnotation(df, i, "dst-change-day")
+                            df = addAnnotation(
+                                    df, i, "tz-inferred-from-" + imputedSeries)
+
+    return df
+
+
+def estimateTzAndTzoWithDeviceRecords(cDF):
+
+    # 2A. use the TZO of the pump or cgm device if it exists on a given day. In
+    # addition, compare the TZO to one of the imputed day series (i.e., the
+    # upload and home series to see if the TZ can be inferred)
+    for deviceType in ["pump", "cgm"]:
+        # find the indices of days where a TZO estimate has not been made AND
+        # where the device (e.g., pump or cgm) TZO has data
+        sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) &
+                        (cDF[deviceType + ".timezoneOffset"].notnull()))].index
+        # compare the device TZO to the imputed series to infer time zone
+        cDF = compareDeviceTzoToImputedSeries(cDF, sIndices, deviceType)
+
+    # 2B. if the TZ cannot be inferred with 2A, then see if the TZ can be
+    # inferred from the previous day's TZO. If the device TZO is equal to the
+    # previous day's TZO, AND if the previous day has a TZ estimate, use the
+    # previous day's TZ estimate for the current day's TZ estimate
+    for deviceType in ["pump", "cgm"]:
+        sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) &
+                        (cDF[deviceType + ".timezoneOffset"].notnull()))].index
+
+        cDF = compareDeviceTzoToPrevDayTzo(cDF, sIndices, deviceType)
+
+    # 2C. after 2A and 2B, check the DEVICE estimates to make sure that the
+    # pump and cgm tzo do not differ by more than 60 minutes. If they differ
+    # by more that 60 minutes, then mark the estimate as UNCERTAIN. Also, we
+    # allow the estimates to be off by 60 minutes as there are a lot of cases
+    # where the devices are off because the user changes the time for DST,
+    # at different times
+    sIndices = cDF[((cDF["est.type"] == "DEVICE") &
+                    (cDF["pump.timezoneOffset"].notnull()) &
+                    (cDF["cgm.timezoneOffset"].notnull()) &
+                    (cDF["pump.timezoneOffset"] != cDF["cgm.timezoneOffset"])
+                    )].index
+
+    tzoDiffGT60 = abs(cDF.loc[sIndices, "cgm.timezoneOffset"] -
+                      cDF.loc[sIndices, "pump.timezoneOffset"]) > 60
+
+    idx = tzoDiffGT60.index[tzoDiffGT60]
+
+    cDF.loc[idx, ["est.type"]] = "UNCERTAIN"
+    for i in idx:
+        cDF = addAnnotation(cDF, i, "pump-cgm-tzo-mismatch")
+
+    return cDF
+
+
+def imputeTzAndTzo(cDF):
+
+    sIndices = cDF[cDF["est.timezoneOffset"].isnull()].index
+    hasTzoIndices = cDF[cDF["est.timezoneOffset"].notnull()].index
+    if len(hasTzoIndices) > 0:
+        if len(sIndices) > 0:
+            lastDay = max(sIndices)
+
+            while ((sIndices.min() < max(hasTzoIndices)) &
+                   (len(sIndices) > 0)):
+
+                currentDay, prevDayWithDay, nextDayIdx = \
+                    getImputIndices(cDF, sIndices, hasTzoIndices)
+
+                cDF = imputeByTimezone(cDF, currentDay,
+                                       prevDayWithDay, nextDayIdx)
+
+                sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) &
+                                (~cDF["est.annotations"].str.contains(
+                                "unable-to-impute-tzo").fillna(False)))].index
+
+                hasTzoIndices = cDF[cDF["est.timezoneOffset"].notnull()].index
+
+            # try to impute to the last day (earliest day) in the dataset
+            # if the last record has a timezone that is the home record, then
+            # impute using the home timezone
+            if len(sIndices) > 0:
+                currentDay = min(sIndices)
+                prevDayWithDay = currentDay - 1
+                gapSize = lastDay - currentDay
+
+                for i in range(currentDay, lastDay + 1):
+                    if cDF.loc[prevDayWithDay, "est.timezoneOffset"] == \
+                      cDF.loc[prevDayWithDay, "home.imputed.timezoneOffset"]:
+
+                        cDF.loc[i, ["est.type"]] = "IMPUTE"
+
+                        cDF.loc[i, ["est.timezoneOffset"]] = \
+                            cDF.loc[i, "home.imputed.timezoneOffset"]
+
+                        cDF.loc[i, ["est.timezone"]] = \
+                            cDF.loc[i, "home.imputed.timezone"]
+
+                        cDF = addAnnotation(cDF, i, "gap=" + str(gapSize))
+                        cDF.loc[i, ["est.gapSize"]] = gapSize
+
+                    else:
+                        cDF.loc[i, ["est.type"]] = "UNCERTAIN"
+                        cDF = addAnnotation(cDF, i, "unable-to-impute-tzo")
+    else:
+        cDF["est.type"] = "UNCERTAIN"
+        cDF["est.annotations"] = "unable-to-impute-tzo"
+
+    return cDF
+
+
+def getRangeOfTZOsForTimezone(tz):
+    minMaxTzo = [getTimezoneOffset(pd.to_datetime("1/1/2017"), tz),
+                 getTimezoneOffset(pd.to_datetime("5/1/2017"), tz)]
+
+    rangeOfTzo = np.arange(int(min(minMaxTzo)), int(max(minMaxTzo))+1, 15)
+
+    return rangeOfTzo
+
+
+def getListOfDSTChangeDays(cDF):
+
+    # get a list of DST change days for the home time zone
+    dstChangeDays = \
+        cDF[abs(cDF["home.imputed.timezoneOffset"] -
+                cDF["home.imputed.timezoneOffset"].shift(-1)) > 0].date
+
+    return dstChangeDays
+
+
+def correctEstimatesAroundDst(df, cDF):
+
+    # get a list of DST change days for the home time zone
+    dstChangeDays = getListOfDSTChangeDays(cDF)
+
+    # loop through the df within 2 days of a daylight savings time change
+    for d in dstChangeDays:
+        dstIndex = df[(df.date > (d + dt.timedelta(days=-2))) &
+                      (df.date < (d + dt.timedelta(days=2)))].index
+        for dIdx in dstIndex:
+            if pd.notnull(df.loc[dIdx, "est.timezone"]):
+                tz = pytz.timezone(df.loc[dIdx, "est.timezone"])
+                tzRange = getRangeOfTZOsForTimezone(str(tz))
+                minHoursToLocal = min(tzRange)/60
+                tzoNum = int(tz.localize(df.loc[dIdx, "utcTime"] +
+                             dt.timedelta(hours=minHoursToLocal)).strftime("%z"))
+                tzoHours = np.floor(tzoNum / 100)
+                tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0)
+                tzoSign = np.sign(tzoHours)
+                tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign))
+                localTime = \
+                    df.loc[dIdx, "utcTime"] + pd.to_timedelta(tzo, unit="m")
+                df.loc[dIdx, ["est.localTime"]] = localTime
+                df.loc[dIdx, ["est.timezoneOffset"]] = tzo
+    return df
+
+
+def applyLocalTimeEstimates(df, cDF):
+    df = pd.merge(df, cDF, how="left", on="date")
+    df["est.localTime"] = \
+        df["utcTime"] + pd.to_timedelta(df["est.timezoneOffset"], unit="m")
+
+    df = correctEstimatesAroundDst(df, cDF)
+
+    return df["est.localTime"].values
+
+
+def isDSTChangeDay(currentDate, currentTimezone):
+    tzoCurrentDay = getTimezoneOffset(pd.to_datetime(currentDate),
+                                      currentTimezone)
+    tzoPreviousDay = getTimezoneOffset(pd.to_datetime(currentDate) +
+                                       dt.timedelta(days=-1), currentTimezone)
+
+    return (tzoCurrentDay != tzoPreviousDay)
+
+
+def tzoRangeWithComparisonTz(df, i, comparisonTz):
+    # if we have a previous timezone estimate, then calcuate the range of
+    # timezone offset values for that time zone
+    if pd.notnull(comparisonTz):
+        rangeTzos = getRangeOfTZOsForTimezone(comparisonTz)
+    else:
+        comparisonTz = np.nan
+        rangeTzos = np.array([])
+
+    return rangeTzos
+
+
+def tzAndTzoRangePreviousDay(df, i):
+    # if we have a previous timezone estimate, then calcuate the range of
+    # timezone offset values for that time zone
+    comparisonTz = df.loc[i-1, "est.timezone"]
+
+    rangeTzos = tzoRangeWithComparisonTz(df, i, comparisonTz)
+
+    return comparisonTz, rangeTzos
+
+
+def assignTzoFromPreviousDay(df, i, previousDayTz):
+
+    df.loc[i, ["est.type"]] = "DEVICE"
+    df.loc[i, ["est.timezone"]] = previousDayTz
+    df.loc[i, ["est.timezoneOffset"]] = \
+        getTimezoneOffset(pd.to_datetime(df.loc[i, "date"]), previousDayTz)
+
+    df.loc[i, ["est.timeProcessing"]] = df.loc[i-1, "est.timeProcessing"]
+    df = addAnnotation(df, i, "tz-inferred-from-prev-day")
+
+    return df
+
+
+def assignTzoFromDeviceTzo(df, i, device):
+
+    df.loc[i, ["est.type"]] = "DEVICE"
+    df.loc[i, ["est.timezoneOffset"]] = \
+        df.loc[i, device + ".timezoneOffset"]
+    df.loc[i, ["est.timeProcessing"]] = \
+        df.loc[i, device + ".upload.imputed.timeProcessing"]
+
+    df = addAnnotation(df, i, "likely-travel")
+    df = addAnnotation(df, i, "tzo-from-" + device)
+
+    return df
+
+
+def compareDeviceTzoToPrevDayTzo(df, sIdx, device):
+
+    for i in sIdx[sIdx > 0]:
+
+        # first see if the previous record has a tzo
+        if (pd.notnull(df.loc[i-1, "est.timezoneOffset"])):
+
+            previousDayTz, dstRange = tzAndTzoRangePreviousDay(df, i)
+            timeDiff = abs((df.loc[i, device + ".timezoneOffset"]) -
+                           df.loc[i-1, "est.timezoneOffset"])
+
+            # next see if the previous record has a tz
+            if (pd.notnull(df.loc[i-1, "est.timezone"])):
+
+                if timeDiff == 0:
+                    assignTzoFromPreviousDay(df, i, previousDayTz)
+
+                # see if the previous day's tzo and device tzo are within the
+                # dst range (as that is a common problem with this data)
+                elif ((df.loc[i, device + ".timezoneOffset"] in dstRange)
+                      & (df.loc[i-1, "est.timezoneOffset"] in dstRange)):
+
+                    # then see if it is DST change day
+                    if isDSTChangeDay(df.loc[i, "date"], previousDayTz):
+
+                        df = addAnnotation(df, i, "dst-change-day")
+                        assignTzoFromPreviousDay(df, i, previousDayTz)
+
+                    # if it is not DST change day, then mark this as uncertain
+                    else:
+                        # also, check to see if the difference between device.
+                        # tzo and prev.tzo is less than the expected dst
+                        # difference. There is a known issue where the BtUTC
+                        # procedure puts clock drift into the device.tzo,
+                        # and as a result the tzo can be off by 15, 30,
+                        # or 45 minutes.
+                        if (((df.loc[i, device + ".timezoneOffset"] ==
+                              min(dstRange)) |
+                            (df.loc[i, device + ".timezoneOffset"] ==
+                             max(dstRange))) &
+                           ((df.loc[i-1, "est.timezoneOffset"] ==
+                             min(dstRange)) |
+                            (df.loc[i-1, "est.timezoneOffset"] ==
+                             max(dstRange)))):
+
+                            df.loc[i, ["est.type"]] = "UNCERTAIN"
+                            df = addAnnotation(df, i,
+                                               "likely-dst-error-OR-travel")
+
+                        else:
+
+                            df.loc[i, ["est.type"]] = "UNCERTAIN"
+                            df = addAnnotation(df, i,
+                                               "likely-15-min-dst-error")
+
+                # next see if time difference between device.tzo and prev.tzo
+                # is off by 720 minutes, which is indicative of a common
+                # user AM/PM error
+                elif timeDiff == 720:
+                    df.loc[i, ["est.type"]] = "UNCERTAIN"
+                    df = addAnnotation(df, i, "likely-AM-PM-error")
+
+                # if it doesn't fall into any of these cases, then the
+                # tzo difference is likely due to travel
+                else:
+                    df = assignTzoFromDeviceTzo(df, i, device)
+
+            elif timeDiff == 0:
+                df = assignTzoFromDeviceTzo(df, i, device)
+
+        # if there is no previous record to compare with check for dst errors,
+        # and if there are no errors, it is likely a travel day
+        else:
+
+            comparisonTz, dstRange = tzAndTzoRangeWithHomeTz(df, i)
+            timeDiff = abs((df.loc[i, device + ".timezoneOffset"]) -
+                           df.loc[i, "home.imputed.timezoneOffset"])
+
+            if ((df.loc[i, device + ".timezoneOffset"] in dstRange)
+               & (df.loc[i, "home.imputed.timezoneOffset"] in dstRange)):
+
+                # see if it is DST change day
+                if isDSTChangeDay(df.loc[i, "date"], comparisonTz):
+
+                    df = addAnnotation(df, i, "dst-change-day")
+                    df.loc[i, ["est.type"]] = "DEVICE"
+                    df.loc[i, ["est.timezoneOffset"]] = \
+                        df.loc[i, device + ".timezoneOffset"]
+                    df.loc[i, ["est.timezone"]] = \
+                        df.loc[i, "home.imputed.timezone"]
+                    df.loc[i, ["est.timeProcessing"]] = \
+                        df.loc[i, device + ".upload.imputed.timeProcessing"]
+
+                # if it is not DST change day, then mark this as uncertain
+                else:
+                    # also, check to see if the difference between device.
+                    # tzo and prev.tzo is less than the expected dst
+                    # difference. There is a known issue where the BtUTC
+                    # procedure puts clock drift into the device.tzo,
+                    # and as a result the tzo can be off by 15, 30,
+                    # or 45 minutes.
+                    if (((df.loc[i, device + ".timezoneOffset"] ==
+                          min(dstRange)) |
+                        (df.loc[i, device + ".timezoneOffset"] ==
+                         max(dstRange))) &
+                       ((df.loc[i, "home.imputed.timezoneOffset"] ==
+                         min(dstRange)) |
+                        (df.loc[i, "home.imputed.timezoneOffset"] ==
+                         max(dstRange)))):
+
+                        df.loc[i, ["est.type"]] = "UNCERTAIN"
+                        df = addAnnotation(df, i, "likely-dst-error-OR-travel")
+
+                    else:
+
+                        df.loc[i, ["est.type"]] = "UNCERTAIN"
+                        df = addAnnotation(df, i, "likely-15-min-dst-error")
+
+            # next see if time difference between device.tzo and prev.tzo
+            # is off by 720 minutes, which is indicative of a common
+            # user AM/PM error
+            elif timeDiff == 720:
+                df.loc[i, ["est.type"]] = "UNCERTAIN"
+                df = addAnnotation(df, i, "likely-AM-PM-error")
+
+            # if it doesn't fall into any of these cases, then the
+            # tzo difference is likely due to travel
+
+            else:
+                df = assignTzoFromDeviceTzo(df, i, device)
+
+    return df
+
+
+def getImputIndices(df, sIdx, hIdx):
+
+    lastDayIdx = len(df) - 1
+
+    currentDayIdx = sIdx.min()
+    tempList = pd.Series(hIdx) - currentDayIdx
+    prevDayIdx = currentDayIdx - 1
+    nextDayIdx = \
+        min(currentDayIdx + min(tempList[tempList >= 0]), lastDayIdx)
+
+    return currentDayIdx, prevDayIdx, nextDayIdx
+
+
+def imputeByTimezone(df, currentDay, prevDaywData, nextDaywData):
+
+    gapSize = (nextDaywData - currentDay)
+
+    if prevDaywData >= 0:
+
+        if df.loc[prevDaywData, "est.timezone"] == \
+          df.loc[nextDaywData, "est.timezone"]:
+
+            tz = df.loc[prevDaywData, "est.timezone"]
+
+            for i in range(currentDay, nextDaywData):
+
+                df.loc[i, ["est.timezone"]] = tz
+
+                df.loc[i, ["est.timezoneOffset"]] = \
+                    getTimezoneOffset(pd.to_datetime(df.loc[i, "date"]), tz)
+
+                df.loc[i, ["est.type"]] = "IMPUTE"
+
+                df = addAnnotation(df, i, "gap=" + str(gapSize))
+                df.loc[i, ["est.gapSize"]] = gapSize
+
+        # TODO: this logic should be updated to handle the edge case
+        # where the day before and after the gap have differing TZ, but
+        # the same TZO. In that case the gap should be marked as UNCERTAIN
+        elif df.loc[prevDaywData, "est.timezoneOffset"] == \
+          df.loc[nextDaywData, "est.timezoneOffset"]:
+
+            for i in range(currentDay, nextDaywData):
+
+                df.loc[i, ["est.timezoneOffset"]] = \
+                    df.loc[prevDaywData, "est.timezoneOffset"]
+
+                df.loc[i, ["est.type"]] = "IMPUTE"
+
+                df = addAnnotation(df, i, "gap=" + str(gapSize))
+                df.loc[i, ["est.gapSize"]] = gapSize
+
+        else:
+            for i in range(currentDay, nextDaywData):
+                df.loc[i, ["est.type"]] = "UNCERTAIN"
+                df = addAnnotation(df, i, "unable-to-impute-tzo")
+
+    else:
+        for i in range(currentDay, nextDaywData):
+            df.loc[i, ["est.type"]] = "UNCERTAIN"
+            df = addAnnotation(df, i, "unable-to-impute-tzo")
+
+    return df
+
+
+def addAnnotation(df, idx, annotationMessage):
+    if pd.notnull(df.loc[idx, "est.annotations"]):
+        df.loc[idx, ["est.annotations"]] = df.loc[idx, "est.annotations"] + \
+            ", " + annotationMessage
+    else:
+        df.loc[idx, ["est.annotations"]] = annotationMessage
+
+    return df
+
+
+def getTimezoneOffset(currentDate, currentTimezone):
+
+    tz = pytz.timezone(currentTimezone)
+    # here we add 1 day to the current date to account for changes to/from DST
+    tzoNum = int(tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z"))
+    tzoHours = np.floor(tzoNum / 100)
+    tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0)
+    tzoSign = np.sign(tzoHours)
+    tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign))
+
+    return tzo
+
+
+def estimate_local_time(df):
+    df["date"] = df["utcTime"].dt.date  # TODO: change this to utcDate later
+    contiguous_days = create_contiguous_day_series(df)
+
+    df["deviceType"] = add_device_type(df)
+    cDays = add_device_day_series(df, contiguous_days, "upload")
+
+    # create day series for cgm df
+    if "timezoneOffset" not in list(df):
+        df["timezoneOffset"] = np.nan
+
+    cgmdf = df[(df["type"] == "cbg") & (df["timezoneOffset"].notnull())].copy()
+    cDays = add_device_day_series(cgmdf, cDays, "cgm")
+
+    # create day series for pump df
+    pumpdf = df[(df.type == "bolus") & (df.timezoneOffset.notnull())].copy()
+    cDays = add_device_day_series(pumpdf, cDays, "pump")
+
+    # interpolate between upload records of the same deviceType, and create a
+    # day series for interpolated pump, non-hk-cgm, and healthkit uploads
+    for deviceType in ["pump", "cgm", "healthkit"]:
+        tempUploaddf = df[df["deviceType"] == deviceType].copy()
+        cDays = impute_upload_records(
+            tempUploaddf, cDays, deviceType + ".upload.imputed"
+        )
+
+    # add a home timezone that also accounts for daylight savings time changes
+    cDays = add_home_timezone(df, cDays)
+
+    # 1. USE UPLOAD RECORDS TO ESTIMATE TZ AND TZO
+    cDays = estimateTzAndTzoWithUploadRecords(cDays)
+
+    # 2. USE DEVICE TZOs TO ESTIMATE TZO AND TZ (IF POSSIBLE)
+    # estimates can be made from pump and cgm df that have a TZO
+    # NOTE: the healthkit and dexcom-api cgm df are excluded
+    cDays = estimateTzAndTzoWithDeviceRecords(cDays)
+
+    # 3. impute, infer, or interpolate gaps in the estimated tzo and tz
+    cDays = imputeTzAndTzo(cDays)
+
+    # 4. APPLY LOCAL TIME ESTIMATES TO ALL df
+    local_time = applyLocalTimeEstimates(df, cDays)
+
+    return local_time, cDays
+
+
 # %% GET DATA FROM API
 '''
 get metadata and data for a donor that has shared with bigdata
@@ -402,11 +1097,14 @@ def add_upload_time(df):
 data, _ = get_data(
     donor_group=donor_group,
     userid=userid,
-    weeks_of_data=52
-    )
+    weeks_of_data=52*10
+)
 
 
 # %% CREATE META DATAFRAME (metadata)
+'''
+this is useful for keeping track of the type and amount of cleaning done
+'''
 metadata = pd.DataFrame(index=[userid])
 
 
@@ -428,32 +1126,30 @@ def add_upload_time(df):
 metadata["nNegativeDurations"] = n_negative_durations
 
 # Tslim calibration bug fix
-data, n_cal_readings = tslim_calibration_fix(data)
+data, n_cal_readings = tslim_calibration_fix(data.copy())
 metadata["nTandemAndPayloadCalReadings"] = n_cal_readings
 
-# fix large timzoneOffset bug
-data = timezone_offset_bug_fix(data)
+# fix large timzoneOffset bug in utcbootstrapping
+data = timezone_offset_bug_fix(data.copy())
+
+# add healthkit timezome information
+data[["timezone", "deviceType"]] = get_healthkit_timezone(data.copy())
 
 
 # %% TIME RELATED ITEMS
 data["utcTime"] = to_utc_datetime(data[["time"]].copy())
-if "timezone" not in list(data):
-    data["timezone"] = np.nan
-
-
 
+# add upload time to the data, which is needed for:
+# getting rid of duplicates and useful for getting local time
+data["uploadTime"] = add_upload_time(data[
+    ["type", "uploadId", "utcTime"]
+].copy())
 
-# estimate local time (simple method)
-data["inferredTimezone"] = get_and_fill_timezone(
-    data[["timezone", "payload"]].copy()
-)
-# TODO: this really needs to be sped up AND/OR use complex version
-data["localTime"] = get_local_time(
-    data[['utcTime', 'inferredTimezone']].copy()
-)
+# estimate local time (refactor of estimate-local-time.py)
+data["localTime"], local_time_metadata = estimate_local_time(data.copy())
 
 # round all data to the nearest 5 minutes
-data["roundedTime"] = round_time(
+data["roundedLocalTime"] = round_time(
     data["localTime"].copy(),
     time_interval_minutes=5,
     start_with_first_record=True,
@@ -461,47 +1157,6 @@ def add_upload_time(df):
 )
 
 # add upload time to the data, which is needed to get rid of duplicates
-data["uploadTime"] = add_upload_time(data[
-    ["type", "uploadId", "utcTime"]
-].copy())
-
-
-# %% TIME CATEGORIES
-contiguousDays = createContiguousDaySeries(data)
-
-# add the day of the localTime that starts at 12am
-#data["day12AM"] = pd.DatetimeIndex(data["localTime"]).date
-#data["day6AM"] = data["localTime"] - pd.Timedelta(6, unit="hours")
-#data.sort_values("uploadTime", ascending=False, inplace=True)
-#
-## AGE, & YLW
-#data["age"] = np.floor((data["localTime"] - bDate).dt.days/365.25).astype(int)
-#data["ylw"] = np.floor((data["localTime"] - dDate).dt.days/365.25).astype(int)
-
-
-## group data by type
-#if "uploadId" not in data:
-#    sys.exit(
-#        "Error: expected that uploadId is in data"
-#    )
-#
-#type_groups = data.groupby("type")
-
-# %% CGM DATA
-
-#def removeInvalidCgmValues(df):
-#
-#    nBefore = len(df)
-#    # remove values < 38 and > 402 mg/dL
-#    df = df.drop(df[((df.type == "cbg") &
-#                     (df.value < 2.109284236597303))].index)
-#    df = df.drop(df[((df.type == "cbg") &
-#                     (df.value > 22.314006924003046))].index)
-#    nRemoved = nBefore - len(df)
-#
-#    return df, nRemoved
-
-# get rid of cgm values too low/high (< 38 & > 402 mg/dL)
-#data, nInvalidCgmValues = removeInvalidCgmValues(data)
-#metadata["nInvalidCgmValues"] = nInvalidCgmValues
-
+data["uploadTime"] = add_upload_time(
+    data[["type", "uploadId", "utcTime"]].copy()
+)

From 21ff818ae6c2b89bb5577db2e861524ddee6eabf Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Mon, 29 Jul 2019 22:36:30 -0500
Subject: [PATCH 18/46] clean cgm data

---
 .../get_stats/get_cgm_stats.py                | 100 +++++++++++++++++-
 1 file changed, 99 insertions(+), 1 deletion(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
index be3dcecf..329e6b75 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
@@ -381,6 +381,48 @@ def add_upload_time(df):
     return df["uploadTime"].values
 
 
+def remove_invalid_cgm_values(df):
+
+    nBefore = len(df)
+    # remove values < 38 and > 402 mg/dL
+    df = df.drop(df[((df.type == "cbg") &
+                     (df["mg/dL"] < 38))].index)
+    df = df.drop(df[((df.type == "cbg") &
+                     (df["mg/dL"] > 402))].index)
+    nRemoved = nBefore - len(df)
+
+    return df, nRemoved
+
+
+def removeDuplicates(df, criteriaDF):
+    nBefore = len(df)
+    df = df.loc[~(df[criteriaDF].duplicated())]
+    df = df.reset_index(drop=True)
+    nDuplicatesRemoved = nBefore - len(df)
+
+    return df, nDuplicatesRemoved
+
+
+def removeCgmDuplicates(df, timeCriterion):
+    if timeCriterion in df:
+        df.sort_values(by=[timeCriterion, "uploadTime"],
+                       ascending=[False, False],
+                       inplace=True)
+        dfIsNull = df[df[timeCriterion].isnull()]
+        dfNotNull = df[df[timeCriterion].notnull()]
+        dfNotNull, nDuplicatesRemoved = (
+            removeDuplicates(dfNotNull, [timeCriterion, "value"])
+        )
+        df = pd.concat([dfIsNull, dfNotNull])
+        df.sort_values(by=[timeCriterion, "uploadTime"],
+                       ascending=[False, False],
+                       inplace=True)
+    else:
+        nDuplicatesRemoved = 0
+
+    return df, nDuplicatesRemoved
+
+
 # %% ESTIMATE LOCAL TIME FUNCTIONS
 def create_contiguous_day_series(df):
     first_day = df["date"].min()
@@ -1097,7 +1139,7 @@ def estimate_local_time(df):
 data, _ = get_data(
     donor_group=donor_group,
     userid=userid,
-    weeks_of_data=52*10
+    weeks_of_data=4  # 52*10
 )
 
 
@@ -1160,3 +1202,59 @@ def estimate_local_time(df):
 data["uploadTime"] = add_upload_time(
     data[["type", "uploadId", "utcTime"]].copy()
 )
+
+# %% TIME CATEGORIES
+# AGE, & YLW
+bDate = pd.to_datetime(donor_metadata["birthday"].values[0][0:7])
+dDate = pd.to_datetime(donor_metadata["diagnosisDate"].values[0][0:7])
+data["age"] = np.floor((data["roundedLocalTime"] - bDate).dt.days/365.25)
+data["ylw"] = np.floor((data["roundedLocalTime"] - dDate).dt.days/365.25)
+
+# hour of the day
+data["hour"] = data["roundedLocalTime"].dt.hour
+
+# add the day of the localTime that starts at 12am
+data["day12AM"] = data["roundedLocalTime"].dt.date
+# NOTE: for day of week Monday = 0 and Sunday = 6
+data["dayofweek12AM"] = data["roundedLocalTime"].dt.dayofweek
+data["weekend12AM"] = data["dayofweek12AM"] > 4
+
+# day that starts at 6am
+data["6amTime"] = data["roundedLocalTime"] - pd.Timedelta(6, unit="hours")
+data["day6AM"] = data["6amTime"].dt.date
+data["dayofweek6AM"] = data["6amTime"].dt.dayofweek
+data["weekend6AM"] = data["dayofweek6AM"] > 4
+
+
+# %% GROUP DATA BY TYPE
+# first sort by upload time (used when removing dumplicates)
+data.sort_values("uploadTime", ascending=False, inplace=True)
+groups = data.groupby(by="type")
+
+
+# %% CGM DATA
+# filter by cgm
+cgm = groups.get_group("cbg").dropna(axis=1, how="all")
+
+# calculate cgm in mg/dL
+cgm["mg/dL"] = round(cgm["value"] * MGDL_PER_MMOLL).astype(int)
+
+# get rid of cgm values too low/high (< 38 & > 402 mg/dL)
+cgm, nInvalidCgmValues = remove_invalid_cgm_values(cgm)
+metadata["nInvalidCgmValues"] = nInvalidCgmValues
+
+# get rid of duplicates that have the same ["deviceTime", "value"]
+cgm, n_cgm_dups_removed = (removeCgmDuplicates(cgm, "deviceTime"))
+metadata["nCgmDuplicatesRemovedDeviceTime"] = n_cgm_dups_removed
+
+# get rid of duplicates that have the same ["time", "value"]
+cgm, n_cgm_dups_removed = removeCgmDuplicates(cgm, "time")
+metadata["nCgmDuplicatesRemovedUtcTime"] = n_cgm_dups_removed
+
+# get rid of duplicates that have the same "roundedTime"
+cgm, n_cgm_dups_removed = removeDuplicates(cgm, "roundedLocalTime")
+metadata["nCgmDuplicatesRemovedRoundedTime"] = n_cgm_dups_removed
+
+
+# %% GET CGM STATS
+

From 8dd746518c0c4de508febc500603396c33ae8951 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Tue, 30 Jul 2019 11:41:43 -0500
Subject: [PATCH 19/46] get cgm 5 minute time series

---
 .../get_stats/get_cgm_stats.py                | 26 ++++++++++++++-----
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
index 329e6b75..1f5f61b6 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
@@ -1139,7 +1139,7 @@ def estimate_local_time(df):
 data, _ = get_data(
     donor_group=donor_group,
     userid=userid,
-    weeks_of_data=4  # 52*10
+    weeks_of_data=52*10
 )
 
 
@@ -1198,10 +1198,6 @@ def estimate_local_time(df):
     return_calculation_columns=False
 )
 
-# add upload time to the data, which is needed to get rid of duplicates
-data["uploadTime"] = add_upload_time(
-    data[["type", "uploadId", "utcTime"]].copy()
-)
 
 # %% TIME CATEGORIES
 # AGE, & YLW
@@ -1237,7 +1233,7 @@ def estimate_local_time(df):
 cgm = groups.get_group("cbg").dropna(axis=1, how="all")
 
 # calculate cgm in mg/dL
-cgm["mg/dL"] = round(cgm["value"] * MGDL_PER_MMOLL).astype(int)
+cgm["mg/dL"] = round(cgm["value"] * MGDL_PER_MMOLL)
 
 # get rid of cgm values too low/high (< 38 & > 402 mg/dL)
 cgm, nInvalidCgmValues = remove_invalid_cgm_values(cgm)
@@ -1257,4 +1253,22 @@ def estimate_local_time(df):
 
 
 # %% GET CGM STATS
+# create a contiguous 5 minute time series
+first_day = cgm["roundedLocalTime"].min()
+last_day = cgm["roundedLocalTime"].max()
+rng = pd.date_range(first_day, last_day, freq="5min")
+contiguous_data = (
+    pd.DataFrame(rng, columns=["roundedLocalTime"]).sort_values(
+        "roundedLocalTime", ascending=False
+    ).reset_index(drop=True)
+)
+
+# merge with cgm data
+cgm_series = pd.merge(
+    contiguous_data,
+    cgm,
+    on="roundedLocalTime",
+    how="left"
+)
 
+#cgm_series["hourly.mean"] = cgm_series["mg/dL"].rolling(12).mean()

From ef40d18538bf3c7922beba51c9d9191a5790f8f6 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Wed, 7 Aug 2019 14:08:17 -0500
Subject: [PATCH 20/46] add new functions that get embedded json data

also refactor existing functions that used old get embedded data functions
---
 .../get_stats/get_cgm_stats.py                | 81 ++++++++++++++++---
 1 file changed, 69 insertions(+), 12 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
index 1f5f61b6..0fcc1c3c 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
@@ -76,17 +76,67 @@ def remove_negative_durations(df):
     return df, n_negative_durations
 
 
-def expand_embedded_dict(df, field, key_):
+def expand_embedded_dict(ts, key_):
+    '''Expanded a single field that has embedded json
+
+    Args:
+        ts: a pandas time series of the field that has embedded json
+        key_: the key that you want to expand
+
+    Raise:
+        TypeError: if you don't pass in a pandas time series
+
+    Returns:
+        key_ts: a new time series of the key of interest
+
+    NOTE:
+        this is new function
+    TODO:
+        could be refactored to allow multiple keys or all keys to be returned
+        could be refactored for speed as the current process
     '''
-    this is new, should be refactored for speed as the current process
-    creates a dataframe of all of keys instead of just the key of interest
+
+    if "Series" not in type(ts).__name__:
+        raise TypeError('Expecting a pandas time series object')
+    key_ts = pd.Series(name=ts.name + "." + key_, index=ts.index)
+    notnull_idx = ts.notnull()
+    # TODO: maybe sped up by only getting the one field of interest?
+    # though, the current method is fairly quick and compact
+    temp_df = pd.DataFrame(ts[notnull_idx].tolist())
+    if key_ in list(temp_df):
+        key_ts[notnull_idx] = temp_df[key_].values
+
+    return key_ts
+
+
+def get_embedded_field(ts, embedded_field):
+    '''get a field that is nested in more than 1 embedded dictionary (json)
+
+    Args:
+        ts: a pandas time series of the field that has embedded json
+        embedded_field (str): the location of the field that is deeply nested
+            (e.g., "origin.payload.device.model")
+
+    Raise:
+        ValueError: if you don't pass in a pandas time series
+
+    Returns:
+        new_ts: a new time series of the key of interest
+
+    NOTE:
+        this is new function
+        the "." notation is used to reference nested json
+
     '''
-    if field in list(df):
-        notnull_idx = df[field].notnull()
-        temp_df = pd.DataFrame(df.loc[notnull_idx, field].tolist())  # TODO: this can be sped up by only getting the field key of interest
-        if key_ in list(temp_df):
-            df[field + "." + key_] = temp_df[key_]
-    return df
+    field_list = embedded_field.split(".")
+    if len(field_list) < 2:
+        raise ValueError('Expecting at least 1 embedded field')
+
+    new_ts = expand_embedded_dict(ts, field_list[1])
+    for i in range(2, len(field_list)):
+        new_ts = expand_embedded_dict(new_ts, field_list[i])
+
+    return new_ts
 
 
 def tslim_calibration_fix(df):
@@ -101,9 +151,11 @@ def tslim_calibration_fix(df):
     '''
 
     # expand payload field one level
-    df = expand_embedded_dict(df, "payload", "calibration_reading")
+    df["payload.calibration_reading"] = (
+        expand_embedded_dict(df["payload"], "calibration_reading")
+    )
 
-    if "payload.calibration_reading" in list(df):
+    if df["payload.calibration_reading"].notnull().sum() > 0:
 
         search_for = ['tan']
         tandem_data_index = (
@@ -133,7 +185,12 @@ def tslim_calibration_fix(df):
 
 
 def get_healthkit_timezone(df):
-    df = expand_embedded_dict(df, "payload", "HKTimeZone")
+    '''
+    TODO: refactor to account for more efficient way to get embedded json
+    '''
+    df["payload.HKTimeZone"] = (
+        expand_embedded_dict(df["payload"], "HKTimeZone")
+    )
     if "timezone" not in list(df):
         if "payload.HKTimeZone" in list(df):
             hk_tz_idx = df["payload.HKTimeZone"].notnull()

From dc9f19e19016d8946c51927b2dc3915278c1b3a3 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Wed, 7 Aug 2019 20:17:57 -0500
Subject: [PATCH 21/46] remove spike data

---
 .../get_stats/get_cgm_stats.py                | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
index 0fcc1c3c..91bf2d9b 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
@@ -480,6 +480,27 @@ def removeCgmDuplicates(df, timeCriterion):
     return df, nDuplicatesRemoved
 
 
+# get rid of spike data
+def remove_spike_data(df):
+    nBefore = len(df)
+    spike_locations = [
+        "origin.payload.device.name",
+        "origin.payload.device.manufacturer",
+        "origin.payload.sourceRevision.source.name",
+    ]
+    for spike_loc in spike_locations:
+
+        df[spike_loc] = get_embedded_field(df["origin"], spike_loc)
+        spike_idx = df.loc[
+            df[spike_loc].notnull(),
+            spike_loc
+        ].str.lower().str.contains("spike")
+        df.drop(df.iloc[np.where(spike_idx)[0]].index, inplace=True)
+    nRemoved = nBefore - len(df)
+
+    return df, nRemoved
+
+
 # %% ESTIMATE LOCAL TIME FUNCTIONS
 def create_contiguous_day_series(df):
     first_day = df["date"].min()
@@ -1292,6 +1313,10 @@ def estimate_local_time(df):
 # calculate cgm in mg/dL
 cgm["mg/dL"] = round(cgm["value"] * MGDL_PER_MMOLL)
 
+# get rid of spike data
+cgm, nSpike = remove_spike_data(cgm)
+metadata["nSpike"] = nSpike
+
 # get rid of cgm values too low/high (< 38 & > 402 mg/dL)
 cgm, nInvalidCgmValues = remove_invalid_cgm_values(cgm)
 metadata["nInvalidCgmValues"] = nInvalidCgmValues

From be7177ef3e911284f64787489e25cc1d40af4e37 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Mon, 12 Aug 2019 10:20:14 -0500
Subject: [PATCH 22/46] make sure there is timezone information

---
 .../get_stats/get_cgm_stats.py                | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
index 91bf2d9b..a36e70bc 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
@@ -566,22 +566,22 @@ def add_device_day_series(df, dfContDays, deviceTypeName):
 
         if "upload" in deviceTypeName:
             if "timezone" in df:
-#                if dfDayGroups.timezone.count().values[0] > 0:  # NOT SURE WHY THIS IS HERE
-                dfDaySeries["timezone"] = (
-                    dfDayGroups.timezone.describe()["top"]
-                )
-                # get the timezone offset for the timezone
-                for i in dfDaySeries.index:
-                    if pd.notnull(dfDaySeries.loc[i, "timezone"]):
-                        tzo = get_timezone_offset(
-                                pd.to_datetime(i),
-                                dfDaySeries.loc[i, "timezone"])
-                        dfDaySeries.loc[i, ["timezoneOffset"]] = tzo
-                if "timeProcessing" in dfDaySeries:
-                    dfDaySeries["timeProcessing"] = \
-                        dfDayGroups.timeProcessing.describe()["top"]
-                else:
-                    dfDaySeries["timeProcessing"] = np.nan
+                if dfDayGroups.timezone.count().max() > 0:
+                    dfDaySeries["timezone"] = (
+                        dfDayGroups.timezone.describe()["top"]
+                    )
+                    # get the timezone offset for the timezone
+                    for i in dfDaySeries.index:
+                        if pd.notnull(dfDaySeries.loc[i, "timezone"]):
+                            tzo = get_timezone_offset(
+                                    pd.to_datetime(i),
+                                    dfDaySeries.loc[i, "timezone"])
+                            dfDaySeries.loc[i, ["timezoneOffset"]] = tzo
+                    if "timeProcessing" in dfDaySeries:
+                        dfDaySeries["timeProcessing"] = \
+                            dfDayGroups.timeProcessing.describe()["top"]
+                    else:
+                        dfDaySeries["timeProcessing"] = np.nan
 
         dfDaySeries = dfDaySeries.add_prefix(deviceTypeName + "."). \
             rename(columns={deviceTypeName + ".date": "date"})

From 07b211e77f1578fe65029a2d75c794158ffc8c26 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Mon, 12 Aug 2019 19:06:18 -0500
Subject: [PATCH 23/46] refactor remove spike data

---
 .../get_stats/get_cgm_stats.py                | 35 +++++++++++--------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
index a36e70bc..172f4784 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
@@ -482,21 +482,26 @@ def removeCgmDuplicates(df, timeCriterion):
 
 # get rid of spike data
 def remove_spike_data(df):
-    nBefore = len(df)
-    spike_locations = [
-        "origin.payload.device.name",
-        "origin.payload.device.manufacturer",
-        "origin.payload.sourceRevision.source.name",
-    ]
-    for spike_loc in spike_locations:
-
-        df[spike_loc] = get_embedded_field(df["origin"], spike_loc)
-        spike_idx = df.loc[
-            df[spike_loc].notnull(),
-            spike_loc
-        ].str.lower().str.contains("spike")
-        df.drop(df.iloc[np.where(spike_idx)[0]].index, inplace=True)
-    nRemoved = nBefore - len(df)
+    if "origin" in list(df):
+        nBefore = len(df)
+        spike_locations = [
+            "origin.payload.device.name",
+            "origin.payload.device.manufacturer",
+            "origin.payload.sourceRevision.source.name",
+        ]
+        for spike_loc in spike_locations:
+
+            df[spike_loc] = get_embedded_field(df["origin"], spike_loc)
+            spike_idx = df.loc[
+                df[spike_loc].notnull(),
+                spike_loc
+            ].astype(str).str.lower().str.contains("spike")
+
+            df.drop((spike_idx == True).index, inplace=True)
+        nRemoved = nBefore - len(df)
+
+    else:
+        nRemoved = np.nan
 
     return df, nRemoved
 

From a240bd7625ae43d8c394a7c5990ff5e7863ba341 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Fri, 16 Aug 2019 12:18:24 -0500
Subject: [PATCH 24/46] wip cgm distributions

---
 .gitignore                                    |    2 +
 .../get_stats/get_cgm_distributions_v3.py     | 1777 +++++++++++++++++
 2 files changed, 1779 insertions(+)
 create mode 100644 projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py

diff --git a/.gitignore b/.gitignore
index 0c1ca188..f4cf204c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,3 +35,5 @@ projects/loop-algorithm/figures/
 projects/parsers/output/
 
 projects/get-donors-pump-settings/temp-plot\.html
+
+projects/bigdata-processing-pipeline/get_stats/debug/
diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
new file mode 100644
index 00000000..5e670608
--- /dev/null
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
@@ -0,0 +1,1777 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+'''
+calculate cgm statsistics for a single tidepool (donor) dataset
+'''
+
+
+# %% REQUIRED LIBRARIES
+import os
+import sys
+import hashlib
+import pytz
+import numpy as np
+import pandas as pd
+import datetime as dt
+import ast
+import pdb
+# TODO: figure out how to get rid of these path dependcies
+get_donor_data_path = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..")
+)
+if get_donor_data_path not in sys.path:
+    sys.path.insert(0, get_donor_data_path)
+import environmentalVariables
+from get_donor_data.get_single_donor_metadata import get_shared_metadata
+from get_donor_data.get_single_tidepool_dataset import get_data
+
+
+# %% CONSTANTS
+MGDL_PER_MMOLL = 18.01559
+
+
+# %% FUNCTIONS
+'''
+the functions that are called in this script,
+which includes notes of where the functions came from,
+and whether they were refactored
+'''
+
+
+def get_slope(y):
+    if "array" not in type(y).__name__:
+        raise TypeError('Expecting a numpy array')
+
+    count_ = len(y)
+
+    x = np.arange(start=0, stop=count_*5, step=5)
+
+    sum_x = x.sum()
+    sum_y = y.sum()
+    sum_xy = (x * y).sum()
+    sum_x_squared = (x * x).sum()
+
+    slope = (
+        ((count_ * sum_xy) - (sum_x * sum_y))
+        / ((count_ * sum_x_squared) - (sum_x * sum_x))
+    )
+
+    return slope
+
+
+def expand_entire_dict(ts):
+    if "Series" not in type(ts).__name__:
+        raise TypeError('Expecting a pandas time series object')
+    notnull_idx = ts.index[ts.notnull()]
+    temp_df = pd.DataFrame(
+        ts[notnull_idx].tolist(),
+        index=notnull_idx
+    )
+
+    return temp_df
+
+
+def expand_embedded_dict(ts, key_):
+    '''Expanded a single field that has embedded json
+
+    Args:
+        ts: a pandas time series of the field that has embedded json
+        key_: the key that you want to expand
+
+    Raise:
+        TypeError: if you don't pass in a pandas time series
+
+    Returns:
+        key_ts: a new time series of the key of interest
+
+    NOTE:
+        this is new function
+    TODO:
+        could be refactored to allow multiple keys or all keys to be returned
+        could be refactored for speed as the current process
+    '''
+
+    if "Series" not in type(ts).__name__:
+        raise TypeError('Expecting a pandas time series object')
+    key_ts = pd.Series(name=ts.name + "." + key_, index=ts.index)
+    notnull_idx = ts.notnull()
+    # TODO: maybe sped up by only getting the one field of interest?
+    # though, the current method is fairly quick and compact
+    temp_df = expand_entire_dict(ts)
+    if key_ in list(temp_df):
+        key_ts[notnull_idx] = temp_df[key_].values
+
+    return key_ts
+
+
+def get_embedded_field(ts, embedded_field):
+    '''get a field that is nested in more than 1 embedded dictionary (json)
+
+    Args:
+        ts: a pandas time series of the field that has embedded json
+        embedded_field (str): the location of the field that is deeply nested
+            (e.g., "origin.payload.device.model")
+
+    Raise:
+        ValueError: if you don't pass in a pandas time series
+
+    Returns:
+        new_ts: a new time series of the key of interest
+
+    NOTE:
+        this is new function
+        the "." notation is used to reference nested json
+
+    '''
+    field_list = embedded_field.split(".")
+    if len(field_list) < 2:
+        raise ValueError('Expecting at least 1 embedded field')
+
+    new_ts = expand_embedded_dict(ts, field_list[1])
+    for i in range(2, len(field_list)):
+        new_ts = expand_embedded_dict(new_ts, field_list[i])
+
+    return new_ts
+
+
+def add_upload_info_to_cgm_records(groups, df):
+    upload_locations = [
+        "uploadId",
+        "deviceManufacturers",
+        "deviceModel",
+        "deviceSerialNumber",
+        "deviceTags"
+    ]
+
+    if "upload" in groups["type"].unique():
+        upload = groups.get_group("upload").dropna(axis=1, how="all")
+        df = pd.merge(
+            left=df,
+            right=upload[list(set(upload_locations) & set(list(upload)))],
+            on="uploadId",
+            how="left"
+        )
+
+    return df
+
+
+def expand_heathkit_cgm_fields(df):
+    healthkit_locations = [
+        "origin",
+        "origin.payload",
+        "origin.payload.device",
+        "origin.payload.sourceRevision",
+        "origin.payload.sourceRevision.source",
+        "payload",
+    ]
+
+    for hk_loc in healthkit_locations:
+        if hk_loc in list(df):
+            temp_df = (
+                expand_entire_dict(df[hk_loc].copy()).add_prefix(hk_loc + ".")
+            )
+            df = pd.concat([df, temp_df], axis=1)
+
+    return df
+
+
+def get_dexcom_cgm_model(df):
+    # add cgm model
+    # put this list in order of precedence when choosing sensor version
+    # NOTE: there is an edge case where "origin.payload.device.model" = G5/G6,
+    # which can be eliminated by getting model from HKMetadataKeySyncIdentifier
+    dexcom_model_locations = [
+        "deviceId",
+        "deviceManufacturers",
+        "deviceModel",
+        "deviceSerialNumber",
+        "payload.HKMetadataKeySyncIdentifier", # do this before "origin.payload.device.model" bc there is an edge case
+        "origin.payload.device.model",
+        "origin.payload.sourceRevision.source.name",
+        "payload.transmitterGeneration",
+        "payload.transmitterId",
+    ]
+
+    for model_location in dexcom_model_locations:
+        if model_location in list(df):
+            # only consider cells where the model location is not null
+            notnull_idx = df[model_location].notnull()
+            if notnull_idx.sum() > 0:
+                for dex_model in ["G4", "G5", "G6"]:
+                    # define a pandas stringMethod
+                    str_list = df[model_location].astype(str).str
+                    # if model has already been determined, then skip
+                    missing_model_idx = df["cgmModel"].isnull()
+                    # get index that matches model
+                    model_idx = str_list.upper().str.contains(dex_model)
+
+                    m_idx = (
+                        missing_model_idx & notnull_idx & model_idx
+                    )
+                    df.loc[m_idx, "cgmModel"] = dex_model
+
+                    # case of "payload.transmitterId"
+                    if (
+                        ("payload.transmitterId" in model_location)
+                        | ("payload.HKMetadataKeySyncIdentifier" in model_location)
+                    ):
+                        # get string length (need 5 digits for G4 and 6 for G5, G6)
+                        if "G4" in dex_model:
+                            model_idx = str_list.len() == 5
+                        elif "G5" in dex_model:
+                            model_idx = str_list.startswith("4")
+                        elif "G6" in dex_model:
+                            model_idx = (
+                                (str_list.startswith("8"))
+                                | (str_list.startswith("2"))
+                            )
+                        m_idx = (
+                            missing_model_idx & notnull_idx & model_idx
+                        )
+                        df.loc[m_idx, "cgmModel"] = dex_model
+
+    return df["cgmModel"]
+
+
+def get_non_dexcom_cgm_model(df):
+    # non-dexcom cgm model query
+    model_locations = ["deviceId"]
+    models_670G = "MMT-158|MMT-178"
+    models_640G = "MMT-1511|MMT-1512|MMT-1711|MMT-1712"
+    models_630G = "MMT-1514|MMT-1515|MMT-1714|MMT-1715"
+    models_530G = (
+        "530G|MedT-551|MedT-751|MedT-554|MedT-754|Veo - 554|Veo - 754"
+    )
+    models_523_723 = "MedT-523|MedT-723|Revel - 523|Revel - 723"  # 523/723
+    models_libre = "AbbottFreeStyleLibre"
+    models_animas = "IR1295"
+    # NOTE: the tandem G4 will first be written as G5_G6,
+    # but the logic should overwrite back to G4
+    models_tandem_G5_G6 = "tandem"
+    models_tandem_G4 = "4628003|5448003"
+
+    non_dex_models = [
+        models_670G, models_640G, models_630G, models_530G, models_523_723,
+        models_libre, models_animas, models_tandem_G5_G6, models_tandem_G4
+    ]
+
+    non_dex_model_names = [
+        "670G", "640G", "630G", "530G", "523_723",
+        "LIBRE", "G4", "G5_G6", "G4"
+    ]
+
+    for model_loc in model_locations:
+        if model_loc in list(df):
+            # only consider cells where the model location is not null
+            # and we are missing a cgm model
+            notnull_idx = df[model_loc].notnull()
+            if notnull_idx.sum() > 0:
+                missing_model_idx = df["cgmModel"].isnull()
+                if missing_model_idx.sum() > 0:
+                    # define a pandas stringMethod
+                    str_list = df[model_loc].astype(str).str
+
+                    for non_dex_model, model_name in zip(
+                        non_dex_models, non_dex_model_names
+                    ):
+                        model_idx = str_list.contains(non_dex_model)
+                        m_idx = (missing_model_idx & notnull_idx & model_idx)
+                        df.loc[m_idx, "cgmModel"] = model_name
+
+    return df["cgmModel"]
+
+
+def hash_userid(userid, salt):
+    '''
+    taken from anonymize-and-export.py
+    refactored name(s) to meet style guide
+    '''
+    usr_string = userid + salt
+    hash_user = hashlib.sha256(usr_string.encode())
+    hashid = hash_user.hexdigest()
+
+    return hashid
+
+
+def get_type(val):
+    return type(val).__name__
+
+
+def remove_negative_durations(df):
+    '''
+    taken from https://github.com/tidepool-org/data-analytics/blob/
+    etn/get-settings-and-events/projects/get-donors-pump-settings/
+    get-users-settings-and-events.py
+
+    refactored name(s) to meet style guide
+    refactored pandas field call to df["field"] instead of df.field
+    refactored because physical activity includes embedded json, whereas
+    the other fields in the data model require a integer
+    TODO: I think that durations are coming in as floats too, so we need
+    to refactor to account for that.
+    '''
+    if "duration" in list(df):
+        type_ = df["duration"].apply(get_type)
+        valid_index = ((type_ == "int") & (df["duration"].notnull()))
+        n_negative_durations = sum(df.loc[valid_index, "duration"] < 0)
+        if n_negative_durations > 0:
+            df = df[~(df.loc[valid_index, "duration"] < 0)]
+    else:
+        n_negative_durations = np.nan
+
+    return df, n_negative_durations
+
+
+
+
+
+def tslim_calibration_fix(df):
+    '''
+    taken from https://github.com/tidepool-org/data-analytics/blob/
+    etn/get-settings-and-events/projects/get-donors-pump-settings/
+    get-users-settings-and-events.py
+
+    refactored name(s) to meet style guide
+    refactored pandas field call to df["field"] instead of df.field
+    refactored to only expand one field
+    '''
+
+    # expand payload field one level
+    if "payload" in list(df):
+        df["payload.calibration_reading"] = (
+            expand_embedded_dict(df["payload"], "calibration_reading")
+        )
+
+        if df["payload.calibration_reading"].notnull().sum() > 0:
+
+            search_for = ['tan']
+            tandem_data_index = (
+                (df["deviceId"].str.contains('|'.join(search_for)))
+                & (df["type"] == "deviceEvent")
+            )
+
+            cal_index = df["payload.calibration_reading"].notnull()
+            valid_index = tandem_data_index & cal_index
+
+            n_cal_readings = sum(valid_index)
+
+            if n_cal_readings > 0:
+                # if reading is > 30 then it is in the wrong units
+                if df["payload.calibration_reading"].min() > 30:
+                    df.loc[cal_index, "value"] = (
+                        df.loc[valid_index, "payload.calibration_reading"]
+                        / MGDL_PER_MMOLL
+                    )
+                else:
+                    df.loc[cal_index, "value"] = (
+                        df.loc[valid_index, "payload.calibration_reading"]
+                    )
+        else:
+            n_cal_readings = 0
+    else:
+        n_cal_readings = 0
+    return df, n_cal_readings
+
+
+def replace_smoothed_cgm_values(df):
+
+    if 'payload.realTimeValue' in list(df):
+        raw_val_idx = df['payload.realTimeValue'].notnull()
+        n_replaced = raw_val_idx.sum()
+        df.loc[raw_val_idx, "mg/dL"] = (
+            df.loc[raw_val_idx, "payload.realTimeValue"]
+        )
+    else:
+        n_replaced = np.nan
+
+    raw_values = df["mg/dL"]
+
+    return raw_values, n_replaced
+
+
+def get_healthkit_timezone(df):
+    '''
+    TODO: refactor to account for more efficient way to get embedded json
+    '''
+    if "payload" in list(df):
+        df["payload.HKTimeZone"] = (
+            expand_embedded_dict(df["payload"], "HKTimeZone")
+        )
+        if "timezone" not in list(df):
+            if "payload.HKTimeZone" in list(df):
+                hk_tz_idx = df["payload.HKTimeZone"].notnull()
+                df.loc[hk_tz_idx, "deviceType"] = "healthkit"
+                df.rename(columns={"payload.HKTimeZone": "timezone"}, inplace=True)
+
+            else:
+                df["timezone"] = np.nan
+                df["deviceType"] = np.nan
+        else:
+            if "payload.HKTimeZone" in list(df):
+                hk_tz_idx = df["payload.HKTimeZone"].notnull()
+                df.loc[hk_tz_idx, "timezone"] = (
+                    df.loc[hk_tz_idx, "payload.HKTimeZone"]
+                )
+                df.loc[hk_tz_idx, "deviceType"] = "healthkit"
+            else:
+                df["timezone"] = np.nan
+                df["deviceType"] = np.nan
+
+    else:
+        df["timezone"] = np.nan
+        df["deviceType"] = np.nan
+
+    return df[["timezone", "deviceType"]]
+
+
+def get_and_fill_timezone(df):
+    '''
+    this is new to deal with healthkit data
+    requires that a data frame that contains payload and HKTimeZone is passed
+    '''
+    df = get_healthkit_timezone(df)
+
+    df["timezone"].fillna(method='ffill', inplace=True)
+    df["timezone"].fillna(method='bfill', inplace=True)
+
+    return df["timezone"]
+
+
+def make_tz_unaware(date_time):
+    return date_time.replace(tzinfo=None)
+
+
+def to_utc_datetime(df):
+    '''
+    this is new to deal with perfomance issue with the previous method
+    of converting to string to datetime with pd.to_datetime()
+    '''
+    utc_time_tz_aware = pd.to_datetime(
+        df["time"],
+        format="%Y-%m-%dT%H:%M:%S",
+        utc=True
+    )
+    utc_tz_unaware = utc_time_tz_aware.apply(make_tz_unaware)
+
+    return utc_tz_unaware
+
+
+# apply the large timezone offset correction (AKA Darin's fix)
+def timezone_offset_bug_fix(df):
+    '''
+    this is taken from estimate-local-time.py
+    TODO: add in unit testing where there is no TZP that is > 840 or < -720
+    '''
+
+    if "timezoneOffset" in list(df):
+
+        while ((df.timezoneOffset > 840).sum() > 0):
+            df.loc[df.timezoneOffset > 840, ["conversionOffset"]] = (
+                df.loc[df.timezoneOffset > 840, ["conversionOffset"]]
+                - (1440 * 60 * 1000)
+                )
+
+            df.loc[df.timezoneOffset > 840, ["timezoneOffset"]] = (
+                df.loc[df.timezoneOffset > 840, ["timezoneOffset"]] - 1440
+            )
+
+        while ((df.timezoneOffset < -720).sum() > 0):
+            df.loc[df.timezoneOffset < -720, ["conversionOffset"]] = (
+                df.loc[df.timezoneOffset < -720, ["conversionOffset"]]
+                + (1440 * 60 * 1000)
+            )
+
+            df.loc[df.timezoneOffset < -720, ["timezoneOffset"]] = (
+                df.loc[df.timezoneOffset < -720, ["timezoneOffset"]] + 1440
+            )
+
+    return df
+
+
+def get_local_time(df):
+
+    tzo = df[['utcTime', 'inferredTimezone']].apply(
+        lambda x: get_timezone_offset(*x), axis=1
+    )
+    local_time = df['utcTime'] + pd.to_timedelta(tzo, unit="m")
+
+    return local_time
+
+
+def round_time(
+        df,
+        time_interval_minutes=5,
+        start_with_first_record=True,
+        return_calculation_columns=False
+):
+    '''
+    A general purpose round time function that rounds the "time"
+    field to nearest <time_interval_minutes> minutes
+    INPUTS:
+        * a dataframe (df) or time series that contains only one time field
+        that you want to round
+        * time_interval_minutes (defaults to 5 minutes given that most cgms
+        output every 5 minutes)
+        * start_with_first_record starts the rounding with the first record
+        if True, and the last record if False (defaults to True)
+        * return_calculation_columns specifies whether the extra columns
+        used to make calculations are returned
+    refactored name(s) to meet style guide
+    '''
+    # if a time series is passed in, convert to dataframe
+    if "Series" in get_type(df):
+        df = pd.DataFrame(df)
+    columns_ = list(df)
+    if len(columns_) > 1:
+        sys.exit(
+            "Error: df should only have one time column"
+        )
+    else:
+        df.rename(columns={columns_[0]: "t"}, inplace=True)
+
+    df.sort_values(
+        by="t",
+        ascending=start_with_first_record,
+        inplace=True
+    )
+
+    df.reset_index(drop=False, inplace=True)
+    df.rename(columns={"index": "originalIndex"}, inplace=True)
+
+    # calculate the time between consecutive records
+    df["t_shift"] = df["t"].shift(1)
+    df["timeBetweenRecords"] = round(
+        (df["t"] - df["t_shift"]).dt.days*(86400/(60 * time_interval_minutes))
+        + (df["t"] - df["t_shift"]).dt.seconds/(60 * time_interval_minutes)
+    ) * time_interval_minutes
+
+    # separate the data into chunks if timeBetweenRecords is greater than
+    # 2 times the <time_interval_minutes> minutes so the rounding process
+    # starts over
+    big_gaps = list(
+        df.query("abs(timeBetweenRecords) > "
+                 + str(time_interval_minutes * 2)).index
+    )
+    big_gaps.insert(0, 0)
+    big_gaps.append(len(df))
+
+    for gap_index in range(0, len(big_gaps) - 1):
+        chunk = df["t"][big_gaps[gap_index]:big_gaps[gap_index+1]]
+        first_chunk = df["t"][big_gaps[gap_index]]
+
+        # calculate the time difference between
+        # each time record and the first record
+        df.loc[
+            big_gaps[gap_index]:big_gaps[gap_index+1],
+            "minutesFromFirstRecord"
+        ] = (
+            (chunk - first_chunk).dt.days*(86400/60)
+            + (chunk - first_chunk).dt.seconds/60
+        )
+
+        # then round to the nearest X Minutes
+        # NOTE: the ".000001" ensures that mulitples of 2:30 always rounds up.
+        df.loc[
+            big_gaps[gap_index]:big_gaps[gap_index+1],
+            "roundedMinutesFromFirstRecord"
+        ] = round(
+            (df.loc[
+                big_gaps[gap_index]:big_gaps[gap_index+1],
+                "minutesFromFirstRecord"
+            ] / time_interval_minutes) + 0.000001
+        ) * (time_interval_minutes)
+
+        rounded_first_record = (
+            first_chunk + pd.Timedelta("1microseconds")
+        ).round(str(time_interval_minutes) + "min")
+
+        df.loc[
+            big_gaps[gap_index]:big_gaps[gap_index+1],
+            "roundedTime"
+        ] = rounded_first_record + pd.to_timedelta(
+            df.loc[
+                big_gaps[gap_index]:big_gaps[gap_index+1],
+                "roundedMinutesFromFirstRecord"
+            ], unit="m"
+        )
+
+    if return_calculation_columns is False:
+        df.drop(
+            columns=[
+                "timeBetweenRecords",
+                "minutesFromFirstRecord",
+                "roundedMinutesFromFirstRecord"
+            ], inplace=True
+        )
+    # sort back to the original index
+    df.sort_values(by="originalIndex", inplace=True)
+
+    return df["roundedTime"].values
+
+
+def add_upload_time(df):
+    '''
+    this is taken from a colab notebook that is not in our github
+    given that it has been refactored to account for bug where there are
+    no upload records
+    NOTE: this is a new fix introduced with healthkit data...we now have
+    data that does not have an upload record
+
+    '''
+
+    if "upload" in df.type.unique():
+        upload_times = pd.DataFrame(
+            df[df.type == "upload"].groupby("uploadId")["utcTime"].max()
+        )
+    else:
+        upload_times = pd.DataFrame(columns=["utcTime"])
+
+    unique_uploadIds = set(df["uploadId"].unique())
+    unique_uploadRecords = set(
+        df.loc[df["type"] == "upload", "uploadId"].unique()
+    )
+    uploadIds_missing_uploadRecords = unique_uploadIds - unique_uploadRecords
+
+    for upId in uploadIds_missing_uploadRecords:
+        last_upload_time = df.loc[df["uploadId"] == upId, "utcTime"].max()
+        upload_times.loc[upId, "utcTime"] = last_upload_time
+
+    upload_times.reset_index(inplace=True)
+    upload_times.rename(
+        columns={"utcTime": "uploadTime",
+                 "index": "uploadId"},
+        inplace=True
+    )
+
+    df = pd.merge(df, upload_times, how='left', on='uploadId')
+
+    return df["uploadTime"].values
+
+
+def remove_invalid_cgm_values(df):
+
+    nBefore = len(df)
+    # remove values < 38 and > 402 mg/dL
+    df = df.drop(df[((df.type == "cbg") &
+                     (df["mg/dL"] < 38))].index)
+    df = df.drop(df[((df.type == "cbg") &
+                     (df["mg/dL"] > 402))].index)
+    nRemoved = nBefore - len(df)
+
+    return df, nRemoved
+
+
+def removeDuplicates(df, criteriaDF):
+    nBefore = len(df)
+    df = df.loc[~(df[criteriaDF].duplicated())]
+    df = df.reset_index(drop=True)
+    nDuplicatesRemoved = nBefore - len(df)
+
+    return df, nDuplicatesRemoved
+
+
+def removeCgmDuplicates(df, timeCriterion, valueCriterion="value"):
+    if timeCriterion in df:
+        df.sort_values(by=[timeCriterion, "uploadTime"],
+                       ascending=[False, False],
+                       inplace=True)
+        dfIsNull = df[df[timeCriterion].isnull()]
+        dfNotNull = df[df[timeCriterion].notnull()]
+        dfNotNull, nDuplicatesRemoved = (
+            removeDuplicates(dfNotNull, [timeCriterion, valueCriterion])
+        )
+        df = pd.concat([dfIsNull, dfNotNull])
+        df.sort_values(by=[timeCriterion, "uploadTime"],
+                       ascending=[False, False],
+                       inplace=True)
+    else:
+        nDuplicatesRemoved = 0
+
+    return df, nDuplicatesRemoved
+
+
+# get rid of spike data
+def remove_spike_data(df):
+    if "origin" in list(df):
+        nBefore = len(df)
+        spike_locations = [
+            "origin.payload.device.name",
+            "origin.payload.device.manufacturer",
+            "origin.payload.sourceRevision.source.name",
+        ]
+        for spike_loc in spike_locations:
+            df[spike_loc] = get_embedded_field(df["origin"], spike_loc)
+
+            spike_idx = df.loc[
+                df[spike_loc].notnull(),
+                spike_loc
+            ].astype(str).str.lower().str.contains("spike")
+
+            df.drop((spike_idx == True).index, inplace=True)
+
+        nRemoved = nBefore - len(df)
+
+    else:
+        nRemoved = np.nan
+
+    return df, nRemoved
+
+
+# %% ESTIMATE LOCAL TIME FUNCTIONS
+def convert_deprecated_timezone_to_alias(df, tzAlias):
+    if "timezone" in df:
+        uniqueTimezones = df.timezone.unique()
+        uniqueTimezones = uniqueTimezones[pd.notnull(df.timezone.unique())]
+
+        for uniqueTimezone in uniqueTimezones:
+            alias = tzAlias.loc[tzAlias.tz.str.endswith(uniqueTimezone),
+                                ["alias"]].values
+            if len(alias) == 1:
+                df.loc[df.timezone == uniqueTimezone, ["timezone"]] = alias
+
+    return df
+
+
+def create_contiguous_day_series(df):
+    first_day = df["date"].min()
+    last_day = df["date"].max()
+    rng = pd.date_range(first_day, last_day).date
+    contiguousDaySeries = \
+        pd.DataFrame(rng, columns=["date"]).sort_values(
+                "date", ascending=False).reset_index(drop=True)
+
+    return contiguousDaySeries
+
+
+def add_device_type(df):
+    col_headings = list(df)
+    if "deviceType" not in col_headings:
+        df["deviceType"] = np.nan
+    if "deviceTags" in col_headings:
+        # first make sure deviceTag is in string format
+        df["deviceTags"] = df.deviceTags.astype(str)
+        # filter by type not null device tags
+        ud = df[df["deviceTags"].notnull()].copy()
+        # define a device type (e.g., pump, cgm, or healthkit)
+        ud.loc[
+            ((ud["deviceTags"].str.contains("pump"))
+             & (ud["deviceType"].isnull())),
+            ["deviceType"]
+        ] = "pump"
+
+        # define a device type (e.g., cgm)
+        ud.loc[
+            ((ud["deviceTags"].str.contains("cgm"))
+             & (ud["deviceType"].isnull())),
+            ["deviceType"]
+        ] = "cgm"
+
+        return ud["deviceType"]
+    else:
+        return np.nan
+
+
+def get_timezone_offset(currentDate, currentTimezone):
+
+    tz = pytz.timezone(currentTimezone)
+    # here we add 1 day to the current date to account for changes to/from DST
+    tzoNum = int(
+        tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z")
+    )
+    tzoHours = np.floor(tzoNum / 100)
+    tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0)
+    tzoSign = np.sign(tzoHours)
+    tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign))
+
+    return tzo
+
+
+def add_device_day_series(df, dfContDays, deviceTypeName):
+    if len(df) > 0:
+        dfDayGroups = df.groupby("date")
+        if "timezoneOffset" in df:
+            dfDaySeries = pd.DataFrame(dfDayGroups["timezoneOffset"].median())
+        else:
+            dfDaySeries = pd.DataFrame(columns=["timezoneOffset"])
+            dfDaySeries.index.name = "date"
+
+        if "upload" in deviceTypeName:
+            if (("timezone" in df) & (df["timezone"].notnull().sum() > 0)):
+                dfDaySeries["timezone"] = (
+                    dfDayGroups.timezone.describe()["top"]
+                )
+                # get the timezone offset for the timezone
+                for i in dfDaySeries.index:
+                    if pd.notnull(dfDaySeries.loc[i, "timezone"]):
+                        tzo = get_timezone_offset(
+                                pd.to_datetime(i),
+                                dfDaySeries.loc[i, "timezone"])
+                        dfDaySeries.loc[i, ["timezoneOffset"]] = tzo
+                if "timeProcessing" in dfDaySeries:
+                    dfDaySeries["timeProcessing"] = \
+                        dfDayGroups.timeProcessing.describe()["top"]
+                else:
+                    dfDaySeries["timeProcessing"] = np.nan
+
+
+        dfDaySeries = dfDaySeries.add_prefix(deviceTypeName + "."). \
+            rename(columns={deviceTypeName + ".date": "date"})
+
+        dfContDays = pd.merge(dfContDays, dfDaySeries.reset_index(),
+                              on="date", how="left")
+
+    else:
+        dfContDays[deviceTypeName + ".timezoneOffset"] = np.nan
+
+    return dfContDays
+
+
+def impute_upload_records(df, contDays, deviceTypeName):
+    daySeries = \
+        add_device_day_series(df, contDays, deviceTypeName)
+
+    if ((len(df) > 0) & (deviceTypeName + ".timezone" in daySeries)):
+        for i in daySeries.index[1:]:
+            if pd.isnull(daySeries[deviceTypeName + ".timezone"][i]):
+                daySeries.loc[i, [deviceTypeName + ".timezone"]] = (
+                    daySeries.loc[i-1, deviceTypeName + ".timezone"]
+                )
+            if pd.notnull(daySeries[deviceTypeName + ".timezone"][i]):
+                tz = daySeries.loc[i, deviceTypeName + ".timezone"]
+                tzo = get_timezone_offset(
+                    pd.to_datetime(daySeries.loc[i, "date"]),
+                    tz
+                )
+                daySeries.loc[i, deviceTypeName + ".timezoneOffset"] = tzo
+
+            if pd.notnull(daySeries[deviceTypeName + ".timeProcessing"][i-1]):
+                daySeries.loc[i, deviceTypeName + ".timeProcessing"] = \
+                    daySeries.loc[i-1, deviceTypeName + ".timeProcessing"]
+
+    else:
+        daySeries[deviceTypeName + ".timezone"] = np.nan
+        daySeries[deviceTypeName + ".timeProcessing"] = np.nan
+
+    return daySeries
+
+
+def add_home_timezone(df, contDays):
+
+    if (("timezone" in df) & (df["timezone"].notnull().sum()> 0)):
+        homeTimezone = df["timezone"].describe()["top"]
+        tzo = contDays.date.apply(
+                lambda x: get_timezone_offset(pd.to_datetime(x), homeTimezone))
+
+        contDays["home.imputed.timezoneOffset"] = tzo
+        contDays["home.imputed.timezone"] = homeTimezone
+
+    else:
+        contDays["home.imputed.timezoneOffset"] = np.nan
+        contDays["home.imputed.timezone"] = np.nan
+    contDays["home.imputed.timeProcessing"] = np.nan
+
+    return contDays
+
+
+def estimateTzAndTzoWithUploadRecords(cDF):
+
+    cDF["est.type"] = np.nan
+    cDF["est.gapSize"] = np.nan
+    cDF["est.timezoneOffset"] = cDF["upload.timezoneOffset"]
+    cDF["est.annotations"] = np.nan
+
+    if "upload.timezone" in cDF:
+        cDF.loc[cDF["upload.timezone"].notnull(), ["est.type"]] = "UPLOAD"
+        cDF["est.timezone"] = cDF["upload.timezone"]
+        cDF["est.timeProcessing"] = cDF["upload.timeProcessing"]
+    else:
+        cDF["est.timezone"] = np.nan
+        cDF["est.timeProcessing"] = np.nan
+
+    cDF.loc[((cDF["est.timezoneOffset"] !=
+              cDF["home.imputed.timezoneOffset"]) &
+            (pd.notnull(cDF["est.timezoneOffset"]))),
+            "est.annotations"] = "travel"
+
+    return cDF
+
+
+def assignTzoFromImputedSeries(df, i, imputedSeries):
+    df.loc[i, ["est.type"]] = "DEVICE"
+
+    df.loc[i, ["est.timezoneOffset"]] = \
+        df.loc[i, imputedSeries + ".timezoneOffset"]
+
+    df.loc[i, ["est.timezone"]] = \
+        df.loc[i, imputedSeries + ".timezone"]
+
+    df.loc[i, ["est.timeProcessing"]] = \
+        df.loc[i, imputedSeries + ".timeProcessing"]
+
+    return df
+
+
+def compareDeviceTzoToImputedSeries(df, sIdx, device):
+    for i in sIdx:
+        # if the device tzo = imputed tzo, then chose the imputed tz and tzo
+        # note, dst is accounted for in the imputed tzo
+        for imputedSeries in ["pump.upload.imputed", "cgm.upload.imputed",
+                              "healthkit.upload.imputed", "home.imputed"]:
+            # if the estimate has not already been made
+            if pd.isnull(df.loc[i, "est.timezone"]):
+
+                if df.loc[i, device + ".timezoneOffset"] == \
+                  df.loc[i, imputedSeries + ".timezoneOffset"]:
+
+                    assignTzoFromImputedSeries(df, i, imputedSeries)
+
+                    df = addAnnotation(df, i,
+                                       "tz-inferred-from-" + imputedSeries)
+
+                # if the imputed series has a timezone estimate, then see if
+                # the current day is a dst change day
+                elif (pd.notnull(df.loc[i, imputedSeries + ".timezone"])):
+                    imputedTimezone = df.loc[i, imputedSeries + ".timezone"]
+                    if isDSTChangeDay(df.loc[i, "date"], imputedTimezone):
+
+                        dstRange = getRangeOfTZOsForTimezone(imputedTimezone)
+                        if ((df.loc[i, device + ".timezoneOffset"] in dstRange)
+                          & (df.loc[i, imputedSeries + ".timezoneOffset"] in dstRange)):
+
+                            assignTzoFromImputedSeries(df, i, imputedSeries)
+
+                            df = addAnnotation(df, i, "dst-change-day")
+                            df = addAnnotation(
+                                    df, i, "tz-inferred-from-" + imputedSeries)
+
+    return df
+
+
+def estimateTzAndTzoWithDeviceRecords(cDF):
+
+    # 2A. use the TZO of the pump or cgm device if it exists on a given day. In
+    # addition, compare the TZO to one of the imputed day series (i.e., the
+    # upload and home series to see if the TZ can be inferred)
+    for deviceType in ["pump", "cgm"]:
+        # find the indices of days where a TZO estimate has not been made AND
+        # where the device (e.g., pump or cgm) TZO has data
+        sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) &
+                        (cDF[deviceType + ".timezoneOffset"].notnull()))].index
+        # compare the device TZO to the imputed series to infer time zone
+        cDF = compareDeviceTzoToImputedSeries(cDF, sIndices, deviceType)
+
+    # 2B. if the TZ cannot be inferred with 2A, then see if the TZ can be
+    # inferred from the previous day's TZO. If the device TZO is equal to the
+    # previous day's TZO, AND if the previous day has a TZ estimate, use the
+    # previous day's TZ estimate for the current day's TZ estimate
+    for deviceType in ["pump", "cgm"]:
+        sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) &
+                        (cDF[deviceType + ".timezoneOffset"].notnull()))].index
+
+        cDF = compareDeviceTzoToPrevDayTzo(cDF, sIndices, deviceType)
+
+    # 2C. after 2A and 2B, check the DEVICE estimates to make sure that the
+    # pump and cgm tzo do not differ by more than 60 minutes. If they differ
+    # by more that 60 minutes, then mark the estimate as UNCERTAIN. Also, we
+    # allow the estimates to be off by 60 minutes as there are a lot of cases
+    # where the devices are off because the user changes the time for DST,
+    # at different times
+    sIndices = cDF[((cDF["est.type"] == "DEVICE") &
+                    (cDF["pump.timezoneOffset"].notnull()) &
+                    (cDF["cgm.timezoneOffset"].notnull()) &
+                    (cDF["pump.timezoneOffset"] != cDF["cgm.timezoneOffset"])
+                    )].index
+
+    tzoDiffGT60 = abs(cDF.loc[sIndices, "cgm.timezoneOffset"] -
+                      cDF.loc[sIndices, "pump.timezoneOffset"]) > 60
+
+    idx = tzoDiffGT60.index[tzoDiffGT60]
+
+    cDF.loc[idx, ["est.type"]] = "UNCERTAIN"
+    for i in idx:
+        cDF = addAnnotation(cDF, i, "pump-cgm-tzo-mismatch")
+
+    return cDF
+
+
+def imputeTzAndTzo(cDF):
+
+    sIndices = cDF[cDF["est.timezoneOffset"].isnull()].index
+    hasTzoIndices = cDF[cDF["est.timezoneOffset"].notnull()].index
+    if len(hasTzoIndices) > 0:
+        if len(sIndices) > 0:
+            lastDay = max(sIndices)
+
+            while ((sIndices.min() < max(hasTzoIndices)) &
+                   (len(sIndices) > 0)):
+
+                currentDay, prevDayWithDay, nextDayIdx = \
+                    getImputIndices(cDF, sIndices, hasTzoIndices)
+
+                cDF = imputeByTimezone(cDF, currentDay,
+                                       prevDayWithDay, nextDayIdx)
+
+                sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) &
+                                (~cDF["est.annotations"].str.contains(
+                                "unable-to-impute-tzo").fillna(False)))].index
+
+                hasTzoIndices = cDF[cDF["est.timezoneOffset"].notnull()].index
+
+            # try to impute to the last day (earliest day) in the dataset
+            # if the last record has a timezone that is the home record, then
+            # impute using the home timezone
+            if len(sIndices) > 0:
+                currentDay = min(sIndices)
+                prevDayWithDay = currentDay - 1
+                gapSize = lastDay - currentDay
+
+                for i in range(currentDay, lastDay + 1):
+                    if cDF.loc[prevDayWithDay, "est.timezoneOffset"] == \
+                      cDF.loc[prevDayWithDay, "home.imputed.timezoneOffset"]:
+
+                        cDF.loc[i, ["est.type"]] = "IMPUTE"
+
+                        cDF.loc[i, ["est.timezoneOffset"]] = \
+                            cDF.loc[i, "home.imputed.timezoneOffset"]
+
+                        cDF.loc[i, ["est.timezone"]] = \
+                            cDF.loc[i, "home.imputed.timezone"]
+
+                        cDF = addAnnotation(cDF, i, "gap=" + str(gapSize))
+                        cDF.loc[i, ["est.gapSize"]] = gapSize
+
+                    else:
+                        cDF.loc[i, ["est.type"]] = "UNCERTAIN"
+                        cDF = addAnnotation(cDF, i, "unable-to-impute-tzo")
+    else:
+        cDF["est.type"] = "UNCERTAIN"
+        cDF["est.annotations"] = "unable-to-impute-tzo"
+
+    return cDF
+
+
+def getRangeOfTZOsForTimezone(tz):
+    minMaxTzo = [getTimezoneOffset(pd.to_datetime("1/1/2017"), tz),
+                 getTimezoneOffset(pd.to_datetime("5/1/2017"), tz)]
+
+    rangeOfTzo = np.arange(int(min(minMaxTzo)), int(max(minMaxTzo))+1, 15)
+
+    return rangeOfTzo
+
+
+def getListOfDSTChangeDays(cDF):
+
+    # get a list of DST change days for the home time zone
+    dstChangeDays = \
+        cDF[abs(cDF["home.imputed.timezoneOffset"] -
+                cDF["home.imputed.timezoneOffset"].shift(-1)) > 0].date
+
+    return dstChangeDays
+
+
+def correctEstimatesAroundDst(df, cDF):
+
+    # get a list of DST change days for the home time zone
+    dstChangeDays = getListOfDSTChangeDays(cDF)
+
+    # loop through the df within 2 days of a daylight savings time change
+    for d in dstChangeDays:
+        dstIndex = df[(df.date > (d + dt.timedelta(days=-2))) &
+                      (df.date < (d + dt.timedelta(days=2)))].index
+        for dIdx in dstIndex:
+            if pd.notnull(df.loc[dIdx, "est.timezone"]):
+                tz = pytz.timezone(df.loc[dIdx, "est.timezone"])
+                tzRange = getRangeOfTZOsForTimezone(str(tz))
+                minHoursToLocal = min(tzRange)/60
+                tzoNum = int(tz.localize(df.loc[dIdx, "utcTime"] +
+                             dt.timedelta(hours=minHoursToLocal)).strftime("%z"))
+                tzoHours = np.floor(tzoNum / 100)
+                tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0)
+                tzoSign = np.sign(tzoHours)
+                tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign))
+                localTime = \
+                    df.loc[dIdx, "utcTime"] + pd.to_timedelta(tzo, unit="m")
+                df.loc[dIdx, ["est.localTime"]] = localTime
+                df.loc[dIdx, ["est.timezoneOffset"]] = tzo
+    return df
+
+
+def applyLocalTimeEstimates(df, cDF):
+    df = pd.merge(df, cDF, how="left", on="date")
+    df["est.localTime"] = \
+        df["utcTime"] + pd.to_timedelta(df["est.timezoneOffset"], unit="m")
+
+    df = correctEstimatesAroundDst(df, cDF)
+
+    return df["est.localTime"].values
+
+
+def isDSTChangeDay(currentDate, currentTimezone):
+    tzoCurrentDay = getTimezoneOffset(pd.to_datetime(currentDate),
+                                      currentTimezone)
+    tzoPreviousDay = getTimezoneOffset(pd.to_datetime(currentDate) +
+                                       dt.timedelta(days=-1), currentTimezone)
+
+    return (tzoCurrentDay != tzoPreviousDay)
+
+
+def tzoRangeWithComparisonTz(df, i, comparisonTz):
+    # if we have a previous timezone estimate, then calcuate the range of
+    # timezone offset values for that time zone
+    if pd.notnull(comparisonTz):
+        rangeTzos = getRangeOfTZOsForTimezone(comparisonTz)
+    else:
+        comparisonTz = np.nan
+        rangeTzos = np.array([])
+
+    return rangeTzos
+
+
+def tzAndTzoRangePreviousDay(df, i):
+    # if we have a previous timezone estimate, then calcuate the range of
+    # timezone offset values for that time zone
+    comparisonTz = df.loc[i-1, "est.timezone"]
+
+    rangeTzos = tzoRangeWithComparisonTz(df, i, comparisonTz)
+
+    return comparisonTz, rangeTzos
+
+
+def assignTzoFromPreviousDay(df, i, previousDayTz):
+
+    df.loc[i, ["est.type"]] = "DEVICE"
+    df.loc[i, ["est.timezone"]] = previousDayTz
+    df.loc[i, ["est.timezoneOffset"]] = \
+        getTimezoneOffset(pd.to_datetime(df.loc[i, "date"]), previousDayTz)
+
+    df.loc[i, ["est.timeProcessing"]] = df.loc[i-1, "est.timeProcessing"]
+    df = addAnnotation(df, i, "tz-inferred-from-prev-day")
+
+    return df
+
+
+def assignTzoFromDeviceTzo(df, i, device):
+
+    df.loc[i, ["est.type"]] = "DEVICE"
+    df.loc[i, ["est.timezoneOffset"]] = \
+        df.loc[i, device + ".timezoneOffset"]
+    df.loc[i, ["est.timeProcessing"]] = \
+        df.loc[i, device + ".upload.imputed.timeProcessing"]
+
+    df = addAnnotation(df, i, "likely-travel")
+    df = addAnnotation(df, i, "tzo-from-" + device)
+
+    return df
+
+
+def compareDeviceTzoToPrevDayTzo(df, sIdx, device):
+
+    for i in sIdx[sIdx > 0]:
+
+        # first see if the previous record has a tzo
+        if (pd.notnull(df.loc[i-1, "est.timezoneOffset"])):
+
+            previousDayTz, dstRange = tzAndTzoRangePreviousDay(df, i)
+            timeDiff = abs((df.loc[i, device + ".timezoneOffset"]) -
+                           df.loc[i-1, "est.timezoneOffset"])
+
+            # next see if the previous record has a tz
+            if (pd.notnull(df.loc[i-1, "est.timezone"])):
+
+                if timeDiff == 0:
+                    assignTzoFromPreviousDay(df, i, previousDayTz)
+
+                # see if the previous day's tzo and device tzo are within the
+                # dst range (as that is a common problem with this data)
+                elif ((df.loc[i, device + ".timezoneOffset"] in dstRange)
+                      & (df.loc[i-1, "est.timezoneOffset"] in dstRange)):
+
+                    # then see if it is DST change day
+                    if isDSTChangeDay(df.loc[i, "date"], previousDayTz):
+
+                        df = addAnnotation(df, i, "dst-change-day")
+                        assignTzoFromPreviousDay(df, i, previousDayTz)
+
+                    # if it is not DST change day, then mark this as uncertain
+                    else:
+                        # also, check to see if the difference between device.
+                        # tzo and prev.tzo is less than the expected dst
+                        # difference. There is a known issue where the BtUTC
+                        # procedure puts clock drift into the device.tzo,
+                        # and as a result the tzo can be off by 15, 30,
+                        # or 45 minutes.
+                        if (((df.loc[i, device + ".timezoneOffset"] ==
+                              min(dstRange)) |
+                            (df.loc[i, device + ".timezoneOffset"] ==
+                             max(dstRange))) &
+                           ((df.loc[i-1, "est.timezoneOffset"] ==
+                             min(dstRange)) |
+                            (df.loc[i-1, "est.timezoneOffset"] ==
+                             max(dstRange)))):
+
+                            df.loc[i, ["est.type"]] = "UNCERTAIN"
+                            df = addAnnotation(df, i,
+                                               "likely-dst-error-OR-travel")
+
+                        else:
+
+                            df.loc[i, ["est.type"]] = "UNCERTAIN"
+                            df = addAnnotation(df, i,
+                                               "likely-15-min-dst-error")
+
+                # next see if time difference between device.tzo and prev.tzo
+                # is off by 720 minutes, which is indicative of a common
+                # user AM/PM error
+                elif timeDiff == 720:
+                    df.loc[i, ["est.type"]] = "UNCERTAIN"
+                    df = addAnnotation(df, i, "likely-AM-PM-error")
+
+                # if it doesn't fall into any of these cases, then the
+                # tzo difference is likely due to travel
+                else:
+                    df = assignTzoFromDeviceTzo(df, i, device)
+
+            elif timeDiff == 0:
+                df = assignTzoFromDeviceTzo(df, i, device)
+
+        # if there is no previous record to compare with check for dst errors,
+        # and if there are no errors, it is likely a travel day
+        else:
+
+            comparisonTz, dstRange = tzAndTzoRangeWithHomeTz(df, i)
+            timeDiff = abs((df.loc[i, device + ".timezoneOffset"]) -
+                           df.loc[i, "home.imputed.timezoneOffset"])
+
+            if ((df.loc[i, device + ".timezoneOffset"] in dstRange)
+               & (df.loc[i, "home.imputed.timezoneOffset"] in dstRange)):
+
+                # see if it is DST change day
+                if isDSTChangeDay(df.loc[i, "date"], comparisonTz):
+
+                    df = addAnnotation(df, i, "dst-change-day")
+                    df.loc[i, ["est.type"]] = "DEVICE"
+                    df.loc[i, ["est.timezoneOffset"]] = \
+                        df.loc[i, device + ".timezoneOffset"]
+                    df.loc[i, ["est.timezone"]] = \
+                        df.loc[i, "home.imputed.timezone"]
+                    df.loc[i, ["est.timeProcessing"]] = \
+                        df.loc[i, device + ".upload.imputed.timeProcessing"]
+
+                # if it is not DST change day, then mark this as uncertain
+                else:
+                    # also, check to see if the difference between device.
+                    # tzo and prev.tzo is less than the expected dst
+                    # difference. There is a known issue where the BtUTC
+                    # procedure puts clock drift into the device.tzo,
+                    # and as a result the tzo can be off by 15, 30,
+                    # or 45 minutes.
+                    if (((df.loc[i, device + ".timezoneOffset"] ==
+                          min(dstRange)) |
+                        (df.loc[i, device + ".timezoneOffset"] ==
+                         max(dstRange))) &
+                       ((df.loc[i, "home.imputed.timezoneOffset"] ==
+                         min(dstRange)) |
+                        (df.loc[i, "home.imputed.timezoneOffset"] ==
+                         max(dstRange)))):
+
+                        df.loc[i, ["est.type"]] = "UNCERTAIN"
+                        df = addAnnotation(df, i, "likely-dst-error-OR-travel")
+
+                    else:
+
+                        df.loc[i, ["est.type"]] = "UNCERTAIN"
+                        df = addAnnotation(df, i, "likely-15-min-dst-error")
+
+            # next see if time difference between device.tzo and prev.tzo
+            # is off by 720 minutes, which is indicative of a common
+            # user AM/PM error
+            elif timeDiff == 720:
+                df.loc[i, ["est.type"]] = "UNCERTAIN"
+                df = addAnnotation(df, i, "likely-AM-PM-error")
+
+            # if it doesn't fall into any of these cases, then the
+            # tzo difference is likely due to travel
+
+            else:
+                df = assignTzoFromDeviceTzo(df, i, device)
+
+    return df
+
+
+def getImputIndices(df, sIdx, hIdx):
+
+    lastDayIdx = len(df) - 1
+
+    currentDayIdx = sIdx.min()
+    tempList = pd.Series(hIdx) - currentDayIdx
+    prevDayIdx = currentDayIdx - 1
+    nextDayIdx = \
+        min(currentDayIdx + min(tempList[tempList >= 0]), lastDayIdx)
+
+    return currentDayIdx, prevDayIdx, nextDayIdx
+
+
+def imputeByTimezone(df, currentDay, prevDaywData, nextDaywData):
+
+    gapSize = (nextDaywData - currentDay)
+
+    if prevDaywData >= 0:
+
+        if df.loc[prevDaywData, "est.timezone"] == \
+          df.loc[nextDaywData, "est.timezone"]:
+
+            tz = df.loc[prevDaywData, "est.timezone"]
+
+            for i in range(currentDay, nextDaywData):
+
+                df.loc[i, ["est.timezone"]] = tz
+
+                df.loc[i, ["est.timezoneOffset"]] = \
+                    getTimezoneOffset(pd.to_datetime(df.loc[i, "date"]), tz)
+
+                df.loc[i, ["est.type"]] = "IMPUTE"
+
+                df = addAnnotation(df, i, "gap=" + str(gapSize))
+                df.loc[i, ["est.gapSize"]] = gapSize
+
+        # TODO: this logic should be updated to handle the edge case
+        # where the day before and after the gap have differing TZ, but
+        # the same TZO. In that case the gap should be marked as UNCERTAIN
+        elif df.loc[prevDaywData, "est.timezoneOffset"] == \
+          df.loc[nextDaywData, "est.timezoneOffset"]:
+
+            for i in range(currentDay, nextDaywData):
+
+                df.loc[i, ["est.timezoneOffset"]] = \
+                    df.loc[prevDaywData, "est.timezoneOffset"]
+
+                df.loc[i, ["est.type"]] = "IMPUTE"
+
+                df = addAnnotation(df, i, "gap=" + str(gapSize))
+                df.loc[i, ["est.gapSize"]] = gapSize
+
+        else:
+            for i in range(currentDay, nextDaywData):
+                df.loc[i, ["est.type"]] = "UNCERTAIN"
+                df = addAnnotation(df, i, "unable-to-impute-tzo")
+
+    else:
+        for i in range(currentDay, nextDaywData):
+            df.loc[i, ["est.type"]] = "UNCERTAIN"
+            df = addAnnotation(df, i, "unable-to-impute-tzo")
+
+    return df
+
+
+def addAnnotation(df, idx, annotationMessage):
+    if pd.notnull(df.loc[idx, "est.annotations"]):
+        df.loc[idx, ["est.annotations"]] = df.loc[idx, "est.annotations"] + \
+            ", " + annotationMessage
+    else:
+        df.loc[idx, ["est.annotations"]] = annotationMessage
+
+    return df
+
+
+def getTimezoneOffset(currentDate, currentTimezone):
+
+    tz = pytz.timezone(currentTimezone)
+    # here we add 1 day to the current date to account for changes to/from DST
+    tzoNum = int(tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z"))
+    tzoHours = np.floor(tzoNum / 100)
+    tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0)
+    tzoSign = np.sign(tzoHours)
+    tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign))
+
+    return tzo
+
+
+def estimate_local_time(df):
+    df["date"] = df["utcTime"].dt.date  # TODO: change this to utcDate later
+    contiguous_days = create_contiguous_day_series(df)
+
+    df["deviceType"] = add_device_type(df)
+    cDays = add_device_day_series(df, contiguous_days, "upload")
+
+    # create day series for cgm df
+    if "timezoneOffset" not in list(df):
+        df["timezoneOffset"] = np.nan
+
+    cgmdf = df[(df["type"] == "cbg") & (df["timezoneOffset"].notnull())].copy()
+    cDays = add_device_day_series(cgmdf, cDays, "cgm")
+
+    # create day series for pump df
+    pumpdf = df[(df.type == "bolus") & (df.timezoneOffset.notnull())].copy()
+    cDays = add_device_day_series(pumpdf, cDays, "pump")
+
+    # interpolate between upload records of the same deviceType, and create a
+    # day series for interpolated pump, non-hk-cgm, and healthkit uploads
+    for deviceType in ["pump", "cgm", "healthkit"]:
+        tempUploaddf = df[df["deviceType"] == deviceType].copy()
+        cDays = impute_upload_records(
+            tempUploaddf, cDays, deviceType + ".upload.imputed"
+        )
+
+    # add a home timezone that also accounts for daylight savings time changes
+    cDays = add_home_timezone(df, cDays)
+
+    # 1. USE UPLOAD RECORDS TO ESTIMATE TZ AND TZO
+    cDays = estimateTzAndTzoWithUploadRecords(cDays)
+
+    # 2. USE DEVICE TZOs TO ESTIMATE TZO AND TZ (IF POSSIBLE)
+    # estimates can be made from pump and cgm df that have a TZO
+    # NOTE: the healthkit and dexcom-api cgm df are excluded
+    cDays = estimateTzAndTzoWithDeviceRecords(cDays)
+
+    # 3. impute, infer, or interpolate gaps in the estimated tzo and tz
+    cDays = imputeTzAndTzo(cDays)
+
+    # 4. APPLY LOCAL TIME ESTIMATES TO ALL df
+    local_time = applyLocalTimeEstimates(df, cDays)
+
+    return local_time, cDays
+
+
+# %% START OF CODE
+all_metadata = pd.DataFrame()
+
+timezone_aliases = pd.read_csv(
+    "wikipedia-timezone-aliases-2018-04-28.csv",
+    low_memory=False
+)
+
+
+# %% GET DATA FROM API
+#'''
+#get metadata and data for a donor that has shared with bigdata
+#NOTE: functions assume you have an .env with bigdata account credentials
+#'''
+#
+#userid = ""
+#donor_group = ""
+#
+#donor_metadata, _ = get_shared_metadata(
+#    donor_group=donor_group,
+#    userid_of_shared_user=userid  # TODO: this should be refactored in several places to be userid
+#)
+#data, _ = get_data(
+#    donor_group=donor_group,
+#    userid=userid,
+#    weeks_of_data=52*10
+#)
+#
+## this is a dummy loop
+#for i in [0]:
+
+
+# %% GET DATA FROM JSON FILE
+
+data_path = os.path.join("..", "data")
+all_donor_metadata = pd.read_csv(
+    os.path.join(
+        data_path,
+        "PHI-2019-07-17-donor-metadata.csv"),
+    low_memory=False
+)
+
+# glob through the json files that are available
+import glob
+all_files = glob.glob(os.path.join(
+    data_path,
+    "PHI-2019-07-17-json-data",
+    "*.json"
+))
+
+# %%
+for d_idx in range(0, len(all_files)):
+    data = pd.read_json(all_files[d_idx])
+    userid = all_files[d_idx][-15:-5]
+    donor_metadata = all_donor_metadata[
+        all_donor_metadata["userid"] == userid
+    ]
+    print("\n", "starting", userid)
+
+    #  CREATE META DATAFRAME (metadata)
+    '''
+    this is useful for keeping track of the type and amount of cleaning done
+    '''
+    metadata = pd.DataFrame(index=[userid])
+
+    #  HASH USER ID
+    hashid = hash_userid(userid, os.environ['BIGDATA_SALT'])
+    data["userid"] = userid
+    data["hashid"] = hashid
+
+
+    #  CLEAN DATA
+    data_fields = list(data)
+
+    # NOTE: moving remove negative durations to type specific cleaning
+    # TODO: ask backend to change "duration" field to only include one object type
+
+    # Tslim calibration bug fix
+    data, n_cal_readings = tslim_calibration_fix(data.copy())
+    metadata["nTandemAndPayloadCalReadings"] = n_cal_readings
+
+    # fix large timzoneOffset bug in utcbootstrapping
+    data = timezone_offset_bug_fix(data.copy())
+
+    # add healthkit timezome information
+    # TODO: refactor this function to only require fields that might have hk tz
+    data[["timezone", "deviceType"]] = get_healthkit_timezone(data.copy())
+
+    # convert deprecated timezones to their aliases
+    data = convert_deprecated_timezone_to_alias(data, timezone_aliases)
+
+
+    #  TIME RELATED ITEMS
+    data["utcTime"] = to_utc_datetime(data[["time"]].copy())
+
+    # add upload time to the data, which is needed for:
+    # getting rid of duplicates and useful for getting local time
+
+    data["uploadTime"] = (
+        add_upload_time(data[["type", "uploadId", "utcTime"]].copy())
+    )
+
+#    # estimate local time (refactor of estimate-local-time.py)
+#    data["localTime"], local_time_metadata = estimate_local_time(data.copy())
+
+    # TODO: fix this issue with estimate local time
+    '''
+    //anaconda3/envs/tbddp/lib/python3.7/site-packages/pandas/core/ops.py:1649:
+    FutureWarning: elementwise comparison failed; returning scalar instead,
+    but in the future will perform elementwise comparison result = method(y)
+    '''
+
+    # round all data to the nearest 5 minutes
+    data["roundedUtcTime"] = round_time(
+        data["utcTime"].copy(),
+        time_interval_minutes=5,
+        start_with_first_record=True,
+        return_calculation_columns=False
+    )
+
+
+    #  TIME CATEGORIES
+    # AGE, & YLW
+    bDate = pd.to_datetime(donor_metadata["birthday"].values[0][0:7])
+    if data["roundedUtcTime"].notnull().sum() == 0:
+        data["age"] = np.nan
+    else:
+        data["age"] = np.floor((data["roundedUtcTime"] - bDate).dt.days/365.25)
+
+
+    #  GROUP DATA BY TYPE
+    # first sort by upload time (used when removing dumplicates)
+    data.sort_values("uploadTime", ascending=False, inplace=True)
+    groups = data.groupby(by="type")
+
+
+    #  CGM DATA
+    if "cbg" in data["type"].unique():
+        # filter by cgm
+        cgm = groups.get_group("cbg").dropna(axis=1, how="all")
+
+        # calculate cgm in mg/dL
+        cgm["mg/dL"] = round(cgm["value"] * MGDL_PER_MMOLL)
+
+        # get rid of spike data
+        cgm, nSpike = remove_spike_data(cgm.copy())
+        metadata["nSpike"] = nSpike
+
+        # TODO: refactor (above) so you don't need to drop columns
+        drop_columns = [
+            'origin.payload.device.name',
+            'origin.payload.device.manufacturer',
+            'origin.payload.sourceRevision.source.name'
+        ]
+        for drop_col in drop_columns:
+            if drop_col in list(cgm):
+                cgm.drop(columns=[drop_col], inplace=True)
+
+        # assign upload cgm device info to cgm records in that upload
+        cgm = add_upload_info_to_cgm_records(groups, cgm.copy())
+
+        # check to see if cgm info exists in healthkit locations
+        cgm = expand_heathkit_cgm_fields(cgm.copy())
+
+        # replace smoothed cgm values with raw values (if they exist)
+        # this must run after expand_heathkit_cgm_fields _
+        cgm["mg/dL"], metadata["nSmoothedCgmReplaced"] = (
+            replace_smoothed_cgm_values(cgm.copy())
+        )
+
+        # get cgm models
+        cgm["cgmModel"] = np.nan
+
+        # dexcom cgm models (G4, G5, G6)
+        cgm["cgmModel"] = get_dexcom_cgm_model(cgm.copy())
+
+        # for non dexcom cgms
+        # 670G, 640G, 630G, 530G, 523/723, libre, animas, and tandem
+        cgm["cgmModel"] = get_non_dexcom_cgm_model(cgm.copy())
+
+        # get metadata on cgm models and devices
+        metadata["nMissingCgmModels"] = cgm["cgmModel"].isnull().sum()
+        metadata["uniqueCgmModels"] = str(cgm["cgmModel"].unique())
+        if "deviceId" in list(cgm):
+            metadata["uniqueCgmDevices"] = str(cgm["deviceId"].unique())
+
+        # %% clean distributions
+        # break up all traces by cgm model
+        all_cgm_series = pd.DataFrame()
+        cgm_models = cgm.groupby(by="cgmModel")
+#        for cgm_model in cgm["cgmModel"].unique():
+        for cgm_model in cgm_models.groups.keys():
+            print(cgm_model)
+            temp_cgm = cgm_models.get_group(cgm_model)
+
+            # get rid of cgm values too low/high (< 38 & > 402 mg/dL)
+            temp_cgm, nInvalidCgmValues = remove_invalid_cgm_values(temp_cgm)
+            metadata["nInvalidCgmValues." + cgm_model] = nInvalidCgmValues
+
+            # sort by upload time before getting rid of duplicates
+            temp_cgm.sort_values("uploadTime", ascending=False, inplace=True)
+
+            # get rid of duplicates that have the same ["deviceTime", "mg/dL"]
+            temp_cgm, n_cgm_dups_removed = (
+                removeCgmDuplicates(temp_cgm, "deviceTime", "mg/dL")
+            )
+            metadata["nCgmDuplicatesRemovedDeviceTime." + cgm_model] = (
+                n_cgm_dups_removed
+            )
+
+            # get rid of duplicates that have the same ["time", "mg/dL"]
+            temp_cgm, n_cgm_dups_removed = (
+                removeCgmDuplicates(temp_cgm, "utcTime", "mg/dL")
+            )
+            metadata["nCgmDuplicatesRemovedUtcTime." + cgm_model] = (
+                n_cgm_dups_removed
+            )
+
+            # get rid of duplicates that have the same roundedTime
+            temp_cgm, n_cgm_dups_removed = (
+                removeDuplicates(temp_cgm, "roundedUtcTime")
+            )
+            metadata["nCgmDuplicatesRemovedRoundedTime." + cgm_model] = (
+                n_cgm_dups_removed
+            )
+
+            # create a contiguous 5 minute time series
+            first_day = temp_cgm["roundedUtcTime"].min()
+            last_day = temp_cgm["roundedUtcTime"].max()
+            rng = pd.date_range(first_day, last_day, freq="5min")
+            contiguous_data = pd.DataFrame(
+                rng,
+                columns=["roundedUtcTime"]
+            ).sort_values(
+                "roundedUtcTime",
+                ascending=False
+            ).reset_index(drop=True)
+
+            # merge with cgm data
+            cgm_series = pd.merge(
+                contiguous_data,
+                temp_cgm[
+                    ["roundedUtcTime", "hashid", "cgmModel", "age", "mg/dL"]
+                ],
+                on="roundedUtcTime",
+                how="left"
+            )
+            #
+            cgm_series.sort_values(
+                "roundedUtcTime", ascending=True, inplace=True
+            )
+            cgm_series.reset_index(drop=True, inplace=True)
+
+            # get dexcom icgm bins
+            value_bins = np.array(
+                [37, 39, 60, 80, 120, 160, 200, 250, 300, 350, 400, 403]
+            )
+            value_bin_names = (
+                "< 40", "40-60", "61-80", "81-120", "121-160", "161-200",
+                "201-250", "251-300", "301-350", "351-400", "> 400"
+            )
+            cgm_series["valueBin"] = pd.cut(
+                cgm_series["mg/dL"], value_bins, labels=value_bin_names
+            )
+
+
+            # get the previous val
+            cgm_series["previousVal"] = cgm_series["mg/dL"].shift(1)
+
+            # get difference between current and previous val
+            cgm_series["diffFromPrevVal"] = (
+                cgm_series["mg/dL"] -  cgm_series["previousVal"]
+            )
+
+            # calculate the rate from previous value (mg/dL/min)
+            cgm_series["rateFromPrevVal"] = cgm_series["diffFromPrevVal"] / 5
+
+            # get dexcom icgm rate bins
+            rate_bins = np.array(
+                [-100, -2.000001, -1.000001, -0.000001, 1, 2, 100]
+            )
+            # NOTE: bracket means include, parentheses means exclude
+            rate_bin_names = (
+                "< -2", "[-2,-1)", "[-1,-0)", "[0,1]", "(1,2]", ">2",
+            )
+            cgm_series["rateBin"] = pd.cut(
+                cgm_series["rateFromPrevVal"], rate_bins, labels=rate_bin_names
+            )
+
+            # through in the join category
+            cgm_series["valAndRateBin"] = (
+                cgm_series["valueBin"].astype(str)
+                + " & "
+                + cgm_series["rateBin"].astype(str)
+            )
+
+            # calculate slope (mg/dL/min) over the last 15, 30, and 60 minutes
+            cgm_series["slope15"] = (
+                cgm_series["mg/dL"].rolling(3).apply(get_slope, raw=True)
+            )
+
+            cgm_series["slope30"] = (
+                cgm_series["mg/dL"].rolling(6).apply(get_slope, raw=True)
+            )
+
+            cgm_series["slope60"] = (
+                cgm_series["mg/dL"].rolling(12).apply(get_slope, raw=True)
+            )
+
+            # add in the next value
+            cgm_series["nextVal"] = cgm_series["mg/dL"].shift(-1)
+
+            # get difference or relative increase/decrease of next value
+            cgm_series["relativeNextValue"] = (
+                cgm_series["nextVal"] -  cgm_series["mg/dL"]
+            )
+
+            # rate of next value
+            cgm_series["rateToNextVal"] = cgm_series["relativeNextValue"] / 5
+
+            # drop rows where there is no information
+            cgm_series.dropna(subset=['hashid'], inplace=True)
+            metadata["nCgmDataPoints." + cgm_model] = len(cgm_series)
+
+            # append cgm model to a larger table
+            all_cgm_series = pd.concat(
+                [all_cgm_series, cgm_series],
+                ignore_index=True
+            )
+
+
+        # %% END OF CODE
+        print(metadata.T)
+
+    else:
+        print(d_idx, "no cgm data")
+
+    # combine metadata
+    all_metadata = pd.concat([all_metadata, metadata], sort=False)
+    print("finished", d_idx, userid)
+
+
+

From 890bac8d3a796d5038f8e6c868cbde7408a7e0e6 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sat, 17 Aug 2019 07:42:34 -0500
Subject: [PATCH 25/46] save output

---
 .../get_stats/get_cgm_distributions_v3.py     |  61 ++++--
 .../wikipedia-timezone-aliases-2018-04-28.csv | 206 ++++++++++++++++++
 2 files changed, 248 insertions(+), 19 deletions(-)
 create mode 100644 projects/bigdata-processing-pipeline/get_stats/wikipedia-timezone-aliases-2018-04-28.csv

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
index 5e670608..9f2bee79 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
@@ -13,7 +13,7 @@
 import numpy as np
 import pandas as pd
 import datetime as dt
-import ast
+import glob
 import pdb
 # TODO: figure out how to get rid of these path dependcies
 get_donor_data_path = os.path.abspath(
@@ -24,7 +24,7 @@
 import environmentalVariables
 from get_donor_data.get_single_donor_metadata import get_shared_metadata
 from get_donor_data.get_single_tidepool_dataset import get_data
-
+from get_donor_data.get_single_tidepool_dataset_json import make_folder_if_doesnt_exist
 
 # %% CONSTANTS
 MGDL_PER_MMOLL = 18.01559
@@ -1465,22 +1465,39 @@ def estimate_local_time(df):
 
 
 # %% GET DATA FROM JSON FILE
-
 data_path = os.path.join("..", "data")
 all_donor_metadata = pd.read_csv(
     os.path.join(
         data_path,
+        "PHI-2019-07-17-donor-data",
         "PHI-2019-07-17-donor-metadata.csv"),
     low_memory=False
 )
 
 # glob through the json files that are available
-import glob
-all_files = glob.glob(os.path.join(
+all_files = glob.glob(
+    os.path.join(
+        data_path,
+        "dremio",
+        "**",
+        "*.json"
+    ),
+    recursive=True
+)
+
+output_metadata = os.path.join(
+    data_path,
+    "PHI-2019-07-17-donor-data",
+    "PHI-2019-07-17-cgm-metadata"
+)
+output_distribution = os.path.join(
     data_path,
-    "PHI-2019-07-17-json-data",
-    "*.json"
-))
+    "PHI-2019-07-17-donor-data",
+    "PHI-2019-07-17-cgm-distributions"
+)
+
+make_folder_if_doesnt_exist([output_metadata, output_distribution])
+
 
 # %%
 for d_idx in range(0, len(all_files)):
@@ -1502,7 +1519,6 @@ def estimate_local_time(df):
     data["userid"] = userid
     data["hashid"] = hashid
 
-
     #  CLEAN DATA
     data_fields = list(data)
 
@@ -1523,7 +1539,6 @@ def estimate_local_time(df):
     # convert deprecated timezones to their aliases
     data = convert_deprecated_timezone_to_alias(data, timezone_aliases)
 
-
     #  TIME RELATED ITEMS
     data["utcTime"] = to_utc_datetime(data[["time"]].copy())
 
@@ -1552,7 +1567,6 @@ def estimate_local_time(df):
         return_calculation_columns=False
     )
 
-
     #  TIME CATEGORIES
     # AGE, & YLW
     bDate = pd.to_datetime(donor_metadata["birthday"].values[0][0:7])
@@ -1561,15 +1575,15 @@ def estimate_local_time(df):
     else:
         data["age"] = np.floor((data["roundedUtcTime"] - bDate).dt.days/365.25)
 
-
     #  GROUP DATA BY TYPE
     # first sort by upload time (used when removing dumplicates)
     data.sort_values("uploadTime", ascending=False, inplace=True)
     groups = data.groupby(by="type")
 
-
     #  CGM DATA
     if "cbg" in data["type"].unique():
+        metadata["cgmData"] = True
+
         # filter by cgm
         cgm = groups.get_group("cbg").dropna(axis=1, how="all")
 
@@ -1622,7 +1636,7 @@ def estimate_local_time(df):
         # break up all traces by cgm model
         all_cgm_series = pd.DataFrame()
         cgm_models = cgm.groupby(by="cgmModel")
-#        for cgm_model in cgm["cgmModel"].unique():
+
         for cgm_model in cgm_models.groups.keys():
             print(cgm_model)
             temp_cgm = cgm_models.get_group(cgm_model)
@@ -1697,13 +1711,12 @@ def estimate_local_time(df):
                 cgm_series["mg/dL"], value_bins, labels=value_bin_names
             )
 
-
             # get the previous val
             cgm_series["previousVal"] = cgm_series["mg/dL"].shift(1)
 
             # get difference between current and previous val
             cgm_series["diffFromPrevVal"] = (
-                cgm_series["mg/dL"] -  cgm_series["previousVal"]
+                cgm_series["mg/dL"] - cgm_series["previousVal"]
             )
 
             # calculate the rate from previous value (mg/dL/min)
@@ -1746,7 +1759,7 @@ def estimate_local_time(df):
 
             # get difference or relative increase/decrease of next value
             cgm_series["relativeNextValue"] = (
-                cgm_series["nextVal"] -  cgm_series["mg/dL"]
+                cgm_series["nextVal"] - cgm_series["mg/dL"]
             )
 
             # rate of next value
@@ -1762,15 +1775,25 @@ def estimate_local_time(df):
                 ignore_index=True
             )
 
-
-        # %% END OF CODE
+        # save distribution data
+        all_cgm_series.to_csv(os.path.join(
+            output_distribution,
+            "PHI-" + userid + "-cgm-distribution.csv"
+        ))
         print(metadata.T)
 
     else:
+        metadata["cgmData"] = False
         print(d_idx, "no cgm data")
 
     # combine metadata
     all_metadata = pd.concat([all_metadata, metadata], sort=False)
+
+    # save metadata
+    all_metadata.to_csv(os.path.join(
+        output_metadata,
+        "PHI-" + userid + "-cgm-metadata.csv"
+    ))
     print("finished", d_idx, userid)
 
 
diff --git a/projects/bigdata-processing-pipeline/get_stats/wikipedia-timezone-aliases-2018-04-28.csv b/projects/bigdata-processing-pipeline/get_stats/wikipedia-timezone-aliases-2018-04-28.csv
new file mode 100644
index 00000000..01370b69
--- /dev/null
+++ b/projects/bigdata-processing-pipeline/get_stats/wikipedia-timezone-aliases-2018-04-28.csv
@@ -0,0 +1,206 @@
+﻿tz,alias
+Africa/Addis_Ababa,Africa/Nairobi
+Africa/Asmara,Africa/Nairobi
+Africa/Bamako,Africa/Abidjan
+Africa/Bangui,Africa/Lagos
+Africa/Banjul,Africa/Abidjan
+Africa/Blantyre,Africa/Maputo
+Africa/Brazzaville,Africa/Lagos
+Africa/Bujumbura,Africa/Maputo
+Africa/Conakry,Africa/Abidjan
+Africa/Dakar,Africa/Abidjan
+Africa/Dar_es_Salaam,Africa/Nairobi
+Africa/Djibouti,Africa/Nairobi
+Africa/Douala,Africa/Lagos
+Africa/Freetown,Africa/Abidjan
+Africa/Gaborone,Africa/Maputo
+Africa/Harare,Africa/Maputo
+Africa/Kampala,Africa/Nairobi
+Africa/Kigali,Africa/Maputo
+Africa/Kinshasa,Africa/Lagos
+Africa/Libreville,Africa/Lagos
+Africa/Lome,Africa/Abidjan
+Africa/Luanda,Africa/Lagos
+Africa/Lubumbashi,Africa/Maputo
+Africa/Lusaka,Africa/Maputo
+Africa/Malabo,Africa/Lagos
+Africa/Maseru,Africa/Johannesburg
+Africa/Mbabane,Africa/Johannesburg
+Africa/Mogadishu,Africa/Nairobi
+Africa/Niamey,Africa/Lagos
+Africa/Nouakchott,Africa/Abidjan
+Africa/Ouagadougou,Africa/Abidjan
+Africa/Porto-Novo,Africa/Lagos
+Africa/Sao_Tome,Africa/Lagos
+Africa/Timbuktu,Africa/Abidjan
+America/Anguilla,America/Port_of_Spain
+America/Antigua,America/Port_of_Spain
+America/Argentina/ComodRivadavia,America/Argentina/Catamarca
+America/Aruba,America/Curacao
+America/Atka,America/Adak
+America/Buenos_Aires,America/Argentina/Buenos_Aires
+America/Catamarca,America/Argentina/Catamarca
+America/Cayman,America/Panama
+America/Coral_Harbour,America/Atikokan
+America/Cordoba,America/Argentina/Cordoba
+America/Dominica,America/Port_of_Spain
+America/Ensenada,America/Tijuana
+America/Fort_Wayne,America/Indiana/Indianapolis
+America/Grenada,America/Port_of_Spain
+America/Guadeloupe,America/Port_of_Spain
+America/Indianapolis,America/Indiana/Indianapolis
+America/Jujuy,America/Argentina/Jujuy
+America/Knox_IN,America/Indiana/Knox
+America/Kralendijk,America/Curacao
+America/Louisville,America/Kentucky/Louisville
+America/Lower_Princes,America/Curacao
+America/Marigot,America/Port_of_Spain
+America/Mendoza,America/Argentina/Mendoza
+America/Montreal,America/Toronto
+America/Montserrat,America/Port_of_Spain
+America/Porto_Acre,America/Rio_Branco
+America/Rosario,America/Argentina/Cordoba
+America/Santa_Isabel,America/Tijuana
+America/Shiprock,America/Denver
+America/St_Barthelemy,America/Port_of_Spain
+America/St_Kitts,America/Port_of_Spain
+America/St_Lucia,America/Port_of_Spain
+America/St_Thomas,America/Port_of_Spain
+America/St_Vincent,America/Port_of_Spain
+America/Tortola,America/Port_of_Spain
+America/Virgin,America/Port_of_Spain
+Antarctica/McMurdo,Pacific/Auckland
+Antarctica/South_Pole,Pacific/Auckland
+Arctic/Longyearbyen,Europe/Oslo
+Asia/Aden,Asia/Riyadh
+Asia/Ashkhabad,Asia/Ashgabat
+Asia/Bahrain,Asia/Qatar
+Asia/Calcutta,Asia/Kolkata
+Asia/Chongqing,Asia/Shanghai
+Asia/Chungking,Asia/Shanghai
+Asia/Dacca,Asia/Dhaka
+Asia/Harbin,Asia/Shanghai
+Asia/Istanbul,Europe/Istanbul
+Asia/Kashgar,Asia/Urumqi[note1]
+Asia/Katmandu,Asia/Kathmandu
+Asia/Kuwait,Asia/Riyadh
+Asia/Macao,Asia/Macau
+Asia/Muscat,Asia/Dubai
+Asia/Phnom_Penh,Asia/Bangkok
+Asia/Rangoon,Asia/Yangon
+Asia/Saigon,Asia/Ho_Chi_Minh
+Asia/Tel_Aviv,Asia/Jerusalem
+Asia/Thimbu,Asia/Thimphu
+Asia/Ujung_Pandang,Asia/Makassar
+Asia/Ulan_Bator,Asia/Ulaanbaatar
+Asia/Vientiane,Asia/Bangkok
+Atlantic/Faeroe,Atlantic/Faroe
+Atlantic/Jan_Mayen,Europe/Oslo
+Atlantic/St_Helena,Africa/Abidjan
+Australia/ACT,Australia/Sydney
+Australia/Canberra,Australia/Sydney
+Australia/LHI,Australia/Lord_Howe
+Australia/North,Australia/Darwin
+Australia/NSW,Australia/Sydney
+Australia/Queensland,Australia/Brisbane
+Australia/South,Australia/Adelaide
+Australia/Tasmania,Australia/Hobart
+Australia/Victoria,Australia/Melbourne
+Australia/West,Australia/Perth
+Australia/Yancowinna,Australia/Broken_Hill
+Brazil/Acre,America/Rio_Branco
+Brazil/DeNoronha,America/Noronha
+Brazil/East,America/Sao_Paulo
+Brazil/West,America/Manaus
+Canada/Atlantic,America/Halifax
+Canada/Central,America/Winnipeg
+Canada/Eastern,America/Toronto
+Canada/Mountain,America/Edmonton
+Canada/Newfoundland,America/St_Johns
+Canada/Pacific,America/Vancouver
+Canada/Saskatchewan,America/Regina
+Canada/Yukon,America/Whitehorse
+Chile/Continental,America/Santiago
+Chile/EasterIsland,Pacific/Easter
+Cuba,America/Havana
+Egypt,Africa/Cairo
+Eire,Europe/Dublin
+Etc/GMT+0,Etc/GMT
+Etc/GMT-0,Etc/GMT
+Etc/GMT0,Etc/GMT
+Etc/Greenwich,Etc/GMT
+Etc/Universal,Etc/UTC
+Etc/Zulu,Etc/UTC
+Europe/Belfast,Europe/London
+Europe/Bratislava,Europe/Prague
+Europe/Busingen,Europe/Zurich
+Europe/Guernsey,Europe/London
+Europe/Isle_of_Man,Europe/London
+Europe/Jersey,Europe/London
+Europe/Ljubljana,Europe/Belgrade
+Europe/Mariehamn,Europe/Helsinki
+Europe/Nicosia,Asia/Nicosia
+Europe/Podgorica,Europe/Belgrade
+Europe/San_Marino,Europe/Rome
+Europe/Sarajevo,Europe/Belgrade
+Europe/Skopje,Europe/Belgrade
+Europe/Tiraspol,Europe/Chisinau
+Europe/Vaduz,Europe/Zurich
+Europe/Vatican,Europe/Rome
+Europe/Zagreb,Europe/Belgrade
+GB,Europe/London
+GB-Eire,Europe/London
+GMT,Etc/GMT
+GMT+0,Etc/GMT
+GMT0,Etc/GMT
+GMT−0,Etc/GMT
+Greenwich,Etc/GMT
+Hongkong,Asia/Hong_Kong
+Iceland,Atlantic/Reykjavik
+Indian/Antananarivo,Africa/Nairobi
+Indian/Comoro,Africa/Nairobi
+Indian/Mayotte,Africa/Nairobi
+Iran,Asia/Tehran
+Israel,Asia/Jerusalem
+Jamaica,America/Jamaica
+Japan,Asia/Tokyo
+Kwajalein,Pacific/Kwajalein
+Libya,Africa/Tripoli
+Mexico/BajaNorte,America/Tijuana
+Mexico/BajaSur,America/Mazatlan
+Mexico/General,America/Mexico_City
+Navajo,America/Denver
+NZ,Pacific/Auckland
+NZ-CHAT,Pacific/Chatham
+Pacific/Johnston,Pacific/Honolulu
+Pacific/Midway,Pacific/Pago_Pago
+Pacific/Ponape,Pacific/Pohnpei
+Pacific/Saipan,Pacific/Guam
+Pacific/Samoa,Pacific/Pago_Pago
+Pacific/Truk,Pacific/Chuuk
+Pacific/Yap,Pacific/Chuuk
+Poland,Europe/Warsaw
+Portugal,Europe/Lisbon
+PRC,Asia/Shanghai
+ROC,Asia/Taipei
+ROK,Asia/Seoul
+Singapore,Asia/Singapore
+Turkey,Europe/Istanbul
+UCT,Etc/UCT
+Universal,Etc/UTC
+US/Alaska,America/Anchorage
+US/Aleutian,America/Adak
+US/Arizona,America/Phoenix
+US/Central,America/Chicago
+US/East-Indiana,America/Indiana/Indianapolis
+US/Eastern,America/New_York
+US/Hawaii,Pacific/Honolulu
+US/Indiana-Starke,America/Indiana/Knox
+US/Michigan,America/Detroit
+US/Mountain,America/Denver
+US/Pacific,America/Los_Angeles
+US/Pacific-New,America/Los_Angeles
+US/Samoa,Pacific/Pago_Pago
+UTC,Etc/UTC
+W-SU,Europe/Moscow
+Zulu,Etc/UTC
\ No newline at end of file

From 1d4ef81fef542549c9eda72ec12de3f2e32eced7 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sat, 17 Aug 2019 08:49:26 -0500
Subject: [PATCH 26/46] remove collecting all metadata

instead collect metadata for single user
---
 .../get_stats/get_cgm_distributions_v3.py     | 59 +++++++++++--------
 1 file changed, 33 insertions(+), 26 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
index 9f2bee79..0d12c57c 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
@@ -1432,15 +1432,6 @@ def estimate_local_time(df):
     return local_time, cDays
 
 
-# %% START OF CODE
-all_metadata = pd.DataFrame()
-
-timezone_aliases = pd.read_csv(
-    "wikipedia-timezone-aliases-2018-04-28.csv",
-    low_memory=False
-)
-
-
 # %% GET DATA FROM API
 #'''
 #get metadata and data for a donor that has shared with bigdata
@@ -1499,25 +1490,38 @@ def estimate_local_time(df):
 make_folder_if_doesnt_exist([output_metadata, output_distribution])
 
 
+# %% START OF CODE
+timezone_aliases = pd.read_csv(
+    "wikipedia-timezone-aliases-2018-04-28.csv",
+    low_memory=False
+)
+
+donor_metadata_columns = [
+    'userid',
+    'diagnosisType',
+    'diagnosisDate',
+    'biologicalSex',
+    'birthday',
+    'targetTimezone',
+    'targetDevices',
+    'isOtherPerson',
+]
+
 # %%
 for d_idx in range(0, len(all_files)):
     data = pd.read_json(all_files[d_idx])
     userid = all_files[d_idx][-15:-5]
-    donor_metadata = all_donor_metadata[
-        all_donor_metadata["userid"] == userid
+    metadata = all_donor_metadata.loc[
+        all_donor_metadata["userid"] == userid,
+        donor_metadata_columns
     ]
     print("\n", "starting", userid)
 
-    #  CREATE META DATAFRAME (metadata)
-    '''
-    this is useful for keeping track of the type and amount of cleaning done
-    '''
-    metadata = pd.DataFrame(index=[userid])
-
     #  HASH USER ID
     hashid = hash_userid(userid, os.environ['BIGDATA_SALT'])
     data["userid"] = userid
     data["hashid"] = hashid
+    metadata["hashid"] = hashid
 
     #  CLEAN DATA
     data_fields = list(data)
@@ -1569,11 +1573,18 @@ def estimate_local_time(df):
 
     #  TIME CATEGORIES
     # AGE, & YLW
-    bDate = pd.to_datetime(donor_metadata["birthday"].values[0][0:7])
-    if data["roundedUtcTime"].notnull().sum() == 0:
+    # TODO: make this a function
+    if metadata["birthday"].values is not None:
+        bDate = pd.to_datetime(metadata["birthday"].values[0][0:7])
+        data["age"] = np.floor((data["roundedUtcTime"] - bDate).dt.days/365.25)
+    else:
         data["age"] = np.nan
+
+    if metadata["diagnosisDate"].values is not None:
+        dDate = pd.to_datetime(metadata["diagnosisDate"].values[0][0:7])
+        data["ylw"] = np.floor((data["roundedUtcTime"] - dDate).dt.days/365.25)
     else:
-        data["age"] = np.floor((data["roundedUtcTime"] - bDate).dt.days/365.25)
+        data["ylw"] = np.nan
 
     #  GROUP DATA BY TYPE
     # first sort by upload time (used when removing dumplicates)
@@ -1786,15 +1797,11 @@ def estimate_local_time(df):
         metadata["cgmData"] = False
         print(d_idx, "no cgm data")
 
-    # combine metadata
-    all_metadata = pd.concat([all_metadata, metadata], sort=False)
-
     # save metadata
-    all_metadata.to_csv(os.path.join(
+    metadata.to_csv(os.path.join(
         output_metadata,
         "PHI-" + userid + "-cgm-metadata.csv"
     ))
-    print("finished", d_idx, userid)
-
 
+    print("finished", d_idx, userid)
 

From 264a79edda08fe9124ffce651003f45fb27b3ac8 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sat, 17 Aug 2019 09:18:01 -0500
Subject: [PATCH 27/46] add additional metadata to output

---
 .../get_stats/get_cgm_distributions_v3.py     | 28 ++++++++++++++++---
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
index 0d12c57c..25006b98 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
@@ -1698,13 +1698,15 @@ def estimate_local_time(df):
             # merge with cgm data
             cgm_series = pd.merge(
                 contiguous_data,
-                temp_cgm[
-                    ["roundedUtcTime", "hashid", "cgmModel", "age", "mg/dL"]
-                ],
+                temp_cgm[[
+                    "roundedUtcTime", "hashid",
+                    "cgmModel", "age", "ylw", "mg/dL"
+                 ]],
                 on="roundedUtcTime",
                 how="left"
             )
-            #
+
+            # sort so that the oldest data point is on top
             cgm_series.sort_values(
                 "roundedUtcTime", ascending=True, inplace=True
             )
@@ -1786,6 +1788,24 @@ def estimate_local_time(df):
                 ignore_index=True
             )
 
+        # sort so that the oldest data point is on top
+        all_cgm_series.sort_values(
+            "roundedUtcTime", ascending=False, inplace=True
+        )
+        all_cgm_series.reset_index(drop=True, inplace=True)
+
+        # add in check to see if there are duplicates between cgm devices
+        nUnique_cgm_times = len(all_cgm_series["roundedUtcTime"].unique())
+        metadata["duplicateCgmDataIssue"] = (
+            nUnique_cgm_times != len(all_cgm_series)
+        )
+
+        # get metadata for cgm stats
+        metadata["lastCgm.date"] = all_cgm_series.loc[0, "roundedUtcTime"]
+        metadata["lastCgm.age"] = all_cgm_series.loc[0, "age"]
+        metadata["lastCgm.ylw"] = all_cgm_series.loc[0, "ylw"]
+
+        pdb.set_trace()
         # save distribution data
         all_cgm_series.to_csv(os.path.join(
             output_distribution,

From 20bb215bb3b6f0a3f27abef0f863598d7dbd2977 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sat, 17 Aug 2019 11:29:15 -0500
Subject: [PATCH 28/46] fix spike data drop bug

the wrong index was getting deleted
---
 .../get_stats/get_cgm_distributions_v3.py      | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
index 25006b98..2966fa5f 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
@@ -652,10 +652,8 @@ def remove_invalid_cgm_values(df):
 
     nBefore = len(df)
     # remove values < 38 and > 402 mg/dL
-    df = df.drop(df[((df.type == "cbg") &
-                     (df["mg/dL"] < 38))].index)
-    df = df.drop(df[((df.type == "cbg") &
-                     (df["mg/dL"] > 402))].index)
+    df = df.drop(df[((df.type == "cbg") & (df["mg/dL"] < 38))].index)
+    df = df.drop(df[((df.type == "cbg") & (df["mg/dL"] > 402))].index)
     nRemoved = nBefore - len(df)
 
     return df, nRemoved
@@ -701,13 +699,11 @@ def remove_spike_data(df):
         ]
         for spike_loc in spike_locations:
             df[spike_loc] = get_embedded_field(df["origin"], spike_loc)
-
-            spike_idx = df.loc[
-                df[spike_loc].notnull(),
-                spike_loc
-            ].astype(str).str.lower().str.contains("spike")
-
-            df.drop((spike_idx == True).index, inplace=True)
+            notnull_idx = df[spike_loc].notnull()
+            df_notnull = df[notnull_idx]
+            is_spike = df_notnull[spike_loc].astype(str).str.lower().str.contains("spike")
+            spike_idx = df_notnull[is_spike].index
+            df.drop(spike_idx, inplace=True)
 
         nRemoved = nBefore - len(df)
 

From 241987bf5bc5bc8f382ac208fa26a6666bd548d4 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Mon, 19 Aug 2019 08:56:28 -0500
Subject: [PATCH 29/46] refactor sensing cgmModel

---
 .../get_stats/get_cgm_distributions_v3.py     | 268 ++++++++++--------
 1 file changed, 152 insertions(+), 116 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
index 2966fa5f..e3fdf2d4 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
@@ -136,19 +136,20 @@ def get_embedded_field(ts, embedded_field):
 
 def add_upload_info_to_cgm_records(groups, df):
     upload_locations = [
-        "uploadId",
-        "deviceManufacturers",
-        "deviceModel",
-        "deviceSerialNumber",
-        "deviceTags"
+        "upload.uploadId",
+        "upload.deviceManufacturers",
+        "upload.deviceModel",
+        "upload.deviceSerialNumber",
+        "upload.deviceTags"
     ]
 
     if "upload" in groups["type"].unique():
-        upload = groups.get_group("upload").dropna(axis=1, how="all")
+        upload = groups.get_group("upload").dropna(axis=1, how="all").add_prefix("upload.")
         df = pd.merge(
             left=df,
             right=upload[list(set(upload_locations) & set(list(upload)))],
-            on="uploadId",
+            left_on="uploadId",
+            right_on="upload.uploadId",
             how="left"
         )
 
@@ -156,6 +157,18 @@ def add_upload_info_to_cgm_records(groups, df):
 
 
 def expand_heathkit_cgm_fields(df):
+    # TODO: refactor the code/function that originally grabs
+    # these fields, so we are only doing it once, and so
+    # we don't have to drop the columns for the code below to work.
+    drop_columns = [
+        'origin.payload.device.name',
+        'origin.payload.device.manufacturer',
+        'origin.payload.sourceRevision.source.name'
+    ]
+    for drop_col in drop_columns:
+        if drop_col in list(df):
+            df.drop(columns=[drop_col], inplace=True)
+
     healthkit_locations = [
         "origin",
         "origin.payload",
@@ -177,65 +190,85 @@ def expand_heathkit_cgm_fields(df):
 
 def get_dexcom_cgm_model(df):
     # add cgm model
-    # put this list in order of precedence when choosing sensor version
-    # NOTE: there is an edge case where "origin.payload.device.model" = G5/G6,
-    # which can be eliminated by getting model from HKMetadataKeySyncIdentifier
+
     dexcom_model_locations = [
         "deviceId",
         "deviceManufacturers",
+        "upload.deviceManufacturers",
         "deviceModel",
+        "upload.deviceModel",
         "deviceSerialNumber",
-        "payload.HKMetadataKeySyncIdentifier", # do this before "origin.payload.device.model" bc there is an edge case
-        "origin.payload.device.model",
+        "upload.deviceSerialNumber",
         "origin.payload.sourceRevision.source.name",
         "payload.transmitterGeneration",
+        "payload.HKMetadataKeySyncIdentifier",
         "payload.transmitterId",
     ]
 
     for model_location in dexcom_model_locations:
-        if model_location in list(df):
-            # only consider cells where the model location is not null
-            notnull_idx = df[model_location].notnull()
-            if notnull_idx.sum() > 0:
-                for dex_model in ["G4", "G5", "G6"]:
-                    # define a pandas stringMethod
-                    str_list = df[model_location].astype(str).str
-                    # if model has already been determined, then skip
-                    missing_model_idx = df["cgmModel"].isnull()
-                    # get index that matches model
-                    model_idx = str_list.upper().str.contains(dex_model)
-
-                    m_idx = (
-                        missing_model_idx & notnull_idx & model_idx
-                    )
-                    df.loc[m_idx, "cgmModel"] = dex_model
-
-                    # case of "payload.transmitterId"
-                    if (
-                        ("payload.transmitterId" in model_location)
-                        | ("payload.HKMetadataKeySyncIdentifier" in model_location)
-                    ):
-                        # get string length (need 5 digits for G4 and 6 for G5, G6)
-                        if "G4" in dex_model:
-                            model_idx = str_list.len() == 5
-                        elif "G5" in dex_model:
-                            model_idx = str_list.startswith("4")
-                        elif "G6" in dex_model:
-                            model_idx = (
-                                (str_list.startswith("8"))
-                                | (str_list.startswith("2"))
-                            )
-                        m_idx = (
-                            missing_model_idx & notnull_idx & model_idx
-                        )
-                        df.loc[m_idx, "cgmModel"] = dex_model
-
-    return df["cgmModel"]
+        # only check if model has NOT been determined, or if it is G5_G6
+        m_idx = (
+            (df["cgmModel"].isnull())
+            | (df["cgmModel"].astype(str).str.contains("G5_G6"))
+        )
+
+        # get index that matches model
+        if ((model_location in list(df)) & (m_idx.sum() > 0)):
+            str_list = df[model_location].astype(str).str
+
+            # G4
+            g4_idx = str_list.contains("G4", case=False, na=False)
+            df.loc[g4_idx, "cgmModel"] = "G4"
+            df.loc[g4_idx, "cgmModelSensedFrom"] = model_location
+
+            # G5
+            g5_idx = str_list.contains("G5", case=False, na=False)
+            df.loc[g5_idx, "cgmModel"] = "G5"
+            df.loc[g5_idx, "cgmModelSensedFrom"] = model_location
+
+            # G6
+            g6_idx = str_list.contains("G6", case=False, na=False)
+            df.loc[g6_idx, "cgmModel"] = "G6"
+            df.loc[g6_idx, "cgmModelSensedFrom"] = model_location
+
+            # edge case of g5 and g6
+            g5_g6_idx = (g5_idx & g6_idx)
+            df.loc[g5_g6_idx, "cgmModel"] = "G5_G6"
+            df.loc[g5_g6_idx, "cgmModelSensedFrom"] = model_location
+
+            # case of "transmitterId"
+            if (
+                ("transmitterId" in model_location)
+                | ("payload.HKMetadataKeySyncIdentifier" in model_location)
+            ):
+                # if length of string is 5, then it is likely a G4 sensor
+                length5_idx = str_list.len() == 5
+                df.loc[length5_idx, "cgmModel"] = "G4"
+                df.loc[length5_idx, "cgmModelSensedFrom"] = model_location
+
+                # if length of string > 5  then might be G5 or G6
+                length_gt5_idx = str_list.len() > 5
+
+                # if sensor stats with 4 then likely G5
+                starts4_idx = str_list.startswith("4")
+                df.loc[(length_gt5_idx & starts4_idx), "cgmModel"] = "G5"
+                df.loc[(length_gt5_idx & starts4_idx), "cgmModelSensedFrom"] = model_location
+
+                # if sensor stats with 2 or 8 then likely G6
+                starts2_6_idx = (
+                    (str_list.startswith("2")) | (str_list.startswith("8"))
+                )
+                df.loc[(length_gt5_idx & starts2_6_idx), "cgmModel"] = "G6"
+                df.loc[(length_gt5_idx & starts2_6_idx), "cgmModelSensedFrom"] = model_location
+
+    return df[["cgmModel", "cgmModelSensedFrom"]]
 
 
 def get_non_dexcom_cgm_model(df):
     # non-dexcom cgm model query
     model_locations = ["deviceId"]
+
+    # model types (NOTE: for medtronic getting pump type not cgm)
     models_670G = "MMT-158|MMT-178"
     models_640G = "MMT-1511|MMT-1512|MMT-1711|MMT-1712"
     models_630G = "MMT-1514|MMT-1515|MMT-1714|MMT-1715"
@@ -260,25 +293,26 @@ def get_non_dexcom_cgm_model(df):
         "LIBRE", "G4", "G5_G6", "G4"
     ]
 
-    for model_loc in model_locations:
-        if model_loc in list(df):
-            # only consider cells where the model location is not null
-            # and we are missing a cgm model
-            notnull_idx = df[model_loc].notnull()
-            if notnull_idx.sum() > 0:
-                missing_model_idx = df["cgmModel"].isnull()
-                if missing_model_idx.sum() > 0:
-                    # define a pandas stringMethod
-                    str_list = df[model_loc].astype(str).str
+    for model_location in model_locations:
+        # only check if model has NOT been determined, or if it is G5_G6
+        m_idx = (
+            (df["cgmModel"].isnull())
+            | (df["cgmModel"].astype(str).str.contains("G5_G6"))
+        )
+
+        # get index that matches model
+        if ((model_location in list(df)) & (m_idx.sum() > 0)):
+            str_list = df[model_location].astype(str).str
+
+            for non_dex_model, model_name in zip(
+                non_dex_models, non_dex_model_names
+            ):
 
-                    for non_dex_model, model_name in zip(
-                        non_dex_models, non_dex_model_names
-                    ):
-                        model_idx = str_list.contains(non_dex_model)
-                        m_idx = (missing_model_idx & notnull_idx & model_idx)
-                        df.loc[m_idx, "cgmModel"] = model_name
+                model_idx = str_list.contains(non_dex_model, na=False)
+                df.loc[model_idx, "cgmModel"] = model_name
+                df.loc[model_idx, "cgmModelSensedFrom"] = model_location
 
-    return df["cgmModel"]
+    return df[["cgmModel", "cgmModelSensedFrom"]]
 
 
 def hash_userid(userid, salt):
@@ -322,9 +356,6 @@ def remove_negative_durations(df):
     return df, n_negative_durations
 
 
-
-
-
 def tslim_calibration_fix(df):
     '''
     taken from https://github.com/tidepool-org/data-analytics/blob/
@@ -1428,29 +1459,6 @@ def estimate_local_time(df):
     return local_time, cDays
 
 
-# %% GET DATA FROM API
-#'''
-#get metadata and data for a donor that has shared with bigdata
-#NOTE: functions assume you have an .env with bigdata account credentials
-#'''
-#
-#userid = ""
-#donor_group = ""
-#
-#donor_metadata, _ = get_shared_metadata(
-#    donor_group=donor_group,
-#    userid_of_shared_user=userid  # TODO: this should be refactored in several places to be userid
-#)
-#data, _ = get_data(
-#    donor_group=donor_group,
-#    userid=userid,
-#    weeks_of_data=52*10
-#)
-#
-## this is a dummy loop
-#for i in [0]:
-
-
 # %% GET DATA FROM JSON FILE
 data_path = os.path.join("..", "data")
 all_donor_metadata = pd.read_csv(
@@ -1523,7 +1531,7 @@ def estimate_local_time(df):
     data_fields = list(data)
 
     # NOTE: moving remove negative durations to type specific cleaning
-    # TODO: ask backend to change "duration" field to only include one object type
+    # TODO: ask backend to change "duration" to only include one object type
 
     # Tslim calibration bug fix
     data, n_cal_readings = tslim_calibration_fix(data.copy())
@@ -1551,13 +1559,13 @@ def estimate_local_time(df):
 
 #    # estimate local time (refactor of estimate-local-time.py)
 #    data["localTime"], local_time_metadata = estimate_local_time(data.copy())
-
-    # TODO: fix this issue with estimate local time
-    '''
-    //anaconda3/envs/tbddp/lib/python3.7/site-packages/pandas/core/ops.py:1649:
-    FutureWarning: elementwise comparison failed; returning scalar instead,
-    but in the future will perform elementwise comparison result = method(y)
-    '''
+#
+# TODO: fix this issue with estimate local time
+#    '''
+#    //anaconda3/envs/tbddp/lib/python3.7/site-packages/pandas/core/ops.py:1649
+#    FutureWarning: elementwise comparison failed; returning scalar instead,
+#    but in the future will perform elementwise comparison result = method(y)
+#    '''
 
     # round all data to the nearest 5 minutes
     data["roundedUtcTime"] = round_time(
@@ -1587,12 +1595,46 @@ def estimate_local_time(df):
     data.sort_values("uploadTime", ascending=False, inplace=True)
     groups = data.groupby(by="type")
 
-    #  CGM DATA
+    # check to see if person is looping
+    if "basal" in data["type"].unique():
+        basal = groups.get_group("basal").dropna(axis=1, how="all")
+        if "deliveryType" in list(basal):
+            bd = basal.loc[
+                basal["deliveryType"] == "temp",
+                ["date", "deliveryType"]
+            ]
+            temp_basal_counts = (
+                pd.DataFrame(
+                    bd.groupby("date").deliveryType.count()
+                ).reset_index()
+            )
+            temp_basal_counts.rename(
+                {"deliveryType": "tempBasalCounts"},
+                axis=1,
+                inplace=True
+            )
+            data = pd.merge(data, temp_basal_counts, on="date", how="left")
+            # >= 25 temp basals per day is likely looping
+            data["isLoopDay"] = data["tempBasalCounts"] >= 25
+            # redefine groups with the new data
+            groups = data.groupby(by="type")
+
+        else:
+            data["isLoopDay"] = np.nan
+    else:
+        data["isLoopDay"] = np.nan
+
+    # %% CGM DATA
     if "cbg" in data["type"].unique():
+        # sort data with
         metadata["cgmData"] = True
 
         # filter by cgm
-        cgm = groups.get_group("cbg").dropna(axis=1, how="all")
+        cgm = groups.get_group("cbg").copy()
+
+        # sort data
+        cgm.sort_values("roundedUtcTime", ascending=False, inplace=True)
+        cgm.reset_index(drop=False, inplace=True)
 
         # calculate cgm in mg/dL
         cgm["mg/dL"] = round(cgm["value"] * MGDL_PER_MMOLL)
@@ -1601,16 +1643,6 @@ def estimate_local_time(df):
         cgm, nSpike = remove_spike_data(cgm.copy())
         metadata["nSpike"] = nSpike
 
-        # TODO: refactor (above) so you don't need to drop columns
-        drop_columns = [
-            'origin.payload.device.name',
-            'origin.payload.device.manufacturer',
-            'origin.payload.sourceRevision.source.name'
-        ]
-        for drop_col in drop_columns:
-            if drop_col in list(cgm):
-                cgm.drop(columns=[drop_col], inplace=True)
-
         # assign upload cgm device info to cgm records in that upload
         cgm = add_upload_info_to_cgm_records(groups, cgm.copy())
 
@@ -1624,14 +1656,18 @@ def estimate_local_time(df):
         )
 
         # get cgm models
-        cgm["cgmModel"] = np.nan
+        cgm["cgmModel"], cgm["cgmModelSensedFrom"] = np.nan, np.nan
 
         # dexcom cgm models (G4, G5, G6)
-        cgm["cgmModel"] = get_dexcom_cgm_model(cgm.copy())
+        cgm[["cgmModel", "cgmModelSensedFrom"]] = (
+            get_dexcom_cgm_model(cgm.copy())
+        )
 
         # for non dexcom cgms
         # 670G, 640G, 630G, 530G, 523/723, libre, animas, and tandem
-        cgm["cgmModel"] = get_non_dexcom_cgm_model(cgm.copy())
+        cgm[["cgmModel", "cgmModelSensedFrom"]] = (
+            get_non_dexcom_cgm_model(cgm.copy())
+        )
 
         # get metadata on cgm models and devices
         metadata["nMissingCgmModels"] = cgm["cgmModel"].isnull().sum()

From e324f779450479625a4afcd1be43f76a06c06cba Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Mon, 19 Aug 2019 08:57:04 -0500
Subject: [PATCH 30/46] first commit of cgm stats

---
 .../get_stats/get_cgm_distributions_v3.py     | 478 ++++++++++++++++--
 1 file changed, 442 insertions(+), 36 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
index e3fdf2d4..90e93cc4 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
@@ -1490,8 +1490,21 @@ def estimate_local_time(df):
     "PHI-2019-07-17-donor-data",
     "PHI-2019-07-17-cgm-distributions"
 )
+debug_duplicates = os.path.join(
+    data_path,
+    "PHI-2019-07-17-donor-data",
+    "PHI-2019-07-17-debug-cgm-duplicates"
+)
+output_stats = os.path.join(
+    data_path,
+    "PHI-2019-07-17-donor-data",
+    "PHI-2019-07-17-cgm-stats"
+)
+
 
-make_folder_if_doesnt_exist([output_metadata, output_distribution])
+make_folder_if_doesnt_exist(
+    [output_metadata, output_distribution, debug_duplicates, output_stats]
+)
 
 
 # %% START OF CODE
@@ -1511,10 +1524,21 @@ def estimate_local_time(df):
     'isOtherPerson',
 ]
 
-# %%
-for d_idx in range(0, len(all_files)):
-    data = pd.read_json(all_files[d_idx])
-    userid = all_files[d_idx][-15:-5]
+
+## %% load test data on my computer
+## TODO: if data comes in as a .csv, the embedded json fields
+## get saved as a string and need to be unwrapped before those fields
+## can be expanded. IN OTHER WORDS: this code only works with .json data
+for d_idx in [0]:
+    userid = "0d4524bc11"
+    data = pd.read_json(os.path.join(
+            "..", "data", "dremio", userid, "PHI-{}.json".format(userid)
+    ))
+
+## %%
+#for d_idx in range(0, len(all_files)):
+#    data = pd.read_json(all_files[d_idx])
+#    userid = all_files[d_idx][-15:-5]
     metadata = all_donor_metadata.loc[
         all_donor_metadata["userid"] == userid,
         donor_metadata_columns
@@ -1576,15 +1600,17 @@ def estimate_local_time(df):
     )
 
     #  TIME CATEGORIES
+    data["date"] = data["roundedUtcTime"].dt.date
+
     # AGE, & YLW
     # TODO: make this a function
-    if metadata["birthday"].values is not None:
+    if metadata["birthday"].values[0] is not np.nan:
         bDate = pd.to_datetime(metadata["birthday"].values[0][0:7])
         data["age"] = np.floor((data["roundedUtcTime"] - bDate).dt.days/365.25)
     else:
         data["age"] = np.nan
 
-    if metadata["diagnosisDate"].values is not None:
+    if metadata["diagnosisDate"].values[0] is not np.nan:
         dDate = pd.to_datetime(metadata["diagnosisDate"].values[0][0:7])
         data["ylw"] = np.floor((data["roundedUtcTime"] - dDate).dt.days/365.25)
     else:
@@ -1675,13 +1701,13 @@ def estimate_local_time(df):
         if "deviceId" in list(cgm):
             metadata["uniqueCgmDevices"] = str(cgm["deviceId"].unique())
 
-        # %% clean distributions
+        #  clean distributions
         # break up all traces by cgm model
-        all_cgm_series = pd.DataFrame()
+        combined_cgm_series = pd.DataFrame()
         cgm_models = cgm.groupby(by="cgmModel")
 
         for cgm_model in cgm_models.groups.keys():
-            print(cgm_model)
+            print("working on", cgm_model)
             temp_cgm = cgm_models.get_group(cgm_model)
 
             # get rid of cgm values too low/high (< 38 & > 402 mg/dL)
@@ -1717,7 +1743,11 @@ def estimate_local_time(df):
 
             # create a contiguous 5 minute time series
             first_day = temp_cgm["roundedUtcTime"].min()
+            metadata["firstCgm." + cgm_model] = first_day
+
             last_day = temp_cgm["roundedUtcTime"].max()
+            metadata["lastCgm." + cgm_model] = last_day
+
             rng = pd.date_range(first_day, last_day, freq="5min")
             contiguous_data = pd.DataFrame(
                 rng,
@@ -1731,7 +1761,7 @@ def estimate_local_time(df):
             cgm_series = pd.merge(
                 contiguous_data,
                 temp_cgm[[
-                    "roundedUtcTime", "hashid",
+                    "roundedUtcTime", "hashid", "isLoopDay",
                     "cgmModel", "age", "ylw", "mg/dL"
                  ]],
                 on="roundedUtcTime",
@@ -1815,34 +1845,411 @@ def estimate_local_time(df):
             metadata["nCgmDataPoints." + cgm_model] = len(cgm_series)
 
             # append cgm model to a larger table
-            all_cgm_series = pd.concat(
-                [all_cgm_series, cgm_series],
+            combined_cgm_series = pd.concat(
+                [combined_cgm_series, cgm_series],
                 ignore_index=True
             )
+        if len(combined_cgm_series) > 0:
+            # sort so that the oldest data point is on top
+            # and that the G5_G6 get deleted if they are apart of a duplicate
+            combined_cgm_series["cgmModel_G5_and_G6"] = (
+                combined_cgm_series["cgmModel"] == "G5_G6"
+            )
+            combined_cgm_series.sort_values(
+                by=["roundedUtcTime", "cgmModel_G5_and_G6", "cgmModel"],
+                ascending=[False, True, False],
+                inplace=True
+            )
+            combined_cgm_series.reset_index(drop=True, inplace=True)
+
+            # add in check to see if there are duplicates between cgm devices
+            nUnique_cgm_times = len(combined_cgm_series["roundedUtcTime"].unique())
+            cgm_len = len(combined_cgm_series)
+            metadata["duplicateCgmDataIssue"] = nUnique_cgm_times != cgm_len
+
+            nDuplicate_cgm = cgm_len - nUnique_cgm_times
+            metadata["nDuplicateCgmDataIssues"] = nDuplicate_cgm
+
+            # if there are still duplicates, get rid of them
+            if nDuplicate_cgm > 0:
+                # save the duplicates for further examination
+                combined_cgm_series.to_csv(os.path.join(
+                    debug_duplicates,
+                    "PHI-" + userid + "-cgm-series-has-cgm-duplicates.csv.gz"
+                ))
+
+                cgm.to_csv(os.path.join(
+                    debug_duplicates,
+                    "PHI-" + userid + "-cgm-data-has-cgm-duplicates.csv.gz"
+                ))
+
+                # get rid of duplicates
+                combined_cgm_series, n_cgm_dups_removed = (
+                    removeDuplicates(combined_cgm_series, "roundedUtcTime")
+                )
+                metadata["nCgmDuplicatesRemovedRoundedTime.atEnd"] = (
+                    n_cgm_dups_removed
+                )
+            metadata["nCgmDataPoints.atEnd"] = len(combined_cgm_series)
 
-        # sort so that the oldest data point is on top
-        all_cgm_series.sort_values(
-            "roundedUtcTime", ascending=False, inplace=True
-        )
-        all_cgm_series.reset_index(drop=True, inplace=True)
+            # add whether data is dexcom cgm or not
+            combined_cgm_series["dexcomCgm"] = (
+                combined_cgm_series["cgmModel"].astype(str).str.contains("G4|G5|G6")
+            )
 
-        # add in check to see if there are duplicates between cgm devices
-        nUnique_cgm_times = len(all_cgm_series["roundedUtcTime"].unique())
-        metadata["duplicateCgmDataIssue"] = (
-            nUnique_cgm_times != len(all_cgm_series)
-        )
+            # save distribution data
+            combined_cgm_series.to_csv(os.path.join(
+                output_distribution,
+                "PHI-" + userid + "-cgm-distribution.csv.gz"
+            ))
+
+
+            # %% get cgm stats
+            # create a contiguous 5 minute time series of ALL cgm data
+            first_day = combined_cgm_series["roundedUtcTime"].min()
+            metadata["firstCgm." + cgm_model] = first_day
+
+            last_day = combined_cgm_series["roundedUtcTime"].max()
+            metadata["lastCgm." + cgm_model] = last_day
+
+            rng = pd.date_range(first_day, last_day, freq="5min")
+            contiguous_data = pd.DataFrame(
+                rng,
+                columns=["roundedUtcTime"]
+            ).sort_values(
+                "roundedUtcTime",
+                ascending=True
+            ).reset_index(drop=True)
+
+            # merge with combined_cgm_series data
+            all_cgm = pd.merge(
+                contiguous_data,
+                combined_cgm_series[[
+                    'roundedUtcTime', 'hashid', 'cgmModel', 'dexcomCgm',
+                    'age', 'ylw', 'isLoopDay', 'mg/dL',
+                ]],
+                on="roundedUtcTime",
+                how="left"
+            )
+
+            # get cgm stats
+            # get a binary (T/F) of whether we have a cgm value
+            all_cgm["hasCgm"] = all_cgm["mg/dL"].notnull()
+
+            # fill isLoopDay nan with False
+            all_cgm["isLoopDay"].fillna(False, inplace=True)
+
+            # has loop and cgm
+            all_cgm["hasLoopAndCgm"] = (
+                (all_cgm["isLoopDay"]) & (all_cgm["hasCgm"])
+            )
+
+            all_cgm["hasCgmWithoutLoop"] = (
+                (~all_cgm["isLoopDay"]) & (all_cgm["hasCgm"])
+            )
+
+            # make this a function and round ascendingly
+            ts39_401 = all_cgm["mg/dL"].copy()
+
+            # for all the less than (<) criteria
+            for cgm_threshold in [40, 54, 70]:
+                all_cgm["cgm < " + str(cgm_threshold)] = (
+                    ts39_401.lt(cgm_threshold)
+                )
+            # for all the greter than or equal to (>=) criteria
+                all_cgm["cgm >= " + str(cgm_threshold)] = (
+                    ts39_401.ge(cgm_threshold)
+                )
+
+            # for all the the less than or equal to (<=) criteria
+            for cgm_threshold in [140, 180, 250, 300, 400]:
+                all_cgm["cgm <= " + str(cgm_threshold)] = (
+                    ts39_401.le(cgm_threshold)
+                )
+            # for all the the greter than (>) criteria
+                all_cgm["cgm > " + str(cgm_threshold)] = (
+                    ts39_401.gt(cgm_threshold)
+                )
+
+            # get all of the cgm ranges
+            # (cgm >= 40) & (cgm < 54)
+            all_cgm["40 <= cgm < 54"] = (
+                (all_cgm["cgm >= 40"]) & (all_cgm["cgm < 54"])
+            )
+
+            # (cgm >= 54) & (cgm < 70)
+            all_cgm["54 <= cgm < 70"] = (
+                (all_cgm["cgm >= 54"]) & (all_cgm["cgm < 70"])
+            )
+
+            # (cgm >= 70) & (cgm <= 140)
+            all_cgm["70 <= cgm <= 140"] = (
+                (all_cgm["cgm >= 70"]) & (all_cgm["cgm <= 140"])
+            )
+
+            # (cgm >= 70) & (cgm <= 180)
+            all_cgm["70 <= cgm <= 180"] = (
+                (all_cgm["cgm >= 70"]) & (all_cgm["cgm <= 180"])
+            )
+
+            # (cgm > 180) & (cgm <= 250)
+            all_cgm["180 < cgm <= 250"] = (
+                (all_cgm["cgm > 180"]) & (all_cgm["cgm <= 250"])
+            )
+
+            # (cgm > 250) & (cgm <= 400)
+            all_cgm["250 < cgm <= 400"] = (
+                (all_cgm["cgm > 250"]) & (all_cgm["cgm <= 400"])
+            )
+
+            # derfine the windows to calculate the stats over
+            window_names = ["hour", "day", "week", "month", "quarter", "year"]
+            window_lengths = [12,    288,   288*7,  288*7*4, 288*90,   288*365]
+
+            for w_name, w_len in zip(window_names, window_lengths):
+                # require lenth of window for percent calculations
+                w_min = w_len
+
+                # get the start and end times for each window
+                all_cgm[w_name + ".startTime"] = (
+                    all_cgm["roundedUtcTime"].shift(w_len - 1)
+                )
+                all_cgm[w_name + ".endTime"] = all_cgm["roundedUtcTime"]
+
+                # add majority age for the time period
+                all_cgm[w_name + ".age"] = np.round(
+                    all_cgm["age"].rolling(
+                        min_periods=1,
+                        window=w_len
+                    ).mean()
+                )
+
+                # add majority ylw for the time period
+                all_cgm[w_name + ".ylw"] = np.round(
+                    all_cgm["ylw"].rolling(
+                        min_periods=1,
+                        window=w_len
+                    ).median()
+                )
+
+                # get percent time cgm used
+                all_cgm[w_name + ".cgmPercent"] = (
+                    all_cgm["hasCgm"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # get the total number of non-null values over this time period
+                all_cgm[w_name + ".missingCgmPercent"] = (
+                    1 - all_cgm[w_name + ".cgmPercent"]
+                )
+
+                # create (T/F) 70 and 80 percent available thresholds
+                # which will be useful for processing later
+                all_cgm[w_name + ".ge70Available"] = (
+                    all_cgm[w_name + ".cgmPercent"] >= 0.7
+                )
+
+                all_cgm[w_name + ".ge80Available"] = (
+                    all_cgm[w_name + ".cgmPercent"] >= 0.8
+                )
+
+                # get percent time Loop was used NOTE: this is
+                # approximate because we use > 24 temp basals per day
+                # ALSO: this is percent time Loop was used while cgm in use
+                all_cgm[w_name + ".loopingAndCgmPercent"] = (
+                    all_cgm["hasLoopAndCgm"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # percent of time cgm without loop
+                all_cgm[w_name + ".cgmWithoutLoopPercent"] = (
+                    all_cgm["hasCgmWithoutLoop"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # get percent time in different ranges
+                # % Time < 54
+                all_cgm[w_name + ".lt54Percent"] = (
+                    all_cgm["cgm < 54"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # % Time in 54-70 (cgm >= 54) & (cgm < 70)
+                all_cgm[w_name + ".bt54_70Percent"] = (
+                    all_cgm["54 <= cgm < 70"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # % Time in target range (cgm >= 70) & (cgm <= 180)
+                all_cgm[w_name + ".bt70_180Percent"] = (
+                    all_cgm["70 <= cgm <= 180"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # % Time in 180-250 (cgm > 180) & (cgm <= 250)
+                all_cgm[w_name + ".bt180_250Percent"] = (
+                    all_cgm["180 < cgm <= 250"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # % Time > 250
+                all_cgm[w_name + ".gt250Percent"] = (
+                    all_cgm["cgm > 250"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # check that all of the percentages add of to 1 or 100%
+                all_cgm[w_name + ".percentCheck"] = (
+                     all_cgm[w_name + ".missingCgmPercent"]
+                     + all_cgm[w_name + ".lt54Percent"]
+                     + all_cgm[w_name + ".bt54_70Percent"]
+                     + all_cgm[w_name + ".bt70_180Percent"]
+                     + all_cgm[w_name + ".bt180_250Percent"]
+                     + all_cgm[w_name + ".gt250Percent"]
+                )
+
+                # here are some other less common percent time in ranges
+                # % Time < 70
+                all_cgm[w_name + ".lt70Percent"] = (
+                    all_cgm["cgm < 70"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # % Time in target range (cgm >= 70) & (cgm <= 140)
+                all_cgm[w_name + ".tir70to140Percent"] = (
+                    all_cgm["70 <= cgm <= 140"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # percent time above a threshold
+                # % Time > 180
+                all_cgm[w_name + ".gt180Percent"] = (
+                    all_cgm["cgm > 180"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # points that are 39 or 401 should NOT be used most
+                # calculations because the actual number is <= 39 or >= 401
+                # (cgm < 40) OR (cgm > 400)
+                all_cgm["mg/dL.40to400"] = (
+                    ts39_401.replace(to_replace=39, value=np.nan)
+                )
+
+                all_cgm["mg/dL.40to400"] = (
+                    all_cgm["mg/dL.40to400"].replace(
+                        to_replace=401,
+                        value=np.nan
+                    )
+                )
+
+                # redefine the time series (ts) for the following stats
+                ts40_400 = all_cgm["mg/dL.40to400"].copy()
+                # require at least 3 points to make a stats calculation
+                w_min = 3
+
+                # recalcuate percent of measurements available
+                all_cgm[w_name + ".40to400availablePercent"] = (
+                    ts40_400.rolling(min_periods=w_min, window=w_len).count()
+                ) / w_len
+
+                # get the total number of non-null values over this time period
+                all_cgm[w_name + ".40to400missingPercent"] = (
+                    1 - all_cgm[w_name + ".40to400availablePercent"]
+                )
+
+                all_cgm[w_name + ".40to400ge70Available"] = (
+                    all_cgm[w_name + ".40to400availablePercent"] >= 0.7
+                )
+
+                all_cgm[w_name + ".40to400ge80Available"] = (
+                    all_cgm[w_name + ".40to400availablePercent"] >= 0.8
+                )
+
+                # create a rolling object
+                roll40_400 = ts40_400.rolling(min_periods=w_min, window=w_len)
+
+                # quantiles
+                # NOTE: this will increase run time, so only run if you need
+                # 3-4X the processing time since it has to sort the data
+                # TODO: make this an option to the function, once it is made
+
+                # min
+                all_cgm[w_name + ".min"] = roll40_400.min()
+
+                # 10, 25, 75, and 90th percentiles
+                all_cgm[w_name + ".10th"] = roll40_400.quantile(0.10)
+                all_cgm[w_name + ".25th"] = roll40_400.quantile(0.25)
+                all_cgm[w_name + ".75th"] = roll40_400.quantile(0.75)
+                all_cgm[w_name + ".90th"] = roll40_400.quantile(0.90)
+
+                # max
+                all_cgm[w_name + ".max"] = roll40_400.max()
+
+                # median
+                all_cgm[w_name + ".median"] = roll40_400.median()
+
+                # iqr
+                all_cgm[w_name + ".iqr"] = (
+                    all_cgm[w_name + ".75th"] - all_cgm[w_name + ".25th"]
+                )
+
+                # mean
+                all_cgm[w_name + ".mean"] = roll40_400.mean()
+
+                # GMI(%) = 3.31 + 0.02392 x [mean glucose in mg/dL]
+                all_cgm[w_name + ".gmi"] = (
+                    3.31 + (0.02392 * all_cgm[w_name + ".mean"])
+                )
+
+                # standard deviation (std)
+                all_cgm[w_name + ".std"] = roll40_400.std()
+
+                # coefficient of variation (cov) = std / mean
+                all_cgm[w_name + ".cov"] = (
+                    all_cgm[w_name + ".std"] / all_cgm[w_name + ".mean"]
+                )
+
+                # %% make an episodes dataframe, and then get stats
+                all_cgm["notnull"] = all_cgm["mg/dL"].notnull()
+                all_cgm["hypoEpisodeStart"] = (
+                    (all_cgm["cgm < 54"]) & (all_cgm["cgm >= 54"].shift(1))
+                    & (all_cgm["notnull"]) & (all_cgm["notnull"].shift(1))
+                )
+#                ts["startCrossPoint"] = ((df.mg_dL.shift(1) >= episodeThreshold) &
+#                                      (df.mg_dL < episodeThreshold))
+#
+#                df["endCrossPoint"] = ((df.mg_dL.shift(1) < episodeThreshold) &
+#                                    (df.mg_dL >= episodeThreshold))
+
+
+                # save cgm stats data
+                all_cgm.to_csv(os.path.join(
+                    output_stats,
+                    "PHI-" + userid + "-cgm-stats.csv.gz"
+                ))
 
-        # get metadata for cgm stats
-        metadata["lastCgm.date"] = all_cgm_series.loc[0, "roundedUtcTime"]
-        metadata["lastCgm.age"] = all_cgm_series.loc[0, "age"]
-        metadata["lastCgm.ylw"] = all_cgm_series.loc[0, "ylw"]
-
-        pdb.set_trace()
-        # save distribution data
-        all_cgm_series.to_csv(os.path.join(
-            output_distribution,
-            "PHI-" + userid + "-cgm-distribution.csv"
-        ))
         print(metadata.T)
 
     else:
@@ -1852,8 +2259,7 @@ def estimate_local_time(df):
     # save metadata
     metadata.to_csv(os.path.join(
         output_metadata,
-        "PHI-" + userid + "-cgm-metadata.csv"
+        "PHI-" + userid + "-cgm-metadata.csv.gz"
     ))
 
     print("finished", d_idx, userid)
-

From ddabb815d5d95832d4985c1a779c1fbcee13c662 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Mon, 19 Aug 2019 11:13:17 -0500
Subject: [PATCH 31/46] initial commit of episodes

---
 .../get_stats/get_cgm_distributions_v3.py     | 52 +++++++++++++++----
 1 file changed, 42 insertions(+), 10 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
index 90e93cc4..0090557e 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
@@ -2232,19 +2232,51 @@ def estimate_local_time(df):
                 )
 
                 # %% make an episodes dataframe, and then get stats
-                all_cgm["notnull"] = all_cgm["mg/dL"].notnull()
-                all_cgm["hypoEpisodeStart"] = (
-                    (all_cgm["cgm < 54"]) & (all_cgm["cgm >= 54"].shift(1))
-                    & (all_cgm["notnull"]) & (all_cgm["notnull"].shift(1))
+                episode_ts = all_cgm[[
+                    "roundedUtcTime", "mg/dL", "hasCgm",
+                    "cgm < 54", "cgm >= 54"
+                ]].copy()
+
+                # put consecutive data that matches in groups
+                episode_ts["tempGroups"] = (
+                    (episode_ts["cgm < 54"] != episode_ts["cgm < 54"].shift()).cumsum()
+                )
+                episode_ts["episodeGroup"] = (
+                    episode_ts["tempGroups"] * episode_ts["cgm < 54"]
+                )
+                episode_groups = episode_ts.groupby("episodeGroup")
+                episodes = (
+                    episode_groups["roundedUtcTime"].count().reset_index()
+                )
+                episodes["duration"] = episodes["roundedUtcTime"] * 5
+                episodes.rename(
+                    columns={"roundedUtcTime": "episodeCounts"}, inplace=True
+                )
+
+                episode_ts = pd.merge(
+                    episode_ts,
+                    episodes,
+                    on="episodeGroup",
+                    how="left"
+                )
+                episode_ts["episodeDuration"] = (
+                    episode_ts["duration"] * episode_ts["cgm < 54"]
+                )
+
+                # merge episodes back into all_cgm
+                all_cgm = pd.merge(
+                    all_cgm,
+                    episode_ts[[
+                        'roundedUtcTime',
+                        'episodeGroup',
+                        'episodeDuration'
+                    ]],
+                    on="roundedUtcTime",
+                    how="left"
                 )
-#                ts["startCrossPoint"] = ((df.mg_dL.shift(1) >= episodeThreshold) &
-#                                      (df.mg_dL < episodeThreshold))
-#
-#                df["endCrossPoint"] = ((df.mg_dL.shift(1) < episodeThreshold) &
-#                                    (df.mg_dL >= episodeThreshold))
 
 
-                # save cgm stats data
+                # %% save cgm stats data
                 all_cgm.to_csv(os.path.join(
                     output_stats,
                     "PHI-" + userid + "-cgm-stats.csv.gz"

From cded1ccc7ff50e380de9a00a0281b7c3c74da87e Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Mon, 19 Aug 2019 14:21:07 -0500
Subject: [PATCH 32/46] next increment of episodes

---
 .../get_stats/get_cgm_distributions_v3.py     | 101 ++++++++++--------
 1 file changed, 59 insertions(+), 42 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
index 0090557e..295cc3f6 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
@@ -38,6 +38,49 @@
 '''
 
 
+def get_episodes(df, episode_criterion, min_duration):
+
+    # put consecutive data that matches in groups
+    df["tempGroups"] = ((
+        df[episode_criterion] != df[episode_criterion].shift()
+    ).cumsum())
+
+    df["episodeId"] = (
+        df["tempGroups"] * df[episode_criterion]
+    )
+
+    # group by the episode groups
+    episode_groups = df.groupby("episodeId")
+    episodes = episode_groups["roundedUtcTime"].count().reset_index()
+    episodes["duration"] = episodes["roundedUtcTime"] * 5
+    episodes.rename(columns={"roundedUtcTime": "episodeCounts"}, inplace=True)
+
+    df = pd.merge(df, episodes, on="episodeId", how="left")
+    df["episodeDuration"] = (
+        df["duration"] * df[episode_criterion]
+    )
+
+    # get rolling stats on episodes
+    df["isEpisode"] = (
+        df["episodeDuration"] >= min_duration
+    )
+
+    # get the hypo episode starts so we only count each episode once
+    df["episodeStart"] = (
+        (df[episode_criterion])
+        & (~df[episode_criterion].shift(1).fillna(False))
+        & (df["hasCgm"])
+        & (df["hasCgm"].shift(1))
+    )
+
+    df = df[[
+        "isEpisode", "episodeStart",
+        "episodeId", "episodeDuration"
+    ]].add_prefix("episode." + episode_criterion + ".")
+
+    return df
+
+
 def get_slope(y):
     if "array" not in type(y).__name__:
         raise TypeError('Expecting a numpy array')
@@ -1903,8 +1946,7 @@ def estimate_local_time(df):
                 "PHI-" + userid + "-cgm-distribution.csv.gz"
             ))
 
-
-            # %% get cgm stats
+            # get cgm stats
             # create a contiguous 5 minute time series of ALL cgm data
             first_day = combined_cgm_series["roundedUtcTime"].min()
             metadata["firstCgm." + cgm_model] = first_day
@@ -2231,50 +2273,25 @@ def estimate_local_time(df):
                     all_cgm[w_name + ".std"] / all_cgm[w_name + ".mean"]
                 )
 
-                # %% make an episodes dataframe, and then get stats
-                episode_ts = all_cgm[[
-                    "roundedUtcTime", "mg/dL", "hasCgm",
-                    "cgm < 54", "cgm >= 54"
-                ]].copy()
-
-                # put consecutive data that matches in groups
-                episode_ts["tempGroups"] = (
-                    (episode_ts["cgm < 54"] != episode_ts["cgm < 54"].shift()).cumsum()
-                )
-                episode_ts["episodeGroup"] = (
-                    episode_ts["tempGroups"] * episode_ts["cgm < 54"]
-                )
-                episode_groups = episode_ts.groupby("episodeGroup")
-                episodes = (
-                    episode_groups["roundedUtcTime"].count().reset_index()
-                )
-                episodes["duration"] = episodes["roundedUtcTime"] * 5
-                episodes.rename(
-                    columns={"roundedUtcTime": "episodeCounts"}, inplace=True
-                )
-
-                episode_ts = pd.merge(
-                    episode_ts,
-                    episodes,
-                    on="episodeGroup",
-                    how="left"
-                )
-                episode_ts["episodeDuration"] = (
-                    episode_ts["duration"] * episode_ts["cgm < 54"]
+                # make an episodes dataframe, and then get stats
+                # get episodes < 54
+                episode_ts = get_episodes(
+                    all_cgm[["roundedUtcTime", "hasCgm", "cgm < 54"]].copy(),
+                    "cgm < 54",
+                    15
                 )
+                all_cgm = pd.concat([all_cgm, episode_ts], axis=1)
 
-                # merge episodes back into all_cgm
-                all_cgm = pd.merge(
-                    all_cgm,
-                    episode_ts[[
-                        'roundedUtcTime',
-                        'episodeGroup',
-                        'episodeDuration'
-                    ]],
-                    on="roundedUtcTime",
-                    how="left"
+                # get episodes < 70
+                episode_ts = get_episodes(
+                    all_cgm[["roundedUtcTime", "hasCgm", "cgm < 70"]].copy(),
+                    "cgm < 70",
+                    15
                 )
+                all_cgm = pd.concat([all_cgm, episode_ts], axis=1)
 
+                # get rolling stats on episodes
+                pdb.set_trace()
 
                 # %% save cgm stats data
                 all_cgm.to_csv(os.path.join(

From 0b9a3b429d8b03663f4d7e9e7fb1daa881933358 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Mon, 19 Aug 2019 14:21:35 -0500
Subject: [PATCH 33/46] move percentile calculations to full range of data
 section

---
 .../get_stats/get_cgm_distributions_v3.py     | 52 ++++++++++---------
 1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
index 295cc3f6..caf73951 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
@@ -2192,6 +2192,33 @@ def estimate_local_time(df):
                     ).sum() / w_len
                 )
 
+                # quantiles
+                # NOTE: this will increase run time, so only run if you need
+                # 3-4X the processing time since it has to sort the data
+                # TODO: make this an option to the function, once it is made
+                # create a rolling object
+                roll39_401 = ts39_401.rolling(min_periods=3, window=w_len)
+
+                # min
+                all_cgm[w_name + ".min"] = roll39_401.min()
+
+                # 10, 25, 75, and 90th percentiles
+                all_cgm[w_name + ".10th"] = roll39_401.quantile(0.10)
+                all_cgm[w_name + ".25th"] = roll39_401.quantile(0.25)
+                all_cgm[w_name + ".75th"] = roll39_401.quantile(0.75)
+                all_cgm[w_name + ".90th"] = roll39_401.quantile(0.90)
+
+                # max
+                all_cgm[w_name + ".max"] = roll39_401.max()
+
+                # median
+                all_cgm[w_name + ".median"] = roll39_401.median()
+
+                # iqr
+                all_cgm[w_name + ".iqr"] = (
+                    all_cgm[w_name + ".75th"] - all_cgm[w_name + ".25th"]
+                )
+
                 # points that are 39 or 401 should NOT be used most
                 # calculations because the actual number is <= 39 or >= 401
                 # (cgm < 40) OR (cgm > 400)
@@ -2232,31 +2259,6 @@ def estimate_local_time(df):
                 # create a rolling object
                 roll40_400 = ts40_400.rolling(min_periods=w_min, window=w_len)
 
-                # quantiles
-                # NOTE: this will increase run time, so only run if you need
-                # 3-4X the processing time since it has to sort the data
-                # TODO: make this an option to the function, once it is made
-
-                # min
-                all_cgm[w_name + ".min"] = roll40_400.min()
-
-                # 10, 25, 75, and 90th percentiles
-                all_cgm[w_name + ".10th"] = roll40_400.quantile(0.10)
-                all_cgm[w_name + ".25th"] = roll40_400.quantile(0.25)
-                all_cgm[w_name + ".75th"] = roll40_400.quantile(0.75)
-                all_cgm[w_name + ".90th"] = roll40_400.quantile(0.90)
-
-                # max
-                all_cgm[w_name + ".max"] = roll40_400.max()
-
-                # median
-                all_cgm[w_name + ".median"] = roll40_400.median()
-
-                # iqr
-                all_cgm[w_name + ".iqr"] = (
-                    all_cgm[w_name + ".75th"] - all_cgm[w_name + ".25th"]
-                )
-
                 # mean
                 all_cgm[w_name + ".mean"] = roll40_400.mean()
 

From 4c9714aec96f0712265d0fecd67e3330b324589d Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Tue, 20 Aug 2019 05:52:58 -0500
Subject: [PATCH 34/46] get episode stats

---
 .../get_stats/get_cgm_distributions_v3.py     | 139 +++++++++++++-----
 1 file changed, 100 insertions(+), 39 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
index caf73951..70d32771 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
@@ -38,7 +38,14 @@
 '''
 
 
-def get_episodes(df, episode_criterion, min_duration):
+def get_episodes(
+        df,
+        episode_criterion="cgm < 54",
+        min_duration=5,
+):
+    # TODO: deal with case where there are nan's in the middle of an episode
+    # it probably makes sense to interpolate between values iff the gap is
+    # <= 1 to 6 points (5 to 30 minutes)
 
     # put consecutive data that matches in groups
     df["tempGroups"] = ((
@@ -60,7 +67,7 @@ def get_episodes(df, episode_criterion, min_duration):
         df["duration"] * df[episode_criterion]
     )
 
-    # get rolling stats on episodes
+    # mark record as belonging to an episode
     df["isEpisode"] = (
         df["episodeDuration"] >= min_duration
     )
@@ -69,14 +76,25 @@ def get_episodes(df, episode_criterion, min_duration):
     df["episodeStart"] = (
         (df[episode_criterion])
         & (~df[episode_criterion].shift(1).fillna(False))
-        & (df["hasCgm"])
-        & (df["hasCgm"].shift(1))
+#        & (df["hasCgm"])
+#        & (df["hasCgm"].shift(1))
+    )
+
+    # calculate the total duration and attach to start record
+    # which is needed to get the average duration per episode
+    df["episodeTotalDuration"] = (
+        df["episodeStart"] * df["episodeDuration"]
+    )
+    df["episodeTotalDuration"].replace(0, np.nan, inplace=True)
+
+    episode_prefix = (
+        "episode." + episode_criterion
+        + ".durationThreshold=" + str(min_duration) + "."
     )
 
     df = df[[
-        "isEpisode", "episodeStart",
-        "episodeId", "episodeDuration"
-    ]].add_prefix("episode." + episode_criterion + ".")
+        "isEpisode", "episodeId", "episodeStart", "episodeTotalDuration"
+    ]].add_prefix(episode_prefix)
 
     return df
 
@@ -1572,16 +1590,16 @@ def estimate_local_time(df):
 ## TODO: if data comes in as a .csv, the embedded json fields
 ## get saved as a string and need to be unwrapped before those fields
 ## can be expanded. IN OTHER WORDS: this code only works with .json data
-for d_idx in [0]:
-    userid = "0d4524bc11"
-    data = pd.read_json(os.path.join(
-            "..", "data", "dremio", userid, "PHI-{}.json".format(userid)
-    ))
-
-## %%
-#for d_idx in range(0, len(all_files)):
-#    data = pd.read_json(all_files[d_idx])
-#    userid = all_files[d_idx][-15:-5]
+#for d_idx in [0]:
+#    userid = "0d4524bc11"
+#    data = pd.read_json(os.path.join(
+#            "..", "data", "dremio", userid, "PHI-{}.json".format(userid)
+#    ))
+
+# %%
+for d_idx in range(0, len(all_files)):
+    data = pd.read_json(all_files[d_idx])
+    userid = all_files[d_idx][-15:-5]
     metadata = all_donor_metadata.loc[
         all_donor_metadata["userid"] == userid,
         donor_metadata_columns
@@ -1946,7 +1964,7 @@ def estimate_local_time(df):
                 "PHI-" + userid + "-cgm-distribution.csv.gz"
             ))
 
-            # get cgm stats
+            # %% get cgm stats
             # create a contiguous 5 minute time series of ALL cgm data
             first_day = combined_cgm_series["roundedUtcTime"].min()
             metadata["firstCgm." + cgm_model] = first_day
@@ -1990,7 +2008,7 @@ def estimate_local_time(df):
                 (~all_cgm["isLoopDay"]) & (all_cgm["hasCgm"])
             )
 
-            # make this a function and round ascendingly
+            # work with all of the non-null data, even 39 = LOW and 401 = HIGH
             ts39_401 = all_cgm["mg/dL"].copy()
 
             # for all the less than (<) criteria
@@ -1998,6 +2016,19 @@ def estimate_local_time(df):
                 all_cgm["cgm < " + str(cgm_threshold)] = (
                     ts39_401.lt(cgm_threshold)
                 )
+                # get episodes below these thresholds
+                for min_duration in [5, 15]:
+                    episode_ts = get_episodes(
+                        all_cgm[[
+                            "roundedUtcTime",
+                            "hasCgm",
+                            "cgm < " + str(cgm_threshold)
+                        ]].copy(),
+                        episode_criterion="cgm < " + str(cgm_threshold),
+                        min_duration=min_duration
+                    )
+                    all_cgm = pd.concat([all_cgm, episode_ts], axis=1)
+
             # for all the greter than or equal to (>=) criteria
                 all_cgm["cgm >= " + str(cgm_threshold)] = (
                     ts39_401.ge(cgm_threshold)
@@ -2115,6 +2146,56 @@ def estimate_local_time(df):
                     ).sum() / w_len
                 )
 
+                # get episode stats
+                # TODO: add in hyper events
+                # get episodes below these thresholds
+                for cgm_threshold in [40, 54, 70]:
+                    # get number of episodes per time window
+                    for min_duration in [5, 15]:
+                        "cgm < " + str(cgm_threshold)
+                        episode_name = (
+                            "episode.cgm < " + str(cgm_threshold)
+                            + ".durationThreshold=" + str(min_duration)
+                        )
+                        all_cgm[w_name + ".count." + episode_name] = (
+                            all_cgm[episode_name + ".episodeStart"].rolling(
+                                min_periods=1,
+                                window=w_len
+                            ).sum()
+                        )
+
+                        # get avg. duration of each episode per time window
+                        all_cgm[w_name + ".avgDuration." + episode_name] = (
+                            all_cgm[episode_name + ".episodeTotalDuration"].rolling(
+                                min_periods=1,
+                                window=w_len
+                            ).sum() / all_cgm[w_name + ".count." + episode_name]
+                        )
+
+                        # get min duration of each episode per time window
+                        all_cgm[w_name + ".minDuration." + episode_name] = (
+                            all_cgm[episode_name + ".episodeTotalDuration"].rolling(
+                                min_periods=1,
+                                window=w_len
+                            ).min()
+                        )
+
+                        # get median duration of each episode per time window
+                        all_cgm[w_name + ".medianDuration." + episode_name] = (
+                            all_cgm[episode_name + ".episodeTotalDuration"].rolling(
+                                min_periods=1,
+                                window=w_len
+                            ).median()
+                        )
+
+                        # get max duration of each episode per time window
+                        all_cgm[w_name + ".maxDuration." + episode_name] = (
+                            all_cgm[episode_name + ".episodeTotalDuration"].rolling(
+                                min_periods=1,
+                                window=w_len
+                            ).max()
+                        )
+
                 # get percent time in different ranges
                 # % Time < 54
                 all_cgm[w_name + ".lt54Percent"] = (
@@ -2275,26 +2356,6 @@ def estimate_local_time(df):
                     all_cgm[w_name + ".std"] / all_cgm[w_name + ".mean"]
                 )
 
-                # make an episodes dataframe, and then get stats
-                # get episodes < 54
-                episode_ts = get_episodes(
-                    all_cgm[["roundedUtcTime", "hasCgm", "cgm < 54"]].copy(),
-                    "cgm < 54",
-                    15
-                )
-                all_cgm = pd.concat([all_cgm, episode_ts], axis=1)
-
-                # get episodes < 70
-                episode_ts = get_episodes(
-                    all_cgm[["roundedUtcTime", "hasCgm", "cgm < 70"]].copy(),
-                    "cgm < 70",
-                    15
-                )
-                all_cgm = pd.concat([all_cgm, episode_ts], axis=1)
-
-                # get rolling stats on episodes
-                pdb.set_trace()
-
                 # %% save cgm stats data
                 all_cgm.to_csv(os.path.join(
                     output_stats,

From 2fe6065b68f3ebd1df0b0150147cbe5fb2f1f2f9 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Tue, 20 Aug 2019 07:27:19 -0500
Subject: [PATCH 35/46] minor refactor

---
 .../get_stats/get_cgm_distributions_v3.py     | 70 +++++++++++--------
 1 file changed, 41 insertions(+), 29 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
index 70d32771..740128f7 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
@@ -2011,6 +2011,21 @@ def estimate_local_time(df):
             # work with all of the non-null data, even 39 = LOW and 401 = HIGH
             ts39_401 = all_cgm["mg/dL"].copy()
 
+            # some stats should NOT include 39 or 401
+            all_cgm["mg/dL.40to400"] = (
+                ts39_401.replace(to_replace=39, value=np.nan)
+            )
+
+            all_cgm["mg/dL.40to400"] = (
+                all_cgm["mg/dL.40to400"].replace(
+                    to_replace=401,
+                    value=np.nan
+                )
+            )
+
+            ts40_400 = all_cgm["mg/dL.40to400"].copy()
+
+
             # for all the less than (<) criteria
             for cgm_threshold in [40, 54, 70]:
                 all_cgm["cgm < " + str(cgm_threshold)] = (
@@ -2278,7 +2293,10 @@ def estimate_local_time(df):
                 # 3-4X the processing time since it has to sort the data
                 # TODO: make this an option to the function, once it is made
                 # create a rolling object
+
+                # NOTE: these calculations only require 3 points to make
                 roll39_401 = ts39_401.rolling(min_periods=3, window=w_len)
+                roll40_400 = ts40_400.rolling(min_periods=3, window=w_len)
 
                 # min
                 all_cgm[w_name + ".min"] = roll39_401.min()
@@ -2300,29 +2318,10 @@ def estimate_local_time(df):
                     all_cgm[w_name + ".75th"] - all_cgm[w_name + ".25th"]
                 )
 
-                # points that are 39 or 401 should NOT be used most
-                # calculations because the actual number is <= 39 or >= 401
-                # (cgm < 40) OR (cgm > 400)
-                all_cgm["mg/dL.40to400"] = (
-                    ts39_401.replace(to_replace=39, value=np.nan)
-                )
-
-                all_cgm["mg/dL.40to400"] = (
-                    all_cgm["mg/dL.40to400"].replace(
-                        to_replace=401,
-                        value=np.nan
-                    )
-                )
-
-                # redefine the time series (ts) for the following stats
-                ts40_400 = all_cgm["mg/dL.40to400"].copy()
-                # require at least 3 points to make a stats calculation
-                w_min = 3
-
                 # recalcuate percent of measurements available
                 all_cgm[w_name + ".40to400availablePercent"] = (
-                    ts40_400.rolling(min_periods=w_min, window=w_len).count()
-                ) / w_len
+                    roll40_400.count() / w_len
+                )
 
                 # get the total number of non-null values over this time period
                 all_cgm[w_name + ".40to400missingPercent"] = (
@@ -2337,9 +2336,6 @@ def estimate_local_time(df):
                     all_cgm[w_name + ".40to400availablePercent"] >= 0.8
                 )
 
-                # create a rolling object
-                roll40_400 = ts40_400.rolling(min_periods=w_min, window=w_len)
-
                 # mean
                 all_cgm[w_name + ".mean"] = roll40_400.mean()
 
@@ -2356,11 +2352,27 @@ def estimate_local_time(df):
                     all_cgm[w_name + ".std"] / all_cgm[w_name + ".mean"]
                 )
 
-                # %% save cgm stats data
-                all_cgm.to_csv(os.path.join(
-                    output_stats,
-                    "PHI-" + userid + "-cgm-stats.csv.gz"
-                ))
+            # %% save cgm stats data
+            all_cgm.to_csv(os.path.join(
+                output_stats,
+                "PHI-" + userid + "-cgm-stats.csv.gz"
+            ))
+            # write the most recent example of the 90 day stats
+            # to the metadata
+            quarter_ge80Available_idx = (
+                all_cgm[all_cgm["quarter.ge80Available"]]
+            ).index.max()
+            most_recent_quarter = all_cgm.loc[
+                [quarter_ge80Available_idx],
+                all_cgm.columns
+            ]
+
+            metadata = pd.merge(
+                metadata,
+                most_recent_quarter,
+                on="hashid",
+                how="left"
+            )
 
         print(metadata.T)
 

From 665598d49c90e889b81f17e7990378347bf4d468 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Tue, 20 Aug 2019 07:49:45 -0500
Subject: [PATCH 36/46] resolve edge case of not having quarterly stats

---
 .../get_stats/get_cgm_distributions_v3.py      | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
index 740128f7..f691f506 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
@@ -2362,14 +2362,22 @@ def estimate_local_time(df):
             quarter_ge80Available_idx = (
                 all_cgm[all_cgm["quarter.ge80Available"]]
             ).index.max()
-            most_recent_quarter = all_cgm.loc[
-                [quarter_ge80Available_idx],
-                all_cgm.columns
-            ]
+
+            if pd.notnull(quarter_ge80Available_idx):
+                # get the most recent quarter
+                most_recent = all_cgm.loc[
+                    [quarter_ge80Available_idx],
+                    all_cgm.columns
+                ]
+            else:
+                most_recent = all_cgm.loc[
+                    [all_cgm.index.max()],
+                    all_cgm.columns
+                ]
 
             metadata = pd.merge(
                 metadata,
-                most_recent_quarter,
+                most_recent,
                 on="hashid",
                 how="left"
             )

From 5432b3332705951b50f97b00a7c1ff9f0b51cadd Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Tue, 20 Aug 2019 10:16:09 -0500
Subject: [PATCH 37/46] initial commit of batch process all cgm distribution
 and stats

---
 .../batch_get_cgm_distributions_and_stats.py  |  160 ++
 .../get_cgm_distributions_and_stats.py        | 2437 +++++++++++++++++
 2 files changed, 2597 insertions(+)
 create mode 100644 projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py
 create mode 100644 projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py

diff --git a/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py b/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py
new file mode 100644
index 00000000..2830fe03
--- /dev/null
+++ b/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py
@@ -0,0 +1,160 @@
+# -*- coding: utf-8 -*-
+"""accept_donors_and_pull_data.py
+This is a wrapper script that gets distributions and stats for all donors,
+NOTE: this needs to be refactored because it is currently set up to run
+on json files that are in a snowflake path
+
+"""
+
+# %% REQUIRED LIBRARIES
+import datetime as dt
+import pandas as pd
+import subprocess as sub
+import os
+import glob
+import time
+import argparse
+from multiprocessing import Pool
+
+
+# %% USER INPUTS (choices to be made in order to run the code)
+codeDescription = "get distribution and stats for all donor's json data"
+parser = argparse.ArgumentParser(description=codeDescription)
+
+parser.add_argument(
+    "-i",
+    "--input-json-data-path",
+    dest="json_data_path",
+    default=os.path.abspath(
+        os.path.join(
+            os.path.dirname(__file__), "..", "data", "dremio", "**", "*.json"
+        ),
+    ),
+    help="the path where json data is located"
+)
+
+parser.add_argument(
+    "-d",
+    "--date-stamp",
+    dest="date_stamp",
+    default=dt.datetime.now().strftime("%Y-%m-%d"),
+    help="date, in '%Y-%m-%d' format, of the date when " +
+    "donors were accepted"
+)
+
+parser.add_argument(
+    "-o",
+    "--output-data-path",
+    dest="data_path",
+    default=os.path.abspath(
+        os.path.join(
+            os.path.dirname(__file__), "..", "data"
+        )
+    ),
+    help="the output path where the data is stored"
+)
+
+args = parser.parse_args()
+
+
+# %% FUNCTIONS
+def run_process(json_data_path):
+    userid = json_data_path[-15:-5]
+
+    p = sub.Popen(
+        [
+             "python", "get_cgm_distributions_and_stats.py",
+             "-i", json_data_path,
+             "-u", userid,
+             "-d", args.date_stamp,
+             "-o", args.data_path
+         ],
+        stdout=sub.PIPE,
+        stderr=sub.PIPE
+    )
+
+    output, errors = p.communicate()
+    output = output.decode("utf-8")
+    errors = errors.decode("utf-8")
+
+    if errors == '':
+        print(output)
+    else:
+        print(errors)
+
+    return
+
+
+# %% GET A LIST OF DONOR JSON FILE LOCATIONS
+all_files = glob.glob(args.json_data_path, recursive=True)
+
+# this is a good test to make sure run process is working before running
+#run_process(all_files[0])
+#pdb.set_trace()
+
+# use multiple cores to process
+startTime = time.time()
+print("starting at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
+pool = Pool(os.cpu_count())
+pool.map(run_process, all_files)
+pool.close()
+endTime = time.time()
+print(
+  "finshed pulling data at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+)
+total_duration = round((endTime - startTime) / 60, 1)
+print("total duration was %s minutes" % total_duration)
+
+
+# %% COMBINE AND SAVE ALL DONOR METADATA
+print("combining all metadata")
+phi_date_stamp = "PHI-" + args.date_stamp
+donor_folder = os.path.join(args.data_path, phi_date_stamp + "-donor-data")
+
+metadata_path = os.path.join(
+    args.data_path,
+    phi_date_stamp + "-donor-data",
+    phi_date_stamp + "-cgm-metadata"
+)
+
+all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
+all_metadata = pd.DataFrame()
+for f in all_metadata_files:
+    temp_meta = pd.read_csv(f)
+    all_metadata = pd.concat(
+        [all_metadata, temp_meta],
+        ignore_index=True,
+        sort=False
+    )
+
+all_metadata.to_csv(
+    os.path.join(donor_folder, phi_date_stamp + "-cgm-metadata.csv.gz")
+)
+print("saving metadata...code complete")
+
+
+# %% COMBINE AND SAVE ALL DISTRIBUTION DATA
+print("combining all distribution data")
+
+metadata_path = os.path.join(
+    args.data_path,
+    phi_date_stamp + "-donor-data",
+    phi_date_stamp + "-cgm-distributions"
+)
+
+all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
+distribution_metadata = pd.DataFrame()
+for f in all_metadata_files:
+    temp_meta = pd.read_csv(f, index_col=[0])
+    distribution_metadata = pd.concat(
+        [distribution_metadata, temp_meta],
+        ignore_index=True,
+        sort=False
+    )
+
+distribution_metadata.to_csv(
+    os.path.join(
+        donor_folder, phi_date_stamp + "-all-cgm-distributions.csv.gz"
+    )
+)
+print("saving all-dataset-info-metadata...code complete")
diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py
new file mode 100644
index 00000000..af0d0d50
--- /dev/null
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py
@@ -0,0 +1,2437 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+'''
+calculate cgm distributions and stats for a single tidepool (donor) dataset
+from a data that comes from a json file (does NOT work with data save as csv)
+'''
+
+
+# %% REQUIRED LIBRARIES
+import os
+import sys
+import hashlib
+import pytz
+import numpy as np
+import pandas as pd
+import datetime as dt
+import argparse
+import pdb
+
+get_donor_data_path = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..")
+)
+if get_donor_data_path not in sys.path:
+    sys.path.insert(0, get_donor_data_path)
+from get_donor_data.get_single_tidepool_dataset_json import make_folder_if_doesnt_exist
+
+# %% CONSTANTS
+MGDL_PER_MMOLL = 18.01559
+
+
+# %% FUNCTIONS
+'''
+the functions that are called in this script,
+which includes notes of where the functions came from,
+and whether they were refactored
+'''
+
+
+def get_episodes(
+        df,
+        episode_criterion="cgm < 54",
+        min_duration=5,
+):
+    # TODO: deal with case where there are nan's in the middle of an episode
+    # it probably makes sense to interpolate between values iff the gap is
+    # <= 1 to 6 points (5 to 30 minutes)
+
+    # put consecutive data that matches in groups
+    df["tempGroups"] = ((
+        df[episode_criterion] != df[episode_criterion].shift()
+    ).cumsum())
+
+    df["episodeId"] = (
+        df["tempGroups"] * df[episode_criterion]
+    )
+
+    # group by the episode groups
+    episode_groups = df.groupby("episodeId")
+    episodes = episode_groups["roundedUtcTime"].count().reset_index()
+    episodes["duration"] = episodes["roundedUtcTime"] * 5
+    episodes.rename(columns={"roundedUtcTime": "episodeCounts"}, inplace=True)
+
+    df = pd.merge(df, episodes, on="episodeId", how="left")
+    df["episodeDuration"] = (
+        df["duration"] * df[episode_criterion]
+    )
+
+    # mark record as belonging to an episode
+    df["isEpisode"] = (
+        df["episodeDuration"] >= min_duration
+    )
+
+    # get the hypo episode starts so we only count each episode once
+    df["episodeStart"] = (
+        (df[episode_criterion])
+        & (~df[episode_criterion].shift(1).fillna(False))
+    )
+
+    # calculate the total duration and attach to start record
+    # which is needed to get the average duration per episode
+    df["episodeTotalDuration"] = (
+        df["episodeStart"] * df["episodeDuration"]
+    )
+    df["episodeTotalDuration"].replace(0, np.nan, inplace=True)
+
+    episode_prefix = (
+        "episode." + episode_criterion
+        + ".durationThreshold=" + str(min_duration) + "."
+    )
+
+    df = df[[
+        "isEpisode", "episodeId", "episodeStart", "episodeTotalDuration"
+    ]].add_prefix(episode_prefix)
+
+    return df
+
+
+def get_slope(y):
+    if "array" not in type(y).__name__:
+        raise TypeError('Expecting a numpy array')
+
+    count_ = len(y)
+
+    x = np.arange(start=0, stop=count_*5, step=5)
+
+    sum_x = x.sum()
+    sum_y = y.sum()
+    sum_xy = (x * y).sum()
+    sum_x_squared = (x * x).sum()
+
+    slope = (
+        ((count_ * sum_xy) - (sum_x * sum_y))
+        / ((count_ * sum_x_squared) - (sum_x * sum_x))
+    )
+
+    return slope
+
+
+def expand_entire_dict(ts):
+    if "Series" not in type(ts).__name__:
+        raise TypeError('Expecting a pandas time series object')
+    notnull_idx = ts.index[ts.notnull()]
+    temp_df = pd.DataFrame(
+        ts[notnull_idx].tolist(),
+        index=notnull_idx
+    )
+
+    return temp_df
+
+
+def expand_embedded_dict(ts, key_):
+    '''Expanded a single field that has embedded json
+
+    Args:
+        ts: a pandas time series of the field that has embedded json
+        key_: the key that you want to expand
+
+    Raise:
+        TypeError: if you don't pass in a pandas time series
+
+    Returns:
+        key_ts: a new time series of the key of interest
+
+    NOTE:
+        this is new function
+    TODO:
+        could be refactored to allow multiple keys or all keys to be returned
+        could be refactored for speed as the current process
+    '''
+
+    if "Series" not in type(ts).__name__:
+        raise TypeError('Expecting a pandas time series object')
+    key_ts = pd.Series(name=ts.name + "." + key_, index=ts.index)
+    notnull_idx = ts.notnull()
+    # TODO: maybe sped up by only getting the one field of interest?
+    # though, the current method is fairly quick and compact
+    temp_df = expand_entire_dict(ts)
+    if key_ in list(temp_df):
+        key_ts[notnull_idx] = temp_df[key_].values
+
+    return key_ts
+
+
+def get_embedded_field(ts, embedded_field):
+    '''get a field that is nested in more than 1 embedded dictionary (json)
+
+    Args:
+        ts: a pandas time series of the field that has embedded json
+        embedded_field (str): the location of the field that is deeply nested
+            (e.g., "origin.payload.device.model")
+
+    Raise:
+        ValueError: if you don't pass in a pandas time series
+
+    Returns:
+        new_ts: a new time series of the key of interest
+
+    NOTE:
+        this is new function
+        the "." notation is used to reference nested json
+
+    '''
+    field_list = embedded_field.split(".")
+    if len(field_list) < 2:
+        raise ValueError('Expecting at least 1 embedded field')
+
+    new_ts = expand_embedded_dict(ts, field_list[1])
+    for i in range(2, len(field_list)):
+        new_ts = expand_embedded_dict(new_ts, field_list[i])
+
+    return new_ts
+
+
+def add_upload_info_to_cgm_records(groups, df):
+    upload_locations = [
+        "upload.uploadId",
+        "upload.deviceManufacturers",
+        "upload.deviceModel",
+        "upload.deviceSerialNumber",
+        "upload.deviceTags"
+    ]
+
+    if "upload" in groups["type"].unique():
+        upload = groups.get_group("upload").dropna(axis=1, how="all").add_prefix("upload.")
+        df = pd.merge(
+            left=df,
+            right=upload[list(set(upload_locations) & set(list(upload)))],
+            left_on="uploadId",
+            right_on="upload.uploadId",
+            how="left"
+        )
+
+    return df
+
+
+def expand_heathkit_cgm_fields(df):
+    # TODO: refactor the code/function that originally grabs
+    # these fields, so we are only doing it once, and so
+    # we don't have to drop the columns for the code below to work.
+    drop_columns = [
+        'origin.payload.device.name',
+        'origin.payload.device.manufacturer',
+        'origin.payload.sourceRevision.source.name'
+    ]
+    for drop_col in drop_columns:
+        if drop_col in list(df):
+            df.drop(columns=[drop_col], inplace=True)
+
+    healthkit_locations = [
+        "origin",
+        "origin.payload",
+        "origin.payload.device",
+        "origin.payload.sourceRevision",
+        "origin.payload.sourceRevision.source",
+        "payload",
+    ]
+
+    for hk_loc in healthkit_locations:
+        if hk_loc in list(df):
+            temp_df = (
+                expand_entire_dict(df[hk_loc].copy()).add_prefix(hk_loc + ".")
+            )
+            df = pd.concat([df, temp_df], axis=1)
+
+    return df
+
+
+def get_dexcom_cgm_model(df):
+    # add cgm model
+
+    dexcom_model_locations = [
+        "deviceId",
+        "deviceManufacturers",
+        "upload.deviceManufacturers",
+        "deviceModel",
+        "upload.deviceModel",
+        "deviceSerialNumber",
+        "upload.deviceSerialNumber",
+        "origin.payload.sourceRevision.source.name",
+        "payload.transmitterGeneration",
+        "payload.HKMetadataKeySyncIdentifier",
+        "payload.transmitterId",
+    ]
+
+    for model_location in dexcom_model_locations:
+        # only check if model has NOT been determined, or if it is G5_G6
+        m_idx = (
+            (df["cgmModel"].isnull())
+            | (df["cgmModel"].astype(str).str.contains("G5_G6"))
+        )
+
+        # get index that matches model
+        if ((model_location in list(df)) & (m_idx.sum() > 0)):
+            str_list = df[model_location].astype(str).str
+
+            # G4
+            g4_idx = str_list.contains("G4", case=False, na=False)
+            df.loc[g4_idx, "cgmModel"] = "G4"
+            df.loc[g4_idx, "cgmModelSensedFrom"] = model_location
+
+            # G5
+            g5_idx = str_list.contains("G5", case=False, na=False)
+            df.loc[g5_idx, "cgmModel"] = "G5"
+            df.loc[g5_idx, "cgmModelSensedFrom"] = model_location
+
+            # G6
+            g6_idx = str_list.contains("G6", case=False, na=False)
+            df.loc[g6_idx, "cgmModel"] = "G6"
+            df.loc[g6_idx, "cgmModelSensedFrom"] = model_location
+
+            # edge case of g5 and g6
+            g5_g6_idx = (g5_idx & g6_idx)
+            df.loc[g5_g6_idx, "cgmModel"] = "G5_G6"
+            df.loc[g5_g6_idx, "cgmModelSensedFrom"] = model_location
+
+            # case of "transmitterId"
+            if (
+                ("transmitterId" in model_location)
+                | ("payload.HKMetadataKeySyncIdentifier" in model_location)
+            ):
+                # if length of string is 5, then it is likely a G4 sensor
+                length5_idx = str_list.len() == 5
+                df.loc[length5_idx, "cgmModel"] = "G4"
+                df.loc[length5_idx, "cgmModelSensedFrom"] = model_location
+
+                # if length of string > 5  then might be G5 or G6
+                length_gt5_idx = str_list.len() > 5
+
+                # if sensor stats with 4 then likely G5
+                starts4_idx = str_list.startswith("4")
+                df.loc[(length_gt5_idx & starts4_idx), "cgmModel"] = "G5"
+                df.loc[(length_gt5_idx & starts4_idx), "cgmModelSensedFrom"] = model_location
+
+                # if sensor stats with 2 or 8 then likely G6
+                starts2_6_idx = (
+                    (str_list.startswith("2")) | (str_list.startswith("8"))
+                )
+                df.loc[(length_gt5_idx & starts2_6_idx), "cgmModel"] = "G6"
+                df.loc[(length_gt5_idx & starts2_6_idx), "cgmModelSensedFrom"] = model_location
+
+    return df[["cgmModel", "cgmModelSensedFrom"]]
+
+
+def get_non_dexcom_cgm_model(df):
+    # non-dexcom cgm model query
+    model_locations = ["deviceId"]
+
+    # model types (NOTE: for medtronic getting pump type not cgm)
+    models_670G = "MMT-158|MMT-178"
+    models_640G = "MMT-1511|MMT-1512|MMT-1711|MMT-1712"
+    models_630G = "MMT-1514|MMT-1515|MMT-1714|MMT-1715"
+    models_530G = (
+        "530G|MedT-551|MedT-751|MedT-554|MedT-754|Veo - 554|Veo - 754"
+    )
+    models_523_723 = "MedT-523|MedT-723|Revel - 523|Revel - 723"  # 523/723
+    models_libre = "AbbottFreeStyleLibre"
+    models_animas = "IR1295"
+    # NOTE: the tandem G4 will first be written as G5_G6,
+    # but the logic should overwrite back to G4
+    models_tandem_G5_G6 = "tandem"
+    models_tandem_G4 = "4628003|5448003"
+
+    non_dex_models = [
+        models_670G, models_640G, models_630G, models_530G, models_523_723,
+        models_libre, models_animas, models_tandem_G5_G6, models_tandem_G4
+    ]
+
+    non_dex_model_names = [
+        "670G", "640G", "630G", "530G", "523_723",
+        "LIBRE", "G4", "G5_G6", "G4"
+    ]
+
+    for model_location in model_locations:
+        # only check if model has NOT been determined, or if it is G5_G6
+        m_idx = (
+            (df["cgmModel"].isnull())
+            | (df["cgmModel"].astype(str).str.contains("G5_G6"))
+        )
+
+        # get index that matches model
+        if ((model_location in list(df)) & (m_idx.sum() > 0)):
+            str_list = df[model_location].astype(str).str
+
+            for non_dex_model, model_name in zip(
+                non_dex_models, non_dex_model_names
+            ):
+
+                model_idx = str_list.contains(non_dex_model, na=False)
+                df.loc[model_idx, "cgmModel"] = model_name
+                df.loc[model_idx, "cgmModelSensedFrom"] = model_location
+
+    return df[["cgmModel", "cgmModelSensedFrom"]]
+
+
+def hash_userid(userid, salt):
+    '''
+    taken from anonymize-and-export.py
+    refactored name(s) to meet style guide
+    '''
+    usr_string = userid + salt
+    hash_user = hashlib.sha256(usr_string.encode())
+    hashid = hash_user.hexdigest()
+
+    return hashid
+
+
+def get_type(val):
+    return type(val).__name__
+
+
+def remove_negative_durations(df):
+    '''
+    taken from https://github.com/tidepool-org/data-analytics/blob/
+    etn/get-settings-and-events/projects/get-donors-pump-settings/
+    get-users-settings-and-events.py
+
+    refactored name(s) to meet style guide
+    refactored pandas field call to df["field"] instead of df.field
+    refactored because physical activity includes embedded json, whereas
+    the other fields in the data model require a integer
+    TODO: I think that durations are coming in as floats too, so we need
+    to refactor to account for that.
+    '''
+    if "duration" in list(df):
+        type_ = df["duration"].apply(get_type)
+        valid_index = ((type_ == "int") & (df["duration"].notnull()))
+        n_negative_durations = sum(df.loc[valid_index, "duration"] < 0)
+        if n_negative_durations > 0:
+            df = df[~(df.loc[valid_index, "duration"] < 0)]
+    else:
+        n_negative_durations = np.nan
+
+    return df, n_negative_durations
+
+
+def tslim_calibration_fix(df):
+    '''
+    taken from https://github.com/tidepool-org/data-analytics/blob/
+    etn/get-settings-and-events/projects/get-donors-pump-settings/
+    get-users-settings-and-events.py
+
+    refactored name(s) to meet style guide
+    refactored pandas field call to df["field"] instead of df.field
+    refactored to only expand one field
+    '''
+
+    # expand payload field one level
+    if "payload" in list(df):
+        df["payload.calibration_reading"] = (
+            expand_embedded_dict(df["payload"], "calibration_reading")
+        )
+
+        if df["payload.calibration_reading"].notnull().sum() > 0:
+
+            search_for = ['tan']
+            tandem_data_index = (
+                (df["deviceId"].str.contains('|'.join(search_for)))
+                & (df["type"] == "deviceEvent")
+            )
+
+            cal_index = df["payload.calibration_reading"].notnull()
+            valid_index = tandem_data_index & cal_index
+
+            n_cal_readings = sum(valid_index)
+
+            if n_cal_readings > 0:
+                # if reading is > 30 then it is in the wrong units
+                if df["payload.calibration_reading"].min() > 30:
+                    df.loc[cal_index, "value"] = (
+                        df.loc[valid_index, "payload.calibration_reading"]
+                        / MGDL_PER_MMOLL
+                    )
+                else:
+                    df.loc[cal_index, "value"] = (
+                        df.loc[valid_index, "payload.calibration_reading"]
+                    )
+        else:
+            n_cal_readings = 0
+    else:
+        n_cal_readings = 0
+    return df, n_cal_readings
+
+
+def replace_smoothed_cgm_values(df):
+
+    if 'payload.realTimeValue' in list(df):
+        raw_val_idx = df['payload.realTimeValue'].notnull()
+        n_replaced = raw_val_idx.sum()
+        df.loc[raw_val_idx, "mg/dL"] = (
+            df.loc[raw_val_idx, "payload.realTimeValue"]
+        )
+    else:
+        n_replaced = np.nan
+
+    raw_values = df["mg/dL"]
+
+    return raw_values, n_replaced
+
+
+def get_healthkit_timezone(df):
+    '''
+    TODO: refactor to account for more efficient way to get embedded json
+    '''
+    if "payload" in list(df):
+        df["payload.HKTimeZone"] = (
+            expand_embedded_dict(df["payload"], "HKTimeZone")
+        )
+        if "timezone" not in list(df):
+            if "payload.HKTimeZone" in list(df):
+                hk_tz_idx = df["payload.HKTimeZone"].notnull()
+                df.loc[hk_tz_idx, "deviceType"] = "healthkit"
+                df.rename(columns={"payload.HKTimeZone": "timezone"}, inplace=True)
+
+            else:
+                df["timezone"] = np.nan
+                df["deviceType"] = np.nan
+        else:
+            if "payload.HKTimeZone" in list(df):
+                hk_tz_idx = df["payload.HKTimeZone"].notnull()
+                df.loc[hk_tz_idx, "timezone"] = (
+                    df.loc[hk_tz_idx, "payload.HKTimeZone"]
+                )
+                df.loc[hk_tz_idx, "deviceType"] = "healthkit"
+            else:
+                df["timezone"] = np.nan
+                df["deviceType"] = np.nan
+
+    else:
+        df["timezone"] = np.nan
+        df["deviceType"] = np.nan
+
+    return df[["timezone", "deviceType"]]
+
+
+def get_and_fill_timezone(df):
+    '''
+    this is new to deal with healthkit data
+    requires that a data frame that contains payload and HKTimeZone is passed
+    '''
+    df = get_healthkit_timezone(df)
+
+    df["timezone"].fillna(method='ffill', inplace=True)
+    df["timezone"].fillna(method='bfill', inplace=True)
+
+    return df["timezone"]
+
+
+def make_tz_unaware(date_time):
+    return date_time.replace(tzinfo=None)
+
+
+def to_utc_datetime(df):
+    '''
+    this is new to deal with perfomance issue with the previous method
+    of converting to string to datetime with pd.to_datetime()
+    '''
+    utc_time_tz_aware = pd.to_datetime(
+        df["time"],
+        format="%Y-%m-%dT%H:%M:%S",
+        utc=True
+    )
+    utc_tz_unaware = utc_time_tz_aware.apply(make_tz_unaware)
+
+    return utc_tz_unaware
+
+
+# apply the large timezone offset correction (AKA Darin's fix)
+def timezone_offset_bug_fix(df):
+    '''
+    this is taken from estimate-local-time.py
+    TODO: add in unit testing where there is no TZP that is > 840 or < -720
+    '''
+
+    if "timezoneOffset" in list(df):
+
+        while ((df.timezoneOffset > 840).sum() > 0):
+            df.loc[df.timezoneOffset > 840, ["conversionOffset"]] = (
+                df.loc[df.timezoneOffset > 840, ["conversionOffset"]]
+                - (1440 * 60 * 1000)
+                )
+
+            df.loc[df.timezoneOffset > 840, ["timezoneOffset"]] = (
+                df.loc[df.timezoneOffset > 840, ["timezoneOffset"]] - 1440
+            )
+
+        while ((df.timezoneOffset < -720).sum() > 0):
+            df.loc[df.timezoneOffset < -720, ["conversionOffset"]] = (
+                df.loc[df.timezoneOffset < -720, ["conversionOffset"]]
+                + (1440 * 60 * 1000)
+            )
+
+            df.loc[df.timezoneOffset < -720, ["timezoneOffset"]] = (
+                df.loc[df.timezoneOffset < -720, ["timezoneOffset"]] + 1440
+            )
+
+    return df
+
+
+def get_local_time(df):
+
+    tzo = df[['utcTime', 'inferredTimezone']].apply(
+        lambda x: get_timezone_offset(*x), axis=1
+    )
+    local_time = df['utcTime'] + pd.to_timedelta(tzo, unit="m")
+
+    return local_time
+
+
+def round_time(
+        df,
+        time_interval_minutes=5,
+        start_with_first_record=True,
+        return_calculation_columns=False
+):
+    '''
+    A general purpose round time function that rounds the "time"
+    field to nearest <time_interval_minutes> minutes
+    INPUTS:
+        * a dataframe (df) or time series that contains only one time field
+        that you want to round
+        * time_interval_minutes (defaults to 5 minutes given that most cgms
+        output every 5 minutes)
+        * start_with_first_record starts the rounding with the first record
+        if True, and the last record if False (defaults to True)
+        * return_calculation_columns specifies whether the extra columns
+        used to make calculations are returned
+    refactored name(s) to meet style guide
+    '''
+    # if a time series is passed in, convert to dataframe
+    if "Series" in get_type(df):
+        df = pd.DataFrame(df)
+    columns_ = list(df)
+    if len(columns_) > 1:
+        sys.exit(
+            "Error: df should only have one time column"
+        )
+    else:
+        df.rename(columns={columns_[0]: "t"}, inplace=True)
+
+    df.sort_values(
+        by="t",
+        ascending=start_with_first_record,
+        inplace=True
+    )
+
+    df.reset_index(drop=False, inplace=True)
+    df.rename(columns={"index": "originalIndex"}, inplace=True)
+
+    # calculate the time between consecutive records
+    df["t_shift"] = df["t"].shift(1)
+    df["timeBetweenRecords"] = round(
+        (df["t"] - df["t_shift"]).dt.days*(86400/(60 * time_interval_minutes))
+        + (df["t"] - df["t_shift"]).dt.seconds/(60 * time_interval_minutes)
+    ) * time_interval_minutes
+
+    # separate the data into chunks if timeBetweenRecords is greater than
+    # 2 times the <time_interval_minutes> minutes so the rounding process
+    # starts over
+    big_gaps = list(
+        df.query("abs(timeBetweenRecords) > "
+                 + str(time_interval_minutes * 2)).index
+    )
+    big_gaps.insert(0, 0)
+    big_gaps.append(len(df))
+
+    for gap_index in range(0, len(big_gaps) - 1):
+        chunk = df["t"][big_gaps[gap_index]:big_gaps[gap_index+1]]
+        first_chunk = df["t"][big_gaps[gap_index]]
+
+        # calculate the time difference between
+        # each time record and the first record
+        df.loc[
+            big_gaps[gap_index]:big_gaps[gap_index+1],
+            "minutesFromFirstRecord"
+        ] = (
+            (chunk - first_chunk).dt.days*(86400/60)
+            + (chunk - first_chunk).dt.seconds/60
+        )
+
+        # then round to the nearest X Minutes
+        # NOTE: the ".000001" ensures that mulitples of 2:30 always rounds up.
+        df.loc[
+            big_gaps[gap_index]:big_gaps[gap_index+1],
+            "roundedMinutesFromFirstRecord"
+        ] = round(
+            (df.loc[
+                big_gaps[gap_index]:big_gaps[gap_index+1],
+                "minutesFromFirstRecord"
+            ] / time_interval_minutes) + 0.000001
+        ) * (time_interval_minutes)
+
+        rounded_first_record = (
+            first_chunk + pd.Timedelta("1microseconds")
+        ).round(str(time_interval_minutes) + "min")
+
+        df.loc[
+            big_gaps[gap_index]:big_gaps[gap_index+1],
+            "roundedTime"
+        ] = rounded_first_record + pd.to_timedelta(
+            df.loc[
+                big_gaps[gap_index]:big_gaps[gap_index+1],
+                "roundedMinutesFromFirstRecord"
+            ], unit="m"
+        )
+
+    if return_calculation_columns is False:
+        df.drop(
+            columns=[
+                "timeBetweenRecords",
+                "minutesFromFirstRecord",
+                "roundedMinutesFromFirstRecord"
+            ], inplace=True
+        )
+    # sort back to the original index
+    df.sort_values(by="originalIndex", inplace=True)
+
+    return df["roundedTime"].values
+
+
+def add_upload_time(df):
+    '''
+    this is taken from a colab notebook that is not in our github
+    given that it has been refactored to account for bug where there are
+    no upload records
+    NOTE: this is a new fix introduced with healthkit data...we now have
+    data that does not have an upload record
+
+    '''
+
+    if "upload" in df.type.unique():
+        upload_times = pd.DataFrame(
+            df[df.type == "upload"].groupby("uploadId")["utcTime"].max()
+        )
+    else:
+        upload_times = pd.DataFrame(columns=["utcTime"])
+
+    unique_uploadIds = set(df["uploadId"].unique())
+    unique_uploadRecords = set(
+        df.loc[df["type"] == "upload", "uploadId"].unique()
+    )
+    uploadIds_missing_uploadRecords = unique_uploadIds - unique_uploadRecords
+
+    for upId in uploadIds_missing_uploadRecords:
+        last_upload_time = df.loc[df["uploadId"] == upId, "utcTime"].max()
+        upload_times.loc[upId, "utcTime"] = last_upload_time
+
+    upload_times.reset_index(inplace=True)
+    upload_times.rename(
+        columns={"utcTime": "uploadTime",
+                 "index": "uploadId"},
+        inplace=True
+    )
+
+    df = pd.merge(df, upload_times, how='left', on='uploadId')
+
+    return df["uploadTime"].values
+
+
+def remove_invalid_cgm_values(df):
+
+    nBefore = len(df)
+    # remove values < 38 and > 402 mg/dL
+    df = df.drop(df[((df.type == "cbg") & (df["mg/dL"] < 38))].index)
+    df = df.drop(df[((df.type == "cbg") & (df["mg/dL"] > 402))].index)
+    nRemoved = nBefore - len(df)
+
+    return df, nRemoved
+
+
+def removeDuplicates(df, criteriaDF):
+    nBefore = len(df)
+    df = df.loc[~(df[criteriaDF].duplicated())]
+    df = df.reset_index(drop=True)
+    nDuplicatesRemoved = nBefore - len(df)
+
+    return df, nDuplicatesRemoved
+
+
+def removeCgmDuplicates(df, timeCriterion, valueCriterion="value"):
+    if timeCriterion in df:
+        df.sort_values(by=[timeCriterion, "uploadTime"],
+                       ascending=[False, False],
+                       inplace=True)
+        dfIsNull = df[df[timeCriterion].isnull()]
+        dfNotNull = df[df[timeCriterion].notnull()]
+        dfNotNull, nDuplicatesRemoved = (
+            removeDuplicates(dfNotNull, [timeCriterion, valueCriterion])
+        )
+        df = pd.concat([dfIsNull, dfNotNull])
+        df.sort_values(by=[timeCriterion, "uploadTime"],
+                       ascending=[False, False],
+                       inplace=True)
+    else:
+        nDuplicatesRemoved = 0
+
+    return df, nDuplicatesRemoved
+
+
+# get rid of spike data
+def remove_spike_data(df):
+    if "origin" in list(df):
+        nBefore = len(df)
+        spike_locations = [
+            "origin.payload.device.name",
+            "origin.payload.device.manufacturer",
+            "origin.payload.sourceRevision.source.name",
+        ]
+        for spike_loc in spike_locations:
+            df[spike_loc] = get_embedded_field(df["origin"], spike_loc)
+            notnull_idx = df[spike_loc].notnull()
+            df_notnull = df[notnull_idx]
+            is_spike = df_notnull[spike_loc].astype(str).str.lower().str.contains("spike")
+            spike_idx = df_notnull[is_spike].index
+            df.drop(spike_idx, inplace=True)
+
+        nRemoved = nBefore - len(df)
+
+    else:
+        nRemoved = np.nan
+
+    return df, nRemoved
+
+
+# %% ESTIMATE LOCAL TIME FUNCTIONS
+def convert_deprecated_timezone_to_alias(df, tzAlias):
+    if "timezone" in df:
+        uniqueTimezones = df.timezone.unique()
+        uniqueTimezones = uniqueTimezones[pd.notnull(df.timezone.unique())]
+
+        for uniqueTimezone in uniqueTimezones:
+            alias = tzAlias.loc[tzAlias.tz.str.endswith(uniqueTimezone),
+                                ["alias"]].values
+            if len(alias) == 1:
+                df.loc[df.timezone == uniqueTimezone, ["timezone"]] = alias
+
+    return df
+
+
+def create_contiguous_day_series(df):
+    first_day = df["date"].min()
+    last_day = df["date"].max()
+    rng = pd.date_range(first_day, last_day).date
+    contiguousDaySeries = \
+        pd.DataFrame(rng, columns=["date"]).sort_values(
+                "date", ascending=False).reset_index(drop=True)
+
+    return contiguousDaySeries
+
+
+def add_device_type(df):
+    col_headings = list(df)
+    if "deviceType" not in col_headings:
+        df["deviceType"] = np.nan
+    if "deviceTags" in col_headings:
+        # first make sure deviceTag is in string format
+        df["deviceTags"] = df.deviceTags.astype(str)
+        # filter by type not null device tags
+        ud = df[df["deviceTags"].notnull()].copy()
+        # define a device type (e.g., pump, cgm, or healthkit)
+        ud.loc[
+            ((ud["deviceTags"].str.contains("pump"))
+             & (ud["deviceType"].isnull())),
+            ["deviceType"]
+        ] = "pump"
+
+        # define a device type (e.g., cgm)
+        ud.loc[
+            ((ud["deviceTags"].str.contains("cgm"))
+             & (ud["deviceType"].isnull())),
+            ["deviceType"]
+        ] = "cgm"
+
+        return ud["deviceType"]
+    else:
+        return np.nan
+
+
+def get_timezone_offset(currentDate, currentTimezone):
+
+    tz = pytz.timezone(currentTimezone)
+    # here we add 1 day to the current date to account for changes to/from DST
+    tzoNum = int(
+        tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z")
+    )
+    tzoHours = np.floor(tzoNum / 100)
+    tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0)
+    tzoSign = np.sign(tzoHours)
+    tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign))
+
+    return tzo
+
+
+def add_device_day_series(df, dfContDays, deviceTypeName):
+    if len(df) > 0:
+        dfDayGroups = df.groupby("date")
+        if "timezoneOffset" in df:
+            dfDaySeries = pd.DataFrame(dfDayGroups["timezoneOffset"].median())
+        else:
+            dfDaySeries = pd.DataFrame(columns=["timezoneOffset"])
+            dfDaySeries.index.name = "date"
+
+        if "upload" in deviceTypeName:
+            if (("timezone" in df) & (df["timezone"].notnull().sum() > 0)):
+                dfDaySeries["timezone"] = (
+                    dfDayGroups.timezone.describe()["top"]
+                )
+                # get the timezone offset for the timezone
+                for i in dfDaySeries.index:
+                    if pd.notnull(dfDaySeries.loc[i, "timezone"]):
+                        tzo = get_timezone_offset(
+                                pd.to_datetime(i),
+                                dfDaySeries.loc[i, "timezone"])
+                        dfDaySeries.loc[i, ["timezoneOffset"]] = tzo
+                if "timeProcessing" in dfDaySeries:
+                    dfDaySeries["timeProcessing"] = \
+                        dfDayGroups.timeProcessing.describe()["top"]
+                else:
+                    dfDaySeries["timeProcessing"] = np.nan
+
+
+        dfDaySeries = dfDaySeries.add_prefix(deviceTypeName + "."). \
+            rename(columns={deviceTypeName + ".date": "date"})
+
+        dfContDays = pd.merge(dfContDays, dfDaySeries.reset_index(),
+                              on="date", how="left")
+
+    else:
+        dfContDays[deviceTypeName + ".timezoneOffset"] = np.nan
+
+    return dfContDays
+
+
+def impute_upload_records(df, contDays, deviceTypeName):
+    daySeries = \
+        add_device_day_series(df, contDays, deviceTypeName)
+
+    if ((len(df) > 0) & (deviceTypeName + ".timezone" in daySeries)):
+        for i in daySeries.index[1:]:
+            if pd.isnull(daySeries[deviceTypeName + ".timezone"][i]):
+                daySeries.loc[i, [deviceTypeName + ".timezone"]] = (
+                    daySeries.loc[i-1, deviceTypeName + ".timezone"]
+                )
+            if pd.notnull(daySeries[deviceTypeName + ".timezone"][i]):
+                tz = daySeries.loc[i, deviceTypeName + ".timezone"]
+                tzo = get_timezone_offset(
+                    pd.to_datetime(daySeries.loc[i, "date"]),
+                    tz
+                )
+                daySeries.loc[i, deviceTypeName + ".timezoneOffset"] = tzo
+
+            if pd.notnull(daySeries[deviceTypeName + ".timeProcessing"][i-1]):
+                daySeries.loc[i, deviceTypeName + ".timeProcessing"] = \
+                    daySeries.loc[i-1, deviceTypeName + ".timeProcessing"]
+
+    else:
+        daySeries[deviceTypeName + ".timezone"] = np.nan
+        daySeries[deviceTypeName + ".timeProcessing"] = np.nan
+
+    return daySeries
+
+
+def add_home_timezone(df, contDays):
+
+    if (("timezone" in df) & (df["timezone"].notnull().sum()> 0)):
+        homeTimezone = df["timezone"].describe()["top"]
+        tzo = contDays.date.apply(
+                lambda x: get_timezone_offset(pd.to_datetime(x), homeTimezone))
+
+        contDays["home.imputed.timezoneOffset"] = tzo
+        contDays["home.imputed.timezone"] = homeTimezone
+
+    else:
+        contDays["home.imputed.timezoneOffset"] = np.nan
+        contDays["home.imputed.timezone"] = np.nan
+    contDays["home.imputed.timeProcessing"] = np.nan
+
+    return contDays
+
+
+def estimateTzAndTzoWithUploadRecords(cDF):
+
+    cDF["est.type"] = np.nan
+    cDF["est.gapSize"] = np.nan
+    cDF["est.timezoneOffset"] = cDF["upload.timezoneOffset"]
+    cDF["est.annotations"] = np.nan
+
+    if "upload.timezone" in cDF:
+        cDF.loc[cDF["upload.timezone"].notnull(), ["est.type"]] = "UPLOAD"
+        cDF["est.timezone"] = cDF["upload.timezone"]
+        cDF["est.timeProcessing"] = cDF["upload.timeProcessing"]
+    else:
+        cDF["est.timezone"] = np.nan
+        cDF["est.timeProcessing"] = np.nan
+
+    cDF.loc[((cDF["est.timezoneOffset"] !=
+              cDF["home.imputed.timezoneOffset"]) &
+            (pd.notnull(cDF["est.timezoneOffset"]))),
+            "est.annotations"] = "travel"
+
+    return cDF
+
+
+def assignTzoFromImputedSeries(df, i, imputedSeries):
+    df.loc[i, ["est.type"]] = "DEVICE"
+
+    df.loc[i, ["est.timezoneOffset"]] = \
+        df.loc[i, imputedSeries + ".timezoneOffset"]
+
+    df.loc[i, ["est.timezone"]] = \
+        df.loc[i, imputedSeries + ".timezone"]
+
+    df.loc[i, ["est.timeProcessing"]] = \
+        df.loc[i, imputedSeries + ".timeProcessing"]
+
+    return df
+
+
+def compareDeviceTzoToImputedSeries(df, sIdx, device):
+    for i in sIdx:
+        # if the device tzo = imputed tzo, then chose the imputed tz and tzo
+        # note, dst is accounted for in the imputed tzo
+        for imputedSeries in ["pump.upload.imputed", "cgm.upload.imputed",
+                              "healthkit.upload.imputed", "home.imputed"]:
+            # if the estimate has not already been made
+            if pd.isnull(df.loc[i, "est.timezone"]):
+
+                if df.loc[i, device + ".timezoneOffset"] == \
+                  df.loc[i, imputedSeries + ".timezoneOffset"]:
+
+                    assignTzoFromImputedSeries(df, i, imputedSeries)
+
+                    df = addAnnotation(df, i,
+                                       "tz-inferred-from-" + imputedSeries)
+
+                # if the imputed series has a timezone estimate, then see if
+                # the current day is a dst change day
+                elif (pd.notnull(df.loc[i, imputedSeries + ".timezone"])):
+                    imputedTimezone = df.loc[i, imputedSeries + ".timezone"]
+                    if isDSTChangeDay(df.loc[i, "date"], imputedTimezone):
+
+                        dstRange = getRangeOfTZOsForTimezone(imputedTimezone)
+                        if ((df.loc[i, device + ".timezoneOffset"] in dstRange)
+                          & (df.loc[i, imputedSeries + ".timezoneOffset"] in dstRange)):
+
+                            assignTzoFromImputedSeries(df, i, imputedSeries)
+
+                            df = addAnnotation(df, i, "dst-change-day")
+                            df = addAnnotation(
+                                    df, i, "tz-inferred-from-" + imputedSeries)
+
+    return df
+
+
+def estimateTzAndTzoWithDeviceRecords(cDF):
+
+    # 2A. use the TZO of the pump or cgm device if it exists on a given day. In
+    # addition, compare the TZO to one of the imputed day series (i.e., the
+    # upload and home series to see if the TZ can be inferred)
+    for deviceType in ["pump", "cgm"]:
+        # find the indices of days where a TZO estimate has not been made AND
+        # where the device (e.g., pump or cgm) TZO has data
+        sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) &
+                        (cDF[deviceType + ".timezoneOffset"].notnull()))].index
+        # compare the device TZO to the imputed series to infer time zone
+        cDF = compareDeviceTzoToImputedSeries(cDF, sIndices, deviceType)
+
+    # 2B. if the TZ cannot be inferred with 2A, then see if the TZ can be
+    # inferred from the previous day's TZO. If the device TZO is equal to the
+    # previous day's TZO, AND if the previous day has a TZ estimate, use the
+    # previous day's TZ estimate for the current day's TZ estimate
+    for deviceType in ["pump", "cgm"]:
+        sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) &
+                        (cDF[deviceType + ".timezoneOffset"].notnull()))].index
+
+        cDF = compareDeviceTzoToPrevDayTzo(cDF, sIndices, deviceType)
+
+    # 2C. after 2A and 2B, check the DEVICE estimates to make sure that the
+    # pump and cgm tzo do not differ by more than 60 minutes. If they differ
+    # by more that 60 minutes, then mark the estimate as UNCERTAIN. Also, we
+    # allow the estimates to be off by 60 minutes as there are a lot of cases
+    # where the devices are off because the user changes the time for DST,
+    # at different times
+    sIndices = cDF[((cDF["est.type"] == "DEVICE") &
+                    (cDF["pump.timezoneOffset"].notnull()) &
+                    (cDF["cgm.timezoneOffset"].notnull()) &
+                    (cDF["pump.timezoneOffset"] != cDF["cgm.timezoneOffset"])
+                    )].index
+
+    tzoDiffGT60 = abs(cDF.loc[sIndices, "cgm.timezoneOffset"] -
+                      cDF.loc[sIndices, "pump.timezoneOffset"]) > 60
+
+    idx = tzoDiffGT60.index[tzoDiffGT60]
+
+    cDF.loc[idx, ["est.type"]] = "UNCERTAIN"
+    for i in idx:
+        cDF = addAnnotation(cDF, i, "pump-cgm-tzo-mismatch")
+
+    return cDF
+
+
+def imputeTzAndTzo(cDF):
+
+    sIndices = cDF[cDF["est.timezoneOffset"].isnull()].index
+    hasTzoIndices = cDF[cDF["est.timezoneOffset"].notnull()].index
+    if len(hasTzoIndices) > 0:
+        if len(sIndices) > 0:
+            lastDay = max(sIndices)
+
+            while ((sIndices.min() < max(hasTzoIndices)) &
+                   (len(sIndices) > 0)):
+
+                currentDay, prevDayWithDay, nextDayIdx = \
+                    getImputIndices(cDF, sIndices, hasTzoIndices)
+
+                cDF = imputeByTimezone(cDF, currentDay,
+                                       prevDayWithDay, nextDayIdx)
+
+                sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) &
+                                (~cDF["est.annotations"].str.contains(
+                                "unable-to-impute-tzo").fillna(False)))].index
+
+                hasTzoIndices = cDF[cDF["est.timezoneOffset"].notnull()].index
+
+            # try to impute to the last day (earliest day) in the dataset
+            # if the last record has a timezone that is the home record, then
+            # impute using the home timezone
+            if len(sIndices) > 0:
+                currentDay = min(sIndices)
+                prevDayWithDay = currentDay - 1
+                gapSize = lastDay - currentDay
+
+                for i in range(currentDay, lastDay + 1):
+                    if cDF.loc[prevDayWithDay, "est.timezoneOffset"] == \
+                      cDF.loc[prevDayWithDay, "home.imputed.timezoneOffset"]:
+
+                        cDF.loc[i, ["est.type"]] = "IMPUTE"
+
+                        cDF.loc[i, ["est.timezoneOffset"]] = \
+                            cDF.loc[i, "home.imputed.timezoneOffset"]
+
+                        cDF.loc[i, ["est.timezone"]] = \
+                            cDF.loc[i, "home.imputed.timezone"]
+
+                        cDF = addAnnotation(cDF, i, "gap=" + str(gapSize))
+                        cDF.loc[i, ["est.gapSize"]] = gapSize
+
+                    else:
+                        cDF.loc[i, ["est.type"]] = "UNCERTAIN"
+                        cDF = addAnnotation(cDF, i, "unable-to-impute-tzo")
+    else:
+        cDF["est.type"] = "UNCERTAIN"
+        cDF["est.annotations"] = "unable-to-impute-tzo"
+
+    return cDF
+
+
+def getRangeOfTZOsForTimezone(tz):
+    minMaxTzo = [getTimezoneOffset(pd.to_datetime("1/1/2017"), tz),
+                 getTimezoneOffset(pd.to_datetime("5/1/2017"), tz)]
+
+    rangeOfTzo = np.arange(int(min(minMaxTzo)), int(max(minMaxTzo))+1, 15)
+
+    return rangeOfTzo
+
+
+def getListOfDSTChangeDays(cDF):
+
+    # get a list of DST change days for the home time zone
+    dstChangeDays = \
+        cDF[abs(cDF["home.imputed.timezoneOffset"] -
+                cDF["home.imputed.timezoneOffset"].shift(-1)) > 0].date
+
+    return dstChangeDays
+
+
+def correctEstimatesAroundDst(df, cDF):
+
+    # get a list of DST change days for the home time zone
+    dstChangeDays = getListOfDSTChangeDays(cDF)
+
+    # loop through the df within 2 days of a daylight savings time change
+    for d in dstChangeDays:
+        dstIndex = df[(df.date > (d + dt.timedelta(days=-2))) &
+                      (df.date < (d + dt.timedelta(days=2)))].index
+        for dIdx in dstIndex:
+            if pd.notnull(df.loc[dIdx, "est.timezone"]):
+                tz = pytz.timezone(df.loc[dIdx, "est.timezone"])
+                tzRange = getRangeOfTZOsForTimezone(str(tz))
+                minHoursToLocal = min(tzRange)/60
+                tzoNum = int(tz.localize(df.loc[dIdx, "utcTime"] +
+                             dt.timedelta(hours=minHoursToLocal)).strftime("%z"))
+                tzoHours = np.floor(tzoNum / 100)
+                tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0)
+                tzoSign = np.sign(tzoHours)
+                tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign))
+                localTime = \
+                    df.loc[dIdx, "utcTime"] + pd.to_timedelta(tzo, unit="m")
+                df.loc[dIdx, ["est.localTime"]] = localTime
+                df.loc[dIdx, ["est.timezoneOffset"]] = tzo
+    return df
+
+
+def applyLocalTimeEstimates(df, cDF):
+    df = pd.merge(df, cDF, how="left", on="date")
+    df["est.localTime"] = \
+        df["utcTime"] + pd.to_timedelta(df["est.timezoneOffset"], unit="m")
+
+    df = correctEstimatesAroundDst(df, cDF)
+
+    return df["est.localTime"].values
+
+
+def isDSTChangeDay(currentDate, currentTimezone):
+    tzoCurrentDay = getTimezoneOffset(pd.to_datetime(currentDate),
+                                      currentTimezone)
+    tzoPreviousDay = getTimezoneOffset(pd.to_datetime(currentDate) +
+                                       dt.timedelta(days=-1), currentTimezone)
+
+    return (tzoCurrentDay != tzoPreviousDay)
+
+
+def tzoRangeWithComparisonTz(df, i, comparisonTz):
+    # if we have a previous timezone estimate, then calcuate the range of
+    # timezone offset values for that time zone
+    if pd.notnull(comparisonTz):
+        rangeTzos = getRangeOfTZOsForTimezone(comparisonTz)
+    else:
+        comparisonTz = np.nan
+        rangeTzos = np.array([])
+
+    return rangeTzos
+
+
+def tzAndTzoRangePreviousDay(df, i):
+    # if we have a previous timezone estimate, then calcuate the range of
+    # timezone offset values for that time zone
+    comparisonTz = df.loc[i-1, "est.timezone"]
+
+    rangeTzos = tzoRangeWithComparisonTz(df, i, comparisonTz)
+
+    return comparisonTz, rangeTzos
+
+
+def assignTzoFromPreviousDay(df, i, previousDayTz):
+
+    df.loc[i, ["est.type"]] = "DEVICE"
+    df.loc[i, ["est.timezone"]] = previousDayTz
+    df.loc[i, ["est.timezoneOffset"]] = \
+        getTimezoneOffset(pd.to_datetime(df.loc[i, "date"]), previousDayTz)
+
+    df.loc[i, ["est.timeProcessing"]] = df.loc[i-1, "est.timeProcessing"]
+    df = addAnnotation(df, i, "tz-inferred-from-prev-day")
+
+    return df
+
+
+def assignTzoFromDeviceTzo(df, i, device):
+
+    df.loc[i, ["est.type"]] = "DEVICE"
+    df.loc[i, ["est.timezoneOffset"]] = \
+        df.loc[i, device + ".timezoneOffset"]
+    df.loc[i, ["est.timeProcessing"]] = \
+        df.loc[i, device + ".upload.imputed.timeProcessing"]
+
+    df = addAnnotation(df, i, "likely-travel")
+    df = addAnnotation(df, i, "tzo-from-" + device)
+
+    return df
+
+
+def compareDeviceTzoToPrevDayTzo(df, sIdx, device):
+
+    for i in sIdx[sIdx > 0]:
+
+        # first see if the previous record has a tzo
+        if (pd.notnull(df.loc[i-1, "est.timezoneOffset"])):
+
+            previousDayTz, dstRange = tzAndTzoRangePreviousDay(df, i)
+            timeDiff = abs((df.loc[i, device + ".timezoneOffset"]) -
+                           df.loc[i-1, "est.timezoneOffset"])
+
+            # next see if the previous record has a tz
+            if (pd.notnull(df.loc[i-1, "est.timezone"])):
+
+                if timeDiff == 0:
+                    assignTzoFromPreviousDay(df, i, previousDayTz)
+
+                # see if the previous day's tzo and device tzo are within the
+                # dst range (as that is a common problem with this data)
+                elif ((df.loc[i, device + ".timezoneOffset"] in dstRange)
+                      & (df.loc[i-1, "est.timezoneOffset"] in dstRange)):
+
+                    # then see if it is DST change day
+                    if isDSTChangeDay(df.loc[i, "date"], previousDayTz):
+
+                        df = addAnnotation(df, i, "dst-change-day")
+                        assignTzoFromPreviousDay(df, i, previousDayTz)
+
+                    # if it is not DST change day, then mark this as uncertain
+                    else:
+                        # also, check to see if the difference between device.
+                        # tzo and prev.tzo is less than the expected dst
+                        # difference. There is a known issue where the BtUTC
+                        # procedure puts clock drift into the device.tzo,
+                        # and as a result the tzo can be off by 15, 30,
+                        # or 45 minutes.
+                        if (((df.loc[i, device + ".timezoneOffset"] ==
+                              min(dstRange)) |
+                            (df.loc[i, device + ".timezoneOffset"] ==
+                             max(dstRange))) &
+                           ((df.loc[i-1, "est.timezoneOffset"] ==
+                             min(dstRange)) |
+                            (df.loc[i-1, "est.timezoneOffset"] ==
+                             max(dstRange)))):
+
+                            df.loc[i, ["est.type"]] = "UNCERTAIN"
+                            df = addAnnotation(df, i,
+                                               "likely-dst-error-OR-travel")
+
+                        else:
+
+                            df.loc[i, ["est.type"]] = "UNCERTAIN"
+                            df = addAnnotation(df, i,
+                                               "likely-15-min-dst-error")
+
+                # next see if time difference between device.tzo and prev.tzo
+                # is off by 720 minutes, which is indicative of a common
+                # user AM/PM error
+                elif timeDiff == 720:
+                    df.loc[i, ["est.type"]] = "UNCERTAIN"
+                    df = addAnnotation(df, i, "likely-AM-PM-error")
+
+                # if it doesn't fall into any of these cases, then the
+                # tzo difference is likely due to travel
+                else:
+                    df = assignTzoFromDeviceTzo(df, i, device)
+
+            elif timeDiff == 0:
+                df = assignTzoFromDeviceTzo(df, i, device)
+
+        # if there is no previous record to compare with check for dst errors,
+        # and if there are no errors, it is likely a travel day
+        else:
+
+            comparisonTz, dstRange = tzAndTzoRangeWithHomeTz(df, i)
+            timeDiff = abs((df.loc[i, device + ".timezoneOffset"]) -
+                           df.loc[i, "home.imputed.timezoneOffset"])
+
+            if ((df.loc[i, device + ".timezoneOffset"] in dstRange)
+               & (df.loc[i, "home.imputed.timezoneOffset"] in dstRange)):
+
+                # see if it is DST change day
+                if isDSTChangeDay(df.loc[i, "date"], comparisonTz):
+
+                    df = addAnnotation(df, i, "dst-change-day")
+                    df.loc[i, ["est.type"]] = "DEVICE"
+                    df.loc[i, ["est.timezoneOffset"]] = \
+                        df.loc[i, device + ".timezoneOffset"]
+                    df.loc[i, ["est.timezone"]] = \
+                        df.loc[i, "home.imputed.timezone"]
+                    df.loc[i, ["est.timeProcessing"]] = \
+                        df.loc[i, device + ".upload.imputed.timeProcessing"]
+
+                # if it is not DST change day, then mark this as uncertain
+                else:
+                    # also, check to see if the difference between device.
+                    # tzo and prev.tzo is less than the expected dst
+                    # difference. There is a known issue where the BtUTC
+                    # procedure puts clock drift into the device.tzo,
+                    # and as a result the tzo can be off by 15, 30,
+                    # or 45 minutes.
+                    if (((df.loc[i, device + ".timezoneOffset"] ==
+                          min(dstRange)) |
+                        (df.loc[i, device + ".timezoneOffset"] ==
+                         max(dstRange))) &
+                       ((df.loc[i, "home.imputed.timezoneOffset"] ==
+                         min(dstRange)) |
+                        (df.loc[i, "home.imputed.timezoneOffset"] ==
+                         max(dstRange)))):
+
+                        df.loc[i, ["est.type"]] = "UNCERTAIN"
+                        df = addAnnotation(df, i, "likely-dst-error-OR-travel")
+
+                    else:
+
+                        df.loc[i, ["est.type"]] = "UNCERTAIN"
+                        df = addAnnotation(df, i, "likely-15-min-dst-error")
+
+            # next see if time difference between device.tzo and prev.tzo
+            # is off by 720 minutes, which is indicative of a common
+            # user AM/PM error
+            elif timeDiff == 720:
+                df.loc[i, ["est.type"]] = "UNCERTAIN"
+                df = addAnnotation(df, i, "likely-AM-PM-error")
+
+            # if it doesn't fall into any of these cases, then the
+            # tzo difference is likely due to travel
+
+            else:
+                df = assignTzoFromDeviceTzo(df, i, device)
+
+    return df
+
+
+def getImputIndices(df, sIdx, hIdx):
+
+    lastDayIdx = len(df) - 1
+
+    currentDayIdx = sIdx.min()
+    tempList = pd.Series(hIdx) - currentDayIdx
+    prevDayIdx = currentDayIdx - 1
+    nextDayIdx = \
+        min(currentDayIdx + min(tempList[tempList >= 0]), lastDayIdx)
+
+    return currentDayIdx, prevDayIdx, nextDayIdx
+
+
+def imputeByTimezone(df, currentDay, prevDaywData, nextDaywData):
+
+    gapSize = (nextDaywData - currentDay)
+
+    if prevDaywData >= 0:
+
+        if df.loc[prevDaywData, "est.timezone"] == \
+          df.loc[nextDaywData, "est.timezone"]:
+
+            tz = df.loc[prevDaywData, "est.timezone"]
+
+            for i in range(currentDay, nextDaywData):
+
+                df.loc[i, ["est.timezone"]] = tz
+
+                df.loc[i, ["est.timezoneOffset"]] = \
+                    getTimezoneOffset(pd.to_datetime(df.loc[i, "date"]), tz)
+
+                df.loc[i, ["est.type"]] = "IMPUTE"
+
+                df = addAnnotation(df, i, "gap=" + str(gapSize))
+                df.loc[i, ["est.gapSize"]] = gapSize
+
+        # TODO: this logic should be updated to handle the edge case
+        # where the day before and after the gap have differing TZ, but
+        # the same TZO. In that case the gap should be marked as UNCERTAIN
+        elif df.loc[prevDaywData, "est.timezoneOffset"] == \
+          df.loc[nextDaywData, "est.timezoneOffset"]:
+
+            for i in range(currentDay, nextDaywData):
+
+                df.loc[i, ["est.timezoneOffset"]] = \
+                    df.loc[prevDaywData, "est.timezoneOffset"]
+
+                df.loc[i, ["est.type"]] = "IMPUTE"
+
+                df = addAnnotation(df, i, "gap=" + str(gapSize))
+                df.loc[i, ["est.gapSize"]] = gapSize
+
+        else:
+            for i in range(currentDay, nextDaywData):
+                df.loc[i, ["est.type"]] = "UNCERTAIN"
+                df = addAnnotation(df, i, "unable-to-impute-tzo")
+
+    else:
+        for i in range(currentDay, nextDaywData):
+            df.loc[i, ["est.type"]] = "UNCERTAIN"
+            df = addAnnotation(df, i, "unable-to-impute-tzo")
+
+    return df
+
+
+def addAnnotation(df, idx, annotationMessage):
+    if pd.notnull(df.loc[idx, "est.annotations"]):
+        df.loc[idx, ["est.annotations"]] = df.loc[idx, "est.annotations"] + \
+            ", " + annotationMessage
+    else:
+        df.loc[idx, ["est.annotations"]] = annotationMessage
+
+    return df
+
+
+def getTimezoneOffset(currentDate, currentTimezone):
+
+    tz = pytz.timezone(currentTimezone)
+    # here we add 1 day to the current date to account for changes to/from DST
+    tzoNum = int(tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z"))
+    tzoHours = np.floor(tzoNum / 100)
+    tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0)
+    tzoSign = np.sign(tzoHours)
+    tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign))
+
+    return tzo
+
+
+def estimate_local_time(df):
+    df["date"] = df["utcTime"].dt.date  # TODO: change this to utcDate later
+    contiguous_days = create_contiguous_day_series(df)
+
+    df["deviceType"] = add_device_type(df)
+    cDays = add_device_day_series(df, contiguous_days, "upload")
+
+    # create day series for cgm df
+    if "timezoneOffset" not in list(df):
+        df["timezoneOffset"] = np.nan
+
+    cgmdf = df[(df["type"] == "cbg") & (df["timezoneOffset"].notnull())].copy()
+    cDays = add_device_day_series(cgmdf, cDays, "cgm")
+
+    # create day series for pump df
+    pumpdf = df[(df.type == "bolus") & (df.timezoneOffset.notnull())].copy()
+    cDays = add_device_day_series(pumpdf, cDays, "pump")
+
+    # interpolate between upload records of the same deviceType, and create a
+    # day series for interpolated pump, non-hk-cgm, and healthkit uploads
+    for deviceType in ["pump", "cgm", "healthkit"]:
+        tempUploaddf = df[df["deviceType"] == deviceType].copy()
+        cDays = impute_upload_records(
+            tempUploaddf, cDays, deviceType + ".upload.imputed"
+        )
+
+    # add a home timezone that also accounts for daylight savings time changes
+    cDays = add_home_timezone(df, cDays)
+
+    # 1. USE UPLOAD RECORDS TO ESTIMATE TZ AND TZO
+    cDays = estimateTzAndTzoWithUploadRecords(cDays)
+
+    # 2. USE DEVICE TZOs TO ESTIMATE TZO AND TZ (IF POSSIBLE)
+    # estimates can be made from pump and cgm df that have a TZO
+    # NOTE: the healthkit and dexcom-api cgm df are excluded
+    cDays = estimateTzAndTzoWithDeviceRecords(cDays)
+
+    # 3. impute, infer, or interpolate gaps in the estimated tzo and tz
+    cDays = imputeTzAndTzo(cDays)
+
+    # 4. APPLY LOCAL TIME ESTIMATES TO ALL df
+    local_time = applyLocalTimeEstimates(df, cDays)
+
+    return local_time, cDays
+
+
+# %% MAIN FUNCTION
+def get_distribution_and_stats(
+        json_data_path,
+        userid,
+        date_stamp,
+        save_data_path
+):
+
+    phi_date = "PHI-" + date_stamp
+
+    output_metadata = os.path.join(
+        save_data_path,
+        phi_date + "-donor-data",
+        phi_date + "-cgm-metadata"
+    )
+
+    output_distribution = os.path.join(
+        save_data_path,
+        phi_date + "-donor-data",
+        phi_date + "-cgm-distributions"
+    )
+    debug_duplicates = os.path.join(
+        save_data_path,
+        phi_date + "-donor-data",
+        phi_date + "-debug-cgm-duplicates"
+    )
+    output_stats = os.path.join(
+        save_data_path,
+        phi_date + "-donor-data",
+        phi_date + "-cgm-stats"
+    )
+
+    make_folder_if_doesnt_exist(
+        [output_metadata, output_distribution, debug_duplicates, output_stats]
+    )
+
+    timezone_aliases = pd.read_csv(
+        "wikipedia-timezone-aliases-2018-04-28.csv",
+        low_memory=False
+    )
+
+    donor_metadata_columns = [
+        'userid',
+        'diagnosisType',
+        'diagnosisDate',
+        'biologicalSex',
+        'birthday',
+        'targetTimezone',
+        'targetDevices',
+        'isOtherPerson',
+    ]
+
+    # load in data
+    data = pd.read_json(json_data_path)
+
+    # load in donor metadata
+    all_donor_metadata = pd.read_csv(
+        os.path.join(
+            save_data_path,
+            phi_date + "-donor-data",
+            phi_date + "-donor-metadata.csv"),
+        low_memory=False
+    )
+
+    metadata = all_donor_metadata.loc[
+        all_donor_metadata["userid"] == userid,
+        donor_metadata_columns
+    ]
+
+    print("starting", userid)
+
+    #  HASH USER ID
+    hashid = hash_userid(userid, os.environ['BIGDATA_SALT'])
+    data["userid"] = userid
+    data["hashid"] = hashid
+    metadata["hashid"] = hashid
+
+    #  CLEAN DATA
+
+    # NOTE: moving remove negative durations to type specific cleaning
+    # TODO: ask backend to change "duration" to only include one object type
+
+    # Tslim calibration bug fix
+    data, n_cal_readings = tslim_calibration_fix(data.copy())
+    metadata["nTandemAndPayloadCalReadings"] = n_cal_readings
+
+    # fix large timzoneOffset bug in utcbootstrapping
+    data = timezone_offset_bug_fix(data.copy())
+
+    # add healthkit timezome information
+    # TODO: refactor this function to only require fields that might have hk tz
+    data[["timezone", "deviceType"]] = get_healthkit_timezone(data.copy())
+
+    # convert deprecated timezones to their aliases
+    data = convert_deprecated_timezone_to_alias(data, timezone_aliases)
+
+    #  TIME RELATED ITEMS
+    data["utcTime"] = to_utc_datetime(data[["time"]].copy())
+
+    # add upload time to the data, which is needed for:
+    # getting rid of duplicates and useful for getting local time
+
+    data["uploadTime"] = (
+        add_upload_time(data[["type", "uploadId", "utcTime"]].copy())
+    )
+
+#    # estimate local time (refactor of estimate-local-time.py)
+#    data["localTime"], local_time_metadata = estimate_local_time(data.copy())
+#
+# TODO: fix this issue with estimate local time
+#    '''
+#    //anaconda3/envs/tbddp/lib/python3.7/site-packages/pandas/core/ops.py:1649
+#    FutureWarning: elementwise comparison failed; returning scalar instead,
+#    but in the future will perform elementwise comparison result = method(y)
+#    '''
+
+    # round all data to the nearest 5 minutes
+    data["roundedUtcTime"] = round_time(
+        data["utcTime"].copy(),
+        time_interval_minutes=5,
+        start_with_first_record=True,
+        return_calculation_columns=False
+    )
+
+    #  TIME CATEGORIES
+    data["date"] = data["roundedUtcTime"].dt.date
+
+    # AGE, & YLW
+    # TODO: make this a function
+    if metadata["birthday"].values[0] is not np.nan:
+        bDate = pd.to_datetime(metadata["birthday"].values[0][0:7])
+        data["age"] = np.floor((data["roundedUtcTime"] - bDate).dt.days/365.25)
+    else:
+        data["age"] = np.nan
+
+    if metadata["diagnosisDate"].values[0] is not np.nan:
+        dDate = pd.to_datetime(metadata["diagnosisDate"].values[0][0:7])
+        data["ylw"] = np.floor((data["roundedUtcTime"] - dDate).dt.days/365.25)
+    else:
+        data["ylw"] = np.nan
+
+    #  GROUP DATA BY TYPE
+    # first sort by upload time (used when removing dumplicates)
+    data.sort_values("uploadTime", ascending=False, inplace=True)
+    groups = data.groupby(by="type")
+
+    # check to see if person is looping
+    if "basal" in data["type"].unique():
+        basal = groups.get_group("basal").dropna(axis=1, how="all")
+        if "deliveryType" in list(basal):
+            bd = basal.loc[
+                basal["deliveryType"] == "temp",
+                ["date", "deliveryType"]
+            ]
+            temp_basal_counts = (
+                pd.DataFrame(
+                    bd.groupby("date").deliveryType.count()
+                ).reset_index()
+            )
+            temp_basal_counts.rename(
+                {"deliveryType": "tempBasalCounts"},
+                axis=1,
+                inplace=True
+            )
+            data = pd.merge(data, temp_basal_counts, on="date", how="left")
+            # >= 25 temp basals per day is likely looping
+            data["isLoopDay"] = data["tempBasalCounts"] >= 25
+            # redefine groups with the new data
+            groups = data.groupby(by="type")
+
+        else:
+            data["isLoopDay"] = np.nan
+    else:
+        data["isLoopDay"] = np.nan
+
+    # %% CGM DATA
+    if "cbg" in data["type"].unique():
+        # sort data with
+        metadata["cgmData"] = True
+
+        # filter by cgm
+        cgm = groups.get_group("cbg").copy()
+
+        # sort data
+        cgm.sort_values("roundedUtcTime", ascending=False, inplace=True)
+        cgm.reset_index(drop=False, inplace=True)
+
+        # calculate cgm in mg/dL
+        cgm["mg/dL"] = round(cgm["value"] * MGDL_PER_MMOLL)
+
+        # get rid of spike data
+        cgm, nSpike = remove_spike_data(cgm.copy())
+        metadata["nSpike"] = nSpike
+
+        # assign upload cgm device info to cgm records in that upload
+        cgm = add_upload_info_to_cgm_records(groups, cgm.copy())
+
+        # check to see if cgm info exists in healthkit locations
+        cgm = expand_heathkit_cgm_fields(cgm.copy())
+
+        # replace smoothed cgm values with raw values (if they exist)
+        # this must run after expand_heathkit_cgm_fields _
+        cgm["mg/dL"], metadata["nSmoothedCgmReplaced"] = (
+            replace_smoothed_cgm_values(cgm.copy())
+        )
+
+        # get cgm models
+        cgm["cgmModel"], cgm["cgmModelSensedFrom"] = np.nan, np.nan
+
+        # dexcom cgm models (G4, G5, G6)
+        cgm[["cgmModel", "cgmModelSensedFrom"]] = (
+            get_dexcom_cgm_model(cgm.copy())
+        )
+
+        # for non dexcom cgms
+        # 670G, 640G, 630G, 530G, 523/723, libre, animas, and tandem
+        cgm[["cgmModel", "cgmModelSensedFrom"]] = (
+            get_non_dexcom_cgm_model(cgm.copy())
+        )
+
+        # get metadata on cgm models and devices
+        metadata["nMissingCgmModels"] = cgm["cgmModel"].isnull().sum()
+        metadata["uniqueCgmModels"] = str(cgm["cgmModel"].unique())
+        if "deviceId" in list(cgm):
+            metadata["uniqueCgmDevices"] = str(cgm["deviceId"].unique())
+
+        #  clean distributions
+        # break up all traces by cgm model
+        combined_cgm_series = pd.DataFrame()
+        cgm_models = cgm.groupby(by="cgmModel")
+
+        for cgm_model in cgm_models.groups.keys():
+            print("working on", cgm_model)
+            temp_cgm = cgm_models.get_group(cgm_model)
+
+            # get rid of cgm values too low/high (< 38 & > 402 mg/dL)
+            temp_cgm, nInvalidCgmValues = remove_invalid_cgm_values(temp_cgm)
+            metadata["nInvalidCgmValues." + cgm_model] = nInvalidCgmValues
+
+            # sort by upload time before getting rid of duplicates
+            temp_cgm.sort_values("uploadTime", ascending=False, inplace=True)
+
+            # get rid of duplicates that have the same ["deviceTime", "mg/dL"]
+            temp_cgm, n_cgm_dups_removed = (
+                removeCgmDuplicates(temp_cgm, "deviceTime", "mg/dL")
+            )
+            metadata["nCgmDuplicatesRemovedDeviceTime." + cgm_model] = (
+                n_cgm_dups_removed
+            )
+
+            # get rid of duplicates that have the same ["time", "mg/dL"]
+            temp_cgm, n_cgm_dups_removed = (
+                removeCgmDuplicates(temp_cgm, "utcTime", "mg/dL")
+            )
+            metadata["nCgmDuplicatesRemovedUtcTime." + cgm_model] = (
+                n_cgm_dups_removed
+            )
+
+            # get rid of duplicates that have the same roundedTime
+            temp_cgm, n_cgm_dups_removed = (
+                removeDuplicates(temp_cgm, "roundedUtcTime")
+            )
+            metadata["nCgmDuplicatesRemovedRoundedTime." + cgm_model] = (
+                n_cgm_dups_removed
+            )
+
+            # create a contiguous 5 minute time series
+            first_day = temp_cgm["roundedUtcTime"].min()
+            metadata["firstCgm." + cgm_model] = first_day
+
+            last_day = temp_cgm["roundedUtcTime"].max()
+            metadata["lastCgm." + cgm_model] = last_day
+
+            rng = pd.date_range(first_day, last_day, freq="5min")
+            contiguous_data = pd.DataFrame(
+                rng,
+                columns=["roundedUtcTime"]
+            ).sort_values(
+                "roundedUtcTime",
+                ascending=False
+            ).reset_index(drop=True)
+
+            # merge with cgm data
+            cgm_series = pd.merge(
+                contiguous_data,
+                temp_cgm[[
+                    "roundedUtcTime", "hashid", "isLoopDay",
+                    "cgmModel", "age", "ylw", "mg/dL"
+                 ]],
+                on="roundedUtcTime",
+                how="left"
+            )
+
+            # sort so that the oldest data point is on top
+            cgm_series.sort_values(
+                "roundedUtcTime", ascending=True, inplace=True
+            )
+            cgm_series.reset_index(drop=True, inplace=True)
+
+            # get dexcom icgm bins
+            value_bins = np.array(
+                [37, 39, 60, 80, 120, 160, 200, 250, 300, 350, 400, 403]
+            )
+            value_bin_names = (
+                "< 40", "40-60", "61-80", "81-120", "121-160", "161-200",
+                "201-250", "251-300", "301-350", "351-400", "> 400"
+            )
+            cgm_series["valueBin"] = pd.cut(
+                cgm_series["mg/dL"], value_bins, labels=value_bin_names
+            )
+
+            # get the previous val
+            cgm_series["previousVal"] = cgm_series["mg/dL"].shift(1)
+
+            # get difference between current and previous val
+            cgm_series["diffFromPrevVal"] = (
+                cgm_series["mg/dL"] - cgm_series["previousVal"]
+            )
+
+            # calculate the rate from previous value (mg/dL/min)
+            cgm_series["rateFromPrevVal"] = cgm_series["diffFromPrevVal"] / 5
+
+            # get dexcom icgm rate bins
+            rate_bins = np.array(
+                [-100, -2.000001, -1.000001, -0.000001, 1, 2, 100]
+            )
+            # NOTE: bracket means include, parentheses means exclude
+            rate_bin_names = (
+                "< -2", "[-2,-1)", "[-1,-0)", "[0,1]", "(1,2]", ">2",
+            )
+            cgm_series["rateBin"] = pd.cut(
+                cgm_series["rateFromPrevVal"], rate_bins, labels=rate_bin_names
+            )
+
+            # through in the join category
+            cgm_series["valAndRateBin"] = (
+                cgm_series["valueBin"].astype(str)
+                + " & "
+                + cgm_series["rateBin"].astype(str)
+            )
+
+            # calculate slope (mg/dL/min) over the last 15, 30, and 60 minutes
+            cgm_series["slope15"] = (
+                cgm_series["mg/dL"].rolling(3).apply(get_slope, raw=True)
+            )
+
+            cgm_series["slope30"] = (
+                cgm_series["mg/dL"].rolling(6).apply(get_slope, raw=True)
+            )
+
+            cgm_series["slope60"] = (
+                cgm_series["mg/dL"].rolling(12).apply(get_slope, raw=True)
+            )
+
+            # add in the next value
+            cgm_series["nextVal"] = cgm_series["mg/dL"].shift(-1)
+
+            # get difference or relative increase/decrease of next value
+            cgm_series["relativeNextValue"] = (
+                cgm_series["nextVal"] - cgm_series["mg/dL"]
+            )
+
+            # rate of next value
+            cgm_series["rateToNextVal"] = cgm_series["relativeNextValue"] / 5
+
+            # drop rows where there is no information
+            cgm_series.dropna(subset=['hashid'], inplace=True)
+            metadata["nCgmDataPoints." + cgm_model] = len(cgm_series)
+
+            # append cgm model to a larger table
+            combined_cgm_series = pd.concat(
+                [combined_cgm_series, cgm_series],
+                ignore_index=True
+            )
+        if len(combined_cgm_series) > 0:
+            # sort so that the oldest data point is on top
+            # and that the G5_G6 get deleted if they are apart of a duplicate
+            combined_cgm_series["cgmModel_G5_and_G6"] = (
+                combined_cgm_series["cgmModel"] == "G5_G6"
+            )
+            combined_cgm_series.sort_values(
+                by=["roundedUtcTime", "cgmModel_G5_and_G6", "cgmModel"],
+                ascending=[False, True, False],
+                inplace=True
+            )
+            combined_cgm_series.reset_index(drop=True, inplace=True)
+
+            # add in check to see if there are duplicates between cgm devices
+            nUnique_cgm_times = len(combined_cgm_series["roundedUtcTime"].unique())
+            cgm_len = len(combined_cgm_series)
+            metadata["duplicateCgmDataIssue"] = nUnique_cgm_times != cgm_len
+
+            nDuplicate_cgm = cgm_len - nUnique_cgm_times
+            metadata["nDuplicateCgmDataIssues"] = nDuplicate_cgm
+
+            # if there are still duplicates, get rid of them
+            if nDuplicate_cgm > 0:
+                # save the duplicates for further examination
+                combined_cgm_series.to_csv(os.path.join(
+                    debug_duplicates,
+                    "PHI-" + userid + "-cgm-series-has-cgm-duplicates.csv.gz"
+                ))
+
+                cgm.to_csv(os.path.join(
+                    debug_duplicates,
+                    "PHI-" + userid + "-cgm-data-has-cgm-duplicates.csv.gz"
+                ))
+
+                # get rid of duplicates
+                combined_cgm_series, n_cgm_dups_removed = (
+                    removeDuplicates(combined_cgm_series, "roundedUtcTime")
+                )
+                metadata["nCgmDuplicatesRemovedRoundedTime.atEnd"] = (
+                    n_cgm_dups_removed
+                )
+            metadata["nCgmDataPoints.atEnd"] = len(combined_cgm_series)
+
+            # add whether data is dexcom cgm or not
+            combined_cgm_series["dexcomCgm"] = (
+                combined_cgm_series["cgmModel"].astype(str).str.contains("G4|G5|G6")
+            )
+
+            # save distribution data
+            combined_cgm_series.to_csv(os.path.join(
+                output_distribution,
+                "PHI-" + userid + "-cgm-distribution.csv.gz"
+            ))
+
+            # %% get cgm stats
+            # create a contiguous 5 minute time series of ALL cgm data
+            first_day = combined_cgm_series["roundedUtcTime"].min()
+            metadata["firstCgm." + cgm_model] = first_day
+
+            last_day = combined_cgm_series["roundedUtcTime"].max()
+            metadata["lastCgm." + cgm_model] = last_day
+
+            rng = pd.date_range(first_day, last_day, freq="5min")
+            contiguous_data = pd.DataFrame(
+                rng,
+                columns=["roundedUtcTime"]
+            ).sort_values(
+                "roundedUtcTime",
+                ascending=True
+            ).reset_index(drop=True)
+
+            # merge with combined_cgm_series data
+            all_cgm = pd.merge(
+                contiguous_data,
+                combined_cgm_series[[
+                    'roundedUtcTime', 'hashid', 'cgmModel', 'dexcomCgm',
+                    'age', 'ylw', 'isLoopDay', 'mg/dL',
+                ]],
+                on="roundedUtcTime",
+                how="left"
+            )
+
+            # get cgm stats
+            # get a binary (T/F) of whether we have a cgm value
+            all_cgm["hasCgm"] = all_cgm["mg/dL"].notnull()
+
+            # fill isLoopDay nan with False
+            all_cgm["isLoopDay"].fillna(False, inplace=True)
+
+            # has loop and cgm
+            all_cgm["hasLoopAndCgm"] = (
+                (all_cgm["isLoopDay"]) & (all_cgm["hasCgm"])
+            )
+
+            all_cgm["hasCgmWithoutLoop"] = (
+                (~all_cgm["isLoopDay"]) & (all_cgm["hasCgm"])
+            )
+
+            # work with all of the non-null data, even 39 = LOW and 401 = HIGH
+            ts39_401 = all_cgm["mg/dL"].copy()
+
+            # some stats should NOT include 39 or 401
+            all_cgm["mg/dL.40to400"] = (
+                ts39_401.replace(to_replace=39, value=np.nan)
+            )
+
+            all_cgm["mg/dL.40to400"] = (
+                all_cgm["mg/dL.40to400"].replace(
+                    to_replace=401,
+                    value=np.nan
+                )
+            )
+
+            ts40_400 = all_cgm["mg/dL.40to400"].copy()
+
+
+            # for all the less than (<) criteria
+            for cgm_threshold in [40, 54, 70]:
+                all_cgm["cgm < " + str(cgm_threshold)] = (
+                    ts39_401.lt(cgm_threshold)
+                )
+                # get episodes below these thresholds
+                for min_duration in [5, 15]:
+                    episode_ts = get_episodes(
+                        all_cgm[[
+                            "roundedUtcTime",
+                            "hasCgm",
+                            "cgm < " + str(cgm_threshold)
+                        ]].copy(),
+                        episode_criterion="cgm < " + str(cgm_threshold),
+                        min_duration=min_duration
+                    )
+                    all_cgm = pd.concat([all_cgm, episode_ts], axis=1)
+
+            # for all the greter than or equal to (>=) criteria
+                all_cgm["cgm >= " + str(cgm_threshold)] = (
+                    ts39_401.ge(cgm_threshold)
+                )
+
+            # for all the the less than or equal to (<=) criteria
+            for cgm_threshold in [140, 180, 250, 300, 400]:
+                all_cgm["cgm <= " + str(cgm_threshold)] = (
+                    ts39_401.le(cgm_threshold)
+                )
+            # for all the the greter than (>) criteria
+                all_cgm["cgm > " + str(cgm_threshold)] = (
+                    ts39_401.gt(cgm_threshold)
+                )
+
+            # get all of the cgm ranges
+            # (cgm >= 40) & (cgm < 54)
+            all_cgm["40 <= cgm < 54"] = (
+                (all_cgm["cgm >= 40"]) & (all_cgm["cgm < 54"])
+            )
+
+            # (cgm >= 54) & (cgm < 70)
+            all_cgm["54 <= cgm < 70"] = (
+                (all_cgm["cgm >= 54"]) & (all_cgm["cgm < 70"])
+            )
+
+            # (cgm >= 70) & (cgm <= 140)
+            all_cgm["70 <= cgm <= 140"] = (
+                (all_cgm["cgm >= 70"]) & (all_cgm["cgm <= 140"])
+            )
+
+            # (cgm >= 70) & (cgm <= 180)
+            all_cgm["70 <= cgm <= 180"] = (
+                (all_cgm["cgm >= 70"]) & (all_cgm["cgm <= 180"])
+            )
+
+            # (cgm > 180) & (cgm <= 250)
+            all_cgm["180 < cgm <= 250"] = (
+                (all_cgm["cgm > 180"]) & (all_cgm["cgm <= 250"])
+            )
+
+            # (cgm > 250) & (cgm <= 400)
+            all_cgm["250 < cgm <= 400"] = (
+                (all_cgm["cgm > 250"]) & (all_cgm["cgm <= 400"])
+            )
+
+            # derfine the windows to calculate the stats over
+            window_names = ["hour", "day", "week", "month", "quarter", "year"]
+            window_lengths = [12,    288,   288*7,  288*7*4, 288*90,   288*365]
+
+            for w_name, w_len in zip(window_names, window_lengths):
+                # require lenth of window for percent calculations
+                w_min = w_len
+
+                # get the start and end times for each window
+                all_cgm[w_name + ".startTime"] = (
+                    all_cgm["roundedUtcTime"].shift(w_len - 1)
+                )
+                all_cgm[w_name + ".endTime"] = all_cgm["roundedUtcTime"]
+
+                # add majority age for the time period
+                all_cgm[w_name + ".age"] = np.round(
+                    all_cgm["age"].rolling(
+                        min_periods=1,
+                        window=w_len
+                    ).mean()
+                )
+
+                # add majority ylw for the time period
+                all_cgm[w_name + ".ylw"] = np.round(
+                    all_cgm["ylw"].rolling(
+                        min_periods=1,
+                        window=w_len
+                    ).median()
+                )
+
+                # get percent time cgm used
+                all_cgm[w_name + ".cgmPercent"] = (
+                    all_cgm["hasCgm"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # get the total number of non-null values over this time period
+                all_cgm[w_name + ".missingCgmPercent"] = (
+                    1 - all_cgm[w_name + ".cgmPercent"]
+                )
+
+                # create (T/F) 70 and 80 percent available thresholds
+                # which will be useful for processing later
+                all_cgm[w_name + ".ge70Available"] = (
+                    all_cgm[w_name + ".cgmPercent"] >= 0.7
+                )
+
+                all_cgm[w_name + ".ge80Available"] = (
+                    all_cgm[w_name + ".cgmPercent"] >= 0.8
+                )
+
+                # get percent time Loop was used NOTE: this is
+                # approximate because we use > 24 temp basals per day
+                # ALSO: this is percent time Loop was used while cgm in use
+                all_cgm[w_name + ".loopingAndCgmPercent"] = (
+                    all_cgm["hasLoopAndCgm"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # percent of time cgm without loop
+                all_cgm[w_name + ".cgmWithoutLoopPercent"] = (
+                    all_cgm["hasCgmWithoutLoop"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # get episode stats
+                # TODO: add in hyper events
+                # get episodes below these thresholds
+                for cgm_threshold in [40, 54, 70]:
+                    # get number of episodes per time window
+                    for min_duration in [5, 15]:
+                        "cgm < " + str(cgm_threshold)
+                        episode_name = (
+                            "episode.cgm < " + str(cgm_threshold)
+                            + ".durationThreshold=" + str(min_duration)
+                        )
+                        all_cgm[w_name + ".count." + episode_name] = (
+                            all_cgm[episode_name + ".episodeStart"].rolling(
+                                min_periods=1,
+                                window=w_len
+                            ).sum()
+                        )
+
+                        # get avg. duration of each episode per time window
+                        all_cgm[w_name + ".avgDuration." + episode_name] = (
+                            all_cgm[episode_name + ".episodeTotalDuration"].rolling(
+                                min_periods=1,
+                                window=w_len
+                            ).sum() / all_cgm[w_name + ".count." + episode_name]
+                        )
+
+                        # get min duration of each episode per time window
+                        all_cgm[w_name + ".minDuration." + episode_name] = (
+                            all_cgm[episode_name + ".episodeTotalDuration"].rolling(
+                                min_periods=1,
+                                window=w_len
+                            ).min()
+                        )
+
+                        # get median duration of each episode per time window
+                        all_cgm[w_name + ".medianDuration." + episode_name] = (
+                            all_cgm[episode_name + ".episodeTotalDuration"].rolling(
+                                min_periods=1,
+                                window=w_len
+                            ).median()
+                        )
+
+                        # get max duration of each episode per time window
+                        all_cgm[w_name + ".maxDuration." + episode_name] = (
+                            all_cgm[episode_name + ".episodeTotalDuration"].rolling(
+                                min_periods=1,
+                                window=w_len
+                            ).max()
+                        )
+
+                # get percent time in different ranges
+                # % Time < 54
+                all_cgm[w_name + ".lt54Percent"] = (
+                    all_cgm["cgm < 54"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # % Time in 54-70 (cgm >= 54) & (cgm < 70)
+                all_cgm[w_name + ".bt54_70Percent"] = (
+                    all_cgm["54 <= cgm < 70"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # % Time in target range (cgm >= 70) & (cgm <= 180)
+                all_cgm[w_name + ".bt70_180Percent"] = (
+                    all_cgm["70 <= cgm <= 180"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # % Time in 180-250 (cgm > 180) & (cgm <= 250)
+                all_cgm[w_name + ".bt180_250Percent"] = (
+                    all_cgm["180 < cgm <= 250"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # % Time > 250
+                all_cgm[w_name + ".gt250Percent"] = (
+                    all_cgm["cgm > 250"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # check that all of the percentages add of to 1 or 100%
+                all_cgm[w_name + ".percentCheck"] = (
+                     all_cgm[w_name + ".missingCgmPercent"]
+                     + all_cgm[w_name + ".lt54Percent"]
+                     + all_cgm[w_name + ".bt54_70Percent"]
+                     + all_cgm[w_name + ".bt70_180Percent"]
+                     + all_cgm[w_name + ".bt180_250Percent"]
+                     + all_cgm[w_name + ".gt250Percent"]
+                )
+
+                # here are some other less common percent time in ranges
+                # % Time < 70
+                all_cgm[w_name + ".lt70Percent"] = (
+                    all_cgm["cgm < 70"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # % Time in target range (cgm >= 70) & (cgm <= 140)
+                all_cgm[w_name + ".tir70to140Percent"] = (
+                    all_cgm["70 <= cgm <= 140"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # percent time above a threshold
+                # % Time > 180
+                all_cgm[w_name + ".gt180Percent"] = (
+                    all_cgm["cgm > 180"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # quantiles
+                # NOTE: this will increase run time, so only run if you need
+                # 3-4X the processing time since it has to sort the data
+                # TODO: make this an option to the function, once it is made
+                # create a rolling object
+
+                # NOTE: these calculations only require 3 points to make
+                roll39_401 = ts39_401.rolling(min_periods=3, window=w_len)
+                roll40_400 = ts40_400.rolling(min_periods=3, window=w_len)
+
+                # min
+                all_cgm[w_name + ".min"] = roll39_401.min()
+
+                # 10, 25, 75, and 90th percentiles
+                all_cgm[w_name + ".10th"] = roll39_401.quantile(0.10)
+                all_cgm[w_name + ".25th"] = roll39_401.quantile(0.25)
+                all_cgm[w_name + ".75th"] = roll39_401.quantile(0.75)
+                all_cgm[w_name + ".90th"] = roll39_401.quantile(0.90)
+
+                # max
+                all_cgm[w_name + ".max"] = roll39_401.max()
+
+                # median
+                all_cgm[w_name + ".median"] = roll39_401.median()
+
+                # iqr
+                all_cgm[w_name + ".iqr"] = (
+                    all_cgm[w_name + ".75th"] - all_cgm[w_name + ".25th"]
+                )
+
+                # recalcuate percent of measurements available
+                all_cgm[w_name + ".40to400availablePercent"] = (
+                    roll40_400.count() / w_len
+                )
+
+                # get the total number of non-null values over this time period
+                all_cgm[w_name + ".40to400missingPercent"] = (
+                    1 - all_cgm[w_name + ".40to400availablePercent"]
+                )
+
+                all_cgm[w_name + ".40to400ge70Available"] = (
+                    all_cgm[w_name + ".40to400availablePercent"] >= 0.7
+                )
+
+                all_cgm[w_name + ".40to400ge80Available"] = (
+                    all_cgm[w_name + ".40to400availablePercent"] >= 0.8
+                )
+
+                # mean
+                all_cgm[w_name + ".mean"] = roll40_400.mean()
+
+                # GMI(%) = 3.31 + 0.02392 x [mean glucose in mg/dL]
+                all_cgm[w_name + ".gmi"] = (
+                    3.31 + (0.02392 * all_cgm[w_name + ".mean"])
+                )
+
+                # standard deviation (std)
+                all_cgm[w_name + ".std"] = roll40_400.std()
+
+                # coefficient of variation (cov) = std / mean
+                all_cgm[w_name + ".cov"] = (
+                    all_cgm[w_name + ".std"] / all_cgm[w_name + ".mean"]
+                )
+
+            # %% save cgm stats data
+            all_cgm.to_csv(os.path.join(
+                output_stats,
+                "PHI-" + userid + "-cgm-stats.csv.gz"
+            ))
+            # write the most recent example of the 90 day stats
+            # to the metadata
+            quarter_ge80Available_idx = (
+                all_cgm[all_cgm["quarter.ge80Available"]]
+            ).index.max()
+
+            if pd.notnull(quarter_ge80Available_idx):
+                # get the most recent quarter
+                most_recent = all_cgm.loc[
+                    [quarter_ge80Available_idx],
+                    all_cgm.columns
+                ]
+            else:
+                most_recent = all_cgm.loc[
+                    [all_cgm.index.max()],
+                    all_cgm.columns
+                ]
+
+            metadata = pd.merge(
+                metadata,
+                most_recent,
+                on="hashid",
+                how="left"
+            )
+
+        print(metadata.T)
+
+    else:
+        metadata["cgmData"] = False
+        print(userid, " has no cgm data")
+
+    # save metadata
+    metadata.to_csv(os.path.join(
+        output_metadata,
+        "PHI-" + userid + "-cgm-metadata.csv.gz"
+    ))
+
+    print("finished with", userid, "\n")
+
+    return
+
+
+# %% MAIN
+if __name__ == "__main__":
+    # USER INPUTS (choices to be made in order to run the code)
+    codeDescription = "get distribution and stats for donor json data"
+    parser = argparse.ArgumentParser(description=codeDescription)
+
+    parser.add_argument(
+        "-i",
+        "--input-json-data-path",
+        dest="json_data_path",
+        default=os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__), "..", "data"
+            )
+        ),
+        help="the path where the json data is located"
+    )
+
+    parser.add_argument(
+        "-u",
+        "--userid",
+        dest="userid",
+        default=np.nan,
+        help="userid and filename"
+    )
+
+    parser.add_argument(
+        "-d",
+        "--date-stamp",
+        dest="date_stamp",
+        default=dt.datetime.now().strftime("%Y-%m-%d"),
+        help="date, in '%Y-%m-%d' format, of the date when " +
+        "donors were accepted"
+    )
+
+    parser.add_argument(
+        "-o",
+        "--output-data-path",
+        dest="data_path",
+        default=os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__), "..", "data"
+            )
+        ),
+        help="the output path where the data is stored"
+    )
+
+    args = parser.parse_args()
+
+    # the main function
+    get_distribution_and_stats(
+        json_data_path=args.json_data_path,
+        userid=args.userid,
+        date_stamp=args.date_stamp,
+        save_data_path=args.data_path,
+    )

From 4b44457fe251605e87175f99ed3393c8c6368938 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Tue, 20 Aug 2019 17:16:50 -0500
Subject: [PATCH 38/46] skip already processed and use 1/2 of processors

---
 .../batch_get_cgm_distributions_and_stats.py  | 51 ++++++++++++-------
 1 file changed, 34 insertions(+), 17 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py b/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py
index 2830fe03..502a4b47 100644
--- a/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py
+++ b/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py
@@ -61,26 +61,40 @@
 def run_process(json_data_path):
     userid = json_data_path[-15:-5]
 
-    p = sub.Popen(
-        [
-             "python", "get_cgm_distributions_and_stats.py",
-             "-i", json_data_path,
-             "-u", userid,
-             "-d", args.date_stamp,
-             "-o", args.data_path
-         ],
-        stdout=sub.PIPE,
-        stderr=sub.PIPE
+    # check to see if the file was already processed
+    phi_date_stamp = "PHI-" + args.date_stamp
+
+    metadata_path = os.path.join(
+        args.data_path,
+        phi_date_stamp + "-donor-data",
+        phi_date_stamp + "-cgm-metadata"
     )
 
-    output, errors = p.communicate()
-    output = output.decode("utf-8")
-    errors = errors.decode("utf-8")
+    all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
+    if userid not in str(all_metadata_files):
+
+        p = sub.Popen(
+            [
+                 "python", "get_cgm_distributions_and_stats.py",
+                 "-i", json_data_path,
+                 "-u", userid,
+                 "-d", args.date_stamp,
+                 "-o", args.data_path
+             ],
+            stdout=sub.PIPE,
+            stderr=sub.PIPE
+        )
+
+        output, errors = p.communicate()
+        output = output.decode("utf-8")
+        errors = errors.decode("utf-8")
 
-    if errors == '':
-        print(output)
+        if errors == '':
+            print(output)
+        else:
+            print(errors)
     else:
-        print(errors)
+        print(userid, "was already processed")
 
     return
 
@@ -89,13 +103,16 @@ def run_process(json_data_path):
 all_files = glob.glob(args.json_data_path, recursive=True)
 
 # this is a good test to make sure run process is working before running
+#import pdb
+#args.date_stamp = "2019-07-17"
 #run_process(all_files[0])
 #pdb.set_trace()
 
+
 # use multiple cores to process
 startTime = time.time()
 print("starting at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
-pool = Pool(os.cpu_count())
+pool = Pool(int(os.cpu_count()/2))
 pool.map(run_process, all_files)
 pool.close()
 endTime = time.time()

From 57dc7fd622ba759206943d1d10e834a52e52a79f Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Wed, 21 Aug 2019 07:24:53 -0500
Subject: [PATCH 39/46] get results script

---
 ...e_cgm_distribution_and_metadata_results.py | 95 +++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py

diff --git a/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py
new file mode 100644
index 00000000..e548115b
--- /dev/null
+++ b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+"""accept_donors_and_pull_data.py
+This is a wrapper script that gets distributions and stats for all donors,
+NOTE: this needs to be refactored because it is currently set up to run
+on json files that are in a snowflake path
+
+"""
+
+# %% REQUIRED LIBRARIES
+import pandas as pd
+import os
+import glob
+import argparse
+
+
+# %% USER INPUTS (choices to be made in order to run the code)
+codeDescription = "get distribution and stats for all donor's json data"
+parser = argparse.ArgumentParser(description=codeDescription)
+
+parser.add_argument(
+    "-d",
+    "--date-stamp",
+    dest="date_stamp",
+    default="2019-07-17",
+    help="date, in '%Y-%m-%d' format, of the date when " +
+    "donors were accepted"
+)
+
+parser.add_argument(
+    "-o",
+    "--output-data-path",
+    dest="data_path",
+    default=os.path.abspath(
+        os.path.join(
+            os.path.dirname(__file__), "..", "data"
+        )
+    ),
+    help="the output path where the data is stored"
+)
+
+args = parser.parse_args()
+
+
+# %% COMBINE AND SAVE ALL DONOR METADATA
+print("combining all metadata")
+phi_date_stamp = "PHI-" + args.date_stamp
+donor_folder = os.path.join(args.data_path, phi_date_stamp + "-donor-data")
+
+metadata_path = os.path.join(
+    args.data_path,
+    phi_date_stamp + "-donor-data",
+    phi_date_stamp + "-cgm-metadata"
+)
+
+all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
+all_metadata = pd.DataFrame()
+for f in all_metadata_files:
+    temp_meta = pd.read_csv(f)
+    all_metadata = pd.concat(
+        [all_metadata, temp_meta],
+        ignore_index=True,
+        sort=False
+    )
+
+all_metadata.to_csv(
+    os.path.join(donor_folder, phi_date_stamp + "-cgm-metadata.csv.gz")
+)
+print("saving metadata...code complete")
+
+
+# %% COMBINE AND SAVE ALL DISTRIBUTION DATA
+print("combining all distribution data")
+
+metadata_path = os.path.join(
+    args.data_path,
+    phi_date_stamp + "-donor-data",
+    phi_date_stamp + "-cgm-distributions"
+)
+
+all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
+distribution_metadata = pd.DataFrame()
+for f in all_metadata_files:
+    temp_meta = pd.read_csv(f, index_col=[0])
+    distribution_metadata = pd.concat(
+        [distribution_metadata, temp_meta],
+        ignore_index=True,
+        sort=False
+    )
+
+distribution_metadata.to_csv(
+    os.path.join(
+        donor_folder, phi_date_stamp + "-all-cgm-distributions.csv.gz"
+    )
+)
+print("saving all-dataset-info-metadata...code complete")

From 60e6c228d81ae385e43562a405de61b66ef35f87 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Wed, 21 Aug 2019 08:32:30 -0500
Subject: [PATCH 40/46] pull files into pandas with low_memory flag

to make sure that the column datatypes are sensed correctly
---
 .../combine_cgm_distribution_and_metadata_results.py          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py
index e548115b..1ed4ad00 100644
--- a/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py
+++ b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py
@@ -55,7 +55,7 @@
 all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
 all_metadata = pd.DataFrame()
 for f in all_metadata_files:
-    temp_meta = pd.read_csv(f)
+    temp_meta = pd.read_csv(f, low_memory=False)
     all_metadata = pd.concat(
         [all_metadata, temp_meta],
         ignore_index=True,
@@ -80,7 +80,7 @@
 all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
 distribution_metadata = pd.DataFrame()
 for f in all_metadata_files:
-    temp_meta = pd.read_csv(f, index_col=[0])
+    temp_meta = pd.read_csv(f, index_col=[0], low_memory=False)
     distribution_metadata = pd.concat(
         [distribution_metadata, temp_meta],
         ignore_index=True,

From a984e128fe2f52dc3b9964058fdb4c1328175716 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Wed, 21 Aug 2019 09:51:36 -0500
Subject: [PATCH 41/46] save results in chunks

---
 ...e_cgm_distribution_and_metadata_results.py | 58 ++++++++++++++-----
 1 file changed, 42 insertions(+), 16 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py
index 1ed4ad00..a14fd57d 100644
--- a/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py
+++ b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py
@@ -8,6 +8,7 @@
 
 # %% REQUIRED LIBRARIES
 import pandas as pd
+import numpy as np
 import os
 import glob
 import argparse
@@ -38,6 +39,15 @@
     help="the output path where the data is stored"
 )
 
+
+parser.add_argument(
+    "-c",
+    "--chunk-size",
+    dest="chunk_size",
+    default=50,
+    help="the output path where the data is stored"
+)
+
 args = parser.parse_args()
 
 
@@ -63,9 +73,13 @@
     )
 
 all_metadata.to_csv(
-    os.path.join(donor_folder, phi_date_stamp + "-cgm-metadata.csv.gz")
+    os.path.join(
+        donor_folder,
+        phi_date_stamp
+        + "-cgm-metadata-0-{}.csv.gz".format(str(len(all_metadata_files)))
+    )
 )
-print("saving metadata...code complete")
+print("finished saving metadata...starting distribution data...")
 
 
 # %% COMBINE AND SAVE ALL DISTRIBUTION DATA
@@ -78,18 +92,30 @@
 )
 
 all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
-distribution_metadata = pd.DataFrame()
-for f in all_metadata_files:
-    temp_meta = pd.read_csv(f, index_col=[0], low_memory=False)
-    distribution_metadata = pd.concat(
-        [distribution_metadata, temp_meta],
-        ignore_index=True,
-        sort=False
-    )
-
-distribution_metadata.to_csv(
-    os.path.join(
-        donor_folder, phi_date_stamp + "-all-cgm-distributions.csv.gz"
+chunks = np.arange(0, len(all_metadata_files), args.chunk_size)
+chunks = np.append(chunks, len(all_metadata_files))
+for chunk_start, chunk_end in zip(chunks[0:-1], chunks[1:]):
+    print("starting chunk {}-{}".format(str(chunk_start), str(chunk_end)))
+    distribution_metadata = pd.DataFrame()
+    for c_idx in np.arange(chunk_start, chunk_end):
+        temp_meta = pd.read_csv(
+            all_metadata_files[c_idx],
+            index_col=[0],
+            low_memory=False
+        )
+        distribution_metadata = pd.concat(
+            [distribution_metadata, temp_meta],
+            ignore_index=True,
+            sort=False
+        )
+    # save chunk
+    print("saving chunk {}-{}".format(str(chunk_start), str(chunk_end)))
+    distribution_metadata.to_csv(
+        os.path.join(
+            donor_folder,
+            phi_date_stamp + "-cgm-distributions-{}-{}.csv.gz".format(
+                str(chunk_start),
+                str(chunk_end))
+        )
     )
-)
-print("saving all-dataset-info-metadata...code complete")
+print("finished saving all-dataset-distribution-data...code complete")

From 858372a9e5029aa352e9d3ac0d74fe2876df0a29 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Wed, 21 Aug 2019 11:20:45 -0500
Subject: [PATCH 42/46] move files and modify print statements

---
 .../batch_get_cgm_distributions_and_stats.py  |    1 -
 ...e_cgm_distribution_and_metadata_results.py |    6 +-
 .../combine_cgm_distribution_results.py       |   92 ++
 .../get_cgm_distributions_v3.py               |    0
 .../get_cgm_distributions_and_stats.py        |    1 +
 .../get_stats/get_cgm_stats.py                | 1361 -----------------
 6 files changed, 96 insertions(+), 1365 deletions(-)
 create mode 100644 projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_results.py
 rename projects/bigdata-processing-pipeline/get_stats/{ => development-versions}/get_cgm_distributions_v3.py (100%)
 delete mode 100644 projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py

diff --git a/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py b/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py
index 502a4b47..61894b9c 100644
--- a/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py
+++ b/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py
@@ -108,7 +108,6 @@ def run_process(json_data_path):
 #run_process(all_files[0])
 #pdb.set_trace()
 
-
 # use multiple cores to process
 startTime = time.time()
 print("starting at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
diff --git a/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py
index a14fd57d..b8bac502 100644
--- a/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py
+++ b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py
@@ -63,6 +63,7 @@
 )
 
 all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
+print("combining {} metaata files".format(len(all_metadata_files)))
 all_metadata = pd.DataFrame()
 for f in all_metadata_files:
     temp_meta = pd.read_csv(f, low_memory=False)
@@ -83,8 +84,6 @@
 
 
 # %% COMBINE AND SAVE ALL DISTRIBUTION DATA
-print("combining all distribution data")
-
 metadata_path = os.path.join(
     args.data_path,
     phi_date_stamp + "-donor-data",
@@ -92,7 +91,8 @@
 )
 
 all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
-chunks = np.arange(0, len(all_metadata_files), args.chunk_size)
+print("combining {} distribution data files".format(len(all_metadata_files)))
+chunks = np.arange(0, len(all_metadata_files), int(args.chunk_size))
 chunks = np.append(chunks, len(all_metadata_files))
 for chunk_start, chunk_end in zip(chunks[0:-1], chunks[1:]):
     print("starting chunk {}-{}".format(str(chunk_start), str(chunk_end)))
diff --git a/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_results.py b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_results.py
new file mode 100644
index 00000000..12abb350
--- /dev/null
+++ b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_results.py
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+"""accept_donors_and_pull_data.py
+This is a wrapper script that gets distributions and stats for all donors,
+NOTE: this needs to be refactored because it is currently set up to run
+on json files that are in a snowflake path
+
+"""
+
+# %% REQUIRED LIBRARIES
+import pandas as pd
+import numpy as np
+import os
+import glob
+import argparse
+
+
+# %% USER INPUTS (choices to be made in order to run the code)
+codeDescription = "get distribution and stats for all donor's json data"
+parser = argparse.ArgumentParser(description=codeDescription)
+
+parser.add_argument(
+    "-d",
+    "--date-stamp",
+    dest="date_stamp",
+    default="2019-07-17",
+    help="date, in '%Y-%m-%d' format, of the date when " +
+    "donors were accepted"
+)
+
+parser.add_argument(
+    "-o",
+    "--output-data-path",
+    dest="data_path",
+    default=os.path.abspath(
+        os.path.join(
+            os.path.dirname(__file__), "..", "data"
+        )
+    ),
+    help="the output path where the data is stored"
+)
+
+parser.add_argument(
+    "-c",
+    "--chunk-size",
+    dest="chunk_size",
+    default=50,
+    help="the output path where the data is stored"
+)
+
+args = parser.parse_args()
+
+
+# %% COMBINE AND SAVE ALL DISTRIBUTION DATA
+
+phi_date_stamp = "PHI-" + args.date_stamp
+donor_folder = os.path.join(args.data_path, phi_date_stamp + "-donor-data")
+
+metadata_path = os.path.join(
+    args.data_path,
+    phi_date_stamp + "-donor-data",
+    phi_date_stamp + "-cgm-distributions"
+)
+
+all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
+print("combining {} distribution data files".format(len(all_metadata_files)))
+chunks = np.arange(0, len(all_metadata_files), int(args.chunk_size))
+chunks = np.append(chunks, len(all_metadata_files))
+for chunk_start, chunk_end in zip(chunks[0:-1], chunks[1:]):
+    print("starting chunk {}-{}".format(str(chunk_start), str(chunk_end)))
+    distribution_metadata = pd.DataFrame()
+    for c_idx in np.arange(chunk_start, chunk_end):
+        temp_meta = pd.read_csv(
+            all_metadata_files[c_idx],
+            index_col=[0],
+            low_memory=False
+        )
+        distribution_metadata = pd.concat(
+            [distribution_metadata, temp_meta],
+            ignore_index=True,
+            sort=False
+        )
+    # save chunk
+    print("saving chunk {}-{}".format(str(chunk_start), str(chunk_end)))
+    distribution_metadata.to_csv(
+        os.path.join(
+            donor_folder,
+            phi_date_stamp + "-cgm-distributions-{}-{}.csv.gz".format(
+                str(chunk_start),
+                str(chunk_end))
+        )
+    )
+print("finished saving all-dataset-distribution-data...code complete")
diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/development-versions/get_cgm_distributions_v3.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py
rename to projects/bigdata-processing-pipeline/get_stats/development-versions/get_cgm_distributions_v3.py
diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py
index af0d0d50..8a6cf7d5 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py
@@ -1901,6 +1901,7 @@ def get_distribution_and_stats(
                 ascending=[False, True, False],
                 inplace=True
             )
+
             combined_cgm_series.reset_index(drop=True, inplace=True)
 
             # add in check to see if there are duplicates between cgm devices
diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
deleted file mode 100644
index 172f4784..00000000
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py
+++ /dev/null
@@ -1,1361 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-'''
-calculate cgm statsistics for a single tidepool (donor) dataset
-'''
-
-
-# %% REQUIRED LIBRARIES
-import os
-import sys
-import hashlib
-import pytz
-import numpy as np
-import pandas as pd
-import datetime as dt
-
-# TODO: figure out how to get rid of these path dependcies
-get_donor_data_path = os.path.abspath(
-    os.path.join(os.path.dirname(__file__), "..")
-)
-if get_donor_data_path not in sys.path:
-    sys.path.insert(0, get_donor_data_path)
-import environmentalVariables
-from get_donor_data.get_single_donor_metadata import get_shared_metadata
-from get_donor_data.get_single_tidepool_dataset import get_data
-
-
-# %% CONSTANTS
-MGDL_PER_MMOLL = 18.01559
-
-
-# %% FUNCTIONS
-'''
-the functions that are called in this script,
-which includes notes of where the functions came from,
-and whether they were refactored
-'''
-
-
-def hash_userid(userid, salt):
-    '''
-    taken from anonymize-and-export.py
-    refactored name(s) to meet style guide
-    '''
-    usr_string = userid + salt
-    hash_user = hashlib.sha256(usr_string.encode())
-    hashid = hash_user.hexdigest()
-
-    return hashid
-
-
-def get_type(val):
-    return type(val).__name__
-
-
-def remove_negative_durations(df):
-    '''
-    taken from https://github.com/tidepool-org/data-analytics/blob/
-    etn/get-settings-and-events/projects/get-donors-pump-settings/
-    get-users-settings-and-events.py
-
-    refactored name(s) to meet style guide
-    refactored pandas field call to df["field"] instead of df.field
-    refactored because physical activity includes embedded json, whereas
-    the other fields in the data model require a integer
-    '''
-    if "duration" in list(df):
-        type_ = df["duration"].apply(get_type)
-        valid_index = ((type_ == "int") & (df["duration"].notnull()))
-        n_negative_durations = sum(df.loc[valid_index, "duration"] < 0)
-        if n_negative_durations > 0:
-            df = df[~(df.loc[valid_index, "duration"] < 0)]
-    else:
-        n_negative_durations = np.nan
-
-    return df, n_negative_durations
-
-
-def expand_embedded_dict(ts, key_):
-    '''Expanded a single field that has embedded json
-
-    Args:
-        ts: a pandas time series of the field that has embedded json
-        key_: the key that you want to expand
-
-    Raise:
-        TypeError: if you don't pass in a pandas time series
-
-    Returns:
-        key_ts: a new time series of the key of interest
-
-    NOTE:
-        this is new function
-    TODO:
-        could be refactored to allow multiple keys or all keys to be returned
-        could be refactored for speed as the current process
-    '''
-
-    if "Series" not in type(ts).__name__:
-        raise TypeError('Expecting a pandas time series object')
-    key_ts = pd.Series(name=ts.name + "." + key_, index=ts.index)
-    notnull_idx = ts.notnull()
-    # TODO: maybe sped up by only getting the one field of interest?
-    # though, the current method is fairly quick and compact
-    temp_df = pd.DataFrame(ts[notnull_idx].tolist())
-    if key_ in list(temp_df):
-        key_ts[notnull_idx] = temp_df[key_].values
-
-    return key_ts
-
-
-def get_embedded_field(ts, embedded_field):
-    '''get a field that is nested in more than 1 embedded dictionary (json)
-
-    Args:
-        ts: a pandas time series of the field that has embedded json
-        embedded_field (str): the location of the field that is deeply nested
-            (e.g., "origin.payload.device.model")
-
-    Raise:
-        ValueError: if you don't pass in a pandas time series
-
-    Returns:
-        new_ts: a new time series of the key of interest
-
-    NOTE:
-        this is new function
-        the "." notation is used to reference nested json
-
-    '''
-    field_list = embedded_field.split(".")
-    if len(field_list) < 2:
-        raise ValueError('Expecting at least 1 embedded field')
-
-    new_ts = expand_embedded_dict(ts, field_list[1])
-    for i in range(2, len(field_list)):
-        new_ts = expand_embedded_dict(new_ts, field_list[i])
-
-    return new_ts
-
-
-def tslim_calibration_fix(df):
-    '''
-    taken from https://github.com/tidepool-org/data-analytics/blob/
-    etn/get-settings-and-events/projects/get-donors-pump-settings/
-    get-users-settings-and-events.py
-
-    refactored name(s) to meet style guide
-    refactored pandas field call to df["field"] instead of df.field
-    refactored to only expand one field
-    '''
-
-    # expand payload field one level
-    df["payload.calibration_reading"] = (
-        expand_embedded_dict(df["payload"], "calibration_reading")
-    )
-
-    if df["payload.calibration_reading"].notnull().sum() > 0:
-
-        search_for = ['tan']
-        tandem_data_index = (
-            (df["deviceId"].str.contains('|'.join(search_for)))
-            & (df["type"] == "deviceEvent")
-        )
-
-        cal_index = df["payload.calibration_reading"].notnull()
-        valid_index = tandem_data_index & cal_index
-
-        n_cal_readings = sum(valid_index)
-
-        if n_cal_readings > 0:
-            # if reading is > 30 then it is in the wrong units
-            if df["payload.calibration_reading"].min() > 30:
-                df.loc[cal_index, "value"] = (
-                    df.loc[valid_index, "payload.calibration_reading"]
-                    / MGDL_PER_MMOLL
-                )
-            else:
-                df.loc[cal_index, "value"] = (
-                    df.loc[valid_index, "payload.calibration_reading"]
-                )
-    else:
-        n_cal_readings = 0
-    return df, n_cal_readings
-
-
-def get_healthkit_timezone(df):
-    '''
-    TODO: refactor to account for more efficient way to get embedded json
-    '''
-    df["payload.HKTimeZone"] = (
-        expand_embedded_dict(df["payload"], "HKTimeZone")
-    )
-    if "timezone" not in list(df):
-        if "payload.HKTimeZone" in list(df):
-            hk_tz_idx = df["payload.HKTimeZone"].notnull()
-            df.loc[hk_tz_idx, "deviceType"] = "healthkit"
-            df.rename(columns={"payload.HKTimeZone": "timezone"}, inplace=True)
-
-        else:
-            df["timezone"] = np.nan
-            df["deviceType"] = np.nan
-    else:
-        if "payload.HKTimeZone" in list(df):
-            hk_tz_idx = df["payload.HKTimeZone"].notnull()
-            df.loc[hk_tz_idx, "timezone"] = (
-                df.loc[hk_tz_idx, "payload.HKTimeZone"]
-            )
-            df.loc[hk_tz_idx, "deviceType"] = "healthkit"
-        else:
-            df["timezone"] = np.nan
-            df["deviceType"] = np.nan
-
-    return df[["timezone", "deviceType"]]
-
-
-def get_and_fill_timezone(df):
-    '''
-    this is new to deal with healthkit data
-    requires that a data frame that contains payload and HKTimeZone is passed
-    '''
-    df = get_healthkit_timezone(df)
-
-    df["timezone"].fillna(method='ffill', inplace=True)
-    df["timezone"].fillna(method='bfill', inplace=True)
-
-    return df["timezone"]
-
-
-def make_tz_unaware(date_time):
-    return date_time.replace(tzinfo=None)
-
-
-def to_utc_datetime(df):
-    '''
-    this is new to deal with perfomance issue with the previous method
-    of converting to string to datetime with pd.to_datetime()
-    '''
-    utc_time_tz_aware = pd.to_datetime(
-        df["time"],
-        format="%Y-%m-%dT%H:%M:%S",
-        utc=True
-    )
-    utc_tz_unaware = utc_time_tz_aware.apply(make_tz_unaware)
-
-    return utc_tz_unaware
-
-
-# apply the large timezone offset correction (AKA Darin's fix)
-def timezone_offset_bug_fix(df):
-    '''
-    this is taken from estimate-local-time.py
-    TODO: add in unit testing where there is no TZP that is > 840 or < -720
-    '''
-
-    if "timezoneOffset" in list(df):
-
-        while ((df.timezoneOffset > 840).sum() > 0):
-            df.loc[df.timezoneOffset > 840, ["conversionOffset"]] = (
-                df.loc[df.timezoneOffset > 840, ["conversionOffset"]]
-                - (1440 * 60 * 1000)
-                )
-
-            df.loc[df.timezoneOffset > 840, ["timezoneOffset"]] = (
-                df.loc[df.timezoneOffset > 840, ["timezoneOffset"]] - 1440
-            )
-
-        while ((df.timezoneOffset < -720).sum() > 0):
-            df.loc[df.timezoneOffset < -720, ["conversionOffset"]] = (
-                df.loc[df.timezoneOffset < -720, ["conversionOffset"]]
-                + (1440 * 60 * 1000)
-            )
-
-            df.loc[df.timezoneOffset < -720, ["timezoneOffset"]] = (
-                df.loc[df.timezoneOffset < -720, ["timezoneOffset"]] + 1440
-            )
-
-    return df
-
-
-def get_local_time(df):
-
-    tzo = df[['utcTime', 'inferredTimezone']].apply(
-        lambda x: get_timezone_offset(*x), axis=1
-    )
-    local_time = df['utcTime'] + pd.to_timedelta(tzo, unit="m")
-
-    return local_time
-
-
-def round_time(
-        df,
-        time_interval_minutes=5,
-        start_with_first_record=True,
-        return_calculation_columns=False
-):
-    '''
-    A general purpose round time function that rounds the "time"
-    field to nearest <time_interval_minutes> minutes
-    INPUTS:
-        * a dataframe (df) or time series that contains only one time field
-        that you want to round
-        * time_interval_minutes (defaults to 5 minutes given that most cgms
-        output every 5 minutes)
-        * start_with_first_record starts the rounding with the first record
-        if True, and the last record if False (defaults to True)
-        * return_calculation_columns specifies whether the extra columns
-        used to make calculations are returned
-    refactored name(s) to meet style guide
-    '''
-    # if a time series is passed in, convert to dataframe
-    if "Series" in get_type(df):
-        df = pd.DataFrame(df)
-    columns_ = list(df)
-    if len(columns_) > 1:
-        sys.exit(
-            "Error: df should only have one time column"
-        )
-    else:
-        df.rename(columns={columns_[0]: "t"}, inplace=True)
-
-    df.sort_values(
-        by="t",
-        ascending=start_with_first_record,
-        inplace=True
-    )
-
-    df.reset_index(drop=False, inplace=True)
-    df.rename(columns={"index": "originalIndex"}, inplace=True)
-
-    # calculate the time between consecutive records
-    df["t_shift"] = df["t"].shift(1)
-    df["timeBetweenRecords"] = round(
-        (df["t"] - df["t_shift"]).dt.days*(86400/(60 * time_interval_minutes))
-        + (df["t"] - df["t_shift"]).dt.seconds/(60 * time_interval_minutes)
-    ) * time_interval_minutes
-
-    # separate the data into chunks if timeBetweenRecords is greater than
-    # 2 times the <time_interval_minutes> minutes so the rounding process
-    # starts over
-    big_gaps = list(
-        df.query("abs(timeBetweenRecords) > "
-                 + str(time_interval_minutes * 2)).index
-    )
-    big_gaps.insert(0, 0)
-    big_gaps.append(len(df))
-
-    for gap_index in range(0, len(big_gaps) - 1):
-        chunk = df["t"][big_gaps[gap_index]:big_gaps[gap_index+1]]
-        first_chunk = df["t"][big_gaps[gap_index]]
-
-        # calculate the time difference between
-        # each time record and the first record
-        df.loc[
-            big_gaps[gap_index]:big_gaps[gap_index+1],
-            "minutesFromFirstRecord"
-        ] = (
-            (chunk - first_chunk).dt.days*(86400/60)
-            + (chunk - first_chunk).dt.seconds/60
-        )
-
-        # then round to the nearest X Minutes
-        # NOTE: the ".000001" ensures that mulitples of 2:30 always rounds up.
-        df.loc[
-            big_gaps[gap_index]:big_gaps[gap_index+1],
-            "roundedMinutesFromFirstRecord"
-        ] = round(
-            (df.loc[
-                big_gaps[gap_index]:big_gaps[gap_index+1],
-                "minutesFromFirstRecord"
-            ] / time_interval_minutes) + 0.000001
-        ) * (time_interval_minutes)
-
-        rounded_first_record = (
-            first_chunk + pd.Timedelta("1microseconds")
-        ).round(str(time_interval_minutes) + "min")
-
-        df.loc[
-            big_gaps[gap_index]:big_gaps[gap_index+1],
-            "roundedTime"
-        ] = rounded_first_record + pd.to_timedelta(
-            df.loc[
-                big_gaps[gap_index]:big_gaps[gap_index+1],
-                "roundedMinutesFromFirstRecord"
-            ], unit="m"
-        )
-
-    if return_calculation_columns is False:
-        df.drop(
-            columns=[
-                "timeBetweenRecords",
-                "minutesFromFirstRecord",
-                "roundedMinutesFromFirstRecord"
-            ], inplace=True
-        )
-    # sort back to the original index
-    df.sort_values(by="originalIndex", inplace=True)
-
-    return df["roundedTime"].values
-
-
-def add_upload_time(df):
-    '''
-    this is taken from a colab notebook that is not in our github
-    given that it has been refactored to account for bug where there are
-    no upload records
-    NOTE: this is a new fix introduced with healthkit data...we now have
-    data that does not have an upload record
-
-    '''
-
-    if "upload" in df.type.unique():
-        upload_times = pd.DataFrame(
-            df[df.type == "upload"].groupby("uploadId")["utcTime"].max()
-        )
-    else:
-        upload_times = pd.DataFrame(columns=["utcTime"])
-
-    unique_uploadIds = set(df["uploadId"].unique())
-    unique_uploadRecords = set(
-        df.loc[df["type"] == "upload", "uploadId"].unique()
-    )
-    uploadIds_missing_uploadRecords = unique_uploadIds - unique_uploadRecords
-
-    for upId in uploadIds_missing_uploadRecords:
-        last_upload_time = df.loc[df["uploadId"] == upId, "utcTime"].max()
-        upload_times.loc[upId, "utcTime"] = last_upload_time
-
-    upload_times.reset_index(inplace=True)
-    upload_times.rename(
-        columns={"utcTime": "uploadTime",
-                 "index": "uploadId"},
-        inplace=True
-    )
-
-    df = pd.merge(df, upload_times, how='left', on='uploadId')
-
-    return df["uploadTime"].values
-
-
-def remove_invalid_cgm_values(df):
-
-    nBefore = len(df)
-    # remove values < 38 and > 402 mg/dL
-    df = df.drop(df[((df.type == "cbg") &
-                     (df["mg/dL"] < 38))].index)
-    df = df.drop(df[((df.type == "cbg") &
-                     (df["mg/dL"] > 402))].index)
-    nRemoved = nBefore - len(df)
-
-    return df, nRemoved
-
-
-def removeDuplicates(df, criteriaDF):
-    nBefore = len(df)
-    df = df.loc[~(df[criteriaDF].duplicated())]
-    df = df.reset_index(drop=True)
-    nDuplicatesRemoved = nBefore - len(df)
-
-    return df, nDuplicatesRemoved
-
-
-def removeCgmDuplicates(df, timeCriterion):
-    if timeCriterion in df:
-        df.sort_values(by=[timeCriterion, "uploadTime"],
-                       ascending=[False, False],
-                       inplace=True)
-        dfIsNull = df[df[timeCriterion].isnull()]
-        dfNotNull = df[df[timeCriterion].notnull()]
-        dfNotNull, nDuplicatesRemoved = (
-            removeDuplicates(dfNotNull, [timeCriterion, "value"])
-        )
-        df = pd.concat([dfIsNull, dfNotNull])
-        df.sort_values(by=[timeCriterion, "uploadTime"],
-                       ascending=[False, False],
-                       inplace=True)
-    else:
-        nDuplicatesRemoved = 0
-
-    return df, nDuplicatesRemoved
-
-
-# get rid of spike data
-def remove_spike_data(df):
-    if "origin" in list(df):
-        nBefore = len(df)
-        spike_locations = [
-            "origin.payload.device.name",
-            "origin.payload.device.manufacturer",
-            "origin.payload.sourceRevision.source.name",
-        ]
-        for spike_loc in spike_locations:
-
-            df[spike_loc] = get_embedded_field(df["origin"], spike_loc)
-            spike_idx = df.loc[
-                df[spike_loc].notnull(),
-                spike_loc
-            ].astype(str).str.lower().str.contains("spike")
-
-            df.drop((spike_idx == True).index, inplace=True)
-        nRemoved = nBefore - len(df)
-
-    else:
-        nRemoved = np.nan
-
-    return df, nRemoved
-
-
-# %% ESTIMATE LOCAL TIME FUNCTIONS
-def create_contiguous_day_series(df):
-    first_day = df["date"].min()
-    last_day = df["date"].max()
-    rng = pd.date_range(first_day, last_day).date
-    contiguousDaySeries = \
-        pd.DataFrame(rng, columns=["date"]).sort_values(
-                "date", ascending=False).reset_index(drop=True)
-
-    return contiguousDaySeries
-
-
-def add_device_type(df):
-    col_headings = list(df)
-    if "deviceType" not in col_headings:
-        df["deviceType"] = np.nan
-    if "deviceTags" in col_headings:
-        # first make sure deviceTag is in string format
-        df["deviceTags"] = df.deviceTags.astype(str)
-        # filter by type not null device tags
-        ud = df[df["deviceTags"].notnull()].copy()
-        # define a device type (e.g., pump, cgm, or healthkit)
-        ud.loc[
-            ((ud["deviceTags"].str.contains("pump"))
-             & (ud["deviceType"].isnull())),
-            ["deviceType"]
-        ] = "pump"
-
-        # define a device type (e.g., cgm)
-        ud.loc[
-            ((ud["deviceTags"].str.contains("cgm"))
-             & (ud["deviceType"].isnull())),
-            ["deviceType"]
-        ] = "cgm"
-
-        return ud["deviceType"]
-    else:
-        return np.nan
-
-
-def get_timezone_offset(currentDate, currentTimezone):
-
-    tz = pytz.timezone(currentTimezone)
-    # here we add 1 day to the current date to account for changes to/from DST
-    tzoNum = int(
-        tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z")
-    )
-    tzoHours = np.floor(tzoNum / 100)
-    tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0)
-    tzoSign = np.sign(tzoHours)
-    tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign))
-
-    return tzo
-
-
-def add_device_day_series(df, dfContDays, deviceTypeName):
-    if len(df) > 0:
-        dfDayGroups = df.groupby("date")
-        if "timezoneOffset" in df:
-            dfDaySeries = pd.DataFrame(dfDayGroups["timezoneOffset"].median())
-        else:
-            dfDaySeries = pd.DataFrame(columns=["timezoneOffset"])
-
-        if "upload" in deviceTypeName:
-            if "timezone" in df:
-                if dfDayGroups.timezone.count().max() > 0:
-                    dfDaySeries["timezone"] = (
-                        dfDayGroups.timezone.describe()["top"]
-                    )
-                    # get the timezone offset for the timezone
-                    for i in dfDaySeries.index:
-                        if pd.notnull(dfDaySeries.loc[i, "timezone"]):
-                            tzo = get_timezone_offset(
-                                    pd.to_datetime(i),
-                                    dfDaySeries.loc[i, "timezone"])
-                            dfDaySeries.loc[i, ["timezoneOffset"]] = tzo
-                    if "timeProcessing" in dfDaySeries:
-                        dfDaySeries["timeProcessing"] = \
-                            dfDayGroups.timeProcessing.describe()["top"]
-                    else:
-                        dfDaySeries["timeProcessing"] = np.nan
-
-        dfDaySeries = dfDaySeries.add_prefix(deviceTypeName + "."). \
-            rename(columns={deviceTypeName + ".date": "date"})
-
-        dfContDays = pd.merge(dfContDays, dfDaySeries.reset_index(),
-                              on="date", how="left")
-
-    else:
-        dfContDays[deviceTypeName + ".timezoneOffset"] = np.nan
-
-    return dfContDays
-
-
-def impute_upload_records(df, contDays, deviceTypeName):
-    daySeries = \
-        add_device_day_series(df, contDays, deviceTypeName)
-
-    if ((len(df) > 0) & (deviceTypeName + ".timezone" in daySeries)):
-        for i in daySeries.index[1:]:
-            if pd.isnull(daySeries[deviceTypeName + ".timezone"][i]):
-                daySeries.loc[i, [deviceTypeName + ".timezone"]] = (
-                    daySeries.loc[i-1, deviceTypeName + ".timezone"]
-                )
-            if pd.notnull(daySeries[deviceTypeName + ".timezone"][i]):
-                tz = daySeries.loc[i, deviceTypeName + ".timezone"]
-                tzo = get_timezone_offset(
-                    pd.to_datetime(daySeries.loc[i, "date"]),
-                    tz
-                )
-                daySeries.loc[i, deviceTypeName + ".timezoneOffset"] = tzo
-
-            if pd.notnull(daySeries[deviceTypeName + ".timeProcessing"][i-1]):
-                daySeries.loc[i, deviceTypeName + ".timeProcessing"] = \
-                    daySeries.loc[i-1, deviceTypeName + ".timeProcessing"]
-
-    else:
-        daySeries[deviceTypeName + ".timezone"] = np.nan
-        daySeries[deviceTypeName + ".timeProcessing"] = np.nan
-
-    return daySeries
-
-
-def add_home_timezone(df, contDays):
-
-    if "timezone" in df:
-        homeTimezone = df["timezone"].describe()["top"]
-        tzo = contDays.date.apply(
-                lambda x: get_timezone_offset(pd.to_datetime(x), homeTimezone))
-
-        contDays["home.imputed.timezoneOffset"] = tzo
-        contDays["home.imputed.timezone"] = homeTimezone
-
-    else:
-        contDays["home.imputed.timezoneOffset"] = np.nan
-        contDays["home.imputed.timezone"] = np.nan
-    contDays["home.imputed.timeProcessing"] = np.nan
-
-    return contDays
-
-
-def estimateTzAndTzoWithUploadRecords(cDF):
-
-    cDF["est.type"] = np.nan
-    cDF["est.gapSize"] = np.nan
-    cDF["est.timezoneOffset"] = cDF["upload.timezoneOffset"]
-    cDF["est.annotations"] = np.nan
-
-    if "upload.timezone" in cDF:
-        cDF.loc[cDF["upload.timezone"].notnull(), ["est.type"]] = "UPLOAD"
-        cDF["est.timezone"] = cDF["upload.timezone"]
-        cDF["est.timeProcessing"] = cDF["upload.timeProcessing"]
-    else:
-        cDF["est.timezone"] = np.nan
-        cDF["est.timeProcessing"] = np.nan
-
-    cDF.loc[((cDF["est.timezoneOffset"] !=
-              cDF["home.imputed.timezoneOffset"]) &
-            (pd.notnull(cDF["est.timezoneOffset"]))),
-            "est.annotations"] = "travel"
-
-    return cDF
-
-
-def assignTzoFromImputedSeries(df, i, imputedSeries):
-    df.loc[i, ["est.type"]] = "DEVICE"
-
-    df.loc[i, ["est.timezoneOffset"]] = \
-        df.loc[i, imputedSeries + ".timezoneOffset"]
-
-    df.loc[i, ["est.timezone"]] = \
-        df.loc[i, imputedSeries + ".timezone"]
-
-    df.loc[i, ["est.timeProcessing"]] = \
-        df.loc[i, imputedSeries + ".timeProcessing"]
-
-    return df
-
-
-def compareDeviceTzoToImputedSeries(df, sIdx, device):
-    for i in sIdx:
-        # if the device tzo = imputed tzo, then chose the imputed tz and tzo
-        # note, dst is accounted for in the imputed tzo
-        for imputedSeries in ["pump.upload.imputed", "cgm.upload.imputed",
-                              "healthkit.upload.imputed", "home.imputed"]:
-            # if the estimate has not already been made
-            if pd.isnull(df.loc[i, "est.timezone"]):
-
-                if df.loc[i, device + ".timezoneOffset"] == \
-                  df.loc[i, imputedSeries + ".timezoneOffset"]:
-
-                    assignTzoFromImputedSeries(df, i, imputedSeries)
-
-                    df = addAnnotation(df, i,
-                                       "tz-inferred-from-" + imputedSeries)
-
-                # if the imputed series has a timezone estimate, then see if
-                # the current day is a dst change day
-                elif (pd.notnull(df.loc[i, imputedSeries + ".timezone"])):
-                    imputedTimezone = df.loc[i, imputedSeries + ".timezone"]
-                    if isDSTChangeDay(df.loc[i, "date"], imputedTimezone):
-
-                        dstRange = getRangeOfTZOsForTimezone(imputedTimezone)
-                        if ((df.loc[i, device + ".timezoneOffset"] in dstRange)
-                          & (df.loc[i, imputedSeries + ".timezoneOffset"] in dstRange)):
-
-                            assignTzoFromImputedSeries(df, i, imputedSeries)
-
-                            df = addAnnotation(df, i, "dst-change-day")
-                            df = addAnnotation(
-                                    df, i, "tz-inferred-from-" + imputedSeries)
-
-    return df
-
-
-def estimateTzAndTzoWithDeviceRecords(cDF):
-
-    # 2A. use the TZO of the pump or cgm device if it exists on a given day. In
-    # addition, compare the TZO to one of the imputed day series (i.e., the
-    # upload and home series to see if the TZ can be inferred)
-    for deviceType in ["pump", "cgm"]:
-        # find the indices of days where a TZO estimate has not been made AND
-        # where the device (e.g., pump or cgm) TZO has data
-        sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) &
-                        (cDF[deviceType + ".timezoneOffset"].notnull()))].index
-        # compare the device TZO to the imputed series to infer time zone
-        cDF = compareDeviceTzoToImputedSeries(cDF, sIndices, deviceType)
-
-    # 2B. if the TZ cannot be inferred with 2A, then see if the TZ can be
-    # inferred from the previous day's TZO. If the device TZO is equal to the
-    # previous day's TZO, AND if the previous day has a TZ estimate, use the
-    # previous day's TZ estimate for the current day's TZ estimate
-    for deviceType in ["pump", "cgm"]:
-        sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) &
-                        (cDF[deviceType + ".timezoneOffset"].notnull()))].index
-
-        cDF = compareDeviceTzoToPrevDayTzo(cDF, sIndices, deviceType)
-
-    # 2C. after 2A and 2B, check the DEVICE estimates to make sure that the
-    # pump and cgm tzo do not differ by more than 60 minutes. If they differ
-    # by more that 60 minutes, then mark the estimate as UNCERTAIN. Also, we
-    # allow the estimates to be off by 60 minutes as there are a lot of cases
-    # where the devices are off because the user changes the time for DST,
-    # at different times
-    sIndices = cDF[((cDF["est.type"] == "DEVICE") &
-                    (cDF["pump.timezoneOffset"].notnull()) &
-                    (cDF["cgm.timezoneOffset"].notnull()) &
-                    (cDF["pump.timezoneOffset"] != cDF["cgm.timezoneOffset"])
-                    )].index
-
-    tzoDiffGT60 = abs(cDF.loc[sIndices, "cgm.timezoneOffset"] -
-                      cDF.loc[sIndices, "pump.timezoneOffset"]) > 60
-
-    idx = tzoDiffGT60.index[tzoDiffGT60]
-
-    cDF.loc[idx, ["est.type"]] = "UNCERTAIN"
-    for i in idx:
-        cDF = addAnnotation(cDF, i, "pump-cgm-tzo-mismatch")
-
-    return cDF
-
-
-def imputeTzAndTzo(cDF):
-
-    sIndices = cDF[cDF["est.timezoneOffset"].isnull()].index
-    hasTzoIndices = cDF[cDF["est.timezoneOffset"].notnull()].index
-    if len(hasTzoIndices) > 0:
-        if len(sIndices) > 0:
-            lastDay = max(sIndices)
-
-            while ((sIndices.min() < max(hasTzoIndices)) &
-                   (len(sIndices) > 0)):
-
-                currentDay, prevDayWithDay, nextDayIdx = \
-                    getImputIndices(cDF, sIndices, hasTzoIndices)
-
-                cDF = imputeByTimezone(cDF, currentDay,
-                                       prevDayWithDay, nextDayIdx)
-
-                sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) &
-                                (~cDF["est.annotations"].str.contains(
-                                "unable-to-impute-tzo").fillna(False)))].index
-
-                hasTzoIndices = cDF[cDF["est.timezoneOffset"].notnull()].index
-
-            # try to impute to the last day (earliest day) in the dataset
-            # if the last record has a timezone that is the home record, then
-            # impute using the home timezone
-            if len(sIndices) > 0:
-                currentDay = min(sIndices)
-                prevDayWithDay = currentDay - 1
-                gapSize = lastDay - currentDay
-
-                for i in range(currentDay, lastDay + 1):
-                    if cDF.loc[prevDayWithDay, "est.timezoneOffset"] == \
-                      cDF.loc[prevDayWithDay, "home.imputed.timezoneOffset"]:
-
-                        cDF.loc[i, ["est.type"]] = "IMPUTE"
-
-                        cDF.loc[i, ["est.timezoneOffset"]] = \
-                            cDF.loc[i, "home.imputed.timezoneOffset"]
-
-                        cDF.loc[i, ["est.timezone"]] = \
-                            cDF.loc[i, "home.imputed.timezone"]
-
-                        cDF = addAnnotation(cDF, i, "gap=" + str(gapSize))
-                        cDF.loc[i, ["est.gapSize"]] = gapSize
-
-                    else:
-                        cDF.loc[i, ["est.type"]] = "UNCERTAIN"
-                        cDF = addAnnotation(cDF, i, "unable-to-impute-tzo")
-    else:
-        cDF["est.type"] = "UNCERTAIN"
-        cDF["est.annotations"] = "unable-to-impute-tzo"
-
-    return cDF
-
-
-def getRangeOfTZOsForTimezone(tz):
-    minMaxTzo = [getTimezoneOffset(pd.to_datetime("1/1/2017"), tz),
-                 getTimezoneOffset(pd.to_datetime("5/1/2017"), tz)]
-
-    rangeOfTzo = np.arange(int(min(minMaxTzo)), int(max(minMaxTzo))+1, 15)
-
-    return rangeOfTzo
-
-
-def getListOfDSTChangeDays(cDF):
-
-    # get a list of DST change days for the home time zone
-    dstChangeDays = \
-        cDF[abs(cDF["home.imputed.timezoneOffset"] -
-                cDF["home.imputed.timezoneOffset"].shift(-1)) > 0].date
-
-    return dstChangeDays
-
-
-def correctEstimatesAroundDst(df, cDF):
-
-    # get a list of DST change days for the home time zone
-    dstChangeDays = getListOfDSTChangeDays(cDF)
-
-    # loop through the df within 2 days of a daylight savings time change
-    for d in dstChangeDays:
-        dstIndex = df[(df.date > (d + dt.timedelta(days=-2))) &
-                      (df.date < (d + dt.timedelta(days=2)))].index
-        for dIdx in dstIndex:
-            if pd.notnull(df.loc[dIdx, "est.timezone"]):
-                tz = pytz.timezone(df.loc[dIdx, "est.timezone"])
-                tzRange = getRangeOfTZOsForTimezone(str(tz))
-                minHoursToLocal = min(tzRange)/60
-                tzoNum = int(tz.localize(df.loc[dIdx, "utcTime"] +
-                             dt.timedelta(hours=minHoursToLocal)).strftime("%z"))
-                tzoHours = np.floor(tzoNum / 100)
-                tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0)
-                tzoSign = np.sign(tzoHours)
-                tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign))
-                localTime = \
-                    df.loc[dIdx, "utcTime"] + pd.to_timedelta(tzo, unit="m")
-                df.loc[dIdx, ["est.localTime"]] = localTime
-                df.loc[dIdx, ["est.timezoneOffset"]] = tzo
-    return df
-
-
-def applyLocalTimeEstimates(df, cDF):
-    df = pd.merge(df, cDF, how="left", on="date")
-    df["est.localTime"] = \
-        df["utcTime"] + pd.to_timedelta(df["est.timezoneOffset"], unit="m")
-
-    df = correctEstimatesAroundDst(df, cDF)
-
-    return df["est.localTime"].values
-
-
-def isDSTChangeDay(currentDate, currentTimezone):
-    tzoCurrentDay = getTimezoneOffset(pd.to_datetime(currentDate),
-                                      currentTimezone)
-    tzoPreviousDay = getTimezoneOffset(pd.to_datetime(currentDate) +
-                                       dt.timedelta(days=-1), currentTimezone)
-
-    return (tzoCurrentDay != tzoPreviousDay)
-
-
-def tzoRangeWithComparisonTz(df, i, comparisonTz):
-    # if we have a previous timezone estimate, then calcuate the range of
-    # timezone offset values for that time zone
-    if pd.notnull(comparisonTz):
-        rangeTzos = getRangeOfTZOsForTimezone(comparisonTz)
-    else:
-        comparisonTz = np.nan
-        rangeTzos = np.array([])
-
-    return rangeTzos
-
-
-def tzAndTzoRangePreviousDay(df, i):
-    # if we have a previous timezone estimate, then calcuate the range of
-    # timezone offset values for that time zone
-    comparisonTz = df.loc[i-1, "est.timezone"]
-
-    rangeTzos = tzoRangeWithComparisonTz(df, i, comparisonTz)
-
-    return comparisonTz, rangeTzos
-
-
-def assignTzoFromPreviousDay(df, i, previousDayTz):
-
-    df.loc[i, ["est.type"]] = "DEVICE"
-    df.loc[i, ["est.timezone"]] = previousDayTz
-    df.loc[i, ["est.timezoneOffset"]] = \
-        getTimezoneOffset(pd.to_datetime(df.loc[i, "date"]), previousDayTz)
-
-    df.loc[i, ["est.timeProcessing"]] = df.loc[i-1, "est.timeProcessing"]
-    df = addAnnotation(df, i, "tz-inferred-from-prev-day")
-
-    return df
-
-
-def assignTzoFromDeviceTzo(df, i, device):
-
-    df.loc[i, ["est.type"]] = "DEVICE"
-    df.loc[i, ["est.timezoneOffset"]] = \
-        df.loc[i, device + ".timezoneOffset"]
-    df.loc[i, ["est.timeProcessing"]] = \
-        df.loc[i, device + ".upload.imputed.timeProcessing"]
-
-    df = addAnnotation(df, i, "likely-travel")
-    df = addAnnotation(df, i, "tzo-from-" + device)
-
-    return df
-
-
-def compareDeviceTzoToPrevDayTzo(df, sIdx, device):
-
-    for i in sIdx[sIdx > 0]:
-
-        # first see if the previous record has a tzo
-        if (pd.notnull(df.loc[i-1, "est.timezoneOffset"])):
-
-            previousDayTz, dstRange = tzAndTzoRangePreviousDay(df, i)
-            timeDiff = abs((df.loc[i, device + ".timezoneOffset"]) -
-                           df.loc[i-1, "est.timezoneOffset"])
-
-            # next see if the previous record has a tz
-            if (pd.notnull(df.loc[i-1, "est.timezone"])):
-
-                if timeDiff == 0:
-                    assignTzoFromPreviousDay(df, i, previousDayTz)
-
-                # see if the previous day's tzo and device tzo are within the
-                # dst range (as that is a common problem with this data)
-                elif ((df.loc[i, device + ".timezoneOffset"] in dstRange)
-                      & (df.loc[i-1, "est.timezoneOffset"] in dstRange)):
-
-                    # then see if it is DST change day
-                    if isDSTChangeDay(df.loc[i, "date"], previousDayTz):
-
-                        df = addAnnotation(df, i, "dst-change-day")
-                        assignTzoFromPreviousDay(df, i, previousDayTz)
-
-                    # if it is not DST change day, then mark this as uncertain
-                    else:
-                        # also, check to see if the difference between device.
-                        # tzo and prev.tzo is less than the expected dst
-                        # difference. There is a known issue where the BtUTC
-                        # procedure puts clock drift into the device.tzo,
-                        # and as a result the tzo can be off by 15, 30,
-                        # or 45 minutes.
-                        if (((df.loc[i, device + ".timezoneOffset"] ==
-                              min(dstRange)) |
-                            (df.loc[i, device + ".timezoneOffset"] ==
-                             max(dstRange))) &
-                           ((df.loc[i-1, "est.timezoneOffset"] ==
-                             min(dstRange)) |
-                            (df.loc[i-1, "est.timezoneOffset"] ==
-                             max(dstRange)))):
-
-                            df.loc[i, ["est.type"]] = "UNCERTAIN"
-                            df = addAnnotation(df, i,
-                                               "likely-dst-error-OR-travel")
-
-                        else:
-
-                            df.loc[i, ["est.type"]] = "UNCERTAIN"
-                            df = addAnnotation(df, i,
-                                               "likely-15-min-dst-error")
-
-                # next see if time difference between device.tzo and prev.tzo
-                # is off by 720 minutes, which is indicative of a common
-                # user AM/PM error
-                elif timeDiff == 720:
-                    df.loc[i, ["est.type"]] = "UNCERTAIN"
-                    df = addAnnotation(df, i, "likely-AM-PM-error")
-
-                # if it doesn't fall into any of these cases, then the
-                # tzo difference is likely due to travel
-                else:
-                    df = assignTzoFromDeviceTzo(df, i, device)
-
-            elif timeDiff == 0:
-                df = assignTzoFromDeviceTzo(df, i, device)
-
-        # if there is no previous record to compare with check for dst errors,
-        # and if there are no errors, it is likely a travel day
-        else:
-
-            comparisonTz, dstRange = tzAndTzoRangeWithHomeTz(df, i)
-            timeDiff = abs((df.loc[i, device + ".timezoneOffset"]) -
-                           df.loc[i, "home.imputed.timezoneOffset"])
-
-            if ((df.loc[i, device + ".timezoneOffset"] in dstRange)
-               & (df.loc[i, "home.imputed.timezoneOffset"] in dstRange)):
-
-                # see if it is DST change day
-                if isDSTChangeDay(df.loc[i, "date"], comparisonTz):
-
-                    df = addAnnotation(df, i, "dst-change-day")
-                    df.loc[i, ["est.type"]] = "DEVICE"
-                    df.loc[i, ["est.timezoneOffset"]] = \
-                        df.loc[i, device + ".timezoneOffset"]
-                    df.loc[i, ["est.timezone"]] = \
-                        df.loc[i, "home.imputed.timezone"]
-                    df.loc[i, ["est.timeProcessing"]] = \
-                        df.loc[i, device + ".upload.imputed.timeProcessing"]
-
-                # if it is not DST change day, then mark this as uncertain
-                else:
-                    # also, check to see if the difference between device.
-                    # tzo and prev.tzo is less than the expected dst
-                    # difference. There is a known issue where the BtUTC
-                    # procedure puts clock drift into the device.tzo,
-                    # and as a result the tzo can be off by 15, 30,
-                    # or 45 minutes.
-                    if (((df.loc[i, device + ".timezoneOffset"] ==
-                          min(dstRange)) |
-                        (df.loc[i, device + ".timezoneOffset"] ==
-                         max(dstRange))) &
-                       ((df.loc[i, "home.imputed.timezoneOffset"] ==
-                         min(dstRange)) |
-                        (df.loc[i, "home.imputed.timezoneOffset"] ==
-                         max(dstRange)))):
-
-                        df.loc[i, ["est.type"]] = "UNCERTAIN"
-                        df = addAnnotation(df, i, "likely-dst-error-OR-travel")
-
-                    else:
-
-                        df.loc[i, ["est.type"]] = "UNCERTAIN"
-                        df = addAnnotation(df, i, "likely-15-min-dst-error")
-
-            # next see if time difference between device.tzo and prev.tzo
-            # is off by 720 minutes, which is indicative of a common
-            # user AM/PM error
-            elif timeDiff == 720:
-                df.loc[i, ["est.type"]] = "UNCERTAIN"
-                df = addAnnotation(df, i, "likely-AM-PM-error")
-
-            # if it doesn't fall into any of these cases, then the
-            # tzo difference is likely due to travel
-
-            else:
-                df = assignTzoFromDeviceTzo(df, i, device)
-
-    return df
-
-
-def getImputIndices(df, sIdx, hIdx):
-
-    lastDayIdx = len(df) - 1
-
-    currentDayIdx = sIdx.min()
-    tempList = pd.Series(hIdx) - currentDayIdx
-    prevDayIdx = currentDayIdx - 1
-    nextDayIdx = \
-        min(currentDayIdx + min(tempList[tempList >= 0]), lastDayIdx)
-
-    return currentDayIdx, prevDayIdx, nextDayIdx
-
-
-def imputeByTimezone(df, currentDay, prevDaywData, nextDaywData):
-
-    gapSize = (nextDaywData - currentDay)
-
-    if prevDaywData >= 0:
-
-        if df.loc[prevDaywData, "est.timezone"] == \
-          df.loc[nextDaywData, "est.timezone"]:
-
-            tz = df.loc[prevDaywData, "est.timezone"]
-
-            for i in range(currentDay, nextDaywData):
-
-                df.loc[i, ["est.timezone"]] = tz
-
-                df.loc[i, ["est.timezoneOffset"]] = \
-                    getTimezoneOffset(pd.to_datetime(df.loc[i, "date"]), tz)
-
-                df.loc[i, ["est.type"]] = "IMPUTE"
-
-                df = addAnnotation(df, i, "gap=" + str(gapSize))
-                df.loc[i, ["est.gapSize"]] = gapSize
-
-        # TODO: this logic should be updated to handle the edge case
-        # where the day before and after the gap have differing TZ, but
-        # the same TZO. In that case the gap should be marked as UNCERTAIN
-        elif df.loc[prevDaywData, "est.timezoneOffset"] == \
-          df.loc[nextDaywData, "est.timezoneOffset"]:
-
-            for i in range(currentDay, nextDaywData):
-
-                df.loc[i, ["est.timezoneOffset"]] = \
-                    df.loc[prevDaywData, "est.timezoneOffset"]
-
-                df.loc[i, ["est.type"]] = "IMPUTE"
-
-                df = addAnnotation(df, i, "gap=" + str(gapSize))
-                df.loc[i, ["est.gapSize"]] = gapSize
-
-        else:
-            for i in range(currentDay, nextDaywData):
-                df.loc[i, ["est.type"]] = "UNCERTAIN"
-                df = addAnnotation(df, i, "unable-to-impute-tzo")
-
-    else:
-        for i in range(currentDay, nextDaywData):
-            df.loc[i, ["est.type"]] = "UNCERTAIN"
-            df = addAnnotation(df, i, "unable-to-impute-tzo")
-
-    return df
-
-
-def addAnnotation(df, idx, annotationMessage):
-    if pd.notnull(df.loc[idx, "est.annotations"]):
-        df.loc[idx, ["est.annotations"]] = df.loc[idx, "est.annotations"] + \
-            ", " + annotationMessage
-    else:
-        df.loc[idx, ["est.annotations"]] = annotationMessage
-
-    return df
-
-
-def getTimezoneOffset(currentDate, currentTimezone):
-
-    tz = pytz.timezone(currentTimezone)
-    # here we add 1 day to the current date to account for changes to/from DST
-    tzoNum = int(tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z"))
-    tzoHours = np.floor(tzoNum / 100)
-    tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0)
-    tzoSign = np.sign(tzoHours)
-    tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign))
-
-    return tzo
-
-
-def estimate_local_time(df):
-    df["date"] = df["utcTime"].dt.date  # TODO: change this to utcDate later
-    contiguous_days = create_contiguous_day_series(df)
-
-    df["deviceType"] = add_device_type(df)
-    cDays = add_device_day_series(df, contiguous_days, "upload")
-
-    # create day series for cgm df
-    if "timezoneOffset" not in list(df):
-        df["timezoneOffset"] = np.nan
-
-    cgmdf = df[(df["type"] == "cbg") & (df["timezoneOffset"].notnull())].copy()
-    cDays = add_device_day_series(cgmdf, cDays, "cgm")
-
-    # create day series for pump df
-    pumpdf = df[(df.type == "bolus") & (df.timezoneOffset.notnull())].copy()
-    cDays = add_device_day_series(pumpdf, cDays, "pump")
-
-    # interpolate between upload records of the same deviceType, and create a
-    # day series for interpolated pump, non-hk-cgm, and healthkit uploads
-    for deviceType in ["pump", "cgm", "healthkit"]:
-        tempUploaddf = df[df["deviceType"] == deviceType].copy()
-        cDays = impute_upload_records(
-            tempUploaddf, cDays, deviceType + ".upload.imputed"
-        )
-
-    # add a home timezone that also accounts for daylight savings time changes
-    cDays = add_home_timezone(df, cDays)
-
-    # 1. USE UPLOAD RECORDS TO ESTIMATE TZ AND TZO
-    cDays = estimateTzAndTzoWithUploadRecords(cDays)
-
-    # 2. USE DEVICE TZOs TO ESTIMATE TZO AND TZ (IF POSSIBLE)
-    # estimates can be made from pump and cgm df that have a TZO
-    # NOTE: the healthkit and dexcom-api cgm df are excluded
-    cDays = estimateTzAndTzoWithDeviceRecords(cDays)
-
-    # 3. impute, infer, or interpolate gaps in the estimated tzo and tz
-    cDays = imputeTzAndTzo(cDays)
-
-    # 4. APPLY LOCAL TIME ESTIMATES TO ALL df
-    local_time = applyLocalTimeEstimates(df, cDays)
-
-    return local_time, cDays
-
-
-# %% GET DATA FROM API
-'''
-get metadata and data for a donor that has shared with bigdata
-NOTE: functions assume you have an .env with bigdata account credentials
-'''
-
-userid = "0d4524bc11"
-donor_group = "bigdata"
-
-donor_metadata, _ = get_shared_metadata(
-    donor_group=donor_group,
-    userid_of_shared_user=userid  # TODO: this should be refactored in several places to be userid
-)
-data, _ = get_data(
-    donor_group=donor_group,
-    userid=userid,
-    weeks_of_data=52*10
-)
-
-
-# %% CREATE META DATAFRAME (metadata)
-'''
-this is useful for keeping track of the type and amount of cleaning done
-'''
-metadata = pd.DataFrame(index=[userid])
-
-
-# %% HASH USER ID
-hashid = hash_userid(userid, os.environ['BIGDATA_SALT'])
-data["userid"] = userid
-data["hashid"] = hashid
-
-
-# %% CLEAN DATA
-data_fields = list(data)
-# remove negative durations
-if "duration" in data_fields:
-    data["duration"], n_negative_durations = (
-        remove_negative_durations(data[["duration"]].copy())
-    )
-else:
-    n_negative_durations = np.nan
-metadata["nNegativeDurations"] = n_negative_durations
-
-# Tslim calibration bug fix
-data, n_cal_readings = tslim_calibration_fix(data.copy())
-metadata["nTandemAndPayloadCalReadings"] = n_cal_readings
-
-# fix large timzoneOffset bug in utcbootstrapping
-data = timezone_offset_bug_fix(data.copy())
-
-# add healthkit timezome information
-data[["timezone", "deviceType"]] = get_healthkit_timezone(data.copy())
-
-
-# %% TIME RELATED ITEMS
-data["utcTime"] = to_utc_datetime(data[["time"]].copy())
-
-# add upload time to the data, which is needed for:
-# getting rid of duplicates and useful for getting local time
-data["uploadTime"] = add_upload_time(data[
-    ["type", "uploadId", "utcTime"]
-].copy())
-
-# estimate local time (refactor of estimate-local-time.py)
-data["localTime"], local_time_metadata = estimate_local_time(data.copy())
-
-# round all data to the nearest 5 minutes
-data["roundedLocalTime"] = round_time(
-    data["localTime"].copy(),
-    time_interval_minutes=5,
-    start_with_first_record=True,
-    return_calculation_columns=False
-)
-
-
-# %% TIME CATEGORIES
-# AGE, & YLW
-bDate = pd.to_datetime(donor_metadata["birthday"].values[0][0:7])
-dDate = pd.to_datetime(donor_metadata["diagnosisDate"].values[0][0:7])
-data["age"] = np.floor((data["roundedLocalTime"] - bDate).dt.days/365.25)
-data["ylw"] = np.floor((data["roundedLocalTime"] - dDate).dt.days/365.25)
-
-# hour of the day
-data["hour"] = data["roundedLocalTime"].dt.hour
-
-# add the day of the localTime that starts at 12am
-data["day12AM"] = data["roundedLocalTime"].dt.date
-# NOTE: for day of week Monday = 0 and Sunday = 6
-data["dayofweek12AM"] = data["roundedLocalTime"].dt.dayofweek
-data["weekend12AM"] = data["dayofweek12AM"] > 4
-
-# day that starts at 6am
-data["6amTime"] = data["roundedLocalTime"] - pd.Timedelta(6, unit="hours")
-data["day6AM"] = data["6amTime"].dt.date
-data["dayofweek6AM"] = data["6amTime"].dt.dayofweek
-data["weekend6AM"] = data["dayofweek6AM"] > 4
-
-
-# %% GROUP DATA BY TYPE
-# first sort by upload time (used when removing dumplicates)
-data.sort_values("uploadTime", ascending=False, inplace=True)
-groups = data.groupby(by="type")
-
-
-# %% CGM DATA
-# filter by cgm
-cgm = groups.get_group("cbg").dropna(axis=1, how="all")
-
-# calculate cgm in mg/dL
-cgm["mg/dL"] = round(cgm["value"] * MGDL_PER_MMOLL)
-
-# get rid of spike data
-cgm, nSpike = remove_spike_data(cgm)
-metadata["nSpike"] = nSpike
-
-# get rid of cgm values too low/high (< 38 & > 402 mg/dL)
-cgm, nInvalidCgmValues = remove_invalid_cgm_values(cgm)
-metadata["nInvalidCgmValues"] = nInvalidCgmValues
-
-# get rid of duplicates that have the same ["deviceTime", "value"]
-cgm, n_cgm_dups_removed = (removeCgmDuplicates(cgm, "deviceTime"))
-metadata["nCgmDuplicatesRemovedDeviceTime"] = n_cgm_dups_removed
-
-# get rid of duplicates that have the same ["time", "value"]
-cgm, n_cgm_dups_removed = removeCgmDuplicates(cgm, "time")
-metadata["nCgmDuplicatesRemovedUtcTime"] = n_cgm_dups_removed
-
-# get rid of duplicates that have the same "roundedTime"
-cgm, n_cgm_dups_removed = removeDuplicates(cgm, "roundedLocalTime")
-metadata["nCgmDuplicatesRemovedRoundedTime"] = n_cgm_dups_removed
-
-
-# %% GET CGM STATS
-# create a contiguous 5 minute time series
-first_day = cgm["roundedLocalTime"].min()
-last_day = cgm["roundedLocalTime"].max()
-rng = pd.date_range(first_day, last_day, freq="5min")
-contiguous_data = (
-    pd.DataFrame(rng, columns=["roundedLocalTime"]).sort_values(
-        "roundedLocalTime", ascending=False
-    ).reset_index(drop=True)
-)
-
-# merge with cgm data
-cgm_series = pd.merge(
-    contiguous_data,
-    cgm,
-    on="roundedLocalTime",
-    how="left"
-)
-
-#cgm_series["hourly.mean"] = cgm_series["mg/dL"].rolling(12).mean()

From 4206d722c3dbf55803e189715c75052ea02c4aa2 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Wed, 21 Aug 2019 14:55:48 -0500
Subject: [PATCH 43/46] remove combine files

---
 .../batch_get_cgm_distributions_and_stats.py  | 54 -------------------
 1 file changed, 54 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py b/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py
index 61894b9c..c8cbb5dc 100644
--- a/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py
+++ b/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py
@@ -120,57 +120,3 @@ def run_process(json_data_path):
 )
 total_duration = round((endTime - startTime) / 60, 1)
 print("total duration was %s minutes" % total_duration)
-
-
-# %% COMBINE AND SAVE ALL DONOR METADATA
-print("combining all metadata")
-phi_date_stamp = "PHI-" + args.date_stamp
-donor_folder = os.path.join(args.data_path, phi_date_stamp + "-donor-data")
-
-metadata_path = os.path.join(
-    args.data_path,
-    phi_date_stamp + "-donor-data",
-    phi_date_stamp + "-cgm-metadata"
-)
-
-all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
-all_metadata = pd.DataFrame()
-for f in all_metadata_files:
-    temp_meta = pd.read_csv(f)
-    all_metadata = pd.concat(
-        [all_metadata, temp_meta],
-        ignore_index=True,
-        sort=False
-    )
-
-all_metadata.to_csv(
-    os.path.join(donor_folder, phi_date_stamp + "-cgm-metadata.csv.gz")
-)
-print("saving metadata...code complete")
-
-
-# %% COMBINE AND SAVE ALL DISTRIBUTION DATA
-print("combining all distribution data")
-
-metadata_path = os.path.join(
-    args.data_path,
-    phi_date_stamp + "-donor-data",
-    phi_date_stamp + "-cgm-distributions"
-)
-
-all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
-distribution_metadata = pd.DataFrame()
-for f in all_metadata_files:
-    temp_meta = pd.read_csv(f, index_col=[0])
-    distribution_metadata = pd.concat(
-        [distribution_metadata, temp_meta],
-        ignore_index=True,
-        sort=False
-    )
-
-distribution_metadata.to_csv(
-    os.path.join(
-        donor_folder, phi_date_stamp + "-all-cgm-distributions.csv.gz"
-    )
-)
-print("saving all-dataset-info-metadata...code complete")

From 3f1ea02b6cd15e2e05f25547942711bab3f7b5bb Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Wed, 21 Aug 2019 15:13:58 -0500
Subject: [PATCH 44/46] use all processors

---
 .../get_stats/batch_get_cgm_distributions_and_stats.py    | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py b/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py
index c8cbb5dc..3fe2fef9 100644
--- a/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py
+++ b/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py
@@ -102,16 +102,10 @@ def run_process(json_data_path):
 # %% GET A LIST OF DONOR JSON FILE LOCATIONS
 all_files = glob.glob(args.json_data_path, recursive=True)
 
-# this is a good test to make sure run process is working before running
-#import pdb
-#args.date_stamp = "2019-07-17"
-#run_process(all_files[0])
-#pdb.set_trace()
-
 # use multiple cores to process
 startTime = time.time()
 print("starting at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
-pool = Pool(int(os.cpu_count()/2))
+pool = Pool(int(os.cpu_count()))
 pool.map(run_process, all_files)
 pool.close()
 endTime = time.time()

From 8a7f86effaeb11409ea5fd4041e4a66e39b752ab Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Thu, 22 Aug 2019 09:24:56 -0500
Subject: [PATCH 45/46] save json data in folder that is compatible with old
 pipeline

---
 .../get_single_tidepool_dataset_json.py       | 39 ++++++++++++++-----
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py
index 5a17d6d8..34c0abe3 100644
--- a/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py
+++ b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py
@@ -81,6 +81,7 @@ def get_data(
         auth=np.nan,
         email=np.nan,
         password=np.nan,
+        save_file="True",
 ):
     # login
     if pd.notnull(donor_group):
@@ -127,8 +128,7 @@ def get_data(
 
     output_folder = os.path.join(
         save_data_path,
-        "dremio",
-        userid,
+        "PHI-" + userid,
     )
 
     output_file_path = os.path.join(
@@ -147,7 +147,7 @@ def get_data(
                 download_ = False
 
     if download_:
-        make_folder_if_doesnt_exist(output_folder)
+
         big_json_file = []
 
         if weeks_of_data > 52:
@@ -184,9 +184,13 @@ def get_data(
 
         # save data
         if len(big_json_file) > 1:
-            print("saving data for {}".format(userid))
-            with open(output_file_path, 'w') as outfile:
-                json.dump(big_json_file, outfile)
+            if "T" in str(save_file).upper():
+                make_folder_if_doesnt_exist(output_folder)
+                print("saving data for {}".format(userid))
+                with open(output_file_path, 'w') as outfile:
+                    json.dump(big_json_file, outfile)
+            else:
+                print("{} has data, but will not be saved".format(userid))
         else:
             print("{} has no data".format(userid))
 
@@ -203,8 +207,9 @@ def get_data(
                 auth[0] + ":" + str(api_response.status_code)
             )
     else:
-        print("skipping bc {}'s data was downloaded (attempted)".format(userid)
-              + " within the last {} hours".format(overwrite_hours)
+        print(
+            "skipping bc {}'s data was downloaded (attempted)".format(userid)
+            + " within the last {} hours".format(overwrite_hours)
         )
 
     return
@@ -215,6 +220,7 @@ def get_data(
     # USER INPUTS (choices to be made in order to run the code)
     codeDescription = "get donor json file"
     parser = argparse.ArgumentParser(description=codeDescription)
+    current_date = dt.datetime.now().strftime("%Y-%m-%d")
 
     parser.add_argument(
         "-o",
@@ -222,7 +228,11 @@ def get_data(
         dest="data_path",
         default=os.path.abspath(
             os.path.join(
-                os.path.dirname(__file__), "..", "data"
+                os.path.dirname(__file__),
+                "..",
+                "data",
+                "PHI-" + current_date + "-donor-data",
+                "PHI-" + current_date + "-jsonData",
             )
         ),
         help="the output path where the data is stored"
@@ -232,7 +242,7 @@ def get_data(
         "-w",
         "--weeks-of-data",
         dest="weeks_of_data",
-        default=52*10,  # go back the last 10 years as default
+        default=2,  # 52*10,  # go back the last 10 years as default
         help="enter the number of weeks of data you want to download"
     )
 
@@ -284,6 +294,14 @@ def get_data(
         help="password of the master account"
     )
 
+    parser.add_argument(
+        "-s",
+        "--save_file",
+        dest="save_file",
+        default="true",
+        help="specify whether to save the downloaded donor data"
+    )
+
     args = parser.parse_args()
 
     # the main function
@@ -296,4 +314,5 @@ def get_data(
         auth=args.auth,
         email=args.email,
         password=args.password,
+        save_file=args.save_file,
     )

From 01d6dc15010501d07fef5c8994bcb414eb5e20f2 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Thu, 22 Aug 2019 10:44:57 -0500
Subject: [PATCH 46/46] refactor to download data if json data path is not
 provided

also removing hardcoding to dremio folder
---
 .../get_single_tidepool_dataset_json.py       | 17 ++++--
 .../get_cgm_distributions_and_stats.py        | 61 ++++++++++++-------
 2 files changed, 50 insertions(+), 28 deletions(-)

diff --git a/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py
index 34c0abe3..d8496891 100644
--- a/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py
+++ b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py
@@ -24,6 +24,8 @@
     sys.path.insert(0, envPath)
 import environmentalVariables
 
+# %% GLOBAL VARIABLES
+current_date = dt.datetime.now().strftime("%Y-%m-%d")
 
 # %% FUNCTIONS
 def make_folder_if_doesnt_exist(folder_paths):
@@ -72,7 +74,9 @@ def get_data(
             os.path.join(
                 os.path.dirname(__file__),
                 "..",
-                "data"
+                "data",
+                "PHI-" + current_date + "-donor-data",
+                "PHI-" + current_date + "-jsonData",
             )
         ),
         overwrite_hours=24,
@@ -81,7 +85,7 @@ def get_data(
         auth=np.nan,
         email=np.nan,
         password=np.nan,
-        save_file="True",
+        save_file="False",
 ):
     # login
     if pd.notnull(donor_group):
@@ -212,7 +216,11 @@ def get_data(
             + " within the last {} hours".format(overwrite_hours)
         )
 
-    return
+    if "T" in str(save_file).upper():
+        return np.nan, userid
+    else:
+        df = pd.DataFrame(big_json_file)
+        return df, userid
 
 
 # %% MAIN
@@ -220,7 +228,6 @@ def get_data(
     # USER INPUTS (choices to be made in order to run the code)
     codeDescription = "get donor json file"
     parser = argparse.ArgumentParser(description=codeDescription)
-    current_date = dt.datetime.now().strftime("%Y-%m-%d")
 
     parser.add_argument(
         "-o",
@@ -305,7 +312,7 @@ def get_data(
     args = parser.parse_args()
 
     # the main function
-    get_data(
+    data, userid = get_data(
         save_data_path=args.data_path,
         weeks_of_data=args.weeks_of_data,
         overwrite_hours=args.overwrite_hours,
diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py
index 8a6cf7d5..4da725b1 100644
--- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py
@@ -22,7 +22,10 @@
 )
 if get_donor_data_path not in sys.path:
     sys.path.insert(0, get_donor_data_path)
-from get_donor_data.get_single_tidepool_dataset_json import make_folder_if_doesnt_exist
+from get_donor_data.get_single_tidepool_dataset_json import (
+    make_folder_if_doesnt_exist, get_data
+)
+from get_donor_data.get_single_donor_metadata import get_shared_metadata
 
 # %% CONSTANTS
 MGDL_PER_MMOLL = 18.01559
@@ -35,7 +38,6 @@
 and whether they were refactored
 '''
 
-
 def get_episodes(
         df,
         episode_criterion="cgm < 54",
@@ -1568,22 +1570,37 @@ def get_distribution_and_stats(
         'isOtherPerson',
     ]
 
-    # load in data
-    data = pd.read_json(json_data_path)
+    # load in data or pull in data
+    if pd.notnull(json_data_path):
+        data = pd.read_json(json_data_path)
+
+    else:
+        data, userid = get_data(
+            save_file="false"
+        )
 
     # load in donor metadata
-    all_donor_metadata = pd.read_csv(
-        os.path.join(
-            save_data_path,
-            phi_date + "-donor-data",
-            phi_date + "-donor-metadata.csv"),
-        low_memory=False
+    donor_meta_path = os.path.join(
+        save_data_path,
+        phi_date + "-donor-data",
+        phi_date + "-donor-metadata.csv"
     )
+    if os.path.exists(donor_meta_path):
 
-    metadata = all_donor_metadata.loc[
-        all_donor_metadata["userid"] == userid,
-        donor_metadata_columns
-    ]
+        all_donor_metadata = pd.read_csv(
+            donor_meta_path,
+            low_memory=False
+        )
+
+        metadata = all_donor_metadata.loc[
+            all_donor_metadata["userid"] == userid,
+            donor_metadata_columns
+        ]
+    else:
+        metadata, _ = get_shared_metadata(
+            donor_group="bigdata",
+            userid_of_shared_user=userid
+        )
 
     print("starting", userid)
 
@@ -1645,13 +1662,13 @@ def get_distribution_and_stats(
 
     # AGE, & YLW
     # TODO: make this a function
-    if metadata["birthday"].values[0] is not np.nan:
+    if pd.notnull(metadata["birthday"].values[0]):
         bDate = pd.to_datetime(metadata["birthday"].values[0][0:7])
         data["age"] = np.floor((data["roundedUtcTime"] - bDate).dt.days/365.25)
     else:
         data["age"] = np.nan
 
-    if metadata["diagnosisDate"].values[0] is not np.nan:
+    if pd.notnull(metadata["diagnosisDate"].values[0]):
         dDate = pd.to_datetime(metadata["diagnosisDate"].values[0][0:7])
         data["ylw"] = np.floor((data["roundedUtcTime"] - dDate).dt.days/365.25)
     else:
@@ -2006,7 +2023,6 @@ def get_distribution_and_stats(
 
             ts40_400 = all_cgm["mg/dL.40to400"].copy()
 
-
             # for all the less than (<) criteria
             for cgm_threshold in [40, 54, 70]:
                 all_cgm["cgm < " + str(cgm_threshold)] = (
@@ -2390,12 +2406,11 @@ def get_distribution_and_stats(
         "-i",
         "--input-json-data-path",
         dest="json_data_path",
-        default=os.path.abspath(
-            os.path.join(
-                os.path.dirname(__file__), "..", "data"
-            )
-        ),
-        help="the path where the json data is located"
+        default=np.nan,
+        help=(
+            "the path where the json data is located, defaults to none and"
+            + " will download your data using  your Tidepool credentials"
+        )
     )
 
     parser.add_argument(