From 865f4ea2404e18b7ab7fc5692c79a187744f2615 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Mon, 15 Jul 2019 14:46:17 -0500 Subject: [PATCH 01/46] create folder and save functions --- .../get_single_tidepool_dataset.py | 91 ++++++++++++++----- 1 file changed, 66 insertions(+), 25 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py b/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py index 290b5324..8d89c2c5 100644 --- a/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py +++ b/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py @@ -56,7 +56,7 @@ parser.add_argument( "-u", "--userid", - dest="userid_of_shared_user", + dest="userid", default=np.nan, help="userid of account shared with the donor group or master account" ) @@ -111,6 +111,55 @@ def make_folder_if_doesnt_exist(folder_paths): return +def create_output_folder( + data_path=args.data_path, + date_stamp=args.date_stamp, + folder_name="not-specified", + phi=True +): + if phi: + date_stamp = "PHI-" + date_stamp + donor_folder = os.path.join(data_path, date_stamp + "-donor-data") + dataset_path = os.path.join( + donor_folder, + date_stamp + "-" + folder_name + ) + make_folder_if_doesnt_exist(dataset_path) + + return dataset_path + + +def save_df( + df, + userid=args.userid, + data_path=args.data_path, + date_stamp=args.date_stamp, + folder_name="not-specified", + phi=True +): + + output_folder = create_output_folder( + data_path=data_path, + date_stamp=date_stamp, + folder_name=folder_name, + phi=phi + ) + + # if the data contains phi, add prefix to the file + if phi: + phi_prefix = 'PHI-' + else: + phi_prefix = '' + output_path = os.path.join( + output_folder, + phi_prefix + userid + "-dataSummary.csv.gz" + ) + + df.to_csv(output_path) + + return output_path + + def get_data_api(userid, startDate, endDate, headers): startDate = startDate.strftime("%Y-%m-%d") + "T00:00:00.000Z" @@ -145,7 +194,7 @@ def get_data_api(userid, startDate, endDate, headers): def get_data( weeks_of_data=10*52, donor_group=np.nan, - userid_of_shared_user=np.nan, + userid=np.nan, auth=np.nan, email=np.nan, password=np.nan, @@ -180,8 +229,8 @@ def get_data( else: sys.exit("Error with " + auth[0] + ":" + str(api_response.status_code)) - if pd.isnull(userid_of_shared_user): - userid_of_shared_user = userid_master + if pd.isnull(userid): + userid = userid_master print( "getting data for the master account since no shared " + "user account was given" @@ -204,7 +253,7 @@ def get_data( endDate.day + 1 ) year_df, endDate = get_data_api( - userid_of_shared_user, + userid, startDate, endDate, headers @@ -222,7 +271,7 @@ def get_data( ) df, _ = get_data_api( - userid_of_shared_user, + userid, startDate, endDate, headers @@ -241,7 +290,7 @@ def get_data( auth[0] + ":" + str(api_response.status_code) ) - return df, userid_of_shared_user + return df, userid # %% START OF CODE @@ -250,40 +299,32 @@ def get_and_save_dataset( data_path=args.data_path, weeks_of_data=args.weeks_of_data, donor_group=args.donor_group, - userid_of_shared_user=args.userid_of_shared_user, + userid=args.userid, auth=args.auth, email=args.email, password=args.password ): - # create output folders if they don't exist - - phi_date_stamp = "PHI-" + date_stamp - donor_folder = os.path.join(data_path, phi_date_stamp + "-donor-data") - - dataset_path = os.path.join( - donor_folder, - phi_date_stamp + "-csvData" - ) - make_folder_if_doesnt_exist(dataset_path) # get dataset data, userid = get_data( weeks_of_data=weeks_of_data, donor_group=donor_group, - userid_of_shared_user=userid_of_shared_user, + userid=userid, auth=auth, email=email, password=password ) # save data - dataset_output_path = os.path.join( - dataset_path, - 'PHI-' + userid + ".csv" + _ = save_df( + data, + userid=userid, + data_path=data_path, + date_stamp=date_stamp, + folder_name="csvData", + phi=True ) - data.to_csv(dataset_output_path) - if __name__ == "__main__": get_and_save_dataset( @@ -291,7 +332,7 @@ def get_and_save_dataset( data_path=args.data_path, weeks_of_data=args.weeks_of_data, donor_group=args.donor_group, - userid_of_shared_user=args.userid_of_shared_user, + userid=args.userid, auth=args.auth, email=args.email, password=args.password From 10116db6b847e3ce43d2fa16c78111f4c8a14539 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Mon, 15 Jul 2019 22:02:35 -0500 Subject: [PATCH 02/46] save as gzipped csv --- .../get_all_donor_data_batch_process.py | 36 ++++++++++++++++--- .../get_single_donor_metadata.py | 2 +- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process.py b/projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process.py index 15daa252..8e81b372 100644 --- a/projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process.py +++ b/projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process.py @@ -117,12 +117,11 @@ def get_all_data(userid, donor_group): metadata_path = os.path.join( args.data_path, - "PHI-" + "2019-07-13" + "-donor-data", - "PHI-" + "2019-07-13" + "-metadata" - + phi_date_stamp + "-donor-data", + phi_date_stamp + "-metadata" ) -all_files = glob.glob(os.path.join(metadata_path, "*.csv")) +all_files = glob.glob(os.path.join(metadata_path, "*.csv.gz")) all_metadata = pd.DataFrame() for f in all_files: temp_meta = pd.read_csv(f) @@ -137,3 +136,32 @@ def get_all_data(userid, donor_group): os.path.join(donor_folder, phi_date_stamp + "-donor-metadata.csv") ) print("saving metadata...code complete") + + +# %% COMBINE AND SAVE ALL DATASET INFO (METADATA) +print("combining all dataset metadata") + +metadata_path = os.path.join( + args.data_path, + phi_date_stamp + "-donor-data", + phi_date_stamp + "-datasetSummary" +) + +all_files = glob.glob(os.path.join(metadata_path, "*.csv.gz")) +dataset_metadata = pd.DataFrame() +for f in all_files: + temp_meta = pd.read_csv(f) + temp_meta.rename(columns={"Unnamed: 0": "col_name"}, inplace=True) + userid = f[-32:-22] + temp_meta["userid"] = userid + dataset_metadata = pd.concat( + [dataset_metadata, temp_meta], + ignore_index=True, + sort=False + ) + +dataset_metadata.to_csv( + os.path.join(donor_folder, phi_date_stamp + "-all-dataset-info.csv") +) +print("saving all-dataset-info-metadata...code complete") + diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_single_donor_metadata.py b/projects/bigdata-processing-pipeline/get-donor-data/get_single_donor_metadata.py index 3135ff41..e02708a9 100644 --- a/projects/bigdata-processing-pipeline/get-donor-data/get_single_donor_metadata.py +++ b/projects/bigdata-processing-pipeline/get-donor-data/get_single_donor_metadata.py @@ -233,7 +233,7 @@ def get_and_save_metadata( # save data meta_output_path = os.path.join( metadata_path, - 'PHI-' + userid + ".csv" + 'PHI-' + userid + ".csv.gz" ) meta_df.to_csv(meta_output_path) From fe9a04a62777ef4a8124beb4e89649bf5e249188 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Mon, 15 Jul 2019 22:04:09 -0500 Subject: [PATCH 03/46] capture dataset info --- .../get-donor-data/get_single_dataset_info.py | 357 ++++++++++++++++++ .../get_single_tidepool_dataset.py | 293 +++++++------- 2 files changed, 496 insertions(+), 154 deletions(-) create mode 100644 projects/bigdata-processing-pipeline/get-donor-data/get_single_dataset_info.py diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_single_dataset_info.py b/projects/bigdata-processing-pipeline/get-donor-data/get_single_dataset_info.py new file mode 100644 index 00000000..d1ddab75 --- /dev/null +++ b/projects/bigdata-processing-pipeline/get-donor-data/get_single_dataset_info.py @@ -0,0 +1,357 @@ +# -*- coding: utf-8 -*- +"""get_donor_data_and_metadata.py +This code takes a tidepool dataset as input, and gives +a description of the type of data in the dataset. +""" + + +# %% REQUIRED LIBRARIES +import pandas as pd +import datetime as dt +import numpy as np +import os +import ast +import argparse + + +# %% FUNCTIONS +def get_type(val): + return type(val).__name__ + + +def get_len(val): + return len(val) + + +def get_val(val, k): + return val[k] + + +def literal_return(val): + try: + return ast.literal_eval(val) + except (ValueError, SyntaxError): + return val + + +def remove_cols(df, cols_to_remove): + + temp_remove_cols = list(set(df) & set(cols_to_remove)) + tempDf = df[temp_remove_cols] + df = df.drop(columns=temp_remove_cols) + + return df, tempDf + + +def make_folder_if_doesnt_exist(folder_paths): + ''' function requires a single path or a list of paths''' + if not isinstance(folder_paths, list): + folder_paths = [folder_paths] + for folder_path in folder_paths: + if not os.path.exists(folder_path): + os.makedirs(folder_path) + return + + +def create_output_folder( + data_path, + date_stamp, + folder_name, + phi=True +): + if phi: + date_stamp = "PHI-" + date_stamp + donor_folder = os.path.join(data_path, date_stamp + "-donor-data") + dataset_path = os.path.join( + donor_folder, + date_stamp + "-" + folder_name + ) + make_folder_if_doesnt_exist(dataset_path) + + return dataset_path + + +def save_df( + df, + userid, + data_path, + date_stamp, + folder_name, + phi=True, + name_suffix="", +): + + output_folder = create_output_folder( + data_path=data_path, + date_stamp=date_stamp, + folder_name=folder_name, + phi=phi + ) + + # if the data contains phi, add prefix to the file + if phi: + phi_prefix = 'PHI-' + else: + phi_prefix = '' + output_path = os.path.join( + output_folder, + phi_prefix + userid + "{}.csv.gz".format(name_suffix) + ) + + df.to_csv(output_path) + + return output_path + + +def expand_df(df, do_not_expand_list=[]): + + # remove fields that we don't want to flatten + df, hold_df = remove_cols(df, do_not_expand_list) + + # get a description of the original columns + col_df = pd.DataFrame(df.dtypes, columns=["dtype"]) + + # go through each dtype that is an object to see if it + # contains strings, mixed datatypes, embedded json, or lists + col_df["nObjectTypes"] = np.nan + col_df["objectType"] = np.nan + + new_df = pd.DataFrame() + for col in col_df[col_df["dtype"] == "object"].index: + rows = df.index[df[col].notnull()].tolist() + + # sometimes the object gets wrapped in a string + literal_df = pd.DataFrame(df.loc[rows, col].apply(literal_return)) + + # see if there are mixed ojbect types + type_df = pd.DataFrame(literal_df.loc[rows, col].apply(get_type)) + unique_types = type_df[col].unique() + col_df.loc[col, "nObjectTypes"] = len(unique_types) + col_df.loc[col, "objectType"] = str(unique_types) + + # USE UNDERSCORE FOR LIST EXPANSION + if "list" in col_df.loc[col, "objectType"]: + list_df = pd.DataFrame(literal_df.loc[type_df[col] == "list", col]) + list_df["len"] = list_df[col].apply(get_len) + + for i in np.arange(1, list_df["len"].max() + 1): + blob_df = pd.DataFrame( + list_df.loc[ + list_df["len"] >= i, col + ].apply(get_val, k=i-1) + ).add_suffix('_' + str(i)) + + new_df = pd.concat([new_df, blob_df], axis=1) + + # USE DOT FOR JSON (DICT) EXPANSION + if "dict" in col_df.loc[col, "objectType"]: + json_blob = literal_df.loc[type_df[col] == "dict", col] + blob_df = pd.DataFrame( + json_blob.tolist(), + index=json_blob.index + ).add_prefix(col + '.') + new_df = pd.concat([new_df, blob_df], axis=1) + + # merge the dataframes together + df = pd.concat([df, new_df, hold_df], axis=1) + + df.sort_index(axis=1, inplace=True) + + return df, col_df + + +def expand_data(starting_df, depth=10): + print("\ninitial df has {} columns".format(len(starting_df.columns))) + print("starting expansion ...") + temp_df, temp_col = expand_df(starting_df) + col_df = temp_col.copy() + skip_columns = starting_df.columns.tolist() + d = 1 + n_col_expanded = len(list(temp_df)) - len(list(starting_df)) + print("{} columns added". format(n_col_expanded)) + + while not ((d >= depth) | (len(temp_col) == 0)): + print("expanding layer {} ... ".format(d)) + next_skip_columns = temp_df.columns.tolist() + temp_df, temp_col = expand_df(temp_df, skip_columns) + skip_columns = next_skip_columns.copy() + + col_df = pd.concat([col_df, temp_col]) + n_col_expanded = len(list(temp_df)) - len(next_skip_columns) + print("{} columns added". format(n_col_expanded)) + d += 1 + + print("expansion complete...getting dataset summary info...") + + col_df.sort_index(inplace=True) + + # get the start and end time for each data type + print("getting data start and end times for each data type ...") + col_df["startTime"] = np.nan + col_df["endTime"] = np.nan + for col in col_df.index: + try: + start_time = temp_df.loc[temp_df[col].notnull(), ["time"]].min() + end_time = temp_df.loc[temp_df[col].notnull(), ["time"]].max() + col_df.loc[col, "startTime"] = start_time.values[0] + col_df.loc[col, "endTime"] = end_time.values[0] + except: + print(col, "missing timestamp") + + # get summary information + print("getting summary information ...") + df_info = pd.DataFrame(temp_df.describe(include='all').T) + df_info.loc["_all", ["count", "unique"]] = temp_df.shape + df_info.sort_index(inplace=True) + + # add which type (or subtype) each column comes from + for typeType in ["type", "subType"]: + if typeType in list(starting_df): + type_groups = temp_df.groupby(by=typeType) + not_null_index = temp_df[typeType].notnull() + for type_ in temp_df.loc[not_null_index, typeType].unique(): + type_df = type_groups.get_group(type_).dropna( + axis=1, + how="all" + ) + df_info.loc[type_df.columns, typeType + "=" + type_] = type_ + + # get memory size of each data type + print("getting memory information ...") + mem_usage = pd.DataFrame( + temp_df.memory_usage(index=True, deep=True), + columns=["memorySize"] + ) + mem_usage.rename(index={"Index": "_all"}, inplace=True) + df_info["memorySize"] = mem_usage["memorySize"] + df_info.loc["_all", "memorySize"] = temp_df.memory_usage( + index=True, deep=True + ).sum() + + # combine with col_summary + summary_df = pd.concat([col_df, df_info], axis=1, sort=True) + + # get/add a list of string values + print("getting a a list of string values ...") + str_cols = summary_df[ + ((summary_df["objectType"] == "['str']") & + (summary_df["unique"] > 1) & + (summary_df["unique"] < 50) + ) + ].index + for str_col in str_cols: + not_null_index = temp_df[str_col].notnull() + str_vals = temp_df.loc[not_null_index, str_col].unique().tolist() + summary_df.loc[str_col, "strVals"] = str(str_vals) + + print("dataset summary info complete\n") + + return summary_df, temp_df + + +# %% START OF CODE +def get_dataset_info( + data, + date_stamp, + data_path, + userid, + save_expanded +): + + if userid == "not-specified": + userid = input("Enter userid of dataset you want info on:\n") + + if type(data) is float: # np.nan is a float + dataset_folder = create_output_folder( + data_path, + date_stamp, + "csvData" + ) + dataset_path = os.path.join( + dataset_folder, + "PHI-{}.csv.gz".format(userid) + ) + data = pd.read_csv(dataset_path, low_memory=False, index_col=0) + + # expand embedded lists and json within dataset + summary_df, expanded_df = expand_data(data.copy(), depth=10) + + # save summary data + _ = save_df( + summary_df, + userid=userid, + data_path=data_path, + date_stamp=date_stamp, + folder_name="datasetSummary", + phi=True, + name_suffix="-datasetSummary" + ) + + if save_expanded: + # save expanded data + _ = save_df( + expanded_df, + userid=userid, + data_path=args.data_path, + date_stamp=args.date_stamp, + folder_name="expandedData", + phi=True, + name_suffix="-expandedData" + ) + + +if __name__ == "__main__": + # USER INPUTS (choices to be made in order to run the code) + codeDescription = "get an overview of the columns and data in the dataset" + parser = argparse.ArgumentParser(description=codeDescription) + + parser.add_argument( + "-d", + "--date-stamp", + dest="date_stamp", + default=dt.datetime.now().strftime("%Y-%m-%d"), + help="date, in '%Y-%m-%d' format, of the date when " + + "donors were accepted" + ) + + parser.add_argument( + "-u", + "--userid", + dest="userid", + default="not-specified", + help="userid of the dataset you are interested in" + ) + + parser.add_argument( + "-o", + "--output-data-path", + dest="data_path", + default=os.path.abspath( + os.path.join( + os.path.dirname(__file__), "..", "data" + ) + ), + help="the output path where the data is stored" + ) + + parser.add_argument( + "-s", + "--save-expanded-dataset", + dest="save_expanded", + default=True, + help=( + "specify if you want to save the expanded datafram (True/False)" + + "NOTE: these files can be rather large" + ) + ) + + args = parser.parse_args() + + # main function + get_dataset_info( + data=np.nan, + date_stamp=args.date_stamp, + data_path=args.data_path, + userid=args.userid, + save_expanded=args.save_expanded + ) diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py b/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py index 8d89c2c5..7526fe77 100644 --- a/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py +++ b/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py @@ -8,6 +8,7 @@ """ # %% REQUIRED LIBRARIES +from get_single_dataset_info import expand_data, save_df import pandas as pd import datetime as dt import numpy as np @@ -16,7 +17,6 @@ import getpass import requests import json -import pdb import argparse envPath = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) if envPath not in sys.path: @@ -24,142 +24,7 @@ import environmentalVariables -# %% USER INPUTS (choices to be made in order to run the code) -codeDescription = "get donor metadata" -parser = argparse.ArgumentParser(description=codeDescription) - -parser.add_argument( - "-d", - "--date-stamp", - dest="date_stamp", - default=dt.datetime.now().strftime("%Y-%m-%d"), - help="date, in '%Y-%m-%d' format, of the date when " + - "donors were accepted" -) - -parser.add_argument( - "-w", - "--weeks-of-data", - dest="weeks_of_data", - default=52*10, - help="enter the number of weeks of data you want to download" -) - -parser.add_argument( - "-dg", - "--donor-group", - dest="donor_group", - default=np.nan, - help="name of the donor group in the tidepool .env file" -) - -parser.add_argument( - "-u", - "--userid", - dest="userid", - default=np.nan, - help="userid of account shared with the donor group or master account" -) - -parser.add_argument( - "-a", - "--auth", - dest="auth", - default=np.nan, - help="tuple that contains (email, password)" -) - -parser.add_argument( - "-e", - "--email", - dest="email", - default=np.nan, - help="email address of the master account" -) - -parser.add_argument( - "-p", - "--password", - dest="password", - default=np.nan, - help="password of the master account" -) - -parser.add_argument( - "-o", - "--output-data-path", - dest="data_path", - default=os.path.abspath( - os.path.join( - os.path.dirname(__file__), "..", "data" - ) - ), - help="the output path where the data is stored" -) - -args = parser.parse_args() - - # %% FUNCTIONS -def make_folder_if_doesnt_exist(folder_paths): - ''' function requires a single path or a list of paths''' - if not isinstance(folder_paths, list): - folder_paths = [folder_paths] - for folder_path in folder_paths: - if not os.path.exists(folder_path): - os.makedirs(folder_path) - return - - -def create_output_folder( - data_path=args.data_path, - date_stamp=args.date_stamp, - folder_name="not-specified", - phi=True -): - if phi: - date_stamp = "PHI-" + date_stamp - donor_folder = os.path.join(data_path, date_stamp + "-donor-data") - dataset_path = os.path.join( - donor_folder, - date_stamp + "-" + folder_name - ) - make_folder_if_doesnt_exist(dataset_path) - - return dataset_path - - -def save_df( - df, - userid=args.userid, - data_path=args.data_path, - date_stamp=args.date_stamp, - folder_name="not-specified", - phi=True -): - - output_folder = create_output_folder( - data_path=data_path, - date_stamp=date_stamp, - folder_name=folder_name, - phi=phi - ) - - # if the data contains phi, add prefix to the file - if phi: - phi_prefix = 'PHI-' - else: - phi_prefix = '' - output_path = os.path.join( - output_folder, - phi_prefix + userid + "-dataSummary.csv.gz" - ) - - df.to_csv(output_path) - - return output_path - - def get_data_api(userid, startDate, endDate, headers): startDate = startDate.strftime("%Y-%m-%d") + "T00:00:00.000Z" @@ -295,14 +160,15 @@ def get_data( # %% START OF CODE def get_and_save_dataset( - date_stamp=args.date_stamp, - data_path=args.data_path, - weeks_of_data=args.weeks_of_data, - donor_group=args.donor_group, - userid=args.userid, - auth=args.auth, - email=args.email, - password=args.password + date_stamp, + data_path, + weeks_of_data, + donor_group, + userid, + auth, + email, + password, + expand_dataset ): # get dataset @@ -315,18 +181,136 @@ def get_and_save_dataset( password=password ) - # save data - _ = save_df( - data, - userid=userid, - data_path=data_path, - date_stamp=date_stamp, - folder_name="csvData", - phi=True - ) + # if the there is data + if len(data) > 1: + # save data + print("saving csv data...") + _ = save_df( + data, + userid=userid, + data_path=data_path, + date_stamp=date_stamp, + folder_name="csvData", + phi=True + ) + + # get dataset info + if expand_dataset: + summary_df, expanded_df = expand_data(data) + print("saving summary data...") + _ = save_df( + summary_df, + userid=userid, + data_path=data_path, + date_stamp=date_stamp, + folder_name="datasetSummary", + phi=True, + name_suffix="-datasetSummary" + ) + + # save expanded data + print("saving expanded data...") + _ = save_df( + expanded_df, + userid=userid, + data_path=args.data_path, + date_stamp=args.date_stamp, + folder_name="expandedData", + phi=True, + name_suffix="-expandedData" + ) + else: + print("{} has no data".format(userid)) if __name__ == "__main__": + # USER INPUTS (choices to be made in order to run the code) + codeDescription = "get donor metadata" + parser = argparse.ArgumentParser(description=codeDescription) + + parser.add_argument( + "-d", + "--date-stamp", + dest="date_stamp", + default=dt.datetime.now().strftime("%Y-%m-%d"), + help="date, in '%Y-%m-%d' format, of the date when " + + "donors were accepted" + ) + + parser.add_argument( + "-w", + "--weeks-of-data", + dest="weeks_of_data", + default=52*10, + help="enter the number of weeks of data you want to download" + ) + + parser.add_argument( + "-dg", + "--donor-group", + dest="donor_group", + default=np.nan, + help="name of the donor group in the tidepool .env file" + ) + + parser.add_argument( + "-u", + "--userid", + dest="userid", + default=np.nan, + help="userid of account shared with the donor group or master account" + ) + + parser.add_argument( + "-a", + "--auth", + dest="auth", + default=np.nan, + help="tuple that contains (email, password)" + ) + + parser.add_argument( + "-e", + "--email", + dest="email", + default=np.nan, + help="email address of the master account" + ) + + parser.add_argument( + "-p", + "--password", + dest="password", + default=np.nan, + help="password of the master account" + ) + + parser.add_argument( + "-o", + "--output-data-path", + dest="data_path", + default=os.path.abspath( + os.path.join( + os.path.dirname(__file__), "..", "data" + ) + ), + help="the output path where the data is stored" + ) + + parser.add_argument( + "-ex", + "--expand-dataset", + dest="expand_dataset", + default=True, + help=( + "specify if you want to get/save the expanded datafram (True/False)" + + "NOTE: this process is time consuming" + ) + ) + + args = parser.parse_args() + + # the main function get_and_save_dataset( date_stamp=args.date_stamp, data_path=args.data_path, @@ -335,5 +319,6 @@ def get_and_save_dataset( userid=args.userid, auth=args.auth, email=args.email, - password=args.password + password=args.password, + expand_dataset=args.expand_dataset ) From c80ca20c0e9205578d54bde7b1a92d87e9755ab3 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Wed, 17 Jul 2019 09:45:44 -0500 Subject: [PATCH 04/46] sort by userid instead of donor group --- .../get-donor-data/accept_new_donors_and_get_donor_list.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/projects/bigdata-processing-pipeline/get-donor-data/accept_new_donors_and_get_donor_list.py b/projects/bigdata-processing-pipeline/get-donor-data/accept_new_donors_and_get_donor_list.py index 0d8c4a41..b17f5c9e 100644 --- a/projects/bigdata-processing-pipeline/get-donor-data/accept_new_donors_and_get_donor_list.py +++ b/projects/bigdata-processing-pipeline/get-donor-data/accept_new_donors_and_get_donor_list.py @@ -16,6 +16,7 @@ import requests import json import argparse +import pdb envPath = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) if envPath not in sys.path: sys.path.insert(0, envPath) @@ -247,7 +248,7 @@ def accept_and_get_list(args): ) # polish up the final donor list - final_donor_list.sort_values(by="donorGroup", inplace=True) + final_donor_list.sort_values(by="userID", inplace=True) final_donor_list.reset_index(drop=True, inplace=True) if args.save_donor_list: From 7340a5c3abc474984958cd0e967cf92ec12b046b Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Wed, 17 Jul 2019 09:46:44 -0500 Subject: [PATCH 05/46] just get json file --- .../get_all_donor_data_batch_process_json.py | 138 ++++++++ .../get_single_tidepool_dataset_json.py | 299 ++++++++++++++++++ 2 files changed, 437 insertions(+) create mode 100644 projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process_json.py create mode 100644 projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset_json.py diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process_json.py b/projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process_json.py new file mode 100644 index 00000000..d43b8e9a --- /dev/null +++ b/projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process_json.py @@ -0,0 +1,138 @@ +# -*- coding: utf-8 -*- +"""accept_donors_and_pull_data.py +This is a wrapper script that accepts all bigdata donation project donors, +and then pulls of their datasets for further processing. +""" + +# %% REQUIRED LIBRARIES +from accept_new_donors_and_get_donor_list import accept_and_get_list +import datetime as dt +import pandas as pd +import subprocess as sub +import os +import glob +import time +import argparse +from multiprocessing import Pool + + +# %% USER INPUTS (choices to be made in order to run the code) +codeDescription = "accepts new donors (shares) and grab their data" +parser = argparse.ArgumentParser(description=codeDescription) + +parser.add_argument( + "-d", + "--date-stamp", + dest="date_stamp", + default=dt.datetime.now().strftime("%Y-%m-%d"), + help="date, in '%Y-%m-%d' format, of the date when " + + "donors were accepted" +) + +parser.add_argument( + "-o", + "--output-data-path", + dest="data_path", + default=os.path.abspath( + os.path.join( + os.path.dirname(__file__), "..", "data" + ) + ), + help="the output path where the data is stored" +) + +parser.add_argument( + "-s", + "--save-donor-list", + dest="save_donor_list", + default=True, + help="specify if you want to save the donor list (True/False)" +) + +args = parser.parse_args() + + +# %% FUNCTIONS +def run_process(func_name, userid, donor_group): + func_path = os.path.join(".", func_name) + + p = sub.Popen( + [ + "python", func_path, + "-d", args.date_stamp, + "-dg", donor_group, + "-u", userid, + "-o", args.data_path + ], + stdout=sub.PIPE, + stderr=sub.PIPE + ) + + output, errors = p.communicate() + output = output.decode("utf-8") + errors = errors.decode("utf-8") + + if errors == '': + print(output) + else: + print(errors) + + return + + +def get_all_data(userid, donor_group): + + run_process("get_single_donor_metadata.py", userid, donor_group) + run_process("get_single_tidepool_dataset_json.py", userid, donor_group) + + return + + +# %% GET LATEST DONOR LIST +final_donor_list = accept_and_get_list(args) + + +# %% GET DONOR META DATA AND DATASETS +# use multiple cores to process +startTime = time.time() +print("starting at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) +pool = Pool(os.cpu_count()) +pool.starmap(get_all_data, zip( + final_donor_list["userID"], + final_donor_list["donorGroup"] +)) +pool.close() +endTime = time.time() +print( + "finshed pulling data at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") +) +total_duration = round((endTime - startTime) / 60, 1) +print("total duration was %s minutes" % total_duration) + + +# %% COMBINE AND SAVE ALL DONOR METADATA +print("combining all metadata") +phi_date_stamp = "PHI-" + args.date_stamp +donor_folder = os.path.join(args.data_path, phi_date_stamp + "-donor-data") + +metadata_path = os.path.join( + args.data_path, + phi_date_stamp + "-donor-data", + phi_date_stamp + "-metadata" +) + +all_files = glob.glob(os.path.join(metadata_path, "*.csv.gz")) +all_metadata = pd.DataFrame() +for f in all_files: + temp_meta = pd.read_csv(f) + temp_meta.rename(columns={"Unnamed: 0": "userid"}, inplace=True) + all_metadata = pd.concat( + [all_metadata, temp_meta], + ignore_index=True, + sort=False + ) + +all_metadata.to_csv( + os.path.join(donor_folder, phi_date_stamp + "-donor-metadata.csv") +) +print("saving metadata...code complete") diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset_json.py b/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset_json.py new file mode 100644 index 00000000..5a17d6d8 --- /dev/null +++ b/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset_json.py @@ -0,0 +1,299 @@ +# -*- coding: utf-8 -*- +"""get_donor_data_and_metadata.py +In the context of the big data donation +project, this code grabs donor data and metadata. + +This code calls accept_new_donors_and_get_donor_list.py +to get the most recent donor list +""" + +# %% REQUIRED LIBRARIES +import pandas as pd +import datetime as dt +import numpy as np +import os +import sys +import time +import getpass +import requests +import json +import argparse +import pdb +envPath = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) +if envPath not in sys.path: + sys.path.insert(0, envPath) +import environmentalVariables + + +# %% FUNCTIONS +def make_folder_if_doesnt_exist(folder_paths): + ''' function requires a single path or a list of paths''' + if not isinstance(folder_paths, list): + folder_paths = [folder_paths] + for folder_path in folder_paths: + if not os.path.exists(folder_path): + os.makedirs(folder_path) + return + + +def get_data_api(userid, startDate, endDate, headers): + + startDate = startDate.strftime("%Y-%m-%d") + "T00:00:00.000Z" + endDate = endDate.strftime("%Y-%m-%d") + "T23:59:59.999Z" + + api_call = ( + "https://api.tidepool.org/data/" + userid + "?" + + "endDate=" + endDate + "&" + + "startDate=" + startDate + "&" + + "dexcom=true" + "&" + + "medtronic=true" + "&" + + "carelink=true" + ) + + api_response = requests.get(api_call, headers=headers) + if(api_response.ok): + print("getting data between %s and %s" % (startDate, endDate)) + json_data = json.loads(api_response.content.decode()) + + else: + sys.exit( + "ERROR in getting data between %s and %s" % (startDate, endDate), + api_response.status_code + ) + + endDate = pd.to_datetime(startDate) - pd.Timedelta(1, unit="d") + + return json_data, endDate + + +def get_data( + weeks_of_data=10*52, + save_data_path=os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "..", + "data" + ) + ), + overwrite_hours=24, + donor_group=np.nan, + userid=np.nan, + auth=np.nan, + email=np.nan, + password=np.nan, +): + # login + if pd.notnull(donor_group): + if donor_group == "bigdata": + dg = "" + else: + dg = donor_group + + auth = environmentalVariables.get_environmental_variables(dg) + + if pd.isnull(auth): + if pd.isnull(email): + email = input("Enter Tidepool email address:\n") + + if pd.isnull(password): + password = getpass.getpass("Enter password:\n") + + auth = (email, password) + + api_call = "https://api.tidepool.org/auth/login" + api_response = requests.post(api_call, auth=auth) + if(api_response.ok): + xtoken = api_response.headers["x-tidepool-session-token"] + userid_master = json.loads(api_response.content.decode())["userid"] + headers = { + "x-tidepool-session-token": xtoken, + "Content-Type": "application/json" + } + else: + sys.exit("Error with " + auth[0] + ":" + str(api_response.status_code)) + + if pd.isnull(userid): + userid = userid_master + print( + "getting data for the master account since no shared " + + "user account was given" + ) + + print("logging into", auth[0], "...") + + # download user data + print("downloading data for {} ...".format(userid)) + endDate = pd.datetime.now() + pd.Timedelta(1, unit="d") + + output_folder = os.path.join( + save_data_path, + "dremio", + userid, + ) + + output_file_path = os.path.join( + output_folder, + "PHI-{}.json".format(userid) + ) + + download_ = True + for f in [output_folder, output_file_path]: + path_exist = os.path.exists(f) + if path_exist: + last_save = os.path.getmtime(f) + time_threshold = time.time() - (overwrite_hours * 3600) + within_time_threshold = last_save > time_threshold + if within_time_threshold: + download_ = False + + if download_: + make_folder_if_doesnt_exist(output_folder) + big_json_file = [] + + if weeks_of_data > 52: + years_of_data = int(np.floor(weeks_of_data/52)) + + for years in range(0, years_of_data + 1): + startDate = pd.datetime( + endDate.year - 1, + endDate.month, + endDate.day + 1 + ) + json_data, endDate = get_data_api( + userid, + startDate, + endDate, + headers + ) + + big_json_file = big_json_file + json_data + + else: + startDate = ( + pd.to_datetime(endDate) - pd.Timedelta(weeks_of_data*7, "d") + ) + + json_data, _ = get_data_api( + userid, + startDate, + endDate, + headers + ) + + big_json_file = big_json_file + json_data + + # save data + if len(big_json_file) > 1: + print("saving data for {}".format(userid)) + with open(output_file_path, 'w') as outfile: + json.dump(big_json_file, outfile) + else: + print("{} has no data".format(userid)) + + # logout + api_call = "https://api.tidepool.org/auth/logout" + api_response = requests.post(api_call, auth=auth) + + if(api_response.ok): + print("successfully logged out of", auth[0]) + + else: + sys.exit( + "Error with logging out for " + + auth[0] + ":" + str(api_response.status_code) + ) + else: + print("skipping bc {}'s data was downloaded (attempted)".format(userid) + + " within the last {} hours".format(overwrite_hours) + ) + + return + + +# %% MAIN +if __name__ == "__main__": + # USER INPUTS (choices to be made in order to run the code) + codeDescription = "get donor json file" + parser = argparse.ArgumentParser(description=codeDescription) + + parser.add_argument( + "-o", + "--output-data-path", + dest="data_path", + default=os.path.abspath( + os.path.join( + os.path.dirname(__file__), "..", "data" + ) + ), + help="the output path where the data is stored" + ) + + parser.add_argument( + "-w", + "--weeks-of-data", + dest="weeks_of_data", + default=52*10, # go back the last 10 years as default + help="enter the number of weeks of data you want to download" + ) + + parser.add_argument( + "-ow", + "--over-write", + dest="overwrite_hours", + default=24, + help="if data was downloaded in the last <24> hours, skip download" + ) + + parser.add_argument( + "-dg", + "--donor-group", + dest="donor_group", + default=np.nan, + help="name of the donor group in the tidepool .env file" + ) + + parser.add_argument( + "-u", + "--userid", + dest="userid", + default=np.nan, + help="userid of account shared with the donor group or master account" + ) + + parser.add_argument( + "-a", + "--auth", + dest="auth", + default=np.nan, + help="tuple that contains (email, password)" + ) + + parser.add_argument( + "-e", + "--email", + dest="email", + default=np.nan, + help="email address of the master account" + ) + + parser.add_argument( + "-p", + "--password", + dest="password", + default=np.nan, + help="password of the master account" + ) + + args = parser.parse_args() + + # the main function + get_data( + save_data_path=args.data_path, + weeks_of_data=args.weeks_of_data, + overwrite_hours=args.overwrite_hours, + donor_group=args.donor_group, + userid=args.userid, + auth=args.auth, + email=args.email, + password=args.password, + ) From 0d08b821ac1d16f76799162523c00515c8f3e259 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Sun, 21 Jul 2019 11:44:33 -0500 Subject: [PATCH 06/46] get interim data summaries --- .../get_interim_dataset_summaries.py | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 projects/bigdata-processing-pipeline/get-donor-data/get_interim_dataset_summaries.py diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_interim_dataset_summaries.py b/projects/bigdata-processing-pipeline/get-donor-data/get_interim_dataset_summaries.py new file mode 100644 index 00000000..0fa04201 --- /dev/null +++ b/projects/bigdata-processing-pipeline/get-donor-data/get_interim_dataset_summaries.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +# %% REQUIRED LIBRARIES +import datetime as dt +import pandas as pd +import os +import glob +import argparse + + +# %% FUNCTIONS +def get_dataset_summaries( + save_data_path=os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "..", + "data" + ) + ), + date_stamp=dt.datetime.now().strftime("%Y-%m-%d"), +): + + + + phi_date_stamp = "PHI-" + args.date_stamp + donor_folder = os.path.join(args.data_path, phi_date_stamp + "-donor-data") + + print("combining all dataset metadata") + + metadata_path = os.path.join( + args.data_path, + phi_date_stamp + "-donor-data", + phi_date_stamp + "-datasetSummary" + ) + + all_files = glob.glob(os.path.join(metadata_path, "*.csv.gz")) + dataset_metadata = pd.DataFrame() + n_files = len(all_files) + print("there are {} files".format(n_files)) + f_counter = 1 + for f in all_files: + temp_meta = pd.read_csv(f) + temp_meta.rename(columns={"Unnamed: 0": "col_name"}, inplace=True) + userid = f[-32:-22] + temp_meta["userid"] = userid + dataset_metadata = pd.concat( + [dataset_metadata, temp_meta], + ignore_index=True, + sort=False + ) + + if f_counter % 10 == 0: + print("completed file {} of {}".format(f_counter, n_files)) + f_counter = f_counter + 1 + dataset_metadata.to_csv( + os.path.join(donor_folder, phi_date_stamp + "-all-dataset-info.csv.gz") + ) + print("saving all-dataset-info-metadata...code complete") + + return + + +# %% MAIN +if __name__ == "__main__": + # USER INPUTS (choices to be made in order to run the code) + codeDescription = "get donor json file" + parser = argparse.ArgumentParser(description=codeDescription) + + parser.add_argument( + "-d", + "--date-stamp", + dest="date_stamp", + default=dt.datetime.now().strftime("%Y-%m-%d"), + help="date, in '%Y-%m-%d' format, of the date when " + + "donors were accepted" + ) + + parser.add_argument( + "-o", + "--output-data-path", + dest="data_path", + default=os.path.abspath( + os.path.join( + os.path.dirname(__file__), "..", "data" + ) + ), + help="the output path where the data is stored" + ) + + args = parser.parse_args() + + # the main function + get_dataset_summaries( + save_data_path=args.data_path, + date_stamp=args.date_stamp + ) From 7b28164a4d40db60ff21f00b1ae59702911ad82d Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Wed, 24 Jul 2019 07:38:28 -0500 Subject: [PATCH 07/46] update env to include latest spyder --- projects/bigdata-processing-pipeline/environment.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/projects/bigdata-processing-pipeline/environment.yml b/projects/bigdata-processing-pipeline/environment.yml index 4c945436..64ef3601 100644 --- a/projects/bigdata-processing-pipeline/environment.yml +++ b/projects/bigdata-processing-pipeline/environment.yml @@ -3,9 +3,8 @@ channels: - defaults dependencies: - python=3.7.3 - - numpy=1.16.4 - pandas=0.24.2 + - spyder=3.3.6 - pip=19.1.1 - - spyder=3.3.5 - pip: - python-dotenv==0.10.3 From 09eaca08429db8407f7e9f5e54a79f46ec7e15cd Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Wed, 24 Jul 2019 08:33:52 -0500 Subject: [PATCH 08/46] add __init__.py and rename folders --- projects/bigdata-processing-pipeline/__init__.py | 0 .../README.md | 0 .../anonymize-and-export.py | 0 .../example-data/dataFieldExportList.csv | 0 .../example-data/jill-jellyfish-lite.csv | 0 .../example-data/jill-jellyfish-lite.json | 0 .../example-data/jill-jellyfish-lite.xlsx | Bin .../.gitignore | 0 .../README.md | 0 .../estimate-local-time.py | 0 .../estimateLocalTime-batchProcess.py | 0 .../example-csv.csv | 0 .../example-json.json | 0 .../example-xlsx.xlsx | Bin .../wikipedia-timezone-aliases-2018-04-28.csv | 0 .../{get-donor-data => get_donor_data}/README.md | 0 .../get_donor_data/__init__.py | 0 .../accept_new_donors_and_get_donor_list.py | 0 .../deprecated/accept-new-donors.py | 0 .../deprecated/get-all-col-headings.py | 0 .../deprecated/get-donor-json-files.py | 0 .../deprecated/get-donor-list.py | 0 .../deprecated/get_all_donor_data.py | 0 .../example_get_all_data_for_single_user.py | 2 +- .../get_all_donor_data_batch_process.py | 0 .../get_all_donor_data_batch_process_json.py | 0 .../get_interim_dataset_summaries.py | 0 .../get_single_dataset_info.py | 0 .../get_single_donor_metadata.py | 0 .../get_single_tidepool_dataset.py | 5 ++++- .../get_single_tidepool_dataset_json.py | 0 .../get_stats/__init__.py | 0 .../{qualify-data => qualify_data}/README.md | 0 .../deprecated/qualify-data.py | 0 .../qualify_all_donor_data_batch_process.py | 0 .../qualify_single_dataset.py | 0 .../tidepool-qualification-criteria.json | 0 37 files changed, 5 insertions(+), 2 deletions(-) create mode 100644 projects/bigdata-processing-pipeline/__init__.py rename projects/bigdata-processing-pipeline/{anonymize-and-export-data => anonymize_and_export_data}/README.md (100%) rename projects/bigdata-processing-pipeline/{anonymize-and-export-data => anonymize_and_export_data}/anonymize-and-export.py (100%) rename projects/bigdata-processing-pipeline/{anonymize-and-export-data => anonymize_and_export_data}/example-data/dataFieldExportList.csv (100%) rename projects/bigdata-processing-pipeline/{anonymize-and-export-data => anonymize_and_export_data}/example-data/jill-jellyfish-lite.csv (100%) rename projects/bigdata-processing-pipeline/{anonymize-and-export-data => anonymize_and_export_data}/example-data/jill-jellyfish-lite.json (100%) rename projects/bigdata-processing-pipeline/{anonymize-and-export-data => anonymize_and_export_data}/example-data/jill-jellyfish-lite.xlsx (100%) rename projects/bigdata-processing-pipeline/{estimate-local-time => estimate_local_time}/.gitignore (100%) rename projects/bigdata-processing-pipeline/{estimate-local-time => estimate_local_time}/README.md (100%) rename projects/bigdata-processing-pipeline/{estimate-local-time => estimate_local_time}/estimate-local-time.py (100%) rename projects/bigdata-processing-pipeline/{estimate-local-time => estimate_local_time}/estimateLocalTime-batchProcess.py (100%) rename projects/bigdata-processing-pipeline/{estimate-local-time => estimate_local_time}/example-csv.csv (100%) rename projects/bigdata-processing-pipeline/{estimate-local-time => estimate_local_time}/example-json.json (100%) rename projects/bigdata-processing-pipeline/{estimate-local-time => estimate_local_time}/example-xlsx.xlsx (100%) rename projects/bigdata-processing-pipeline/{estimate-local-time => estimate_local_time}/wikipedia-timezone-aliases-2018-04-28.csv (100%) rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/README.md (100%) create mode 100644 projects/bigdata-processing-pipeline/get_donor_data/__init__.py rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/accept_new_donors_and_get_donor_list.py (100%) rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/deprecated/accept-new-donors.py (100%) rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/deprecated/get-all-col-headings.py (100%) rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/deprecated/get-donor-json-files.py (100%) rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/deprecated/get-donor-list.py (100%) rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/deprecated/get_all_donor_data.py (100%) rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/example_get_all_data_for_single_user.py (95%) rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/get_all_donor_data_batch_process.py (100%) rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/get_all_donor_data_batch_process_json.py (100%) rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/get_interim_dataset_summaries.py (100%) rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/get_single_dataset_info.py (100%) rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/get_single_donor_metadata.py (100%) rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/get_single_tidepool_dataset.py (98%) rename projects/bigdata-processing-pipeline/{get-donor-data => get_donor_data}/get_single_tidepool_dataset_json.py (100%) create mode 100644 projects/bigdata-processing-pipeline/get_stats/__init__.py rename projects/bigdata-processing-pipeline/{qualify-data => qualify_data}/README.md (100%) rename projects/bigdata-processing-pipeline/{qualify-data => qualify_data}/deprecated/qualify-data.py (100%) rename projects/bigdata-processing-pipeline/{qualify-data => qualify_data}/qualify_all_donor_data_batch_process.py (100%) rename projects/bigdata-processing-pipeline/{qualify-data => qualify_data}/qualify_single_dataset.py (100%) rename projects/bigdata-processing-pipeline/{qualify-data => qualify_data}/tidepool-qualification-criteria.json (100%) diff --git a/projects/bigdata-processing-pipeline/__init__.py b/projects/bigdata-processing-pipeline/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/projects/bigdata-processing-pipeline/anonymize-and-export-data/README.md b/projects/bigdata-processing-pipeline/anonymize_and_export_data/README.md similarity index 100% rename from projects/bigdata-processing-pipeline/anonymize-and-export-data/README.md rename to projects/bigdata-processing-pipeline/anonymize_and_export_data/README.md diff --git a/projects/bigdata-processing-pipeline/anonymize-and-export-data/anonymize-and-export.py b/projects/bigdata-processing-pipeline/anonymize_and_export_data/anonymize-and-export.py similarity index 100% rename from projects/bigdata-processing-pipeline/anonymize-and-export-data/anonymize-and-export.py rename to projects/bigdata-processing-pipeline/anonymize_and_export_data/anonymize-and-export.py diff --git a/projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/dataFieldExportList.csv b/projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/dataFieldExportList.csv similarity index 100% rename from projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/dataFieldExportList.csv rename to projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/dataFieldExportList.csv diff --git a/projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/jill-jellyfish-lite.csv b/projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/jill-jellyfish-lite.csv similarity index 100% rename from projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/jill-jellyfish-lite.csv rename to projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/jill-jellyfish-lite.csv diff --git a/projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/jill-jellyfish-lite.json b/projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/jill-jellyfish-lite.json similarity index 100% rename from projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/jill-jellyfish-lite.json rename to projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/jill-jellyfish-lite.json diff --git a/projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/jill-jellyfish-lite.xlsx b/projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/jill-jellyfish-lite.xlsx similarity index 100% rename from projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/jill-jellyfish-lite.xlsx rename to projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/jill-jellyfish-lite.xlsx diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/.gitignore b/projects/bigdata-processing-pipeline/estimate_local_time/.gitignore similarity index 100% rename from projects/bigdata-processing-pipeline/estimate-local-time/.gitignore rename to projects/bigdata-processing-pipeline/estimate_local_time/.gitignore diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/README.md b/projects/bigdata-processing-pipeline/estimate_local_time/README.md similarity index 100% rename from projects/bigdata-processing-pipeline/estimate-local-time/README.md rename to projects/bigdata-processing-pipeline/estimate_local_time/README.md diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/estimate-local-time.py b/projects/bigdata-processing-pipeline/estimate_local_time/estimate-local-time.py similarity index 100% rename from projects/bigdata-processing-pipeline/estimate-local-time/estimate-local-time.py rename to projects/bigdata-processing-pipeline/estimate_local_time/estimate-local-time.py diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/estimateLocalTime-batchProcess.py b/projects/bigdata-processing-pipeline/estimate_local_time/estimateLocalTime-batchProcess.py similarity index 100% rename from projects/bigdata-processing-pipeline/estimate-local-time/estimateLocalTime-batchProcess.py rename to projects/bigdata-processing-pipeline/estimate_local_time/estimateLocalTime-batchProcess.py diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/example-csv.csv b/projects/bigdata-processing-pipeline/estimate_local_time/example-csv.csv similarity index 100% rename from projects/bigdata-processing-pipeline/estimate-local-time/example-csv.csv rename to projects/bigdata-processing-pipeline/estimate_local_time/example-csv.csv diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/example-json.json b/projects/bigdata-processing-pipeline/estimate_local_time/example-json.json similarity index 100% rename from projects/bigdata-processing-pipeline/estimate-local-time/example-json.json rename to projects/bigdata-processing-pipeline/estimate_local_time/example-json.json diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/example-xlsx.xlsx b/projects/bigdata-processing-pipeline/estimate_local_time/example-xlsx.xlsx similarity index 100% rename from projects/bigdata-processing-pipeline/estimate-local-time/example-xlsx.xlsx rename to projects/bigdata-processing-pipeline/estimate_local_time/example-xlsx.xlsx diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/wikipedia-timezone-aliases-2018-04-28.csv b/projects/bigdata-processing-pipeline/estimate_local_time/wikipedia-timezone-aliases-2018-04-28.csv similarity index 100% rename from projects/bigdata-processing-pipeline/estimate-local-time/wikipedia-timezone-aliases-2018-04-28.csv rename to projects/bigdata-processing-pipeline/estimate_local_time/wikipedia-timezone-aliases-2018-04-28.csv diff --git a/projects/bigdata-processing-pipeline/get-donor-data/README.md b/projects/bigdata-processing-pipeline/get_donor_data/README.md similarity index 100% rename from projects/bigdata-processing-pipeline/get-donor-data/README.md rename to projects/bigdata-processing-pipeline/get_donor_data/README.md diff --git a/projects/bigdata-processing-pipeline/get_donor_data/__init__.py b/projects/bigdata-processing-pipeline/get_donor_data/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/projects/bigdata-processing-pipeline/get-donor-data/accept_new_donors_and_get_donor_list.py b/projects/bigdata-processing-pipeline/get_donor_data/accept_new_donors_and_get_donor_list.py similarity index 100% rename from projects/bigdata-processing-pipeline/get-donor-data/accept_new_donors_and_get_donor_list.py rename to projects/bigdata-processing-pipeline/get_donor_data/accept_new_donors_and_get_donor_list.py diff --git a/projects/bigdata-processing-pipeline/get-donor-data/deprecated/accept-new-donors.py b/projects/bigdata-processing-pipeline/get_donor_data/deprecated/accept-new-donors.py similarity index 100% rename from projects/bigdata-processing-pipeline/get-donor-data/deprecated/accept-new-donors.py rename to projects/bigdata-processing-pipeline/get_donor_data/deprecated/accept-new-donors.py diff --git a/projects/bigdata-processing-pipeline/get-donor-data/deprecated/get-all-col-headings.py b/projects/bigdata-processing-pipeline/get_donor_data/deprecated/get-all-col-headings.py similarity index 100% rename from projects/bigdata-processing-pipeline/get-donor-data/deprecated/get-all-col-headings.py rename to projects/bigdata-processing-pipeline/get_donor_data/deprecated/get-all-col-headings.py diff --git a/projects/bigdata-processing-pipeline/get-donor-data/deprecated/get-donor-json-files.py b/projects/bigdata-processing-pipeline/get_donor_data/deprecated/get-donor-json-files.py similarity index 100% rename from projects/bigdata-processing-pipeline/get-donor-data/deprecated/get-donor-json-files.py rename to projects/bigdata-processing-pipeline/get_donor_data/deprecated/get-donor-json-files.py diff --git a/projects/bigdata-processing-pipeline/get-donor-data/deprecated/get-donor-list.py b/projects/bigdata-processing-pipeline/get_donor_data/deprecated/get-donor-list.py similarity index 100% rename from projects/bigdata-processing-pipeline/get-donor-data/deprecated/get-donor-list.py rename to projects/bigdata-processing-pipeline/get_donor_data/deprecated/get-donor-list.py diff --git a/projects/bigdata-processing-pipeline/get-donor-data/deprecated/get_all_donor_data.py b/projects/bigdata-processing-pipeline/get_donor_data/deprecated/get_all_donor_data.py similarity index 100% rename from projects/bigdata-processing-pipeline/get-donor-data/deprecated/get_all_donor_data.py rename to projects/bigdata-processing-pipeline/get_donor_data/deprecated/get_all_donor_data.py diff --git a/projects/bigdata-processing-pipeline/get-donor-data/example_get_all_data_for_single_user.py b/projects/bigdata-processing-pipeline/get_donor_data/example_get_all_data_for_single_user.py similarity index 95% rename from projects/bigdata-processing-pipeline/get-donor-data/example_get_all_data_for_single_user.py rename to projects/bigdata-processing-pipeline/get_donor_data/example_get_all_data_for_single_user.py index 14767119..3a0966d9 100644 --- a/projects/bigdata-processing-pipeline/get-donor-data/example_get_all_data_for_single_user.py +++ b/projects/bigdata-processing-pipeline/get_donor_data/example_get_all_data_for_single_user.py @@ -25,6 +25,6 @@ ) data, _ = get_data( donor_group="bigdata", - userid_of_shared_user="0d4524bc11", + userid="0d4524bc11", weeks_of_data=4 ) diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process.py b/projects/bigdata-processing-pipeline/get_donor_data/get_all_donor_data_batch_process.py similarity index 100% rename from projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process.py rename to projects/bigdata-processing-pipeline/get_donor_data/get_all_donor_data_batch_process.py diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process_json.py b/projects/bigdata-processing-pipeline/get_donor_data/get_all_donor_data_batch_process_json.py similarity index 100% rename from projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process_json.py rename to projects/bigdata-processing-pipeline/get_donor_data/get_all_donor_data_batch_process_json.py diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_interim_dataset_summaries.py b/projects/bigdata-processing-pipeline/get_donor_data/get_interim_dataset_summaries.py similarity index 100% rename from projects/bigdata-processing-pipeline/get-donor-data/get_interim_dataset_summaries.py rename to projects/bigdata-processing-pipeline/get_donor_data/get_interim_dataset_summaries.py diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_single_dataset_info.py b/projects/bigdata-processing-pipeline/get_donor_data/get_single_dataset_info.py similarity index 100% rename from projects/bigdata-processing-pipeline/get-donor-data/get_single_dataset_info.py rename to projects/bigdata-processing-pipeline/get_donor_data/get_single_dataset_info.py diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_single_donor_metadata.py b/projects/bigdata-processing-pipeline/get_donor_data/get_single_donor_metadata.py similarity index 100% rename from projects/bigdata-processing-pipeline/get-donor-data/get_single_donor_metadata.py rename to projects/bigdata-processing-pipeline/get_donor_data/get_single_donor_metadata.py diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset.py similarity index 98% rename from projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py rename to projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset.py index 7526fe77..84563d9c 100644 --- a/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py +++ b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset.py @@ -8,7 +8,10 @@ """ # %% REQUIRED LIBRARIES -from get_single_dataset_info import expand_data, save_df +try: + from get_single_dataset_info import expand_data, save_df +except: + from get_donor_data.get_single_dataset_info import expand_data, save_df import pandas as pd import datetime as dt import numpy as np diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset_json.py b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py similarity index 100% rename from projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset_json.py rename to projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py diff --git a/projects/bigdata-processing-pipeline/get_stats/__init__.py b/projects/bigdata-processing-pipeline/get_stats/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/projects/bigdata-processing-pipeline/qualify-data/README.md b/projects/bigdata-processing-pipeline/qualify_data/README.md similarity index 100% rename from projects/bigdata-processing-pipeline/qualify-data/README.md rename to projects/bigdata-processing-pipeline/qualify_data/README.md diff --git a/projects/bigdata-processing-pipeline/qualify-data/deprecated/qualify-data.py b/projects/bigdata-processing-pipeline/qualify_data/deprecated/qualify-data.py similarity index 100% rename from projects/bigdata-processing-pipeline/qualify-data/deprecated/qualify-data.py rename to projects/bigdata-processing-pipeline/qualify_data/deprecated/qualify-data.py diff --git a/projects/bigdata-processing-pipeline/qualify-data/qualify_all_donor_data_batch_process.py b/projects/bigdata-processing-pipeline/qualify_data/qualify_all_donor_data_batch_process.py similarity index 100% rename from projects/bigdata-processing-pipeline/qualify-data/qualify_all_donor_data_batch_process.py rename to projects/bigdata-processing-pipeline/qualify_data/qualify_all_donor_data_batch_process.py diff --git a/projects/bigdata-processing-pipeline/qualify-data/qualify_single_dataset.py b/projects/bigdata-processing-pipeline/qualify_data/qualify_single_dataset.py similarity index 100% rename from projects/bigdata-processing-pipeline/qualify-data/qualify_single_dataset.py rename to projects/bigdata-processing-pipeline/qualify_data/qualify_single_dataset.py diff --git a/projects/bigdata-processing-pipeline/qualify-data/tidepool-qualification-criteria.json b/projects/bigdata-processing-pipeline/qualify_data/tidepool-qualification-criteria.json similarity index 100% rename from projects/bigdata-processing-pipeline/qualify-data/tidepool-qualification-criteria.json rename to projects/bigdata-processing-pipeline/qualify_data/tidepool-qualification-criteria.json From a1ce9911897d72e4786eec0578c31b8edeac1cc3 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Wed, 24 Jul 2019 08:34:33 -0500 Subject: [PATCH 09/46] get data from api --- .../get_stats/get_cgm_stats.py | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py new file mode 100644 index 00000000..32f6e824 --- /dev/null +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +calculate cgm statsistics for a single tidepool (donor) dataset +''' + + +# %% REQUIRED LIBRARIES +import os +import sys +# TODO: figure out how to get rid of these path dependcies +get_donor_data_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..") +) +if get_donor_data_path not in sys.path: + sys.path.insert(0, get_donor_data_path) +from get_donor_data.get_single_donor_metadata import get_shared_metadata +from get_donor_data.get_single_tidepool_dataset import get_data + + +# %% GET DATA FROM API +''' +get metadata and data for a donor that has shared with bigdata +NOTE: functions assume you have an .env with bigdata account credentials +''' + +userid = "0d4524bc11" +donor_group = "bigdata" + +metadata, _ = get_shared_metadata( + donor_group=donor_group, + userid_of_shared_user=userid +) +data, _ = get_data( + donor_group=donor_group, + userid=userid, + weeks_of_data=52 + ) + + From 5629152d446a73de213d9ce0efa2088f1cf5602d Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Wed, 24 Jul 2019 08:37:16 -0500 Subject: [PATCH 10/46] add path if needed --- .../get_donor_data/get_single_tidepool_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset.py b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset.py index 84563d9c..0b3e384f 100644 --- a/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset.py +++ b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset.py @@ -10,7 +10,7 @@ # %% REQUIRED LIBRARIES try: from get_single_dataset_info import expand_data, save_df -except: +except: # TODO: there has to be a better way to do this from get_donor_data.get_single_dataset_info import expand_data, save_df import pandas as pd import datetime as dt From 8fa5f2c465f87f2bc2c91de6962f35bac3911958 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Wed, 24 Jul 2019 19:39:19 -0500 Subject: [PATCH 11/46] initial commit WIP --- .../get_stats/get_cgm_stats.py | 268 +++++++++++++++++- 1 file changed, 266 insertions(+), 2 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py index 32f6e824..2ede6fda 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py @@ -8,15 +8,201 @@ # %% REQUIRED LIBRARIES import os import sys +import hashlib +import pytz +import numpy as np +import pandas as pd +import datetime as dt + + # TODO: figure out how to get rid of these path dependcies get_donor_data_path = os.path.abspath( os.path.join(os.path.dirname(__file__), "..") ) if get_donor_data_path not in sys.path: sys.path.insert(0, get_donor_data_path) +import environmentalVariables from get_donor_data.get_single_donor_metadata import get_shared_metadata from get_donor_data.get_single_tidepool_dataset import get_data +# %% CONSTANTS +MGDL_PER_MMOLL = 18.01559 + + +# %% FUNCTIONS +''' +the functions that are called in this script, +which includes notes of where the functions came from, +and whether they were refactored +''' + + +def hash_userid(userid, salt): + ''' + taken from anonymize-and-export.py + refactored name(s) to meet style guide + ''' + usr_string = userid + salt + hash_user = hashlib.sha256(usr_string.encode()) + hashid = hash_user.hexdigest() + + return hashid + + +def get_type(val): + return type(val).__name__ + + +def remove_negative_durations(df): + ''' + taken from https://github.com/tidepool-org/data-analytics/blob/ + etn/get-settings-and-events/projects/get-donors-pump-settings/ + get-users-settings-and-events.py + + refactored name(s) to meet style guide + refactored pandas field call to df["field"] instead of df.field + refactored because physical activity includes embedded json, whereas + the other fields in the data model require a integer + ''' + if "duration" in list(df): + type_ = df["duration"].apply(get_type) + valid_index = ((type_ == "int") & (df["duration"].notnull())) + n_negative_durations = sum(df.loc[valid_index, "duration"] < 0) + if n_negative_durations > 0: + df = df[~(df.loc[valid_index, "duration"] < 0)] + else: + n_negative_durations = np.nan + + return df, n_negative_durations + + +def expand_embedded_dict(df, field, key_): + ''' + this is new, should be refactored for speed as the current process + creates a dataframe of all of keys instead of just the key of interest + ''' + if field in list(df): + notnull_idx = df[field].notnull() + temp_df = pd.DataFrame(df.loc[notnull_idx, field].tolist()) # TODO: this can be sped up by only getting the field key of interest + if key_ in list(temp_df): + df[field + "." + key_] = temp_df[key_] + return df + + +def tslim_calibration_fix(df): + ''' + taken from https://github.com/tidepool-org/data-analytics/blob/ + etn/get-settings-and-events/projects/get-donors-pump-settings/ + get-users-settings-and-events.py + + refactored name(s) to meet style guide + refactored pandas field call to df["field"] instead of df.field + refactored to only expand one field + ''' + + # expand payload field one level + df = expand_embedded_dict(df, "payload", "calibration_reading") + + if "payload.calibration_reading" in list(df): + + search_for = ['tan'] + tandem_data_index = ( + (df["deviceId"].str.contains('|'.join(search_for))) + & (df["type"] == "deviceEvent") + ) + + cal_index = df["payload.calibration_reading"].notnull() + valid_index = tandem_data_index & cal_index + + n_cal_readings = sum(valid_index) + + if n_cal_readings > 0: + # if reading is > 30 then it is in the wrong units + if df["payload.calibration_reading"].min() > 30: + df.loc[cal_index, "value"] = ( + df.loc[valid_index, "payload.calibration_reading"] + / MGDL_PER_MMOLL + ) + else: + df.loc[cal_index, "value"] = ( + df.loc[valid_index, "payload.calibration_reading"] + ) + else: + n_cal_readings = 0 + return df, n_cal_readings + + +def get_and_fill_timezone(df): + ''' + this is new to deal with healthkit data + requires that a data frame that contains payload and HKTimeZone is passed + ''' + df = expand_embedded_dict(df, "payload", "HKTimeZone") + if "timezone" not in list(df): + if "payload.HKTimeZone" in list(df): + df.rename(columns={"payload.HKTimeZone": "timezone"}, inplace=True) + else: + df["timezone"] = np.nan + else: + if "payload.HKTimeZone" in list(df): + hk_tz_idx = df["payload.HKTimeZone"].notnull() + df.loc[hk_tz_idx, "timezone"] = ( + df.loc[hk_tz_idx, "payload.HKTimeZone"] + ) + + df["timezone"].fillna(method='ffill', inplace=True) + df["timezone"].fillna(method='bfill', inplace=True) + + return df["timezone"] + + +def make_tz_unaware(date_time): + return date_time.replace(tzinfo=None) + + +def to_utc_datetime(df): + ''' + this is new to deal with perfomance issue with the previous method + of converting to string to datetime with pd.to_datetime() + ''' + utc_time_tz_aware = pd.to_datetime( + df["time"], + format="%Y-%m-%dT%H:%M:%S", + utc=True + ) + utc_tz_unaware = utc_time_tz_aware.apply(make_tz_unaware) + + return utc_tz_unaware + + +def get_timezone_offset(currentDate, currentTimezone): + + # edge case for 'US/Pacific-New' + if currentTimezone == 'US/Pacific-New': + currentTimezone = 'US/Pacific' + + tz = pytz.timezone(currentTimezone) + + tzoNum = int( + tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z") + ) + tzoHours = np.floor(tzoNum / 100) + tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0) + tzoSign = np.sign(tzoHours) + tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign)) + + return tzo + + +def get_local_time(df): + + tzo = df[['utcTime', 'inferredTimezone']].apply( + lambda x: get_timezone_offset(*x), axis=1 + ) + local_time = df['utcTime'] + pd.to_timedelta(tzo, unit="m") + + return local_time + # %% GET DATA FROM API ''' @@ -29,12 +215,90 @@ metadata, _ = get_shared_metadata( donor_group=donor_group, - userid_of_shared_user=userid + userid_of_shared_user=userid # TODO: this should be refactored in several places to be userid ) data, _ = get_data( donor_group=donor_group, userid=userid, - weeks_of_data=52 + weeks_of_data=4 + ) + + +# %% CREATE META DATAFRAME (metadata) +metadata = pd.DataFrame(index=[userid]) + + +# %% HASH USER ID +hashid = hash_userid(userid, os.environ['BIGDATA_SALT']) +data["userid"] = userid +data["hashid"] = hashid + + +# %% CLEAN DATA +data_fields = list(data) +# remove negative durations +if "duration" in data_fields: + data["duration"], n_negative_durations = ( + remove_negative_durations(data[["duration"]].copy()) ) +else: + n_negative_durations = np.nan +metadata["nNegativeDurations"] = n_negative_durations + +# Tslim calibration bug fix +data, n_cal_readings = tslim_calibration_fix(data) +metadata["nTandemAndPayloadCalReadings"] = n_cal_readings + + +# %% TIME RELATED ITEMS +data["utcTime"] = to_utc_datetime(data[["time"]].copy()) +if "timezone" not in list(data): + data["timezone"] = np.nan +data["inferredTimezone"] = get_and_fill_timezone( + data[["timezone", "payload"]].copy() +) +# estimate local time (simple method) +# TODO: this really needs to be sped up +data["localTime"] = get_local_time( + data[['utcTime', 'inferredTimezone']].copy() +) + + + + + +#data["day"] = pd.DatetimeIndex(data["localTime"]).date +# +## round to the nearest 5 minutes +## TODO: once roundTime is pushed to tidals repository then this line can be replaced +## with td.clean.round_time +#data = round_time(data, timeIntervalMinutes=5, timeField="time", +# roundedTimeFieldName="roundedTime", startWithFirstRecord=True, +# verbose=False) +# +#data["roundedLocalTime"] = data["roundedTime"] + pd.to_timedelta(data["tzo"], unit="m") +#data.sort_values("uploadTime", ascending=False, inplace=True) +# +## AGE, & YLW +#data["age"] = np.floor((data["localTime"] - bDate).dt.days/365.25).astype(int) +#data["ylw"] = np.floor((data["localTime"] - dDate).dt.days/365.25).astype(int) + + +# %% CGM DATA + +#def removeInvalidCgmValues(df): +# +# nBefore = len(df) +# # remove values < 38 and > 402 mg/dL +# df = df.drop(df[((df.type == "cbg") & +# (df.value < 2.109284236597303))].index) +# df = df.drop(df[((df.type == "cbg") & +# (df.value > 22.314006924003046))].index) +# nRemoved = nBefore - len(df) +# +# return df, nRemoved +# get rid of cgm values too low/high (< 38 & > 402 mg/dL) +#data, nInvalidCgmValues = removeInvalidCgmValues(data) +#metadata["nInvalidCgmValues"] = nInvalidCgmValues From fae47883954898e322031b4e266d4ff726723f83 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Mon, 29 Jul 2019 09:48:53 -0500 Subject: [PATCH 12/46] distinguish donor metadata from data metadata --- projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py index 2ede6fda..ad9fe110 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py @@ -213,7 +213,7 @@ def get_local_time(df): userid = "0d4524bc11" donor_group = "bigdata" -metadata, _ = get_shared_metadata( +donor_metadata, _ = get_shared_metadata( donor_group=donor_group, userid_of_shared_user=userid # TODO: this should be refactored in several places to be userid ) From 9ef98f1e79cd658c495397cb912e3f1ef63d64be Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Mon, 29 Jul 2019 11:51:27 -0500 Subject: [PATCH 13/46] refactor round_time --- .../get_stats/get_cgm_stats.py | 126 +++++++++++++++++- 1 file changed, 123 insertions(+), 3 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py index ad9fe110..b831e2d5 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py @@ -8,12 +8,13 @@ # %% REQUIRED LIBRARIES import os import sys +import sys import hashlib import pytz import numpy as np import pandas as pd import datetime as dt - +import pdb # TODO: figure out how to get rid of these path dependcies get_donor_data_path = os.path.abspath( @@ -204,6 +205,117 @@ def get_local_time(df): return local_time +def round_time( + df, + time_interval_minutes=5, + start_with_first_record=True, + return_calculation_columns=False +): + ''' + A general purpose round time function that rounds the "time" + field to nearest minutes + INPUTS: + * a dataframe (df) or time series that contains only one time field + that you want to round + * time_interval_minutes (defaults to 5 minutes given that most cgms + output every 5 minutes) + * start_with_first_record starts the rounding with the first record + if True, and the last record if False (defaults to True) + * return_calculation_columns specifies whether the extra columns + used to make calculations are returned + refactored name(s) to meet style guide + ''' + # if a time series is passed in, convert to dataframe + if "Series" in get_type(df): + df = pd.DataFrame(df) + columns_ = list(df) + if len(columns_) > 1: + sys.exit( + "Error: df should only have one time column" + ) + else: + df.rename(columns={columns_[0]: "t"}, inplace=True) + + df.sort_values( + by="t", + ascending=start_with_first_record, + inplace=True + ) + + df.reset_index(drop=False, inplace=True) + df.rename(columns={"index": "originalIndex"}, inplace=True) + + # calculate the time between consecutive records + df["t_shift"] = df["t"].shift(1) + df["timeBetweenRecords"] = round( + (df["t"] - df["t_shift"]).dt.days*(86400/(60 * time_interval_minutes)) + + (df["t"] - df["t_shift"]).dt.seconds/(60 * time_interval_minutes) + ) * time_interval_minutes + + # separate the data into chunks if timeBetweenRecords is greater than + # 2 times the minutes so the rounding process + # starts over + big_gaps = list( + df.query("abs(timeBetweenRecords) > " + + str(time_interval_minutes * 2)).index + ) + big_gaps.insert(0, 0) + big_gaps.append(len(df)) + + for gap_index in range(0, len(big_gaps) - 1): + chunk = df["t"][big_gaps[gap_index]:big_gaps[gap_index+1]] + first_chunk = df["t"][big_gaps[gap_index]] + + # calculate the time difference between + # each time record and the first record + df.loc[ + big_gaps[gap_index]:big_gaps[gap_index+1], + "minutesFromFirstRecord" + ] = ( + (chunk - first_chunk).dt.days*(86400/60) + + (chunk - first_chunk).dt.seconds/60 + ) + + # then round to the nearest X Minutes + # NOTE: the ".000001" ensures that mulitples of 2:30 always rounds up. + df.loc[ + big_gaps[gap_index]:big_gaps[gap_index+1], + "roundedMinutesFromFirstRecord" + ] = round( + (df.loc[ + big_gaps[gap_index]:big_gaps[gap_index+1], + "minutesFromFirstRecord" + ] / time_interval_minutes) + 0.000001 + ) * (time_interval_minutes) + + rounded_first_record = ( + first_chunk + pd.Timedelta("1microseconds") + ).round(str(time_interval_minutes) + "min") + + df.loc[ + big_gaps[gap_index]:big_gaps[gap_index+1], + "roundedTime" + ] = rounded_first_record + pd.to_timedelta( + df.loc[ + big_gaps[gap_index]:big_gaps[gap_index+1], + "roundedMinutesFromFirstRecord" + ], unit="m" + ) + + if return_calculation_columns is False: + df.drop( + columns=[ + "timeBetweenRecords", + "minutesFromFirstRecord", + "roundedMinutesFromFirstRecord" + ], inplace=True + ) + # sort back to the original index + df.sort_values(by="originalIndex", inplace=True) + + return df["roundedTime"].values + + # %% GET DATA FROM API ''' get metadata and data for a donor that has shared with bigdata @@ -263,6 +375,14 @@ def get_local_time(df): data[['utcTime', 'inferredTimezone']].copy() ) +# round all data to the nearest 5 minutes +data["roundedTime"] = round_time( + data["localTime"].copy(), + time_interval_minutes=5, + start_with_first_record=True, + return_calculation_columns=False +) + @@ -272,8 +392,8 @@ def get_local_time(df): ## round to the nearest 5 minutes ## TODO: once roundTime is pushed to tidals repository then this line can be replaced ## with td.clean.round_time -#data = round_time(data, timeIntervalMinutes=5, timeField="time", -# roundedTimeFieldName="roundedTime", startWithFirstRecord=True, +#data = round_time(data, time_interval_minutes=5, time_field="time", +# rounded_field_name="roundedTime", start_with_first_record=True, # verbose=False) # #data["roundedLocalTime"] = data["roundedTime"] + pd.to_timedelta(data["tzo"], unit="m") From 302e45b7edbee9dd86bacd06044d7b699828aded Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Mon, 29 Jul 2019 15:16:15 -0500 Subject: [PATCH 14/46] add upload time to data --- .../get_stats/get_cgm_stats.py | 65 +++++++++++++++---- 1 file changed, 54 insertions(+), 11 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py index b831e2d5..45bc7079 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py @@ -316,6 +316,43 @@ def round_time( return df["roundedTime"].values +def add_upload_time(df): + ''' + this is taken from a colab notebook that is not in our github + given that it has been refactored to account for bug where there are + no upload records + NOTE: this is a new fix introduced with healthkit data...we now have + data that does not have an upload record + + ''' + + if "upload" in df.type.unique(): + upload_times = pd.DataFrame( + df[df.type == "upload"].groupby("uploadId")["utcTime"].max() + ) + else: + upload_times = pd.DataFrame(columns=["utcTime"]) + + unique_uploadIds = set(df["uploadId"].unique()) + unique_uploadRecords = set( + df.loc[df["type"] == "upload", "uploadId"].unique() + ) + uploadIds_missing_uploadRecords = unique_uploadIds - unique_uploadRecords + + for upId in uploadIds_missing_uploadRecords: + last_upload_time = df.loc[df["uploadId"] == upId, "utcTime"].max() + upload_times.loc[upId, "utcTime"] = last_upload_time + + upload_times.reset_index(inplace=True) + upload_times.rename( + columns={"utcTime": "uploadTime"}, + inplace=True + ) + df = pd.merge(df, upload_times, how='left', on='uploadId') + + return df["uploadTime"].values + + # %% GET DATA FROM API ''' get metadata and data for a donor that has shared with bigdata @@ -362,6 +399,7 @@ def round_time( metadata["nTandemAndPayloadCalReadings"] = n_cal_readings + # %% TIME RELATED ITEMS data["utcTime"] = to_utc_datetime(data[["time"]].copy()) if "timezone" not in list(data): @@ -383,20 +421,17 @@ def round_time( return_calculation_columns=False ) +# add upload time to the data, which is needed to get rid of duplicates +data["uploadTime"] = add_upload_time(data[ + ["type", "uploadId", "utcTime"] +].copy()) +# %% TIME CATEGORIES - -#data["day"] = pd.DatetimeIndex(data["localTime"]).date -# -## round to the nearest 5 minutes -## TODO: once roundTime is pushed to tidals repository then this line can be replaced -## with td.clean.round_time -#data = round_time(data, time_interval_minutes=5, time_field="time", -# rounded_field_name="roundedTime", start_with_first_record=True, -# verbose=False) -# -#data["roundedLocalTime"] = data["roundedTime"] + pd.to_timedelta(data["tzo"], unit="m") +# add the day of the localTime that starts at 12am +#data["day12AM"] = pd.DatetimeIndex(data["localTime"]).date +#data["day6AM"] = data["localTime"] - pd.Timedelta(6, unit="hours") #data.sort_values("uploadTime", ascending=False, inplace=True) # ## AGE, & YLW @@ -404,6 +439,14 @@ def round_time( #data["ylw"] = np.floor((data["localTime"] - dDate).dt.days/365.25).astype(int) +## group data by type +#if "uploadId" not in data: +# sys.exit( +# "Error: expected that uploadId is in data" +# ) +# +#type_groups = data.groupby("type") + # %% CGM DATA #def removeInvalidCgmValues(df): From 75bb4ff4217b0b941e9d01e618bc8d4da0fed340 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Mon, 29 Jul 2019 15:45:48 -0500 Subject: [PATCH 15/46] handle edge case where uploadId is not given --- .../bigdata-processing-pipeline/get_stats/get_cgm_stats.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py index 45bc7079..deb2a107 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py @@ -345,9 +345,11 @@ def add_upload_time(df): upload_times.reset_index(inplace=True) upload_times.rename( - columns={"utcTime": "uploadTime"}, + columns={"utcTime": "uploadTime", + "index": "uploadId"}, inplace=True ) + df = pd.merge(df, upload_times, how='left', on='uploadId') return df["uploadTime"].values From 61ae41c6075c29478713f1708839bfd408921e5f Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Mon, 29 Jul 2019 15:46:25 -0500 Subject: [PATCH 16/46] apply timezoneOffset correction --- .../get_stats/get_cgm_stats.py | 46 +++++++++++++++++-- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py index deb2a107..00015b61 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py @@ -176,8 +176,39 @@ def to_utc_datetime(df): return utc_tz_unaware -def get_timezone_offset(currentDate, currentTimezone): +# apply the large timezone offset correction (AKA Darin's fix) +def timezone_offset_bug_fix(df): + ''' + this is taken from estimate-local-time.py + TODO: add in unit testing where there is no TZP that is > 840 or < -720 + ''' + + if "timezoneOffset" in list(df): + + while ((df.timezoneOffset > 840).sum() > 0): + df.loc[df.timezoneOffset > 840, ["conversionOffset"]] = ( + df.loc[df.timezoneOffset > 840, ["conversionOffset"]] + - (1440 * 60 * 1000) + ) + + df.loc[df.timezoneOffset > 840, ["timezoneOffset"]] = ( + df.loc[df.timezoneOffset > 840, ["timezoneOffset"]] - 1440 + ) + + while ((df.timezoneOffset < -720).sum() > 0): + df.loc[df.timezoneOffset < -720, ["conversionOffset"]] = ( + df.loc[df.timezoneOffset < -720, ["conversionOffset"]] + + (1440 * 60 * 1000) + ) + + df.loc[df.timezoneOffset < -720, ["timezoneOffset"]] = ( + df.loc[df.timezoneOffset < -720, ["timezoneOffset"]] + 1440 + ) + + return df + +def get_timezone_offset(currentDate, currentTimezone): # edge case for 'US/Pacific-New' if currentTimezone == 'US/Pacific-New': currentTimezone = 'US/Pacific' @@ -371,7 +402,7 @@ def add_upload_time(df): data, _ = get_data( donor_group=donor_group, userid=userid, - weeks_of_data=4 + weeks_of_data=52 ) @@ -400,17 +431,23 @@ def add_upload_time(df): data, n_cal_readings = tslim_calibration_fix(data) metadata["nTandemAndPayloadCalReadings"] = n_cal_readings +# fix large timzoneOffset bug +data = timezone_offset_bug_fix(data) # %% TIME RELATED ITEMS data["utcTime"] = to_utc_datetime(data[["time"]].copy()) if "timezone" not in list(data): data["timezone"] = np.nan + + + + +# estimate local time (simple method) data["inferredTimezone"] = get_and_fill_timezone( data[["timezone", "payload"]].copy() ) -# estimate local time (simple method) -# TODO: this really needs to be sped up +# TODO: this really needs to be sped up AND/OR use complex version data["localTime"] = get_local_time( data[['utcTime', 'inferredTimezone']].copy() ) @@ -430,6 +467,7 @@ def add_upload_time(df): # %% TIME CATEGORIES +contiguousDays = createContiguousDaySeries(data) # add the day of the localTime that starts at 12am #data["day12AM"] = pd.DatetimeIndex(data["localTime"]).date From 4b86a07a331bb7ab6ac6844aaf6cd084ae08ab24 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Mon, 29 Jul 2019 21:29:49 -0500 Subject: [PATCH 17/46] refactor of estimate-local-time to handle healthkit data --- .../get_stats/get_cgm_stats.py | 829 ++++++++++++++++-- 1 file changed, 742 insertions(+), 87 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py index 00015b61..be3dcecf 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py @@ -8,13 +8,11 @@ # %% REQUIRED LIBRARIES import os import sys -import sys import hashlib import pytz import numpy as np import pandas as pd import datetime as dt -import pdb # TODO: figure out how to get rid of these path dependcies get_donor_data_path = os.path.abspath( @@ -26,6 +24,7 @@ from get_donor_data.get_single_donor_metadata import get_shared_metadata from get_donor_data.get_single_tidepool_dataset import get_data + # %% CONSTANTS MGDL_PER_MMOLL = 18.01559 @@ -133,23 +132,37 @@ def tslim_calibration_fix(df): return df, n_cal_readings -def get_and_fill_timezone(df): - ''' - this is new to deal with healthkit data - requires that a data frame that contains payload and HKTimeZone is passed - ''' +def get_healthkit_timezone(df): df = expand_embedded_dict(df, "payload", "HKTimeZone") if "timezone" not in list(df): if "payload.HKTimeZone" in list(df): + hk_tz_idx = df["payload.HKTimeZone"].notnull() + df.loc[hk_tz_idx, "deviceType"] = "healthkit" df.rename(columns={"payload.HKTimeZone": "timezone"}, inplace=True) + else: df["timezone"] = np.nan + df["deviceType"] = np.nan else: if "payload.HKTimeZone" in list(df): hk_tz_idx = df["payload.HKTimeZone"].notnull() df.loc[hk_tz_idx, "timezone"] = ( df.loc[hk_tz_idx, "payload.HKTimeZone"] ) + df.loc[hk_tz_idx, "deviceType"] = "healthkit" + else: + df["timezone"] = np.nan + df["deviceType"] = np.nan + + return df[["timezone", "deviceType"]] + + +def get_and_fill_timezone(df): + ''' + this is new to deal with healthkit data + requires that a data frame that contains payload and HKTimeZone is passed + ''' + df = get_healthkit_timezone(df) df["timezone"].fillna(method='ffill', inplace=True) df["timezone"].fillna(method='bfill', inplace=True) @@ -208,24 +221,6 @@ def timezone_offset_bug_fix(df): return df -def get_timezone_offset(currentDate, currentTimezone): - # edge case for 'US/Pacific-New' - if currentTimezone == 'US/Pacific-New': - currentTimezone = 'US/Pacific' - - tz = pytz.timezone(currentTimezone) - - tzoNum = int( - tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z") - ) - tzoHours = np.floor(tzoNum / 100) - tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0) - tzoSign = np.sign(tzoHours) - tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign)) - - return tzo - - def get_local_time(df): tzo = df[['utcTime', 'inferredTimezone']].apply( @@ -386,6 +381,706 @@ def add_upload_time(df): return df["uploadTime"].values +# %% ESTIMATE LOCAL TIME FUNCTIONS +def create_contiguous_day_series(df): + first_day = df["date"].min() + last_day = df["date"].max() + rng = pd.date_range(first_day, last_day).date + contiguousDaySeries = \ + pd.DataFrame(rng, columns=["date"]).sort_values( + "date", ascending=False).reset_index(drop=True) + + return contiguousDaySeries + + +def add_device_type(df): + col_headings = list(df) + if "deviceType" not in col_headings: + df["deviceType"] = np.nan + if "deviceTags" in col_headings: + # first make sure deviceTag is in string format + df["deviceTags"] = df.deviceTags.astype(str) + # filter by type not null device tags + ud = df[df["deviceTags"].notnull()].copy() + # define a device type (e.g., pump, cgm, or healthkit) + ud.loc[ + ((ud["deviceTags"].str.contains("pump")) + & (ud["deviceType"].isnull())), + ["deviceType"] + ] = "pump" + + # define a device type (e.g., cgm) + ud.loc[ + ((ud["deviceTags"].str.contains("cgm")) + & (ud["deviceType"].isnull())), + ["deviceType"] + ] = "cgm" + + return ud["deviceType"] + else: + return np.nan + + +def get_timezone_offset(currentDate, currentTimezone): + + tz = pytz.timezone(currentTimezone) + # here we add 1 day to the current date to account for changes to/from DST + tzoNum = int( + tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z") + ) + tzoHours = np.floor(tzoNum / 100) + tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0) + tzoSign = np.sign(tzoHours) + tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign)) + + return tzo + + +def add_device_day_series(df, dfContDays, deviceTypeName): + if len(df) > 0: + dfDayGroups = df.groupby("date") + if "timezoneOffset" in df: + dfDaySeries = pd.DataFrame(dfDayGroups["timezoneOffset"].median()) + else: + dfDaySeries = pd.DataFrame(columns=["timezoneOffset"]) + + if "upload" in deviceTypeName: + if "timezone" in df: +# if dfDayGroups.timezone.count().values[0] > 0: # NOT SURE WHY THIS IS HERE + dfDaySeries["timezone"] = ( + dfDayGroups.timezone.describe()["top"] + ) + # get the timezone offset for the timezone + for i in dfDaySeries.index: + if pd.notnull(dfDaySeries.loc[i, "timezone"]): + tzo = get_timezone_offset( + pd.to_datetime(i), + dfDaySeries.loc[i, "timezone"]) + dfDaySeries.loc[i, ["timezoneOffset"]] = tzo + if "timeProcessing" in dfDaySeries: + dfDaySeries["timeProcessing"] = \ + dfDayGroups.timeProcessing.describe()["top"] + else: + dfDaySeries["timeProcessing"] = np.nan + + dfDaySeries = dfDaySeries.add_prefix(deviceTypeName + "."). \ + rename(columns={deviceTypeName + ".date": "date"}) + + dfContDays = pd.merge(dfContDays, dfDaySeries.reset_index(), + on="date", how="left") + + else: + dfContDays[deviceTypeName + ".timezoneOffset"] = np.nan + + return dfContDays + + +def impute_upload_records(df, contDays, deviceTypeName): + daySeries = \ + add_device_day_series(df, contDays, deviceTypeName) + + if ((len(df) > 0) & (deviceTypeName + ".timezone" in daySeries)): + for i in daySeries.index[1:]: + if pd.isnull(daySeries[deviceTypeName + ".timezone"][i]): + daySeries.loc[i, [deviceTypeName + ".timezone"]] = ( + daySeries.loc[i-1, deviceTypeName + ".timezone"] + ) + if pd.notnull(daySeries[deviceTypeName + ".timezone"][i]): + tz = daySeries.loc[i, deviceTypeName + ".timezone"] + tzo = get_timezone_offset( + pd.to_datetime(daySeries.loc[i, "date"]), + tz + ) + daySeries.loc[i, deviceTypeName + ".timezoneOffset"] = tzo + + if pd.notnull(daySeries[deviceTypeName + ".timeProcessing"][i-1]): + daySeries.loc[i, deviceTypeName + ".timeProcessing"] = \ + daySeries.loc[i-1, deviceTypeName + ".timeProcessing"] + + else: + daySeries[deviceTypeName + ".timezone"] = np.nan + daySeries[deviceTypeName + ".timeProcessing"] = np.nan + + return daySeries + + +def add_home_timezone(df, contDays): + + if "timezone" in df: + homeTimezone = df["timezone"].describe()["top"] + tzo = contDays.date.apply( + lambda x: get_timezone_offset(pd.to_datetime(x), homeTimezone)) + + contDays["home.imputed.timezoneOffset"] = tzo + contDays["home.imputed.timezone"] = homeTimezone + + else: + contDays["home.imputed.timezoneOffset"] = np.nan + contDays["home.imputed.timezone"] = np.nan + contDays["home.imputed.timeProcessing"] = np.nan + + return contDays + + +def estimateTzAndTzoWithUploadRecords(cDF): + + cDF["est.type"] = np.nan + cDF["est.gapSize"] = np.nan + cDF["est.timezoneOffset"] = cDF["upload.timezoneOffset"] + cDF["est.annotations"] = np.nan + + if "upload.timezone" in cDF: + cDF.loc[cDF["upload.timezone"].notnull(), ["est.type"]] = "UPLOAD" + cDF["est.timezone"] = cDF["upload.timezone"] + cDF["est.timeProcessing"] = cDF["upload.timeProcessing"] + else: + cDF["est.timezone"] = np.nan + cDF["est.timeProcessing"] = np.nan + + cDF.loc[((cDF["est.timezoneOffset"] != + cDF["home.imputed.timezoneOffset"]) & + (pd.notnull(cDF["est.timezoneOffset"]))), + "est.annotations"] = "travel" + + return cDF + + +def assignTzoFromImputedSeries(df, i, imputedSeries): + df.loc[i, ["est.type"]] = "DEVICE" + + df.loc[i, ["est.timezoneOffset"]] = \ + df.loc[i, imputedSeries + ".timezoneOffset"] + + df.loc[i, ["est.timezone"]] = \ + df.loc[i, imputedSeries + ".timezone"] + + df.loc[i, ["est.timeProcessing"]] = \ + df.loc[i, imputedSeries + ".timeProcessing"] + + return df + + +def compareDeviceTzoToImputedSeries(df, sIdx, device): + for i in sIdx: + # if the device tzo = imputed tzo, then chose the imputed tz and tzo + # note, dst is accounted for in the imputed tzo + for imputedSeries in ["pump.upload.imputed", "cgm.upload.imputed", + "healthkit.upload.imputed", "home.imputed"]: + # if the estimate has not already been made + if pd.isnull(df.loc[i, "est.timezone"]): + + if df.loc[i, device + ".timezoneOffset"] == \ + df.loc[i, imputedSeries + ".timezoneOffset"]: + + assignTzoFromImputedSeries(df, i, imputedSeries) + + df = addAnnotation(df, i, + "tz-inferred-from-" + imputedSeries) + + # if the imputed series has a timezone estimate, then see if + # the current day is a dst change day + elif (pd.notnull(df.loc[i, imputedSeries + ".timezone"])): + imputedTimezone = df.loc[i, imputedSeries + ".timezone"] + if isDSTChangeDay(df.loc[i, "date"], imputedTimezone): + + dstRange = getRangeOfTZOsForTimezone(imputedTimezone) + if ((df.loc[i, device + ".timezoneOffset"] in dstRange) + & (df.loc[i, imputedSeries + ".timezoneOffset"] in dstRange)): + + assignTzoFromImputedSeries(df, i, imputedSeries) + + df = addAnnotation(df, i, "dst-change-day") + df = addAnnotation( + df, i, "tz-inferred-from-" + imputedSeries) + + return df + + +def estimateTzAndTzoWithDeviceRecords(cDF): + + # 2A. use the TZO of the pump or cgm device if it exists on a given day. In + # addition, compare the TZO to one of the imputed day series (i.e., the + # upload and home series to see if the TZ can be inferred) + for deviceType in ["pump", "cgm"]: + # find the indices of days where a TZO estimate has not been made AND + # where the device (e.g., pump or cgm) TZO has data + sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) & + (cDF[deviceType + ".timezoneOffset"].notnull()))].index + # compare the device TZO to the imputed series to infer time zone + cDF = compareDeviceTzoToImputedSeries(cDF, sIndices, deviceType) + + # 2B. if the TZ cannot be inferred with 2A, then see if the TZ can be + # inferred from the previous day's TZO. If the device TZO is equal to the + # previous day's TZO, AND if the previous day has a TZ estimate, use the + # previous day's TZ estimate for the current day's TZ estimate + for deviceType in ["pump", "cgm"]: + sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) & + (cDF[deviceType + ".timezoneOffset"].notnull()))].index + + cDF = compareDeviceTzoToPrevDayTzo(cDF, sIndices, deviceType) + + # 2C. after 2A and 2B, check the DEVICE estimates to make sure that the + # pump and cgm tzo do not differ by more than 60 minutes. If they differ + # by more that 60 minutes, then mark the estimate as UNCERTAIN. Also, we + # allow the estimates to be off by 60 minutes as there are a lot of cases + # where the devices are off because the user changes the time for DST, + # at different times + sIndices = cDF[((cDF["est.type"] == "DEVICE") & + (cDF["pump.timezoneOffset"].notnull()) & + (cDF["cgm.timezoneOffset"].notnull()) & + (cDF["pump.timezoneOffset"] != cDF["cgm.timezoneOffset"]) + )].index + + tzoDiffGT60 = abs(cDF.loc[sIndices, "cgm.timezoneOffset"] - + cDF.loc[sIndices, "pump.timezoneOffset"]) > 60 + + idx = tzoDiffGT60.index[tzoDiffGT60] + + cDF.loc[idx, ["est.type"]] = "UNCERTAIN" + for i in idx: + cDF = addAnnotation(cDF, i, "pump-cgm-tzo-mismatch") + + return cDF + + +def imputeTzAndTzo(cDF): + + sIndices = cDF[cDF["est.timezoneOffset"].isnull()].index + hasTzoIndices = cDF[cDF["est.timezoneOffset"].notnull()].index + if len(hasTzoIndices) > 0: + if len(sIndices) > 0: + lastDay = max(sIndices) + + while ((sIndices.min() < max(hasTzoIndices)) & + (len(sIndices) > 0)): + + currentDay, prevDayWithDay, nextDayIdx = \ + getImputIndices(cDF, sIndices, hasTzoIndices) + + cDF = imputeByTimezone(cDF, currentDay, + prevDayWithDay, nextDayIdx) + + sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) & + (~cDF["est.annotations"].str.contains( + "unable-to-impute-tzo").fillna(False)))].index + + hasTzoIndices = cDF[cDF["est.timezoneOffset"].notnull()].index + + # try to impute to the last day (earliest day) in the dataset + # if the last record has a timezone that is the home record, then + # impute using the home timezone + if len(sIndices) > 0: + currentDay = min(sIndices) + prevDayWithDay = currentDay - 1 + gapSize = lastDay - currentDay + + for i in range(currentDay, lastDay + 1): + if cDF.loc[prevDayWithDay, "est.timezoneOffset"] == \ + cDF.loc[prevDayWithDay, "home.imputed.timezoneOffset"]: + + cDF.loc[i, ["est.type"]] = "IMPUTE" + + cDF.loc[i, ["est.timezoneOffset"]] = \ + cDF.loc[i, "home.imputed.timezoneOffset"] + + cDF.loc[i, ["est.timezone"]] = \ + cDF.loc[i, "home.imputed.timezone"] + + cDF = addAnnotation(cDF, i, "gap=" + str(gapSize)) + cDF.loc[i, ["est.gapSize"]] = gapSize + + else: + cDF.loc[i, ["est.type"]] = "UNCERTAIN" + cDF = addAnnotation(cDF, i, "unable-to-impute-tzo") + else: + cDF["est.type"] = "UNCERTAIN" + cDF["est.annotations"] = "unable-to-impute-tzo" + + return cDF + + +def getRangeOfTZOsForTimezone(tz): + minMaxTzo = [getTimezoneOffset(pd.to_datetime("1/1/2017"), tz), + getTimezoneOffset(pd.to_datetime("5/1/2017"), tz)] + + rangeOfTzo = np.arange(int(min(minMaxTzo)), int(max(minMaxTzo))+1, 15) + + return rangeOfTzo + + +def getListOfDSTChangeDays(cDF): + + # get a list of DST change days for the home time zone + dstChangeDays = \ + cDF[abs(cDF["home.imputed.timezoneOffset"] - + cDF["home.imputed.timezoneOffset"].shift(-1)) > 0].date + + return dstChangeDays + + +def correctEstimatesAroundDst(df, cDF): + + # get a list of DST change days for the home time zone + dstChangeDays = getListOfDSTChangeDays(cDF) + + # loop through the df within 2 days of a daylight savings time change + for d in dstChangeDays: + dstIndex = df[(df.date > (d + dt.timedelta(days=-2))) & + (df.date < (d + dt.timedelta(days=2)))].index + for dIdx in dstIndex: + if pd.notnull(df.loc[dIdx, "est.timezone"]): + tz = pytz.timezone(df.loc[dIdx, "est.timezone"]) + tzRange = getRangeOfTZOsForTimezone(str(tz)) + minHoursToLocal = min(tzRange)/60 + tzoNum = int(tz.localize(df.loc[dIdx, "utcTime"] + + dt.timedelta(hours=minHoursToLocal)).strftime("%z")) + tzoHours = np.floor(tzoNum / 100) + tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0) + tzoSign = np.sign(tzoHours) + tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign)) + localTime = \ + df.loc[dIdx, "utcTime"] + pd.to_timedelta(tzo, unit="m") + df.loc[dIdx, ["est.localTime"]] = localTime + df.loc[dIdx, ["est.timezoneOffset"]] = tzo + return df + + +def applyLocalTimeEstimates(df, cDF): + df = pd.merge(df, cDF, how="left", on="date") + df["est.localTime"] = \ + df["utcTime"] + pd.to_timedelta(df["est.timezoneOffset"], unit="m") + + df = correctEstimatesAroundDst(df, cDF) + + return df["est.localTime"].values + + +def isDSTChangeDay(currentDate, currentTimezone): + tzoCurrentDay = getTimezoneOffset(pd.to_datetime(currentDate), + currentTimezone) + tzoPreviousDay = getTimezoneOffset(pd.to_datetime(currentDate) + + dt.timedelta(days=-1), currentTimezone) + + return (tzoCurrentDay != tzoPreviousDay) + + +def tzoRangeWithComparisonTz(df, i, comparisonTz): + # if we have a previous timezone estimate, then calcuate the range of + # timezone offset values for that time zone + if pd.notnull(comparisonTz): + rangeTzos = getRangeOfTZOsForTimezone(comparisonTz) + else: + comparisonTz = np.nan + rangeTzos = np.array([]) + + return rangeTzos + + +def tzAndTzoRangePreviousDay(df, i): + # if we have a previous timezone estimate, then calcuate the range of + # timezone offset values for that time zone + comparisonTz = df.loc[i-1, "est.timezone"] + + rangeTzos = tzoRangeWithComparisonTz(df, i, comparisonTz) + + return comparisonTz, rangeTzos + + +def assignTzoFromPreviousDay(df, i, previousDayTz): + + df.loc[i, ["est.type"]] = "DEVICE" + df.loc[i, ["est.timezone"]] = previousDayTz + df.loc[i, ["est.timezoneOffset"]] = \ + getTimezoneOffset(pd.to_datetime(df.loc[i, "date"]), previousDayTz) + + df.loc[i, ["est.timeProcessing"]] = df.loc[i-1, "est.timeProcessing"] + df = addAnnotation(df, i, "tz-inferred-from-prev-day") + + return df + + +def assignTzoFromDeviceTzo(df, i, device): + + df.loc[i, ["est.type"]] = "DEVICE" + df.loc[i, ["est.timezoneOffset"]] = \ + df.loc[i, device + ".timezoneOffset"] + df.loc[i, ["est.timeProcessing"]] = \ + df.loc[i, device + ".upload.imputed.timeProcessing"] + + df = addAnnotation(df, i, "likely-travel") + df = addAnnotation(df, i, "tzo-from-" + device) + + return df + + +def compareDeviceTzoToPrevDayTzo(df, sIdx, device): + + for i in sIdx[sIdx > 0]: + + # first see if the previous record has a tzo + if (pd.notnull(df.loc[i-1, "est.timezoneOffset"])): + + previousDayTz, dstRange = tzAndTzoRangePreviousDay(df, i) + timeDiff = abs((df.loc[i, device + ".timezoneOffset"]) - + df.loc[i-1, "est.timezoneOffset"]) + + # next see if the previous record has a tz + if (pd.notnull(df.loc[i-1, "est.timezone"])): + + if timeDiff == 0: + assignTzoFromPreviousDay(df, i, previousDayTz) + + # see if the previous day's tzo and device tzo are within the + # dst range (as that is a common problem with this data) + elif ((df.loc[i, device + ".timezoneOffset"] in dstRange) + & (df.loc[i-1, "est.timezoneOffset"] in dstRange)): + + # then see if it is DST change day + if isDSTChangeDay(df.loc[i, "date"], previousDayTz): + + df = addAnnotation(df, i, "dst-change-day") + assignTzoFromPreviousDay(df, i, previousDayTz) + + # if it is not DST change day, then mark this as uncertain + else: + # also, check to see if the difference between device. + # tzo and prev.tzo is less than the expected dst + # difference. There is a known issue where the BtUTC + # procedure puts clock drift into the device.tzo, + # and as a result the tzo can be off by 15, 30, + # or 45 minutes. + if (((df.loc[i, device + ".timezoneOffset"] == + min(dstRange)) | + (df.loc[i, device + ".timezoneOffset"] == + max(dstRange))) & + ((df.loc[i-1, "est.timezoneOffset"] == + min(dstRange)) | + (df.loc[i-1, "est.timezoneOffset"] == + max(dstRange)))): + + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, + "likely-dst-error-OR-travel") + + else: + + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, + "likely-15-min-dst-error") + + # next see if time difference between device.tzo and prev.tzo + # is off by 720 minutes, which is indicative of a common + # user AM/PM error + elif timeDiff == 720: + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "likely-AM-PM-error") + + # if it doesn't fall into any of these cases, then the + # tzo difference is likely due to travel + else: + df = assignTzoFromDeviceTzo(df, i, device) + + elif timeDiff == 0: + df = assignTzoFromDeviceTzo(df, i, device) + + # if there is no previous record to compare with check for dst errors, + # and if there are no errors, it is likely a travel day + else: + + comparisonTz, dstRange = tzAndTzoRangeWithHomeTz(df, i) + timeDiff = abs((df.loc[i, device + ".timezoneOffset"]) - + df.loc[i, "home.imputed.timezoneOffset"]) + + if ((df.loc[i, device + ".timezoneOffset"] in dstRange) + & (df.loc[i, "home.imputed.timezoneOffset"] in dstRange)): + + # see if it is DST change day + if isDSTChangeDay(df.loc[i, "date"], comparisonTz): + + df = addAnnotation(df, i, "dst-change-day") + df.loc[i, ["est.type"]] = "DEVICE" + df.loc[i, ["est.timezoneOffset"]] = \ + df.loc[i, device + ".timezoneOffset"] + df.loc[i, ["est.timezone"]] = \ + df.loc[i, "home.imputed.timezone"] + df.loc[i, ["est.timeProcessing"]] = \ + df.loc[i, device + ".upload.imputed.timeProcessing"] + + # if it is not DST change day, then mark this as uncertain + else: + # also, check to see if the difference between device. + # tzo and prev.tzo is less than the expected dst + # difference. There is a known issue where the BtUTC + # procedure puts clock drift into the device.tzo, + # and as a result the tzo can be off by 15, 30, + # or 45 minutes. + if (((df.loc[i, device + ".timezoneOffset"] == + min(dstRange)) | + (df.loc[i, device + ".timezoneOffset"] == + max(dstRange))) & + ((df.loc[i, "home.imputed.timezoneOffset"] == + min(dstRange)) | + (df.loc[i, "home.imputed.timezoneOffset"] == + max(dstRange)))): + + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "likely-dst-error-OR-travel") + + else: + + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "likely-15-min-dst-error") + + # next see if time difference between device.tzo and prev.tzo + # is off by 720 minutes, which is indicative of a common + # user AM/PM error + elif timeDiff == 720: + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "likely-AM-PM-error") + + # if it doesn't fall into any of these cases, then the + # tzo difference is likely due to travel + + else: + df = assignTzoFromDeviceTzo(df, i, device) + + return df + + +def getImputIndices(df, sIdx, hIdx): + + lastDayIdx = len(df) - 1 + + currentDayIdx = sIdx.min() + tempList = pd.Series(hIdx) - currentDayIdx + prevDayIdx = currentDayIdx - 1 + nextDayIdx = \ + min(currentDayIdx + min(tempList[tempList >= 0]), lastDayIdx) + + return currentDayIdx, prevDayIdx, nextDayIdx + + +def imputeByTimezone(df, currentDay, prevDaywData, nextDaywData): + + gapSize = (nextDaywData - currentDay) + + if prevDaywData >= 0: + + if df.loc[prevDaywData, "est.timezone"] == \ + df.loc[nextDaywData, "est.timezone"]: + + tz = df.loc[prevDaywData, "est.timezone"] + + for i in range(currentDay, nextDaywData): + + df.loc[i, ["est.timezone"]] = tz + + df.loc[i, ["est.timezoneOffset"]] = \ + getTimezoneOffset(pd.to_datetime(df.loc[i, "date"]), tz) + + df.loc[i, ["est.type"]] = "IMPUTE" + + df = addAnnotation(df, i, "gap=" + str(gapSize)) + df.loc[i, ["est.gapSize"]] = gapSize + + # TODO: this logic should be updated to handle the edge case + # where the day before and after the gap have differing TZ, but + # the same TZO. In that case the gap should be marked as UNCERTAIN + elif df.loc[prevDaywData, "est.timezoneOffset"] == \ + df.loc[nextDaywData, "est.timezoneOffset"]: + + for i in range(currentDay, nextDaywData): + + df.loc[i, ["est.timezoneOffset"]] = \ + df.loc[prevDaywData, "est.timezoneOffset"] + + df.loc[i, ["est.type"]] = "IMPUTE" + + df = addAnnotation(df, i, "gap=" + str(gapSize)) + df.loc[i, ["est.gapSize"]] = gapSize + + else: + for i in range(currentDay, nextDaywData): + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "unable-to-impute-tzo") + + else: + for i in range(currentDay, nextDaywData): + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "unable-to-impute-tzo") + + return df + + +def addAnnotation(df, idx, annotationMessage): + if pd.notnull(df.loc[idx, "est.annotations"]): + df.loc[idx, ["est.annotations"]] = df.loc[idx, "est.annotations"] + \ + ", " + annotationMessage + else: + df.loc[idx, ["est.annotations"]] = annotationMessage + + return df + + +def getTimezoneOffset(currentDate, currentTimezone): + + tz = pytz.timezone(currentTimezone) + # here we add 1 day to the current date to account for changes to/from DST + tzoNum = int(tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z")) + tzoHours = np.floor(tzoNum / 100) + tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0) + tzoSign = np.sign(tzoHours) + tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign)) + + return tzo + + +def estimate_local_time(df): + df["date"] = df["utcTime"].dt.date # TODO: change this to utcDate later + contiguous_days = create_contiguous_day_series(df) + + df["deviceType"] = add_device_type(df) + cDays = add_device_day_series(df, contiguous_days, "upload") + + # create day series for cgm df + if "timezoneOffset" not in list(df): + df["timezoneOffset"] = np.nan + + cgmdf = df[(df["type"] == "cbg") & (df["timezoneOffset"].notnull())].copy() + cDays = add_device_day_series(cgmdf, cDays, "cgm") + + # create day series for pump df + pumpdf = df[(df.type == "bolus") & (df.timezoneOffset.notnull())].copy() + cDays = add_device_day_series(pumpdf, cDays, "pump") + + # interpolate between upload records of the same deviceType, and create a + # day series for interpolated pump, non-hk-cgm, and healthkit uploads + for deviceType in ["pump", "cgm", "healthkit"]: + tempUploaddf = df[df["deviceType"] == deviceType].copy() + cDays = impute_upload_records( + tempUploaddf, cDays, deviceType + ".upload.imputed" + ) + + # add a home timezone that also accounts for daylight savings time changes + cDays = add_home_timezone(df, cDays) + + # 1. USE UPLOAD RECORDS TO ESTIMATE TZ AND TZO + cDays = estimateTzAndTzoWithUploadRecords(cDays) + + # 2. USE DEVICE TZOs TO ESTIMATE TZO AND TZ (IF POSSIBLE) + # estimates can be made from pump and cgm df that have a TZO + # NOTE: the healthkit and dexcom-api cgm df are excluded + cDays = estimateTzAndTzoWithDeviceRecords(cDays) + + # 3. impute, infer, or interpolate gaps in the estimated tzo and tz + cDays = imputeTzAndTzo(cDays) + + # 4. APPLY LOCAL TIME ESTIMATES TO ALL df + local_time = applyLocalTimeEstimates(df, cDays) + + return local_time, cDays + + # %% GET DATA FROM API ''' get metadata and data for a donor that has shared with bigdata @@ -402,11 +1097,14 @@ def add_upload_time(df): data, _ = get_data( donor_group=donor_group, userid=userid, - weeks_of_data=52 - ) + weeks_of_data=52*10 +) # %% CREATE META DATAFRAME (metadata) +''' +this is useful for keeping track of the type and amount of cleaning done +''' metadata = pd.DataFrame(index=[userid]) @@ -428,32 +1126,30 @@ def add_upload_time(df): metadata["nNegativeDurations"] = n_negative_durations # Tslim calibration bug fix -data, n_cal_readings = tslim_calibration_fix(data) +data, n_cal_readings = tslim_calibration_fix(data.copy()) metadata["nTandemAndPayloadCalReadings"] = n_cal_readings -# fix large timzoneOffset bug -data = timezone_offset_bug_fix(data) +# fix large timzoneOffset bug in utcbootstrapping +data = timezone_offset_bug_fix(data.copy()) + +# add healthkit timezome information +data[["timezone", "deviceType"]] = get_healthkit_timezone(data.copy()) # %% TIME RELATED ITEMS data["utcTime"] = to_utc_datetime(data[["time"]].copy()) -if "timezone" not in list(data): - data["timezone"] = np.nan - - +# add upload time to the data, which is needed for: +# getting rid of duplicates and useful for getting local time +data["uploadTime"] = add_upload_time(data[ + ["type", "uploadId", "utcTime"] +].copy()) -# estimate local time (simple method) -data["inferredTimezone"] = get_and_fill_timezone( - data[["timezone", "payload"]].copy() -) -# TODO: this really needs to be sped up AND/OR use complex version -data["localTime"] = get_local_time( - data[['utcTime', 'inferredTimezone']].copy() -) +# estimate local time (refactor of estimate-local-time.py) +data["localTime"], local_time_metadata = estimate_local_time(data.copy()) # round all data to the nearest 5 minutes -data["roundedTime"] = round_time( +data["roundedLocalTime"] = round_time( data["localTime"].copy(), time_interval_minutes=5, start_with_first_record=True, @@ -461,47 +1157,6 @@ def add_upload_time(df): ) # add upload time to the data, which is needed to get rid of duplicates -data["uploadTime"] = add_upload_time(data[ - ["type", "uploadId", "utcTime"] -].copy()) - - -# %% TIME CATEGORIES -contiguousDays = createContiguousDaySeries(data) - -# add the day of the localTime that starts at 12am -#data["day12AM"] = pd.DatetimeIndex(data["localTime"]).date -#data["day6AM"] = data["localTime"] - pd.Timedelta(6, unit="hours") -#data.sort_values("uploadTime", ascending=False, inplace=True) -# -## AGE, & YLW -#data["age"] = np.floor((data["localTime"] - bDate).dt.days/365.25).astype(int) -#data["ylw"] = np.floor((data["localTime"] - dDate).dt.days/365.25).astype(int) - - -## group data by type -#if "uploadId" not in data: -# sys.exit( -# "Error: expected that uploadId is in data" -# ) -# -#type_groups = data.groupby("type") - -# %% CGM DATA - -#def removeInvalidCgmValues(df): -# -# nBefore = len(df) -# # remove values < 38 and > 402 mg/dL -# df = df.drop(df[((df.type == "cbg") & -# (df.value < 2.109284236597303))].index) -# df = df.drop(df[((df.type == "cbg") & -# (df.value > 22.314006924003046))].index) -# nRemoved = nBefore - len(df) -# -# return df, nRemoved - -# get rid of cgm values too low/high (< 38 & > 402 mg/dL) -#data, nInvalidCgmValues = removeInvalidCgmValues(data) -#metadata["nInvalidCgmValues"] = nInvalidCgmValues - +data["uploadTime"] = add_upload_time( + data[["type", "uploadId", "utcTime"]].copy() +) From 21ff818ae6c2b89bb5577db2e861524ddee6eabf Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Mon, 29 Jul 2019 22:36:30 -0500 Subject: [PATCH 18/46] clean cgm data --- .../get_stats/get_cgm_stats.py | 100 +++++++++++++++++- 1 file changed, 99 insertions(+), 1 deletion(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py index be3dcecf..329e6b75 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py @@ -381,6 +381,48 @@ def add_upload_time(df): return df["uploadTime"].values +def remove_invalid_cgm_values(df): + + nBefore = len(df) + # remove values < 38 and > 402 mg/dL + df = df.drop(df[((df.type == "cbg") & + (df["mg/dL"] < 38))].index) + df = df.drop(df[((df.type == "cbg") & + (df["mg/dL"] > 402))].index) + nRemoved = nBefore - len(df) + + return df, nRemoved + + +def removeDuplicates(df, criteriaDF): + nBefore = len(df) + df = df.loc[~(df[criteriaDF].duplicated())] + df = df.reset_index(drop=True) + nDuplicatesRemoved = nBefore - len(df) + + return df, nDuplicatesRemoved + + +def removeCgmDuplicates(df, timeCriterion): + if timeCriterion in df: + df.sort_values(by=[timeCriterion, "uploadTime"], + ascending=[False, False], + inplace=True) + dfIsNull = df[df[timeCriterion].isnull()] + dfNotNull = df[df[timeCriterion].notnull()] + dfNotNull, nDuplicatesRemoved = ( + removeDuplicates(dfNotNull, [timeCriterion, "value"]) + ) + df = pd.concat([dfIsNull, dfNotNull]) + df.sort_values(by=[timeCriterion, "uploadTime"], + ascending=[False, False], + inplace=True) + else: + nDuplicatesRemoved = 0 + + return df, nDuplicatesRemoved + + # %% ESTIMATE LOCAL TIME FUNCTIONS def create_contiguous_day_series(df): first_day = df["date"].min() @@ -1097,7 +1139,7 @@ def estimate_local_time(df): data, _ = get_data( donor_group=donor_group, userid=userid, - weeks_of_data=52*10 + weeks_of_data=4 # 52*10 ) @@ -1160,3 +1202,59 @@ def estimate_local_time(df): data["uploadTime"] = add_upload_time( data[["type", "uploadId", "utcTime"]].copy() ) + +# %% TIME CATEGORIES +# AGE, & YLW +bDate = pd.to_datetime(donor_metadata["birthday"].values[0][0:7]) +dDate = pd.to_datetime(donor_metadata["diagnosisDate"].values[0][0:7]) +data["age"] = np.floor((data["roundedLocalTime"] - bDate).dt.days/365.25) +data["ylw"] = np.floor((data["roundedLocalTime"] - dDate).dt.days/365.25) + +# hour of the day +data["hour"] = data["roundedLocalTime"].dt.hour + +# add the day of the localTime that starts at 12am +data["day12AM"] = data["roundedLocalTime"].dt.date +# NOTE: for day of week Monday = 0 and Sunday = 6 +data["dayofweek12AM"] = data["roundedLocalTime"].dt.dayofweek +data["weekend12AM"] = data["dayofweek12AM"] > 4 + +# day that starts at 6am +data["6amTime"] = data["roundedLocalTime"] - pd.Timedelta(6, unit="hours") +data["day6AM"] = data["6amTime"].dt.date +data["dayofweek6AM"] = data["6amTime"].dt.dayofweek +data["weekend6AM"] = data["dayofweek6AM"] > 4 + + +# %% GROUP DATA BY TYPE +# first sort by upload time (used when removing dumplicates) +data.sort_values("uploadTime", ascending=False, inplace=True) +groups = data.groupby(by="type") + + +# %% CGM DATA +# filter by cgm +cgm = groups.get_group("cbg").dropna(axis=1, how="all") + +# calculate cgm in mg/dL +cgm["mg/dL"] = round(cgm["value"] * MGDL_PER_MMOLL).astype(int) + +# get rid of cgm values too low/high (< 38 & > 402 mg/dL) +cgm, nInvalidCgmValues = remove_invalid_cgm_values(cgm) +metadata["nInvalidCgmValues"] = nInvalidCgmValues + +# get rid of duplicates that have the same ["deviceTime", "value"] +cgm, n_cgm_dups_removed = (removeCgmDuplicates(cgm, "deviceTime")) +metadata["nCgmDuplicatesRemovedDeviceTime"] = n_cgm_dups_removed + +# get rid of duplicates that have the same ["time", "value"] +cgm, n_cgm_dups_removed = removeCgmDuplicates(cgm, "time") +metadata["nCgmDuplicatesRemovedUtcTime"] = n_cgm_dups_removed + +# get rid of duplicates that have the same "roundedTime" +cgm, n_cgm_dups_removed = removeDuplicates(cgm, "roundedLocalTime") +metadata["nCgmDuplicatesRemovedRoundedTime"] = n_cgm_dups_removed + + +# %% GET CGM STATS + From 8dd746518c0c4de508febc500603396c33ae8951 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Tue, 30 Jul 2019 11:41:43 -0500 Subject: [PATCH 19/46] get cgm 5 minute time series --- .../get_stats/get_cgm_stats.py | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py index 329e6b75..1f5f61b6 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py @@ -1139,7 +1139,7 @@ def estimate_local_time(df): data, _ = get_data( donor_group=donor_group, userid=userid, - weeks_of_data=4 # 52*10 + weeks_of_data=52*10 ) @@ -1198,10 +1198,6 @@ def estimate_local_time(df): return_calculation_columns=False ) -# add upload time to the data, which is needed to get rid of duplicates -data["uploadTime"] = add_upload_time( - data[["type", "uploadId", "utcTime"]].copy() -) # %% TIME CATEGORIES # AGE, & YLW @@ -1237,7 +1233,7 @@ def estimate_local_time(df): cgm = groups.get_group("cbg").dropna(axis=1, how="all") # calculate cgm in mg/dL -cgm["mg/dL"] = round(cgm["value"] * MGDL_PER_MMOLL).astype(int) +cgm["mg/dL"] = round(cgm["value"] * MGDL_PER_MMOLL) # get rid of cgm values too low/high (< 38 & > 402 mg/dL) cgm, nInvalidCgmValues = remove_invalid_cgm_values(cgm) @@ -1257,4 +1253,22 @@ def estimate_local_time(df): # %% GET CGM STATS +# create a contiguous 5 minute time series +first_day = cgm["roundedLocalTime"].min() +last_day = cgm["roundedLocalTime"].max() +rng = pd.date_range(first_day, last_day, freq="5min") +contiguous_data = ( + pd.DataFrame(rng, columns=["roundedLocalTime"]).sort_values( + "roundedLocalTime", ascending=False + ).reset_index(drop=True) +) + +# merge with cgm data +cgm_series = pd.merge( + contiguous_data, + cgm, + on="roundedLocalTime", + how="left" +) +#cgm_series["hourly.mean"] = cgm_series["mg/dL"].rolling(12).mean() From ef40d18538bf3c7922beba51c9d9191a5790f8f6 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Wed, 7 Aug 2019 14:08:17 -0500 Subject: [PATCH 20/46] add new functions that get embedded json data also refactor existing functions that used old get embedded data functions --- .../get_stats/get_cgm_stats.py | 81 ++++++++++++++++--- 1 file changed, 69 insertions(+), 12 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py index 1f5f61b6..0fcc1c3c 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py @@ -76,17 +76,67 @@ def remove_negative_durations(df): return df, n_negative_durations -def expand_embedded_dict(df, field, key_): +def expand_embedded_dict(ts, key_): + '''Expanded a single field that has embedded json + + Args: + ts: a pandas time series of the field that has embedded json + key_: the key that you want to expand + + Raise: + TypeError: if you don't pass in a pandas time series + + Returns: + key_ts: a new time series of the key of interest + + NOTE: + this is new function + TODO: + could be refactored to allow multiple keys or all keys to be returned + could be refactored for speed as the current process ''' - this is new, should be refactored for speed as the current process - creates a dataframe of all of keys instead of just the key of interest + + if "Series" not in type(ts).__name__: + raise TypeError('Expecting a pandas time series object') + key_ts = pd.Series(name=ts.name + "." + key_, index=ts.index) + notnull_idx = ts.notnull() + # TODO: maybe sped up by only getting the one field of interest? + # though, the current method is fairly quick and compact + temp_df = pd.DataFrame(ts[notnull_idx].tolist()) + if key_ in list(temp_df): + key_ts[notnull_idx] = temp_df[key_].values + + return key_ts + + +def get_embedded_field(ts, embedded_field): + '''get a field that is nested in more than 1 embedded dictionary (json) + + Args: + ts: a pandas time series of the field that has embedded json + embedded_field (str): the location of the field that is deeply nested + (e.g., "origin.payload.device.model") + + Raise: + ValueError: if you don't pass in a pandas time series + + Returns: + new_ts: a new time series of the key of interest + + NOTE: + this is new function + the "." notation is used to reference nested json + ''' - if field in list(df): - notnull_idx = df[field].notnull() - temp_df = pd.DataFrame(df.loc[notnull_idx, field].tolist()) # TODO: this can be sped up by only getting the field key of interest - if key_ in list(temp_df): - df[field + "." + key_] = temp_df[key_] - return df + field_list = embedded_field.split(".") + if len(field_list) < 2: + raise ValueError('Expecting at least 1 embedded field') + + new_ts = expand_embedded_dict(ts, field_list[1]) + for i in range(2, len(field_list)): + new_ts = expand_embedded_dict(new_ts, field_list[i]) + + return new_ts def tslim_calibration_fix(df): @@ -101,9 +151,11 @@ def tslim_calibration_fix(df): ''' # expand payload field one level - df = expand_embedded_dict(df, "payload", "calibration_reading") + df["payload.calibration_reading"] = ( + expand_embedded_dict(df["payload"], "calibration_reading") + ) - if "payload.calibration_reading" in list(df): + if df["payload.calibration_reading"].notnull().sum() > 0: search_for = ['tan'] tandem_data_index = ( @@ -133,7 +185,12 @@ def tslim_calibration_fix(df): def get_healthkit_timezone(df): - df = expand_embedded_dict(df, "payload", "HKTimeZone") + ''' + TODO: refactor to account for more efficient way to get embedded json + ''' + df["payload.HKTimeZone"] = ( + expand_embedded_dict(df["payload"], "HKTimeZone") + ) if "timezone" not in list(df): if "payload.HKTimeZone" in list(df): hk_tz_idx = df["payload.HKTimeZone"].notnull() From dc9f19e19016d8946c51927b2dc3915278c1b3a3 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Wed, 7 Aug 2019 20:17:57 -0500 Subject: [PATCH 21/46] remove spike data --- .../get_stats/get_cgm_stats.py | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py index 0fcc1c3c..91bf2d9b 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py @@ -480,6 +480,27 @@ def removeCgmDuplicates(df, timeCriterion): return df, nDuplicatesRemoved +# get rid of spike data +def remove_spike_data(df): + nBefore = len(df) + spike_locations = [ + "origin.payload.device.name", + "origin.payload.device.manufacturer", + "origin.payload.sourceRevision.source.name", + ] + for spike_loc in spike_locations: + + df[spike_loc] = get_embedded_field(df["origin"], spike_loc) + spike_idx = df.loc[ + df[spike_loc].notnull(), + spike_loc + ].str.lower().str.contains("spike") + df.drop(df.iloc[np.where(spike_idx)[0]].index, inplace=True) + nRemoved = nBefore - len(df) + + return df, nRemoved + + # %% ESTIMATE LOCAL TIME FUNCTIONS def create_contiguous_day_series(df): first_day = df["date"].min() @@ -1292,6 +1313,10 @@ def estimate_local_time(df): # calculate cgm in mg/dL cgm["mg/dL"] = round(cgm["value"] * MGDL_PER_MMOLL) +# get rid of spike data +cgm, nSpike = remove_spike_data(cgm) +metadata["nSpike"] = nSpike + # get rid of cgm values too low/high (< 38 & > 402 mg/dL) cgm, nInvalidCgmValues = remove_invalid_cgm_values(cgm) metadata["nInvalidCgmValues"] = nInvalidCgmValues From be7177ef3e911284f64787489e25cc1d40af4e37 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Mon, 12 Aug 2019 10:20:14 -0500 Subject: [PATCH 22/46] make sure there is timezone information --- .../get_stats/get_cgm_stats.py | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py index 91bf2d9b..a36e70bc 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py @@ -566,22 +566,22 @@ def add_device_day_series(df, dfContDays, deviceTypeName): if "upload" in deviceTypeName: if "timezone" in df: -# if dfDayGroups.timezone.count().values[0] > 0: # NOT SURE WHY THIS IS HERE - dfDaySeries["timezone"] = ( - dfDayGroups.timezone.describe()["top"] - ) - # get the timezone offset for the timezone - for i in dfDaySeries.index: - if pd.notnull(dfDaySeries.loc[i, "timezone"]): - tzo = get_timezone_offset( - pd.to_datetime(i), - dfDaySeries.loc[i, "timezone"]) - dfDaySeries.loc[i, ["timezoneOffset"]] = tzo - if "timeProcessing" in dfDaySeries: - dfDaySeries["timeProcessing"] = \ - dfDayGroups.timeProcessing.describe()["top"] - else: - dfDaySeries["timeProcessing"] = np.nan + if dfDayGroups.timezone.count().max() > 0: + dfDaySeries["timezone"] = ( + dfDayGroups.timezone.describe()["top"] + ) + # get the timezone offset for the timezone + for i in dfDaySeries.index: + if pd.notnull(dfDaySeries.loc[i, "timezone"]): + tzo = get_timezone_offset( + pd.to_datetime(i), + dfDaySeries.loc[i, "timezone"]) + dfDaySeries.loc[i, ["timezoneOffset"]] = tzo + if "timeProcessing" in dfDaySeries: + dfDaySeries["timeProcessing"] = \ + dfDayGroups.timeProcessing.describe()["top"] + else: + dfDaySeries["timeProcessing"] = np.nan dfDaySeries = dfDaySeries.add_prefix(deviceTypeName + "."). \ rename(columns={deviceTypeName + ".date": "date"}) From 07b211e77f1578fe65029a2d75c794158ffc8c26 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Mon, 12 Aug 2019 19:06:18 -0500 Subject: [PATCH 23/46] refactor remove spike data --- .../get_stats/get_cgm_stats.py | 35 +++++++++++-------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py index a36e70bc..172f4784 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py @@ -482,21 +482,26 @@ def removeCgmDuplicates(df, timeCriterion): # get rid of spike data def remove_spike_data(df): - nBefore = len(df) - spike_locations = [ - "origin.payload.device.name", - "origin.payload.device.manufacturer", - "origin.payload.sourceRevision.source.name", - ] - for spike_loc in spike_locations: - - df[spike_loc] = get_embedded_field(df["origin"], spike_loc) - spike_idx = df.loc[ - df[spike_loc].notnull(), - spike_loc - ].str.lower().str.contains("spike") - df.drop(df.iloc[np.where(spike_idx)[0]].index, inplace=True) - nRemoved = nBefore - len(df) + if "origin" in list(df): + nBefore = len(df) + spike_locations = [ + "origin.payload.device.name", + "origin.payload.device.manufacturer", + "origin.payload.sourceRevision.source.name", + ] + for spike_loc in spike_locations: + + df[spike_loc] = get_embedded_field(df["origin"], spike_loc) + spike_idx = df.loc[ + df[spike_loc].notnull(), + spike_loc + ].astype(str).str.lower().str.contains("spike") + + df.drop((spike_idx == True).index, inplace=True) + nRemoved = nBefore - len(df) + + else: + nRemoved = np.nan return df, nRemoved From a240bd7625ae43d8c394a7c5990ff5e7863ba341 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Fri, 16 Aug 2019 12:18:24 -0500 Subject: [PATCH 24/46] wip cgm distributions --- .gitignore | 2 + .../get_stats/get_cgm_distributions_v3.py | 1777 +++++++++++++++++ 2 files changed, 1779 insertions(+) create mode 100644 projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py diff --git a/.gitignore b/.gitignore index 0c1ca188..f4cf204c 100644 --- a/.gitignore +++ b/.gitignore @@ -35,3 +35,5 @@ projects/loop-algorithm/figures/ projects/parsers/output/ projects/get-donors-pump-settings/temp-plot\.html + +projects/bigdata-processing-pipeline/get_stats/debug/ diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py new file mode 100644 index 00000000..5e670608 --- /dev/null +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py @@ -0,0 +1,1777 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +calculate cgm statsistics for a single tidepool (donor) dataset +''' + + +# %% REQUIRED LIBRARIES +import os +import sys +import hashlib +import pytz +import numpy as np +import pandas as pd +import datetime as dt +import ast +import pdb +# TODO: figure out how to get rid of these path dependcies +get_donor_data_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..") +) +if get_donor_data_path not in sys.path: + sys.path.insert(0, get_donor_data_path) +import environmentalVariables +from get_donor_data.get_single_donor_metadata import get_shared_metadata +from get_donor_data.get_single_tidepool_dataset import get_data + + +# %% CONSTANTS +MGDL_PER_MMOLL = 18.01559 + + +# %% FUNCTIONS +''' +the functions that are called in this script, +which includes notes of where the functions came from, +and whether they were refactored +''' + + +def get_slope(y): + if "array" not in type(y).__name__: + raise TypeError('Expecting a numpy array') + + count_ = len(y) + + x = np.arange(start=0, stop=count_*5, step=5) + + sum_x = x.sum() + sum_y = y.sum() + sum_xy = (x * y).sum() + sum_x_squared = (x * x).sum() + + slope = ( + ((count_ * sum_xy) - (sum_x * sum_y)) + / ((count_ * sum_x_squared) - (sum_x * sum_x)) + ) + + return slope + + +def expand_entire_dict(ts): + if "Series" not in type(ts).__name__: + raise TypeError('Expecting a pandas time series object') + notnull_idx = ts.index[ts.notnull()] + temp_df = pd.DataFrame( + ts[notnull_idx].tolist(), + index=notnull_idx + ) + + return temp_df + + +def expand_embedded_dict(ts, key_): + '''Expanded a single field that has embedded json + + Args: + ts: a pandas time series of the field that has embedded json + key_: the key that you want to expand + + Raise: + TypeError: if you don't pass in a pandas time series + + Returns: + key_ts: a new time series of the key of interest + + NOTE: + this is new function + TODO: + could be refactored to allow multiple keys or all keys to be returned + could be refactored for speed as the current process + ''' + + if "Series" not in type(ts).__name__: + raise TypeError('Expecting a pandas time series object') + key_ts = pd.Series(name=ts.name + "." + key_, index=ts.index) + notnull_idx = ts.notnull() + # TODO: maybe sped up by only getting the one field of interest? + # though, the current method is fairly quick and compact + temp_df = expand_entire_dict(ts) + if key_ in list(temp_df): + key_ts[notnull_idx] = temp_df[key_].values + + return key_ts + + +def get_embedded_field(ts, embedded_field): + '''get a field that is nested in more than 1 embedded dictionary (json) + + Args: + ts: a pandas time series of the field that has embedded json + embedded_field (str): the location of the field that is deeply nested + (e.g., "origin.payload.device.model") + + Raise: + ValueError: if you don't pass in a pandas time series + + Returns: + new_ts: a new time series of the key of interest + + NOTE: + this is new function + the "." notation is used to reference nested json + + ''' + field_list = embedded_field.split(".") + if len(field_list) < 2: + raise ValueError('Expecting at least 1 embedded field') + + new_ts = expand_embedded_dict(ts, field_list[1]) + for i in range(2, len(field_list)): + new_ts = expand_embedded_dict(new_ts, field_list[i]) + + return new_ts + + +def add_upload_info_to_cgm_records(groups, df): + upload_locations = [ + "uploadId", + "deviceManufacturers", + "deviceModel", + "deviceSerialNumber", + "deviceTags" + ] + + if "upload" in groups["type"].unique(): + upload = groups.get_group("upload").dropna(axis=1, how="all") + df = pd.merge( + left=df, + right=upload[list(set(upload_locations) & set(list(upload)))], + on="uploadId", + how="left" + ) + + return df + + +def expand_heathkit_cgm_fields(df): + healthkit_locations = [ + "origin", + "origin.payload", + "origin.payload.device", + "origin.payload.sourceRevision", + "origin.payload.sourceRevision.source", + "payload", + ] + + for hk_loc in healthkit_locations: + if hk_loc in list(df): + temp_df = ( + expand_entire_dict(df[hk_loc].copy()).add_prefix(hk_loc + ".") + ) + df = pd.concat([df, temp_df], axis=1) + + return df + + +def get_dexcom_cgm_model(df): + # add cgm model + # put this list in order of precedence when choosing sensor version + # NOTE: there is an edge case where "origin.payload.device.model" = G5/G6, + # which can be eliminated by getting model from HKMetadataKeySyncIdentifier + dexcom_model_locations = [ + "deviceId", + "deviceManufacturers", + "deviceModel", + "deviceSerialNumber", + "payload.HKMetadataKeySyncIdentifier", # do this before "origin.payload.device.model" bc there is an edge case + "origin.payload.device.model", + "origin.payload.sourceRevision.source.name", + "payload.transmitterGeneration", + "payload.transmitterId", + ] + + for model_location in dexcom_model_locations: + if model_location in list(df): + # only consider cells where the model location is not null + notnull_idx = df[model_location].notnull() + if notnull_idx.sum() > 0: + for dex_model in ["G4", "G5", "G6"]: + # define a pandas stringMethod + str_list = df[model_location].astype(str).str + # if model has already been determined, then skip + missing_model_idx = df["cgmModel"].isnull() + # get index that matches model + model_idx = str_list.upper().str.contains(dex_model) + + m_idx = ( + missing_model_idx & notnull_idx & model_idx + ) + df.loc[m_idx, "cgmModel"] = dex_model + + # case of "payload.transmitterId" + if ( + ("payload.transmitterId" in model_location) + | ("payload.HKMetadataKeySyncIdentifier" in model_location) + ): + # get string length (need 5 digits for G4 and 6 for G5, G6) + if "G4" in dex_model: + model_idx = str_list.len() == 5 + elif "G5" in dex_model: + model_idx = str_list.startswith("4") + elif "G6" in dex_model: + model_idx = ( + (str_list.startswith("8")) + | (str_list.startswith("2")) + ) + m_idx = ( + missing_model_idx & notnull_idx & model_idx + ) + df.loc[m_idx, "cgmModel"] = dex_model + + return df["cgmModel"] + + +def get_non_dexcom_cgm_model(df): + # non-dexcom cgm model query + model_locations = ["deviceId"] + models_670G = "MMT-158|MMT-178" + models_640G = "MMT-1511|MMT-1512|MMT-1711|MMT-1712" + models_630G = "MMT-1514|MMT-1515|MMT-1714|MMT-1715" + models_530G = ( + "530G|MedT-551|MedT-751|MedT-554|MedT-754|Veo - 554|Veo - 754" + ) + models_523_723 = "MedT-523|MedT-723|Revel - 523|Revel - 723" # 523/723 + models_libre = "AbbottFreeStyleLibre" + models_animas = "IR1295" + # NOTE: the tandem G4 will first be written as G5_G6, + # but the logic should overwrite back to G4 + models_tandem_G5_G6 = "tandem" + models_tandem_G4 = "4628003|5448003" + + non_dex_models = [ + models_670G, models_640G, models_630G, models_530G, models_523_723, + models_libre, models_animas, models_tandem_G5_G6, models_tandem_G4 + ] + + non_dex_model_names = [ + "670G", "640G", "630G", "530G", "523_723", + "LIBRE", "G4", "G5_G6", "G4" + ] + + for model_loc in model_locations: + if model_loc in list(df): + # only consider cells where the model location is not null + # and we are missing a cgm model + notnull_idx = df[model_loc].notnull() + if notnull_idx.sum() > 0: + missing_model_idx = df["cgmModel"].isnull() + if missing_model_idx.sum() > 0: + # define a pandas stringMethod + str_list = df[model_loc].astype(str).str + + for non_dex_model, model_name in zip( + non_dex_models, non_dex_model_names + ): + model_idx = str_list.contains(non_dex_model) + m_idx = (missing_model_idx & notnull_idx & model_idx) + df.loc[m_idx, "cgmModel"] = model_name + + return df["cgmModel"] + + +def hash_userid(userid, salt): + ''' + taken from anonymize-and-export.py + refactored name(s) to meet style guide + ''' + usr_string = userid + salt + hash_user = hashlib.sha256(usr_string.encode()) + hashid = hash_user.hexdigest() + + return hashid + + +def get_type(val): + return type(val).__name__ + + +def remove_negative_durations(df): + ''' + taken from https://github.com/tidepool-org/data-analytics/blob/ + etn/get-settings-and-events/projects/get-donors-pump-settings/ + get-users-settings-and-events.py + + refactored name(s) to meet style guide + refactored pandas field call to df["field"] instead of df.field + refactored because physical activity includes embedded json, whereas + the other fields in the data model require a integer + TODO: I think that durations are coming in as floats too, so we need + to refactor to account for that. + ''' + if "duration" in list(df): + type_ = df["duration"].apply(get_type) + valid_index = ((type_ == "int") & (df["duration"].notnull())) + n_negative_durations = sum(df.loc[valid_index, "duration"] < 0) + if n_negative_durations > 0: + df = df[~(df.loc[valid_index, "duration"] < 0)] + else: + n_negative_durations = np.nan + + return df, n_negative_durations + + + + + +def tslim_calibration_fix(df): + ''' + taken from https://github.com/tidepool-org/data-analytics/blob/ + etn/get-settings-and-events/projects/get-donors-pump-settings/ + get-users-settings-and-events.py + + refactored name(s) to meet style guide + refactored pandas field call to df["field"] instead of df.field + refactored to only expand one field + ''' + + # expand payload field one level + if "payload" in list(df): + df["payload.calibration_reading"] = ( + expand_embedded_dict(df["payload"], "calibration_reading") + ) + + if df["payload.calibration_reading"].notnull().sum() > 0: + + search_for = ['tan'] + tandem_data_index = ( + (df["deviceId"].str.contains('|'.join(search_for))) + & (df["type"] == "deviceEvent") + ) + + cal_index = df["payload.calibration_reading"].notnull() + valid_index = tandem_data_index & cal_index + + n_cal_readings = sum(valid_index) + + if n_cal_readings > 0: + # if reading is > 30 then it is in the wrong units + if df["payload.calibration_reading"].min() > 30: + df.loc[cal_index, "value"] = ( + df.loc[valid_index, "payload.calibration_reading"] + / MGDL_PER_MMOLL + ) + else: + df.loc[cal_index, "value"] = ( + df.loc[valid_index, "payload.calibration_reading"] + ) + else: + n_cal_readings = 0 + else: + n_cal_readings = 0 + return df, n_cal_readings + + +def replace_smoothed_cgm_values(df): + + if 'payload.realTimeValue' in list(df): + raw_val_idx = df['payload.realTimeValue'].notnull() + n_replaced = raw_val_idx.sum() + df.loc[raw_val_idx, "mg/dL"] = ( + df.loc[raw_val_idx, "payload.realTimeValue"] + ) + else: + n_replaced = np.nan + + raw_values = df["mg/dL"] + + return raw_values, n_replaced + + +def get_healthkit_timezone(df): + ''' + TODO: refactor to account for more efficient way to get embedded json + ''' + if "payload" in list(df): + df["payload.HKTimeZone"] = ( + expand_embedded_dict(df["payload"], "HKTimeZone") + ) + if "timezone" not in list(df): + if "payload.HKTimeZone" in list(df): + hk_tz_idx = df["payload.HKTimeZone"].notnull() + df.loc[hk_tz_idx, "deviceType"] = "healthkit" + df.rename(columns={"payload.HKTimeZone": "timezone"}, inplace=True) + + else: + df["timezone"] = np.nan + df["deviceType"] = np.nan + else: + if "payload.HKTimeZone" in list(df): + hk_tz_idx = df["payload.HKTimeZone"].notnull() + df.loc[hk_tz_idx, "timezone"] = ( + df.loc[hk_tz_idx, "payload.HKTimeZone"] + ) + df.loc[hk_tz_idx, "deviceType"] = "healthkit" + else: + df["timezone"] = np.nan + df["deviceType"] = np.nan + + else: + df["timezone"] = np.nan + df["deviceType"] = np.nan + + return df[["timezone", "deviceType"]] + + +def get_and_fill_timezone(df): + ''' + this is new to deal with healthkit data + requires that a data frame that contains payload and HKTimeZone is passed + ''' + df = get_healthkit_timezone(df) + + df["timezone"].fillna(method='ffill', inplace=True) + df["timezone"].fillna(method='bfill', inplace=True) + + return df["timezone"] + + +def make_tz_unaware(date_time): + return date_time.replace(tzinfo=None) + + +def to_utc_datetime(df): + ''' + this is new to deal with perfomance issue with the previous method + of converting to string to datetime with pd.to_datetime() + ''' + utc_time_tz_aware = pd.to_datetime( + df["time"], + format="%Y-%m-%dT%H:%M:%S", + utc=True + ) + utc_tz_unaware = utc_time_tz_aware.apply(make_tz_unaware) + + return utc_tz_unaware + + +# apply the large timezone offset correction (AKA Darin's fix) +def timezone_offset_bug_fix(df): + ''' + this is taken from estimate-local-time.py + TODO: add in unit testing where there is no TZP that is > 840 or < -720 + ''' + + if "timezoneOffset" in list(df): + + while ((df.timezoneOffset > 840).sum() > 0): + df.loc[df.timezoneOffset > 840, ["conversionOffset"]] = ( + df.loc[df.timezoneOffset > 840, ["conversionOffset"]] + - (1440 * 60 * 1000) + ) + + df.loc[df.timezoneOffset > 840, ["timezoneOffset"]] = ( + df.loc[df.timezoneOffset > 840, ["timezoneOffset"]] - 1440 + ) + + while ((df.timezoneOffset < -720).sum() > 0): + df.loc[df.timezoneOffset < -720, ["conversionOffset"]] = ( + df.loc[df.timezoneOffset < -720, ["conversionOffset"]] + + (1440 * 60 * 1000) + ) + + df.loc[df.timezoneOffset < -720, ["timezoneOffset"]] = ( + df.loc[df.timezoneOffset < -720, ["timezoneOffset"]] + 1440 + ) + + return df + + +def get_local_time(df): + + tzo = df[['utcTime', 'inferredTimezone']].apply( + lambda x: get_timezone_offset(*x), axis=1 + ) + local_time = df['utcTime'] + pd.to_timedelta(tzo, unit="m") + + return local_time + + +def round_time( + df, + time_interval_minutes=5, + start_with_first_record=True, + return_calculation_columns=False +): + ''' + A general purpose round time function that rounds the "time" + field to nearest minutes + INPUTS: + * a dataframe (df) or time series that contains only one time field + that you want to round + * time_interval_minutes (defaults to 5 minutes given that most cgms + output every 5 minutes) + * start_with_first_record starts the rounding with the first record + if True, and the last record if False (defaults to True) + * return_calculation_columns specifies whether the extra columns + used to make calculations are returned + refactored name(s) to meet style guide + ''' + # if a time series is passed in, convert to dataframe + if "Series" in get_type(df): + df = pd.DataFrame(df) + columns_ = list(df) + if len(columns_) > 1: + sys.exit( + "Error: df should only have one time column" + ) + else: + df.rename(columns={columns_[0]: "t"}, inplace=True) + + df.sort_values( + by="t", + ascending=start_with_first_record, + inplace=True + ) + + df.reset_index(drop=False, inplace=True) + df.rename(columns={"index": "originalIndex"}, inplace=True) + + # calculate the time between consecutive records + df["t_shift"] = df["t"].shift(1) + df["timeBetweenRecords"] = round( + (df["t"] - df["t_shift"]).dt.days*(86400/(60 * time_interval_minutes)) + + (df["t"] - df["t_shift"]).dt.seconds/(60 * time_interval_minutes) + ) * time_interval_minutes + + # separate the data into chunks if timeBetweenRecords is greater than + # 2 times the minutes so the rounding process + # starts over + big_gaps = list( + df.query("abs(timeBetweenRecords) > " + + str(time_interval_minutes * 2)).index + ) + big_gaps.insert(0, 0) + big_gaps.append(len(df)) + + for gap_index in range(0, len(big_gaps) - 1): + chunk = df["t"][big_gaps[gap_index]:big_gaps[gap_index+1]] + first_chunk = df["t"][big_gaps[gap_index]] + + # calculate the time difference between + # each time record and the first record + df.loc[ + big_gaps[gap_index]:big_gaps[gap_index+1], + "minutesFromFirstRecord" + ] = ( + (chunk - first_chunk).dt.days*(86400/60) + + (chunk - first_chunk).dt.seconds/60 + ) + + # then round to the nearest X Minutes + # NOTE: the ".000001" ensures that mulitples of 2:30 always rounds up. + df.loc[ + big_gaps[gap_index]:big_gaps[gap_index+1], + "roundedMinutesFromFirstRecord" + ] = round( + (df.loc[ + big_gaps[gap_index]:big_gaps[gap_index+1], + "minutesFromFirstRecord" + ] / time_interval_minutes) + 0.000001 + ) * (time_interval_minutes) + + rounded_first_record = ( + first_chunk + pd.Timedelta("1microseconds") + ).round(str(time_interval_minutes) + "min") + + df.loc[ + big_gaps[gap_index]:big_gaps[gap_index+1], + "roundedTime" + ] = rounded_first_record + pd.to_timedelta( + df.loc[ + big_gaps[gap_index]:big_gaps[gap_index+1], + "roundedMinutesFromFirstRecord" + ], unit="m" + ) + + if return_calculation_columns is False: + df.drop( + columns=[ + "timeBetweenRecords", + "minutesFromFirstRecord", + "roundedMinutesFromFirstRecord" + ], inplace=True + ) + # sort back to the original index + df.sort_values(by="originalIndex", inplace=True) + + return df["roundedTime"].values + + +def add_upload_time(df): + ''' + this is taken from a colab notebook that is not in our github + given that it has been refactored to account for bug where there are + no upload records + NOTE: this is a new fix introduced with healthkit data...we now have + data that does not have an upload record + + ''' + + if "upload" in df.type.unique(): + upload_times = pd.DataFrame( + df[df.type == "upload"].groupby("uploadId")["utcTime"].max() + ) + else: + upload_times = pd.DataFrame(columns=["utcTime"]) + + unique_uploadIds = set(df["uploadId"].unique()) + unique_uploadRecords = set( + df.loc[df["type"] == "upload", "uploadId"].unique() + ) + uploadIds_missing_uploadRecords = unique_uploadIds - unique_uploadRecords + + for upId in uploadIds_missing_uploadRecords: + last_upload_time = df.loc[df["uploadId"] == upId, "utcTime"].max() + upload_times.loc[upId, "utcTime"] = last_upload_time + + upload_times.reset_index(inplace=True) + upload_times.rename( + columns={"utcTime": "uploadTime", + "index": "uploadId"}, + inplace=True + ) + + df = pd.merge(df, upload_times, how='left', on='uploadId') + + return df["uploadTime"].values + + +def remove_invalid_cgm_values(df): + + nBefore = len(df) + # remove values < 38 and > 402 mg/dL + df = df.drop(df[((df.type == "cbg") & + (df["mg/dL"] < 38))].index) + df = df.drop(df[((df.type == "cbg") & + (df["mg/dL"] > 402))].index) + nRemoved = nBefore - len(df) + + return df, nRemoved + + +def removeDuplicates(df, criteriaDF): + nBefore = len(df) + df = df.loc[~(df[criteriaDF].duplicated())] + df = df.reset_index(drop=True) + nDuplicatesRemoved = nBefore - len(df) + + return df, nDuplicatesRemoved + + +def removeCgmDuplicates(df, timeCriterion, valueCriterion="value"): + if timeCriterion in df: + df.sort_values(by=[timeCriterion, "uploadTime"], + ascending=[False, False], + inplace=True) + dfIsNull = df[df[timeCriterion].isnull()] + dfNotNull = df[df[timeCriterion].notnull()] + dfNotNull, nDuplicatesRemoved = ( + removeDuplicates(dfNotNull, [timeCriterion, valueCriterion]) + ) + df = pd.concat([dfIsNull, dfNotNull]) + df.sort_values(by=[timeCriterion, "uploadTime"], + ascending=[False, False], + inplace=True) + else: + nDuplicatesRemoved = 0 + + return df, nDuplicatesRemoved + + +# get rid of spike data +def remove_spike_data(df): + if "origin" in list(df): + nBefore = len(df) + spike_locations = [ + "origin.payload.device.name", + "origin.payload.device.manufacturer", + "origin.payload.sourceRevision.source.name", + ] + for spike_loc in spike_locations: + df[spike_loc] = get_embedded_field(df["origin"], spike_loc) + + spike_idx = df.loc[ + df[spike_loc].notnull(), + spike_loc + ].astype(str).str.lower().str.contains("spike") + + df.drop((spike_idx == True).index, inplace=True) + + nRemoved = nBefore - len(df) + + else: + nRemoved = np.nan + + return df, nRemoved + + +# %% ESTIMATE LOCAL TIME FUNCTIONS +def convert_deprecated_timezone_to_alias(df, tzAlias): + if "timezone" in df: + uniqueTimezones = df.timezone.unique() + uniqueTimezones = uniqueTimezones[pd.notnull(df.timezone.unique())] + + for uniqueTimezone in uniqueTimezones: + alias = tzAlias.loc[tzAlias.tz.str.endswith(uniqueTimezone), + ["alias"]].values + if len(alias) == 1: + df.loc[df.timezone == uniqueTimezone, ["timezone"]] = alias + + return df + + +def create_contiguous_day_series(df): + first_day = df["date"].min() + last_day = df["date"].max() + rng = pd.date_range(first_day, last_day).date + contiguousDaySeries = \ + pd.DataFrame(rng, columns=["date"]).sort_values( + "date", ascending=False).reset_index(drop=True) + + return contiguousDaySeries + + +def add_device_type(df): + col_headings = list(df) + if "deviceType" not in col_headings: + df["deviceType"] = np.nan + if "deviceTags" in col_headings: + # first make sure deviceTag is in string format + df["deviceTags"] = df.deviceTags.astype(str) + # filter by type not null device tags + ud = df[df["deviceTags"].notnull()].copy() + # define a device type (e.g., pump, cgm, or healthkit) + ud.loc[ + ((ud["deviceTags"].str.contains("pump")) + & (ud["deviceType"].isnull())), + ["deviceType"] + ] = "pump" + + # define a device type (e.g., cgm) + ud.loc[ + ((ud["deviceTags"].str.contains("cgm")) + & (ud["deviceType"].isnull())), + ["deviceType"] + ] = "cgm" + + return ud["deviceType"] + else: + return np.nan + + +def get_timezone_offset(currentDate, currentTimezone): + + tz = pytz.timezone(currentTimezone) + # here we add 1 day to the current date to account for changes to/from DST + tzoNum = int( + tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z") + ) + tzoHours = np.floor(tzoNum / 100) + tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0) + tzoSign = np.sign(tzoHours) + tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign)) + + return tzo + + +def add_device_day_series(df, dfContDays, deviceTypeName): + if len(df) > 0: + dfDayGroups = df.groupby("date") + if "timezoneOffset" in df: + dfDaySeries = pd.DataFrame(dfDayGroups["timezoneOffset"].median()) + else: + dfDaySeries = pd.DataFrame(columns=["timezoneOffset"]) + dfDaySeries.index.name = "date" + + if "upload" in deviceTypeName: + if (("timezone" in df) & (df["timezone"].notnull().sum() > 0)): + dfDaySeries["timezone"] = ( + dfDayGroups.timezone.describe()["top"] + ) + # get the timezone offset for the timezone + for i in dfDaySeries.index: + if pd.notnull(dfDaySeries.loc[i, "timezone"]): + tzo = get_timezone_offset( + pd.to_datetime(i), + dfDaySeries.loc[i, "timezone"]) + dfDaySeries.loc[i, ["timezoneOffset"]] = tzo + if "timeProcessing" in dfDaySeries: + dfDaySeries["timeProcessing"] = \ + dfDayGroups.timeProcessing.describe()["top"] + else: + dfDaySeries["timeProcessing"] = np.nan + + + dfDaySeries = dfDaySeries.add_prefix(deviceTypeName + "."). \ + rename(columns={deviceTypeName + ".date": "date"}) + + dfContDays = pd.merge(dfContDays, dfDaySeries.reset_index(), + on="date", how="left") + + else: + dfContDays[deviceTypeName + ".timezoneOffset"] = np.nan + + return dfContDays + + +def impute_upload_records(df, contDays, deviceTypeName): + daySeries = \ + add_device_day_series(df, contDays, deviceTypeName) + + if ((len(df) > 0) & (deviceTypeName + ".timezone" in daySeries)): + for i in daySeries.index[1:]: + if pd.isnull(daySeries[deviceTypeName + ".timezone"][i]): + daySeries.loc[i, [deviceTypeName + ".timezone"]] = ( + daySeries.loc[i-1, deviceTypeName + ".timezone"] + ) + if pd.notnull(daySeries[deviceTypeName + ".timezone"][i]): + tz = daySeries.loc[i, deviceTypeName + ".timezone"] + tzo = get_timezone_offset( + pd.to_datetime(daySeries.loc[i, "date"]), + tz + ) + daySeries.loc[i, deviceTypeName + ".timezoneOffset"] = tzo + + if pd.notnull(daySeries[deviceTypeName + ".timeProcessing"][i-1]): + daySeries.loc[i, deviceTypeName + ".timeProcessing"] = \ + daySeries.loc[i-1, deviceTypeName + ".timeProcessing"] + + else: + daySeries[deviceTypeName + ".timezone"] = np.nan + daySeries[deviceTypeName + ".timeProcessing"] = np.nan + + return daySeries + + +def add_home_timezone(df, contDays): + + if (("timezone" in df) & (df["timezone"].notnull().sum()> 0)): + homeTimezone = df["timezone"].describe()["top"] + tzo = contDays.date.apply( + lambda x: get_timezone_offset(pd.to_datetime(x), homeTimezone)) + + contDays["home.imputed.timezoneOffset"] = tzo + contDays["home.imputed.timezone"] = homeTimezone + + else: + contDays["home.imputed.timezoneOffset"] = np.nan + contDays["home.imputed.timezone"] = np.nan + contDays["home.imputed.timeProcessing"] = np.nan + + return contDays + + +def estimateTzAndTzoWithUploadRecords(cDF): + + cDF["est.type"] = np.nan + cDF["est.gapSize"] = np.nan + cDF["est.timezoneOffset"] = cDF["upload.timezoneOffset"] + cDF["est.annotations"] = np.nan + + if "upload.timezone" in cDF: + cDF.loc[cDF["upload.timezone"].notnull(), ["est.type"]] = "UPLOAD" + cDF["est.timezone"] = cDF["upload.timezone"] + cDF["est.timeProcessing"] = cDF["upload.timeProcessing"] + else: + cDF["est.timezone"] = np.nan + cDF["est.timeProcessing"] = np.nan + + cDF.loc[((cDF["est.timezoneOffset"] != + cDF["home.imputed.timezoneOffset"]) & + (pd.notnull(cDF["est.timezoneOffset"]))), + "est.annotations"] = "travel" + + return cDF + + +def assignTzoFromImputedSeries(df, i, imputedSeries): + df.loc[i, ["est.type"]] = "DEVICE" + + df.loc[i, ["est.timezoneOffset"]] = \ + df.loc[i, imputedSeries + ".timezoneOffset"] + + df.loc[i, ["est.timezone"]] = \ + df.loc[i, imputedSeries + ".timezone"] + + df.loc[i, ["est.timeProcessing"]] = \ + df.loc[i, imputedSeries + ".timeProcessing"] + + return df + + +def compareDeviceTzoToImputedSeries(df, sIdx, device): + for i in sIdx: + # if the device tzo = imputed tzo, then chose the imputed tz and tzo + # note, dst is accounted for in the imputed tzo + for imputedSeries in ["pump.upload.imputed", "cgm.upload.imputed", + "healthkit.upload.imputed", "home.imputed"]: + # if the estimate has not already been made + if pd.isnull(df.loc[i, "est.timezone"]): + + if df.loc[i, device + ".timezoneOffset"] == \ + df.loc[i, imputedSeries + ".timezoneOffset"]: + + assignTzoFromImputedSeries(df, i, imputedSeries) + + df = addAnnotation(df, i, + "tz-inferred-from-" + imputedSeries) + + # if the imputed series has a timezone estimate, then see if + # the current day is a dst change day + elif (pd.notnull(df.loc[i, imputedSeries + ".timezone"])): + imputedTimezone = df.loc[i, imputedSeries + ".timezone"] + if isDSTChangeDay(df.loc[i, "date"], imputedTimezone): + + dstRange = getRangeOfTZOsForTimezone(imputedTimezone) + if ((df.loc[i, device + ".timezoneOffset"] in dstRange) + & (df.loc[i, imputedSeries + ".timezoneOffset"] in dstRange)): + + assignTzoFromImputedSeries(df, i, imputedSeries) + + df = addAnnotation(df, i, "dst-change-day") + df = addAnnotation( + df, i, "tz-inferred-from-" + imputedSeries) + + return df + + +def estimateTzAndTzoWithDeviceRecords(cDF): + + # 2A. use the TZO of the pump or cgm device if it exists on a given day. In + # addition, compare the TZO to one of the imputed day series (i.e., the + # upload and home series to see if the TZ can be inferred) + for deviceType in ["pump", "cgm"]: + # find the indices of days where a TZO estimate has not been made AND + # where the device (e.g., pump or cgm) TZO has data + sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) & + (cDF[deviceType + ".timezoneOffset"].notnull()))].index + # compare the device TZO to the imputed series to infer time zone + cDF = compareDeviceTzoToImputedSeries(cDF, sIndices, deviceType) + + # 2B. if the TZ cannot be inferred with 2A, then see if the TZ can be + # inferred from the previous day's TZO. If the device TZO is equal to the + # previous day's TZO, AND if the previous day has a TZ estimate, use the + # previous day's TZ estimate for the current day's TZ estimate + for deviceType in ["pump", "cgm"]: + sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) & + (cDF[deviceType + ".timezoneOffset"].notnull()))].index + + cDF = compareDeviceTzoToPrevDayTzo(cDF, sIndices, deviceType) + + # 2C. after 2A and 2B, check the DEVICE estimates to make sure that the + # pump and cgm tzo do not differ by more than 60 minutes. If they differ + # by more that 60 minutes, then mark the estimate as UNCERTAIN. Also, we + # allow the estimates to be off by 60 minutes as there are a lot of cases + # where the devices are off because the user changes the time for DST, + # at different times + sIndices = cDF[((cDF["est.type"] == "DEVICE") & + (cDF["pump.timezoneOffset"].notnull()) & + (cDF["cgm.timezoneOffset"].notnull()) & + (cDF["pump.timezoneOffset"] != cDF["cgm.timezoneOffset"]) + )].index + + tzoDiffGT60 = abs(cDF.loc[sIndices, "cgm.timezoneOffset"] - + cDF.loc[sIndices, "pump.timezoneOffset"]) > 60 + + idx = tzoDiffGT60.index[tzoDiffGT60] + + cDF.loc[idx, ["est.type"]] = "UNCERTAIN" + for i in idx: + cDF = addAnnotation(cDF, i, "pump-cgm-tzo-mismatch") + + return cDF + + +def imputeTzAndTzo(cDF): + + sIndices = cDF[cDF["est.timezoneOffset"].isnull()].index + hasTzoIndices = cDF[cDF["est.timezoneOffset"].notnull()].index + if len(hasTzoIndices) > 0: + if len(sIndices) > 0: + lastDay = max(sIndices) + + while ((sIndices.min() < max(hasTzoIndices)) & + (len(sIndices) > 0)): + + currentDay, prevDayWithDay, nextDayIdx = \ + getImputIndices(cDF, sIndices, hasTzoIndices) + + cDF = imputeByTimezone(cDF, currentDay, + prevDayWithDay, nextDayIdx) + + sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) & + (~cDF["est.annotations"].str.contains( + "unable-to-impute-tzo").fillna(False)))].index + + hasTzoIndices = cDF[cDF["est.timezoneOffset"].notnull()].index + + # try to impute to the last day (earliest day) in the dataset + # if the last record has a timezone that is the home record, then + # impute using the home timezone + if len(sIndices) > 0: + currentDay = min(sIndices) + prevDayWithDay = currentDay - 1 + gapSize = lastDay - currentDay + + for i in range(currentDay, lastDay + 1): + if cDF.loc[prevDayWithDay, "est.timezoneOffset"] == \ + cDF.loc[prevDayWithDay, "home.imputed.timezoneOffset"]: + + cDF.loc[i, ["est.type"]] = "IMPUTE" + + cDF.loc[i, ["est.timezoneOffset"]] = \ + cDF.loc[i, "home.imputed.timezoneOffset"] + + cDF.loc[i, ["est.timezone"]] = \ + cDF.loc[i, "home.imputed.timezone"] + + cDF = addAnnotation(cDF, i, "gap=" + str(gapSize)) + cDF.loc[i, ["est.gapSize"]] = gapSize + + else: + cDF.loc[i, ["est.type"]] = "UNCERTAIN" + cDF = addAnnotation(cDF, i, "unable-to-impute-tzo") + else: + cDF["est.type"] = "UNCERTAIN" + cDF["est.annotations"] = "unable-to-impute-tzo" + + return cDF + + +def getRangeOfTZOsForTimezone(tz): + minMaxTzo = [getTimezoneOffset(pd.to_datetime("1/1/2017"), tz), + getTimezoneOffset(pd.to_datetime("5/1/2017"), tz)] + + rangeOfTzo = np.arange(int(min(minMaxTzo)), int(max(minMaxTzo))+1, 15) + + return rangeOfTzo + + +def getListOfDSTChangeDays(cDF): + + # get a list of DST change days for the home time zone + dstChangeDays = \ + cDF[abs(cDF["home.imputed.timezoneOffset"] - + cDF["home.imputed.timezoneOffset"].shift(-1)) > 0].date + + return dstChangeDays + + +def correctEstimatesAroundDst(df, cDF): + + # get a list of DST change days for the home time zone + dstChangeDays = getListOfDSTChangeDays(cDF) + + # loop through the df within 2 days of a daylight savings time change + for d in dstChangeDays: + dstIndex = df[(df.date > (d + dt.timedelta(days=-2))) & + (df.date < (d + dt.timedelta(days=2)))].index + for dIdx in dstIndex: + if pd.notnull(df.loc[dIdx, "est.timezone"]): + tz = pytz.timezone(df.loc[dIdx, "est.timezone"]) + tzRange = getRangeOfTZOsForTimezone(str(tz)) + minHoursToLocal = min(tzRange)/60 + tzoNum = int(tz.localize(df.loc[dIdx, "utcTime"] + + dt.timedelta(hours=minHoursToLocal)).strftime("%z")) + tzoHours = np.floor(tzoNum / 100) + tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0) + tzoSign = np.sign(tzoHours) + tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign)) + localTime = \ + df.loc[dIdx, "utcTime"] + pd.to_timedelta(tzo, unit="m") + df.loc[dIdx, ["est.localTime"]] = localTime + df.loc[dIdx, ["est.timezoneOffset"]] = tzo + return df + + +def applyLocalTimeEstimates(df, cDF): + df = pd.merge(df, cDF, how="left", on="date") + df["est.localTime"] = \ + df["utcTime"] + pd.to_timedelta(df["est.timezoneOffset"], unit="m") + + df = correctEstimatesAroundDst(df, cDF) + + return df["est.localTime"].values + + +def isDSTChangeDay(currentDate, currentTimezone): + tzoCurrentDay = getTimezoneOffset(pd.to_datetime(currentDate), + currentTimezone) + tzoPreviousDay = getTimezoneOffset(pd.to_datetime(currentDate) + + dt.timedelta(days=-1), currentTimezone) + + return (tzoCurrentDay != tzoPreviousDay) + + +def tzoRangeWithComparisonTz(df, i, comparisonTz): + # if we have a previous timezone estimate, then calcuate the range of + # timezone offset values for that time zone + if pd.notnull(comparisonTz): + rangeTzos = getRangeOfTZOsForTimezone(comparisonTz) + else: + comparisonTz = np.nan + rangeTzos = np.array([]) + + return rangeTzos + + +def tzAndTzoRangePreviousDay(df, i): + # if we have a previous timezone estimate, then calcuate the range of + # timezone offset values for that time zone + comparisonTz = df.loc[i-1, "est.timezone"] + + rangeTzos = tzoRangeWithComparisonTz(df, i, comparisonTz) + + return comparisonTz, rangeTzos + + +def assignTzoFromPreviousDay(df, i, previousDayTz): + + df.loc[i, ["est.type"]] = "DEVICE" + df.loc[i, ["est.timezone"]] = previousDayTz + df.loc[i, ["est.timezoneOffset"]] = \ + getTimezoneOffset(pd.to_datetime(df.loc[i, "date"]), previousDayTz) + + df.loc[i, ["est.timeProcessing"]] = df.loc[i-1, "est.timeProcessing"] + df = addAnnotation(df, i, "tz-inferred-from-prev-day") + + return df + + +def assignTzoFromDeviceTzo(df, i, device): + + df.loc[i, ["est.type"]] = "DEVICE" + df.loc[i, ["est.timezoneOffset"]] = \ + df.loc[i, device + ".timezoneOffset"] + df.loc[i, ["est.timeProcessing"]] = \ + df.loc[i, device + ".upload.imputed.timeProcessing"] + + df = addAnnotation(df, i, "likely-travel") + df = addAnnotation(df, i, "tzo-from-" + device) + + return df + + +def compareDeviceTzoToPrevDayTzo(df, sIdx, device): + + for i in sIdx[sIdx > 0]: + + # first see if the previous record has a tzo + if (pd.notnull(df.loc[i-1, "est.timezoneOffset"])): + + previousDayTz, dstRange = tzAndTzoRangePreviousDay(df, i) + timeDiff = abs((df.loc[i, device + ".timezoneOffset"]) - + df.loc[i-1, "est.timezoneOffset"]) + + # next see if the previous record has a tz + if (pd.notnull(df.loc[i-1, "est.timezone"])): + + if timeDiff == 0: + assignTzoFromPreviousDay(df, i, previousDayTz) + + # see if the previous day's tzo and device tzo are within the + # dst range (as that is a common problem with this data) + elif ((df.loc[i, device + ".timezoneOffset"] in dstRange) + & (df.loc[i-1, "est.timezoneOffset"] in dstRange)): + + # then see if it is DST change day + if isDSTChangeDay(df.loc[i, "date"], previousDayTz): + + df = addAnnotation(df, i, "dst-change-day") + assignTzoFromPreviousDay(df, i, previousDayTz) + + # if it is not DST change day, then mark this as uncertain + else: + # also, check to see if the difference between device. + # tzo and prev.tzo is less than the expected dst + # difference. There is a known issue where the BtUTC + # procedure puts clock drift into the device.tzo, + # and as a result the tzo can be off by 15, 30, + # or 45 minutes. + if (((df.loc[i, device + ".timezoneOffset"] == + min(dstRange)) | + (df.loc[i, device + ".timezoneOffset"] == + max(dstRange))) & + ((df.loc[i-1, "est.timezoneOffset"] == + min(dstRange)) | + (df.loc[i-1, "est.timezoneOffset"] == + max(dstRange)))): + + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, + "likely-dst-error-OR-travel") + + else: + + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, + "likely-15-min-dst-error") + + # next see if time difference between device.tzo and prev.tzo + # is off by 720 minutes, which is indicative of a common + # user AM/PM error + elif timeDiff == 720: + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "likely-AM-PM-error") + + # if it doesn't fall into any of these cases, then the + # tzo difference is likely due to travel + else: + df = assignTzoFromDeviceTzo(df, i, device) + + elif timeDiff == 0: + df = assignTzoFromDeviceTzo(df, i, device) + + # if there is no previous record to compare with check for dst errors, + # and if there are no errors, it is likely a travel day + else: + + comparisonTz, dstRange = tzAndTzoRangeWithHomeTz(df, i) + timeDiff = abs((df.loc[i, device + ".timezoneOffset"]) - + df.loc[i, "home.imputed.timezoneOffset"]) + + if ((df.loc[i, device + ".timezoneOffset"] in dstRange) + & (df.loc[i, "home.imputed.timezoneOffset"] in dstRange)): + + # see if it is DST change day + if isDSTChangeDay(df.loc[i, "date"], comparisonTz): + + df = addAnnotation(df, i, "dst-change-day") + df.loc[i, ["est.type"]] = "DEVICE" + df.loc[i, ["est.timezoneOffset"]] = \ + df.loc[i, device + ".timezoneOffset"] + df.loc[i, ["est.timezone"]] = \ + df.loc[i, "home.imputed.timezone"] + df.loc[i, ["est.timeProcessing"]] = \ + df.loc[i, device + ".upload.imputed.timeProcessing"] + + # if it is not DST change day, then mark this as uncertain + else: + # also, check to see if the difference between device. + # tzo and prev.tzo is less than the expected dst + # difference. There is a known issue where the BtUTC + # procedure puts clock drift into the device.tzo, + # and as a result the tzo can be off by 15, 30, + # or 45 minutes. + if (((df.loc[i, device + ".timezoneOffset"] == + min(dstRange)) | + (df.loc[i, device + ".timezoneOffset"] == + max(dstRange))) & + ((df.loc[i, "home.imputed.timezoneOffset"] == + min(dstRange)) | + (df.loc[i, "home.imputed.timezoneOffset"] == + max(dstRange)))): + + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "likely-dst-error-OR-travel") + + else: + + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "likely-15-min-dst-error") + + # next see if time difference between device.tzo and prev.tzo + # is off by 720 minutes, which is indicative of a common + # user AM/PM error + elif timeDiff == 720: + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "likely-AM-PM-error") + + # if it doesn't fall into any of these cases, then the + # tzo difference is likely due to travel + + else: + df = assignTzoFromDeviceTzo(df, i, device) + + return df + + +def getImputIndices(df, sIdx, hIdx): + + lastDayIdx = len(df) - 1 + + currentDayIdx = sIdx.min() + tempList = pd.Series(hIdx) - currentDayIdx + prevDayIdx = currentDayIdx - 1 + nextDayIdx = \ + min(currentDayIdx + min(tempList[tempList >= 0]), lastDayIdx) + + return currentDayIdx, prevDayIdx, nextDayIdx + + +def imputeByTimezone(df, currentDay, prevDaywData, nextDaywData): + + gapSize = (nextDaywData - currentDay) + + if prevDaywData >= 0: + + if df.loc[prevDaywData, "est.timezone"] == \ + df.loc[nextDaywData, "est.timezone"]: + + tz = df.loc[prevDaywData, "est.timezone"] + + for i in range(currentDay, nextDaywData): + + df.loc[i, ["est.timezone"]] = tz + + df.loc[i, ["est.timezoneOffset"]] = \ + getTimezoneOffset(pd.to_datetime(df.loc[i, "date"]), tz) + + df.loc[i, ["est.type"]] = "IMPUTE" + + df = addAnnotation(df, i, "gap=" + str(gapSize)) + df.loc[i, ["est.gapSize"]] = gapSize + + # TODO: this logic should be updated to handle the edge case + # where the day before and after the gap have differing TZ, but + # the same TZO. In that case the gap should be marked as UNCERTAIN + elif df.loc[prevDaywData, "est.timezoneOffset"] == \ + df.loc[nextDaywData, "est.timezoneOffset"]: + + for i in range(currentDay, nextDaywData): + + df.loc[i, ["est.timezoneOffset"]] = \ + df.loc[prevDaywData, "est.timezoneOffset"] + + df.loc[i, ["est.type"]] = "IMPUTE" + + df = addAnnotation(df, i, "gap=" + str(gapSize)) + df.loc[i, ["est.gapSize"]] = gapSize + + else: + for i in range(currentDay, nextDaywData): + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "unable-to-impute-tzo") + + else: + for i in range(currentDay, nextDaywData): + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "unable-to-impute-tzo") + + return df + + +def addAnnotation(df, idx, annotationMessage): + if pd.notnull(df.loc[idx, "est.annotations"]): + df.loc[idx, ["est.annotations"]] = df.loc[idx, "est.annotations"] + \ + ", " + annotationMessage + else: + df.loc[idx, ["est.annotations"]] = annotationMessage + + return df + + +def getTimezoneOffset(currentDate, currentTimezone): + + tz = pytz.timezone(currentTimezone) + # here we add 1 day to the current date to account for changes to/from DST + tzoNum = int(tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z")) + tzoHours = np.floor(tzoNum / 100) + tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0) + tzoSign = np.sign(tzoHours) + tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign)) + + return tzo + + +def estimate_local_time(df): + df["date"] = df["utcTime"].dt.date # TODO: change this to utcDate later + contiguous_days = create_contiguous_day_series(df) + + df["deviceType"] = add_device_type(df) + cDays = add_device_day_series(df, contiguous_days, "upload") + + # create day series for cgm df + if "timezoneOffset" not in list(df): + df["timezoneOffset"] = np.nan + + cgmdf = df[(df["type"] == "cbg") & (df["timezoneOffset"].notnull())].copy() + cDays = add_device_day_series(cgmdf, cDays, "cgm") + + # create day series for pump df + pumpdf = df[(df.type == "bolus") & (df.timezoneOffset.notnull())].copy() + cDays = add_device_day_series(pumpdf, cDays, "pump") + + # interpolate between upload records of the same deviceType, and create a + # day series for interpolated pump, non-hk-cgm, and healthkit uploads + for deviceType in ["pump", "cgm", "healthkit"]: + tempUploaddf = df[df["deviceType"] == deviceType].copy() + cDays = impute_upload_records( + tempUploaddf, cDays, deviceType + ".upload.imputed" + ) + + # add a home timezone that also accounts for daylight savings time changes + cDays = add_home_timezone(df, cDays) + + # 1. USE UPLOAD RECORDS TO ESTIMATE TZ AND TZO + cDays = estimateTzAndTzoWithUploadRecords(cDays) + + # 2. USE DEVICE TZOs TO ESTIMATE TZO AND TZ (IF POSSIBLE) + # estimates can be made from pump and cgm df that have a TZO + # NOTE: the healthkit and dexcom-api cgm df are excluded + cDays = estimateTzAndTzoWithDeviceRecords(cDays) + + # 3. impute, infer, or interpolate gaps in the estimated tzo and tz + cDays = imputeTzAndTzo(cDays) + + # 4. APPLY LOCAL TIME ESTIMATES TO ALL df + local_time = applyLocalTimeEstimates(df, cDays) + + return local_time, cDays + + +# %% START OF CODE +all_metadata = pd.DataFrame() + +timezone_aliases = pd.read_csv( + "wikipedia-timezone-aliases-2018-04-28.csv", + low_memory=False +) + + +# %% GET DATA FROM API +#''' +#get metadata and data for a donor that has shared with bigdata +#NOTE: functions assume you have an .env with bigdata account credentials +#''' +# +#userid = "" +#donor_group = "" +# +#donor_metadata, _ = get_shared_metadata( +# donor_group=donor_group, +# userid_of_shared_user=userid # TODO: this should be refactored in several places to be userid +#) +#data, _ = get_data( +# donor_group=donor_group, +# userid=userid, +# weeks_of_data=52*10 +#) +# +## this is a dummy loop +#for i in [0]: + + +# %% GET DATA FROM JSON FILE + +data_path = os.path.join("..", "data") +all_donor_metadata = pd.read_csv( + os.path.join( + data_path, + "PHI-2019-07-17-donor-metadata.csv"), + low_memory=False +) + +# glob through the json files that are available +import glob +all_files = glob.glob(os.path.join( + data_path, + "PHI-2019-07-17-json-data", + "*.json" +)) + +# %% +for d_idx in range(0, len(all_files)): + data = pd.read_json(all_files[d_idx]) + userid = all_files[d_idx][-15:-5] + donor_metadata = all_donor_metadata[ + all_donor_metadata["userid"] == userid + ] + print("\n", "starting", userid) + + # CREATE META DATAFRAME (metadata) + ''' + this is useful for keeping track of the type and amount of cleaning done + ''' + metadata = pd.DataFrame(index=[userid]) + + # HASH USER ID + hashid = hash_userid(userid, os.environ['BIGDATA_SALT']) + data["userid"] = userid + data["hashid"] = hashid + + + # CLEAN DATA + data_fields = list(data) + + # NOTE: moving remove negative durations to type specific cleaning + # TODO: ask backend to change "duration" field to only include one object type + + # Tslim calibration bug fix + data, n_cal_readings = tslim_calibration_fix(data.copy()) + metadata["nTandemAndPayloadCalReadings"] = n_cal_readings + + # fix large timzoneOffset bug in utcbootstrapping + data = timezone_offset_bug_fix(data.copy()) + + # add healthkit timezome information + # TODO: refactor this function to only require fields that might have hk tz + data[["timezone", "deviceType"]] = get_healthkit_timezone(data.copy()) + + # convert deprecated timezones to their aliases + data = convert_deprecated_timezone_to_alias(data, timezone_aliases) + + + # TIME RELATED ITEMS + data["utcTime"] = to_utc_datetime(data[["time"]].copy()) + + # add upload time to the data, which is needed for: + # getting rid of duplicates and useful for getting local time + + data["uploadTime"] = ( + add_upload_time(data[["type", "uploadId", "utcTime"]].copy()) + ) + +# # estimate local time (refactor of estimate-local-time.py) +# data["localTime"], local_time_metadata = estimate_local_time(data.copy()) + + # TODO: fix this issue with estimate local time + ''' + //anaconda3/envs/tbddp/lib/python3.7/site-packages/pandas/core/ops.py:1649: + FutureWarning: elementwise comparison failed; returning scalar instead, + but in the future will perform elementwise comparison result = method(y) + ''' + + # round all data to the nearest 5 minutes + data["roundedUtcTime"] = round_time( + data["utcTime"].copy(), + time_interval_minutes=5, + start_with_first_record=True, + return_calculation_columns=False + ) + + + # TIME CATEGORIES + # AGE, & YLW + bDate = pd.to_datetime(donor_metadata["birthday"].values[0][0:7]) + if data["roundedUtcTime"].notnull().sum() == 0: + data["age"] = np.nan + else: + data["age"] = np.floor((data["roundedUtcTime"] - bDate).dt.days/365.25) + + + # GROUP DATA BY TYPE + # first sort by upload time (used when removing dumplicates) + data.sort_values("uploadTime", ascending=False, inplace=True) + groups = data.groupby(by="type") + + + # CGM DATA + if "cbg" in data["type"].unique(): + # filter by cgm + cgm = groups.get_group("cbg").dropna(axis=1, how="all") + + # calculate cgm in mg/dL + cgm["mg/dL"] = round(cgm["value"] * MGDL_PER_MMOLL) + + # get rid of spike data + cgm, nSpike = remove_spike_data(cgm.copy()) + metadata["nSpike"] = nSpike + + # TODO: refactor (above) so you don't need to drop columns + drop_columns = [ + 'origin.payload.device.name', + 'origin.payload.device.manufacturer', + 'origin.payload.sourceRevision.source.name' + ] + for drop_col in drop_columns: + if drop_col in list(cgm): + cgm.drop(columns=[drop_col], inplace=True) + + # assign upload cgm device info to cgm records in that upload + cgm = add_upload_info_to_cgm_records(groups, cgm.copy()) + + # check to see if cgm info exists in healthkit locations + cgm = expand_heathkit_cgm_fields(cgm.copy()) + + # replace smoothed cgm values with raw values (if they exist) + # this must run after expand_heathkit_cgm_fields _ + cgm["mg/dL"], metadata["nSmoothedCgmReplaced"] = ( + replace_smoothed_cgm_values(cgm.copy()) + ) + + # get cgm models + cgm["cgmModel"] = np.nan + + # dexcom cgm models (G4, G5, G6) + cgm["cgmModel"] = get_dexcom_cgm_model(cgm.copy()) + + # for non dexcom cgms + # 670G, 640G, 630G, 530G, 523/723, libre, animas, and tandem + cgm["cgmModel"] = get_non_dexcom_cgm_model(cgm.copy()) + + # get metadata on cgm models and devices + metadata["nMissingCgmModels"] = cgm["cgmModel"].isnull().sum() + metadata["uniqueCgmModels"] = str(cgm["cgmModel"].unique()) + if "deviceId" in list(cgm): + metadata["uniqueCgmDevices"] = str(cgm["deviceId"].unique()) + + # %% clean distributions + # break up all traces by cgm model + all_cgm_series = pd.DataFrame() + cgm_models = cgm.groupby(by="cgmModel") +# for cgm_model in cgm["cgmModel"].unique(): + for cgm_model in cgm_models.groups.keys(): + print(cgm_model) + temp_cgm = cgm_models.get_group(cgm_model) + + # get rid of cgm values too low/high (< 38 & > 402 mg/dL) + temp_cgm, nInvalidCgmValues = remove_invalid_cgm_values(temp_cgm) + metadata["nInvalidCgmValues." + cgm_model] = nInvalidCgmValues + + # sort by upload time before getting rid of duplicates + temp_cgm.sort_values("uploadTime", ascending=False, inplace=True) + + # get rid of duplicates that have the same ["deviceTime", "mg/dL"] + temp_cgm, n_cgm_dups_removed = ( + removeCgmDuplicates(temp_cgm, "deviceTime", "mg/dL") + ) + metadata["nCgmDuplicatesRemovedDeviceTime." + cgm_model] = ( + n_cgm_dups_removed + ) + + # get rid of duplicates that have the same ["time", "mg/dL"] + temp_cgm, n_cgm_dups_removed = ( + removeCgmDuplicates(temp_cgm, "utcTime", "mg/dL") + ) + metadata["nCgmDuplicatesRemovedUtcTime." + cgm_model] = ( + n_cgm_dups_removed + ) + + # get rid of duplicates that have the same roundedTime + temp_cgm, n_cgm_dups_removed = ( + removeDuplicates(temp_cgm, "roundedUtcTime") + ) + metadata["nCgmDuplicatesRemovedRoundedTime." + cgm_model] = ( + n_cgm_dups_removed + ) + + # create a contiguous 5 minute time series + first_day = temp_cgm["roundedUtcTime"].min() + last_day = temp_cgm["roundedUtcTime"].max() + rng = pd.date_range(first_day, last_day, freq="5min") + contiguous_data = pd.DataFrame( + rng, + columns=["roundedUtcTime"] + ).sort_values( + "roundedUtcTime", + ascending=False + ).reset_index(drop=True) + + # merge with cgm data + cgm_series = pd.merge( + contiguous_data, + temp_cgm[ + ["roundedUtcTime", "hashid", "cgmModel", "age", "mg/dL"] + ], + on="roundedUtcTime", + how="left" + ) + # + cgm_series.sort_values( + "roundedUtcTime", ascending=True, inplace=True + ) + cgm_series.reset_index(drop=True, inplace=True) + + # get dexcom icgm bins + value_bins = np.array( + [37, 39, 60, 80, 120, 160, 200, 250, 300, 350, 400, 403] + ) + value_bin_names = ( + "< 40", "40-60", "61-80", "81-120", "121-160", "161-200", + "201-250", "251-300", "301-350", "351-400", "> 400" + ) + cgm_series["valueBin"] = pd.cut( + cgm_series["mg/dL"], value_bins, labels=value_bin_names + ) + + + # get the previous val + cgm_series["previousVal"] = cgm_series["mg/dL"].shift(1) + + # get difference between current and previous val + cgm_series["diffFromPrevVal"] = ( + cgm_series["mg/dL"] - cgm_series["previousVal"] + ) + + # calculate the rate from previous value (mg/dL/min) + cgm_series["rateFromPrevVal"] = cgm_series["diffFromPrevVal"] / 5 + + # get dexcom icgm rate bins + rate_bins = np.array( + [-100, -2.000001, -1.000001, -0.000001, 1, 2, 100] + ) + # NOTE: bracket means include, parentheses means exclude + rate_bin_names = ( + "< -2", "[-2,-1)", "[-1,-0)", "[0,1]", "(1,2]", ">2", + ) + cgm_series["rateBin"] = pd.cut( + cgm_series["rateFromPrevVal"], rate_bins, labels=rate_bin_names + ) + + # through in the join category + cgm_series["valAndRateBin"] = ( + cgm_series["valueBin"].astype(str) + + " & " + + cgm_series["rateBin"].astype(str) + ) + + # calculate slope (mg/dL/min) over the last 15, 30, and 60 minutes + cgm_series["slope15"] = ( + cgm_series["mg/dL"].rolling(3).apply(get_slope, raw=True) + ) + + cgm_series["slope30"] = ( + cgm_series["mg/dL"].rolling(6).apply(get_slope, raw=True) + ) + + cgm_series["slope60"] = ( + cgm_series["mg/dL"].rolling(12).apply(get_slope, raw=True) + ) + + # add in the next value + cgm_series["nextVal"] = cgm_series["mg/dL"].shift(-1) + + # get difference or relative increase/decrease of next value + cgm_series["relativeNextValue"] = ( + cgm_series["nextVal"] - cgm_series["mg/dL"] + ) + + # rate of next value + cgm_series["rateToNextVal"] = cgm_series["relativeNextValue"] / 5 + + # drop rows where there is no information + cgm_series.dropna(subset=['hashid'], inplace=True) + metadata["nCgmDataPoints." + cgm_model] = len(cgm_series) + + # append cgm model to a larger table + all_cgm_series = pd.concat( + [all_cgm_series, cgm_series], + ignore_index=True + ) + + + # %% END OF CODE + print(metadata.T) + + else: + print(d_idx, "no cgm data") + + # combine metadata + all_metadata = pd.concat([all_metadata, metadata], sort=False) + print("finished", d_idx, userid) + + + From 890bac8d3a796d5038f8e6c868cbde7408a7e0e6 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Sat, 17 Aug 2019 07:42:34 -0500 Subject: [PATCH 25/46] save output --- .../get_stats/get_cgm_distributions_v3.py | 61 ++++-- .../wikipedia-timezone-aliases-2018-04-28.csv | 206 ++++++++++++++++++ 2 files changed, 248 insertions(+), 19 deletions(-) create mode 100644 projects/bigdata-processing-pipeline/get_stats/wikipedia-timezone-aliases-2018-04-28.csv diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py index 5e670608..9f2bee79 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py @@ -13,7 +13,7 @@ import numpy as np import pandas as pd import datetime as dt -import ast +import glob import pdb # TODO: figure out how to get rid of these path dependcies get_donor_data_path = os.path.abspath( @@ -24,7 +24,7 @@ import environmentalVariables from get_donor_data.get_single_donor_metadata import get_shared_metadata from get_donor_data.get_single_tidepool_dataset import get_data - +from get_donor_data.get_single_tidepool_dataset_json import make_folder_if_doesnt_exist # %% CONSTANTS MGDL_PER_MMOLL = 18.01559 @@ -1465,22 +1465,39 @@ def estimate_local_time(df): # %% GET DATA FROM JSON FILE - data_path = os.path.join("..", "data") all_donor_metadata = pd.read_csv( os.path.join( data_path, + "PHI-2019-07-17-donor-data", "PHI-2019-07-17-donor-metadata.csv"), low_memory=False ) # glob through the json files that are available -import glob -all_files = glob.glob(os.path.join( +all_files = glob.glob( + os.path.join( + data_path, + "dremio", + "**", + "*.json" + ), + recursive=True +) + +output_metadata = os.path.join( + data_path, + "PHI-2019-07-17-donor-data", + "PHI-2019-07-17-cgm-metadata" +) +output_distribution = os.path.join( data_path, - "PHI-2019-07-17-json-data", - "*.json" -)) + "PHI-2019-07-17-donor-data", + "PHI-2019-07-17-cgm-distributions" +) + +make_folder_if_doesnt_exist([output_metadata, output_distribution]) + # %% for d_idx in range(0, len(all_files)): @@ -1502,7 +1519,6 @@ def estimate_local_time(df): data["userid"] = userid data["hashid"] = hashid - # CLEAN DATA data_fields = list(data) @@ -1523,7 +1539,6 @@ def estimate_local_time(df): # convert deprecated timezones to their aliases data = convert_deprecated_timezone_to_alias(data, timezone_aliases) - # TIME RELATED ITEMS data["utcTime"] = to_utc_datetime(data[["time"]].copy()) @@ -1552,7 +1567,6 @@ def estimate_local_time(df): return_calculation_columns=False ) - # TIME CATEGORIES # AGE, & YLW bDate = pd.to_datetime(donor_metadata["birthday"].values[0][0:7]) @@ -1561,15 +1575,15 @@ def estimate_local_time(df): else: data["age"] = np.floor((data["roundedUtcTime"] - bDate).dt.days/365.25) - # GROUP DATA BY TYPE # first sort by upload time (used when removing dumplicates) data.sort_values("uploadTime", ascending=False, inplace=True) groups = data.groupby(by="type") - # CGM DATA if "cbg" in data["type"].unique(): + metadata["cgmData"] = True + # filter by cgm cgm = groups.get_group("cbg").dropna(axis=1, how="all") @@ -1622,7 +1636,7 @@ def estimate_local_time(df): # break up all traces by cgm model all_cgm_series = pd.DataFrame() cgm_models = cgm.groupby(by="cgmModel") -# for cgm_model in cgm["cgmModel"].unique(): + for cgm_model in cgm_models.groups.keys(): print(cgm_model) temp_cgm = cgm_models.get_group(cgm_model) @@ -1697,13 +1711,12 @@ def estimate_local_time(df): cgm_series["mg/dL"], value_bins, labels=value_bin_names ) - # get the previous val cgm_series["previousVal"] = cgm_series["mg/dL"].shift(1) # get difference between current and previous val cgm_series["diffFromPrevVal"] = ( - cgm_series["mg/dL"] - cgm_series["previousVal"] + cgm_series["mg/dL"] - cgm_series["previousVal"] ) # calculate the rate from previous value (mg/dL/min) @@ -1746,7 +1759,7 @@ def estimate_local_time(df): # get difference or relative increase/decrease of next value cgm_series["relativeNextValue"] = ( - cgm_series["nextVal"] - cgm_series["mg/dL"] + cgm_series["nextVal"] - cgm_series["mg/dL"] ) # rate of next value @@ -1762,15 +1775,25 @@ def estimate_local_time(df): ignore_index=True ) - - # %% END OF CODE + # save distribution data + all_cgm_series.to_csv(os.path.join( + output_distribution, + "PHI-" + userid + "-cgm-distribution.csv" + )) print(metadata.T) else: + metadata["cgmData"] = False print(d_idx, "no cgm data") # combine metadata all_metadata = pd.concat([all_metadata, metadata], sort=False) + + # save metadata + all_metadata.to_csv(os.path.join( + output_metadata, + "PHI-" + userid + "-cgm-metadata.csv" + )) print("finished", d_idx, userid) diff --git a/projects/bigdata-processing-pipeline/get_stats/wikipedia-timezone-aliases-2018-04-28.csv b/projects/bigdata-processing-pipeline/get_stats/wikipedia-timezone-aliases-2018-04-28.csv new file mode 100644 index 00000000..01370b69 --- /dev/null +++ b/projects/bigdata-processing-pipeline/get_stats/wikipedia-timezone-aliases-2018-04-28.csv @@ -0,0 +1,206 @@ +tz,alias +Africa/Addis_Ababa,Africa/Nairobi +Africa/Asmara,Africa/Nairobi +Africa/Bamako,Africa/Abidjan +Africa/Bangui,Africa/Lagos +Africa/Banjul,Africa/Abidjan +Africa/Blantyre,Africa/Maputo +Africa/Brazzaville,Africa/Lagos +Africa/Bujumbura,Africa/Maputo +Africa/Conakry,Africa/Abidjan +Africa/Dakar,Africa/Abidjan +Africa/Dar_es_Salaam,Africa/Nairobi +Africa/Djibouti,Africa/Nairobi +Africa/Douala,Africa/Lagos +Africa/Freetown,Africa/Abidjan +Africa/Gaborone,Africa/Maputo +Africa/Harare,Africa/Maputo +Africa/Kampala,Africa/Nairobi +Africa/Kigali,Africa/Maputo +Africa/Kinshasa,Africa/Lagos +Africa/Libreville,Africa/Lagos +Africa/Lome,Africa/Abidjan +Africa/Luanda,Africa/Lagos +Africa/Lubumbashi,Africa/Maputo +Africa/Lusaka,Africa/Maputo +Africa/Malabo,Africa/Lagos +Africa/Maseru,Africa/Johannesburg +Africa/Mbabane,Africa/Johannesburg +Africa/Mogadishu,Africa/Nairobi +Africa/Niamey,Africa/Lagos +Africa/Nouakchott,Africa/Abidjan +Africa/Ouagadougou,Africa/Abidjan +Africa/Porto-Novo,Africa/Lagos +Africa/Sao_Tome,Africa/Lagos +Africa/Timbuktu,Africa/Abidjan +America/Anguilla,America/Port_of_Spain +America/Antigua,America/Port_of_Spain +America/Argentina/ComodRivadavia,America/Argentina/Catamarca +America/Aruba,America/Curacao +America/Atka,America/Adak +America/Buenos_Aires,America/Argentina/Buenos_Aires +America/Catamarca,America/Argentina/Catamarca +America/Cayman,America/Panama +America/Coral_Harbour,America/Atikokan +America/Cordoba,America/Argentina/Cordoba +America/Dominica,America/Port_of_Spain +America/Ensenada,America/Tijuana +America/Fort_Wayne,America/Indiana/Indianapolis +America/Grenada,America/Port_of_Spain +America/Guadeloupe,America/Port_of_Spain +America/Indianapolis,America/Indiana/Indianapolis +America/Jujuy,America/Argentina/Jujuy +America/Knox_IN,America/Indiana/Knox +America/Kralendijk,America/Curacao +America/Louisville,America/Kentucky/Louisville +America/Lower_Princes,America/Curacao +America/Marigot,America/Port_of_Spain +America/Mendoza,America/Argentina/Mendoza +America/Montreal,America/Toronto +America/Montserrat,America/Port_of_Spain +America/Porto_Acre,America/Rio_Branco +America/Rosario,America/Argentina/Cordoba +America/Santa_Isabel,America/Tijuana +America/Shiprock,America/Denver +America/St_Barthelemy,America/Port_of_Spain +America/St_Kitts,America/Port_of_Spain +America/St_Lucia,America/Port_of_Spain +America/St_Thomas,America/Port_of_Spain +America/St_Vincent,America/Port_of_Spain +America/Tortola,America/Port_of_Spain +America/Virgin,America/Port_of_Spain +Antarctica/McMurdo,Pacific/Auckland +Antarctica/South_Pole,Pacific/Auckland +Arctic/Longyearbyen,Europe/Oslo +Asia/Aden,Asia/Riyadh +Asia/Ashkhabad,Asia/Ashgabat +Asia/Bahrain,Asia/Qatar +Asia/Calcutta,Asia/Kolkata +Asia/Chongqing,Asia/Shanghai +Asia/Chungking,Asia/Shanghai +Asia/Dacca,Asia/Dhaka +Asia/Harbin,Asia/Shanghai +Asia/Istanbul,Europe/Istanbul +Asia/Kashgar,Asia/Urumqi[note1] +Asia/Katmandu,Asia/Kathmandu +Asia/Kuwait,Asia/Riyadh +Asia/Macao,Asia/Macau +Asia/Muscat,Asia/Dubai +Asia/Phnom_Penh,Asia/Bangkok +Asia/Rangoon,Asia/Yangon +Asia/Saigon,Asia/Ho_Chi_Minh +Asia/Tel_Aviv,Asia/Jerusalem +Asia/Thimbu,Asia/Thimphu +Asia/Ujung_Pandang,Asia/Makassar +Asia/Ulan_Bator,Asia/Ulaanbaatar +Asia/Vientiane,Asia/Bangkok +Atlantic/Faeroe,Atlantic/Faroe +Atlantic/Jan_Mayen,Europe/Oslo +Atlantic/St_Helena,Africa/Abidjan +Australia/ACT,Australia/Sydney +Australia/Canberra,Australia/Sydney +Australia/LHI,Australia/Lord_Howe +Australia/North,Australia/Darwin +Australia/NSW,Australia/Sydney +Australia/Queensland,Australia/Brisbane +Australia/South,Australia/Adelaide +Australia/Tasmania,Australia/Hobart +Australia/Victoria,Australia/Melbourne +Australia/West,Australia/Perth +Australia/Yancowinna,Australia/Broken_Hill +Brazil/Acre,America/Rio_Branco +Brazil/DeNoronha,America/Noronha +Brazil/East,America/Sao_Paulo +Brazil/West,America/Manaus +Canada/Atlantic,America/Halifax +Canada/Central,America/Winnipeg +Canada/Eastern,America/Toronto +Canada/Mountain,America/Edmonton +Canada/Newfoundland,America/St_Johns +Canada/Pacific,America/Vancouver +Canada/Saskatchewan,America/Regina +Canada/Yukon,America/Whitehorse +Chile/Continental,America/Santiago +Chile/EasterIsland,Pacific/Easter +Cuba,America/Havana +Egypt,Africa/Cairo +Eire,Europe/Dublin +Etc/GMT+0,Etc/GMT +Etc/GMT-0,Etc/GMT +Etc/GMT0,Etc/GMT +Etc/Greenwich,Etc/GMT +Etc/Universal,Etc/UTC +Etc/Zulu,Etc/UTC +Europe/Belfast,Europe/London +Europe/Bratislava,Europe/Prague +Europe/Busingen,Europe/Zurich +Europe/Guernsey,Europe/London +Europe/Isle_of_Man,Europe/London +Europe/Jersey,Europe/London +Europe/Ljubljana,Europe/Belgrade +Europe/Mariehamn,Europe/Helsinki +Europe/Nicosia,Asia/Nicosia +Europe/Podgorica,Europe/Belgrade +Europe/San_Marino,Europe/Rome +Europe/Sarajevo,Europe/Belgrade +Europe/Skopje,Europe/Belgrade +Europe/Tiraspol,Europe/Chisinau +Europe/Vaduz,Europe/Zurich +Europe/Vatican,Europe/Rome +Europe/Zagreb,Europe/Belgrade +GB,Europe/London +GB-Eire,Europe/London +GMT,Etc/GMT +GMT+0,Etc/GMT +GMT0,Etc/GMT +GMT−0,Etc/GMT +Greenwich,Etc/GMT +Hongkong,Asia/Hong_Kong +Iceland,Atlantic/Reykjavik +Indian/Antananarivo,Africa/Nairobi +Indian/Comoro,Africa/Nairobi +Indian/Mayotte,Africa/Nairobi +Iran,Asia/Tehran +Israel,Asia/Jerusalem +Jamaica,America/Jamaica +Japan,Asia/Tokyo +Kwajalein,Pacific/Kwajalein +Libya,Africa/Tripoli +Mexico/BajaNorte,America/Tijuana +Mexico/BajaSur,America/Mazatlan +Mexico/General,America/Mexico_City +Navajo,America/Denver +NZ,Pacific/Auckland +NZ-CHAT,Pacific/Chatham +Pacific/Johnston,Pacific/Honolulu +Pacific/Midway,Pacific/Pago_Pago +Pacific/Ponape,Pacific/Pohnpei +Pacific/Saipan,Pacific/Guam +Pacific/Samoa,Pacific/Pago_Pago +Pacific/Truk,Pacific/Chuuk +Pacific/Yap,Pacific/Chuuk +Poland,Europe/Warsaw +Portugal,Europe/Lisbon +PRC,Asia/Shanghai +ROC,Asia/Taipei +ROK,Asia/Seoul +Singapore,Asia/Singapore +Turkey,Europe/Istanbul +UCT,Etc/UCT +Universal,Etc/UTC +US/Alaska,America/Anchorage +US/Aleutian,America/Adak +US/Arizona,America/Phoenix +US/Central,America/Chicago +US/East-Indiana,America/Indiana/Indianapolis +US/Eastern,America/New_York +US/Hawaii,Pacific/Honolulu +US/Indiana-Starke,America/Indiana/Knox +US/Michigan,America/Detroit +US/Mountain,America/Denver +US/Pacific,America/Los_Angeles +US/Pacific-New,America/Los_Angeles +US/Samoa,Pacific/Pago_Pago +UTC,Etc/UTC +W-SU,Europe/Moscow +Zulu,Etc/UTC \ No newline at end of file From 1d4ef81fef542549c9eda72ec12de3f2e32eced7 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Sat, 17 Aug 2019 08:49:26 -0500 Subject: [PATCH 26/46] remove collecting all metadata instead collect metadata for single user --- .../get_stats/get_cgm_distributions_v3.py | 59 +++++++++++-------- 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py index 9f2bee79..0d12c57c 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py @@ -1432,15 +1432,6 @@ def estimate_local_time(df): return local_time, cDays -# %% START OF CODE -all_metadata = pd.DataFrame() - -timezone_aliases = pd.read_csv( - "wikipedia-timezone-aliases-2018-04-28.csv", - low_memory=False -) - - # %% GET DATA FROM API #''' #get metadata and data for a donor that has shared with bigdata @@ -1499,25 +1490,38 @@ def estimate_local_time(df): make_folder_if_doesnt_exist([output_metadata, output_distribution]) +# %% START OF CODE +timezone_aliases = pd.read_csv( + "wikipedia-timezone-aliases-2018-04-28.csv", + low_memory=False +) + +donor_metadata_columns = [ + 'userid', + 'diagnosisType', + 'diagnosisDate', + 'biologicalSex', + 'birthday', + 'targetTimezone', + 'targetDevices', + 'isOtherPerson', +] + # %% for d_idx in range(0, len(all_files)): data = pd.read_json(all_files[d_idx]) userid = all_files[d_idx][-15:-5] - donor_metadata = all_donor_metadata[ - all_donor_metadata["userid"] == userid + metadata = all_donor_metadata.loc[ + all_donor_metadata["userid"] == userid, + donor_metadata_columns ] print("\n", "starting", userid) - # CREATE META DATAFRAME (metadata) - ''' - this is useful for keeping track of the type and amount of cleaning done - ''' - metadata = pd.DataFrame(index=[userid]) - # HASH USER ID hashid = hash_userid(userid, os.environ['BIGDATA_SALT']) data["userid"] = userid data["hashid"] = hashid + metadata["hashid"] = hashid # CLEAN DATA data_fields = list(data) @@ -1569,11 +1573,18 @@ def estimate_local_time(df): # TIME CATEGORIES # AGE, & YLW - bDate = pd.to_datetime(donor_metadata["birthday"].values[0][0:7]) - if data["roundedUtcTime"].notnull().sum() == 0: + # TODO: make this a function + if metadata["birthday"].values is not None: + bDate = pd.to_datetime(metadata["birthday"].values[0][0:7]) + data["age"] = np.floor((data["roundedUtcTime"] - bDate).dt.days/365.25) + else: data["age"] = np.nan + + if metadata["diagnosisDate"].values is not None: + dDate = pd.to_datetime(metadata["diagnosisDate"].values[0][0:7]) + data["ylw"] = np.floor((data["roundedUtcTime"] - dDate).dt.days/365.25) else: - data["age"] = np.floor((data["roundedUtcTime"] - bDate).dt.days/365.25) + data["ylw"] = np.nan # GROUP DATA BY TYPE # first sort by upload time (used when removing dumplicates) @@ -1786,15 +1797,11 @@ def estimate_local_time(df): metadata["cgmData"] = False print(d_idx, "no cgm data") - # combine metadata - all_metadata = pd.concat([all_metadata, metadata], sort=False) - # save metadata - all_metadata.to_csv(os.path.join( + metadata.to_csv(os.path.join( output_metadata, "PHI-" + userid + "-cgm-metadata.csv" )) - print("finished", d_idx, userid) - + print("finished", d_idx, userid) From 264a79edda08fe9124ffce651003f45fb27b3ac8 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Sat, 17 Aug 2019 09:18:01 -0500 Subject: [PATCH 27/46] add additional metadata to output --- .../get_stats/get_cgm_distributions_v3.py | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py index 0d12c57c..25006b98 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py @@ -1698,13 +1698,15 @@ def estimate_local_time(df): # merge with cgm data cgm_series = pd.merge( contiguous_data, - temp_cgm[ - ["roundedUtcTime", "hashid", "cgmModel", "age", "mg/dL"] - ], + temp_cgm[[ + "roundedUtcTime", "hashid", + "cgmModel", "age", "ylw", "mg/dL" + ]], on="roundedUtcTime", how="left" ) - # + + # sort so that the oldest data point is on top cgm_series.sort_values( "roundedUtcTime", ascending=True, inplace=True ) @@ -1786,6 +1788,24 @@ def estimate_local_time(df): ignore_index=True ) + # sort so that the oldest data point is on top + all_cgm_series.sort_values( + "roundedUtcTime", ascending=False, inplace=True + ) + all_cgm_series.reset_index(drop=True, inplace=True) + + # add in check to see if there are duplicates between cgm devices + nUnique_cgm_times = len(all_cgm_series["roundedUtcTime"].unique()) + metadata["duplicateCgmDataIssue"] = ( + nUnique_cgm_times != len(all_cgm_series) + ) + + # get metadata for cgm stats + metadata["lastCgm.date"] = all_cgm_series.loc[0, "roundedUtcTime"] + metadata["lastCgm.age"] = all_cgm_series.loc[0, "age"] + metadata["lastCgm.ylw"] = all_cgm_series.loc[0, "ylw"] + + pdb.set_trace() # save distribution data all_cgm_series.to_csv(os.path.join( output_distribution, From 20bb215bb3b6f0a3f27abef0f863598d7dbd2977 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Sat, 17 Aug 2019 11:29:15 -0500 Subject: [PATCH 28/46] fix spike data drop bug the wrong index was getting deleted --- .../get_stats/get_cgm_distributions_v3.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py index 25006b98..2966fa5f 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py @@ -652,10 +652,8 @@ def remove_invalid_cgm_values(df): nBefore = len(df) # remove values < 38 and > 402 mg/dL - df = df.drop(df[((df.type == "cbg") & - (df["mg/dL"] < 38))].index) - df = df.drop(df[((df.type == "cbg") & - (df["mg/dL"] > 402))].index) + df = df.drop(df[((df.type == "cbg") & (df["mg/dL"] < 38))].index) + df = df.drop(df[((df.type == "cbg") & (df["mg/dL"] > 402))].index) nRemoved = nBefore - len(df) return df, nRemoved @@ -701,13 +699,11 @@ def remove_spike_data(df): ] for spike_loc in spike_locations: df[spike_loc] = get_embedded_field(df["origin"], spike_loc) - - spike_idx = df.loc[ - df[spike_loc].notnull(), - spike_loc - ].astype(str).str.lower().str.contains("spike") - - df.drop((spike_idx == True).index, inplace=True) + notnull_idx = df[spike_loc].notnull() + df_notnull = df[notnull_idx] + is_spike = df_notnull[spike_loc].astype(str).str.lower().str.contains("spike") + spike_idx = df_notnull[is_spike].index + df.drop(spike_idx, inplace=True) nRemoved = nBefore - len(df) From 241987bf5bc5bc8f382ac208fa26a6666bd548d4 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Mon, 19 Aug 2019 08:56:28 -0500 Subject: [PATCH 29/46] refactor sensing cgmModel --- .../get_stats/get_cgm_distributions_v3.py | 268 ++++++++++-------- 1 file changed, 152 insertions(+), 116 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py index 2966fa5f..e3fdf2d4 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py @@ -136,19 +136,20 @@ def get_embedded_field(ts, embedded_field): def add_upload_info_to_cgm_records(groups, df): upload_locations = [ - "uploadId", - "deviceManufacturers", - "deviceModel", - "deviceSerialNumber", - "deviceTags" + "upload.uploadId", + "upload.deviceManufacturers", + "upload.deviceModel", + "upload.deviceSerialNumber", + "upload.deviceTags" ] if "upload" in groups["type"].unique(): - upload = groups.get_group("upload").dropna(axis=1, how="all") + upload = groups.get_group("upload").dropna(axis=1, how="all").add_prefix("upload.") df = pd.merge( left=df, right=upload[list(set(upload_locations) & set(list(upload)))], - on="uploadId", + left_on="uploadId", + right_on="upload.uploadId", how="left" ) @@ -156,6 +157,18 @@ def add_upload_info_to_cgm_records(groups, df): def expand_heathkit_cgm_fields(df): + # TODO: refactor the code/function that originally grabs + # these fields, so we are only doing it once, and so + # we don't have to drop the columns for the code below to work. + drop_columns = [ + 'origin.payload.device.name', + 'origin.payload.device.manufacturer', + 'origin.payload.sourceRevision.source.name' + ] + for drop_col in drop_columns: + if drop_col in list(df): + df.drop(columns=[drop_col], inplace=True) + healthkit_locations = [ "origin", "origin.payload", @@ -177,65 +190,85 @@ def expand_heathkit_cgm_fields(df): def get_dexcom_cgm_model(df): # add cgm model - # put this list in order of precedence when choosing sensor version - # NOTE: there is an edge case where "origin.payload.device.model" = G5/G6, - # which can be eliminated by getting model from HKMetadataKeySyncIdentifier + dexcom_model_locations = [ "deviceId", "deviceManufacturers", + "upload.deviceManufacturers", "deviceModel", + "upload.deviceModel", "deviceSerialNumber", - "payload.HKMetadataKeySyncIdentifier", # do this before "origin.payload.device.model" bc there is an edge case - "origin.payload.device.model", + "upload.deviceSerialNumber", "origin.payload.sourceRevision.source.name", "payload.transmitterGeneration", + "payload.HKMetadataKeySyncIdentifier", "payload.transmitterId", ] for model_location in dexcom_model_locations: - if model_location in list(df): - # only consider cells where the model location is not null - notnull_idx = df[model_location].notnull() - if notnull_idx.sum() > 0: - for dex_model in ["G4", "G5", "G6"]: - # define a pandas stringMethod - str_list = df[model_location].astype(str).str - # if model has already been determined, then skip - missing_model_idx = df["cgmModel"].isnull() - # get index that matches model - model_idx = str_list.upper().str.contains(dex_model) - - m_idx = ( - missing_model_idx & notnull_idx & model_idx - ) - df.loc[m_idx, "cgmModel"] = dex_model - - # case of "payload.transmitterId" - if ( - ("payload.transmitterId" in model_location) - | ("payload.HKMetadataKeySyncIdentifier" in model_location) - ): - # get string length (need 5 digits for G4 and 6 for G5, G6) - if "G4" in dex_model: - model_idx = str_list.len() == 5 - elif "G5" in dex_model: - model_idx = str_list.startswith("4") - elif "G6" in dex_model: - model_idx = ( - (str_list.startswith("8")) - | (str_list.startswith("2")) - ) - m_idx = ( - missing_model_idx & notnull_idx & model_idx - ) - df.loc[m_idx, "cgmModel"] = dex_model - - return df["cgmModel"] + # only check if model has NOT been determined, or if it is G5_G6 + m_idx = ( + (df["cgmModel"].isnull()) + | (df["cgmModel"].astype(str).str.contains("G5_G6")) + ) + + # get index that matches model + if ((model_location in list(df)) & (m_idx.sum() > 0)): + str_list = df[model_location].astype(str).str + + # G4 + g4_idx = str_list.contains("G4", case=False, na=False) + df.loc[g4_idx, "cgmModel"] = "G4" + df.loc[g4_idx, "cgmModelSensedFrom"] = model_location + + # G5 + g5_idx = str_list.contains("G5", case=False, na=False) + df.loc[g5_idx, "cgmModel"] = "G5" + df.loc[g5_idx, "cgmModelSensedFrom"] = model_location + + # G6 + g6_idx = str_list.contains("G6", case=False, na=False) + df.loc[g6_idx, "cgmModel"] = "G6" + df.loc[g6_idx, "cgmModelSensedFrom"] = model_location + + # edge case of g5 and g6 + g5_g6_idx = (g5_idx & g6_idx) + df.loc[g5_g6_idx, "cgmModel"] = "G5_G6" + df.loc[g5_g6_idx, "cgmModelSensedFrom"] = model_location + + # case of "transmitterId" + if ( + ("transmitterId" in model_location) + | ("payload.HKMetadataKeySyncIdentifier" in model_location) + ): + # if length of string is 5, then it is likely a G4 sensor + length5_idx = str_list.len() == 5 + df.loc[length5_idx, "cgmModel"] = "G4" + df.loc[length5_idx, "cgmModelSensedFrom"] = model_location + + # if length of string > 5 then might be G5 or G6 + length_gt5_idx = str_list.len() > 5 + + # if sensor stats with 4 then likely G5 + starts4_idx = str_list.startswith("4") + df.loc[(length_gt5_idx & starts4_idx), "cgmModel"] = "G5" + df.loc[(length_gt5_idx & starts4_idx), "cgmModelSensedFrom"] = model_location + + # if sensor stats with 2 or 8 then likely G6 + starts2_6_idx = ( + (str_list.startswith("2")) | (str_list.startswith("8")) + ) + df.loc[(length_gt5_idx & starts2_6_idx), "cgmModel"] = "G6" + df.loc[(length_gt5_idx & starts2_6_idx), "cgmModelSensedFrom"] = model_location + + return df[["cgmModel", "cgmModelSensedFrom"]] def get_non_dexcom_cgm_model(df): # non-dexcom cgm model query model_locations = ["deviceId"] + + # model types (NOTE: for medtronic getting pump type not cgm) models_670G = "MMT-158|MMT-178" models_640G = "MMT-1511|MMT-1512|MMT-1711|MMT-1712" models_630G = "MMT-1514|MMT-1515|MMT-1714|MMT-1715" @@ -260,25 +293,26 @@ def get_non_dexcom_cgm_model(df): "LIBRE", "G4", "G5_G6", "G4" ] - for model_loc in model_locations: - if model_loc in list(df): - # only consider cells where the model location is not null - # and we are missing a cgm model - notnull_idx = df[model_loc].notnull() - if notnull_idx.sum() > 0: - missing_model_idx = df["cgmModel"].isnull() - if missing_model_idx.sum() > 0: - # define a pandas stringMethod - str_list = df[model_loc].astype(str).str + for model_location in model_locations: + # only check if model has NOT been determined, or if it is G5_G6 + m_idx = ( + (df["cgmModel"].isnull()) + | (df["cgmModel"].astype(str).str.contains("G5_G6")) + ) + + # get index that matches model + if ((model_location in list(df)) & (m_idx.sum() > 0)): + str_list = df[model_location].astype(str).str + + for non_dex_model, model_name in zip( + non_dex_models, non_dex_model_names + ): - for non_dex_model, model_name in zip( - non_dex_models, non_dex_model_names - ): - model_idx = str_list.contains(non_dex_model) - m_idx = (missing_model_idx & notnull_idx & model_idx) - df.loc[m_idx, "cgmModel"] = model_name + model_idx = str_list.contains(non_dex_model, na=False) + df.loc[model_idx, "cgmModel"] = model_name + df.loc[model_idx, "cgmModelSensedFrom"] = model_location - return df["cgmModel"] + return df[["cgmModel", "cgmModelSensedFrom"]] def hash_userid(userid, salt): @@ -322,9 +356,6 @@ def remove_negative_durations(df): return df, n_negative_durations - - - def tslim_calibration_fix(df): ''' taken from https://github.com/tidepool-org/data-analytics/blob/ @@ -1428,29 +1459,6 @@ def estimate_local_time(df): return local_time, cDays -# %% GET DATA FROM API -#''' -#get metadata and data for a donor that has shared with bigdata -#NOTE: functions assume you have an .env with bigdata account credentials -#''' -# -#userid = "" -#donor_group = "" -# -#donor_metadata, _ = get_shared_metadata( -# donor_group=donor_group, -# userid_of_shared_user=userid # TODO: this should be refactored in several places to be userid -#) -#data, _ = get_data( -# donor_group=donor_group, -# userid=userid, -# weeks_of_data=52*10 -#) -# -## this is a dummy loop -#for i in [0]: - - # %% GET DATA FROM JSON FILE data_path = os.path.join("..", "data") all_donor_metadata = pd.read_csv( @@ -1523,7 +1531,7 @@ def estimate_local_time(df): data_fields = list(data) # NOTE: moving remove negative durations to type specific cleaning - # TODO: ask backend to change "duration" field to only include one object type + # TODO: ask backend to change "duration" to only include one object type # Tslim calibration bug fix data, n_cal_readings = tslim_calibration_fix(data.copy()) @@ -1551,13 +1559,13 @@ def estimate_local_time(df): # # estimate local time (refactor of estimate-local-time.py) # data["localTime"], local_time_metadata = estimate_local_time(data.copy()) - - # TODO: fix this issue with estimate local time - ''' - //anaconda3/envs/tbddp/lib/python3.7/site-packages/pandas/core/ops.py:1649: - FutureWarning: elementwise comparison failed; returning scalar instead, - but in the future will perform elementwise comparison result = method(y) - ''' +# +# TODO: fix this issue with estimate local time +# ''' +# //anaconda3/envs/tbddp/lib/python3.7/site-packages/pandas/core/ops.py:1649 +# FutureWarning: elementwise comparison failed; returning scalar instead, +# but in the future will perform elementwise comparison result = method(y) +# ''' # round all data to the nearest 5 minutes data["roundedUtcTime"] = round_time( @@ -1587,12 +1595,46 @@ def estimate_local_time(df): data.sort_values("uploadTime", ascending=False, inplace=True) groups = data.groupby(by="type") - # CGM DATA + # check to see if person is looping + if "basal" in data["type"].unique(): + basal = groups.get_group("basal").dropna(axis=1, how="all") + if "deliveryType" in list(basal): + bd = basal.loc[ + basal["deliveryType"] == "temp", + ["date", "deliveryType"] + ] + temp_basal_counts = ( + pd.DataFrame( + bd.groupby("date").deliveryType.count() + ).reset_index() + ) + temp_basal_counts.rename( + {"deliveryType": "tempBasalCounts"}, + axis=1, + inplace=True + ) + data = pd.merge(data, temp_basal_counts, on="date", how="left") + # >= 25 temp basals per day is likely looping + data["isLoopDay"] = data["tempBasalCounts"] >= 25 + # redefine groups with the new data + groups = data.groupby(by="type") + + else: + data["isLoopDay"] = np.nan + else: + data["isLoopDay"] = np.nan + + # %% CGM DATA if "cbg" in data["type"].unique(): + # sort data with metadata["cgmData"] = True # filter by cgm - cgm = groups.get_group("cbg").dropna(axis=1, how="all") + cgm = groups.get_group("cbg").copy() + + # sort data + cgm.sort_values("roundedUtcTime", ascending=False, inplace=True) + cgm.reset_index(drop=False, inplace=True) # calculate cgm in mg/dL cgm["mg/dL"] = round(cgm["value"] * MGDL_PER_MMOLL) @@ -1601,16 +1643,6 @@ def estimate_local_time(df): cgm, nSpike = remove_spike_data(cgm.copy()) metadata["nSpike"] = nSpike - # TODO: refactor (above) so you don't need to drop columns - drop_columns = [ - 'origin.payload.device.name', - 'origin.payload.device.manufacturer', - 'origin.payload.sourceRevision.source.name' - ] - for drop_col in drop_columns: - if drop_col in list(cgm): - cgm.drop(columns=[drop_col], inplace=True) - # assign upload cgm device info to cgm records in that upload cgm = add_upload_info_to_cgm_records(groups, cgm.copy()) @@ -1624,14 +1656,18 @@ def estimate_local_time(df): ) # get cgm models - cgm["cgmModel"] = np.nan + cgm["cgmModel"], cgm["cgmModelSensedFrom"] = np.nan, np.nan # dexcom cgm models (G4, G5, G6) - cgm["cgmModel"] = get_dexcom_cgm_model(cgm.copy()) + cgm[["cgmModel", "cgmModelSensedFrom"]] = ( + get_dexcom_cgm_model(cgm.copy()) + ) # for non dexcom cgms # 670G, 640G, 630G, 530G, 523/723, libre, animas, and tandem - cgm["cgmModel"] = get_non_dexcom_cgm_model(cgm.copy()) + cgm[["cgmModel", "cgmModelSensedFrom"]] = ( + get_non_dexcom_cgm_model(cgm.copy()) + ) # get metadata on cgm models and devices metadata["nMissingCgmModels"] = cgm["cgmModel"].isnull().sum() From e324f779450479625a4afcd1be43f76a06c06cba Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Mon, 19 Aug 2019 08:57:04 -0500 Subject: [PATCH 30/46] first commit of cgm stats --- .../get_stats/get_cgm_distributions_v3.py | 478 ++++++++++++++++-- 1 file changed, 442 insertions(+), 36 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py index e3fdf2d4..90e93cc4 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py @@ -1490,8 +1490,21 @@ def estimate_local_time(df): "PHI-2019-07-17-donor-data", "PHI-2019-07-17-cgm-distributions" ) +debug_duplicates = os.path.join( + data_path, + "PHI-2019-07-17-donor-data", + "PHI-2019-07-17-debug-cgm-duplicates" +) +output_stats = os.path.join( + data_path, + "PHI-2019-07-17-donor-data", + "PHI-2019-07-17-cgm-stats" +) + -make_folder_if_doesnt_exist([output_metadata, output_distribution]) +make_folder_if_doesnt_exist( + [output_metadata, output_distribution, debug_duplicates, output_stats] +) # %% START OF CODE @@ -1511,10 +1524,21 @@ def estimate_local_time(df): 'isOtherPerson', ] -# %% -for d_idx in range(0, len(all_files)): - data = pd.read_json(all_files[d_idx]) - userid = all_files[d_idx][-15:-5] + +## %% load test data on my computer +## TODO: if data comes in as a .csv, the embedded json fields +## get saved as a string and need to be unwrapped before those fields +## can be expanded. IN OTHER WORDS: this code only works with .json data +for d_idx in [0]: + userid = "0d4524bc11" + data = pd.read_json(os.path.join( + "..", "data", "dremio", userid, "PHI-{}.json".format(userid) + )) + +## %% +#for d_idx in range(0, len(all_files)): +# data = pd.read_json(all_files[d_idx]) +# userid = all_files[d_idx][-15:-5] metadata = all_donor_metadata.loc[ all_donor_metadata["userid"] == userid, donor_metadata_columns @@ -1576,15 +1600,17 @@ def estimate_local_time(df): ) # TIME CATEGORIES + data["date"] = data["roundedUtcTime"].dt.date + # AGE, & YLW # TODO: make this a function - if metadata["birthday"].values is not None: + if metadata["birthday"].values[0] is not np.nan: bDate = pd.to_datetime(metadata["birthday"].values[0][0:7]) data["age"] = np.floor((data["roundedUtcTime"] - bDate).dt.days/365.25) else: data["age"] = np.nan - if metadata["diagnosisDate"].values is not None: + if metadata["diagnosisDate"].values[0] is not np.nan: dDate = pd.to_datetime(metadata["diagnosisDate"].values[0][0:7]) data["ylw"] = np.floor((data["roundedUtcTime"] - dDate).dt.days/365.25) else: @@ -1675,13 +1701,13 @@ def estimate_local_time(df): if "deviceId" in list(cgm): metadata["uniqueCgmDevices"] = str(cgm["deviceId"].unique()) - # %% clean distributions + # clean distributions # break up all traces by cgm model - all_cgm_series = pd.DataFrame() + combined_cgm_series = pd.DataFrame() cgm_models = cgm.groupby(by="cgmModel") for cgm_model in cgm_models.groups.keys(): - print(cgm_model) + print("working on", cgm_model) temp_cgm = cgm_models.get_group(cgm_model) # get rid of cgm values too low/high (< 38 & > 402 mg/dL) @@ -1717,7 +1743,11 @@ def estimate_local_time(df): # create a contiguous 5 minute time series first_day = temp_cgm["roundedUtcTime"].min() + metadata["firstCgm." + cgm_model] = first_day + last_day = temp_cgm["roundedUtcTime"].max() + metadata["lastCgm." + cgm_model] = last_day + rng = pd.date_range(first_day, last_day, freq="5min") contiguous_data = pd.DataFrame( rng, @@ -1731,7 +1761,7 @@ def estimate_local_time(df): cgm_series = pd.merge( contiguous_data, temp_cgm[[ - "roundedUtcTime", "hashid", + "roundedUtcTime", "hashid", "isLoopDay", "cgmModel", "age", "ylw", "mg/dL" ]], on="roundedUtcTime", @@ -1815,34 +1845,411 @@ def estimate_local_time(df): metadata["nCgmDataPoints." + cgm_model] = len(cgm_series) # append cgm model to a larger table - all_cgm_series = pd.concat( - [all_cgm_series, cgm_series], + combined_cgm_series = pd.concat( + [combined_cgm_series, cgm_series], ignore_index=True ) + if len(combined_cgm_series) > 0: + # sort so that the oldest data point is on top + # and that the G5_G6 get deleted if they are apart of a duplicate + combined_cgm_series["cgmModel_G5_and_G6"] = ( + combined_cgm_series["cgmModel"] == "G5_G6" + ) + combined_cgm_series.sort_values( + by=["roundedUtcTime", "cgmModel_G5_and_G6", "cgmModel"], + ascending=[False, True, False], + inplace=True + ) + combined_cgm_series.reset_index(drop=True, inplace=True) + + # add in check to see if there are duplicates between cgm devices + nUnique_cgm_times = len(combined_cgm_series["roundedUtcTime"].unique()) + cgm_len = len(combined_cgm_series) + metadata["duplicateCgmDataIssue"] = nUnique_cgm_times != cgm_len + + nDuplicate_cgm = cgm_len - nUnique_cgm_times + metadata["nDuplicateCgmDataIssues"] = nDuplicate_cgm + + # if there are still duplicates, get rid of them + if nDuplicate_cgm > 0: + # save the duplicates for further examination + combined_cgm_series.to_csv(os.path.join( + debug_duplicates, + "PHI-" + userid + "-cgm-series-has-cgm-duplicates.csv.gz" + )) + + cgm.to_csv(os.path.join( + debug_duplicates, + "PHI-" + userid + "-cgm-data-has-cgm-duplicates.csv.gz" + )) + + # get rid of duplicates + combined_cgm_series, n_cgm_dups_removed = ( + removeDuplicates(combined_cgm_series, "roundedUtcTime") + ) + metadata["nCgmDuplicatesRemovedRoundedTime.atEnd"] = ( + n_cgm_dups_removed + ) + metadata["nCgmDataPoints.atEnd"] = len(combined_cgm_series) - # sort so that the oldest data point is on top - all_cgm_series.sort_values( - "roundedUtcTime", ascending=False, inplace=True - ) - all_cgm_series.reset_index(drop=True, inplace=True) + # add whether data is dexcom cgm or not + combined_cgm_series["dexcomCgm"] = ( + combined_cgm_series["cgmModel"].astype(str).str.contains("G4|G5|G6") + ) - # add in check to see if there are duplicates between cgm devices - nUnique_cgm_times = len(all_cgm_series["roundedUtcTime"].unique()) - metadata["duplicateCgmDataIssue"] = ( - nUnique_cgm_times != len(all_cgm_series) - ) + # save distribution data + combined_cgm_series.to_csv(os.path.join( + output_distribution, + "PHI-" + userid + "-cgm-distribution.csv.gz" + )) + + + # %% get cgm stats + # create a contiguous 5 minute time series of ALL cgm data + first_day = combined_cgm_series["roundedUtcTime"].min() + metadata["firstCgm." + cgm_model] = first_day + + last_day = combined_cgm_series["roundedUtcTime"].max() + metadata["lastCgm." + cgm_model] = last_day + + rng = pd.date_range(first_day, last_day, freq="5min") + contiguous_data = pd.DataFrame( + rng, + columns=["roundedUtcTime"] + ).sort_values( + "roundedUtcTime", + ascending=True + ).reset_index(drop=True) + + # merge with combined_cgm_series data + all_cgm = pd.merge( + contiguous_data, + combined_cgm_series[[ + 'roundedUtcTime', 'hashid', 'cgmModel', 'dexcomCgm', + 'age', 'ylw', 'isLoopDay', 'mg/dL', + ]], + on="roundedUtcTime", + how="left" + ) + + # get cgm stats + # get a binary (T/F) of whether we have a cgm value + all_cgm["hasCgm"] = all_cgm["mg/dL"].notnull() + + # fill isLoopDay nan with False + all_cgm["isLoopDay"].fillna(False, inplace=True) + + # has loop and cgm + all_cgm["hasLoopAndCgm"] = ( + (all_cgm["isLoopDay"]) & (all_cgm["hasCgm"]) + ) + + all_cgm["hasCgmWithoutLoop"] = ( + (~all_cgm["isLoopDay"]) & (all_cgm["hasCgm"]) + ) + + # make this a function and round ascendingly + ts39_401 = all_cgm["mg/dL"].copy() + + # for all the less than (<) criteria + for cgm_threshold in [40, 54, 70]: + all_cgm["cgm < " + str(cgm_threshold)] = ( + ts39_401.lt(cgm_threshold) + ) + # for all the greter than or equal to (>=) criteria + all_cgm["cgm >= " + str(cgm_threshold)] = ( + ts39_401.ge(cgm_threshold) + ) + + # for all the the less than or equal to (<=) criteria + for cgm_threshold in [140, 180, 250, 300, 400]: + all_cgm["cgm <= " + str(cgm_threshold)] = ( + ts39_401.le(cgm_threshold) + ) + # for all the the greter than (>) criteria + all_cgm["cgm > " + str(cgm_threshold)] = ( + ts39_401.gt(cgm_threshold) + ) + + # get all of the cgm ranges + # (cgm >= 40) & (cgm < 54) + all_cgm["40 <= cgm < 54"] = ( + (all_cgm["cgm >= 40"]) & (all_cgm["cgm < 54"]) + ) + + # (cgm >= 54) & (cgm < 70) + all_cgm["54 <= cgm < 70"] = ( + (all_cgm["cgm >= 54"]) & (all_cgm["cgm < 70"]) + ) + + # (cgm >= 70) & (cgm <= 140) + all_cgm["70 <= cgm <= 140"] = ( + (all_cgm["cgm >= 70"]) & (all_cgm["cgm <= 140"]) + ) + + # (cgm >= 70) & (cgm <= 180) + all_cgm["70 <= cgm <= 180"] = ( + (all_cgm["cgm >= 70"]) & (all_cgm["cgm <= 180"]) + ) + + # (cgm > 180) & (cgm <= 250) + all_cgm["180 < cgm <= 250"] = ( + (all_cgm["cgm > 180"]) & (all_cgm["cgm <= 250"]) + ) + + # (cgm > 250) & (cgm <= 400) + all_cgm["250 < cgm <= 400"] = ( + (all_cgm["cgm > 250"]) & (all_cgm["cgm <= 400"]) + ) + + # derfine the windows to calculate the stats over + window_names = ["hour", "day", "week", "month", "quarter", "year"] + window_lengths = [12, 288, 288*7, 288*7*4, 288*90, 288*365] + + for w_name, w_len in zip(window_names, window_lengths): + # require lenth of window for percent calculations + w_min = w_len + + # get the start and end times for each window + all_cgm[w_name + ".startTime"] = ( + all_cgm["roundedUtcTime"].shift(w_len - 1) + ) + all_cgm[w_name + ".endTime"] = all_cgm["roundedUtcTime"] + + # add majority age for the time period + all_cgm[w_name + ".age"] = np.round( + all_cgm["age"].rolling( + min_periods=1, + window=w_len + ).mean() + ) + + # add majority ylw for the time period + all_cgm[w_name + ".ylw"] = np.round( + all_cgm["ylw"].rolling( + min_periods=1, + window=w_len + ).median() + ) + + # get percent time cgm used + all_cgm[w_name + ".cgmPercent"] = ( + all_cgm["hasCgm"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # get the total number of non-null values over this time period + all_cgm[w_name + ".missingCgmPercent"] = ( + 1 - all_cgm[w_name + ".cgmPercent"] + ) + + # create (T/F) 70 and 80 percent available thresholds + # which will be useful for processing later + all_cgm[w_name + ".ge70Available"] = ( + all_cgm[w_name + ".cgmPercent"] >= 0.7 + ) + + all_cgm[w_name + ".ge80Available"] = ( + all_cgm[w_name + ".cgmPercent"] >= 0.8 + ) + + # get percent time Loop was used NOTE: this is + # approximate because we use > 24 temp basals per day + # ALSO: this is percent time Loop was used while cgm in use + all_cgm[w_name + ".loopingAndCgmPercent"] = ( + all_cgm["hasLoopAndCgm"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # percent of time cgm without loop + all_cgm[w_name + ".cgmWithoutLoopPercent"] = ( + all_cgm["hasCgmWithoutLoop"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # get percent time in different ranges + # % Time < 54 + all_cgm[w_name + ".lt54Percent"] = ( + all_cgm["cgm < 54"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # % Time in 54-70 (cgm >= 54) & (cgm < 70) + all_cgm[w_name + ".bt54_70Percent"] = ( + all_cgm["54 <= cgm < 70"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # % Time in target range (cgm >= 70) & (cgm <= 180) + all_cgm[w_name + ".bt70_180Percent"] = ( + all_cgm["70 <= cgm <= 180"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # % Time in 180-250 (cgm > 180) & (cgm <= 250) + all_cgm[w_name + ".bt180_250Percent"] = ( + all_cgm["180 < cgm <= 250"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # % Time > 250 + all_cgm[w_name + ".gt250Percent"] = ( + all_cgm["cgm > 250"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # check that all of the percentages add of to 1 or 100% + all_cgm[w_name + ".percentCheck"] = ( + all_cgm[w_name + ".missingCgmPercent"] + + all_cgm[w_name + ".lt54Percent"] + + all_cgm[w_name + ".bt54_70Percent"] + + all_cgm[w_name + ".bt70_180Percent"] + + all_cgm[w_name + ".bt180_250Percent"] + + all_cgm[w_name + ".gt250Percent"] + ) + + # here are some other less common percent time in ranges + # % Time < 70 + all_cgm[w_name + ".lt70Percent"] = ( + all_cgm["cgm < 70"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # % Time in target range (cgm >= 70) & (cgm <= 140) + all_cgm[w_name + ".tir70to140Percent"] = ( + all_cgm["70 <= cgm <= 140"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # percent time above a threshold + # % Time > 180 + all_cgm[w_name + ".gt180Percent"] = ( + all_cgm["cgm > 180"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # points that are 39 or 401 should NOT be used most + # calculations because the actual number is <= 39 or >= 401 + # (cgm < 40) OR (cgm > 400) + all_cgm["mg/dL.40to400"] = ( + ts39_401.replace(to_replace=39, value=np.nan) + ) + + all_cgm["mg/dL.40to400"] = ( + all_cgm["mg/dL.40to400"].replace( + to_replace=401, + value=np.nan + ) + ) + + # redefine the time series (ts) for the following stats + ts40_400 = all_cgm["mg/dL.40to400"].copy() + # require at least 3 points to make a stats calculation + w_min = 3 + + # recalcuate percent of measurements available + all_cgm[w_name + ".40to400availablePercent"] = ( + ts40_400.rolling(min_periods=w_min, window=w_len).count() + ) / w_len + + # get the total number of non-null values over this time period + all_cgm[w_name + ".40to400missingPercent"] = ( + 1 - all_cgm[w_name + ".40to400availablePercent"] + ) + + all_cgm[w_name + ".40to400ge70Available"] = ( + all_cgm[w_name + ".40to400availablePercent"] >= 0.7 + ) + + all_cgm[w_name + ".40to400ge80Available"] = ( + all_cgm[w_name + ".40to400availablePercent"] >= 0.8 + ) + + # create a rolling object + roll40_400 = ts40_400.rolling(min_periods=w_min, window=w_len) + + # quantiles + # NOTE: this will increase run time, so only run if you need + # 3-4X the processing time since it has to sort the data + # TODO: make this an option to the function, once it is made + + # min + all_cgm[w_name + ".min"] = roll40_400.min() + + # 10, 25, 75, and 90th percentiles + all_cgm[w_name + ".10th"] = roll40_400.quantile(0.10) + all_cgm[w_name + ".25th"] = roll40_400.quantile(0.25) + all_cgm[w_name + ".75th"] = roll40_400.quantile(0.75) + all_cgm[w_name + ".90th"] = roll40_400.quantile(0.90) + + # max + all_cgm[w_name + ".max"] = roll40_400.max() + + # median + all_cgm[w_name + ".median"] = roll40_400.median() + + # iqr + all_cgm[w_name + ".iqr"] = ( + all_cgm[w_name + ".75th"] - all_cgm[w_name + ".25th"] + ) + + # mean + all_cgm[w_name + ".mean"] = roll40_400.mean() + + # GMI(%) = 3.31 + 0.02392 x [mean glucose in mg/dL] + all_cgm[w_name + ".gmi"] = ( + 3.31 + (0.02392 * all_cgm[w_name + ".mean"]) + ) + + # standard deviation (std) + all_cgm[w_name + ".std"] = roll40_400.std() + + # coefficient of variation (cov) = std / mean + all_cgm[w_name + ".cov"] = ( + all_cgm[w_name + ".std"] / all_cgm[w_name + ".mean"] + ) + + # %% make an episodes dataframe, and then get stats + all_cgm["notnull"] = all_cgm["mg/dL"].notnull() + all_cgm["hypoEpisodeStart"] = ( + (all_cgm["cgm < 54"]) & (all_cgm["cgm >= 54"].shift(1)) + & (all_cgm["notnull"]) & (all_cgm["notnull"].shift(1)) + ) +# ts["startCrossPoint"] = ((df.mg_dL.shift(1) >= episodeThreshold) & +# (df.mg_dL < episodeThreshold)) +# +# df["endCrossPoint"] = ((df.mg_dL.shift(1) < episodeThreshold) & +# (df.mg_dL >= episodeThreshold)) + + + # save cgm stats data + all_cgm.to_csv(os.path.join( + output_stats, + "PHI-" + userid + "-cgm-stats.csv.gz" + )) - # get metadata for cgm stats - metadata["lastCgm.date"] = all_cgm_series.loc[0, "roundedUtcTime"] - metadata["lastCgm.age"] = all_cgm_series.loc[0, "age"] - metadata["lastCgm.ylw"] = all_cgm_series.loc[0, "ylw"] - - pdb.set_trace() - # save distribution data - all_cgm_series.to_csv(os.path.join( - output_distribution, - "PHI-" + userid + "-cgm-distribution.csv" - )) print(metadata.T) else: @@ -1852,8 +2259,7 @@ def estimate_local_time(df): # save metadata metadata.to_csv(os.path.join( output_metadata, - "PHI-" + userid + "-cgm-metadata.csv" + "PHI-" + userid + "-cgm-metadata.csv.gz" )) print("finished", d_idx, userid) - From ddabb815d5d95832d4985c1a779c1fbcee13c662 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Mon, 19 Aug 2019 11:13:17 -0500 Subject: [PATCH 31/46] initial commit of episodes --- .../get_stats/get_cgm_distributions_v3.py | 52 +++++++++++++++---- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py index 90e93cc4..0090557e 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py @@ -2232,19 +2232,51 @@ def estimate_local_time(df): ) # %% make an episodes dataframe, and then get stats - all_cgm["notnull"] = all_cgm["mg/dL"].notnull() - all_cgm["hypoEpisodeStart"] = ( - (all_cgm["cgm < 54"]) & (all_cgm["cgm >= 54"].shift(1)) - & (all_cgm["notnull"]) & (all_cgm["notnull"].shift(1)) + episode_ts = all_cgm[[ + "roundedUtcTime", "mg/dL", "hasCgm", + "cgm < 54", "cgm >= 54" + ]].copy() + + # put consecutive data that matches in groups + episode_ts["tempGroups"] = ( + (episode_ts["cgm < 54"] != episode_ts["cgm < 54"].shift()).cumsum() + ) + episode_ts["episodeGroup"] = ( + episode_ts["tempGroups"] * episode_ts["cgm < 54"] + ) + episode_groups = episode_ts.groupby("episodeGroup") + episodes = ( + episode_groups["roundedUtcTime"].count().reset_index() + ) + episodes["duration"] = episodes["roundedUtcTime"] * 5 + episodes.rename( + columns={"roundedUtcTime": "episodeCounts"}, inplace=True + ) + + episode_ts = pd.merge( + episode_ts, + episodes, + on="episodeGroup", + how="left" + ) + episode_ts["episodeDuration"] = ( + episode_ts["duration"] * episode_ts["cgm < 54"] + ) + + # merge episodes back into all_cgm + all_cgm = pd.merge( + all_cgm, + episode_ts[[ + 'roundedUtcTime', + 'episodeGroup', + 'episodeDuration' + ]], + on="roundedUtcTime", + how="left" ) -# ts["startCrossPoint"] = ((df.mg_dL.shift(1) >= episodeThreshold) & -# (df.mg_dL < episodeThreshold)) -# -# df["endCrossPoint"] = ((df.mg_dL.shift(1) < episodeThreshold) & -# (df.mg_dL >= episodeThreshold)) - # save cgm stats data + # %% save cgm stats data all_cgm.to_csv(os.path.join( output_stats, "PHI-" + userid + "-cgm-stats.csv.gz" From cded1ccc7ff50e380de9a00a0281b7c3c74da87e Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Mon, 19 Aug 2019 14:21:07 -0500 Subject: [PATCH 32/46] next increment of episodes --- .../get_stats/get_cgm_distributions_v3.py | 101 ++++++++++-------- 1 file changed, 59 insertions(+), 42 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py index 0090557e..295cc3f6 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py @@ -38,6 +38,49 @@ ''' +def get_episodes(df, episode_criterion, min_duration): + + # put consecutive data that matches in groups + df["tempGroups"] = (( + df[episode_criterion] != df[episode_criterion].shift() + ).cumsum()) + + df["episodeId"] = ( + df["tempGroups"] * df[episode_criterion] + ) + + # group by the episode groups + episode_groups = df.groupby("episodeId") + episodes = episode_groups["roundedUtcTime"].count().reset_index() + episodes["duration"] = episodes["roundedUtcTime"] * 5 + episodes.rename(columns={"roundedUtcTime": "episodeCounts"}, inplace=True) + + df = pd.merge(df, episodes, on="episodeId", how="left") + df["episodeDuration"] = ( + df["duration"] * df[episode_criterion] + ) + + # get rolling stats on episodes + df["isEpisode"] = ( + df["episodeDuration"] >= min_duration + ) + + # get the hypo episode starts so we only count each episode once + df["episodeStart"] = ( + (df[episode_criterion]) + & (~df[episode_criterion].shift(1).fillna(False)) + & (df["hasCgm"]) + & (df["hasCgm"].shift(1)) + ) + + df = df[[ + "isEpisode", "episodeStart", + "episodeId", "episodeDuration" + ]].add_prefix("episode." + episode_criterion + ".") + + return df + + def get_slope(y): if "array" not in type(y).__name__: raise TypeError('Expecting a numpy array') @@ -1903,8 +1946,7 @@ def estimate_local_time(df): "PHI-" + userid + "-cgm-distribution.csv.gz" )) - - # %% get cgm stats + # get cgm stats # create a contiguous 5 minute time series of ALL cgm data first_day = combined_cgm_series["roundedUtcTime"].min() metadata["firstCgm." + cgm_model] = first_day @@ -2231,50 +2273,25 @@ def estimate_local_time(df): all_cgm[w_name + ".std"] / all_cgm[w_name + ".mean"] ) - # %% make an episodes dataframe, and then get stats - episode_ts = all_cgm[[ - "roundedUtcTime", "mg/dL", "hasCgm", - "cgm < 54", "cgm >= 54" - ]].copy() - - # put consecutive data that matches in groups - episode_ts["tempGroups"] = ( - (episode_ts["cgm < 54"] != episode_ts["cgm < 54"].shift()).cumsum() - ) - episode_ts["episodeGroup"] = ( - episode_ts["tempGroups"] * episode_ts["cgm < 54"] - ) - episode_groups = episode_ts.groupby("episodeGroup") - episodes = ( - episode_groups["roundedUtcTime"].count().reset_index() - ) - episodes["duration"] = episodes["roundedUtcTime"] * 5 - episodes.rename( - columns={"roundedUtcTime": "episodeCounts"}, inplace=True - ) - - episode_ts = pd.merge( - episode_ts, - episodes, - on="episodeGroup", - how="left" - ) - episode_ts["episodeDuration"] = ( - episode_ts["duration"] * episode_ts["cgm < 54"] + # make an episodes dataframe, and then get stats + # get episodes < 54 + episode_ts = get_episodes( + all_cgm[["roundedUtcTime", "hasCgm", "cgm < 54"]].copy(), + "cgm < 54", + 15 ) + all_cgm = pd.concat([all_cgm, episode_ts], axis=1) - # merge episodes back into all_cgm - all_cgm = pd.merge( - all_cgm, - episode_ts[[ - 'roundedUtcTime', - 'episodeGroup', - 'episodeDuration' - ]], - on="roundedUtcTime", - how="left" + # get episodes < 70 + episode_ts = get_episodes( + all_cgm[["roundedUtcTime", "hasCgm", "cgm < 70"]].copy(), + "cgm < 70", + 15 ) + all_cgm = pd.concat([all_cgm, episode_ts], axis=1) + # get rolling stats on episodes + pdb.set_trace() # %% save cgm stats data all_cgm.to_csv(os.path.join( From 0b9a3b429d8b03663f4d7e9e7fb1daa881933358 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Mon, 19 Aug 2019 14:21:35 -0500 Subject: [PATCH 33/46] move percentile calculations to full range of data section --- .../get_stats/get_cgm_distributions_v3.py | 52 ++++++++++--------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py index 295cc3f6..caf73951 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py @@ -2192,6 +2192,33 @@ def estimate_local_time(df): ).sum() / w_len ) + # quantiles + # NOTE: this will increase run time, so only run if you need + # 3-4X the processing time since it has to sort the data + # TODO: make this an option to the function, once it is made + # create a rolling object + roll39_401 = ts39_401.rolling(min_periods=3, window=w_len) + + # min + all_cgm[w_name + ".min"] = roll39_401.min() + + # 10, 25, 75, and 90th percentiles + all_cgm[w_name + ".10th"] = roll39_401.quantile(0.10) + all_cgm[w_name + ".25th"] = roll39_401.quantile(0.25) + all_cgm[w_name + ".75th"] = roll39_401.quantile(0.75) + all_cgm[w_name + ".90th"] = roll39_401.quantile(0.90) + + # max + all_cgm[w_name + ".max"] = roll39_401.max() + + # median + all_cgm[w_name + ".median"] = roll39_401.median() + + # iqr + all_cgm[w_name + ".iqr"] = ( + all_cgm[w_name + ".75th"] - all_cgm[w_name + ".25th"] + ) + # points that are 39 or 401 should NOT be used most # calculations because the actual number is <= 39 or >= 401 # (cgm < 40) OR (cgm > 400) @@ -2232,31 +2259,6 @@ def estimate_local_time(df): # create a rolling object roll40_400 = ts40_400.rolling(min_periods=w_min, window=w_len) - # quantiles - # NOTE: this will increase run time, so only run if you need - # 3-4X the processing time since it has to sort the data - # TODO: make this an option to the function, once it is made - - # min - all_cgm[w_name + ".min"] = roll40_400.min() - - # 10, 25, 75, and 90th percentiles - all_cgm[w_name + ".10th"] = roll40_400.quantile(0.10) - all_cgm[w_name + ".25th"] = roll40_400.quantile(0.25) - all_cgm[w_name + ".75th"] = roll40_400.quantile(0.75) - all_cgm[w_name + ".90th"] = roll40_400.quantile(0.90) - - # max - all_cgm[w_name + ".max"] = roll40_400.max() - - # median - all_cgm[w_name + ".median"] = roll40_400.median() - - # iqr - all_cgm[w_name + ".iqr"] = ( - all_cgm[w_name + ".75th"] - all_cgm[w_name + ".25th"] - ) - # mean all_cgm[w_name + ".mean"] = roll40_400.mean() From 4c9714aec96f0712265d0fecd67e3330b324589d Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Tue, 20 Aug 2019 05:52:58 -0500 Subject: [PATCH 34/46] get episode stats --- .../get_stats/get_cgm_distributions_v3.py | 139 +++++++++++++----- 1 file changed, 100 insertions(+), 39 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py index caf73951..70d32771 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py @@ -38,7 +38,14 @@ ''' -def get_episodes(df, episode_criterion, min_duration): +def get_episodes( + df, + episode_criterion="cgm < 54", + min_duration=5, +): + # TODO: deal with case where there are nan's in the middle of an episode + # it probably makes sense to interpolate between values iff the gap is + # <= 1 to 6 points (5 to 30 minutes) # put consecutive data that matches in groups df["tempGroups"] = (( @@ -60,7 +67,7 @@ def get_episodes(df, episode_criterion, min_duration): df["duration"] * df[episode_criterion] ) - # get rolling stats on episodes + # mark record as belonging to an episode df["isEpisode"] = ( df["episodeDuration"] >= min_duration ) @@ -69,14 +76,25 @@ def get_episodes(df, episode_criterion, min_duration): df["episodeStart"] = ( (df[episode_criterion]) & (~df[episode_criterion].shift(1).fillna(False)) - & (df["hasCgm"]) - & (df["hasCgm"].shift(1)) +# & (df["hasCgm"]) +# & (df["hasCgm"].shift(1)) + ) + + # calculate the total duration and attach to start record + # which is needed to get the average duration per episode + df["episodeTotalDuration"] = ( + df["episodeStart"] * df["episodeDuration"] + ) + df["episodeTotalDuration"].replace(0, np.nan, inplace=True) + + episode_prefix = ( + "episode." + episode_criterion + + ".durationThreshold=" + str(min_duration) + "." ) df = df[[ - "isEpisode", "episodeStart", - "episodeId", "episodeDuration" - ]].add_prefix("episode." + episode_criterion + ".") + "isEpisode", "episodeId", "episodeStart", "episodeTotalDuration" + ]].add_prefix(episode_prefix) return df @@ -1572,16 +1590,16 @@ def estimate_local_time(df): ## TODO: if data comes in as a .csv, the embedded json fields ## get saved as a string and need to be unwrapped before those fields ## can be expanded. IN OTHER WORDS: this code only works with .json data -for d_idx in [0]: - userid = "0d4524bc11" - data = pd.read_json(os.path.join( - "..", "data", "dremio", userid, "PHI-{}.json".format(userid) - )) - -## %% -#for d_idx in range(0, len(all_files)): -# data = pd.read_json(all_files[d_idx]) -# userid = all_files[d_idx][-15:-5] +#for d_idx in [0]: +# userid = "0d4524bc11" +# data = pd.read_json(os.path.join( +# "..", "data", "dremio", userid, "PHI-{}.json".format(userid) +# )) + +# %% +for d_idx in range(0, len(all_files)): + data = pd.read_json(all_files[d_idx]) + userid = all_files[d_idx][-15:-5] metadata = all_donor_metadata.loc[ all_donor_metadata["userid"] == userid, donor_metadata_columns @@ -1946,7 +1964,7 @@ def estimate_local_time(df): "PHI-" + userid + "-cgm-distribution.csv.gz" )) - # get cgm stats + # %% get cgm stats # create a contiguous 5 minute time series of ALL cgm data first_day = combined_cgm_series["roundedUtcTime"].min() metadata["firstCgm." + cgm_model] = first_day @@ -1990,7 +2008,7 @@ def estimate_local_time(df): (~all_cgm["isLoopDay"]) & (all_cgm["hasCgm"]) ) - # make this a function and round ascendingly + # work with all of the non-null data, even 39 = LOW and 401 = HIGH ts39_401 = all_cgm["mg/dL"].copy() # for all the less than (<) criteria @@ -1998,6 +2016,19 @@ def estimate_local_time(df): all_cgm["cgm < " + str(cgm_threshold)] = ( ts39_401.lt(cgm_threshold) ) + # get episodes below these thresholds + for min_duration in [5, 15]: + episode_ts = get_episodes( + all_cgm[[ + "roundedUtcTime", + "hasCgm", + "cgm < " + str(cgm_threshold) + ]].copy(), + episode_criterion="cgm < " + str(cgm_threshold), + min_duration=min_duration + ) + all_cgm = pd.concat([all_cgm, episode_ts], axis=1) + # for all the greter than or equal to (>=) criteria all_cgm["cgm >= " + str(cgm_threshold)] = ( ts39_401.ge(cgm_threshold) @@ -2115,6 +2146,56 @@ def estimate_local_time(df): ).sum() / w_len ) + # get episode stats + # TODO: add in hyper events + # get episodes below these thresholds + for cgm_threshold in [40, 54, 70]: + # get number of episodes per time window + for min_duration in [5, 15]: + "cgm < " + str(cgm_threshold) + episode_name = ( + "episode.cgm < " + str(cgm_threshold) + + ".durationThreshold=" + str(min_duration) + ) + all_cgm[w_name + ".count." + episode_name] = ( + all_cgm[episode_name + ".episodeStart"].rolling( + min_periods=1, + window=w_len + ).sum() + ) + + # get avg. duration of each episode per time window + all_cgm[w_name + ".avgDuration." + episode_name] = ( + all_cgm[episode_name + ".episodeTotalDuration"].rolling( + min_periods=1, + window=w_len + ).sum() / all_cgm[w_name + ".count." + episode_name] + ) + + # get min duration of each episode per time window + all_cgm[w_name + ".minDuration." + episode_name] = ( + all_cgm[episode_name + ".episodeTotalDuration"].rolling( + min_periods=1, + window=w_len + ).min() + ) + + # get median duration of each episode per time window + all_cgm[w_name + ".medianDuration." + episode_name] = ( + all_cgm[episode_name + ".episodeTotalDuration"].rolling( + min_periods=1, + window=w_len + ).median() + ) + + # get max duration of each episode per time window + all_cgm[w_name + ".maxDuration." + episode_name] = ( + all_cgm[episode_name + ".episodeTotalDuration"].rolling( + min_periods=1, + window=w_len + ).max() + ) + # get percent time in different ranges # % Time < 54 all_cgm[w_name + ".lt54Percent"] = ( @@ -2275,26 +2356,6 @@ def estimate_local_time(df): all_cgm[w_name + ".std"] / all_cgm[w_name + ".mean"] ) - # make an episodes dataframe, and then get stats - # get episodes < 54 - episode_ts = get_episodes( - all_cgm[["roundedUtcTime", "hasCgm", "cgm < 54"]].copy(), - "cgm < 54", - 15 - ) - all_cgm = pd.concat([all_cgm, episode_ts], axis=1) - - # get episodes < 70 - episode_ts = get_episodes( - all_cgm[["roundedUtcTime", "hasCgm", "cgm < 70"]].copy(), - "cgm < 70", - 15 - ) - all_cgm = pd.concat([all_cgm, episode_ts], axis=1) - - # get rolling stats on episodes - pdb.set_trace() - # %% save cgm stats data all_cgm.to_csv(os.path.join( output_stats, From 2fe6065b68f3ebd1df0b0150147cbe5fb2f1f2f9 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Tue, 20 Aug 2019 07:27:19 -0500 Subject: [PATCH 35/46] minor refactor --- .../get_stats/get_cgm_distributions_v3.py | 70 +++++++++++-------- 1 file changed, 41 insertions(+), 29 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py index 70d32771..740128f7 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py @@ -2011,6 +2011,21 @@ def estimate_local_time(df): # work with all of the non-null data, even 39 = LOW and 401 = HIGH ts39_401 = all_cgm["mg/dL"].copy() + # some stats should NOT include 39 or 401 + all_cgm["mg/dL.40to400"] = ( + ts39_401.replace(to_replace=39, value=np.nan) + ) + + all_cgm["mg/dL.40to400"] = ( + all_cgm["mg/dL.40to400"].replace( + to_replace=401, + value=np.nan + ) + ) + + ts40_400 = all_cgm["mg/dL.40to400"].copy() + + # for all the less than (<) criteria for cgm_threshold in [40, 54, 70]: all_cgm["cgm < " + str(cgm_threshold)] = ( @@ -2278,7 +2293,10 @@ def estimate_local_time(df): # 3-4X the processing time since it has to sort the data # TODO: make this an option to the function, once it is made # create a rolling object + + # NOTE: these calculations only require 3 points to make roll39_401 = ts39_401.rolling(min_periods=3, window=w_len) + roll40_400 = ts40_400.rolling(min_periods=3, window=w_len) # min all_cgm[w_name + ".min"] = roll39_401.min() @@ -2300,29 +2318,10 @@ def estimate_local_time(df): all_cgm[w_name + ".75th"] - all_cgm[w_name + ".25th"] ) - # points that are 39 or 401 should NOT be used most - # calculations because the actual number is <= 39 or >= 401 - # (cgm < 40) OR (cgm > 400) - all_cgm["mg/dL.40to400"] = ( - ts39_401.replace(to_replace=39, value=np.nan) - ) - - all_cgm["mg/dL.40to400"] = ( - all_cgm["mg/dL.40to400"].replace( - to_replace=401, - value=np.nan - ) - ) - - # redefine the time series (ts) for the following stats - ts40_400 = all_cgm["mg/dL.40to400"].copy() - # require at least 3 points to make a stats calculation - w_min = 3 - # recalcuate percent of measurements available all_cgm[w_name + ".40to400availablePercent"] = ( - ts40_400.rolling(min_periods=w_min, window=w_len).count() - ) / w_len + roll40_400.count() / w_len + ) # get the total number of non-null values over this time period all_cgm[w_name + ".40to400missingPercent"] = ( @@ -2337,9 +2336,6 @@ def estimate_local_time(df): all_cgm[w_name + ".40to400availablePercent"] >= 0.8 ) - # create a rolling object - roll40_400 = ts40_400.rolling(min_periods=w_min, window=w_len) - # mean all_cgm[w_name + ".mean"] = roll40_400.mean() @@ -2356,11 +2352,27 @@ def estimate_local_time(df): all_cgm[w_name + ".std"] / all_cgm[w_name + ".mean"] ) - # %% save cgm stats data - all_cgm.to_csv(os.path.join( - output_stats, - "PHI-" + userid + "-cgm-stats.csv.gz" - )) + # %% save cgm stats data + all_cgm.to_csv(os.path.join( + output_stats, + "PHI-" + userid + "-cgm-stats.csv.gz" + )) + # write the most recent example of the 90 day stats + # to the metadata + quarter_ge80Available_idx = ( + all_cgm[all_cgm["quarter.ge80Available"]] + ).index.max() + most_recent_quarter = all_cgm.loc[ + [quarter_ge80Available_idx], + all_cgm.columns + ] + + metadata = pd.merge( + metadata, + most_recent_quarter, + on="hashid", + how="left" + ) print(metadata.T) From 665598d49c90e889b81f17e7990378347bf4d468 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Tue, 20 Aug 2019 07:49:45 -0500 Subject: [PATCH 36/46] resolve edge case of not having quarterly stats --- .../get_stats/get_cgm_distributions_v3.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py index 740128f7..f691f506 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py @@ -2362,14 +2362,22 @@ def estimate_local_time(df): quarter_ge80Available_idx = ( all_cgm[all_cgm["quarter.ge80Available"]] ).index.max() - most_recent_quarter = all_cgm.loc[ - [quarter_ge80Available_idx], - all_cgm.columns - ] + + if pd.notnull(quarter_ge80Available_idx): + # get the most recent quarter + most_recent = all_cgm.loc[ + [quarter_ge80Available_idx], + all_cgm.columns + ] + else: + most_recent = all_cgm.loc[ + [all_cgm.index.max()], + all_cgm.columns + ] metadata = pd.merge( metadata, - most_recent_quarter, + most_recent, on="hashid", how="left" ) From 5432b3332705951b50f97b00a7c1ff9f0b51cadd Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Tue, 20 Aug 2019 10:16:09 -0500 Subject: [PATCH 37/46] initial commit of batch process all cgm distribution and stats --- .../batch_get_cgm_distributions_and_stats.py | 160 ++ .../get_cgm_distributions_and_stats.py | 2437 +++++++++++++++++ 2 files changed, 2597 insertions(+) create mode 100644 projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py create mode 100644 projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py diff --git a/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py b/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py new file mode 100644 index 00000000..2830fe03 --- /dev/null +++ b/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py @@ -0,0 +1,160 @@ +# -*- coding: utf-8 -*- +"""accept_donors_and_pull_data.py +This is a wrapper script that gets distributions and stats for all donors, +NOTE: this needs to be refactored because it is currently set up to run +on json files that are in a snowflake path + +""" + +# %% REQUIRED LIBRARIES +import datetime as dt +import pandas as pd +import subprocess as sub +import os +import glob +import time +import argparse +from multiprocessing import Pool + + +# %% USER INPUTS (choices to be made in order to run the code) +codeDescription = "get distribution and stats for all donor's json data" +parser = argparse.ArgumentParser(description=codeDescription) + +parser.add_argument( + "-i", + "--input-json-data-path", + dest="json_data_path", + default=os.path.abspath( + os.path.join( + os.path.dirname(__file__), "..", "data", "dremio", "**", "*.json" + ), + ), + help="the path where json data is located" +) + +parser.add_argument( + "-d", + "--date-stamp", + dest="date_stamp", + default=dt.datetime.now().strftime("%Y-%m-%d"), + help="date, in '%Y-%m-%d' format, of the date when " + + "donors were accepted" +) + +parser.add_argument( + "-o", + "--output-data-path", + dest="data_path", + default=os.path.abspath( + os.path.join( + os.path.dirname(__file__), "..", "data" + ) + ), + help="the output path where the data is stored" +) + +args = parser.parse_args() + + +# %% FUNCTIONS +def run_process(json_data_path): + userid = json_data_path[-15:-5] + + p = sub.Popen( + [ + "python", "get_cgm_distributions_and_stats.py", + "-i", json_data_path, + "-u", userid, + "-d", args.date_stamp, + "-o", args.data_path + ], + stdout=sub.PIPE, + stderr=sub.PIPE + ) + + output, errors = p.communicate() + output = output.decode("utf-8") + errors = errors.decode("utf-8") + + if errors == '': + print(output) + else: + print(errors) + + return + + +# %% GET A LIST OF DONOR JSON FILE LOCATIONS +all_files = glob.glob(args.json_data_path, recursive=True) + +# this is a good test to make sure run process is working before running +#run_process(all_files[0]) +#pdb.set_trace() + +# use multiple cores to process +startTime = time.time() +print("starting at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) +pool = Pool(os.cpu_count()) +pool.map(run_process, all_files) +pool.close() +endTime = time.time() +print( + "finshed pulling data at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") +) +total_duration = round((endTime - startTime) / 60, 1) +print("total duration was %s minutes" % total_duration) + + +# %% COMBINE AND SAVE ALL DONOR METADATA +print("combining all metadata") +phi_date_stamp = "PHI-" + args.date_stamp +donor_folder = os.path.join(args.data_path, phi_date_stamp + "-donor-data") + +metadata_path = os.path.join( + args.data_path, + phi_date_stamp + "-donor-data", + phi_date_stamp + "-cgm-metadata" +) + +all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz")) +all_metadata = pd.DataFrame() +for f in all_metadata_files: + temp_meta = pd.read_csv(f) + all_metadata = pd.concat( + [all_metadata, temp_meta], + ignore_index=True, + sort=False + ) + +all_metadata.to_csv( + os.path.join(donor_folder, phi_date_stamp + "-cgm-metadata.csv.gz") +) +print("saving metadata...code complete") + + +# %% COMBINE AND SAVE ALL DISTRIBUTION DATA +print("combining all distribution data") + +metadata_path = os.path.join( + args.data_path, + phi_date_stamp + "-donor-data", + phi_date_stamp + "-cgm-distributions" +) + +all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz")) +distribution_metadata = pd.DataFrame() +for f in all_metadata_files: + temp_meta = pd.read_csv(f, index_col=[0]) + distribution_metadata = pd.concat( + [distribution_metadata, temp_meta], + ignore_index=True, + sort=False + ) + +distribution_metadata.to_csv( + os.path.join( + donor_folder, phi_date_stamp + "-all-cgm-distributions.csv.gz" + ) +) +print("saving all-dataset-info-metadata...code complete") diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py new file mode 100644 index 00000000..af0d0d50 --- /dev/null +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py @@ -0,0 +1,2437 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +calculate cgm distributions and stats for a single tidepool (donor) dataset +from a data that comes from a json file (does NOT work with data save as csv) +''' + + +# %% REQUIRED LIBRARIES +import os +import sys +import hashlib +import pytz +import numpy as np +import pandas as pd +import datetime as dt +import argparse +import pdb + +get_donor_data_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..") +) +if get_donor_data_path not in sys.path: + sys.path.insert(0, get_donor_data_path) +from get_donor_data.get_single_tidepool_dataset_json import make_folder_if_doesnt_exist + +# %% CONSTANTS +MGDL_PER_MMOLL = 18.01559 + + +# %% FUNCTIONS +''' +the functions that are called in this script, +which includes notes of where the functions came from, +and whether they were refactored +''' + + +def get_episodes( + df, + episode_criterion="cgm < 54", + min_duration=5, +): + # TODO: deal with case where there are nan's in the middle of an episode + # it probably makes sense to interpolate between values iff the gap is + # <= 1 to 6 points (5 to 30 minutes) + + # put consecutive data that matches in groups + df["tempGroups"] = (( + df[episode_criterion] != df[episode_criterion].shift() + ).cumsum()) + + df["episodeId"] = ( + df["tempGroups"] * df[episode_criterion] + ) + + # group by the episode groups + episode_groups = df.groupby("episodeId") + episodes = episode_groups["roundedUtcTime"].count().reset_index() + episodes["duration"] = episodes["roundedUtcTime"] * 5 + episodes.rename(columns={"roundedUtcTime": "episodeCounts"}, inplace=True) + + df = pd.merge(df, episodes, on="episodeId", how="left") + df["episodeDuration"] = ( + df["duration"] * df[episode_criterion] + ) + + # mark record as belonging to an episode + df["isEpisode"] = ( + df["episodeDuration"] >= min_duration + ) + + # get the hypo episode starts so we only count each episode once + df["episodeStart"] = ( + (df[episode_criterion]) + & (~df[episode_criterion].shift(1).fillna(False)) + ) + + # calculate the total duration and attach to start record + # which is needed to get the average duration per episode + df["episodeTotalDuration"] = ( + df["episodeStart"] * df["episodeDuration"] + ) + df["episodeTotalDuration"].replace(0, np.nan, inplace=True) + + episode_prefix = ( + "episode." + episode_criterion + + ".durationThreshold=" + str(min_duration) + "." + ) + + df = df[[ + "isEpisode", "episodeId", "episodeStart", "episodeTotalDuration" + ]].add_prefix(episode_prefix) + + return df + + +def get_slope(y): + if "array" not in type(y).__name__: + raise TypeError('Expecting a numpy array') + + count_ = len(y) + + x = np.arange(start=0, stop=count_*5, step=5) + + sum_x = x.sum() + sum_y = y.sum() + sum_xy = (x * y).sum() + sum_x_squared = (x * x).sum() + + slope = ( + ((count_ * sum_xy) - (sum_x * sum_y)) + / ((count_ * sum_x_squared) - (sum_x * sum_x)) + ) + + return slope + + +def expand_entire_dict(ts): + if "Series" not in type(ts).__name__: + raise TypeError('Expecting a pandas time series object') + notnull_idx = ts.index[ts.notnull()] + temp_df = pd.DataFrame( + ts[notnull_idx].tolist(), + index=notnull_idx + ) + + return temp_df + + +def expand_embedded_dict(ts, key_): + '''Expanded a single field that has embedded json + + Args: + ts: a pandas time series of the field that has embedded json + key_: the key that you want to expand + + Raise: + TypeError: if you don't pass in a pandas time series + + Returns: + key_ts: a new time series of the key of interest + + NOTE: + this is new function + TODO: + could be refactored to allow multiple keys or all keys to be returned + could be refactored for speed as the current process + ''' + + if "Series" not in type(ts).__name__: + raise TypeError('Expecting a pandas time series object') + key_ts = pd.Series(name=ts.name + "." + key_, index=ts.index) + notnull_idx = ts.notnull() + # TODO: maybe sped up by only getting the one field of interest? + # though, the current method is fairly quick and compact + temp_df = expand_entire_dict(ts) + if key_ in list(temp_df): + key_ts[notnull_idx] = temp_df[key_].values + + return key_ts + + +def get_embedded_field(ts, embedded_field): + '''get a field that is nested in more than 1 embedded dictionary (json) + + Args: + ts: a pandas time series of the field that has embedded json + embedded_field (str): the location of the field that is deeply nested + (e.g., "origin.payload.device.model") + + Raise: + ValueError: if you don't pass in a pandas time series + + Returns: + new_ts: a new time series of the key of interest + + NOTE: + this is new function + the "." notation is used to reference nested json + + ''' + field_list = embedded_field.split(".") + if len(field_list) < 2: + raise ValueError('Expecting at least 1 embedded field') + + new_ts = expand_embedded_dict(ts, field_list[1]) + for i in range(2, len(field_list)): + new_ts = expand_embedded_dict(new_ts, field_list[i]) + + return new_ts + + +def add_upload_info_to_cgm_records(groups, df): + upload_locations = [ + "upload.uploadId", + "upload.deviceManufacturers", + "upload.deviceModel", + "upload.deviceSerialNumber", + "upload.deviceTags" + ] + + if "upload" in groups["type"].unique(): + upload = groups.get_group("upload").dropna(axis=1, how="all").add_prefix("upload.") + df = pd.merge( + left=df, + right=upload[list(set(upload_locations) & set(list(upload)))], + left_on="uploadId", + right_on="upload.uploadId", + how="left" + ) + + return df + + +def expand_heathkit_cgm_fields(df): + # TODO: refactor the code/function that originally grabs + # these fields, so we are only doing it once, and so + # we don't have to drop the columns for the code below to work. + drop_columns = [ + 'origin.payload.device.name', + 'origin.payload.device.manufacturer', + 'origin.payload.sourceRevision.source.name' + ] + for drop_col in drop_columns: + if drop_col in list(df): + df.drop(columns=[drop_col], inplace=True) + + healthkit_locations = [ + "origin", + "origin.payload", + "origin.payload.device", + "origin.payload.sourceRevision", + "origin.payload.sourceRevision.source", + "payload", + ] + + for hk_loc in healthkit_locations: + if hk_loc in list(df): + temp_df = ( + expand_entire_dict(df[hk_loc].copy()).add_prefix(hk_loc + ".") + ) + df = pd.concat([df, temp_df], axis=1) + + return df + + +def get_dexcom_cgm_model(df): + # add cgm model + + dexcom_model_locations = [ + "deviceId", + "deviceManufacturers", + "upload.deviceManufacturers", + "deviceModel", + "upload.deviceModel", + "deviceSerialNumber", + "upload.deviceSerialNumber", + "origin.payload.sourceRevision.source.name", + "payload.transmitterGeneration", + "payload.HKMetadataKeySyncIdentifier", + "payload.transmitterId", + ] + + for model_location in dexcom_model_locations: + # only check if model has NOT been determined, or if it is G5_G6 + m_idx = ( + (df["cgmModel"].isnull()) + | (df["cgmModel"].astype(str).str.contains("G5_G6")) + ) + + # get index that matches model + if ((model_location in list(df)) & (m_idx.sum() > 0)): + str_list = df[model_location].astype(str).str + + # G4 + g4_idx = str_list.contains("G4", case=False, na=False) + df.loc[g4_idx, "cgmModel"] = "G4" + df.loc[g4_idx, "cgmModelSensedFrom"] = model_location + + # G5 + g5_idx = str_list.contains("G5", case=False, na=False) + df.loc[g5_idx, "cgmModel"] = "G5" + df.loc[g5_idx, "cgmModelSensedFrom"] = model_location + + # G6 + g6_idx = str_list.contains("G6", case=False, na=False) + df.loc[g6_idx, "cgmModel"] = "G6" + df.loc[g6_idx, "cgmModelSensedFrom"] = model_location + + # edge case of g5 and g6 + g5_g6_idx = (g5_idx & g6_idx) + df.loc[g5_g6_idx, "cgmModel"] = "G5_G6" + df.loc[g5_g6_idx, "cgmModelSensedFrom"] = model_location + + # case of "transmitterId" + if ( + ("transmitterId" in model_location) + | ("payload.HKMetadataKeySyncIdentifier" in model_location) + ): + # if length of string is 5, then it is likely a G4 sensor + length5_idx = str_list.len() == 5 + df.loc[length5_idx, "cgmModel"] = "G4" + df.loc[length5_idx, "cgmModelSensedFrom"] = model_location + + # if length of string > 5 then might be G5 or G6 + length_gt5_idx = str_list.len() > 5 + + # if sensor stats with 4 then likely G5 + starts4_idx = str_list.startswith("4") + df.loc[(length_gt5_idx & starts4_idx), "cgmModel"] = "G5" + df.loc[(length_gt5_idx & starts4_idx), "cgmModelSensedFrom"] = model_location + + # if sensor stats with 2 or 8 then likely G6 + starts2_6_idx = ( + (str_list.startswith("2")) | (str_list.startswith("8")) + ) + df.loc[(length_gt5_idx & starts2_6_idx), "cgmModel"] = "G6" + df.loc[(length_gt5_idx & starts2_6_idx), "cgmModelSensedFrom"] = model_location + + return df[["cgmModel", "cgmModelSensedFrom"]] + + +def get_non_dexcom_cgm_model(df): + # non-dexcom cgm model query + model_locations = ["deviceId"] + + # model types (NOTE: for medtronic getting pump type not cgm) + models_670G = "MMT-158|MMT-178" + models_640G = "MMT-1511|MMT-1512|MMT-1711|MMT-1712" + models_630G = "MMT-1514|MMT-1515|MMT-1714|MMT-1715" + models_530G = ( + "530G|MedT-551|MedT-751|MedT-554|MedT-754|Veo - 554|Veo - 754" + ) + models_523_723 = "MedT-523|MedT-723|Revel - 523|Revel - 723" # 523/723 + models_libre = "AbbottFreeStyleLibre" + models_animas = "IR1295" + # NOTE: the tandem G4 will first be written as G5_G6, + # but the logic should overwrite back to G4 + models_tandem_G5_G6 = "tandem" + models_tandem_G4 = "4628003|5448003" + + non_dex_models = [ + models_670G, models_640G, models_630G, models_530G, models_523_723, + models_libre, models_animas, models_tandem_G5_G6, models_tandem_G4 + ] + + non_dex_model_names = [ + "670G", "640G", "630G", "530G", "523_723", + "LIBRE", "G4", "G5_G6", "G4" + ] + + for model_location in model_locations: + # only check if model has NOT been determined, or if it is G5_G6 + m_idx = ( + (df["cgmModel"].isnull()) + | (df["cgmModel"].astype(str).str.contains("G5_G6")) + ) + + # get index that matches model + if ((model_location in list(df)) & (m_idx.sum() > 0)): + str_list = df[model_location].astype(str).str + + for non_dex_model, model_name in zip( + non_dex_models, non_dex_model_names + ): + + model_idx = str_list.contains(non_dex_model, na=False) + df.loc[model_idx, "cgmModel"] = model_name + df.loc[model_idx, "cgmModelSensedFrom"] = model_location + + return df[["cgmModel", "cgmModelSensedFrom"]] + + +def hash_userid(userid, salt): + ''' + taken from anonymize-and-export.py + refactored name(s) to meet style guide + ''' + usr_string = userid + salt + hash_user = hashlib.sha256(usr_string.encode()) + hashid = hash_user.hexdigest() + + return hashid + + +def get_type(val): + return type(val).__name__ + + +def remove_negative_durations(df): + ''' + taken from https://github.com/tidepool-org/data-analytics/blob/ + etn/get-settings-and-events/projects/get-donors-pump-settings/ + get-users-settings-and-events.py + + refactored name(s) to meet style guide + refactored pandas field call to df["field"] instead of df.field + refactored because physical activity includes embedded json, whereas + the other fields in the data model require a integer + TODO: I think that durations are coming in as floats too, so we need + to refactor to account for that. + ''' + if "duration" in list(df): + type_ = df["duration"].apply(get_type) + valid_index = ((type_ == "int") & (df["duration"].notnull())) + n_negative_durations = sum(df.loc[valid_index, "duration"] < 0) + if n_negative_durations > 0: + df = df[~(df.loc[valid_index, "duration"] < 0)] + else: + n_negative_durations = np.nan + + return df, n_negative_durations + + +def tslim_calibration_fix(df): + ''' + taken from https://github.com/tidepool-org/data-analytics/blob/ + etn/get-settings-and-events/projects/get-donors-pump-settings/ + get-users-settings-and-events.py + + refactored name(s) to meet style guide + refactored pandas field call to df["field"] instead of df.field + refactored to only expand one field + ''' + + # expand payload field one level + if "payload" in list(df): + df["payload.calibration_reading"] = ( + expand_embedded_dict(df["payload"], "calibration_reading") + ) + + if df["payload.calibration_reading"].notnull().sum() > 0: + + search_for = ['tan'] + tandem_data_index = ( + (df["deviceId"].str.contains('|'.join(search_for))) + & (df["type"] == "deviceEvent") + ) + + cal_index = df["payload.calibration_reading"].notnull() + valid_index = tandem_data_index & cal_index + + n_cal_readings = sum(valid_index) + + if n_cal_readings > 0: + # if reading is > 30 then it is in the wrong units + if df["payload.calibration_reading"].min() > 30: + df.loc[cal_index, "value"] = ( + df.loc[valid_index, "payload.calibration_reading"] + / MGDL_PER_MMOLL + ) + else: + df.loc[cal_index, "value"] = ( + df.loc[valid_index, "payload.calibration_reading"] + ) + else: + n_cal_readings = 0 + else: + n_cal_readings = 0 + return df, n_cal_readings + + +def replace_smoothed_cgm_values(df): + + if 'payload.realTimeValue' in list(df): + raw_val_idx = df['payload.realTimeValue'].notnull() + n_replaced = raw_val_idx.sum() + df.loc[raw_val_idx, "mg/dL"] = ( + df.loc[raw_val_idx, "payload.realTimeValue"] + ) + else: + n_replaced = np.nan + + raw_values = df["mg/dL"] + + return raw_values, n_replaced + + +def get_healthkit_timezone(df): + ''' + TODO: refactor to account for more efficient way to get embedded json + ''' + if "payload" in list(df): + df["payload.HKTimeZone"] = ( + expand_embedded_dict(df["payload"], "HKTimeZone") + ) + if "timezone" not in list(df): + if "payload.HKTimeZone" in list(df): + hk_tz_idx = df["payload.HKTimeZone"].notnull() + df.loc[hk_tz_idx, "deviceType"] = "healthkit" + df.rename(columns={"payload.HKTimeZone": "timezone"}, inplace=True) + + else: + df["timezone"] = np.nan + df["deviceType"] = np.nan + else: + if "payload.HKTimeZone" in list(df): + hk_tz_idx = df["payload.HKTimeZone"].notnull() + df.loc[hk_tz_idx, "timezone"] = ( + df.loc[hk_tz_idx, "payload.HKTimeZone"] + ) + df.loc[hk_tz_idx, "deviceType"] = "healthkit" + else: + df["timezone"] = np.nan + df["deviceType"] = np.nan + + else: + df["timezone"] = np.nan + df["deviceType"] = np.nan + + return df[["timezone", "deviceType"]] + + +def get_and_fill_timezone(df): + ''' + this is new to deal with healthkit data + requires that a data frame that contains payload and HKTimeZone is passed + ''' + df = get_healthkit_timezone(df) + + df["timezone"].fillna(method='ffill', inplace=True) + df["timezone"].fillna(method='bfill', inplace=True) + + return df["timezone"] + + +def make_tz_unaware(date_time): + return date_time.replace(tzinfo=None) + + +def to_utc_datetime(df): + ''' + this is new to deal with perfomance issue with the previous method + of converting to string to datetime with pd.to_datetime() + ''' + utc_time_tz_aware = pd.to_datetime( + df["time"], + format="%Y-%m-%dT%H:%M:%S", + utc=True + ) + utc_tz_unaware = utc_time_tz_aware.apply(make_tz_unaware) + + return utc_tz_unaware + + +# apply the large timezone offset correction (AKA Darin's fix) +def timezone_offset_bug_fix(df): + ''' + this is taken from estimate-local-time.py + TODO: add in unit testing where there is no TZP that is > 840 or < -720 + ''' + + if "timezoneOffset" in list(df): + + while ((df.timezoneOffset > 840).sum() > 0): + df.loc[df.timezoneOffset > 840, ["conversionOffset"]] = ( + df.loc[df.timezoneOffset > 840, ["conversionOffset"]] + - (1440 * 60 * 1000) + ) + + df.loc[df.timezoneOffset > 840, ["timezoneOffset"]] = ( + df.loc[df.timezoneOffset > 840, ["timezoneOffset"]] - 1440 + ) + + while ((df.timezoneOffset < -720).sum() > 0): + df.loc[df.timezoneOffset < -720, ["conversionOffset"]] = ( + df.loc[df.timezoneOffset < -720, ["conversionOffset"]] + + (1440 * 60 * 1000) + ) + + df.loc[df.timezoneOffset < -720, ["timezoneOffset"]] = ( + df.loc[df.timezoneOffset < -720, ["timezoneOffset"]] + 1440 + ) + + return df + + +def get_local_time(df): + + tzo = df[['utcTime', 'inferredTimezone']].apply( + lambda x: get_timezone_offset(*x), axis=1 + ) + local_time = df['utcTime'] + pd.to_timedelta(tzo, unit="m") + + return local_time + + +def round_time( + df, + time_interval_minutes=5, + start_with_first_record=True, + return_calculation_columns=False +): + ''' + A general purpose round time function that rounds the "time" + field to nearest minutes + INPUTS: + * a dataframe (df) or time series that contains only one time field + that you want to round + * time_interval_minutes (defaults to 5 minutes given that most cgms + output every 5 minutes) + * start_with_first_record starts the rounding with the first record + if True, and the last record if False (defaults to True) + * return_calculation_columns specifies whether the extra columns + used to make calculations are returned + refactored name(s) to meet style guide + ''' + # if a time series is passed in, convert to dataframe + if "Series" in get_type(df): + df = pd.DataFrame(df) + columns_ = list(df) + if len(columns_) > 1: + sys.exit( + "Error: df should only have one time column" + ) + else: + df.rename(columns={columns_[0]: "t"}, inplace=True) + + df.sort_values( + by="t", + ascending=start_with_first_record, + inplace=True + ) + + df.reset_index(drop=False, inplace=True) + df.rename(columns={"index": "originalIndex"}, inplace=True) + + # calculate the time between consecutive records + df["t_shift"] = df["t"].shift(1) + df["timeBetweenRecords"] = round( + (df["t"] - df["t_shift"]).dt.days*(86400/(60 * time_interval_minutes)) + + (df["t"] - df["t_shift"]).dt.seconds/(60 * time_interval_minutes) + ) * time_interval_minutes + + # separate the data into chunks if timeBetweenRecords is greater than + # 2 times the minutes so the rounding process + # starts over + big_gaps = list( + df.query("abs(timeBetweenRecords) > " + + str(time_interval_minutes * 2)).index + ) + big_gaps.insert(0, 0) + big_gaps.append(len(df)) + + for gap_index in range(0, len(big_gaps) - 1): + chunk = df["t"][big_gaps[gap_index]:big_gaps[gap_index+1]] + first_chunk = df["t"][big_gaps[gap_index]] + + # calculate the time difference between + # each time record and the first record + df.loc[ + big_gaps[gap_index]:big_gaps[gap_index+1], + "minutesFromFirstRecord" + ] = ( + (chunk - first_chunk).dt.days*(86400/60) + + (chunk - first_chunk).dt.seconds/60 + ) + + # then round to the nearest X Minutes + # NOTE: the ".000001" ensures that mulitples of 2:30 always rounds up. + df.loc[ + big_gaps[gap_index]:big_gaps[gap_index+1], + "roundedMinutesFromFirstRecord" + ] = round( + (df.loc[ + big_gaps[gap_index]:big_gaps[gap_index+1], + "minutesFromFirstRecord" + ] / time_interval_minutes) + 0.000001 + ) * (time_interval_minutes) + + rounded_first_record = ( + first_chunk + pd.Timedelta("1microseconds") + ).round(str(time_interval_minutes) + "min") + + df.loc[ + big_gaps[gap_index]:big_gaps[gap_index+1], + "roundedTime" + ] = rounded_first_record + pd.to_timedelta( + df.loc[ + big_gaps[gap_index]:big_gaps[gap_index+1], + "roundedMinutesFromFirstRecord" + ], unit="m" + ) + + if return_calculation_columns is False: + df.drop( + columns=[ + "timeBetweenRecords", + "minutesFromFirstRecord", + "roundedMinutesFromFirstRecord" + ], inplace=True + ) + # sort back to the original index + df.sort_values(by="originalIndex", inplace=True) + + return df["roundedTime"].values + + +def add_upload_time(df): + ''' + this is taken from a colab notebook that is not in our github + given that it has been refactored to account for bug where there are + no upload records + NOTE: this is a new fix introduced with healthkit data...we now have + data that does not have an upload record + + ''' + + if "upload" in df.type.unique(): + upload_times = pd.DataFrame( + df[df.type == "upload"].groupby("uploadId")["utcTime"].max() + ) + else: + upload_times = pd.DataFrame(columns=["utcTime"]) + + unique_uploadIds = set(df["uploadId"].unique()) + unique_uploadRecords = set( + df.loc[df["type"] == "upload", "uploadId"].unique() + ) + uploadIds_missing_uploadRecords = unique_uploadIds - unique_uploadRecords + + for upId in uploadIds_missing_uploadRecords: + last_upload_time = df.loc[df["uploadId"] == upId, "utcTime"].max() + upload_times.loc[upId, "utcTime"] = last_upload_time + + upload_times.reset_index(inplace=True) + upload_times.rename( + columns={"utcTime": "uploadTime", + "index": "uploadId"}, + inplace=True + ) + + df = pd.merge(df, upload_times, how='left', on='uploadId') + + return df["uploadTime"].values + + +def remove_invalid_cgm_values(df): + + nBefore = len(df) + # remove values < 38 and > 402 mg/dL + df = df.drop(df[((df.type == "cbg") & (df["mg/dL"] < 38))].index) + df = df.drop(df[((df.type == "cbg") & (df["mg/dL"] > 402))].index) + nRemoved = nBefore - len(df) + + return df, nRemoved + + +def removeDuplicates(df, criteriaDF): + nBefore = len(df) + df = df.loc[~(df[criteriaDF].duplicated())] + df = df.reset_index(drop=True) + nDuplicatesRemoved = nBefore - len(df) + + return df, nDuplicatesRemoved + + +def removeCgmDuplicates(df, timeCriterion, valueCriterion="value"): + if timeCriterion in df: + df.sort_values(by=[timeCriterion, "uploadTime"], + ascending=[False, False], + inplace=True) + dfIsNull = df[df[timeCriterion].isnull()] + dfNotNull = df[df[timeCriterion].notnull()] + dfNotNull, nDuplicatesRemoved = ( + removeDuplicates(dfNotNull, [timeCriterion, valueCriterion]) + ) + df = pd.concat([dfIsNull, dfNotNull]) + df.sort_values(by=[timeCriterion, "uploadTime"], + ascending=[False, False], + inplace=True) + else: + nDuplicatesRemoved = 0 + + return df, nDuplicatesRemoved + + +# get rid of spike data +def remove_spike_data(df): + if "origin" in list(df): + nBefore = len(df) + spike_locations = [ + "origin.payload.device.name", + "origin.payload.device.manufacturer", + "origin.payload.sourceRevision.source.name", + ] + for spike_loc in spike_locations: + df[spike_loc] = get_embedded_field(df["origin"], spike_loc) + notnull_idx = df[spike_loc].notnull() + df_notnull = df[notnull_idx] + is_spike = df_notnull[spike_loc].astype(str).str.lower().str.contains("spike") + spike_idx = df_notnull[is_spike].index + df.drop(spike_idx, inplace=True) + + nRemoved = nBefore - len(df) + + else: + nRemoved = np.nan + + return df, nRemoved + + +# %% ESTIMATE LOCAL TIME FUNCTIONS +def convert_deprecated_timezone_to_alias(df, tzAlias): + if "timezone" in df: + uniqueTimezones = df.timezone.unique() + uniqueTimezones = uniqueTimezones[pd.notnull(df.timezone.unique())] + + for uniqueTimezone in uniqueTimezones: + alias = tzAlias.loc[tzAlias.tz.str.endswith(uniqueTimezone), + ["alias"]].values + if len(alias) == 1: + df.loc[df.timezone == uniqueTimezone, ["timezone"]] = alias + + return df + + +def create_contiguous_day_series(df): + first_day = df["date"].min() + last_day = df["date"].max() + rng = pd.date_range(first_day, last_day).date + contiguousDaySeries = \ + pd.DataFrame(rng, columns=["date"]).sort_values( + "date", ascending=False).reset_index(drop=True) + + return contiguousDaySeries + + +def add_device_type(df): + col_headings = list(df) + if "deviceType" not in col_headings: + df["deviceType"] = np.nan + if "deviceTags" in col_headings: + # first make sure deviceTag is in string format + df["deviceTags"] = df.deviceTags.astype(str) + # filter by type not null device tags + ud = df[df["deviceTags"].notnull()].copy() + # define a device type (e.g., pump, cgm, or healthkit) + ud.loc[ + ((ud["deviceTags"].str.contains("pump")) + & (ud["deviceType"].isnull())), + ["deviceType"] + ] = "pump" + + # define a device type (e.g., cgm) + ud.loc[ + ((ud["deviceTags"].str.contains("cgm")) + & (ud["deviceType"].isnull())), + ["deviceType"] + ] = "cgm" + + return ud["deviceType"] + else: + return np.nan + + +def get_timezone_offset(currentDate, currentTimezone): + + tz = pytz.timezone(currentTimezone) + # here we add 1 day to the current date to account for changes to/from DST + tzoNum = int( + tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z") + ) + tzoHours = np.floor(tzoNum / 100) + tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0) + tzoSign = np.sign(tzoHours) + tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign)) + + return tzo + + +def add_device_day_series(df, dfContDays, deviceTypeName): + if len(df) > 0: + dfDayGroups = df.groupby("date") + if "timezoneOffset" in df: + dfDaySeries = pd.DataFrame(dfDayGroups["timezoneOffset"].median()) + else: + dfDaySeries = pd.DataFrame(columns=["timezoneOffset"]) + dfDaySeries.index.name = "date" + + if "upload" in deviceTypeName: + if (("timezone" in df) & (df["timezone"].notnull().sum() > 0)): + dfDaySeries["timezone"] = ( + dfDayGroups.timezone.describe()["top"] + ) + # get the timezone offset for the timezone + for i in dfDaySeries.index: + if pd.notnull(dfDaySeries.loc[i, "timezone"]): + tzo = get_timezone_offset( + pd.to_datetime(i), + dfDaySeries.loc[i, "timezone"]) + dfDaySeries.loc[i, ["timezoneOffset"]] = tzo + if "timeProcessing" in dfDaySeries: + dfDaySeries["timeProcessing"] = \ + dfDayGroups.timeProcessing.describe()["top"] + else: + dfDaySeries["timeProcessing"] = np.nan + + + dfDaySeries = dfDaySeries.add_prefix(deviceTypeName + "."). \ + rename(columns={deviceTypeName + ".date": "date"}) + + dfContDays = pd.merge(dfContDays, dfDaySeries.reset_index(), + on="date", how="left") + + else: + dfContDays[deviceTypeName + ".timezoneOffset"] = np.nan + + return dfContDays + + +def impute_upload_records(df, contDays, deviceTypeName): + daySeries = \ + add_device_day_series(df, contDays, deviceTypeName) + + if ((len(df) > 0) & (deviceTypeName + ".timezone" in daySeries)): + for i in daySeries.index[1:]: + if pd.isnull(daySeries[deviceTypeName + ".timezone"][i]): + daySeries.loc[i, [deviceTypeName + ".timezone"]] = ( + daySeries.loc[i-1, deviceTypeName + ".timezone"] + ) + if pd.notnull(daySeries[deviceTypeName + ".timezone"][i]): + tz = daySeries.loc[i, deviceTypeName + ".timezone"] + tzo = get_timezone_offset( + pd.to_datetime(daySeries.loc[i, "date"]), + tz + ) + daySeries.loc[i, deviceTypeName + ".timezoneOffset"] = tzo + + if pd.notnull(daySeries[deviceTypeName + ".timeProcessing"][i-1]): + daySeries.loc[i, deviceTypeName + ".timeProcessing"] = \ + daySeries.loc[i-1, deviceTypeName + ".timeProcessing"] + + else: + daySeries[deviceTypeName + ".timezone"] = np.nan + daySeries[deviceTypeName + ".timeProcessing"] = np.nan + + return daySeries + + +def add_home_timezone(df, contDays): + + if (("timezone" in df) & (df["timezone"].notnull().sum()> 0)): + homeTimezone = df["timezone"].describe()["top"] + tzo = contDays.date.apply( + lambda x: get_timezone_offset(pd.to_datetime(x), homeTimezone)) + + contDays["home.imputed.timezoneOffset"] = tzo + contDays["home.imputed.timezone"] = homeTimezone + + else: + contDays["home.imputed.timezoneOffset"] = np.nan + contDays["home.imputed.timezone"] = np.nan + contDays["home.imputed.timeProcessing"] = np.nan + + return contDays + + +def estimateTzAndTzoWithUploadRecords(cDF): + + cDF["est.type"] = np.nan + cDF["est.gapSize"] = np.nan + cDF["est.timezoneOffset"] = cDF["upload.timezoneOffset"] + cDF["est.annotations"] = np.nan + + if "upload.timezone" in cDF: + cDF.loc[cDF["upload.timezone"].notnull(), ["est.type"]] = "UPLOAD" + cDF["est.timezone"] = cDF["upload.timezone"] + cDF["est.timeProcessing"] = cDF["upload.timeProcessing"] + else: + cDF["est.timezone"] = np.nan + cDF["est.timeProcessing"] = np.nan + + cDF.loc[((cDF["est.timezoneOffset"] != + cDF["home.imputed.timezoneOffset"]) & + (pd.notnull(cDF["est.timezoneOffset"]))), + "est.annotations"] = "travel" + + return cDF + + +def assignTzoFromImputedSeries(df, i, imputedSeries): + df.loc[i, ["est.type"]] = "DEVICE" + + df.loc[i, ["est.timezoneOffset"]] = \ + df.loc[i, imputedSeries + ".timezoneOffset"] + + df.loc[i, ["est.timezone"]] = \ + df.loc[i, imputedSeries + ".timezone"] + + df.loc[i, ["est.timeProcessing"]] = \ + df.loc[i, imputedSeries + ".timeProcessing"] + + return df + + +def compareDeviceTzoToImputedSeries(df, sIdx, device): + for i in sIdx: + # if the device tzo = imputed tzo, then chose the imputed tz and tzo + # note, dst is accounted for in the imputed tzo + for imputedSeries in ["pump.upload.imputed", "cgm.upload.imputed", + "healthkit.upload.imputed", "home.imputed"]: + # if the estimate has not already been made + if pd.isnull(df.loc[i, "est.timezone"]): + + if df.loc[i, device + ".timezoneOffset"] == \ + df.loc[i, imputedSeries + ".timezoneOffset"]: + + assignTzoFromImputedSeries(df, i, imputedSeries) + + df = addAnnotation(df, i, + "tz-inferred-from-" + imputedSeries) + + # if the imputed series has a timezone estimate, then see if + # the current day is a dst change day + elif (pd.notnull(df.loc[i, imputedSeries + ".timezone"])): + imputedTimezone = df.loc[i, imputedSeries + ".timezone"] + if isDSTChangeDay(df.loc[i, "date"], imputedTimezone): + + dstRange = getRangeOfTZOsForTimezone(imputedTimezone) + if ((df.loc[i, device + ".timezoneOffset"] in dstRange) + & (df.loc[i, imputedSeries + ".timezoneOffset"] in dstRange)): + + assignTzoFromImputedSeries(df, i, imputedSeries) + + df = addAnnotation(df, i, "dst-change-day") + df = addAnnotation( + df, i, "tz-inferred-from-" + imputedSeries) + + return df + + +def estimateTzAndTzoWithDeviceRecords(cDF): + + # 2A. use the TZO of the pump or cgm device if it exists on a given day. In + # addition, compare the TZO to one of the imputed day series (i.e., the + # upload and home series to see if the TZ can be inferred) + for deviceType in ["pump", "cgm"]: + # find the indices of days where a TZO estimate has not been made AND + # where the device (e.g., pump or cgm) TZO has data + sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) & + (cDF[deviceType + ".timezoneOffset"].notnull()))].index + # compare the device TZO to the imputed series to infer time zone + cDF = compareDeviceTzoToImputedSeries(cDF, sIndices, deviceType) + + # 2B. if the TZ cannot be inferred with 2A, then see if the TZ can be + # inferred from the previous day's TZO. If the device TZO is equal to the + # previous day's TZO, AND if the previous day has a TZ estimate, use the + # previous day's TZ estimate for the current day's TZ estimate + for deviceType in ["pump", "cgm"]: + sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) & + (cDF[deviceType + ".timezoneOffset"].notnull()))].index + + cDF = compareDeviceTzoToPrevDayTzo(cDF, sIndices, deviceType) + + # 2C. after 2A and 2B, check the DEVICE estimates to make sure that the + # pump and cgm tzo do not differ by more than 60 minutes. If they differ + # by more that 60 minutes, then mark the estimate as UNCERTAIN. Also, we + # allow the estimates to be off by 60 minutes as there are a lot of cases + # where the devices are off because the user changes the time for DST, + # at different times + sIndices = cDF[((cDF["est.type"] == "DEVICE") & + (cDF["pump.timezoneOffset"].notnull()) & + (cDF["cgm.timezoneOffset"].notnull()) & + (cDF["pump.timezoneOffset"] != cDF["cgm.timezoneOffset"]) + )].index + + tzoDiffGT60 = abs(cDF.loc[sIndices, "cgm.timezoneOffset"] - + cDF.loc[sIndices, "pump.timezoneOffset"]) > 60 + + idx = tzoDiffGT60.index[tzoDiffGT60] + + cDF.loc[idx, ["est.type"]] = "UNCERTAIN" + for i in idx: + cDF = addAnnotation(cDF, i, "pump-cgm-tzo-mismatch") + + return cDF + + +def imputeTzAndTzo(cDF): + + sIndices = cDF[cDF["est.timezoneOffset"].isnull()].index + hasTzoIndices = cDF[cDF["est.timezoneOffset"].notnull()].index + if len(hasTzoIndices) > 0: + if len(sIndices) > 0: + lastDay = max(sIndices) + + while ((sIndices.min() < max(hasTzoIndices)) & + (len(sIndices) > 0)): + + currentDay, prevDayWithDay, nextDayIdx = \ + getImputIndices(cDF, sIndices, hasTzoIndices) + + cDF = imputeByTimezone(cDF, currentDay, + prevDayWithDay, nextDayIdx) + + sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) & + (~cDF["est.annotations"].str.contains( + "unable-to-impute-tzo").fillna(False)))].index + + hasTzoIndices = cDF[cDF["est.timezoneOffset"].notnull()].index + + # try to impute to the last day (earliest day) in the dataset + # if the last record has a timezone that is the home record, then + # impute using the home timezone + if len(sIndices) > 0: + currentDay = min(sIndices) + prevDayWithDay = currentDay - 1 + gapSize = lastDay - currentDay + + for i in range(currentDay, lastDay + 1): + if cDF.loc[prevDayWithDay, "est.timezoneOffset"] == \ + cDF.loc[prevDayWithDay, "home.imputed.timezoneOffset"]: + + cDF.loc[i, ["est.type"]] = "IMPUTE" + + cDF.loc[i, ["est.timezoneOffset"]] = \ + cDF.loc[i, "home.imputed.timezoneOffset"] + + cDF.loc[i, ["est.timezone"]] = \ + cDF.loc[i, "home.imputed.timezone"] + + cDF = addAnnotation(cDF, i, "gap=" + str(gapSize)) + cDF.loc[i, ["est.gapSize"]] = gapSize + + else: + cDF.loc[i, ["est.type"]] = "UNCERTAIN" + cDF = addAnnotation(cDF, i, "unable-to-impute-tzo") + else: + cDF["est.type"] = "UNCERTAIN" + cDF["est.annotations"] = "unable-to-impute-tzo" + + return cDF + + +def getRangeOfTZOsForTimezone(tz): + minMaxTzo = [getTimezoneOffset(pd.to_datetime("1/1/2017"), tz), + getTimezoneOffset(pd.to_datetime("5/1/2017"), tz)] + + rangeOfTzo = np.arange(int(min(minMaxTzo)), int(max(minMaxTzo))+1, 15) + + return rangeOfTzo + + +def getListOfDSTChangeDays(cDF): + + # get a list of DST change days for the home time zone + dstChangeDays = \ + cDF[abs(cDF["home.imputed.timezoneOffset"] - + cDF["home.imputed.timezoneOffset"].shift(-1)) > 0].date + + return dstChangeDays + + +def correctEstimatesAroundDst(df, cDF): + + # get a list of DST change days for the home time zone + dstChangeDays = getListOfDSTChangeDays(cDF) + + # loop through the df within 2 days of a daylight savings time change + for d in dstChangeDays: + dstIndex = df[(df.date > (d + dt.timedelta(days=-2))) & + (df.date < (d + dt.timedelta(days=2)))].index + for dIdx in dstIndex: + if pd.notnull(df.loc[dIdx, "est.timezone"]): + tz = pytz.timezone(df.loc[dIdx, "est.timezone"]) + tzRange = getRangeOfTZOsForTimezone(str(tz)) + minHoursToLocal = min(tzRange)/60 + tzoNum = int(tz.localize(df.loc[dIdx, "utcTime"] + + dt.timedelta(hours=minHoursToLocal)).strftime("%z")) + tzoHours = np.floor(tzoNum / 100) + tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0) + tzoSign = np.sign(tzoHours) + tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign)) + localTime = \ + df.loc[dIdx, "utcTime"] + pd.to_timedelta(tzo, unit="m") + df.loc[dIdx, ["est.localTime"]] = localTime + df.loc[dIdx, ["est.timezoneOffset"]] = tzo + return df + + +def applyLocalTimeEstimates(df, cDF): + df = pd.merge(df, cDF, how="left", on="date") + df["est.localTime"] = \ + df["utcTime"] + pd.to_timedelta(df["est.timezoneOffset"], unit="m") + + df = correctEstimatesAroundDst(df, cDF) + + return df["est.localTime"].values + + +def isDSTChangeDay(currentDate, currentTimezone): + tzoCurrentDay = getTimezoneOffset(pd.to_datetime(currentDate), + currentTimezone) + tzoPreviousDay = getTimezoneOffset(pd.to_datetime(currentDate) + + dt.timedelta(days=-1), currentTimezone) + + return (tzoCurrentDay != tzoPreviousDay) + + +def tzoRangeWithComparisonTz(df, i, comparisonTz): + # if we have a previous timezone estimate, then calcuate the range of + # timezone offset values for that time zone + if pd.notnull(comparisonTz): + rangeTzos = getRangeOfTZOsForTimezone(comparisonTz) + else: + comparisonTz = np.nan + rangeTzos = np.array([]) + + return rangeTzos + + +def tzAndTzoRangePreviousDay(df, i): + # if we have a previous timezone estimate, then calcuate the range of + # timezone offset values for that time zone + comparisonTz = df.loc[i-1, "est.timezone"] + + rangeTzos = tzoRangeWithComparisonTz(df, i, comparisonTz) + + return comparisonTz, rangeTzos + + +def assignTzoFromPreviousDay(df, i, previousDayTz): + + df.loc[i, ["est.type"]] = "DEVICE" + df.loc[i, ["est.timezone"]] = previousDayTz + df.loc[i, ["est.timezoneOffset"]] = \ + getTimezoneOffset(pd.to_datetime(df.loc[i, "date"]), previousDayTz) + + df.loc[i, ["est.timeProcessing"]] = df.loc[i-1, "est.timeProcessing"] + df = addAnnotation(df, i, "tz-inferred-from-prev-day") + + return df + + +def assignTzoFromDeviceTzo(df, i, device): + + df.loc[i, ["est.type"]] = "DEVICE" + df.loc[i, ["est.timezoneOffset"]] = \ + df.loc[i, device + ".timezoneOffset"] + df.loc[i, ["est.timeProcessing"]] = \ + df.loc[i, device + ".upload.imputed.timeProcessing"] + + df = addAnnotation(df, i, "likely-travel") + df = addAnnotation(df, i, "tzo-from-" + device) + + return df + + +def compareDeviceTzoToPrevDayTzo(df, sIdx, device): + + for i in sIdx[sIdx > 0]: + + # first see if the previous record has a tzo + if (pd.notnull(df.loc[i-1, "est.timezoneOffset"])): + + previousDayTz, dstRange = tzAndTzoRangePreviousDay(df, i) + timeDiff = abs((df.loc[i, device + ".timezoneOffset"]) - + df.loc[i-1, "est.timezoneOffset"]) + + # next see if the previous record has a tz + if (pd.notnull(df.loc[i-1, "est.timezone"])): + + if timeDiff == 0: + assignTzoFromPreviousDay(df, i, previousDayTz) + + # see if the previous day's tzo and device tzo are within the + # dst range (as that is a common problem with this data) + elif ((df.loc[i, device + ".timezoneOffset"] in dstRange) + & (df.loc[i-1, "est.timezoneOffset"] in dstRange)): + + # then see if it is DST change day + if isDSTChangeDay(df.loc[i, "date"], previousDayTz): + + df = addAnnotation(df, i, "dst-change-day") + assignTzoFromPreviousDay(df, i, previousDayTz) + + # if it is not DST change day, then mark this as uncertain + else: + # also, check to see if the difference between device. + # tzo and prev.tzo is less than the expected dst + # difference. There is a known issue where the BtUTC + # procedure puts clock drift into the device.tzo, + # and as a result the tzo can be off by 15, 30, + # or 45 minutes. + if (((df.loc[i, device + ".timezoneOffset"] == + min(dstRange)) | + (df.loc[i, device + ".timezoneOffset"] == + max(dstRange))) & + ((df.loc[i-1, "est.timezoneOffset"] == + min(dstRange)) | + (df.loc[i-1, "est.timezoneOffset"] == + max(dstRange)))): + + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, + "likely-dst-error-OR-travel") + + else: + + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, + "likely-15-min-dst-error") + + # next see if time difference between device.tzo and prev.tzo + # is off by 720 minutes, which is indicative of a common + # user AM/PM error + elif timeDiff == 720: + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "likely-AM-PM-error") + + # if it doesn't fall into any of these cases, then the + # tzo difference is likely due to travel + else: + df = assignTzoFromDeviceTzo(df, i, device) + + elif timeDiff == 0: + df = assignTzoFromDeviceTzo(df, i, device) + + # if there is no previous record to compare with check for dst errors, + # and if there are no errors, it is likely a travel day + else: + + comparisonTz, dstRange = tzAndTzoRangeWithHomeTz(df, i) + timeDiff = abs((df.loc[i, device + ".timezoneOffset"]) - + df.loc[i, "home.imputed.timezoneOffset"]) + + if ((df.loc[i, device + ".timezoneOffset"] in dstRange) + & (df.loc[i, "home.imputed.timezoneOffset"] in dstRange)): + + # see if it is DST change day + if isDSTChangeDay(df.loc[i, "date"], comparisonTz): + + df = addAnnotation(df, i, "dst-change-day") + df.loc[i, ["est.type"]] = "DEVICE" + df.loc[i, ["est.timezoneOffset"]] = \ + df.loc[i, device + ".timezoneOffset"] + df.loc[i, ["est.timezone"]] = \ + df.loc[i, "home.imputed.timezone"] + df.loc[i, ["est.timeProcessing"]] = \ + df.loc[i, device + ".upload.imputed.timeProcessing"] + + # if it is not DST change day, then mark this as uncertain + else: + # also, check to see if the difference between device. + # tzo and prev.tzo is less than the expected dst + # difference. There is a known issue where the BtUTC + # procedure puts clock drift into the device.tzo, + # and as a result the tzo can be off by 15, 30, + # or 45 minutes. + if (((df.loc[i, device + ".timezoneOffset"] == + min(dstRange)) | + (df.loc[i, device + ".timezoneOffset"] == + max(dstRange))) & + ((df.loc[i, "home.imputed.timezoneOffset"] == + min(dstRange)) | + (df.loc[i, "home.imputed.timezoneOffset"] == + max(dstRange)))): + + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "likely-dst-error-OR-travel") + + else: + + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "likely-15-min-dst-error") + + # next see if time difference between device.tzo and prev.tzo + # is off by 720 minutes, which is indicative of a common + # user AM/PM error + elif timeDiff == 720: + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "likely-AM-PM-error") + + # if it doesn't fall into any of these cases, then the + # tzo difference is likely due to travel + + else: + df = assignTzoFromDeviceTzo(df, i, device) + + return df + + +def getImputIndices(df, sIdx, hIdx): + + lastDayIdx = len(df) - 1 + + currentDayIdx = sIdx.min() + tempList = pd.Series(hIdx) - currentDayIdx + prevDayIdx = currentDayIdx - 1 + nextDayIdx = \ + min(currentDayIdx + min(tempList[tempList >= 0]), lastDayIdx) + + return currentDayIdx, prevDayIdx, nextDayIdx + + +def imputeByTimezone(df, currentDay, prevDaywData, nextDaywData): + + gapSize = (nextDaywData - currentDay) + + if prevDaywData >= 0: + + if df.loc[prevDaywData, "est.timezone"] == \ + df.loc[nextDaywData, "est.timezone"]: + + tz = df.loc[prevDaywData, "est.timezone"] + + for i in range(currentDay, nextDaywData): + + df.loc[i, ["est.timezone"]] = tz + + df.loc[i, ["est.timezoneOffset"]] = \ + getTimezoneOffset(pd.to_datetime(df.loc[i, "date"]), tz) + + df.loc[i, ["est.type"]] = "IMPUTE" + + df = addAnnotation(df, i, "gap=" + str(gapSize)) + df.loc[i, ["est.gapSize"]] = gapSize + + # TODO: this logic should be updated to handle the edge case + # where the day before and after the gap have differing TZ, but + # the same TZO. In that case the gap should be marked as UNCERTAIN + elif df.loc[prevDaywData, "est.timezoneOffset"] == \ + df.loc[nextDaywData, "est.timezoneOffset"]: + + for i in range(currentDay, nextDaywData): + + df.loc[i, ["est.timezoneOffset"]] = \ + df.loc[prevDaywData, "est.timezoneOffset"] + + df.loc[i, ["est.type"]] = "IMPUTE" + + df = addAnnotation(df, i, "gap=" + str(gapSize)) + df.loc[i, ["est.gapSize"]] = gapSize + + else: + for i in range(currentDay, nextDaywData): + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "unable-to-impute-tzo") + + else: + for i in range(currentDay, nextDaywData): + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "unable-to-impute-tzo") + + return df + + +def addAnnotation(df, idx, annotationMessage): + if pd.notnull(df.loc[idx, "est.annotations"]): + df.loc[idx, ["est.annotations"]] = df.loc[idx, "est.annotations"] + \ + ", " + annotationMessage + else: + df.loc[idx, ["est.annotations"]] = annotationMessage + + return df + + +def getTimezoneOffset(currentDate, currentTimezone): + + tz = pytz.timezone(currentTimezone) + # here we add 1 day to the current date to account for changes to/from DST + tzoNum = int(tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z")) + tzoHours = np.floor(tzoNum / 100) + tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0) + tzoSign = np.sign(tzoHours) + tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign)) + + return tzo + + +def estimate_local_time(df): + df["date"] = df["utcTime"].dt.date # TODO: change this to utcDate later + contiguous_days = create_contiguous_day_series(df) + + df["deviceType"] = add_device_type(df) + cDays = add_device_day_series(df, contiguous_days, "upload") + + # create day series for cgm df + if "timezoneOffset" not in list(df): + df["timezoneOffset"] = np.nan + + cgmdf = df[(df["type"] == "cbg") & (df["timezoneOffset"].notnull())].copy() + cDays = add_device_day_series(cgmdf, cDays, "cgm") + + # create day series for pump df + pumpdf = df[(df.type == "bolus") & (df.timezoneOffset.notnull())].copy() + cDays = add_device_day_series(pumpdf, cDays, "pump") + + # interpolate between upload records of the same deviceType, and create a + # day series for interpolated pump, non-hk-cgm, and healthkit uploads + for deviceType in ["pump", "cgm", "healthkit"]: + tempUploaddf = df[df["deviceType"] == deviceType].copy() + cDays = impute_upload_records( + tempUploaddf, cDays, deviceType + ".upload.imputed" + ) + + # add a home timezone that also accounts for daylight savings time changes + cDays = add_home_timezone(df, cDays) + + # 1. USE UPLOAD RECORDS TO ESTIMATE TZ AND TZO + cDays = estimateTzAndTzoWithUploadRecords(cDays) + + # 2. USE DEVICE TZOs TO ESTIMATE TZO AND TZ (IF POSSIBLE) + # estimates can be made from pump and cgm df that have a TZO + # NOTE: the healthkit and dexcom-api cgm df are excluded + cDays = estimateTzAndTzoWithDeviceRecords(cDays) + + # 3. impute, infer, or interpolate gaps in the estimated tzo and tz + cDays = imputeTzAndTzo(cDays) + + # 4. APPLY LOCAL TIME ESTIMATES TO ALL df + local_time = applyLocalTimeEstimates(df, cDays) + + return local_time, cDays + + +# %% MAIN FUNCTION +def get_distribution_and_stats( + json_data_path, + userid, + date_stamp, + save_data_path +): + + phi_date = "PHI-" + date_stamp + + output_metadata = os.path.join( + save_data_path, + phi_date + "-donor-data", + phi_date + "-cgm-metadata" + ) + + output_distribution = os.path.join( + save_data_path, + phi_date + "-donor-data", + phi_date + "-cgm-distributions" + ) + debug_duplicates = os.path.join( + save_data_path, + phi_date + "-donor-data", + phi_date + "-debug-cgm-duplicates" + ) + output_stats = os.path.join( + save_data_path, + phi_date + "-donor-data", + phi_date + "-cgm-stats" + ) + + make_folder_if_doesnt_exist( + [output_metadata, output_distribution, debug_duplicates, output_stats] + ) + + timezone_aliases = pd.read_csv( + "wikipedia-timezone-aliases-2018-04-28.csv", + low_memory=False + ) + + donor_metadata_columns = [ + 'userid', + 'diagnosisType', + 'diagnosisDate', + 'biologicalSex', + 'birthday', + 'targetTimezone', + 'targetDevices', + 'isOtherPerson', + ] + + # load in data + data = pd.read_json(json_data_path) + + # load in donor metadata + all_donor_metadata = pd.read_csv( + os.path.join( + save_data_path, + phi_date + "-donor-data", + phi_date + "-donor-metadata.csv"), + low_memory=False + ) + + metadata = all_donor_metadata.loc[ + all_donor_metadata["userid"] == userid, + donor_metadata_columns + ] + + print("starting", userid) + + # HASH USER ID + hashid = hash_userid(userid, os.environ['BIGDATA_SALT']) + data["userid"] = userid + data["hashid"] = hashid + metadata["hashid"] = hashid + + # CLEAN DATA + + # NOTE: moving remove negative durations to type specific cleaning + # TODO: ask backend to change "duration" to only include one object type + + # Tslim calibration bug fix + data, n_cal_readings = tslim_calibration_fix(data.copy()) + metadata["nTandemAndPayloadCalReadings"] = n_cal_readings + + # fix large timzoneOffset bug in utcbootstrapping + data = timezone_offset_bug_fix(data.copy()) + + # add healthkit timezome information + # TODO: refactor this function to only require fields that might have hk tz + data[["timezone", "deviceType"]] = get_healthkit_timezone(data.copy()) + + # convert deprecated timezones to their aliases + data = convert_deprecated_timezone_to_alias(data, timezone_aliases) + + # TIME RELATED ITEMS + data["utcTime"] = to_utc_datetime(data[["time"]].copy()) + + # add upload time to the data, which is needed for: + # getting rid of duplicates and useful for getting local time + + data["uploadTime"] = ( + add_upload_time(data[["type", "uploadId", "utcTime"]].copy()) + ) + +# # estimate local time (refactor of estimate-local-time.py) +# data["localTime"], local_time_metadata = estimate_local_time(data.copy()) +# +# TODO: fix this issue with estimate local time +# ''' +# //anaconda3/envs/tbddp/lib/python3.7/site-packages/pandas/core/ops.py:1649 +# FutureWarning: elementwise comparison failed; returning scalar instead, +# but in the future will perform elementwise comparison result = method(y) +# ''' + + # round all data to the nearest 5 minutes + data["roundedUtcTime"] = round_time( + data["utcTime"].copy(), + time_interval_minutes=5, + start_with_first_record=True, + return_calculation_columns=False + ) + + # TIME CATEGORIES + data["date"] = data["roundedUtcTime"].dt.date + + # AGE, & YLW + # TODO: make this a function + if metadata["birthday"].values[0] is not np.nan: + bDate = pd.to_datetime(metadata["birthday"].values[0][0:7]) + data["age"] = np.floor((data["roundedUtcTime"] - bDate).dt.days/365.25) + else: + data["age"] = np.nan + + if metadata["diagnosisDate"].values[0] is not np.nan: + dDate = pd.to_datetime(metadata["diagnosisDate"].values[0][0:7]) + data["ylw"] = np.floor((data["roundedUtcTime"] - dDate).dt.days/365.25) + else: + data["ylw"] = np.nan + + # GROUP DATA BY TYPE + # first sort by upload time (used when removing dumplicates) + data.sort_values("uploadTime", ascending=False, inplace=True) + groups = data.groupby(by="type") + + # check to see if person is looping + if "basal" in data["type"].unique(): + basal = groups.get_group("basal").dropna(axis=1, how="all") + if "deliveryType" in list(basal): + bd = basal.loc[ + basal["deliveryType"] == "temp", + ["date", "deliveryType"] + ] + temp_basal_counts = ( + pd.DataFrame( + bd.groupby("date").deliveryType.count() + ).reset_index() + ) + temp_basal_counts.rename( + {"deliveryType": "tempBasalCounts"}, + axis=1, + inplace=True + ) + data = pd.merge(data, temp_basal_counts, on="date", how="left") + # >= 25 temp basals per day is likely looping + data["isLoopDay"] = data["tempBasalCounts"] >= 25 + # redefine groups with the new data + groups = data.groupby(by="type") + + else: + data["isLoopDay"] = np.nan + else: + data["isLoopDay"] = np.nan + + # %% CGM DATA + if "cbg" in data["type"].unique(): + # sort data with + metadata["cgmData"] = True + + # filter by cgm + cgm = groups.get_group("cbg").copy() + + # sort data + cgm.sort_values("roundedUtcTime", ascending=False, inplace=True) + cgm.reset_index(drop=False, inplace=True) + + # calculate cgm in mg/dL + cgm["mg/dL"] = round(cgm["value"] * MGDL_PER_MMOLL) + + # get rid of spike data + cgm, nSpike = remove_spike_data(cgm.copy()) + metadata["nSpike"] = nSpike + + # assign upload cgm device info to cgm records in that upload + cgm = add_upload_info_to_cgm_records(groups, cgm.copy()) + + # check to see if cgm info exists in healthkit locations + cgm = expand_heathkit_cgm_fields(cgm.copy()) + + # replace smoothed cgm values with raw values (if they exist) + # this must run after expand_heathkit_cgm_fields _ + cgm["mg/dL"], metadata["nSmoothedCgmReplaced"] = ( + replace_smoothed_cgm_values(cgm.copy()) + ) + + # get cgm models + cgm["cgmModel"], cgm["cgmModelSensedFrom"] = np.nan, np.nan + + # dexcom cgm models (G4, G5, G6) + cgm[["cgmModel", "cgmModelSensedFrom"]] = ( + get_dexcom_cgm_model(cgm.copy()) + ) + + # for non dexcom cgms + # 670G, 640G, 630G, 530G, 523/723, libre, animas, and tandem + cgm[["cgmModel", "cgmModelSensedFrom"]] = ( + get_non_dexcom_cgm_model(cgm.copy()) + ) + + # get metadata on cgm models and devices + metadata["nMissingCgmModels"] = cgm["cgmModel"].isnull().sum() + metadata["uniqueCgmModels"] = str(cgm["cgmModel"].unique()) + if "deviceId" in list(cgm): + metadata["uniqueCgmDevices"] = str(cgm["deviceId"].unique()) + + # clean distributions + # break up all traces by cgm model + combined_cgm_series = pd.DataFrame() + cgm_models = cgm.groupby(by="cgmModel") + + for cgm_model in cgm_models.groups.keys(): + print("working on", cgm_model) + temp_cgm = cgm_models.get_group(cgm_model) + + # get rid of cgm values too low/high (< 38 & > 402 mg/dL) + temp_cgm, nInvalidCgmValues = remove_invalid_cgm_values(temp_cgm) + metadata["nInvalidCgmValues." + cgm_model] = nInvalidCgmValues + + # sort by upload time before getting rid of duplicates + temp_cgm.sort_values("uploadTime", ascending=False, inplace=True) + + # get rid of duplicates that have the same ["deviceTime", "mg/dL"] + temp_cgm, n_cgm_dups_removed = ( + removeCgmDuplicates(temp_cgm, "deviceTime", "mg/dL") + ) + metadata["nCgmDuplicatesRemovedDeviceTime." + cgm_model] = ( + n_cgm_dups_removed + ) + + # get rid of duplicates that have the same ["time", "mg/dL"] + temp_cgm, n_cgm_dups_removed = ( + removeCgmDuplicates(temp_cgm, "utcTime", "mg/dL") + ) + metadata["nCgmDuplicatesRemovedUtcTime." + cgm_model] = ( + n_cgm_dups_removed + ) + + # get rid of duplicates that have the same roundedTime + temp_cgm, n_cgm_dups_removed = ( + removeDuplicates(temp_cgm, "roundedUtcTime") + ) + metadata["nCgmDuplicatesRemovedRoundedTime." + cgm_model] = ( + n_cgm_dups_removed + ) + + # create a contiguous 5 minute time series + first_day = temp_cgm["roundedUtcTime"].min() + metadata["firstCgm." + cgm_model] = first_day + + last_day = temp_cgm["roundedUtcTime"].max() + metadata["lastCgm." + cgm_model] = last_day + + rng = pd.date_range(first_day, last_day, freq="5min") + contiguous_data = pd.DataFrame( + rng, + columns=["roundedUtcTime"] + ).sort_values( + "roundedUtcTime", + ascending=False + ).reset_index(drop=True) + + # merge with cgm data + cgm_series = pd.merge( + contiguous_data, + temp_cgm[[ + "roundedUtcTime", "hashid", "isLoopDay", + "cgmModel", "age", "ylw", "mg/dL" + ]], + on="roundedUtcTime", + how="left" + ) + + # sort so that the oldest data point is on top + cgm_series.sort_values( + "roundedUtcTime", ascending=True, inplace=True + ) + cgm_series.reset_index(drop=True, inplace=True) + + # get dexcom icgm bins + value_bins = np.array( + [37, 39, 60, 80, 120, 160, 200, 250, 300, 350, 400, 403] + ) + value_bin_names = ( + "< 40", "40-60", "61-80", "81-120", "121-160", "161-200", + "201-250", "251-300", "301-350", "351-400", "> 400" + ) + cgm_series["valueBin"] = pd.cut( + cgm_series["mg/dL"], value_bins, labels=value_bin_names + ) + + # get the previous val + cgm_series["previousVal"] = cgm_series["mg/dL"].shift(1) + + # get difference between current and previous val + cgm_series["diffFromPrevVal"] = ( + cgm_series["mg/dL"] - cgm_series["previousVal"] + ) + + # calculate the rate from previous value (mg/dL/min) + cgm_series["rateFromPrevVal"] = cgm_series["diffFromPrevVal"] / 5 + + # get dexcom icgm rate bins + rate_bins = np.array( + [-100, -2.000001, -1.000001, -0.000001, 1, 2, 100] + ) + # NOTE: bracket means include, parentheses means exclude + rate_bin_names = ( + "< -2", "[-2,-1)", "[-1,-0)", "[0,1]", "(1,2]", ">2", + ) + cgm_series["rateBin"] = pd.cut( + cgm_series["rateFromPrevVal"], rate_bins, labels=rate_bin_names + ) + + # through in the join category + cgm_series["valAndRateBin"] = ( + cgm_series["valueBin"].astype(str) + + " & " + + cgm_series["rateBin"].astype(str) + ) + + # calculate slope (mg/dL/min) over the last 15, 30, and 60 minutes + cgm_series["slope15"] = ( + cgm_series["mg/dL"].rolling(3).apply(get_slope, raw=True) + ) + + cgm_series["slope30"] = ( + cgm_series["mg/dL"].rolling(6).apply(get_slope, raw=True) + ) + + cgm_series["slope60"] = ( + cgm_series["mg/dL"].rolling(12).apply(get_slope, raw=True) + ) + + # add in the next value + cgm_series["nextVal"] = cgm_series["mg/dL"].shift(-1) + + # get difference or relative increase/decrease of next value + cgm_series["relativeNextValue"] = ( + cgm_series["nextVal"] - cgm_series["mg/dL"] + ) + + # rate of next value + cgm_series["rateToNextVal"] = cgm_series["relativeNextValue"] / 5 + + # drop rows where there is no information + cgm_series.dropna(subset=['hashid'], inplace=True) + metadata["nCgmDataPoints." + cgm_model] = len(cgm_series) + + # append cgm model to a larger table + combined_cgm_series = pd.concat( + [combined_cgm_series, cgm_series], + ignore_index=True + ) + if len(combined_cgm_series) > 0: + # sort so that the oldest data point is on top + # and that the G5_G6 get deleted if they are apart of a duplicate + combined_cgm_series["cgmModel_G5_and_G6"] = ( + combined_cgm_series["cgmModel"] == "G5_G6" + ) + combined_cgm_series.sort_values( + by=["roundedUtcTime", "cgmModel_G5_and_G6", "cgmModel"], + ascending=[False, True, False], + inplace=True + ) + combined_cgm_series.reset_index(drop=True, inplace=True) + + # add in check to see if there are duplicates between cgm devices + nUnique_cgm_times = len(combined_cgm_series["roundedUtcTime"].unique()) + cgm_len = len(combined_cgm_series) + metadata["duplicateCgmDataIssue"] = nUnique_cgm_times != cgm_len + + nDuplicate_cgm = cgm_len - nUnique_cgm_times + metadata["nDuplicateCgmDataIssues"] = nDuplicate_cgm + + # if there are still duplicates, get rid of them + if nDuplicate_cgm > 0: + # save the duplicates for further examination + combined_cgm_series.to_csv(os.path.join( + debug_duplicates, + "PHI-" + userid + "-cgm-series-has-cgm-duplicates.csv.gz" + )) + + cgm.to_csv(os.path.join( + debug_duplicates, + "PHI-" + userid + "-cgm-data-has-cgm-duplicates.csv.gz" + )) + + # get rid of duplicates + combined_cgm_series, n_cgm_dups_removed = ( + removeDuplicates(combined_cgm_series, "roundedUtcTime") + ) + metadata["nCgmDuplicatesRemovedRoundedTime.atEnd"] = ( + n_cgm_dups_removed + ) + metadata["nCgmDataPoints.atEnd"] = len(combined_cgm_series) + + # add whether data is dexcom cgm or not + combined_cgm_series["dexcomCgm"] = ( + combined_cgm_series["cgmModel"].astype(str).str.contains("G4|G5|G6") + ) + + # save distribution data + combined_cgm_series.to_csv(os.path.join( + output_distribution, + "PHI-" + userid + "-cgm-distribution.csv.gz" + )) + + # %% get cgm stats + # create a contiguous 5 minute time series of ALL cgm data + first_day = combined_cgm_series["roundedUtcTime"].min() + metadata["firstCgm." + cgm_model] = first_day + + last_day = combined_cgm_series["roundedUtcTime"].max() + metadata["lastCgm." + cgm_model] = last_day + + rng = pd.date_range(first_day, last_day, freq="5min") + contiguous_data = pd.DataFrame( + rng, + columns=["roundedUtcTime"] + ).sort_values( + "roundedUtcTime", + ascending=True + ).reset_index(drop=True) + + # merge with combined_cgm_series data + all_cgm = pd.merge( + contiguous_data, + combined_cgm_series[[ + 'roundedUtcTime', 'hashid', 'cgmModel', 'dexcomCgm', + 'age', 'ylw', 'isLoopDay', 'mg/dL', + ]], + on="roundedUtcTime", + how="left" + ) + + # get cgm stats + # get a binary (T/F) of whether we have a cgm value + all_cgm["hasCgm"] = all_cgm["mg/dL"].notnull() + + # fill isLoopDay nan with False + all_cgm["isLoopDay"].fillna(False, inplace=True) + + # has loop and cgm + all_cgm["hasLoopAndCgm"] = ( + (all_cgm["isLoopDay"]) & (all_cgm["hasCgm"]) + ) + + all_cgm["hasCgmWithoutLoop"] = ( + (~all_cgm["isLoopDay"]) & (all_cgm["hasCgm"]) + ) + + # work with all of the non-null data, even 39 = LOW and 401 = HIGH + ts39_401 = all_cgm["mg/dL"].copy() + + # some stats should NOT include 39 or 401 + all_cgm["mg/dL.40to400"] = ( + ts39_401.replace(to_replace=39, value=np.nan) + ) + + all_cgm["mg/dL.40to400"] = ( + all_cgm["mg/dL.40to400"].replace( + to_replace=401, + value=np.nan + ) + ) + + ts40_400 = all_cgm["mg/dL.40to400"].copy() + + + # for all the less than (<) criteria + for cgm_threshold in [40, 54, 70]: + all_cgm["cgm < " + str(cgm_threshold)] = ( + ts39_401.lt(cgm_threshold) + ) + # get episodes below these thresholds + for min_duration in [5, 15]: + episode_ts = get_episodes( + all_cgm[[ + "roundedUtcTime", + "hasCgm", + "cgm < " + str(cgm_threshold) + ]].copy(), + episode_criterion="cgm < " + str(cgm_threshold), + min_duration=min_duration + ) + all_cgm = pd.concat([all_cgm, episode_ts], axis=1) + + # for all the greter than or equal to (>=) criteria + all_cgm["cgm >= " + str(cgm_threshold)] = ( + ts39_401.ge(cgm_threshold) + ) + + # for all the the less than or equal to (<=) criteria + for cgm_threshold in [140, 180, 250, 300, 400]: + all_cgm["cgm <= " + str(cgm_threshold)] = ( + ts39_401.le(cgm_threshold) + ) + # for all the the greter than (>) criteria + all_cgm["cgm > " + str(cgm_threshold)] = ( + ts39_401.gt(cgm_threshold) + ) + + # get all of the cgm ranges + # (cgm >= 40) & (cgm < 54) + all_cgm["40 <= cgm < 54"] = ( + (all_cgm["cgm >= 40"]) & (all_cgm["cgm < 54"]) + ) + + # (cgm >= 54) & (cgm < 70) + all_cgm["54 <= cgm < 70"] = ( + (all_cgm["cgm >= 54"]) & (all_cgm["cgm < 70"]) + ) + + # (cgm >= 70) & (cgm <= 140) + all_cgm["70 <= cgm <= 140"] = ( + (all_cgm["cgm >= 70"]) & (all_cgm["cgm <= 140"]) + ) + + # (cgm >= 70) & (cgm <= 180) + all_cgm["70 <= cgm <= 180"] = ( + (all_cgm["cgm >= 70"]) & (all_cgm["cgm <= 180"]) + ) + + # (cgm > 180) & (cgm <= 250) + all_cgm["180 < cgm <= 250"] = ( + (all_cgm["cgm > 180"]) & (all_cgm["cgm <= 250"]) + ) + + # (cgm > 250) & (cgm <= 400) + all_cgm["250 < cgm <= 400"] = ( + (all_cgm["cgm > 250"]) & (all_cgm["cgm <= 400"]) + ) + + # derfine the windows to calculate the stats over + window_names = ["hour", "day", "week", "month", "quarter", "year"] + window_lengths = [12, 288, 288*7, 288*7*4, 288*90, 288*365] + + for w_name, w_len in zip(window_names, window_lengths): + # require lenth of window for percent calculations + w_min = w_len + + # get the start and end times for each window + all_cgm[w_name + ".startTime"] = ( + all_cgm["roundedUtcTime"].shift(w_len - 1) + ) + all_cgm[w_name + ".endTime"] = all_cgm["roundedUtcTime"] + + # add majority age for the time period + all_cgm[w_name + ".age"] = np.round( + all_cgm["age"].rolling( + min_periods=1, + window=w_len + ).mean() + ) + + # add majority ylw for the time period + all_cgm[w_name + ".ylw"] = np.round( + all_cgm["ylw"].rolling( + min_periods=1, + window=w_len + ).median() + ) + + # get percent time cgm used + all_cgm[w_name + ".cgmPercent"] = ( + all_cgm["hasCgm"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # get the total number of non-null values over this time period + all_cgm[w_name + ".missingCgmPercent"] = ( + 1 - all_cgm[w_name + ".cgmPercent"] + ) + + # create (T/F) 70 and 80 percent available thresholds + # which will be useful for processing later + all_cgm[w_name + ".ge70Available"] = ( + all_cgm[w_name + ".cgmPercent"] >= 0.7 + ) + + all_cgm[w_name + ".ge80Available"] = ( + all_cgm[w_name + ".cgmPercent"] >= 0.8 + ) + + # get percent time Loop was used NOTE: this is + # approximate because we use > 24 temp basals per day + # ALSO: this is percent time Loop was used while cgm in use + all_cgm[w_name + ".loopingAndCgmPercent"] = ( + all_cgm["hasLoopAndCgm"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # percent of time cgm without loop + all_cgm[w_name + ".cgmWithoutLoopPercent"] = ( + all_cgm["hasCgmWithoutLoop"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # get episode stats + # TODO: add in hyper events + # get episodes below these thresholds + for cgm_threshold in [40, 54, 70]: + # get number of episodes per time window + for min_duration in [5, 15]: + "cgm < " + str(cgm_threshold) + episode_name = ( + "episode.cgm < " + str(cgm_threshold) + + ".durationThreshold=" + str(min_duration) + ) + all_cgm[w_name + ".count." + episode_name] = ( + all_cgm[episode_name + ".episodeStart"].rolling( + min_periods=1, + window=w_len + ).sum() + ) + + # get avg. duration of each episode per time window + all_cgm[w_name + ".avgDuration." + episode_name] = ( + all_cgm[episode_name + ".episodeTotalDuration"].rolling( + min_periods=1, + window=w_len + ).sum() / all_cgm[w_name + ".count." + episode_name] + ) + + # get min duration of each episode per time window + all_cgm[w_name + ".minDuration." + episode_name] = ( + all_cgm[episode_name + ".episodeTotalDuration"].rolling( + min_periods=1, + window=w_len + ).min() + ) + + # get median duration of each episode per time window + all_cgm[w_name + ".medianDuration." + episode_name] = ( + all_cgm[episode_name + ".episodeTotalDuration"].rolling( + min_periods=1, + window=w_len + ).median() + ) + + # get max duration of each episode per time window + all_cgm[w_name + ".maxDuration." + episode_name] = ( + all_cgm[episode_name + ".episodeTotalDuration"].rolling( + min_periods=1, + window=w_len + ).max() + ) + + # get percent time in different ranges + # % Time < 54 + all_cgm[w_name + ".lt54Percent"] = ( + all_cgm["cgm < 54"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # % Time in 54-70 (cgm >= 54) & (cgm < 70) + all_cgm[w_name + ".bt54_70Percent"] = ( + all_cgm["54 <= cgm < 70"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # % Time in target range (cgm >= 70) & (cgm <= 180) + all_cgm[w_name + ".bt70_180Percent"] = ( + all_cgm["70 <= cgm <= 180"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # % Time in 180-250 (cgm > 180) & (cgm <= 250) + all_cgm[w_name + ".bt180_250Percent"] = ( + all_cgm["180 < cgm <= 250"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # % Time > 250 + all_cgm[w_name + ".gt250Percent"] = ( + all_cgm["cgm > 250"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # check that all of the percentages add of to 1 or 100% + all_cgm[w_name + ".percentCheck"] = ( + all_cgm[w_name + ".missingCgmPercent"] + + all_cgm[w_name + ".lt54Percent"] + + all_cgm[w_name + ".bt54_70Percent"] + + all_cgm[w_name + ".bt70_180Percent"] + + all_cgm[w_name + ".bt180_250Percent"] + + all_cgm[w_name + ".gt250Percent"] + ) + + # here are some other less common percent time in ranges + # % Time < 70 + all_cgm[w_name + ".lt70Percent"] = ( + all_cgm["cgm < 70"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # % Time in target range (cgm >= 70) & (cgm <= 140) + all_cgm[w_name + ".tir70to140Percent"] = ( + all_cgm["70 <= cgm <= 140"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # percent time above a threshold + # % Time > 180 + all_cgm[w_name + ".gt180Percent"] = ( + all_cgm["cgm > 180"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # quantiles + # NOTE: this will increase run time, so only run if you need + # 3-4X the processing time since it has to sort the data + # TODO: make this an option to the function, once it is made + # create a rolling object + + # NOTE: these calculations only require 3 points to make + roll39_401 = ts39_401.rolling(min_periods=3, window=w_len) + roll40_400 = ts40_400.rolling(min_periods=3, window=w_len) + + # min + all_cgm[w_name + ".min"] = roll39_401.min() + + # 10, 25, 75, and 90th percentiles + all_cgm[w_name + ".10th"] = roll39_401.quantile(0.10) + all_cgm[w_name + ".25th"] = roll39_401.quantile(0.25) + all_cgm[w_name + ".75th"] = roll39_401.quantile(0.75) + all_cgm[w_name + ".90th"] = roll39_401.quantile(0.90) + + # max + all_cgm[w_name + ".max"] = roll39_401.max() + + # median + all_cgm[w_name + ".median"] = roll39_401.median() + + # iqr + all_cgm[w_name + ".iqr"] = ( + all_cgm[w_name + ".75th"] - all_cgm[w_name + ".25th"] + ) + + # recalcuate percent of measurements available + all_cgm[w_name + ".40to400availablePercent"] = ( + roll40_400.count() / w_len + ) + + # get the total number of non-null values over this time period + all_cgm[w_name + ".40to400missingPercent"] = ( + 1 - all_cgm[w_name + ".40to400availablePercent"] + ) + + all_cgm[w_name + ".40to400ge70Available"] = ( + all_cgm[w_name + ".40to400availablePercent"] >= 0.7 + ) + + all_cgm[w_name + ".40to400ge80Available"] = ( + all_cgm[w_name + ".40to400availablePercent"] >= 0.8 + ) + + # mean + all_cgm[w_name + ".mean"] = roll40_400.mean() + + # GMI(%) = 3.31 + 0.02392 x [mean glucose in mg/dL] + all_cgm[w_name + ".gmi"] = ( + 3.31 + (0.02392 * all_cgm[w_name + ".mean"]) + ) + + # standard deviation (std) + all_cgm[w_name + ".std"] = roll40_400.std() + + # coefficient of variation (cov) = std / mean + all_cgm[w_name + ".cov"] = ( + all_cgm[w_name + ".std"] / all_cgm[w_name + ".mean"] + ) + + # %% save cgm stats data + all_cgm.to_csv(os.path.join( + output_stats, + "PHI-" + userid + "-cgm-stats.csv.gz" + )) + # write the most recent example of the 90 day stats + # to the metadata + quarter_ge80Available_idx = ( + all_cgm[all_cgm["quarter.ge80Available"]] + ).index.max() + + if pd.notnull(quarter_ge80Available_idx): + # get the most recent quarter + most_recent = all_cgm.loc[ + [quarter_ge80Available_idx], + all_cgm.columns + ] + else: + most_recent = all_cgm.loc[ + [all_cgm.index.max()], + all_cgm.columns + ] + + metadata = pd.merge( + metadata, + most_recent, + on="hashid", + how="left" + ) + + print(metadata.T) + + else: + metadata["cgmData"] = False + print(userid, " has no cgm data") + + # save metadata + metadata.to_csv(os.path.join( + output_metadata, + "PHI-" + userid + "-cgm-metadata.csv.gz" + )) + + print("finished with", userid, "\n") + + return + + +# %% MAIN +if __name__ == "__main__": + # USER INPUTS (choices to be made in order to run the code) + codeDescription = "get distribution and stats for donor json data" + parser = argparse.ArgumentParser(description=codeDescription) + + parser.add_argument( + "-i", + "--input-json-data-path", + dest="json_data_path", + default=os.path.abspath( + os.path.join( + os.path.dirname(__file__), "..", "data" + ) + ), + help="the path where the json data is located" + ) + + parser.add_argument( + "-u", + "--userid", + dest="userid", + default=np.nan, + help="userid and filename" + ) + + parser.add_argument( + "-d", + "--date-stamp", + dest="date_stamp", + default=dt.datetime.now().strftime("%Y-%m-%d"), + help="date, in '%Y-%m-%d' format, of the date when " + + "donors were accepted" + ) + + parser.add_argument( + "-o", + "--output-data-path", + dest="data_path", + default=os.path.abspath( + os.path.join( + os.path.dirname(__file__), "..", "data" + ) + ), + help="the output path where the data is stored" + ) + + args = parser.parse_args() + + # the main function + get_distribution_and_stats( + json_data_path=args.json_data_path, + userid=args.userid, + date_stamp=args.date_stamp, + save_data_path=args.data_path, + ) From 4b44457fe251605e87175f99ed3393c8c6368938 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Tue, 20 Aug 2019 17:16:50 -0500 Subject: [PATCH 38/46] skip already processed and use 1/2 of processors --- .../batch_get_cgm_distributions_and_stats.py | 51 ++++++++++++------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py b/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py index 2830fe03..502a4b47 100644 --- a/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py +++ b/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py @@ -61,26 +61,40 @@ def run_process(json_data_path): userid = json_data_path[-15:-5] - p = sub.Popen( - [ - "python", "get_cgm_distributions_and_stats.py", - "-i", json_data_path, - "-u", userid, - "-d", args.date_stamp, - "-o", args.data_path - ], - stdout=sub.PIPE, - stderr=sub.PIPE + # check to see if the file was already processed + phi_date_stamp = "PHI-" + args.date_stamp + + metadata_path = os.path.join( + args.data_path, + phi_date_stamp + "-donor-data", + phi_date_stamp + "-cgm-metadata" ) - output, errors = p.communicate() - output = output.decode("utf-8") - errors = errors.decode("utf-8") + all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz")) + if userid not in str(all_metadata_files): + + p = sub.Popen( + [ + "python", "get_cgm_distributions_and_stats.py", + "-i", json_data_path, + "-u", userid, + "-d", args.date_stamp, + "-o", args.data_path + ], + stdout=sub.PIPE, + stderr=sub.PIPE + ) + + output, errors = p.communicate() + output = output.decode("utf-8") + errors = errors.decode("utf-8") - if errors == '': - print(output) + if errors == '': + print(output) + else: + print(errors) else: - print(errors) + print(userid, "was already processed") return @@ -89,13 +103,16 @@ def run_process(json_data_path): all_files = glob.glob(args.json_data_path, recursive=True) # this is a good test to make sure run process is working before running +#import pdb +#args.date_stamp = "2019-07-17" #run_process(all_files[0]) #pdb.set_trace() + # use multiple cores to process startTime = time.time() print("starting at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) -pool = Pool(os.cpu_count()) +pool = Pool(int(os.cpu_count()/2)) pool.map(run_process, all_files) pool.close() endTime = time.time() From 57dc7fd622ba759206943d1d10e834a52e52a79f Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Wed, 21 Aug 2019 07:24:53 -0500 Subject: [PATCH 39/46] get results script --- ...e_cgm_distribution_and_metadata_results.py | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py diff --git a/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py new file mode 100644 index 00000000..e548115b --- /dev/null +++ b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- +"""accept_donors_and_pull_data.py +This is a wrapper script that gets distributions and stats for all donors, +NOTE: this needs to be refactored because it is currently set up to run +on json files that are in a snowflake path + +""" + +# %% REQUIRED LIBRARIES +import pandas as pd +import os +import glob +import argparse + + +# %% USER INPUTS (choices to be made in order to run the code) +codeDescription = "get distribution and stats for all donor's json data" +parser = argparse.ArgumentParser(description=codeDescription) + +parser.add_argument( + "-d", + "--date-stamp", + dest="date_stamp", + default="2019-07-17", + help="date, in '%Y-%m-%d' format, of the date when " + + "donors were accepted" +) + +parser.add_argument( + "-o", + "--output-data-path", + dest="data_path", + default=os.path.abspath( + os.path.join( + os.path.dirname(__file__), "..", "data" + ) + ), + help="the output path where the data is stored" +) + +args = parser.parse_args() + + +# %% COMBINE AND SAVE ALL DONOR METADATA +print("combining all metadata") +phi_date_stamp = "PHI-" + args.date_stamp +donor_folder = os.path.join(args.data_path, phi_date_stamp + "-donor-data") + +metadata_path = os.path.join( + args.data_path, + phi_date_stamp + "-donor-data", + phi_date_stamp + "-cgm-metadata" +) + +all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz")) +all_metadata = pd.DataFrame() +for f in all_metadata_files: + temp_meta = pd.read_csv(f) + all_metadata = pd.concat( + [all_metadata, temp_meta], + ignore_index=True, + sort=False + ) + +all_metadata.to_csv( + os.path.join(donor_folder, phi_date_stamp + "-cgm-metadata.csv.gz") +) +print("saving metadata...code complete") + + +# %% COMBINE AND SAVE ALL DISTRIBUTION DATA +print("combining all distribution data") + +metadata_path = os.path.join( + args.data_path, + phi_date_stamp + "-donor-data", + phi_date_stamp + "-cgm-distributions" +) + +all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz")) +distribution_metadata = pd.DataFrame() +for f in all_metadata_files: + temp_meta = pd.read_csv(f, index_col=[0]) + distribution_metadata = pd.concat( + [distribution_metadata, temp_meta], + ignore_index=True, + sort=False + ) + +distribution_metadata.to_csv( + os.path.join( + donor_folder, phi_date_stamp + "-all-cgm-distributions.csv.gz" + ) +) +print("saving all-dataset-info-metadata...code complete") From 60e6c228d81ae385e43562a405de61b66ef35f87 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Wed, 21 Aug 2019 08:32:30 -0500 Subject: [PATCH 40/46] pull files into pandas with low_memory flag to make sure that the column datatypes are sensed correctly --- .../combine_cgm_distribution_and_metadata_results.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py index e548115b..1ed4ad00 100644 --- a/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py +++ b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py @@ -55,7 +55,7 @@ all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz")) all_metadata = pd.DataFrame() for f in all_metadata_files: - temp_meta = pd.read_csv(f) + temp_meta = pd.read_csv(f, low_memory=False) all_metadata = pd.concat( [all_metadata, temp_meta], ignore_index=True, @@ -80,7 +80,7 @@ all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz")) distribution_metadata = pd.DataFrame() for f in all_metadata_files: - temp_meta = pd.read_csv(f, index_col=[0]) + temp_meta = pd.read_csv(f, index_col=[0], low_memory=False) distribution_metadata = pd.concat( [distribution_metadata, temp_meta], ignore_index=True, From a984e128fe2f52dc3b9964058fdb4c1328175716 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Wed, 21 Aug 2019 09:51:36 -0500 Subject: [PATCH 41/46] save results in chunks --- ...e_cgm_distribution_and_metadata_results.py | 58 ++++++++++++++----- 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py index 1ed4ad00..a14fd57d 100644 --- a/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py +++ b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py @@ -8,6 +8,7 @@ # %% REQUIRED LIBRARIES import pandas as pd +import numpy as np import os import glob import argparse @@ -38,6 +39,15 @@ help="the output path where the data is stored" ) + +parser.add_argument( + "-c", + "--chunk-size", + dest="chunk_size", + default=50, + help="the output path where the data is stored" +) + args = parser.parse_args() @@ -63,9 +73,13 @@ ) all_metadata.to_csv( - os.path.join(donor_folder, phi_date_stamp + "-cgm-metadata.csv.gz") + os.path.join( + donor_folder, + phi_date_stamp + + "-cgm-metadata-0-{}.csv.gz".format(str(len(all_metadata_files))) + ) ) -print("saving metadata...code complete") +print("finished saving metadata...starting distribution data...") # %% COMBINE AND SAVE ALL DISTRIBUTION DATA @@ -78,18 +92,30 @@ ) all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz")) -distribution_metadata = pd.DataFrame() -for f in all_metadata_files: - temp_meta = pd.read_csv(f, index_col=[0], low_memory=False) - distribution_metadata = pd.concat( - [distribution_metadata, temp_meta], - ignore_index=True, - sort=False - ) - -distribution_metadata.to_csv( - os.path.join( - donor_folder, phi_date_stamp + "-all-cgm-distributions.csv.gz" +chunks = np.arange(0, len(all_metadata_files), args.chunk_size) +chunks = np.append(chunks, len(all_metadata_files)) +for chunk_start, chunk_end in zip(chunks[0:-1], chunks[1:]): + print("starting chunk {}-{}".format(str(chunk_start), str(chunk_end))) + distribution_metadata = pd.DataFrame() + for c_idx in np.arange(chunk_start, chunk_end): + temp_meta = pd.read_csv( + all_metadata_files[c_idx], + index_col=[0], + low_memory=False + ) + distribution_metadata = pd.concat( + [distribution_metadata, temp_meta], + ignore_index=True, + sort=False + ) + # save chunk + print("saving chunk {}-{}".format(str(chunk_start), str(chunk_end))) + distribution_metadata.to_csv( + os.path.join( + donor_folder, + phi_date_stamp + "-cgm-distributions-{}-{}.csv.gz".format( + str(chunk_start), + str(chunk_end)) + ) ) -) -print("saving all-dataset-info-metadata...code complete") +print("finished saving all-dataset-distribution-data...code complete") From 858372a9e5029aa352e9d3ac0d74fe2876df0a29 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Wed, 21 Aug 2019 11:20:45 -0500 Subject: [PATCH 42/46] move files and modify print statements --- .../batch_get_cgm_distributions_and_stats.py | 1 - ...e_cgm_distribution_and_metadata_results.py | 6 +- .../combine_cgm_distribution_results.py | 92 ++ .../get_cgm_distributions_v3.py | 0 .../get_cgm_distributions_and_stats.py | 1 + .../get_stats/get_cgm_stats.py | 1361 ----------------- 6 files changed, 96 insertions(+), 1365 deletions(-) create mode 100644 projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_results.py rename projects/bigdata-processing-pipeline/get_stats/{ => development-versions}/get_cgm_distributions_v3.py (100%) delete mode 100644 projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py diff --git a/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py b/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py index 502a4b47..61894b9c 100644 --- a/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py +++ b/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py @@ -108,7 +108,6 @@ def run_process(json_data_path): #run_process(all_files[0]) #pdb.set_trace() - # use multiple cores to process startTime = time.time() print("starting at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) diff --git a/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py index a14fd57d..b8bac502 100644 --- a/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py +++ b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py @@ -63,6 +63,7 @@ ) all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz")) +print("combining {} metaata files".format(len(all_metadata_files))) all_metadata = pd.DataFrame() for f in all_metadata_files: temp_meta = pd.read_csv(f, low_memory=False) @@ -83,8 +84,6 @@ # %% COMBINE AND SAVE ALL DISTRIBUTION DATA -print("combining all distribution data") - metadata_path = os.path.join( args.data_path, phi_date_stamp + "-donor-data", @@ -92,7 +91,8 @@ ) all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz")) -chunks = np.arange(0, len(all_metadata_files), args.chunk_size) +print("combining {} distribution data files".format(len(all_metadata_files))) +chunks = np.arange(0, len(all_metadata_files), int(args.chunk_size)) chunks = np.append(chunks, len(all_metadata_files)) for chunk_start, chunk_end in zip(chunks[0:-1], chunks[1:]): print("starting chunk {}-{}".format(str(chunk_start), str(chunk_end))) diff --git a/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_results.py b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_results.py new file mode 100644 index 00000000..12abb350 --- /dev/null +++ b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_results.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +"""accept_donors_and_pull_data.py +This is a wrapper script that gets distributions and stats for all donors, +NOTE: this needs to be refactored because it is currently set up to run +on json files that are in a snowflake path + +""" + +# %% REQUIRED LIBRARIES +import pandas as pd +import numpy as np +import os +import glob +import argparse + + +# %% USER INPUTS (choices to be made in order to run the code) +codeDescription = "get distribution and stats for all donor's json data" +parser = argparse.ArgumentParser(description=codeDescription) + +parser.add_argument( + "-d", + "--date-stamp", + dest="date_stamp", + default="2019-07-17", + help="date, in '%Y-%m-%d' format, of the date when " + + "donors were accepted" +) + +parser.add_argument( + "-o", + "--output-data-path", + dest="data_path", + default=os.path.abspath( + os.path.join( + os.path.dirname(__file__), "..", "data" + ) + ), + help="the output path where the data is stored" +) + +parser.add_argument( + "-c", + "--chunk-size", + dest="chunk_size", + default=50, + help="the output path where the data is stored" +) + +args = parser.parse_args() + + +# %% COMBINE AND SAVE ALL DISTRIBUTION DATA + +phi_date_stamp = "PHI-" + args.date_stamp +donor_folder = os.path.join(args.data_path, phi_date_stamp + "-donor-data") + +metadata_path = os.path.join( + args.data_path, + phi_date_stamp + "-donor-data", + phi_date_stamp + "-cgm-distributions" +) + +all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz")) +print("combining {} distribution data files".format(len(all_metadata_files))) +chunks = np.arange(0, len(all_metadata_files), int(args.chunk_size)) +chunks = np.append(chunks, len(all_metadata_files)) +for chunk_start, chunk_end in zip(chunks[0:-1], chunks[1:]): + print("starting chunk {}-{}".format(str(chunk_start), str(chunk_end))) + distribution_metadata = pd.DataFrame() + for c_idx in np.arange(chunk_start, chunk_end): + temp_meta = pd.read_csv( + all_metadata_files[c_idx], + index_col=[0], + low_memory=False + ) + distribution_metadata = pd.concat( + [distribution_metadata, temp_meta], + ignore_index=True, + sort=False + ) + # save chunk + print("saving chunk {}-{}".format(str(chunk_start), str(chunk_end))) + distribution_metadata.to_csv( + os.path.join( + donor_folder, + phi_date_stamp + "-cgm-distributions-{}-{}.csv.gz".format( + str(chunk_start), + str(chunk_end)) + ) + ) +print("finished saving all-dataset-distribution-data...code complete") diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/development-versions/get_cgm_distributions_v3.py similarity index 100% rename from projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_v3.py rename to projects/bigdata-processing-pipeline/get_stats/development-versions/get_cgm_distributions_v3.py diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py index af0d0d50..8a6cf7d5 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py @@ -1901,6 +1901,7 @@ def get_distribution_and_stats( ascending=[False, True, False], inplace=True ) + combined_cgm_series.reset_index(drop=True, inplace=True) # add in check to see if there are duplicates between cgm devices diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py deleted file mode 100644 index 172f4784..00000000 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_stats.py +++ /dev/null @@ -1,1361 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -''' -calculate cgm statsistics for a single tidepool (donor) dataset -''' - - -# %% REQUIRED LIBRARIES -import os -import sys -import hashlib -import pytz -import numpy as np -import pandas as pd -import datetime as dt - -# TODO: figure out how to get rid of these path dependcies -get_donor_data_path = os.path.abspath( - os.path.join(os.path.dirname(__file__), "..") -) -if get_donor_data_path not in sys.path: - sys.path.insert(0, get_donor_data_path) -import environmentalVariables -from get_donor_data.get_single_donor_metadata import get_shared_metadata -from get_donor_data.get_single_tidepool_dataset import get_data - - -# %% CONSTANTS -MGDL_PER_MMOLL = 18.01559 - - -# %% FUNCTIONS -''' -the functions that are called in this script, -which includes notes of where the functions came from, -and whether they were refactored -''' - - -def hash_userid(userid, salt): - ''' - taken from anonymize-and-export.py - refactored name(s) to meet style guide - ''' - usr_string = userid + salt - hash_user = hashlib.sha256(usr_string.encode()) - hashid = hash_user.hexdigest() - - return hashid - - -def get_type(val): - return type(val).__name__ - - -def remove_negative_durations(df): - ''' - taken from https://github.com/tidepool-org/data-analytics/blob/ - etn/get-settings-and-events/projects/get-donors-pump-settings/ - get-users-settings-and-events.py - - refactored name(s) to meet style guide - refactored pandas field call to df["field"] instead of df.field - refactored because physical activity includes embedded json, whereas - the other fields in the data model require a integer - ''' - if "duration" in list(df): - type_ = df["duration"].apply(get_type) - valid_index = ((type_ == "int") & (df["duration"].notnull())) - n_negative_durations = sum(df.loc[valid_index, "duration"] < 0) - if n_negative_durations > 0: - df = df[~(df.loc[valid_index, "duration"] < 0)] - else: - n_negative_durations = np.nan - - return df, n_negative_durations - - -def expand_embedded_dict(ts, key_): - '''Expanded a single field that has embedded json - - Args: - ts: a pandas time series of the field that has embedded json - key_: the key that you want to expand - - Raise: - TypeError: if you don't pass in a pandas time series - - Returns: - key_ts: a new time series of the key of interest - - NOTE: - this is new function - TODO: - could be refactored to allow multiple keys or all keys to be returned - could be refactored for speed as the current process - ''' - - if "Series" not in type(ts).__name__: - raise TypeError('Expecting a pandas time series object') - key_ts = pd.Series(name=ts.name + "." + key_, index=ts.index) - notnull_idx = ts.notnull() - # TODO: maybe sped up by only getting the one field of interest? - # though, the current method is fairly quick and compact - temp_df = pd.DataFrame(ts[notnull_idx].tolist()) - if key_ in list(temp_df): - key_ts[notnull_idx] = temp_df[key_].values - - return key_ts - - -def get_embedded_field(ts, embedded_field): - '''get a field that is nested in more than 1 embedded dictionary (json) - - Args: - ts: a pandas time series of the field that has embedded json - embedded_field (str): the location of the field that is deeply nested - (e.g., "origin.payload.device.model") - - Raise: - ValueError: if you don't pass in a pandas time series - - Returns: - new_ts: a new time series of the key of interest - - NOTE: - this is new function - the "." notation is used to reference nested json - - ''' - field_list = embedded_field.split(".") - if len(field_list) < 2: - raise ValueError('Expecting at least 1 embedded field') - - new_ts = expand_embedded_dict(ts, field_list[1]) - for i in range(2, len(field_list)): - new_ts = expand_embedded_dict(new_ts, field_list[i]) - - return new_ts - - -def tslim_calibration_fix(df): - ''' - taken from https://github.com/tidepool-org/data-analytics/blob/ - etn/get-settings-and-events/projects/get-donors-pump-settings/ - get-users-settings-and-events.py - - refactored name(s) to meet style guide - refactored pandas field call to df["field"] instead of df.field - refactored to only expand one field - ''' - - # expand payload field one level - df["payload.calibration_reading"] = ( - expand_embedded_dict(df["payload"], "calibration_reading") - ) - - if df["payload.calibration_reading"].notnull().sum() > 0: - - search_for = ['tan'] - tandem_data_index = ( - (df["deviceId"].str.contains('|'.join(search_for))) - & (df["type"] == "deviceEvent") - ) - - cal_index = df["payload.calibration_reading"].notnull() - valid_index = tandem_data_index & cal_index - - n_cal_readings = sum(valid_index) - - if n_cal_readings > 0: - # if reading is > 30 then it is in the wrong units - if df["payload.calibration_reading"].min() > 30: - df.loc[cal_index, "value"] = ( - df.loc[valid_index, "payload.calibration_reading"] - / MGDL_PER_MMOLL - ) - else: - df.loc[cal_index, "value"] = ( - df.loc[valid_index, "payload.calibration_reading"] - ) - else: - n_cal_readings = 0 - return df, n_cal_readings - - -def get_healthkit_timezone(df): - ''' - TODO: refactor to account for more efficient way to get embedded json - ''' - df["payload.HKTimeZone"] = ( - expand_embedded_dict(df["payload"], "HKTimeZone") - ) - if "timezone" not in list(df): - if "payload.HKTimeZone" in list(df): - hk_tz_idx = df["payload.HKTimeZone"].notnull() - df.loc[hk_tz_idx, "deviceType"] = "healthkit" - df.rename(columns={"payload.HKTimeZone": "timezone"}, inplace=True) - - else: - df["timezone"] = np.nan - df["deviceType"] = np.nan - else: - if "payload.HKTimeZone" in list(df): - hk_tz_idx = df["payload.HKTimeZone"].notnull() - df.loc[hk_tz_idx, "timezone"] = ( - df.loc[hk_tz_idx, "payload.HKTimeZone"] - ) - df.loc[hk_tz_idx, "deviceType"] = "healthkit" - else: - df["timezone"] = np.nan - df["deviceType"] = np.nan - - return df[["timezone", "deviceType"]] - - -def get_and_fill_timezone(df): - ''' - this is new to deal with healthkit data - requires that a data frame that contains payload and HKTimeZone is passed - ''' - df = get_healthkit_timezone(df) - - df["timezone"].fillna(method='ffill', inplace=True) - df["timezone"].fillna(method='bfill', inplace=True) - - return df["timezone"] - - -def make_tz_unaware(date_time): - return date_time.replace(tzinfo=None) - - -def to_utc_datetime(df): - ''' - this is new to deal with perfomance issue with the previous method - of converting to string to datetime with pd.to_datetime() - ''' - utc_time_tz_aware = pd.to_datetime( - df["time"], - format="%Y-%m-%dT%H:%M:%S", - utc=True - ) - utc_tz_unaware = utc_time_tz_aware.apply(make_tz_unaware) - - return utc_tz_unaware - - -# apply the large timezone offset correction (AKA Darin's fix) -def timezone_offset_bug_fix(df): - ''' - this is taken from estimate-local-time.py - TODO: add in unit testing where there is no TZP that is > 840 or < -720 - ''' - - if "timezoneOffset" in list(df): - - while ((df.timezoneOffset > 840).sum() > 0): - df.loc[df.timezoneOffset > 840, ["conversionOffset"]] = ( - df.loc[df.timezoneOffset > 840, ["conversionOffset"]] - - (1440 * 60 * 1000) - ) - - df.loc[df.timezoneOffset > 840, ["timezoneOffset"]] = ( - df.loc[df.timezoneOffset > 840, ["timezoneOffset"]] - 1440 - ) - - while ((df.timezoneOffset < -720).sum() > 0): - df.loc[df.timezoneOffset < -720, ["conversionOffset"]] = ( - df.loc[df.timezoneOffset < -720, ["conversionOffset"]] - + (1440 * 60 * 1000) - ) - - df.loc[df.timezoneOffset < -720, ["timezoneOffset"]] = ( - df.loc[df.timezoneOffset < -720, ["timezoneOffset"]] + 1440 - ) - - return df - - -def get_local_time(df): - - tzo = df[['utcTime', 'inferredTimezone']].apply( - lambda x: get_timezone_offset(*x), axis=1 - ) - local_time = df['utcTime'] + pd.to_timedelta(tzo, unit="m") - - return local_time - - -def round_time( - df, - time_interval_minutes=5, - start_with_first_record=True, - return_calculation_columns=False -): - ''' - A general purpose round time function that rounds the "time" - field to nearest minutes - INPUTS: - * a dataframe (df) or time series that contains only one time field - that you want to round - * time_interval_minutes (defaults to 5 minutes given that most cgms - output every 5 minutes) - * start_with_first_record starts the rounding with the first record - if True, and the last record if False (defaults to True) - * return_calculation_columns specifies whether the extra columns - used to make calculations are returned - refactored name(s) to meet style guide - ''' - # if a time series is passed in, convert to dataframe - if "Series" in get_type(df): - df = pd.DataFrame(df) - columns_ = list(df) - if len(columns_) > 1: - sys.exit( - "Error: df should only have one time column" - ) - else: - df.rename(columns={columns_[0]: "t"}, inplace=True) - - df.sort_values( - by="t", - ascending=start_with_first_record, - inplace=True - ) - - df.reset_index(drop=False, inplace=True) - df.rename(columns={"index": "originalIndex"}, inplace=True) - - # calculate the time between consecutive records - df["t_shift"] = df["t"].shift(1) - df["timeBetweenRecords"] = round( - (df["t"] - df["t_shift"]).dt.days*(86400/(60 * time_interval_minutes)) - + (df["t"] - df["t_shift"]).dt.seconds/(60 * time_interval_minutes) - ) * time_interval_minutes - - # separate the data into chunks if timeBetweenRecords is greater than - # 2 times the minutes so the rounding process - # starts over - big_gaps = list( - df.query("abs(timeBetweenRecords) > " - + str(time_interval_minutes * 2)).index - ) - big_gaps.insert(0, 0) - big_gaps.append(len(df)) - - for gap_index in range(0, len(big_gaps) - 1): - chunk = df["t"][big_gaps[gap_index]:big_gaps[gap_index+1]] - first_chunk = df["t"][big_gaps[gap_index]] - - # calculate the time difference between - # each time record and the first record - df.loc[ - big_gaps[gap_index]:big_gaps[gap_index+1], - "minutesFromFirstRecord" - ] = ( - (chunk - first_chunk).dt.days*(86400/60) - + (chunk - first_chunk).dt.seconds/60 - ) - - # then round to the nearest X Minutes - # NOTE: the ".000001" ensures that mulitples of 2:30 always rounds up. - df.loc[ - big_gaps[gap_index]:big_gaps[gap_index+1], - "roundedMinutesFromFirstRecord" - ] = round( - (df.loc[ - big_gaps[gap_index]:big_gaps[gap_index+1], - "minutesFromFirstRecord" - ] / time_interval_minutes) + 0.000001 - ) * (time_interval_minutes) - - rounded_first_record = ( - first_chunk + pd.Timedelta("1microseconds") - ).round(str(time_interval_minutes) + "min") - - df.loc[ - big_gaps[gap_index]:big_gaps[gap_index+1], - "roundedTime" - ] = rounded_first_record + pd.to_timedelta( - df.loc[ - big_gaps[gap_index]:big_gaps[gap_index+1], - "roundedMinutesFromFirstRecord" - ], unit="m" - ) - - if return_calculation_columns is False: - df.drop( - columns=[ - "timeBetweenRecords", - "minutesFromFirstRecord", - "roundedMinutesFromFirstRecord" - ], inplace=True - ) - # sort back to the original index - df.sort_values(by="originalIndex", inplace=True) - - return df["roundedTime"].values - - -def add_upload_time(df): - ''' - this is taken from a colab notebook that is not in our github - given that it has been refactored to account for bug where there are - no upload records - NOTE: this is a new fix introduced with healthkit data...we now have - data that does not have an upload record - - ''' - - if "upload" in df.type.unique(): - upload_times = pd.DataFrame( - df[df.type == "upload"].groupby("uploadId")["utcTime"].max() - ) - else: - upload_times = pd.DataFrame(columns=["utcTime"]) - - unique_uploadIds = set(df["uploadId"].unique()) - unique_uploadRecords = set( - df.loc[df["type"] == "upload", "uploadId"].unique() - ) - uploadIds_missing_uploadRecords = unique_uploadIds - unique_uploadRecords - - for upId in uploadIds_missing_uploadRecords: - last_upload_time = df.loc[df["uploadId"] == upId, "utcTime"].max() - upload_times.loc[upId, "utcTime"] = last_upload_time - - upload_times.reset_index(inplace=True) - upload_times.rename( - columns={"utcTime": "uploadTime", - "index": "uploadId"}, - inplace=True - ) - - df = pd.merge(df, upload_times, how='left', on='uploadId') - - return df["uploadTime"].values - - -def remove_invalid_cgm_values(df): - - nBefore = len(df) - # remove values < 38 and > 402 mg/dL - df = df.drop(df[((df.type == "cbg") & - (df["mg/dL"] < 38))].index) - df = df.drop(df[((df.type == "cbg") & - (df["mg/dL"] > 402))].index) - nRemoved = nBefore - len(df) - - return df, nRemoved - - -def removeDuplicates(df, criteriaDF): - nBefore = len(df) - df = df.loc[~(df[criteriaDF].duplicated())] - df = df.reset_index(drop=True) - nDuplicatesRemoved = nBefore - len(df) - - return df, nDuplicatesRemoved - - -def removeCgmDuplicates(df, timeCriterion): - if timeCriterion in df: - df.sort_values(by=[timeCriterion, "uploadTime"], - ascending=[False, False], - inplace=True) - dfIsNull = df[df[timeCriterion].isnull()] - dfNotNull = df[df[timeCriterion].notnull()] - dfNotNull, nDuplicatesRemoved = ( - removeDuplicates(dfNotNull, [timeCriterion, "value"]) - ) - df = pd.concat([dfIsNull, dfNotNull]) - df.sort_values(by=[timeCriterion, "uploadTime"], - ascending=[False, False], - inplace=True) - else: - nDuplicatesRemoved = 0 - - return df, nDuplicatesRemoved - - -# get rid of spike data -def remove_spike_data(df): - if "origin" in list(df): - nBefore = len(df) - spike_locations = [ - "origin.payload.device.name", - "origin.payload.device.manufacturer", - "origin.payload.sourceRevision.source.name", - ] - for spike_loc in spike_locations: - - df[spike_loc] = get_embedded_field(df["origin"], spike_loc) - spike_idx = df.loc[ - df[spike_loc].notnull(), - spike_loc - ].astype(str).str.lower().str.contains("spike") - - df.drop((spike_idx == True).index, inplace=True) - nRemoved = nBefore - len(df) - - else: - nRemoved = np.nan - - return df, nRemoved - - -# %% ESTIMATE LOCAL TIME FUNCTIONS -def create_contiguous_day_series(df): - first_day = df["date"].min() - last_day = df["date"].max() - rng = pd.date_range(first_day, last_day).date - contiguousDaySeries = \ - pd.DataFrame(rng, columns=["date"]).sort_values( - "date", ascending=False).reset_index(drop=True) - - return contiguousDaySeries - - -def add_device_type(df): - col_headings = list(df) - if "deviceType" not in col_headings: - df["deviceType"] = np.nan - if "deviceTags" in col_headings: - # first make sure deviceTag is in string format - df["deviceTags"] = df.deviceTags.astype(str) - # filter by type not null device tags - ud = df[df["deviceTags"].notnull()].copy() - # define a device type (e.g., pump, cgm, or healthkit) - ud.loc[ - ((ud["deviceTags"].str.contains("pump")) - & (ud["deviceType"].isnull())), - ["deviceType"] - ] = "pump" - - # define a device type (e.g., cgm) - ud.loc[ - ((ud["deviceTags"].str.contains("cgm")) - & (ud["deviceType"].isnull())), - ["deviceType"] - ] = "cgm" - - return ud["deviceType"] - else: - return np.nan - - -def get_timezone_offset(currentDate, currentTimezone): - - tz = pytz.timezone(currentTimezone) - # here we add 1 day to the current date to account for changes to/from DST - tzoNum = int( - tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z") - ) - tzoHours = np.floor(tzoNum / 100) - tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0) - tzoSign = np.sign(tzoHours) - tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign)) - - return tzo - - -def add_device_day_series(df, dfContDays, deviceTypeName): - if len(df) > 0: - dfDayGroups = df.groupby("date") - if "timezoneOffset" in df: - dfDaySeries = pd.DataFrame(dfDayGroups["timezoneOffset"].median()) - else: - dfDaySeries = pd.DataFrame(columns=["timezoneOffset"]) - - if "upload" in deviceTypeName: - if "timezone" in df: - if dfDayGroups.timezone.count().max() > 0: - dfDaySeries["timezone"] = ( - dfDayGroups.timezone.describe()["top"] - ) - # get the timezone offset for the timezone - for i in dfDaySeries.index: - if pd.notnull(dfDaySeries.loc[i, "timezone"]): - tzo = get_timezone_offset( - pd.to_datetime(i), - dfDaySeries.loc[i, "timezone"]) - dfDaySeries.loc[i, ["timezoneOffset"]] = tzo - if "timeProcessing" in dfDaySeries: - dfDaySeries["timeProcessing"] = \ - dfDayGroups.timeProcessing.describe()["top"] - else: - dfDaySeries["timeProcessing"] = np.nan - - dfDaySeries = dfDaySeries.add_prefix(deviceTypeName + "."). \ - rename(columns={deviceTypeName + ".date": "date"}) - - dfContDays = pd.merge(dfContDays, dfDaySeries.reset_index(), - on="date", how="left") - - else: - dfContDays[deviceTypeName + ".timezoneOffset"] = np.nan - - return dfContDays - - -def impute_upload_records(df, contDays, deviceTypeName): - daySeries = \ - add_device_day_series(df, contDays, deviceTypeName) - - if ((len(df) > 0) & (deviceTypeName + ".timezone" in daySeries)): - for i in daySeries.index[1:]: - if pd.isnull(daySeries[deviceTypeName + ".timezone"][i]): - daySeries.loc[i, [deviceTypeName + ".timezone"]] = ( - daySeries.loc[i-1, deviceTypeName + ".timezone"] - ) - if pd.notnull(daySeries[deviceTypeName + ".timezone"][i]): - tz = daySeries.loc[i, deviceTypeName + ".timezone"] - tzo = get_timezone_offset( - pd.to_datetime(daySeries.loc[i, "date"]), - tz - ) - daySeries.loc[i, deviceTypeName + ".timezoneOffset"] = tzo - - if pd.notnull(daySeries[deviceTypeName + ".timeProcessing"][i-1]): - daySeries.loc[i, deviceTypeName + ".timeProcessing"] = \ - daySeries.loc[i-1, deviceTypeName + ".timeProcessing"] - - else: - daySeries[deviceTypeName + ".timezone"] = np.nan - daySeries[deviceTypeName + ".timeProcessing"] = np.nan - - return daySeries - - -def add_home_timezone(df, contDays): - - if "timezone" in df: - homeTimezone = df["timezone"].describe()["top"] - tzo = contDays.date.apply( - lambda x: get_timezone_offset(pd.to_datetime(x), homeTimezone)) - - contDays["home.imputed.timezoneOffset"] = tzo - contDays["home.imputed.timezone"] = homeTimezone - - else: - contDays["home.imputed.timezoneOffset"] = np.nan - contDays["home.imputed.timezone"] = np.nan - contDays["home.imputed.timeProcessing"] = np.nan - - return contDays - - -def estimateTzAndTzoWithUploadRecords(cDF): - - cDF["est.type"] = np.nan - cDF["est.gapSize"] = np.nan - cDF["est.timezoneOffset"] = cDF["upload.timezoneOffset"] - cDF["est.annotations"] = np.nan - - if "upload.timezone" in cDF: - cDF.loc[cDF["upload.timezone"].notnull(), ["est.type"]] = "UPLOAD" - cDF["est.timezone"] = cDF["upload.timezone"] - cDF["est.timeProcessing"] = cDF["upload.timeProcessing"] - else: - cDF["est.timezone"] = np.nan - cDF["est.timeProcessing"] = np.nan - - cDF.loc[((cDF["est.timezoneOffset"] != - cDF["home.imputed.timezoneOffset"]) & - (pd.notnull(cDF["est.timezoneOffset"]))), - "est.annotations"] = "travel" - - return cDF - - -def assignTzoFromImputedSeries(df, i, imputedSeries): - df.loc[i, ["est.type"]] = "DEVICE" - - df.loc[i, ["est.timezoneOffset"]] = \ - df.loc[i, imputedSeries + ".timezoneOffset"] - - df.loc[i, ["est.timezone"]] = \ - df.loc[i, imputedSeries + ".timezone"] - - df.loc[i, ["est.timeProcessing"]] = \ - df.loc[i, imputedSeries + ".timeProcessing"] - - return df - - -def compareDeviceTzoToImputedSeries(df, sIdx, device): - for i in sIdx: - # if the device tzo = imputed tzo, then chose the imputed tz and tzo - # note, dst is accounted for in the imputed tzo - for imputedSeries in ["pump.upload.imputed", "cgm.upload.imputed", - "healthkit.upload.imputed", "home.imputed"]: - # if the estimate has not already been made - if pd.isnull(df.loc[i, "est.timezone"]): - - if df.loc[i, device + ".timezoneOffset"] == \ - df.loc[i, imputedSeries + ".timezoneOffset"]: - - assignTzoFromImputedSeries(df, i, imputedSeries) - - df = addAnnotation(df, i, - "tz-inferred-from-" + imputedSeries) - - # if the imputed series has a timezone estimate, then see if - # the current day is a dst change day - elif (pd.notnull(df.loc[i, imputedSeries + ".timezone"])): - imputedTimezone = df.loc[i, imputedSeries + ".timezone"] - if isDSTChangeDay(df.loc[i, "date"], imputedTimezone): - - dstRange = getRangeOfTZOsForTimezone(imputedTimezone) - if ((df.loc[i, device + ".timezoneOffset"] in dstRange) - & (df.loc[i, imputedSeries + ".timezoneOffset"] in dstRange)): - - assignTzoFromImputedSeries(df, i, imputedSeries) - - df = addAnnotation(df, i, "dst-change-day") - df = addAnnotation( - df, i, "tz-inferred-from-" + imputedSeries) - - return df - - -def estimateTzAndTzoWithDeviceRecords(cDF): - - # 2A. use the TZO of the pump or cgm device if it exists on a given day. In - # addition, compare the TZO to one of the imputed day series (i.e., the - # upload and home series to see if the TZ can be inferred) - for deviceType in ["pump", "cgm"]: - # find the indices of days where a TZO estimate has not been made AND - # where the device (e.g., pump or cgm) TZO has data - sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) & - (cDF[deviceType + ".timezoneOffset"].notnull()))].index - # compare the device TZO to the imputed series to infer time zone - cDF = compareDeviceTzoToImputedSeries(cDF, sIndices, deviceType) - - # 2B. if the TZ cannot be inferred with 2A, then see if the TZ can be - # inferred from the previous day's TZO. If the device TZO is equal to the - # previous day's TZO, AND if the previous day has a TZ estimate, use the - # previous day's TZ estimate for the current day's TZ estimate - for deviceType in ["pump", "cgm"]: - sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) & - (cDF[deviceType + ".timezoneOffset"].notnull()))].index - - cDF = compareDeviceTzoToPrevDayTzo(cDF, sIndices, deviceType) - - # 2C. after 2A and 2B, check the DEVICE estimates to make sure that the - # pump and cgm tzo do not differ by more than 60 minutes. If they differ - # by more that 60 minutes, then mark the estimate as UNCERTAIN. Also, we - # allow the estimates to be off by 60 minutes as there are a lot of cases - # where the devices are off because the user changes the time for DST, - # at different times - sIndices = cDF[((cDF["est.type"] == "DEVICE") & - (cDF["pump.timezoneOffset"].notnull()) & - (cDF["cgm.timezoneOffset"].notnull()) & - (cDF["pump.timezoneOffset"] != cDF["cgm.timezoneOffset"]) - )].index - - tzoDiffGT60 = abs(cDF.loc[sIndices, "cgm.timezoneOffset"] - - cDF.loc[sIndices, "pump.timezoneOffset"]) > 60 - - idx = tzoDiffGT60.index[tzoDiffGT60] - - cDF.loc[idx, ["est.type"]] = "UNCERTAIN" - for i in idx: - cDF = addAnnotation(cDF, i, "pump-cgm-tzo-mismatch") - - return cDF - - -def imputeTzAndTzo(cDF): - - sIndices = cDF[cDF["est.timezoneOffset"].isnull()].index - hasTzoIndices = cDF[cDF["est.timezoneOffset"].notnull()].index - if len(hasTzoIndices) > 0: - if len(sIndices) > 0: - lastDay = max(sIndices) - - while ((sIndices.min() < max(hasTzoIndices)) & - (len(sIndices) > 0)): - - currentDay, prevDayWithDay, nextDayIdx = \ - getImputIndices(cDF, sIndices, hasTzoIndices) - - cDF = imputeByTimezone(cDF, currentDay, - prevDayWithDay, nextDayIdx) - - sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) & - (~cDF["est.annotations"].str.contains( - "unable-to-impute-tzo").fillna(False)))].index - - hasTzoIndices = cDF[cDF["est.timezoneOffset"].notnull()].index - - # try to impute to the last day (earliest day) in the dataset - # if the last record has a timezone that is the home record, then - # impute using the home timezone - if len(sIndices) > 0: - currentDay = min(sIndices) - prevDayWithDay = currentDay - 1 - gapSize = lastDay - currentDay - - for i in range(currentDay, lastDay + 1): - if cDF.loc[prevDayWithDay, "est.timezoneOffset"] == \ - cDF.loc[prevDayWithDay, "home.imputed.timezoneOffset"]: - - cDF.loc[i, ["est.type"]] = "IMPUTE" - - cDF.loc[i, ["est.timezoneOffset"]] = \ - cDF.loc[i, "home.imputed.timezoneOffset"] - - cDF.loc[i, ["est.timezone"]] = \ - cDF.loc[i, "home.imputed.timezone"] - - cDF = addAnnotation(cDF, i, "gap=" + str(gapSize)) - cDF.loc[i, ["est.gapSize"]] = gapSize - - else: - cDF.loc[i, ["est.type"]] = "UNCERTAIN" - cDF = addAnnotation(cDF, i, "unable-to-impute-tzo") - else: - cDF["est.type"] = "UNCERTAIN" - cDF["est.annotations"] = "unable-to-impute-tzo" - - return cDF - - -def getRangeOfTZOsForTimezone(tz): - minMaxTzo = [getTimezoneOffset(pd.to_datetime("1/1/2017"), tz), - getTimezoneOffset(pd.to_datetime("5/1/2017"), tz)] - - rangeOfTzo = np.arange(int(min(minMaxTzo)), int(max(minMaxTzo))+1, 15) - - return rangeOfTzo - - -def getListOfDSTChangeDays(cDF): - - # get a list of DST change days for the home time zone - dstChangeDays = \ - cDF[abs(cDF["home.imputed.timezoneOffset"] - - cDF["home.imputed.timezoneOffset"].shift(-1)) > 0].date - - return dstChangeDays - - -def correctEstimatesAroundDst(df, cDF): - - # get a list of DST change days for the home time zone - dstChangeDays = getListOfDSTChangeDays(cDF) - - # loop through the df within 2 days of a daylight savings time change - for d in dstChangeDays: - dstIndex = df[(df.date > (d + dt.timedelta(days=-2))) & - (df.date < (d + dt.timedelta(days=2)))].index - for dIdx in dstIndex: - if pd.notnull(df.loc[dIdx, "est.timezone"]): - tz = pytz.timezone(df.loc[dIdx, "est.timezone"]) - tzRange = getRangeOfTZOsForTimezone(str(tz)) - minHoursToLocal = min(tzRange)/60 - tzoNum = int(tz.localize(df.loc[dIdx, "utcTime"] + - dt.timedelta(hours=minHoursToLocal)).strftime("%z")) - tzoHours = np.floor(tzoNum / 100) - tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0) - tzoSign = np.sign(tzoHours) - tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign)) - localTime = \ - df.loc[dIdx, "utcTime"] + pd.to_timedelta(tzo, unit="m") - df.loc[dIdx, ["est.localTime"]] = localTime - df.loc[dIdx, ["est.timezoneOffset"]] = tzo - return df - - -def applyLocalTimeEstimates(df, cDF): - df = pd.merge(df, cDF, how="left", on="date") - df["est.localTime"] = \ - df["utcTime"] + pd.to_timedelta(df["est.timezoneOffset"], unit="m") - - df = correctEstimatesAroundDst(df, cDF) - - return df["est.localTime"].values - - -def isDSTChangeDay(currentDate, currentTimezone): - tzoCurrentDay = getTimezoneOffset(pd.to_datetime(currentDate), - currentTimezone) - tzoPreviousDay = getTimezoneOffset(pd.to_datetime(currentDate) + - dt.timedelta(days=-1), currentTimezone) - - return (tzoCurrentDay != tzoPreviousDay) - - -def tzoRangeWithComparisonTz(df, i, comparisonTz): - # if we have a previous timezone estimate, then calcuate the range of - # timezone offset values for that time zone - if pd.notnull(comparisonTz): - rangeTzos = getRangeOfTZOsForTimezone(comparisonTz) - else: - comparisonTz = np.nan - rangeTzos = np.array([]) - - return rangeTzos - - -def tzAndTzoRangePreviousDay(df, i): - # if we have a previous timezone estimate, then calcuate the range of - # timezone offset values for that time zone - comparisonTz = df.loc[i-1, "est.timezone"] - - rangeTzos = tzoRangeWithComparisonTz(df, i, comparisonTz) - - return comparisonTz, rangeTzos - - -def assignTzoFromPreviousDay(df, i, previousDayTz): - - df.loc[i, ["est.type"]] = "DEVICE" - df.loc[i, ["est.timezone"]] = previousDayTz - df.loc[i, ["est.timezoneOffset"]] = \ - getTimezoneOffset(pd.to_datetime(df.loc[i, "date"]), previousDayTz) - - df.loc[i, ["est.timeProcessing"]] = df.loc[i-1, "est.timeProcessing"] - df = addAnnotation(df, i, "tz-inferred-from-prev-day") - - return df - - -def assignTzoFromDeviceTzo(df, i, device): - - df.loc[i, ["est.type"]] = "DEVICE" - df.loc[i, ["est.timezoneOffset"]] = \ - df.loc[i, device + ".timezoneOffset"] - df.loc[i, ["est.timeProcessing"]] = \ - df.loc[i, device + ".upload.imputed.timeProcessing"] - - df = addAnnotation(df, i, "likely-travel") - df = addAnnotation(df, i, "tzo-from-" + device) - - return df - - -def compareDeviceTzoToPrevDayTzo(df, sIdx, device): - - for i in sIdx[sIdx > 0]: - - # first see if the previous record has a tzo - if (pd.notnull(df.loc[i-1, "est.timezoneOffset"])): - - previousDayTz, dstRange = tzAndTzoRangePreviousDay(df, i) - timeDiff = abs((df.loc[i, device + ".timezoneOffset"]) - - df.loc[i-1, "est.timezoneOffset"]) - - # next see if the previous record has a tz - if (pd.notnull(df.loc[i-1, "est.timezone"])): - - if timeDiff == 0: - assignTzoFromPreviousDay(df, i, previousDayTz) - - # see if the previous day's tzo and device tzo are within the - # dst range (as that is a common problem with this data) - elif ((df.loc[i, device + ".timezoneOffset"] in dstRange) - & (df.loc[i-1, "est.timezoneOffset"] in dstRange)): - - # then see if it is DST change day - if isDSTChangeDay(df.loc[i, "date"], previousDayTz): - - df = addAnnotation(df, i, "dst-change-day") - assignTzoFromPreviousDay(df, i, previousDayTz) - - # if it is not DST change day, then mark this as uncertain - else: - # also, check to see if the difference between device. - # tzo and prev.tzo is less than the expected dst - # difference. There is a known issue where the BtUTC - # procedure puts clock drift into the device.tzo, - # and as a result the tzo can be off by 15, 30, - # or 45 minutes. - if (((df.loc[i, device + ".timezoneOffset"] == - min(dstRange)) | - (df.loc[i, device + ".timezoneOffset"] == - max(dstRange))) & - ((df.loc[i-1, "est.timezoneOffset"] == - min(dstRange)) | - (df.loc[i-1, "est.timezoneOffset"] == - max(dstRange)))): - - df.loc[i, ["est.type"]] = "UNCERTAIN" - df = addAnnotation(df, i, - "likely-dst-error-OR-travel") - - else: - - df.loc[i, ["est.type"]] = "UNCERTAIN" - df = addAnnotation(df, i, - "likely-15-min-dst-error") - - # next see if time difference between device.tzo and prev.tzo - # is off by 720 minutes, which is indicative of a common - # user AM/PM error - elif timeDiff == 720: - df.loc[i, ["est.type"]] = "UNCERTAIN" - df = addAnnotation(df, i, "likely-AM-PM-error") - - # if it doesn't fall into any of these cases, then the - # tzo difference is likely due to travel - else: - df = assignTzoFromDeviceTzo(df, i, device) - - elif timeDiff == 0: - df = assignTzoFromDeviceTzo(df, i, device) - - # if there is no previous record to compare with check for dst errors, - # and if there are no errors, it is likely a travel day - else: - - comparisonTz, dstRange = tzAndTzoRangeWithHomeTz(df, i) - timeDiff = abs((df.loc[i, device + ".timezoneOffset"]) - - df.loc[i, "home.imputed.timezoneOffset"]) - - if ((df.loc[i, device + ".timezoneOffset"] in dstRange) - & (df.loc[i, "home.imputed.timezoneOffset"] in dstRange)): - - # see if it is DST change day - if isDSTChangeDay(df.loc[i, "date"], comparisonTz): - - df = addAnnotation(df, i, "dst-change-day") - df.loc[i, ["est.type"]] = "DEVICE" - df.loc[i, ["est.timezoneOffset"]] = \ - df.loc[i, device + ".timezoneOffset"] - df.loc[i, ["est.timezone"]] = \ - df.loc[i, "home.imputed.timezone"] - df.loc[i, ["est.timeProcessing"]] = \ - df.loc[i, device + ".upload.imputed.timeProcessing"] - - # if it is not DST change day, then mark this as uncertain - else: - # also, check to see if the difference between device. - # tzo and prev.tzo is less than the expected dst - # difference. There is a known issue where the BtUTC - # procedure puts clock drift into the device.tzo, - # and as a result the tzo can be off by 15, 30, - # or 45 minutes. - if (((df.loc[i, device + ".timezoneOffset"] == - min(dstRange)) | - (df.loc[i, device + ".timezoneOffset"] == - max(dstRange))) & - ((df.loc[i, "home.imputed.timezoneOffset"] == - min(dstRange)) | - (df.loc[i, "home.imputed.timezoneOffset"] == - max(dstRange)))): - - df.loc[i, ["est.type"]] = "UNCERTAIN" - df = addAnnotation(df, i, "likely-dst-error-OR-travel") - - else: - - df.loc[i, ["est.type"]] = "UNCERTAIN" - df = addAnnotation(df, i, "likely-15-min-dst-error") - - # next see if time difference between device.tzo and prev.tzo - # is off by 720 minutes, which is indicative of a common - # user AM/PM error - elif timeDiff == 720: - df.loc[i, ["est.type"]] = "UNCERTAIN" - df = addAnnotation(df, i, "likely-AM-PM-error") - - # if it doesn't fall into any of these cases, then the - # tzo difference is likely due to travel - - else: - df = assignTzoFromDeviceTzo(df, i, device) - - return df - - -def getImputIndices(df, sIdx, hIdx): - - lastDayIdx = len(df) - 1 - - currentDayIdx = sIdx.min() - tempList = pd.Series(hIdx) - currentDayIdx - prevDayIdx = currentDayIdx - 1 - nextDayIdx = \ - min(currentDayIdx + min(tempList[tempList >= 0]), lastDayIdx) - - return currentDayIdx, prevDayIdx, nextDayIdx - - -def imputeByTimezone(df, currentDay, prevDaywData, nextDaywData): - - gapSize = (nextDaywData - currentDay) - - if prevDaywData >= 0: - - if df.loc[prevDaywData, "est.timezone"] == \ - df.loc[nextDaywData, "est.timezone"]: - - tz = df.loc[prevDaywData, "est.timezone"] - - for i in range(currentDay, nextDaywData): - - df.loc[i, ["est.timezone"]] = tz - - df.loc[i, ["est.timezoneOffset"]] = \ - getTimezoneOffset(pd.to_datetime(df.loc[i, "date"]), tz) - - df.loc[i, ["est.type"]] = "IMPUTE" - - df = addAnnotation(df, i, "gap=" + str(gapSize)) - df.loc[i, ["est.gapSize"]] = gapSize - - # TODO: this logic should be updated to handle the edge case - # where the day before and after the gap have differing TZ, but - # the same TZO. In that case the gap should be marked as UNCERTAIN - elif df.loc[prevDaywData, "est.timezoneOffset"] == \ - df.loc[nextDaywData, "est.timezoneOffset"]: - - for i in range(currentDay, nextDaywData): - - df.loc[i, ["est.timezoneOffset"]] = \ - df.loc[prevDaywData, "est.timezoneOffset"] - - df.loc[i, ["est.type"]] = "IMPUTE" - - df = addAnnotation(df, i, "gap=" + str(gapSize)) - df.loc[i, ["est.gapSize"]] = gapSize - - else: - for i in range(currentDay, nextDaywData): - df.loc[i, ["est.type"]] = "UNCERTAIN" - df = addAnnotation(df, i, "unable-to-impute-tzo") - - else: - for i in range(currentDay, nextDaywData): - df.loc[i, ["est.type"]] = "UNCERTAIN" - df = addAnnotation(df, i, "unable-to-impute-tzo") - - return df - - -def addAnnotation(df, idx, annotationMessage): - if pd.notnull(df.loc[idx, "est.annotations"]): - df.loc[idx, ["est.annotations"]] = df.loc[idx, "est.annotations"] + \ - ", " + annotationMessage - else: - df.loc[idx, ["est.annotations"]] = annotationMessage - - return df - - -def getTimezoneOffset(currentDate, currentTimezone): - - tz = pytz.timezone(currentTimezone) - # here we add 1 day to the current date to account for changes to/from DST - tzoNum = int(tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z")) - tzoHours = np.floor(tzoNum / 100) - tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0) - tzoSign = np.sign(tzoHours) - tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign)) - - return tzo - - -def estimate_local_time(df): - df["date"] = df["utcTime"].dt.date # TODO: change this to utcDate later - contiguous_days = create_contiguous_day_series(df) - - df["deviceType"] = add_device_type(df) - cDays = add_device_day_series(df, contiguous_days, "upload") - - # create day series for cgm df - if "timezoneOffset" not in list(df): - df["timezoneOffset"] = np.nan - - cgmdf = df[(df["type"] == "cbg") & (df["timezoneOffset"].notnull())].copy() - cDays = add_device_day_series(cgmdf, cDays, "cgm") - - # create day series for pump df - pumpdf = df[(df.type == "bolus") & (df.timezoneOffset.notnull())].copy() - cDays = add_device_day_series(pumpdf, cDays, "pump") - - # interpolate between upload records of the same deviceType, and create a - # day series for interpolated pump, non-hk-cgm, and healthkit uploads - for deviceType in ["pump", "cgm", "healthkit"]: - tempUploaddf = df[df["deviceType"] == deviceType].copy() - cDays = impute_upload_records( - tempUploaddf, cDays, deviceType + ".upload.imputed" - ) - - # add a home timezone that also accounts for daylight savings time changes - cDays = add_home_timezone(df, cDays) - - # 1. USE UPLOAD RECORDS TO ESTIMATE TZ AND TZO - cDays = estimateTzAndTzoWithUploadRecords(cDays) - - # 2. USE DEVICE TZOs TO ESTIMATE TZO AND TZ (IF POSSIBLE) - # estimates can be made from pump and cgm df that have a TZO - # NOTE: the healthkit and dexcom-api cgm df are excluded - cDays = estimateTzAndTzoWithDeviceRecords(cDays) - - # 3. impute, infer, or interpolate gaps in the estimated tzo and tz - cDays = imputeTzAndTzo(cDays) - - # 4. APPLY LOCAL TIME ESTIMATES TO ALL df - local_time = applyLocalTimeEstimates(df, cDays) - - return local_time, cDays - - -# %% GET DATA FROM API -''' -get metadata and data for a donor that has shared with bigdata -NOTE: functions assume you have an .env with bigdata account credentials -''' - -userid = "0d4524bc11" -donor_group = "bigdata" - -donor_metadata, _ = get_shared_metadata( - donor_group=donor_group, - userid_of_shared_user=userid # TODO: this should be refactored in several places to be userid -) -data, _ = get_data( - donor_group=donor_group, - userid=userid, - weeks_of_data=52*10 -) - - -# %% CREATE META DATAFRAME (metadata) -''' -this is useful for keeping track of the type and amount of cleaning done -''' -metadata = pd.DataFrame(index=[userid]) - - -# %% HASH USER ID -hashid = hash_userid(userid, os.environ['BIGDATA_SALT']) -data["userid"] = userid -data["hashid"] = hashid - - -# %% CLEAN DATA -data_fields = list(data) -# remove negative durations -if "duration" in data_fields: - data["duration"], n_negative_durations = ( - remove_negative_durations(data[["duration"]].copy()) - ) -else: - n_negative_durations = np.nan -metadata["nNegativeDurations"] = n_negative_durations - -# Tslim calibration bug fix -data, n_cal_readings = tslim_calibration_fix(data.copy()) -metadata["nTandemAndPayloadCalReadings"] = n_cal_readings - -# fix large timzoneOffset bug in utcbootstrapping -data = timezone_offset_bug_fix(data.copy()) - -# add healthkit timezome information -data[["timezone", "deviceType"]] = get_healthkit_timezone(data.copy()) - - -# %% TIME RELATED ITEMS -data["utcTime"] = to_utc_datetime(data[["time"]].copy()) - -# add upload time to the data, which is needed for: -# getting rid of duplicates and useful for getting local time -data["uploadTime"] = add_upload_time(data[ - ["type", "uploadId", "utcTime"] -].copy()) - -# estimate local time (refactor of estimate-local-time.py) -data["localTime"], local_time_metadata = estimate_local_time(data.copy()) - -# round all data to the nearest 5 minutes -data["roundedLocalTime"] = round_time( - data["localTime"].copy(), - time_interval_minutes=5, - start_with_first_record=True, - return_calculation_columns=False -) - - -# %% TIME CATEGORIES -# AGE, & YLW -bDate = pd.to_datetime(donor_metadata["birthday"].values[0][0:7]) -dDate = pd.to_datetime(donor_metadata["diagnosisDate"].values[0][0:7]) -data["age"] = np.floor((data["roundedLocalTime"] - bDate).dt.days/365.25) -data["ylw"] = np.floor((data["roundedLocalTime"] - dDate).dt.days/365.25) - -# hour of the day -data["hour"] = data["roundedLocalTime"].dt.hour - -# add the day of the localTime that starts at 12am -data["day12AM"] = data["roundedLocalTime"].dt.date -# NOTE: for day of week Monday = 0 and Sunday = 6 -data["dayofweek12AM"] = data["roundedLocalTime"].dt.dayofweek -data["weekend12AM"] = data["dayofweek12AM"] > 4 - -# day that starts at 6am -data["6amTime"] = data["roundedLocalTime"] - pd.Timedelta(6, unit="hours") -data["day6AM"] = data["6amTime"].dt.date -data["dayofweek6AM"] = data["6amTime"].dt.dayofweek -data["weekend6AM"] = data["dayofweek6AM"] > 4 - - -# %% GROUP DATA BY TYPE -# first sort by upload time (used when removing dumplicates) -data.sort_values("uploadTime", ascending=False, inplace=True) -groups = data.groupby(by="type") - - -# %% CGM DATA -# filter by cgm -cgm = groups.get_group("cbg").dropna(axis=1, how="all") - -# calculate cgm in mg/dL -cgm["mg/dL"] = round(cgm["value"] * MGDL_PER_MMOLL) - -# get rid of spike data -cgm, nSpike = remove_spike_data(cgm) -metadata["nSpike"] = nSpike - -# get rid of cgm values too low/high (< 38 & > 402 mg/dL) -cgm, nInvalidCgmValues = remove_invalid_cgm_values(cgm) -metadata["nInvalidCgmValues"] = nInvalidCgmValues - -# get rid of duplicates that have the same ["deviceTime", "value"] -cgm, n_cgm_dups_removed = (removeCgmDuplicates(cgm, "deviceTime")) -metadata["nCgmDuplicatesRemovedDeviceTime"] = n_cgm_dups_removed - -# get rid of duplicates that have the same ["time", "value"] -cgm, n_cgm_dups_removed = removeCgmDuplicates(cgm, "time") -metadata["nCgmDuplicatesRemovedUtcTime"] = n_cgm_dups_removed - -# get rid of duplicates that have the same "roundedTime" -cgm, n_cgm_dups_removed = removeDuplicates(cgm, "roundedLocalTime") -metadata["nCgmDuplicatesRemovedRoundedTime"] = n_cgm_dups_removed - - -# %% GET CGM STATS -# create a contiguous 5 minute time series -first_day = cgm["roundedLocalTime"].min() -last_day = cgm["roundedLocalTime"].max() -rng = pd.date_range(first_day, last_day, freq="5min") -contiguous_data = ( - pd.DataFrame(rng, columns=["roundedLocalTime"]).sort_values( - "roundedLocalTime", ascending=False - ).reset_index(drop=True) -) - -# merge with cgm data -cgm_series = pd.merge( - contiguous_data, - cgm, - on="roundedLocalTime", - how="left" -) - -#cgm_series["hourly.mean"] = cgm_series["mg/dL"].rolling(12).mean() From 4206d722c3dbf55803e189715c75052ea02c4aa2 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Wed, 21 Aug 2019 14:55:48 -0500 Subject: [PATCH 43/46] remove combine files --- .../batch_get_cgm_distributions_and_stats.py | 54 ------------------- 1 file changed, 54 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py b/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py index 61894b9c..c8cbb5dc 100644 --- a/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py +++ b/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py @@ -120,57 +120,3 @@ def run_process(json_data_path): ) total_duration = round((endTime - startTime) / 60, 1) print("total duration was %s minutes" % total_duration) - - -# %% COMBINE AND SAVE ALL DONOR METADATA -print("combining all metadata") -phi_date_stamp = "PHI-" + args.date_stamp -donor_folder = os.path.join(args.data_path, phi_date_stamp + "-donor-data") - -metadata_path = os.path.join( - args.data_path, - phi_date_stamp + "-donor-data", - phi_date_stamp + "-cgm-metadata" -) - -all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz")) -all_metadata = pd.DataFrame() -for f in all_metadata_files: - temp_meta = pd.read_csv(f) - all_metadata = pd.concat( - [all_metadata, temp_meta], - ignore_index=True, - sort=False - ) - -all_metadata.to_csv( - os.path.join(donor_folder, phi_date_stamp + "-cgm-metadata.csv.gz") -) -print("saving metadata...code complete") - - -# %% COMBINE AND SAVE ALL DISTRIBUTION DATA -print("combining all distribution data") - -metadata_path = os.path.join( - args.data_path, - phi_date_stamp + "-donor-data", - phi_date_stamp + "-cgm-distributions" -) - -all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz")) -distribution_metadata = pd.DataFrame() -for f in all_metadata_files: - temp_meta = pd.read_csv(f, index_col=[0]) - distribution_metadata = pd.concat( - [distribution_metadata, temp_meta], - ignore_index=True, - sort=False - ) - -distribution_metadata.to_csv( - os.path.join( - donor_folder, phi_date_stamp + "-all-cgm-distributions.csv.gz" - ) -) -print("saving all-dataset-info-metadata...code complete") From 3f1ea02b6cd15e2e05f25547942711bab3f7b5bb Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Wed, 21 Aug 2019 15:13:58 -0500 Subject: [PATCH 44/46] use all processors --- .../get_stats/batch_get_cgm_distributions_and_stats.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py b/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py index c8cbb5dc..3fe2fef9 100644 --- a/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py +++ b/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py @@ -102,16 +102,10 @@ def run_process(json_data_path): # %% GET A LIST OF DONOR JSON FILE LOCATIONS all_files = glob.glob(args.json_data_path, recursive=True) -# this is a good test to make sure run process is working before running -#import pdb -#args.date_stamp = "2019-07-17" -#run_process(all_files[0]) -#pdb.set_trace() - # use multiple cores to process startTime = time.time() print("starting at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) -pool = Pool(int(os.cpu_count()/2)) +pool = Pool(int(os.cpu_count())) pool.map(run_process, all_files) pool.close() endTime = time.time() From 8a7f86effaeb11409ea5fd4041e4a66e39b752ab Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Thu, 22 Aug 2019 09:24:56 -0500 Subject: [PATCH 45/46] save json data in folder that is compatible with old pipeline --- .../get_single_tidepool_dataset_json.py | 39 ++++++++++++++----- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py index 5a17d6d8..34c0abe3 100644 --- a/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py +++ b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py @@ -81,6 +81,7 @@ def get_data( auth=np.nan, email=np.nan, password=np.nan, + save_file="True", ): # login if pd.notnull(donor_group): @@ -127,8 +128,7 @@ def get_data( output_folder = os.path.join( save_data_path, - "dremio", - userid, + "PHI-" + userid, ) output_file_path = os.path.join( @@ -147,7 +147,7 @@ def get_data( download_ = False if download_: - make_folder_if_doesnt_exist(output_folder) + big_json_file = [] if weeks_of_data > 52: @@ -184,9 +184,13 @@ def get_data( # save data if len(big_json_file) > 1: - print("saving data for {}".format(userid)) - with open(output_file_path, 'w') as outfile: - json.dump(big_json_file, outfile) + if "T" in str(save_file).upper(): + make_folder_if_doesnt_exist(output_folder) + print("saving data for {}".format(userid)) + with open(output_file_path, 'w') as outfile: + json.dump(big_json_file, outfile) + else: + print("{} has data, but will not be saved".format(userid)) else: print("{} has no data".format(userid)) @@ -203,8 +207,9 @@ def get_data( auth[0] + ":" + str(api_response.status_code) ) else: - print("skipping bc {}'s data was downloaded (attempted)".format(userid) - + " within the last {} hours".format(overwrite_hours) + print( + "skipping bc {}'s data was downloaded (attempted)".format(userid) + + " within the last {} hours".format(overwrite_hours) ) return @@ -215,6 +220,7 @@ def get_data( # USER INPUTS (choices to be made in order to run the code) codeDescription = "get donor json file" parser = argparse.ArgumentParser(description=codeDescription) + current_date = dt.datetime.now().strftime("%Y-%m-%d") parser.add_argument( "-o", @@ -222,7 +228,11 @@ def get_data( dest="data_path", default=os.path.abspath( os.path.join( - os.path.dirname(__file__), "..", "data" + os.path.dirname(__file__), + "..", + "data", + "PHI-" + current_date + "-donor-data", + "PHI-" + current_date + "-jsonData", ) ), help="the output path where the data is stored" @@ -232,7 +242,7 @@ def get_data( "-w", "--weeks-of-data", dest="weeks_of_data", - default=52*10, # go back the last 10 years as default + default=2, # 52*10, # go back the last 10 years as default help="enter the number of weeks of data you want to download" ) @@ -284,6 +294,14 @@ def get_data( help="password of the master account" ) + parser.add_argument( + "-s", + "--save_file", + dest="save_file", + default="true", + help="specify whether to save the downloaded donor data" + ) + args = parser.parse_args() # the main function @@ -296,4 +314,5 @@ def get_data( auth=args.auth, email=args.email, password=args.password, + save_file=args.save_file, ) From 01d6dc15010501d07fef5c8994bcb414eb5e20f2 Mon Sep 17 00:00:00 2001 From: Ed Nykaza Date: Thu, 22 Aug 2019 10:44:57 -0500 Subject: [PATCH 46/46] refactor to download data if json data path is not provided also removing hardcoding to dremio folder --- .../get_single_tidepool_dataset_json.py | 17 ++++-- .../get_cgm_distributions_and_stats.py | 61 ++++++++++++------- 2 files changed, 50 insertions(+), 28 deletions(-) diff --git a/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py index 34c0abe3..d8496891 100644 --- a/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py +++ b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py @@ -24,6 +24,8 @@ sys.path.insert(0, envPath) import environmentalVariables +# %% GLOBAL VARIABLES +current_date = dt.datetime.now().strftime("%Y-%m-%d") # %% FUNCTIONS def make_folder_if_doesnt_exist(folder_paths): @@ -72,7 +74,9 @@ def get_data( os.path.join( os.path.dirname(__file__), "..", - "data" + "data", + "PHI-" + current_date + "-donor-data", + "PHI-" + current_date + "-jsonData", ) ), overwrite_hours=24, @@ -81,7 +85,7 @@ def get_data( auth=np.nan, email=np.nan, password=np.nan, - save_file="True", + save_file="False", ): # login if pd.notnull(donor_group): @@ -212,7 +216,11 @@ def get_data( + " within the last {} hours".format(overwrite_hours) ) - return + if "T" in str(save_file).upper(): + return np.nan, userid + else: + df = pd.DataFrame(big_json_file) + return df, userid # %% MAIN @@ -220,7 +228,6 @@ def get_data( # USER INPUTS (choices to be made in order to run the code) codeDescription = "get donor json file" parser = argparse.ArgumentParser(description=codeDescription) - current_date = dt.datetime.now().strftime("%Y-%m-%d") parser.add_argument( "-o", @@ -305,7 +312,7 @@ def get_data( args = parser.parse_args() # the main function - get_data( + data, userid = get_data( save_data_path=args.data_path, weeks_of_data=args.weeks_of_data, overwrite_hours=args.overwrite_hours, diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py index 8a6cf7d5..4da725b1 100644 --- a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py @@ -22,7 +22,10 @@ ) if get_donor_data_path not in sys.path: sys.path.insert(0, get_donor_data_path) -from get_donor_data.get_single_tidepool_dataset_json import make_folder_if_doesnt_exist +from get_donor_data.get_single_tidepool_dataset_json import ( + make_folder_if_doesnt_exist, get_data +) +from get_donor_data.get_single_donor_metadata import get_shared_metadata # %% CONSTANTS MGDL_PER_MMOLL = 18.01559 @@ -35,7 +38,6 @@ and whether they were refactored ''' - def get_episodes( df, episode_criterion="cgm < 54", @@ -1568,22 +1570,37 @@ def get_distribution_and_stats( 'isOtherPerson', ] - # load in data - data = pd.read_json(json_data_path) + # load in data or pull in data + if pd.notnull(json_data_path): + data = pd.read_json(json_data_path) + + else: + data, userid = get_data( + save_file="false" + ) # load in donor metadata - all_donor_metadata = pd.read_csv( - os.path.join( - save_data_path, - phi_date + "-donor-data", - phi_date + "-donor-metadata.csv"), - low_memory=False + donor_meta_path = os.path.join( + save_data_path, + phi_date + "-donor-data", + phi_date + "-donor-metadata.csv" ) + if os.path.exists(donor_meta_path): - metadata = all_donor_metadata.loc[ - all_donor_metadata["userid"] == userid, - donor_metadata_columns - ] + all_donor_metadata = pd.read_csv( + donor_meta_path, + low_memory=False + ) + + metadata = all_donor_metadata.loc[ + all_donor_metadata["userid"] == userid, + donor_metadata_columns + ] + else: + metadata, _ = get_shared_metadata( + donor_group="bigdata", + userid_of_shared_user=userid + ) print("starting", userid) @@ -1645,13 +1662,13 @@ def get_distribution_and_stats( # AGE, & YLW # TODO: make this a function - if metadata["birthday"].values[0] is not np.nan: + if pd.notnull(metadata["birthday"].values[0]): bDate = pd.to_datetime(metadata["birthday"].values[0][0:7]) data["age"] = np.floor((data["roundedUtcTime"] - bDate).dt.days/365.25) else: data["age"] = np.nan - if metadata["diagnosisDate"].values[0] is not np.nan: + if pd.notnull(metadata["diagnosisDate"].values[0]): dDate = pd.to_datetime(metadata["diagnosisDate"].values[0][0:7]) data["ylw"] = np.floor((data["roundedUtcTime"] - dDate).dt.days/365.25) else: @@ -2006,7 +2023,6 @@ def get_distribution_and_stats( ts40_400 = all_cgm["mg/dL.40to400"].copy() - # for all the less than (<) criteria for cgm_threshold in [40, 54, 70]: all_cgm["cgm < " + str(cgm_threshold)] = ( @@ -2390,12 +2406,11 @@ def get_distribution_and_stats( "-i", "--input-json-data-path", dest="json_data_path", - default=os.path.abspath( - os.path.join( - os.path.dirname(__file__), "..", "data" - ) - ), - help="the path where the json data is located" + default=np.nan, + help=( + "the path where the json data is located, defaults to none and" + + " will download your data using your Tidepool credentials" + ) ) parser.add_argument(