Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Etn/get cgm stats #57

Open
wants to merge 46 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
865f4ea
create folder and save functions
ed-nykaza Jul 15, 2019
10116db
save as gzipped csv
ed-nykaza Jul 16, 2019
fe9a04a
capture dataset info
ed-nykaza Jul 16, 2019
c80ca20
sort by userid instead of donor group
ed-nykaza Jul 17, 2019
7340a5c
just get json file
ed-nykaza Jul 17, 2019
0d08b82
get interim data summaries
ed-nykaza Jul 21, 2019
7b28164
update env to include latest spyder
ed-nykaza Jul 24, 2019
09eaca0
add __init__.py and rename folders
ed-nykaza Jul 24, 2019
a1ce991
get data from api
ed-nykaza Jul 24, 2019
5629152
add path if needed
ed-nykaza Jul 24, 2019
8fa5f2c
initial commit WIP
ed-nykaza Jul 25, 2019
fae4788
distinguish donor metadata from data metadata
ed-nykaza Jul 29, 2019
9ef98f1
refactor round_time
ed-nykaza Jul 29, 2019
302e45b
add upload time to data
ed-nykaza Jul 29, 2019
75bb4ff
handle edge case where uploadId is not given
ed-nykaza Jul 29, 2019
61ae41c
apply timezoneOffset correction
ed-nykaza Jul 29, 2019
4b86a07
refactor of estimate-local-time to handle healthkit data
ed-nykaza Jul 30, 2019
21ff818
clean cgm data
ed-nykaza Jul 30, 2019
8dd7465
get cgm 5 minute time series
ed-nykaza Jul 30, 2019
ef40d18
add new functions that get embedded json data
ed-nykaza Aug 7, 2019
dc9f19e
remove spike data
ed-nykaza Aug 8, 2019
be7177e
make sure there is timezone information
ed-nykaza Aug 12, 2019
07b211e
refactor remove spike data
ed-nykaza Aug 13, 2019
a240bd7
wip cgm distributions
ed-nykaza Aug 16, 2019
890bac8
save output
ed-nykaza Aug 17, 2019
1d4ef81
remove collecting all metadata
ed-nykaza Aug 17, 2019
264a79e
add additional metadata to output
ed-nykaza Aug 17, 2019
20bb215
fix spike data drop bug
ed-nykaza Aug 17, 2019
241987b
refactor sensing cgmModel
ed-nykaza Aug 19, 2019
e324f77
first commit of cgm stats
ed-nykaza Aug 19, 2019
ddabb81
initial commit of episodes
ed-nykaza Aug 19, 2019
cded1cc
next increment of episodes
ed-nykaza Aug 19, 2019
0b9a3b4
move percentile calculations to full range of data section
ed-nykaza Aug 19, 2019
4c9714a
get episode stats
ed-nykaza Aug 20, 2019
2fe6065
minor refactor
ed-nykaza Aug 20, 2019
665598d
resolve edge case of not having quarterly stats
ed-nykaza Aug 20, 2019
5432b33
initial commit of batch process all cgm distribution and stats
ed-nykaza Aug 20, 2019
4b44457
skip already processed and use 1/2 of processors
ed-nykaza Aug 20, 2019
57dc7fd
get results script
ed-nykaza Aug 21, 2019
60e6c22
pull files into pandas with low_memory flag
ed-nykaza Aug 21, 2019
a984e12
save results in chunks
ed-nykaza Aug 21, 2019
858372a
move files and modify print statements
ed-nykaza Aug 21, 2019
4206d72
remove combine files
ed-nykaza Aug 21, 2019
3f1ea02
use all processors
ed-nykaza Aug 21, 2019
8a7f86e
save json data in folder that is compatible with old pipeline
ed-nykaza Aug 22, 2019
01d6dc1
refactor to download data if json data path is not provided
ed-nykaza Aug 22, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,5 @@ projects/loop-algorithm/figures/
projects/parsers/output/

projects/get-donors-pump-settings/temp-plot\.html

projects/bigdata-processing-pipeline/get_stats/debug/
Empty file.
3 changes: 1 addition & 2 deletions projects/bigdata-processing-pipeline/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@ channels:
- defaults
dependencies:
- python=3.7.3
- numpy=1.16.4
- pandas=0.24.2
- spyder=3.3.6
- pip=19.1.1
- spyder=3.3.5
- pip:
- python-dotenv==0.10.3
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import requests
import json
import argparse
import pdb
envPath = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
if envPath not in sys.path:
sys.path.insert(0, envPath)
Expand Down Expand Up @@ -247,7 +248,7 @@ def accept_and_get_list(args):
)

# polish up the final donor list
final_donor_list.sort_values(by="donorGroup", inplace=True)
final_donor_list.sort_values(by="userID", inplace=True)
final_donor_list.reset_index(drop=True, inplace=True)

if args.save_donor_list:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,6 @@
)
data, _ = get_data(
donor_group="bigdata",
userid_of_shared_user="0d4524bc11",
userid="0d4524bc11",
weeks_of_data=4
)
Original file line number Diff line number Diff line change
Expand Up @@ -117,12 +117,11 @@ def get_all_data(userid, donor_group):

metadata_path = os.path.join(
args.data_path,
"PHI-" + "2019-07-13" + "-donor-data",
"PHI-" + "2019-07-13" + "-metadata"

phi_date_stamp + "-donor-data",
phi_date_stamp + "-metadata"
)

all_files = glob.glob(os.path.join(metadata_path, "*.csv"))
all_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
all_metadata = pd.DataFrame()
for f in all_files:
temp_meta = pd.read_csv(f)
Expand All @@ -137,3 +136,32 @@ def get_all_data(userid, donor_group):
os.path.join(donor_folder, phi_date_stamp + "-donor-metadata.csv")
)
print("saving metadata...code complete")


# %% COMBINE AND SAVE ALL DATASET INFO (METADATA)
print("combining all dataset metadata")

metadata_path = os.path.join(
args.data_path,
phi_date_stamp + "-donor-data",
phi_date_stamp + "-datasetSummary"
)

all_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
dataset_metadata = pd.DataFrame()
for f in all_files:
temp_meta = pd.read_csv(f)
temp_meta.rename(columns={"Unnamed: 0": "col_name"}, inplace=True)
userid = f[-32:-22]
temp_meta["userid"] = userid
dataset_metadata = pd.concat(
[dataset_metadata, temp_meta],
ignore_index=True,
sort=False
)

dataset_metadata.to_csv(
os.path.join(donor_folder, phi_date_stamp + "-all-dataset-info.csv")
)
print("saving all-dataset-info-metadata...code complete")

Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# -*- coding: utf-8 -*-
"""accept_donors_and_pull_data.py
This is a wrapper script that accepts all bigdata donation project donors,
and then pulls of their datasets for further processing.
"""

# %% REQUIRED LIBRARIES
from accept_new_donors_and_get_donor_list import accept_and_get_list
import datetime as dt
import pandas as pd
import subprocess as sub
import os
import glob
import time
import argparse
from multiprocessing import Pool


# %% USER INPUTS (choices to be made in order to run the code)
codeDescription = "accepts new donors (shares) and grab their data"
parser = argparse.ArgumentParser(description=codeDescription)

parser.add_argument(
"-d",
"--date-stamp",
dest="date_stamp",
default=dt.datetime.now().strftime("%Y-%m-%d"),
help="date, in '%Y-%m-%d' format, of the date when " +
"donors were accepted"
)

parser.add_argument(
"-o",
"--output-data-path",
dest="data_path",
default=os.path.abspath(
os.path.join(
os.path.dirname(__file__), "..", "data"
)
),
help="the output path where the data is stored"
)

parser.add_argument(
"-s",
"--save-donor-list",
dest="save_donor_list",
default=True,
help="specify if you want to save the donor list (True/False)"
)

args = parser.parse_args()


# %% FUNCTIONS
def run_process(func_name, userid, donor_group):
func_path = os.path.join(".", func_name)

p = sub.Popen(
[
"python", func_path,
"-d", args.date_stamp,
"-dg", donor_group,
"-u", userid,
"-o", args.data_path
],
stdout=sub.PIPE,
stderr=sub.PIPE
)

output, errors = p.communicate()
output = output.decode("utf-8")
errors = errors.decode("utf-8")

if errors == '':
print(output)
else:
print(errors)

return


def get_all_data(userid, donor_group):

run_process("get_single_donor_metadata.py", userid, donor_group)
run_process("get_single_tidepool_dataset_json.py", userid, donor_group)

return


# %% GET LATEST DONOR LIST
final_donor_list = accept_and_get_list(args)


# %% GET DONOR META DATA AND DATASETS
# use multiple cores to process
startTime = time.time()
print("starting at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
pool = Pool(os.cpu_count())
pool.starmap(get_all_data, zip(
final_donor_list["userID"],
final_donor_list["donorGroup"]
))
pool.close()
endTime = time.time()
print(
"finshed pulling data at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
)
total_duration = round((endTime - startTime) / 60, 1)
print("total duration was %s minutes" % total_duration)


# %% COMBINE AND SAVE ALL DONOR METADATA
print("combining all metadata")
phi_date_stamp = "PHI-" + args.date_stamp
donor_folder = os.path.join(args.data_path, phi_date_stamp + "-donor-data")

metadata_path = os.path.join(
args.data_path,
phi_date_stamp + "-donor-data",
phi_date_stamp + "-metadata"
)

all_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
all_metadata = pd.DataFrame()
for f in all_files:
temp_meta = pd.read_csv(f)
temp_meta.rename(columns={"Unnamed: 0": "userid"}, inplace=True)
all_metadata = pd.concat(
[all_metadata, temp_meta],
ignore_index=True,
sort=False
)

all_metadata.to_csv(
os.path.join(donor_folder, phi_date_stamp + "-donor-metadata.csv")
)
print("saving metadata...code complete")
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-


# %% REQUIRED LIBRARIES
import datetime as dt
import pandas as pd
import os
import glob
import argparse


# %% FUNCTIONS
def get_dataset_summaries(
save_data_path=os.path.abspath(
os.path.join(
os.path.dirname(__file__),
"..",
"data"
)
),
date_stamp=dt.datetime.now().strftime("%Y-%m-%d"),
):



phi_date_stamp = "PHI-" + args.date_stamp
donor_folder = os.path.join(args.data_path, phi_date_stamp + "-donor-data")

print("combining all dataset metadata")

metadata_path = os.path.join(
args.data_path,
phi_date_stamp + "-donor-data",
phi_date_stamp + "-datasetSummary"
)

all_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
dataset_metadata = pd.DataFrame()
n_files = len(all_files)
print("there are {} files".format(n_files))
f_counter = 1
for f in all_files:
temp_meta = pd.read_csv(f)
temp_meta.rename(columns={"Unnamed: 0": "col_name"}, inplace=True)
userid = f[-32:-22]
temp_meta["userid"] = userid
dataset_metadata = pd.concat(
[dataset_metadata, temp_meta],
ignore_index=True,
sort=False
)

if f_counter % 10 == 0:
print("completed file {} of {}".format(f_counter, n_files))
f_counter = f_counter + 1
dataset_metadata.to_csv(
os.path.join(donor_folder, phi_date_stamp + "-all-dataset-info.csv.gz")
)
print("saving all-dataset-info-metadata...code complete")

return


# %% MAIN
if __name__ == "__main__":
# USER INPUTS (choices to be made in order to run the code)
codeDescription = "get donor json file"
parser = argparse.ArgumentParser(description=codeDescription)

parser.add_argument(
"-d",
"--date-stamp",
dest="date_stamp",
default=dt.datetime.now().strftime("%Y-%m-%d"),
help="date, in '%Y-%m-%d' format, of the date when " +
"donors were accepted"
)

parser.add_argument(
"-o",
"--output-data-path",
dest="data_path",
default=os.path.abspath(
os.path.join(
os.path.dirname(__file__), "..", "data"
)
),
help="the output path where the data is stored"
)

args = parser.parse_args()

# the main function
get_dataset_summaries(
save_data_path=args.data_path,
date_stamp=args.date_stamp
)
Loading