tidepool-org · ed-nykaza · Jul 15, 2019 · Jul 16, 2019 · Jul 16, 2019 · Jul 17, 2019
diff --git a/.gitignore b/.gitignore
@@ -35,3 +35,5 @@ projects/loop-algorithm/figures/
 projects/parsers/output/
 
 projects/get-donors-pump-settings/temp-plot\.html
+
+projects/bigdata-processing-pipeline/get_stats/debug/
diff --git a/projects/bigdata-processing-pipeline/__init__.py b/projects/bigdata-processing-pipeline/__init__.py
diff --git a/...eline/anonymize-and-export-data/README.md → ...eline/anonymize_and_export_data/README.md b/...eline/anonymize-and-export-data/README.md → ...eline/anonymize_and_export_data/README.md
diff --git a/...e-and-export-data/anonymize-and-export.py → ...e_and_export_data/anonymize-and-export.py b/...e-and-export-data/anonymize-and-export.py → ...e_and_export_data/anonymize-and-export.py
diff --git a/...data/example-data/dataFieldExportList.csv → ...data/example-data/dataFieldExportList.csv b/...data/example-data/dataFieldExportList.csv → ...data/example-data/dataFieldExportList.csv
diff --git a/...data/example-data/jill-jellyfish-lite.csv → ...data/example-data/jill-jellyfish-lite.csv b/...data/example-data/jill-jellyfish-lite.csv → ...data/example-data/jill-jellyfish-lite.csv
diff --git a/...ata/example-data/jill-jellyfish-lite.json → ...ata/example-data/jill-jellyfish-lite.json b/...ata/example-data/jill-jellyfish-lite.json → ...ata/example-data/jill-jellyfish-lite.json
diff --git a/...ata/example-data/jill-jellyfish-lite.xlsx → ...ata/example-data/jill-jellyfish-lite.xlsx b/...ata/example-data/jill-jellyfish-lite.xlsx → ...ata/example-data/jill-jellyfish-lite.xlsx
diff --git a/projects/bigdata-processing-pipeline/environment.yml b/projects/bigdata-processing-pipeline/environment.yml
@@ -3,9 +3,8 @@ channels:
   - defaults
 dependencies:
   - python=3.7.3
-  - numpy=1.16.4
   - pandas=0.24.2
+  - spyder=3.3.6
   - pip=19.1.1
-  - spyder=3.3.5
   - pip:
     - python-dotenv==0.10.3
diff --git a/...g-pipeline/estimate-local-time/.gitignore → ...g-pipeline/estimate_local_time/.gitignore b/...g-pipeline/estimate-local-time/.gitignore → ...g-pipeline/estimate_local_time/.gitignore
diff --git a/...ng-pipeline/estimate-local-time/README.md → ...ng-pipeline/estimate_local_time/README.md b/...ng-pipeline/estimate-local-time/README.md → ...ng-pipeline/estimate_local_time/README.md
diff --git a/...stimate-local-time/estimate-local-time.py → ...stimate_local_time/estimate-local-time.py b/...stimate-local-time/estimate-local-time.py → ...stimate_local_time/estimate-local-time.py
diff --git a/...al-time/estimateLocalTime-batchProcess.py → ...al_time/estimateLocalTime-batchProcess.py b/...al-time/estimateLocalTime-batchProcess.py → ...al_time/estimateLocalTime-batchProcess.py
diff --git a/...eline/estimate-local-time/example-csv.csv → ...eline/estimate_local_time/example-csv.csv b/...eline/estimate-local-time/example-csv.csv → ...eline/estimate_local_time/example-csv.csv
diff --git a/...ine/estimate-local-time/example-json.json → ...ine/estimate_local_time/example-json.json b/...ine/estimate-local-time/example-json.json → ...ine/estimate_local_time/example-json.json
diff --git a/...ine/estimate-local-time/example-xlsx.xlsx → ...ine/estimate_local_time/example-xlsx.xlsx b/...ine/estimate-local-time/example-xlsx.xlsx → ...ine/estimate_local_time/example-xlsx.xlsx
diff --git a/...wikipedia-timezone-aliases-2018-04-28.csv → ...wikipedia-timezone-aliases-2018-04-28.csv b/...wikipedia-timezone-aliases-2018-04-28.csv → ...wikipedia-timezone-aliases-2018-04-28.csv
diff --git a/...cessing-pipeline/get-donor-data/README.md → ...cessing-pipeline/get_donor_data/README.md b/...cessing-pipeline/get-donor-data/README.md → ...cessing-pipeline/get_donor_data/README.md
diff --git a/projects/bigdata-processing-pipeline/get_donor_data/__init__.py b/projects/bigdata-processing-pipeline/get_donor_data/__init__.py
diff --git a/...a/accept_new_donors_and_get_donor_list.py → ...a/accept_new_donors_and_get_donor_list.py b/...a/accept_new_donors_and_get_donor_list.py → ...a/accept_new_donors_and_get_donor_list.py
@@ -16,6 +16,7 @@
 import requests
 import json
 import argparse
+import pdb
 envPath = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 if envPath not in sys.path:
     sys.path.insert(0, envPath)
@@ -247,7 +248,7 @@ def accept_and_get_list(args):
     )
 
     # polish up the final donor list
-    final_donor_list.sort_values(by="donorGroup", inplace=True)
+    final_donor_list.sort_values(by="userID", inplace=True)
     final_donor_list.reset_index(drop=True, inplace=True)
 
     if args.save_donor_list:

diff --git a/...onor-data/deprecated/accept-new-donors.py → ...onor_data/deprecated/accept-new-donors.py b/...onor-data/deprecated/accept-new-donors.py → ...onor_data/deprecated/accept-new-donors.py
diff --git a/...r-data/deprecated/get-all-col-headings.py → ...r_data/deprecated/get-all-col-headings.py b/...r-data/deprecated/get-all-col-headings.py → ...r_data/deprecated/get-all-col-headings.py
diff --git a/...r-data/deprecated/get-donor-json-files.py → ...r_data/deprecated/get-donor-json-files.py b/...r-data/deprecated/get-donor-json-files.py → ...r_data/deprecated/get-donor-json-files.py
diff --git a/...t-donor-data/deprecated/get-donor-list.py → ...t_donor_data/deprecated/get-donor-list.py b/...t-donor-data/deprecated/get-donor-list.py → ...t_donor_data/deprecated/get-donor-list.py
diff --git a/...nor-data/deprecated/get_all_donor_data.py → ...nor_data/deprecated/get_all_donor_data.py b/...nor-data/deprecated/get_all_donor_data.py → ...nor_data/deprecated/get_all_donor_data.py
diff --git a/...a/example_get_all_data_for_single_user.py → ...a/example_get_all_data_for_single_user.py b/...a/example_get_all_data_for_single_user.py → ...a/example_get_all_data_for_single_user.py
@@ -25,6 +25,6 @@
 )
 data, _ = get_data(
     donor_group="bigdata",
-    userid_of_shared_user="0d4524bc11",
+    userid="0d4524bc11",
     weeks_of_data=4
     )
diff --git a/...-data/get_all_donor_data_batch_process.py → ..._data/get_all_donor_data_batch_process.py b/...-data/get_all_donor_data_batch_process.py → ..._data/get_all_donor_data_batch_process.py
@@ -117,12 +117,11 @@ def get_all_data(userid, donor_group):
 
 metadata_path = os.path.join(
     args.data_path,
-    "PHI-" + "2019-07-13" + "-donor-data",
-    "PHI-" + "2019-07-13" + "-metadata"
-
+    phi_date_stamp + "-donor-data",
+    phi_date_stamp + "-metadata"
 )
 
-all_files = glob.glob(os.path.join(metadata_path, "*.csv"))
+all_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
 all_metadata = pd.DataFrame()
 for f in all_files:
     temp_meta = pd.read_csv(f)
@@ -137,3 +136,32 @@ def get_all_data(userid, donor_group):
     os.path.join(donor_folder, phi_date_stamp + "-donor-metadata.csv")
 )
 print("saving metadata...code complete")
+
+
+# %% COMBINE AND SAVE ALL DATASET INFO (METADATA)
+print("combining all dataset metadata")
+
+metadata_path = os.path.join(
+    args.data_path,
+    phi_date_stamp + "-donor-data",
+    phi_date_stamp + "-datasetSummary"
+)
+
+all_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
+dataset_metadata = pd.DataFrame()
+for f in all_files:
+    temp_meta = pd.read_csv(f)
+    temp_meta.rename(columns={"Unnamed: 0": "col_name"}, inplace=True)
+    userid = f[-32:-22]
+    temp_meta["userid"] = userid
+    dataset_metadata = pd.concat(
+        [dataset_metadata, temp_meta],
+        ignore_index=True,
+        sort=False
+    )
+
+dataset_metadata.to_csv(
+    os.path.join(donor_folder, phi_date_stamp + "-all-dataset-info.csv")
+)
+print("saving all-dataset-info-metadata...code complete")
+
diff --git a/projects/bigdata-processing-pipeline/get_donor_data/get_all_donor_data_batch_process_json.py b/projects/bigdata-processing-pipeline/get_donor_data/get_all_donor_data_batch_process_json.py
@@ -0,0 +1,138 @@
+# -*- coding: utf-8 -*-
+"""accept_donors_and_pull_data.py
+This is a wrapper script that accepts all bigdata donation project donors,
+and then pulls of their datasets for further processing.
+"""
+
+# %% REQUIRED LIBRARIES
+from accept_new_donors_and_get_donor_list import accept_and_get_list
+import datetime as dt
+import pandas as pd
+import subprocess as sub
+import os
+import glob
+import time
+import argparse
+from multiprocessing import Pool
+
+
+# %% USER INPUTS (choices to be made in order to run the code)
+codeDescription = "accepts new donors (shares) and grab their data"
+parser = argparse.ArgumentParser(description=codeDescription)
+
+parser.add_argument(
+    "-d",
+    "--date-stamp",
+    dest="date_stamp",
+    default=dt.datetime.now().strftime("%Y-%m-%d"),
+    help="date, in '%Y-%m-%d' format, of the date when " +
+    "donors were accepted"
+)
+
+parser.add_argument(
+    "-o",
+    "--output-data-path",
+    dest="data_path",
+    default=os.path.abspath(
+        os.path.join(
+            os.path.dirname(__file__), "..", "data"
+        )
+    ),
+    help="the output path where the data is stored"
+)
+
+parser.add_argument(
+    "-s",
+    "--save-donor-list",
+    dest="save_donor_list",
+    default=True,
+    help="specify if you want to save the donor list (True/False)"
+)
+
+args = parser.parse_args()
+
+
+# %% FUNCTIONS
+def run_process(func_name, userid, donor_group):
+    func_path = os.path.join(".", func_name)
+
+    p = sub.Popen(
+        [
+             "python", func_path,
+             "-d", args.date_stamp,
+             "-dg", donor_group,
+             "-u", userid,
+             "-o", args.data_path
+         ],
+        stdout=sub.PIPE,
+        stderr=sub.PIPE
+    )
+
+    output, errors = p.communicate()
+    output = output.decode("utf-8")
+    errors = errors.decode("utf-8")
+
+    if errors == '':
+        print(output)
+    else:
+        print(errors)
+
+    return
+
+
+def get_all_data(userid, donor_group):
+
+    run_process("get_single_donor_metadata.py", userid, donor_group)
+    run_process("get_single_tidepool_dataset_json.py", userid, donor_group)
+
+    return
+
+
+# %% GET LATEST DONOR LIST
+final_donor_list = accept_and_get_list(args)
+
+
+# %% GET DONOR META DATA AND DATASETS
+# use multiple cores to process
+startTime = time.time()
+print("starting at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
+pool = Pool(os.cpu_count())
+pool.starmap(get_all_data, zip(
+    final_donor_list["userID"],
+    final_donor_list["donorGroup"]
+))
+pool.close()
+endTime = time.time()
+print(
+  "finshed pulling data at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+)
+total_duration = round((endTime - startTime) / 60, 1)
+print("total duration was %s minutes" % total_duration)
+
+
+# %% COMBINE AND SAVE ALL DONOR METADATA
+print("combining all metadata")
+phi_date_stamp = "PHI-" + args.date_stamp
+donor_folder = os.path.join(args.data_path, phi_date_stamp + "-donor-data")
+
+metadata_path = os.path.join(
+    args.data_path,
+    phi_date_stamp + "-donor-data",
+    phi_date_stamp + "-metadata"
+)
+
+all_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
+all_metadata = pd.DataFrame()
+for f in all_files:
+    temp_meta = pd.read_csv(f)
+    temp_meta.rename(columns={"Unnamed: 0": "userid"}, inplace=True)
+    all_metadata = pd.concat(
+        [all_metadata, temp_meta],
+        ignore_index=True,
+        sort=False
+    )
+
+all_metadata.to_csv(
+    os.path.join(donor_folder, phi_date_stamp + "-donor-metadata.csv")
+)
+print("saving metadata...code complete")
diff --git a/projects/bigdata-processing-pipeline/get_donor_data/get_interim_dataset_summaries.py b/projects/bigdata-processing-pipeline/get_donor_data/get_interim_dataset_summaries.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+# %% REQUIRED LIBRARIES
+import datetime as dt
+import pandas as pd
+import os
+import glob
+import argparse
+
+
+# %% FUNCTIONS
+def get_dataset_summaries(
+        save_data_path=os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__),
+                "..",
+                "data"
+            )
+        ),
+        date_stamp=dt.datetime.now().strftime("%Y-%m-%d"),
+):
+
+
+
+    phi_date_stamp = "PHI-" + args.date_stamp
+    donor_folder = os.path.join(args.data_path, phi_date_stamp + "-donor-data")
+
+    print("combining all dataset metadata")
+
+    metadata_path = os.path.join(
+        args.data_path,
+        phi_date_stamp + "-donor-data",
+        phi_date_stamp + "-datasetSummary"
+    )
+
+    all_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
+    dataset_metadata = pd.DataFrame()
+    n_files = len(all_files)
+    print("there are {} files".format(n_files))
+    f_counter = 1
+    for f in all_files:
+        temp_meta = pd.read_csv(f)
+        temp_meta.rename(columns={"Unnamed: 0": "col_name"}, inplace=True)
+        userid = f[-32:-22]
+        temp_meta["userid"] = userid
+        dataset_metadata = pd.concat(
+            [dataset_metadata, temp_meta],
+            ignore_index=True,
+            sort=False
+        )
+
+        if f_counter % 10 == 0:
+            print("completed file {} of {}".format(f_counter, n_files))
+        f_counter = f_counter + 1
+    dataset_metadata.to_csv(
+        os.path.join(donor_folder, phi_date_stamp + "-all-dataset-info.csv.gz")
+    )
+    print("saving all-dataset-info-metadata...code complete")
+
+    return
+
+
+# %% MAIN
+if __name__ == "__main__":
+    # USER INPUTS (choices to be made in order to run the code)
+    codeDescription = "get donor json file"
+    parser = argparse.ArgumentParser(description=codeDescription)
+
+    parser.add_argument(
+        "-d",
+        "--date-stamp",
+        dest="date_stamp",
+        default=dt.datetime.now().strftime("%Y-%m-%d"),
+        help="date, in '%Y-%m-%d' format, of the date when " +
+        "donors were accepted"
+    )
+
+    parser.add_argument(
+        "-o",
+        "--output-data-path",
+        dest="data_path",
+        default=os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__), "..", "data"
+            )
+        ),
+        help="the output path where the data is stored"
+    )
+
+    args = parser.parse_args()
+
+    # the main function
+    get_dataset_summaries(
+        save_data_path=args.data_path,
+        date_stamp=args.date_stamp
+    )
Original file line number	Diff line number	Diff line change
Expand Up		@@ -35,3 +35,5 @@ projects/loop-algorithm/figures/
		projects/parsers/output/

		projects/get-donors-pump-settings/temp-plot\.html

		projects/bigdata-processing-pipeline/get_stats/debug/