From 3bcbb2fcaa17b5ec69c4de7def93923eda42b2e3 Mon Sep 17 00:00:00 2001
From: Evan Sarmiento <evansarm@gmail.com>
Date: Wed, 26 Jun 2024 10:34:15 -0400
Subject: [PATCH] Fix CGA Events metric?

---
 cga.py | 203 ++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 158 insertions(+), 45 deletions(-)

diff --git a/cga.py b/cga.py
index 29c8725..2d8cd90 100644
--- a/cga.py
+++ b/cga.py
@@ -5,17 +5,51 @@
 import os
 
 sheets = [
-    ["cgaContact", 279615175, os.getenv("SHEET_URL_CGA_CONTACT"), "A:H", [0, 5, 6, 7]],  # OK
-    ["cgaWorkshopEvaluation", 1803423154, os.getenv("SHEET_URL_CGA_WORKSHOP_EVALUATIONS"), "A:P",
-     [0, 1, 2, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]],
+    [
+        "cgaContact",
+        279615175,
+        os.getenv("SHEET_URL_CGA_CONTACT"),
+        "A:H",
+        [0, 5, 6, 7],
+    ],  # OK
+    [
+        "cgaWorkshopEvaluation",
+        1803423154,
+        os.getenv("SHEET_URL_CGA_WORKSHOP_EVALUATIONS"),
+        "A:P",
+        [0, 1, 2, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+    ],
     # OK
-    ["cgaGISApplication", 1021617292, os.getenv("SHEET_URL_CGA_GIS_APPLICATION"), "A:N", [0, 1, 5, 6, 7]],
+    [
+        "cgaGISApplication",
+        1021617292,
+        os.getenv("SHEET_URL_CGA_GIS_APPLICATION"),
+        "A:N",
+        [0, 1, 5, 6, 7],
+    ],
     # OK
-    ["cgaEventRegistration", 340045856, os.getenv("SHEET_URL_CGA_EVENT_REGISTRATION"), "A:L", [0, 1, 5, 6, 7]],
+    [
+        "cgaEventRegistration",
+        340045856,
+        os.getenv("SHEET_URL_CGA_EVENT_REGISTRATION"),
+        "A:L",
+        [0, 1, 5, 6, 7],
+    ],
     # OK
-    ["cgaTrainingRegistration", 2068274999, os.getenv("SHEET_URL_CGA_TRAINING_REGISTRATION"), "A:M",
-     [0, 1, 2, 6, 7, 8, 11, 12]],
-    ["cgaLicenseRequest", 842362239, os.getenv("SHEET_URL_CGA_LICENSE_REQUEST"), "A:L", [0, 5, 6, 7, 10, 11]]
+    [
+        "cgaTrainingRegistration",
+        2068274999,
+        os.getenv("SHEET_URL_CGA_TRAINING_REGISTRATION"),
+        "A:M",
+        [0, 1, 2, 6, 7, 8, 11, 12],
+    ],
+    [
+        "cgaLicenseRequest",
+        842362239,
+        os.getenv("SHEET_URL_CGA_LICENSE_REQUEST"),
+        "A:L",
+        [0, 5, 6, 7, 10, 11],
+    ],
 ]
 
 
@@ -33,7 +67,9 @@ def harvest_cga(path):
         sheet_url = s[2]
         range_name = s[3]
         columns = s[4]
-        harvest_sheet_tsv_http(path, collection, sheet_url, range_name, columns, gid=gid)
+        harvest_sheet_tsv_http(
+            path, collection, sheet_url, range_name, columns, gid=gid
+        )
 
     return
 
@@ -52,6 +88,7 @@ def aggregate_cga(path):
     gis_institute(path)
     cga_lic_req_top10(path)
     cga_lic_req_status(path)
+    cga_event_registration_aggr(path)
 
 
 def cga_contact_school(path):
@@ -61,14 +98,20 @@ def cga_contact_school(path):
     @return: nothing
     """
     #  ----------------------------------------------
-    df = pd.read_csv(path + 'cgaContact.tsv', delimiter="\t")
-    df_contact_12mo = filter_last_12_months(df, 'Timestamp')
+    df = pd.read_csv(path + "cgaContact.tsv", delimiter="\t")
+    df_contact_12mo = filter_last_12_months(df, "Timestamp")
     c = "Your primary affiliated school at Harvard"
     df_contact_12mo_aggr = get_counts(df_contact_12mo, column=c)
-    df_contact_12mo_aggr = df_contact_12mo_aggr[df_contact_12mo_aggr[c] != "Non-Harvard"]
-    df_contact_12mo_aggr = create_percentage(df_contact_12mo_aggr, 'count')
-    df_contact_12mo_aggr.to_csv(path + "cga_contact_last_12_months_by_school.tsv", sep='\t', index=True,
-                                index_label="id")
+    df_contact_12mo_aggr = df_contact_12mo_aggr[
+        df_contact_12mo_aggr[c] != "Non-Harvard"
+    ]
+    df_contact_12mo_aggr = create_percentage(df_contact_12mo_aggr, "count")
+    df_contact_12mo_aggr.to_csv(
+        path + "cga_contact_last_12_months_by_school.tsv",
+        sep="\t",
+        index=True,
+        index_label="id",
+    )
 
 
 def cga_lic_req_status(path):
@@ -77,14 +120,18 @@ def cga_lic_req_status(path):
     @param path: path where to write the TSV
     @return: nothing
     """
-    df = pd.read_csv(path + 'cgaLicenseRequest.tsv', delimiter="\t")
-    df_lic_12mo = filter_last_12_months(df, 'Timestamp')
+    df = pd.read_csv(path + "cgaLicenseRequest.tsv", delimiter="\t")
+    df_lic_12mo = filter_last_12_months(df, "Timestamp")
     c = "Your primary affiliated school at Harvard"
     df_aggr_status = get_counts(df_lic_12mo, c)
     df_aggr_status = df_aggr_status[df_aggr_status[c] != "Non-Harvard"]
-    df_aggr_status = create_percentage(df_aggr_status, 'count')
-    df_aggr_status.to_csv(path + "cga_license_request_last_12_months_by_status.tsv", sep='\t', index=True,
-                          index_label="id")
+    df_aggr_status = create_percentage(df_aggr_status, "count")
+    df_aggr_status.to_csv(
+        path + "cga_license_request_last_12_months_by_status.tsv",
+        sep="\t",
+        index=True,
+        index_label="id",
+    )
 
 
 def cga_lic_req_top10(path):
@@ -93,7 +140,7 @@ def cga_lic_req_top10(path):
     @param path: path where to write the TSV
     @return: nothing
     """
-    df = pd.read_csv(path + 'cgaLicenseRequest.tsv', delimiter="\t")
+    df = pd.read_csv(path + "cgaLicenseRequest.tsv", delimiter="\t")
     # smaller df for last 12 months
     df2 = df[["Software product which you need a license for", "Timestamp"]]
     df3 = filter_last_12_months(df2, "Timestamp", drop_datetime=True)
@@ -103,8 +150,17 @@ def cga_lic_req_top10(path):
     # clean up of output
     df3 = df3.reset_index()
     df3 = df3.rename(
-        columns={'Software product which you need a license for': 'Software product', 'Timestamp': 'count'})
-    df3.to_csv(path + "cga_license_req_last_12_months.tsv", sep='\t', index=True, index_label="id")
+        columns={
+            "Software product which you need a license for": "Software product",
+            "Timestamp": "count",
+        }
+    )
+    df3.to_csv(
+        path + "cga_license_req_last_12_months.tsv",
+        sep="\t",
+        index=True,
+        index_label="id",
+    )
 
 
 def gis_institute(path):
@@ -114,13 +170,19 @@ def gis_institute(path):
     @return: nothing
     """
     #  ---------------------------
-    df = pd.read_csv(path + 'cgaGISApplication.tsv', delimiter="\t")
+    df = pd.read_csv(path + "cgaGISApplication.tsv", delimiter="\t")
     applications_ytd = len(get_records_YTD(df, drop_datetime=True))
-    write_metric(path=path, group="CGA", metric="GIS Institute Applications",
-                 title="GIS Institute",
-                 value=applications_ytd, unit="Number of applications " + get_current_year_str() + " YTD",
-                 icon="fa fa-university", color="blue",
-                 url="")
+    write_metric(
+        path=path,
+        group="CGA",
+        metric="GIS Institute Applications",
+        title="GIS Institute",
+        value=applications_ytd,
+        unit="Number of applications " + get_current_year_str() + " YTD",
+        icon="fa fa-university",
+        color="blue",
+        url="",
+    )
 
 
 def cga_training_evaluations(path):
@@ -129,10 +191,55 @@ def cga_training_evaluations(path):
     @param path: path where to write the TSV
     @return: nothing
     """
-    df = pd.read_csv(path + 'cgaWorkshopEvaluation.tsv', delimiter="\t")
+    df = pd.read_csv(path + "cgaWorkshopEvaluation.tsv", delimiter="\t")
     df_aggr = df.describe()[1:2].transpose()
     df_aggr = df_aggr.transform(lambda x: round(x, 2))
-    df_aggr.to_csv(path + 'cga_workshop_evaluations.tsv', sep="\t", index=True, index_label="metric")
+    df_aggr.to_csv(
+        path + "cga_workshop_evaluations.tsv",
+        sep="\t",
+        index=True,
+        index_label="metric",
+    )
+
+
+def cga_event_registration_aggr(path):
+    """
+    Aggregate number of registration per workshop. Selects last 12 months and only workshops with more than
+    5 registrations
+    @param path: path where to write the TSV
+    @return: nothing
+    """
+    # Training (C) --------------------------------
+    df = pd.read_csv(path + "cgaEventRegistration.tsv", delimiter="\t")
+    df = filter_last_12_months(df, "Timestamp")
+    df["name"] = df["The event name"]
+    df = df.sort_values("datetime")
+
+    # create a list with unique courses in time order
+    df2 = df[["name"]].drop_duplicates()
+    df2.reset_index(drop=True)  # save the order
+
+    # count the number or registrations and save ones  with more than 5
+    df3 = df[["name", "datetime"]].groupby(["name"]).count()
+    df3 = df3[df3["datetime"] > 5]
+
+    # join with the table with the correct order and rename columns
+    df3 = df2.merge(df3, how="inner", on="name").drop_duplicates()[["name", "datetime"]]
+    df_aggr = df3.rename(columns={"name": "course", "datetime": "registration_count"})
+    # save
+    registrations_ytd = int(df_aggr.tail(1)['registration_count'][0])
+
+    write_metric(
+        path=path,
+        group="CGA",
+        metric="Number of Registrations for CGA Conference",
+        title="CGA Events",
+        value=registrations_ytd,
+        unit="Registrations for CGA Conference " + get_current_year_str() + " YTD",
+        icon="fa fa-university",
+        color="blue",
+        url="",
+    )
 
 
 def cga_training_aggr(path):
@@ -143,10 +250,12 @@ def cga_training_aggr(path):
     @return: nothing
     """
     # Training (C) --------------------------------
-    df = pd.read_csv(path + 'cgaTrainingRegistration.tsv', delimiter="\t")
-    df = filter_last_12_months(df, 'Date of the training workshop')
+    df = pd.read_csv(path + "cgaTrainingRegistration.tsv", delimiter="\t")
+    df = filter_last_12_months(df, "Date of the training workshop")
     df["month"] = df.datetime.transform(lambda x: x.strftime("%b") + " " + str(x.year))
-    df["name"] = df["Name of the training workshop"] + "#(" + df["month"] + ")"  # we name the column 'count'
+    df["name"] = (
+        df["Name of the training workshop"] + "#(" + df["month"] + ")"
+    )  # we name the column 'count'
     df = df.sort_values("datetime")
 
     # create a list with unique courses in time order
@@ -154,14 +263,14 @@ def cga_training_aggr(path):
     df2.reset_index(drop=True)  # save the order
 
     # count the number or registrations and save ones  with more than 5
-    df3 = df[["name", "datetime"]].groupby(['name']).count()
+    df3 = df[["name", "datetime"]].groupby(["name"]).count()
     df3 = df3[df3["datetime"] > 5]
 
     # join with the table with the correct order and rename columns
     df3 = df2.merge(df3, how="inner", on="name").drop_duplicates()[["name", "datetime"]]
-    df_aggr = df3.rename(columns={'name': 'course', 'datetime': 'registration_count'})
+    df_aggr = df3.rename(columns={"name": "course", "datetime": "registration_count"})
     # save
-    df_aggr.to_csv(path + 'cga_training.tsv', sep="\t", index=True, index_label="id")
+    df_aggr.to_csv(path + "cga_training.tsv", sep="\t", index=True, index_label="id")
 
 
 def cga_contact_time(path):
@@ -170,10 +279,12 @@ def cga_contact_time(path):
     @param path: path where to write the TSV
     @return: nothing
     """
-    df = pd.read_csv(path + 'cgaContact.tsv', delimiter="\t")
+    df = pd.read_csv(path + "cgaContact.tsv", delimiter="\t")
     df["date"] = df.Timestamp.transform(lambda x: convert_timestamp_str(x)[:7])
-    df_aggr = pd.DataFrame({'count': df["date"].value_counts()}).sort_index()
-    df_previous_12_months(df_aggr).to_csv(path + 'cga_contact.tsv', sep="\t", index=True, index_label="date")
+    df_aggr = pd.DataFrame({"count": df["date"].value_counts()}).sort_index()
+    df_previous_12_months(df_aggr).to_csv(
+        path + "cga_contact.tsv", sep="\t", index=True, index_label="date"
+    )
 
 
 def cga_contact_status(path):
@@ -182,10 +293,12 @@ def cga_contact_status(path):
     @param path: path where to write the TSV
     @return: nothing
     """
-    df = pd.read_csv(path + 'cgaContact.tsv', delimiter="\t")
-    c = 'Your Harvard status/appointment'
-    df_aggr = filter_last_12_months(df, 'Timestamp')
+    df = pd.read_csv(path + "cgaContact.tsv", delimiter="\t")
+    c = "Your Harvard status/appointment"
+    df_aggr = filter_last_12_months(df, "Timestamp")
     df_aggr2 = get_counts(df_aggr, c)
     df_aggr2 = df_aggr2[df_aggr2["Your Harvard status/appointment"] != "Non-Harvard"]
-    df_aggr2 = create_percentage(df_aggr2, 'count')
-    df_aggr2.to_csv(path + 'cga_contact_status.tsv', sep="\t", index=True, index_label="id")
+    df_aggr2 = create_percentage(df_aggr2, "count")
+    df_aggr2.to_csv(
+        path + "cga_contact_status.tsv", sep="\t", index=True, index_label="id"
+    )