V11 - Added V11: Non-clinical visits issues have been fixed (#15)

* V11 - Added V11: Non-clinical visits issues have been fixed * Updated changes for V11 --------- Co-authored-by: Ian <[email protected]>
AMPATH · Dec 6, 2024 · cbc38eb · cbc38eb
1 parent 62d0159
commit cbc38eb
Show file tree

Hide file tree

Showing 49 changed files with 221,408 additions and 24 deletions.
diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
@@ -22,4 +22,4 @@ jobs:
         uses: docker/build-push-action@v4
         with:
           push: true
-          tags: ampathke/ampath-iit-prediction-model-v9:latest
+          tags: ampathke/ampath-iit-prediction-model-v11:latest
diff --git a/Dockerfile b/Dockerfile
@@ -48,8 +48,7 @@ RUN chmod 0644 /etc/cron.d/iit-crontab
 RUN crontab -u root /etc/cron.d/iit-crontab
 
 # Add the prediction models to the app
-COPY IIT-Prediction/model/V9B /app/model
-COPY IIT-Prediction/model/V10 /app/model
+COPY IIT-Prediction/model/V11 /app/model
 # Add the production extraction query to the app
 COPY SQL/iit_prod_data_extract.sql /app/iit_prod_data_extract.sql
 # Add the procution threshold queries to the app

diff --git a/..._AutoML_17_20241028_160939_model_3_auc_0.719/GBM_grid_1_AutoML_17_20241028_160939_model_3 b/..._AutoML_17_20241028_160939_model_3_auc_0.719/GBM_grid_1_AutoML_17_20241028_160939_model_3
diff --git a/...oML_17_20241028_160939_auc_0.722/StackedEnsemble_BestOfFamily_1_AutoML_17_20241028_160939 b/...oML_17_20241028_160939_auc_0.722/StackedEnsemble_BestOfFamily_1_AutoML_17_20241028_160939
diff --git a/...day_adult_IIT/2_GBM_3_AutoML_17_20241028_160939_auc_0.721/GBM_3_AutoML_17_20241028_160939 b/...day_adult_IIT/2_GBM_3_AutoML_17_20241028_160939_auc_0.721/GBM_3_AutoML_17_20241028_160939
diff --git a/...day_adult_IIT/3_GBM_1_AutoML_17_20241028_160939_auc_0.721/GBM_1_AutoML_17_20241028_160939 b/...day_adult_IIT/3_GBM_1_AutoML_17_20241028_160939_auc_0.721/GBM_1_AutoML_17_20241028_160939
diff --git a/...day_adult_IIT/4_GBM_4_AutoML_17_20241028_160939_auc_0.721/GBM_4_AutoML_17_20241028_160939 b/...day_adult_IIT/4_GBM_4_AutoML_17_20241028_160939_auc_0.721/GBM_4_AutoML_17_20241028_160939
diff --git a/...day_adult_IIT/5_GBM_2_AutoML_17_20241028_160939_auc_0.721/GBM_2_AutoML_17_20241028_160939 b/...day_adult_IIT/5_GBM_2_AutoML_17_20241028_160939_auc_0.721/GBM_2_AutoML_17_20241028_160939
diff --git a/...1day_adult_IIT/6_GBM_5_AutoML_17_20241028_160939_auc_0.72/GBM_5_AutoML_17_20241028_160939 b/...1day_adult_IIT/6_GBM_5_AutoML_17_20241028_160939_auc_0.72/GBM_5_AutoML_17_20241028_160939
diff --git a/...lt_IIT/7_XGBoost_3_AutoML_17_20241028_160939_auc_0.72/XGBoost_3_AutoML_17_20241028_160939 b/...lt_IIT/7_XGBoost_3_AutoML_17_20241028_160939_auc_0.72/XGBoost_3_AutoML_17_20241028_160939
diff --git a/...toML_17_20241028_160939_model_3_auc_0.72/XGBoost_grid_1_AutoML_17_20241028_160939_model_3 b/...toML_17_20241028_160939_model_3_auc_0.72/XGBoost_grid_1_AutoML_17_20241028_160939_model_3
diff --git a/...ML_17_20241028_160939_model_11_auc_0.72/XGBoost_grid_1_AutoML_17_20241028_160939_model_11 b/...ML_17_20241028_160939_model_11_auc_0.72/XGBoost_grid_1_AutoML_17_20241028_160939_model_11
diff --git a/..._AutoML_18_20241028_193321_model_3_auc_0.705/GBM_grid_1_AutoML_18_20241028_193321_model_3 b/..._AutoML_18_20241028_193321_model_3_auc_0.705/GBM_grid_1_AutoML_18_20241028_193321_model_3
diff --git a/...oML_18_20241028_193321_auc_0.711/StackedEnsemble_BestOfFamily_1_AutoML_18_20241028_193321 b/...oML_18_20241028_193321_auc_0.711/StackedEnsemble_BestOfFamily_1_AutoML_18_20241028_193321
diff --git a/...1day_minor_IIT/2_GBM_5_AutoML_18_20241028_193321_auc_0.71/GBM_5_AutoML_18_20241028_193321 b/...1day_minor_IIT/2_GBM_5_AutoML_18_20241028_193321_auc_0.71/GBM_5_AutoML_18_20241028_193321
diff --git a/...day_minor_IIT/3_GBM_3_AutoML_18_20241028_193321_auc_0.709/GBM_3_AutoML_18_20241028_193321 b/...day_minor_IIT/3_GBM_3_AutoML_18_20241028_193321_auc_0.709/GBM_3_AutoML_18_20241028_193321
diff --git a/..._AutoML_18_20241028_193321_model_2_auc_0.708/GBM_grid_1_AutoML_18_20241028_193321_model_2 b/..._AutoML_18_20241028_193321_model_2_auc_0.708/GBM_grid_1_AutoML_18_20241028_193321_model_2
diff --git a/..._AutoML_18_20241028_193321_model_7_auc_0.707/GBM_grid_1_AutoML_18_20241028_193321_model_7 b/..._AutoML_18_20241028_193321_model_7_auc_0.707/GBM_grid_1_AutoML_18_20241028_193321_model_7
diff --git a/...day_minor_IIT/6_GBM_2_AutoML_18_20241028_193321_auc_0.707/GBM_2_AutoML_18_20241028_193321 b/...day_minor_IIT/6_GBM_2_AutoML_18_20241028_193321_auc_0.707/GBM_2_AutoML_18_20241028_193321
diff --git a/...r_IIT/7_XGBoost_3_AutoML_18_20241028_193321_auc_0.707/XGBoost_3_AutoML_18_20241028_193321 b/...r_IIT/7_XGBoost_3_AutoML_18_20241028_193321_auc_0.707/XGBoost_3_AutoML_18_20241028_193321
diff --git a/..._AutoML_18_20241028_193321_model_6_auc_0.706/GBM_grid_1_AutoML_18_20241028_193321_model_6 b/..._AutoML_18_20241028_193321_model_6_auc_0.706/GBM_grid_1_AutoML_18_20241028_193321_model_6
diff --git a/...day_minor_IIT/9_GBM_1_AutoML_18_20241028_193321_auc_0.706/GBM_1_AutoML_18_20241028_193321 b/...day_minor_IIT/9_GBM_1_AutoML_18_20241028_193321_auc_0.706/GBM_1_AutoML_18_20241028_193321
diff --git a/..._AutoML_15_20241028_105945_model_3_auc_0.753/GBM_grid_1_AutoML_15_20241028_105945_model_3 b/..._AutoML_15_20241028_105945_model_3_auc_0.753/GBM_grid_1_AutoML_15_20241028_105945_model_3
diff --git a/...oML_15_20241028_105945_auc_0.756/StackedEnsemble_BestOfFamily_1_AutoML_15_20241028_105945 b/...oML_15_20241028_105945_auc_0.756/StackedEnsemble_BestOfFamily_1_AutoML_15_20241028_105945
diff --git a/...ays_adult_IIT/2_GBM_3_AutoML_15_20241028_105945_auc_0.755/GBM_3_AutoML_15_20241028_105945 b/...ays_adult_IIT/2_GBM_3_AutoML_15_20241028_105945_auc_0.755/GBM_3_AutoML_15_20241028_105945
diff --git a/...ays_adult_IIT/3_GBM_2_AutoML_15_20241028_105945_auc_0.755/GBM_2_AutoML_15_20241028_105945 b/...ays_adult_IIT/3_GBM_2_AutoML_15_20241028_105945_auc_0.755/GBM_2_AutoML_15_20241028_105945
diff --git a/...ays_adult_IIT/4_GBM_4_AutoML_15_20241028_105945_auc_0.755/GBM_4_AutoML_15_20241028_105945 b/...ays_adult_IIT/4_GBM_4_AutoML_15_20241028_105945_auc_0.755/GBM_4_AutoML_15_20241028_105945
diff --git a/...ays_adult_IIT/5_GBM_1_AutoML_15_20241028_105945_auc_0.754/GBM_1_AutoML_15_20241028_105945 b/...ays_adult_IIT/5_GBM_1_AutoML_15_20241028_105945_auc_0.754/GBM_1_AutoML_15_20241028_105945
diff --git a/...t_IIT/6_XGBoost_3_AutoML_15_20241028_105945_auc_0.754/XGBoost_3_AutoML_15_20241028_105945 b/...t_IIT/6_XGBoost_3_AutoML_15_20241028_105945_auc_0.754/XGBoost_3_AutoML_15_20241028_105945
diff --git a/...ays_adult_IIT/7_GBM_5_AutoML_15_20241028_105945_auc_0.754/GBM_5_AutoML_15_20241028_105945 b/...ays_adult_IIT/7_GBM_5_AutoML_15_20241028_105945_auc_0.754/GBM_5_AutoML_15_20241028_105945
diff --git a/...oML_15_20241028_105945_model_3_auc_0.754/XGBoost_grid_1_AutoML_15_20241028_105945_model_3 b/...oML_15_20241028_105945_model_3_auc_0.754/XGBoost_grid_1_AutoML_15_20241028_105945_model_3
diff --git a/...L_15_20241028_105945_model_11_auc_0.753/XGBoost_grid_1_AutoML_15_20241028_105945_model_11 b/...L_15_20241028_105945_model_11_auc_0.753/XGBoost_grid_1_AutoML_15_20241028_105945_model_11
diff --git a/..._AutoML_16_20241028_150402_model_3_auc_0.742/GBM_grid_1_AutoML_16_20241028_150402_model_3 b/..._AutoML_16_20241028_150402_model_3_auc_0.742/GBM_grid_1_AutoML_16_20241028_150402_model_3
diff --git a/...AutoML_16_20241028_150402_auc_0.749/StackedEnsemble_AllModels_1_AutoML_16_20241028_150402 b/...AutoML_16_20241028_150402_auc_0.749/StackedEnsemble_AllModels_1_AutoML_16_20241028_150402
diff --git a/...ays_minor_IIT/2_GBM_1_AutoML_16_20241028_150402_auc_0.747/GBM_1_AutoML_16_20241028_150402 b/...ays_minor_IIT/2_GBM_1_AutoML_16_20241028_150402_auc_0.747/GBM_1_AutoML_16_20241028_150402
diff --git a/...oML_16_20241028_150402_auc_0.747/StackedEnsemble_BestOfFamily_1_AutoML_16_20241028_150402 b/...oML_16_20241028_150402_auc_0.747/StackedEnsemble_BestOfFamily_1_AutoML_16_20241028_150402
diff --git a/...ays_minor_IIT/4_GBM_3_AutoML_16_20241028_150402_auc_0.746/GBM_3_AutoML_16_20241028_150402 b/...ays_minor_IIT/4_GBM_3_AutoML_16_20241028_150402_auc_0.746/GBM_3_AutoML_16_20241028_150402
diff --git a/...ays_minor_IIT/5_GBM_2_AutoML_16_20241028_150402_auc_0.745/GBM_2_AutoML_16_20241028_150402 b/...ays_minor_IIT/5_GBM_2_AutoML_16_20241028_150402_auc_0.745/GBM_2_AutoML_16_20241028_150402
diff --git a/..._AutoML_16_20241028_150402_model_6_auc_0.745/GBM_grid_1_AutoML_16_20241028_150402_model_6 b/..._AutoML_16_20241028_150402_model_6_auc_0.745/GBM_grid_1_AutoML_16_20241028_150402_model_6
diff --git a/..._AutoML_16_20241028_150402_model_2_auc_0.744/GBM_grid_1_AutoML_16_20241028_150402_model_2 b/..._AutoML_16_20241028_150402_model_2_auc_0.744/GBM_grid_1_AutoML_16_20241028_150402_model_2
diff --git a/..._AutoML_16_20241028_150402_model_7_auc_0.743/GBM_grid_1_AutoML_16_20241028_150402_model_7 b/..._AutoML_16_20241028_150402_model_7_auc_0.743/GBM_grid_1_AutoML_16_20241028_150402_model_7
diff --git a/...ays_minor_IIT/9_GBM_5_AutoML_16_20241028_150402_auc_0.742/GBM_5_AutoML_16_20241028_150402 b/...ays_minor_IIT/9_GBM_5_AutoML_16_20241028_150402_auc_0.742/GBM_5_AutoML_16_20241028_150402
diff --git a/IIT-Prediction/training scripts/V11/main.R b/IIT-Prediction/training scripts/V11/main.R
diff --git a/IIT-Prediction/training scripts/V11/utils.R b/IIT-Prediction/training scripts/V11/utils.R
@@ -0,0 +1,302 @@
+
+# custom operator 
+`%nin%` <- Negate(`%in%`)
+
+# Function that assign golds
+assign_folds <- function(df, k_folds) {
+  # Set seed for reproducibility
+  set.seed(123)
+
+  # Randomly shuffle the unique patient_ids
+  shuffled_patient_ids <- sample(unique(df$person_id))
+
+  # Calculate the fold_id for each person_id
+  fold_ids <- rep(1:k_folds, length.out = length(shuffled_patient_ids))
+
+  # Create a lookup table for fold_id
+  fold_lookup <- data.frame(person_id = shuffled_patient_ids,
+                            fold_id = fold_ids)
+
+  # Merge the lookup table with the original dataframe
+  df <- df %>%
+    left_join(fold_lookup, by = "person_id")
+
+  # Return the updated dataframe
+  return(df)
+}
+
+# function to set outliers to NA
+set_quantile_bounds_to_na <- function(x, lower_quantile = 0.01, upper_quantile = 0.99) {
+  lower_bound <- quantile(x, probs = lower_quantile, na.rm = TRUE)
+  upper_bound <- quantile(x, probs = upper_quantile, na.rm = TRUE)
+  x[x < lower_bound | x > upper_bound] <- NA
+  return(x)
+}
+
+# This function  cleans and imputes data
+clean_longitudinal_data = function(df){
+  time_varrying.df=df%>%
+    #filter( encounter_type!= 186 & encounter_type!= 158) %>% # Removed in V11
+    mutate(
+
+      # Set NA: for 1 and NA levels
+      Pregnancy = if_else(is.na(Pregnancy),0,Pregnancy),
+
+      # Change date time to date
+      Encounter_Date=as.Date(encounter_datetime),
+      VL_Order_Date =as.Date(VL_Order_Date),
+      CD4_Order_Date =as.Date(CD4_Order_Date),
+      #Next_Encounter_Datetime =as.Date(next_clinical_datetime_hiv), # V9B
+      RTC_Date = as.Date(rtc_date),#pmin(as.Date(rtc_date), as.Date(med_pickup_rtc_date), na.rm = TRUE),
+      RTC_GT_DB_Closure = factor(if_else(RTC_Date >= as.Date("2024-04-04"), 'Yes', 'No')),
+
+      Encounter_ID=encounter_id,
+      ART_Adherence = cur_arv_adherence,
+      HIV_disclosure_stage = if_else(is.na(hiv_disclosure_status_value),"Not Done",hiv_disclosure_status_value),
+      Clinic_County=Clinic_County,
+      Clinic_Name =Clinic_Name,
+      Program_Name = if_else(is.na(Program_Name),"Unknown",Program_Name),     
+      # New Vars
+      TB_screening = tb_screen,
+      TB_Test_Result =factor(tb_test_result), 
+      On_TB_TX = on_tb_tx,
+      On_IPT = on_ipt,
+      CA_CX_Screening =if_else(is.na(ca_cx_screening),0,ca_cx_screening),
+      CA_CX_Screening_Result = factor(if_else(is.na(ca_cx_screening_result),1118,ca_cx_screening_result))
+
+
+    )  %>%
+    group_by(person_id, Encounter_Date) %>%
+    filter(row_number()==1)%>%ungroup() %>% # This collapses multiple encounters per day
+    group_by(person_id) %>%
+    arrange( person_id, Encounter_Date)%>% # Just to make sure chronology of events are descending
+    mutate(
+
+      Next_Encounter_Type = lead(encounter_type, order_by =Encounter_ID), # Added V11
+      Next_Encounter_Datetime =as.Date(next_clinical_datetime_hiv), # Modified V11
+      # Set Null dates to today
+      Next_Encounter_Datetime = if_else( is.na(Next_Encounter_Datetime), as.Date(Sys.time()),Next_Encounter_Datetime ),
+      Visit_Number = row_number(),
+
+      # Lab Values
+      # VL processing
+      VL_suppression = if_else(Viral_Load >= 1000 | is.na(Viral_Load), 0, 1),
+      Viral_Load_log10 = log10(Viral_Load+1), # to avoid log10(0) = -Inf
+
+      # NOTE _order_Date are highly missing so we estimate potential vl/cd4 order date
+      VL_Order_Date_Estimated = if_else( !is.na(Viral_Load) & is.na(VL_Order_Date),  
+                                         lag(Encounter_Date, order_by =Encounter_ID), # use last Enc as VL_Order_Date
+                                         VL_Order_Date # otherwise use VL_Order_Date 
+      ),
+      VL_Order_Date_Estimated =  na.locf( VL_Order_Date_Estimated, na.rm = FALSE), # LOCF
+
+      # NOTE _order_Date are highly missing so we estimate potential vl/cd4 order date
+      CD4_Order_Date_Estimated = if_else( !is.na(CD4) & is.na(CD4_Order_Date),  
+                                          lag(Encounter_Date, order_by =Encounter_ID), # use last Enc as CD4_Order_Date
+                                          CD4_Order_Date # otherwise use CD4_Order_Date 
+      ),
+      CD4_Order_Date_Estimated =  na.locf(CD4_Order_Date_Estimated, na.rm = FALSE), # LOCF
+
+      # Calculate Days_Since_Last
+      Days_Since_Last_VL = as.numeric(difftime(Encounter_Date, VL_Order_Date_Estimated, units = "days")),
+      Days_Since_Last_CD4 = as.numeric(difftime(Encounter_Date, CD4_Order_Date_Estimated, units = "days")),
+
+       # Calculate Days defaulted
+      `Days defaulted` =   as.numeric(difftime(Next_Encounter_Datetime , RTC_Date , units = c("days"))),
+
+       # Labeled response variables
+      `disengagement-1day` = if_else(`Days defaulted` >= 1, "Disengaged",  "Active In Care"),
+      `disengagement-2wks` = if_else(`Days defaulted` >= 14, "Disengaged",  "Active In Care"),
+      `disengagement-1month` =  if_else(`Days defaulted` >= 28, "Disengaged",  "Active In Care"),
+      `disengagement-3month` =  if_else(`Days defaulted` >= 90, "Disengaged",  "Active In Care"),
+      `disengagement-7days` = if_else(`Days defaulted` >= 7, "Disengaged",  "Active In Care"),
+
+      ######################## Modified in V11 ###############################################
+
+       # If Next Encounter type is 186 or 153, set the response variables to NA ==>  Modified in V11
+      `disengagement-1day` = factor(if_else(  Next_Encounter_Type %in% c(186,153,158), NA, `disengagement-1day`)),
+      `disengagement-2wks` = factor(if_else(  Next_Encounter_Type %in% c(186,153,158), NA, `disengagement-2wks`)),
+      `disengagement-1month` = factor(if_else(  Next_Encounter_Type %in% c(186,153,158), NA, `disengagement-1month`)),
+      `disengagement-3month` = factor(if_else(  Next_Encounter_Type %in% c(186,153,158), NA, `disengagement-3month`)),
+      `disengagement-7days` = factor(if_else(  Next_Encounter_Type %in% c(186,153,158), NA, `disengagement-7days`)),
+
+      # Binary version of the response variable
+      `disengagement-1day_bin`  = ifelse(`disengagement-1day` == "Disengaged", TRUE, FALSE),
+      `disengagement-2wks_bin`  = ifelse(`disengagement-2wks` == "Disengaged", TRUE, FALSE),
+      `disengagement-1month_bin`  = ifelse(`disengagement-1month` == "Disengaged", TRUE, FALSE),
+      `disengagement-3month_bin`  = ifelse(`disengagement-3month` == "Disengaged", TRUE, FALSE),
+      `disengagement-7days_bin`  = ifelse(`disengagement-7days` == "Disengaged", TRUE, FALSE),
+
+      # Number of non-clinical encounters in the last 3 visits ==> Added in V11
+      num_of_med_pickups_in_last_3visits = as.double(lag(rollapplyr(is.na(`disengagement-1day`), 3, 
+                                                                    sum_ignore_na, partial =T),order_by = Encounter_ID)) %>% replace_na(0),
+
+      # Number of days defaulted in the previous CLINICAL encounters ==> Added V11
+      is_non_med_pickup_visit = if_else(encounter_type %nin% c(186,153,158), 1, 0),
+      index_of_last_clinical_visit =  na_if(lag(cummax(row_number() * is_non_med_pickup_visit),order_by = Encounter_ID), 0),
+      Days_defaulted_in_prev_enc = `Days defaulted`[index_of_last_clinical_visit], # Modified in V11
+
+      # Number of missed-visit out of the last three for each endpoint Y ==> V11
+      num_2wks_defaults_last_3visits = as.double(lag(rollapplyr(`disengagement-2wks_bin`, 3+num_of_med_pickups_in_last_3visits,
+                                                                sum_ignore_na, partial =T),order_by = Encounter_ID)) %>% replace_na(0),
+      num_1day_defaults_last_3visits = as.double(lag(rollapplyr(`disengagement-1day_bin`, 3+num_of_med_pickups_in_last_3visits, 
+                                                                sum_ignore_na, partial =T),order_by = Encounter_ID)) %>% replace_na(0),
+      num_7days_defaults_last_3visits = as.double(lag(rollapplyr(`disengagement-7days_bin`, 3+num_of_med_pickups_in_last_3visits, 
+                                                                 sum_ignore_na, partial =T),order_by = Encounter_ID)) %>% replace_na(0),
+      num_1month_defaults_last_3visits = as.double(lag(rollapplyr(`disengagement-1month_bin`, 3+num_of_med_pickups_in_last_3visits, 
+                                                                  sum_ignore_na, partial =T),order_by = Encounter_ID)) %>% replace_na(0),
+
+      #################################################################################################
+
+      # Have you disengagements (by 1 month) in the last year
+      ever_defaulted_by_1m_in_last_1year = as.numeric(runner(
+        x = `disengagement-1month_bin`,
+        k = "1 years",
+        idx = Encounter_Date,
+        f = function(x) {
+          any(head(x,-1))
+        }
+      )),
+
+      # Have you disengagements (by 1 month) in the last 2 years
+      ever_defaulted_by_1m_in_last_2year = as.numeric(runner(
+        x = `disengagement-1month_bin`,
+        k = "2 years",
+        idx = Encounter_Date,
+        f = function(x) {
+          any(head(x,-1))
+        }
+      ))
+
+
+    ) %>%
+    ungroup() 
+
+  ## IMPUTATION (Lagging)
+  vars_to_impute =  c(    'Age',
+                          'Gender' ,  
+                          'BMI',
+                          'WHO_staging',
+                          'VL_suppression', 
+                          'Viral_Load_log10',
+                          'HIV_disclosure',
+                          'Regimen_Line', 
+                          'Pregnancy',
+                          'CD4',
+                          "Clinic_Name", 
+                          'ART_regimen',
+                          'ART_Adherence',
+                          'HIV_disclosure_stage',
+                          'TB_Test_Result',
+                          'On_TB_TX',
+                          'On_IPT',
+                          'CA_CX_Screening_Result'
+
+  )
+  time_varrying.df%>% select(vars_to_impute)
+
+  clean.df = time_varrying.df%>% 
+    mutate(across(all_of(vars_to_impute), ~.x, .names = "{.col}_orig")) %>%
+    group_by(person_id) %>%
+    # Add baseline vars
+    mutate(across(all_of(vars_to_impute),  ~ if_else(Visit_Number == 1, ., NA),.names = "{.col}_baseline")) %>% 
+    #LOCF stands for “Last Observation Carried Forward.” 
+    mutate(across(all_of(c(vars_to_impute, paste0(vars_to_impute, "_baseline"))), ~ifelse(is.na(.), na.locf(., na.rm = FALSE), .))) %>% 
+    ungroup()%>%
+    ## Factor the response variable
+    mutate(
+      y0=as.factor(`disengagement-1day`),
+      y1=as.factor(`disengagement-7days`),
+      y2=as.factor(`disengagement-1month`), # This is the main reponse that was used to train the models
+      y3=as.factor(`disengagement-3month`),
+      Month = as.factor(as.numeric(format(as.Date(RTC_Date), "%m")))
+    )
+
+  return(clean.df)
+}
+
+
+
+confIntAUC = function(model, index, y, newdata){
+
+  AUC= format(round(h2o.performance(model = model, newdata = newdata)@metrics$AUC*index,2 ),nsmall = 2)
+  LL = format(round(c(pROC::roc( as.numeric(as.data.frame(newdata[,y])[,y])-1,as.data.frame(predict(model, newdata = newdata))[,'Yes'], ci=TRUE, plot=FALSE) $ci*index)[1],2),nsmall = 2)
+  UL = format(round(c(pROC::roc( as.numeric(as.data.frame(newdata[,y])[,y])-1,as.data.frame(predict(model, newdata = newdata))[,'Yes'], ci=TRUE, plot=FALSE) $ci*index)[3],2),nsmall = 2)
+
+  return( list(AUC=paste0(AUC, ' (',LL,'-',UL,')'), LL=LL, UL=UL))
+}
+
+
+# Custom Predict Function
+custom_predict <- function(model, newdata) {
+  newdata_h2o <- as.h2o(newdata)
+  res <- as.data.frame(h2o.predict(model, newdata_h2o))
+  return(round(res[, 3]))  # round the probabilities
+}
+
+
+
+
+
+# Function for collecting cross-validation results: 
+
+results_cross_validation <- function(h2o_model) {
+  h2o_model@model$cross_validation_metrics_summary %>% 
+    as.data.frame() %>% 
+    select(-mean, -sd) %>% 
+    t() %>% 
+    as.data.frame() %>% 
+    mutate_all(as.character) %>% 
+    mutate_all(as.numeric) %>% 
+    select(Accuracy = accuracy, 
+           AUC = auc, 
+           Precision = precision, 
+           Specificity = specificity, 
+           Recall = recall, 
+           Logloss = logloss) %>% 
+    return()
+}
+
+
+
+plot_results <- function(df_results) {
+  df_results %>% 
+    gather(Metrics, Values) %>% 
+    ggplot(aes(Metrics, Values, fill = Metrics, color = Metrics)) +
+    geom_boxplot(alpha = 0.3, show.legend = FALSE) + 
+    theme(plot.margin = unit(c(1, 1, 1, 1), "cm")) +    
+    scale_y_continuous(labels = scales::percent) + 
+    facet_wrap(~ Metrics, scales = "free") + 
+    labs(title = "Model Performance by Some Criteria Selected", y = NULL)
+}
+
+getAUC_onTestData <- function(i) {
+
+  # Extract i-th model:
+  best_ith <- h2o.getModel(autoML@leaderboard[i, 1])
+
+  # Model performance by ith model by AUC on Test data:
+  metrics_ith <- h2o.performance(model = best_ith, newdata = test)
+
+  # Return output:
+  return(data.frame(AUC_Test = metrics_ith@metrics$AUC, model_id = best_ith@model_id))
+
+}
+
+results_cross_validation <- function(h2o_model) {
+  h2o_model@model$cross_validation_metrics_summary %>% 
+    as.data.frame() %>% 
+    arrange(desc(mean)) %>% 
+    select(-mean, -sd) %>% 
+    t() %>% 
+    as.data.frame() %>% 
+    mutate_all(as.character) %>% 
+    mutate_all(as.numeric) %>% 
+    select(Accuracy = accuracy, 
+           AUC = auc, 
+           Precision = precision, 
+           Specificity = specificity, 
+           Sensitivity = recall, 
+           Logloss = logloss) %>% 
+    return()
+}