From 29ca97ccac328853824c7fe1a795a161244471f2 Mon Sep 17 00:00:00 2001 From: gpitt71 <93520106+gpitt71@users.noreply.github.com> Date: Tue, 12 Nov 2024 10:23:09 +0100 Subject: [PATCH] Updated Vignettes --- README.md | 2 +- vignettes/Manuscript_replication_material.Rmd | 1319 +++++++++-------- vignettes/cas_call.Rmd | 35 +- vignettes/hp_tuning.Rmd | 265 ++-- vignettes/simulate_individual_data.Rmd | 281 ++-- vignettes/variables_importance.Rmd | 209 +-- 6 files changed, 1165 insertions(+), 946 deletions(-) diff --git a/README.md b/README.md index 51599a7..caf61fb 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ We then suggest to refresh the R session and to import the `ReSurv` package in ` ``` library(ReSurv) -reticulate::use_virtualenv('pyresurv') +reticulate::use_virtualenv("pyresurv") ``` #### Managing Multiple Package Dependencies diff --git a/vignettes/Manuscript_replication_material.Rmd b/vignettes/Manuscript_replication_material.Rmd index f2ddb57..ebc639b 100644 --- a/vignettes/Manuscript_replication_material.Rmd +++ b/vignettes/Manuscript_replication_material.Rmd @@ -1,10 +1,10 @@ --- -title: "A machine learning approach based on survival analysis for IBNR frequencies in non-life reserving" +title: "A Machine Learning Approach Based On Survival Analysis For IBNR Frequencies In Non-Life Reserving" author: "Gabriele Pittarello, Emil Hofman" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{A machine learning approach based on survival analysis for IBNR frequencies in non-life reserving} + %\VignetteIndexEntry{A Machine Learning Approach Based On Survival Analysis For IBNR Frequencies In Non-Life Reserving} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -17,9 +17,14 @@ We remark that the real data that we used to obtain the results that we included With reference to the code related to the simulation case study, we wanted to make this vignette quickly reproducible and drop extensive computations. The full code that we implemented can be found at the GitHub repository [resurv-replication-code](https://github.com/gpitt71/resurv-replication-code). +For optimising the hyperparameters included in this vignette, we used the procedure shown in the [Hyperparameters Tuning Vignette](https://github.com/gpitt71/ReSurv/blob/main/vignettes/hp_tuning.Rmd) . + +## Load the pacakge + ```{r eval=FALSE, echo=TRUE} library(ReSurv) +reticulate::use_virtualenv("pyresurv") library(data.table) library(ggplot2) @@ -28,329 +33,333 @@ library(ggplot2) # Figure 1 +We first fit simulate and pre-process some data from Scenario Delta. + ```{r eval=FALSE, echo=TRUE} input_data <- data_generator(random_seed = 7, - scenario=3, - time_unit = 1/360, + scenario = 3, + time_unit = 1 / 360, years = 4, period_exposure = 200) individual_data <- IndividualDataPP(input_data, - id="claim_number", - continuous_features="AP_i", - categorical_features="claim_type", - accident_period="AP", - calendar_period="RP", - input_time_granularity = "days", - output_time_granularity = "quarters", - years=4, - continuous_features_spline=NULL, - calendar_period_extrapolation=F) - -hparameters = list(params=list(booster="gbtree", - eta=0.2234094, - subsample=0.8916594, - alpha=12.44775, - lambda=5.714286, - min_child_weight=4.211996, - max_depth = 2), - print_every_n = 0, - nrounds=3000, - verbose=F, - early_stopping_rounds = 500) + id = "claim_number", + continuous_features = "AP_i", + categorical_features = "claim_type", + accident_period = "AP", + calendar_period = "RP", + input_time_granularity = "days", + output_time_granularity = "quarters", + years = 4, + continuous_features_spline = NULL, + calendar_period_extrapolation = FALSE) + ``` -```{r eval=FALSE, echo=TRUE} +We fit a XGB model with optimal hyperparameters. -resurv.fit <- ReSurv(individual_data, +```{r eval=FALSE, echo=TRUE} +hparameters <- list(params = list(booster = "gbtree", + eta = 0.2234094, + subsample = 0.8916594, + alpha = 12.44775, + lambda = 5.714286, + min_child_weight = 4.211996, + max_depth = 2), + print_every_n = 0, + nrounds = 3000, + verbose = FALSE, + early_stopping_rounds = 500) + + +resurv_fit <- ReSurv(individual_data, hazard_model = "XGB", hparameters = hparameters) +``` +We predict for different time granularities. + +```{r eval=FALSE, echo=TRUE} -resurv.fit.predict.Q <- predict(resurv.fit, +resurv_fit_predict_q <- predict(resurv_fit, grouping_method = "probability") -individual_dataY <- IndividualDataPP(input_data, - id="claim_number", - continuous_features="AP_i", - categorical_features="claim_type", - accident_period="AP", - calendar_period="RP", - input_time_granularity = "days", - output_time_granularity = "years", - years=4, - continuous_features_spline=NULL, - calendar_period_extrapolation=F) - -resurv.fit.predict.Y <- predict(resurv.fit, - newdata=individual_dataY, +individual_data_y <- IndividualDataPP(input_data, + id = "claim_number", + continuous_features = "AP_i", + categorical_features = "claim_type", + accident_period = "AP", + calendar_period = "RP", + input_time_granularity = "days", + output_time_granularity = "years", + years = 4, + continuous_features_spline = NULL, + calendar_period_extrapolation = FALSE) + +resurv_fit_predict_y <- predict(resurv_fit, + newdata = individual_data_y, grouping_method = "probability") -individual_dataM <- IndividualDataPP(input_data, - id="claim_number", - continuous_features="AP_i", - categorical_features="claim_type", - accident_period="AP", - calendar_period="RP", - input_time_granularity = "days", - output_time_granularity = "months", - years=4, - continuous_features_spline=NULL, - calendar_period_extrapolation=F) - -resurv.fit.predict.M <- predict(resurv.fit, - newdata=individual_dataM, +individual_data_m <- IndividualDataPP(input_data, + id = "claim_number", + continuous_features = "AP_i", + categorical_features = "claim_type", + accident_period = "AP", + calendar_period = "RP", + input_time_granularity = "days", + output_time_granularity = "months", + years = 4, + continuous_features_spline = NULL, + calendar_period_extrapolation = FALSE) + +resurv_fit_predict_m <- predict(resurv_fit, + newdata = individual_data_m, grouping_method = "probability") -dtb_2_plot_M <- resurv.fit.predict.M$hazard_frame_output -dtb_2_plot_M=dtb_2_plot_M %>% - mutate(DP_o=48-DP_rev_o+1) +``` + + +We also plot the Chain-Ladder (CL) development factors for Monthly and Quarterly granularities -dtb_2_plot_Q <- resurv.fit.predict.Q$hazard_frame_output +## Chain-Ladder (Monthly) - Figure 1a + +```{r eval=FALSE, echo=TRUE} +ticks_at <- seq(1, 48, 4) +labels_as <- as.character(ticks_at) -dtb_2_plot_Q=dtb_2_plot_Q %>% - mutate(DP_o=16-DP_rev_o+1) -dtb_2_plot_Y <- resurv.fit.predict.Y$hazard_frame_output +cl_months <- individual_data_m$training.data %>% + mutate(DP_o = 48 - DP_rev_o + 1) %>% + group_by(AP_o, DP_o) %>% + summarize(I = sum(I), .groups = "drop") %>% + group_by(AP_o) %>% + arrange(DP_o) %>% + mutate(I_cum = cumsum(I), I_cum_lag = lag(I_cum, default = 0)) %>% + ungroup() %>% + group_by(DP_o) %>% + reframe(df_o = sum(I_cum * ( + AP_o <= max(individual_data_m$training.data$AP_o) - DP_o + 1 + )) / + sum(I_cum_lag * ( + AP_o <= max(individual_data_m$training.data$AP_o) - DP_o + 1 + )), + I = sum(I * ( + AP_o <= max(individual_data_m$training.data$AP_o) - DP_o + ))) %>% + mutate(DP_o_join = DP_o - 1) %>% + as.data.frame() + + +cl_months %>% + filter(DP_o > 1) %>% + ggplot(aes(x = DP_o, + y = df_o)) + + geom_line(linewidth = 2.5, color = "#454555") + + labs(title = "Chain ladder", + x = "Development month", + y = "Development factor") + + ylim(1, 2.5 + .01) + + scale_x_continuous(breaks = ticks_at, + labels = labels_as) + + theme_bw(base_size = rel(5)) + + theme(plot.title = element_text(size = 20)) -dtb_2_plot_Y=dtb_2_plot_Y %>% - mutate(DP_o=4-DP_rev_o+1) ``` -## Figure 1b +## Chain-Ladder (Quarterly) - Figure 1b ```{r eval=FALSE, echo=TRUE} -CL = resurv.fit$IndividualDataPP$training.data %>% - mutate(DP_o = max(resurv.fit.predict.Q$hazard_frame_output$DP_rev_o)-DP_rev_o + 1) %>% +cl_years <- resurv_fit$IndividualDataPP$training.data %>% + mutate(DP_o = 16 - DP_rev_o + 1) %>% group_by(AP_o, DP_o) %>% - summarize(I=sum(I), .groups="drop") %>% + summarize(I = sum(I), .groups = "drop") %>% group_by(AP_o) %>% arrange(DP_o) %>% - mutate(I_cum = cumsum(I), - I_cum_lag = lag(I_cum, default=0)) %>% + mutate(I_cum = cumsum(I), I_cum_lag = lag(I_cum, default = 0)) %>% ungroup() %>% group_by(DP_o) %>% - reframe(df_o = sum(I_cum*(AP_o<=max(resurv.fit$IndividualDataPP$training.data$AP_o)-DP_o+1)) / - sum(I_cum_lag*(AP_o<=max(resurv.fit$IndividualDataPP$training.data$AP_o)-DP_o+1)), - I=sum(I*(AP_o<=max(resurv.fit$IndividualDataPP$training.data$AP_o)-DP_o))) %>% - mutate(DP_o_join = DP_o-1) %>%as.data.frame() - -# Figure 1a ---- - -CL %>% - filter(DP_o>1) %>% - ggplot(aes(x=DP_o, - y=df_o))+ - geom_line(linewidth=2.5,color="#454555") + - labs(title="Chain ladder", + reframe(df_o = sum(I_cum * ( + AP_o <= max(resurv_fit$IndividualDataPP$training.data$AP_o) - DP_o + 1 + )) / + sum(I_cum_lag * ( + AP_o <= max(resurv_fit$IndividualDataPP$training.data$AP_o) - DP_o + 1 + )), + I = sum(I * ( + AP_o <= max(resurv_fit$IndividualDataPP$training.data$AP_o) - DP_o + ))) %>% + mutate(DP_o_join = DP_o - 1) %>% + as.data.frame() + +cl_years %>% + filter(DP_o > 1) %>% + ggplot(aes(x = DP_o, y = df_o)) + + geom_line(linewidth = 2.5, color = "#454555") + + labs(title = "Chain ladder", x = "Development quarter", y = "Development factor") + - ylim(1,max(dtb_2_plot_Q$df_o)+.01)+ - theme_bw(base_size=rel(5))+ - theme(plot.title = element_text(size=20)) + ylim(1, 4 + .01) + + theme_bw(base_size = rel(5)) + + theme(plot.title = element_text(size = 20)) + +ticks_at <- seq(1, 16, by = 2) +labels_as <- as.character(ticks_at) -ticks.at <- seq(1,16,by=2) -labels.as <- as.character(ticks.at) ``` + + +For different granularities and feature combinations we plot the XGB feature dependent development factors. ` -## Figure 1f +## XGB (Quarterly) - Figure 1f ```{r eval=FALSE, echo=TRUE} -# Figure 1c ---- - -ap=15 -ct=1 -dtb_2_plot_Q %>% - filter(claim_type==ct, - AP_o==ap, - DP_o>1) %>% - ggplot(aes(x=DP_o, - y=df_o))+ - geom_line(linewidth=2.5,color="royalblue") + - ylim(1,max(dtb_2_plot_Q$df_o)+.01)+ - labs(title=paste("XGB: Accident Quarter", ap, "Claim Type", ct), - x = "Development quarter", - y = "Development factor") + - scale_x_continuous(breaks = ticks.at, - labels = labels.as) + - theme_bw(base_size=rel(5))+ - theme(plot.title = element_text(size=20)) + +ap <- 15 +ct <- 1 +resurv_fit_predict_q$long_triangle_format_out$output_granularity %>% + filter(AP_o == 15 & claim_type == 1) %>% + filter(row_number() == 1) %>% + select(group_o) + + +plot( + resurv_fit_predict_q, + granularity = "output", + title_par = "XGB: Accident Quarter 15 Claim Type 1", + x_text_par = "Development Quarter", + group_code = 30 +) ``` -## Figure 1g +## XGB (Quarterly) - Figure 1g ```{r eval=FALSE, echo=TRUE} -# Figure 1d ---- - -ap=12 -ct=0 -dtb_2_plot_Q %>% - filter(claim_type==ct, - AP_o==ap, - DP_o>1) %>% - ggplot(aes(x=DP_o, - y=df_o))+ - geom_line(linewidth=2.5,color="royalblue") + - ylim(1,max(dtb_2_plot_Q$df_o)+.01)+ - labs(title=paste("XGB: Accident Quarter", ap, "Claim Type", ct), - x = "Development quarter", - y = "Development factor") + - scale_x_continuous(breaks = ticks.at, - labels = labels.as) + - theme_bw(base_size=rel(5))+ - theme(plot.title = element_text(size=20)) +ct <- 0 +ap <- 15 + +resurv_fit_predict_q$long_triangle_format_out$output_granularity %>% + filter(AP_o == ap & claim_type == ct) %>% + filter(row_number() == 1) %>% + select(group_o) + +plot( + resurv_fit_predict_q, + granularity = "output", + title_par = "XGB: Accident Quarter 15 Claim Type 0", + x_text_par = "Development Quarter", + ylim_par = 4, + group_code = 29 +) ``` -## Figure 1h +## XGB (Quarterly) - Figure 1h ```{r eval=FALSE, echo=TRUE} -ap=16 -ct=0 -dtb_2_plot_Q %>% - filter(claim_type==ct, - AP_o==ap, - DP_o>1) %>% - ggplot(aes(x=DP_o, - y=df_o))+ - geom_line(linewidth=2.5,color="royalblue") + - ylim(1,max(dtb_2_plot_Q$df_o)+.01)+ - labs(title=paste("XGB: Accident Quarter", ap, "Claim Type", ct), - x = "Development quarter", - y = "Development factor") + - scale_x_continuous(breaks = ticks.at, - labels = labels.as) + - theme_bw(base_size=rel(5))+ - theme(plot.title = element_text(size=20)) -ggsave("C:\\Users\\gpitt\\Pictures\\ReSurv\\XGBdfoquarters2c.eps", - width = 5.5, - height= 5, - device=cairo_ps) +ct <- 0 +ap <- 16 + +resurv_fit_predict_q$long_triangle_format_out$output_granularity %>% + filter(AP_o == ap & claim_type == ct) %>% + filter(row_number() == 1) %>% + select(group_o) +plot( + resurv_fit_predict_q, + granularity = "output", + title_par = "XGB: Accident Quarter 16 Claim Type 0", + x_text_par = "Development Quarter", + ylim_par = 4, + group_code = 31 +) ``` -## Figure 1a -```{r eval=FALSE, echo=TRUE} -ticks.at <- seq(1,48,4) -labels.as <- as.character(ticks.at) +## XGB (Monthly) - Figure 1c +```{r eval=FALSE, echo=TRUE} -CL_months = individual_dataM$training.data %>% - mutate(DP_o = max(resurv.fit.predict.M$hazard_frame_output$DP_rev_o)-DP_rev_o + 1) %>% - group_by(AP_o, DP_o) %>% - summarize(I=sum(I), .groups="drop") %>% - group_by(AP_o) %>% - arrange(DP_o) %>% - mutate(I_cum = cumsum(I), - I_cum_lag = lag(I_cum, default=0)) %>% - ungroup() %>% - group_by(DP_o) %>% - reframe(df_o = sum(I_cum*(AP_o<=max(individual_dataM$training.data$AP_o)-DP_o+1)) / - sum(I_cum_lag*(AP_o<=max(individual_dataM$training.data$AP_o)-DP_o+1)), - I=sum(I*(AP_o<=max(individual_dataM$training.data$AP_o)-DP_o))) %>% - mutate(DP_o_join = DP_o-1) %>%as.data.frame() - - -CL_months %>% - filter(DP_o>1) %>% - ggplot(aes(x=DP_o, - y=df_o))+ - geom_line(linewidth=2.5,color="#454555") + - labs(title="Chain ladder", - x = "Development month", - y = "Development factor") + - ylim(1, 2.5+.01)+ - scale_x_continuous(breaks = ticks.at, - labels = labels.as) + - theme_bw(base_size=rel(5))+ - theme(plot.title = element_text(size=20)) +ct <- 1 +ap <- 7 +resurv_fit_predict_m$long_triangle_format_out$output_granularity %>% + filter(AP_o == ap & claim_type == ct) %>% + filter(row_number() == 1) %>% + select(group_o) + +plot( + resurv_fit_predict_m, + granularity = "output", + title_par = "XGB: Accident Month 7 Claim Type 1", + x_text_par = "Development Month", + color_par = "#a71429", + ylim_par = 10, + group_code = 14 +) ``` -## Figure 1c +## XGB (Monthly) - Figure 1d + ```{r eval=FALSE, echo=TRUE} -ct=1 -ap=7 -dtb_2_plot_M %>% - filter(claim_type==ct, - AP_o==ap, - DP_o>1) %>% - ggplot(aes(x=DP_o, - y=df_o))+ - geom_line(linewidth=2.5,color="#a71429") + - ylim(1,max(dtb_2_plot_M$df_o)+.01)+ - labs(title=paste("XGB: Accident Month", ap, "Claim Type", ct), - x = "Development month", - y = "Development factor") + - scale_x_continuous(breaks = ticks.at, - labels = labels.as) + - theme_bw(base_size=rel(5))+ - theme(plot.title = element_text(size=20)) +ct <- 0 +ap <- 9 + +resurv_fit_predict_m$long_triangle_format_out$output_granularity %>% + filter(AP_o == ap & claim_type == ct) %>% + filter(row_number() == 1) %>% + select(group_o) + + +plot( + resurv_fit_predict_m, + granularity = "output", + title_par = "XGB: Accident Month 9 Claim Type 0", + x_text_par = "Development Month", + color_par = "#a71429", + ylim_par = 2.5, + group_code = 17 +) ``` -## Figure 1d +## XGB (Monthly) - Figure 1e + ```{r eval=FALSE, echo=TRUE} -ct=0 -ap=9 -dtb_2_plot_M %>% - filter(claim_type==ct, - AP_o==ap, - DP_o>1) %>% - ggplot(aes(x=DP_o, - y=df_o))+ - geom_line(linewidth=2.5,color="#a71429") + - ylim(1,2.5+.01)+ - labs(title=paste("XGB: Accident Month", ap, "Claim Type", ct), - x = "Development month", - y = "Development factor") + - scale_x_continuous(breaks = ticks.at, - labels = labels.as) + - theme_bw(base_size=rel(5))+ - theme(plot.title = element_text(size=20)) -``` +ct <- 0 +ap <- 36 -## Figure 1e +resurv_fit_predict_m$long_triangle_format_out$output_granularity %>% + filter(AP_o == ap & claim_type == ct) %>% + filter(row_number() == 1) %>% + select(group_o) -```{r eval=FALSE, echo=TRUE} -ct=0 -ap=36 - -dtb_2_plot_M %>% - filter(claim_type==ct, - AP_o==ap, - DP_o>1) %>% - ggplot(aes(x=DP_o, - y=df_o))+ - geom_line(linewidth=2.5,color="#a71429") + - ylim(1,2.5+.01)+ - labs(title=paste("XGB: Accident Month", ap, "Claim Type", ct), - x = "Development month", - y = "Development factor") + - scale_x_continuous(breaks = ticks.at, - labels = labels.as) + - theme_bw(base_size=rel(5))+ - theme(plot.title = element_text(size=20)) +plot( + resurv_fit_predict_m, + granularity = "output", + title_par = "XGB: Accident Month 36 Claim Type 0", + color_par = "#a71429", + x_text_par = "Development Month", + ylim_par = 2.5, + group_code = 71 +) ``` @@ -360,169 +369,217 @@ dtb_2_plot_M %>% The models likelihood can be extracted using the following commands. Table 2 shows the average negative log-likelihood for each scenario over the different simulations, here we show the results for XGB, scenario Delta and simulation number 7. ```{r eval=FALSE, echo=TRUE} -resurv.fit$is_lkh -# $metric -# [1] "log-partial likelihood" -# -# $value -# [1] 8.598515 - -resurv.fit$os_lkh -# $metric -# [1] "log-partial likelihood" -# -# $value -# [1] 7.220672 +resurv_fit$is_lkh + +resurv_fit$os_lkh + ``` -# Figure 4 and Figure 5 +# Fitted Survival Curve - Figure 4 and Figure 5 + +We show how to plot the fitted survival function for scenario Alpha and scenario Delta against the true survival function. ## Figure 4 +We find the true Survival Function for scenario Alpha, claim type 1. + ```{r eval=FALSE, echo=TRUE} -beta=2*30 -lambda=0.1 #1 -k=1 -b=1440 -alpha=0.5 -beta0 = 1.15129 -beta1 = 1.95601 - -F_correct_s0 <- function(t, alpha, beta, lambda, k,b,beta_coef){ - #nu/xi*(t/xi)^(nu-1) - #exp(-(t/xi)^nu)/(1-exp(-(t/xi)^nu)* - #nu/xi*(t/xi)^(nu-1)*exp(beta1) - exp(-beta^alpha *(lambda*exp(beta_coef)^(1/(alpha*k)) )^(alpha*k)*(t^(-alpha*k)-b^(-alpha*k)) ) -} +beta <- 2 * 30 +lambda <- 0.1 +k <- 1 +b <- 1440 +alpha <- 0.5 +beta0 <- 1.15129 +beta1 <- 1.95601 -c_correct_grouped<-c() -for (i in 0:(b-1)){ - t<-seq(i,i+1, by=0.001) - n_t<-length(t) - c_correct_grouped[i+1] <- sum(1-F_correct_s0(t, alpha,beta,lambda,k,b, beta0) )/n_t -} -c_correct_grouped<-c(1,c_correct_grouped[1:(b-1)]) +f_correct_s0 <- function(t, alpha, beta, lambda, k, b, beta_coef) { -c_correct_grouped1<-c() -for (i in 0:(b-1)){ - t<-seq(i,i+1, by=0.001) - n_t<-length(t) - c_correct_grouped1[i+1] <- sum(1-F_correct_s0(t, alpha,beta,lambda,k,b, beta1) )/n_t + inner <- lambda * exp(beta_coef) ^ (1 / (alpha * k)) + element_one <- -beta ^ alpha * (inner) ^ (alpha * k) + element_two <- (t ^ (-alpha * k) - b ^ (-alpha * k)) + exp(element_one * element_two) } -c_correct_grouped1<-c(1,c_correct_grouped1[1:(b-1)]) -true_curve <- data.table('DP_rev_i'=(b-1)-seq(0,(b-1),by=1), - 'S_i'=1-(c_correct_grouped1), - 'model_label'='TRUE') +c_correct_grouped <- c() +for (i in 0:(b - 1)) { + t <- seq(i, i + 1, by = 0.001) + n_t <- length(t) + calculation <- f_correct_s0(t, alpha, beta, lambda, k, b, beta0) + c_correct_grouped[i + 1] <- sum(1 - calculation) / + n_t +} +c_correct_grouped <- c(1, c_correct_grouped[1:(b - 1)]) + +c_correct_grouped1 <- c() +for (i in 0:(b - 1)) { + t <- seq(i, i + 1, by = 0.001) + n_t <- length(t) + calculation <- f_correct_s0(t, alpha, beta, lambda, k, b, beta1) + c_correct_grouped1[i + 1] <- sum(1 - calculation) / + n_t +} +c_correct_grouped1 <- c(1, c_correct_grouped1[1:(b - 1)]) +true_curve <- data.table( + "DP_rev_i" = (b - 1) - seq(0, (b - 1), by = 1), + "S_i" = 1 - (c_correct_grouped1), + "model_label" = "TRUE" +) ``` + +We generate the data for Scenario Alpha. + ```{r eval=FALSE, echo=TRUE} -input_data <- data_generator(random_seed = 1, - scenario=0, - time_unit = 1/360, - years = 4, - period_exposure = 200) +input_data <- data_generator( + random_seed = 1, + scenario = 0, + time_unit = 1 / 360, + years = 4, + period_exposure = 200 +) + + +individual_data <- IndividualDataPP( + input_data, + id = "claim_number", + continuous_features = "AP_i", + categorical_features = "claim_type", + accident_period = "AP", + calendar_period = "RP", + input_time_granularity = "days", + output_time_granularity = "quarters", + years = 4, + continuous_features_spline = NULL, + calendar_period_extrapolation = FALSE +) + +``` +Here the optimal hyperparameters for XGB and NN for the generated data set. -individual_data <- IndividualDataPP(input_data, - id="claim_number", - continuous_features="AP_i", - categorical_features="claim_type", - accident_period="AP", - calendar_period="RP", - input_time_granularity = "days", - output_time_granularity = "quarters", - years=4, - continuous_features_spline=NULL, - calendar_period_extrapolation=F) - - -hparameters.xgb.01 = list(params=list(booster="gbtree", - eta=0.9887265, - subsample=0.7924135, - alpha=10.85342, - lambda=6.213317, - min_child_weight=3.042204, - max_depth = 1), - print_every_n = 0, - nrounds=3000, - verbose=F, - early_stopping_rounds = 500) - - -hparameters.nn.01 = list(num_layers=2, - early_stopping = TRUE, - patience = 350, - verbose=F, - network_structure = NULL, - num_nodes = 10, - activation = "SELU", - optim= "SGD", - lr = 0.2741031, - xi= 0.3829451, - epsilon = 0, - batch_size=as.integer(5000), - epochs=as.integer(5500), - num_workers=0, - tie='Efron') +```{r eval=FALSE, echo=TRUE} + +hparameters_xgb_01 <- list( + params = list( + booster = "gbtree", + eta = 0.9887265, + subsample = 0.7924135, + alpha = 10.85342, + lambda = 6.213317, + min_child_weight = 3.042204, + max_depth = 1 + ), + print_every_n = 0, + nrounds = 3000, + verbose = FALSE, + early_stopping_rounds = 500 +) + + +hparameters_nn_01 <- list( + num_layers = 2, + early_stopping = TRUE, + patience = 350, + verbose = FALSE, + network_structure = NULL, + num_nodes = 10, + activation = "SELU", + optim = "SGD", + lr = 0.2741031, + xi = 0.3829451, + epsilon = 0, + batch_size = as.integer(5000), + epochs = as.integer(5500), + num_workers = 0, + tie = "Efron" +) ``` +We fit COX, NN and XGB on the simulated data. + ```{r eval=FALSE, echo=TRUE} -resurv.fit.cox.01 <- ReSurv(individual_data, +resurv_fit_cox_01 <- ReSurv(individual_data, hazard_model = "COX") -resurv.fit.nn.01 <- ReSurv(individual_data, +resurv_fit_nn_01 <- ReSurv(individual_data, hazard_model = "NN", - hparameters = hparameters.nn.01) + hparameters = hparameters_nn_01) -resurv.fit.xgb.01 <- ReSurv(individual_data, +resurv_fit_xgb_01 <- ReSurv(individual_data, hazard_model = "XGB", - hparameters = hparameters.xgb.01) + hparameters = hparameters_xgb_01) ``` +Extract the fitted survival curve. + ```{r eval=FALSE, echo=TRUE} -hazard_frame_updated_cox <- resurv.fit.cox.01$hazard_frame +hazard_frame_updated_cox <- resurv_fit_cox_01$hazard_frame + +hazard_frame_updated_nn <- resurv_fit_nn_01$hazard_frame + +hazard_frame_updated_xgb <- resurv_fit_xgb_01$hazard_frame + -hazard_frame_updated_nn <- resurv.fit.nn.01$hazard_frame +cond_1 <- hazard_frame_updated_cox$AP_i == 13 & + hazard_frame_updated_cox$claim_type == 1 +estimated_cox <- hazard_frame_updated_cox[, c("S_i", "DP_rev_i")] +estimated_cox <- as.data.table(estimated_cox)[, model_label := "COX"] -hazard_frame_updated_xgb <- resurv.fit.xgb.01$hazard_frame +cond_2 <- hazard_frame_updated_nn$AP_i == 13 & + hazard_frame_updated_nn$claim_type == 1 +estimated_nn <- hazard_frame_updated_nn[cond_2, c("S_i", "DP_rev_i")] +estimated_nn <- as.data.table(estimated_nn)[, model_label := "NN"] -estimated_cox <- hazard_frame_updated_cox[hazard_frame_updated_cox$AP_i==13& hazard_frame_updated_cox$claim_type==1,c("S_i", "DP_rev_i")] -estimated_cox<- as.data.table(estimated_cox)[,model_label:='COX'] -estimated_nn <- hazard_frame_updated_nn[hazard_frame_updated_nn$AP_i==13& hazard_frame_updated_nn$claim_type==1,c("S_i", "DP_rev_i")] -estimated_nn <- as.data.table(estimated_nn)[,model_label:='NN'] -estimated_xgb <- hazard_frame_updated_xgb[hazard_frame_updated_xgb$AP_i==13& hazard_frame_updated_xgb$claim_type==1,c("S_i", "DP_rev_i")] -estimated_xgb <- as.data.table(estimated_xgb)[,model_label:='XGB'] +cond_3 <- hazard_frame_updated_xgb$AP_i == 13 & + hazard_frame_updated_xgb$claim_type == 1 +estimated_xgb <- hazard_frame_updated_xgb[, c("S_i", "DP_rev_i")] +estimated_xgb <- as.data.table(estimated_xgb)[cond3, model_label := "XGB"] -dt <- rbind(estimated_cox,estimated_nn,estimated_xgb) +dt <- rbind(estimated_cox, estimated_nn, estimated_xgb) ``` +Plot the fitted survival curve for the three. + ```{r eval=FALSE, echo=TRUE} -ggplot(data=dt,aes(x=DP_rev_i,y=S_i, color=model_label))+ - geom_line(linewidth=1)+ - facet_grid(~model_label)+ - annotate(geom='line', x=true_curve$DP_rev_i,y=true_curve$S_i, lty=2, linewidth=1)+ - scale_x_continuous(expand = c(0, 0), - breaks=c(0,440,940), - labels = c("1440","1000","500")) + - scale_y_continuous(expand=c(0, .001))+ - xlab(latex2exp::TeX("Development time"))+ - ylab("Survival function")+ +ggplot(data = dt, aes(x = DP_rev_i, y = S_i, color = model_label)) + + geom_line(linewidth = 1) + + facet_grid(~ model_label) + + annotate( + geom = "line", + x = true_curve$DP_rev_i, + y = true_curve$S_i, + lty = 2, + linewidth = 1 + ) + + scale_x_continuous( + expand = c(0, 0), + breaks = c(0, 440, 940), + labels = c("1440", "1000", "500") + ) + + scale_y_continuous(expand = c(0, .001)) + + xlab("Development time") + + ylab("Survival function") + scale_color_manual(name = "Model", - values = c("#AAAABC", "#a71429", "#4169E1"))+ - theme_bw()+ - theme(legend.position="none", - text = element_text(size = 20), - axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + values = c("#AAAABC", "#a71429", "#4169E1")) + + theme_bw() + + theme( + legend.position = "none", + text = element_text(size = 20), + axis.text.x = element_text( + angle = 90, + vjust = 0.5, + hjust = 1 + ) + ) ``` @@ -530,363 +587,447 @@ ggplot(data=dt,aes(x=DP_rev_i,y=S_i, color=model_label))+ ## Figure 5 -```{r eval=FALSE, echo=TRUE} +We find the true Survival Function for scenario Delta, accident day 691 and claim type 1. -my_ap=691 -period_function <-function(x){ +```{r eval=FALSE, echo=TRUE} +my_ap = 691 +period_function <- function(x) { " Add monthly seasonal effect starting from daily input. " - - tmp <- floor((x-1)/30) - - if((tmp%%12) %in% (c(2,3,4))){ + + tmp <- floor((x - 1) / 30) + + if ((tmp %% 12) %in% (c(2, 3, 4))) { return(-0.3) } - if((tmp%%12) %in% (c(5,6,7))){ + if ((tmp %% 12) %in% (c(5, 6, 7))) { return(0.4) } - if((tmp%%12) %in% (c(8,9,10))){ + if ((tmp %% 12) %in% (c(8, 9, 10))) { return(-0.7) } - if((tmp%%12) %in% (c(11,0,1))){ #0 instead of 12 + if ((tmp %% 12) %in% (c(11, 0, 1))) { + #0 instead of 12 return(0.1) } } -beta=2*30 -lambda=0.1 #1 -k=1 -b=1440 -alpha=0.5 -beta0 = 1.15129 -beta1 = 1.95601 +period_function(my_ap) - -F_correct_s0 <- function(t, alpha, beta, lambda, k,b,beta_coef){ - #nu/xi*(t/xi)^(nu-1) - #exp(-(t/xi)^nu)/(1-exp(-(t/xi)^nu)* - #nu/xi*(t/xi)^(nu-1)*exp(beta1) - exp(-beta^alpha *(lambda*exp(beta_coef)^(1/(alpha*k)) )^(alpha*k)*(t^(-alpha*k)-b^(-alpha*k)) ) +beta <- 2 * 30 +lambda <- 0.1 +k <- 1 +b <- 1440 +alpha <- 0.5 +beta0 <- 1.15129 +beta1 <- 1.95601 + period_function(my_ap) + +f_correct_s0 <- function(t, alpha, beta, lambda, k, b, beta_coef) { + element_1 <- -beta ^ alpha + element_2 <- lambda * exp(beta_coef) ^ (1 / (alpha * k)) + element_ 3 <- t ^ (-alpha * k) - b ^ (-alpha * k) + exp(element_1 * (element_2) ^ (alpha * k) * ()) } -c_correct_grouped<-c() -for (i in 0:(b-1)){ - t<-seq(i,i+1, by=0.001) - n_t<-length(t) - c_correct_grouped[i+1] <- sum(1-F_correct_s0(t, alpha,beta,lambda,k,b, beta0) )/n_t +c_correct_grouped <- c() +for (i in 0:(b - 1)) { + t <- seq(i, i + 1, by = 0.001) + n_t <- length(t) + calculation <- f_correct_s0(t, alpha, beta, lambda, k, b, beta0) + c_correct_grouped[i + 1] <- sum(1 - calculation) / + n_t } -c_correct_grouped<-c(1,c_correct_grouped[1:(b-1)]) - -c_correct_grouped1<-c() -for (i in 0:(b-1)){ - t<-seq(i,i+1, by=0.001) - n_t<-length(t) - c_correct_grouped1[i+1] <- sum(1-F_correct_s0(t, alpha,beta,lambda,k,b, beta1) )/n_t +c_correct_grouped <- c(1, c_correct_grouped[1:(b - 1)]) + +c_correct_grouped1 <- c() +for (i in 0:(b - 1)) { + t <- seq(i, i + 1, by = 0.001) + n_t <- length(t) + calculation <- f_correct_s0(t, alpha, beta, lambda, k, b, beta1) + c_correct_grouped1[i + 1] <- sum(1 - calculation) / n_t } -c_correct_grouped1<-c(1,c_correct_grouped1[1:(b-1)]) - -true_curve <- data.table('DP_rev_i'=(b-1)-seq(0,(b-1),by=1), - 'S_i'=1-(c_correct_grouped1), - 'model_label'='TRUE') - - - - +c_correct_grouped1 <- c(1, c_correct_grouped1[1:(b - 1)]) +true_curve <- data.table( + "DP_rev_i" = (b - 1) - seq(0, (b - 1), by = 1), + "S_i" = 1 - (c_correct_grouped1), + "model_label" = "TRUE" +) ``` +We generate a data set from scenario Delta. ```{r eval=FALSE, echo=TRUE} -input_data <- data_generator(random_seed = 1, - scenario=3, - time_unit = 1/360, - years = 4, - period_exposure = 200) - - -individual_data <- IndividualDataPP(input_data, - id="claim_number", - continuous_features="AP_i", - categorical_features="claim_type", - accident_period="AP", - calendar_period="RP", - input_time_granularity = "days", - output_time_granularity = "quarters", - years=4, - continuous_features_spline=NULL, - calendar_period_extrapolation=F) +input_data <- data_generator( + random_seed = 1, + scenario = 3, + time_unit = 1 / 360, + years = 4, + period_exposure = 200 +) + + +individual_data <- IndividualDataPP( + input_data, + id = "claim_number", + continuous_features = "AP_i", + categorical_features = "claim_type", + accident_period = "AP", + calendar_period = "RP", + input_time_granularity = "days", + output_time_granularity = "quarters", + years = 4, + continuous_features_spline = NULL, + calendar_period_extrapolation = F +) ``` +Here the optimal parameters for the data set above for XGB and NN. + ```{r eval=FALSE, echo=TRUE} -hparameters.xgb.31=list(params=list(booster="gbtree", - eta=0.1801517, - subsample=0.8768306, - alpha=0.6620562, - lambda=1.379897, - min_child_weight=15.61339, - max_depth = 2), - print_every_n = 0, - nrounds=3000, - verbose=F, - early_stopping_rounds = 500) - -hparameters.nn.31 = list(num_layers=2, - early_stopping = TRUE, - patience = 350, - verbose=F, - network_structure = NULL, - num_nodes = 2, - activation = "LeakyReLU", - optim= "Adam", - lr = 0.3542422, - xi= 0.1803953, - epsilon = 0, - batch_size=as.integer(5000), - epochs=as.integer(5500), - num_workers=0, - tie='Efron') +hparameters_xgb_31 <- list( + params = list( + booster = "gbtree", + eta = 0.1801517, + subsample = 0.8768306, + alpha = 0.6620562, + lambda = 1.379897, + min_child_weight = 15.61339, + max_depth = 2 + ), + print_every_n = 0, + nrounds = 3000, + verbose = FALSE, + early_stopping_rounds = 500 +) + +hparameters_nn_31 <- list( + num_layers = 2, + early_stopping = TRUE, + patience = 350, + verbose = FALSE, + network_structure = NULL, + num_nodes = 2, + activation = "LeakyReLU", + optim = "Adam", + lr = 0.3542422, + xi = 0.1803953, + epsilon = 0, + batch_size = as.integer(5000), + epochs = as.integer(5500), + num_workers = 0, + tie = "Efron" +) ``` +We fit COX, NN and XGB. + ```{r eval=FALSE, echo=TRUE} -resurv.fit.cox.31 <- ReSurv(individual_data, - hazard_model = "COX") +resurv_fit_cox_31 <- ReSurv(individual_data, hazard_model = "COX") -resurv.fit.nn.31 <- ReSurv(individual_data, - hazard_model = "NN", - hparameters = hparameters.nn.31) +resurv_fit_nn_31 <- ReSurv(individual_data, + hazard_model = "NN", + hparameters = hparameters_nn_31) -resurv.fit.xgb.31 <- ReSurv(individual_data, - hazard_model = "XGB", - hparameters = hparameters.xgb.31) +resurv_fit_xgb_31 <- ReSurv(individual_data, + hazard_model = "XGB", + hparameters = hparameters_xgb_31) ``` +Extract the fitted survival curve. + ```{r eval=FALSE, echo=TRUE} -hazard_frame_updated_cox <- resurv.fit.cox.31$hazard_frame +hazard_frame_updated_cox <- resurv_fit_cox_31$hazard_frame -hazard_frame_updated_nn <- resurv.fit.nn.31$hazard_frame +hazard_frame_updated_nn <- resurv_fit_nn_31$hazard_frame -hazard_frame_updated_xgb <- resurv.fit.xgb.31$hazard_frame +hazard_frame_updated_xgb <- resurv_fit_xgb_31$hazard_frame -estimated_cox <- hazard_frame_updated_cox[hazard_frame_updated_cox$AP_i==my_ap& hazard_frame_updated_cox$claim_type==1,c("S_i", "DP_rev_i")] -estimated_cox<- as.data.table(estimated_cox)[,model_label:='COX'] -estimated_nn <- hazard_frame_updated_nn[hazard_frame_updated_nn$AP_i==my_ap& hazard_frame_updated_nn$claim_type==1,c("S_i", "DP_rev_i")] -estimated_nn <- as.data.table(estimated_nn)[,model_label:='NN'] -estimated_xgb <- hazard_frame_updated_xgb[hazard_frame_updated_xgb$AP_i==my_ap& hazard_frame_updated_xgb$claim_type==1,c("S_i", "DP_rev_i")] -estimated_xgb <- as.data.table(estimated_xgb)[,model_label:='XGB'] +cond_1 <- hazard_frame_updated_cox$AP_i == my_ap & + hazard_frame_updated_cox$claim_type == 1 -dt <- rbind(estimated_cox,estimated_nn,estimated_xgb) +estimated_cox <- hazard_frame_updated_cox[cond_1, c("S_i", "DP_rev_i")] +estimated_cox <- as.data.table(estimated_cox)[, model_label := 'COX'] + +cond_2 <- hazard_frame_updated_nn$AP_i == my_ap & + hazard_frame_updated_nn$claim_type == 1 +estimated_nn <- hazard_frame_updated_nn[cond_2, c("S_i", "DP_rev_i")] +estimated_nn <- as.data.table(estimated_nn)[, model_label := 'NN'] + +cond_3 <- hazard_frame_updated_xgb$AP_i == my_ap & + hazard_frame_updated_xgb$claim_type == 1 +estimated_xgb <- hazard_frame_updated_xgb[cond_3, c("S_i", "DP_rev_i")] +estimated_xgb <- as.data.table(estimated_xgb)[, model_label := 'XGB'] + +dt <- rbind(estimated_cox, estimated_nn, estimated_xgb) ``` +Plot the fitted survival curve for the three models. + ```{r eval=FALSE, echo=TRUE} -ggplot(data=dt,aes(x=DP_rev_i,y=S_i, color=model_label))+ - geom_line(linewidth=1)+ - facet_grid(~model_label)+ - annotate(geom='line', x=true_curve$DP_rev_i,y=true_curve$S_i, lty=2, linewidth=1)+ - scale_x_continuous(expand = c(0, 0), - breaks=c(0,440,940), - labels = c("1440","1000","500")) + - scale_y_continuous(expand=c(0, .001))+ - xlab("Development time")+ - ylab("Survival function")+ +ggplot(data = dt, aes(x = DP_rev_i, y = S_i, color = model_label)) + + geom_line(linewidth = 1) + + facet_grid( ~ model_label) + + annotate( + geom = 'line', + x = true_curve$DP_rev_i, + y = true_curve$S_i, + lty = 2, + linewidth = 1 + ) + + scale_x_continuous( + expand = c(0, 0), + breaks = c(0, 440, 940), + labels = c("1440", "1000", "500") + ) + + scale_y_continuous(expand = c(0, .001)) + + xlab("Development time") + + ylab("Survival function") + scale_color_manual(name = "Model", - values = c("#AAAABC", "#a71429", "#4169E1"))+ - theme_bw()+ - theme(legend.position="none", - text = element_text(size = 20), - axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + values = c("#AAAABC", "#a71429", "#4169E1")) + + theme_bw() + + theme( + legend.position = "none", + text = element_text(size = 20), + axis.text.x = element_text( + angle = 90, + vjust = 0.5, + hjust = 1 + ) + ) ``` # Fitting and scoring -## Fitting + +In this Section we show how we fitted and scored our models. While we only show an example for COX, the other models were fitted and scored in a similar fashion. ```{r eval=FALSE, echo=TRUE} seed = 1 scenario = 0 -input_data <- data_generator(random_seed = seed, - scenario=scenario, - time_unit = 1/360, - years = 4, - period_exposure = 200) - -individual_data <- IndividualDataPP(input_data, - id="claim_number", - continuous_features="AP_i", - categorical_features="claim_type", - accident_period="AP", - calendar_period="RP", - input_time_granularity = "days", - output_time_granularity = "quarters", - years=4, - continuous_features_spline=NULL, - calendar_period_extrapolation=F) +input_data <- data_generator( + random_seed = seed, + scenario = scenario, + time_unit = 1 / 360, + years = 4, + period_exposure = 200 +) + +individual_data <- IndividualDataPP( + input_data, + id = "claim_number", + continuous_features = "AP_i", + categorical_features = "claim_type", + accident_period = "AP", + calendar_period = "RP", + input_time_granularity = "days", + output_time_granularity = "quarters", + years = 4, + continuous_features_spline = NULL, + calendar_period_extrapolation = FALSE +) ``` -## Scoring + + +## Fitting + +### Time to fit and predict + +Measuring computation time. ```{r eval=FALSE, echo=TRUE} -start<-Sys.time() +start <- Sys.time() -resurv.fit <- ReSurv(individual_data, - hazard_model = "COX") +resurv_fit <- ReSurv(individual_data, hazard_model = "COX") -resurv.fit.predict <- predict(resurv.fit, - grouping_method = "probability") +resurv_fit_predict <- predict(resurv_fit, grouping_method = "probability") time <- Sys.time() - start +``` + +## Scoring +### Scoring on a Quarterly grid. -max_dp_i <-1440 +Total absolute relative error and total absolute error by calendar time on a quarterly grid. + +```{r eval=FALSE, echo=TRUE} +conversion_factor <- individual_data$conversion_factor + +max_dp_i <- 1440 # Compute the continuously Ranked Probability Score (CRPS) ---- -crps_dt <- ReSurv::survival_crps(resurv.fit) +crps_dt <- ReSurv::survival_crps(resurv_fit) crps_result <- mean(crps_dt$crps) # Compute the ARE tot ---- -conversion_factor <- resurv.fit$IndividualDataPP$conversion_factor - -true_output <- resurv.fit$IndividualDataPP$full.data %>% +true_output <- resurv_fit$IndividualDataPP$full.data %>% mutate( - DP_rev_o = floor(max_dp_i*conversion_factor)-ceiling(DP_i*conversion_factor+((AP_i-1)%%(1/conversion_factor))*conversion_factor) +1, - AP_o = ceiling(AP_i*conversion_factor), - TR_o= AP_o-1 + DP_rev_o = floor(max_dp_i * conversion_factor) - + ceiling(DP_i * conversion_factor + + ((AP_i - 1) %% ( + 1 / conversion_factor + )) * conversion_factor) + 1, + AP_o = ceiling(AP_i * conversion_factor), + TR_o = AP_o - 1 ) %>% - filter(DP_rev_o <=TR_o) %>% + filter(DP_rev_o <= TR_o) %>% group_by(claim_type, AP_o, DP_rev_o) %>% mutate(claim_type = as.character(claim_type)) %>% - summarize(I=sum(I), .groups = "drop") %>% - filter(DP_rev_o>0) #we cant have =0, because corresponds to half a parallelogram. + summarize(I = sum(I), .groups = "drop") %>% + filter(DP_rev_o > 0) + + +out_list <- resurv_fit_predict$long_triangle_format_out +out <- out_list$output_granularity #Total output -score_total<- resurv.fit.predict$hazard_frame_output[,c("claim_type","AP_o", "DP_rev_o", "I_expected")] %>% - inner_join(true_output, by =c("claim_type","AP_o", "DP_rev_o")) %>% - mutate(ave = I-I_expected, - abs_ave = abs(ave)) %>% +score_total <- out[, c("claim_type", "AP_o", "DP_o", "expected_counts")] %>% + mutate(DP_rev_o = 16 - DP_o + 1) %>% + inner_join(true_output, by = c("claim_type", "AP_o", "DP_rev_o")) %>% + mutate(ave = I - expected_counts, abs_ave = abs(ave)) %>% + # from here it is reformulated for the are tot + ungroup() %>% group_by(AP_o, DP_rev_o) %>% - reframe(abs_ave=abs(sum(ave)), - I=sum(I)) %>% - ungroup() - -are_tot=sum(score_total$abs_ave)/sum(score_total$I) + reframe(abs_ave = abs(sum(ave)), I = sum(I)) -# Compute the ARE cal ---- +are_tot <- sum(score_total$abs_ave) / sum(score_total$I) -# Quarterly ---- -dfs_output <- resurv.fit.predict$hazard_frame_output %>% - select(AP_o, claim_type, DP_rev_o, df_o) %>% +dfs_output <- out %>% + mutate(DP_rev_o = 16 - DP_o + 1) %>% + select(AP_o, claim_type, DP_rev_o, f_o) %>% mutate(DP_rev_o = DP_rev_o) %>% distinct() - -score_diagonal <- resurv.fit$IndividualDataPP$full.data %>% +#Cashflow on output scale.Etc quarterly cashflow development +score_diagonal <- resurv_fit$IndividualDataPP$full.data %>% mutate( - DP_rev_o = floor(max_dp_i*conversion_factor)-ceiling(DP_i*conversion_factor+((AP_i-1)%%(1/conversion_factor))*conversion_factor) +1, - AP_o = ceiling(AP_i*conversion_factor) + DP_rev_o = floor(max_dp_i * conversion_factor) - + ceiling(DP_i * conversion_factor + + ((AP_i - 1) %% ( + 1 / conversion_factor + )) * conversion_factor) + 1, + AP_o = ceiling(AP_i * conversion_factor) ) %>% group_by(claim_type, AP_o, DP_rev_o) %>% mutate(claim_type = as.character(claim_type)) %>% - summarize(I=sum(I), .groups = "drop") %>% + summarize(I = sum(I), .groups = "drop") %>% group_by(claim_type, AP_o) %>% arrange(desc(DP_rev_o)) %>% - mutate(I_cum=cumsum(I)) %>% + mutate(I_cum = cumsum(I)) %>% mutate(I_cum_lag = lag(I_cum, default = 0)) %>% left_join(dfs_output, by = c("AP_o", "claim_type", "DP_rev_o")) %>% - mutate(I_cum_hat = I_cum_lag * df_o, - RP_o = max(DP_rev_o)-DP_rev_o + AP_o) %>% - inner_join(true_output[,c("AP_o", "DP_rev_o")] %>% distinct() - , by =c("AP_o", "DP_rev_o")) %>% - group_by(AP_o,DP_rev_o) %>% - reframe(abs_ave2_diag = abs(sum(I_cum_hat)-sum(I_cum)), - I=sum(I)) - -are_cal_q=sum(score_diagonal$abs_ave2_diag)/sum(score_diagonal$I) - -# Yearly ---- - -individual_data2 <- IndividualDataPP(input_data, - id="claim_number", - continuous_features="AP_i", - categorical_features="claim_type", - accident_period="AP", - calendar_period="RP", - input_time_granularity = "days", - output_time_granularity = "years", - years=4, - continuous_features_spline=NULL, - calendar_period_extrapolation=F) - -resurv.predict <- predict(resurv.fit, - newdata=individual_data2, - grouping_method = "probability") + mutate(I_cum_hat = I_cum_lag * f_o, + RP_o = max(DP_rev_o) - DP_rev_o + AP_o) %>% + inner_join(true_output[, c("AP_o", "DP_rev_o")] %>% distinct() + , by = c("AP_o", "DP_rev_o")) %>% + group_by(AP_o, DP_rev_o) %>% + reframe(abs_ave2_diag = abs(sum(I_cum_hat) - sum(I_cum)), I = sum(I)) -conversion_factor <- individual_data2$conversion_factor +are_cal_q <- sum(score_diagonal$abs_ave2_diag) / sum(score_diagonal$I) + +``` +Scoring on an yearly grid. -max_dp_i <-1440 +```{r eval=FALSE, echo=TRUE} +individual_data2 <- IndividualDataPP( + input_data, + id = "claim_number", + continuous_features = "AP_i", + categorical_features = "claim_type", + accident_period = "AP", + calendar_period = "RP", + input_time_granularity = "days", + output_time_granularity = "years", + years = 4, + continuous_features_spline = NULL, + calendar_period_extrapolation = F +) + +resurv_predict_yearly <- predict(resurv_fit, + newdata = individual_data2, + grouping_method = "probability") + +conversion_factor <- individual_data2$conversion_factor + + +max_dp_i <- 1440 true_output <- individual_data2$full.data %>% mutate( - DP_rev_o = floor(max_dp_i*conversion_factor)-ceiling(DP_i*conversion_factor+((AP_i-1)%%(1/conversion_factor))*conversion_factor) +1, - AP_o = ceiling(AP_i*conversion_factor), - TR_o= AP_o-1 + DP_rev_o = floor(max_dp_i * conversion_factor) - + ceiling(DP_i * conversion_factor + + ((AP_i - 1) %% ( + 1 / conversion_factor + )) * conversion_factor) + 1, + AP_o = ceiling(AP_i * conversion_factor), + TR_o = AP_o - 1 ) %>% - filter(DP_rev_o <=TR_o) %>% + filter(DP_rev_o <= TR_o) %>% group_by(claim_type, AP_o, DP_rev_o) %>% mutate(claim_type = as.character(claim_type)) %>% - summarize(I=sum(I), .groups = "drop") %>% - filter(DP_rev_o>0) #we cant have =0, because corresponds to half a parallelogram. + summarize(I = sum(I), .groups = "drop") %>% + filter(DP_rev_o > 0) -dfs_output <- resurv.predict$hazard_frame_output %>% - select(AP_o, claim_type, DP_rev_o, df_o) %>% +out_list_yearly <- resurv_predict_yearly$long_triangle_format_out +out_yearly <- out_list_yearly$output_granularity + +dfs_output <- out_yearly %>% + mutate(DP_rev_o = 4 - DP_o + 1) %>% + select(AP_o, claim_type, DP_rev_o, f_o) %>% mutate(DP_rev_o = DP_rev_o) %>% distinct() -#Cashflow on output scale.Etc quarterly cashflow development score_diagonal_yearly <- individual_data2$full.data %>% mutate( - DP_rev_o = floor(max_dp_i*conversion_factor)-ceiling(DP_i*conversion_factor+((AP_i-1)%%(1/conversion_factor))*conversion_factor) +1, - AP_o = ceiling(AP_i*conversion_factor) + DP_rev_o = floor(max_dp_i * conversion_factor) - + ceiling(DP_i * conversion_factor + + ((AP_i - 1) %% ( + 1 / conversion_factor + )) * conversion_factor) + 1, + AP_o = ceiling(AP_i * conversion_factor) ) %>% group_by(claim_type, AP_o, DP_rev_o) %>% mutate(claim_type = as.character(claim_type)) %>% - summarize(I=sum(I), .groups = "drop") %>% + summarize(I = sum(I), .groups = "drop") %>% group_by(claim_type, AP_o) %>% arrange(desc(DP_rev_o)) %>% - mutate(I_cum=cumsum(I)) %>% + mutate(I_cum = cumsum(I)) %>% mutate(I_cum_lag = lag(I_cum, default = 0)) %>% left_join(dfs_output, by = c("AP_o", "claim_type", "DP_rev_o")) %>% - mutate(I_cum_hat = I_cum_lag * df_o, - RP_o = max(DP_rev_o)-DP_rev_o + AP_o) %>% - inner_join(true_output[,c("AP_o", "DP_rev_o")] %>% distinct() - , by =c("AP_o", "DP_rev_o")) %>% - group_by(AP_o,DP_rev_o) %>% - reframe(abs_ave2_diag = abs(sum(I_cum_hat)-sum(I_cum)), - I=sum(I)) - -are_cal_y=sum(score_diagonal_yearly$abs_ave2_diag)/sum(score_diagonal_yearly$I) + mutate(I_cum_hat = I_cum_lag * f_o, + RP_o = max(DP_rev_o) - DP_rev_o + AP_o) %>% + inner_join(true_output[, c("AP_o", "DP_rev_o")] %>% distinct() + , by = c("AP_o", "DP_rev_o")) %>% + group_by(AP_o, DP_rev_o) %>% + reframe(abs_ave2_diag = abs(sum(I_cum_hat) - sum(I_cum)), I = sum(I)) + +are_cal_y = sum(score_diagonal_yearly$abs_ave2_diag) / + sum(score_diagonal_yearly$I) ``` diff --git a/vignettes/cas_call.Rmd b/vignettes/cas_call.Rmd index 9011f19..e8d2e69 100644 --- a/vignettes/cas_call.Rmd +++ b/vignettes/cas_call.Rmd @@ -1,10 +1,10 @@ --- -title: "2024 Reserving Call Paper Program on Technology and the Reserving Actuary" +title: "Claim Counts Prediction Using Individual Data" author: "Gabriele Pittarello" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{2024 Reserving Call Paper Program on Technology and the Reserving Actuary} + %\VignetteIndexEntry{Claim Counts Prediction Using Individual Data} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -23,7 +23,9 @@ library(ggplot2) ``` -In this vignette we show a pipeline for predicting individual claim counts using the `ReSurv` pacakge. This vignette contains the code to replicate our +In this vignette we show a pipeline for predicting individual claim counts using the `ReSurv` package. + +This vignette was created for the *2024 Reserving Call Paper Program on Technology and the Reserving Actuary* of the Casualty Actuarial Society (CAS) and the CAS Reserves Working Group. We would like to thank the team of reviewers that provided feedback that improved our package. # Installation @@ -37,6 +39,33 @@ packageVersion("ReSurv") ``` + +To handle the Python dependencies for *the first package installation*, we suggest to create a dedicated virtual environment. The remaining part of this Section can be disregard from users that are not interested in using our models based on Neural Networks. + +```{r eval=FALSE, include=TRUE} +ReSurv::install_pyresurv() +``` + +The default name of the virtual environment is `"pyresurv"`. + +We then suggest to refresh the R session and to import the `ReSurv` package in R using + +```{r eval=FALSE, include=TRUE} +library(ReSurv) +reticulate::use_virtualenv("pyresurv") + +``` + +## Managing Multiple Package Dependencies + +For the case of multiple packages using isolated-package-environments we suggest the following procedure. Below an example for `pysparklyr`. + +```{r eval=FALSE, include=TRUE} +envname <- "./venv" +ReSurv::install_pyresurv(envname = envname) +pysparklyr::install_pyspark(envname = envname) +``` + # Data Simulation We simulate a synthetic data set from scenario Alpha. diff --git a/vignettes/hp_tuning.Rmd b/vignettes/hp_tuning.Rmd index 8043951..2d45dbd 100644 --- a/vignettes/hp_tuning.Rmd +++ b/vignettes/hp_tuning.Rmd @@ -1,11 +1,11 @@ --- -title: "Hyperparameters tuning" +title: "Hyperparameters Tuning" author: "Gabriele Pittarello" date: "`r Sys.Date()`" bibliography: '`r system.file("references.bib", package="ReSurv")`' output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Hyperparameters tuning} + %\VignetteIndexEntry{Hyperparameters Tuning} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} @@ -16,35 +16,40 @@ knitr::opts_chunk$set(echo = TRUE) library(ReSurv) library(reticulate) -use_virtualenv('pyresurv') +use_virtualenv("pyresurv") ``` + # Introduction This vignette shows how to tune the hyperparameters of the machine learning algorithms implemented in `ReSurv` using the approach in @snoek12 implemented in the R package `ParBayesianOptimization` (@ParBayesianOptimization). For illustrative purposes, we will simulate daily claim notifications from one of the scenarios introduced in @hiabu23 (scenario Alpha). ```{r eval=FALSE, include=TRUE} -input_data0 <- data_generator(random_seed = 1964, - scenario=0, - time_unit = 1/360, - years = 4, - period_exposure = 200) +input_data_0 <- data_generator( + random_seed = 1964, + scenario = 0, + time_unit = 1 / 360, + years = 4, + period_exposure = 200 +) ``` The counts data are then pre-processed using the `IndividualDataPP` function. ```{r eval=FALSE, include=TRUE} -individual_data <- IndividualDataPP(data = input_data0, - id=NULL, - categorical_features = c("claim_type"), - continuous_features = "AP", - accident_period="AP", - calendar_period="RP", - input_time_granularity = "days", - output_time_granularity = "quarters", - years=4) +individual_data <- IndividualDataPP( + data = input_data_0, + id = NULL, + categorical_features = "claim_type", + continuous_features = "AP", + accident_period = "AP", + calendar_period = "RP", + input_time_granularity = "days", + output_time_granularity = "quarters", + years = 4 +) ``` @@ -77,24 +82,28 @@ In `ReSurv`, we have our own implementation of a standard K-Fold cross-validatio ```{r eval=FALSE, include=TRUE} -resurv.cv.xgboost <- ReSurvCV(IndividualDataPP=individual_data, - model="XGB", - hparameters_grid=list(booster="gbtree", - eta=c(.001,.01,.2,.3), - max_depth=c(3,6,8), - subsample=c(1), - alpha=c(0,.2,1), - lambda=c(0,.2,1), - min_child_weight=c(.5,1)), - print_every_n = 1L, - nrounds=500, - verbose=F, - verbose.cv=T, - early_stopping_rounds = 100, - folds=5, - parallel=T, - ncores=2, - random_seed=1) +resurv_cv_xgboost <- ReSurvCV( + IndividualDataPP = individual_data, + model = "XGB", + hparameters_grid = list( + booster = "gbtree", + eta = c(.001, .01, .2, .3), + max_depth = c(3, 6, 8), + subsample = c(1), + alpha = c(0, .2, 1), + lambda = c(0, .2, 1), + min_child_weight = c(.5, 1) + ), + print_every_n = 1L, + nrounds = 500, + verbose = FALSE, + verbose.cv = TRUE, + early_stopping_rounds = 100, + folds = 5, + parallel = T, + ncores = 2, + random_seed = 1 +) ``` @@ -107,14 +116,15 @@ In order to use the `ParBayesianOptimization` package, we first need to specify ```{r eval=FALSE, include=TRUE} -bounds <- list(num_layers = c(2L,10L), - num_nodes = c(2L,10L), - optim=c(1L,2L), - activation = c(1L,2L), - lr=c(.005,0.5), - xi=c(0,0.5), - eps = c(0,0.5) - ) +bounds <- list( + num_layers = c(2L, 10L), + num_nodes = c(2L, 10L), + optim = c(1L, 2L), + activation = c(1L, 2L), + lr = c(.005, 0.5), + xi = c(0, 0.5), + eps = c(0, 0.5) +) ``` @@ -130,47 +140,46 @@ The score metric we inspect is the negative (partial) likelihood. The likelihood ```{r eval=FALSE, include=TRUE} -obj_func <- function(num_layers, - num_nodes, - optim, +obj_func <- function(num_layers, + num_nodes, + optim, activation, - lr, - xi, + lr, + xi, eps) { - - optim = switch(optim, - "Adam", - "SGD") - activation = switch(activation, "LeakyReLU","SELU") - batch_size=as.integer(5000) - number_layers=as.integer(num_layers) - num_nodes=as.integer(num_nodes) + optim = switch(optim, "Adam", "SGD") + activation = switch(activation, "LeakyReLU", "SELU") + batch_size = as.integer(5000) + number_layers = as.integer(num_layers) + num_nodes = as.integer(num_nodes) + + deepsurv_cv <- ReSurvCV( + IndividualDataPP = individual_data, + model = "NN", + hparameters_grid = list( + num_layers = num_layers, + num_nodes = num_nodes, + optim = optim, + activation = activation, + lr = lr, + xi = xi, + eps = eps, + tie = "Efron", + batch_size = batch_size, + early_stopping = 'TRUE', + patience = 20 + ), + epochs = as.integer(300), + num_workers = 0, + verbose = FALSE, + verbose.cv = TRUE, + folds = 3, + parallel = FALSE, + random_seed = as.integer(Sys.time()) + ) - deepsurv_cv <- ReSurvCV(IndividualDataPP=individual_data, - model="NN", - hparameters_grid=list(num_layers = num_layers, - num_nodes = num_nodes, - optim=optim, - activation = activation, - lr=lr, - xi=xi, - eps = eps, - tie = "Efron", - batch_size = batch_size, - early_stopping = 'TRUE', - patience = 20 - ), - epochs=as.integer(300), - num_workers = 0, - verbose=F, - verbose.cv=T, - folds=3, - parallel=F, - random_seed = as.integer(Sys.time())) - lst <- list( - Score = -deepsurv_cv$out.cv.best.oos$test.lkh, train.lkh = deepsurv_cv$out.cv.best.oos$train.lkh @@ -187,12 +196,12 @@ As a last step, we use the `bayesOpt` function to perform the optimization. ```{r eval=FALSE, include=TRUE} bayes_out <- bayesOpt( - FUN = obj_func - , bounds = bounds - , initPoints = 50 - , iters.n = 1000 - , iters.k = 50 - , otherHalting = list(timeLimit = 18000) + FUN = obj_func, + bounds = bounds, + initPoints = 50, + iters.n = 1000, + iters.k = 50, + otherHalting = list(timeLimit = 18000) ) @@ -229,12 +238,14 @@ In a similar fashion, we can optimize the gradient boosting parameters. We first ```{r eval=FALSE, include=TRUE} -bounds <- list(eta = c(0, 1), - max_depth = c(1L, 25L), - min_child_weight = c(0, 50), - subsample = c(0.51, 1), - lambda = c(0, 15), - alpha = c(0, 15)) +bounds <- list( + eta = c(0, 1), + max_depth = c(1L, 25L), + min_child_weight = c(0, 50), + subsample = c(0.51, 1), + lambda = c(0, 15), + alpha = c(0, 15) +) ``` @@ -242,29 +253,35 @@ bounds <- list(eta = c(0, 1), Secondly, we define an objective function. ```{r eval=FALSE, include=TRUE} -# Function must take the hyper-parameters as inputs -obj_func <- function(eta, max_depth, min_child_weight, subsample, lambda, alpha) { - - xgbcv <- ReSurvCV(IndividualDataPP=individual_data, - model="XGB", - hparameters_grid=list(booster="gbtree", - eta=eta, - max_depth=max_depth, - subsample=subsample, - alpha=lambda, - lambda=alpha, - min_child_weight=min_child_weight), - print_every_n = 1L, - nrounds=500, - verbose=F, - verbose.cv=T, - early_stopping_rounds = 30, - folds=3, - parallel=F, - random_seed = as.integer(Sys.time())) +obj_func <- function(eta, + max_depth, + min_child_weight, + subsample, + lambda, + alpha) { + xgbcv <- ReSurvCV( + IndividualDataPP = individual_data, + model = "XGB", + hparameters_grid = list( + booster = "gbtree", + eta = eta, + max_depth = max_depth, + subsample = subsample, + alpha = lambda, + lambda = alpha, + min_child_weight = min_child_weight + ), + print_every_n = 1L, + nrounds = 500, + verbose = FALSE, + verbose.cv = TRUE, + early_stopping_rounds = 30, + folds = 3, + parallel = FALSE, + random_seed = as.integer(Sys.time()) + ) lst <- list( - Score = -xgbcv$out.cv.best.oos$test.lkh, train.lkh = xgbcv$out.cv.best.oos$train.lkh ) @@ -281,19 +298,27 @@ Lastly, we perform the optimization in a parallel setting using the `DoParallel` library(DoParallel) -cl <- makeCluster( parallel::detectCores() ) +cl <- makeCluster(parallel::detectCores()) registerDoParallel(cl) -clusterEvalQ(cl, {library("ReSurv")} ) +clusterEvalQ(cl, { + library("ReSurv") +}) bayes_out <- bayesOpt( FUN = obj_func - , bounds = bounds - , initPoints = length(bounds) + 20 - , iters.n = 1000 - , iters.k = 50 - , otherHalting = list(timeLimit = 18000) - , parallel = TRUE + , + bounds = bounds + , + initPoints = length(bounds) + 20 + , + iters.n = 1000 + , + iters.k = 50 + , + otherHalting = list(timeLimit = 18000) + , + parallel = TRUE ) ``` diff --git a/vignettes/simulate_individual_data.Rmd b/vignettes/simulate_individual_data.Rmd index 100a687..4e2fd6e 100644 --- a/vignettes/simulate_individual_data.Rmd +++ b/vignettes/simulate_individual_data.Rmd @@ -1,10 +1,10 @@ --- -title: "Simulate individual data" +title: "Simulate Individual Data" author: "Gabriele Pittarello" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Simulate individual data} + %\VignetteIndexEntry{Simulate Individual Data} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} bibliography: '`r system.file("references.bib", package="ReSurv")`' @@ -25,9 +25,8 @@ In the manuscript, we named the $5$ scenarios Alpha, Beta, Gamma, Delta, Epsilon |--------------------------------------------------|--------------------| | `claim_number` | Policy identifier. | | `claim_type` $\in \left\{0, 1 \right\}$ | Type of claim. | -| `AM` | Accident month. | -| `RM` | Reporting month. | -| `DM` | Development month. | +| `AP` | Accident month. | +| `RP` | Reporting month. | For each scenario we will show if they satisfy the chain ladder assumptions (CL), the proportionality assumption in @cox72 (PROP) and if interactions are present (INT). Details on the simulation mechanism and the simulation parameters can be found in the manuscript. @@ -44,38 +43,42 @@ This scenario is a mix of `claim_type 0` and `claim_type 1` with same number of ```{r eval=FALSE, include=TRUE} # Input data -input_data0 <- data_generator(random_seed = 1964, - scenario='alpha', - time_unit = 1/360, - years = 4, - period_exposure = 200) +input_data_0 <- data_generator( + random_seed = 1964, + scenario = "alpha", + time_unit = 1 / 360, + years = 4, + period_exposure = 200 +) ``` ```{r eval=FALSE, include=TRUE} -png(filename = "~/GitHub/ReSurv/vignettes/scenario0ecdf.png", width=600, height=480, res=72) -input_data0 %>% +input_data_0 %>% as.data.frame() %>% - mutate(claim_type=as.factor(claim_type))%>% - ggplot(aes(x=RT-AT, color=claim_type)) + - stat_ecdf(size=1) + - labs(title="Empirical distribution of simulated notification delays", - x="Notification delay (in days)", - y="Cumulative Density") + - xlim(0,1500)+ - scale_color_manual(values=c("royalblue", "#a71429"), - labels=c("Claim type 0","Claim type 1")) + - scale_linetype_manual(values=c(1,3), - labels=c("Claim type 0","Claim type 1"))+ - guides(color = guide_legend(title="Claim type", - override.aes = list(color = c("royalblue", "#a71429"), - size = 2)), - linetype = guide_legend(title="Claim type", - override.aes = list(linetype = c(1,3), - size = 0.7))) + + mutate(claim_type = as.factor(claim_type)) %>% + ggplot(aes(x = RT - AT, color = claim_type)) + + stat_ecdf(size = 1) + + labs(title = "Empirical distribution of simulated notification delays", x = + "Notification delay (in days)", y = "Cumulative Density") + + xlim(0, 1500) + + scale_color_manual( + values = c("royalblue", "#a71429"), + labels = c("Claim type 0", "Claim type 1") + ) + + scale_linetype_manual(values = c(1, 3), + labels = c("Claim type 0", "Claim type 1")) + + guides( + color = guide_legend(title = "Claim type", override.aes = list( + color = c("royalblue", "#a71429"), size = 2 + )), + linetype = guide_legend( + title = "Claim type", + override.aes = list(linetype = c(1, 3), size = 0.7) + ) + ) + theme_bw() -dev.off() ``` @@ -84,39 +87,42 @@ dev.off() This scenario is similar to simulation `Alpha` but the volume of `claim_type 1` is decreasing in the most recent accident dates. When the longer tailed bodily injuries have a decreasing claim volume, aggregated chain ladder methods will overestimate reserves, see @ajne94. ```{r include=TRUE, eval =FALSE} -# Input data -input_data1 <- data_generator(random_seed = 1964, - scenario=1, - time_unit = 1/360, - years = 4, - period_exposure = 200) +input_data_1 <- data_generator( + random_seed = 1964, + scenario = 1, + time_unit = 1 / 360, + years = 4, + period_exposure = 200 +) ``` ```{r eval=FALSE, include=TRUE} -png(filename = "~/GitHub/ReSurv/vignettes/scenario1ecdf.png", width=600, height=480, res=72) -input_data1 %>% +input_data_1 %>% as.data.frame() %>% - mutate(claim_type=as.factor(claim_type))%>% - ggplot(aes(x=RT-AT, color=claim_type)) + - stat_ecdf(size=1) + - labs(title="Empirical distribution of simulated notification delays", - x="Notification delay (in days)", - y="Cumulative Density") + - xlim(0,1500)+ - scale_color_manual(values=c("royalblue", "#a71429"), - labels=c("Claim type 0","Claim type 1")) + - scale_linetype_manual(values=c(1,3), - labels=c("Claim type 0","Claim type 1"))+ - guides(color = guide_legend(title="Claim type", - override.aes = list(color = c("royalblue", "#a71429"), - size = 2)), - linetype = guide_legend(title="Claim type", - override.aes = list(linetype = c(1,3), - size = 0.7))) + + mutate(claim_type = as.factor(claim_type)) %>% + ggplot(aes(x = RT - AT, color = claim_type)) + + stat_ecdf(size = 1) + + labs(title = "Empirical distribution of simulated notification delays", x = + "Notification delay (in days)", y = "Cumulative Density") + + xlim(0, 1500) + + scale_color_manual( + values = c("royalblue", "#a71429"), + labels = c("Claim type 0", "Claim type 1") + ) + + scale_linetype_manual(values = c(1, 3), + labels = c("Claim type 0", "Claim type 1")) + + guides( + color = guide_legend(title = "Claim type", override.aes = list( + color = c("royalblue", "#a71429"), size = 2 + )), + linetype = guide_legend( + title = "Claim type", + override.aes = list(linetype = c(1, 3), size = 0.7) + ) + ) + theme_bw() -dev.off() ``` # Scenario Gamma @@ -126,37 +132,41 @@ An interaction between `claim_type 1` and accident period affects the claims occ ```{r} # Input data -input_data2 <- data_generator(random_seed = 1964, - scenario=2, - time_unit = 1/360, - years = 4, - period_exposure = 200) +input_data_2 <- data_generator( + random_seed = 1964, + scenario = 2, + time_unit = 1 / 360, + years = 4, + period_exposure = 200 +) ``` ```{r eval=FALSE, include=TRUE} -png(filename = "~/GitHub/ReSurv/vignettes/scenario2ecdf.png", width=600, height=480, res=72) -input_data2 %>% +input_data_2 %>% as.data.frame() %>% - mutate(claim_type=as.factor(claim_type))%>% - ggplot(aes(x=RT-AT, color=claim_type)) + - stat_ecdf(size=1) + - labs(title="Empirical distribution of simulated notification delays", - x="Notification delay (in days)", - y="Cumulative Density") + - xlim(0,1500)+ - scale_color_manual(values=c("royalblue", "#a71429"), - labels=c("Claim type 0","Claim type 1")) + - scale_linetype_manual(values=c(1,3), - labels=c("Claim type 0","Claim type 1"))+ - guides(color = guide_legend(title="Claim type", - override.aes = list(color = c("royalblue", "#a71429"), - size = 2)), - linetype = guide_legend(title="Claim type", - override.aes = list(linetype = c(1,3), - size = 0.7))) + + mutate(claim_type = as.factor(claim_type)) %>% + ggplot(aes(x = RT - AT, color = claim_type)) + + stat_ecdf(size = 1) + + labs(title = "Empirical distribution of simulated notification delays", x = + "Notification delay (in days)", y = "Cumulative Density") + + xlim(0, 1500) + + scale_color_manual( + values = c("royalblue", "#a71429"), + labels = c("Claim type 0", "Claim type 1") + ) + + scale_linetype_manual(values = c(1, 3), + labels = c("Claim type 0", "Claim type 1")) + + guides( + color = guide_legend(title = "Claim type", override.aes = list( + color = c("royalblue", "#a71429"), size = 2 + )), + linetype = guide_legend( + title = "Claim type", + override.aes = list(linetype = c(1, 3), size = 0.7) + ) + ) + theme_bw() -dev.off() ``` @@ -165,39 +175,42 @@ dev.off() A seasonality effect dependent on the accident months for `claim_type 0` and `claim_type 1` is present. This could occur in a real world setting with increased work load during winter for certain claim types, or a decreased workforce during the summer holidays. ```{r} -# Input data -input_data3 <- data_generator(random_seed = 1964, - scenario=3, - time_unit = 1/360, - years = 4, - period_exposure = 200) +input_data_3 <- data_generator( + random_seed = 1964, + scenario = 3, + time_unit = 1 / 360, + years = 4, + period_exposure = 200 +) ``` ```{r eval=FALSE, include=TRUE} -png(filename = "~/GitHub/ReSurv/vignettes/scenario3ecdf.png", width=600, height=480, res=72) -input_data3 %>% +input_data_3 %>% as.data.frame() %>% - mutate(claim_type=as.factor(claim_type))%>% - ggplot(aes(x=RT-AT, color=claim_type)) + - stat_ecdf(size=1) + - labs(title="Empirical distribution of simulated notification delays", - x="Notification delay (in days)", - y="Cumulative Density") + - xlim(0,1500)+ - scale_color_manual(values=c("royalblue", "#a71429"), - labels=c("Claim type 0","Claim type 1")) + - scale_linetype_manual(values=c(1,3), - labels=c("Claim type 0","Claim type 1"))+ - guides(color = guide_legend(title="Claim type", - override.aes = list(color = c("royalblue", "#a71429"), - size = 2)), - linetype = guide_legend(title="Claim type", - override.aes = list(linetype = c(1,3), - size = 0.7))) + + mutate(claim_type = as.factor(claim_type)) %>% + ggplot(aes(x = RT - AT, color = claim_type)) + + stat_ecdf(size = 1) + + labs(title = "Empirical distribution of simulated notification delays", x = + "Notification delay (in days)", y = "Cumulative Density") + + xlim(0, 1500) + + scale_color_manual( + values = c("royalblue", "#a71429"), + labels = c("Claim type 0", "Claim type 1") + ) + + scale_linetype_manual(values = c(1, 3), + labels = c("Claim type 0", "Claim type 1")) + + guides( + color = guide_legend(title = "Claim type", override.aes = list( + color = c("royalblue", "#a71429"), size = 2 + )), + linetype = guide_legend( + title = "Claim type", + override.aes = list(linetype = c(1, 3), size = 0.7) + ) + ) + theme_bw() -dev.off() ``` # Scenario Epsilon @@ -207,39 +220,43 @@ The data generating process violates the proportional likelihood in @cox72. We g ```{r} # Input data -input_data4 <- data_generator(random_seed = 1964, - scenario=4, - time_unit = 1/360, - years = 4, - period_exposure = 200) +input_data_4 <- data_generator( + random_seed = 1964, + scenario = 4, + time_unit = 1 / 360, + years = 4, + period_exposure = 200 +) ``` ```{r eval=FALSE, include=TRUE} -png(filename = "~/GitHub/ReSurv/vignettes/scenario4ecdf.png", width=600, height=480, res=72) -input_data4 %>% +input_data_4 %>% as.data.frame() %>% - mutate(claim_type=as.factor(claim_type))%>% - ggplot(aes(x=RT-AT, color=claim_type)) + - stat_ecdf(size=1) + - labs(title="Empirical distribution of simulated notification delays", - x="Notification delay (in days)", - y="Cumulative Density") + - xlim(0,1500)+ - scale_color_manual(values=c("royalblue", "#a71429"), - labels=c("Claim type 0","Claim type 1")) + - scale_linetype_manual(values=c(1,3), - labels=c("Claim type 0","Claim type 1"))+ - guides(color = guide_legend(title="Claim type", - override.aes = list(color = c("royalblue", "#a71429"), - size = 2)), - linetype = guide_legend(title="Claim type", - override.aes = list(linetype = c(1,3), - size = 0.7))) + + mutate(claim_type = as.factor(claim_type)) %>% + ggplot(aes(x = RT - AT, color = claim_type)) + + stat_ecdf(size = 1) + + labs(title = "Empirical distribution of simulated notification delays", x = + "Notification delay (in days)", y = "Cumulative Density") + + xlim(0, 1500) + + scale_color_manual( + values = c("royalblue", "#a71429"), + labels = c("Claim type 0", "Claim type 1") + ) + + scale_linetype_manual(values = c(1, 3), + labels = c("Claim type 0", "Claim type 1")) + + guides( + color = guide_legend(title = "Claim type", override.aes = list( + color = c("royalblue", "#a71429"), size = 2 + )), + linetype = guide_legend( + title = "Claim type", + override.aes = list(linetype = c(1, 3), size = 0.7) + ) + ) + theme_bw() -dev.off() ``` # Bibliography diff --git a/vignettes/variables_importance.Rmd b/vignettes/variables_importance.Rmd index 6dd2168..017df89 100644 --- a/vignettes/variables_importance.Rmd +++ b/vignettes/variables_importance.Rmd @@ -1,10 +1,10 @@ --- -title: "Exploring the variables importance" +title: "Exploring The Variables Importance" author: "Gabriele Pittarello" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Exploring the variables importance} + %\VignetteIndexEntry{Exploring The Variables Importance} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -24,23 +24,26 @@ library(ReSurv) Machine learning models catch interactions between covariates. Often they are a black-box but they can be interpreted with SHAP values. We generate two data sets, one from scenario Alpha and one from scenario Delta the plotting functionalities of the ReSurv package. ```{r eval=FALSE, include=TRUE} -# Input data scenario Alpha - -input_data0 <- data_generator(random_seed = 1, - scenario=0, - time_unit = 1/360, - years = 4, - yearly_exposure = 200) - -individual_data0 <- IndividualDataPP(data = input_data0, - id=NULL, - categorical_features = c("claim_type"), - continuous_features = "AP", - accident_period="AP", - calendar_period="RP", - input_time_granularity = "days", - output_time_granularity = "quarters", - years=4) + +input_data_0 <- data_generator( + random_seed = 1, + scenario = 0, + time_unit = 1 / 360, + years = 4, + yearly_exposure = 200 +) + +individual_data_0 <- IndividualDataPP( + data = input_data_0, + id = NULL, + categorical_features = "claim_type", + continuous_features = "AP", + accident_period = "AP", + calendar_period = "RP", + input_time_granularity = "days", + output_time_granularity = "quarters", + years = 4 +) ``` @@ -48,21 +51,25 @@ individual_data0 <- IndividualDataPP(data = input_data0, ```{r eval=FALSE, include=TRUE} # Input data scenario Delta -input_data3 <- data_generator(random_seed = 1, - scenario=3, - time_unit = 1/360, - years = 4, - yearly_exposure = 200) - -individual_data3 <- IndividualDataPP(data = input_data3, - id=NULL, - categorical_features = c("claim_type"), - continuous_features = "AP", - accident_period="AP", - calendar_period="RP", - input_time_granularity = "days", - output_time_granularity = "quarters", - years=4) +input_data3 <- data_generator( + random_seed = 1, + scenario = 3, + time_unit = 1 / 360, + years = 4, + yearly_exposure = 200 +) + +individual_data_3 <- IndividualDataPP( + data = input_data3, + id = NULL, + categorical_features = "claim_type", + continuous_features = "AP", + accident_period = "AP", + calendar_period = "RP", + input_time_granularity = "days", + output_time_granularity = "quarters", + years = 4 +) ``` @@ -71,34 +78,40 @@ Here we fit Neural Networks and XGB. In order to simplify this vignette, we prov ```{r eval=FALSE, include=TRUE} -hp_scenario_alpha_xgb <- list(params=list(booster="gbtree", - eta=0.9887265, - subsample=0.7924135 , - alpha=10.85342, - lambda=6.213317, - min_child_weight=3.042204, - max_depth = 1), - print_every_n = 0, - nrounds=3000, - verbose=F, - early_stopping_rounds = 500) - -hp_scenario_alpha_nn <- list(batch_size=as.integer(5000), - epochs=as.integer(5500), - num_workers=0, - tie='Efron', - num_layers=2, - num_nodes=10, - optim="SGD", - batch_size=as.integer(5000), - lr=0.3023043, - xi=0.426443, - eps=0, - activation="SELU", - early_stopping = TRUE, - patience = 350, - verbose=F, - network_structure = NULL) +hp_scenario_alpha_xgb <- list( + params = list( + booster = "gbtree", + eta = 0.9887265, + subsample = 0.7924135 , + alpha = 10.85342, + lambda = 6.213317, + min_child_weight = 3.042204, + max_depth = 1 + ), + print_every_n = 0, + nrounds = 3000, + verbose = FALSE, + early_stopping_rounds = 500 +) + +hp_scenario_alpha_nn <- list( + batch_size = as.integer(5000), + epochs = as.integer(5500), + num_workers = 0, + tie = 'Efron', + num_layers = 2, + num_nodes = 10, + optim = "SGD", + batch_size = as.integer(5000), + lr = 0.3023043, + xi = 0.426443, + eps = 0, + activation = "SELU", + early_stopping = TRUE, + patience = 350, + verbose = FALSE, + network_structure = NULL +) hp_scenario_delta_xgb <- list(params=list(booster="gbtree", eta=0.2717736, @@ -109,45 +122,47 @@ hp_scenario_delta_xgb <- list(params=list(booster="gbtree", max_depth = 4), print_every_n = 0, nrounds=3000, - verbose=F, + verbose= FALSE, early_stopping_rounds = 500) -hp_scenario_delta_nn <- list(batch_size=as.integer(5000), - epochs=as.integer(5500), - num_workers=0, - tie='Efron', - num_layers=2, - num_nodes=2, - optim="Adam", - batch_size=as.integer(5000), - lr=0.3542422, - xi= 0.1803953, - eps=0, - activation="LeakyReLU", - early_stopping = TRUE, - patience = 350, - verbose=F, - network_structure = NULL) +hp_scenario_delta_nn <- list( + batch_size = as.integer(5000), + epochs = as.integer(5500), + num_workers = 0, + tie = 'Efron', + num_layers = 2, + num_nodes = 2, + optim = "Adam", + batch_size = as.integer(5000), + lr = 0.3542422, + xi = 0.1803953, + eps = 0, + activation = "LeakyReLU", + early_stopping = TRUE, + patience = 350, + verbose = FALSE, + network_structure = NULL +) ``` ```{r eval=FALSE, include=TRUE} -resurv.model.xgb.A <- ReSurv(individual_data0, +resurv_model_xgb_A <- ReSurv(individual_data_0, hazard_model = "XGB", - hparameters=hp_scenario_alpha_xgb) + hparameters = hp_scenario_alpha_xgb) -resurv.model.nn.A <- ReSurv(individual_data0, - hazard_model = "NN", - hparameters=hp_scenario_alpha_nn) +resurv_model_nn_A <- ReSurv(individual_data_0, + hazard_model = "NN", + hparameters = hp_scenario_alpha_nn) -resurv.model.xgb.D <- ReSurv(individual_data3, +resurv_model_xgb_D <- ReSurv(individual_data_3, hazard_model = "XGB", - hparameters=hp_scenario_delta_xgb) + hparameters = hp_scenario_delta_xgb) -resurv.model.nn.D <- ReSurv(individual_data3, - hazard_model = "NN", - hparameters=hp_scenario_delta_nn) +resurv_model_nn_D <- ReSurv(individual_data_3, + hazard_model = "NN", + hparameters = hp_scenario_delta_nn) ``` @@ -156,29 +171,21 @@ resurv.model.nn.D <- ReSurv(individual_data3, ## Shap values (XGB) ```{r eval=FALSE, include=TRUE} -png(filename = "~/GitHub/ReSurv/vignettes/alpha_shap_xgb.png", width=600, height=480, res=72) -plot(resurv.model.xgb.A) -dev.off() +plot(resurv_model_xgb_A) ``` ```{r eval=FALSE, include=TRUE} -png(filename = "~/GitHub/ReSurv/vignettes/delta_shap_xgb.png", width=600, height=480, res=72) -plot(resurv.model.xgb.D) -dev.off() +plot(resurv_model_xgb_D) ``` ## Shap values (NN) ```{r eval=FALSE, include=TRUE} -png(filename = "~/GitHub/ReSurv/vignettes/alpha_shap_nn.png", width=600, height=480, res=72) -plot(resurv.model.nn.A,nsamples=10000) -dev.off() +plot(resurv_model_nn_A, nsamples = 10000) ``` ```{r eval=FALSE, include=TRUE} -png(filename = "~/GitHub/ReSurv/vignettes/delta_shap_nn.png", width=600, height=480, res=72) -plot(resurv.model.nn.D,nsamples=10000) -dev.off() +plot(resurv_model_nn_D, nsamples=10000) ```