project_work.Rmd

---
title: "BDA - Project Work"
author: "Jacopo Losi, Nicola Saljoughi"
output:
  pdf_document:
    toc: yes
    toc_depth: 3
  word_document:
    toc: yes
    toc_depth: '1'
  html_document:
    df_print: paged
    toc: yes
    toc_depth: '1'
---
```{r setup, include=FALSE}
# This chunk just sets echo = TRUE as default (i.e. print all code)
knitr::opts_chunk$set(echo = TRUE, tidy = FALSE)

library(magrittr)
library(arules)
library(rstan)
library(nnet)
library(epitools)
library(aaltobda)
library(tinytex)
library(MASS)
library(mvtnorm)
library(dplyr)
library(ggplot2)
library(easyGgplot2)
library(rstan)
library(devtools)
library(brms)
library(loo)
library(KernSmooth)
library(tableone)
library(phonTools)
library(rcompanion)

data <- read.csv("./suicide attempt data_2.csv", stringsAsFactors=FALSE)

```

```{r}

mydata <- data
mydata$Season <- data$Month
mydata$Month = NULL


# Hospitalised

indexHosp   <- which(data$Hospitalised == 'yes')
indexNoHosp <- which(data$Hospitalised == 'no')

mydata$Hospitalised[indexHosp]   <- 1    # 1 --> yes
mydata$Hospitalised[indexNoHosp] <- 0    # 0 --> no


# Died

indexDied   <- which(data$Died == 'yes')
indexNoDied <- which(data$Died == 'no')

mydata$Died[indexDied]   <- 1    # 1 --> yes
mydata$Died[indexNoDied] <- 0    # 0 --> no


# Urban

indexUrban   <- which(data$Urban == 'yes')
indexNoUrban <- which(data$Urban == 'no')

mydata$Urban[indexUrban]   <- 1    # 1 --> yes
mydata$Urban[indexNoUrban] <- 0    # 0 --> no

#Year
indexYear2009 <- which(mydata$Year == 2009)
indexYear2010 <- which(mydata$Year == 2010)
indexYear2011 <- which(mydata$Year == 2011)

mydata$Year[indexYear2009] <- 1    # 1 --> 2009
mydata$Year[indexYear2010] <- 2    # 2 --> 2010
mydata$Year[indexYear2011] <- 3    # 3 --> 2011


# Season

indexSpring <- which(data$Month >= 3 & data$Month <= 5)
indexSummer <- which(data$Month >= 6 & data$Month <= 8)
indexAutumn <- which(data$Month >= 9 & data$Month <= 11)
indexWinter <- which(data$Month == 12 | data$Month <= 2)

mydata$Season[indexSpring] <- 1  # 1 --> Spring
mydata$Season[indexSummer] <- 2  # 2 --> Summer
mydata$Season[indexAutumn] <- 3  # 3 --> Autumn
mydata$Season[indexWinter] <- 4  # 4 --> Winter


# Sex

indexMale   <- which(data$Sex == 'male')
indexFemale <- which(data$Sex == 'female')

mydata$Sex[indexMale]   <- 1    # 1 --> male
mydata$Sex[indexFemale] <- 0    # 0 --> female


# Age

indexAgeOne   <- which(data$Age <= 34) 
indexAgeTwo   <- which(data$Age >= 35 & data$Age <= 49)
indexAgeThree <- which(data$Age >= 50 & data$Age <= 64)
indexAgeFour  <- which(data$Age >= 65)

mydata$Age[indexAgeOne]   <- 1   # 1 --> <34
mydata$Age[indexAgeTwo]   <- 2   # 2 --> 35-49
mydata$Age[indexAgeThree] <- 3   # 3 --> 50-64
mydata$Age[indexAgeFour]  <- 4   # 4 --> >65


# Education

indexEduZero  <- which(data$Education == 'iliterate') 
indexEduOne   <- which(data$Education == 'primary') 
indexEduTwo   <- which(data$Education == 'Secondary')
indexEduThree <- which(data$Education == 'Tertiary')

mydata$Education[indexEduZero]   <- 0   # 0 --> iliterate
mydata$Education[indexEduOne]    <- 1   # 1 --> primary
mydata$Education[indexEduTwo]    <- 2   # 2 --> Secondary
mydata$Education[indexEduThree]  <- 3   # 3 --> Tertiary


# Occupation

indexFarm   <- which(data$Occupation == 'farming')
indexNoFarm <- which(data$Occupation != 'farming')

mydata$Occupation[indexFarm]   <- 1    # 1 --> farming
mydata$Occupation[indexNoFarm] <- 0    # 0 --> non farming


# Method 

indexPesticide <- which(data$method == 'Pesticide')
indexPoison    <- which(data$method == 'Other poison')
indexHanging    <- which(data$method == 'hanging')
indexOthers    <- which(data$method != 'Pesticide' &
                        data$method != 'Other poison' &
                        data$method != 'hanging')

mydata$method[indexPesticide] <- 1 # 1 --> Pesticide
mydata$method[indexPoison]    <- 2 # 2 --> Other poison  
mydata$method[indexHanging]   <- 3 # 3 --> hanging
mydata$method[indexOthers]    <- 4 # 4 --> All others

```


# Introduction


## Objective

The objective of the study is to estimate the incidence of seriours suicide attempts (SSAs), defined as suicide attempts resulting in either death or hospitalization, and to analyse the factors associated with fatality among the attempters. 

## Data 
The data set is constituted by 2571 observations of 11 variables:
\begin{itemize}
  \item \texttt{Person\_ID}: ID number, $1,...,2571$
  \item \texttt{Hospitalised}: \textit{yes} or \textit{no}
  \item \texttt{Died}: \textit{yes} or \textit{no}
  \item \texttt{Urban}: \textit{yes}, \textit{no} or \textit{unknown}
  \item \texttt{Year}: $2009$, $2010$ or $2011$
  \item \texttt{Month}: $1,...,12$
  \item \texttt{Sex}: \textit{female} or \textit{male}
  \item \texttt{Age}: years
  \item \texttt{Education}: \textit{iliterate}, \textit{primary}, \textit{Secondary}, \textit{Tertiary} or \textit{unknown}
  \item \texttt{Occupation}: one of ten categories
  \item \texttt{method}: one of nine methods
\end{itemize}

## Source

Sun J, Guo X, Zhang J, Wang M, Jia C, Xu A (2015) "Incidence and fatality of serious suicide attempts in a predominantly rural population in Shandong, China: a public health surveillance study," BMJ Open 5(2): e006762. https://doi.org/10.1136/bmjopen-2014-006762

Data downloaded via Dryad Digital Repository. https://doi.org/10.5061/dryad.r0v35


# Analysis

```{r}

rural_men   <- subset(data, data$Sex=="male" & data$Urban=="no")
rural_women <- subset(data, data$Sex=="female" & data$Urban=="no")
urban_men   <- subset(data, data$Sex=="male" & data$Urban=="yes")
urban_women <- subset(data, data$Sex=="female" & data$Urban=="yes")

```
```{r}

str(mydata)

```


\begin{tabular}{p{13mm} p{10mm} p{25mm} p{20mm} p{25mm} p{35mm} p{20mm} }
\hline
  & \textbf{All SAAs}  &  \textbf{Hospitalised} \newline \textbf{and survived}  &  \textbf{Hospitalised} \newline \textbf{but died}  &  \textbf{Total SSA} \newline \textbf{hospitalisations} & \textbf{SSA deaths without} \newline \textbf{hospitalisation} & \textbf{Total SSA} \newline \textbf{deaths} \\
  \hline
  Urban      &      &      &     &      &      &      \\
  \;\;Female & 149  & 99   & 18  & 117  & 32   & 50   \\
  \;\;Male   & 128  & 65   & 17  & 82   & 46   & 63   \\
  \;\;Both   & 277  & 164  & 35  & 199  & 78   & 113  \\
  Rural      &      &      &     &      &      &      \\
  \;\;Female & 1134 & 598  & 100 & 698  & 436  & 536  \\
  \;\;Male   & 1079 & 474  & 103 & 577  & 502  & 605  \\
  \;\;Both   & 2213 & 1072 & 203 & 1275 & 938  & 1141 \\
  Total      &      &      &     &      &      &      \\
  \;\;Female & 1328 & 741  & 118 & 859  & 469  & 587  \\
  \;\;Male   & 1243 & 574  & 120 & 694  & 549  & 669  \\
  \;\;Both   & 2571 & 1315 & 238 & 1553 & 1018 & 1256 \\
  \hline
  
\end{tabular}


```{r}

## Remove unknown labels

indexUnknw1 <- which(mydata$Education == 'unknown')
mydata <- mydata[-indexUnknw1,]
indexUnkn <- which(mydata$Urban == 'unknown')
mydata <- mydata[-indexUnkn,]

```

Then in this phase, in which we are working on testing different models, it is worth to take only some random samples from the data. As a matter of fact, the dataset that we have is big and thus the computation on the whole dataset will take a lot of time.

Therefore, we will proceed as follows:
* we will generate a vector of 50 random number taken from our dataset;
* we will test the models with this data, that are sufficient for not loosing in generality;
* we will run the final model on the whole dataset.

```{r}

random_index <- sample(mydata$Person_ID, size = 50, replace = TRUE)

data_reduced <- mydata[random_index, ]
data_reduced <- na.omit(data_reduced)
```


```{r}

## Create Stan data
dat <- list(N        = nrow(data_reduced),
            p        = ncol(data_reduced) - 2,
            died     = as.numeric(data_reduced$Died),
            urban    = as.numeric(data_reduced$Urban),
            year     = as.numeric(data_reduced$Year),
            season   = as.numeric(data_reduced$Season),
            sex      = as.numeric(data_reduced$Sex),
            age      = as.numeric(data_reduced$Age),
            edu      = as.numeric(data_reduced$Education),
            job      = as.numeric(data_reduced$Occupation),
            method   = as.numeric(data_reduced$method))

## Load Stan file
fileName <- "./logistic_regression_model.stan"
stan_code <- readChar(fileName, file.info(fileName)$size)
cat(stan_code)
```

```{r echo=FALSE}

# Run Stan
resStan <- stan(model_code = stan_code,
                data = dat,
                chains = 5, 
                iter = 2000, 
                warmup = 800,
                thin = 10,
                refresh = 0,
                seed = 12345,
                control = list(adapt_delta = 0.95))
print(resStan, pars = c('beta'))

```

```{r}
# Show traceplot
traceplot(resStan, pars = c('beta[3]','beta[4]', 'beta[5]', 
                            'beta[6]', 'beta[7]', 'beta[8]',
                            'beta[9]'), inc_warmup = TRUE)
# Generate some scatter plots in order to see the correlations between parameters
plot(extract(resStan)$beta[,3], extract(resStan)$beta[,7], main="Correlation between location and education",
     xlab="Urban", ylab="Education")
# qplot(extract(resStan)$beta[,3], bins = 30, geom = 'histogram', main='Histogram for Urban',
#                         xlab = 'weigths for the coefficient', ylab = 'Occurrences',
#                         col = I('red'))

# overlay histogram, density and show the mean value
# The plots of the most interesting parameters are presented.
# Using the mean value it could be interesting to understand 
# which weaknly informative priors can be designed

plot_1 <- qplot(extract(resStan)$beta[,3], geom = 'blank', xlab = 'Values of weigth', ylab = 'Occurences', main='Urbans') +   
  geom_histogram(aes(y = ..density..),col = I('red'), bins = 50) + 
  geom_line(aes(y = ..density..), size = 1, col = I('blue'), stat = 'density', ) +
  geom_vline(aes(xintercept=mean(extract(resStan)$beta[,3])), col=I('yellow'), linetype="dashed", size=1) 

plot_2 <- qplot(extract(resStan)$beta[,5], geom = 'blank', xlab = 'Values of weigth', ylab = 'Occurences', main='Sex') +   
  geom_histogram(aes(y = ..density..),col = I('red'), bins = 50) + 
  geom_line(aes(y = ..density..), size = 1, col = I('blue'), stat = 'density', ) +
  geom_vline(aes(xintercept=mean(extract(resStan)$beta[,5])), col=I('yellow'), linetype="dashed", size=1)

plot_3 <- qplot(extract(resStan)$beta[,6], geom = 'blank', xlab = 'Values of weigth', ylab = 'Occurences', main='Age') +   
  geom_histogram(aes(y = ..density..),col = I('red'), bins = 50) + 
  geom_line(aes(y = ..density..), size = 1, col = I('blue'), stat = 'density', ) +
  geom_vline(aes(xintercept=mean(extract(resStan)$beta[,6])), col=I('yellow'), linetype="dashed", size=1)

plot_4 <- qplot(extract(resStan)$beta[,7], geom = 'blank', xlab = 'Values of weigth', ylab = 'Occurences', main='Education') +   
  geom_histogram(aes(y = ..density..),col = I('red'), bins = 50) + 
  geom_line(aes(y = ..density..), size = 1, col = I('blue'), stat = 'density', ) +
  geom_vline(aes(xintercept=mean(extract(resStan)$beta[,7])), col=I('yellow'), linetype="dashed", size=1)

plot_5 <- qplot(extract(resStan)$beta[,8], geom = 'blank', xlab = 'Values of weigth', ylab = 'Occurences', main='Occupation') +   
  geom_histogram(aes(y = ..density..),col = I('red'), bins = 50) + 
  geom_line(aes(y = ..density..), size = 1, col = I('blue'), stat = 'density', ) +
  geom_vline(aes(xintercept=mean(extract(resStan)$beta[,8])), col=I('yellow'), linetype="dashed", size=1) 

ggplot2.multiplot(plot_1,plot_2,plot_3,plot_4, plot_5, cols=3)

```

From the analysis done above, and especially looking at the histogram, it is clear that the most important parameters that count in our analysis are: the fact that the people come from urbn or rural areas, then their education and occupation. As a matter of fact, the mean and the maximum values of the coeffcient related to those paramters have the bigger magnitude. This means that those parameters are weighted more in the multi regression function in the model.

```{r}
rows <- length(extract(resStan)$beta[,1])
beta_matrix <- zeros(rows, ncol(data_reduced) - 2)

for (i in 1:ncol(data_reduced) - 2){
  beta_matrix[,i] <- beta_matrix[,i] + extract(resStan)$beta[,i]

}

beta_df <- as.data.frame(beta_matrix)


```


```{r}
# Show traceplot
traceplot(resStan, pars = c('beta[3]','beta[4]', 'beta[5]', 
                            'beta[6]', 'beta[7]', 'beta[8]',
                            'beta[9]'), inc_warmup = TRUE)
# Generate some scatter plots in order to see the correlations between parameters
scatter_1 <- ggplot(beta_df, aes(x=V3, y=V7)) +
                    ggtitle("Correlation between location and education") +
                    xlab("Urban") + ylab("Education") +
             geom_point(size=1, shape=23) +
             geom_smooth(method=lm, linetype="dashed", color="darkred", fill="blue")

scatter_2 <- ggplot(beta_df, aes(x=V3, y=V8)) +
                    ggtitle("Correlation between location and occuption") +
                    xlab("Urban") + ylab("Occupation") +
             geom_point(size=1, shape=23) +
             geom_smooth(method=lm, linetype="dashed", color="darkred", fill="blue")

scatter_3 <- ggplot(beta_df, aes(x=V5, y=V6)) +
                    ggtitle("Correlation between gender and age") +
                    xlab("Gender") + ylab("Age") +
             geom_point(size=1, shape=23) +
             geom_smooth(method=lm, linetype="dashed", color="darkred", fill="blue")


ggplot2.multiplot(scatter_1,scatter_2,scatter_3, cols=1)


# overlay histogram, density and show the mean value
# The plots of the most interesting parameters are presented.
# Using the mean value it could be interesting to understand 
# which weaknly informative priors can be designed

plot_1 <- qplot(extract(resStan)$beta[,3], geom = 'blank', xlab = 'Values of weigth', ylab = 'Occurences', main='Urbans') +   
  geom_histogram(aes(y = ..density..),col = I('red'), bins = 50) + 
  geom_line(aes(y = ..density..), size = 1, col = I('blue'), stat = 'density', ) +
  geom_vline(aes(xintercept=mean(extract(resStan)$beta[,3])), col=I('yellow'), linetype="dashed", size=1) 

plot_2 <- qplot(extract(resStan)$beta[,5], geom = 'blank', xlab = 'Values of weigth', ylab = 'Occurences', main='Sex') +   
  geom_histogram(aes(y = ..density..),col = I('red'), bins = 50) + 
  geom_line(aes(y = ..density..), size = 1, col = I('blue'), stat = 'density', ) +
  geom_vline(aes(xintercept=mean(extract(resStan)$beta[,5])), col=I('yellow'), linetype="dashed", size=1)

plot_3 <- qplot(extract(resStan)$beta[,6], geom = 'blank', xlab = 'Values of weigth', ylab = 'Occurences', main='Age') +   
  geom_histogram(aes(y = ..density..),col = I('red'), bins = 50) + 
  geom_line(aes(y = ..density..), size = 1, col = I('blue'), stat = 'density', ) +
  geom_vline(aes(xintercept=mean(extract(resStan)$beta[,6])), col=I('yellow'), linetype="dashed", size=1)

plot_4 <- qplot(extract(resStan)$beta[,7], geom = 'blank', xlab = 'Values of weigth', ylab = 'Occurences', main='Education') +   
  geom_histogram(aes(y = ..density..),col = I('red'), bins = 50) + 
  geom_line(aes(y = ..density..), size = 1, col = I('blue'), stat = 'density', ) +
  geom_vline(aes(xintercept=mean(extract(resStan)$beta[,7])), col=I('yellow'), linetype="dashed", size=1)

plot_5 <- qplot(extract(resStan)$beta[,8], geom = 'blank', xlab = 'Values of weigth', ylab = 'Occurences', main='Occupation') +   
  geom_histogram(aes(y = ..density..),col = I('red'), bins = 50) + 
  geom_line(aes(y = ..density..), size = 1, col = I('blue'), stat = 'density', ) +
  geom_vline(aes(xintercept=mean(extract(resStan)$beta[,8])), col=I('yellow'), linetype="dashed", size=1) 

ggplot2.multiplot(plot_1,plot_2,plot_3,plot_4, plot_5, cols=3)

```

From the analysis done above, and especially looking at the histogram, it is clear that the most important parameters that count in our analysis are: the fact that the people come from urban or rural areas, then their education, occupation and partially if they are man or woman. As a matter of fact, the mean and the maximum values of the coeffcient related to those paramters have the bigger magnitude. This means that those parameters are weighted more in the multi regression function in the model.
>>>>>>> f37af421a4ca7e36f3df54866e0eed4bbd90c768

Therefore, for further analysis, it will be good to develop specific analysis using only these parameters, in order to have a more precise evalution considering only the most relevant parameters. 

## Frequentist approach 

```{r}

outcomeModel <- glm(as.numeric(Died) ~ as.numeric(Urban) + 
                                       as.numeric(Year) + 
                                       as.numeric(Season) + 
                                       as.numeric(Sex) + 
                                       as.numeric(Age) + 
                                       as.numeric(Education) + 
                                       as.numeric(Occupation) + 
                                       as.numeric(method), data = mydata,
                    family = binomial(link = "logit"))
summary(outcomeModel)

```


## Comparison

```{r}
## Bayesian
print(resStan, pars = c("beta"))

## Frequentist
tableone::ShowRegTable(outcomeModel, exp = FALSE)

```

## Same clustering on the data

Let us try to cluster the data using the specific year in order to do a prediction on the following year

```{r}

indexYear2009 <- which(mydata$Year == 2009)
data_year_2009 <- mydata[indexYear2009,]

indexYear2010 <- which(mydata$Year == 2010)
data_year_2010 <- mydata[indexYear2010,]

indexYear2011 <- which(mydata$Year == 2011)
```

# Conclusions

```{r}

datFile <- "suicide attempt data_2.csv"
datCsv <- read.csv(datFile, stringsAsFactors=FALSE)
datSet <- as.data.frame(datCsv)

datSet$Season <- datSet$Month
datSet$Month = NULL

## Remove unknown labels

indexUnkn_1 <- which(datSet$Education == 'unknown')
indexUnkn_2 <- which(datSet$Urban == 'unknown')
indexUnkn_3 <- which(datSet$Occupation == 'others/unknown')
datSet <- datSet[-c(indexUnkn_1, indexUnkn_2,indexUnkn_3),]

datSet$Month = NULL

# Hospitalised
indexHosp   <- which(datSet$Hospitalised == 'yes')
indexNoHosp <- which(datSet$Hospitalised == 'no')
datSet$Hospitalised[indexHosp]   <- 1    # 1 --> yes
datSet$Hospitalised[indexNoHosp] <- 0    # 0 --> no


# Died
indexDied   <- which(datSet$Died == 'yes')
indexNoDied <- which(datSet$Died == 'no')
datSet$Died[indexDied]   <- 1    # 1 --> yes
datSet$Died[indexNoDied] <- 0    # 0 --> no


# Urban
indexUrban   <- which(datSet$Urban == 'yes')
indexNoUrban <- which(datSet$Urban == 'no')
datSet$Urban[indexUrban]   <- 1    # 1 --> yes
datSet$Urban[indexNoUrban] <- 0    # 0 --> no

#Year
indexYear2009 <- which(datSet$Year == 2009)
indexYear2010 <- which(datSet$Year == 2010)
indexYear2011 <- which(datSet$Year == 2011)
datSet$Year[indexYear2009] <- 1    # 1 --> 2009
datSet$Year[indexYear2010] <- 2    # 2 --> 2010
datSet$Year[indexYear2011] <- 3    # 3 --> 2011

# Sex
indexMale   <- which(datSet$Sex == 'male')
indexFemale <- which(datSet$Sex == 'female')
datSet$Sex[indexMale]   <- 1    # 1 --> male
datSet$Sex[indexFemale] <- 0    # 0 --> female

# Education
indexEduZero  <- which(datSet$Education == 'iliterate') 
indexEduOne   <- which(datSet$Education == 'primary') 
indexEduTwo   <- which(datSet$Education == 'Secondary')
indexEduThree <- which(datSet$Education == 'Tertiary')

datSet$Education[indexEduZero]   <- 0   # 0 --> iliterate
datSet$Education[indexEduOne]    <- 1   # 1 --> primary
datSet$Education[indexEduTwo]    <- 2   # 2 --> Secondary
datSet$Education[indexEduThree]  <- 3   # 3 --> Tertiary


# Occupation
indexUnEmpl <- which(datSet$Occupation == 'unemployed')
indexFarm   <- which(datSet$Occupation == 'farming')
indexProf   <- which(datSet$Occupation == 'business/service' | datSet$Occupation == 'professional')

datSet$Occupation[indexUnEmpl]   <- 0    # 1 --> farming
datSet$Occupation[indexFarm]     <- 1    # 0 --> non farming
datSet$Occupation[indexProf]     <- 2
datSet$Occupation[-c(indexUnEmpl, indexFarm, indexProf)]    <- 3

# Method 
indexPesticide <- which(datSet$method == 'Pesticide')
indexPoison    <- which(datSet$method == 'Other poison')
indexHanging   <- which(datSet$method == 'hanging')
indexOthers    <- which(datSet$method != 'Pesticide' &
                        datSet$method != 'Other poison' &
                        datSet$method != 'hanging')

datSet$method[indexPesticide] <- 1 # 1 --> Pesticide
datSet$method[indexPoison]    <- 2 # 2 --> Other poison  
datSet$method[indexHanging]   <- 3 # 3 --> hanging
datSet$method[indexOthers]    <- 4 # 4 --> All others

# Season
indexSpring <- which(datSet$Season >= 3 & datSet$Season <= 5)
indexSummer <- which(datSet$Season >= 6 & datSet$Season <= 8)
indexAutumn <- which(datSet$Season >= 9 & datSet$Season <= 11)
indexWinter <- which(datSet$Season == 12 | datSet$Season <= 2)

datSet$Season[indexSpring] <- 1  # 1 --> Spring
datSet$Season[indexSummer] <- 2  # 2 --> Summer
datSet$Season[indexAutumn] <- 3  # 3 --> Autumn
datSet$Season[indexWinter] <- 4  # 4 --> Winter


# qplot(datSet$Age, geom = 'blank', xlab = 'Values of weigth', ylab = 'Occurences', main='Urbans') +   
#   geom_histogram(aes(y = ..density..),col = I('red'), bins = 50) + 
#   geom_line(aes(y = ..density..), size = 1, col = I('blue'), stat = 'density', ) +
#   geom_vline(aes(xintercept=mean(datSet$Age)), col=I('yellow'), linetype="dashed", size=1) 
```


```{r}

data_4_fit <- list(N = nrow(datSet),
                   p = 6,
                   age = as.numeric(datSet$Age),
                   died = as.numeric(datSet$Died),
                   sex = as.numeric(datSet$Sex),
                   job = as.numeric(datSet$Occupation),
                   urban = as.numeric(datSet$Urban),
                   edu = as.numeric(datSet$Education))


fileName <- "./stan_model_with_prior.stan"
stan_code <- readChar(fileName, file.info(fileName)$size)
cat(stan_code)

# Run Stan
fitStan <- stan(model_code = stan_code,
                data = data_4_fit,
                chains = 5, 
                iter = 2000, 
                warmup = 800,
                thin = 10,
                refresh = 0,
                seed = 12345,
                control = list(adapt_delta = 0.95))
print(fitStan, pars = c('beta', 'sigma'))


```

```{r}

log_like_model <- extract_log_lik(fitStan, merge_chains = FALSE)
r_eff <- relative_eff(exp(log_like_model))
loo_mod <- loo(log_like_model, r_eff = r_eff)
print(loo_mod)
plot(loo_mod)
elpd_loo_mod <- loo_mod$estimates[1]
elpd_loo_mod

```


```{r}

data_4_fit_complete <- list(N = nrow(datSet),
                   p = 10,
                   age = as.numeric(datSet$Age),
                   died = as.numeric(datSet$Died),
                   hosp = as.numeric(datSet$Hospitalised),
                   year = as.numeric(datSet$Year),
                   sex = as.numeric(datSet$Sex),
                   job = as.numeric(datSet$Occupation),
                   urban = as.numeric(datSet$Urban),
                   edu = as.numeric(datSet$Education),
                   method = as.numeric(datSet$method),
                   season = as.numeric(datSet$Season))


fileName <- "./stan_model_prior_all_params.stan"
stan_code_complete <- readChar(fileName, file.info(fileName)$size)
cat(stan_code_complete)

# Run Stan
fitStan_complete <- stan(model_code = stan_code_complete,
                data = data_4_fit_complete,
                chains = 5, 
                iter = 2000, 
                warmup = 800,
                thin = 10,
                refresh = 0,
                seed = 12345,
                control = list(adapt_delta = 0.95))
print(fitStan_complete, pars = c('beta', 'sigma'))


```


```{r}

log_like_model_compl <- extract_log_lik(fitStan_complete, merge_chains = FALSE)
r_eff <- relative_eff(exp(log_like_model_compl))
loo_mod_comp <- loo(log_like_model_compl, r_eff = r_eff)
print(loo_mod_comp)
plot(loo_mod_comp)
elpd_loo_mod_comp <- loo_mod_comp$estimates[1]
elpd_loo_mod_comp

```