Covid_XG3_hGHyz71E.rmd

---
title: "Covid spread"
author: "Dr Fad"
date: "Apr 7, 2020"
output:
  html_document:
    toc: yes 
    toc_depth: 4
  word_document:
    toc: yes 
    toc_depth: 4
  pdf_document: 
    toc: yes
    toc_depth: 4
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## Load libraries

```{r cars}
rm(list = ls())
## Import packages
library(ggplot2)
library(gridExtra)
library(repr)
library(dplyr)
library(caret)
library(e1071)
library(MLmetrics)
library(klaR)
library(sqldf) #Running sql type of query
library(Matrix) # For matrix conversion used for xgboost
library(beepr) #For output sound
library(stringi) #For text manipulation
library(stringr) #For text manipulation
library(lubridate) #For manipulating dates
library(tictoc) #To calculate running time
library(catboost) #For modelling
library(Boruta)#For feature selection
library(xgboost)
library(Matrix)
library(purrr)
library(ROCR) #Visualising performance of classifiers
library(plyr) #For mapping and renaming
library(tidyr)#To reshape data



options(repr.plot.width=4, repr.plot.height=4)
options(scipen=99999999) # Used to revoke display of scientific numbers
```

#Load data
```{r}
Covid_sub <- read.csv("Zindi/Covid 19/SampleSubmission.csv", stringsAsFactors = F)
head(Covid_sub)
```
#Clean Submit
```{r}
test <- as.data.frame(strsplit(Covid_sub$Territory.X.Date, "X"),stringsAsFactors = F)
test <- as.data.frame(t(as.matrix(test)),stringsAsFactors = F,row.names = F)#Transpose
test$V2 <- as.character(test$V2)
test$V1 <- str_trim(test$V1, side ="both")#Remove white spaces
test$V2 <- str_trim(test$V2, side ="both")
print("Number of countries:")
length(unique(test$V1))
Countries <- unique(test$V1)
test$V2 <- str_sub(test$V2,2,8)
test$V2 <- str_replace(test$V2,"/0","/")#Format the days removing leading zero's
Covid_sub$Territory_Date <- paste(test$V1,"X",test$V2)
Covid_sub <- Covid_sub[,c(3,2)]
Covid_sub
```

#Load life data
```{r}
#Load data from JHU as at 18 April inclusive
Covid_cases <- read.csv("Zindi/Covid 19/COVID-19_for_submission/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv",stringsAsFactors = F)
territories <- c('St Martin', 'Faroe Islands', 'Saint Barthelemy', 'French Polynesia', 'French Guiana', 'Mayotte',
          'Guadeloupe', 'Curacao', 'Cayman Islands', 'Reunion', 'Aruba', 'Montserrat', 'Greenland', 'New Caledonia', 'Bermuda')

#Change country name when territory is in above
Covid_cases$Country.Region <- ifelse(Covid_cases$Province.State %in% territories,Covid_cases$Province.State,
                                     Covid_cases$Country.Region)

#Map new names 
Covid_cases$Country.Region <- mapvalues(Covid_cases$Country.Region,
                                        from=c("United Arab Emirates","Philippines","Dominican Republic",
                                               "Brunei","US","Iran","Korea, South","Vietnam","Russia",
                                               "Moldova","Bolivia","United Kingdom","Congo (Kinshasa)",
                                               "Cote d'Ivoire","Sudan","Venezuela","Central African Republic",
                                               "Congo (Brazzaville)","Netherlands","Tanzania","Gambia",
                                               "Bahamas","Niger","Syria","Laos","Taiwan*"),
                                        to = c("United Arab Emirates (the)","Philippines (the)",
                                               "Dominican Republic (the)","Brunei Darussalam",
                                               "United States of America (the)","Iran (Islamic Republic of)",
                                               "Democratic People's Republic of Korea (the)",
                                               "Viet Nam","Russian Federation (the)","Republic of Moldova (the)",
                                               "Bolivia (Plurinational State of)",
                                               "United Kingdom of Great Britain and Northern Ireland (the)",
                                               "Democratic Republic of the Congo (the)","CÃ´te d'Ivoire",
                                               "South Sudan","Venezuela (Bolivarian Republic of)",
                                               "Central African Republic (the)","Congo (the)",
                                               "Netherlands (the)","United Republic of Tanzania (the)",
                                               "Gambia (the)","Bahamas (the)","Niger (the)",
                                               "Syrian Arab Republic (the)",
                                               "Lao People's Democratic Republic (the)","Taiwan"))
#Load data from JHU as at 18 April inclusive
Covid_deaths <- read.csv("Zindi/Covid 19/COVID-19_for_submission/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv",stringsAsFactors = F)

#Change country name when territory is in above
Covid_deaths$Country.Region <- ifelse(Covid_deaths$Province.State %in% territories,Covid_deaths$Province.State,
                                     Covid_deaths$Country.Region)
Covid_deaths$Country.Region <- mapvalues(Covid_deaths$Country.Region,
                                        from=c("United Arab Emirates","Philippines","Dominican Republic",
                                               "Brunei","US","Iran","Korea, South","Vietnam","Russia",
                                               "Moldova","Bolivia","United Kingdom","Congo (Kinshasa)",
                                               "Cote d'Ivoire","Sudan","Venezuela","Central African Republic",
                                               "Congo (Brazzaville)","Netherlands","Tanzania","Gambia",
                                               "Bahamas","Niger","Syria","Laos","Taiwan*"),
                                        to = c("United Arab Emirates (the)","Philippines (the)",
                                               "Dominican Republic (the)","Brunei Darussalam",
                                               "United States of America (the)","Iran (Islamic Republic of)",
                                               "Democratic People's Republic of Korea (the)",
                                               "Viet Nam","Russian Federation (the)","Republic of Moldova (the)",
                                               "Bolivia (Plurinational State of)",
                                               "United Kingdom of Great Britain and Northern Ireland (the)",
                                               "Democratic Republic of the Congo (the)","CÃ´te d'Ivoire",
                                               "South Sudan","Venezuela (Bolivarian Republic of)",
                                               "Central African Republic (the)","Congo (the)",
                                               "Netherlands (the)","United Republic of Tanzania (the)",
                                               "Gambia (the)","Bahamas (the)","Niger (the)",
                                               "Syrian Arab Republic (the)",
                                               "Lao People's Democratic Republic (the)","Taiwan"))
head(Covid_deaths)
unique(Covid_deaths$Country.Region)
Check <- data.frame(Country.Region = Countries, type = 1)
Check2 <- Covid_cases[,c(2,3)]
Check2 <- Check2[!duplicated(Check2$Country.Region),]


Check <- merge(Check2,Check, by = "Country.Region", all.y = T)

#Subset by country
Covid_cases <- Covid_cases[Covid_cases$Country.Region %in% Countries, ]
Covid_deaths <- Covid_deaths[Covid_deaths$Country.Region %in% Countries, ]

#Subset
Covid_cases <- Covid_cases[,-c(1,3,4)]
Covid_deaths <- Covid_deaths[,-c(1,3,4)]


#Summarise by sum
Covid_cases <- Covid_cases %>% dplyr::group_by(Country.Region) %>% dplyr::summarise_all(funs(sum))
Covid_deaths <- Covid_deaths %>% dplyr::group_by(Country.Region) %>% dplyr::summarise_all(funs(sum))
length(unique(Covid_cases$Country.Region))
Covid_sub
```

#Assemble train from source and clean
Change the column extent manually
```{r}
library(tidyr)#To reshape data
Covid_deaths$dummy <- 1
Covid_deaths <- Covid_deaths %>% gather(dummy, deaths,X1.22.20:X4.18.20)#Put start and last date here
Covid_deaths
#Remove X in dummy column
Covid_deaths$dummy <- str_replace(Covid_deaths$dummy, "X","")
#Convert dummy to dates
Covid_deaths$dummy <- as.Date(Covid_deaths$dummy, format = "%m.%d.%y")
#Sort
Covid_deaths <- Covid_deaths[with(Covid_deaths,order(Country.Region,dummy)),]
#Format dummy
Covid_deaths$dummy <- format(Covid_deaths$dummy, format = "%m/%d/%y")
#Format dummy in preparation for output
Covid_deaths$dummy <- str_sub(Covid_deaths$dummy,2,8)
Covid_deaths$dummy <- str_replace(Covid_deaths$dummy,"/0","/")#Format the days removing leading zero's
Covid_deaths$Country.Region <- paste(Covid_deaths$Country.Region,"X",Covid_deaths$dummy)

Covid_deaths
```
#Build for countries without JHU data
```{r}
Countries_no_data <- Check[is.na(Check$Lat),]
no_data <- expand.grid(Country.Region = Countries_no_data$Country.Region,date = seq(mdy("3/6/20", tz= Sys.timezone()),mdy("6/7/20", tz= Sys.timezone()),86400))#Range in submit file

#Sort
no_data <- no_data[with(no_data,order(Country.Region,date)),]
no_data$Country.Region <- as.character(no_data$Country.Region)
#Format dummy
no_data$date <- format(no_data$date, format = "%m/%d/%y")
#Format dummy in preparation for output
no_data$date <- str_sub(no_data$date,2,8)
no_data$date <- str_replace(no_data$date,"/0","/")#Format the days removing leading zero's
colnames(no_data)[2] <- "dummy"
no_data$deaths <- 0
no_data$Country.Region <- paste(no_data$Country.Region,"X",no_data$dummy)

```
#Combine available data with no data
```{r}
Hold <- rbind(Covid_deaths,no_data)
#Subset
Hold <- Hold[,c("Country.Region","deaths")]

#Merge to submit file
colnames(Hold)[1] <- "Territory_Date"
Covid_sub <- merge(Hold,Covid_sub, by="Territory_Date", all.y= T)
Covid_sub$deaths[is.na(Covid_sub$deaths)] <- 0

```

#Manipulate output further

```{r}
country_date <- as.data.frame(strsplit(Covid_sub$Territory_Date, "X"))
country_date <- as.data.frame(t(as.matrix(country_date)),stringsAsFactors = F,row.names = F)#Transpose
country_date$V1 <- str_trim(country_date$V1, side ="both")#Remove white spaces
country_date$V2 <- str_trim(country_date$V2, side ="both")
colnames(country_date)[1] <- "country"
colnames(country_date)[2] <- "date"
Covid_sub <- cbind(Covid_sub,country_date)
#COnvert date to date
Covid_sub$date <- as.Date(Covid_sub$date , format= "%m/%d/%y")
#Sort
Covid_sub <- Covid_sub[with(Covid_sub, order(country,date)),]
Covid_sub[Covid_sub$country =="CÃ´te d'Ivoire",]

```
#Create death growth rate

```{r}
Covid_sub <- Covid_sub %>%   group_by(country) %>% mutate(death_growth_rate = ((deaths-lag(deaths))/lag(deaths)))
summary(Covid_sub$death_growth_rate)
Covid_sub[is.na(Covid_sub)] <- 0
Covid_sub$death_growth_rate[!is.finite(Covid_sub$death_growth_rate)] <- 0
Covid_sub <- Covid_sub %>%   group_by(country) %>% mutate(death_new = (1+death_growth_rate)* lag(deaths))

#Subset to get rate for last 7 days, use min and 10th percentile for each country
rate_df <- Covid_sub[Covid_sub$date >= "2020-04-11" & Covid_sub$date <= "2020-04-18",]
#Get maximum detahs by country
max_death <- Covid_sub %>% dplyr::group_by(country) %>% dplyr::summarise(max_death = max(deaths))

#Prepare quantile calculations
p <- c(0.1, 0.2,0.3,0.4)
p_names <- map_chr(p, ~paste0("q",.x*100, "g_rate"))
p_funs <- map(p, ~partial(quantile, probs = .x, na.rm = TRUE)) %>% 
  set_names(nm = p_names)

rate_df <- rate_df %>% 
                              group_by(country) %>% 
                              summarize_at( vars(death_growth_rate), funs(!!!p_funs))
#Merge to covid_sub
Covid_sub <- merge(rate_df,Covid_sub , by="country", all.y= T)
Covid_sub <- merge(max_death,Covid_sub , by="country", all.y= T)


Covid_sub$death_growth_rate2 <- ifelse(Covid_sub$date > "2020-04-18" & Covid_sub$max_death >240,
                                       Covid_sub$q30g_rate,ifelse(Covid_sub$date > "2020-04-18" &
                                                                    Covid_sub$max_death < 40,Covid_sub$q30g_rate,
                                                                  ifelse(Covid_sub$date > "2020-04-18" & Covid_sub$max_death >60 & Covid_sub$max_death < 240,Covid_sub$q40g_rate,ifelse(Covid_sub$date > "2020-04-18" & Covid_sub$max_death >40 & Covid_sub$max_death >60,Covid_sub$q20g_rate, Covid_sub$death_growth_rate))))#Mistake here, it should be < 60. But the submitted version was a mistake resulting in affecting predictions for 8 countries. South Africa inclusive



Covid_sub$death_growth_rate2 <- 1 + Covid_sub$death_growth_rate2


#Sort
Covid_sub <- Covid_sub[with(Covid_sub, order(country,date)),]
#########################################################################################
#Create flag
Covid_sub$Flag <- ifelse(Covid_sub$date == "2020-04-19",0,1)#From date you want to predict from i.e Apr19

#Make rates NAs in preparation of sequential rates
Covid_sub$death_growth_rate2 <- ifelse(Covid_sub$date > "2020-04-19",NA,
                                       Covid_sub$death_growth_rate2)#From pred date
Covid_sub$death_growth_rate2 <- ifelse(Covid_sub$death_growth_rate2 == 1,
                                       0.17 +Covid_sub$death_growth_rate2,
                                       Covid_sub$death_growth_rate2)
Covid_sub$death_growth_rate2 <- ifelse(Covid_sub$date == "2020-06-07",1,Covid_sub$death_growth_rate2)#Last date

#Fill rates based on deline to 1
library(data.table)
setDT(Covid_sub)
Covid_sub[Flag == 1, death_growth_rate2 := seq(Covid_sub[Flag == 0, last(death_growth_rate2), by = .(Flag, country)][,V1][.GRP], last(death_growth_rate2), length.out = .N + 1)[-1], by = .(Flag, country)]

Covid_sub <- as.data.frame(Covid_sub)

```

```{r}
# Covid_sub$flag <- ifelse(Covid_sub$date > "2020-04-06",1,0)
Covid_sub$death_new <- ifelse(Covid_sub$date > "2020-04-18", NA,Covid_sub$death_new)

library(data.table)
ix <- setDT(Covid_sub)[is.na(death_new), which=TRUE]
Covid_sub[, death_new := as.double(nafill(death_new, "locf"))][
    ix, death_new := death_new * cumprod(death_growth_rate2), country]
Covid_sub[Covid_sub$country == "Netherlands (the)",c("Territory_Date","deaths","death_new","death_growth_rate2")]
```

United Kingdom of Great Britain and Northern Ireland (the)     more reduce rate
Netherlands (the)
United States of America (the)    less increase rate
Italy      close
Spain      close
France     more reduce rate


#Metrics
MAE(pred, true)
```{r}
Sub_7_13 <- Covid_sub[Covid_sub$date >= "2020-04-07" &Covid_sub$date <= "2020-04-18", ]#Test period
cat("Overall 7-18 April is", MAE(Sub_7_13$death_new,Sub_7_13$deaths))
cat("\n Above 10k is ",MAE(Sub_7_13[Sub_7_13$deaths > 10000,]$death_new,Sub_7_13[Sub_7_13$deaths > 10000,]$deaths))
cat("\n Between 10k and 240 is",MAE(Sub_7_13[Sub_7_13$deaths > 240 & Sub_7_13$deaths < 10000,]$death_new,
    Sub_7_13[Sub_7_13$deaths > 240& Sub_7_13$deaths < 10000,]$deaths))
cat("\n Between 240 and 60 is",MAE(Sub_7_13[Sub_7_13$deaths > 60 & Sub_7_13$deaths < 240,]$death_new,
    Sub_7_13[Sub_7_13$deaths > 60& Sub_7_13$deaths < 240,]$deaths))
cat("\n Btw 60 and 40 is ",MAE(Sub_7_13[Sub_7_13$deaths < 60 & Sub_7_13$deaths > 24,]$death_new,
    Sub_7_13[Sub_7_13$deaths < 60 & Sub_7_13$deaths > 24,]$deaths))
cat("\n Less than 40 is ",MAE(Sub_7_13[Sub_7_13$deaths < 40,]$death_new,
    Sub_7_13[Sub_7_13$deaths < 40,]$deaths))
```

#Slit into train and test
```{r}
Covid_sub$country <- factor(Covid_sub$country)
#Add yday
Covid_sub$yday <- yday(Covid_sub$date)
covid_train <- Covid_sub[Covid_sub$date <= "2020-04-18" ,]#Last train date
covid_test <- Covid_sub[Covid_sub$date > "2020-04-18",]

#Re-arrange
covid_train <- covid_train[,c(7,8,1:6,9:15)]
covid_test <- covid_test[,c(7,8,1:6,9:15)]
 
```
 


United Kingdom of Great Britain and Northern Ireland (the)     more
Netherlands (the)
United States of America (the)    less
Italy      close
Spain      close
France     more


#Cross validation or xgboost

```{r}
large_deaths <- covid_train[covid_train$deaths > 4000,]

tic()
library(ggplot2) # Data visualization
library(data.table)
library(xgboost)
library(caret)
library(Matrix)


#Remove Field ID from train features
Train_XG <- rbind(covid_train[,c(2,4,12)],large_deaths[,c(2,4,12)])#Use without SMOTE

Train_XG[is.na(Train_XG)] <- -999

Test_XG <- covid_test[,c(4,12)]
Test_XG[is.na(Test_XG)] <- -999


train = Train_XG #training partition

#Create Matrix
dtrain <- sparse.model.matrix(deaths ~ . -1, data = train)
feature_names <- names(dtrain)
target <- train$deaths

dtrain <- xgb.DMatrix( data = as.matrix(dtrain), label = target, missing= NA)

###################
#XG Boost setup
###################

dtest_F <- xgb.DMatrix(data=as.matrix( Test_XG))

###################
#Cross Validation
###################
# Set up cross-validation scheme (3-fold)
foldsCV <- createFolds(target, k=5, list=TRUE, returnTrain=FALSE)


param <- list(booster = "gblinear"
              # , objective = "multi:softprob"
              , subsample = 0.9 #Try 0.8 and 0.75 , else stick with 0.85
              , max_depth = 4
              , colsample_bytree = 0.7 #0.95,1
              , eta = 0.032
              #, lambda = 0.08
              , eval_metric = 'mae'
              # , num_class = 7
              , gamma = 0.3
              #, base_score = 0.012 #average
              , min_child_weight = 17#2,16
                )

toc()
```

#Try 1600,1700. Bad at 2000
3k runs for 6mins
15k+ 0n 0.005 runs for 26mins
```{r}
tic()
param <- list(booster = "gblinear"
              # , objective = "multi:softprob"
              , subsample = 0.9 #Try 0.8 and 0.75 , else stick with 0.85
              , max_depth = 4
              , colsample_bytree = 0.7 #0.95,1
              , eta = 0.032
              #, lambda = 0.08
              , eval_metric = 'mae'
              # , num_class = 7
              , gamma = 0.3
              #, base_score = 0.012 #average
              , min_child_weight = 17#2,16
                )

  ################
  # Final model
  ################
  set.seed(987654321)
  xgb <- xgboost::xgboost(params = param
                   , data = dtrain
                  # , watchlist = list(train = dtrain)
                   , nrounds = 1500#3000,15776
                   , verbose = 1
                   , print_every_n = 2
                   #, feval = amm_mae
                  )
  ###############
  # Results
  ###############
  #Feature imprtance
  imp <- xgb.importance(feature_names, model =xgb)
  imp
  xgb.plot.importance(imp)
  # imp$Feature


  #Submission
  test_new <- as.matrix(Test_XG)


  #Prep for submit
  Check_XG <- predict(xgb, newdata = test_new)
  # Check_XG <- catboost.predict(CB_model_1, submit_pool,prediction_type = "Probability")

#Prep for submit
  Covid_submit <- cbind(covid_test,Check_XG)
  Covid_submit[Covid_submit$country == "Nigeria",c("Territory_Date","deaths","death_new","Check_XG")]


#Combine and submit
df_1 <- covid_train[,c("Territory_Date","deaths","country","date")]
df_2 <- Covid_submit[,c("Territory_Date","Check_XG","country","date")]
colnames(df_2) <- colnames(df_1)

df <- rbind(df_1,df_2)
df <- df[with(df, order(country,date)),]
df$deaths <- round(df$deaths,2)
df[df$country == "Greenland",]
write.csv(df, file="Zindi/Covid 19/Covid_XG3_hGHyz71E.csv",row.names = F)

toc()
```
'St Martin', 'Faroe Islands', 'Saint Barthelemy', 'French Polynesia', 'French Guiana', 'Mayotte',
          'Guadeloupe', 'Curacao', 'Cayman Islands', 'Reunion', 'Aruba', 'Montserrat', 'Greenland', 'New Caledonia', 'Bermuda'

```{r}

```