-
Notifications
You must be signed in to change notification settings - Fork 0
/
TitanicSurvival_R.R
130 lines (92 loc) · 3.87 KB
/
TitanicSurvival_R.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#UNCOMMENT AND RUN THE LINES BELOW IF YOU DON'T HAVE THESE PACKAGES
#install.packages('caret')
#install.packages('rpart')
#install.packages('rpart.plot')
#install.packages('rattle')
#install.packages('RColorBrewer')
#install.packages('e1071')
#install.packages('class')
#Import the necessary libraries
library(caret)
library(rpart)
library(rpart.plot)
library(rattle)
library(RColorBrewer)
library(e1071)
library(class)
#IMPORT THE NECESSARY DATA
train_data = read.csv("train.csv")
test_data = read.csv("test.csv")
validation_data = read.csv("gender_submission.csv")
print("*****USING DECISION TREES*****")
tc <- trainControl("cv",10)
rpart.grid <- expand.grid(.cp=0.2)
train = train_data
test = test_data
fit <- rpart(Survived ~ Pclass + Sex + SibSp + Parch + Age,
method="class", data=train)
fancyRpartPlot(fit)
Prediction <- predict(fit, test, type = "class")
submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)
write.csv(submit, file = "Results/dtreeResults.csv", row.names = FALSE)
print("*****CONFUSION MATRIX USING DECISION TREES IS GIVEN BELOW*****")
results1 <- table(Prediction, validation_data$Survived)
print("*****ACCURACY USING DECISION TREES IS GIVEN BELOW*****")
confusionMatrix(results1)
print("*****USING NAIVE BAYES*****")
BayesTitanicModel<-naiveBayes(as.factor(Survived)~., train)
BayesPrediction<-predict(BayesTitanicModel, test)
summary(BayesPrediction)
output<-data.frame(test$PassengerId, BayesPrediction)
colnames(output)<-cbind("PassengerId","Survived")
write.csv(output, file = 'Results/NaiveBayesResults.csv', row.names = F)
print("*****CONFUSION MATRIX USING NAIVE BAYES IS GIVEN BELOW*****")
results2 <- table(BayesPrediction, validation_data$Survived)
print("*****ACCURACY USING NAIVE BAYES IS GIVEN BELOW*****")
confusionMatrix(results2)
print("*****USING K NEAREST NEIGHBOUR*****")
train <- train[,-c(4,9,11,12)]
test <- test[,-c(3,8,10,11)]
# Change Sex to 0 = male, 1 = female
train$Sex <- sapply(as.character(train$Sex), switch, 'male' = 0, 'female' = 1)
test$Sex <- sapply(as.character(test$Sex), switch, 'male' = 0, 'female' = 1)
train$Embarked[train$Embarked == ''] <- 'S'
train$Embarked <- sapply(as.character(train$Embarked), switch, 'C' = 0, 'Q' = 1, 'S' = 2)
test$Embarked <- sapply(as.character(test$Embarked), switch, 'C' = 0, 'Q' = 1, 'S' = 2)
#Removing NA Values
train_age <- na.omit(train$Age)
train_age_avg <- mean(train_age)
train$Age[is.na(train$Age)] <- train_age_avg
test_age <- na.omit(test$Age)
test_age_avg <- mean(test_age)
test$Age[is.na(test$Age)] <- test_age_avg
test_fare <- na.omit(test$Fare)
test_fare_avg <- mean(test_fare)
test$Fare[is.na(test$Fare)] <- test_fare_avg
# Change Age to 0 = Adult(>=18), 1 = Child(<18)
train$Age <- ifelse(train$Age<18, 1, 0)
test$Age <- ifelse(test$Age<18, 1, 0)
#Function to normalize the values
normalize <- function(x) {
num <- x - min(x)
denom <- max(x) - min(x)
return (num/denom)
}
#Call Function to normalize values
train$Pclass = normalize(train$Pclass)
test$Pclass = normalize(test$Pclass)
test_length <- length(test$Fare)
fare <- normalize(c(train$Fare, test$Fare))
train$Fare <- fare[1:(length(fare)-test_length)]
test$Fare <- fare[(length(fare)-test_length + 1): length(fare)]
survived <- train$Survived
passengers <- test$PassengerId
train <- train[,-c(1,2,6,7,9)]
test <- test[,-c(1,5,6,8)]
knn_titanic <- knn(train, test, survived, k = 5, l = 0, prob = FALSE, use.all = TRUE)
submission <- data.frame(PassengerId = passengers,Survived = knn_titanic)
write.csv(submission,'Results/KnnResults.csv')
print("*****CONFUSION MATRIX USING K NEAREST NEIGHBOUR IS GIVEN BELOW*****")
results3 <- table(submission$Survived, validation_data$Survived)
print("*****ACCURACY USING K NEAREST NEIGHBOUR IS GIVEN BELOW*****")
confusionMatrix(results3)