Skip to content

Commit 5ecd03d

Browse files
committed
tried some ideas, implementad campbells lm model (and got 0.79995 which I already got using logit)
1 parent 471f4c5 commit 5ecd03d

File tree

3 files changed

+185
-70
lines changed

3 files changed

+185
-70
lines changed

forests.r

+17-9
Original file line numberDiff line numberDiff line change
@@ -31,20 +31,20 @@ cabin_to_deck <- function(data) {
3131
# Cabin
3232
train$Cabin = cabin_to_deck(train$Cabin)
3333
train$Cabin = factor(train$Cabin, levels=c('A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'))
34-
train$Cabin = impute(train$Cabin, median)
34+
train$Cabin = impute(train$Cabin, max)
3535

3636

3737
test$Cabin = cabin_to_deck(test$Cabin)
3838
test$Cabin = factor(test$Cabin, levels=c('A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'))
39-
test$Cabin = impute(test$Cabin, median)
39+
test$Cabin = impute(test$Cabin, max)
4040

4141
# Age
4242
train$Age <- impute(train$Age, mean)
4343
test$Age <- impute(test$Age, mean)
4444

4545
# Embarked
46-
train$Embarked <- impute(factor(train$Embarked), median)
47-
test$Embarked <- impute(factor(test$Embarked), median)
46+
train$Embarked <- impute(factor(train$Embarked), max)
47+
test$Embarked <- impute(factor(test$Embarked), max)
4848

4949
# Sex
5050
train$Sex <- factor(train$Sex)
@@ -58,13 +58,21 @@ str(train)
5858
str(test)
5959

6060
model <- randomForest(
61-
Survived ~ (Pclass + Sex + Age + SibSp + Parch + Embarked)^6,
61+
Survived ~ Pclass + Sex + Age + SibSp + Parch + Embarked + Cabin ,
6262
data=train,
63-
ntree=5000,
64-
mtry=3
63+
ntree=2002,
64+
mtry=2,
65+
replace=FALSE,
66+
importance=TRUE,
67+
proximity=TRUE,
68+
# we should have 0 na's so die loudly if we find any
69+
na.action=na.fail
6570
)
6671
print(model)
72+
importance(model)
6773

68-
test$Survived <- predict(model, newdata=test, type="response")
74+
#print(model$importance)
75+
76+
#test$Survived <- predict(model, newdata=test, type="response")
6977

70-
write.csv(test[,c("PassengerId", "Survived")], file="predictions.csv", row.names=FALSE, quote=FALSE)
78+
#write.csv(test[,c("PassengerId", "Survived")], file="predictions.csv", row.names=FALSE, quote=FALSE)

lm.campbell.r

+107
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
setwd('D:/Development/RScripts/Titanic/')
2+
3+
data = read.csv('data/train.csv', sep=',', na.strings=c(''))
4+
5+
data$Survived <- factor(data$Survived)
6+
#data$Sex <- factor(data$Sex)
7+
#data$Embarked <- factor(data$Embarked)
8+
#data$Pclass <- factor(data$Pclass)
9+
10+
# extract deck name from Cabin number
11+
cabin_to_deck <- function(data) {
12+
data = as.character(data)
13+
for(i in seq(along=data)) {
14+
if (is.na(data[i]))
15+
next
16+
data[i] <- substr(data[i], 1, 1)
17+
}
18+
return (data)
19+
}
20+
21+
data$Cabin <- cabin_to_deck(data$Cabin)
22+
data$Cabin <- factor(data$Cabin, levels=c('A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'))
23+
24+
# extract Title from Name
25+
extract_title <- function(data) {
26+
for(i in seq(along=data)) {
27+
if (is.na(data[i]))
28+
next
29+
a <- unlist(strsplit(data[i], ', '))[2]
30+
b <- unlist(strsplit(a, '. '))[1]
31+
data[i] <- b
32+
}
33+
return (data)
34+
}
35+
36+
data$Title <- extract_title(as.character(data$Name))
37+
data$Title <- factor(data$Title)
38+
39+
# impute age
40+
models.age <- lm(Age ~ Fare + Title + SibSp + Parch, data=data)
41+
for(i in 1:nrow(data)) {
42+
if (is.na(data[i, 'Age'])) {
43+
data[i, 'Age'] <- predict(models.age, newdata=data[i,])
44+
}
45+
}
46+
47+
48+
models.glm = glm(Survived ~ Pclass + Fare + SibSp + Parch + Sex + Age + Pclass:Age + Age:Sex + SibSp:Sex, family=binomial(link='logit'), data=data)
49+
50+
p = predict(models.glm, newdata=data, type='response')
51+
survived = round(p)
52+
53+
library(caret)
54+
confusionMatrix(factor(survived), data$Survived)
55+
56+
# make prediction
57+
58+
test = read.csv('data/test.csv', sep=',', na.strings=c(''))
59+
60+
# extract deck name from Cabin number
61+
cabin_to_deck <- function(data) {
62+
data = as.character(data)
63+
for(i in seq(along=data)) {
64+
if (is.na(data[i]))
65+
next
66+
data[i] <- substr(data[i], 1, 1)
67+
}
68+
return (data)
69+
}
70+
71+
test$Cabin <- cabin_to_deck(test$Cabin)
72+
test$Cabin <- factor(test$Cabin, levels=c('A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'))
73+
74+
# extract Title from Name
75+
extract_title <- function(data) {
76+
for(i in seq(along=data)) {
77+
if (is.na(data[i]))
78+
next
79+
a <- unlist(strsplit(data[i], ', '))[2]
80+
b <- unlist(strsplit(a, '. '))[1]
81+
data[i] <- b
82+
}
83+
return (data)
84+
}
85+
86+
test$Title <- extract_title(as.character(test$Name))
87+
test$Title <- factor(test$Title)
88+
89+
# impute age
90+
models.age <- lm(Age ~ Fare + Title + SibSp + Parch, data=data)
91+
for(i in 1:nrow(test)) {
92+
if (is.na(test[i, 'Age'])) {
93+
test[i, 'Age'] <- predict(models.age, newdata=test[i,])
94+
}
95+
}
96+
97+
test$Fare[153] <- mean(
98+
with(test, subset(Fare, Pclass == 3)),
99+
na.rm=TRUE
100+
)
101+
102+
summary(test)
103+
104+
p = predict(models.glm, newdata=test, type='response')
105+
106+
data = data.frame(PassengerId = test$PassengerId, survived = round(p))
107+
write.csv(data, 'predictions.csv', row.names = FALSE)

0 commit comments

Comments
 (0)