#Titanic Survival Model with Kaggle Dataset
#Data Processing
#Import Required Library
import pandas as pd #pandas is python library for data processing
import numpy as np #numpy is python library for scientific calculation
#import CSV Dataset
train_df = pd.read_csv('titanic_data.csv') #We read csv with pandas and assigned it to train_df
#Now lets look at it
train_df.head() #Head only displays some top rows and tail displays last rows
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
</style>
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
#Now we look at data Info and analyze it
train_df.info() #info gives information about overall data structure
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
#now time to process the data
#Total Values are 891
#cabin, embarked, age has some null values
#Cabin, Embarked, Ticket, Sex, Name are object.
#what we need to do
#Drop cabin because it has huge null values and has no relation to survival
#drop name and ticket because it has no relation to survival
#fill missing age values with average
#convert sex to numerical values
#Fill Embarked with highest possible values
train_df.drop('Cabin', axis =1, inplace=True) # drop the cabin
train_df.drop(['Name','Ticket'], axis=1, inplace=True) #drop name and ticket. this is how we pass multiple var with list
#now lets look at data. There will be no cabin, name and ticket
train_df.head()
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
</style>
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
PassengerId | Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S |
1 | 2 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C |
2 | 3 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S |
3 | 4 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S |
4 | 5 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S |
#Lets Analyze The age
age_null = train_df['Age'].isnull() #This store train_df age null status on boolen on age_null
train_df[age_null] #this display all null ages datas
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
</style>
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
PassengerId | Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
---|---|---|---|---|---|---|---|---|---|
5 | 6 | 0 | 3 | male | NaN | 0 | 0 | 8.4583 | Q |
17 | 18 | 1 | 2 | male | NaN | 0 | 0 | 13.0000 | S |
19 | 20 | 1 | 3 | female | NaN | 0 | 0 | 7.2250 | C |
26 | 27 | 0 | 3 | male | NaN | 0 | 0 | 7.2250 | C |
28 | 29 | 1 | 3 | female | NaN | 0 | 0 | 7.8792 | Q |
29 | 30 | 0 | 3 | male | NaN | 0 | 0 | 7.8958 | S |
31 | 32 | 1 | 1 | female | NaN | 1 | 0 | 146.5208 | C |
32 | 33 | 1 | 3 | female | NaN | 0 | 0 | 7.7500 | Q |
36 | 37 | 1 | 3 | male | NaN | 0 | 0 | 7.2292 | C |
42 | 43 | 0 | 3 | male | NaN | 0 | 0 | 7.8958 | C |
45 | 46 | 0 | 3 | male | NaN | 0 | 0 | 8.0500 | S |
46 | 47 | 0 | 3 | male | NaN | 1 | 0 | 15.5000 | Q |
47 | 48 | 1 | 3 | female | NaN | 0 | 0 | 7.7500 | Q |
48 | 49 | 0 | 3 | male | NaN | 2 | 0 | 21.6792 | C |
55 | 56 | 1 | 1 | male | NaN | 0 | 0 | 35.5000 | S |
64 | 65 | 0 | 1 | male | NaN | 0 | 0 | 27.7208 | C |
65 | 66 | 1 | 3 | male | NaN | 1 | 1 | 15.2458 | C |
76 | 77 | 0 | 3 | male | NaN | 0 | 0 | 7.8958 | S |
77 | 78 | 0 | 3 | male | NaN | 0 | 0 | 8.0500 | S |
82 | 83 | 1 | 3 | female | NaN | 0 | 0 | 7.7875 | Q |
87 | 88 | 0 | 3 | male | NaN | 0 | 0 | 8.0500 | S |
95 | 96 | 0 | 3 | male | NaN | 0 | 0 | 8.0500 | S |
101 | 102 | 0 | 3 | male | NaN | 0 | 0 | 7.8958 | S |
107 | 108 | 1 | 3 | male | NaN | 0 | 0 | 7.7750 | S |
109 | 110 | 1 | 3 | female | NaN | 1 | 0 | 24.1500 | Q |
121 | 122 | 0 | 3 | male | NaN | 0 | 0 | 8.0500 | S |
126 | 127 | 0 | 3 | male | NaN | 0 | 0 | 7.7500 | Q |
128 | 129 | 1 | 3 | female | NaN | 1 | 1 | 22.3583 | C |
140 | 141 | 0 | 3 | female | NaN | 0 | 2 | 15.2458 | C |
154 | 155 | 0 | 3 | male | NaN | 0 | 0 | 7.3125 | S |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
718 | 719 | 0 | 3 | male | NaN | 0 | 0 | 15.5000 | Q |
727 | 728 | 1 | 3 | female | NaN | 0 | 0 | 7.7375 | Q |
732 | 733 | 0 | 2 | male | NaN | 0 | 0 | 0.0000 | S |
738 | 739 | 0 | 3 | male | NaN | 0 | 0 | 7.8958 | S |
739 | 740 | 0 | 3 | male | NaN | 0 | 0 | 7.8958 | S |
740 | 741 | 1 | 1 | male | NaN | 0 | 0 | 30.0000 | S |
760 | 761 | 0 | 3 | male | NaN | 0 | 0 | 14.5000 | S |
766 | 767 | 0 | 1 | male | NaN | 0 | 0 | 39.6000 | C |
768 | 769 | 0 | 3 | male | NaN | 1 | 0 | 24.1500 | Q |
773 | 774 | 0 | 3 | male | NaN | 0 | 0 | 7.2250 | C |
776 | 777 | 0 | 3 | male | NaN | 0 | 0 | 7.7500 | Q |
778 | 779 | 0 | 3 | male | NaN | 0 | 0 | 7.7375 | Q |
783 | 784 | 0 | 3 | male | NaN | 1 | 2 | 23.4500 | S |
790 | 791 | 0 | 3 | male | NaN | 0 | 0 | 7.7500 | Q |
792 | 793 | 0 | 3 | female | NaN | 8 | 2 | 69.5500 | S |
793 | 794 | 0 | 1 | male | NaN | 0 | 0 | 30.6958 | C |
815 | 816 | 0 | 1 | male | NaN | 0 | 0 | 0.0000 | S |
825 | 826 | 0 | 3 | male | NaN | 0 | 0 | 6.9500 | Q |
826 | 827 | 0 | 3 | male | NaN | 0 | 0 | 56.4958 | S |
828 | 829 | 1 | 3 | male | NaN | 0 | 0 | 7.7500 | Q |
832 | 833 | 0 | 3 | male | NaN | 0 | 0 | 7.2292 | C |
837 | 838 | 0 | 3 | male | NaN | 0 | 0 | 8.0500 | S |
839 | 840 | 1 | 1 | male | NaN | 0 | 0 | 29.7000 | C |
846 | 847 | 0 | 3 | male | NaN | 8 | 2 | 69.5500 | S |
849 | 850 | 1 | 1 | female | NaN | 1 | 0 | 89.1042 | C |
859 | 860 | 0 | 3 | male | NaN | 0 | 0 | 7.2292 | C |
863 | 864 | 0 | 3 | female | NaN | 8 | 2 | 69.5500 | S |
868 | 869 | 0 | 3 | male | NaN | 0 | 0 | 9.5000 | S |
878 | 879 | 0 | 3 | male | NaN | 0 | 0 | 7.8958 | S |
888 | 889 | 0 | 3 | female | NaN | 1 | 2 | 23.4500 | S |
177 rows × 9 columns
#Now lets fill them with mean
age_mean =train_df['Age'].mean() # we calculate mean of age and stored it on age_mean
age_mean # this is calculated value
29.69911764705882
#Filling the age
age_null = train_df['Age'].fillna(age_mean, inplace =True) #we fill all the null ages with mean
train_df['Age'].head()
0 22.0
1 38.0
2 26.0
3 35.0
4 35.0
Name: Age, dtype: float64
train_df.info() #Now there is no null age
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Sex 891 non-null object
Age 891 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Fare 891 non-null float64
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(2)
memory usage: 62.7+ KB
#Time for embarked
null_embarked = train_df["Embarked"].isnull() #we assigned null embark value on null_embarked variable
train_df[null_embarked] #retrive null contains embark
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
</style>
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
PassengerId | Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
---|---|---|---|---|---|---|---|---|---|
61 | 62 | 1 | 1 | female | 38.0 | 0 | 0 | 80.0 | NaN |
829 | 830 | 1 | 1 | female | 62.0 | 0 | 0 | 80.0 | NaN |
#Now we look at statistics of embarked
train_df['Embarked'].describe()
count 889
unique 3
top S
freq 644
Name: Embarked, dtype: object
#Because S has highest frequency we gonna replace null with S
train_df['Embarked'].fillna('S', inplace=True)
new_null_embarked = train_df['Embarked'].isnull() #we gonna check if there is any null value
train_df[new_null_embarked] #There is no null value on embarked
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
</style>
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
PassengerId | Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked |
---|
train_df.info() #Now there is no null embarked
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Sex 891 non-null object
Age 891 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Fare 891 non-null float64
Embarked 891 non-null object
dtypes: float64(2), int64(5), object(2)
memory usage: 62.7+ KB
#Now we need to convert object to numerical, as there are two object
#Lets deal with gender
#we gonna convert male, frmale to integer with simple dictionary
gender_maps ={
'male':1,
'female':2
}
train_df['Sex'] = train_df['Sex'].map(gender_maps) #we replace Sex with gender_maps dictionary
#Now lets look at the info
train_df.info() #sex must be integer by now
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Sex 891 non-null int64
Age 891 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Fare 891 non-null float64
Embarked 891 non-null object
dtypes: float64(2), int64(6), object(1)
memory usage: 62.7+ KB
#Lets Deal with Embarked
#now we gonna convert embarked value to integer with dictionary
embarked_map ={
'S':1,
'Q':2,
'C':3
}
train_df['Embarked'] = train_df['Embarked'].map(embarked_map) #we replace embarked with embarked_maps dictionary
#Lets look at data status
train_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Sex 891 non-null int64
Age 891 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Fare 891 non-null float64
Embarked 891 non-null int64
dtypes: float64(2), int64(7)
memory usage: 62.7 KB
#The Data is perfect for accurate analysis now
#Now we export processed data to CSV
train_df.to_csv("titanic.csv", index=False) #import to csv
#The data is cleaned and managed now we can make model via machine learning
#Machine Learning
train_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Sex 891 non-null int64
Age 891 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Fare 891 non-null float64
Embarked 891 non-null int64
dtypes: float64(2), int64(7)
memory usage: 62.7 KB
#Lets Import sklearn dicission tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(train_df, test_df,test_size=.20) #split training and test data
X_test.info() #testing data information
<class 'pandas.core.frame.DataFrame'>
Int64Index: 179 entries, 339 to 98
Data columns (total 9 columns):
PassengerId 179 non-null int64
Survived 179 non-null int64
Pclass 179 non-null int64
Sex 179 non-null int64
Age 179 non-null float64
SibSp 179 non-null int64
Parch 179 non-null int64
Fare 179 non-null float64
Embarked 179 non-null int64
dtypes: float64(2), int64(7)
memory usage: 14.0 KB
X_train.info() #training data info
<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 7 to 681
Data columns (total 9 columns):
PassengerId 712 non-null int64
Survived 712 non-null int64
Pclass 712 non-null int64
Sex 712 non-null int64
Age 712 non-null float64
SibSp 712 non-null int64
Parch 712 non-null int64
Fare 712 non-null float64
Embarked 712 non-null int64
dtypes: float64(2), int64(7)
memory usage: 55.6 KB
#Desession tree
decision_tree = DecisionTreeClassifier(random_state=4) #We Define classifier moudle with desission tree algorithm
decision_tree.fit(X_train,Y_train) #We fit Training data and testing data on desision tree algorithm and trained the model
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False, random_state=4,
splitter='best')
decision_tree.predict(X_test[0:1]) #time for prediction
array([0], dtype=int64)
decision_tree.predict(X_test[0:5]) #predicting for multiple values 0-5
array([0, 0, 1, 0, 0], dtype=int64)
#Time to check accuracy for model
decision_tree.score(X_test, Y_test) #wonder how this is possibele
1.0
#please suggest me where i am wrong