GitHub - dipeshkumardahal/titanic-survival-model

#Titanic Survival Model with Kaggle Dataset

#Data Processing

#Import Required Library

import pandas as pd #pandas is python library for data processing
import numpy as np #numpy is python library for scientific calculation

#import CSV Dataset

train_df = pd.read_csv('titanic_data.csv') #We read csv with pandas and assigned it to train_df

#Now lets look at it

train_df.head() #Head only displays some top rows and tail displays last rows

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

#Now we look at data Info and analyze it

train_df.info() #info gives information about overall data structure

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB

#now time to process the data
#Total Values are 891 
#cabin, embarked, age has some null values
#Cabin, Embarked, Ticket, Sex, Name are object.

#what we need to do
#Drop cabin because it has huge null values and has no relation to survival
#drop name and ticket because it has no relation to survival
#fill missing age values with average
#convert sex to numerical values
#Fill Embarked with highest possible values

train_df.drop('Cabin', axis =1, inplace=True) # drop the cabin

train_df.drop(['Name','Ticket'], axis=1, inplace=True) #drop name and ticket. this is how we pass multiple var with list

#now lets look at data. There will be no cabin, name and ticket

train_df.head()

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

	PassengerId	Survived	Pclass	Sex	Age	SibSp	Fare	Embarked
0	1	0	3	male	22.0	1	7.2500	S
1	2	1	1	female	38.0	1	71.2833	C
2	3	1	3	female	26.0	0	7.9250	S
3	4	1	1	female	35.0	1	53.1000	S
4	5	0	3	male	35.0	0	8.0500	S

#Lets Analyze The age

age_null = train_df['Age'].isnull() #This store train_df age null status on boolen on age_null

train_df[age_null] #this display all null ages datas

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

	PassengerId	Survived	Pclass	Sex	Age	SibSp	Parch	Fare	Embarked
5	6	0	3	male	NaN	0	0	8.4583	Q
17	18	1	2	male	NaN	0	0	13.0000	S
19	20	1	3	female	NaN	0	0	7.2250	C
26	27	0	3	male	NaN	0	0	7.2250	C
28	29	1	3	female	NaN	0	0	7.8792	Q
29	30	0	3	male	NaN	0	0	7.8958	S
31	32	1	1	female	NaN	1	0	146.5208	C
32	33	1	3	female	NaN	0	0	7.7500	Q
36	37	1	3	male	NaN	0	0	7.2292	C
42	43	0	3	male	NaN	0	0	7.8958	C
45	46	0	3	male	NaN	0	0	8.0500	S
46	47	0	3	male	NaN	1	0	15.5000	Q
47	48	1	3	female	NaN	0	0	7.7500	Q
48	49	0	3	male	NaN	2	0	21.6792	C
55	56	1	1	male	NaN	0	0	35.5000	S
64	65	0	1	male	NaN	0	0	27.7208	C
65	66	1	3	male	NaN	1	1	15.2458	C
76	77	0	3	male	NaN	0	0	7.8958	S
77	78	0	3	male	NaN	0	0	8.0500	S
82	83	1	3	female	NaN	0	0	7.7875	Q
87	88	0	3	male	NaN	0	0	8.0500	S
95	96	0	3	male	NaN	0	0	8.0500	S
101	102	0	3	male	NaN	0	0	7.8958	S
107	108	1	3	male	NaN	0	0	7.7750	S
109	110	1	3	female	NaN	1	0	24.1500	Q
121	122	0	3	male	NaN	0	0	8.0500	S
126	127	0	3	male	NaN	0	0	7.7500	Q
128	129	1	3	female	NaN	1	1	22.3583	C
140	141	0	3	female	NaN	0	2	15.2458	C
154	155	0	3	male	NaN	0	0	7.3125	S
...	...	...	...	...	...	...	...	...	...
718	719	0	3	male	NaN	0	0	15.5000	Q
727	728	1	3	female	NaN	0	0	7.7375	Q
732	733	0	2	male	NaN	0	0	0.0000	S
738	739	0	3	male	NaN	0	0	7.8958	S
739	740	0	3	male	NaN	0	0	7.8958	S
740	741	1	1	male	NaN	0	0	30.0000	S
760	761	0	3	male	NaN	0	0	14.5000	S
766	767	0	1	male	NaN	0	0	39.6000	C
768	769	0	3	male	NaN	1	0	24.1500	Q
773	774	0	3	male	NaN	0	0	7.2250	C
776	777	0	3	male	NaN	0	0	7.7500	Q
778	779	0	3	male	NaN	0	0	7.7375	Q
783	784	0	3	male	NaN	1	2	23.4500	S
790	791	0	3	male	NaN	0	0	7.7500	Q
792	793	0	3	female	NaN	8	2	69.5500	S
793	794	0	1	male	NaN	0	0	30.6958	C
815	816	0	1	male	NaN	0	0	0.0000	S
825	826	0	3	male	NaN	0	0	6.9500	Q
826	827	0	3	male	NaN	0	0	56.4958	S
828	829	1	3	male	NaN	0	0	7.7500	Q
832	833	0	3	male	NaN	0	0	7.2292	C
837	838	0	3	male	NaN	0	0	8.0500	S
839	840	1	1	male	NaN	0	0	29.7000	C
846	847	0	3	male	NaN	8	2	69.5500	S
849	850	1	1	female	NaN	1	0	89.1042	C
859	860	0	3	male	NaN	0	0	7.2292	C
863	864	0	3	female	NaN	8	2	69.5500	S
868	869	0	3	male	NaN	0	0	9.5000	S
878	879	0	3	male	NaN	0	0	7.8958	S
888	889	0	3	female	NaN	1	2	23.4500	S

177 rows × 9 columns

#Now lets fill them with mean

age_mean =train_df['Age'].mean() # we calculate mean of age and stored it on age_mean

age_mean # this is calculated value

29.69911764705882

#Filling the age

age_null = train_df['Age'].fillna(age_mean, inplace =True) #we fill all the null ages with mean

train_df['Age'].head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64

train_df.info() #Now there is no null age

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(2)
memory usage: 62.7+ KB

#Time for embarked

null_embarked = train_df["Embarked"].isnull() #we assigned null embark value on null_embarked variable

train_df[null_embarked] #retrive null contains embark

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

	PassengerId	Survived	Pclass	Sex	Age	SibSp	Parch	Fare	Embarked
61	62	1	1	female	38.0	0	0	80.0	NaN
829	830	1	1	female	62.0	0	0	80.0	NaN

#Now we look at statistics of embarked

train_df['Embarked'].describe()

count     889
unique      3
top         S
freq      644
Name: Embarked, dtype: object

#Because S has highest frequency we gonna replace null with S

train_df['Embarked'].fillna('S', inplace=True)

new_null_embarked = train_df['Embarked'].isnull() #we gonna check if there is any null value

train_df[new_null_embarked] #There is no null value on embarked

.dataframe thead th {
    text-align: left;
}

.dataframe tbody tr th {
    vertical-align: top;
}

</style>

	PassengerId	Survived	Pclass	Sex	Age	SibSp	Parch	Fare	Embarked

train_df.info() #Now there is no null embarked

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Embarked       891 non-null object
dtypes: float64(2), int64(5), object(2)
memory usage: 62.7+ KB

#Now we need to convert object to numerical, as there are two object

#Lets deal with gender

#we gonna convert male, frmale to integer with simple dictionary 
gender_maps ={
    'male':1,
    'female':2
}

train_df['Sex'] = train_df['Sex'].map(gender_maps) #we replace Sex with gender_maps dictionary

#Now lets look at the info

train_df.info() #sex must be integer by now

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Embarked       891 non-null object
dtypes: float64(2), int64(6), object(1)
memory usage: 62.7+ KB

#Lets Deal with Embarked

#now we gonna convert embarked value to integer with dictionary

embarked_map ={
    'S':1,
    'Q':2,
    'C':3
}

train_df['Embarked'] = train_df['Embarked'].map(embarked_map) #we replace embarked with embarked_maps dictionary

#Lets look at data status

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Embarked       891 non-null int64
dtypes: float64(2), int64(7)
memory usage: 62.7 KB

#The Data is perfect for accurate analysis now

#Now we export processed data to CSV
train_df.to_csv("titanic.csv", index=False) #import to csv

#The data is cleaned and managed now we can make model via machine learning

#Machine Learning

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Embarked       891 non-null int64
dtypes: float64(2), int64(7)
memory usage: 62.7 KB

#Lets Import sklearn dicission tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(train_df, test_df,test_size=.20) #split training and test data

X_test.info() #testing data information

<class 'pandas.core.frame.DataFrame'>
Int64Index: 179 entries, 339 to 98
Data columns (total 9 columns):
PassengerId    179 non-null int64
Survived       179 non-null int64
Pclass         179 non-null int64
Sex            179 non-null int64
Age            179 non-null float64
SibSp          179 non-null int64
Parch          179 non-null int64
Fare           179 non-null float64
Embarked       179 non-null int64
dtypes: float64(2), int64(7)
memory usage: 14.0 KB

X_train.info() #training data info

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 7 to 681
Data columns (total 9 columns):
PassengerId    712 non-null int64
Survived       712 non-null int64
Pclass         712 non-null int64
Sex            712 non-null int64
Age            712 non-null float64
SibSp          712 non-null int64
Parch          712 non-null int64
Fare           712 non-null float64
Embarked       712 non-null int64
dtypes: float64(2), int64(7)
memory usage: 55.6 KB

#Desession tree
decision_tree = DecisionTreeClassifier(random_state=4) #We Define classifier moudle with desission tree algorithm

decision_tree.fit(X_train,Y_train) #We fit Training data and testing data on desision tree algorithm and trained the model

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=4,
            splitter='best')

decision_tree.predict(X_test[0:1]) #time for prediction

array([0], dtype=int64)

decision_tree.predict(X_test[0:5]) #predicting for multiple values 0-5

array([0, 0, 1, 0, 0], dtype=int64)

#Time to check accuracy for model
decision_tree.score(X_test, Y_test) #wonder how this is possibele

1.0

#please suggest me where i am wrong

Name		Name	Last commit message	Last commit date
Latest commit History 4 Commits
README.md		README.md
Titanic Survival Model.ipynb		Titanic Survival Model.ipynb
TitanicSurvivalModel.py		TitanicSurvivalModel.py
titanic.csv		titanic.csv
titanic_data.csv		titanic_data.csv

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

About

Releases

Packages

Languages

dipeshkumardahal/titanic-survival-model

Folders and files

Latest commit

History

Repository files navigation

About

Resources

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages