spyder -w kaggle-titanic (-w 参数指定工作目录)
本实验基本在控制台下进行,可关闭 spyder 中的其余窗口,只保留控制台。如需要调出窗口,可以通过 view->windows and toolbar 调出。比如希望在py文件中编写代码,可以 view->windows and toolbar->Editor 调出编辑器窗口。
- Python科学计算知识
- 数据挖掘知识
- Matplotlib,numpy,pandas, statsmodels等包的使用
import matplotlib.pyplot as plt
%matplotlib qt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.nonparametric.kde import KDEUnivariate
from statsmodels.nonparametric import smoothers_lowess
from pandas import DataFrame
from patsy import dmatrices
用 pandas 读取数据:
df = pd.read_csv("data/train.csv")
####先看看能获取哪些信息 上述数据被存储在 Pandas 的 DataFrame 中。可以把 DataFrame 想像成强化版 Excel 表格工作流。通过 DataFrame 的打印输出我们知道了这份数据一共包含891位船上人员的资料:
Int64Index: 891 entries, 0 to 890
表格中姓名,性别,年龄等列名称在这里被称作数据集的特征(feature),一些特征信息是全员都具备的,比如 survived :
survived 891 non-null values
age 714 non-null values
缺失的部分都由 NaN 表示。
Ticket 与 Cabin 缺失了很多信息,所以不适合作为分析的材料,让我们把这两列从 DataFrame 中去除:
df = df.drop(['Ticket','Cabin'], axis=1)
df = df.dropna()
# 指定图的参数
fig = plt.figure(figsize=(18,6), dpi=100)
alpha=alpha_scatterplot = 0.2
alpha_bar_chart = 0.55
# 幸存数量对比
ax1 = plt.subplot2grid((2,3),(0,0))
# plots a bar graph of those who surived vs those who did not.
df.Survived.value_counts().plot(kind='bar', alpha=alpha_bar_chart)
# this nicely sets the margins in matplotlib to deal with a recent bug 1.3.1
ax1.set_xlim(-1, 2)
# puts a title on our graph
plt.title("Distribution of Survival, (1 = Survived)")
# 年龄与幸存数量的关系对比
plt.scatter(df.Survived, df.Age, alpha=alpha_scatterplot)
# sets the y axis lable
# formats the grid line style of our graphs
plt.grid(b=True, which='major', axis='y')
plt.title("Survial by Age, (1 = Survived)")
ax3 = plt.subplot2grid((2,3),(0,2))
df.Pclass.value_counts().plot(kind="barh", alpha=alpha_bar_chart)
ax3.set_ylim(-1, len(df.Pclass.value_counts()))
plt.title("Class Distribution")
plt.subplot2grid((2,3),(1,0), colspan=2)
# plots a kernel desnsity estimate of the subset of the 1st class passanges's age
df.Age[df.Pclass == 1].plot(kind='kde')
df.Age[df.Pclass == 2].plot(kind='kde')
df.Age[df.Pclass == 3].plot(kind='kde')
# plots an axis lable
plt.title("Age Distribution within classes")
# sets our legend for our graph.
plt.legend(('1st Class', '2nd Class','3rd Class'),loc='best')
ax5 = plt.subplot2grid((2,3),(1,2))
df.Embarked.value_counts().plot(kind='bar', alpha=alpha_bar_chart)
ax5.set_xlim(-1, len(df.Embarked.value_counts()))
# specifies the parameters of our graphs
plt.title("Passengers per boarding location")
- 乘客阶级(pclass)
- 性别(Sex)
- 年龄(Age)
- 船票价格(Fare Price)
fig, ax = plt.subplots()
df.Survived.value_counts().plot(kind='barh', color="blue", alpha=.65)
ax.set_ylim(-1, len(df.Survived.value_counts()))
plt.title("Survival Breakdown (1 = Survived, 0 = Died)")
fig = plt.figure(figsize=(18,6))
# 各个性别幸存数量的关系对比
ax1 = fig.add_subplot(121)
df.Survived[df.Sex == 'male'].value_counts(sort=False).plot(kind='barh',label='Male')
df.Survived[df.Sex == 'female'].value_counts(sort=False).plot(kind='barh', color='#FA2379',label='Female')
ax1.set_ylim(-1, 2)
plt.title("Who Survived? with respect to Gender, (raw value counts) "); plt.legend(loc='best')
# 各个性别幸存率百分比关系
# adjust graph to display the proportions of survival by gender
ax2 = fig.add_subplot(122)
(df.Survived[df.Sex == 'male'].value_counts(sort=False)/float(df.Sex[df.Sex == 'male'].size)).plot(kind='barh',label='Male')
(df.Survived[df.Sex == 'female'].value_counts(sort=False)/float(df.Sex[df.Sex == 'female'].size)).plot(kind='barh', color='#FA2379',label='Female')
ax2.set_ylim(-1, 2)
plt.title("Who Survived proportionally? with respect to Gender"); plt.legend(loc='best')
可以从Pclass入手,挖掘出更多内容来,将 class 1-2 归类为高阶级,class 3 归类为低阶级:
fig = plt.figure(figsize=(18,4), dpi=1600)
alpha_level = 0.65
# building on the previous code, here we create an additional subset with in the gender subset
# we created for the survived variable. I know, thats a lot of subsets. After we do that we call
# value_counts() so it it can be easily plotted as a bar graph. this is repeated for each gender
# class pair.
# 女性高阶级
female_highclass = df.Survived[df.Sex == 'female'][df.Pclass != 3].value_counts()
female_highclass.plot(kind='bar', label='female highclass', color='#FA2479', alpha=alpha_level)
ax1.set_xticklabels(["Survived", "Died"], rotation=0)
ax1.set_xlim(-1, len(female_highclass))
plt.title("Who Survived? with respect to Gender and Class"); plt.legend(loc='best')
# 女性低阶级
ax2=fig.add_subplot(142, sharey=ax1)
female_lowclass = df.Survived[df.Sex == 'female'][df.Pclass == 3].value_counts()
female_lowclass.plot(kind='bar', label='female, low class', color='pink', alpha=alpha_level)
ax2.set_xticklabels(["Died","Survived"], rotation=0)
ax2.set_xlim(-1, len(female_lowclass))
# 男性高阶级
ax3=fig.add_subplot(143, sharey=ax1)
male_lowclass = df.Survived[df.Sex == 'male'][df.Pclass == 3].value_counts()
male_lowclass.plot(kind='bar', label='male, low class',color='lightblue', alpha=alpha_level)
ax3.set_xticklabels(["Died","Survived"], rotation=0)
ax3.set_xlim(-1, len(male_lowclass))
ax4=fig.add_subplot(144, sharey=ax1)
male_highclass = df.Survived[df.Sex == 'male'][df.Pclass != 3].value_counts()
male_highclass.plot(kind='bar', label='male highclass', alpha=alpha_level, color='steelblue')
ax4.set_xticklabels(["Died","Survived"], rotation=0)
ax4.set_xlim(-1, len(male_highclass))
fig = plt.figure(figsize=(18,12), dpi=1600)
a = 0.65
# Step 1
ax1 = fig.add_subplot(341)
df.Survived.value_counts().plot(kind='bar', color="blue", alpha=a)
ax1.set_xlim(-1, len(df.Survived.value_counts()))
plt.title("Step. 1")
# Step 2
ax2 = fig.add_subplot(345)
df.Survived[df.Sex == 'male'].value_counts(sort=False).plot(kind='bar',label='Male')
df.Survived[df.Sex == 'female'].value_counts(sort=False).plot(kind='bar', color='#FA2379',label='Female')
ax2.set_xlim(-1, 2)
plt.title("Step. 2 \nWho Survied? with respect to Gender."); plt.legend(loc='best')
ax3 = fig.add_subplot(346)
(df.Survived[df.Sex == 'male'].value_counts(sort=False)/float(df.Sex[df.Sex == 'male'].size)).plot(kind='bar',label='Male')
(df.Survived[df.Sex == 'female'].value_counts(sort=False)/float(df.Sex[df.Sex == 'female'].size)).plot(kind='bar', color='#FA2379',label='Female')
plt.title("Who Survied proportionally?"); plt.legend(loc='best')
# Step 3
ax4 = fig.add_subplot(349)
female_highclass = df.Survived[df.Sex == 'female'][df.Pclass != 3].value_counts()
female_highclass.plot(kind='bar', label='female highclass', color='#FA2479', alpha=a)
ax4.set_xticklabels(["Survived", "Died"], rotation=0)
ax4.set_xlim(-1, len(female_highclass))
plt.title("Who Survived? with respect to Gender and Class"); plt.legend(loc='best')
ax5 = fig.add_subplot(3,4,10, sharey=ax1)
female_lowclass = df.Survived[df.Sex == 'female'][df.Pclass == 3].value_counts()
female_lowclass.plot(kind='bar', label='female, low class', color='pink', alpha=a)
ax5.set_xticklabels(["Died","Survived"], rotation=0)
ax5.set_xlim(-1, len(female_lowclass))
ax6 = fig.add_subplot(3,4,11, sharey=ax1)
male_lowclass = df.Survived[df.Sex == 'male'][df.Pclass == 3].value_counts()
male_lowclass.plot(kind='bar', label='male, low class',color='lightblue', alpha=a)
ax6.set_xticklabels(["Died","Survived"], rotation=0)
ax6.set_xlim(-1, len(male_lowclass))
ax7 = fig.add_subplot(3,4,12, sharey=ax1)
male_highclass = df.Survived[df.Sex == 'male'][df.Pclass != 3].value_counts()
male_highclass.plot(kind='bar', label='male highclass', alpha=a, color='steelblue')
ax7.set_xticklabels(["Died","Survived"], rotation=0)
ax7.set_xlim(-1, len(male_highclass))
作图代码已经尽量可读和直观了,关于如何使用 matplotlib 作图,可以先学习我们的 Python 科学计算课程呀。
在统计学中,logistic 回归或 logit 回归是回归分析的一种类型,目的在于根据一个或多个预测变量预测分类因变量(分类因变量指的是拥有有限个值的因变量,它的大小没有意义,但值的排列顺序可能有意义)。使用 logistic 方程以预测量为输入对输出的因变量的概率进行建模。通常情况下,logistic 用到逻辑回归的问题的因变量都是二分类的,正适合我们的题目。
显然,这张图并没有告诉我们某位乘客是生是死,它只告诉了我们乘客生死的概率。我们需要自己把生死的概率翻译成生或死的结果。但要怎么做呢?我们可以说幸存率 > 50% 的人会幸存下来。
下面给出创建 logistic 回归模型的代码,训练数据然后检验性能。
# formula 的格式:~ 的左边是因变量,右边时自变量
# C() 确认某一变量是分类变量
formula = 'Survived ~ C(Pclass) + C(Sex) + Age + SibSp + C(Embarked)'
# 使用 patsy 的 dmatrices 函数能够根据 formula 生成方便处理的 dataframe
y,x = dmatrices(formula, data=df, return_type='dataframe')
# 初始化 logit 模型
model = sm.Logit(y,x)
# 将模型与训练数据进行拟合
model = model.fit()
Optimization terminated successfully.
Current function value: 0.444388
Iterations 6
# 预测值 Vs 真实值
plt.subplot(121, axisbg="#DBDBDB")
# 生成预测
ypred = model.predict(x)
plt.plot(x.index, ypred, 'bo', x.index, y, 'mo', alpha=.25);
plt.grid(color='white', linestyle='dashed')
plt.title('Logit predictions, Blue: \nFitted/predicted values: Red');
# 残差
ax2 = plt.subplot(122, axisbg="#DBDBDB")
plt.plot(model.resid_dev, 'r-')
plt.grid(color='white', linestyle='dashed')
ax2.set_xlim(-1, len(res.resid_dev))
plt.title('Logit Residuals');
fig = plt.figure(figsize=(18,9), dpi=1600)
a = .2
# Below are examples of more advanced plotting.
# It it looks strange check out the tutorial above.
fig.add_subplot(221, axisbg="#DBDBDB")
kde_res = KDEUnivariate(res.predict())
plt.fill_between(kde_res.support,kde_res.density, alpha=a)
plt.title("Distribution of our Predictions")
fig.add_subplot(222, axisbg="#DBDBDB")
plt.scatter(res.predict(),x['C(Sex)[T.male]'] , alpha=a)
plt.grid(b=True, which='major', axis='x')
plt.xlabel("Predicted chance of survival")
plt.ylabel("Gender Bool")
plt.title("The Change of Survival Probability by Gender (1 = Male)")
fig.add_subplot(223, axisbg="#DBDBDB")
plt.scatter(res.predict(),x['C(Pclass)[T.3]'] , alpha=a)
plt.xlabel("Predicted chance of survival")
plt.ylabel("Class Bool")
plt.grid(b=True, which='major', axis='x')
plt.title("The Change of Survival Probability by Lower Class (1 = 3rd Class)")
fig.add_subplot(224, axisbg="#DBDBDB")
plt.scatter(res.predict(),x.Age , alpha=a)
plt.grid(True, linewidth=0.15)
plt.title("The Change of Survival Probability by Age")
plt.xlabel("Predicted chance of survival")
现在让我们使用模型预测测试数据集的结果吧,你可以将结果保存下来提交到 Kaggle 上查看数据的正确率。
test_data = pd.read_csv("data/test.csv")
列出所有含有 NaN 的列:
PassengerId 0
Pclass 0
Name 0
Sex 0
Age 86
SibSp 0
Parch 0
Ticket 0
Fare 1
Cabin 327
Embarked 0
dtype: int64
注意到公式关系:Survived ~ C(Pclass) + C(Sex) + Age + SibSp + C(Embarked)
由于我们需要得到所有测试数据的幸存率,在 Age 的 NaN 补上测试集的中值:
test_data.loc[test_data['Age'].isnull(), 'Age'] = np.nanmedian(df['Age'])
####将因变量加入到测试数据集中。(在 Kaggle 中这一列都是留空的)
test_data['Survived'] = 1.23
test_x, test_y = dmatrices(formula, data=test_data, return_type='dataframe')
result = model.predict(test_y)
result = [1 if v > 0.5 else 0 for v in result]
output = pd.DataFrame(columns=['PassengerId', 'Survived'])
output['PassengerId'] = test_data['PassengerId']
output['Survived'] = outcome
output.to_csv('output.csv', index=False)
####Kaggle 得分 RMSE = 0.74641,怎么把分数提上去就看你咯。
