-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmultiple_linear_regression.py
115 lines (81 loc) · 3.5 KB
/
multiple_linear_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# -*- coding: utf-8 -*-
"""Multiple Linear Regression.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1wmec5pSwgp_waP7Oqs96VqENWzNQRM4R
"""
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
print('User uploaded file "{name}" with length {length} bytes'.format(
name=fn, length=len(uploaded[fn])))
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
ipl_auction_df = pd.read_csv('IPL IMB381IPL2013.csv')
ipl_auction_df.info()
ipl_auction_df.iloc[0:5, 0:10]
X_features = ipl_auction_df.columns
X_features = ['AGE','PLAYING ROLE','T-RUNS','T-WKTS','ODI-RUNS-S','ODI-SR-B','ODI-WKTS', 'ODI-SR-BL', 'CAPTAINCY EXP', 'RUNS-S', 'HS', 'AVE', 'SR-B', 'SIXERS', 'RUNS-C', 'WKTS', 'AVE-BL', 'ECON', 'SR-BL']
ipl_auction_df['PLAYING ROLE'].unique()
pd.get_dummies(ipl_auction_df['PLAYING ROLE'])[0:5]
categorical_features = ['AGE','PLAYING ROLE', 'CAPTAINCY EXP']
ipl_auction_encoded_df = pd.get_dummies(ipl_auction_df [X_features],
columns = categorical_features,
drop_first = "True")
ipl_auction_encoded_df.columns
X_features = ipl_auction_encoded_df.columns
X = sm.add_constant(ipl_auction_encoded_df)
Y = ipl_auction_df['SOLD PRICE']
train_X, test_X, train_y, test_y = train_test_split(X,
Y,
train_size = 0.3,
random_state = 42 )
ipl_model_1 = sm.OLS(train_y, train_X).fit()
ipl_model_1.summary2()
"""**Variance Inflation Factor** - measure used for identifying the existence of multi-collinearity"""
from statsmodels.stats.outliers_influence import variance_inflation_factor
def get_vif_factor (X):
X_matrix = X.as_matrix()
vif = [ variance_inflation_factor(X_matrix, i) for i in range(X_matrix.shape[1])]
vif_factors = pd.DataFrame()
vif_factors['column'] = X.columns
vif_factors['VIF'] = vif
return vif_factors
vif_factors = get_vif_factor(X [X_features] )
vif_factors
columns_to_be_removed = ['T-RUNS', 'T-WKTS', 'HS', 'AVE', 'RUNS-C', 'SR-B', 'AVE-BL', 'ECON', 'ODI-SR-B', 'ODI-RUNS-S', 'AGE_2', 'SR-BL']
X_new_features = list( set(X_features) - set(columns_to_be_removed))
get_vif_factor( X[X_new_features])
"""*Building a New model after removing Multi-collinearity*"""
train_X = train_X[X_new_features]
ipl_model_2 = sm.OLS(train_y, train_X).fit()
ipl_model_2.summary2()
significant_vars = [ 'SIXERS', 'CAPTAINCY EXP_1']
train_X = train_X[significant_vars]
ipl_model_3 = sm.OLS(train_y, train_X).fit()
ipl_model_3.summary2()
"""*Residual Analysis in Multiple Regression*
**P-P plot**
"""
def draw_pp_plot(model, title):
probplot = sm.ProbPlot(model.resid);
plt.figure( figsize = (8,6) );
probplot.ppplot( line= '45 ' );
plt.title( title);
plt.show();
draw_pp_plot(ipl_model_3,
"Figure - Normal P-P Plot of Regression Standardized Residuals");
k = train_X.shape[1]
n = train_X.shape[0]
print("Number of Variables: ",k," and number of observations: ",n)
leverage_cutoff = 3* ((k+1)/n)
print("cutoff for leverage value: ", round(leverage_cutoff, 3) )
from statsmodels.graphics.regressionplots import influence_plot
fig, ax = plt.subplots(figsize=(8,6) )
influence_plot(ipl_model_3, ax=ax)
plt.title("Fig - Leverage Value vs Residuals" )
plt.show()