-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmachine learning .py
226 lines (181 loc) · 7.71 KB
/
machine learning .py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#!/usr/bin/env python
# coding: utf-8
# In[4]:
## we will try to genarate a random set of numbers
import numpy as np
import matplotlib.pyplot as plt
x = np.random.uniform(0,5,250) ## we wil generate 250 numbers between 0 and 5 and then plot the result
plt.hist(x,5) # we split into 5 groups
plt.show()
print(np.percentile(x,75))
# In[13]:
## we will create a normal distribution gaussian distribution
## by def the values are concentrated around a given value and not betwenn two values as we did before
import numpy as np
import matplotlib.pyplot as plt
x=np.random.normal(5.0,1.0,100000) ## 5 is the mean and 1 is the standard deviation
plt.hist(x,100)
plt.show()
# In[14]:
## we will try a linear regression model as well
## it's when we try to find the relationship between two variables and the importance exist when we try to predict future income
## we will create our date
import matplotlib.pyplot as plt
x = [5,7,8,8,17,11,12,9,6]
y=[99,66,111,77,85,86,77,88,55]
plt.scatter(x,y)
plt.xlabel("the age of the car")
plt.ylabel("the speed of the car ")
plt.title("linear regression on the car dataset")
## we will import our library
from scipy import stats
slope,intercept,r,p,std_err = stats.linregress(x,y)
# we will define our function
def myfunction(x):
return x*slope + intercept ##our function will return a new values of y
my_model = list(map(myfunction,x))
plt.scatter(x,y) ##we plot the original x and y values
plt.plot(x,my_model) ## we plot our linear regression model
plt.show() ## it the realation between our variables is nonlinear then a regression model will not be useful to us at all
## look if there is a linear correlation between x and y values using the r parameter
print("the coefficient of correlatio of your values is :",r)
print("unfort the realtion between your values is not linear at all soo a regression model can't be applied ")
## but we will althought a linear one is soo bad for our data try it in order to predict the future values instead
speed = myfunction(10)
print("the spped of a 10 years old car is: ",speed)
# In[39]:
## we will try a polynomial regression model now
## we use it when there is no relation ship between our variables
import matplotlib.pyplot as plt
x = [5,7,8,8,17,11,12,9,6]
y=[99,66,111,77,85,86,77,88,55] # we will use the same data as before
import numpy as np
mymodel =np.poly1d(np.polyfit(x,y,3)) ## i think 3 mean that we have a polynome of the third degree
## and it's one dimentional polynome
myline = np.linspace(1,22,2000) ## the last parameter make a smooth line
plt.scatter(x,y)
plt.plot(myline,mymodel(myline))
plt.show()
## look if there is a polynomial relationship between our values
from sklearn.metrics import r2_score
print("the coefficient of correlation between x and y values is:",r2_score(y,mymodel(x)))
print("unfortunately there no polynomial relation ship between x and y values ")
## and then we can use it to make predictions
speed = mymodel(17)
print("the speed of a 17 yeras old car is :",speed)
# In[ ]:
## we will try to use the multiple regression now
## it's when the input is not a single value as the simple one but instead a multiple values
## the values that we make our prediction based on
# yuppy we will use a real data set one
# we will predict the emission of the co2 based on multiple features we have
# In[61]:
import pandas as pd
car_data =pd.read_csv("C:/Users/asus/Downloads/cars.csv")
print(car_data)
## we will split the independant variables from the dependent ones
print(car_data.columns)
x = car_data[['Volume',"Weight"]]
y = car_data['CO2']
from sklearn import linear_model
model = linear_model.LinearRegression()
model.fit(x,y)
## our model is ready now ## we will use it to make predictions
print("what the CO2 emission of your car if its volume is 2300 and its weigt is equal 1600")
print("the answer is :",model.predict([[2300,1600]]),"of co2")
## we will look then at the coefficient that determine the relation between two independant variables
print(model.coef_)
# In[73]:
## we will look at the scaling of our features that a lot of times can be very helpful for us when the data have diffenrent units and different size
## to scale our data we will use the standardization the value - mean/ standard deviation
import numpy as numpy
import pandas as pd
cars_data = pd.read_csv("C:/Users/asus/Downloads/cars.csv")
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
x = cars_data[["Weight","Volume"]]
y = cars_data["CO2"]
scalex = scale.fit_transform(x)
print(scalex)
## we define our model
model =linear_model.LinearRegression()
model.fit(scalex,y) ## when we scale our data we have to applied it as well when predicting values as well
scaled = scale.transform([[2300,1.3]])
predictedco2= model.predict([scaled[0]])
print("the predicted value of co2 is :",predictedco2)
# In[96]:
## then we will take a look to the train and test part
# it's a method to evaluate the accuracy of our model
## we will start with our data set
import numpy as np
import matplotlib.pyplot as plt
numpy.random.seed(2)
x =np.random.normal(3,1,100)
y = np.random.normal(150,40,100)/x
plt.xlabel("the time spent in the market")
plt.ylabel("the value of the purchase ")
plt.title("the shopping habits")
plt.scatter(x,y)
plt.show()
## we will split our data into a train and a test dataset
train_x=x[:80]
train_y = y[:80]
test_x = x[80:]
test_y =y[80:]
print("the plot of the training set")
plt.scatter(train_x,train_y)
plt.show()
print("the plot of the testing set")
plt.scatter(test_x,test_y)
plt.show()
# In[97]:
## i think the best model that fit our data well is a polynomial one
#let's create one
model = np.poly1d(np.polyfit(train_x,train_y,4)) #the degree of the polynome is 4
myline = np.linspace(0,6,100)
plt.scatter(train_x,train_y)
plt.plot(myline,model(myline))
plt.show()
## we see that our model is overfitting the data
## wa can look at the r squared coefficient to see if the model fit our data
from sklearn.metrics import r2_score
r2=r2_score(train_y,model(train_x))
print("the squred coefficient is equal to:",r2)
print("the model suits the data well on the training data ")
## we will use our model to the test data
coeff2 = r2_score(test_y,model(test_y))
print("now the coefficient on the testing data is :",coeff2)
print("the model fit the testing data as well but not as in the training set")
## then we will move to making predictions using our model
print("the purchase of a custumer that spent 5 minutes is about:",model(5))
# In[98]:
## we will try the decision tree as well
# the decision tree is made in order to make future decison based on our past aexperiences
# in this examples we will use the data of persons (attend to the show or not) based on the age the nationality the rank and theexperience
import pandas
from sklearn import tree
import pydotplus # i don't know how to download isn't already installed
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import matplotlib.image as pltimg
## the decision tree handle just the numerical values only
d={"UK":0,"USA":1,'N':2}
df["nationality"] =df["nationality"].map(d)
a = {"yes":1,"no":2}
df["Go"] = df["Go"].map(a)
## we will seperate the target columns from the features ones
features = ["age","Experience","Rank","Natinality"]
x = df[features]
y = df["GO"]
## we will create our decision tree now and save it as a image
dtree = DecisionTreeClassifier()
dtree = dtree.fit(x,y)
data = tree.export_graphviz(dtree,out_file=None,feature_names=features)
graph= pydotplus.graph_from_dot_data(data)
graph.write_png('mydecisiontree.png')
img= pltimg.imread("mydecisiontree.png')
imgplot = plt.imshow(img)
plt.show()
## we will use our model to make prediction
# print(dtree([[the number of the features]])