-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathML_grad_desc.py
executable file
·99 lines (68 loc) · 2.42 KB
/
ML_grad_desc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 13 15:29:14 2018
@author: bh387886
"""
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
def Grad_desc(X, y, m=0
,iters=10000, alpha=0.001):
N = len(y)
print('Iterations: %s, Learning Rate: %s' % (iters,alpha))
c = (float(random.randint(0,30))/100)
for i in range(iters):
m_grad = 0
c_grad = 0
for j in range(N):
m_grad += -(y[j]-(m*X[j]+c))*X[j]/N
c_grad += -(y[j]-(m*X[j]+c))/N
m = m - alpha*m_grad
c = c - alpha*c_grad
cost = 0
for j in range(N):
cost += (y[j]-(m*X[j]+c))**2
return [m, c, cost/(2*N)]
headers = ['cylinders','displacement','horsepower','weight'
,'acceleration','model year','origin']
raw = open('auto-mpg.data.txt')
mpg = [] #Dependent variable : mpg
features = [] #Independent variables: rest of them except car name
#Preparing the data
for line in raw:
i=0
var = []
for y in line.split():
if (i==0):
mpg.append([float(y)])
if (i<8 and i>0):
if(y=='?'): # Handaling missing values for horsepower
y = 0 # (Setting them to 0)
if(i==4):
var.append(float(y.replace('.','')))
else:
var.append(float(y))
i+=1
features.append(var)
df_x = pd.DataFrame.from_records(features, columns = headers)
df_y = pd.DataFrame.from_records(mpg, columns = ['mpg'])
#replacing 0(missing) horsepower with average horsepower
avg_bhp = (np.average(df_x['horsepower'])*len(df_x))/(len(df_x)-6)
df_x['horsepower'] = df_x['horsepower'].replace(0,avg_bhp)
x_org = df_x['horsepower'] #change this if you want another dependent var
y_org = df_y['mpg']
#Feature Scaling (Reducing the )
df_new1 = (x_org-x_org.min())/(x_org.max()-x_org.min())
df_y1 = (y_org - y_org.min())/(y_org.max() - y_org.min())
ans = Grad_desc(df_new1.tolist(),df_y1.tolist())
y_ans = ans[0]*df_new1 + ans[1]
#Restoring the scaled features
y_ans = y_ans*(y_org.max() - y_org.min()) + y_org.min()
ans[1] = ans[1]*(y_org.max() - y_org.min()) + y_org.min()
ans[0] = (y_ans[1]-y_ans[0])/(x_org[1]-x_org[0])
print('ERROR = %s' % ans[2])
print('Equation of best-fit-line: y=%sx + %s' % (ans[0],ans[1]))
plt.scatter(x_org, df_y, color='black')
plt.plot(x_org, y_ans, color='blue', linewidth=3)