-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathLinear_Regression_code.py
More file actions
148 lines (112 loc) · 4.72 KB
/
Linear_Regression_code.py
File metadata and controls
148 lines (112 loc) · 4.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# -*- coding: utf-8 -*-
"""Untitled13.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1mVogyEwZ56lzTCW9NevzCxC-ewNG_jSP
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as rmse
from sklearn.metrics import mean_absolute_percentage_error as mape
df = pd.read_csv('/content/data_sales.csv')
df.head(10)
df.info()
df.drop(['f0', 'f1'], axis=1, inplace=True)
df.sample(5)
df['date'] = pd.to_datetime(df['date'], format='%d-%m-%Y')
df.describe()
df = df.set_index('date').interpolate()
df = df.reset_index(drop=False)
df.sample(5)
df.sort_values(by='date', ascending = True, inplace = True)
df = df[df['store'] == 1]
df = df[df['item'] == 1]
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d') # convert date column to datatime object
# Create Date-related Features to be used for EDA and Supervised ML: Regression
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['weekday'] = df['date'].dt.weekday
df['weekday'] = np.where(df.weekday == 0, 7, df.weekday)
# Split the series to predict the last 3 months of 2017
temp_df = df.set_index('date')
train_df = temp_df.loc[:'2017-09-30'].reset_index(drop=False)
test_df = temp_df.loc['2017-10-01':].reset_index(drop=False)
train_df.head()
for i in range(1,8):
lag_i = 'lag_' + str(i)
df[lag_i] = df.sales.shift(i)
# Rolling window
df['rolling_mean'] = df.sales.rolling(window=7).mean()
df['rolling_max'] = df.sales.rolling(window=7).max()
df['rolling_min'] = df.sales.rolling(window=7).min()
df = df.dropna(how='any', inplace=False)
df = df.drop(['store', 'item'], axis=1)
# Split the series to predict the last 3 months of 2017
df = df.set_index('date')
reg_train_df = df.loc[:'2017-09-30']
reg_test_df= df.loc['2017-10-01':]
X_train = reg_train_df.drop(['sales'], axis=1)
y_train = reg_train_df['sales'].values
X_test = reg_test_df.drop(['sales'], axis=1)
y_test = reg_test_df['sales'].values
#Univariate SelectKBest class to extract top 5 best features
top_features = SelectKBest(score_func=f_regression, k=5)
fit = top_features.fit(X_train, y_train)
df_scores = pd.DataFrame(fit.scores_)
df_columns = pd.DataFrame(X_train.columns)
#concat two dataframes for better visualization
feature_scores = pd.concat([df_columns, df_scores], axis=1)
feature_scores.columns = ['Feature','Score'] # naming the dataframe columns
print(feature_scores.nlargest(5,'Score'))
corr = reg_train_df.corr()
fig = plt.figure(figsize=(10,7))
sns.heatmap(corr, linewidths=.5)
fig, axs = plt.subplots(ncols=2, figsize=(14,7))
sns.scatterplot(x=reg_train_df['rolling_mean'], y=reg_train_df['sales'], ax=axs[0])
axs[0].set(title='Linear relationship between sales and rolling_mean of sales')
sns.scatterplot(x=reg_train_df['rolling_max'], y=reg_train_df['sales'], ax=axs[1])
axs[1].set(title='Linear relationship between sales and rolling_max of sales')
fig, axs = plt.subplots(ncols=2, figsize=(14,7))
sns.scatterplot(x=reg_train_df['rolling_min'], y=reg_train_df['sales'], ax=axs[0])
axs[0].set(title='Linear relationship between sales and rolling_min of sales')
sns.scatterplot(x=reg_train_df['lag_7'], y=reg_train_df['sales'], ax=axs[1])
axs[1].set(title='Linear relationship between sales and lag_7 of sales')
plt.show()
X_train = X_train[['rolling_mean', 'rolling_max', 'rolling_min', 'lag_7', 'lag_1']]
X_test = X_test[['rolling_mean', 'rolling_max', 'rolling_min', 'lag_7', 'lag_1']]
# fit model
model = LinearRegression()
model.fit(X_train, y_train)
preds = model.predict(X_test)
errors_df = reg_test_df[['sales']]
errors_df['pred_sales'] = preds
errors_df['errors'] = preds - y_test
errors_df.insert(0, 'model', 'LinearRegression')
# eval predictions
fig = plt.figure(figsize=(14,7))
plt.plot(reg_train_df.index, reg_train_df['sales'], label='Train')
plt.plot(reg_test_df.index, reg_test_df['sales'], label='Test')
plt.plot(errors_df.index, errors_df['pred_sales'], label='Forecast - Linear Regression')
plt.legend(loc='best')
plt.xlabel('date')
plt.ylabel('sales')
plt.title('Forecasts using Linear Regression model')
plt.show()
fig = plt.figure(figsize=(14,7))
plt.plot(errors_df.index, errors_df.errors, label='errors')
plt.plot(errors_df.index, errors_df.sales, label='actual sales')
plt.plot(errors_df.index, errors_df.pred_sales, label='forecast')
plt.legend(loc='best')
plt.xlabel('date')
plt.ylabel('sales')
plt.title('Linear Regression forecasts with actual sales and errors')
plt.show()