-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlinear_regression.py
123 lines (100 loc) · 5.07 KB
/
linear_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from __future__ import annotations
import matplotlib.pyplot as plt
from pandas import DataFrame
from generator import Triplet
from gooddata_pandas import DataFrameFactory
from statsmodels.stats.diagnostic import het_white, normal_ad
from statsmodels.stats.stattools import durbin_watson
import statsmodels.api as sm
import seaborn as sns
class MyLinearRegression:
def __init__(self, df_factory: DataFrameFactory, triplet: Triplet):
self.df_factory = df_factory
self.triplet = triplet
self.data_frame = df_factory.not_indexed(triplet.as_computable_dictionary)
self.X = self.data_frame.iloc[:, 1].values.reshape(-1, 1)
self.Y = self.data_frame.iloc[:, 2].values.reshape(-1, 1)
self.model = None
self.assumptions_results = dict()
self.assumptions = dict()
def check_assumptions(self,
sample_size=10,
linearity_threshold=0.7,
homoscedasticity_threshold=0.05,
independence_interval=None,
normality_threshold=0.05):
if independence_interval is None:
independence_interval = [1.5, 2.5]
if not self.data_frame.isnull().values.any() and self.data_frame.shape[0] > sample_size:
x = sm.add_constant(self.X)
self.model = sm.OLS(self.Y, x).fit()
self.assumptions["linearity_assumption"] = self.linearity_assumption()
self.assumptions["homoscedasticity_assumption"] = self.homoscedasticity_assumption()
self.assumptions["independence_assumption"] = self.independence_assumption()
self.assumptions["normality_assumption"] = self.normality_assumption()
self.assumptions_results["linearity_assumption"] = linearity_threshold < abs(
self.assumptions["linearity_assumption"].unstack().sort_values(kind="quicksort")[
0])
self.assumptions_results["homoscedasticity_assumption"] = self.assumptions["homoscedasticity_assumption"][
"p-value"] > homoscedasticity_threshold
self.assumptions_results["independence_assumption"] = independence_interval[0] < self.assumptions[
"independence_assumption"] and self.assumptions["independence_assumption"] < independence_interval[1]
self.assumptions_results["normality_assumption"] = self.assumptions[
"normality_assumption"][1] > normality_threshold
@property
def valid_assumptions(self) -> bool:
if self.assumptions_results:
return all(self.assumptions_results.values())
return False
@property
def valid_assumptions_count(self) -> int:
return list(self.assumptions_results.values()).count(True)
def linearity_assumption(self) -> DataFrame:
"""
Assumption: The existence of a linear relationship between variables.
_____________________________________________________________________
Pearson correlation coefficient is used.
:return:
"""
# Explicit say that I want the second and the third columns
return self.data_frame.iloc[:, 1:3].corr()
def homoscedasticity_assumption(self) -> dict[str, float]:
"""
Assumption: The variance of residual is the same for any value of X.
________________________________________________________
Since the p-value is not less than 0.05, we fail to reject the null hypothesis.
Null (H0): Homoscedasticity is present (residuals are equally scattered)
:return:
"""
test_values = het_white(self.model.resid, self.model.model.exog)
labels = ['Test Statistic', 'p-value', 'F-Statistic', 'F-Test p-value']
return dict(zip(labels, test_values))
def independence_assumption(self) -> float:
"""
Assumption: Residuals are independent of each other.
____________________________________________________
The value should be inside interval 1.5 < x < 2.5
:return:
"""
return durbin_watson(self.model.resid)
def normality_assumption(self) -> float:
"""
Assumption: Residuals are normally distributed.
____________________________________________________
Residuals are normally distributed if p_value > p_value_threshold. p_value < 0.05 is no go.
:return:
"""
return normal_ad(self.model.resid)
def visualize(self):
if self.model:
sns.regplot(x=self.data_frame.columns[1], y=self.data_frame.columns[2], data=self.data_frame).set(
title=f"Using attribute {self.data_frame.columns[0]}")
plt.show()
def __eq__(self, other: object) -> bool:
if not isinstance(other, self.__class__):
return False
return self.triplet == other.triplet
def __str__(self) -> str:
return self.__repr__()
def __repr__(self) -> str:
return f"{self.__class__.__name__}{self.triplet}"