Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/hanbo-e/wimlds_emissions in…
Browse files Browse the repository at this point in the history
…to main
  • Loading branch information
Guli-Y committed Jun 9, 2021
2 parents 3ddd08f + 225629a commit d1a2f2f
Show file tree
Hide file tree
Showing 3 changed files with 680 additions and 1,002 deletions.
145 changes: 92 additions & 53 deletions emissions/impsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,22 @@

class ImpSearch():
"""
this class is built to used to facilitate analysis for answering following question:
this class is built to facilitate analysis for answering following question:
How different {year} could have been with implementation of our solution?
It is a bit similar to GridSearch.
What ImSearch do:
For each param, it trains the model and implement our solution on test year
and then collects the corresponding pollution quantity. After finishing this for all
possible params, it will select the params that gave the smallest pollution quantity
as best param and plot the implementation outcome for that year
1. For each param,
it trains the model, performs implement analysis on test year
and then collects total pollution quantity in that year caused by vehicles that failed the test.
2. After finishing the above steps for all possible params,
it will select the params that gave the smallest pollution quantity
as best param and plot the implementation outcome for that year
check out notebooks/what_if_2020.ipynb for usage
"""
cols = ['VEHICLE_AGE', 'MILE_YEAR', 'MAKE', 'MODEL_YEAR', 'ENGINE_WEIGHT_RATIO']

cols = ['VEHICLE_AGE', 'MILE_YEAR', 'MAKE',
'MODEL_YEAR', 'ENGINE_WEIGHT_RATIO']
cat_col = ['MAKE']

def __init__(self):
"""
"""
Expand Down Expand Up @@ -52,7 +57,7 @@ def load_data(self):

def train_test_split(self, year):
'''
splits the data into train and test sets using the given year
for a given year, splits data to train (before that year) and test (in that year)
'''
train = self.df[self.df.TEST_SDATE.dt.year < year].sort_values('TEST_SDATE')
test = self.df[self.df.TEST_SDATE.dt.year == year].sort_values('TEST_SDATE')
Expand All @@ -67,7 +72,7 @@ def get_estimator(self, depth):
'''
uses Trainer class from trainer.py to get the fitted estimator
prints the evluation scores
plots learning curve
if you want to plot learning curve, uncomment the last line
'''
trainer = Trainer(self.X_train[self.cols],
self.y_train,
Expand Down Expand Up @@ -97,38 +102,57 @@ def get_counter_table(self):
df.drop(columns=['count_fail', 'count_test', 'TEST_SDATE'], inplace=True)
return df

def count_fails_captured(self,
data,
precision,
predicted_fails,
true_fails):
"""
params:
data: output df from get_cumulated_tests
precision: precision score on test set
predicted_fails: number of fails predicted
true_fails: number of predicted failed vehicles that acutally fail the test
returns a panda series: number of fails captured along the year
"""
df = data.copy()
# first test all the predicted fails
df['fails_captured'] = df.n_tests * precision
# changes after finish testing predicted fails
tests_left = df[df.n_tests > predicted_fails].shape[0]
fails_left = self.total_fails - true_fails
avg_fail_per_test = fails_left/tests_left
df.loc[df.n_tests > predicted_fails, 'fails_captured'] = true_fails +\
(df[df.n_tests > predicted_fails]['n_tests'] - predicted_fails)*avg_fail_per_test
return df.fails_captured
def plot_heuristic_curve(self, df):
'''
this function is used in plot_simulation_plot():
it plots the cumulative number of failed vehicles along the year with heuristic decision
The heuristic decision is that we predict all the vehicles with age > 16 fail the emissions test
'''
# get heuristic prediction
y_pred = (self.X_test.VEHICLE_AGE > 16).astype('int')
true_fails = sum([i for i, j in zip (y_pred, self.y_test) if i + j == 2])
predicted_fails = y_pred.sum()
self.total_predicted_fails['heuristic'] = predicted_fails
# create df with prediction outcomes
pred_df = pd.DataFrame.from_dict({'y_true':self.y_test, 'y_pred':y_pred})
pred_df = pred_df.sort_values('y_pred', ascending=False)
# merge the prediction with counter table
pred_df.index = df.n_tests
df = df.merge(pred_df, how='left', left_on='n_tests', right_index=True)
# add new columns storing number of failed test captured along the year
df['fails_captured_heuristic'] = df.y_true.cumsum()
df['fails_left_heuristic'] = self.total_fails - df['fails_captured_heuristic']
plt.plot(df.index,
df['fails_captured_heuristic'],
label='with heuristic')
# mark the time when all the predicted fail tests from precision favored model are completed
t = df[df.n_tests==predicted_fails].index[0]
cap_fails = df[df.n_tests==predicted_fails]['fails_captured_heuristic'][0]
true_fails = df[df.n_tests==predicted_fails].n_fails[0]
print('\n With heuristic decision:')
print(f'''\nBy the time {str(t)[:10]},
- {round(true_fails)} vehicles were off the road in reality
- {round(cap_fails)} vehicles could have been off the road with heuristic decision''')
# what about dayofyear = 100
t2 = df[df.dayofyear==100].index[0]
cap_fails2 = df[df.index==t2]['fails_captured_heuristic'][0]
true_fails2 = df[df.index==t2].n_fails[0]
print(f'''\nBy the time {str(t2)[:10]},
- {round(true_fails2)} vehicles were off the road in reality
- {round(cap_fails2)} vehicles could have been off the road with heuristic decision''')
# store the pollution quantity in pollution
self.pollutions['heuristic'] = df['fails_left_heuristic'].sum()
return df

def plot_simulation_curve(self, year, df):
'''
for given year, plots:
1. how many failed tests were detected along that year
2. how many failed tests could have been deen detected along that year if our solution is implemented
1. cumulative number of failed tests in reality along that year
2. cumulative number of failed tests along that year if our solution was implemented
3. cumulative number of failed tests along that year if heuristic decision was made
'''
# find the best max_depth and ploting
plt.figure(figsize=(10, 6))
plt.figure(figsize=(10, 5))
plt.plot(df.index,
df.n_fails,
label='with current policy', c='red')
Expand All @@ -139,16 +163,20 @@ def plot_simulation_curve(self, year, df):
trainer = self.get_estimator([depth])
# get all the numbers
y_pred = trainer.search_result.predict(self.X_test[self.cols])
precision = precision_score(self.y_test, y_pred)
true_fails = sum([i for i, j in zip (y_pred, self.y_test) if i + j == 2])
predicted_fails = y_pred.sum()
self.total_predicted_fails[f'{depth}'] = predicted_fails
# sort testing order by proba
y_proba = trainer.search_result.predict_proba(self.X_test[self.cols])
pred_df = pd.DataFrame.from_dict({'y_true':self.y_test,
'y_pred':y_pred,
'y_proba':y_proba[:,1]})
pred_df = pred_df.sort_values('y_proba', ascending=False)
pred_df.index = df.n_tests
df = df.merge(pred_df, how='left', left_on='n_tests', right_index=True)
# add new columns storing number of failed test captured along the year
df[f'fails_captured_{depth}'] = self.count_fails_captured(df,
precision,
predicted_fails,
true_fails
)
df[f'fails_captured_{depth}'] = df.y_true.cumsum()
df.drop(columns=['y_true', 'y_pred', 'y_proba'], inplace=True)
df[f'fails_left_{depth}'] = self.total_fails - df[f'fails_captured_{depth}']
plt.plot(df.index,
df[f'fails_captured_{depth}'],
Expand All @@ -160,15 +188,17 @@ def plot_simulation_curve(self, year, df):
print(f'''\nBy the time {str(t)[:10]},
- {round(true_fails)} vehicles were off the road in reality
- {round(cap_fails)} vehicles could have been off the road using model max_depth = {depth}''')
# what about dayofyear = 182
t2 = df[df.dayofyear==182].index[0]
# what about dayofyear = 100
t2 = df[df.dayofyear==100].index[0]
cap_fails2 = df[df.index==t2][f'fails_captured_{depth}'][0]
true_fails2 = df[df.index==t2].n_fails[0]
print(f'''\nBy the time {str(t2)[:10]},
- {round(true_fails2)} vehicles were off the road in reality
- {round(cap_fails2)} vehicles could have been off the road using model max_depth = {depth}''')
# store the pollution quantity in pollution
self.pollutions[depth] = df[f'fails_left_{depth}'].sum()
# plot the curve of heuristic decision
df = self.plot_heuristic_curve(df)
# fill in the area corresponding to total pollution
plt.fill_between(df.index,
df.n_fails,
Expand All @@ -185,10 +215,16 @@ def plot_simulation_curve(self, year, df):
return df

def implement(self, year, n_estimators=[1], max_depth=[2, 3]):
self.max_depth = max_depth
self.n_estimators = n_estimators
'''
params:
year: split train (before that year) and test (in that year)
max_depth and n_estimators: hyper params for grid search with random forest classifier
'''
# train set split
self.train_test_split(year)
# get update the class attributes
self.max_depth = max_depth
self.n_estimators = n_estimators
# get counter table
df = self.get_counter_table()
# plotting simulation curve for each max_depth
Expand All @@ -199,11 +235,15 @@ def implement(self, year, n_estimators=[1], max_depth=[2, 3]):
self.best_depth = tmp.index[0]
# save df
self.anaylsis_table = df
df.to_csv(f'../data/implementation_analysis_{year}_best_depth_{self.best_depth}.csv')
print(f'\nSaved implementation_analysis_{year}_best_depth_{self.best_depth}.csv in data folder')
df.to_csv(f'../data/implementation_analysis_{year}_best_{self.best_depth}.csv')
print(f'\nSaved implementation_analysis_{year}_best_{self.best_depth}.csv in data folder')
self.year = year

def plot_clean(self):
'''
similar to plot simulation method,
only that this one plots the curve creatd by the best max_depth
'''
df = self.anaylsis_table
plt.figure(figsize=(10, 6))
# reality curve
Expand All @@ -215,21 +255,20 @@ def plot_clean(self):
df[f'fails_captured_{self.best_depth}'],
label='with our solution', c='green')

# mark the time when all the predicted fail tests are completed
pf = self.total_predicted_fails[str(self.best_depth)]
col = f'fails_captured_{self.best_depth}'
t = df[df.n_tests==pf].index[0]
# in simulated curve
t = df[df.dayofyear==100].index[0]
# horizontal grey line - simulated curve
cap_fails = df[df.index==t][col].values[0]
plt.plot([t for i in range(100)], np.linspace(0, cap_fails, 100), c='grey')
plt.plot(df[df.index < t].index,
[cap_fails for i in range(df[df.index < t].shape[0])],
c='grey')
# in reality curve
# horizontal grey line - reality curve
true_fails = df[df.index==t].n_fails.values[0]
plt.plot(df[df.index < t].index,
[true_fails for i in range(df[df.index < t].shape[0])],
c='grey')
# vertical grey line
plt.plot([t for i in range(100)], np.linspace(0, cap_fails, 100), c='grey')
# fill in the area corresponding to reduced pollution
plt.fill_between(df.index,
df.n_fails,
Expand Down
Loading

0 comments on commit d1a2f2f

Please sign in to comment.