Skip to content

Commit 29dacc9

Browse files
authored
Add random_state argument to HyperparameterOptimization class to ensure reproducibility of results. - Fixes for #125 (#131)
* set seed for numpy * add a random_state argument to hyperparameter optimization module * add random state for hyperopt fmin function * add rstate for hyperparameter optimization and add a test for it * fixed MR comments * fix attempt for ci cd * version update for a minor fix release --------- Authored by: Panagiotis Papaemmanouil <[email protected]> Review changes authored by: @sayanchk
1 parent 9471eb3 commit 29dacc9

File tree

6 files changed

+48
-11
lines changed

6 files changed

+48
-11
lines changed

luminaire/optimization/hyperparameter_optimization.py

+12-6
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
from hyperopt import fmin, tpe, hp, STATUS_OK
22
from luminaire.model import LADStructuralModel, LADStructuralHyperParams, LADFilteringModel, LADFilteringHyperParams
33
from luminaire.exploration.data_exploration import DataExploration
4+
from luminaire.utils.random_state_validation import check_random_state
45
import warnings
56
warnings.filterwarnings('ignore')
67

7-
88
class HyperparameterOptimization(object):
99
"""
1010
Hyperparameter optimization for LAD outlier detection configuration for batch data.
@@ -20,6 +20,7 @@ class HyperparameterOptimization(object):
2020
:param int min_ts_length: The minimum required length of the time series for training. The input time series will be
2121
truncated if the length is greater than this value.
2222
:param int scoring_length: Number of innovations to be scored after training window with respect to the frequency.
23+
:param int random_state: Turn seed into a np.random.RandomState instance
2324
2425
.. _Pandas offset: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects
2526
"""
@@ -31,6 +32,7 @@ def __init__(self,
3132
max_ts_length=None,
3233
min_ts_length=None,
3334
scoring_length=None,
35+
random_state=None,
3436
**kwargs):
3537
self._target_metric = 'raw'
3638
self.freq = freq
@@ -48,6 +50,8 @@ def __init__(self,
4850
self.scoring_length = scoring_length or (scoring_length_dict.get(freq)
4951
if freq in scoring_length_dict.keys() else 30)
5052

53+
self.random_state = random_state
54+
5155
def _mape(self, actuals, predictions):
5256
"""
5357
This function computes the mean absolute percentage error for the observed vs the predicted values.
@@ -93,7 +97,8 @@ def _synthetic_anomaly_check(self, observation, prediction, std_error):
9397

9498
# Anomaly detection based on synthetic anomalies generated through a given intensity list
9599
for prop in self.anomaly_intensity_list:
96-
trial_prob = np.random.uniform(0, 1, 1)
100+
rnd = check_random_state(self.random_state)
101+
trial_prob = rnd.uniform(0, 1, 1)
97102
if trial_prob < 0.4:
98103
synthetic_value = observation + (prop * observation)
99104
anomaly_flags.append(1)
@@ -227,7 +232,8 @@ def _objective_part(self, data, smoothed_series, args):
227232
anomaly_probabilities_list = []
228233
local_model = copy.deepcopy(stable_model)
229234
for i, row in scoring_data.iterrows():
230-
trial_prob = np.random.uniform(0, 1, 1)
235+
rnd = check_random_state(self.random_state)
236+
trial_prob = rnd.random.uniform(0, 1, 1)
231237
observed_value = row.raw
232238
synthetic_actual = observed_value
233239
if trial_prob < 0.4:
@@ -263,7 +269,7 @@ def _optimize(self, data, objective_part, algo=tpe.suggest, max_evals=50):
263269
:return: Optimal hyperparameters
264270
:rtype: dict
265271
"""
266-
272+
import numpy as np
267273
from functools import partial
268274
from pykalman import KalmanFilter
269275

@@ -288,7 +294,7 @@ def _optimize(self, data, objective_part, algo=tpe.suggest, max_evals=50):
288294

289295
try:
290296
series = data[self._target_metric].values
291-
kf = KalmanFilter()
297+
kf = KalmanFilter(random_state=self.random_state)
292298
smoothed_series, cov_series = kf.em(series).smooth(series)
293299
except:
294300
raise ValueError('Kalman Smoothing requires more than one data point')
@@ -299,7 +305,7 @@ def _optimize(self, data, objective_part, algo=tpe.suggest, max_evals=50):
299305
raise ValueError('Only `detection_type=OutlierDetection` is supported in hyperparameter optimization right now')
300306

301307
# Calling the optimization function
302-
hyper_param = fmin(objective, space=space, algo=algo, max_evals=max_evals, show_progressbar=True)
308+
hyper_param = fmin(objective, space=space, algo=algo, max_evals=max_evals, show_progressbar=True, rstate=np.random.default_rng(self.random_state))
303309
hyper_param['LuminaireModel'] = hyper_param_list[hyper_param['LuminaireModel']]['model']
304310
if 'max_ft_freq' in hyper_param:
305311
hyper_param['max_ft_freq'] = hyper_param['max_ft_freq'] + 2

luminaire/tests/test_hyper.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,16 @@
22

33
class TestHyperparameterOptimization(object):
44

5-
def test_run(self, test_data_with_missing):
6-
5+
def test_run1(self, test_data_with_missing):
6+
"""Test using the default random_state=None"""
77
hyper_obj = HyperparameterOptimization(freq='D', detection_type='OutlierDetection')
88
hyper_parameters = hyper_obj.run(test_data_with_missing, max_evals=5)
99

1010
assert isinstance(hyper_parameters, dict)
11+
12+
def test_run2(self, test_data_with_missing):
13+
"""Test defining a random_state"""
14+
hyper_obj = HyperparameterOptimization(freq='D', detection_type='OutlierDetection', random_state=42)
15+
hyper_parameters = hyper_obj.run(test_data_with_missing, max_evals=5)
16+
17+
assert isinstance(hyper_parameters, dict)

luminaire/utils/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .random_state_validation import check_random_state
+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import numpy as np
2+
import numbers
3+
4+
def check_random_state(seed):
5+
"""Turn seed into a np.random.RandomState instance
6+
7+
:param int seed: seed for the random state
8+
:return: None, int or instance of RandomState
9+
If seed is None, return the RandomState singleton used by np.random.
10+
If seed is an int, return a new RandomState instance seeded with seed.
11+
If seed is already a RandomState instance, return it.
12+
Otherwise raise ValueError.
13+
:rtype: np.random.RandomState or None
14+
"""
15+
if seed is None or seed is np.random:
16+
return np.random.mtrand._rand
17+
if isinstance(seed, numbers.Integral):
18+
return np.random.RandomState(seed)
19+
if isinstance(seed, np.random.RandomState):
20+
return seed
21+
raise ValueError(
22+
"%r cannot be used to seed a numpy.random.RandomState instance" % seed
23+
)

requirements.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@ bayescd>=0.4
22
changepy>=0.3.1
33
hyperopt>=0.1.2
44
numpy>=1.17.5, <=1.22.4
5-
pandas>=0.25.3
5+
pandas>=0.25.3, <=2.0.3
66
pykalman>=0.9.5
77
scipy>=1.6.0
8-
statsmodels>=0.13.0
8+
statsmodels>=0.13.0, <=0.13.5
99
scikit-learn>=0.24.2
1010
decorator>=5.1.0

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
setup(
1616
name='luminaire',
17-
version='0.4.2',
17+
version='0.4.3',
1818

1919
license='Apache License 2.0',
2020

0 commit comments

Comments
 (0)