Add random_state argument to HyperparameterOptimization class to ensure reproducibility of results. - Fixes for #125 (#131)

sayanchk · web-flow · commit 29dacc918648 · 2024-01-31T11:05:51.000-08:00
* set seed for numpy * add a random_state argument to hyperparameter optimization module * add random state for hyperopt fmin function * add rstate for hyperparameter optimization and add a test for it * fixed MR comments * fix attempt for ci cd * version update for a minor fix release --------- Authored by: Panagiotis Papaemmanouil <papaemman.pan@gmail.com> Review changes authored by: @sayanchk
diff --git a/luminaire/optimization/hyperparameter_optimization.py b/luminaire/optimization/hyperparameter_optimization.py
@@ -1,10 +1,10 @@
 from hyperopt import fmin, tpe, hp, STATUS_OK
 from luminaire.model import LADStructuralModel, LADStructuralHyperParams, LADFilteringModel, LADFilteringHyperParams
 from luminaire.exploration.data_exploration import DataExploration
+from luminaire.utils.random_state_validation import check_random_state
 import warnings
 warnings.filterwarnings('ignore')
 
-
 class HyperparameterOptimization(object):
     """
     Hyperparameter optimization for LAD outlier detection configuration for batch data.
@@ -20,6 +20,7 @@ class HyperparameterOptimization(object):
     :param int min_ts_length: The minimum required length of the time series for training. The input time series will be
         truncated if the length is greater than this value.
     :param int scoring_length: Number of innovations to be scored after training window with respect to the frequency.
+    :param int random_state: Turn seed into a np.random.RandomState instance
 
     .. _Pandas offset: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects
     """
@@ -31,6 +32,7 @@ def __init__(self,
                  max_ts_length=None,
                  min_ts_length=None,
                  scoring_length=None,
+                 random_state=None,
                  **kwargs):
         self._target_metric = 'raw'
         self.freq = freq
@@ -48,6 +50,8 @@ def __init__(self,
         self.scoring_length = scoring_length or (scoring_length_dict.get(freq)
                                                  if freq in scoring_length_dict.keys() else 30)
 
+        self.random_state = random_state
+
     def _mape(self, actuals, predictions):
         """
         This function computes the mean absolute percentage error for the observed vs the predicted values.
@@ -93,7 +97,8 @@ def _synthetic_anomaly_check(self, observation, prediction, std_error):
 
         # Anomaly detection based on synthetic anomalies generated through a given intensity list
         for prop in self.anomaly_intensity_list:
-            trial_prob = np.random.uniform(0, 1, 1)
+            rnd = check_random_state(self.random_state)
+            trial_prob = rnd.uniform(0, 1, 1)
             if trial_prob < 0.4:
                 synthetic_value = observation + (prop * observation)
                 anomaly_flags.append(1)
@@ -227,7 +232,8 @@ def _objective_part(self, data, smoothed_series, args):
                     anomaly_probabilities_list = []
                     local_model = copy.deepcopy(stable_model)
                     for i, row in scoring_data.iterrows():
-                        trial_prob = np.random.uniform(0, 1, 1)
+                        rnd = check_random_state(self.random_state)
+                        trial_prob = rnd.random.uniform(0, 1, 1)
                         observed_value = row.raw
                         synthetic_actual = observed_value
                         if trial_prob < 0.4:
@@ -263,7 +269,7 @@ def _optimize(self, data, objective_part, algo=tpe.suggest, max_evals=50):
         :return: Optimal hyperparameters
         :rtype: dict
         """
-
+        import numpy as np
         from functools import partial
         from pykalman import KalmanFilter
 
@@ -288,7 +294,7 @@ def _optimize(self, data, objective_part, algo=tpe.suggest, max_evals=50):
 
             try:
                 series = data[self._target_metric].values
-                kf = KalmanFilter()
+                kf = KalmanFilter(random_state=self.random_state)
                 smoothed_series, cov_series = kf.em(series).smooth(series)
             except:
                 raise ValueError('Kalman Smoothing requires more than one data point')
@@ -299,7 +305,7 @@ def _optimize(self, data, objective_part, algo=tpe.suggest, max_evals=50):
             raise ValueError('Only `detection_type=OutlierDetection` is supported in hyperparameter optimization right now')
 
         # Calling the optimization function
-        hyper_param = fmin(objective, space=space, algo=algo, max_evals=max_evals, show_progressbar=True)
+        hyper_param = fmin(objective, space=space, algo=algo, max_evals=max_evals, show_progressbar=True, rstate=np.random.default_rng(self.random_state))
         hyper_param['LuminaireModel'] = hyper_param_list[hyper_param['LuminaireModel']]['model']
         if 'max_ft_freq' in hyper_param:
             hyper_param['max_ft_freq'] = hyper_param['max_ft_freq'] + 2
diff --git a/luminaire/tests/test_hyper.py b/luminaire/tests/test_hyper.py
@@ -2,9 +2,16 @@
 
 class TestHyperparameterOptimization(object):
 
-    def test_run(self, test_data_with_missing):
-
+    def test_run1(self, test_data_with_missing):
+        """Test using the default random_state=None"""
         hyper_obj = HyperparameterOptimization(freq='D', detection_type='OutlierDetection')
         hyper_parameters = hyper_obj.run(test_data_with_missing, max_evals=5)
 
         assert isinstance(hyper_parameters, dict)
+
+    def test_run2(self, test_data_with_missing):
+        """Test defining a random_state"""
+        hyper_obj = HyperparameterOptimization(freq='D', detection_type='OutlierDetection', random_state=42)
+        hyper_parameters = hyper_obj.run(test_data_with_missing, max_evals=5)
+
+        assert isinstance(hyper_parameters, dict)
diff --git a/luminaire/utils/__init__.py b/luminaire/utils/__init__.py
@@ -0,0 +1 @@
+from .random_state_validation import check_random_state
diff --git a/luminaire/utils/random_state_validation.py b/luminaire/utils/random_state_validation.py
@@ -0,0 +1,23 @@
+import numpy as np
+import numbers
+
+def check_random_state(seed):
+    """Turn seed into a np.random.RandomState instance
+
+    :param int seed: seed for the random state
+    :return: None, int or instance of RandomState
+             If seed is None, return the RandomState singleton used by np.random.
+             If seed is an int, return a new RandomState instance seeded with seed.
+             If seed is already a RandomState instance, return it.
+             Otherwise raise ValueError.
+    :rtype: np.random.RandomState or None
+    """
+    if seed is None or seed is np.random:
+        return np.random.mtrand._rand
+    if isinstance(seed, numbers.Integral):
+        return np.random.RandomState(seed)
+    if isinstance(seed, np.random.RandomState):
+        return seed
+    raise ValueError(
+        "%r cannot be used to seed a numpy.random.RandomState instance" % seed
+    )
diff --git a/requirements.txt b/requirements.txt
@@ -2,9 +2,9 @@ bayescd>=0.4
 changepy>=0.3.1
 hyperopt>=0.1.2
 numpy>=1.17.5, <=1.22.4
-pandas>=0.25.3
+pandas>=0.25.3, <=2.0.3
 pykalman>=0.9.5
 scipy>=1.6.0
-statsmodels>=0.13.0
+statsmodels>=0.13.0, <=0.13.5
 scikit-learn>=0.24.2
 decorator>=5.1.0
diff --git a/setup.py b/setup.py
@@ -14,7 +14,7 @@
 
 setup(
     name='luminaire',
-    version='0.4.2',
+    version='0.4.3',
 
     license='Apache License 2.0',
 

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from .random_state_validation import check_random_state`