add: random tie break for disagreement sampling

cosmic-cortex · cosmic-cortex · commit e0af35f35b63 · 2018-12-05T12:18:59.000+01:00
diff --git a/modAL/disagreement.py b/modAL/disagreement.py
@@ -10,7 +10,7 @@
 from sklearn.base import BaseEstimator
 
 from modAL.utils.data import modALinput
-from modAL.utils.selection import multi_argmax
+from modAL.utils.selection import multi_argmax, shuffled_argmax
 from modAL.models.base import BaseCommittee
 
 
@@ -103,80 +103,116 @@ def KL_max_disagreement(committee: BaseCommittee, X: modALinput, **predict_proba
 
 
 def vote_entropy_sampling(committee: BaseCommittee, X: modALinput,
-                          n_instances: int = 1,**disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
+                          n_instances: int = 1, random_tie_break=False,
+                          **disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
     """
     Vote entropy sampling strategy.
 
     Args:
         committee: The committee for which the labels are to be queried.
         X: The pool of samples to query from.
         n_instances: Number of samples to be queried.
-        **disagreement_measure_kwargs: Keyword arguments to be passed for the disagreement measure function.
+        random_tie_break: If True, shuffles utility scores to randomize the order. This
+            can be used to break the tie when the highest utility score is not unique.
+        **disagreement_measure_kwargs: Keyword arguments to be passed for the disagreement
+            measure function.
 
     Returns:
-        The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
+        The indices of the instances from X chosen to be labelled;
+         the instances from X chosen to be labelled.
     """
     disagreement = vote_entropy(committee, X, **disagreement_measure_kwargs)
-    query_idx = multi_argmax(disagreement, n_instances=n_instances)
+
+    if not random_tie_break:
+        query_idx = multi_argmax(disagreement, n_instances=n_instances)
+    else:
+        query_idx = shuffled_argmax(disagreement, n_instances=n_instances)
 
     return query_idx, X[query_idx]
 
 
 def consensus_entropy_sampling(committee: BaseCommittee, X: modALinput,
-                               n_instances: int = 1,**disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
+                               n_instances: int = 1, random_tie_break=False,
+                               **disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
     """
     Consensus entropy sampling strategy.
 
     Args:
         committee: The committee for which the labels are to be queried.
         X: The pool of samples to query from.
         n_instances: Number of samples to be queried.
-        **disagreement_measure_kwargs: Keyword arguments to be passed for the disagreement measure function.
+        random_tie_break: If True, shuffles utility scores to randomize the order. This
+            can be used to break the tie when the highest utility score is not unique.
+        **disagreement_measure_kwargs: Keyword arguments to be passed for the disagreement
+            measure function.
 
     Returns:
-        The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
+        The indices of the instances from X chosen to be labelled;
+        the instances from X chosen to be labelled.
     """
     disagreement = consensus_entropy(committee, X, **disagreement_measure_kwargs)
-    query_idx = multi_argmax(disagreement, n_instances=n_instances)
+
+    if not random_tie_break:
+        query_idx = multi_argmax(disagreement, n_instances=n_instances)
+    else:
+        query_idx = shuffled_argmax(disagreement, n_instances=n_instances)
 
     return query_idx, X[query_idx]
 
 
 def max_disagreement_sampling(committee: BaseCommittee, X: modALinput,
-                              n_instances: int = 1,**disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
+                              n_instances: int = 1, random_tie_break=False,
+                              **disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
     """
     Maximum disagreement sampling strategy.
 
     Args:
         committee: The committee for which the labels are to be queried.
         X: The pool of samples to query from.
         n_instances: Number of samples to be queried.
-        **disagreement_measure_kwargs: Keyword arguments to be passed for the disagreement measure function.
+        random_tie_break: If True, shuffles utility scores to randomize the order. This
+            can be used to break the tie when the highest utility score is not unique.
+        **disagreement_measure_kwargs: Keyword arguments to be passed for the disagreement
+         measure function.
 
     Returns:
-        The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
+        The indices of the instances from X chosen to be labelled;
+        the instances from X chosen to be labelled.
     """
     disagreement = KL_max_disagreement(committee, X, **disagreement_measure_kwargs)
-    query_idx = multi_argmax(disagreement, n_instances=n_instances)
+
+    if not random_tie_break:
+        query_idx = multi_argmax(disagreement, n_instances=n_instances)
+    else:
+        query_idx = shuffled_argmax(disagreement, n_instances=n_instances)
 
     return query_idx, X[query_idx]
 
 
 def max_std_sampling(regressor: BaseEstimator, X: modALinput,
-                     n_instances: int = 1, **predict_kwargs) -> Tuple[np.ndarray, modALinput]:
+                     n_instances: int = 1,  random_tie_break=False,
+                     **predict_kwargs) -> Tuple[np.ndarray, modALinput]:
     """
     Regressor standard deviation sampling strategy.
 
     Args:
         regressor: The regressor for which the labels are to be queried.
         X: The pool of samples to query from.
         n_instances: Number of samples to be queried.
+        random_tie_break: If True, shuffles utility scores to randomize the order. This
+            can be used to break the tie when the highest utility score is not unique.
         **predict_kwargs: Keyword arguments to be passed to :meth:`predict` of the CommiteeRegressor.
 
     Returns:
-        The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
+        The indices of the instances from X chosen to be labelled;
+        the instances from X chosen to be labelled.
     """
     _, std = regressor.predict(X, return_std=True, **predict_kwargs)
     std = std.reshape(X.shape[0], )
-    query_idx = multi_argmax(std, n_instances=n_instances)
+
+    if not random_tie_break:
+        query_idx = multi_argmax(std, n_instances=n_instances)
+    else:
+        query_idx = shuffled_argmax(std, n_instances=n_instances)
+    
     return query_idx, X[query_idx]