Skip to content

Commit 586c59e

Browse files
committed
Added the Bietti benchmark and improved openml timeout handling.
1 parent 5727a6f commit 586c59e

File tree

5 files changed

+127
-25
lines changed

5 files changed

+127
-25
lines changed

coba/environments/core.py

+76-6
Original file line numberDiff line numberDiff line change
@@ -525,11 +525,10 @@ def from_feurer(drop_missing: bool = True) -> 'Environments':
525525
drop_missing: Exclude interactions with missing context features.
526526
527527
Remarks:
528-
The description of the benchmark is provided at https://arxiv.org/abs/2007.04074.
529-
For Task ids 232, 3044, 75105, and 211723 every row has a missing feature. These
530-
environments will be empty when drop_missing is True. Task id 189866 has been
531-
updated to 361282, a new version of the original dataset that fixes api issues
532-
with the old dataset.
528+
The benchmark is described at https://arxiv.org/abs/2007.04074. For task ids
529+
232, 3044, 75105, and 211723 every row has a missing feature. These environments
530+
will be empty when drop_missing is True. Task id 189866 has been removed due to an
531+
OpenML issue (see https://github.com/openml/OpenML/issues/1036 for more information).
533532
534533
Returns:
535534
An Environments object.
@@ -552,13 +551,84 @@ def from_feurer(drop_missing: bool = True) -> 'Environments':
552551
167152,167161,167168,167181,167184,167185,167190,167200,167201,167202,167203,
553552
167204,167205,168785,168791,168792,168793,168794,168795,168796,168797,168798,
554553
189779,189786,189828,189829,189836,189840,189841,189843,189844,189845,189846,
555-
189858,189859,189860,189861,189862,189863,189864,189865,361282,189869,189870,
554+
189858,189859,189860,189861,189862,189863,189864,189865,189869,189870,
556555
189871,189872,189873,189874,189875,189878,189880,189881,189882,189883,189884,
557556
189887,189890,189893,189894,189899,189900,189902,189905,189906,189908,189909,
558557
190154,190155,190156,190157,190158,190159,211720,211721,211722,211723,211724]
559558

560559
return Environments.from_openml(task_id=task_ids,drop_missing=drop_missing)
561560

561+
@staticmethod
562+
def from_bietti(drop_missing: bool = True) -> 'Environments':
563+
"""Create Environments from the Bietti benchmark.
564+
565+
Args:
566+
drop_missing: Exclude interactions with missing context features.
567+
568+
Remarks:
569+
The benchmark is defined in https://www.jmlr.org/papers/volume22/18-863/18-863.pdf.
570+
571+
The benchmark has many datasets repeated with small variations such
572+
as a multiclass version and a binary version. Some datasets have many
573+
more variations than others (e.g., fri_c0_1000_10 has 79 variations).
574+
This benchmark also has several synthetically generated datasets such as
575+
RandomRBF_0_0, fri_c0_1000_10, and synthetic_control.
576+
577+
The following changes were made to the original data ids:
578+
1. 21 was replaced with a newer version 40975
579+
2. 292 was replaced with a newer version 40981
580+
3. 478 was replaced with a newer version 40971
581+
4. 822 was removed because it is an old version of 823
582+
5. 872 was removed because it is an old version of 853
583+
6. 948 was removed because it is an old version of 772
584+
7. 1036 was replaced with a newer version 40992
585+
8. 1043 was replaced with a newer version 40993
586+
9. 1454 was removed because it is a duplicate of 1049
587+
10. 1470 was replaced with a newer version 23381
588+
11. 1217 was removed because it is a subsample of 1216
589+
12. 1113 was remove because it is a subsample of 1110
590+
591+
Returns:
592+
An Environments object.
593+
"""
594+
595+
data_ids = [3,6,10,11,12,14,16,18,20,22,23,26,28,30,31,32,36,37,39,40,41,43,44,46,
596+
48,50,53,54,59,60,61,62,150,151,153,154,155,156,157,158,159,160,161,162,
597+
180,181,182,183,184,187,273,275,276,277,278,279,285,293,300,307,310,312,313,
598+
329,333,334,335,336,337,338,339,343,346,351,354,357,375,377,383,384,385,386,
599+
387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,444,446,448,450,
600+
457,458,459,461,462,463,464,465,467,468,469,472,475,476,477,479,480,554,
601+
679,682,683,685,694,713,714,715,716,717,718,719,720,721,722,723,724,725,
602+
726,727,728,729,730,731,732,733,734,735,736,737,740,741,742,743,744,745,746,
603+
747,748,749,750,751,752,753,754,755,756,758,759,761,762,763,764,765,766,767,
604+
768,769,770,771,772,773,774,775,776,777,778,779,780,782,783,784,785,787,788,
605+
789,790,791,792,793,794,795,796,797,799,800,801,803,804,805,806,807,808,811,
606+
812,813,814,815,816,817,818,819,820,821,823,824,825,826,827,828,829,830,
607+
832,833,834,835,836,837,838,841,843,845,846,847,848,849,850,851,853,855,857,
608+
859,860,862,863,864,865,866,867,868,869,870,871,873,874,875,876,877,878,
609+
879,880,881,882,884,885,886,888,891,892,893,894,895,896,900,901,902,903,904,
610+
905,906,907,908,909,910,911,912,913,914,915,916,917,918,919,920,921,922,923,
611+
924,925,926,927,928,929,931,932,933,934,935,936,937,938,941,942,943,945,946,
612+
947,949,950,951,952,953,954,955,956,958,959,962,964,965,969,970,971,973,
613+
974,976,977,978,979,980,983,987,988,991,994,995,996,997,1004,1005,1006,1009,
614+
1011,1012,1013,1014,1015,1016,1019,1020,1021,1022,1025,1026,1038,1040,1041,
615+
1044,1045,1046,1048,1049,1050,1054,1055,1056,1059,1060,1061,1062,1063,1064,
616+
1065,1066,1067,1068,1069,1071,1073,1075,1077,1078,1079,1080,1081,1082,1083,
617+
1084,1085,1086,1087,1088,1100,1104,1106,1107,1110,1113,1115,1116,1117,1120,
618+
1121,1122,1123,1124,1125,1126,1127,1128,1129,1130,1131,1132,1133,1135,1136,
619+
1137,1138,1139,1140,1141,1142,1143,1144,1145,1146,1147,1148,1149,1150,1151,
620+
1152,1153,1154,1155,1156,1157,1158,1159,1160,1161,1162,1163,1164,1165,1166,
621+
1169,1216,1217,1218,1233,1235,1236,1237,1238,1241,1242,1412,1413,1441,1442,
622+
1443,1444,1449,1451,1453,1455,1457,1459,1460,1464,1467,1471,1472,1473,1475,
623+
1481,1482,1483,1486,1487,1488,1489,1496,1498,1590,40975,40981,40971,23381]
624+
625+
env40992 = Environments.from_openml(data_id=40992,target='label',drop_missing=drop_missing)
626+
env40993 = Environments.from_openml(data_id=40993,target='label',drop_missing=drop_missing)
627+
628+
return Environments.from_openml(data_id=data_ids,drop_missing=drop_missing) + env40992 + env40993
629+
630+
631+
562632
def __init__(self, *environments: Union[Environment, Sequence[Environment]]):
563633
"""Instantiate an Environments class.
564634

coba/environments/openml.py

+7-9
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def read(self) -> Iterable[Union[Dense,Sparse]]:
8989
raise CobaException(f"We were unable to find an appropriate target column for the given openml source.")
9090

9191
if data_descr.get('status') == 'deactivated':
92-
raise CobaException(f"Openml {self._data_id} has been deactivated. This is often due to flags on the data.")
92+
raise CobaException(f"Openml {self._data_id} has been deactivated (see, https://docs.openml.org/#dataset-status).")
9393

9494
is_ignore = lambda feat_descr:(
9595
feat_descr['is_ignore' ] == 'true' or
@@ -139,25 +139,24 @@ def _get_data(self, url:str, key:str, checksum:str=None) -> Iterable[str]:
139139
self._clear_cache()
140140
raise
141141

142-
def _http_request(self, url: str, tries: int = 0) -> Iterable[str]:
142+
def _http_request(self, url: str, tries: int = 1, timeout: int = 5) -> Iterable[str]:
143143
api_key = CobaContext.api_keys.get('openml')
144144
semaphore = CobaContext.store.get("openml_semaphore")
145145

146-
# In an attempt to be considerate we stagger/limit our hits of the REST API.
147-
# Openml doesn't publish any rate-limiting guidelines, so this is just a guess.
148146
# if semaphore is not None it indictes that we are in a CobaMultiprocessor.
147+
# When this is the case we stagger/limit our hits of the REST API to be considerate.
148+
# Openml doesn't publish any rate-limiting guidelines, so our staggering is a guess.
149149
if semaphore: time.sleep(2*random())
150150

151151
try:
152152
KB = 1024
153153
MB = 1024*KB
154154
if api_key: url = f"{url}?api_key={api_key}"
155-
yield from HttpSource(url, timeout=20, chunk_size=10*MB).read()
155+
yield from HttpSource(url, timeout=timeout, chunk_size=10*MB).read()
156156

157157
except TimeoutError:
158-
if tries >= 3: raise
159-
yield from self._http_request(url, tries+1)
160-
return
158+
if tries == 3: raise
159+
yield from self._http_request(url, timeout=5**(tries+1), tries=tries+1)
161160

162161
except request.HTTPError as e:
163162
status, content = e.code, e.fp.read()
@@ -179,7 +178,6 @@ def _http_request(self, url: str, tries: int = 0) -> Iterable[str]:
179178

180179
raise CobaException(f"An error was returned by openml: {content}")
181180

182-
183181
def _get_data_descr(self, data_id:int) -> Dict[str,Any]:
184182
descr_txt = " ".join(self._get_data(f'https://openml.org/api/v1/json/data/{data_id}', self._cache_keys['data']))
185183
descr_obj = json.loads(descr_txt)["data_set_description"]

coba/tests/test_environments_core.py

+37-1
Original file line numberDiff line numberDiff line change
@@ -482,13 +482,49 @@ def test_from_feurer(self):
482482
167097,167099,167100,167101,167103,167104,167105,167106,167149,167152,167161,167168,167181,
483483
167184,167185,167190,167200,167201,167202,167203,167204,167205,168785,168791,168792,168793,
484484
168794,168795,168796,168797,168798,189779,189786,189828,189829,189836,189840,189841,189843,
485-
189844,189845,189846,189858,189859,189860,189861,189862,189863,189864,189865,361282,189869,
485+
189844,189845,189846,189858,189859,189860,189861,189862,189863,189864,189865,189869,
486486
189870,189871,189872,189873,189874,189875,189878,189880,189881,189882,189883,189884,189887,
487487
189890,189893,189894,189899,189900,189902,189905,189906,189908,189909,190154,190155,190156,
488488
190157,190158,190159,211720,211721,211722,211723,211724}
489489

490490
self.assertEqual(actual_tasks,expected_tasks)
491491

492+
def test_from_bietti(self):
493+
actual_tasks = set([e.params['openml_data'] for e in Environments.from_bietti()])
494+
495+
expected_tasks = {3,6,10,11,12,14,16,18,20,22,23,26,28,30,31,32,36,37,39,40,41,43,44,46,
496+
48,50,53,54,59,60,61,62,150,151,153,154,155,156,157,158,159,160,161,162,
497+
180,181,182,183,184,187,273,275,276,277,278,279,285,293,300,307,310,312,313,
498+
329,333,334,335,336,337,338,339,343,346,351,354,357,375,377,383,384,385,386,
499+
387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,444,446,448,450,
500+
457,458,459,461,462,463,464,465,467,468,469,472,475,476,477,479,480,554,
501+
679,682,683,685,694,713,714,715,716,717,718,719,720,721,722,723,724,725,
502+
726,727,728,729,730,731,732,733,734,735,736,737,740,741,742,743,744,745,746,
503+
747,748,749,750,751,752,753,754,755,756,758,759,761,762,763,764,765,766,767,
504+
768,769,770,771,772,773,774,775,776,777,778,779,780,782,783,784,785,787,788,
505+
789,790,791,792,793,794,795,796,797,799,800,801,803,804,805,806,807,808,811,
506+
812,813,814,815,816,817,818,819,820,821,823,824,825,826,827,828,829,830,
507+
832,833,834,835,836,837,838,841,843,845,846,847,848,849,850,851,853,855,857,
508+
859,860,862,863,864,865,866,867,868,869,870,871,873,874,875,876,877,878,
509+
879,880,881,882,884,885,886,888,891,892,893,894,895,896,900,901,902,903,904,
510+
905,906,907,908,909,910,911,912,913,914,915,916,917,918,919,920,921,922,923,
511+
924,925,926,927,928,929,931,932,933,934,935,936,937,938,941,942,943,945,946,
512+
947,949,950,951,952,953,954,955,956,958,959,962,964,965,969,970,971,973,
513+
974,976,977,978,979,980,983,987,988,991,994,995,996,997,1004,1005,1006,1009,
514+
1011,1012,1013,1014,1015,1016,1019,1020,1021,1022,1025,1026,1038,1040,1041,
515+
1044,1045,1046,1048,1049,1050,1054,1055,1056,1059,1060,1061,1062,1063,1064,
516+
1065,1066,1067,1068,1069,1071,1073,1075,1077,1078,1079,1080,1081,1082,1083,
517+
1084,1085,1086,1087,1088,1100,1104,1106,1107,1110,1113,1115,1116,1117,1120,
518+
1121,1122,1123,1124,1125,1126,1127,1128,1129,1130,1131,1132,1133,1135,1136,
519+
1137,1138,1139,1140,1141,1142,1143,1144,1145,1146,1147,1148,1149,1150,1151,
520+
1152,1153,1154,1155,1156,1157,1158,1159,1160,1161,1162,1163,1164,1165,1166,
521+
1169,1216,1217,1218,1233,1235,1236,1237,1238,1241,1242,1412,1413,1441,1442,
522+
1443,1444,1449,1451,1453,1455,1457,1459,1460,1464,1467,1471,1472,1473,1475,
523+
1481,1482,1483,1486,1487,1488,1489,1496,1498,1590,40975,40981,40971,23381,
524+
40992,40993}
525+
526+
self.assertEqual(actual_tasks,expected_tasks)
527+
492528
def test_from_lambda(self):
493529
context = lambda index,rng : [ round(r,2) for r in rng.randoms(5) ]
494530
actions = lambda index,context,rng : [rng.randoms(5) for _ in range(3)]

coba/tests/test_environments_openml.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -923,7 +923,7 @@ def thread_1():
923923
self.assertIn('openml_042693_arff', CobaContext.cacher)
924924

925925
@unittest.mock.patch('coba.environments.openml.HttpSource')
926-
def test_three_timeouts(self,mock):
926+
def test_two_timeouts(self,mock):
927927

928928
task = {
929929
"task":{
@@ -978,7 +978,6 @@ def test_three_timeouts(self,mock):
978978
"""
979979

980980
responses = [
981-
TimeoutError(),
982981
TimeoutError(),
983982
TimeoutError(),
984983
json.dumps(task).splitlines(),
@@ -1013,7 +1012,7 @@ def test_three_timeouts(self,mock):
10131012
self.assertIn('openml_042693_arff', CobaContext.cacher)
10141013

10151014
@unittest.mock.patch('coba.environments.openml.HttpSource')
1016-
def test_four_timeouts(self,mock):
1015+
def test_three_timeouts(self,mock):
10171016

10181017
task = {
10191018
"task":{
@@ -1071,7 +1070,6 @@ def test_four_timeouts(self,mock):
10711070
TimeoutError(),
10721071
TimeoutError(),
10731072
TimeoutError(),
1074-
TimeoutError(),
10751073
json.dumps(task).splitlines(),
10761074
json.dumps(data).splitlines(),
10771075
json.dumps(feat).splitlines(),

examples/scripts/Getting Started.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
11
"""
22
This is an example script that creates and executes an Experiment.
3-
This script requires that the matplotlib and vowpalwabbit packages be installed.
3+
This script depends on the matplotlib and vowpalwabbit packages.
44
"""
55

66
import coba as cb
77

8-
#First, we define the learners that we want to test
8+
#First, we define the learners that we wish to evaluate
99
learners = [ cb.VowpalEpsilonLearner(), cb.RandomLearner() ]
1010

11-
#Next we create an environment we'd like to evaluate against
11+
#Next, we create an environment we'd like to evaluate against
1212
environments = cb.Environments.from_linear_synthetic(1000, n_action_features=0).shuffle([1,2,3])
1313

14-
#We then create and run our experiment from our environments and learners
14+
#We then create and run an experiment using our environments and learners
1515
result = cb.Experiment(environments,learners).run()
1616

17-
#After evaluating can create a quick summary plot to get a sense of how the learners performed
17+
#Finally, we can plot the results of our experiment
1818
result.plot_learners(y='reward',err='se',xlim=(10,None))

0 commit comments

Comments
 (0)