-
-
Notifications
You must be signed in to change notification settings - Fork 147
/
Copy path2015_neurips_feurer_example.py
90 lines (75 loc) · 4.01 KB
/
2015_neurips_feurer_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
"""
Feurer et al. (2015)
====================
A tutorial on how to get the datasets used in the paper introducing *Auto-sklearn* by Feurer et al..
Auto-sklearn website: https://automl.github.io/auto-sklearn/
Publication
~~~~~~~~~~~
| Efficient and Robust Automated Machine Learning
| Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter
| In *Advances in Neural Information Processing Systems 28*, 2015
| Available at https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf
"""
# License: BSD 3-Clause
import openml
####################################################################################################
# List of dataset IDs given in the supplementary material of Feurer et al.:
# https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning-supplemental.zip
# fmt: off
dataset_ids = [
3, 6, 12, 14, 16, 18, 21, 22, 23, 24, 26, 28, 30, 31, 32, 36, 38, 44, 46,
57, 60, 179, 180, 181, 182, 184, 185, 273, 293, 300, 351, 354, 357, 389,
390, 391, 392, 393, 395, 396, 398, 399, 401, 554, 679, 715, 718, 720, 722,
723, 727, 728, 734, 735, 737, 740, 741, 743, 751, 752, 761, 772, 797, 799,
803, 806, 807, 813, 816, 819, 821, 822, 823, 833, 837, 843, 845, 846, 847,
849, 866, 871, 881, 897, 901, 903, 904, 910, 912, 913, 914, 917, 923, 930,
934, 953, 958, 959, 962, 966, 971, 976, 977, 978, 979, 980, 991, 993, 995,
1000, 1002, 1018, 1019, 1020, 1021, 1036, 1040, 1041, 1049, 1050, 1053,
1056, 1067, 1068, 1069, 1111, 1112, 1114, 1116, 1119, 1120, 1128, 1130,
1134, 1138, 1139, 1142, 1146, 1161, 1166,
]
# fmt: on
####################################################################################################
# The dataset IDs could be used directly to load the dataset and split the data into a training set
# and a test set. However, to be reproducible, we will first obtain the respective tasks from
# OpenML, which define both the target feature and the train/test split.
#
# .. note::
# It is discouraged to work directly on datasets and only provide dataset IDs in a paper as
# this does not allow reproducibility (unclear splitting). Please do not use datasets but the
# respective tasks as basis for a paper and publish task IDS. This example is only given to
# showcase the use of OpenML-Python for a published paper and as a warning on how not to do it.
# Please check the `OpenML documentation of tasks <https://docs.openml.org/#tasks>`_ if you
# want to learn more about them.
####################################################################################################
# This lists both active and inactive tasks (because of ``status='all'``). Unfortunately,
# this is necessary as some of the datasets contain issues found after the publication and became
# deactivated, which also deactivated the tasks on them. More information on active or inactive
# datasets can be found in the `online docs <https://docs.openml.org/#dataset-status>`_.
tasks = openml.tasks.list_tasks(
task_type=openml.tasks.TaskType.SUPERVISED_CLASSIFICATION,
status="all",
)
# Query only those with holdout as the resampling startegy.
tasks = tasks.query('estimation_procedure == "33% Holdout set"')
task_ids = []
for did in dataset_ids:
tasks_ = list(tasks.query(f"did == {did}").tid)
if len(tasks_) >= 1: # if there are multiple task, take the one with lowest ID (oldest).
task_id = min(tasks_)
else:
raise ValueError(did)
# Optional - Check that the task has the same target attribute as the
# dataset default target attribute
# (disabled for this example as it needs to run fast to be rendered online)
# task = openml.tasks.get_task(task_id)
# dataset = task.get_dataset()
# if task.target_name != dataset.default_target_attribute:
# raise ValueError(
# (task.target_name, dataset.default_target_attribute)
# )
task_ids.append(task_id)
assert len(task_ids) == 140
task_ids.sort()
# These are the tasks to work with:
print(task_ids)