Skip to content

Commit 7a6f845

Browse files
committed
refactor: Remove "array" format
1 parent 26ae499 commit 7a6f845

File tree

19 files changed

+368
-1079
lines changed

19 files changed

+368
-1079
lines changed

examples/20_basic/simple_datasets_tutorial.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
# List datasets
2020
# =============
2121

22-
datasets_df = openml.datasets.list_datasets(output_format="dataframe")
22+
datasets_df = openml.datasets.list_datasets()
2323
print(datasets_df.head(n=10))
2424

2525
############################################################################
@@ -48,7 +48,7 @@
4848
# attribute_names - the names of the features for the examples (X) and
4949
# target feature (y)
5050
X, y, categorical_indicator, attribute_names = dataset.get_data(
51-
dataset_format="dataframe", target=dataset.default_target_attribute
51+
target=dataset.default_target_attribute
5252
)
5353

5454
############################################################################
@@ -63,9 +63,9 @@
6363
# Visualize the dataset
6464
# =====================
6565

66+
import matplotlib.pyplot as plt
6667
import pandas as pd
6768
import seaborn as sns
68-
import matplotlib.pyplot as plt
6969

7070
sns.set_style("darkgrid")
7171

examples/30_extended/task_manual_iteration_tutorial.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@
6868
####################################################################################################
6969
# And then split the data based on this:
7070

71-
X, y = task.get_X_and_y(dataset_format="dataframe")
71+
X, y = task.get_X_and_y()
7272
X_train = X.iloc[train_indices]
7373
y_train = y.iloc[train_indices]
7474
X_test = X.iloc[test_indices]
@@ -88,7 +88,7 @@
8888

8989
task_id = 3
9090
task = openml.tasks.get_task(task_id)
91-
X, y = task.get_X_and_y(dataset_format="dataframe")
91+
X, y = task.get_X_and_y()
9292
n_repeats, n_folds, n_samples = task.get_split_dimensions()
9393
print(
9494
"Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
@@ -132,7 +132,7 @@
132132

133133
task_id = 1767
134134
task = openml.tasks.get_task(task_id)
135-
X, y = task.get_X_and_y(dataset_format="dataframe")
135+
X, y = task.get_X_and_y()
136136
n_repeats, n_folds, n_samples = task.get_split_dimensions()
137137
print(
138138
"Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
@@ -176,7 +176,7 @@
176176

177177
task_id = 1702
178178
task = openml.tasks.get_task(task_id)
179-
X, y = task.get_X_and_y(dataset_format="dataframe")
179+
X, y = task.get_X_and_y()
180180
n_repeats, n_folds, n_samples = task.get_split_dimensions()
181181
print(
182182
"Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(

openml/datasets/functions.py

+22-130
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from collections import OrderedDict
88
from pathlib import Path
99
from pyexpat import ExpatError
10-
from typing import TYPE_CHECKING, Any, overload
10+
from typing import TYPE_CHECKING, Any
1111
from typing_extensions import Literal
1212

1313
import arff
@@ -75,54 +75,16 @@ def list_qualities() -> list[str]:
7575
return qualities["oml:data_qualities_list"]["oml:quality"]
7676

7777

78-
@overload
79-
def list_datasets(
80-
data_id: list[int] | None = ...,
81-
offset: int | None = ...,
82-
size: int | None = ...,
83-
status: str | None = ...,
84-
tag: str | None = ...,
85-
*,
86-
output_format: Literal["dataframe"],
87-
**kwargs: Any,
88-
) -> pd.DataFrame: ...
89-
90-
91-
@overload
92-
def list_datasets(
93-
data_id: list[int] | None,
94-
offset: int | None,
95-
size: int | None,
96-
status: str | None,
97-
tag: str | None,
98-
output_format: Literal["dataframe"],
99-
**kwargs: Any,
100-
) -> pd.DataFrame: ...
101-
102-
103-
@overload
104-
def list_datasets(
105-
data_id: list[int] | None = ...,
106-
offset: int | None = ...,
107-
size: int | None = ...,
108-
status: str | None = ...,
109-
tag: str | None = ...,
110-
output_format: Literal["dict"] = "dict",
111-
**kwargs: Any,
112-
) -> pd.DataFrame: ...
113-
114-
11578
def list_datasets(
11679
data_id: list[int] | None = None,
11780
offset: int | None = None,
11881
size: int | None = None,
11982
status: str | None = None,
12083
tag: str | None = None,
121-
output_format: Literal["dataframe", "dict"] = "dict",
12284
**kwargs: Any,
123-
) -> dict | pd.DataFrame:
124-
"""
125-
Return a list of all dataset which are on OpenML.
85+
) -> pd.DataFrame:
86+
"""Return a dataframe of all dataset which are on OpenML.
87+
12688
Supports large amount of results.
12789
12890
Parameters
@@ -139,87 +101,36 @@ def list_datasets(
139101
default active datasets are returned, but also datasets
140102
from another status can be requested.
141103
tag : str, optional
142-
output_format: str, optional (default='dict')
143-
The parameter decides the format of the output.
144-
- If 'dict' the output is a dict of dict
145-
- If 'dataframe' the output is a pandas DataFrame
146104
kwargs : dict, optional
147105
Legal filter operators (keys in the dict):
148106
data_name, data_version, number_instances,
149107
number_features, number_classes, number_missing_values.
150108
151109
Returns
152110
-------
153-
datasets : dict of dicts, or dataframe
154-
- If output_format='dict'
155-
A mapping from dataset ID to dict.
156-
157-
Every dataset is represented by a dictionary containing
158-
the following information:
159-
- dataset id
160-
- name
161-
- format
162-
- status
163-
If qualities are calculated for the dataset, some of
164-
these are also returned.
165-
166-
- If output_format='dataframe'
167-
Each row maps to a dataset
168-
Each column contains the following information:
169-
- dataset id
170-
- name
171-
- format
172-
- status
173-
If qualities are calculated for the dataset, some of
174-
these are also included as columns.
111+
datasets: dataframe
112+
Each row maps to a dataset
113+
Each column contains the following information:
114+
- dataset id
115+
- name
116+
- format
117+
- status
118+
If qualities are calculated for the dataset, some of
119+
these are also included as columns.
175120
"""
176-
if output_format not in ["dataframe", "dict"]:
177-
raise ValueError(
178-
"Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.",
179-
)
180-
181-
# TODO: [0.15]
182-
if output_format == "dict":
183-
msg = (
184-
"Support for `output_format` of 'dict' will be removed in 0.15 "
185-
"and pandas dataframes will be returned instead. To ensure your code "
186-
"will continue to work, use `output_format`='dataframe'."
187-
)
188-
warnings.warn(msg, category=FutureWarning, stacklevel=2)
189-
190-
return openml.utils._list_all( # type: ignore
191-
data_id=data_id,
192-
list_output_format=output_format, # type: ignore
121+
batches = openml.utils._list_all(
193122
listing_call=_list_datasets,
123+
data_id=data_id,
194124
offset=offset,
195125
size=size,
196126
status=status,
197127
tag=tag,
198128
**kwargs,
199129
)
130+
return pd.concat(batches, ignore_index=True)
200131

201132

202-
@overload
203-
def _list_datasets(
204-
data_id: list | None = ...,
205-
output_format: Literal["dict"] = "dict",
206-
**kwargs: Any,
207-
) -> dict: ...
208-
209-
210-
@overload
211-
def _list_datasets(
212-
data_id: list | None = ...,
213-
output_format: Literal["dataframe"] = "dataframe",
214-
**kwargs: Any,
215-
) -> pd.DataFrame: ...
216-
217-
218-
def _list_datasets(
219-
data_id: list | None = None,
220-
output_format: Literal["dict", "dataframe"] = "dict",
221-
**kwargs: Any,
222-
) -> dict | pd.DataFrame:
133+
def _list_datasets(data_id: list[int] | None = None, **kwargs: Any) -> pd.DataFrame:
223134
"""
224135
Perform api call to return a list of all datasets.
225136
@@ -232,18 +143,14 @@ def _list_datasets(
232143
233144
data_id : list, optional
234145
235-
output_format: str, optional (default='dict')
236-
The parameter decides the format of the output.
237-
- If 'dict' the output is a dict of dict
238-
- If 'dataframe' the output is a pandas DataFrame
239146
kwargs : dict, optional
240147
Legal filter operators (keys in the dict):
241148
tag, status, limit, offset, data_name, data_version, number_instances,
242149
number_features, number_classes, number_missing_values.
243150
244151
Returns
245152
-------
246-
datasets : dict of dicts, or dataframe
153+
datasets : dataframe
247154
"""
248155
api_call = "data/list"
249156

@@ -252,21 +159,10 @@ def _list_datasets(
252159
api_call += f"/{operator}/{value}"
253160
if data_id is not None:
254161
api_call += "/data_id/{}".format(",".join([str(int(i)) for i in data_id]))
255-
return __list_datasets(api_call=api_call, output_format=output_format)
256-
257-
258-
@overload
259-
def __list_datasets(api_call: str, output_format: Literal["dict"] = "dict") -> dict: ...
260-
162+
return __list_datasets(api_call=api_call)
261163

262-
@overload
263-
def __list_datasets(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: ...
264164

265-
266-
def __list_datasets(
267-
api_call: str,
268-
output_format: Literal["dict", "dataframe"] = "dict",
269-
) -> dict | pd.DataFrame:
165+
def __list_datasets(api_call: str) -> pd.DataFrame:
270166
xml_string = openml._api_calls._perform_api_call(api_call, "get")
271167
datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",))
272168

@@ -295,10 +191,7 @@ def __list_datasets(
295191
dataset[quality["@name"]] = float(quality["#text"])
296192
datasets[dataset["did"]] = dataset
297193

298-
if output_format == "dataframe":
299-
datasets = pd.DataFrame.from_dict(datasets, orient="index")
300-
301-
return datasets
194+
return pd.DataFrame.from_dict(datasets, orient="index")
302195

303196

304197
def _expand_parameter(parameter: str | list[str] | None) -> list[str]:
@@ -1493,8 +1386,7 @@ def _get_online_dataset_arff(dataset_id: int) -> str | None:
14931386

14941387

14951388
def _get_online_dataset_format(dataset_id: int) -> str:
1496-
"""Get the dataset format for a given dataset id
1497-
from the OpenML website.
1389+
"""Get the dataset format for a given dataset id from the OpenML website.
14981390
14991391
Parameters
15001392
----------

openml/evaluations/evaluation.py

+20
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
import openml.tasks
99

1010

11+
# TODO(eddiebergman): A lot of this class is automatically
12+
# handled by a dataclass
1113
class OpenMLEvaluation:
1214
"""
1315
Contains all meta-information about a run / evaluation combination,
@@ -78,6 +80,24 @@ def __init__( # noqa: PLR0913
7880
self.values = values
7981
self.array_data = array_data
8082

83+
def _to_dict(self) -> dict:
84+
return {
85+
"run_id": self.run_id,
86+
"task_id": self.task_id,
87+
"setup_id": self.setup_id,
88+
"flow_id": self.flow_id,
89+
"flow_name": self.flow_name,
90+
"data_id": self.data_id,
91+
"data_name": self.data_name,
92+
"function": self.function,
93+
"upload_time": self.upload_time,
94+
"uploader": self.uploader,
95+
"uploader_name": self.uploader_name,
96+
"value": self.value,
97+
"values": self.values,
98+
"array_data": self.array_data,
99+
}
100+
81101
def __repr__(self) -> str:
82102
header = "OpenML Evaluation"
83103
header = "{}\n{}\n".format(header, "=" * len(header))

0 commit comments

Comments
 (0)