7
7
from collections import OrderedDict
8
8
from pathlib import Path
9
9
from pyexpat import ExpatError
10
- from typing import TYPE_CHECKING , Any , overload
10
+ from typing import TYPE_CHECKING , Any
11
11
from typing_extensions import Literal
12
12
13
13
import arff
@@ -75,54 +75,16 @@ def list_qualities() -> list[str]:
75
75
return qualities ["oml:data_qualities_list" ]["oml:quality" ]
76
76
77
77
78
- @overload
79
- def list_datasets (
80
- data_id : list [int ] | None = ...,
81
- offset : int | None = ...,
82
- size : int | None = ...,
83
- status : str | None = ...,
84
- tag : str | None = ...,
85
- * ,
86
- output_format : Literal ["dataframe" ],
87
- ** kwargs : Any ,
88
- ) -> pd .DataFrame : ...
89
-
90
-
91
- @overload
92
- def list_datasets (
93
- data_id : list [int ] | None ,
94
- offset : int | None ,
95
- size : int | None ,
96
- status : str | None ,
97
- tag : str | None ,
98
- output_format : Literal ["dataframe" ],
99
- ** kwargs : Any ,
100
- ) -> pd .DataFrame : ...
101
-
102
-
103
- @overload
104
- def list_datasets (
105
- data_id : list [int ] | None = ...,
106
- offset : int | None = ...,
107
- size : int | None = ...,
108
- status : str | None = ...,
109
- tag : str | None = ...,
110
- output_format : Literal ["dict" ] = "dict" ,
111
- ** kwargs : Any ,
112
- ) -> pd .DataFrame : ...
113
-
114
-
115
78
def list_datasets (
116
79
data_id : list [int ] | None = None ,
117
80
offset : int | None = None ,
118
81
size : int | None = None ,
119
82
status : str | None = None ,
120
83
tag : str | None = None ,
121
- output_format : Literal ["dataframe" , "dict" ] = "dict" ,
122
84
** kwargs : Any ,
123
- ) -> dict | pd .DataFrame :
124
- """
125
- Return a list of all dataset which are on OpenML.
85
+ ) -> pd .DataFrame :
86
+ """Return a dataframe of all dataset which are on OpenML.
87
+
126
88
Supports large amount of results.
127
89
128
90
Parameters
@@ -139,87 +101,36 @@ def list_datasets(
139
101
default active datasets are returned, but also datasets
140
102
from another status can be requested.
141
103
tag : str, optional
142
- output_format: str, optional (default='dict')
143
- The parameter decides the format of the output.
144
- - If 'dict' the output is a dict of dict
145
- - If 'dataframe' the output is a pandas DataFrame
146
104
kwargs : dict, optional
147
105
Legal filter operators (keys in the dict):
148
106
data_name, data_version, number_instances,
149
107
number_features, number_classes, number_missing_values.
150
108
151
109
Returns
152
110
-------
153
- datasets : dict of dicts, or dataframe
154
- - If output_format='dict'
155
- A mapping from dataset ID to dict.
156
-
157
- Every dataset is represented by a dictionary containing
158
- the following information:
159
- - dataset id
160
- - name
161
- - format
162
- - status
163
- If qualities are calculated for the dataset, some of
164
- these are also returned.
165
-
166
- - If output_format='dataframe'
167
- Each row maps to a dataset
168
- Each column contains the following information:
169
- - dataset id
170
- - name
171
- - format
172
- - status
173
- If qualities are calculated for the dataset, some of
174
- these are also included as columns.
111
+ datasets: dataframe
112
+ Each row maps to a dataset
113
+ Each column contains the following information:
114
+ - dataset id
115
+ - name
116
+ - format
117
+ - status
118
+ If qualities are calculated for the dataset, some of
119
+ these are also included as columns.
175
120
"""
176
- if output_format not in ["dataframe" , "dict" ]:
177
- raise ValueError (
178
- "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable." ,
179
- )
180
-
181
- # TODO: [0.15]
182
- if output_format == "dict" :
183
- msg = (
184
- "Support for `output_format` of 'dict' will be removed in 0.15 "
185
- "and pandas dataframes will be returned instead. To ensure your code "
186
- "will continue to work, use `output_format`='dataframe'."
187
- )
188
- warnings .warn (msg , category = FutureWarning , stacklevel = 2 )
189
-
190
- return openml .utils ._list_all ( # type: ignore
191
- data_id = data_id ,
192
- list_output_format = output_format , # type: ignore
121
+ batches = openml .utils ._list_all (
193
122
listing_call = _list_datasets ,
123
+ data_id = data_id ,
194
124
offset = offset ,
195
125
size = size ,
196
126
status = status ,
197
127
tag = tag ,
198
128
** kwargs ,
199
129
)
130
+ return pd .concat (batches , ignore_index = True )
200
131
201
132
202
- @overload
203
- def _list_datasets (
204
- data_id : list | None = ...,
205
- output_format : Literal ["dict" ] = "dict" ,
206
- ** kwargs : Any ,
207
- ) -> dict : ...
208
-
209
-
210
- @overload
211
- def _list_datasets (
212
- data_id : list | None = ...,
213
- output_format : Literal ["dataframe" ] = "dataframe" ,
214
- ** kwargs : Any ,
215
- ) -> pd .DataFrame : ...
216
-
217
-
218
- def _list_datasets (
219
- data_id : list | None = None ,
220
- output_format : Literal ["dict" , "dataframe" ] = "dict" ,
221
- ** kwargs : Any ,
222
- ) -> dict | pd .DataFrame :
133
+ def _list_datasets (data_id : list [int ] | None = None , ** kwargs : Any ) -> pd .DataFrame :
223
134
"""
224
135
Perform api call to return a list of all datasets.
225
136
@@ -232,18 +143,14 @@ def _list_datasets(
232
143
233
144
data_id : list, optional
234
145
235
- output_format: str, optional (default='dict')
236
- The parameter decides the format of the output.
237
- - If 'dict' the output is a dict of dict
238
- - If 'dataframe' the output is a pandas DataFrame
239
146
kwargs : dict, optional
240
147
Legal filter operators (keys in the dict):
241
148
tag, status, limit, offset, data_name, data_version, number_instances,
242
149
number_features, number_classes, number_missing_values.
243
150
244
151
Returns
245
152
-------
246
- datasets : dict of dicts, or dataframe
153
+ datasets : dataframe
247
154
"""
248
155
api_call = "data/list"
249
156
@@ -252,21 +159,10 @@ def _list_datasets(
252
159
api_call += f"/{ operator } /{ value } "
253
160
if data_id is not None :
254
161
api_call += "/data_id/{}" .format ("," .join ([str (int (i )) for i in data_id ]))
255
- return __list_datasets (api_call = api_call , output_format = output_format )
256
-
257
-
258
- @overload
259
- def __list_datasets (api_call : str , output_format : Literal ["dict" ] = "dict" ) -> dict : ...
260
-
162
+ return __list_datasets (api_call = api_call )
261
163
262
- @overload
263
- def __list_datasets (api_call : str , output_format : Literal ["dataframe" ]) -> pd .DataFrame : ...
264
164
265
-
266
- def __list_datasets (
267
- api_call : str ,
268
- output_format : Literal ["dict" , "dataframe" ] = "dict" ,
269
- ) -> dict | pd .DataFrame :
165
+ def __list_datasets (api_call : str ) -> pd .DataFrame :
270
166
xml_string = openml ._api_calls ._perform_api_call (api_call , "get" )
271
167
datasets_dict = xmltodict .parse (xml_string , force_list = ("oml:dataset" ,))
272
168
@@ -295,10 +191,7 @@ def __list_datasets(
295
191
dataset [quality ["@name" ]] = float (quality ["#text" ])
296
192
datasets [dataset ["did" ]] = dataset
297
193
298
- if output_format == "dataframe" :
299
- datasets = pd .DataFrame .from_dict (datasets , orient = "index" )
300
-
301
- return datasets
194
+ return pd .DataFrame .from_dict (datasets , orient = "index" )
302
195
303
196
304
197
def _expand_parameter (parameter : str | list [str ] | None ) -> list [str ]:
@@ -1493,8 +1386,7 @@ def _get_online_dataset_arff(dataset_id: int) -> str | None:
1493
1386
1494
1387
1495
1388
def _get_online_dataset_format (dataset_id : int ) -> str :
1496
- """Get the dataset format for a given dataset id
1497
- from the OpenML website.
1389
+ """Get the dataset format for a given dataset id from the OpenML website.
1498
1390
1499
1391
Parameters
1500
1392
----------
0 commit comments