Skip to content

Commit 6296f37

Browse files
committed
refactor: Explicitly name parameter to listing functions
1 parent 7a6f845 commit 6296f37

File tree

12 files changed

+204
-133
lines changed

12 files changed

+204
-133
lines changed

examples/40_paper/2018_ida_strang_example.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717
# License: BSD 3-Clause
1818

1919
import matplotlib.pyplot as plt
20+
2021
import openml
21-
import pandas as pd
2222

2323
##############################################################################
2424
# A basic step for each data-mining or machine learning task is to determine
@@ -47,13 +47,16 @@
4747

4848
# Downloads all evaluation records related to this study
4949
evaluations = openml.evaluations.list_evaluations(
50-
measure, size=None, flows=flow_ids, study=study_id, output_format="dataframe"
50+
measure,
51+
size=None,
52+
flows=flow_ids,
53+
study=study_id,
5154
)
5255
# gives us a table with columns data_id, flow1_value, flow2_value
5356
evaluations = evaluations.pivot(index="data_id", columns="flow_id", values="value").dropna()
5457
# downloads all data qualities (for scatter plot)
5558
data_qualities = openml.datasets.list_datasets(
56-
data_id=list(evaluations.index.values), output_format="dataframe"
59+
data_id=list(evaluations.index.values),
5760
)
5861
# removes irrelevant data qualities
5962
data_qualities = data_qualities[meta_features]
@@ -86,10 +89,9 @@
8689
def determine_class(val_lin, val_nonlin):
8790
if val_lin < val_nonlin:
8891
return class_values[0]
89-
elif val_nonlin < val_lin:
92+
if val_nonlin < val_lin:
9093
return class_values[1]
91-
else:
92-
return class_values[2]
94+
return class_values[2]
9395

9496

9597
evaluations["class"] = evaluations.apply(

openml/datasets/functions.py

+39-17
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import logging
66
import warnings
77
from collections import OrderedDict
8+
from functools import partial
89
from pathlib import Path
910
from pyexpat import ExpatError
1011
from typing import TYPE_CHECKING, Any
@@ -81,7 +82,12 @@ def list_datasets(
8182
size: int | None = None,
8283
status: str | None = None,
8384
tag: str | None = None,
84-
**kwargs: Any,
85+
data_name: str | None = None,
86+
data_version: int | None = None,
87+
number_instances: int | None = None,
88+
number_features: int | None = None,
89+
number_classes: int | None = None,
90+
number_missing_values: int | None = None,
8591
) -> pd.DataFrame:
8692
"""Return a dataframe of all dataset which are on OpenML.
8793
@@ -101,10 +107,12 @@ def list_datasets(
101107
default active datasets are returned, but also datasets
102108
from another status can be requested.
103109
tag : str, optional
104-
kwargs : dict, optional
105-
Legal filter operators (keys in the dict):
106-
data_name, data_version, number_instances,
107-
number_features, number_classes, number_missing_values.
110+
data_name : str, optional
111+
data_version : int, optional
112+
number_instances : int, optional
113+
number_features : int, optional
114+
number_classes : int, optional
115+
number_missing_values : int, optional
108116
109117
Returns
110118
-------
@@ -118,19 +126,29 @@ def list_datasets(
118126
If qualities are calculated for the dataset, some of
119127
these are also included as columns.
120128
"""
121-
batches = openml.utils._list_all(
122-
listing_call=_list_datasets,
129+
listing_call = partial(
130+
_list_datasets,
123131
data_id=data_id,
124-
offset=offset,
125-
size=size,
126132
status=status,
127133
tag=tag,
128-
**kwargs,
134+
data_name=data_name,
135+
data_version=data_version,
136+
number_instances=number_instances,
137+
number_features=number_features,
138+
number_classes=number_classes,
139+
number_missing_values=number_missing_values,
129140
)
130-
return pd.concat(batches, ignore_index=True)
141+
batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
142+
return pd.concat(batches)
131143

132144

133-
def _list_datasets(data_id: list[int] | None = None, **kwargs: Any) -> pd.DataFrame:
145+
def _list_datasets(
146+
limit: int,
147+
offset: int,
148+
*,
149+
data_id: list[int] | None = None,
150+
**kwargs: Any,
151+
) -> pd.DataFrame:
134152
"""
135153
Perform api call to return a list of all datasets.
136154
@@ -141,6 +159,10 @@ def _list_datasets(data_id: list[int] | None = None, **kwargs: Any) -> pd.DataFr
141159
display_errors is also separated from the kwargs since it has a
142160
default value.
143161
162+
limit : int
163+
The maximum number of datasets to show.
164+
offset : int
165+
The number of datasets to skip, starting from the first.
144166
data_id : list, optional
145167
146168
kwargs : dict, optional
@@ -152,7 +174,7 @@ def _list_datasets(data_id: list[int] | None = None, **kwargs: Any) -> pd.DataFr
152174
-------
153175
datasets : dataframe
154176
"""
155-
api_call = "data/list"
177+
api_call = f"data/list/list/{limit}/offset/{offset}"
156178

157179
if kwargs is not None:
158180
for operator, value in kwargs.items():
@@ -242,12 +264,13 @@ def check_datasets_active(
242264
dict
243265
A dictionary with items {did: bool}
244266
"""
245-
datasets = list_datasets(status="all", data_id=dataset_ids, output_format="dataframe")
246-
missing = set(dataset_ids) - set(datasets.get("did", []))
267+
datasets = list_datasets(status="all", data_id=dataset_ids)
268+
missing = set(dataset_ids) - set(datasets.index)
247269
if raise_error_if_not_exist and missing:
248270
missing_str = ", ".join(str(did) for did in missing)
249271
raise ValueError(f"Could not find dataset(s) {missing_str} in OpenML dataset list.")
250-
return dict(datasets["status"] == "active")
272+
mask = datasets["status"] == "active"
273+
return dict(mask)
251274

252275

253276
def _name_to_id(
@@ -285,7 +308,6 @@ def _name_to_id(
285308
data_name=dataset_name,
286309
status=status,
287310
data_version=version,
288-
output_format="dataframe",
289311
)
290312
if error_if_multiple and len(candidates) > 1:
291313
msg = f"Multiple active datasets exist with name '{dataset_name}'."

openml/evaluations/functions.py

+16-9
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from __future__ import annotations
44

55
import json
6+
from functools import partial
67
from itertools import chain
78
from typing import Any
89
from typing_extensions import Literal, overload
@@ -56,7 +57,7 @@ def list_evaluations(
5657
def list_evaluations(
5758
function: str,
5859
offset: int | None = None,
59-
size: int | None = 10000,
60+
size: int | None = None,
6061
tasks: list[str | int] | None = None,
6162
setups: list[str | int] | None = None,
6263
flows: list[str | int] | None = None,
@@ -118,11 +119,9 @@ def list_evaluations(
118119
if per_fold is not None:
119120
per_fold_str = str(per_fold).lower()
120121

121-
eval_collection: list[list[OpenMLEvaluation]] = openml.utils._list_all(
122-
listing_call=_list_evaluations,
122+
listing_call = partial(
123+
_list_evaluations,
123124
function=function,
124-
offset=offset,
125-
size=size,
126125
tasks=tasks,
127126
setups=setups,
128127
flows=flows,
@@ -133,8 +132,9 @@ def list_evaluations(
133132
sort_order=sort_order,
134133
per_fold=per_fold_str,
135134
)
136-
flattened = list(chain.from_iterable(eval_collection))
135+
eval_collection = openml.utils._list_all(listing_call, offset=offset, limit=size)
137136

137+
flattened = list(chain.from_iterable(eval_collection))
138138
if output_format == "dataframe":
139139
records = [item._to_dict() for item in flattened]
140140
return pd.DataFrame.from_records(records, index="run_id")
@@ -143,6 +143,9 @@ def list_evaluations(
143143

144144

145145
def _list_evaluations(
146+
limit: int,
147+
offset: int,
148+
*,
146149
function: str,
147150
tasks: list | None = None,
148151
setups: list | None = None,
@@ -161,6 +164,10 @@ def _list_evaluations(
161164
The arguments that are lists are separated from the single value
162165
ones which are put into the kwargs.
163166
167+
limit : int
168+
the number of evaluations to return
169+
offset : int
170+
the number of evaluations to skip, starting from the first
164171
function : str
165172
the evaluation function. e.g., predictive_accuracy
166173
@@ -178,7 +185,7 @@ def _list_evaluations(
178185
study : int, optional
179186
180187
kwargs: dict, optional
181-
Legal filter operators: tag, limit, offset.
188+
Legal filter operators: tag, per_fold
182189
183190
sort_order : str, optional
184191
order of sorting evaluations, ascending ("asc") or descending ("desc")
@@ -187,7 +194,7 @@ def _list_evaluations(
187194
-------
188195
list of OpenMLEvaluation objects
189196
"""
190-
api_call = f"evaluation/list/function/{function}"
197+
api_call = f"evaluation/list/function/{function}/limit/{limit}/offset/{offset}"
191198
if kwargs is not None:
192199
for operator, value in kwargs.items():
193200
api_call += f"/{operator}/{value}"
@@ -202,7 +209,7 @@ def _list_evaluations(
202209
if uploaders is not None:
203210
api_call += "/uploader/{}".format(",".join([str(int(i)) for i in uploaders]))
204211
if study is not None:
205-
api_call += "/study/%d" % study
212+
api_call += f"/study/{study}"
206213
if sort_order is not None:
207214
api_call += f"/sort_order/{sort_order}"
208215

openml/flows/functions.py

+11-11
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import os
55
import re
66
from collections import OrderedDict
7+
from functools import partial
78
from typing import Any, Dict
89

910
import dateutil.parser
@@ -135,7 +136,7 @@ def list_flows(
135136
offset: int | None = None,
136137
size: int | None = None,
137138
tag: str | None = None,
138-
**kwargs: Any,
139+
uploader: str | None = None,
139140
) -> pd.DataFrame:
140141
"""
141142
Return a list of all flows which are on OpenML.
@@ -164,30 +165,29 @@ def list_flows(
164165
- external version
165166
- uploader
166167
"""
167-
batches = openml.utils._list_all(
168-
listing_call=_list_flows,
169-
offset=offset,
170-
size=size,
171-
tag=tag,
172-
**kwargs,
173-
)
168+
listing_call = partial(_list_flows, tag=tag, uploader=uploader)
169+
batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
174170
return pd.concat(batches, ignore_index=True)
175171

176172

177-
def _list_flows(**kwargs: Any) -> pd.DataFrame:
173+
def _list_flows(limit: int, offset: int, **kwargs: Any) -> pd.DataFrame:
178174
"""
179175
Perform the api call that return a list of all flows.
180176
181177
Parameters
182178
----------
179+
limit : int
180+
the maximum number of flows to return
181+
offset : int
182+
the number of flows to skip, starting from the first
183183
kwargs: dict, optional
184-
Legal filter operators: uploader, tag, limit, offset.
184+
Legal filter operators: uploader, tag
185185
186186
Returns
187187
-------
188188
flows : dataframe
189189
"""
190-
api_call = "flow/list"
190+
api_call = f"flow/list/limit/{limit}/offset/{offset}"
191191

192192
if kwargs is not None:
193193
for operator, value in kwargs.items():

0 commit comments

Comments
 (0)