Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: Deprecate array formats and default to dataframe #1372

Open
wants to merge 4 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<div id="user-content-toc">
<ul align="center" style="list-style: none;">
<summary>
<img src="https://github.com/openml/openml.org/blob/master/app/public/static/svg/logo.svg" width="50" alt="OpenML Logo"/>
<img src="https://github.com/openml/openml.org/blob/master/app/public/static/svg/logo.svg" width="50" alt="OpenML Logo"/>
<h1>OpenML-Python</h1>
<img src="https://github.com/openml/docs/blob/master/docs/img/python.png" width="50" alt="Python Logo"/>
</summary>
Expand Down
6 changes: 3 additions & 3 deletions examples/20_basic/simple_datasets_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
# List datasets
# =============

datasets_df = openml.datasets.list_datasets(output_format="dataframe")
datasets_df = openml.datasets.list_datasets()
print(datasets_df.head(n=10))

############################################################################
Expand Down Expand Up @@ -48,7 +48,7 @@
# attribute_names - the names of the features for the examples (X) and
# target feature (y)
X, y, categorical_indicator, attribute_names = dataset.get_data(
dataset_format="dataframe", target=dataset.default_target_attribute
target_names=dataset.default_target_attribute
)

############################################################################
Expand All @@ -63,9 +63,9 @@
# Visualize the dataset
# =====================

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style("darkgrid")

Expand Down
4 changes: 2 additions & 2 deletions examples/20_basic/simple_flows_and_runs_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@

# License: BSD 3-Clause

import openml
from sklearn import ensemble, neighbors

import openml

############################################################################
# .. warning::
Expand All @@ -23,7 +23,7 @@
# NOTE: We are using dataset "diabetes" from the test server: https://test.openml.org/d/20
dataset = openml.datasets.get_dataset(dataset_id="diabetes", version=1)
X, y, categorical_indicator, attribute_names = dataset.get_data(
target=dataset.default_target_attribute
target_names=dataset.default_target_attribute
)
clf = neighbors.KNeighborsClassifier(n_neighbors=3)
clf.fit(X, y)
Expand Down
17 changes: 6 additions & 11 deletions examples/30_extended/datasets_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,24 @@

# License: BSD 3-Clauses

import openml
import pandas as pd

import openml
from openml.datasets import edit_dataset, fork_dataset, get_dataset

############################################################################
# Exercise 0
# **********
#
# * List datasets
#
# * Use the output_format parameter to select output type
# * Default gives 'dict' (other option: 'dataframe', see below)
#
# Note: list_datasets will return a pandas dataframe by default from 0.15. When using
# openml-python 0.14, `list_datasets` will warn you to use output_format='dataframe'.
datalist = openml.datasets.list_datasets(output_format="dataframe")
# * List datasets and return a dataframe
datalist = openml.datasets.list_datasets()
datalist = datalist[["did", "name", "NumberOfInstances", "NumberOfFeatures", "NumberOfClasses"]]

print(f"First 10 of {len(datalist)} datasets...")
datalist.head(n=10)

# The same can be done with lesser lines of code
openml_df = openml.datasets.list_datasets(output_format="dataframe")
openml_df = openml.datasets.list_datasets()
openml_df.head(n=10)

############################################################################
Expand Down Expand Up @@ -73,7 +68,7 @@
# dataset. In particular, many datasets for supervised problems have a set
# `default_target_attribute` which may help identify the target variable.
X, y, categorical_indicator, attribute_names = dataset.get_data(
target=dataset.default_target_attribute
target_names=dataset.default_target_attribute
)
print(X.head())
print(X.info())
Expand Down
10 changes: 3 additions & 7 deletions examples/30_extended/fetch_evaluations_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,7 @@
# Required filters can be applied to retrieve results from runs as required.

# We shall retrieve a small set (only 10 entries) to test the listing function for evaluations
openml.evaluations.list_evaluations(
function="predictive_accuracy", size=10, output_format="dataframe"
)
openml.evaluations.list_evaluations(function="predictive_accuracy", size=10)

# Using other evaluation metrics, 'precision' in this case
evals = openml.evaluations.list_evaluations(
Expand Down Expand Up @@ -62,9 +60,7 @@
# Note that we now filter the evaluations based on another parameter 'task'.

metric = "predictive_accuracy"
evals = openml.evaluations.list_evaluations(
function=metric, tasks=[task_id], output_format="dataframe"
)
evals = openml.evaluations.list_evaluations(function=metric, tasks=[task_id])
# Displaying the first 10 rows
print(evals.head(n=10))
# Sorting the evaluations in decreasing order of the metric chosen
Expand Down Expand Up @@ -94,7 +90,7 @@ def plot_cdf(values, metric="predictive_accuracy"):
plt.minorticks_on()
plt.grid(visible=True, which="minor", linestyle="--")
plt.axvline(max_val, linestyle="--", color="gray")
plt.text(max_val, 0, "%.3f" % max_val, fontsize=9)
plt.text(max_val, 0, f"{max_val:.3f}", fontsize=9)
plt.show()


Expand Down
8 changes: 4 additions & 4 deletions examples/30_extended/flows_and_runs_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@

# License: BSD 3-Clause

import openml
from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree
from sklearn import compose, ensemble, impute, neighbors, pipeline, preprocessing, tree

import openml

############################################################################
# We'll use the test server for the rest of this tutorial.
Expand All @@ -27,7 +27,7 @@
# NOTE: We are using dataset 68 from the test server: https://test.openml.org/d/68
dataset = openml.datasets.get_dataset(dataset_id="eeg-eye-state", version=1)
X, y, categorical_indicator, attribute_names = dataset.get_data(
target=dataset.default_target_attribute
target_names=dataset.default_target_attribute
)
clf = neighbors.KNeighborsClassifier(n_neighbors=1)
clf.fit(X, y)
Expand All @@ -38,7 +38,7 @@
# * e.g. categorical features -> do feature encoding
dataset = openml.datasets.get_dataset(dataset_id="credit-g", version=1)
X, y, categorical_indicator, attribute_names = dataset.get_data(
target=dataset.default_target_attribute
target_names=dataset.default_target_attribute
)
print(f"Categorical features: {categorical_indicator}")
transformer = compose.ColumnTransformer(
Expand Down
4 changes: 2 additions & 2 deletions examples/30_extended/plot_svm_hyperparameters_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@

# License: BSD 3-Clause

import openml
import numpy as np

import openml

####################################################################################################
# First step - obtaining the data
# ===============================
Expand All @@ -22,7 +23,6 @@
function="predictive_accuracy",
flows=[8353],
tasks=[6],
output_format="dataframe",
# Using this flag incorporates the hyperparameters into the returned dataframe. Otherwise,
# the dataframe would contain a field ``paramaters`` containing an unparsed dictionary.
parameters_in_separate_columns=True,
Expand Down
10 changes: 2 additions & 8 deletions examples/30_extended/study_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,11 @@

import openml


############################################################################
# Listing studies
# ***************
#
# * Use the output_format parameter to select output type
# * Default gives ``dict``, but we'll use ``dataframe`` to obtain an
# easier-to-work-with data structure

studies = openml.study.list_studies(output_format="dataframe", status="all")
studies = openml.study.list_studies(status="all")
print(studies.head(n=10))


Expand All @@ -52,7 +47,6 @@
# the evaluations available for the conducted runs:
evaluations = openml.evaluations.list_evaluations(
function="predictive_accuracy",
output_format="dataframe",
study=study.study_id,
)
print(evaluations.head())
Expand Down Expand Up @@ -81,7 +75,7 @@
# To verify
# https://test.openml.org/api/v1/study/1
suite = openml.study.get_suite("OpenML100")
print(all([t_id in suite.tasks for t_id in tasks]))
print(all(t_id in suite.tasks for t_id in tasks))

run_ids = []
for task_id in tasks:
Expand Down
11 changes: 3 additions & 8 deletions examples/30_extended/suites_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,11 @@

import openml


############################################################################
# Listing suites
# **************
#
# * Use the output_format parameter to select output type
# * Default gives ``dict``, but we'll use ``dataframe`` to obtain an
# easier-to-work-with data structure

suites = openml.study.list_suites(output_format="dataframe", status="all")
suites = openml.study.list_suites(status="all")
print(suites.head(n=10))

############################################################################
Expand All @@ -51,7 +46,7 @@

############################################################################
# And we can use the task listing functionality to learn more about them:
tasks = openml.tasks.list_tasks(output_format="dataframe")
tasks = openml.tasks.list_tasks()

# Using ``@`` in `pd.DataFrame.query <
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html>`_
Expand All @@ -76,7 +71,7 @@

# We'll take a random subset of at least ten tasks of all available tasks on
# the test server:
all_tasks = list(openml.tasks.list_tasks(output_format="dataframe")["tid"])
all_tasks = list(openml.tasks.list_tasks()["tid"])
task_ids_for_suite = sorted(np.random.choice(all_tasks, replace=False, size=20))

# The study needs a machine-readable and unique alias. To obtain this,
Expand Down
8 changes: 4 additions & 4 deletions examples/30_extended/task_manual_iteration_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
####################################################################################################
# And then split the data based on this:

X, y = task.get_X_and_y(dataset_format="dataframe")
X, y = task.get_X_and_y()
X_train = X.iloc[train_indices]
y_train = y.iloc[train_indices]
X_test = X.iloc[test_indices]
Expand All @@ -88,7 +88,7 @@

task_id = 3
task = openml.tasks.get_task(task_id)
X, y = task.get_X_and_y(dataset_format="dataframe")
X, y = task.get_X_and_y()
n_repeats, n_folds, n_samples = task.get_split_dimensions()
print(
"Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
Expand Down Expand Up @@ -132,7 +132,7 @@

task_id = 1767
task = openml.tasks.get_task(task_id)
X, y = task.get_X_and_y(dataset_format="dataframe")
X, y = task.get_X_and_y()
n_repeats, n_folds, n_samples = task.get_split_dimensions()
print(
"Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
Expand Down Expand Up @@ -176,7 +176,7 @@

task_id = 1702
task = openml.tasks.get_task(task_id)
X, y = task.get_X_and_y(dataset_format="dataframe")
X, y = task.get_X_and_y()
n_repeats, n_folds, n_samples = task.get_split_dimensions()
print(
"Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
Expand Down
20 changes: 8 additions & 12 deletions examples/30_extended/tasks_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

import openml
from openml.tasks import TaskType
import pandas as pd

############################################################################
#
Expand All @@ -30,14 +29,11 @@
# ^^^^^^^^^^^^^
#
# We will start by simply listing only *supervised classification* tasks.
# **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, but we
# request a
# **openml.tasks.list_tasks()** getting a
# `pandas dataframe <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html>`_
# instead to have better visualization capabilities and easier access:
# to have good visualization capabilities and easier access:

tasks = openml.tasks.list_tasks(
task_type=TaskType.SUPERVISED_CLASSIFICATION, output_format="dataframe"
)
tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION)
print(tasks.columns)
print(f"First 5 of {len(tasks)} tasks:")
print(tasks.head())
Expand Down Expand Up @@ -71,21 +67,21 @@
#
# Similar to listing tasks by task type, we can list tasks by tags:

tasks = openml.tasks.list_tasks(tag="OpenML100", output_format="dataframe")
tasks = openml.tasks.list_tasks(tag="OpenML100")
print(f"First 5 of {len(tasks)} tasks:")
print(tasks.head())

############################################################################
# Furthermore, we can list tasks based on the dataset id:

tasks = openml.tasks.list_tasks(data_id=1471, output_format="dataframe")
tasks = openml.tasks.list_tasks(data_id=1471)
print(f"First 5 of {len(tasks)} tasks:")
print(tasks.head())

############################################################################
# In addition, a size limit and an offset can be applied both separately and simultaneously:

tasks = openml.tasks.list_tasks(size=10, offset=50, output_format="dataframe")
tasks = openml.tasks.list_tasks(size=10, offset=50)
print(tasks)

############################################################################
Expand All @@ -101,7 +97,7 @@
# Finally, it is also possible to list all tasks on OpenML with:

############################################################################
tasks = openml.tasks.list_tasks(output_format="dataframe")
tasks = openml.tasks.list_tasks()
print(len(tasks))

############################################################################
Expand Down Expand Up @@ -195,7 +191,7 @@
# Error code for 'task already exists'
if e.code == 614:
# Lookup task
tasks = openml.tasks.list_tasks(data_id=128, output_format="dataframe")
tasks = openml.tasks.list_tasks(data_id=128)
tasks = tasks.query(
'task_type == "Supervised Classification" '
'and estimation_procedure == "10-fold Crossvalidation" '
Expand Down
7 changes: 2 additions & 5 deletions examples/40_paper/2015_neurips_feurer_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,10 @@
| Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter
| In *Advances in Neural Information Processing Systems 28*, 2015
| Available at https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf
""" # noqa F401
"""

# License: BSD 3-Clause

import pandas as pd

import openml

####################################################################################################
Expand Down Expand Up @@ -60,15 +58,14 @@
tasks = openml.tasks.list_tasks(
task_type=openml.tasks.TaskType.SUPERVISED_CLASSIFICATION,
status="all",
output_format="dataframe",
)

# Query only those with holdout as the resampling startegy.
tasks = tasks.query('estimation_procedure == "33% Holdout set"')

task_ids = []
for did in dataset_ids:
tasks_ = list(tasks.query("did == {}".format(did)).tid)
tasks_ = list(tasks.query(f"did == {did}").tid)
if len(tasks_) >= 1: # if there are multiple task, take the one with lowest ID (oldest).
task_id = min(tasks_)
else:
Expand Down
Loading