openml · eddiebergman · Oct 16, 2024 · Oct 16, 2024 · Oct 17, 2024 · Nov 4, 2024
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@
 <div id="user-content-toc">
   <ul align="center" style="list-style: none;">
     <summary>
-      <img src="https://github.com/openml/openml.org/blob/master/app/public/static/svg/logo.svg" width="50" alt="OpenML Logo"/> 
+      <img src="https://github.com/openml/openml.org/blob/master/app/public/static/svg/logo.svg" width="50" alt="OpenML Logo"/>
       <h1>OpenML-Python</h1>
       <img src="https://github.com/openml/docs/blob/master/docs/img/python.png" width="50" alt="Python Logo"/>
     </summary>

diff --git a/examples/20_basic/simple_datasets_tutorial.py b/examples/20_basic/simple_datasets_tutorial.py
@@ -19,7 +19,7 @@
 # List datasets
 # =============
 
-datasets_df = openml.datasets.list_datasets(output_format="dataframe")
+datasets_df = openml.datasets.list_datasets()
 print(datasets_df.head(n=10))
 
 ############################################################################
@@ -48,7 +48,7 @@
 # attribute_names - the names of the features for the examples (X) and
 # target feature (y)
 X, y, categorical_indicator, attribute_names = dataset.get_data(
-    dataset_format="dataframe", target=dataset.default_target_attribute
+    target_names=dataset.default_target_attribute
 )
 
 ############################################################################
@@ -63,9 +63,9 @@
 # Visualize the dataset
 # =====================
 
+import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
-import matplotlib.pyplot as plt
 
 sns.set_style("darkgrid")
 

diff --git a/examples/20_basic/simple_flows_and_runs_tutorial.py b/examples/20_basic/simple_flows_and_runs_tutorial.py
@@ -7,9 +7,9 @@
 
 # License: BSD 3-Clause
 
-import openml
 from sklearn import ensemble, neighbors
 
+import openml
 
 ############################################################################
 # .. warning::
@@ -23,7 +23,7 @@
 # NOTE: We are using dataset "diabetes" from the test server: https://test.openml.org/d/20
 dataset = openml.datasets.get_dataset(dataset_id="diabetes", version=1)
 X, y, categorical_indicator, attribute_names = dataset.get_data(
-    target=dataset.default_target_attribute
+    target_names=dataset.default_target_attribute
 )
 clf = neighbors.KNeighborsClassifier(n_neighbors=3)
 clf.fit(X, y)

diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py
@@ -8,29 +8,24 @@
 
 # License: BSD 3-Clauses
 
-import openml
 import pandas as pd
+
+import openml
 from openml.datasets import edit_dataset, fork_dataset, get_dataset
 
 ############################################################################
 # Exercise 0
 # **********
 #
-# * List datasets
-#
-#   * Use the output_format parameter to select output type
-#   * Default gives 'dict' (other option: 'dataframe', see below)
-#
-# Note: list_datasets will return a pandas dataframe by default from 0.15. When using
-# openml-python 0.14, `list_datasets` will warn you to use output_format='dataframe'.
-datalist = openml.datasets.list_datasets(output_format="dataframe")
+# * List datasets and return a dataframe
+datalist = openml.datasets.list_datasets()
 datalist = datalist[["did", "name", "NumberOfInstances", "NumberOfFeatures", "NumberOfClasses"]]
 
 print(f"First 10 of {len(datalist)} datasets...")
 datalist.head(n=10)
 
 # The same can be done with lesser lines of code
-openml_df = openml.datasets.list_datasets(output_format="dataframe")
+openml_df = openml.datasets.list_datasets()
 openml_df.head(n=10)
 
 ############################################################################
@@ -73,7 +68,7 @@
 # dataset. In particular, many datasets for supervised problems have a set
 # `default_target_attribute` which may help identify the target variable.
 X, y, categorical_indicator, attribute_names = dataset.get_data(
-    target=dataset.default_target_attribute
+    target_names=dataset.default_target_attribute
 )
 print(X.head())
 print(X.info())

diff --git a/examples/30_extended/fetch_evaluations_tutorial.py b/examples/30_extended/fetch_evaluations_tutorial.py
@@ -32,9 +32,7 @@
 # Required filters can be applied to retrieve results from runs as required.
 
 # We shall retrieve a small set (only 10 entries) to test the listing function for evaluations
-openml.evaluations.list_evaluations(
-    function="predictive_accuracy", size=10, output_format="dataframe"
-)
+openml.evaluations.list_evaluations(function="predictive_accuracy", size=10)
 
 # Using other evaluation metrics, 'precision' in this case
 evals = openml.evaluations.list_evaluations(
@@ -62,9 +60,7 @@
 # Note that we now filter the evaluations based on another parameter 'task'.
 
 metric = "predictive_accuracy"
-evals = openml.evaluations.list_evaluations(
-    function=metric, tasks=[task_id], output_format="dataframe"
-)
+evals = openml.evaluations.list_evaluations(function=metric, tasks=[task_id])
 # Displaying the first 10 rows
 print(evals.head(n=10))
 # Sorting the evaluations in decreasing order of the metric chosen
@@ -94,7 +90,7 @@ def plot_cdf(values, metric="predictive_accuracy"):
     plt.minorticks_on()
     plt.grid(visible=True, which="minor", linestyle="--")
     plt.axvline(max_val, linestyle="--", color="gray")
-    plt.text(max_val, 0, "%.3f" % max_val, fontsize=9)
+    plt.text(max_val, 0, f"{max_val:.3f}", fontsize=9)
     plt.show()
 
 

diff --git a/examples/30_extended/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py
@@ -7,9 +7,9 @@
 
 # License: BSD 3-Clause
 
-import openml
-from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree
+from sklearn import compose, ensemble, impute, neighbors, pipeline, preprocessing, tree
 
+import openml
 
 ############################################################################
 # We'll use the test server for the rest of this tutorial.
@@ -27,7 +27,7 @@
 # NOTE: We are using dataset 68 from the test server: https://test.openml.org/d/68
 dataset = openml.datasets.get_dataset(dataset_id="eeg-eye-state", version=1)
 X, y, categorical_indicator, attribute_names = dataset.get_data(
-    target=dataset.default_target_attribute
+    target_names=dataset.default_target_attribute
 )
 clf = neighbors.KNeighborsClassifier(n_neighbors=1)
 clf.fit(X, y)
@@ -38,7 +38,7 @@
 # * e.g. categorical features -> do feature encoding
 dataset = openml.datasets.get_dataset(dataset_id="credit-g", version=1)
 X, y, categorical_indicator, attribute_names = dataset.get_data(
-    target=dataset.default_target_attribute
+    target_names=dataset.default_target_attribute
 )
 print(f"Categorical features: {categorical_indicator}")
 transformer = compose.ColumnTransformer(

diff --git a/examples/30_extended/plot_svm_hyperparameters_tutorial.py b/examples/30_extended/plot_svm_hyperparameters_tutorial.py
@@ -6,9 +6,10 @@
 
 # License: BSD 3-Clause
 
-import openml
 import numpy as np
 
+import openml
+
 ####################################################################################################
 # First step - obtaining the data
 # ===============================
@@ -22,7 +23,6 @@
     function="predictive_accuracy",
     flows=[8353],
     tasks=[6],
-    output_format="dataframe",
     # Using this flag incorporates the hyperparameters into the returned dataframe. Otherwise,
     # the dataframe would contain a field ``paramaters`` containing an unparsed dictionary.
     parameters_in_separate_columns=True,

diff --git a/examples/30_extended/study_tutorial.py b/examples/30_extended/study_tutorial.py
@@ -17,16 +17,11 @@
 
 import openml
 
-
 ############################################################################
 # Listing studies
 # ***************
-#
-# * Use the output_format parameter to select output type
-# * Default gives ``dict``, but we'll use ``dataframe`` to obtain an
-#   easier-to-work-with data structure
 
-studies = openml.study.list_studies(output_format="dataframe", status="all")
+studies = openml.study.list_studies(status="all")
 print(studies.head(n=10))
 
 
@@ -52,7 +47,6 @@
 # the evaluations available for the conducted runs:
 evaluations = openml.evaluations.list_evaluations(
     function="predictive_accuracy",
-    output_format="dataframe",
     study=study.study_id,
 )
 print(evaluations.head())
@@ -81,7 +75,7 @@
 # To verify
 # https://test.openml.org/api/v1/study/1
 suite = openml.study.get_suite("OpenML100")
-print(all([t_id in suite.tasks for t_id in tasks]))
+print(all(t_id in suite.tasks for t_id in tasks))
 
 run_ids = []
 for task_id in tasks:

diff --git a/examples/30_extended/suites_tutorial.py b/examples/30_extended/suites_tutorial.py
@@ -19,16 +19,11 @@
 
 import openml
 
-
 ############################################################################
 # Listing suites
 # **************
-#
-# * Use the output_format parameter to select output type
-# * Default gives ``dict``, but we'll use ``dataframe`` to obtain an
-#   easier-to-work-with data structure
 
-suites = openml.study.list_suites(output_format="dataframe", status="all")
+suites = openml.study.list_suites(status="all")
 print(suites.head(n=10))
 
 ############################################################################
@@ -51,7 +46,7 @@
 
 ############################################################################
 # And we can use the task listing functionality to learn more about them:
-tasks = openml.tasks.list_tasks(output_format="dataframe")
+tasks = openml.tasks.list_tasks()
 
 # Using ``@`` in `pd.DataFrame.query <
 # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html>`_
@@ -76,7 +71,7 @@
 
 # We'll take a random subset of at least ten tasks of all available tasks on
 # the test server:
-all_tasks = list(openml.tasks.list_tasks(output_format="dataframe")["tid"])
+all_tasks = list(openml.tasks.list_tasks()["tid"])
 task_ids_for_suite = sorted(np.random.choice(all_tasks, replace=False, size=20))
 
 # The study needs a machine-readable and unique alias. To obtain this,

diff --git a/examples/30_extended/task_manual_iteration_tutorial.py b/examples/30_extended/task_manual_iteration_tutorial.py
@@ -68,7 +68,7 @@
 ####################################################################################################
 # And then split the data based on this:
 
-X, y = task.get_X_and_y(dataset_format="dataframe")
+X, y = task.get_X_and_y()
 X_train = X.iloc[train_indices]
 y_train = y.iloc[train_indices]
 X_test = X.iloc[test_indices]
@@ -88,7 +88,7 @@
 
 task_id = 3
 task = openml.tasks.get_task(task_id)
-X, y = task.get_X_and_y(dataset_format="dataframe")
+X, y = task.get_X_and_y()
 n_repeats, n_folds, n_samples = task.get_split_dimensions()
 print(
     "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
@@ -132,7 +132,7 @@
 
 task_id = 1767
 task = openml.tasks.get_task(task_id)
-X, y = task.get_X_and_y(dataset_format="dataframe")
+X, y = task.get_X_and_y()
 n_repeats, n_folds, n_samples = task.get_split_dimensions()
 print(
     "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
@@ -176,7 +176,7 @@
 
 task_id = 1702
 task = openml.tasks.get_task(task_id)
-X, y = task.get_X_and_y(dataset_format="dataframe")
+X, y = task.get_X_and_y()
 n_repeats, n_folds, n_samples = task.get_split_dimensions()
 print(
     "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(

diff --git a/examples/30_extended/tasks_tutorial.py b/examples/30_extended/tasks_tutorial.py
@@ -9,7 +9,6 @@
 
 import openml
 from openml.tasks import TaskType
-import pandas as pd
 
 ############################################################################
 #
@@ -30,14 +29,11 @@
 # ^^^^^^^^^^^^^
 #
 # We will start by simply listing only *supervised classification* tasks.
-# **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, but we
-# request a
+# **openml.tasks.list_tasks()** getting a
 # `pandas dataframe <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html>`_
-# instead to have better visualization capabilities and easier access:
+# to have good visualization capabilities and easier access:
 
-tasks = openml.tasks.list_tasks(
-    task_type=TaskType.SUPERVISED_CLASSIFICATION, output_format="dataframe"
-)
+tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION)
 print(tasks.columns)
 print(f"First 5 of {len(tasks)} tasks:")
 print(tasks.head())
@@ -71,21 +67,21 @@
 #
 # Similar to listing tasks by task type, we can list tasks by tags:
 
-tasks = openml.tasks.list_tasks(tag="OpenML100", output_format="dataframe")
+tasks = openml.tasks.list_tasks(tag="OpenML100")
 print(f"First 5 of {len(tasks)} tasks:")
 print(tasks.head())
 
 ############################################################################
 # Furthermore, we can list tasks based on the dataset id:
 
-tasks = openml.tasks.list_tasks(data_id=1471, output_format="dataframe")
+tasks = openml.tasks.list_tasks(data_id=1471)
 print(f"First 5 of {len(tasks)} tasks:")
 print(tasks.head())
 
 ############################################################################
 # In addition, a size limit and an offset can be applied both separately and simultaneously:
 
-tasks = openml.tasks.list_tasks(size=10, offset=50, output_format="dataframe")
+tasks = openml.tasks.list_tasks(size=10, offset=50)
 print(tasks)
 
 ############################################################################
@@ -101,7 +97,7 @@
 # Finally, it is also possible to list all tasks on OpenML with:
 
 ############################################################################
-tasks = openml.tasks.list_tasks(output_format="dataframe")
+tasks = openml.tasks.list_tasks()
 print(len(tasks))
 
 ############################################################################
@@ -195,7 +191,7 @@
     # Error code for 'task already exists'
     if e.code == 614:
         # Lookup task
-        tasks = openml.tasks.list_tasks(data_id=128, output_format="dataframe")
+        tasks = openml.tasks.list_tasks(data_id=128)
         tasks = tasks.query(
             'task_type == "Supervised Classification" '
             'and estimation_procedure == "10-fold Crossvalidation" '

diff --git a/examples/40_paper/2015_neurips_feurer_example.py b/examples/40_paper/2015_neurips_feurer_example.py
@@ -13,12 +13,10 @@
 | Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter
 | In *Advances in Neural Information Processing Systems 28*, 2015
 | Available at https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf
-"""  # noqa F401
+"""
 
 # License: BSD 3-Clause
 
-import pandas as pd
-
 import openml
 
 ####################################################################################################
@@ -60,15 +58,14 @@
 tasks = openml.tasks.list_tasks(
     task_type=openml.tasks.TaskType.SUPERVISED_CLASSIFICATION,
     status="all",
-    output_format="dataframe",
 )
 
 # Query only those with holdout as the resampling startegy.
 tasks = tasks.query('estimation_procedure == "33% Holdout set"')
 
 task_ids = []
 for did in dataset_ids:
-    tasks_ = list(tasks.query("did == {}".format(did)).tid)
+    tasks_ = list(tasks.query(f"did == {did}").tid)
     if len(tasks_) >= 1:  # if there are multiple task, take the one with lowest ID (oldest).
         task_id = min(tasks_)
     else: