diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 3b13c9908..7bc1bbaeb 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -22,48 +22,39 @@ jobs:
build-and-deploy:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v4
- - name: Setup Python
- uses: actions/setup-python@v5
- with:
- python-version: 3.8
- - name: Install dependencies
- run: |
+ - uses: actions/checkout@v4
+ - name: Setup Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: 3.8
+ - name: Install dependencies
+ run: |
pip install -e .[docs,examples]
- - name: Make docs
- run: |
- cd doc
- make html
- - name: Check links
- run: |
- cd doc
- make linkcheck
- - name: Pull latest gh-pages
- if: (contains(github.ref, 'develop') || contains(github.ref, 'main')) && github.event_name == 'push'
- run: |
- cd ..
- git clone https://github.com/openml/openml-python.git --branch gh-pages --single-branch gh-pages
- - name: Copy new doc into gh-pages
- if: (contains(github.ref, 'develop') || contains(github.ref, 'main')) && github.event_name == 'push'
- run: |
- branch_name=${GITHUB_REF##*/}
- cd ../gh-pages
- rm -rf $branch_name
- cp -r ../openml-python/doc/build/html $branch_name
- - name: Push to gh-pages
- if: (contains(github.ref, 'develop') || contains(github.ref, 'main')) && github.event_name == 'push'
- run: |
- last_commit=$(git log --pretty=format:"%an: %s")
- cd ../gh-pages
- branch_name=${GITHUB_REF##*/}
- git add $branch_name/
- git config --global user.name 'Github Actions'
- git config --global user.email 'not@mail.com'
- git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}
- # Only commit and push if there are changes
- if ! git diff --cached --quiet; then
- git commit -m "$last_commit"
- git push
- else
- echo "Branch is up to date with origin/gh-pages, no need to update docs. Skipping."
- fi
+ - name: Make docs
+ run: |
+ mkdocs build
+ - name: Deploy to GitHub Pages
+ env:
+ CI: false
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ PAGES_BRANCH: gh-pages
+ if: (contains(github.ref, 'develop') || contains(github.ref, 'main')) && github.event_name == 'push'
+ run: |
+ # mkdocs gh-deploy --force
+ git config user.name doc-bot
+ git config user.email doc-bot@openml.com
+ current_version=$(git tag | sort --version-sort | tail -n 1)
+ # This block will rename previous retitled versions
+ retitled_versions=$(mike list -j | jq ".[] | select(.title != .version) | .version" | tr -d '"')
+ for version in $retitled_versions; do
+ mike retitle "${version}" "${version}"
+ done
+
+ echo "Deploying docs for ${current_version}"
+ mike deploy \
+ --push \
+ --title "${current_version} (latest)" \
+ --update-aliases \
+ "${current_version}" \
+ "latest"\
+ -b $PAGES_BRANCH origin/$PAGES_BRANCH
diff --git a/.gitignore b/.gitignore
index 5687e41f1..241cf9630 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
doc/generated
examples/.ipynb_checkpoints
venv
+.uv-lock
# Byte-compiled / optimized / DLL files
__pycache__/
diff --git a/docs/contributing.md b/docs/contributing.md
new file mode 100644
index 000000000..c18de3ccc
--- /dev/null
+++ b/docs/contributing.md
@@ -0,0 +1,24 @@
+# Contributing
+
+Contribution to the OpenML package is highly appreciated in all forms.
+In particular, a few ways to contribute to openml-python are:
+
+- A direct contribution to the package, by means of improving the
+ code, documentation or examples. To get started, see [this
+ file](https://github.com/openml/openml-python/blob/main/CONTRIBUTING.md)
+ with details on how to set up your environment to develop for
+ openml-python.
+- A contribution to an openml-python extension. An extension package
+ allows OpenML to interface with a machine learning package (such
+ as scikit-learn or keras). These extensions are hosted in separate
+ repositories and may have their own guidelines. For more
+ information, see also [extensions](extensions.md).
+- Bug reports. If something doesn't work for you or is cumbersome,
+ please open a new issue to let us know about the problem. See
+ [this
+ section](https://github.com/openml/openml-python/blob/main/CONTRIBUTING.md).
+- [Cite OpenML](https://www.openml.org/cite) if you use it in a
+ scientific publication.
+- Visit one of our [hackathons](https://www.openml.org/meet).
+- Contribute to another OpenML project, such as [the main OpenML
+ project](https://github.com/openml/OpenML/blob/master/CONTRIBUTING.md).
diff --git a/docs/extensions.md b/docs/extensions.md
new file mode 100644
index 000000000..f2aa230f5
--- /dev/null
+++ b/docs/extensions.md
@@ -0,0 +1,179 @@
+# Extensions
+
+OpenML-Python provides an extension interface to connect other machine
+learning libraries than scikit-learn to OpenML. Please check the
+`api_extensions`{.interpreted-text role="ref"} and use the scikit-learn
+extension in
+`openml.extensions.sklearn.SklearnExtension`{.interpreted-text
+role="class"} as a starting point.
+
+## List of extensions
+
+Here is a list of currently maintained OpenML extensions:
+
+- `openml.extensions.sklearn.SklearnExtension`{.interpreted-text
+ role="class"}
+- [openml-keras](https://github.com/openml/openml-keras)
+- [openml-pytorch](https://github.com/openml/openml-pytorch)
+- [openml-tensorflow (for tensorflow
+ 2+)](https://github.com/openml/openml-tensorflow)
+
+## Connecting new machine learning libraries
+
+### Content of the Library
+
+To leverage support from the community and to tap in the potential of
+OpenML, interfacing with popular machine learning libraries is
+essential. The OpenML-Python package is capable of downloading meta-data
+and results (data, flows, runs), regardless of the library that was used
+to upload it. However, in order to simplify the process of uploading
+flows and runs from a specific library, an additional interface can be
+built. The OpenML-Python team does not have the capacity to develop and
+maintain such interfaces on its own. For this reason, we have built an
+extension interface to allows others to contribute back. Building a
+suitable extension for therefore requires an understanding of the
+current OpenML-Python support.
+
+The
+`sphx_glr_examples_20_basic_simple_flows_and_runs_tutorial.py`{.interpreted-text
+role="ref"} tutorial shows how scikit-learn currently works with
+OpenML-Python as an extension. The *sklearn* extension packaged with the
+[openml-python](https://github.com/openml/openml-python) repository can
+be used as a template/benchmark to build the new extension.
+
+#### API
+
+- The extension scripts must import the [openml]{.title-ref} package
+ and be able to interface with any function from the OpenML-Python
+ `api`{.interpreted-text role="ref"}.
+- The extension has to be defined as a Python class and must inherit
+ from `openml.extensions.Extension`{.interpreted-text role="class"}.
+- This class needs to have all the functions from [class
+ Extension]{.title-ref} overloaded as required.
+- The redefined functions should have adequate and appropriate
+ docstrings. The [Sklearn Extension API
+ :class:\`openml.extensions.sklearn.SklearnExtension.html]{.title-ref}
+ is a good example to follow.
+
+#### Interfacing with OpenML-Python
+
+Once the new extension class has been defined, the openml-python module
+to `openml.extensions.register_extension`{.interpreted-text role="meth"}
+must be called to allow OpenML-Python to interface the new extension.
+
+The following methods should get implemented. Although the documentation
+in the [Extension]{.title-ref} interface should always be leading, here
+we list some additional information and best practices. The [Sklearn
+Extension API
+:class:\`openml.extensions.sklearn.SklearnExtension.html]{.title-ref} is
+a good example to follow. Note that most methods are relatively simple
+and can be implemented in several lines of code.
+
+- General setup (required)
+ - `can_handle_flow`{.interpreted-text role="meth"}: Takes as
+ argument an OpenML flow, and checks whether this can be handled
+ by the current extension. The OpenML database consists of many
+ flows, from various workbenches (e.g., scikit-learn, Weka, mlr).
+ This method is called before a model is being deserialized.
+ Typically, the flow-dependency field is used to check whether
+ the specific library is present, and no unknown libraries are
+ present there.
+ - `can_handle_model`{.interpreted-text role="meth"}: Similar as
+ `can_handle_flow`{.interpreted-text role="meth"}, except that in
+ this case a Python object is given. As such, in many cases, this
+ method can be implemented by checking whether this adheres to a
+ certain base class.
+- Serialization and De-serialization (required)
+ - `flow_to_model`{.interpreted-text role="meth"}: deserializes the
+ OpenML Flow into a model (if the library can indeed handle the
+ flow). This method has an important interplay with
+ `model_to_flow`{.interpreted-text role="meth"}. Running these
+ two methods in succession should result in exactly the same
+ model (or flow). This property can be used for unit testing
+ (e.g., build a model with hyperparameters, make predictions on a
+ task, serialize it to a flow, deserialize it back, make it
+ predict on the same task, and check whether the predictions are
+ exactly the same.) The example in the scikit-learn interface
+ might seem daunting, but note that here some complicated design
+ choices were made, that allow for all sorts of interesting
+ research questions. It is probably good practice to start easy.
+ - `model_to_flow`{.interpreted-text role="meth"}: The inverse of
+ `flow_to_model`{.interpreted-text role="meth"}. Serializes a
+ model into an OpenML Flow. The flow should preserve the class,
+ the library version, and the tunable hyperparameters.
+ - `get_version_information`{.interpreted-text role="meth"}: Return
+ a tuple with the version information of the important libraries.
+ - `create_setup_string`{.interpreted-text role="meth"}: No longer
+ used, and will be deprecated soon.
+- Performing runs (required)
+ - `is_estimator`{.interpreted-text role="meth"}: Gets as input a
+ class, and checks whether it has the status of estimator in the
+ library (typically, whether it has a train method and a predict
+ method).
+ - `seed_model`{.interpreted-text role="meth"}: Sets a random seed
+ to the model.
+ - `_run_model_on_fold`{.interpreted-text role="meth"}: One of the
+ main requirements for a library to generate run objects for the
+ OpenML server. Obtains a train split (with labels) and a test
+ split (without labels) and the goal is to train a model on the
+ train split and return the predictions on the test split. On top
+ of the actual predictions, also the class probabilities should
+ be determined. For classifiers that do not return class
+ probabilities, this can just be the hot-encoded predicted label.
+ The predictions will be evaluated on the OpenML server. Also,
+ additional information can be returned, for example,
+ user-defined measures (such as runtime information, as this can
+ not be inferred on the server). Additionally, information about
+ a hyperparameter optimization trace can be provided.
+ - `obtain_parameter_values`{.interpreted-text role="meth"}:
+ Obtains the hyperparameters of a given model and the current
+ values. Please note that in the case of a hyperparameter
+ optimization procedure (e.g., random search), you only should
+ return the hyperparameters of this procedure (e.g., the
+ hyperparameter grid, budget, etc) and that the chosen model will
+ be inferred from the optimization trace.
+ - `check_if_model_fitted`{.interpreted-text role="meth"}: Check
+ whether the train method of the model has been called (and as
+ such, whether the predict method can be used).
+- Hyperparameter optimization (optional)
+ - `instantiate_model_from_hpo_class`{.interpreted-text
+ role="meth"}: If a given run has recorded the hyperparameter
+ optimization trace, then this method can be used to
+ reinstantiate the model with hyperparameters of a given
+ hyperparameter optimization iteration. Has some similarities
+ with `flow_to_model`{.interpreted-text role="meth"} (as this
+ method also sets the hyperparameters of a model). Note that
+ although this method is required, it is not necessary to
+ implement any logic if hyperparameter optimization is not
+ implemented. Simply raise a [NotImplementedError]{.title-ref}
+ then.
+
+### Hosting the library
+
+Each extension created should be a stand-alone repository, compatible
+with the [OpenML-Python
+repository](https://github.com/openml/openml-python). The extension
+repository should work off-the-shelf with *OpenML-Python* installed.
+
+Create a [public Github
+repo](https://docs.github.com/en/github/getting-started-with-github/create-a-repo)
+with the following directory structure:
+
+ | [repo name]
+ | |-- [extension name]
+ | | |-- __init__.py
+ | | |-- extension.py
+ | | |-- config.py (optionally)
+
+### Recommended
+
+- Test cases to keep the extension up to date with the
+ [openml-python]{.title-ref} upstream changes.
+- Documentation of the extension API, especially if any new
+ functionality added to OpenML-Python\'s extension design.
+- Examples to show how the new extension interfaces and works with
+ OpenML-Python.
+- Create a PR to add the new extension to the OpenML-Python API
+ documentation.
+
+Happy contributing!
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 000000000..cda5bcb4b
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,89 @@
+# OpenML
+
+**Collaborative Machine Learning in Python**
+
+Welcome to the documentation of the OpenML Python API, a connector to
+the collaborative machine learning platform
+[OpenML.org](https://www.openml.org). The OpenML Python package allows
+to use datasets and tasks from OpenML together with scikit-learn and
+share the results online.
+
+## Example
+
+```python
+import openml
+from sklearn import impute, tree, pipeline
+
+# Define a scikit-learn classifier or pipeline
+clf = pipeline.Pipeline(
+ steps=[
+ ('imputer', impute.SimpleImputer()),
+ ('estimator', tree.DecisionTreeClassifier())
+ ]
+)
+# Download the OpenML task for the pendigits dataset with 10-fold
+# cross-validation.
+task = openml.tasks.get_task(32)
+# Run the scikit-learn model on the task.
+run = openml.runs.run_model_on_task(clf, task)
+# Publish the experiment on OpenML (optional, requires an API key.
+# You can get your own API key by signing up to OpenML.org)
+run.publish()
+print(f'View the run online: {run.openml_url}')
+```
+
+Find more examples in the sidebar on the left.
+
+## How to get OpenML for python
+
+You can install the OpenML package via `pip` (we recommend using a virtual environment):
+
+```bash
+python -m pip install openml
+```
+
+For more advanced installation information, please see the
+["Introduction"](../examples/20_basic/introduction_tutorial.py) example.
+
+
+## Further information
+
+- [OpenML documentation](https://docs.openml.org/)
+- [OpenML client APIs](https://docs.openml.org/APIs/)
+- [OpenML developer guide](https://docs.openml.org/Contributing/)
+- [Contact information](https://www.openml.org/contact)
+- [Citation request](https://www.openml.org/cite)
+- [OpenML blog](https://medium.com/open-machine-learning)
+- [OpenML twitter account](https://twitter.com/open_ml)
+
+## Contributing
+
+Contribution to the OpenML package is highly appreciated. Please see the
+["Contributing"][contributing] page for more information.
+
+## Citing OpenML-Python
+
+If you use OpenML-Python in a scientific publication, we would
+appreciate a reference to our JMLR-MLOSS paper
+["OpenML-Python: an extensible Python API for OpenML"](https://www.jmlr.org/papers/v22/19-920.html):
+
+=== "Bibtex"
+
+ ```bibtex
+ @article{JMLR:v22:19-920,
+ author = {Matthias Feurer and Jan N. van Rijn and Arlind Kadra and Pieter Gijsbers and Neeratyoy Mallik and Sahithya Ravi and Andreas Müller and Joaquin Vanschoren and Frank Hutter},
+ title = {OpenML-Python: an extensible Python API for OpenML},
+ journal = {Journal of Machine Learning Research},
+ year = {2021},
+ volume = {22},
+ number = {100},
+ pages = {1--5},
+ url = {http://jmlr.org/papers/v22/19-920.html}
+ }
+ ```
+
+=== "MLA"
+
+ Feurer, Matthias, et al.
+ "OpenML-Python: an extensible Python API for OpenML."
+ _Journal of Machine Learning Research_ 22.100 (2021):1−5.
diff --git a/docs/progress.md b/docs/progress.md
new file mode 100644
index 000000000..c2923576b
--- /dev/null
+++ b/docs/progress.md
@@ -0,0 +1,489 @@
+# Changelog {#progress}
+
+## next
+
+> - MAINT #1340: Add Numpy 2.0 support. Update tests to work with
+> scikit-learn \<= 1.5.
+> - ADD #1342: Add HTTP header to requests to indicate they are from
+> openml-python.
+
+## 0.14.2
+
+> - MAINT #1280: Use the server-provided `parquet_url` instead of
+> `minio_url` to determine the location of the parquet file.
+> - ADD #716: add documentation for remaining attributes of classes
+> and functions.
+> - ADD #1261: more annotations for type hints.
+> - MAINT #1294: update tests to new tag specification.
+> - FIX #1314: Update fetching a bucket from MinIO.
+> - FIX #1315: Make class label retrieval more lenient.
+> - ADD #1316: add feature descriptions ontologies support.
+> - MAINT #1310/#1307: switch to ruff and resolve all mypy errors.
+
+## 0.14.1
+
+> - FIX: Fallback on downloading ARFF when failing to download parquet
+> from MinIO due to a ServerError.
+
+## 0.14.0
+
+**IMPORTANT:** This release paves the way towards a breaking update of
+OpenML-Python. From version 0.15, functions that had the option to
+return a pandas DataFrame will return a pandas DataFrame by default.
+This version (0.14) emits a warning if you still use the old access
+functionality. More concretely:
+
+- In 0.15 we will drop the ability to return dictionaries in listing
+ calls and only provide pandas DataFrames. To disable warnings in
+ 0.14 you have to request a pandas DataFrame (using
+ `output_format="dataframe"`).
+- In 0.15 we will drop the ability to return datasets as numpy arrays
+ and only provide pandas DataFrames. To disable warnings in 0.14 you
+ have to request a pandas DataFrame (using
+ `dataset_format="dataframe"`).
+
+Furthermore, from version 0.15, OpenML-Python will no longer download
+datasets and dataset metadata by default. This version (0.14) emits a
+warning if you don\'t explicitly specifiy the desired behavior.
+
+Please see the pull requests #1258 and #1260 for further information.
+
+- ADD #1081: New flag that allows disabling downloading dataset
+ features.
+- ADD #1132: New flag that forces a redownload of cached data.
+- FIX #1244: Fixes a rare bug where task listing could fail when the
+ server returned invalid data.
+- DOC #1229: Fixes a comment string for the main example.
+- DOC #1241: Fixes a comment in an example.
+- MAINT #1124: Improve naming of helper functions that govern the
+ cache directories.
+- MAINT #1223, #1250: Update tools used in pre-commit to the latest
+ versions (`black==23.30`, `mypy==1.3.0`, `flake8==6.0.0`).
+- MAINT #1253: Update the citation request to the JMLR paper.
+- MAINT #1246: Add a warning that warns the user that checking for
+ duplicate runs on the server cannot be done without an API key.
+
+## 0.13.1
+
+- ADD #1081 #1132: Add additional options for (not) downloading
+ datasets `openml.datasets.get_dataset` and cache management.
+- ADD #1028: Add functions to delete runs, flows, datasets, and tasks
+ (e.g., `openml.datasets.delete_dataset`).
+- ADD #1144: Add locally computed results to the `OpenMLRun` object\'s
+ representation if the run was created locally and not downloaded
+ from the server.
+- ADD #1180: Improve the error message when the checksum of a
+ downloaded dataset does not match the checksum provided by the API.
+- ADD #1201: Make `OpenMLTraceIteration` a dataclass.
+- DOC #1069: Add argument documentation for the `OpenMLRun` class.
+- DOC #1241 #1229 #1231: Minor documentation fixes and resolve
+ documentation examples not working.
+- FIX #1197 #559 #1131: Fix the order of ground truth and predictions
+ in the `OpenMLRun` object and in `format_prediction`.
+- FIX #1198: Support numpy 1.24 and higher.
+- FIX #1216: Allow unknown task types on the server. This is only
+ relevant when new task types are added to the test server.
+- FIX #1223: Fix mypy errors for implicit optional typing.
+- MAINT #1155: Add dependabot github action to automatically update
+ other github actions.
+- MAINT #1199: Obtain pre-commit\'s flake8 from github.com instead of
+ gitlab.com.
+- MAINT #1215: Support latest numpy version.
+- MAINT #1218: Test Python3.6 on Ubuntu 20.04 instead of the latest
+ Ubuntu (which is 22.04).
+- MAINT #1221 #1212 #1206 #1211: Update github actions to the latest
+ versions.
+
+## 0.13.0
+
+> - FIX #1030: `pre-commit` hooks now no longer should issue a
+> warning.
+> - FIX #1058, #1100: Avoid `NoneType` error when printing task
+> without `class_labels` attribute.
+> - FIX #1110: Make arguments to `create_study` and `create_suite`
+> that are defined as optional by the OpenML XSD actually optional.
+> - FIX #1147: `openml.flow.flow_exists` no longer requires an API
+> key.
+> - FIX #1184: Automatically resolve proxies when downloading from
+> minio. Turn this off by setting environment variable
+> `no_proxy="*"`.
+> - MAINT #1088: Do CI for Windows on Github Actions instead of
+> Appveyor.
+> - MAINT #1104: Fix outdated docstring for `list_task`.
+> - MAINT #1146: Update the pre-commit dependencies.
+> - ADD #1103: Add a `predictions` property to OpenMLRun for easy
+> accessibility of prediction data.
+> - ADD #1188: EXPERIMENTAL. Allow downloading all files from a minio
+> bucket with `download_all_files=True` for `get_dataset`.
+
+## 0.12.2
+
+- ADD #1065: Add a `retry_policy` configuration option that determines
+ the frequency and number of times to attempt to retry server
+ requests.
+- ADD #1075: A docker image is now automatically built on a push to
+ develop. It can be used to build docs or run tests in an isolated
+ environment.
+- ADD: You can now avoid downloading \'qualities\' meta-data when
+ downloading a task with the `download_qualities` parameter of
+ `openml.tasks.get_task[s]` functions.
+- DOC: Fixes a few broken links in the documentation.
+- DOC #1061: Improve examples to always show a warning when they
+ switch to the test server.
+- DOC #1067: Improve documentation on the scikit-learn extension
+ interface.
+- DOC #1068: Create dedicated extensions page.
+- FIX #1075: Correctly convert [y]{.title-ref} to a pandas series when
+ downloading sparse data.
+- MAINT: Rename [master]{.title-ref} brach to [ main]{.title-ref}
+ branch.
+- MAINT/DOC: Automatically check for broken external links when
+ building the documentation.
+- MAINT/DOC: Fail documentation building on warnings. This will make
+ the documentation building fail if a reference cannot be found (i.e.
+ an internal link is broken).
+
+## 0.12.1
+
+- ADD #895/#1038: Measure runtimes of scikit-learn runs also for
+ models which are parallelized via the joblib.
+- DOC #1050: Refer to the webpage instead of the XML file in the main
+ example.
+- DOC #1051: Document existing extensions to OpenML-Python besides the
+ shipped scikit-learn extension.
+- FIX #1035: Render class attributes and methods again.
+- ADD #1049: Add a command line tool for configuration openml-python.
+- FIX #1042: Fixes a rare concurrency issue with OpenML-Python and
+ joblib which caused the joblib worker pool to fail.
+- FIX #1053: Fixes a bug which could prevent importing the package in
+ a docker container.
+
+## 0.12.0
+
+- ADD #964: Validate `ignore_attribute`, `default_target_attribute`,
+ `row_id_attribute` are set to attributes that exist on the dataset
+ when calling `create_dataset`.
+- ADD #979: Dataset features and qualities are now also cached in
+ pickle format.
+- ADD #982: Add helper functions for column transformers.
+- ADD #989: `run_model_on_task` will now warn the user the the model
+ passed has already been fitted.
+- ADD #1009 : Give possibility to not download the dataset qualities.
+ The cached version is used even so download attribute is false.
+- ADD #1016: Add scikit-learn 0.24 support.
+- ADD #1020: Add option to parallelize evaluation of tasks with
+ joblib.
+- ADD #1022: Allow minimum version of dependencies to be listed for a
+ flow, use more accurate minimum versions for scikit-learn
+ dependencies.
+- ADD #1023: Add admin-only calls for adding topics to datasets.
+- ADD #1029: Add support for fetching dataset from a minio server in
+ parquet format.
+- ADD #1031: Generally improve runtime measurements, add them for some
+ previously unsupported flows (e.g. BaseSearchCV derived flows).
+- DOC #973 : Change the task used in the welcome page example so it no
+ longer fails using numerical dataset.
+- MAINT #671: Improved the performance of `check_datasets_active` by
+ only querying the given list of datasets in contrast to querying all
+ datasets. Modified the corresponding unit test.
+- MAINT #891: Changed the way that numerical features are stored.
+ Numerical features that range from 0 to 255 are now stored as uint8,
+ which reduces the storage space required as well as storing and
+ loading times.
+- MAINT #975, #988: Add CI through Github Actions.
+- MAINT #977: Allow `short` and `long` scenarios for unit tests.
+ Reduce the workload for some unit tests.
+- MAINT #985, #1000: Improve unit test stability and output
+ readability, and adds load balancing.
+- MAINT #1018: Refactor data loading and storage. Data is now
+ compressed on the first call to [get_data]{.title-ref}.
+- MAINT #1024: Remove flaky decorator for study unit test.
+- FIX #883 #884 #906 #972: Various improvements to the caching system.
+- FIX #980: Speed up `check_datasets_active`.
+- FIX #984: Add a retry mechanism when the server encounters a
+ database issue.
+- FIX #1004: Fixed an issue that prevented installation on some
+ systems (e.g. Ubuntu).
+- FIX #1013: Fixes a bug where `OpenMLRun.setup_string` was not
+ uploaded to the server, prepares for `run_details` being sent from
+ the server.
+- FIX #1021: Fixes an issue that could occur when running unit tests
+ and openml-python was not in PATH.
+- FIX #1037: Fixes a bug where a dataset could not be loaded if a
+ categorical value had listed nan-like as a possible category.
+
+## 0.11.0
+
+- ADD #753: Allows uploading custom flows to OpenML via OpenML-Python.
+- ADD #777: Allows running a flow on pandas dataframes (in addition to
+ numpy arrays).
+- ADD #888: Allow passing a [task_id]{.title-ref} to
+ [run_model_on_task]{.title-ref}.
+- ADD #894: Support caching of datasets using feather format as an
+ option.
+- ADD #929: Add `edit_dataset` and `fork_dataset` to allow editing and
+ forking of uploaded datasets.
+- ADD #866, #943: Add support for scikit-learn\'s
+ [passthrough]{.title-ref} and [drop]{.title-ref} when uploading
+ flows to OpenML.
+- ADD #879: Add support for scikit-learn\'s MLP hyperparameter
+ [layer_sizes]{.title-ref}.
+- ADD #894: Support caching of datasets using feather format as an
+ option.
+- ADD #945: PEP 561 compliance for distributing Type information.
+- DOC #660: Remove nonexistent argument from docstring.
+- DOC #901: The API reference now documents the config file and its
+ options.
+- DOC #912: API reference now shows [create_task]{.title-ref}.
+- DOC #954: Remove TODO text from documentation.
+- DOC #960: document how to upload multiple ignore attributes.
+- FIX #873: Fixes an issue which resulted in incorrect URLs when
+ printing OpenML objects after switching the server.
+- FIX #885: Logger no longer registered by default. Added utility
+ functions to easily register logging to console and file.
+- FIX #890: Correct the scaling of data in the SVM example.
+- MAINT #371: `list_evaluations` default `size` changed from `None` to
+ `10_000`.
+- MAINT #767: Source distribution installation is now unit-tested.
+- MAINT #781: Add pre-commit and automated code formatting with black.
+- MAINT #804: Rename arguments of list_evaluations to indicate they
+ expect lists of ids.
+- MAINT #836: OpenML supports only pandas version 1.0.0 or above.
+- MAINT #865: OpenML no longer bundles test files in the source
+ distribution.
+- MAINT #881: Improve the error message for too-long URIs.
+- MAINT #897: Dropping support for Python 3.5.
+- MAINT #916: Adding support for Python 3.8.
+- MAINT #920: Improve error messages for dataset upload.
+- MAINT #921: Improve hangling of the OpenML server URL in the config
+ file.
+- MAINT #925: Improve error handling and error message when loading
+ datasets.
+- MAINT #928: Restructures the contributing documentation.
+- MAINT #936: Adding support for scikit-learn 0.23.X.
+- MAINT #945: Make OpenML-Python PEP562 compliant.
+- MAINT #951: Converts TaskType class to a TaskType enum.
+
+## 0.10.2
+
+- ADD #857: Adds task type ID to list_runs
+- DOC #862: Added license BSD 3-Clause to each of the source files.
+
+## 0.10.1
+
+- ADD #175: Automatically adds the docstring of scikit-learn objects
+ to flow and its parameters.
+- ADD #737: New evaluation listing call that includes the
+ hyperparameter settings.
+- ADD #744: It is now possible to only issue a warning and not raise
+ an exception if the package versions for a flow are not met when
+ deserializing it.
+- ADD #783: The URL to download the predictions for a run is now
+ stored in the run object.
+- ADD #790: Adds the uploader name and id as new filtering options for
+ `list_evaluations`.
+- ADD #792: New convenience function `openml.flow.get_flow_id`.
+- ADD #861: Debug-level log information now being written to a file in
+ the cache directory (at most 2 MB).
+- DOC #778: Introduces instructions on how to publish an extension to
+ support other libraries than scikit-learn.
+- DOC #785: The examples section is completely restructured into
+ simple simple examples, advanced examples and examples showcasing
+ the use of OpenML-Python to reproduce papers which were done with
+ OpenML-Python.
+- DOC #788: New example on manually iterating through the split of a
+ task.
+- DOC #789: Improve the usage of dataframes in the examples.
+- DOC #791: New example for the paper *Efficient and Robust Automated
+ Machine Learning* by Feurer et al. (2015).
+- DOC #803: New example for the paper *Don't Rule Out Simple Models
+ Prematurely: A Large Scale Benchmark Comparing Linear and Non-linear
+ Classifiers in OpenML* by Benjamin Strang et al. (2018).
+- DOC #808: New example demonstrating basic use cases of a dataset.
+- DOC #810: New example demonstrating the use of benchmarking studies
+ and suites.
+- DOC #832: New example for the paper *Scalable Hyperparameter
+ Transfer Learning* by Valerio Perrone et al. (2019)
+- DOC #834: New example showing how to plot the loss surface for a
+ support vector machine.
+- FIX #305: Do not require the external version in the flow XML when
+ loading an object.
+- FIX #734: Better handling of *\"old\"* flows.
+- FIX #736: Attach a StreamHandler to the openml logger instead of the
+ root logger.
+- FIX #758: Fixes an error which made the client API crash when
+ loading a sparse data with categorical variables.
+- FIX #779: Do not fail on corrupt pickle
+- FIX #782: Assign the study id to the correct class attribute.
+- FIX #819: Automatically convert column names to type string when
+ uploading a dataset.
+- FIX #820: Make `__repr__` work for datasets which do not have an id.
+- MAINT #796: Rename an argument to make the function
+ `list_evaluations` more consistent.
+- MAINT #811: Print the full error message given by the server.
+- MAINT #828: Create base class for OpenML entity classes.
+- MAINT #829: Reduce the number of data conversion warnings.
+- MAINT #831: Warn if there\'s an empty flow description when
+ publishing a flow.
+- MAINT #837: Also print the flow XML if a flow fails to validate.
+- FIX #838: Fix list_evaluations_setups to work when evaluations are
+ not a 100 multiple.
+- FIX #847: Fixes an issue where the client API would crash when
+ trying to download a dataset when there are no qualities available
+ on the server.
+- MAINT #849: Move logic of most different `publish` functions into
+ the base class.
+- MAINt #850: Remove outdated test code.
+
+## 0.10.0
+
+- ADD #737: Add list_evaluations_setups to return hyperparameters
+ along with list of evaluations.
+- FIX #261: Test server is cleared of all files uploaded during unit
+ testing.
+- FIX #447: All files created by unit tests no longer persist in
+ local.
+- FIX #608: Fixing dataset_id referenced before assignment error in
+ get_run function.
+- FIX #447: All files created by unit tests are deleted after the
+ completion of all unit tests.
+- FIX #589: Fixing a bug that did not successfully upload the columns
+ to ignore when creating and publishing a dataset.
+- FIX #608: Fixing dataset_id referenced before assignment error in
+ get_run function.
+- DOC #639: More descriptive documention for function to convert array
+ format.
+- DOC #719: Add documentation on uploading tasks.
+- ADD #687: Adds a function to retrieve the list of evaluation
+ measures available.
+- ADD #695: A function to retrieve all the data quality measures
+ available.
+- ADD #412: Add a function to trim flow names for scikit-learn flows.
+- ADD #715: [list_evaluations]{.title-ref} now has an option to sort
+ evaluations by score (value).
+- ADD #722: Automatic reinstantiation of flow in
+ [run_model_on_task]{.title-ref}. Clearer errors if that\'s not
+ possible.
+- ADD #412: The scikit-learn extension populates the short name field
+ for flows.
+- MAINT #726: Update examples to remove deprecation warnings from
+ scikit-learn
+- MAINT #752: Update OpenML-Python to be compatible with sklearn 0.21
+- ADD #790: Add user ID and name to list_evaluations
+
+## 0.9.0
+
+- ADD #560: OpenML-Python can now handle regression tasks as well.
+- ADD #620, #628, #632, #649, #682: Full support for studies and
+ distinguishes suites from studies.
+- ADD #607: Tasks can now be created and uploaded.
+- ADD #647, #673: Introduced the extension interface. This provides an
+ easy way to create a hook for machine learning packages to perform
+ e.g. automated runs.
+- ADD #548, #646, #676: Support for Pandas DataFrame and
+ SparseDataFrame
+- ADD #662: Results of listing functions can now be returned as
+ pandas.DataFrame.
+- ADD #59: Datasets can now also be retrieved by name.
+- ADD #672: Add timing measurements for runs, when possible.
+- ADD #661: Upload time and error messages now displayed with
+ [list_runs]{.title-ref}.
+- ADD #644: Datasets can now be downloaded \'lazily\', retrieving only
+ metadata at first, and the full dataset only when necessary.
+- ADD #659: Lazy loading of task splits.
+- ADD #516: [run_flow_on_task]{.title-ref} flow uploading is now
+ optional.
+- ADD #680: Adds
+ [openml.config.start_using_configuration_for_example]{.title-ref}
+ (and resp. stop) to easily connect to the test server.
+- ADD #75, #653: Adds a pretty print for objects of the top-level
+ classes.
+- FIX #642: [check_datasets_active]{.title-ref} now correctly also
+ returns active status of deactivated datasets.
+- FIX #304, #636: Allow serialization of numpy datatypes and list of
+ lists of more types (e.g. bools, ints) for flows.
+- FIX #651: Fixed a bug that would prevent openml-python from finding
+ the user\'s config file.
+- FIX #693: OpenML-Python uses liac-arff instead of scipy.io for
+ loading task splits now.
+- DOC #678: Better color scheme for code examples in documentation.
+- DOC #681: Small improvements and removing list of missing functions.
+- DOC #684: Add notice to examples that connect to the test server.
+- DOC #688: Add new example on retrieving evaluations.
+- DOC #691: Update contributing guidelines to use Github draft feature
+ instead of tags in title.
+- DOC #692: All functions are documented now.
+- MAINT #184: Dropping Python2 support.
+- MAINT #596: Fewer dependencies for regular pip install.
+- MAINT #652: Numpy and Scipy are no longer required before
+ installation.
+- MAINT #655: Lazy loading is now preferred in unit tests.
+- MAINT #667: Different tag functions now share code.
+- MAINT #666: More descriptive error message for
+ [TypeError]{.title-ref} in [list_runs]{.title-ref}.
+- MAINT #668: Fix some type hints.
+- MAINT #677: [dataset.get_data]{.title-ref} now has consistent
+ behavior in its return type.
+- MAINT #686: Adds ignore directives for several [mypy]{.title-ref}
+ folders.
+- MAINT #629, #630: Code now adheres to single PEP8 standard.
+
+## 0.8.0
+
+- ADD #440: Improved dataset upload.
+- ADD #545, #583: Allow uploading a dataset from a pandas DataFrame.
+- ADD #528: New functions to update the status of a dataset.
+- ADD #523: Support for scikit-learn 0.20\'s new ColumnTransformer.
+- ADD #459: Enhanced support to store runs on disk prior to uploading
+ them to OpenML.
+- ADD #564: New helpers to access the structure of a flow (and find
+ its subflows).
+- ADD #618: The software will from now on retry to connect to the
+ server if a connection failed. The number of retries can be
+ configured.
+- FIX #538: Support loading clustering tasks.
+- FIX #464: Fixes a bug related to listing functions (returns correct
+ listing size).
+- FIX #580: Listing function now works properly when there are less
+ results than requested.
+- FIX #571: Fixes an issue where tasks could not be downloaded in
+ parallel.
+- FIX #536: Flows can now be printed when the flow name is None.
+- FIX #504: Better support for hierarchical hyperparameters when
+ uploading scikit-learn\'s grid and random search.
+- FIX #569: Less strict checking of flow dependencies when loading
+ flows.
+- FIX #431: Pickle of task splits are no longer cached.
+- DOC #540: More examples for dataset uploading.
+- DOC #554: Remove the doubled progress entry from the docs.
+- MAINT #613: Utilize the latest updates in OpenML evaluation
+ listings.
+- MAINT #482: Cleaner interface for handling search traces.
+- MAINT #557: Continuous integration works for scikit-learn 0.18-0.20.
+- MAINT #542: Continuous integration now runs python3.7 as well.
+- MAINT #535: Continuous integration now enforces PEP8 compliance for
+ new code.
+- MAINT #527: Replace deprecated nose by pytest.
+- MAINT #510: Documentation is now built by travis-ci instead of
+ circle-ci.
+- MAINT: Completely re-designed documentation built on sphinx gallery.
+- MAINT #462: Appveyor CI support.
+- MAINT #477: Improve error handling for issue
+ [#479](https://github.com/openml/openml-python/pull/479): the OpenML
+ connector fails earlier and with a better error message when failing
+ to create a flow from the OpenML description.
+- MAINT #561: Improve documentation on running specific unit tests.
+
+## 0.4.-0.7
+
+There is no changelog for these versions.
+
+## 0.3.0
+
+- Add this changelog
+- 2nd example notebook PyOpenML.ipynb
+- Pagination support for list datasets and list tasks
+
+## Prior
+
+There is no changelog for prior versions.
diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css
new file mode 100644
index 000000000..d0c4f79d8
--- /dev/null
+++ b/docs/stylesheets/extra.css
@@ -0,0 +1,3 @@
+.jp-InputArea-prompt, .jp-InputPrompt {
+ display: none !important;
+}
diff --git a/docs/usage.md b/docs/usage.md
new file mode 100644
index 000000000..7c733fedc
--- /dev/null
+++ b/docs/usage.md
@@ -0,0 +1,155 @@
+# User Guide
+
+This document will guide you through the most important use cases,
+functions and classes in the OpenML Python API. Throughout this
+document, we will use [pandas](https://pandas.pydata.org/) to format and
+filter tables.
+
+## Installation
+
+The OpenML Python package is a connector to
+[OpenML](https://www.openml.org/). It allows you to use and share
+datasets and tasks, run machine learning algorithms on them and then
+share the results online.
+
+The ["intruduction tutorial and setup"][intro] tutorial gives a short introduction on how to install and
+set up the OpenML Python connector, followed up by a simple example.
+
+## Configuration
+
+The configuration file resides in a directory `.config/openml` in the
+home directory of the user and is called config (More specifically, it
+resides in the [configuration directory specified by the XDGB Base
+Directory
+Specification](https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html)).
+It consists of `key = value` pairs which are separated by newlines. The
+following keys are defined:
+
+- apikey: required to access the server. The [introduction tutorial][intro] describes how to obtain an API key.
+- server: the server to connect to (default: `http://www.openml.org`).
+ For connection to the test server, set this to `test.openml.org`.
+- cachedir: the root folder where the cache file directories should be created.
+ If not given, will default to `~/.openml/cache`
+- avoid_duplicate_runs: if set to `True` (default), when `run_flow_on_task` or similar methods
+ are called a lookup is performed to see if there already
+ exists such a run on the server. If so, download those
+ results instead.
+- retry_policy: Defines how to react when the server is unavailable or
+ experiencing high load. It determines both how often to
+ attempt to reconnect and how quickly to do so. Please don't
+ use `human` in an automated script that you run more than
+ one instance of, it might increase the time to complete your
+ jobs and that of others. One of:
+ - human (default): For people running openml in interactive
+ fashion. Try only a few times, but in quick succession.
+ - robot: For people using openml in an automated fashion. Keep
+ trying to reconnect for a longer time, quickly increasing
+ the time between retries.
+
+- connection_n_retries: number of times to retry a request if they fail.
+Default depends on retry_policy (5 for `human`, 50 for `robot`)
+- verbosity: the level of output:
+ - 0: normal output
+ - 1: info output
+ - 2: debug output
+
+This file is easily configurable by the `openml` command line interface.
+To see where the file is stored, and what its values are, use openml
+configure none.
+
+## Docker
+
+It is also possible to try out the latest development version of
+`openml-python` with docker:
+
+``` bash
+docker run -it openml/openml-python
+```
+
+See the [openml-python docker
+documentation](https://github.com/openml/openml-python/blob/main/docker/readme.md)
+for more information.
+
+## Key concepts
+
+OpenML contains several key concepts which it needs to make machine
+learning research shareable. A machine learning experiment consists of
+one or several **runs**, which describe the performance of an algorithm
+(called a **flow** in OpenML), its hyperparameter settings (called a
+**setup**) on a **task**. A **Task** is the combination of a
+**dataset**, a split and an evaluation metric. In this user guide we
+will go through listing and exploring existing **tasks** to actually
+running machine learning algorithms on them. In a further user guide we
+will examine how to search through **datasets** in order to curate a
+list of **tasks**.
+
+A further explanation is given in the [OpenML user
+guide](https://openml.github.io/OpenML/#concepts).
+
+## Working with tasks
+
+You can think of a task as an experimentation protocol, describing how
+to apply a machine learning model to a dataset in a way that is
+comparable with the results of others (more on how to do that further
+down). Tasks are containers, defining which dataset to use, what kind of
+task we\'re solving (regression, classification, clustering, etc\...)
+and which column to predict. Furthermore, it also describes how to split
+the dataset into a train and test set, whether to use several disjoint
+train and test splits (cross-validation) and whether this should be
+repeated several times. Also, the task defines a target metric for which
+a flow should be optimized.
+
+If you want to know more about tasks, try the ["Task tutorial"](../examples/30_extended/tasks_tutorial)
+
+## Running machine learning algorithms and uploading results
+
+In order to upload and share results of running a machine learning
+algorithm on a task, we need to create an
+[openml.runs.OpenMLRun][]. A run object can be
+created by running a [openml.flows.OpenMLFlow][] or a scikit-learn compatible model on a task. We will
+focus on the simpler example of running a scikit-learn model.
+
+Flows are descriptions of something runnable which does the machine
+learning. A flow contains all information to set up the necessary
+machine learning library and its dependencies as well as all possible
+parameters.
+
+A run is the outcome of running a flow on a task. It contains all
+parameter settings for the flow, a setup string (most likely a command
+line call) and all predictions of that run. When a run is uploaded to
+the server, the server automatically calculates several metrics which
+can be used to compare the performance of different flows to each other.
+
+So far, the OpenML Python connector works only with estimator objects
+following the [scikit-learn estimator
+API](https://scikit-learn.org/stable/developers/develop.html#apis-of-scikit-learn-objects).
+Those can be directly run on a task, and a flow will automatically be
+created or downloaded from the server if it already exists.
+
+See ["Simple Flows and Runs"](../examples/20_basic/simple_flows_and_runs_tutorial) for a tutorial covers how to train different machine learning models,
+how to run machine learning models on OpenML data and how to share the
+results.
+
+## Datasets
+
+OpenML provides a large collection of datasets and the benchmark
+[OpenML100](https://docs.openml.org/benchmark/) which consists of a
+curated list of datasets.
+
+You can find the dataset that best fits your requirements by making use
+of the available metadata. The tutorial ["extended datasets"](../examples/30_extended/datasets_tutorial) which follows explains how to
+get a list of datasets, how to filter the list to find the dataset that
+suits your requirements and how to download a dataset.
+
+OpenML is about sharing machine learning results and the datasets they
+were obtained on. Learn how to share your datasets in the following
+tutorial ["Upload"](../examples/30_extended/create_upload_tutorial) tutorial.
+
+# Extending OpenML-Python
+
+OpenML-Python provides an extension interface to connect machine
+learning libraries directly to the API and ships a `scikit-learn`
+extension. Read more about them in the ["Extensions"](extensions.md) section.
+
+[intro]: examples/20_basic/introduction_tutorial/
+
diff --git a/examples/20_basic/introduction_tutorial.py b/examples/20_basic/introduction_tutorial.py
index 26d3143dd..a850a0792 100644
--- a/examples/20_basic/introduction_tutorial.py
+++ b/examples/20_basic/introduction_tutorial.py
@@ -1,10 +1,8 @@
-"""
-Introduction tutorial & Setup
-=============================
+# %% [markdown]
+# # Introduction tutorial & Setup
+# An example how to set up OpenML-Python followed up by a simple example.
-An example how to set up OpenML-Python followed up by a simple example.
-"""
-############################################################################
+# %% [markdown]
# OpenML is an online collaboration platform for machine learning which allows
# you to:
#
@@ -16,22 +14,16 @@
# * Large scale benchmarking, compare to state of the art
#
-############################################################################
-# Installation
-# ^^^^^^^^^^^^
+# %% [markdown]
+# # Installation
# Installation is done via ``pip``:
#
-# .. code:: bash
-#
-# pip install openml
-#
-# For further information, please check out the installation guide at
-# :ref:`installation`.
-#
+# ```bash
+# pip install openml
+# ```
-############################################################################
-# Authentication
-# ^^^^^^^^^^^^^^
+# %% [markdown]
+# # Authentication
#
# The OpenML server can only be accessed by users who have signed up on the
# OpenML platform. If you don’t have an account yet, sign up now.
@@ -55,28 +47,38 @@
# you authenticate for the duration of the python process.
-############################################################################
-
-# License: BSD 3-Clause
+# %%
import openml
from sklearn import neighbors
-############################################################################
-# .. warning::
-# .. include:: ../../test_server_usage_warning.txt
-openml.config.start_using_configuration_for_example()
+# %% [markdown]
+#
+#
Warning
+#
+# This example uploads data. For that reason, this example connects to the
+# test server at test.openml.org.
+# This prevents the main server from becoming overloaded with example datasets, tasks,
+# runs, and other submissions.
+# Using this test server may affect the behavior and performance of the
+# OpenML-Python API.
+#
+#
-############################################################################
+# %%
+# openml.config.start_using_configuration_for_example()
+
+# %% [markdown]
# When using the main server instead, make sure your apikey is configured.
# This can be done with the following line of code (uncomment it!).
# Never share your apikey with others.
+# %%
# openml.config.apikey = 'YOURKEY'
-############################################################################
-# Caching
-# ^^^^^^^
+# %% [markdown]
+# # Caching
# When downloading datasets, tasks, runs and flows, they will be cached to
# retrieve them without calling the server later. As with the API key,
# the cache directory can be either specified through the config file or
@@ -87,23 +89,27 @@
# will use **~/.openml/cache** as the cache directory.
# * Run the code below, replacing 'YOURDIR' with the path to the cache directory.
+# %%
# Uncomment and set your OpenML cache directory
# import os
# openml.config.cache_directory = os.path.expanduser('YOURDIR')
+openml.config.set_root_cache_directory("YOURDIR")
-############################################################################
-# Simple Example
-# ^^^^^^^^^^^^^^
+# %% [markdown]
+# # Simple Example
# Download the OpenML task for the eeg-eye-state.
+
+# %%
task = openml.tasks.get_task(403)
-data = openml.datasets.get_dataset(task.dataset_id)
clf = neighbors.KNeighborsClassifier(n_neighbors=5)
+openml.config.start_using_configuration_for_example()
+
run = openml.runs.run_model_on_task(clf, task, avoid_duplicate_runs=False)
# Publish the experiment on OpenML (optional, requires an API key).
# For this tutorial, our configuration publishes to the test server
# as to not crowd the main server with runs created by examples.
myrun = run.publish()
-print(f"kNN on {data.name}: {myrun.openml_url}")
-############################################################################
+# %%
openml.config.stop_using_configuration_for_example()
+# License: BSD 3-Clause
diff --git a/examples/20_basic/simple_datasets_tutorial.py b/examples/20_basic/simple_datasets_tutorial.py
index 9b18aab14..f855184c0 100644
--- a/examples/20_basic/simple_datasets_tutorial.py
+++ b/examples/20_basic/simple_datasets_tutorial.py
@@ -1,33 +1,29 @@
-"""
-========
-Datasets
-========
-
-A basic tutorial on how to list, load and visualize datasets.
-"""
-############################################################################
+# %% [markdown]
+# # Datasets
+# A basic tutorial on how to list, load and visualize datasets.
+#
# In general, we recommend working with tasks, so that the results can
# be easily reproduced. Furthermore, the results can be compared to existing results
# at OpenML. However, for the purposes of this tutorial, we are going to work with
# the datasets directly.
-# License: BSD 3-Clause
+# %%
import openml
-############################################################################
-# List datasets
-# =============
+# %% [markdown]
+# ## List datasets
-datasets_df = openml.datasets.list_datasets()
+# %%
+datasets_df = openml.datasets.list_datasets(output_format="dataframe")
print(datasets_df.head(n=10))
-############################################################################
-# Download a dataset
-# ==================
+# %% [markdown]
+# ## Download a dataset
+# %%
# Iris dataset https://www.openml.org/d/61
-dataset = openml.datasets.get_dataset(dataset_id="iris", version=1)
+dataset = openml.datasets.get_dataset(dataset_id=61, version=1)
# Print a summary
print(
@@ -37,33 +33,31 @@
print(f"URL: {dataset.url}")
print(dataset.description[:500])
-############################################################################
-# Load a dataset
-# ==============
-
+# %% [markdown]
+# ## Load a dataset
# X - An array/dataframe where each row represents one example with
# the corresponding feature values.
+#
# y - the classes for each example
+#
# categorical_indicator - an array that indicates which feature is categorical
+#
# attribute_names - the names of the features for the examples (X) and
# target feature (y)
+
+# %%
X, y, categorical_indicator, attribute_names = dataset.get_data(
target=dataset.default_target_attribute
)
-############################################################################
-# Tip: you can get a progress bar for dataset downloads, simply set it in
-# the configuration. Either in code or in the configuration file
-# (see also the introduction tutorial)
-
-openml.config.show_progress = True
-
-
-############################################################################
+# %% [markdown]
# Visualize the dataset
-# =====================
+<<<<<<< docs/mkdoc -- Incoming Change
+# %%
+=======
import matplotlib.pyplot as plt
+>>>>>>> develop -- Current Change
import pandas as pd
import seaborn as sns
@@ -80,3 +74,5 @@ def hide_current_axis(*args, **kwds):
iris_plot = sns.pairplot(combined_data, hue="class")
iris_plot.map_upper(hide_current_axis)
plt.show()
+
+# License: BSD 3-Clause
diff --git a/examples/20_basic/simple_flows_and_runs_tutorial.py b/examples/20_basic/simple_flows_and_runs_tutorial.py
index f7d7a49d1..9f35e8bc1 100644
--- a/examples/20_basic/simple_flows_and_runs_tutorial.py
+++ b/examples/20_basic/simple_flows_and_runs_tutorial.py
@@ -1,49 +1,65 @@
-"""
-Flows and Runs
-==============
+# %% [markdown]
+# # Flows and Runs
+# A simple tutorial on how to train/run a model and how to upload the results.
-A simple tutorial on how to train/run a model and how to upload the results.
-"""
+# %%
+import openml
+from sklearn import ensemble, neighbors
-# License: BSD 3-Clause
+from openml.utils import thread_safe_if_oslo_installed
-from sklearn import ensemble, neighbors
-import openml
+# %% [markdown]
+#
+#
Warning
+#
+# This example uploads data. For that reason, this example connects to the
+# test server at test.openml.org.
+# This prevents the main server from becoming overloaded with example datasets, tasks,
+# runs, and other submissions.
+# Using this test server may affect the behavior and performance of the
+# OpenML-Python API.
+#
+#
-############################################################################
-# .. warning::
-# .. include:: ../../test_server_usage_warning.txt
+# %%
openml.config.start_using_configuration_for_example()
-############################################################################
-# Train a machine learning model
-# ==============================
+# %% [markdown]
+# ## Train a machine learning model
+
+# NOTE: We are using dataset 20 from the test server: https://test.openml.org/d/20
-# NOTE: We are using dataset "diabetes" from the test server: https://test.openml.org/d/20
-dataset = openml.datasets.get_dataset(dataset_id="diabetes", version=1)
+# %%
+dataset = openml.datasets.get_dataset(20)
X, y, categorical_indicator, attribute_names = dataset.get_data(
- target=dataset.default_target_attribute
+ dataset_format="dataframe", target=dataset.default_target_attribute
)
+if y is None:
+ y = X["class"]
+ X = X.drop(columns=["class"], axis=1)
clf = neighbors.KNeighborsClassifier(n_neighbors=3)
clf.fit(X, y)
-############################################################################
-# Running a model on a task
-# =========================
+# %% [markdown]
+# ## Running a model on a task
+# %%
task = openml.tasks.get_task(119)
+
clf = ensemble.RandomForestClassifier()
run = openml.runs.run_model_on_task(clf, task)
print(run)
-############################################################################
-# Publishing the run
-# ==================
+# %% [markdown]
+# ## Publishing the run
+# %%
myrun = run.publish()
print(f"Run was uploaded to {myrun.openml_url}")
print(f"The flow can be found at {myrun.flow.openml_url}")
-############################################################################
+# %%
openml.config.stop_using_configuration_for_example()
+# License: BSD 3-Clause
diff --git a/examples/20_basic/simple_suites_tutorial.py b/examples/20_basic/simple_suites_tutorial.py
index 3daf7b992..5a1b429b1 100644
--- a/examples/20_basic/simple_suites_tutorial.py
+++ b/examples/20_basic/simple_suites_tutorial.py
@@ -1,19 +1,14 @@
-"""
-================
-Benchmark suites
-================
-
-This is a brief showcase of OpenML benchmark suites, which were introduced by
-`Bischl et al. (2019) `_. Benchmark suites standardize the
-datasets and splits to be used in an experiment or paper. They are fully integrated into OpenML
-and simplify both the sharing of the setup and the results.
-"""
-
-# License: BSD 3-Clause
+# %% [markdown]
+# # Benchmark suites
+# This is a brief showcase of OpenML benchmark suites, which were introduced by
+# [Bischl et al. (2019)](https://arxiv.org/abs/1708.03731v2). Benchmark suites standardize the
+# datasets and splits to be used in an experiment or paper. They are fully integrated into OpenML
+# and simplify both the sharing of the setup and the results.
+# %%
import openml
-####################################################################################################
+# %% [markdown]
# OpenML-CC18
# ===========
#
@@ -30,40 +25,43 @@
# imbalanced datasets which require special treatment for both algorithms and evaluation
# measures).
#
-# A full description can be found in the `OpenML benchmarking docs
-# `_.
+# A full description can be found in the
+# [OpenML benchmarking docs](https://docs.openml.org/benchmark/#openml-cc18).
#
# In this example we'll focus on how to use benchmark suites in practice.
-####################################################################################################
+# %% [markdown]
# Downloading benchmark suites
# ============================
-# OpenML Benchmarking Suites and the OpenML-CC18
-# https://www.openml.org/s/99
-suite = openml.study.get_suite("OpenML-CC18")
+# %%
+suite = openml.study.get_suite(99)
print(suite)
-####################################################################################################
+# %% [markdown]
# The benchmark suite does not download the included tasks and datasets itself, but only contains
# a list of which tasks constitute the study.
#
# Tasks can then be accessed via
+# %%
tasks = suite.tasks
print(tasks)
-####################################################################################################
+# %% [markdown]
# and iterated over for benchmarking. For speed reasons we only iterate over the first three tasks:
+# %%
for task_id in tasks[:3]:
task = openml.tasks.get_task(task_id)
print(task)
-####################################################################################################
+# %% [markdown]
# Further examples
# ================
#
-# * :ref:`sphx_glr_examples_30_extended_suites_tutorial.py`
-# * :ref:`sphx_glr_examples_30_extended_study_tutorial.py`
-# * :ref:`sphx_glr_examples_40_paper_2018_ida_strang_example.py`
+# * [Suites Tutorial](../../30_extended/suites_tutorial)
+# * [Study Tutoral](../../30_extended/study_tutorial)
+# * [Paper example: Strang et al.](../../40_paper/2018_ida_strang_example.py)
+
+# License: BSD 3-Clause
diff --git a/examples/30_extended/benchmark_with_optunahub.py b/examples/30_extended/benchmark_with_optunahub.py
index 0fd4a63e5..67d106da3 100644
--- a/examples/30_extended/benchmark_with_optunahub.py
+++ b/examples/30_extended/benchmark_with_optunahub.py
@@ -7,28 +7,45 @@
"""
############################################################################
# Please make sure to install the dependencies with:
-# ``pip install openml optunahub hebo`` and ``pip install --upgrade pymoo``
+# ``pip install "openml>=0.15.1" plotly``
# Then we import all the necessary modules.
# License: BSD 3-Clause
+import logging
+
+import optuna
+
import openml
from openml.extensions.sklearn import cat
from openml.extensions.sklearn import cont
-import optuna
-import optunahub
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
-# Set your openml api key if you want to publish the run
+
+logger = logging.Logger(name="Experiment Logger", level=1)
+
+# Set your openml api key if you want to upload your results to OpenML (eg:
+# https://openml.org/search?type=run&sort=date) . To get one, simply make an
+# account (you don't need one for anything else, just to upload your results),
+# go to your profile and select the API-KEY.
+# Or log in, and navigate to https://www.openml.org/auth/api-key
openml.config.apikey = ""
############################################################################
# Prepare for preprocessors and an OpenML task
# ============================================
+# OpenML contains several key concepts which it needs to make machine learning research shareable.
+# A machine learning experiment consists of one or several runs, which describe the performance of
+# an algorithm (called a flow in OpenML), its hyperparameter settings (called a setup) on a task.
+# A Task is the combination of a dataset, a split and an evaluation metric We choose a dataset from
+# OpenML, (https://www.openml.org/d/1464) and a subsequent task (https://www.openml.org/t/10101) To
+# make your own dataset and task, please refer to
+# https://openml.github.io/openml-python/main/examples/30_extended/create_upload_tutorial.html
+
# https://www.openml.org/search?type=study&study_type=task&id=218
task_id = 10101
seed = 42
@@ -41,13 +58,19 @@
preproc = ColumnTransformer([categorical_preproc, numerical_preproc])
############################################################################
-# Define a pipeline for the hyperparameter optimization
+# Define a pipeline for the hyperparameter optimization (this is standark for Optuna)
# =====================================================
-# Since we use `OptunaHub `__ for the benchmarking of hyperparameter optimization,
+# Optuna explanation
# we follow the `Optuna `__ search space design.
-# We can simply pass the parametrized classifier to `run_model_on_task` to obtain the performance of the pipeline
+
+# OpenML runs
+# We can simply pass the parametrized classifier to `run_model_on_task` to obtain the performance
+# of the pipeline
# on the specified OpenML task.
+# Do you want to share your results along with an easily reproducible pipeline, you can set an API
+# key and just upload your results.
+# You can find more examples on https://www.openml.org/
def objective(trial: optuna.Trial) -> Pipeline:
@@ -57,47 +80,37 @@ def objective(trial: optuna.Trial) -> Pipeline:
random_state=seed,
)
pipe = Pipeline(steps=[("preproc", preproc), ("model", clf)])
+ logger.log(1, f"Running pipeline - {pipe}")
run = openml.runs.run_model_on_task(pipe, task=task_id, avoid_duplicate_runs=False)
+
+ logger.log(1, f"Model has been trained - {run}")
if openml.config.apikey != "":
try:
run.publish()
+
+ logger.log(1, f"Run was uploaded to - {run.openml_url}")
except Exception as e:
- print(f"Could not publish run - {e}")
+ logger.log(1, f"Could not publish run - {e}")
else:
- print(
- "If you want to publish your results to OpenML, please set an apikey using `openml.config.apikey = ''`"
+ logger.log(
+ 0,
+ "If you want to publish your results to OpenML, please set an apikey",
)
accuracy = max(run.fold_evaluations["predictive_accuracy"][0].values())
- return accuracy
-
+ logger.log(0, f"Accuracy {accuracy}")
-############################################################################
-# Load a sampler from OptunaHub
-# =============================
-
-# OptunaHub is a feature-sharing plotform for hyperparameter optimization methods.
-# For example, we load a state-of-the-art algorithm (`HEBO `__
-# , the winning solution of `NeurIPS 2020 Black-Box Optimisation Challenge `__)
-# from OptunaHub here.
+ return accuracy
-sampler = optunahub.load_module("samplers/hebo").HEBOSampler(seed=seed)
############################################################################
# Optimize the pipeline
# =====================
-
-# We now run the optimization. For more details about Optuna API,
-# please visit `the API reference `__.
-
-study = optuna.create_study(direction="maximize", sampler=sampler)
+study = optuna.create_study(direction="maximize")
+logger.log(0, f"Study {study}")
study.optimize(objective, n_trials=15)
############################################################################
# Visualize the optimization history
# ==================================
-
-# It is very simple to visualize the result by the Optuna visualization module.
-# For more details, please check `the API reference `__.
-
fig = optuna.visualization.plot_optimization_history(study)
fig.show()
diff --git a/examples/30_extended/configure_logging.py b/examples/30_extended/configure_logging.py
index 3878b0436..0191253e9 100644
--- a/examples/30_extended/configure_logging.py
+++ b/examples/30_extended/configure_logging.py
@@ -1,31 +1,26 @@
-"""
-========
-Logging
-========
-
-Explains openml-python logging, and shows how to configure it.
-"""
-##################################################################################
-# Openml-python uses the `Python logging module `_
+# %% [markdown]
+# # Logging
+# This tutorial explains openml-python logging, and shows how to configure it.
+# Openml-python uses the [Python logging module](https://docs.python.org/3/library/logging.html)
# to provide users with log messages. Each log message is assigned a level of importance, see
# the table in Python's logging tutorial
-# `here `_.
+# [here](https://docs.python.org/3/howto/logging.html#when-to-use-logging).
#
# By default, openml-python will print log messages of level `WARNING` and above to console.
# All log messages (including `DEBUG` and `INFO`) are also saved in a file, which can be
# found in your cache directory (see also the
-# :ref:`sphx_glr_examples_20_basic_introduction_tutorial.py`).
+# [introduction tutorial](../20_basic/introduction_tutorial).
# These file logs are automatically deleted if needed, and use at most 2MB of space.
#
# It is possible to configure what log levels to send to console and file.
# When downloading a dataset from OpenML, a `DEBUG`-level message is written:
-# License: BSD 3-Clause
-
+# %%
import openml
openml.datasets.get_dataset("iris", version=1)
+# %% [markdown]
# With default configuration, the above example will show no output to console.
# However, in your cache directory you should find a file named 'openml_python.log',
# which has a DEBUG message written to it. It should be either like
@@ -35,12 +30,14 @@
# , depending on whether or not you had downloaded iris before.
# The processed log levels can be configured programmatically:
+# %%
import logging
openml.config.set_console_log_level(logging.DEBUG)
openml.config.set_file_log_level(logging.WARNING)
openml.datasets.get_dataset("iris", version=1)
+# %% [markdown]
# Now the log level that was previously written to file should also be shown in the console.
# The message is now no longer written to file as the `file_log` was set to level `WARNING`.
#
@@ -52,3 +49,5 @@
# * 0: `logging.WARNING` and up.
# * 1: `logging.INFO` and up.
# * 2: `logging.DEBUG` and up (i.e. all messages).
+#
+# License: BSD 3-Clause
diff --git a/examples/30_extended/create_upload_tutorial.py b/examples/30_extended/create_upload_tutorial.py
index 7825d8cf7..2b010401c 100644
--- a/examples/30_extended/create_upload_tutorial.py
+++ b/examples/30_extended/create_upload_tutorial.py
@@ -1,12 +1,8 @@
-"""
-Dataset upload tutorial
-=======================
-
-A tutorial on how to create and upload a dataset to OpenML.
-"""
-
-# License: BSD 3-Clause
+# %% [markdown]
+# # Dataset upload tutorial
+# A tutorial on how to create and upload a dataset to OpenML.
+# %%
import numpy as np
import pandas as pd
import sklearn.datasets
@@ -15,14 +11,14 @@
import openml
from openml.datasets.functions import create_dataset
-############################################################################
+# %% [markdown]
# .. warning::
# .. include:: ../../test_server_usage_warning.txt
+# %%
openml.config.start_using_configuration_for_example()
-############################################################################
-############################################################################
+# %% [markdown]
# Below we will cover the following cases of the dataset object:
#
# * A numpy array
@@ -31,17 +27,17 @@
# * A sparse matrix
# * A pandas sparse dataframe
-############################################################################
+# %% [markdown]
# Dataset is a numpy array
# ========================
# A numpy array can contain lists in the case of dense data or it can contain
# OrderedDicts in the case of sparse data.
#
-# Prepare dataset
-# ^^^^^^^^^^^^^^^
+# # Prepare dataset
# Load an example dataset from scikit-learn which we will upload to OpenML.org
# via the API.
+# %%
diabetes = sklearn.datasets.load_diabetes()
name = "Diabetes(scikit-learn)"
X = diabetes.data
@@ -49,13 +45,14 @@
attribute_names = diabetes.feature_names
description = diabetes.DESCR
-############################################################################
+# %% [markdown]
# OpenML does not distinguish between the attributes and targets on the data
# level and stores all data in a single matrix.
#
# The target feature is indicated as meta-data of the dataset (and tasks on
# that data).
+# %%
data = np.concatenate((X, y.reshape((-1, 1))), axis=1)
attribute_names = list(attribute_names)
attributes = [(attribute_name, "REAL") for attribute_name in attribute_names] + [
@@ -68,14 +65,14 @@
)
paper_url = "https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf"
-############################################################################
-# Create the dataset object
-# ^^^^^^^^^^^^^^^^^^^^^^^^^
+# %% [markdown]
+# # Create the dataset object
# The definition of all fields can be found in the XSD files describing the
# expected format:
#
# https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.data.upload.xsd
+# %%
diabetes_dataset = create_dataset(
# The name of the dataset (needs to be unique).
# Must not be longer than 128 characters and only contain
@@ -113,20 +110,20 @@
paper_url=paper_url,
)
-############################################################################
+# %%
diabetes_dataset.publish()
print(f"URL for dataset: {diabetes_dataset.openml_url}")
-############################################################################
-# Dataset is a list
-# =================
+# %% [markdown]
+# ## Dataset is a list
# A list can contain lists in the case of dense data or it can contain
# OrderedDicts in the case of sparse data.
#
# Weather dataset:
# https://storm.cis.fordham.edu/~gweiss/data-mining/datasets.html
+# %%
data = [
["sunny", 85, 85, "FALSE", "no"],
["sunny", 80, 90, "TRUE", "no"],
@@ -186,14 +183,13 @@
version_label="example",
)
-############################################################################
+# %%
weather_dataset.publish()
print(f"URL for dataset: {weather_dataset.openml_url}")
-############################################################################
-# Dataset is a pandas DataFrame
-# =============================
+# %% [markdown]
+# ## Dataset is a pandas DataFrame
# It might happen that your dataset is made of heterogeneous data which can usually
# be stored as a Pandas DataFrame. DataFrames offer the advantage of
# storing the type of data for each column as well as the attribute names.
@@ -202,20 +198,23 @@
# function :func:`openml.datasets.create_dataset`. In this regard, you only
# need to pass ``'auto'`` to the ``attributes`` parameter.
+# %%
df = pd.DataFrame(data, columns=[col_name for col_name, _ in attribute_names])
+
# enforce the categorical column to have a categorical dtype
df["outlook"] = df["outlook"].astype("category")
df["windy"] = df["windy"].astype("bool")
df["play"] = df["play"].astype("category")
print(df.info())
-############################################################################
+# %% [markdown]
# We enforce the column 'outlook' and 'play' to be a categorical
# dtype while the column 'windy' is kept as a boolean column. 'temperature'
# and 'humidity' are kept as numeric columns. Then, we can
# call :func:`openml.datasets.create_dataset` by passing the dataframe and
# fixing the parameter ``attributes`` to ``'auto'``.
+# %%
weather_dataset = create_dataset(
name="Weather",
description=description,
@@ -233,15 +232,15 @@
version_label="example",
)
-############################################################################
-
+# %%
weather_dataset.publish()
print(f"URL for dataset: {weather_dataset.openml_url}")
-############################################################################
+# %% [markdown]
# Dataset is a sparse matrix
# ==========================
+# %%
sparse_data = coo_matrix(
([0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1]))
)
@@ -269,15 +268,14 @@
version_label="example",
)
-############################################################################
+# %%
xor_dataset.publish()
print(f"URL for dataset: {xor_dataset.openml_url}")
-############################################################################
-# Dataset is a pandas dataframe with sparse columns
-# =================================================
+# %% [markdown]
+# ## Dataset is a pandas dataframe with sparse columns
sparse_data = coo_matrix(
([1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0], ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1]))
@@ -303,11 +301,11 @@
version_label="example",
)
-############################################################################
+# %%
xor_dataset.publish()
print(f"URL for dataset: {xor_dataset.openml_url}")
-
-############################################################################
+# %%
openml.config.stop_using_configuration_for_example()
+# License: BSD 3-Clause
diff --git a/examples/30_extended/custom_flow_.py b/examples/30_extended/custom_flow_.py
index 241f3e6eb..15ec0e1fb 100644
--- a/examples/30_extended/custom_flow_.py
+++ b/examples/30_extended/custom_flow_.py
@@ -1,20 +1,18 @@
-"""
-================================
-Creating and Using a Custom Flow
-================================
+# %% [markdown]
+# # Creating and Using a Custom Flow
-The most convenient way to create a flow for your machine learning workflow is to generate it
-automatically as described in the :ref:`sphx_glr_examples_30_extended_flow_id_tutorial.py` tutorial.
-However, there are scenarios where this is not possible, such
-as when the flow uses a framework without an extension or when the flow is described by a script.
+# The most convenient way to create a flow for your machine learning workflow is to generate it
+# automatically as described in the
+# ["Obtaining Flow IDs"](../../30_extended/flow_id_tutorial) tutorial.
+# However, there are scenarios where this is not possible, such
+# as when the flow uses a framework without an extension or when the flow is described by a script.
-In those cases you can still create a custom flow by following the steps of this tutorial.
-As an example we will use the flows generated for the `AutoML Benchmark `_,
-and also show how to link runs to the custom flow.
-"""
-
-# License: BSD 3-Clause
+# In those cases you can still create a custom flow by following the steps of this tutorial.
+# As an example we will use the flows generated for the
+# [AutoML Benchmark](https://openml.github.io/automlbenchmark/),
+# and also show how to link runs to the custom flow.
+# %%
from collections import OrderedDict
import numpy as np
@@ -22,14 +20,15 @@
from openml import OpenMLClassificationTask
from openml.runs.functions import format_prediction
-####################################################################################################
+# %% [markdown]
# .. warning::
# .. include:: ../../test_server_usage_warning.txt
+
+# %%
openml.config.start_using_configuration_for_example()
-####################################################################################################
-# 1. Defining the flow
-# ====================
+# %% [markdown]
+# ## 1. Defining the flow
# The first step is to define all the hyperparameters of your flow.
# The API pages feature a descriptions of each variable of the :class:`openml.flows.OpenMLFlow`.
# Note that `external version` and `name` together uniquely identify a flow.
@@ -43,6 +42,7 @@
# Make sure to leave enough information so others can determine exactly which
# version of the package/script is used. Use tags so users can find your flow easily.
+# %%
general = dict(
name="automlbenchmark_autosklearn",
description=(
@@ -55,12 +55,13 @@
dependencies="amlb==0.9",
)
-####################################################################################################
+# %% [markdown]
# Next we define the flow hyperparameters. We define their name and default value in `parameters`,
# and provide meta-data for each hyperparameter through `parameters_meta_info`.
# Note that even though the argument name is `parameters` they describe the hyperparameters.
# The use of ordered dicts is required.
+# %%
flow_hyperparameters = dict(
parameters=OrderedDict(time="240", memory="32", cores="8"),
parameters_meta_info=OrderedDict(
@@ -70,7 +71,7 @@
),
)
-####################################################################################################
+# %% [markdown]
# It is possible to build a flow which uses other flows.
# For example, the Random Forest Classifier is a flow, but you could also construct a flow
# which uses a Random Forest Classifier in a ML pipeline. When constructing the pipeline flow,
@@ -86,6 +87,7 @@
# Note: flow 9313 is not actually the right flow on the test server,
# but that does not matter for this demonstration.
+# %%
autosklearn_flow = openml.flows.get_flow(9313) # auto-sklearn 0.5.1
subflow = dict(
components=OrderedDict(automl_tool=autosklearn_flow),
@@ -93,7 +95,7 @@
# components=OrderedDict(),
)
-####################################################################################################
+# %% [markdown]
# With all parameters of the flow defined, we can now initialize the OpenMLFlow and publish.
# Because we provided all the details already, we do not need to provide a `model` to the flow.
#
@@ -103,6 +105,7 @@
# So whether you have a model with no extension or no model at all, explicitly set
# the model of the flow to `None`.
+# %%
autosklearn_amlb_flow = openml.flows.OpenMLFlow(
**general,
**flow_hyperparameters,
@@ -112,14 +115,14 @@
autosklearn_amlb_flow.publish()
print(f"autosklearn flow created: {autosklearn_amlb_flow.flow_id}")
-####################################################################################################
-# 2. Using the flow
-# ====================
+# %% [markdown]
+# ## 2. Using the flow
# This Section will show how to upload run data for your custom flow.
# Take care to change the values of parameters as well as the task id,
# to reflect the actual run.
# Task and parameter values in the example are fictional.
+# %%
flow_id = autosklearn_amlb_flow.flow_id
parameters = [
@@ -133,7 +136,7 @@
dataset_id = task.get_dataset().dataset_id
-####################################################################################################
+# %% [markdown]
# The last bit of information for the run we need are the predicted values.
# The exact format of the predictions will depend on the task.
#
@@ -158,6 +161,8 @@
# You can ignore this code, or use it to better understand the formatting of the predictions.
#
# Find the repeats/folds for this task:
+
+# %%
n_repeats, n_folds, _ = task.get_split_dimensions()
all_test_indices = [
(repeat, fold, index)
@@ -193,10 +198,11 @@
)
predictions.append(prediction)
-####################################################################################################
+# %% [markdown]
# Finally we can create the OpenMLRun object and upload.
# We use the argument setup_string because the used flow was a script.
+# %%
benchmark_command = f"python3 runbenchmark.py auto-sklearn medium -m aws -t 119"
my_run = openml.runs.OpenMLRun(
task_id=task_id,
@@ -211,4 +217,6 @@
my_run.publish()
print("run created:", my_run.run_id)
+# %%
openml.config.stop_using_configuration_for_example()
+# License: BSD 3-Clause
diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py
index 77a46d8b0..d7c74b843 100644
--- a/examples/30_extended/datasets_tutorial.py
+++ b/examples/30_extended/datasets_tutorial.py
@@ -1,21 +1,14 @@
-"""
-========
-Datasets
-========
-
-How to list and download datasets.
-"""
-
-# License: BSD 3-Clauses
+# %% [markdown]
+# # Datasets
+# How to list and download datasets.
import pandas as pd
import openml
from openml.datasets import edit_dataset, fork_dataset, get_dataset
-############################################################################
-# Exercise 0
-# **********
+# %% [markdown]
+# ## Exercise 0
#
# * List datasets and return a dataframe
datalist = openml.datasets.list_datasets()
@@ -28,23 +21,26 @@
openml_df = openml.datasets.list_datasets()
openml_df.head(n=10)
-############################################################################
-# Exercise 1
-# **********
+# %% [markdown]
+# ## Exercise 1
#
# * Find datasets with more than 10000 examples.
# * Find a dataset called 'eeg_eye_state'.
# * Find all datasets with more than 50 classes.
+
+# %%
datalist[datalist.NumberOfInstances > 10000].sort_values(["NumberOfInstances"]).head(n=20)
-""
+
+# %%
datalist.query('name == "eeg-eye-state"')
-""
+
+# %%
datalist.query("NumberOfClasses > 50")
-############################################################################
-# Download datasets
-# =================
+# %% [markdown]
+# ## Download datasets
+# %%
# This is done based on the dataset ID.
dataset = openml.datasets.get_dataset(dataset_id="eeg-eye-state", version=1)
@@ -56,24 +52,28 @@
print(f"URL: {dataset.url}")
print(dataset.description[:500])
-############################################################################
+# %% [markdown]
# Get the actual data.
#
# openml-python returns data as pandas dataframes (stored in the `eeg` variable below),
# and also some additional metadata that we don't care about right now.
+
+# %%
eeg, *_ = dataset.get_data()
-############################################################################
+# %% [markdown]
# You can optionally choose to have openml separate out a column from the
# dataset. In particular, many datasets for supervised problems have a set
# `default_target_attribute` which may help identify the target variable.
+
+# %%
X, y, categorical_indicator, attribute_names = dataset.get_data(
target=dataset.default_target_attribute
)
print(X.head())
print(X.info())
-############################################################################
+# %% [markdown]
# Sometimes you only need access to a dataset's metadata.
# In those cases, you can download the dataset without downloading the
# data file. The dataset object can be used as normal.
@@ -82,11 +82,15 @@
# Starting from 0.15, not downloading data will be the default behavior instead.
# The data will be downloading automatically when you try to access it through
# openml objects, e.g., using `dataset.features`.
-dataset = openml.datasets.get_dataset(dataset_id="eeg-eye-state", version=1, download_data=False)
-############################################################################
-# Exercise 2
-# **********
+
+# %%
+dataset = openml.datasets.get_dataset(1471)
+
+# %% [markdown]
+# ## Exercise 2
# * Explore the data visually.
+
+# %%
eegs = eeg.sample(n=1000)
_ = pd.plotting.scatter_matrix(
X.iloc[:100, :4],
@@ -99,18 +103,21 @@
)
-############################################################################
-# Edit a created dataset
-# ======================
+# %% [markdown]
+# ## Edit a created dataset
# This example uses the test server, to avoid editing a dataset on the main server.
#
# .. warning::
# .. include:: ../../test_server_usage_warning.txt
+
+# %%
openml.config.start_using_configuration_for_example()
-############################################################################
+# %% [markdown]
# Edit non-critical fields, allowed for all authorized users:
# description, creator, contributor, collection_date, language, citation,
# original_data_url, paper_url
+
+# %%
desc = (
"This data sets consists of 3 different types of irises' "
"(Setosa, Versicolour, and Virginica) petal and sepal length,"
@@ -129,29 +136,33 @@
print(f"Edited dataset ID: {data_id}")
-############################################################################
+# %% [markdown]
# Editing critical fields (default_target_attribute, row_id_attribute, ignore_attribute) is allowed
# only for the dataset owner. Further, critical fields cannot be edited if the dataset has any
# tasks associated with it. To edit critical fields of a dataset (without tasks) owned by you,
# configure the API key:
# openml.config.apikey = 'FILL_IN_OPENML_API_KEY'
# This example here only shows a failure when trying to work on a dataset not owned by you:
+
+# %%
try:
data_id = edit_dataset(1, default_target_attribute="shape")
except openml.exceptions.OpenMLServerException as e:
print(e)
-############################################################################
-# Fork dataset
-# ============
+# %% [markdown]
+# ## Fork dataset
# Used to create a copy of the dataset with you as the owner.
# Use this API only if you are unable to edit the critical fields (default_target_attribute,
# ignore_attribute, row_id_attribute) of a dataset through the edit_dataset API.
# After the dataset is forked, you can edit the new version of the dataset using edit_dataset.
+# %%
data_id = fork_dataset(1)
print(data_id)
data_id = edit_dataset(data_id, default_target_attribute="shape")
print(f"Forked dataset ID: {data_id}")
+# %%
openml.config.stop_using_configuration_for_example()
+# License: BSD 3-Clauses
diff --git a/examples/30_extended/fetch_evaluations_tutorial.py b/examples/30_extended/fetch_evaluations_tutorial.py
index 6c8a88ec8..21f36a194 100644
--- a/examples/30_extended/fetch_evaluations_tutorial.py
+++ b/examples/30_extended/fetch_evaluations_tutorial.py
@@ -1,38 +1,35 @@
-"""
-====================
-Fetching Evaluations
-====================
-
-Evaluations contain a concise summary of the results of all runs made. Each evaluation
-provides information on the dataset used, the flow applied, the setup used, the metric
-evaluated, and the result obtained on the metric, for each such run made. These collection
-of results can be used for efficient benchmarking of an algorithm and also allow transparent
-reuse of results from previous experiments on similar parameters.
-
-In this example, we shall do the following:
-
-* Retrieve evaluations based on different metrics
-* Fetch evaluations pertaining to a specific task
-* Sort the obtained results in descending order of the metric
-* Plot a cumulative distribution function for the evaluations
-* Compare the top 10 performing flows based on the evaluation performance
-* Retrieve evaluations with hyperparameter settings
-"""
-
-############################################################################
-
-# License: BSD 3-Clause
-
+# %% [markdown]
+# # Fetching Evaluations
+
+# Evaluations contain a concise summary of the results of all runs made. Each evaluation
+# provides information on the dataset used, the flow applied, the setup used, the metric
+# evaluated, and the result obtained on the metric, for each such run made. These collection
+# of results can be used for efficient benchmarking of an algorithm and also allow transparent
+# reuse of results from previous experiments on similar parameters.
+#
+# In this example, we shall do the following:
+#
+# * Retrieve evaluations based on different metrics
+# * Fetch evaluations pertaining to a specific task
+# * Sort the obtained results in descending order of the metric
+# * Plot a cumulative distribution function for the evaluations
+# * Compare the top 10 performing flows based on the evaluation performance
+# * Retrieve evaluations with hyperparameter settings
+
+# %%
import openml
-############################################################################
-# Listing evaluations
-# *******************
+# %% [markdown]
+# ## Listing evaluations
# Evaluations can be retrieved from the database in the chosen output format.
# Required filters can be applied to retrieve results from runs as required.
# We shall retrieve a small set (only 10 entries) to test the listing function for evaluations
-openml.evaluations.list_evaluations(function="predictive_accuracy", size=10)
+
+# %%
+openml.evaluations.list_evaluations(
+ function="predictive_accuracy", size=10
+)
# Using other evaluation metrics, 'precision' in this case
evals = openml.evaluations.list_evaluations(
@@ -42,23 +39,23 @@
# Querying the returned results for precision above 0.98
print(evals[evals.value > 0.98])
-#############################################################################
-# Viewing a sample task
-# =====================
+# %% [markdown]
+# ## Viewing a sample task
# Over here we shall briefly take a look at the details of the task.
-
# We will start by displaying a simple *supervised classification* task:
+
+# %%
task_id = 167140 # https://www.openml.org/t/167140
task = openml.tasks.get_task(task_id)
print(task)
-#############################################################################
-# Obtaining all the evaluations for the task
-# ==========================================
+# %% [markdown]
+# ## Obtaining all the evaluations for the task
# We'll now obtain all the evaluations that were uploaded for the task
# we displayed previously.
# Note that we now filter the evaluations based on another parameter 'task'.
+# %%
metric = "predictive_accuracy"
evals = openml.evaluations.list_evaluations(
function=metric, tasks=[task_id], output_format="dataframe"
@@ -70,13 +67,13 @@
print("\nDisplaying head of sorted dataframe: ")
print(evals.head())
-#############################################################################
-# Obtaining CDF of metric for chosen task
-# ***************************************
+# %% [markdown]
+# ## Obtaining CDF of metric for chosen task
# We shall now analyse how the performance of various flows have been on this task,
# by seeing the likelihood of the accuracy obtained across all runs.
# We shall now plot a cumulative distributive function (CDF) for the accuracies obtained.
+# %%
from matplotlib import pyplot as plt
@@ -97,16 +94,18 @@ def plot_cdf(values, metric="predictive_accuracy"):
plot_cdf(evals.value, metric)
+
+# %% [markdown]
# This CDF plot shows that for the given task, based on the results of the
# runs uploaded, it is almost certain to achieve an accuracy above 52%, i.e.,
# with non-zero probability. While the maximum accuracy seen till now is 96.5%.
-#############################################################################
-# Comparing top 10 performing flows
-# *********************************
+# %% [markdown]
+# ## Comparing top 10 performing flows
# Let us now try to see which flows generally performed the best for this task.
# For this, we shall compare the top performing flows.
+# %%
import numpy as np
import pandas as pd
@@ -139,6 +138,8 @@ def plot_flow_compare(evaluations, top_n=10, metric="predictive_accuracy"):
plot_flow_compare(evals, metric=metric, top_n=10)
+
+# %% [markdown]
# The boxplots below show how the flows perform across multiple runs on the chosen
# task. The green horizontal lines represent the median accuracy of all the runs for
# that flow (number of runs denoted at the bottom of the boxplots). The higher the
@@ -146,19 +147,22 @@ def plot_flow_compare(evaluations, top_n=10, metric="predictive_accuracy"):
# are in the descending order of the higest accuracy value seen under that flow.
# Printing the corresponding flow names for the top 10 performing flow IDs
+
+# %%
top_n = 10
flow_ids = evals.flow_id.unique()[:top_n]
flow_names = evals.flow_name.unique()[:top_n]
for i in range(top_n):
print((flow_ids[i], flow_names[i]))
-#############################################################################
-# Obtaining evaluations with hyperparameter settings
-# ==================================================
+# %% [markdown]
+# ## Obtaining evaluations with hyperparameter settings
# We'll now obtain the evaluations of a task and a flow with the hyperparameters
# List evaluations in descending order based on predictive_accuracy with
# hyperparameters
+
+# %%
evals_setups = openml.evaluations.list_evaluations_setups(
function="predictive_accuracy",
tasks=[31],
@@ -166,18 +170,18 @@ def plot_flow_compare(evaluations, top_n=10, metric="predictive_accuracy"):
sort_order="desc",
)
-""
print(evals_setups.head())
-""
+# %% [markdown]
# Return evaluations for flow_id in descending order based on predictive_accuracy
# with hyperparameters. parameters_in_separate_columns returns parameters in
# separate columns
+
+# %%
evals_setups = openml.evaluations.list_evaluations_setups(
function="predictive_accuracy", flows=[6767], size=100, parameters_in_separate_columns=True
)
-""
print(evals_setups.head(10))
-""
+# License: BSD 3-Clause
diff --git a/examples/30_extended/fetch_runtimes_tutorial.py b/examples/30_extended/fetch_runtimes_tutorial.py
index 8adf37d31..b2a3f1d2a 100644
--- a/examples/30_extended/fetch_runtimes_tutorial.py
+++ b/examples/30_extended/fetch_runtimes_tutorial.py
@@ -1,51 +1,43 @@
-"""
-
-==========================================
-Measuring runtimes for Scikit-learn models
-==========================================
-
-The runtime of machine learning models on specific datasets can be a deciding
-factor on the choice of algorithms, especially for benchmarking and comparison
-purposes. OpenML's scikit-learn extension provides runtime data from runs of
-model fit and prediction on tasks or datasets, for both the CPU-clock as well
-as the actual wallclock-time incurred. The objective of this example is to
-illustrate how to retrieve such timing measures, and also offer some potential
-means of usage and interpretation of the same.
-
-It should be noted that there are multiple levels at which parallelism can occur.
-
-* At the outermost level, OpenML tasks contain fixed data splits, on which the
- defined model/flow is executed. Thus, a model can be fit on each OpenML dataset fold
- in parallel using the `n_jobs` parameter to `run_model_on_task` or `run_flow_on_task`
- (illustrated under Case 2 & 3 below).
-
-* The model/flow specified can also include scikit-learn models that perform their own
- parallelization. For instance, by specifying `n_jobs` in a Random Forest model definition
- (covered under Case 2 below).
-
-* The sklearn model can further be an HPO estimator and contain it's own parallelization.
- If the base estimator used also supports `parallelization`, then there's at least a 2-level nested
- definition for parallelization possible (covered under Case 3 below).
-
-We shall cover these 5 representative scenarios for:
-
-* (Case 1) Retrieving runtimes for Random Forest training and prediction on each of the
- cross-validation folds
-
-* (Case 2) Testing the above setting in a parallel setup and monitor the difference using
- runtimes retrieved
-
-* (Case 3) Comparing RandomSearchCV and GridSearchCV on the above task based on runtimes
-
-* (Case 4) Running models that don't run in parallel or models which scikit-learn doesn't
- parallelize
-
-* (Case 5) Running models that do not release the Python Global Interpreter Lock (GIL)
-"""
-
-############################################################################
-
-# License: BSD 3-Clause
+# %% [markdown]
+# Measuring runtimes for Scikit-learn models
+#
+# The runtime of machine learning models on specific datasets can be a deciding
+# factor on the choice of algorithms, especially for benchmarking and comparison
+# purposes. OpenML's scikit-learn extension provides runtime data from runs of
+# model fit and prediction on tasks or datasets, for both the CPU-clock as well
+# as the actual wallclock-time incurred. The objective of this example is to
+# illustrate how to retrieve such timing measures, and also offer some potential
+# means of usage and interpretation of the same.
+#
+# It should be noted that there are multiple levels at which parallelism can occur.
+#
+# * At the outermost level, OpenML tasks contain fixed data splits, on which the
+# defined model/flow is executed. Thus, a model can be fit on each OpenML dataset fold
+# in parallel using the `n_jobs` parameter to `run_model_on_task` or `run_flow_on_task`
+# (illustrated under Case 2 & 3 below).
+#
+# * The model/flow specified can also include scikit-learn models that perform their own
+# parallelization. For instance, by specifying `n_jobs` in a Random Forest model definition
+# (covered under Case 2 below).
+#
+# * The sklearn model can further be an HPO estimator and contain it's own parallelization.
+# If the base estimator used also supports `parallelization`, then there's at least a 2-level nested
+# definition for parallelization possible (covered under Case 3 below).
+#
+# We shall cover these 5 representative scenarios for:
+#
+# * (Case 1) Retrieving runtimes for Random Forest training and prediction on each of the
+# cross-validation folds
+#
+# * (Case 2) Testing the above setting in a parallel setup and monitor the difference using
+# runtimes retrieved
+#
+# * (Case 3) Comparing RandomSearchCV and GridSearchCV on the above task based on runtimes
+#
+# * (Case 4) Running models that don't run in parallel or models which scikit-learn doesn't
+# parallelize
+#
+# * (Case 5) Running models that do not release the Python Global Interpreter Lock (GIL)
import openml
import numpy as np
@@ -59,10 +51,10 @@
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
-############################################################################
-# Preparing tasks and scikit-learn models
-# ***************************************
+# %% [markdown]
+# # Preparing tasks and scikit-learn models
+# %%
task_id = 167119
task = openml.tasks.get_task(task_id)
@@ -91,13 +83,13 @@ def print_compare_runtimes(measures):
)
-############################################################################
-# Case 1: Running a Random Forest model on an OpenML task
-# *******************************************************
+# %% [markdown]
+# # Case 1: Running a Random Forest model on an OpenML task
# We'll run a Random Forest model and obtain an OpenML run object. We can
# see the evaluations recorded per fold for the dataset and the information
# available for this run.
+# %%
clf = RandomForestClassifier(n_estimators=10)
run1 = openml.runs.run_model_on_task(
@@ -122,7 +114,7 @@ def print_compare_runtimes(measures):
print(f"Repeat #{repeat}-Fold #{fold}: {val2:.4f}")
print()
-################################################################################
+# %% [markdown]
# The remaining entries recorded in `measures` are the runtime records
# related as:
#
@@ -138,13 +130,15 @@ def print_compare_runtimes(measures):
# follows the same procedure but for time taken for the `.predict()` procedure.
# Comparing the CPU and wall-clock training times of the Random Forest model
+
+# %%
print_compare_runtimes(measures)
-######################################################################
-# Case 2: Running Scikit-learn model on an OpenML task in parallel
-# ****************************************************************
+# %% [markdown]
+# ## Case 2: Running Scikit-learn model on an OpenML task in parallel
# Redefining the model to allow parallelism with `n_jobs=2` (2 cores)
+# %%
clf = RandomForestClassifier(n_estimators=10, n_jobs=2)
run2 = openml.runs.run_model_on_task(
@@ -154,9 +148,10 @@ def print_compare_runtimes(measures):
# The wall-clock time recorded per fold should be lesser than Case 1 above
print_compare_runtimes(measures)
-####################################################################################
+# %% [markdown]
# Running a Random Forest model on an OpenML task in parallel (all cores available):
+# %%
# Redefining the model to use all available cores with `n_jobs=-1`
clf = RandomForestClassifier(n_estimators=10, n_jobs=-1)
@@ -164,24 +159,27 @@ def print_compare_runtimes(measures):
model=clf, task=task, upload_flow=False, avoid_duplicate_runs=False
)
measures = run3.fold_evaluations
+
+# %% [markdown]
# The wall-clock time recorded per fold should be lesser than the case above,
# if more than 2 CPU cores are available. The speed-up is more pronounced for
# larger datasets.
print_compare_runtimes(measures)
-####################################################################################
+# %% [markdown]
# We can now observe that the ratio of CPU time to wallclock time is lower
# than in case 1. This happens because joblib by default spawns subprocesses
# for the workloads for which CPU time cannot be tracked. Therefore, interpreting
# the reported CPU and wallclock time requires knowledge of the parallelization
# applied at runtime.
-####################################################################################
+# %% [markdown]
# Running the same task with a different parallel backend. Joblib provides multiple
# backends: {`loky` (default), `multiprocessing`, `dask`, `threading`, `sequential`}.
# The backend can be explicitly set using a joblib context manager. The behaviour of
# the job distribution can change and therefore the scale of runtimes recorded too.
+# %%
with parallel_backend(backend="multiprocessing", n_jobs=-1):
run3_ = openml.runs.run_model_on_task(
model=clf, task=task, upload_flow=False, avoid_duplicate_runs=False
@@ -189,7 +187,7 @@ def print_compare_runtimes(measures):
measures = run3_.fold_evaluations
print_compare_runtimes(measures)
-####################################################################################
+# %% [markdown]
# The CPU time interpretation becomes ambiguous when jobs are distributed over an
# unknown number of cores or when subprocesses are spawned for which the CPU time
# cannot be tracked, as in the examples above. It is impossible for OpenML-Python
@@ -198,9 +196,8 @@ def print_compare_runtimes(measures):
# cases that can arise as demonstrated in the rest of the example. Therefore,
# the final interpretation of the runtimes is left to the `user`.
-#####################################################################
-# Case 3: Running and benchmarking HPO algorithms with their runtimes
-# *******************************************************************
+# %% [markdown]
+# ## Case 3: Running and benchmarking HPO algorithms with their runtimes
# We shall now optimize a similar RandomForest model for the same task using
# scikit-learn's HPO support by using GridSearchCV to optimize our earlier
# RandomForest model's hyperparameter `n_estimators`. Scikit-learn also provides a
@@ -208,9 +205,9 @@ def print_compare_runtimes(measures):
# and evaluating the model on the best found parameter setting. This is
# included in the `wall_clock_time_millis_training` measure recorded.
+# %%
from sklearn.model_selection import GridSearchCV
-
clf = RandomForestClassifier(n_estimators=10, n_jobs=2)
# GridSearchCV model
@@ -228,7 +225,7 @@ def print_compare_runtimes(measures):
measures = run4.fold_evaluations
print_compare_runtimes(measures)
-##################################################################################
+# %% [markdown]
# Like any optimisation problem, scikit-learn's HPO estimators also generate
# a sequence of configurations which are evaluated, using which the best found
# configuration is tracked throughout the trace.
@@ -241,17 +238,19 @@ def print_compare_runtimes(measures):
# is for the entire `fit()` procedure of GridSearchCV thus subsuming the runtimes of
# the 2-fold (inner) CV search performed.
+# %%
# We earlier extracted the number of repeats and folds for this task:
print(f"# repeats: {n_repeats}\n# folds: {n_folds}")
# To extract the training runtime of the first repeat, first fold:
print(run4.fold_evaluations["wall_clock_time_millis_training"][0][0])
-##################################################################################
+# %% [markdown]
# To extract the training runtime of the 1-st repeat, 4-th (outer) fold and also
# to fetch the parameters and performance of the evaluations made during
# the 1-st repeat, 4-th fold evaluation by the Grid Search model.
+# %%
_repeat = 0
_fold = 3
print(
@@ -268,7 +267,7 @@ def print_compare_runtimes(measures):
)
)
-##################################################################################
+# %% [markdown]
# Scikit-learn's HPO estimators also come with an argument `refit=True` as a default.
# In our previous model definition it was set to True by default, which meant that the best
# found hyperparameter configuration was used to refit or retrain the model without any inner
@@ -283,6 +282,8 @@ def print_compare_runtimes(measures):
# This refit time can therefore be explicitly extracted in this manner:
+# %%
+
def extract_refit_time(run, repeat, fold):
refit_time = (
run.fold_evaluations["wall_clock_time_millis"][repeat][fold]
@@ -300,12 +301,13 @@ def extract_refit_time(run, repeat, fold):
)
)
-############################################################################
+# %% [markdown]
# Along with the GridSearchCV already used above, we demonstrate how such
# optimisation traces can be retrieved by showing an application of these
# traces - comparing the speed of finding the best configuration using
# RandomizedSearchCV and GridSearchCV available with scikit-learn.
+# %%
# RandomizedSearchCV model
rs_pipe = RandomizedSearchCV(
estimator=clf,
@@ -320,7 +322,7 @@ def extract_refit_time(run, repeat, fold):
model=rs_pipe, task=task, upload_flow=False, avoid_duplicate_runs=False, n_jobs=2
)
-################################################################################
+# %% [markdown]
# Since for the call to ``openml.runs.run_model_on_task`` the parameter
# ``n_jobs`` is set to its default ``None``, the evaluations across the OpenML folds
# are not parallelized. Hence, the time recorded is agnostic to the ``n_jobs``
@@ -334,6 +336,7 @@ def extract_refit_time(run, repeat, fold):
# the runtimes per fold can be cumulatively added to plot the trace against time.
+# %%
def extract_trace_data(run, n_repeats, n_folds, n_iter, key=None):
key = "wall_clock_time_millis_training" if key is None else key
data = {"score": [], "runtime": []}
@@ -376,9 +379,8 @@ def get_incumbent_trace(trace):
plt.legend()
plt.show()
-################################################################################
-# Case 4: Running models that scikit-learn doesn't parallelize
-# *************************************************************
+# %% [markdown]
+# ## Case 4: Running models that scikit-learn doesn't parallelize
# Both scikit-learn and OpenML depend on parallelism implemented through `joblib`.
# However, there can be cases where either models cannot be parallelized or don't
# depend on joblib for its parallelism. 2 such cases are illustrated below.
@@ -386,6 +388,7 @@ def get_incumbent_trace(trace):
# Running a Decision Tree model that doesn't support parallelism implicitly, but
# using OpenML to parallelize evaluations for the outer-cross validation folds.
+# %%
dt = DecisionTreeClassifier()
run6 = openml.runs.run_model_on_task(
@@ -394,11 +397,12 @@ def get_incumbent_trace(trace):
measures = run6.fold_evaluations
print_compare_runtimes(measures)
-################################################################################
+# %% [markdown]
# Although the decision tree does not run in parallel, it can release the
# `Python GIL `_.
# This can result in surprising runtime measures as demonstrated below:
+# %%
with parallel_backend("threading", n_jobs=-1):
run7 = openml.runs.run_model_on_task(
model=dt, task=task, upload_flow=False, avoid_duplicate_runs=False
@@ -406,11 +410,12 @@ def get_incumbent_trace(trace):
measures = run7.fold_evaluations
print_compare_runtimes(measures)
-################################################################################
+# %% [markdown]
# Running a Neural Network from scikit-learn that uses scikit-learn independent
-# parallelism using libraries such as `MKL, OpenBLAS or BLIS
-# `_.
+# parallelism using libraries such as
+# [MKL, OpenBLAS or BLIS](https://scikit-learn.org/stable/computing/parallelism.html#parallel-numpy-and-scipy-routines-from-numerical-libraries>).
+# %%
mlp = MLPClassifier(max_iter=10)
run8 = openml.runs.run_model_on_task(
@@ -419,15 +424,15 @@ def get_incumbent_trace(trace):
measures = run8.fold_evaluations
print_compare_runtimes(measures)
-################################################################################
-# Case 5: Running Scikit-learn models that don't release GIL
-# **********************************************************
-# Certain Scikit-learn models do not release the `Python GIL
-# `_ and
+# %% [markdown]
+# ## Case 5: Running Scikit-learn models that don't release GIL
+# Certain Scikit-learn models do not release the
+# [Python GIL](https://docs.python.org/dev/glossary.html#term-global-interpreter-lock) and
# are also not executed in parallel via a BLAS library. In such cases, the
# CPU times and wallclock times are most likely trustworthy. Note however
# that only very few models such as naive Bayes models are of this kind.
+# %%
clf = GaussianNB()
with parallel_backend("multiprocessing", n_jobs=-1):
@@ -437,9 +442,8 @@ def get_incumbent_trace(trace):
measures = run9.fold_evaluations
print_compare_runtimes(measures)
-################################################################################
-# Summmary
-# *********
+# %% [markdown]
+# ## Summmary
# The scikit-learn extension for OpenML-Python records model runtimes for the
# CPU-clock and the wall-clock times. The above examples illustrated how these
# recorded runtimes can be extracted when using a scikit-learn model and under
@@ -484,3 +488,4 @@ def get_incumbent_trace(trace):
#
# Because of all the cases mentioned above it is crucial to understand which case is triggered
# when reporting runtimes for scikit-learn models measured with OpenML-Python!
+# License: BSD 3-Clause
diff --git a/examples/30_extended/flow_id_tutorial.py b/examples/30_extended/flow_id_tutorial.py
index 137f8d14e..e813655fc 100644
--- a/examples/30_extended/flow_id_tutorial.py
+++ b/examples/30_extended/flow_id_tutorial.py
@@ -1,41 +1,36 @@
-"""
-==================
-Obtaining Flow IDs
-==================
+# %% [markdown]
+# # Obtaining Flow IDs
+# This tutorial discusses different ways to obtain the ID of a flow in order to perform further
+# analysis.
-This tutorial discusses different ways to obtain the ID of a flow in order to perform further
-analysis.
-"""
-
-####################################################################################################
-
-# License: BSD 3-Clause
+# %%
import sklearn.tree
import openml
-############################################################################
+# %% [markdown]
# .. warning::
# .. include:: ../../test_server_usage_warning.txt
-openml.config.start_using_configuration_for_example()
+# %%
+openml.config.start_using_configuration_for_example()
+openml.config.server = "https://api.openml.org/api/v1/xml"
-############################################################################
+# %%
# Defining a classifier
clf = sklearn.tree.DecisionTreeClassifier()
-####################################################################################################
-# 1. Obtaining a flow given a classifier
-# ======================================
-#
+# %% [markdown]
+# ## 1. Obtaining a flow given a classifier
+# %%
flow = openml.extensions.get_extension_by_model(clf).model_to_flow(clf).publish()
flow_id = flow.flow_id
print(flow_id)
-####################################################################################################
+# %% [markdown]
# This piece of code is rather involved. First, it retrieves a
# :class:`~openml.extensions.Extension` which is registered and can handle the given model,
# in our case it is :class:`openml.extensions.sklearn.SklearnExtension`. Second, the extension
@@ -46,38 +41,46 @@
#
# To simplify the usage we have created a helper function which automates all these steps:
+# %%
flow_id = openml.flows.get_flow_id(model=clf)
print(flow_id)
-####################################################################################################
-# 2. Obtaining a flow given its name
-# ==================================
-# The schema of a flow is given in XSD (`here
-# `_). # noqa E501
+# %% [markdown]
+# ## 2. Obtaining a flow given its name
+# The schema of a flow is given in XSD (
+# [here](https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.implementation.upload.xsd)). # noqa E501
# Only two fields are required, a unique name, and an external version. While it should be pretty
# obvious why we need a name, the need for the additional external version information might not
# be immediately clear. However, this information is very important as it allows to have multiple
# flows with the same name for different versions of a software. This might be necessary if an
# algorithm or implementation introduces, renames or drop hyperparameters over time.
+# %%
print(flow.name, flow.external_version)
-####################################################################################################
+# %% [markdown]
# The name and external version are automatically added to a flow when constructing it from a
# model. We can then use them to retrieve the flow id as follows:
+# %%
flow_id = openml.flows.flow_exists(name=flow.name, external_version=flow.external_version)
print(flow_id)
-####################################################################################################
+# %% [markdown]
# We can also retrieve all flows for a given name:
+
+# %%
flow_ids = openml.flows.get_flow_id(name=flow.name)
print(flow_ids)
-####################################################################################################
+# %% [markdown]
# This also works with the actual model (generalizing the first part of this example):
+
+# %%
flow_ids = openml.flows.get_flow_id(model=clf, exact_version=False)
print(flow_ids)
-# Deactivating test server
+# %%
+# Deactivating test configuration
openml.config.stop_using_configuration_for_example()
+# License: BSD 3-Clause
diff --git a/examples/30_extended/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py
index afd398feb..2d1bcb864 100644
--- a/examples/30_extended/flows_and_runs_tutorial.py
+++ b/examples/30_extended/flows_and_runs_tutorial.py
@@ -1,29 +1,28 @@
-"""
-Flows and Runs
-==============
+# %% [markdown]
+# #Flows and Runs
+# This tutorial covers how to train/run a model and how to upload the results.
-How to train/run a model and how to upload the results.
-"""
-
-# License: BSD 3-Clause
-
-from sklearn import compose, ensemble, impute, neighbors, pipeline, preprocessing, tree
+# %%
+import openml
+from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree
import openml
-############################################################################
+# %% [markdown]
# We'll use the test server for the rest of this tutorial.
#
# .. warning::
# .. include:: ../../test_server_usage_warning.txt
+
+# %%
openml.config.start_using_configuration_for_example()
-############################################################################
-# Train machine learning models
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# %% [markdown]
+# ## Train machine learning models
#
# Train a scikit-learn model on the data manually.
+# %%
# NOTE: We are using dataset 68 from the test server: https://test.openml.org/d/68
dataset = openml.datasets.get_dataset(dataset_id="eeg-eye-state", version=1)
X, y, categorical_indicator, attribute_names = dataset.get_data(
@@ -32,11 +31,13 @@
clf = neighbors.KNeighborsClassifier(n_neighbors=1)
clf.fit(X, y)
-############################################################################
+# %% [markdown]
# You can also ask for meta-data to automatically preprocess the data.
#
# * e.g. categorical features -> do feature encoding
-dataset = openml.datasets.get_dataset(dataset_id="credit-g", version=1)
+
+# %%
+dataset = openml.datasets.get_dataset(17)
X, y, categorical_indicator, attribute_names = dataset.get_data(
target=dataset.default_target_attribute
)
@@ -47,11 +48,11 @@
X = transformer.fit_transform(X)
clf.fit(X, y)
-############################################################################
-# Runs: Easily explore models
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# %% [markdown]
+# ## Runs: Easily explore models
# We can run (many) scikit-learn algorithms on (many) OpenML tasks.
+# %%
# Get a task
task = openml.tasks.get_task(403)
@@ -63,31 +64,34 @@
print(run)
-############################################################################
+# %% [markdown]
# Share the run on the OpenML server
#
# So far the run is only available locally. By calling the publish function,
# the run is sent to the OpenML server:
+# %%
myrun = run.publish()
# For this tutorial, our configuration publishes to the test server
# as to not pollute the main server.
print(f"Uploaded to {myrun.openml_url}")
-############################################################################
+# %% [markdown]
# We can now also inspect the flow object which was automatically created:
+# %%
flow = openml.flows.get_flow(run.flow_id)
print(flow)
-############################################################################
-# It also works with pipelines
-# ############################
+# %% [markdown]
+# ## It also works with pipelines
#
# When you need to handle 'dirty' data, build pipelines to model then automatically.
# To demonstrate this using the dataset `credit-a `_ via
# `task `_ as it contains both numerical and categorical
# variables and missing values in both.
+
+# %%
task = openml.tasks.get_task(96)
# OpenML helper functions for sklearn can be plugged in directly for complicated pipelines
@@ -121,10 +125,12 @@
print(f"Uploaded to {myrun.openml_url}")
+# %% [markdown]
# The above pipeline works with the helper functions that internally deal with pandas DataFrame.
# In the case, pandas is not available, or a NumPy based data processing is the requirement, the
# above pipeline is presented below to work with NumPy.
+# %%
# Extracting the indices of the categorical columns
features = task.get_dataset().features
categorical_feature_indices = []
@@ -164,14 +170,15 @@
myrun = run.publish()
print(f"Uploaded to {myrun.openml_url}")
-###############################################################################
-# Running flows on tasks offline for later upload
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# %% [markdown]
+# ## Running flows on tasks offline for later upload
# For those scenarios where there is no access to internet, it is possible to run
# a model on a task without uploading results or flows to the server immediately.
# To perform the following line offline, it is required to have been called before
# such that the task is cached on the local openml cache directory:
+
+# %%
task = openml.tasks.get_task(96)
# The following lines can then be executed offline:
@@ -192,9 +199,10 @@
# Publishing the run will automatically upload the related flow if
# it does not yet exist on the server.
-############################################################################
+# %% [markdown]
# Alternatively, one can also directly run flows.
+# %%
# Get a task
task = openml.tasks.get_task(403)
@@ -208,9 +216,8 @@
run = openml.runs.run_flow_on_task(flow, task)
-############################################################################
-# Challenge
-# ^^^^^^^^^
+# %% [markdown]
+# ## Challenge
#
# Try to build the best possible models on several OpenML tasks,
# compare your results with the rest of the class and learn from
@@ -227,6 +234,7 @@
# * Higgs (Physics): data_id:`23512 `_,
# task_id:`52950 `_, 100k instances, missing values.
+# %%
# Easy benchmarking:
for task_id in [115]: # Add further tasks. Disclaimer: they might take some time
task = openml.tasks.get_task(task_id)
@@ -238,5 +246,6 @@
print(f"kNN on {data.name}: {myrun.openml_url}")
-############################################################################
+# %%
openml.config.stop_using_configuration_for_example()
+# License: BSD 3-Clause
diff --git a/examples/30_extended/plot_svm_hyperparameters_tutorial.py b/examples/30_extended/plot_svm_hyperparameters_tutorial.py
index 491507d16..faced588b 100644
--- a/examples/30_extended/plot_svm_hyperparameters_tutorial.py
+++ b/examples/30_extended/plot_svm_hyperparameters_tutorial.py
@@ -1,24 +1,20 @@
-"""
-================================
-Plotting hyperparameter surfaces
-================================
-"""
-
-# License: BSD 3-Clause
-
-import numpy as np
+# %% [markdown]
+# # Plotting hyperparameter surfaces
+# %%
import openml
+import numpy as np
-####################################################################################################
-# First step - obtaining the data
-# ===============================
+# %% [markdown]
+# # First step - obtaining the data
# First, we need to choose an SVM flow, for example 8353, and a task. Finding the IDs of them are
# not part of this tutorial, this could for example be done via the website.
#
# For this we use the function ``list_evaluations_setup`` which can automatically join
# evaluations conducted by the server with the hyperparameter settings extracted from the
# uploaded runs (called *setup*).
+
+# %%
df = openml.evaluations.list_evaluations_setups(
function="predictive_accuracy",
flows=[8353],
@@ -29,21 +25,25 @@
)
print(df.head(n=10))
-####################################################################################################
+# %% [markdown]
# We can see all the hyperparameter names in the columns of the dataframe:
+
+# %%
for name in df.columns:
print(name)
-####################################################################################################
+# %% [markdown]
# Next, we cast and transform the hyperparameters of interest (``C`` and ``gamma``) so that we
# can nicely plot them.
+
+# %%
hyperparameters = ["sklearn.svm.classes.SVC(16)_C", "sklearn.svm.classes.SVC(16)_gamma"]
df[hyperparameters] = df[hyperparameters].astype(float).apply(np.log10)
-####################################################################################################
-# Option 1 - plotting via the pandas helper functions
-# ===================================================
-#
+# %% [markdown]
+# ## Option 1 - plotting via the pandas helper functions
+
+# %%
df.plot.hexbin(
x="sklearn.svm.classes.SVC(16)_C",
y="sklearn.svm.classes.SVC(16)_gamma",
@@ -53,10 +53,10 @@
title="SVM performance landscape",
)
-####################################################################################################
-# Option 2 - plotting via matplotlib
-# ==================================
-#
+# %% [markdown]
+# ## Option 2 - plotting via matplotlib
+
+# %%
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
@@ -79,3 +79,4 @@
ylabel="gamma (log10)",
)
ax.set_title("SVM performance landscape")
+# License: BSD 3-Clause
diff --git a/examples/30_extended/run_setup_tutorial.py b/examples/30_extended/run_setup_tutorial.py
index 477e49fa6..55d25d291 100644
--- a/examples/30_extended/run_setup_tutorial.py
+++ b/examples/30_extended/run_setup_tutorial.py
@@ -1,32 +1,26 @@
-"""
-=========
-Run Setup
-=========
-
-By: Jan N. van Rijn
-
-One of the key features of the openml-python library is that is allows to
-reinstantiate flows with hyperparameter settings that were uploaded before.
-This tutorial uses the concept of setups. Although setups are not extensively
-described in the OpenML documentation (because most users will not directly
-use them), they form a important concept within OpenML distinguishing between
-hyperparameter configurations.
-A setup is the combination of a flow with all its hyperparameters set.
-
-A key requirement for reinstantiating a flow is to have the same scikit-learn
-version as the flow that was uploaded. However, this tutorial will upload the
-flow (that will later be reinstantiated) itself, so it can be ran with any
-scikit-learn version that is supported by this library. In this case, the
-requirement of the corresponding scikit-learn versions is automatically met.
-
-In this tutorial we will
- 1) Create a flow and use it to solve a task;
- 2) Download the flow, reinstantiate the model with same hyperparameters,
- and solve the same task again;
- 3) We will verify that the obtained results are exactly the same.
-"""
-
-# License: BSD 3-Clause
+# %% [markdown]
+# # Run Setup
+# One of the key features of the openml-python library is that is allows to
+# reinstantiate flows with hyperparameter settings that were uploaded before.
+# This tutorial uses the concept of setups. Although setups are not extensively
+# described in the OpenML documentation (because most users will not directly
+# use them), they form a important concept within OpenML distinguishing between
+# hyperparameter configurations.
+# A setup is the combination of a flow with all its hyperparameters set.
+#
+# A key requirement for reinstantiating a flow is to have the same scikit-learn
+# version as the flow that was uploaded. However, this tutorial will upload the
+# flow (that will later be reinstantiated) itself, so it can be ran with any
+# scikit-learn version that is supported by this library. In this case, the
+# requirement of the corresponding scikit-learn versions is automatically met.
+#
+# In this tutorial we will
+# 1) Create a flow and use it to solve a task;
+# 2) Download the flow, reinstantiate the model with same hyperparameters,
+# and solve the same task again;
+# 3) We will verify that the obtained results are exactly the same.
+
+# %%
import numpy as np
import openml
@@ -39,24 +33,28 @@
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD
-############################################################################
+# %% [markdown]
# .. warning::
# .. include:: ../../test_server_usage_warning.txt
+
+# %%
openml.config.start_using_configuration_for_example()
-###############################################################################
+# %% [markdown]
# 1) Create a flow and use it to solve a task
-###############################################################################
-# first, let's download the task that we are interested in
-task = openml.tasks.get_task(6)
+# First, let's download the task that we are interested in
+# %%
+task = openml.tasks.get_task(6)
+# %% [markdown]
# we will create a fairly complex model, with many preprocessing components and
# many potential hyperparameters. Of course, the model can be as complex and as
# easy as you want it to be
+# %%
cat_imp = make_pipeline(
OneHotEncoder(handle_unknown="ignore"),
TruncatedSVD(),
@@ -70,10 +68,13 @@
]
)
+# %% [markdown]
# Let's change some hyperparameters. Of course, in any good application we
# would tune them using, e.g., Random Search or Bayesian Optimization, but for
# the purpose of this tutorial we set them to some specific values that might
# or might not be optimal
+
+# %%
hyperparameters_original = {
"estimator__criterion": "gini",
"estimator__n_estimators": 50,
@@ -86,10 +87,10 @@
run = openml.runs.run_model_on_task(model_original, task, avoid_duplicate_runs=False)
run_original = run.publish() # this implicitly uploads the flow
-###############################################################################
-# 2) Download the flow and solve the same task again.
-###############################################################################
+# %% [markdown]
+# ## 2) Download the flow and solve the same task again.
+# %%
# obtain setup id (note that the setup id is assigned by the OpenML server -
# therefore it was not yet available in our local copy of the run)
run_downloaded = openml.runs.get_run(run_original.run_id)
@@ -103,13 +104,16 @@
run_duplicate = openml.runs.run_model_on_task(model_duplicate, task, avoid_duplicate_runs=False)
-###############################################################################
-# 3) We will verify that the obtained results are exactly the same.
-###############################################################################
+# %% [markdown]
+# ## 3) We will verify that the obtained results are exactly the same.
+# %%
# the run has stored all predictions in the field data content
np.testing.assert_array_equal(run_original.data_content, run_duplicate.data_content)
-###############################################################################
+# %%
openml.config.stop_using_configuration_for_example()
+
+# By: Jan N. van Rijn
+# License: BSD 3-Clause
diff --git a/examples/30_extended/study_tutorial.py b/examples/30_extended/study_tutorial.py
index c0874b944..416e543bb 100644
--- a/examples/30_extended/study_tutorial.py
+++ b/examples/30_extended/study_tutorial.py
@@ -1,50 +1,58 @@
-"""
-=================
-Benchmark studies
-=================
-How to list, download and upload benchmark studies.
-In contrast to `benchmark suites `_ which
-hold a list of tasks, studies hold a list of runs. As runs contain all information on flows and
-tasks, all required information about a study can be retrieved.
-"""
-############################################################################
-
-# License: BSD 3-Clause
-
+# %% [markdown]
+# # Benchmark studies
+# How to list, download and upload benchmark studies.
+# In contrast to
+# [benchmark suites](https://docs.openml.org/benchmark/#benchmarking-suites) which
+# hold a list of tasks, studies hold a list of runs. As runs contain all information on flows and
+# tasks, all required information about a study can be retrieved.
+
+# %%
import uuid
from sklearn.ensemble import RandomForestClassifier
import openml
-############################################################################
-# Listing studies
-# ***************
-studies = openml.study.list_studies(status="all")
+# %% [markdown]
+# ## Listing studies
+#
+# * Use the output_format parameter to select output type
+# * Default gives ``dict``, but we'll use ``dataframe`` to obtain an
+# easier-to-work-with data structure
+
+# %%
+studies = openml.study.list_studies(output_format="dataframe", status="all")
print(studies.head(n=10))
-############################################################################
-# Downloading studies
-# ===================
+# %% [markdown]
+# ## Downloading studies
-############################################################################
+# %% [markdown]
# This is done based on the study ID.
+
+# %%
study = openml.study.get_study(123)
print(study)
-############################################################################
+# %% [markdown]
# Studies also features a description:
+
+# %%
print(study.description)
-############################################################################
+# %% [markdown]
# Studies are a container for runs:
+
+# %%
print(study.runs)
-############################################################################
+# %% [markdown]
# And we can use the evaluation listing functionality to learn more about
# the evaluations available for the conducted runs:
+
+# %%
evaluations = openml.evaluations.list_evaluations(
function="predictive_accuracy",
study=study.study_id,
@@ -52,21 +60,23 @@
)
print(evaluations.head())
-############################################################################
+# %% [markdown]
# We'll use the test server for the rest of this tutorial.
#
# .. warning::
# .. include:: ../../test_server_usage_warning.txt
+
+# %%
openml.config.start_using_configuration_for_example()
-############################################################################
-# Uploading studies
-# =================
+# %% [markdown]
+# ## Uploading studies
#
# Creating a study is as simple as creating any kind of other OpenML entity.
# In this examples we'll create a few runs for the OpenML-100 benchmark
# suite which is available on the OpenML test server.
+# %%
# Model to be used
clf = RandomForestClassifier()
@@ -100,5 +110,6 @@
print(new_study)
-############################################################################
+# %%
openml.config.stop_using_configuration_for_example()
+# License: BSD 3-Clause
diff --git a/examples/30_extended/suites_tutorial.py b/examples/30_extended/suites_tutorial.py
index 19f5cdc1a..a92c1cdb5 100644
--- a/examples/30_extended/suites_tutorial.py
+++ b/examples/30_extended/suites_tutorial.py
@@ -1,69 +1,79 @@
-"""
-================
-Benchmark suites
-================
-
-How to list, download and upload benchmark suites.
-
-If you want to learn more about benchmark suites, check out our
-brief introductory tutorial :ref:`sphx_glr_examples_20_basic_simple_suites_tutorial.py` or the
-`OpenML benchmark docs `_.
-"""
-############################################################################
-
-# License: BSD 3-Clause
+# %% [markdown]
+# # Benchmark suites
+#
+# How to list, download and upload benchmark suites.
+#
+# If you want to learn more about benchmark suites, check out our
+# brief introductory tutorial ["Simple suites tutorial"](../20_basic/simple_suites_tutorial) or the
+# [OpenML benchmark docs](https://docs.openml.org/benchmark/#benchmarking-suites).
+# %%
import uuid
import numpy as np
import openml
-############################################################################
-# Listing suites
-# **************
-suites = openml.study.list_suites(status="all")
+# %% [markdown]
+# ## Listing suites
+#
+# * Use the output_format parameter to select output type
+# * Default gives ``dict``, but we'll use ``dataframe`` to obtain an
+# easier-to-work-with data structure
+
+# %%
+suites = openml.study.list_suites(output_format="dataframe", status="all")
print(suites.head(n=10))
-############################################################################
-# Downloading suites
-# ==================
+# %% [markdown]
+# ## Downloading suites
-############################################################################
+# %% [markdown]
# This is done based on the dataset ID.
-# https://www.openml.org/api/v1/study/99
-suite = openml.study.get_suite("OpenML-CC18")
+
+# %%
+suite = openml.study.get_suite(99)
print(suite)
-############################################################################
+# %% [markdown]
# Suites also feature a description:
+
+# %%
print(suite.description)
-############################################################################
+# %% [markdown]
# Suites are a container for tasks:
+
+# %%
print(suite.tasks)
-############################################################################
+# %% [markdown]
# And we can use the task listing functionality to learn more about them:
-tasks = openml.tasks.list_tasks()
-# Using ``@`` in `pd.DataFrame.query <
-# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html>`_
+# %%
+tasks = openml.tasks.list_tasks(output_format="dataframe")
+
+# %% [markdown]
+# Using ``@`` in
+# [pd.DataFrame.query](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html)
# accesses variables outside of the current dataframe.
+
+# %%
tasks = tasks.query("tid in @suite.tasks")
print(tasks.describe().transpose())
-############################################################################
+# %% [markdown]
# We'll use the test server for the rest of this tutorial.
#
# .. warning::
# .. include:: ../../test_server_usage_warning.txt
+
+# %%
openml.config.start_using_configuration_for_example()
-############################################################################
-# Uploading suites
-# ================
+# %% [markdown]
+# ## Uploading suites
#
# Uploading suites is as simple as uploading any kind of other OpenML
# entity - the only reason why we need so much code in this example is
@@ -71,7 +81,9 @@
# We'll take a random subset of at least ten tasks of all available tasks on
# the test server:
-all_tasks = list(openml.tasks.list_tasks()["tid"])
+
+# %%
+all_tasks = list(openml.tasks.list_tasks(output_format="dataframe")["tid"])
task_ids_for_suite = sorted(np.random.choice(all_tasks, replace=False, size=20))
# The study needs a machine-readable and unique alias. To obtain this,
@@ -88,6 +100,6 @@
new_suite.publish()
print(new_suite)
-
-############################################################################
+# %%
openml.config.stop_using_configuration_for_example()
+# License: BSD 3-Clause
diff --git a/examples/30_extended/task_manual_iteration_tutorial.py b/examples/30_extended/task_manual_iteration_tutorial.py
index dda40de50..8b35633a2 100644
--- a/examples/30_extended/task_manual_iteration_tutorial.py
+++ b/examples/30_extended/task_manual_iteration_tutorial.py
@@ -1,47 +1,49 @@
-"""
-Tasks: retrieving splits
-========================
-
-Tasks define a target and a train/test split. Normally, they are the input to the function
-``openml.runs.run_model_on_task`` which automatically runs the model on all splits of the task.
-However, sometimes it is necessary to manually split a dataset to perform experiments outside of
-the functions provided by OpenML. One such example is in the benchmark library
-`HPOBench `_ which extensively uses data from OpenML,
-but not OpenML's functionality to conduct runs.
-"""
+# %% [markdown]
+# # Tasks: retrieving splits
+
+# Tasks define a target and a train/test split. Normally, they are the input to the function
+# ``openml.runs.run_model_on_task`` which automatically runs the model on all splits of the task.
+# However, sometimes it is necessary to manually split a dataset to perform experiments outside of
+# the functions provided by OpenML. One such example is in the benchmark library
+# [HPOBench](https://github.com/automl/HPOBench) which extensively uses data from OpenML,
+# but not OpenML's functionality to conduct runs.
-# License: BSD 3-Clause
+# %%
import openml
-####################################################################################################
+# %% [markdown]
# For this tutorial we will use the famous King+Rook versus King+Pawn on A7 dataset, which has
-# the dataset ID 3 (`dataset on OpenML `_), and for which there exist
+# the dataset ID 3 ([dataset on OpenML](https://www.openml.org/d/3)), and for which there exist
# tasks with all important estimation procedures. It is small enough (less than 5000 samples) to
# efficiently use it in an example.
#
-# We will first start with (`task 233 `_), which is a task with a
+# We will first start with ([task 233](https://www.openml.org/t/233)), which is a task with a
# holdout estimation procedure.
+
+# %%
task_id = 233
task = openml.tasks.get_task(task_id)
-####################################################################################################
+# %% [markdown]
# Now that we have a task object we can obtain the number of repetitions, folds and samples as
# defined by the task:
+# %%
n_repeats, n_folds, n_samples = task.get_split_dimensions()
-####################################################################################################
+# %% [markdown]
# * ``n_repeats``: Number of times the model quality estimation is performed
# * ``n_folds``: Number of folds per repeat
# * ``n_samples``: How many data points to use. This is only relevant for learning curve tasks
#
# A list of all available estimation procedures is available
-# `here `_.
+# [here](https://www.openml.org/search?q=%2520measure_type%3Aestimation_procedure&type=measure).
#
# Task ``233`` is a simple task using the holdout estimation procedure and therefore has only a
# single repeat, a single fold and a single sample size:
+# %%
print(
"Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
task_id,
@@ -51,11 +53,12 @@
)
)
-####################################################################################################
+# %% [markdown]
# We can now retrieve the train/test split for this combination of repeats, folds and number of
# samples (indexing is zero-based). Usually, one would loop over all repeats, folds and sample
# sizes, but we can neglect this here as there is only a single repetition.
+# %%
train_indices, test_indices = task.get_train_test_split_indices(
repeat=0,
fold=0,
@@ -65,10 +68,11 @@
print(train_indices.shape, train_indices.dtype)
print(test_indices.shape, test_indices.dtype)
-####################################################################################################
+# %% [markdown]
# And then split the data based on this:
-X, y = task.get_X_and_y()
+# %%
+X, y = task.get_X_and_y(dataset_format="dataframe")
X_train = X.iloc[train_indices]
y_train = y.iloc[train_indices]
X_test = X.iloc[test_indices]
@@ -83,9 +87,10 @@
)
)
-####################################################################################################
+# %% [markdown]
# Obviously, we can also retrieve cross-validation versions of the dataset used in task ``233``:
+# %%
task_id = 3
task = openml.tasks.get_task(task_id)
X, y = task.get_X_and_y()
@@ -99,8 +104,10 @@
)
)
-####################################################################################################
+# %% [markdown]
# And then perform the aforementioned iteration over all splits:
+
+# %%
for repeat_idx in range(n_repeats):
for fold_idx in range(n_folds):
for sample_idx in range(n_samples):
@@ -127,9 +134,10 @@
)
)
-####################################################################################################
+# %% [markdown]
# And also versions with multiple repeats:
+# %%
task_id = 1767
task = openml.tasks.get_task(task_id)
X, y = task.get_X_and_y()
@@ -143,8 +151,10 @@
)
)
-####################################################################################################
+# %% [markdown]
# And then again perform the aforementioned iteration over all splits:
+
+# %%
for repeat_idx in range(n_repeats):
for fold_idx in range(n_folds):
for sample_idx in range(n_samples):
@@ -171,9 +181,10 @@
)
)
-####################################################################################################
+# %% [markdown]
# And finally a task based on learning curves:
+# %%
task_id = 1702
task = openml.tasks.get_task(task_id)
X, y = task.get_X_and_y()
@@ -187,8 +198,10 @@
)
)
-####################################################################################################
+# %% [markdown]
# And then again perform the aforementioned iteration over all splits:
+
+# %%
for repeat_idx in range(n_repeats):
for fold_idx in range(n_folds):
for sample_idx in range(n_samples):
@@ -214,3 +227,4 @@
y_test.shape,
)
)
+# License: BSD 3-Clause
diff --git a/examples/30_extended/tasks_tutorial.py b/examples/30_extended/tasks_tutorial.py
index 63821c7a2..54a373fca 100644
--- a/examples/30_extended/tasks_tutorial.py
+++ b/examples/30_extended/tasks_tutorial.py
@@ -1,16 +1,12 @@
-"""
-Tasks
-=====
-
-A tutorial on how to list and download tasks.
-"""
-
-# License: BSD 3-Clause
+# %% [markdown]
+# # Tasks
+# A tutorial on how to list and download tasks.
+# %%
import openml
from openml.tasks import TaskType
-############################################################################
+# %% [markdown]
#
# Tasks are identified by IDs and can be accessed in two different ways:
#
@@ -24,67 +20,75 @@
# metric, the splits and an iterator which can be used to access the
# splits in a useful manner.
-############################################################################
-# Listing tasks
-# ^^^^^^^^^^^^^
+# %% [markdown]
+# ## Listing tasks
#
# We will start by simply listing only *supervised classification* tasks.
-# **openml.tasks.list_tasks()** getting a
-# `pandas dataframe `_
-# to have good visualization capabilities and easier access:
+#
+# **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, but we
+# request a
+# [pandas dataframe](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html)
+# instead to have better visualization capabilities and easier access:
-tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION)
+# %%
+tasks = openml.tasks.list_tasks(
+ task_type=TaskType.SUPERVISED_CLASSIFICATION, output_format="dataframe"
+)
print(tasks.columns)
print(f"First 5 of {len(tasks)} tasks:")
print(tasks.head())
-############################################################################
+# %% [markdown]
# We can filter the list of tasks to only contain datasets with more than
# 500 samples, but less than 1000 samples:
+# %%
filtered_tasks = tasks.query("NumberOfInstances > 500 and NumberOfInstances < 1000")
print(list(filtered_tasks.index))
-############################################################################
+# %%
# Number of tasks
print(len(filtered_tasks))
-############################################################################
+# %% [markdown]
# Then, we can further restrict the tasks to all have the same resampling strategy:
+# %%
filtered_tasks = filtered_tasks.query('estimation_procedure == "10-fold Crossvalidation"')
print(list(filtered_tasks.index))
-############################################################################
-
+# %%
# Number of tasks
print(len(filtered_tasks))
-############################################################################
+# %% [markdown]
# Resampling strategies can be found on the
-# `OpenML Website `_.
+# [OpenML Website](https://www.openml.org/search?type=measure&q=estimation%20procedure).
#
# Similar to listing tasks by task type, we can list tasks by tags:
-tasks = openml.tasks.list_tasks(tag="OpenML100")
+# %%
+tasks = openml.tasks.list_tasks(tag="OpenML100", output_format="dataframe")
print(f"First 5 of {len(tasks)} tasks:")
print(tasks.head())
-############################################################################
+# %% [markdown]
# Furthermore, we can list tasks based on the dataset id:
-tasks = openml.tasks.list_tasks(data_id=1471)
+# %%
+tasks = openml.tasks.list_tasks(data_id=1471, output_format="dataframe")
print(f"First 5 of {len(tasks)} tasks:")
print(tasks.head())
-############################################################################
+# %% [markdown]
# In addition, a size limit and an offset can be applied both separately and simultaneously:
-tasks = openml.tasks.list_tasks(size=10, offset=50)
+# %%
+tasks = openml.tasks.list_tasks(size=10, offset=50, output_format="dataframe")
print(tasks)
-############################################################################
+# %% [markdown]
#
# **OpenML 100**
# is a curated list of 100 tasks to start using OpenML. They are all
@@ -92,48 +96,46 @@
# instances per task. To make things easier, the tasks do not contain highly
# unbalanced data and sparse data. However, the tasks include missing values and
# categorical features. You can find out more about the *OpenML 100* on
-# `the OpenML benchmarking page `_.
+# [the OpenML benchmarking page](https://docs.openml.org/benchmark/).
#
# Finally, it is also possible to list all tasks on OpenML with:
-############################################################################
-tasks = openml.tasks.list_tasks()
+# %%
+tasks = openml.tasks.list_tasks(output_format="dataframe")
print(len(tasks))
-############################################################################
-# Exercise
-# ########
+# %% [markdown]
+# ## Exercise
#
# Search for the tasks on the 'eeg-eye-state' dataset.
+# %%
tasks.query('name=="eeg-eye-state"')
-############################################################################
-# Downloading tasks
-# ^^^^^^^^^^^^^^^^^
+# %% [markdown]
+# ## Downloading tasks
#
# We provide two functions to download tasks, one which downloads only a
# single task by its ID, and one which takes a list of IDs and downloads
# all of these tasks:
+# %%
task_id = 31
task = openml.tasks.get_task(task_id)
-############################################################################
+# %%
# Properties of the task are stored as member variables:
-
print(task)
-############################################################################
+# %%
# And:
ids = [2, 1891, 31, 9983]
tasks = openml.tasks.get_tasks(ids)
print(tasks[0])
-############################################################################
-# Creating tasks
-# ^^^^^^^^^^^^^^
+# %% [markdown]
+# ## Creating tasks
#
# You can also create new tasks. Take the following into account:
#
@@ -159,16 +161,16 @@
# necessary (e.g. when other measure make no sense), since it will create a new task, which
# scatters results across tasks.
-############################################################################
+# %% [markdown]
# We'll use the test server for the rest of this tutorial.
#
# .. warning::
# .. include:: ../../test_server_usage_warning.txt
+# %%
openml.config.start_using_configuration_for_example()
-############################################################################
-# Example
-# #######
+# %% [markdown]
+# ## Example
#
# Let's create a classification task on a dataset. In this example we will do this on the
# Iris dataset (ID=128 (on test server)). We'll use 10-fold cross-validation (ID=1),
@@ -177,7 +179,7 @@
# If such a task doesn't exist, a task will be created and the corresponding task_id
# will be returned.
-
+# %%
try:
my_task = openml.tasks.create_task(
task_type=TaskType.SUPERVISED_CLASSIFICATION,
@@ -200,12 +202,14 @@
task_id = tasks.loc[:, "tid"].values[0]
print("Task already exists. Task ID is", task_id)
+# %%
# reverting to prod server
openml.config.stop_using_configuration_for_example()
-############################################################################
-# * `Complete list of task types `_.
-# * `Complete list of model estimation procedures `_.
-# * `Complete list of evaluation measures `_.
+# %% [markdown]
+# * [Complete list of task types](https://www.openml.org/search?type=task_type).
+# * [Complete list of model estimation procedures](https://www.openml.org/search?q=%2520measure_type%3Aestimation_procedure&type=measure).
+# * [Complete list of evaluation measures](https://www.openml.org/search?q=measure_type%3Aevaluation_measure&type=measure).
#
+# License: BSD 3-Clause
diff --git a/examples/40_paper/2015_neurips_feurer_example.py b/examples/40_paper/2015_neurips_feurer_example.py
index 28015557b..8b1ac02f9 100644
--- a/examples/40_paper/2015_neurips_feurer_example.py
+++ b/examples/40_paper/2015_neurips_feurer_example.py
@@ -1,28 +1,27 @@
-"""
-Feurer et al. (2015)
-====================
+# %% [markdown]
+# # Feurer et al. (2015)
-A tutorial on how to get the datasets used in the paper introducing *Auto-sklearn* by Feurer et al..
-
-Auto-sklearn website: https://automl.github.io/auto-sklearn/
-
-Publication
-~~~~~~~~~~~
+# A tutorial on how to get the datasets used in the paper introducing *Auto-sklearn* by Feurer et al..
+#
+# Auto-sklearn website: https://automl.github.io/auto-sklearn/
+#
+# ## Publication
+#
+# | Efficient and Robust Automated Machine Learning
+# | Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter
+# | In *Advances in Neural Information Processing Systems 28*, 2015
+# | Available at https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf
-| Efficient and Robust Automated Machine Learning
-| Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter
-| In *Advances in Neural Information Processing Systems 28*, 2015
-| Available at https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf
-"""
-
-# License: BSD 3-Clause
+# %%
+import pandas as pd
import openml
-####################################################################################################
+# %% [markdown]
# List of dataset IDs given in the supplementary material of Feurer et al.:
# https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning-supplemental.zip
-# fmt: off
+
+# %%
dataset_ids = [
3, 6, 12, 14, 16, 18, 21, 22, 23, 24, 26, 28, 30, 31, 32, 36, 38, 44, 46,
57, 60, 179, 180, 181, 182, 184, 185, 273, 293, 300, 351, 354, 357, 389,
@@ -35,9 +34,8 @@
1056, 1067, 1068, 1069, 1111, 1112, 1114, 1116, 1119, 1120, 1128, 1130,
1134, 1138, 1139, 1142, 1146, 1161, 1166,
]
-# fmt: on
-####################################################################################################
+# %% [markdown]
# The dataset IDs could be used directly to load the dataset and split the data into a training set
# and a test set. However, to be reproducible, we will first obtain the respective tasks from
# OpenML, which define both the target feature and the train/test split.
@@ -50,11 +48,13 @@
# Please check the `OpenML documentation of tasks `_ if you
# want to learn more about them.
-####################################################################################################
+# %% [markdown]
# This lists both active and inactive tasks (because of ``status='all'``). Unfortunately,
# this is necessary as some of the datasets contain issues found after the publication and became
# deactivated, which also deactivated the tasks on them. More information on active or inactive
-# datasets can be found in the `online docs `_.
+# datasets can be found in the [online docs](https://docs.openml.org/#dataset-status).
+
+# %%
tasks = openml.tasks.list_tasks(
task_type=openml.tasks.TaskType.SUPERVISED_CLASSIFICATION,
status="all",
@@ -88,3 +88,5 @@
# These are the tasks to work with:
print(task_ids)
+
+# License: BSD 3-Clause
diff --git a/examples/40_paper/2018_ida_strang_example.py b/examples/40_paper/2018_ida_strang_example.py
index d9fdc78a7..1a873a01c 100644
--- a/examples/40_paper/2018_ida_strang_example.py
+++ b/examples/40_paper/2018_ida_strang_example.py
@@ -1,26 +1,22 @@
-"""
-Strang et al. (2018)
-====================
-
-A tutorial on how to reproduce the analysis conducted for *Don't Rule Out Simple Models
-Prematurely: A Large Scale Benchmark Comparing Linear and Non-linear Classifiers in OpenML*.
-
-Publication
-~~~~~~~~~~~
-
-| Don't Rule Out Simple Models Prematurely: A Large Scale Benchmark Comparing Linear and Non-linear Classifiers in OpenML
-| Benjamin Strang, Peter van der Putten, Jan N. van Rijn and Frank Hutter
-| In *Advances in Intelligent Data Analysis XVII 17th International Symposium*, 2018
-| Available at https://link.springer.com/chapter/10.1007%2F978-3-030-01768-2_25
-"""
-
-# License: BSD 3-Clause
+# %% [markdown]
+# # Strang et al. (2018)
+#
+# A tutorial on how to reproduce the analysis conducted for *Don't Rule Out Simple Models
+# Prematurely: A Large Scale Benchmark Comparing Linear and Non-linear Classifiers in OpenML*.
+#
+# ## Publication
+#
+# | Don't Rule Out Simple Models Prematurely: A Large Scale Benchmark Comparing Linear and Non-linear Classifiers in OpenML
+# | Benjamin Strang, Peter van der Putten, Jan N. van Rijn and Frank Hutter
+# | In *Advances in Intelligent Data Analysis XVII 17th International Symposium*, 2018
+# | Available at https://link.springer.com/chapter/10.1007%2F978-3-030-01768-2_25
+# %%
import matplotlib.pyplot as plt
import openml
-##############################################################################
+# %% [markdown]
# A basic step for each data-mining or machine learning task is to determine
# which model to choose based on the problem and the data at hand. In this
# work we investigate when non-linear classifiers outperform linear
@@ -35,6 +31,7 @@
# more effort to distinguish the same flow with different hyperparameter
# values.
+# %%
study_id = 123
# for comparing svms: flow_ids = [7754, 7756]
# for comparing nns: flow_ids = [7722, 7729]
@@ -69,10 +66,10 @@
# adds column that indicates the difference between the two classifiers
evaluations["diff"] = evaluations[flow_ids[0]] - evaluations[flow_ids[1]]
-
-##############################################################################
+# %% [markdown]
# makes the s-plot
+# %%
fig_splot, ax_splot = plt.subplots()
ax_splot.plot(range(len(evaluations)), sorted(evaluations["diff"]))
ax_splot.set_title(classifier_family)
@@ -82,11 +79,12 @@
plt.show()
-##############################################################################
+# %% [markdown]
# adds column that indicates the difference between the two classifiers,
# needed for the scatter plot
+# %%
def determine_class(val_lin, val_nonlin):
if val_lin < val_nonlin:
return class_values[0]
@@ -112,10 +110,11 @@ def determine_class(val_lin, val_nonlin):
ax_scatter.set_yscale("log")
plt.show()
-##############################################################################
+# %% [markdown]
# makes a scatter plot where each data point represents the performance of the
# two algorithms on various axis (not in the paper)
+# %%
fig_diagplot, ax_diagplot = plt.subplots()
ax_diagplot.grid(linestyle="--")
ax_diagplot.plot([0, 1], ls="-", color="black")
@@ -125,3 +124,4 @@ def determine_class(val_lin, val_nonlin):
ax_diagplot.set_xlabel(measure)
ax_diagplot.set_ylabel(measure)
plt.show()
+# License: BSD 3-Clause
diff --git a/examples/40_paper/2018_kdd_rijn_example.py b/examples/40_paper/2018_kdd_rijn_example.py
index 751f53470..315c27dc3 100644
--- a/examples/40_paper/2018_kdd_rijn_example.py
+++ b/examples/40_paper/2018_kdd_rijn_example.py
@@ -1,38 +1,18 @@
-"""
-This example is deprecated! You will need to manually remove adapt this code to make it run.
-We deprecated this example in our CI as it requires fanova as a dependency. However, fanova is not supported in all Python versions used in our CI/CD.
-
-van Rijn and Hutter (2018)
-==========================
-
-A tutorial on how to reproduce the paper *Hyperparameter Importance Across Datasets*.
-
-Example Deprecation Warning!
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This example is not supported anymore by the OpenML-Python developers. The example is kept for reference purposes but not tested anymore.
-
-Publication
-~~~~~~~~~~~
-
-| Hyperparameter importance across datasets
-| Jan N. van Rijn and Frank Hutter
-| In *Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining*, 2018
-| Available at https://dl.acm.org/doi/10.1145/3219819.3220058
-
-Requirements
-~~~~~~~~~~~~
-
-This is a Unix-only tutorial, as the requirements can not be satisfied on a Windows machine (Untested on other
-systems).
-
-The following Python packages are required:
-
-pip install openml[examples,docs] fanova ConfigSpace<1.0
-"""
+# %% [markdown]
+# # van Rijn and Hutter (2018)
+#
+# A tutorial on how to reproduce the paper *Hyperparameter Importance Across Datasets*.
+#
+# This is a Unix-only tutorial, as the requirements can not be satisfied on a Windows machine (Untested on other
+# systems).
+#
+# ## Publication
+#
+# | Hyperparameter importance across datasets
+# | Jan N. van Rijn and Frank Hutter
+# | In *Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining*, 2018
+# | Available at https://dl.acm.org/doi/10.1145/3219819.3220058
-# License: BSD 3-Clause
-run_code = False
import sys
# DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline
print("This example is deprecated, remove this code to use it manually.")
@@ -107,22 +87,64 @@
size=limit_per_task,
)
- performance_column = "value"
- # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance
- # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine
- # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format
- # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for
- # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the
- # setups that belong to the flows embedded in this example though.
- try:
- setups_evals = pd.DataFrame(
- [
- dict(
- **{name: json.loads(value) for name, value in setup["parameters"].items()},
- **{performance_column: setup[performance_column]},
- )
- for _, setup in evals.iterrows()
- ]
+# %% [markdown]
+# With the advent of automated machine learning, automated hyperparameter
+# optimization methods are by now routinely used in data mining. However, this
+# progress is not yet matched by equal progress on automatic analyses that
+# yield information beyond performance-optimizing hyperparameter settings.
+# In this example, we aim to answer the following two questions: Given an
+# algorithm, what are generally its most important hyperparameters?
+#
+# This work is carried out on the OpenML-100 benchmark suite, which can be
+# obtained by ``openml.study.get_suite('OpenML100')``. In this example, we
+# conduct the experiment on the Support Vector Machine (``flow_id=7707``)
+# with specific kernel (we will perform a post-process filter operation for
+# this). We should set some other experimental parameters (number of results
+# per task, evaluation measure and the number of trees of the internal
+# functional Anova) before the fun can begin.
+#
+# Note that we simplify the example in several ways:
+#
+# 1) We only consider numerical hyperparameters
+# 2) We consider all hyperparameters that are numerical (in reality, some
+# hyperparameters might be inactive (e.g., ``degree``) or irrelevant
+# (e.g., ``random_state``)
+# 3) We assume all hyperparameters to be on uniform scale
+#
+# Any difference in conclusion between the actual paper and the presented
+# results is most likely due to one of these simplifications. For example,
+# the hyperparameter C looks rather insignificant, whereas it is quite
+# important when it is put on a log-scale. All these simplifications can be
+# addressed by defining a ConfigSpace. For a more elaborated example that uses
+# this, please see:
+# https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py # noqa F401
+
+# %%
+ suite = openml.study.get_suite("OpenML100")
+ flow_id = 7707
+ parameter_filters = {"sklearn.svm.classes.SVC(17)_kernel": "sigmoid"}
+ evaluation_measure = "predictive_accuracy"
+ limit_per_task = 500
+ limit_nr_tasks = 15
+ n_trees = 16
+
+ fanova_results = []
+ # we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the
+ # communication with OpenML, and for iterated experimenting it is better to cache the results in a local file.
+ for idx, task_id in enumerate(suite.tasks):
+ if limit_nr_tasks is not None and idx >= limit_nr_tasks:
+ continue
+ print(
+ "Starting with task %d (%d/%d)"
+ % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks)
+ )
+ # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop)
+ evals = openml.evaluations.list_evaluations_setups(
+ evaluation_measure,
+ flows=[flow_id],
+ tasks=[task_id],
+ size=limit_per_task,
+ output_format="dataframe",
)
except json.decoder.JSONDecodeError as e:
print("Task %d error: %s" % (task_id, e))
@@ -174,11 +196,13 @@
# transform ``fanova_results`` from a list of dicts into a DataFrame
fanova_results = pd.DataFrame(fanova_results)
- ##############################################################################
- # make the boxplot of the variance contribution. Obviously, we can also use
- # this data to make the Nemenyi plot, but this relies on the rather complex
- # ``Orange`` dependency (``pip install Orange3``). For the complete example,
- # the reader is referred to the more elaborate script (referred to earlier)
+# %% [markdown]
+# make the boxplot of the variance contribution. Obviously, we can also use
+# this data to make the Nemenyi plot, but this relies on the rather complex
+# ``Orange`` dependency (``pip install Orange3``). For the complete example,
+# the reader is referred to the more elaborate script (referred to earlier)
+
+ # %%
fig, ax = plt.subplots()
sns.boxplot(x="hyperparameter", y="fanova", data=fanova_results, ax=ax)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
@@ -186,3 +210,4 @@
ax.set_xlabel(None)
plt.tight_layout()
plt.show()
+ # License: BSD 3-Clause
diff --git a/examples/40_paper/2018_neurips_perrone_example.py b/examples/40_paper/2018_neurips_perrone_example.py
index 91768e010..feb107cba 100644
--- a/examples/40_paper/2018_neurips_perrone_example.py
+++ b/examples/40_paper/2018_neurips_perrone_example.py
@@ -1,32 +1,28 @@
-"""
-Perrone et al. (2018)
-=====================
-
-A tutorial on how to build a surrogate model based on OpenML data as done for *Scalable
-Hyperparameter Transfer Learning* by Perrone et al..
-
-Publication
-~~~~~~~~~~~
-
-| Scalable Hyperparameter Transfer Learning
-| Valerio Perrone and Rodolphe Jenatton and Matthias Seeger and Cedric Archambeau
-| In *Advances in Neural Information Processing Systems 31*, 2018
-| Available at https://papers.nips.cc/paper/7917-scalable-hyperparameter-transfer-learning.pdf
-
-This example demonstrates how OpenML runs can be used to construct a surrogate model.
-
-In the following section, we shall do the following:
-
-* Retrieve tasks and flows as used in the experiments by Perrone et al. (2018).
-* Build a tabular data by fetching the evaluations uploaded to OpenML.
-* Impute missing values and handle categorical data before building a Random Forest model that
- maps hyperparameter values to the area under curve score.
-"""
-
-############################################################################
+# %% [markdown]
+# # Perrone et al. (2018)
+#
+# A tutorial on how to build a surrogate model based on OpenML data as done for *Scalable
+# Hyperparameter Transfer Learning* by Perrone et al..
+#
+# ## Publication
+#
+# | Scalable Hyperparameter Transfer Learning
+# | Valerio Perrone and Rodolphe Jenatton and Matthias Seeger and Cedric Archambeau
+# | In *Advances in Neural Information Processing Systems 31*, 2018
+# | Available at https://papers.nips.cc/paper/7917-scalable-hyperparameter-transfer-learning.pdf
+#
+# This example demonstrates how OpenML runs can be used to construct a surrogate model.
+#
+# In the following section, we shall do the following:
+#
+# * Retrieve tasks and flows as used in the experiments by Perrone et al. (2018).
+# * Build a tabular data by fetching the evaluations uploaded to OpenML.
+# * Impute missing values and handle categorical data before building a Random Forest model that
+# maps hyperparameter values to the area under curve score.
-# License: BSD 3-Clause
+# %%
+import openml
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
@@ -40,11 +36,13 @@
import openml
flow_type = "svm" # this example will use the smaller svm flow evaluations
-############################################################################
+
+# %% [markdown]
# The subsequent functions are defined to fetch tasks, flows, evaluations and preprocess them into
# a tabular format that can be used to build models.
+# %%
def fetch_evaluations(run_full=False, flow_type="svm", metric="area_under_roc_curve"):
"""
Fetch a list of evaluations based on the flows and tasks used in the experiments.
@@ -154,25 +152,26 @@ def list_categorical_attributes(flow_type="svm"):
return ["booster"]
-#############################################################################
+# %% [markdown]
# Fetching the data from OpenML
# *****************************
# Now, we read all the tasks and evaluations for them and collate into a table.
# Here, we are reading all the tasks and evaluations for the SVM flow and
# pre-processing all retrieved evaluations.
+# %%
eval_df, task_ids, flow_id = fetch_evaluations(run_full=False, flow_type=flow_type)
X, y = create_table_from_evaluations(eval_df, flow_type=flow_type)
print(X.head())
print("Y : ", y[:5])
-#############################################################################
-# Creating pre-processing and modelling pipelines
-# ***********************************************
+# %% [markdown]
+# ## Creating pre-processing and modelling pipelines
# The two primary tasks are to impute the missing values, that is, account for the hyperparameters
# that are not available with the runs from OpenML. And secondly, to handle categorical variables
# using One-hot encoding prior to modelling.
+# %%
# Separating data into categorical and non-categorical (numeric for this example) columns
cat_cols = list_categorical_attributes(flow_type=flow_type)
num_cols = list(set(X.columns) - set(cat_cols))
@@ -201,13 +200,13 @@ def list_categorical_attributes(flow_type="svm"):
model = Pipeline(steps=[("preprocess", ct), ("surrogate", clf)])
-#############################################################################
-# Building a surrogate model on a task's evaluation
-# *************************************************
+# %% [markdown]
+# ## Building a surrogate model on a task's evaluation
# The same set of functions can be used for a single task to retrieve a singular table which can
# be used for the surrogate model construction. We shall use the SVM flow here to keep execution
# time simple and quick.
+# %%
# Selecting a task for the surrogate
task_id = task_ids[-1]
print("Task ID : ", task_id)
@@ -218,10 +217,8 @@ def list_categorical_attributes(flow_type="svm"):
print(f"Training RMSE : {mean_squared_error(y, y_pred):.5}")
-
-#############################################################################
-# Evaluating the surrogate model
-# ******************************
+# %% [markdown]
+# ## Evaluating the surrogate model
# The surrogate model built from a task's evaluations fetched from OpenML will be put into
# trivial action here, where we shall randomly sample configurations and observe the trajectory
# of the area under curve (auc) we can obtain from the surrogate we've built.
@@ -229,6 +226,7 @@ def list_categorical_attributes(flow_type="svm"):
# NOTE: This section is written exclusively for the SVM flow
+# %%
# Sampling random configurations
def random_sample_configurations(num_samples=100):
colnames = ["cost", "degree", "gamma", "kernel"]
@@ -251,7 +249,7 @@ def random_sample_configurations(num_samples=100):
configs = random_sample_configurations(num_samples=1000)
print(configs)
-#############################################################################
+# %%
preds = model.predict(configs)
# tracking the maximum AUC obtained over the functions evaluations
@@ -264,3 +262,4 @@ def random_sample_configurations(num_samples=100):
plt.title("AUC regret for Random Search on surrogate")
plt.xlabel("Numbe of function evaluations")
plt.ylabel("Regret")
+# License: BSD 3-Clause
diff --git a/examples/test_server_usage_warning.txt b/examples/test_server_usage_warning.txt
new file mode 100644
index 000000000..c551480b6
--- /dev/null
+++ b/examples/test_server_usage_warning.txt
@@ -0,0 +1,3 @@
+This example uploads data. For that reason, this example connects to the test server at test.openml.org.
+This prevents the main server from crowding with example datasets, tasks, runs, and so on.
+The use of this test server can affect behaviour and performance of the OpenML-Python API.
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100644
index 000000000..20394ed32
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,45 @@
+site_name: openml-python
+theme:
+ name: material
+ features:
+ - content.code.copy
+ palette:
+ - scheme: default
+
+extra_css:
+ - stylesheets/extra.css
+
+nav:
+ - index.md
+ - Code Reference: reference/
+ - Examples: examples/
+ - Usage: usage.md
+ - Contributing: contributing.md
+ - Extensions: extensions.md
+ - Changelog: progress.md
+
+markdown_extensions:
+ - pymdownx.highlight:
+ anchor_linenums: true
+ - pymdownx.superfences
+ - pymdownx.snippets
+ - attr_list
+ - pymdownx.tabbed:
+ alternate_style: true
+
+plugins:
+ - search
+ - autorefs
+ - section-index
+ - mkdocstrings:
+ handlers:
+ python:
+ options:
+ docstring_style: numpy
+ - gen-files:
+ scripts:
+ - scripts/gen_ref_pages.py
+ - literate-nav:
+ nav_file: SUMMARY.md
+ - mkdocs-jupyter:
+ theme: light
diff --git a/pyproject.toml b/pyproject.toml
index fa9a70dc1..8019f981d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -93,10 +93,17 @@ examples=[
"seaborn",
]
docs=[
- "sphinx>=3",
- "sphinx-gallery",
- "sphinx_bootstrap_theme",
+ "mkdocs",
"numpydoc",
+ "mkdocs-material",
+ "mkdocs-autorefs",
+ "mkdocstrings[python]",
+ "mkdocs-gen-files",
+ "mkdocs-literate-nav",
+ "mkdocs-section-index",
+ "mkdocs-jupyter",
+ "mkdocs-linkcheck",
+ "mike"
]
[project.urls]
diff --git a/scripts/gen_ref_pages.py b/scripts/gen_ref_pages.py
new file mode 100644
index 000000000..730a98024
--- /dev/null
+++ b/scripts/gen_ref_pages.py
@@ -0,0 +1,55 @@
+"""Generate the code reference pages.
+
+based on https://github.com/mkdocstrings/mkdocstrings/blob/33aa573efb17b13e7b9da77e29aeccb3fbddd8e8/docs/recipes.md
+but modified for lack of "src/" file structure.
+
+"""
+
+from pathlib import Path
+import shutil
+
+import mkdocs_gen_files
+
+nav = mkdocs_gen_files.Nav()
+
+root = Path(__file__).parent.parent
+src = root / "openml"
+
+for path in sorted(src.rglob("*.py")):
+ module_path = path.relative_to(root).with_suffix("")
+ doc_path = path.relative_to(src).with_suffix(".md")
+ full_doc_path = Path("reference", doc_path)
+
+ parts = tuple(module_path.parts)
+
+ if parts[-1] == "__init__":
+ parts = parts[:-1]
+ doc_path = doc_path.with_name("index.md")
+ full_doc_path = full_doc_path.with_name("index.md")
+ elif parts[-1] == "__main__":
+ continue
+
+ nav[parts] = doc_path.as_posix()
+
+ with mkdocs_gen_files.open(full_doc_path, "w") as fd:
+ identifier = ".".join(parts)
+ print("::: " + identifier, file=fd)
+
+ mkdocs_gen_files.set_edit_path(full_doc_path, path.relative_to(root))
+
+ with mkdocs_gen_files.open("reference/SUMMARY.md", "w") as nav_file:
+ nav_file.writelines(nav.build_literate_nav())
+
+nav = mkdocs_gen_files.Nav()
+examples_dir = root / "examples"
+examples_doc_dir = root / "docs" / "examples"
+for path in sorted(examples_dir.rglob("*.py")):
+ dest_path = Path("examples") / path.relative_to(examples_dir)
+ with mkdocs_gen_files.open(dest_path, "w") as dest_file:
+ print(path.read_text(), file=dest_file)
+
+ new_relative_location = Path("../") / dest_path
+ nav[new_relative_location.parts[2:]] = new_relative_location.as_posix()
+
+ with mkdocs_gen_files.open("examples/SUMMARY.md", "w") as nav_file:
+ nav_file.writelines(nav.build_literate_nav())