diff --git a/.github/workflows/benchmarks-last-release.yml b/.github/workflows/benchmarks-last-release.yml deleted file mode 100644 index bf3f5de480f..00000000000 --- a/.github/workflows/benchmarks-last-release.yml +++ /dev/null @@ -1,80 +0,0 @@ -name: Benchmark compare last release - -on: - push: - branches: - - main - workflow_dispatch: - -jobs: - benchmark: - name: Linux - runs-on: ubuntu-latest - env: - ASV_DIR: "./asv_bench" - CONDA_ENV_FILE: ci/requirements/environment.yml - - steps: - # We need the full repo to avoid this issue - # https://github.com/actions/checkout/issues/23 - - uses: actions/checkout@v5 - with: - fetch-depth: 0 - - - name: Set up conda environment - uses: mamba-org/setup-micromamba@v2 - with: - micromamba-version: "1.5.10-0" - environment-file: ${{env.CONDA_ENV_FILE}} - environment-name: xarray-tests - cache-environment: true - cache-environment-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}-benchmark" - create-args: >- - asv - - - name: "Get Previous tag" - id: previoustag - uses: "WyriHaximus/github-action-get-previous-tag@v1" - # with: - # fallback: 1.0.0 # Optional fallback tag to use when no tag can be found - - - name: Run benchmarks - shell: bash -l {0} - id: benchmark - env: - OPENBLAS_NUM_THREADS: 1 - MKL_NUM_THREADS: 1 - OMP_NUM_THREADS: 1 - ASV_FACTOR: 1.5 - ASV_SKIP_SLOW: 1 - run: | - set -x - # ID this runner - asv machine --yes - echo "Baseline: ${{ steps.previoustag.outputs.tag }} " - echo "Contender: ${{ github.sha }}" - # Use mamba for env creation - # export CONDA_EXE=$(which mamba) - export CONDA_EXE=$(which conda) - # Run benchmarks for current commit against base - ASV_OPTIONS="--split --show-stderr --factor $ASV_FACTOR" - asv continuous $ASV_OPTIONS ${{ steps.previoustag.outputs.tag }} ${{ github.sha }} \ - | sed "/Traceback \|failed$\|PERFORMANCE DECREASED/ s/^/::error::/" \ - | tee benchmarks.log - # Report and export results for subsequent steps - if grep "Traceback \|failed\|PERFORMANCE DECREASED" benchmarks.log > /dev/null ; then - exit 1 - fi - working-directory: ${{ env.ASV_DIR }} - - - name: Add instructions to artifact - if: always() - run: | - cp benchmarks/README_CI.md benchmarks.log .asv/results/ - working-directory: ${{ env.ASV_DIR }} - - - uses: actions/upload-artifact@v4 - if: always() - with: - name: asv-benchmark-results-${{ runner.os }} - path: ${{ env.ASV_DIR }}/.asv/results diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml deleted file mode 100644 index 113e8184f56..00000000000 --- a/.github/workflows/benchmarks.yml +++ /dev/null @@ -1,77 +0,0 @@ -name: Benchmark - -on: - pull_request: - types: [opened, reopened, synchronize, labeled] - workflow_dispatch: - -env: - PR_HEAD_LABEL: ${{ github.event.pull_request.head.label }} - -jobs: - benchmark: - if: ${{ contains( github.event.pull_request.labels.*.name, 'run-benchmark') && github.event_name == 'pull_request' || contains( github.event.pull_request.labels.*.name, 'topic-performance') && github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch' }} - name: Linux - runs-on: ubuntu-latest - env: - ASV_DIR: "./asv_bench" - CONDA_ENV_FILE: ci/requirements/environment-benchmark.yml - - steps: - # We need the full repo to avoid this issue - # https://github.com/actions/checkout/issues/23 - - uses: actions/checkout@v5 - with: - fetch-depth: 0 - - - name: Set up conda environment - uses: mamba-org/setup-micromamba@v2 - with: - micromamba-version: "1.5.10-0" - environment-file: ${{env.CONDA_ENV_FILE}} - environment-name: xarray-benchmark - cache-environment: true - cache-environment-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}-benchmark" - # add "build" because of https://github.com/airspeed-velocity/asv/issues/1385 - create-args: >- - asv - python-build - mamba<=1.5.10 - - - name: Run benchmarks - shell: bash -l {0} - id: benchmark - env: - OPENBLAS_NUM_THREADS: 1 - MKL_NUM_THREADS: 1 - OMP_NUM_THREADS: 1 - ASV_FACTOR: 1.5 - ASV_SKIP_SLOW: 1 - run: | - set -x - # ID this runner - asv machine --yes - echo "Baseline: ${{ github.event.pull_request.base.sha }} (${{ github.event.pull_request.base.label }})" - echo "Contender: ${GITHUB_SHA} ($PR_HEAD_LABEL)" - # Run benchmarks for current commit against base - ASV_OPTIONS="--split --show-stderr --factor $ASV_FACTOR" - asv continuous $ASV_OPTIONS ${{ github.event.pull_request.base.sha }} ${GITHUB_SHA} \ - | sed "/Traceback \|failed$\|PERFORMANCE DECREASED/ s/^/::error::/" \ - | tee benchmarks.log - # Report and export results for subsequent steps - if grep "Traceback \|failed\|PERFORMANCE DECREASED" benchmarks.log > /dev/null ; then - exit 1 - fi - working-directory: ${{ env.ASV_DIR }} - - - name: Add instructions to artifact - if: always() - run: | - cp benchmarks/README_CI.md benchmarks.log .asv/results/ - working-directory: ${{ env.ASV_DIR }} - - - uses: actions/upload-artifact@v4 - if: always() - with: - name: asv-benchmark-results-${{ runner.os }} - path: ${{ env.ASV_DIR }}/.asv/results diff --git a/.github/workflows/benchmarks_report.yml b/.github/workflows/benchmarks_report.yml new file mode 100644 index 00000000000..bdaf76e0391 --- /dev/null +++ b/.github/workflows/benchmarks_report.yml @@ -0,0 +1,83 @@ +# Post any reports generated by benchmarks_run.yml . +# Separated for security: +# https://securitylab.github.com/research/github-actions-preventing-pwn-requests/ + +name: benchmarks-report +run-name: Report benchmark results + +on: + workflow_run: + workflows: [benchmarks-run] + types: + - completed + +jobs: + download: + runs-on: ubuntu-latest + outputs: + reports_exist: ${{ steps.unzip.outputs.reports_exist }} + steps: + - name: Download artifact + id: download-artifact + # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#using-data-from-the-triggering-workflow + uses: actions/github-script@v8 + with: + script: | + let allArtifacts = await github.rest.actions.listWorkflowRunArtifacts({ + owner: context.repo.owner, + repo: context.repo.repo, + run_id: context.payload.workflow_run.id, + }); + let matchArtifact = allArtifacts.data.artifacts.filter((artifact) => { + return artifact.name == "benchmark_reports" + })[0]; + if (typeof matchArtifact != 'undefined') { + let download = await github.rest.actions.downloadArtifact({ + owner: context.repo.owner, + repo: context.repo.repo, + artifact_id: matchArtifact.id, + archive_format: 'zip', + }); + let fs = require('fs'); + fs.writeFileSync(`${process.env.GITHUB_WORKSPACE}/benchmark_reports.zip`, Buffer.from(download.data)); + }; + + - name: Unzip artifact + id: unzip + run: | + if test -f "benchmark_reports.zip"; then + reports_exist=1 + unzip benchmark_reports.zip -d benchmark_reports + else + reports_exist=0 + fi + echo "reports_exist=$reports_exist" >> "$GITHUB_OUTPUT" + + - name: Store artifact + uses: actions/upload-artifact@v4 + with: + name: benchmark_reports + path: benchmark_reports + + post_reports: + runs-on: ubuntu-latest + needs: download + if: needs.download.outputs.reports_exist == 1 + steps: + - name: Checkout repo + uses: actions/checkout@v5 + + - name: Download artifact + uses: actions/download-artifact@v5 + with: + name: benchmark_reports + path: .github/workflows/benchmark_reports + + - name: Set up Python + # benchmarks/bm_runner.py only needs builtins to run. + uses: actions/setup-python@v6 + + - name: Post reports + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: benchmarks/bm_runner.py _gh_post diff --git a/.github/workflows/benchmarks_run.yml b/.github/workflows/benchmarks_run.yml new file mode 100644 index 00000000000..cd88559c13b --- /dev/null +++ b/.github/workflows/benchmarks_run.yml @@ -0,0 +1,175 @@ +# Use ASV to check for performance regressions, either: +# - In the last 24 hours' commits. +# - Introduced by this pull request. + +name: benchmarks-run +run-name: Run benchmarks + +on: + schedule: + # Runs every day at 23:00. + - cron: "0 23 * * *" + workflow_dispatch: + inputs: + first_commit: + description: "First commit to benchmark (see bm_runner.py > Overnight)." + required: false + type: string + pull_request: + # Add the `labeled` type to the default list. + types: [labeled, opened, synchronize, reopened] + +jobs: + pre-checks: + # This workflow supports two different scenarios (overnight and branch). + # The pre-checks job determines which scenario is being run. + runs-on: ubuntu-latest + if: github.repository == 'pydata/xarray' + outputs: + overnight: ${{ steps.overnight.outputs.check }} + branch: ${{ steps.branch.outputs.check }} + steps: + - uses: actions/checkout@v5 + with: + fetch-depth: 2 + - id: files-changed + uses: marceloprado/has-changed-path@df1b7a3161b8fb9fd8c90403c66a9e66dfde50cb + with: + # SEE ALSO .github/labeler.yml . + paths: ci/requirements/locks/*.lock + - id: overnight + name: Check overnight scenario + if: github.event_name != 'pull_request' + run: echo "check=true" >> "$GITHUB_OUTPUT" + - id: branch + name: Check branch scenario + if: > + github.event_name == 'pull_request' + && + ( + steps.files-changed.outputs.changed == 'true' + || + github.event.label.name == 'benchmark_this' + || + github.event.label.name == 'run-benchmarks' + || + github.event.label.name == 'topic-performance' + ) + run: echo "check=true" >> "$GITHUB_OUTPUT" + + + benchmark: + runs-on: ubuntu-latest + needs: pre-checks + if: > + needs.pre-checks.outputs.overnight == 'true' || + needs.pre-checks.outputs.branch == 'true' + + env: + IRIS_TEST_DATA_LOC_PATH: benchmarks + IRIS_TEST_DATA_PATH: benchmarks/iris-test-data + IRIS_TEST_DATA_VERSION: "2.28" + # Lets us manually bump the cache to rebuild + ENV_CACHE_BUILD: "0" + TEST_DATA_CACHE_BUILD: "2" + + steps: + # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it + - name: Checkout repo + uses: actions/checkout@v5 + with: + fetch-depth: 0 + + - name: Install run dependencies + run: pip install asv nox!=2025.05.01 + + - name: Cache environment directories + id: cache-env-dir + uses: actions/cache@v4 + with: + path: | + .nox + benchmarks/.asv/env + $CONDA/pkgs + key: ${{ runner.os }}-${{ hashFiles('requirements/') }}-${{ env.ENV_CACHE_BUILD }} + + - name: Cache test data directory + id: cache-test-data + uses: actions/cache@v4 + with: + path: | + ${{ env.IRIS_TEST_DATA_PATH }} + key: + test-data-${{ env.IRIS_TEST_DATA_VERSION }}-${{ env.TEST_DATA_CACHE_BUILD }} + + - name: Fetch the test data + if: steps.cache-test-data.outputs.cache-hit != 'true' + run: | + wget --quiet https://github.com/SciTools/iris-test-data/archive/v${IRIS_TEST_DATA_VERSION}.zip -O iris-test-data.zip + unzip -q iris-test-data.zip + mkdir --parents ${GITHUB_WORKSPACE}/${IRIS_TEST_DATA_LOC_PATH} + mv iris-test-data-${IRIS_TEST_DATA_VERSION} ${GITHUB_WORKSPACE}/${IRIS_TEST_DATA_PATH} + + - name: Set test data var + run: | + echo "OVERRIDE_TEST_DATA_REPOSITORY=${GITHUB_WORKSPACE}/${IRIS_TEST_DATA_PATH}/test_data" >> $GITHUB_ENV + + - name: Benchmark this pull request + # If the 'branch' condition(s) are met: use the bm_runner to compare + # the proposed merge with the base branch. + if: needs.pre-checks.outputs.branch == 'true' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_NUMBER: ${{ github.event.number }} + run: | + nox -s benchmarks -- branch origin/${{ github.base_ref }} + + - name: Run overnight benchmarks + # If the 'overnight' condition(s) are met: use the bm_runner to compare + # each of the last 24 hours' commits to their parents. + id: overnight + if: needs.pre-checks.outputs.overnight == 'true' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # The first_commit argument allows a custom starting point - useful + # for manual re-running. + run: | + first_commit=${{ inputs.first_commit }} + if [ "$first_commit" == "" ] + then + first_commit=$(git log --after="$(date -d "1 day ago" +"%Y-%m-%d") 23:00:00" --pretty=format:"%h" | tail -n 1) + fi + + if [ "$first_commit" != "" ] + then + nox -s benchmarks -- overnight $first_commit + fi + + - name: Warn of failure + # The overnight run is not on a pull request, so a failure could go + # unnoticed without being actively advertised. + if: > + failure() && + steps.overnight.outcome == 'failure' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + title="Overnight benchmark workflow failed: \`${{ github.run_id }}\`" + body="Generated by GHA run [\`${{github.run_id}}\`](https://github.com/${{github.repository}}/actions/runs/${{github.run_id}})" + gh issue create --title "$title" --body "$body" --label "Bot" --label "Type: Performance" --repo $GITHUB_REPOSITORY + + - name: Upload any benchmark reports + # Uploading enables more downstream processing e.g. posting a PR comment. + if: success() || steps.overnight.outcome == 'failure' + uses: actions/upload-artifact@v4 + with: + name: benchmark_reports + path: .github/workflows/benchmark_reports + + - name: Archive asv results + # Store the raw ASV database(s) to help manual investigations. + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: asv-raw-results + path: benchmarks/.asv/results diff --git a/.github/workflows/benchmarks_validate.yml b/.github/workflows/benchmarks_validate.yml new file mode 100644 index 00000000000..e3f090b32c0 --- /dev/null +++ b/.github/workflows/benchmarks_validate.yml @@ -0,0 +1,48 @@ +name: benchmarks-validate +run-name: Validate the benchmarking setup + +on: + push: + branches: + - "main" + - "v*x" + tags: + - "v*" + pull_request: + branches: + - "*" + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + validate: + runs-on: ubuntu-latest + + env: + # Lets us manually bump the cache to rebuild + ENV_CACHE_BUILD: "0" + + steps: + - name: Checkout repo + uses: actions/checkout@v5 + with: + fetch-depth: 0 + + - name: Install run dependencies + run: pip install asv nox!=2025.05.01 + + - name: Cache environment directories + id: cache-env-dir + uses: actions/cache@v4 + with: + path: | + .nox + benchmarks/.asv/env + $CONDA/pkgs + key: ${{ runner.os }}-${{ hashFiles('requirements/') }}-${{ env.ENV_CACHE_BUILD }} + + - name: Validate setup + run: nox -s benchmarks -- validate diff --git a/.github/workflows/refresh-lockfiles.yml b/.github/workflows/refresh-lockfiles.yml new file mode 100644 index 00000000000..7aa8803426a --- /dev/null +++ b/.github/workflows/refresh-lockfiles.yml @@ -0,0 +1,111 @@ +# This workflow periodically creates new environment lock files based on the newest +# available packages and dependencies. +# +# Environment specifications are given as conda environment.yml files found in +# `requirements/py**.yml`. These state the packages required, the conda channels +# that the packages will be pulled from, and any versions of packages that need to be +# pinned at specific versions. +# +# For environments that have changed, a pull request will be made and submitted +# to the main branch + +name: Refresh Lockfiles + + +on: + pull_request: + branches: + - "*" + workflow_call: + +jobs: + get_python_matrix: + # Determines which Python versions should be included in the matrix used in + # the gen_lockfiles job. + if: "github.repository_owner == 'pydata' || github.event_name == 'workflow_dispatch'" + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.get_py.outputs.matrix }} + steps: + - uses: actions/checkout@v5 + - id: get_py + run: echo "MATRIX=$(ls -1 ci/requirements/environment-benchmark.yml | xargs -n1 basename | sed 's/....$//' | jq -cnR '[inputs]')" >> ${GITHUB_OUTPUT} + + gen_lockfiles: + # This is a matrix job: it splits to create new lockfiles for each + # of the CI test python versions. + if: "github.repository_owner == 'pydata' || github.event_name == 'workflow_dispatch'" + runs-on: ubuntu-latest + needs: get_python_matrix + + strategy: + matrix: + python: ${{ fromJSON(needs.get_python_matrix.outputs.MATRIX) }} + + steps: + - uses: actions/checkout@v5 + - name: install requirements + run: | + source $CONDA/bin/activate base + conda update -n base --all + - name: generate lockfile + run: | + pipx run conda-lock -k explicit -p linux-64 -f ci/requirements/${{matrix.python}}.yml + mv conda-linux-64.lock ${{matrix.python}}-linux-64.lock + - name: output lockfile + uses: actions/upload-artifact@v4 + with: + name: lock-artifacts-${{matrix.python}} + path: ${{matrix.python}}-linux-64.lock + + create_pr: + # Once the matrix job has completed all the lock files will have been + # uploaded as artifacts. + # Download the artifacts, add them to the repo, and create a PR. + if: "github.repository_owner == 'pydata' || github.event_name == 'workflow_dispatch'" + runs-on: ubuntu-latest + needs: gen_lockfiles + + steps: + - uses: actions/checkout@v5 + - name: get artifacts + uses: actions/download-artifact@v5 + with: + path: ${{ github.workspace }}/ci/requirements/locks + merge-multiple: true + + - name: "Generate token" + uses: actions/create-github-app-token@v2 + id: generate-token + with: + app-id: ${{ secrets.AUTH_APP_ID }} + private-key: ${{ secrets.AUTH_APP_PRIVATE_KEY }} + + - name: Create Pull Request + id: cpr + uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e + with: + token: ${{ steps.generate-token.outputs.token }} + commit-message: Updated environment lockfiles + committer: "Lockfile bot " + author: "Lockfile bot " + delete-branch: true + branch: auto-update-lockfiles + title: "[CI Bot] environment lockfiles auto-update" + body: | + Lockfiles updated to the latest resolvable environment. + ### If the CI tasks fail, create a new branch based on this PR and add the required fixes to that branch. + labels: | + New: Pull Request + Bot + + - name: Check Pull Request + if: steps.cpr.outputs.pull-request-number != '' + run: | + echo "### :rocket: Pull-Request Summary" >> ${GITHUB_STEP_SUMMARY} + echo "" >> ${GITHUB_STEP_SUMMARY} + echo "The following lock-files pull-request has been auto-generated:" + echo "- **PR** #${{ steps.cpr.outputs.pull-request-number }}" >> ${GITHUB_STEP_SUMMARY} + echo "- **URL** ${{ steps.cpr.outputs.pull-request-url }}" >> ${GITHUB_STEP_SUMMARY} + echo "- **Operation** [${{ steps.cpr.outputs.pull-request-operation }}]" >> ${GITHUB_STEP_SUMMARY} + echo "- **SHA** ${{ steps.cpr.outputs.pull-request-head-sha }}" >> ${GITHUB_STEP_SUMMARY} diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 00000000000..09ea920176f --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,175 @@ +# SciTools Performance Benchmarking + +SciTools uses an [Airspeed Velocity](https://github.com/airspeed-velocity/asv) +(ASV) setup to benchmark performance. This is primarily designed to check for +performance shifts between commits using statistical analysis, but can also +be easily repurposed for manual comparative and scalability analyses. + +The benchmarks are automatically run overnight +[by a GitHub Action](../.github/workflows/benchmark.yml), with any notable +shifts in performance being flagged in a new GitHub issue. + +## Running benchmarks + +On GitHub: a Pull Request can be benchmarked by adding the +https://github.com/SciTools/iris/labels/benchmark_this +label to the PR (to run a second time: just remove and re-add the label). +Note that a benchmark run could take an hour or more to complete. +This runs a comparison between the PR branch's ``HEAD`` and its merge-base with +the PR's base branch, thus showing performance differences introduced +by the PR. (This run is managed by +[the aforementioned GitHub Action](../.github/workflows/benchmark.yml)). + +To run locally: the **benchmark runner** provides conveniences for +common benchmark setup and run tasks, including replicating the benchmarking +performed by GitHub Actions workflows. This can be accessed by: + +- The Nox `benchmarks` session - (use + `nox -s benchmarks -- --help` for details). +- `benchmarks/bm_runner.py` (use the `--help` argument for details). +- Directly running `asv` commands from the `benchmarks/` directory (check + whether environment setup has any extra dependencies - see + [Benchmark environments](#benchmark-environments)). + +### Reducing run time + +A significant portion of benchmark run time is environment management. Run-time +can be reduced by co-locating the benchmark environment and your +[Conda package cache](https://docs.conda.io/projects/conda/en/latest/user-guide/configuration/custom-env-and-pkg-locations.html) +on the same [file system](https://en.wikipedia.org/wiki/File_system), if they +are not already. This can be done in several ways: + +- Temporarily reconfiguring `env_parent` in + [`_asv_delegated_abc`](_asv_delegated_abc.py) to reference a location on the same + file system as the Conda package cache. +- Using an alternative Conda package cache location during the benchmark run, + e.g. via the `$CONDA_PKGS_DIRS` environment variable. +- Moving your repo checkout to the same file system as the Conda package cache. + +### Environment variables + +* `OVERRIDE_TEST_DATA_REPOSITORY` - required - some benchmarks use +`iris-test-data` content, and your local `site.cfg` is not available for +benchmark scripts. The benchmark runner defers to any value already set in +the shell, but will otherwise download `iris-test-data` and set the variable +accordingly. +* `DATA_GEN_PYTHON` - required - path to a Python executable that can be +used to generate benchmark test objects/files; see +[Data generation](#data-generation). The benchmark runner sets this +automatically, but will defer to any value already set in the shell. Note that +[Mule](https://github.com/metomi/mule) will be automatically installed into +this environment, and sometimes +[iris-test-data](https://github.com/SciTools/iris-test-data) (see +`OVERRIDE_TEST_DATA_REPOSITORY`). +* `BENCHMARK_DATA` - optional - path to a directory for benchmark synthetic +test data, which the benchmark scripts will create if it doesn't already +exist. Defaults to `/benchmarks/.data/` if not set. Note that some of +the generated files, especially in the 'SPerf' suite, are many GB in size so +plan accordingly. +* `ON_DEMAND_BENCHMARKS` - optional - when set (to any value): benchmarks +decorated with `@on_demand_benchmark` are included in the ASV run. Usually +coupled with the ASV `--bench` argument to only run the benchmark(s) of +interest. Is set during the benchmark runner `cperf` and `sperf` sub-commands. +* `ASV_COMMIT_ENVS` - optional - instruct the +[delegated environment management](#benchmark-environments) to create a +dedicated environment for each commit being benchmarked when set (to any +value). This means that benchmarking commits with different environment +requirements will not be delayed by repeated environment setup - especially +relevant given the [benchmark runner](bm_runner.py)'s use of +[--interleave-rounds](https://asv.readthedocs.io/en/stable/commands.html?highlight=interleave-rounds#asv-run), +or any time you know you will repeatedly benchmark the same commit. **NOTE:** +SciTools environments tend to large so this option can consume a lot of disk +space. + +## Writing benchmarks + +[See the ASV docs](https://asv.readthedocs.io/) for full detail. + +### What benchmarks to write + +It is not possible to maintain a full suite of 'unit style' benchmarks: + +* Benchmarks take longer to run than tests. +* Small benchmarks are more vulnerable to noise - they report a lot of false +positive regressions. + +We therefore recommend writing benchmarks representing scripts or single +operations that are likely to be run at the user level. + +The drawback of this approach: a reported regression is less likely to reveal +the root cause (e.g. if a commit caused a regression in coordinate-creation +time, but the only benchmark covering this was for file-loading). Be prepared +for manual investigations; and consider committing any useful benchmarks as +[on-demand benchmarks](#on-demand-benchmarks) for future developers to use. + +### Data generation + +**Important:** be sure not to use the benchmarking environment to generate any +test objects/files, as this environment changes with each commit being +benchmarked, creating inconsistent benchmark 'conditions'. The +[generate_data](./benchmarks/generate_data/__init__.py) module offers a +solution; read more detail there. + +### ASV re-run behaviour + +Note that ASV re-runs a benchmark multiple times between its `setup()` routine. +This is a problem for benchmarking certain SciTools operations such as data +realisation, since the data will no longer be lazy after the first run. +Consider writing extra steps to restore objects' original state _within_ the +benchmark itself. + +If adding steps to the benchmark will skew the result too much then re-running +can be disabled by setting an attribute on the benchmark: `number = 1`. To +maintain result accuracy this should be accompanied by increasing the number of +repeats _between_ `setup()` calls using the `repeat` attribute. +`warmup_time = 0` is also advisable since ASV performs independent re-runs to +estimate run-time, and these will still be subject to the original problem. +The `@disable_repeat_between_setup` decorator in +[`benchmarks/__init__.py`](benchmarks/__init__.py) offers a convenience for +all this. + +### Custom benchmarks + +SciTools benchmarking implements custom benchmark types, such as a `tracemalloc` +benchmark to measure memory growth. See [custom_bms/](./custom_bms) for more +detail. + +### Scaling / non-Scaling Performance Differences + +**(We no longer advocate the below for benchmarks run during CI, given the +limited available runtime and risk of false-positives. It remains useful for +manual investigations).** + +When comparing performance between commits/file-type/whatever it can be helpful +to know if the differences exist in scaling or non-scaling parts of the +operation under test. This can be done using a size parameter, setting +one value to be as small as possible (e.g. a scalar value), and the other to +be significantly larger (e.g. a 1000x1000 array). Performance differences +might only be seen for the larger value, or the smaller, or both, getting you +closer to the root cause. + +### On-demand benchmarks + +Some benchmarks provide useful insight but are inappropriate to be included in +a benchmark run by default, e.g. those with long run-times or requiring a local +file. These benchmarks should be decorated with `@on_demand_benchmark` +(see [benchmarks init](./benchmarks/__init__.py)), which +sets the benchmark to only be included in a run when the `ON_DEMAND_BENCHMARKS` +environment variable is set. Examples include the CPerf and SPerf benchmark +suites for the UK Met Office NG-VAT project. + +## Benchmark environments + +We have disabled ASV's standard environment management, instead using an +environment built using the same scripts that set up the package test +environments. +This is done using ASV's plugin architecture - see +[`asv_delegated.py`](asv_delegated.py) and associated +references in [`asv.conf.json`](asv.conf.json) (`environment_type` and +`plugins`). + +(ASV is written to control the environment(s) that benchmarks are run in - +minimising external factors and also allowing it to compare between a matrix +of dependencies (each in a separate environment). We have chosen to sacrifice +these features in favour of testing each commit with its intended dependencies, +controlled by the test environment setup script(s)). diff --git a/benchmarks/_asv_delegated_abc.py b/benchmarks/_asv_delegated_abc.py new file mode 100644 index 00000000000..0546a3c6a2d --- /dev/null +++ b/benchmarks/_asv_delegated_abc.py @@ -0,0 +1,249 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the BSD license. +# See LICENSE in the root of the repository for full licensing details. +"""ASV plug-in providing an alternative :class:`asv.environments.Environment` subclass. + +Preps an environment via custom user scripts, then uses that as the +benchmarking environment. + +This module is intended as the generic code that can be shared between +repositories. Providing a functional benchmarking environment relies on correct +subclassing of the :class:`_DelegatedABC` class to specialise it for the repo in +question. The parent and subclass are separated into their own dedicated files, +which isolates ALL repo-specific code to a single file, thus simplifying the +templating process. + +""" + +from abc import ABC, abstractmethod +from contextlib import contextmanager, suppress +from os import environ +from pathlib import Path +import sys + +from asv.console import log +from asv.environment import Environment, EnvironmentUnavailable +from asv.repo import Repo + + +class _DelegatedABC(Environment, ABC): + """Manage a benchmark environment using custom user scripts, run at each commit. + + Ignores user input variations - ``matrix`` / ``pythons`` / + ``exclude``, since environment is being managed outside ASV. + + A vanilla :class:`asv.environment.Environment` is created for containing + the expected ASV configuration files and checked-out project. The actual + 'functional' environment is created/updated using + :meth:`_prep_env_override`, then the location is recorded via + a symlink within the ASV environment. The symlink is used as the + environment path used for any executable calls (e.g. + ``python my_script.py``). + + Intended as the generic parent class that can be shared between + repositories. Providing a functional benchmarking environment relies on + correct subclassing of this class to specialise it for the repo in question. + + Warnings + -------- + :class:`_DelegatedABC` is an abstract base class. It MUST ONLY be used via + subclasses implementing their own :meth:`_prep_env_override`, and also + :attr:`tool_name`, which must be unique. + + """ + + tool_name = "delegated-ABC" + """Required by ASV as a unique identifier of the environment type.""" + + DELEGATED_LINK_NAME = "delegated_env" + """The name of the symlink to the delegated environment.""" + + COMMIT_ENVS_VAR = "ASV_COMMIT_ENVS" + """Env var that instructs a dedicated environment be created per commit.""" + + def __init__(self, conf, python, requirements, tagged_env_vars): + """Get a 'delegated' environment based on the given ASV config object. + + Parameters + ---------- + conf : dict + ASV configuration object. + + python : str + Ignored - environment management is delegated. The value is always + ``DELEGATED``. + + requirements : dict (str -> str) + Ignored - environment management is delegated. The value is always + an empty dict. + + tagged_env_vars : dict (tag, key) -> value + Ignored - environment management is delegated. The value is always + an empty dict. + + Raises + ------ + EnvironmentUnavailable + The original environment or delegated environment cannot be created. + + """ + ignored = [] + if python: + ignored.append(f"{python=}") + if requirements: + ignored.append(f"{requirements=}") + if tagged_env_vars: + ignored.append(f"{tagged_env_vars=}") + message = ( + f"Ignoring ASV setting(s): {', '.join(ignored)}. Benchmark " + "environment management is delegated to third party script(s)." + ) + log.warning(message) + self._python = "DELEGATED" + self._requirements = {} + self._tagged_env_vars = {} + super().__init__( + conf, + self._python, + self._requirements, + self._tagged_env_vars, + ) + + self._path_undelegated = Path(self._path) + """Preserves the 'true' path of the environment so that self._path can + be safely modified and restored.""" + + @property + def _path_delegated(self) -> Path: + """The path of the symlink to the delegated environment.""" + return self._path_undelegated / self.DELEGATED_LINK_NAME + + @property + def _delegated_found(self) -> bool: + """Whether self._path_delegated successfully resolves to a directory.""" + resolved = None + with suppress(FileNotFoundError): + resolved = self._path_delegated.resolve(strict=True) + result = resolved is not None and resolved.is_dir() + return result + + def _symlink_to_delegated(self, delegated_env_path: Path) -> None: + """Create the symlink to the delegated environment.""" + self._path_delegated.unlink(missing_ok=True) + self._path_delegated.parent.mkdir(parents=True, exist_ok=True) + self._path_delegated.symlink_to(delegated_env_path, target_is_directory=True) + assert self._delegated_found + + def _setup(self): + """Temporarily try to set the user's active env as the delegated env. + + Environment prep will be run anyway once ASV starts checking out + commits, but this step tries to provide a usable environment (with + python, etc.) at the moment that ASV expects it. + + """ + current_env = Path(sys.executable).parents[1] + message = ( + "Temporarily using user's active environment as benchmarking " + f"environment: {current_env} . " + ) + try: + self._symlink_to_delegated(current_env) + _ = self.find_executable("python") + except Exception: + message = ( + f"Delegated environment {self.name} not yet set up (unable to " + "determine current environment)." + ) + self._path_delegated.unlink(missing_ok=True) + + message += "Correct environment will be set up at the first commit checkout." + log.warning(message) + + @abstractmethod + def _prep_env_override(self, env_parent_dir: Path) -> Path: + """Run aspects of :meth:`_prep_env` that vary between repos. + + This is the method that is expected to do the preparing + (:meth:`_prep_env` only performs pre- and post- steps). MUST be + overridden in any subclass environments before they will work. + + Parameters + ---------- + env_parent_dir : Path + The directory that the prepared environment should be placed in. + + Returns + ------- + Path + The path to the prepared environment. + """ + pass + + def _prep_env(self, commit_hash: str) -> None: + """Prepare the delegated environment for the given commit hash.""" + message = ( + f"Running delegated environment management for: {self.name} " + f"at commit: {commit_hash[:8]}" + ) + log.info(message) + + env_parent = Path(self._env_dir).resolve() + new_env_per_commit = self.COMMIT_ENVS_VAR in environ + if new_env_per_commit: + env_parent = env_parent / commit_hash[:8] + + delegated_env_path = self._prep_env_override(env_parent) + assert delegated_env_path.is_relative_to(env_parent) + + # Record the environment's path via a symlink within this environment. + self._symlink_to_delegated(delegated_env_path) + + message = f"Environment {self.name} updated to spec at {commit_hash[:8]}" + log.info(message) + + def checkout_project(self, repo: Repo, commit_hash: str) -> None: + """Check out the working tree of the project at given commit hash.""" + super().checkout_project(repo, commit_hash) + self._prep_env(commit_hash) + + @contextmanager + def _delegate_path(self): + """Context manager to use the delegated env path as this env's path.""" + if not self._delegated_found: + message = f"Delegated environment not found at: {self._path_delegated}" + log.error(message) + raise EnvironmentUnavailable(message) + + try: + self._path = str(self._path_delegated) + yield + finally: + self._path = str(self._path_undelegated) + + def find_executable(self, executable): + """Find an executable (e.g. python, pip) in the DELEGATED environment. + + Raises + ------ + OSError + If the executable is not found in the environment. + """ + if not self._delegated_found: + # Required during environment setup. OSError expected if executable + # not found. + raise OSError + + with self._delegate_path(): + return super().find_executable(executable) + + def run_executable(self, executable, args, **kwargs): + """Run a given executable (e.g. python, pip) in the DELEGATED environment.""" + with self._delegate_path(): + return super().run_executable(executable, args, **kwargs) + + def run(self, args, **kwargs): + # This is not a specialisation - just implementing the abstract method. + log.debug(f"Running '{' '.join(args)}' in {self.name}") + return self.run_executable("python", args, **kwargs) diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json new file mode 100644 index 00000000000..bc0f6e55e35 --- /dev/null +++ b/benchmarks/asv.conf.json @@ -0,0 +1,27 @@ +{ + "version": 1, + "project": "scitools-iris", + "project_url": "https://github.com/SciTools/iris", + "repo": "..", + "environment_type": "delegated", + "show_commit_url": "https://github.com/scitools/iris/commit/", + "branches": ["upstream/main"], + + "benchmark_dir": "./benchmarks", + "env_dir": ".asv/env", + "results_dir": ".asv/results", + "html_dir": ".asv/html", + "plugins": [".asv_delegated"], + + "command_comment": [ + "The inherited setup of the Iris test environment takes care of ", + "Iris-installation too, and in the case of Iris no specialised ", + "uninstall or build commands are needed to get it working either.", + + "We do however need to install the custom benchmarks for them to be", + "usable." + ], + "install_command": [], + "uninstall_command": [], + "build_command": ["python {conf_dir}/custom_bms/install.py"] +} diff --git a/benchmarks/asv_delegated.py b/benchmarks/asv_delegated.py new file mode 100644 index 00000000000..85ba432ac75 --- /dev/null +++ b/benchmarks/asv_delegated.py @@ -0,0 +1,153 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the BSD license. +# See LICENSE in the root of the repository for full licensing details. +"""Repository-specific adaptation of :mod:`_asv_delegated_abc`.""" + +import ast +import enum +from os import environ +from os.path import getmtime +from pathlib import Path +import re + +from asv import util as asv_util + +from _asv_delegated_abc import _DelegatedABC + + +class Delegated(_DelegatedABC): + """Specialism of :class:`_DelegatedABC` for benchmarking this repo.""" + + tool_name = "delegated" + + def _prep_env_override(self, env_parent_dir: Path) -> Path: + """Environment preparation specialised for this repo. + + Scans the checked-out commit of Iris to work out the appropriate + preparation command, including gathering any extra information that said + command needs. + + Parameters + ---------- + env_parent_dir : Path + The directory that the prepared environment should be placed in. + + Returns + ------- + Path + The path to the prepared environment. + """ + # The project checkout. + build_dir = Path(self._build_root) / self._repo_subdir + + # Older iterations of setup.py are incompatible with setuptools>=80. + # (Most dependencies are protected by lock-files, but build + # dependencies in pyproject.toml are independent). + setup_py = build_dir / "setup.py" + pyproject = build_dir / "pyproject.toml" + if setup_py.is_file() and "setuptools.command.develop" in setup_py.read_text(): + with pyproject.open("r+") as file_write: + lines = file_write.readlines() + for i, line in enumerate(lines): + if line == "requires = [\n": + next_line = lines[i + 1] + indent = next_line[: len(next_line) - len(next_line.lstrip())] + + lines.insert(i + 1, f'{indent}"setuptools<80",\n') + break + file_write.seek(0) + file_write.writelines(lines) + + class Mode(enum.Enum): + """The scenarios where the correct env setup script is known.""" + + NOX = enum.auto() + """``PY_VER=x.xx nox --session=tests --install-only`` is supported.""" + + mode = None + + noxfile = build_dir / "noxfile.py" + if noxfile.is_file(): + # Our noxfile originally did not support `--install-only` - you + # could either run the tests, or run nothing at all. Adding + # `run_always` to `prepare_venv` enabled environment setup without + # running tests. + noxfile_tree = ast.parse(source=noxfile.read_text()) + prep_session = next( + filter( + lambda node: getattr(node, "name", "") == "prepare_venv", + ast.walk(noxfile_tree), + ) + ) + prep_session_code = ast.unparse(prep_session) + if ( + "session.run(" not in prep_session_code + and "session.run_always(" in prep_session_code + ): + mode = Mode.NOX + + match mode: + # Just NOX for now but the architecture is here for future cases. + case Mode.NOX: + # Need to determine a single Python version to run with. + # req_dir = build_dir / "ci" / "requirements" + # lockfile_dir = req_dir / "locks" + # if not lockfile_dir.is_dir(): + # lockfile_dir = req_dir / "ci" / "nox.lock" + + # if not lockfile_dir.is_dir(): + # message = f"No lockfile directory found in the expected locations, got '{lockfile_dir}'." + # raise FileNotFoundError(message) + + # def py_ver_from_lockfiles(lockfile: Path) -> str: + # pattern = re.compile(r"py(\d+)-") + # search = pattern.search(lockfile.name) + # assert search is not None + # version = search.group(1) + # return f"{version[0]}.{version[1:]}" + + # python_versions = [ + # py_ver_from_lockfiles(lockfile) + # for lockfile in lockfile_dir.glob("*.lock") + # ] + # python_version = max(python_versions) + python_version = "3.13" + + # Construct and run the environment preparation command. + local_envs = dict(environ) + local_envs["PY_VER"] = python_version + # Prevent Nox re-using env with wrong Python version. + env_parent_dir = ( + env_parent_dir / f"nox{python_version.replace('.', '')}" + ) + env_command = [ + "nox", + f"--envdir={env_parent_dir}", + "--session=tests", + "--install-only", + "--no-error-on-external-run", + "--verbose", + ] + _ = asv_util.check_output( + env_command, + timeout=self._install_timeout, + cwd=build_dir, + env=local_envs, + ) + + env_parent_contents = list(env_parent_dir.iterdir()) + if len(env_parent_contents) != 1: + message = ( + f"{env_parent_dir} contains {len(env_parent_contents)} " + "items, expected 1. Cannot determine the environment " + "directory." + ) + raise FileNotFoundError(message) + (delegated_env_path,) = env_parent_contents + + case _: + message = "No environment setup is known for this commit of Iris." + raise NotImplementedError(message) + + return delegated_env_path diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py new file mode 100644 index 00000000000..b8a54f71a42 --- /dev/null +++ b/benchmarks/benchmarks/__init__.py @@ -0,0 +1,74 @@ +import itertools +import os + +import numpy as np + +_counter = itertools.count() + + +def parameterized(names, params): + def decorator(func): + func.param_names = names + func.params = params + return func + + return decorator + + +def requires_dask(): + try: + import dask # noqa: F401 + except ImportError as err: + raise NotImplementedError() from err + + +def requires_sparse(): + try: + import sparse # noqa: F401 + except ImportError as err: + raise NotImplementedError() from err + + +def randn(shape, frac_nan=None, chunks=None, seed=0): + rng = np.random.default_rng(seed) + if chunks is None: + x = rng.standard_normal(shape) + else: + import dask.array as da + + rng = da.random.default_rng(seed) + x = rng.standard_normal(shape, chunks=chunks) + + if frac_nan is not None: + inds = rng.choice(range(x.size), int(x.size * frac_nan)) + x.flat[inds] = np.nan + + return x + + +def randint(low, high=None, size=None, frac_minus=None, seed=0): + rng = np.random.default_rng(seed) + x = rng.integers(low, high, size) + if frac_minus is not None: + inds = rng.choice(range(x.size), int(x.size * frac_minus)) + x.flat[inds] = -1 + + return x + + +def _skip_slow(): + """ + Use this function to skip slow or highly demanding tests. + + Use it as a `Class.setup` method or a `function.setup` attribute. + + Examples + -------- + >>> from . import _skip_slow + >>> def time_something_slow(): + ... pass + ... + >>> time_something.setup = _skip_slow + """ + if os.environ.get("ASV_SKIP_SLOW", "0") == "1": + raise NotImplementedError("Skipping this test...") diff --git a/benchmarks/benchmarks/accessors.py b/benchmarks/benchmarks/accessors.py new file mode 100644 index 00000000000..259c06160ac --- /dev/null +++ b/benchmarks/benchmarks/accessors.py @@ -0,0 +1,25 @@ +import numpy as np + +import xarray as xr + +from . import parameterized + +NTIME = 365 * 30 + + +@parameterized(["calendar"], [("standard", "noleap")]) +class DateTimeAccessor: + def setup(self, calendar): + np.random.randn(NTIME) + time = xr.date_range("2000", periods=30 * 365, calendar=calendar) + data = np.ones((NTIME,)) + self.da = xr.DataArray(data, dims="time", coords={"time": time}) + + def time_dayofyear(self, calendar): + _ = self.da.time.dt.dayofyear + + def time_year(self, calendar): + _ = self.da.time.dt.year + + def time_floor(self, calendar): + _ = self.da.time.dt.floor("D") diff --git a/benchmarks/benchmarks/alignment.py b/benchmarks/benchmarks/alignment.py new file mode 100644 index 00000000000..5a6ee3fa0a6 --- /dev/null +++ b/benchmarks/benchmarks/alignment.py @@ -0,0 +1,54 @@ +import numpy as np + +import xarray as xr + +from . import parameterized, requires_dask + +ntime = 365 * 30 +nx = 50 +ny = 50 + +rng = np.random.default_rng(0) + + +class Align: + def setup(self, *args, **kwargs): + data = rng.standard_normal((ntime, nx, ny)) + self.ds = xr.Dataset( + {"temperature": (("time", "x", "y"), data)}, + coords={ + "time": xr.date_range("2000", periods=ntime), + "x": np.arange(nx), + "y": np.arange(ny), + }, + ) + self.year = self.ds.time.dt.year + self.idx = np.unique(rng.integers(low=0, high=ntime, size=ntime // 2)) + self.year_subset = self.year.isel(time=self.idx) + + @parameterized(["join"], [("outer", "inner", "left", "right", "exact", "override")]) + def time_already_aligned(self, join): + xr.align(self.ds, self.year, join=join) + + @parameterized(["join"], [("outer", "inner", "left", "right")]) + def time_not_aligned(self, join): + xr.align(self.ds, self.year[-100:], join=join) + + @parameterized(["join"], [("outer", "inner", "left", "right")]) + def time_not_aligned_random_integers(self, join): + xr.align(self.ds, self.year_subset, join=join) + + +class AlignCFTime(Align): + def setup(self, *args, **kwargs): + super().setup() + self.ds["time"] = xr.date_range("2000", periods=ntime, calendar="noleap") + self.year = self.ds.time.dt.year + self.year_subset = self.year.isel(time=self.idx) + + +class AlignDask(Align): + def setup(self, *args, **kwargs): + requires_dask() + super().setup() + self.ds = self.ds.chunk({"time": 100}) diff --git a/benchmarks/benchmarks/coding.py b/benchmarks/benchmarks/coding.py new file mode 100644 index 00000000000..c39555243c0 --- /dev/null +++ b/benchmarks/benchmarks/coding.py @@ -0,0 +1,18 @@ +import numpy as np + +import xarray as xr + +from . import parameterized + + +@parameterized(["calendar"], [("standard", "noleap")]) +class EncodeCFDatetime: + def setup(self, calendar): + self.units = "days since 2000-01-01" + self.dtype = np.dtype("int64") + self.times = xr.date_range( + "2000", freq="D", periods=10000, calendar=calendar + ).values + + def time_encode_cf_datetime(self, calendar): + xr.coding.times.encode_cf_datetime(self.times, self.units, calendar, self.dtype) diff --git a/benchmarks/benchmarks/combine.py b/benchmarks/benchmarks/combine.py new file mode 100644 index 00000000000..772d888306c --- /dev/null +++ b/benchmarks/benchmarks/combine.py @@ -0,0 +1,79 @@ +import numpy as np + +import xarray as xr + +from . import requires_dask + + +class Combine1d: + """Benchmark concatenating and merging large datasets""" + + def setup(self) -> None: + """Create 2 datasets with two different variables""" + + t_size = 8000 + t = np.arange(t_size) + data = np.random.randn(t_size) + + self.dsA0 = xr.Dataset({"A": xr.DataArray(data, coords={"T": t}, dims=("T"))}) + self.dsA1 = xr.Dataset( + {"A": xr.DataArray(data, coords={"T": t + t_size}, dims=("T"))} + ) + + def time_combine_by_coords(self) -> None: + """Also has to load and arrange t coordinate""" + datasets = [self.dsA0, self.dsA1] + + xr.combine_by_coords(datasets) + + +class Combine1dDask(Combine1d): + """Benchmark concatenating and merging large datasets""" + + def setup(self) -> None: + """Create 2 datasets with two different variables""" + requires_dask() + + t_size = 8000 + t = np.arange(t_size) + var = xr.Variable(dims=("T",), data=np.random.randn(t_size)).chunk() + + data_vars = {f"long_name_{v}": ("T", var) for v in range(500)} + + self.dsA0 = xr.Dataset(data_vars, coords={"T": t}) + self.dsA1 = xr.Dataset(data_vars, coords={"T": t + t_size}) + + +class Combine3d: + """Benchmark concatenating and merging large datasets""" + + def setup(self): + """Create 4 datasets with two different variables""" + + t_size, x_size, y_size = 50, 450, 400 + t = np.arange(t_size) + data = np.random.randn(t_size, x_size, y_size) + + self.dsA0 = xr.Dataset( + {"A": xr.DataArray(data, coords={"T": t}, dims=("T", "X", "Y"))} + ) + self.dsA1 = xr.Dataset( + {"A": xr.DataArray(data, coords={"T": t + t_size}, dims=("T", "X", "Y"))} + ) + self.dsB0 = xr.Dataset( + {"B": xr.DataArray(data, coords={"T": t}, dims=("T", "X", "Y"))} + ) + self.dsB1 = xr.Dataset( + {"B": xr.DataArray(data, coords={"T": t + t_size}, dims=("T", "X", "Y"))} + ) + + def time_combine_nested(self): + datasets = [[self.dsA0, self.dsA1], [self.dsB0, self.dsB1]] + + xr.combine_nested(datasets, concat_dim=[None, "T"]) + + def time_combine_by_coords(self): + """Also has to load and arrange t coordinate""" + datasets = [self.dsA0, self.dsA1, self.dsB0, self.dsB1] + + xr.combine_by_coords(datasets) diff --git a/benchmarks/benchmarks/dataarray_missing.py b/benchmarks/benchmarks/dataarray_missing.py new file mode 100644 index 00000000000..83de65b7fe4 --- /dev/null +++ b/benchmarks/benchmarks/dataarray_missing.py @@ -0,0 +1,72 @@ +import pandas as pd + +import xarray as xr + +from . import parameterized, randn, requires_dask + + +def make_bench_data(shape, frac_nan, chunks): + vals = randn(shape, frac_nan) + coords = {"time": pd.date_range("2000-01-01", freq="D", periods=shape[0])} + da = xr.DataArray(vals, dims=("time", "x", "y"), coords=coords) + + if chunks is not None: + da = da.chunk(chunks) + + return da + + +class DataArrayMissingInterpolateNA: + def setup(self, shape, chunks, limit): + if chunks is not None: + requires_dask() + self.da = make_bench_data(shape, 0.1, chunks) + + @parameterized( + ["shape", "chunks", "limit"], + ( + [(365, 75, 75)], + [None, {"x": 25, "y": 25}], + [None, 3], + ), + ) + def time_interpolate_na(self, shape, chunks, limit): + actual = self.da.interpolate_na(dim="time", method="linear", limit=limit) + + if chunks is not None: + actual = actual.compute() + + +class DataArrayMissingBottleneck: + def setup(self, shape, chunks, limit): + if chunks is not None: + requires_dask() + self.da = make_bench_data(shape, 0.1, chunks) + + @parameterized( + ["shape", "chunks", "limit"], + ( + [(365, 75, 75)], + [None, {"x": 25, "y": 25}], + [None, 3], + ), + ) + def time_ffill(self, shape, chunks, limit): + actual = self.da.ffill(dim="time", limit=limit) + + if chunks is not None: + actual = actual.compute() + + @parameterized( + ["shape", "chunks", "limit"], + ( + [(365, 75, 75)], + [None, {"x": 25, "y": 25}], + [None, 3], + ), + ) + def time_bfill(self, shape, chunks, limit): + actual = self.da.bfill(dim="time", limit=limit) + + if chunks is not None: + actual = actual.compute() diff --git a/benchmarks/benchmarks/dataset.py b/benchmarks/benchmarks/dataset.py new file mode 100644 index 00000000000..d8a6d6df9d8 --- /dev/null +++ b/benchmarks/benchmarks/dataset.py @@ -0,0 +1,32 @@ +import numpy as np + +from xarray import Dataset + +from . import requires_dask + + +class DatasetBinaryOp: + def setup(self): + self.ds = Dataset( + { + "a": (("x", "y"), np.ones((300, 400))), + "b": (("x", "y"), np.ones((300, 400))), + } + ) + self.mean = self.ds.mean() + self.std = self.ds.std() + + def time_normalize(self): + (self.ds - self.mean) / self.std + + +class DatasetChunk: + def setup(self): + requires_dask() + self.ds = Dataset() + array = np.ones(1000) + for i in range(250): + self.ds[f"var{i}"] = ("x", array) + + def time_chunk(self): + self.ds.chunk(x=(1,) * 1000) diff --git a/benchmarks/benchmarks/dataset_io.py b/benchmarks/benchmarks/dataset_io.py new file mode 100644 index 00000000000..b8afabe802e --- /dev/null +++ b/benchmarks/benchmarks/dataset_io.py @@ -0,0 +1,755 @@ +from __future__ import annotations + +import os +from dataclasses import dataclass + +import numpy as np +import pandas as pd + +import xarray as xr + +from . import _skip_slow, parameterized, randint, randn, requires_dask + +try: + import dask + import dask.multiprocessing +except ImportError: + pass + +os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE" + +_ENGINES = tuple(xr.backends.list_engines().keys() - {"store"}) + + +class IOSingleNetCDF: + """ + A few examples that benchmark reading/writing a single netCDF file with + xarray + """ + + timeout = 300.0 + repeat = 1 + number = 5 + + def make_ds(self): + # single Dataset + self.ds = xr.Dataset() + self.nt = 1000 + self.nx = 90 + self.ny = 45 + + self.block_chunks = { + "time": self.nt / 4, + "lon": self.nx / 3, + "lat": self.ny / 3, + } + + self.time_chunks = {"time": int(self.nt / 36)} + + times = pd.date_range("1970-01-01", periods=self.nt, freq="D") + lons = xr.DataArray( + np.linspace(0, 360, self.nx), + dims=("lon",), + attrs={"units": "degrees east", "long_name": "longitude"}, + ) + lats = xr.DataArray( + np.linspace(-90, 90, self.ny), + dims=("lat",), + attrs={"units": "degrees north", "long_name": "latitude"}, + ) + self.ds["foo"] = xr.DataArray( + randn((self.nt, self.nx, self.ny), frac_nan=0.2), + coords={"lon": lons, "lat": lats, "time": times}, + dims=("time", "lon", "lat"), + name="foo", + attrs={"units": "foo units", "description": "a description"}, + ) + self.ds["bar"] = xr.DataArray( + randn((self.nt, self.nx, self.ny), frac_nan=0.2), + coords={"lon": lons, "lat": lats, "time": times}, + dims=("time", "lon", "lat"), + name="bar", + attrs={"units": "bar units", "description": "a description"}, + ) + self.ds["baz"] = xr.DataArray( + randn((self.nx, self.ny), frac_nan=0.2).astype(np.float32), + coords={"lon": lons, "lat": lats}, + dims=("lon", "lat"), + name="baz", + attrs={"units": "baz units", "description": "a description"}, + ) + + self.ds.attrs = {"history": "created for xarray benchmarking"} + + self.oinds = { + "time": randint(0, self.nt, 120), + "lon": randint(0, self.nx, 20), + "lat": randint(0, self.ny, 10), + } + self.vinds = { + "time": xr.DataArray(randint(0, self.nt, 120), dims="x"), + "lon": xr.DataArray(randint(0, self.nx, 120), dims="x"), + "lat": slice(3, 20), + } + + +class IOWriteSingleNetCDF3(IOSingleNetCDF): + def setup(self): + # TODO: Lazily skipped in CI as it is very demanding and slow. + # Improve times and remove errors. + _skip_slow() + + self.format = "NETCDF3_64BIT" + self.make_ds() + + def time_write_dataset_netcdf4(self): + self.ds.to_netcdf("test_netcdf4_write.nc", engine="netcdf4", format=self.format) + + def time_write_dataset_scipy(self): + self.ds.to_netcdf("test_scipy_write.nc", engine="scipy", format=self.format) + + +class IOReadSingleNetCDF4(IOSingleNetCDF): + def setup(self): + # TODO: Lazily skipped in CI as it is very demanding and slow. + # Improve times and remove errors. + _skip_slow() + + self.make_ds() + + self.filepath = "test_single_file.nc4.nc" + self.format = "NETCDF4" + self.ds.to_netcdf(self.filepath, format=self.format) + + def time_load_dataset_netcdf4(self): + xr.open_dataset(self.filepath, engine="netcdf4").load() + + def time_orthogonal_indexing(self): + ds = xr.open_dataset(self.filepath, engine="netcdf4") + ds = ds.isel(**self.oinds).load() + + def time_vectorized_indexing(self): + ds = xr.open_dataset(self.filepath, engine="netcdf4") + ds = ds.isel(**self.vinds).load() + + +class IOReadSingleNetCDF3(IOReadSingleNetCDF4): + def setup(self): + # TODO: Lazily skipped in CI as it is very demanding and slow. + # Improve times and remove errors. + _skip_slow() + + self.make_ds() + + self.filepath = "test_single_file.nc3.nc" + self.format = "NETCDF3_64BIT" + self.ds.to_netcdf(self.filepath, format=self.format) + + def time_load_dataset_scipy(self): + xr.open_dataset(self.filepath, engine="scipy").load() + + def time_orthogonal_indexing(self): + ds = xr.open_dataset(self.filepath, engine="scipy") + ds = ds.isel(**self.oinds).load() + + def time_vectorized_indexing(self): + ds = xr.open_dataset(self.filepath, engine="scipy") + ds = ds.isel(**self.vinds).load() + + +class IOReadSingleNetCDF4Dask(IOSingleNetCDF): + def setup(self): + # TODO: Lazily skipped in CI as it is very demanding and slow. + # Improve times and remove errors. + _skip_slow() + + requires_dask() + + self.make_ds() + + self.filepath = "test_single_file.nc4.nc" + self.format = "NETCDF4" + self.ds.to_netcdf(self.filepath, format=self.format) + + def time_load_dataset_netcdf4_with_block_chunks(self): + xr.open_dataset( + self.filepath, engine="netcdf4", chunks=self.block_chunks + ).load() + + def time_load_dataset_netcdf4_with_block_chunks_oindexing(self): + ds = xr.open_dataset(self.filepath, engine="netcdf4", chunks=self.block_chunks) + ds = ds.isel(**self.oinds).load() + + def time_load_dataset_netcdf4_with_block_chunks_vindexing(self): + ds = xr.open_dataset(self.filepath, engine="netcdf4", chunks=self.block_chunks) + ds = ds.isel(**self.vinds).load() + + def time_load_dataset_netcdf4_with_block_chunks_multiprocessing(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_dataset( + self.filepath, engine="netcdf4", chunks=self.block_chunks + ).load() + + def time_load_dataset_netcdf4_with_time_chunks(self): + xr.open_dataset(self.filepath, engine="netcdf4", chunks=self.time_chunks).load() + + def time_load_dataset_netcdf4_with_time_chunks_multiprocessing(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_dataset( + self.filepath, engine="netcdf4", chunks=self.time_chunks + ).load() + + +class IOReadSingleNetCDF3Dask(IOReadSingleNetCDF4Dask): + def setup(self): + # TODO: Lazily skipped in CI as it is very demanding and slow. + # Improve times and remove errors. + _skip_slow() + + requires_dask() + + self.make_ds() + + self.filepath = "test_single_file.nc3.nc" + self.format = "NETCDF3_64BIT" + self.ds.to_netcdf(self.filepath, format=self.format) + + def time_load_dataset_scipy_with_block_chunks(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_dataset( + self.filepath, engine="scipy", chunks=self.block_chunks + ).load() + + def time_load_dataset_scipy_with_block_chunks_oindexing(self): + ds = xr.open_dataset(self.filepath, engine="scipy", chunks=self.block_chunks) + ds = ds.isel(**self.oinds).load() + + def time_load_dataset_scipy_with_block_chunks_vindexing(self): + ds = xr.open_dataset(self.filepath, engine="scipy", chunks=self.block_chunks) + ds = ds.isel(**self.vinds).load() + + def time_load_dataset_scipy_with_time_chunks(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_dataset( + self.filepath, engine="scipy", chunks=self.time_chunks + ).load() + + +class IOMultipleNetCDF: + """ + A few examples that benchmark reading/writing multiple netCDF files with + xarray + """ + + timeout = 300.0 + repeat = 1 + number = 5 + + def make_ds(self, nfiles=10): + # multiple Dataset + self.ds = xr.Dataset() + self.nt = 1000 + self.nx = 90 + self.ny = 45 + self.nfiles = nfiles + + self.block_chunks = { + "time": self.nt / 4, + "lon": self.nx / 3, + "lat": self.ny / 3, + } + + self.time_chunks = {"time": int(self.nt / 36)} + + self.time_vars = np.split( + pd.date_range("1970-01-01", periods=self.nt, freq="D"), self.nfiles + ) + + self.ds_list = [] + self.filenames_list = [] + for i, times in enumerate(self.time_vars): + ds = xr.Dataset() + nt = len(times) + lons = xr.DataArray( + np.linspace(0, 360, self.nx), + dims=("lon",), + attrs={"units": "degrees east", "long_name": "longitude"}, + ) + lats = xr.DataArray( + np.linspace(-90, 90, self.ny), + dims=("lat",), + attrs={"units": "degrees north", "long_name": "latitude"}, + ) + ds["foo"] = xr.DataArray( + randn((nt, self.nx, self.ny), frac_nan=0.2), + coords={"lon": lons, "lat": lats, "time": times}, + dims=("time", "lon", "lat"), + name="foo", + attrs={"units": "foo units", "description": "a description"}, + ) + ds["bar"] = xr.DataArray( + randn((nt, self.nx, self.ny), frac_nan=0.2), + coords={"lon": lons, "lat": lats, "time": times}, + dims=("time", "lon", "lat"), + name="bar", + attrs={"units": "bar units", "description": "a description"}, + ) + ds["baz"] = xr.DataArray( + randn((self.nx, self.ny), frac_nan=0.2).astype(np.float32), + coords={"lon": lons, "lat": lats}, + dims=("lon", "lat"), + name="baz", + attrs={"units": "baz units", "description": "a description"}, + ) + + ds.attrs = {"history": "created for xarray benchmarking"} + + self.ds_list.append(ds) + self.filenames_list.append(f"test_netcdf_{i}.nc") + + +class IOWriteMultipleNetCDF3(IOMultipleNetCDF): + def setup(self): + # TODO: Lazily skipped in CI as it is very demanding and slow. + # Improve times and remove errors. + _skip_slow() + + self.make_ds() + self.format = "NETCDF3_64BIT" + + def time_write_dataset_netcdf4(self): + xr.save_mfdataset( + self.ds_list, self.filenames_list, engine="netcdf4", format=self.format + ) + + def time_write_dataset_scipy(self): + xr.save_mfdataset( + self.ds_list, self.filenames_list, engine="scipy", format=self.format + ) + + +class IOReadMultipleNetCDF4(IOMultipleNetCDF): + def setup(self): + # TODO: Lazily skipped in CI as it is very demanding and slow. + # Improve times and remove errors. + _skip_slow() + + requires_dask() + + self.make_ds() + self.format = "NETCDF4" + xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format) + + def time_load_dataset_netcdf4(self): + xr.open_mfdataset(self.filenames_list, engine="netcdf4").load() + + def time_open_dataset_netcdf4(self): + xr.open_mfdataset(self.filenames_list, engine="netcdf4") + + +class IOReadMultipleNetCDF3(IOReadMultipleNetCDF4): + def setup(self): + # TODO: Lazily skipped in CI as it is very demanding and slow. + # Improve times and remove errors. + _skip_slow() + + requires_dask() + + self.make_ds() + self.format = "NETCDF3_64BIT" + xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format) + + def time_load_dataset_scipy(self): + xr.open_mfdataset(self.filenames_list, engine="scipy").load() + + def time_open_dataset_scipy(self): + xr.open_mfdataset(self.filenames_list, engine="scipy") + + +class IOReadMultipleNetCDF4Dask(IOMultipleNetCDF): + def setup(self): + # TODO: Lazily skipped in CI as it is very demanding and slow. + # Improve times and remove errors. + _skip_slow() + + requires_dask() + + self.make_ds() + self.format = "NETCDF4" + xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format) + + def time_load_dataset_netcdf4_with_block_chunks(self): + xr.open_mfdataset( + self.filenames_list, engine="netcdf4", chunks=self.block_chunks + ).load() + + def time_load_dataset_netcdf4_with_block_chunks_multiprocessing(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_mfdataset( + self.filenames_list, engine="netcdf4", chunks=self.block_chunks + ).load() + + def time_load_dataset_netcdf4_with_time_chunks(self): + xr.open_mfdataset( + self.filenames_list, engine="netcdf4", chunks=self.time_chunks + ).load() + + def time_load_dataset_netcdf4_with_time_chunks_multiprocessing(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_mfdataset( + self.filenames_list, engine="netcdf4", chunks=self.time_chunks + ).load() + + def time_open_dataset_netcdf4_with_block_chunks(self): + xr.open_mfdataset( + self.filenames_list, engine="netcdf4", chunks=self.block_chunks + ) + + def time_open_dataset_netcdf4_with_block_chunks_multiprocessing(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_mfdataset( + self.filenames_list, engine="netcdf4", chunks=self.block_chunks + ) + + def time_open_dataset_netcdf4_with_time_chunks(self): + xr.open_mfdataset( + self.filenames_list, engine="netcdf4", chunks=self.time_chunks + ) + + def time_open_dataset_netcdf4_with_time_chunks_multiprocessing(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_mfdataset( + self.filenames_list, engine="netcdf4", chunks=self.time_chunks + ) + + +class IOReadMultipleNetCDF3Dask(IOReadMultipleNetCDF4Dask): + def setup(self): + # TODO: Lazily skipped in CI as it is very demanding and slow. + # Improve times and remove errors. + _skip_slow() + + requires_dask() + + self.make_ds() + self.format = "NETCDF3_64BIT" + xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format) + + def time_load_dataset_scipy_with_block_chunks(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_mfdataset( + self.filenames_list, engine="scipy", chunks=self.block_chunks + ).load() + + def time_load_dataset_scipy_with_time_chunks(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_mfdataset( + self.filenames_list, engine="scipy", chunks=self.time_chunks + ).load() + + def time_open_dataset_scipy_with_block_chunks(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_mfdataset( + self.filenames_list, engine="scipy", chunks=self.block_chunks + ) + + def time_open_dataset_scipy_with_time_chunks(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_mfdataset( + self.filenames_list, engine="scipy", chunks=self.time_chunks + ) + + +def create_delayed_write(): + import dask.array as da + + vals = da.random.random(300, chunks=(1,)) + ds = xr.Dataset({"vals": (["a"], vals)}) + return ds.to_netcdf("file.nc", engine="netcdf4", compute=False) + + +class IONestedDataTree: + """ + A few examples that benchmark reading/writing a heavily nested netCDF datatree with + xarray + """ + + timeout = 300.0 + repeat = 1 + number = 5 + + def make_datatree(self, nchildren=10): + # multiple Dataset + self.ds = xr.Dataset() + self.nt = 1000 + self.nx = 90 + self.ny = 45 + self.nchildren = nchildren + + self.block_chunks = { + "time": self.nt / 4, + "lon": self.nx / 3, + "lat": self.ny / 3, + } + + self.time_chunks = {"time": int(self.nt / 36)} + + times = pd.date_range("1970-01-01", periods=self.nt, freq="D") + lons = xr.DataArray( + np.linspace(0, 360, self.nx), + dims=("lon",), + attrs={"units": "degrees east", "long_name": "longitude"}, + ) + lats = xr.DataArray( + np.linspace(-90, 90, self.ny), + dims=("lat",), + attrs={"units": "degrees north", "long_name": "latitude"}, + ) + self.ds["foo"] = xr.DataArray( + randn((self.nt, self.nx, self.ny), frac_nan=0.2), + coords={"lon": lons, "lat": lats, "time": times}, + dims=("time", "lon", "lat"), + name="foo", + attrs={"units": "foo units", "description": "a description"}, + ) + self.ds["bar"] = xr.DataArray( + randn((self.nt, self.nx, self.ny), frac_nan=0.2), + coords={"lon": lons, "lat": lats, "time": times}, + dims=("time", "lon", "lat"), + name="bar", + attrs={"units": "bar units", "description": "a description"}, + ) + self.ds["baz"] = xr.DataArray( + randn((self.nx, self.ny), frac_nan=0.2).astype(np.float32), + coords={"lon": lons, "lat": lats}, + dims=("lon", "lat"), + name="baz", + attrs={"units": "baz units", "description": "a description"}, + ) + + self.ds.attrs = {"history": "created for xarray benchmarking"} + + self.oinds = { + "time": randint(0, self.nt, 120), + "lon": randint(0, self.nx, 20), + "lat": randint(0, self.ny, 10), + } + self.vinds = { + "time": xr.DataArray(randint(0, self.nt, 120), dims="x"), + "lon": xr.DataArray(randint(0, self.nx, 120), dims="x"), + "lat": slice(3, 20), + } + root = {f"group_{group}": self.ds for group in range(self.nchildren)} + nested_tree1 = { + f"group_{group}/subgroup_1": xr.Dataset() for group in range(self.nchildren) + } + nested_tree2 = { + f"group_{group}/subgroup_2": xr.DataArray(np.arange(1, 10)).to_dataset( + name="a" + ) + for group in range(self.nchildren) + } + nested_tree3 = { + f"group_{group}/subgroup_2/sub-subgroup_1": self.ds + for group in range(self.nchildren) + } + dtree = root | nested_tree1 | nested_tree2 | nested_tree3 + self.dtree = xr.DataTree.from_dict(dtree) + + +class IOReadDataTreeNetCDF4(IONestedDataTree): + def setup(self): + # TODO: Lazily skipped in CI as it is very demanding and slow. + # Improve times and remove errors. + _skip_slow() + + requires_dask() + + self.make_datatree() + self.format = "NETCDF4" + self.filepath = "datatree.nc4.nc" + dtree = self.dtree + dtree.to_netcdf(filepath=self.filepath) + + def time_load_datatree_netcdf4(self): + xr.open_datatree(self.filepath, engine="netcdf4").load() + + def time_open_datatree_netcdf4(self): + xr.open_datatree(self.filepath, engine="netcdf4") + + +class IOWriteNetCDFDask: + timeout = 60 + repeat = 1 + number = 5 + + def setup(self): + # TODO: Lazily skipped in CI as it is very demanding and slow. + # Improve times and remove errors. + _skip_slow() + + requires_dask() + + self.write = create_delayed_write() + + def time_write(self): + self.write.compute() + + +class IOWriteNetCDFDaskDistributed: + def setup(self): + # TODO: Lazily skipped in CI as it is very demanding and slow. + # Improve times and remove errors. + _skip_slow() + + requires_dask() + + try: + import distributed + except ImportError as err: + raise NotImplementedError() from err + + self.client = distributed.Client() + self.write = create_delayed_write() + + def cleanup(self): + self.client.shutdown() + + def time_write(self): + self.write.compute() + + +class IOReadSingleFile(IOSingleNetCDF): + def setup(self, *args, **kwargs): + self.make_ds() + + self.filepaths = {} + for engine in _ENGINES: + self.filepaths[engine] = f"test_single_file_with_{engine}.nc" + self.ds.to_netcdf(self.filepaths[engine], engine=engine) + + @parameterized(["engine", "chunks"], (_ENGINES, [None, {}])) + def time_read_dataset(self, engine, chunks): + xr.open_dataset(self.filepaths[engine], engine=engine, chunks=chunks) + + +class IOReadCustomEngine: + def setup(self, *args, **kwargs): + """ + The custom backend does the bare minimum to be considered a lazy backend. But + the data in it is still in memory so slow file reading shouldn't affect the + results. + """ + requires_dask() + + @dataclass + class PerformanceBackendArray(xr.backends.BackendArray): + filename_or_obj: str | os.PathLike | None + shape: tuple[int, ...] + dtype: np.dtype + lock: xr.backends.locks.SerializableLock + + def __getitem__(self, key: tuple): + return xr.core.indexing.explicit_indexing_adapter( + key, + self.shape, + xr.core.indexing.IndexingSupport.BASIC, + self._raw_indexing_method, + ) + + def _raw_indexing_method(self, key: tuple): + raise NotImplementedError + + @dataclass + class PerformanceStore(xr.backends.common.AbstractWritableDataStore): + manager: xr.backends.CachingFileManager + mode: str | None = None + lock: xr.backends.locks.SerializableLock | None = None + autoclose: bool = False + + def __post_init__(self): + self.filename = self.manager._args[0] + + @classmethod + def open( + cls, + filename: str | os.PathLike | None, + mode: str = "r", + lock: xr.backends.locks.SerializableLock | None = None, + autoclose: bool = False, + ): + locker = lock or xr.backends.locks.SerializableLock() + + manager = xr.backends.CachingFileManager( + xr.backends.DummyFileManager, + filename, + mode=mode, + ) + return cls(manager, mode=mode, lock=locker, autoclose=autoclose) + + def load(self) -> tuple: + """ + Load a bunch of test data quickly. + + Normally this method would've opened a file and parsed it. + """ + n_variables = 2000 + + # Important to have a shape and dtype for lazy loading. + shape = (1000,) + dtype = np.dtype(int) + variables = { + f"long_variable_name_{v}": xr.Variable( + data=PerformanceBackendArray( + self.filename, shape, dtype, self.lock + ), + dims=("time",), + fastpath=True, + ) + for v in range(n_variables) + } + attributes = {} + + return variables, attributes + + class PerformanceBackend(xr.backends.BackendEntrypoint): + def open_dataset( + self, + filename_or_obj: str | os.PathLike | None, + drop_variables: tuple[str, ...] | None = None, + *, + mask_and_scale=True, + decode_times=True, + concat_characters=True, + decode_coords=True, + use_cftime=None, + decode_timedelta=None, + lock=None, + **kwargs, + ) -> xr.Dataset: + filename_or_obj = xr.backends.common._normalize_path(filename_or_obj) + store = PerformanceStore.open(filename_or_obj, lock=lock) + + store_entrypoint = xr.backends.store.StoreBackendEntrypoint() + + ds = store_entrypoint.open_dataset( + store, + mask_and_scale=mask_and_scale, + decode_times=decode_times, + concat_characters=concat_characters, + decode_coords=decode_coords, + drop_variables=drop_variables, + use_cftime=use_cftime, + decode_timedelta=decode_timedelta, + ) + return ds + + self.engine = PerformanceBackend + + @parameterized(["chunks"], ([None, {}, {"time": 10}])) + def time_open_dataset(self, chunks): + """ + Time how fast xr.open_dataset is without the slow data reading part. + Test with and without dask. + """ + xr.open_dataset(None, engine=self.engine, chunks=chunks) diff --git a/benchmarks/benchmarks/datatree.py b/benchmarks/benchmarks/datatree.py new file mode 100644 index 00000000000..9f1774f60ac --- /dev/null +++ b/benchmarks/benchmarks/datatree.py @@ -0,0 +1,15 @@ +import xarray as xr +from xarray.core.datatree import DataTree + + +class Datatree: + def setup(self): + run1 = DataTree.from_dict({"run1": xr.Dataset({"a": 1})}) + self.d_few = {"run1": run1} + self.d_many = {f"run{i}": xr.Dataset({"a": 1}) for i in range(100)} + + def time_from_dict_few(self): + DataTree.from_dict(self.d_few) + + def time_from_dict_many(self): + DataTree.from_dict(self.d_many) diff --git a/benchmarks/benchmarks/groupby.py b/benchmarks/benchmarks/groupby.py new file mode 100644 index 00000000000..681fd6ed734 --- /dev/null +++ b/benchmarks/benchmarks/groupby.py @@ -0,0 +1,191 @@ +# import flox to avoid the cost of first import +import cftime +import flox.xarray # noqa: F401 +import numpy as np +import pandas as pd + +import xarray as xr + +from . import _skip_slow, parameterized, requires_dask + + +class GroupBy: + def setup(self, *args, **kwargs): + self.n = 100 + self.ds1d = xr.Dataset( + { + "a": xr.DataArray(np.r_[np.repeat(1, self.n), np.repeat(2, self.n)]), + "b": xr.DataArray(np.arange(2 * self.n)), + "c": xr.DataArray(np.arange(2 * self.n)), + } + ) + self.ds2d = self.ds1d.expand_dims(z=10).copy() + self.ds1d_mean = self.ds1d.groupby("b").mean() + self.ds2d_mean = self.ds2d.groupby("b").mean() + + @parameterized(["ndim"], [(1, 2)]) + def time_init(self, ndim): + getattr(self, f"ds{ndim}d").groupby("b") + + @parameterized( + ["method", "ndim", "use_flox"], [("sum", "mean"), (1, 2), (True, False)] + ) + def time_agg_small_num_groups(self, method, ndim, use_flox): + ds = getattr(self, f"ds{ndim}d") + with xr.set_options(use_flox=use_flox): + getattr(ds.groupby("a"), method)().compute() + + @parameterized( + ["method", "ndim", "use_flox"], [("sum", "mean"), (1, 2), (True, False)] + ) + def time_agg_large_num_groups(self, method, ndim, use_flox): + ds = getattr(self, f"ds{ndim}d") + with xr.set_options(use_flox=use_flox): + getattr(ds.groupby("b"), method)().compute() + + def time_binary_op_1d(self): + (self.ds1d.groupby("b") - self.ds1d_mean).compute() + + def time_binary_op_2d(self): + (self.ds2d.groupby("b") - self.ds2d_mean).compute() + + def peakmem_binary_op_1d(self): + (self.ds1d.groupby("b") - self.ds1d_mean).compute() + + def peakmem_binary_op_2d(self): + (self.ds2d.groupby("b") - self.ds2d_mean).compute() + + +class GroupByDask(GroupBy): + def setup(self, *args, **kwargs): + requires_dask() + super().setup(**kwargs) + + self.ds1d = self.ds1d.sel(dim_0=slice(None, None, 2)) + self.ds1d["c"] = self.ds1d["c"].chunk({"dim_0": 50}) + self.ds2d = self.ds2d.sel(dim_0=slice(None, None, 2)) + self.ds2d["c"] = self.ds2d["c"].chunk({"dim_0": 50, "z": 5}) + self.ds1d_mean = self.ds1d.groupby("b").mean().compute() + self.ds2d_mean = self.ds2d.groupby("b").mean().compute() + + +# TODO: These don't work now because we are calling `.compute` explicitly. +class GroupByPandasDataFrame(GroupBy): + """Run groupby tests using pandas DataFrame.""" + + def setup(self, *args, **kwargs): + # Skip testing in CI as it won't ever change in a commit: + _skip_slow() + + super().setup(**kwargs) + self.ds1d = self.ds1d.to_dataframe() + self.ds1d_mean = self.ds1d.groupby("b").mean() + + def time_binary_op_2d(self): + raise NotImplementedError + + def peakmem_binary_op_2d(self): + raise NotImplementedError + + +class GroupByDaskDataFrame(GroupBy): + """Run groupby tests using dask DataFrame.""" + + def setup(self, *args, **kwargs): + # Skip testing in CI as it won't ever change in a commit: + _skip_slow() + + requires_dask() + super().setup(**kwargs) + self.ds1d = self.ds1d.chunk({"dim_0": 50}).to_dask_dataframe() + self.ds1d_mean = self.ds1d.groupby("b").mean().compute() + + def time_binary_op_2d(self): + raise NotImplementedError + + def peakmem_binary_op_2d(self): + raise NotImplementedError + + +class Resample: + def setup(self, *args, **kwargs): + self.ds1d = xr.Dataset( + { + "b": ("time", np.arange(365.0 * 24)), + }, + coords={"time": pd.date_range("2001-01-01", freq="h", periods=365 * 24)}, + ) + self.ds2d = self.ds1d.expand_dims(z=10) + self.ds1d_mean = self.ds1d.resample(time="48h").mean() + self.ds2d_mean = self.ds2d.resample(time="48h").mean() + + @parameterized(["ndim"], [(1, 2)]) + def time_init(self, ndim): + getattr(self, f"ds{ndim}d").resample(time="D") + + @parameterized( + ["method", "ndim", "use_flox"], [("sum", "mean"), (1, 2), (True, False)] + ) + def time_agg_small_num_groups(self, method, ndim, use_flox): + ds = getattr(self, f"ds{ndim}d") + with xr.set_options(use_flox=use_flox): + getattr(ds.resample(time="3ME"), method)().compute() + + @parameterized( + ["method", "ndim", "use_flox"], [("sum", "mean"), (1, 2), (True, False)] + ) + def time_agg_large_num_groups(self, method, ndim, use_flox): + ds = getattr(self, f"ds{ndim}d") + with xr.set_options(use_flox=use_flox): + getattr(ds.resample(time="48h"), method)().compute() + + +class ResampleDask(Resample): + def setup(self, *args, **kwargs): + requires_dask() + super().setup(**kwargs) + self.ds1d = self.ds1d.chunk({"time": 50}) + self.ds2d = self.ds2d.chunk({"time": 50, "z": 4}) + + +class ResampleCFTime(Resample): + def setup(self, *args, **kwargs): + self.ds1d = xr.Dataset( + { + "b": ("time", np.arange(365.0 * 24)), + }, + coords={ + "time": xr.date_range( + "2001-01-01", freq="h", periods=365 * 24, calendar="noleap" + ) + }, + ) + self.ds2d = self.ds1d.expand_dims(z=10) + self.ds1d_mean = self.ds1d.resample(time="48h").mean() + self.ds2d_mean = self.ds2d.resample(time="48h").mean() + + +@parameterized(["use_cftime", "use_flox"], [[True, False], [True, False]]) +class GroupByLongTime: + def setup(self, use_cftime, use_flox): + arr = np.random.randn(10, 10, 365 * 30) + time = xr.date_range("2000", periods=30 * 365, use_cftime=use_cftime) + + # GH9426 - deep-copying CFTime object arrays is weirdly slow + asda = xr.DataArray(time) + labeled_time = [] + for year, month in zip(asda.dt.year, asda.dt.month, strict=True): + labeled_time.append(cftime.datetime(year, month, 1)) + + self.da = xr.DataArray( + arr, + dims=("y", "x", "time"), + coords={"time": time, "time2": ("time", labeled_time)}, + ) + + def time_setup(self, use_cftime, use_flox): + self.da.groupby("time.month") + + def time_mean(self, use_cftime, use_flox): + with xr.set_options(use_flox=use_flox): + self.da.groupby("time.year").mean() diff --git a/benchmarks/benchmarks/import.py b/benchmarks/benchmarks/import.py new file mode 100644 index 00000000000..f9d0bcc336b --- /dev/null +++ b/benchmarks/benchmarks/import.py @@ -0,0 +1,18 @@ +class Import: + """Benchmark importing xarray""" + + def timeraw_import_xarray(self): + return "import xarray" + + def timeraw_import_xarray_plot(self): + return "import xarray.plot" + + def timeraw_import_xarray_backends(self): + return """ + from xarray.backends import list_engines + list_engines() + """ + + def timeraw_import_xarray_only(self): + # import numpy and pandas in the setup stage + return "import xarray", "import numpy, pandas" diff --git a/benchmarks/benchmarks/indexing.py b/benchmarks/benchmarks/indexing.py new file mode 100644 index 00000000000..50bb8a5ee99 --- /dev/null +++ b/benchmarks/benchmarks/indexing.py @@ -0,0 +1,201 @@ +import os + +import numpy as np +import pandas as pd + +import xarray as xr + +from . import parameterized, randint, randn, requires_dask + +nx = 2000 +ny = 1000 +nt = 500 + +basic_indexes = { + "1scalar": {"x": 0}, + "1slice": {"x": slice(0, 3)}, + "1slice-1scalar": {"x": 0, "y": slice(None, None, 3)}, + "2slicess-1scalar": {"x": slice(3, -3, 3), "y": 1, "t": slice(None, -3, 3)}, +} + +basic_assignment_values = { + "1scalar": 0, + "1slice": xr.DataArray(randn((3, ny), frac_nan=0.1), dims=["x", "y"]), + "1slice-1scalar": xr.DataArray(randn(int(ny / 3) + 1, frac_nan=0.1), dims=["y"]), + "2slicess-1scalar": xr.DataArray( + randn(np.empty(nx)[slice(3, -3, 3)].size, frac_nan=0.1), dims=["x"] + ), +} + +outer_indexes = { + "1d": {"x": randint(0, nx, 400)}, + "2d": {"x": randint(0, nx, 500), "y": randint(0, ny, 400)}, + "2d-1scalar": {"x": randint(0, nx, 100), "y": 1, "t": randint(0, nt, 400)}, +} + +outer_assignment_values = { + "1d": xr.DataArray(randn((400, ny), frac_nan=0.1), dims=["x", "y"]), + "2d": xr.DataArray(randn((500, 400), frac_nan=0.1), dims=["x", "y"]), + "2d-1scalar": xr.DataArray(randn(100, frac_nan=0.1), dims=["x"]), +} + + +def make_vectorized_indexes(n_index): + return { + "1-1d": {"x": xr.DataArray(randint(0, nx, n_index), dims="a")}, + "2-1d": { + "x": xr.DataArray(randint(0, nx, n_index), dims="a"), + "y": xr.DataArray(randint(0, ny, n_index), dims="a"), + }, + "3-2d": { + "x": xr.DataArray( + randint(0, nx, n_index).reshape(n_index // 100, 100), dims=["a", "b"] + ), + "y": xr.DataArray( + randint(0, ny, n_index).reshape(n_index // 100, 100), dims=["a", "b"] + ), + "t": xr.DataArray( + randint(0, nt, n_index).reshape(n_index // 100, 100), dims=["a", "b"] + ), + }, + } + + +vectorized_indexes = make_vectorized_indexes(400) +big_vectorized_indexes = make_vectorized_indexes(400_000) + +vectorized_assignment_values = { + "1-1d": xr.DataArray(randn((400, ny)), dims=["a", "y"], coords={"a": randn(400)}), + "2-1d": xr.DataArray(randn(400), dims=["a"], coords={"a": randn(400)}), + "3-2d": xr.DataArray( + randn((4, 100)), dims=["a", "b"], coords={"a": randn(4), "b": randn(100)} + ), +} + + +class Base: + def setup(self, key): + self.ds = xr.Dataset( + { + "var1": (("x", "y"), randn((nx, ny), frac_nan=0.1)), + "var2": (("x", "t"), randn((nx, nt))), + "var3": (("t",), randn(nt)), + }, + coords={ + "x": np.arange(nx), + "y": np.linspace(0, 1, ny), + "t": pd.date_range("1970-01-01", periods=nt, freq="D"), + "x_coords": ("x", np.linspace(1.1, 2.1, nx)), + }, + ) + # Benchmark how indexing is slowed down by adding many scalar variable + # to the dataset + # https://github.com/pydata/xarray/pull/9003 + self.ds_large = self.ds.merge({f"extra_var{i}": i for i in range(400)}) + + +class Indexing(Base): + @parameterized(["key"], [list(basic_indexes.keys())]) + def time_indexing_basic(self, key): + self.ds.isel(**basic_indexes[key]).load() + + @parameterized(["key"], [list(outer_indexes.keys())]) + def time_indexing_outer(self, key): + self.ds.isel(**outer_indexes[key]).load() + + @parameterized(["key"], [list(vectorized_indexes.keys())]) + def time_indexing_vectorized(self, key): + self.ds.isel(**vectorized_indexes[key]).load() + + @parameterized(["key"], [list(basic_indexes.keys())]) + def time_indexing_basic_ds_large(self, key): + # https://github.com/pydata/xarray/pull/9003 + self.ds_large.isel(**basic_indexes[key]).load() + + +class IndexingOnly(Base): + @parameterized(["key"], [list(basic_indexes.keys())]) + def time_indexing_basic(self, key): + self.ds.isel(**basic_indexes[key]) + + @parameterized(["key"], [list(outer_indexes.keys())]) + def time_indexing_outer(self, key): + self.ds.isel(**outer_indexes[key]) + + @parameterized(["key"], [list(big_vectorized_indexes.keys())]) + def time_indexing_big_vectorized(self, key): + self.ds.isel(**big_vectorized_indexes[key]) + + +class Assignment(Base): + @parameterized(["key"], [list(basic_indexes.keys())]) + def time_assignment_basic(self, key): + ind = basic_indexes[key] + val = basic_assignment_values[key] + self.ds["var1"][ind.get("x", slice(None)), ind.get("y", slice(None))] = val + + @parameterized(["key"], [list(outer_indexes.keys())]) + def time_assignment_outer(self, key): + ind = outer_indexes[key] + val = outer_assignment_values[key] + self.ds["var1"][ind.get("x", slice(None)), ind.get("y", slice(None))] = val + + @parameterized(["key"], [list(vectorized_indexes.keys())]) + def time_assignment_vectorized(self, key): + ind = vectorized_indexes[key] + val = vectorized_assignment_values[key] + self.ds["var1"][ind.get("x", slice(None)), ind.get("y", slice(None))] = val + + +class IndexingDask(Indexing): + def setup(self, key): + requires_dask() + super().setup(key) + self.ds = self.ds.chunk({"x": 100, "y": 50, "t": 50}) + + +class BooleanIndexing: + # https://github.com/pydata/xarray/issues/2227 + def setup(self): + self.ds = xr.Dataset( + {"a": ("time", np.arange(10_000_000))}, + coords={"time": np.arange(10_000_000)}, + ) + self.time_filter = self.ds.time > 50_000 + + def time_indexing(self): + self.ds.isel(time=self.time_filter) + + +class HugeAxisSmallSliceIndexing: + # https://github.com/pydata/xarray/pull/4560 + def setup(self): + self.filepath = "test_indexing_huge_axis_small_slice.nc" + if not os.path.isfile(self.filepath): + xr.Dataset( + {"a": ("x", np.arange(10_000_000))}, + coords={"x": np.arange(10_000_000)}, + ).to_netcdf(self.filepath, format="NETCDF4") + + self.ds = xr.open_dataset(self.filepath) + + def time_indexing(self): + self.ds.isel(x=slice(100)) + + def cleanup(self): + self.ds.close() + + +class AssignmentOptimized: + # https://github.com/pydata/xarray/pull/7382 + def setup(self): + self.ds = xr.Dataset(coords={"x": np.arange(500_000)}) + self.da = xr.DataArray(np.arange(500_000), dims="x") + + def time_assign_no_reindex(self): + # assign with non-indexed DataArray of same dimension size + self.ds.assign(foo=self.da) + + def time_assign_identical_indexes(self): + # fastpath index comparison (same index object) + self.ds.assign(foo=self.ds.x) diff --git a/benchmarks/benchmarks/interp.py b/benchmarks/benchmarks/interp.py new file mode 100644 index 00000000000..ca1d0a2dd89 --- /dev/null +++ b/benchmarks/benchmarks/interp.py @@ -0,0 +1,65 @@ +import numpy as np +import pandas as pd + +import xarray as xr + +from . import parameterized, randn, requires_dask + +nx = 1500 +ny = 1000 +nt = 500 + +randn_xy = randn((nx, ny), frac_nan=0.1) +randn_xt = randn((nx, nt)) +randn_t = randn((nt,)) + +new_x_short = np.linspace(0.3 * nx, 0.7 * nx, 100) +new_x_long = np.linspace(0.3 * nx, 0.7 * nx, 500) +new_y_long = np.linspace(0.1, 0.9, 500) + + +class Interpolation: + def setup(self, *args, **kwargs): + self.ds = xr.Dataset( + { + "var1": (("x", "y"), randn_xy), + "var2": (("x", "t"), randn_xt), + "var3": (("t",), randn_t), + "var4": (("z",), np.array(["text"])), + "var5": (("k",), np.array(["a", "b", "c"])), + }, + coords={ + "x": np.arange(nx), + "y": np.linspace(0, 1, ny), + "t": pd.date_range("1970-01-01", periods=nt, freq="D"), + "x_coords": ("x", np.linspace(1.1, 2.1, nx)), + "z": np.array([1]), + "k": np.linspace(0, nx, 3), + }, + ) + + @parameterized(["method", "is_short"], (["linear", "cubic"], [True, False])) + def time_interpolation_numeric_1d(self, method, is_short): + new_x = new_x_short if is_short else new_x_long + self.ds.interp(x=new_x, method=method).compute() + + @parameterized(["method"], (["linear", "nearest"])) + def time_interpolation_numeric_2d(self, method): + self.ds.interp(x=new_x_long, y=new_y_long, method=method).compute() + + @parameterized(["is_short"], ([True, False])) + def time_interpolation_string_scalar(self, is_short): + new_z = new_x_short if is_short else new_x_long + self.ds.interp(z=new_z).compute() + + @parameterized(["is_short"], ([True, False])) + def time_interpolation_string_1d(self, is_short): + new_k = new_x_short if is_short else new_x_long + self.ds.interp(k=new_k).compute() + + +class InterpolationDask(Interpolation): + def setup(self, *args, **kwargs): + requires_dask() + super().setup(**kwargs) + self.ds = self.ds.chunk({"t": 50}) diff --git a/benchmarks/benchmarks/merge.py b/benchmarks/benchmarks/merge.py new file mode 100644 index 00000000000..6c8c1e9da90 --- /dev/null +++ b/benchmarks/benchmarks/merge.py @@ -0,0 +1,77 @@ +import numpy as np + +import xarray as xr + + +class DatasetAddVariable: + param_names = ["existing_elements"] + params = [[0, 10, 100, 1000]] + + def setup(self, existing_elements): + self.datasets = {} + # Dictionary insertion is fast(er) than xarray.Dataset insertion + d = {} + for i in range(existing_elements): + d[f"var{i}"] = i + self.dataset = xr.merge([d]) + + d = {f"set_2_{i}": i for i in range(existing_elements)} + self.dataset2 = xr.merge([d]) + + def time_variable_insertion(self, existing_elements): + dataset = self.dataset + dataset["new_var"] = 0 + + def time_merge_two_datasets(self, existing_elements): + xr.merge([self.dataset, self.dataset2]) + + +class DatasetCreation: + # The idea here is to time how long it takes to go from numpy + # and python data types, to a full dataset + # See discussion + # https://github.com/pydata/xarray/issues/7224#issuecomment-1292216344 + param_names = ["strategy", "count"] + params = [ + ["dict_of_DataArrays", "dict_of_Variables", "dict_of_Tuples"], + [0, 1, 10, 100, 1000], + ] + + def setup(self, strategy, count): + data = np.array(["0", "b"], dtype=str) + self.dataset_coords = dict(time=np.array([0, 1])) + self.dataset_attrs = dict(description="Test data") + attrs = dict(units="Celsius") + if strategy == "dict_of_DataArrays": + + def create_data_vars(): + return { + f"long_variable_name_{i}": xr.DataArray( + data=data, dims=("time"), attrs=attrs + ) + for i in range(count) + } + + elif strategy == "dict_of_Variables": + + def create_data_vars(): + return { + f"long_variable_name_{i}": xr.Variable("time", data, attrs=attrs) + for i in range(count) + } + + elif strategy == "dict_of_Tuples": + + def create_data_vars(): + return { + f"long_variable_name_{i}": ("time", data, attrs) + for i in range(count) + } + + self.create_data_vars = create_data_vars + + def time_dataset_creation(self, strategy, count): + data_vars = self.create_data_vars() + xr.Dataset( + data_vars=data_vars, coords=self.dataset_coords, attrs=self.dataset_attrs + ) diff --git a/benchmarks/benchmarks/pandas.py b/benchmarks/benchmarks/pandas.py new file mode 100644 index 00000000000..ebe61081916 --- /dev/null +++ b/benchmarks/benchmarks/pandas.py @@ -0,0 +1,64 @@ +import numpy as np +import pandas as pd + +import xarray as xr + +from . import parameterized, requires_dask + + +class MultiIndexSeries: + def setup(self, dtype, subset): + data = np.random.rand(100000).astype(dtype) + index = pd.MultiIndex.from_product( + [ + list("abcdefhijk"), + list("abcdefhijk"), + pd.date_range(start="2000-01-01", periods=1000, freq="D"), + ] + ) + series = pd.Series(data, index) + if subset: + series = series[::3] + self.series = series + + @parameterized(["dtype", "subset"], ([int, float], [True, False])) + def time_from_series(self, dtype, subset): + xr.DataArray.from_series(self.series) + + +class ToDataFrame: + def setup(self, *args, **kwargs): + xp = kwargs.get("xp", np) + nvars = kwargs.get("nvars", 1) + random_kws = kwargs.get("random_kws", {}) + method = kwargs.get("method", "to_dataframe") + + dim1 = 10_000 + dim2 = 10_000 + + var = xr.Variable( + dims=("dim1", "dim2"), data=xp.random.random((dim1, dim2), **random_kws) + ) + data_vars = {f"long_name_{v}": (("dim1", "dim2"), var) for v in range(nvars)} + + ds = xr.Dataset( + data_vars, coords={"dim1": np.arange(0, dim1), "dim2": np.arange(0, dim2)} + ) + self.to_frame = getattr(ds, method) + + def time_to_dataframe(self): + self.to_frame() + + def peakmem_to_dataframe(self): + self.to_frame() + + +class ToDataFrameDask(ToDataFrame): + def setup(self, *args, **kwargs): + requires_dask() + + import dask.array as da + + super().setup( + xp=da, random_kws=dict(chunks=5000), method="to_dask_dataframe", nvars=500 + ) diff --git a/benchmarks/benchmarks/polyfit.py b/benchmarks/benchmarks/polyfit.py new file mode 100644 index 00000000000..429ffa19baa --- /dev/null +++ b/benchmarks/benchmarks/polyfit.py @@ -0,0 +1,38 @@ +import numpy as np + +import xarray as xr + +from . import parameterized, randn, requires_dask + +NDEGS = (2, 5, 20) +NX = (10**2, 10**6) + + +class Polyval: + def setup(self, *args, **kwargs): + self.xs = {nx: xr.DataArray(randn((nx,)), dims="x", name="x") for nx in NX} + self.coeffs = { + ndeg: xr.DataArray( + randn((ndeg,)), dims="degree", coords={"degree": np.arange(ndeg)} + ) + for ndeg in NDEGS + } + + @parameterized(["nx", "ndeg"], [NX, NDEGS]) + def time_polyval(self, nx, ndeg): + x = self.xs[nx] + c = self.coeffs[ndeg] + xr.polyval(x, c).compute() + + @parameterized(["nx", "ndeg"], [NX, NDEGS]) + def peakmem_polyval(self, nx, ndeg): + x = self.xs[nx] + c = self.coeffs[ndeg] + xr.polyval(x, c).compute() + + +class PolyvalDask(Polyval): + def setup(self, *args, **kwargs): + requires_dask() + super().setup(*args, **kwargs) + self.xs = {k: v.chunk({"x": 10000}) for k, v in self.xs.items()} diff --git a/benchmarks/benchmarks/reindexing.py b/benchmarks/benchmarks/reindexing.py new file mode 100644 index 00000000000..61e6b2213f3 --- /dev/null +++ b/benchmarks/benchmarks/reindexing.py @@ -0,0 +1,52 @@ +import numpy as np + +import xarray as xr + +from . import requires_dask + +ntime = 500 +nx = 50 +ny = 50 + + +class Reindex: + def setup(self): + data = np.random.default_rng(0).random((ntime, nx, ny)) + self.ds = xr.Dataset( + {"temperature": (("time", "x", "y"), data)}, + coords={"time": np.arange(ntime), "x": np.arange(nx), "y": np.arange(ny)}, + ) + + def time_1d_coarse(self): + self.ds.reindex(time=np.arange(0, ntime, 5)).load() + + def time_1d_fine_all_found(self): + self.ds.reindex(time=np.arange(0, ntime, 0.5), method="nearest").load() + + def time_1d_fine_some_missing(self): + self.ds.reindex( + time=np.arange(0, ntime, 0.5), method="nearest", tolerance=0.1 + ).load() + + def time_2d_coarse(self): + self.ds.reindex(x=np.arange(0, nx, 2), y=np.arange(0, ny, 2)).load() + + def time_2d_fine_all_found(self): + self.ds.reindex( + x=np.arange(0, nx, 0.5), y=np.arange(0, ny, 0.5), method="nearest" + ).load() + + def time_2d_fine_some_missing(self): + self.ds.reindex( + x=np.arange(0, nx, 0.5), + y=np.arange(0, ny, 0.5), + method="nearest", + tolerance=0.1, + ).load() + + +class ReindexDask(Reindex): + def setup(self): + requires_dask() + super().setup() + self.ds = self.ds.chunk({"time": 100}) diff --git a/benchmarks/benchmarks/renaming.py b/benchmarks/benchmarks/renaming.py new file mode 100644 index 00000000000..3ade5d8df70 --- /dev/null +++ b/benchmarks/benchmarks/renaming.py @@ -0,0 +1,27 @@ +import numpy as np + +import xarray as xr + + +class SwapDims: + param_names = ["size"] + params = [[int(1e3), int(1e5), int(1e7)]] + + def setup(self, size: int) -> None: + self.ds = xr.Dataset( + {"a": (("x", "t"), np.ones((size, 2)))}, + coords={ + "x": np.arange(size), + "y": np.arange(size), + "z": np.arange(size), + "x2": ("x", np.arange(size)), + "y2": ("y", np.arange(size)), + "z2": ("z", np.arange(size)), + }, + ) + + def time_swap_dims(self, size: int) -> None: + self.ds.swap_dims({"x": "xn", "y": "yn", "z": "zn"}) + + def time_swap_dims_newindex(self, size: int) -> None: + self.ds.swap_dims({"x": "x2", "y": "y2", "z": "z2"}) diff --git a/benchmarks/benchmarks/repr.py b/benchmarks/benchmarks/repr.py new file mode 100644 index 00000000000..68a082fcc4f --- /dev/null +++ b/benchmarks/benchmarks/repr.py @@ -0,0 +1,87 @@ +import numpy as np +import pandas as pd + +import xarray as xr + + +class Repr: + def setup(self): + a = np.arange(0, 100) + data_vars = dict() + for i in a: + data_vars[f"long_variable_name_{i}"] = xr.DataArray( + name=f"long_variable_name_{i}", + data=np.arange(0, 20), + dims=[f"long_coord_name_{i}_x"], + coords={f"long_coord_name_{i}_x": np.arange(0, 20) * 2}, + ) + self.ds = xr.Dataset(data_vars) + self.ds.attrs = {f"attr_{k}": 2 for k in a} + + def time_repr(self): + repr(self.ds) + + def time_repr_html(self): + self.ds._repr_html_() + + +class ReprDataTree: + def setup(self): + # construct a datatree with 500 nodes + number_of_files = 20 + number_of_groups = 25 + tree_dict = {} + for f in range(number_of_files): + for g in range(number_of_groups): + tree_dict[f"file_{f}/group_{g}"] = xr.Dataset({"g": f * g}) + + self.dt = xr.DataTree.from_dict(tree_dict) + + def time_repr(self): + repr(self.dt) + + def time_repr_html(self): + self.dt._repr_html_() + + +class ReprMultiIndex: + def setup(self): + index = pd.MultiIndex.from_product( + [range(1000), range(1000)], names=("level_0", "level_1") + ) + series = pd.Series(range(1000 * 1000), index=index) + self.da = xr.DataArray(series) + + def time_repr(self): + repr(self.da) + + def time_repr_html(self): + self.da._repr_html_() + + +class ReprPandasRangeIndex: + # display a memory-saving pandas.RangeIndex shouldn't trigger memory + # expensive conversion into a numpy array + def setup(self): + index = xr.indexes.PandasIndex(pd.RangeIndex(1_000_000), "x") + self.ds = xr.Dataset(coords=xr.Coordinates.from_xindex(index)) + + def time_repr(self): + repr(self.ds.x) + + def time_repr_html(self): + self.ds.x._repr_html_() + + +class ReprXarrayRangeIndex: + # display an Xarray RangeIndex shouldn't trigger memory expensive conversion + # of its lazy coordinate into a numpy array + def setup(self): + index = xr.indexes.RangeIndex.arange(1_000_000, dim="x") + self.ds = xr.Dataset(coords=xr.Coordinates.from_xindex(index)) + + def time_repr(self): + repr(self.ds.x) + + def time_repr_html(self): + self.ds.x._repr_html_() diff --git a/benchmarks/benchmarks/rolling.py b/benchmarks/benchmarks/rolling.py new file mode 100644 index 00000000000..4fa2e09c9c0 --- /dev/null +++ b/benchmarks/benchmarks/rolling.py @@ -0,0 +1,142 @@ +import numpy as np +import pandas as pd + +import xarray as xr + +from . import _skip_slow, parameterized, randn, requires_dask + +nx = 3000 +long_nx = 30000 +ny = 200 +nt = 1000 +window = 20 + +randn_xy = randn((nx, ny), frac_nan=0.1) +randn_xt = randn((nx, nt)) +randn_t = randn((nt,)) +randn_long = randn((long_nx,), frac_nan=0.1) + + +class Rolling: + def setup(self, *args, **kwargs): + self.ds = xr.Dataset( + { + "var1": (("x", "y"), randn_xy), + "var2": (("x", "t"), randn_xt), + "var3": (("t",), randn_t), + }, + coords={ + "x": np.arange(nx), + "y": np.linspace(0, 1, ny), + "t": pd.date_range("1970-01-01", periods=nt, freq="D"), + "x_coords": ("x", np.linspace(1.1, 2.1, nx)), + }, + ) + self.da_long = xr.DataArray( + randn_long, dims="x", coords={"x": np.arange(long_nx) * 0.1} + ) + + @parameterized( + ["func", "center", "use_bottleneck"], + (["mean", "count"], [True, False], [True, False]), + ) + def time_rolling(self, func, center, use_bottleneck): + with xr.set_options(use_bottleneck=use_bottleneck): + getattr(self.ds.rolling(x=window, center=center), func)().load() + + @parameterized( + ["func", "pandas", "use_bottleneck"], + (["mean", "count"], [True, False], [True, False]), + ) + def time_rolling_long(self, func, pandas, use_bottleneck): + if pandas: + se = self.da_long.to_series() + getattr(se.rolling(window=window, min_periods=window), func)() + else: + with xr.set_options(use_bottleneck=use_bottleneck): + getattr( + self.da_long.rolling(x=window, min_periods=window), func + )().load() + + @parameterized( + ["window_", "min_periods", "use_bottleneck"], ([20, 40], [5, 5], [True, False]) + ) + def time_rolling_np(self, window_, min_periods, use_bottleneck): + with xr.set_options(use_bottleneck=use_bottleneck): + self.ds.rolling(x=window_, center=False, min_periods=min_periods).reduce( + np.nansum + ).load() + + @parameterized( + ["center", "stride", "use_bottleneck"], ([True, False], [1, 1], [True, False]) + ) + def time_rolling_construct(self, center, stride, use_bottleneck): + with xr.set_options(use_bottleneck=use_bottleneck): + self.ds.rolling(x=window, center=center).construct( + "window_dim", stride=stride + ).sum(dim="window_dim").load() + + +class RollingDask(Rolling): + def setup(self, *args, **kwargs): + requires_dask() + # TODO: Lazily skipped in CI as it is very demanding and slow. + # Improve times and remove errors. + _skip_slow() + super().setup(**kwargs) + self.ds = self.ds.chunk({"x": 100, "y": 50, "t": 50}) + self.da_long = self.da_long.chunk({"x": 10000}) + + +class RollingMemory: + def setup(self, *args, **kwargs): + self.ds = xr.Dataset( + { + "var1": (("x", "y"), randn_xy), + "var2": (("x", "t"), randn_xt), + "var3": (("t",), randn_t), + }, + coords={ + "x": np.arange(nx), + "y": np.linspace(0, 1, ny), + "t": pd.date_range("1970-01-01", periods=nt, freq="D"), + "x_coords": ("x", np.linspace(1.1, 2.1, nx)), + }, + ) + + +class DataArrayRollingMemory(RollingMemory): + @parameterized(["func", "use_bottleneck"], (["sum", "max", "mean"], [True, False])) + def peakmem_ndrolling_reduce(self, func, use_bottleneck): + with xr.set_options(use_bottleneck=use_bottleneck): + roll = self.ds.var1.rolling(x=10, y=4) + getattr(roll, func)() + + @parameterized(["func", "use_bottleneck"], (["sum", "max", "mean"], [True, False])) + def peakmem_1drolling_reduce(self, func, use_bottleneck): + with xr.set_options(use_bottleneck=use_bottleneck): + roll = self.ds.var3.rolling(t=100) + getattr(roll, func)() + + @parameterized(["stride"], ([None, 5, 50])) + def peakmem_1drolling_construct(self, stride): + self.ds.var2.rolling(t=100).construct("w", stride=stride) + self.ds.var3.rolling(t=100).construct("w", stride=stride) + + +class DatasetRollingMemory(RollingMemory): + @parameterized(["func", "use_bottleneck"], (["sum", "max", "mean"], [True, False])) + def peakmem_ndrolling_reduce(self, func, use_bottleneck): + with xr.set_options(use_bottleneck=use_bottleneck): + roll = self.ds.rolling(x=10, y=4) + getattr(roll, func)() + + @parameterized(["func", "use_bottleneck"], (["sum", "max", "mean"], [True, False])) + def peakmem_1drolling_reduce(self, func, use_bottleneck): + with xr.set_options(use_bottleneck=use_bottleneck): + roll = self.ds.rolling(t=100) + getattr(roll, func)() + + @parameterized(["stride"], ([None, 5, 50])) + def peakmem_1drolling_construct(self, stride): + self.ds.rolling(t=100).construct("w", stride=stride) diff --git a/benchmarks/benchmarks/unstacking.py b/benchmarks/benchmarks/unstacking.py new file mode 100644 index 00000000000..b3af5eac19c --- /dev/null +++ b/benchmarks/benchmarks/unstacking.py @@ -0,0 +1,64 @@ +import numpy as np +import pandas as pd + +import xarray as xr + +from . import requires_dask, requires_sparse + + +class Unstacking: + def setup(self): + data = np.random.default_rng(0).random((250, 500)) + self.da_full = xr.DataArray(data, dims=list("ab")).stack(flat_dim=[...]) + self.da_missing = self.da_full[:-1] + self.df_missing = self.da_missing.to_pandas() + + def time_unstack_fast(self): + self.da_full.unstack("flat_dim") + + def time_unstack_slow(self): + self.da_missing.unstack("flat_dim") + + def time_unstack_pandas_slow(self): + self.df_missing.unstack() + + +class UnstackingDask(Unstacking): + def setup(self, *args, **kwargs): + requires_dask() + super().setup(**kwargs) + self.da_full = self.da_full.chunk({"flat_dim": 25}) + + +class UnstackingSparse(Unstacking): + def setup(self, *args, **kwargs): + requires_sparse() + + import sparse + + data = sparse.random((500, 1000), random_state=0, fill_value=0) + self.da_full = xr.DataArray(data, dims=list("ab")).stack(flat_dim=[...]) + self.da_missing = self.da_full[:-1] + + mindex = pd.MultiIndex.from_arrays([np.arange(100), np.arange(100)]) + self.da_eye_2d = xr.DataArray(np.ones((100,)), dims="z", coords={"z": mindex}) + self.da_eye_3d = xr.DataArray( + np.ones((100, 50)), + dims=("z", "foo"), + coords={"z": mindex, "foo": np.arange(50)}, + ) + + def time_unstack_to_sparse_2d(self): + self.da_eye_2d.unstack(sparse=True) + + def time_unstack_to_sparse_3d(self): + self.da_eye_3d.unstack(sparse=True) + + def peakmem_unstack_to_sparse_2d(self): + self.da_eye_2d.unstack(sparse=True) + + def peakmem_unstack_to_sparse_3d(self): + self.da_eye_3d.unstack(sparse=True) + + def time_unstack_pandas_slow(self): + pass diff --git a/benchmarks/bm_runner.py b/benchmarks/bm_runner.py new file mode 100644 index 00000000000..c0da5b5fb47 --- /dev/null +++ b/benchmarks/bm_runner.py @@ -0,0 +1,739 @@ +#!/usr/bin/env python3 +# Copyright Iris contributors +# +# This file is part of Iris and is released under the BSD license. +# See LICENSE in the root of the repository for full licensing details. +"""Argparse conveniences for executing common types of benchmark runs.""" + +from abc import ABC, abstractmethod +import argparse +from datetime import datetime +from importlib import import_module +from os import environ +from pathlib import Path +import re +import shlex +import subprocess +from tempfile import NamedTemporaryFile +from textwrap import dedent +from typing import Literal, Protocol + +# The threshold beyond which shifts are 'notable'. See `asv compare`` docs +# for more. +COMPARE_FACTOR = 1.2 + +BENCHMARKS_DIR = Path(__file__).parent +ROOT_DIR = BENCHMARKS_DIR.parent +# Storage location for reports used in GitHub actions. +GH_REPORT_DIR = ROOT_DIR.joinpath(".github", "workflows", "benchmark_reports") + +# Common ASV arguments for all run_types except `custom`. +ASV_HARNESS = "run {posargs} --attribute rounds=3 --interleave-rounds --show-stderr" + + +def echo(echo_string: str): + # Use subprocess for printing to reduce chance of printing out of sequence + # with the subsequent calls. + subprocess.run(["echo", f"BM_RUNNER DEBUG: {echo_string}"]) + + +def _subprocess_runner(args, asv=False, **kwargs): + # Avoid permanent modifications if the same arguments are used more than once. + args = args.copy() + kwargs = kwargs.copy() + if asv: + args.insert(0, "asv") + kwargs["cwd"] = BENCHMARKS_DIR + echo(" ".join(args)) + kwargs.setdefault("check", True) + return subprocess.run(args, **kwargs) + + +def _subprocess_runner_capture(args, **kwargs) -> str: + result = _subprocess_runner(args, capture_output=True, **kwargs) + return result.stdout.decode().rstrip() + + +def _check_requirements(package: str) -> None: + try: + import_module(package) + except ImportError as exc: + message = ( + f"No {package} install detected. Benchmarks can only " + f"be run in an environment including {package}." + ) + raise Exception(message) from exc + + +def _prep_data_gen_env() -> None: + """Create or access a separate, unchanging environment for generating test data.""" + python_version = "3.13" + data_gen_var = "DATA_GEN_PYTHON" + if data_gen_var in environ: + echo("Using existing data generation environment.") + else: + echo("Setting up the data generation environment ...") + # Get Nox to build an environment for the `tests` session, but don't + # run the session. Will reuse a cached environment if appropriate. + _subprocess_runner( + [ + "nox", + f"--noxfile={ROOT_DIR / 'noxfile.py'}", + "--session=tests", + "--install-only", + f"--python={python_version}", + ] + ) + # Find the environment built above, set it to be the data generation + # environment. + env_directory: Path = next((ROOT_DIR / ".nox").rglob(f"tests*")) + data_gen_python = (env_directory / "bin" / "python").resolve() + environ[data_gen_var] = str(data_gen_python) + + def clone_resource(name: str, clone_source: str) -> Path: + resource_dir = data_gen_python.parents[1] / "resources" + resource_dir.mkdir(exist_ok=True) + clone_dir = resource_dir / name + if not clone_dir.is_dir(): + _subprocess_runner(["git", "clone", clone_source, str(clone_dir)]) + return clone_dir + + echo("Installing Mule into data generation environment ...") + mule_dir = clone_resource("mule", "https://github.com/metomi/mule.git") + _subprocess_runner( + [ + str(data_gen_python), + "-m", + "pip", + "install", + str(mule_dir / "mule"), + ] + ) + + test_data_var = "OVERRIDE_TEST_DATA_REPOSITORY" + if test_data_var not in environ: + echo("Installing iris-test-data into data generation environment ...") + test_data_dir = clone_resource( + "iris-test-data", "https://github.com/SciTools/iris-test-data.git" + ) + environ[test_data_var] = str(test_data_dir / "test_data") + + echo("Data generation environment ready.") + + +def _setup_common() -> None: + _check_requirements("asv") + _check_requirements("nox") + + _prep_data_gen_env() + + echo("Setting up ASV ...") + _subprocess_runner(["machine", "--yes"], asv=True) + + echo("Setup complete.") + + +def _asv_compare( + *commits: str, + overnight_mode: bool = False, + fail_on_regression: bool = False, +) -> None: + """Run through a list of commits comparing each one to the next.""" + commits = tuple(commit[:8] for commit in commits) + + machine_script = [ + "from asv.machine import Machine", + "print(Machine.get_unique_machine_name())", + ] + machine_name = _subprocess_runner_capture( + ["python", "-c", ";".join(machine_script)] + ) + + for i in range(len(commits) - 1): + before = commits[i] + after = commits[i + 1] + asv_command = shlex.split( + f"compare {before} {after} " + f"--machine {machine_name} --factor={COMPARE_FACTOR} --split" + ) + + comparison = _subprocess_runner_capture(asv_command, asv=True) + echo(comparison) + shifts = _subprocess_runner_capture([*asv_command, "--only-changed"], asv=True) + + if shifts or (not overnight_mode): + # For the overnight run: only post if there are shifts. + _gh_create_reports(after, comparison, shifts) + + if shifts and fail_on_regression: + # fail_on_regression supports setups that expect CI failures. + message = ( + f"Performance shifts detected between commits {before} and {after}.\n" + ) + raise RuntimeError(message) + + +def _gh_create_reports(commit_sha: str, results_full: str, results_shifts: str) -> None: + """If running under GitHub Actions: record the results in report(s). + + Posting the reports is done by :func:`_gh_post_reports`, which must be run + within a separate action to comply with GHA's security limitations. + """ + if "GITHUB_ACTIONS" not in environ: + # Only run when within GHA. + return + + pr_number = environ.get("PR_NUMBER", None) + on_pull_request = pr_number is not None + run_id = environ["GITHUB_RUN_ID"] + repo = environ["GITHUB_REPOSITORY"] + gha_run_link = f"[`{run_id}`](https://github.com/{repo}/actions/runs/{run_id})" + + GH_REPORT_DIR.mkdir(exist_ok=True) + commit_dir = GH_REPORT_DIR / commit_sha + commit_dir.mkdir() + command_path = commit_dir / "command.txt" + body_path = commit_dir / "body.txt" + + performance_report = dedent( + ( + """ + # :stopwatch: Performance Benchmark Report: {commit_sha} + +
+ Performance shifts + + ``` + {results_shifts} + ``` + +
+ +
+ Full benchmark results + + ``` + {results_full} + ``` + +
+ + Generated by GHA run {gha_run_link} + """ + ) + ) + performance_report = performance_report.format( + commit_sha=commit_sha, + results_shifts=results_shifts, + results_full=results_full, + gha_run_link=gha_run_link, + ) + + if on_pull_request: + # Command to post the report as a comment on the active PR. + body_path.write_text(performance_report) + command = ( + f"gh pr comment {pr_number} " + f"--body-file {body_path.absolute()} " + f"--repo {repo}" + ) + command_path.write_text(command) + + else: + # Command to post the report as new issue. + commit_msg = _subprocess_runner_capture( + f"git log {commit_sha}^! --oneline".split(" ") + ) + # Intended for benchmarking commits on trunk - should include a PR + # number due to our squash policy. + pr_tag_match = re.search("#[0-9]*", commit_msg) + + assignee = "" + pr_tag = "pull request number unavailable" + if pr_tag_match is not None: + pr_tag = pr_tag_match.group(0) + + for login_type in ("author", "mergedBy"): + gh_query = f'.["{login_type}"]["login"]' + commandlist = shlex.split( + f"gh pr view {pr_tag[1:]} " + f"--json {login_type} -q '{gh_query}' " + f"--repo {repo}" + ) + login = _subprocess_runner_capture(commandlist) + + commandlist = [ + "curl", + "-s", + f"https://api.github.com/users/{login}", + ] + login_info = _subprocess_runner_capture(commandlist) + is_user = '"type": "User"' in login_info + if is_user: + assignee = login + break + + title = f"Performance Shift(s): `{commit_sha}`" + body = dedent( + ( + f""" + Benchmark comparison has identified performance shifts at: + + * commit {commit_sha} ({pr_tag}). + +

+ Please review the report below and + take corrective/congratulatory action as appropriate + :slightly_smiling_face: +

+ """ + ) + ) + body += performance_report + body_path.write_text(body) + + command = ( + "gh issue create " + f'--title "{title}" ' + f"--body-file {body_path.absolute()} " + '--label "Bot" ' + '--label "Type: Performance" ' + f"--repo {repo}" + ) + if assignee: + command += f" --assignee {assignee}" + command_path.write_text(command) + + +def _gh_post_reports() -> None: + """If running under GitHub Actions: post pre-prepared benchmark reports. + + Reports are prepared by :func:`_gh_create_reports`, which must be run + within a separate action to comply with GHA's security limitations. + """ + if "GITHUB_ACTIONS" not in environ: + # Only run when within GHA. + return + + commit_dirs = [x for x in GH_REPORT_DIR.iterdir() if x.is_dir()] + for commit_dir in commit_dirs: + command_path = commit_dir / "command.txt" + command = command_path.read_text() + + # Security: only accept certain commands to run. + assert command.startswith(("gh issue create", "gh pr comment")) + + _subprocess_runner(shlex.split(command)) + + +class _SubParserGenerator(ABC): + """Convenience for holding all the necessary argparse info in 1 place.""" + + name: str = NotImplemented + description: str = NotImplemented + epilog: str = NotImplemented + + class _SubParsersType(Protocol): + """Duck typing since argparse._SubParsersAction is private.""" + + def add_parser(self, name, **kwargs) -> argparse.ArgumentParser: ... + + def __init__(self, subparsers: _SubParsersType) -> None: + self.subparser = subparsers.add_parser( + self.name, + description=self.description, + epilog=self.epilog, + formatter_class=argparse.RawTextHelpFormatter, + ) + self.add_arguments() + self.add_asv_arguments() + self.subparser.set_defaults(func=self.func) + + @abstractmethod + def add_arguments(self) -> None: + """All custom self.subparser.add_argument() calls.""" + _ = NotImplemented + + def add_asv_arguments(self) -> None: + self.subparser.add_argument( + "asv_args", + nargs=argparse.REMAINDER, + help="Any number of arguments to pass down to the ASV benchmark command.", + ) + + @staticmethod + @abstractmethod + def func(args: argparse.Namespace): + """Return when the subparser is parsed. + + `func` is then called, performing the user's selected sub-command. + + """ + _ = args + return NotImplemented + + +class Overnight(_SubParserGenerator): + name = "overnight" + description = ( + "Benchmarks all commits between the input **first_commit** to ``HEAD``, " + "comparing each to its parent for performance shifts. If running on " + "GitHub Actions: performance shift(s) will be reported in a new issue.\n" + "Designed for checking the previous 24 hours' commits, typically in a " + "scheduled script.\n" + "Uses `asv run`." + ) + epilog = ( + "e.g. python bm_runner.py overnight a1b23d4\n" + "e.g. python bm_runner.py overnight a1b23d4 --bench=regridding" + ) + + def add_arguments(self) -> None: + self.subparser.add_argument( + "first_commit", + type=str, + help="The first commit in the benchmarking commit sequence.", + ) + + @staticmethod + def func(args: argparse.Namespace) -> None: + _setup_common() + + commit_range = f"{args.first_commit}^^.." + # git rev-list --first-parent is the command ASV uses. + git_command = shlex.split(f"git rev-list --first-parent {commit_range}") + commit_string = _subprocess_runner_capture(git_command) + commit_list = commit_string.split("\n") + + asv_command = shlex.split(ASV_HARNESS.format(posargs=commit_range)) + try: + _subprocess_runner([*asv_command, *args.asv_args], asv=True) + finally: + # Designed for long running - want to compare/post any valid + # results even if some are broken. + _asv_compare(*reversed(commit_list), overnight_mode=True) + + +class Branch(_SubParserGenerator): + name = "branch" + description = ( + "Performs the same operations as ``overnight``, but always on two " + "commits only - ``HEAD``, and ``HEAD``'s merge-base with the input " + "**base_branch**.\n" + "If running on GitHub Actions: HEAD will be GitHub's " + "merge commit and merge-base will be the merge target. Performance " + "comparisons will be posted in a comment on the relevant pull request.\n" + "Designed for testing if the active branch's changes cause performance " + "shifts - anticipating what would be caught by ``overnight`` once " + "merged.\n\n" + "**For maximum accuracy, avoid using the machine that is running this " + "session. Run time could be >1 hour for the full benchmark suite.**\n" + "Uses `asv run`." + ) + epilog = ( + "e.g. python bm_runner.py branch upstream/main\n" + "e.g. python bm_runner.py branch upstream/main --bench=regridding" + ) + + def add_arguments(self) -> None: + self.subparser.add_argument( + "base_branch", + type=str, + help="A branch that has the merge-base with ``HEAD`` - ``HEAD`` will be benchmarked against that merge-base.", + ) + + @staticmethod + def func(args: argparse.Namespace) -> None: + _setup_common() + + git_command = shlex.split("git rev-parse HEAD") + head_sha = _subprocess_runner_capture(git_command)[:8] + + git_command = shlex.split(f"git merge-base {head_sha} {args.base_branch}") + merge_base = _subprocess_runner_capture(git_command)[:8] + + with NamedTemporaryFile("w") as hashfile: + hashfile.writelines([merge_base, "\n", head_sha]) + hashfile.flush() + commit_range = f"HASHFILE:{hashfile.name}" + asv_command = shlex.split(ASV_HARNESS.format(posargs=commit_range)) + _subprocess_runner([*asv_command, *args.asv_args], asv=True) + + _asv_compare(merge_base, head_sha) + + +class _CSPerf(_SubParserGenerator, ABC): + """Common code used by both CPerf and SPerf.""" + + description = ( + "Run the on-demand {} suite of benchmarks (part of the UK Met " + "Office NG-VAT project) for the ``HEAD`` of ``upstream/main`` only, " + "and publish the results to the input **publish_dir**, within a " + "unique subdirectory for this run.\n" + "Uses `asv run`." + ) + epilog = ( + "e.g. python bm_runner.py {0} my_publish_dir\n" + "e.g. python bm_runner.py {0} my_publish_dir --bench=regridding" + ) + + def add_arguments(self) -> None: + self.subparser.add_argument( + "publish_dir", + type=str, + help="HTML results will be published to a sub-dir in this dir.", + ) + + @staticmethod + def csperf(args: argparse.Namespace, run_type: Literal["cperf", "sperf"]) -> None: + _setup_common() + + publish_dir = Path(args.publish_dir) + if not publish_dir.is_dir(): + message = f"Input 'publish directory' is not a directory: {publish_dir}" + raise NotADirectoryError(message) + publish_subdir = ( + publish_dir / f"{run_type}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" + ) + publish_subdir.mkdir() + + # Activate on demand benchmarks (C/SPerf are deactivated for + # 'standard' runs). + environ["ON_DEMAND_BENCHMARKS"] = "True" + commit_range = "upstream/main^!" + + asv_command_str = ( + ASV_HARNESS.format(posargs=commit_range) + f" --bench={run_type}" + ) + + # Only do a single round. + asv_command = shlex.split(re.sub(r"rounds=\d", "rounds=1", asv_command_str)) + try: + _subprocess_runner([*asv_command, *args.asv_args], asv=True) + except subprocess.CalledProcessError as err: + # C/SPerf benchmarks are much bigger than the CI ones: + # Don't fail the whole run if memory blows on 1 benchmark. + # ASV produces return code of 2 if the run includes crashes. + if err.returncode != 2: + raise + + asv_command = shlex.split(f"publish {commit_range} --html-dir={publish_subdir}") + _subprocess_runner(asv_command, asv=True) + + # Print completion message. + location = BENCHMARKS_DIR / ".asv" + echo( + f'New ASV results for "{run_type}".\n' + f'See "{publish_subdir}",' + f'\n or JSON files under "{location / "results"}".' + ) + + +class CPerf(_CSPerf): + name = "cperf" + description = _CSPerf.description.format("CPerf") + epilog = _CSPerf.epilog.format("cperf") + + @staticmethod + def func(args: argparse.Namespace) -> None: + _CSPerf.csperf(args, "cperf") + + +class SPerf(_CSPerf): + name = "sperf" + description = _CSPerf.description.format("SPerf") + epilog = _CSPerf.epilog.format("sperf") + + @staticmethod + def func(args: argparse.Namespace) -> None: + _CSPerf.csperf(args, "sperf") + + +class Custom(_SubParserGenerator): + name = "custom" + description = ( + "Run ASV with the input **ASV sub-command**, without any preset " + "arguments - must all be supplied by the user. So just like running " + "ASV manually, with the convenience of re-using the runner's " + "scripted setup steps." + ) + epilog = "e.g. python bm_runner.py custom continuous a1b23d4 HEAD --quick" + + def add_arguments(self) -> None: + self.subparser.add_argument( + "asv_sub_command", + type=str, + help="The ASV command to run.", + ) + + @staticmethod + def func(args: argparse.Namespace) -> None: + _setup_common() + _subprocess_runner([args.asv_sub_command, *args.asv_args], asv=True) + + +class TrialRun(_SubParserGenerator): + name = "trialrun" + description = ( + "Fast trial-run a given benchmark, to check it works : " + "in a provided or latest-lockfile environment, " + "with no repeats for accuracy of measurement." + ) + epilog = ( + "e.g. python bm_runner.py trialrun " + "MyBenchmarks.time_calc ${DATA_GEN_PYTHON}" + "\n\nNOTE: 'runpath' also replaces $DATA_GEN_PYTHON during the run." + ) + + def add_arguments(self) -> None: + self.subparser.add_argument( + "benchmark", + type=str, + help=( + "A benchmark name, possibly including wildcards, " + "as supported by the ASV '--bench' argument." + ), + ) + self.subparser.add_argument( + "runpath", + type=str, + help=( + "A path to an existing python executable, " + "to completely bypass environment building." + ), + ) + + @staticmethod + def func(args: argparse.Namespace) -> None: + if args.runpath: + # Shortcut creation of a data-gen environment + # - which is also the trial-run env. + python_path = Path(args.runpath).resolve() + environ["DATA_GEN_PYTHON"] = str(python_path) + _setup_common() + # get path of data-gen environment, setup by previous call + python_path = Path(environ["DATA_GEN_PYTHON"]) + # allow 'on-demand' benchmarks + environ["ON_DEMAND_BENCHMARKS"] = "1" + asv_command = [ + "run", + "--bench", + args.benchmark, + # no repeats for timing accuracy + "--quick", + "--show-stderr", + # do not build a unique env : run test in data-gen environment + "--environment", + f"existing:{python_path}", + ] + args.asv_args + _subprocess_runner(asv_command, asv=True) + + +class Validate(_SubParserGenerator): + name = "validate" + description = ( + "Quickly check that the benchmark architecture works as intended with " + "the current codebase. Things that are checked: env creation/update, " + "package build/install/uninstall, artificial data creation." + ) + epilog = "Sole acceptable syntax: python bm_runner.py validate" + + @staticmethod + def func(args: argparse.Namespace) -> None: + _setup_common() + + git_command = shlex.split("git rev-parse HEAD") + head_sha = _subprocess_runner_capture(git_command)[:8] + + # Find the most recent commit where the lock-files are not + # identical to HEAD - will force environment updates. + locks_dir = Path(__file__).parents[1] / "ci" / "requirements" / "locks" + assert locks_dir.is_dir() + git_command = shlex.split( + f"git log -1 --pretty=format:%P -- {locks_dir.resolve()}" + ) + locks_sha = _subprocess_runner_capture(git_command)[:8] + + with NamedTemporaryFile("w") as hashfile: + hashfile.writelines([locks_sha, "\n", head_sha]) + hashfile.flush() + asv_command = shlex.split( + f"run HASHFILE:{hashfile.name} --bench ValidateSetup " + "--attribute rounds=1 --show-stderr" + ) + extra_env = environ | {"ON_DEMAND_BENCHMARKS": "1"} + _subprocess_runner(asv_command, asv=True, env=extra_env) + + # No arguments permitted for this subclass: + + def add_arguments(self) -> None: + pass + + def add_asv_arguments(self) -> None: + pass + + +class GhPost(_SubParserGenerator): + name = "_gh_post" + description = ( + "Used by GitHub Actions to post benchmark reports that were prepared " + "during previous actions. Separated to comply with GitHub's security " + "requirements." + ) + epilog = "Sole acceptable syntax: python bm_runner.py _gh_post" + + @staticmethod + def func(args: argparse.Namespace) -> None: + _gh_post_reports() + + # No arguments permitted for this subclass: + + def add_arguments(self) -> None: + pass + + def add_asv_arguments(self) -> None: + pass + + +def main() -> None: + parser = argparse.ArgumentParser( + description=( + "Run the repository performance benchmarks (using Airspeed Velocity)." + ), + epilog=( + "More help is available within each sub-command." + "\n\nNOTE(1): a separate python environment is created to " + "construct test files.\n Set $DATA_GEN_PYTHON to avoid the cost " + "of this." + "\nNOTE(2): iris-test-data is downloaded and cached within the " + "data generation environment.\n Set " + "$OVERRIDE_TEST_DATA_REPOSITORY to avoid the cost of this." + "\nNOTE(3): test data is cached within the " + "benchmarks code directory, and uses a lot of disk space " + "of disk space (Gb).\n Set $BENCHMARK_DATA to specify where this " + "space can be safely allocated." + ), + formatter_class=argparse.RawTextHelpFormatter, + ) + subparsers = parser.add_subparsers(required=True) + + parser_generators: tuple[type[_SubParserGenerator], ...] = ( + Overnight, + Branch, + CPerf, + SPerf, + Custom, + TrialRun, + Validate, + GhPost, + ) + + for gen in parser_generators: + _ = gen(subparsers).subparser + + parsed = parser.parse_args() + parsed.func(parsed) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/custom_bms/README.md b/benchmarks/custom_bms/README.md new file mode 100644 index 00000000000..eea85d74fe9 --- /dev/null +++ b/benchmarks/custom_bms/README.md @@ -0,0 +1,11 @@ +# Iris custom benchmarks + +To be recognised by ASV, these benchmarks must be packaged and installed in +line with the +[ASV guidelines](https://asv.readthedocs.io/projects/asv-runner/en/latest/development/benchmark_plugins.html). +This is achieved using the custom build in [install.py](./install.py). + +Installation is into the environment where the benchmarks are run (i.e. not +the environment containing ASV + Nox, but the one built to the same +specifications as the Tests environment). This is done via `build_command` +in [asv.conf.json](../asv.conf.json). diff --git a/benchmarks/custom_bms/install.py b/benchmarks/custom_bms/install.py new file mode 100644 index 00000000000..bda9f1cc3cd --- /dev/null +++ b/benchmarks/custom_bms/install.py @@ -0,0 +1,55 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the BSD license. +# See LICENSE in the root of the repository for full licensing details. +"""Install the SciTools custom benchmarks for detection by ASV. + +See the requirements for being detected as an ASV plugin: +https://asv.readthedocs.io/projects/asv-runner/en/latest/development/benchmark_plugins.html +""" + +from pathlib import Path +import shutil +from subprocess import run +from tempfile import TemporaryDirectory + +this_dir = Path(__file__).parent + + +def package_files(new_dir: Path) -> None: + """Package SciTools' custom benchmarks for detection by ASV. + + Parameters + ---------- + new_dir : Path + The directory to package the custom benchmarks in. + """ + asv_bench_scitools = new_dir / "asv_bench_scitools" + benchmarks = asv_bench_scitools / "benchmarks" + benchmarks.mkdir(parents=True) + (asv_bench_scitools / "__init__.py").touch() + + for py_file in this_dir.glob("*.py"): + if py_file != Path(__file__): + shutil.copy2(py_file, benchmarks) + + # Create this on the fly, as having multiple pyproject.toml files in 1 + # project causes problems. + py_project = new_dir / "pyproject.toml" + py_project.write_text( + """ + [project] + name = "asv_bench_scitools" + version = "0.1" + """ + ) + + +def main(): + with TemporaryDirectory() as temp_dir: + package_files(Path(temp_dir)) + run(["python", "-m", "pip", "install", temp_dir]) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/custom_bms/tracemallocbench.py b/benchmarks/custom_bms/tracemallocbench.py new file mode 100644 index 00000000000..486c67aeb99 --- /dev/null +++ b/benchmarks/custom_bms/tracemallocbench.py @@ -0,0 +1,196 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the BSD license. +# See LICENSE in the root of the repository for full licensing details. + +"""Benchmark for growth in process resident memory, repeating for accuracy. + +Uses a modified version of the repeat logic in +:class:`asv_runner.benchmarks.time.TimeBenchmark`. +""" + +import re +from timeit import Timer +import tracemalloc +from typing import Callable + +from asv_runner.benchmarks.time import TimeBenchmark, wall_timer + + +class TracemallocBenchmark(TimeBenchmark): + """Benchmark for growth in process resident memory, repeating for accuracy. + + Obviously limited as to what it actually measures : Relies on the current + process not having significant unused (de-allocated) memory when the + tested codeblock runs, and only reliable when the code allocates a + significant amount of new memory. + + Benchmark operations prefixed with ``tracemalloc_`` or ``Tracemalloc`` will + use this benchmark class. + + Inherits behaviour from :class:`asv_runner.benchmarks.time.TimeBenchmark`, + with modifications for memory measurement. See the below Attributes section + and https://asv.readthedocs.io/en/stable/writing_benchmarks.html#timing-benchmarks. + + Attributes + ---------- + Mostly identical to :class:`asv_runner.benchmarks.time.TimeBenchmark`. See + https://asv.readthedocs.io/en/stable/benchmarks.html#timing-benchmarks + Make sure to use the inherited ``repeat`` attribute if greater accuracy + is needed. Below are the attributes where inherited behaviour is + overridden. + + number : int + The number of times the benchmarked operation will be called per + ``repeat``. Memory growth is measured after ALL calls - + i.e. `number` should make no difference to the result if the operation + has perfect garbage collection. The parent class's intelligent + modification of `number` is NOT inherited. A minimum value of ``1`` is + enforced. + warmup_time, sample_time, min_run_count, timer + Not used. + type : str = "tracemalloc" + The name of this benchmark type. + unit : str = "bytes" + The units of the measured metric (i.e. the growth in memory). + + """ + + name_regex = re.compile("^(Tracemalloc[A-Z_].+)|(tracemalloc_.+)$") + + param: tuple + + def __init__(self, name: str, func: Callable, attr_sources: list) -> None: + """Initialize a new instance of `TracemallocBenchmark`. + + Parameters + ---------- + name : str + The name of the benchmark. + func : callable + The function to benchmark. + attr_sources : list + A list of objects from which to draw attributes. + """ + super().__init__(name, func, attr_sources) + self.type = "tracemalloc" + self.unit = "bytes" + + def _load_vars(self): + """Load benchmark variables from attribute sources. + + Downstream handling of ``number`` is not the same as in the parent, so + need to make sure it is at least 1. + """ + super()._load_vars() + self.number = max(1, self.number) + + def run(self, *param: tuple) -> dict: + """Run the benchmark with the given parameters. + + Downstream handling of ``param`` is not the same as in the parent, so + need to store it now. + + Parameters + ---------- + *param : tuple + The parameters to pass to the benchmark function. + + Returns + ------- + dict + A dictionary with the benchmark results. It contains the samples + taken, and "the number of times the function was called in each + sample" - for this benchmark that is always ``1`` to avoid the + parent class incorrectly modifying the results. + """ + self.param = param + return super().run(*param) + + def benchmark_timing( + self, + timer: Timer, + min_repeat: int, + max_repeat: int, + max_time: float, + warmup_time: float, + number: int, + min_run_count: int, + ) -> tuple[list[int], int]: + """Benchmark the timing of the function execution. + + Heavily modified from the parent method + - Directly performs setup and measurement (parent used timeit). + - `number` used differently (see Parameters). + - No warmup phase. + + Parameters + ---------- + timer : timeit.Timer + Not used. + min_repeat : int + The minimum number of times to repeat the function execution. + max_repeat : int + The maximum number of times to repeat the function execution. + max_time : float + The maximum total time to spend on the benchmarking. + warmup_time : float + Not used. + number : int + The number of times the benchmarked operation will be called per + repeat. Memory growth is measured after ALL calls - i.e. `number` + should make no difference to the result if the operation + has perfect garbage collection. The parent class's intelligent + modification of `number` is NOT inherited. + min_run_count : int + Not used. + + Returns + ------- + list + A list of the measured memory growths, in bytes. + int = 1 + Part of the inherited return signature. Must be 1 to avoid + the parent incorrectly modifying the results. + """ + start_time = wall_timer() + samples: list[int] = [] + + def too_slow(num_samples) -> bool: + """Stop taking samples if limits exceeded. + + Parameters + ---------- + num_samples : int + The number of samples taken so far. + + Returns + ------- + bool + True if the benchmark should stop, False otherwise. + """ + if num_samples < min_repeat: + return False + return wall_timer() > start_time + max_time + + # Collect samples + while len(samples) < max_repeat: + self.redo_setup() + tracemalloc.start() + for _ in range(number): + __ = self.func(*self.param) + _, peak_mem_bytes = tracemalloc.get_traced_memory() + tracemalloc.stop() + + samples.append(peak_mem_bytes) + + if too_slow(len(samples)): + break + + # ``number`` is not used in the same way as in the parent class. Must + # be returned as 1 to avoid parent incorrectly modifying the results. + return samples, 1 + + +# https://asv.readthedocs.io/projects/asv-runner/en/latest/development/benchmark_plugins.html +export_as_benchmark = [TracemallocBenchmark] diff --git a/ci/requirements/locks/environment-benchmark-linux-64.lock b/ci/requirements/locks/environment-benchmark-linux-64.lock new file mode 100644 index 00000000000..c8c67b1bcf2 --- /dev/null +++ b/ci/requirements/locks/environment-benchmark-linux-64.lock @@ -0,0 +1,164 @@ +# Generated by conda-lock. +# platform: linux-64 +# input_hash: 98a4801aafacb13b98a04850d749d850a6ee57bf6f26506db814286373f2ebaf +@EXPLICIT +https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81 +https://conda.anaconda.org/conda-forge/linux-64/libopentelemetry-cpp-headers-1.21.0-ha770c72_1.conda#9e298d76f543deb06eb0f3413675e13a +https://conda.anaconda.org/conda-forge/linux-64/nlohmann_json-3.12.0-h3f2d84a_0.conda#d76872d096d063e226482c99337209dc +https://conda.anaconda.org/conda-forge/noarch/nomkl-1.0-h5ca1d4c_0.tar.bz2#9a66894dfd07c4510beb6b3f9672ccc0 +https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda#94305520c52a4aa3f6c2b1ff6008d9f8 +https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda#4222072737ccff51314b5ece9c7d6f5a +https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.8.3-hbd8a1cb_0.conda#74784ee3d225fc3dca89edb635b4e5cc +https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.44-h1423503_1.conda#0be7c6e070c19105f966d3758448d018 +https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_5.conda#dcd5ff1940cd38f6df777cac86819d60 +https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d +https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_5.conda#264fbfba7fb20acf3b29cde153e345ce +https://conda.anaconda.org/conda-forge/linux-64/attr-2.5.2-h39aace5_0.conda#791365c5f65975051e4e017b5da3abf5 +https://conda.anaconda.org/conda-forge/linux-64/aws-c-common-0.12.4-hb03c661_0.conda#ae5621814cb99642c9308977fe90ed0d +https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_8.conda#51a19bba1b8ebfb60df25cde030b7ebc +https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.5-hb9d3cd8_0.conda#f7f0d6cc2dc986d42ac2689ec88192be +https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.3-hb9d3cd8_0.conda#b38117a3c920364aff79f870c984b4a3 +https://conda.anaconda.org/conda-forge/linux-64/libbrotlicommon-1.1.0-hb03c661_4.conda#1d29d2e33fe59954af82ef54a8af3fe1 +https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.1-hecca717_0.conda#4211416ecba1866fab0c6470986c22d6 +https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda#ede4673863426c0883c0063d853bbd85 +https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_5.conda#069afdf8ea72504e48d23ae1171d951c +https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.1.0-hcea5267_5.conda#fbd4008644add05032b6764807ee2cba +https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.18-h3b78370_2.conda#915f5995e94f60e9a4826e0b0920ee88 +https://conda.anaconda.org/conda-forge/linux-64/libjpeg-turbo-3.1.0-hb9d3cd8_0.conda#9fa334557db9f63da6c9285fd2a48638 +https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda#1a580f7796c7bf6393fddb8bbbde58dc +https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb9d3cd8_0.conda#c7e925f37e3b40d893459e625f6a53f1 +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.1.0-h8f9b012_5.conda#4e02a49aaa9d5190cb630fa43528fbe6 +https://conda.anaconda.org/conda-forge/linux-64/libutf8proc-2.11.0-hb04c3b8_0.conda#34fb73fd2d5a613d8f17ce2eaa15a8a5 +https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.1-he9a06e4_0.conda#af930c65e9a79a3423d6d36e265cef65 +https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda#edb0dca6bc32e4f4789199455a1dbeb8 +https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda#47e340acb35de30501a76c7c799c41d7 +https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.3-h26f9b46_0.conda#72b3dd72e4f0b88cdacf3421313480f0 +https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda#a77f85f77be52ff59391544bfe73390a +https://conda.anaconda.org/conda-forge/linux-64/aws-c-cal-0.9.2-he7b75e1_1.conda#c04d1312e7feec369308d656c18e7f3e +https://conda.anaconda.org/conda-forge/linux-64/aws-c-compression-0.3.1-h92c474e_6.conda#3490e744cb8b9d5a3b9785839d618a17 +https://conda.anaconda.org/conda-forge/linux-64/aws-c-sdkutils-0.2.4-h92c474e_1.conda#4ab554b102065910f098f88b40163835 +https://conda.anaconda.org/conda-forge/linux-64/aws-checksums-0.2.7-h92c474e_2.conda#248831703050fe9a5b2680a7589fdba9 +https://conda.anaconda.org/conda-forge/linux-64/gflags-2.2.2-h5888daf_1005.conda#d411fc29e338efb48c5fd4576d71d881 +https://conda.anaconda.org/conda-forge/linux-64/libabseil-20250512.1-cxx17_hba17884_0.conda#83b160d4da3e1e847bf044997621ed63 +https://conda.anaconda.org/conda-forge/linux-64/libaec-1.1.4-h3f801dc_0.conda#01ba04e414e47f95c03d6ddd81fd37be +https://conda.anaconda.org/conda-forge/linux-64/libbrotlidec-1.1.0-hb03c661_4.conda#5cb5a1c9a94a78f5b23684bcb845338d +https://conda.anaconda.org/conda-forge/linux-64/libbrotlienc-1.1.0-hb03c661_4.conda#2e55011fa483edb8bfe3fd92e860cd79 +https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda#c277e0a4d549b03ac1e9d6cbbe3d017b +https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda#172bf1cd1ff8629f2b1179945ed45055 +https://conda.anaconda.org/conda-forge/linux-64/libevent-2.1.12-hf998b51_1.conda#a1cfcc585f0c42bf8d5546bb1dfb668d +https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.1.0-h69a702a_5.conda#0c91408b3dec0b97e8a3c694845bd63b +https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.4-h0c1763c_0.conda#0b367fad34931cb79e0d6b7e5c06bb1c +https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hcf80075_0.conda#eecce068c7e4eddeb169591baac20ac4 +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-15.1.0-h4852527_5.conda#8bba50c7f4679f08c861b597ad2bda6b +https://conda.anaconda.org/conda-forge/linux-64/libzip-1.11.2-h6991a6a_0.conda#a7b27c075c9b7f459f1c022090697cba +https://conda.anaconda.org/conda-forge/linux-64/lz4-c-1.10.0-h5888daf_1.conda#9de5350a85c4a20c685259b889aa6393 +https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda#283b96675859b20a825f8fa30f311446 +https://conda.anaconda.org/conda-forge/linux-64/s2n-1.5.23-h8e187f5_0.conda#edd15d7a5914dc1d87617a2b7c582d23 +https://conda.anaconda.org/conda-forge/linux-64/snappy-1.2.2-h03e3b7b_0.conda#3d8da0248bdae970b4ade636a104b7f5 +https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda#a0116df4f4ed05c303811a837d5b39d8 +https://conda.anaconda.org/conda-forge/linux-64/zlib-1.3.1-hb9d3cd8_2.conda#c9f075ab2f33b3bbee9e62d4ad0a6cd8 +https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb8e6e7a_2.conda#6432cb5d4ac0046c3ac0a8a0f95842f9 +https://conda.anaconda.org/conda-forge/linux-64/aws-c-io-0.21.2-h6252d9a_1.conda#cf5e9b21384fdb75b15faf397551c247 +https://conda.anaconda.org/conda-forge/linux-64/blosc-1.21.6-he440d0b_1.conda#2c2fae981fd2afd00812c92ac47d023d +https://conda.anaconda.org/conda-forge/linux-64/glog-0.7.1-hbabe93e_0.conda#ff862eebdfeb2fd048ae9dc92510baca +https://conda.anaconda.org/conda-forge/linux-64/hdf4-4.2.15-h2a13503_7.conda#bd77f8da987968ec3927990495dc22e4 +https://conda.anaconda.org/conda-forge/linux-64/icu-75.1-he02047a_0.conda#8b189310083baabfb622af68fd9d3ae3 +https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.3-h659f571_0.conda#3f43953b7d3fb3aaa1d0d0723d91e368 +https://conda.anaconda.org/conda-forge/linux-64/libcrc32c-1.1.2-h9c3ff4c_0.tar.bz2#c965a5aa0d5c1c37ffc62dff36e28400 +https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.67.0-had1ee68_0.conda#b499ce4b026493a13774bcf0f4c33849 +https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_2.conda#dfc5aae7b043d9f56ba99514d5e60625 +https://conda.anaconda.org/conda-forge/linux-64/libprotobuf-6.31.1-h9ef548d_1.conda#b92e2a26764fcadb4304add7e698ccf2 +https://conda.anaconda.org/conda-forge/linux-64/libre2-11-2025.08.12-h7b12aa8_1.conda#0a801dabf8776bb86b12091d2f99377e +https://conda.anaconda.org/conda-forge/linux-64/libthrift-0.22.0-h454ac66_1.conda#8ed82d90e6b1686f5e98f8b7825a15ef +https://conda.anaconda.org/conda-forge/linux-64/python-3.13.7-h2b335a9_100_cp313.conda#724dcf9960e933838247971da07fe5cf +https://conda.anaconda.org/conda-forge/linux-64/aws-c-event-stream-0.5.5-h149bd38_3.conda#f9bff8c2a205ee0f28b0c61dad849a98 +https://conda.anaconda.org/conda-forge/linux-64/aws-c-http-0.10.4-h37a7233_0.conda#d828cb0be64d51e27eebe354a2907a98 +https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.1.0-py313h7033f15_4.conda#bc8624c405856b1d047dd0a81829b08c +https://conda.anaconda.org/conda-forge/noarch/certifi-2025.8.3-pyhd8ed1ab_0.conda#11f59985f49df4620890f3e746ed7102 +https://conda.anaconda.org/conda-forge/noarch/click-8.2.1-pyh707e725_0.conda#94b550b8d3a614dbd326af798c7dfb40 +https://conda.anaconda.org/conda-forge/noarch/cloudpickle-3.1.1-pyhd8ed1ab_0.conda#364ba6c9fb03886ac979b482f39ebb92 +https://conda.anaconda.org/conda-forge/linux-64/crc32c-2.7.1-py313h54dd161_2.conda#1b52ef3cbbb8a4108c78c7a73fe31450 +https://conda.anaconda.org/conda-forge/noarch/fsspec-2025.9.0-pyhd8ed1ab_0.conda#76f492bd8ba8a0fb80ffe16fc1a75b3b +https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda#0a802cb9888dd14eeefc611f05c40b6e +https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda#8e6923fc12f1fe8f8c4e5c9f343256ac +https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-35_h4a7cf45_openblas.conda#6da7e852c812a84096b68158574398d0 +https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.14.1-h332b0f4_0.conda#45f6713cb00f124af300342512219182 +https://conda.anaconda.org/conda-forge/linux-64/libxml2-16-2.15.0-ha9997c6_0.conda#84bed2bfefc14e4878bd16979782e522 +https://conda.anaconda.org/conda-forge/linux-64/llvmlite-0.44.0-py313hfdae721_2.conda#dd0d7947635c0c524608eab7db55dcc9 +https://conda.anaconda.org/conda-forge/noarch/locket-1.0.0-pyhd8ed1ab_0.tar.bz2#91e27ef3d05cc772ce627e51cff111c4 +https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py313h8060acc_1.conda#21b62c55924f01b6eef6827167b46acb +https://conda.anaconda.org/conda-forge/linux-64/msgpack-python-1.1.1-py313h7037e92_1.conda#cc41d40a7ec345da56c496767d4bb61b +https://conda.anaconda.org/conda-forge/noarch/opt_einsum-3.4.0-pyhd8ed1ab_1.conda#52919815cd35c4e1a0298af658ccda04 +https://conda.anaconda.org/conda-forge/linux-64/orc-2.2.0-h1bc01a4_0.conda#53ab33c0b0ba995d2546e54b2160f3fd +https://conda.anaconda.org/conda-forge/noarch/packaging-25.0-pyh29332c3_1.conda#58335b26c38bf4a20f399384c33cbcf9 +https://conda.anaconda.org/conda-forge/linux-64/psutil-7.0.0-py313h07c4f96_1.conda#5a7c24c9dc49128731ae565cf598cde4 +https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda#12c566707c80111f9799308d9e265aef +https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda#461219d1a5bd61342293efa2c0c90eac +https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2025.2-pyhd8ed1ab_0.conda#88476ae6ebd24f39261e0854ac244f33 +https://conda.anaconda.org/conda-forge/noarch/pytz-2025.2-pyhd8ed1ab_0.conda#bc8e3267d44011051f2eb14d22fb0960 +https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.2-py313h8060acc_2.conda#50992ba61a8a1f8c2d346168ae1c86df +https://conda.anaconda.org/conda-forge/linux-64/re2-2025.08.12-h5301d42_1.conda#4637c13ff87424af0f6a981ab6f5ffa5 +https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda#3339e3b65d58accf4ca4fb8748ab16b3 +https://conda.anaconda.org/conda-forge/noarch/sortedcontainers-2.4.0-pyhd8ed1ab_1.conda#0401a17ae845fa72c7210e206ec5647d +https://conda.anaconda.org/conda-forge/noarch/tblib-3.1.0-pyhd8ed1ab_0.conda#a15c62b8a306b8978f094f76da2f903f +https://conda.anaconda.org/conda-forge/noarch/toolz-1.0.0-pyhd8ed1ab_1.conda#40d0ed782a8aaa16ef248e68c06c168d +https://conda.anaconda.org/conda-forge/linux-64/tornado-6.5.2-py313h07c4f96_1.conda#45821154b9cb2fb63c2b354c76086954 +https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda#0caa1af407ecff61170c9437a808404d +https://conda.anaconda.org/conda-forge/linux-64/wrapt-1.17.3-py313h07c4f96_1.conda#c2662497e9a9ff2153753682f53989c9 +https://conda.anaconda.org/conda-forge/noarch/zict-3.0.0-pyhd8ed1ab_1.conda#e52c2ef711ccf31bb7f70ca87d144b9e +https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhd8ed1ab_0.conda#df5e78d904988eb55042c0c97446079f +https://conda.anaconda.org/conda-forge/linux-64/aws-c-auth-0.9.0-h0fbd49f_19.conda#24139f2990e92effbeb374a0eb33fdb1 +https://conda.anaconda.org/conda-forge/linux-64/aws-c-mqtt-0.13.3-h19deb91_3.conda#1680d64986f8263978c3624f677656c8 +https://conda.anaconda.org/conda-forge/linux-64/azure-core-cpp-1.16.0-h3a458e0_1.conda#682cb082bbd998528c51f1e77d9ce415 +https://conda.anaconda.org/conda-forge/linux-64/cffi-1.17.1-py313hf01b4d8_1.conda#c4a0f01c46bc155d205694bec57bd709 +https://conda.anaconda.org/conda-forge/linux-64/cytoolz-1.0.1-py313h536fd9c_0.conda#e886bb6a3c24f8b9dd4fcd1d617a1f64 +https://conda.anaconda.org/conda-forge/noarch/deprecated-1.2.18-pyhd8ed1ab_0.conda#0cef44b1754ae4d6924ac0eef6b9fdbe +https://conda.anaconda.org/conda-forge/noarch/donfig-0.8.1.post1-pyhd8ed1ab_1.conda#c56a7fa5597ad78b62e1f5d21f7f8b8f +https://conda.anaconda.org/conda-forge/noarch/h2-4.3.0-pyhcf101f3_0.conda#164fc43f0b53b6e3a7bc7dce5e4f1dc9 +https://conda.anaconda.org/conda-forge/linux-64/hdf5-1.14.6-nompi_h6e4c0c1_103.conda#c74d83614aec66227ae5199d98852aaf +https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.7.0-pyhe01879c_1.conda#63ccfdc3a3ce25b027b8767eb722fca8 +https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhd8ed1ab_0.conda#446bd6c8cb26050d528881df495ce646 +https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-35_h0358290_openblas.conda#8aa3389d36791ecd31602a247b1f3641 +https://conda.anaconda.org/conda-forge/linux-64/libgrpc-1.73.1-h1e535eb_0.conda#8075d8550f773a17288c7ec2cf2f2d56 +https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-35_h47877c9_openblas.conda#aa0b36b71d44f74686f13b9bfabec891 +https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.15.0-h26afc86_0.conda#c52b54db4660b44ca75b6a61c533b9f5 +https://conda.anaconda.org/conda-forge/noarch/partd-1.4.2-pyhd8ed1ab_0.conda#0badf9c54e24cecfb0ad2f99d680c163 +https://conda.anaconda.org/conda-forge/linux-64/prometheus-cpp-1.3.0-ha5d0236_0.conda#a83f6a2fdc079e643237887a37460668 +https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhe01879c_2.conda#5b8d21249ff20967101ffa321cab24e8 +https://conda.anaconda.org/conda-forge/linux-64/aws-c-s3-0.8.6-h800fcd2_2.conda#50e0900a33add0c715f17648de6be786 +https://conda.anaconda.org/conda-forge/linux-64/azure-identity-cpp-1.12.0-ha729027_0.conda#3dab8d6fa3d10fe4104f1fbe59c10176 +https://conda.anaconda.org/conda-forge/linux-64/azure-storage-common-cpp-12.10.0-h4bb41a7_3.conda#1efaf34774bfb92ecf2fa8fa985b2752 +https://conda.anaconda.org/conda-forge/noarch/dask-core-2025.9.1-pyhcf101f3_0.conda#c49de33395d775a92ea90e0cb34c3577 +https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-2.39.0-hdb79228_0.conda#a2e30ccd49f753fd30de0d30b1569789 +https://conda.anaconda.org/conda-forge/linux-64/libnetcdf-4.9.3-nompi_h11f7409_103.conda#3ccff1066c05a1e6c221356eecc40581 +https://conda.anaconda.org/conda-forge/linux-64/libopentelemetry-cpp-1.21.0-hb9b0907_1.conda#1c0320794855f457dea27d35c4c71e23 +https://conda.anaconda.org/conda-forge/linux-64/numpy-2.2.6-py313h17eae1a_0.conda#7a2d2f9adecd86ed5c29c2115354f615 +https://conda.anaconda.org/conda-forge/linux-64/zstandard-0.25.0-py313h54dd161_0.conda#1fe43bd1fc86e22ad3eb0edec637f8a2 +https://conda.anaconda.org/conda-forge/linux-64/aws-crt-cpp-0.33.1-hb4fd278_2.conda#81c545e27e527ca1be0cc04b74c20386 +https://conda.anaconda.org/conda-forge/linux-64/azure-storage-blobs-cpp-12.14.0-hb1c9500_1.conda#30da390c211967189c58f83ab58a6f0c +https://conda.anaconda.org/conda-forge/linux-64/bottleneck-1.6.0-py313h29aa505_0.conda#02405ff909c10e59bf13527f8df3910c +https://conda.anaconda.org/conda-forge/linux-64/cftime-1.6.4-py313h29aa505_2.conda#1363e8db910e403edc8fd486f8470ec6 +https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-storage-2.39.0-hdbdcf42_0.conda#bd21962ff8a9d1ce4720d42a35a4af40 +https://conda.anaconda.org/conda-forge/linux-64/numba-0.61.2-py313h50b8c88_1.conda#53c79b7cdee329ed4c77cafe27600cdb +https://conda.anaconda.org/conda-forge/linux-64/numcodecs-0.16.1-py313h08cd8bf_1.conda#5c1c296392a81820e2332b3315f58b66 +https://conda.anaconda.org/conda-forge/linux-64/numexpr-2.12.1-py313h24ae7f9_100.conda#8e5d3d84d8091537034c021420853613 +https://conda.anaconda.org/conda-forge/noarch/numpy_groupies-0.11.3-pyhd8ed1ab_0.conda#5402c2b046432ceb2d192a82802e7854 +https://conda.anaconda.org/conda-forge/linux-64/pandas-2.3.2-py313h08cd8bf_0.conda#5f4cc42e08d6d862b7b919a3c8959e0b +https://conda.anaconda.org/conda-forge/linux-64/scipy-1.16.2-py313h11c21cd_0.conda#85a80978a04be9c290b8fe6d9bccff1c +https://conda.anaconda.org/conda-forge/noarch/urllib3-2.5.0-pyhd8ed1ab_0.conda#436c165519e140cb08d246a4472a9d6a +https://conda.anaconda.org/conda-forge/linux-64/aws-sdk-cpp-1.11.606-h31ade35_1.conda#e33b3d2a2d44ba0fb35373d2343b71dd +https://conda.anaconda.org/conda-forge/linux-64/azure-storage-files-datalake-cpp-12.12.0-h8b27e44_3.conda#7b738aea4f1b8ae2d1118156ad3ae993 +https://conda.anaconda.org/conda-forge/noarch/distributed-2025.9.1-pyhcf101f3_0.conda#f140b63da44c9a3fc7ae75cb9cc53c47 +https://conda.anaconda.org/conda-forge/noarch/flox-0.10.6-pyhd8ed1ab_0.conda#40136da5d8e93ccbd406518154763fd9 +https://conda.anaconda.org/conda-forge/linux-64/netcdf4-1.7.2-nompi_py313hfae5b86_104.conda#b6ddba788230a41a534cf288d41a1df4 +https://conda.anaconda.org/conda-forge/noarch/numbagg-0.9.2-pyhd8ed1ab_0.conda#5e01f678d82477576cb4d56cc6e9357f +https://conda.anaconda.org/conda-forge/noarch/sparse-0.17.0-pyhcf101f3_0.conda#1b59de14a7e5888f939611e1fe329e00 +https://conda.anaconda.org/conda-forge/noarch/zarr-3.1.2-pyhcf101f3_0.conda#2bdb3950ea64a365bfe9e6414e748a9b +https://conda.anaconda.org/conda-forge/linux-64/libarrow-21.0.0-hb708d0b_3_cpu.conda#2d0305c8802fcba095d8d4e14e66ed3b +https://conda.anaconda.org/conda-forge/linux-64/libarrow-compute-21.0.0-h8c2c5c3_3_cpu.conda#b0b73752adfcbe6b73ef9f2eb5d5cf03 +https://conda.anaconda.org/conda-forge/linux-64/libparquet-21.0.0-h790f06f_3_cpu.conda#0568ba99a1f6c0ef7a04ca23dc78905a +https://conda.anaconda.org/conda-forge/linux-64/libarrow-acero-21.0.0-h635bf11_3_cpu.conda#12fe67afbd946adae49856b275478d0f +https://conda.anaconda.org/conda-forge/linux-64/pyarrow-core-21.0.0-py313he109ebe_0_cpu.conda#3018b7f30825c21c47a7a1e061459f96 +https://conda.anaconda.org/conda-forge/linux-64/libarrow-dataset-21.0.0-h635bf11_3_cpu.conda#630dfffcaf67b800607164d4b5b08bf7 +https://conda.anaconda.org/conda-forge/linux-64/libarrow-substrait-21.0.0-h3f74fd7_3_cpu.conda#595ca398ad8dcac76a315f358e3312a6 +https://conda.anaconda.org/conda-forge/linux-64/pyarrow-21.0.0-py313h78bf25f_0.conda#1580ddd94606ccb60270877cb8838562 diff --git a/noxfile.py b/noxfile.py new file mode 100644 index 00000000000..a90ab3bf304 --- /dev/null +++ b/noxfile.py @@ -0,0 +1,292 @@ +"""Perform test automation with nox. + +For further details, see https://nox.thea.codes/en/stable/# + +""" + +import hashlib +import os +from pathlib import Path + +import nox +from nox.logger import logger + +#: Default to reusing any pre-existing nox environments. +nox.options.reuse_existing_virtualenvs = True + +#: Python versions we can run sessions under +_PY_VERSIONS_ALL = ["3.11", "3.12", "3.13"] +_PY_VERSION_LATEST = _PY_VERSIONS_ALL[-1] + +#: One specific python version for docs builds +_PY_VERSION_DOCSBUILD = _PY_VERSION_LATEST + +#: Cirrus-CI environment variable hook. +PY_VER = os.environ.get("PY_VER", _PY_VERSIONS_ALL) + +#: Default cartopy cache directory. +CARTOPY_CACHE_DIR = os.environ.get("HOME") / Path(".local/share/cartopy") + +# https://github.com/numpy/numpy/pull/19478 +# https://github.com/matplotlib/matplotlib/pull/22099 +#: Common session environment variables. +ENV = dict(NPY_DISABLE_CPU_FEATURES="AVX512F,AVX512CD,AVX512_SKX") + + +def session_lockfile(session: nox.sessions.Session) -> Path: + """Return the path of the session lockfile.""" + # return Path(f"ci/requirements/locks/py{session.python.replace('.', '')}-linux-64.lock") + return Path(f"ci/requirements/locks/environment-benchmark-linux-64.lock") + + +def session_cachefile(session: nox.sessions.Session) -> Path: + """Return the path of the session lockfile cache.""" + lockfile = session_lockfile(session) + tmp_dir = Path(session.create_tmp()) + cache = tmp_dir / lockfile.name + return cache + + +def venv_populated(session: nox.sessions.Session) -> bool: + """List of packages in the lockfile installed. + + Returns True if the conda venv has been created. + """ + return session_cachefile(session).is_file() + + +def venv_changed(session: nox.sessions.Session) -> bool: + """Return True if the installed session is different. + + Compares to that specified in the lockfile. + """ + changed = False + cache = session_cachefile(session) + lockfile = session_lockfile(session) + if cache.is_file(): + with open(lockfile, "rb") as fi: + expected = hashlib.sha256(fi.read()).hexdigest() + with open(cache, "r") as fi: + actual = fi.read() + changed = actual != expected + return changed + + +def cache_venv(session: nox.sessions.Session) -> None: + """Cache the nox session environment. + + This consists of saving a hexdigest (sha256) of the associated + conda lock file. + + Parameters + ---------- + session : object + A `nox.sessions.Session` object. + + """ + lockfile = session_lockfile(session) + cache = session_cachefile(session) + with open(lockfile, "rb") as fi: + hexdigest = hashlib.sha256(fi.read()).hexdigest() + with open(cache, "w") as fout: + fout.write(hexdigest) + + +def cache_cartopy(session: nox.sessions.Session) -> None: + """Determine whether to cache the cartopy natural earth shapefiles. + + Parameters + ---------- + session : object + A `nox.sessions.Session` object. + + """ + if not CARTOPY_CACHE_DIR.is_dir(): + session.run_always( + "python", + "-c", + "import cartopy; cartopy.io.shapereader.natural_earth()", + ) + + +def prepare_venv(session: nox.sessions.Session) -> None: + """Create and cache the nox session conda environment. + + Additionally provide conda environment package details and info. + + Note that, iris is installed into the environment using pip. + + Parameters + ---------- + session : object + A `nox.sessions.Session` object. + + Notes + ----- + See + - https://github.com/theacodes/nox/issues/346 + - https://github.com/theacodes/nox/issues/260 + + """ + lockfile = session_lockfile(session) + venv_dir = session.virtualenv.location_name + + if not venv_populated(session): + # environment has been created but packages not yet installed + # populate the environment from the lockfile + logger.debug(f"Populating conda env at {venv_dir} using {lockfile}") + session.conda_install("--file", str(lockfile)) + cache_venv(session) + + elif venv_changed(session): + # destroy the environment and rebuild it + logger.debug(f"Lockfile changed. Re-creating conda env at {venv_dir}") + _re_orig = session.virtualenv.reuse_existing + session.virtualenv.reuse_existing = False + session.virtualenv.create() + session.conda_install("--file", str(lockfile)) + session.virtualenv.reuse_existing = _re_orig + cache_venv(session) + + logger.debug(f"Environment {venv_dir} is up to date") + + # cache_cartopy(session) + + # Determine whether verbose diagnostics have been requested + # from the command line. + verbose = "-v" in session.posargs or "--verbose" in session.posargs + + if verbose: + session.run_always("conda", "info") + session.run_always("conda", "list", f"--prefix={venv_dir}") + session.run_always( + "conda", + "list", + f"--prefix={venv_dir}", + "--explicit", + ) + + +@nox.session(python=PY_VER, venv_backend="conda") +def tests(session: nox.sessions.Session): + """Perform iris system, integration and unit tests. + + Coverage testing is enabled if the "--coverage" or "-c" flag is used. + + Parameters + ---------- + session : object + A `nox.sessions.Session` object. + + """ + prepare_venv(session) + session.install("--no-deps", "--editable", ".") + session.env.update(ENV) + run_args = [ + "pytest", + "-n", + "auto", + "lib/iris/tests", + ] + if "-c" in session.posargs or "--coverage" in session.posargs: + run_args[-1:-1] = ["--cov=lib/iris", "--cov-report=xml"] + session.run(*run_args) + + +@nox.session(python=_PY_VERSION_DOCSBUILD, venv_backend="conda") +def doctest(session: nox.sessions.Session): + """Perform iris doctests and gallery. + + Parameters + ---------- + session : object + A `nox.sessions.Session` object. + + """ + prepare_venv(session) + session.install("--no-deps", "--editable", ".") + session.env.update(ENV) + session.cd("docs") + session.run( + "make", + "clean", + "html", + external=True, + ) + session.run( + "make", + "doctest", + external=True, + ) + + +@nox.session(python=_PY_VERSION_DOCSBUILD, venv_backend="conda") +def gallery(session: nox.sessions.Session): + """Perform iris gallery doc-tests. + + Parameters + ---------- + session : object + A `nox.sessions.Session` object. + + """ + prepare_venv(session) + session.install("--no-deps", "--editable", ".") + session.env.update(ENV) + session.run( + "pytest", + "-n", + "auto", + "docs/gallery_tests", + ) + + +@nox.session(python=PY_VER, venv_backend="conda") +def wheel(session: nox.sessions.Session): + """Perform iris local wheel install and import test. + + Parameters + ---------- + session : object + A `nox.sessions.Session` object. + + """ + prepare_venv(session) + session.cd("dist") + fname = list(Path(".").glob("scitools_iris-*.whl")) + if len(fname) == 0: + raise ValueError("Cannot find wheel to install.") + if len(fname) > 1: + emsg = f"Expected to find 1 wheel to install, found {len(fname)} instead." + raise ValueError(emsg) + session.install(fname[0].name) + session.run( + "python", + "-c", + "import iris; print(f'{iris.__version__=}')", + external=True, + ) + + +@nox.session +def benchmarks(session: nox.sessions.Session): + """Run the Iris benchmark runner. Run session with `-- --help` for help. + + Parameters + ---------- + session : object + A `nox.sessions.Session` object. + + """ + if len(session.posargs) == 0: + message = ( + "This session MUST be run with at least one argument. The " + "arguments are passed down to the benchmark runner script. E.g:\n" + "nox -s benchmarks -- --help\n" + "nox -s benchmarks -- something --help\n" + "nox -s benchmarks -- something\n" + ) + session.error(message) + session.install("asv", "nox") + bm_runner_path = Path(__file__).parent / "benchmarks" / "bm_runner.py" + session.run("python", bm_runner_path, *session.posargs)