diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 7767b4c8d34c..e38703903929 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -57,7 +57,6 @@ benchmarks/asv.conf.json  @larsoner
 # CI config
 .circleci/  @larsoner
 .github/workflows/  @larsoner @andyfaff
-.cirrus.star  @larsoner @andyfaff
 
 # Doc
 requirements/doc.txt  @tupui
diff --git a/.github/label-globs.yml b/.github/label-globs.yml
index 18b6e046a254..e19700021f2c 100644
--- a/.github/label-globs.yml
+++ b/.github/label-globs.yml
@@ -156,7 +156,6 @@ CI:
     - .circleci/**
     - .github/workflows/**
     - ci/**
-    - .cirrus.star
 
 DX:
 - changed-files:
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index b06323bc4c97..b8faec729b26 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -285,7 +285,7 @@ jobs:
   #################################################################################
   prerelease_deps_coverage_64bit_blas:
     # TODO: re-enable ILP64 build.
-    name: Prerelease deps & coverage report, full, py3.11/npMin & py3.11/npPre, dev.py
+    name: Prerelease deps & coverage report, full, py3.11/npMin & py3.11/npPre, dev.py, SCIPY_ARRAY_API=1
     needs: get_commit_message
     if: >
       needs.get_commit_message.outputs.message == 1
@@ -362,6 +362,7 @@ jobs:
     - name: Test SciPy
       run: |
         export OPENBLAS_NUM_THREADS=1
+        export SCIPY_ARRAY_API=1
         python dev.py --no-build test --coverage -j2 --mode full -- --cov --cov-report term-missing
 
   #################################################################################
@@ -508,7 +509,6 @@ jobs:
       if: ${{ matrix.parallel == '1'}}
       env:
         # Excluded modules:
-        # - scipy.special and scipy.stats are waiting on special.errstate being made thread-safe
         # - scipy.spatial has multiple issues  in kdtree/qhull, and gh-20655 is pending.
         TEST_SUBMODULES: >-
           -t scipy.cluster
@@ -527,6 +527,8 @@ jobs:
           -t scipy.optimize
           -t scipy.signal
           -t scipy.sparse
+          -t scipy.special
+          -t scipy.stats
       run: |
         # Note: only fast tests; full test suite is unlikely to uncover anything more,
         #       and it'll be quite slow with pytest-run-parallel
diff --git a/.github/workflows/linux_blas_ilp64.yml b/.github/workflows/linux_blas_ilp64.yml
new file mode 100644
index 000000000000..5f4f4b9eda93
--- /dev/null
+++ b/.github/workflows/linux_blas_ilp64.yml
@@ -0,0 +1,129 @@
+name: BLAS tests (Linux)
+
+# This file is meant for testing different BLAS/LAPACK flavors and build
+# options on Linux. All other yml files for Linux will only test without BLAS
+# (mostly because that's easier and faster to build) or with the same 64-bit
+# OpenBLAS build that is used in the wheel jobs.
+#
+# Jobs and their purpose:
+#
+#   - mkl:
+#         Tests MKL installed from PyPI (because easiest/fastest, if broken) in
+#         3 ways: both LP64 and ILP64 via pkg-config, and then using the
+#         Single Dynamic Library (SDL, or `libmkl_rt`).
+#
+#    - scipy-openblas64:
+#         Test ILP64-enabled build with scipy-openblas32 and scipy-openblas64.
+#
+
+on:
+  pull_request:
+    branches:
+      - main
+      - maintenance/**
+
+defaults:
+  run:
+    shell: bash
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read # to fetch code (actions/checkout)
+
+jobs:
+
+  mkl-lp64:
+    runs-on: ubuntu-latest
+    name: "MKL LP64"
+    steps:
+    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      with:
+        submodules: recursive
+        fetch-depth: 0
+    - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
+      with:
+        python-version: '3.11'
+
+    - name: Install dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y gfortran
+        pip install cython numpy pybind11 pythran pytest hypothesis pytest-xdist pooch
+        pip install -r requirements/dev.txt
+        pip install git+https://github.com/numpy/meson.git@main-numpymeson
+        pip install mkl mkl-devel
+
+    - name: Build with defaults (LP64)
+      run: |
+        pkg-config --libs mkl-dynamic-lp64-seq  # check link flags
+        python dev.py build -C-Dblas=mkl
+
+    - name: Test
+      run: python dev.py test -j 2
+
+
+  mkl-ilp64:
+    runs-on: ubuntu-latest
+    name: "MKL ILP64"
+    steps:
+    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      with:
+        submodules: recursive
+        fetch-depth: 0
+    - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
+      with:
+        python-version: '3.11'
+
+    - name: Install dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y gfortran
+        pip install cython numpy pybind11 pythran pytest hypothesis pytest-xdist pooch
+        pip install -r requirements/dev.txt
+        pip install git+https://github.com/numpy/meson.git@main-numpymeson
+        pip install mkl mkl-devel
+
+    - name: Build with ILP64
+      run: |
+        pkg-config --libs mkl-dynamic-ilp64-seq  # check link flags
+        python dev.py build -C-Dblas=mkl -C-Duse-ilp64=true
+
+    - name: Test
+      run: python dev.py test -j 2
+
+
+  scipy-openblas-ilp64:
+    runs-on: ubuntu-latest
+    name: "scipy-openblas ILP64"
+    steps:
+    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      with:
+        submodules: recursive
+        fetch-depth: 0
+    - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
+      with:
+        python-version: '3.11'
+
+    - name: Install dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y gfortran
+        pip install cython numpy pybind11 pythran pytest hypothesis pytest-xdist pooch
+        pip install -r requirements/dev.txt
+        pip install git+https://github.com/numpy/meson.git@main-numpymeson
+        pip install scipy-openblas32 scipy-openblas64
+        # dev.py does this for scipy-openblas32
+        python -c'import scipy_openblas64 as so64; print(so64.get_pkg_config())' > scipy-openblas64.pc
+        export PKG_CONFIG_PATH=`pwd`
+
+    - name: Build with ILP64
+      run: |
+        python dev.py build --with-scipy-openblas -C-Duse-ilp64=true
+
+    - name: Test
+      run: python dev.py test -j 2
+
+
diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml
index cc2f83189737..ed1a529ae0b0 100644
--- a/.github/workflows/macos.yml
+++ b/.github/workflows/macos.yml
@@ -217,3 +217,4 @@ jobs:
 
         pip install pooch pytest hypothesis
         python dev.py -n test
+
diff --git a/.github/workflows/macos_blas_ilp64.yml b/.github/workflows/macos_blas_ilp64.yml
new file mode 100644
index 000000000000..59164c0f0370
--- /dev/null
+++ b/.github/workflows/macos_blas_ilp64.yml
@@ -0,0 +1,56 @@
+name: macOS BLAS ILP64 tests
+
+on:
+  push:
+    branches:
+      - maintenance/**
+  pull_request:
+    branches:
+      - main
+      - maintenance/**
+
+permissions:
+   contents: read  # to fetch code (actions/checkout)
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  accelerate:
+    name: Accelerate (ILP64)
+    if: "github.repository == 'ev-br/scipy'"
+    runs-on: macos-15
+    strategy:
+      matrix:
+        python-version: ["3.11"]
+
+    steps:
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      with:
+        submodules: recursive
+
+    - name: Setup Python
+      uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0
+      with:
+        python-version: ${{ matrix.python-version }}
+        cache: 'pip'
+
+    - name: Build and Install SciPy
+      run: |
+        sudo xcode-select -s /Applications/Xcode_16.app
+
+        git submodule update --init
+        GFORTRAN_LOC=$(which gfortran-13)
+        ln -s $GFORTRAN_LOC gfortran
+        export PATH=$PWD:$PATH
+
+        # Ensure we have gfortran dylib
+        GFORTRAN_LIB=$(dirname `gfortran --print-file-name libgfortran.dylib`)
+        export DYLD_LIBRARY_PATH=$GFORTRAN_LIB
+
+        pip install click doit pydevtool rich_click meson cython pythran pybind11 ninja numpy
+        python dev.py build -C-Dblas=accelerate -C-Duse-ilp64=true
+
+        pip install pooch pytest hypothesis
+        python dev.py -n test -v
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 825823ba579e..8c8c18f80661 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -216,8 +216,7 @@ jobs:
         env:
           CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }}*
           CIBW_ARCHS: ${{ matrix.buildplat[2] }}
-          CIBW_PRERELEASE_PYTHONS: True
-          CIBW_FREE_THREADED_SUPPORT: True
+          CIBW_ENABLE: cpython-freethreading cpython-prerelease
 
       - name: Rename macOS wheels
         if: startsWith( matrix.buildplat[0], 'macos-' )
diff --git a/doc/source/building/blas_lapack.rst b/doc/source/building/blas_lapack.rst
index 284c3672bb1e..61e27b0e618e 100644
--- a/doc/source/building/blas_lapack.rst
+++ b/doc/source/building/blas_lapack.rst
@@ -96,6 +96,44 @@ user wants to override this autodetection mechanism for building against plain
     $ python -m build -C-Duse-g77-abi=true -Csetup-args=-Dblas=blas -Csetup-args=-Dlapack=lapack 
 
 
+64-bit integer (ILP64) BLAS/LAPACK
+----------------------------------
+
+Support for ILP64 BLAS and LAPACK is still experimental; at the time of writing
+(Apr 2025) it is only available for two BLAS/LAPACK configurations: MKL and
+``scipy-openblas``.
+
+SciPy always requires LP64 (32-bit integer size) BLAS/LAPACK. You can build SciPy
+with *additional* ILP64 support. This will result in SciPy requiring both BLAS and
+LAPACK variants, where some extensions link to the ILP64 variant, while other
+extensions link to the LP64 variant. From python, choosing the variant is done
+through the ``get_blas_funcs`` and ``get_lapack_funcs`` functions::
+
+    >>> from scipy.linalg.blas import get_blas_funcs
+    >>> daxpy = get_blas_funcs('axpy', (np.ones(3),), ilp64='preferred')
+    >>> daxpy.int_dtype
+    dtype('int64')
+
+Building with ILP64 support requires several NumPy additions to ``meson``, which have
+not been merged to upstream yet::
+
+    $ pip install git+https://github.com/numpy/meson.git@main-numpymeson
+
+For a development build with MKL, install the library and its development headers, and
+give use the ``ilp64=true`` command line argument
+
+    $ pip install mkl mkl-devel
+    $ python dev.py build -C-Dblas=mkl -C-Duse-ilp64=true
+
+For a development build with ``scipy-openblas64``, make sure you have installed both
+``scipy-openblas32`` and ``scipy-openblas64``, and generate the pkg-config file
+for the ILP64 variant::
+
+    >>> python -c'import scipy_openblas64 as so64; print(so64.get_pkg_config())' > scipy-openblas64.pc
+    >>> export PKG_CONFIG_PATH=`pwd`
+    >>> python dev.py build --with-scipy-openblas -C-Duse-ilp64=true
+
+
 Work-in-progress
 ----------------
 
diff --git a/doc/source/dev/contributor/continuous_integration.rst b/doc/source/dev/contributor/continuous_integration.rst
index 242e5293eee3..389bf957a98e 100644
--- a/doc/source/dev/contributor/continuous_integration.rst
+++ b/doc/source/dev/contributor/continuous_integration.rst
@@ -56,12 +56,6 @@ CircleCI
 * ``run_benchmarks``: verify how the changes impact performance
 * ``refguide_check``: doctests from examples and benchmarks
 
-CirrusCI
---------
-* ``Tests``: test suite for specific architecture like
-  ``musllinux, arm, aarch``
-* ``Wheels``: build and upload some wheels
-
 .. _skip-ci:
 
 Skipping
@@ -79,7 +73,6 @@ Skipping CI can be achieved by adding a special text in the commit message:
 
 * ``[skip actions]``: will skip GitHub Actions
 * ``[skip circle]``: will skip CircleCI
-* ``[skip cirrus]``: will skip CirrusCI
 * ``[docs only]``: will skip *all but* the CircleCI checks and the linter
 * ``[lint only]``: will skip *all but* the linter
 * ``[skip ci]``: will skip *all* CI
@@ -88,7 +81,7 @@ Of course, you can combine these to skip multiple workflows.
 
 This skip information should be placed on a new line. In this example, we
 just updated a ``.rst`` file in the documentation and ask to skip all but the
-relevant docs checks (skip Cirrus and GitHub Actions' workflows)::
+relevant docs checks (skip GitHub Actions' workflows)::
 
     DOC: improve QMCEngine examples.
 
diff --git a/meson.options b/meson.options
index 3257cb8a8ff5..b7c3968ebdb0 100644
--- a/meson.options
+++ b/meson.options
@@ -2,6 +2,15 @@ option('blas', type: 'string', value: 'openblas',
         description: 'option for BLAS library switching')
 option('lapack', type: 'string', value: 'openblas',
         description: 'option for LAPACK library switching')
+
+# NB ILP64 build is experimental.
+# See https://scipy.github.io/devdocs/building/blas_lapack.html for details
+option('use-ilp64', type: 'boolean', value: false,
+       description: 'Use ILP64 (64-bit integer) BLAS and LAPACK interfaces')
+option('blas-symbol-suffix', type: 'string', value: 'auto',
+        description: 'BLAS and LAPACK symbol suffix to use, if any')
+option('mkl-threading', type: 'string', value: 'auto',
+        description: 'MKL threading method, one of: `seq`, `iomp`, `gomp`, `tbb`')
 option('use-g77-abi', type: 'boolean', value: false,
         description: 'If set to true, forces using g77 compatibility wrappers ' +
                      'for LAPACK functions. The default is to use gfortran ' +
diff --git a/mypy.ini b/mypy.ini
index e45c588cadf7..ffc23fb21bb9 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -120,7 +120,7 @@ ignore_missing_imports = True
 [mypy-scipy.optimize._bglu_dense]
 ignore_missing_imports = True
 
-[mypy-scipy.optimize._slsqp]
+[mypy-scipy.optimize._slsqplib]
 ignore_missing_imports = True
 
 [mypy-scipy.interpolate._dfitpack]
diff --git a/pyproject.toml b/pyproject.toml
index 1d7f207e9dab..1181de3fe34f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -82,7 +82,7 @@ test = [
     "scikit-umfpack",
     "pooch",
     "hypothesis>=6.30",
-    "array-api-strict>=2.3",
+    "array-api-strict>=2.3.1",
     "Cython",
     "meson",
     'ninja; sys_platform != "emscripten"',
diff --git a/requirements/test.txt b/requirements/test.txt
index 666ff489f4ed..78a5873d9038 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -11,7 +11,7 @@ threadpoolctl
 # scikit-umfpack  # circular dependency issues
 pooch
 hypothesis>=6.30
-array-api-strict>=2.0,<2.1.1
+array-api-strict>=2.3.1
 Cython
 meson
 ninja; sys_platform != "emscripten"
diff --git a/scipy/_build_utils/int64.f2cmap.in b/scipy/_build_utils/int64.f2cmap.in
new file mode 100644
index 000000000000..86ffa326b6e6
--- /dev/null
+++ b/scipy/_build_utils/int64.f2cmap.in
@@ -0,0 +1 @@
+{'integer': {'': '@int64_name@'}, 'logical': {'': '@int64_name@'}}
diff --git a/scipy/_build_utils/src/_blas64_defines.h b/scipy/_build_utils/src/_blas64_defines.h
new file mode 100644
index 000000000000..20d9834079ca
--- /dev/null
+++ b/scipy/_build_utils/src/_blas64_defines.h
@@ -0,0 +1,33 @@
+/*
+ * A common include for fblas_64 and flapack_64 f2py sources.
+ *
+ * f2py accounts for the Fortran name mangling (upppercase/lowercase, trailing underscore),
+ * via its hardcoded F_FUNC define.
+ *
+ * For ILP64 variants, we need a more flexible naming scheme, to potentially include
+ * the _64 or 64_ suffixes. This is what the `BLAS_FUNC` macro from `npy_cblas.h` does.
+ *
+ * We therefore inject the define into the f2py-generated sources. 
+ */
+
+#ifdef F_FUNC
+#undef F_FUNC
+#endif
+
+#include "npy_cblas.h"
+#define F_FUNC(f, F) BLAS_FUNC(f)
+
+#ifdef FIX_MKL_2025_ILP64_MISSING_SYMBOL
+#define cspr_64_ cspr_64
+#endif
+
+#define F_INT npy_int64
+
+
+#ifndef HAVE_BLAS_ILP64
+#error("HAVE_BLAS_ILP64 not defined.")
+#endif
+
+#ifndef BLAS_SYMBOL_SUFFIX
+#error("BLAS_SYMBOL_SUFFIX  not defined")
+#endif
diff --git a/scipy/_build_utils/src/npy_cblas.h b/scipy/_build_utils/src/npy_cblas.h
index de65ad903284..56c5e05916bd 100644
--- a/scipy/_build_utils/src/npy_cblas.h
+++ b/scipy/_build_utils/src/npy_cblas.h
@@ -26,6 +26,21 @@ enum CBLAS_SIDE {CblasLeft=141, CblasRight=142};
 
 #define CBLAS_INDEX size_t  /* this may vary between platforms */
 
+#ifdef ACCELERATE_NEW_LAPACK
+    #if __MAC_OS_X_VERSION_MAX_ALLOWED < 130300
+        #ifdef HAVE_BLAS_ILP64
+            #error "Accelerate ILP64 support is only available with macOS 13.3 SDK or later"
+        #endif
+    #else
+       /* #define NO_APPEND_FORTRAN */
+        #ifdef HAVE_BLAS_ILP64
+            #define BLAS_SYMBOL_SUFFIX $NEWLAPACK$ILP64
+        #else
+            #define BLAS_SYMBOL_SUFFIX $NEWLAPACK
+        #endif
+    #endif
+#endif
+
 #ifdef NO_APPEND_FORTRAN
 #define BLAS_FORTRAN_SUFFIX
 #else
@@ -50,7 +65,6 @@ enum CBLAS_SIDE {CblasLeft=141, CblasRight=142};
 #define BLAS_FUNC_CONCAT(name,prefix,suffix,suffix2) prefix ## name ## suffix ## suffix2
 #define BLAS_FUNC_EXPAND(name,prefix,suffix,suffix2) BLAS_FUNC_CONCAT(name,prefix,suffix,suffix2)
 
-#define CBLAS_FUNC(name) BLAS_FUNC_EXPAND(name,BLAS_SYMBOL_PREFIX,,BLAS_SYMBOL_SUFFIX)
 /*
  * Use either the OpenBLAS scheme with the `64_` suffix behind the Fortran
  * compiler symbol mangling, or the MKL scheme (and upcoming
@@ -62,6 +76,12 @@ enum CBLAS_SIDE {CblasLeft=141, CblasRight=142};
 #define BLAS_FUNC(name) BLAS_FUNC_EXPAND(name,BLAS_SYMBOL_PREFIX,BLAS_SYMBOL_SUFFIX,BLAS_FORTRAN_SUFFIX)
 #endif
 
+/*
+ * Note that CBLAS doesn't include Fortran compiler symbol mangling, so ends up
+ * being the same in both schemes
+ */
+#define CBLAS_FUNC(name) BLAS_FUNC_EXPAND(name,BLAS_SYMBOL_PREFIX,,BLAS_SYMBOL_SUFFIX)
+
 #ifdef HAVE_BLAS_ILP64
 #define CBLAS_INT npy_int64
 #define CBLAS_INT_MAX NPY_MAX_INT64
diff --git a/scipy/_build_utils/src/wrap_g77_abi.c b/scipy/_build_utils/src/wrap_g77_abi.c
index ac11f9c53c57..f8ff60d5a325 100644
--- a/scipy/_build_utils/src/wrap_g77_abi.c
+++ b/scipy/_build_utils/src/wrap_g77_abi.c
@@ -25,6 +25,12 @@ return values, struct complex arguments work without segfaulting.
 #include "npy_cblas.h"
 #include "fortran_defs.h"
 
+#ifdef HAVE_BLAS_ILP64
+/* NB: this redefines F_FUNC */
+#include "_blas64_defines.h"
+#endif
+
+
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/scipy/_lib/_array_api.py b/scipy/_lib/_array_api.py
index b7fb0c25ce35..a724ff74144a 100644
--- a/scipy/_lib/_array_api.py
+++ b/scipy/_lib/_array_api.py
@@ -39,12 +39,13 @@
 
 __all__ = [
     '_asarray', 'array_namespace', 'assert_almost_equal', 'assert_array_almost_equal',
-    'get_xp_devices', 'default_xp', 'is_lazy_array', 'is_marray',
+    'default_xp', 'is_lazy_array', 'is_marray',
     'is_array_api_strict', 'is_complex', 'is_cupy', 'is_jax', 'is_numpy', 'is_torch', 
     'SCIPY_ARRAY_API', 'SCIPY_DEVICE', 'scipy_namespace_for',
     'xp_assert_close', 'xp_assert_equal', 'xp_assert_less',
     'xp_copy', 'xp_device', 'xp_ravel', 'xp_size',
     'xp_unsupported_param_msg', 'xp_vector_norm', 'xp_capabilities',
+    'xp_result_type', 'xp_promote'
 ]
 
 
@@ -420,42 +421,6 @@ def is_complex(x: Array, xp: ModuleType) -> bool:
     return xp.isdtype(x.dtype, 'complex floating')
 
 
-def get_xp_devices(xp: ModuleType) -> list[str] | list[None]:
-    """Returns a list of available devices for the given namespace."""
-    devices: list[str] = []
-    if is_torch(xp):
-        devices += ['cpu']
-        import torch # type: ignore[import]
-        num_cuda = torch.cuda.device_count()
-        for i in range(0, num_cuda):
-            devices += [f'cuda:{i}']
-        if torch.backends.mps.is_available():
-            devices += ['mps']
-        return devices
-    elif is_cupy(xp):
-        import cupy # type: ignore[import]
-        num_cuda = cupy.cuda.runtime.getDeviceCount()
-        for i in range(0, num_cuda):
-            devices += [f'cuda:{i}']
-        return devices
-    elif is_jax(xp):
-        import jax # type: ignore[import]
-        num_cpu = jax.device_count(backend='cpu')
-        for i in range(0, num_cpu):
-            devices += [f'cpu:{i}']
-        num_gpu = jax.device_count(backend='gpu')
-        for i in range(0, num_gpu):
-            devices += [f'gpu:{i}']
-        num_tpu = jax.device_count(backend='tpu')
-        for i in range(0, num_tpu):
-            devices += [f'tpu:{i}']
-        return devices
-
-    # given namespace is not known to have a list of available devices;
-    # return `[None]` so that one can use this in tests for `device=None`.
-    return [None]
-
-
 def scipy_namespace_for(xp: ModuleType) -> ModuleType | None:
     """Return the `scipy`-like namespace of a non-NumPy backend
 
@@ -513,31 +478,87 @@ def xp_ravel(x: Array, /, *, xp: ModuleType | None = None) -> Array:
     return xp.reshape(x, (-1,))
 
 
-# utility to broadcast arrays and promote to common dtype
-def xp_broadcast_promote(*args, ensure_writeable=False, force_floating=False, xp=None):
-    xp = array_namespace(*args) if xp is None else xp
-
-    args = [(_asarray(arg, subok=True) if arg is not None else arg) for arg in args]
+# utility to find common dtype with option to force floating
+def xp_result_type(*args, force_floating=False, xp):
+    """
+    Returns the dtype that results from applying type promotion rules
+    (see Array API Standard Type Promotion Rules) to the arguments. Augments
+    standard `result_type` in a few ways:
+
+    - There is a `force_floating` argument that ensures that the result type
+      is floating point, even when all args are integer.     
+    - When a TypeError is raised (e.g. due to an unsupported promotion)
+      and `force_floating=True`, we define a custom rule: use the result type
+      of the default float and any other floats passed. See
+      https://github.com/scipy/scipy/pull/22695/files#r1997905891
+      for rationale.
+    - This function accepts array-like iterables, which are immediately converted
+      to the namespace's arrays before result type calculation. Consequently, the
+      result dtype may be different when an argument is `1.` vs `[1.]`.
+
+    Typically, this function will be called shortly after `array_namespace`
+    on a subset of the arguments passed to `array_namespace`.
+    """
+    args = [(_asarray(arg, subok=True, xp=xp) if np.iterable(arg) else arg)
+            for arg in args]
     args_not_none = [arg for arg in args if arg is not None]
+    if force_floating:
+        args_not_none.append(1.0)
 
-    # determine minimum dtype
-    default_float = xp.asarray(1.).dtype
-    dtypes = [arg.dtype for arg in args_not_none]
-    try:  # follow library's prefered mixed promotion rules
-        dtype = xp.result_type(*dtypes)
-        if force_floating and xp.isdtype(dtype, 'integral'):
-            # If we were to add `default_float` before checking whether the result
-            # type is otherwise integral, we risk promotion from lower float.
-            dtype = xp.result_type(dtype, default_float)
+    if is_numpy(xp) and xp.__version__ < '2.0':
+        # Follow NEP 50 promotion rules anyway
+        args_not_none = [arg.dtype if getattr(arg, 'size', 0) == 1 else arg
+                         for arg in args_not_none]
+        return xp.result_type(*args_not_none)
+
+    try:  # follow library's preferred promotion rules
+        return xp.result_type(*args_not_none)
     except TypeError:  # mixed type promotion isn't defined
-        float_dtypes = [dtype for dtype in dtypes
-                        if not xp.isdtype(dtype, 'integral')]
-        if float_dtypes:
-            dtype = xp.result_type(*float_dtypes, default_float)
-        elif force_floating:
-            dtype = default_float
-        else:
-            dtype = xp.result_type(*dtypes)
+        if not force_floating:
+            raise
+        # use `result_type` of default floating point type and any floats present
+        # This can be revisited, but right now, the only backends that get here
+        # are array-api-strict (which is not for production use) and PyTorch
+        # (due to data-apis/array-api-compat#279).
+        float_args = []
+        for arg in args_not_none:
+            arg_array = xp.asarray(arg) if np.isscalar(arg) else arg
+            dtype = getattr(arg_array, 'dtype', arg)
+            if xp.isdtype(dtype, ('real floating', 'complex floating')):
+                float_args.append(arg)
+        return xp.result_type(*float_args, xp_default_dtype(xp))
+
+
+def xp_promote(*args, broadcast=False, force_floating=False, xp):
+    """
+    Promotes elements of *args to result dtype, ignoring `None`s.
+    Includes options for forcing promotion to floating point and
+    broadcasting the arrays, again ignoring `None`s.
+    Type promotion rules follow `xp_result_type` instead of `xp.result_type`.
+
+    Typically, this function will be called shortly after `array_namespace`
+    on a subset of the arguments passed to `array_namespace`.
+
+    This function accepts array-like iterables, which are immediately converted
+    to the namespace's arrays before result type calculation. Consequently, the
+    result dtype may be different when an argument is `1.` vs `[1.]`.
+    
+    See Also
+    --------
+    xp_result_type
+    """
+    args = [(_asarray(arg, subok=True, xp=xp) if np.iterable(arg) else arg)
+            for arg in args]  # solely to prevent double conversion of iterable to array
+
+    dtype = xp_result_type(*args, force_floating=force_floating, xp=xp)
+
+    args = [(_asarray(arg, dtype=dtype, subok=True, xp=xp) if arg is not None else arg)
+            for arg in args]
+
+    if not broadcast:
+        return args[0] if len(args)==1 else tuple(args)
+
+    args_not_none = [arg for arg in args if arg is not None]
 
     # determine result shape
     shapes = {arg.shape for arg in args_not_none}
@@ -561,12 +582,13 @@ def xp_broadcast_promote(*args, ensure_writeable=False, force_floating=False, xp
             kwargs = {'subok': True} if is_numpy(xp) else {}
             arg = xp.broadcast_to(arg, shape, **kwargs)
 
-        # convert dtype/copy only if needed
-        if (arg.dtype != dtype) or ensure_writeable:
-            arg = xp.astype(arg, dtype, copy=True)
+        # This is much faster than xp.astype(arg, dtype, copy=False)
+        if arg.dtype != dtype:
+            arg = xp.astype(arg, dtype)
+
         out.append(arg)
 
-    return out
+    return out[0] if len(out)==1 else tuple(out)
 
 
 def xp_float_to_complex(arr: Array, xp: ModuleType | None = None) -> Array:
diff --git a/scipy/_lib/_elementwise_iterative_method.py b/scipy/_lib/_elementwise_iterative_method.py
index 05efe86d31c1..c0d5a3d06ae0 100644
--- a/scipy/_lib/_elementwise_iterative_method.py
+++ b/scipy/_lib/_elementwise_iterative_method.py
@@ -15,7 +15,7 @@
 import math
 import numpy as np
 from ._util import _RichResult, _call_callback_maybe_halt
-from ._array_api import array_namespace, xp_size
+from ._array_api import array_namespace, xp_size, xp_result_type
 
 _ESIGNERR = -1
 _ECONVERR = -2
@@ -82,9 +82,8 @@ def _initialize(func, xs, args, complex_ok=False, preserve_shape=None, xp=None):
     # and cause failure.
     # There might be benefit to combining the `xs` into a single array and
     # calling `func` once on the combined array. For now, keep them separate.
+    xat = xp_result_type(*xs, force_floating=True, xp=xp)
     xas = xp.broadcast_arrays(*xs, *args)  # broadcast and rename
-    xat = xp.result_type(*[xa.dtype for xa in xas])
-    xat = xp.asarray(1.).dtype if xp.isdtype(xat, "integral") else xat
     xs, args = xas[:nx], xas[nx:]
     xs = [xp.asarray(x, dtype=xat) for x in xs]  # use copy=False when implemented
     fs = [xp.asarray(func(x, *args)) for x in xs]
diff --git a/scipy/_lib/_util.py b/scipy/_lib/_util.py
index 88ecb85c5bfc..2de7552b45d9 100644
--- a/scipy/_lib/_util.py
+++ b/scipy/_lib/_util.py
@@ -12,11 +12,11 @@
 
 import numpy as np
 from scipy._lib._array_api import (Array, array_namespace, is_lazy_array,
-                                   is_numpy, is_marray, xp_size)
+                                   is_numpy, is_marray, xp_size, xp_result_type)
 from scipy._lib._docscrape import FunctionDoc, Parameter
 from scipy._lib._sparse import issparse
 
-from numpy.exceptions import AxisError, DTypePromotionError
+from numpy.exceptions import AxisError
 
 
 np_long: type
@@ -1012,13 +1012,7 @@ def _rng_spawn(rng, n_children):
 def _get_nan(*data, xp=None):
     xp = array_namespace(*data) if xp is None else xp
     # Get NaN of appropriate dtype for data
-    data = [xp.asarray(item) for item in data]
-    try:
-        min_float = getattr(xp, 'float16', xp.float32)
-        dtype = xp.result_type(*data, min_float)  # must be at least a float
-    except DTypePromotionError:
-        # fallback to float64
-        dtype = xp.float64
+    dtype = xp_result_type(*data, force_floating=True, xp=xp)
     res = xp.asarray(xp.nan, dtype=dtype)[()]
     # whenever mdhaber/marray#89 is resolved, could just return `res`
     return res.data if is_marray(xp) else res
diff --git a/scipy/_lib/array_api_compat b/scipy/_lib/array_api_compat
index 8d991b437cdc..621494be1bd8 160000
--- a/scipy/_lib/array_api_compat
+++ b/scipy/_lib/array_api_compat
@@ -1 +1 @@
-Subproject commit 8d991b437cdcdf2cd91bef33fbfd491a409cb64f
+Subproject commit 621494be1bd8682f1d76ae874272c12464953d3d
diff --git a/scipy/_lib/array_api_extra b/scipy/_lib/array_api_extra
index de481f2cac82..0d26a7462a3f 160000
--- a/scipy/_lib/array_api_extra
+++ b/scipy/_lib/array_api_extra
@@ -1 +1 @@
-Subproject commit de481f2cac821c2db7ab2a45b83ed29963c2e1eb
+Subproject commit 0d26a7462a3fbf5ed9e42e261bdb3b39f25e2faf
diff --git a/scipy/_lib/meson.build b/scipy/_lib/meson.build
index acf54b58c2cf..01cedf0d94a1 100644
--- a/scipy/_lib/meson.build
+++ b/scipy/_lib/meson.build
@@ -211,6 +211,7 @@ py3.install_sources(
     'array_api_compat/array_api_compat/torch/__init__.py',
     'array_api_compat/array_api_compat/torch/_aliases.py',
     'array_api_compat/array_api_compat/torch/_info.py',
+    'array_api_compat/array_api_compat/torch/_typing.py',
     'array_api_compat/array_api_compat/torch/fft.py',
     'array_api_compat/array_api_compat/torch/linalg.py',
   ],
diff --git a/scipy/_lib/tests/test_array_api.py b/scipy/_lib/tests/test_array_api.py
index d0da4d45135c..663ee0ac0024 100644
--- a/scipy/_lib/tests/test_array_api.py
+++ b/scipy/_lib/tests/test_array_api.py
@@ -1,14 +1,17 @@
+import re
+
 import numpy as np
 import pytest
 
 from scipy._lib._array_api import (
     _GLOBAL_CONFIG, array_namespace, _asarray, xp_copy, xp_assert_equal, is_numpy,
-    np_compat, xp_default_dtype
+    np_compat, xp_default_dtype, xp_result_type, is_torch
 )
 from scipy._lib import array_api_extra as xpx
 from scipy._lib._array_api_no_0d import xp_assert_equal as xp_assert_equal_no_0d
 from scipy._lib.array_api_extra.testing import lazy_xp_function
 
+
 lazy_xp_function(_asarray, static_argnames=(
                  "dtype", "order", "copy", "xp", "check_finite", "subok"))
 lazy_xp_function(xp_copy, static_argnames=("xp", ))
@@ -225,3 +228,96 @@ def test_check_scalar_no_0d(self, xp):
 
     def test_default_dtype(self, xp):
         assert xp_default_dtype(xp) == xp.asarray(1.).dtype
+
+
+scalars = [1, 1., 1. + 1j]
+lists = [[1], [1.], [1. + 1j]]
+types = ('int8 int16 int32 int64 '
+         'uint8 uint16 uint32 uint64 '
+         'float32 float64 complex64 complex128').split()
+arrays = [np.asarray([1], dtype=getattr(np, t)) for t in types]
+
+
+def convert_type(x, xp):
+    # Convert NumPy array to xp-array
+    # Convert string to indicated dtype from xp
+    # Return Python scalars unchanged
+    if isinstance(x, np.ndarray):
+        return xp.asarray(x)
+    elif isinstance(x, str):
+        return getattr(xp, x)
+    return x
+
+
+def is_inexact(x, xp):
+    # Determine whether `x` is of inexact (real of complex floating) dtype
+    x = xp.asarray(x) if np.isscalar(x) or isinstance(x, list) else x
+    dtype = getattr(x, 'dtype', x)
+    return xp.isdtype(dtype, ('real floating', 'complex floating'))
+
+
+@pytest.mark.parametrize('x', scalars + lists + types + arrays)
+@pytest.mark.parametrize('y', scalars + lists + types + arrays)
+def test_xp_result_type_no_force(x, y, xp):
+    # When force_floating==False (default), behavior of `xp_result_type`
+    # should match that of `xp.result_type` on the same arguments after
+    # converting lists to arrays of type `xp`.
+    x = convert_type(x, xp)
+    y = convert_type(y, xp)
+    x_ref = xp.asarray(x) if isinstance(x, list) else x
+    y_ref = xp.asarray(y) if isinstance(y, list) else y
+
+    try:
+        dtype_ref = xp.result_type(x_ref, y_ref)
+        expected_error = None
+    except Exception as e:
+        expected_error = (type(e), str(e))
+
+    if expected_error is not None:
+        with pytest.raises(expected_error[0], match=re.escape(expected_error[1])):
+            xp_result_type(x, y, xp=xp)
+        return
+
+    dtype_res = xp_result_type(x, y, xp=xp)
+    assert dtype_res == dtype_ref
+
+
+@pytest.mark.parametrize('x', scalars + lists + types + arrays)
+@pytest.mark.parametrize('y', scalars + lists + types + arrays)
+def test_xp_result_type_force_floating(x, y, xp):
+    # When `force_floating==True`, behavior of `xp_result_type`
+    # should match that of `xp.result_type` with `1.0` appended to the set of
+    # arguments (after converting lists to arrays of type `xp`).
+    # If this raises a `TypeError`, which is the case when the result
+    # type is not defined by the standard, the result type should be
+    # the result type of any inexact (real or complex floating) arguments
+    # and the default floating point type.
+    if (is_torch(xp) and not(isinstance(x, str) or isinstance(y, str))
+            and np.isscalar(x) and np.isscalar(y)):
+        pytest.skip("See 3/27/2024 comment at  data-apis/array-api-compat#277")
+
+    x = convert_type(x, xp)
+    y = convert_type(y, xp)
+    x_ref = xp.asarray(x) if isinstance(x, list) else x
+    y_ref = xp.asarray(y) if isinstance(y, list) else y
+
+    expected_error = None
+    try:
+        dtype_ref = xp.result_type(x_ref, y_ref, 1.0)
+    except TypeError:
+        args = []
+        if is_inexact(x_ref, xp):
+            args.append(x_ref)
+        if is_inexact(y_ref, xp):
+            args.append(y_ref)
+        dtype_ref = xp.result_type(*args, xp.asarray(1.0))
+    except Exception as e:
+        expected_error = (type(e), str(e))
+
+    if expected_error is not None:
+        with pytest.raises(expected_error[0], match=expected_error[1]):
+            xp_result_type(x, y, xp=xp)
+        return
+
+    dtype_res = xp_result_type(x, y, force_floating=True, xp=xp)
+    assert dtype_res == dtype_ref
diff --git a/scipy/cluster/hierarchy.py b/scipy/cluster/hierarchy.py
index 2fb11dcf0651..56620ad9a8d9 100644
--- a/scipy/cluster/hierarchy.py
+++ b/scipy/cluster/hierarchy.py
@@ -134,7 +134,8 @@
 import numpy as np
 from . import _hierarchy, _optimal_leaf_ordering
 import scipy.spatial.distance as distance
-from scipy._lib._array_api import array_namespace, _asarray, xp_copy, is_jax
+from scipy._lib._array_api import (_asarray, array_namespace, is_dask, is_jax,
+                                   is_lazy_array, xp_copy)
 from scipy._lib._disjoint_set import DisjointSet
 import scipy._lib.array_api_extra as xpx
 
@@ -1005,6 +1006,7 @@ def linkage(y, method='single', metric='euclidean', optimal_ordering=False):
     """
     xp = array_namespace(y)
     y = _asarray(y, order='C', dtype=xp.float64, xp=xp)
+    lazy = is_lazy_array(y)
 
     if method not in _LINKAGE_METHODS:
         raise ValueError(f"Invalid method: {method}")
@@ -1016,35 +1018,40 @@ def linkage(y, method='single', metric='euclidean', optimal_ordering=False):
     if y.ndim == 1:
         distance.is_valid_y(y, throw=True, name='y')
     elif y.ndim == 2:
-        if (y.shape[0] == y.shape[1] and np.allclose(np.diag(y), 0) and
-                xp.all(y >= 0) and np.allclose(y, y.T)):
+        if (not lazy and y.shape[0] == y.shape[1]
+            and xp.all(xpx.isclose(xp.linalg.diagonal(y), 0))
+            and xp.all(y >= 0) and xp.all(xpx.isclose(y, y.T))):
             warnings.warn('The symmetric non-negative hollow observation '
                           'matrix looks suspiciously like an uncondensed '
                           'distance matrix',
                           ClusterWarning, stacklevel=2)
         y = distance.pdist(y, metric)
-        y = xp.asarray(y)
     else:
         raise ValueError("`y` must be 1 or 2 dimensional.")
 
-    if not xp.all(xp.isfinite(y)):
+    if not lazy and not xp.all(xp.isfinite(y)):
         raise ValueError("The condensed distance matrix must contain only "
                          "finite values.")
 
-    n = int(distance.num_obs_y(y))
+    n = distance.num_obs_y(y)
     method_code = _LINKAGE_METHODS[method]
 
-    y = np.asarray(y)
-    if method == 'single':
-        result = _hierarchy.mst_single_linkage(y, n)
-    elif method in ['complete', 'average', 'weighted', 'ward']:
-        result = _hierarchy.nn_chain(y, n, method_code)
-    else:
-        result = _hierarchy.fast_linkage(y, n, method_code)
-    result = xp.asarray(result)
+    def cy_linkage(y, validate):
+        if validate and not np.all(np.isfinite(y)):
+            raise ValueError("The condensed distance matrix must contain only "
+                            "finite values.")            
+
+        if method == 'single':
+            return _hierarchy.mst_single_linkage(y, n)
+        elif method in ('complete', 'average', 'weighted', 'ward'):
+            return _hierarchy.nn_chain(y, n, method_code)
+        else:
+            return _hierarchy.fast_linkage(y, n, method_code)
+
+    result = xpx.lazy_apply(cy_linkage, y, validate=lazy,
+                            shape=(n - 1, 4), dtype=xp.float64, as_numpy=True)
 
     if optimal_ordering:
-        y = xp.asarray(y)
         return optimal_leaf_ordering(result, y)
     else:
         return result
@@ -1514,31 +1521,39 @@ def optimal_leaf_ordering(Z, y, metric='euclidean'):
     """
     xp = array_namespace(Z, y)
     Z = _asarray(Z, order='C', xp=xp)
-    is_valid_linkage(Z, throw=True, name='Z')
-
     y = _asarray(y, order='C', dtype=xp.float64, xp=xp)
+    lazy = is_lazy_array(Z)
+    _is_valid_linkage(Z, throw=True, name='Z')
 
     if y.ndim == 1:
         distance.is_valid_y(y, throw=True, name='y')
     elif y.ndim == 2:
-        if (y.shape[0] == y.shape[1] and np.allclose(np.diag(y), 0) and
-                np.all(y >= 0) and np.allclose(y, y.T)):
+        if (not lazy and y.shape[0] == y.shape[1]
+            and xp.all(xpx.isclose(xp.linalg.diagonal(y), 0))
+            and xp.all(y >= 0) and xp.all(xpx.isclose(y, y.T))):
             warnings.warn('The symmetric non-negative hollow observation '
                           'matrix looks suspiciously like an uncondensed '
                           'distance matrix',
                           ClusterWarning, stacklevel=2)
         y = distance.pdist(y, metric)
-        y = xp.asarray(y)
     else:
         raise ValueError("`y` must be 1 or 2 dimensional.")
-
-    if not xp.all(xp.isfinite(y)):
+    if not lazy and not xp.all(xp.isfinite(y)):
         raise ValueError("The condensed distance matrix must contain only "
                          "finite values.")
 
-    Z = np.asarray(Z)
-    y = np.asarray(y)
-    return xp.asarray(_optimal_leaf_ordering.optimal_leaf_ordering(Z, y))
+    # The function name is prominently visible on the user-facing Dask dashboard;
+    # make sure it is meaningful.
+    def optimal_leaf_ordering_(Z, y, validate):
+        if validate:
+            is_valid_linkage(Z, throw=True, name='Z')
+            if not np.all(np.isfinite(y)):
+                raise ValueError("The condensed distance matrix must contain only "
+                                 "finite values.")
+        return _optimal_leaf_ordering.optimal_leaf_ordering(Z, y)
+
+    return xpx.lazy_apply(optimal_leaf_ordering_, Z, y, validate=lazy,
+                          shape=Z.shape, dtype=Z.dtype, as_numpy=True)
 
 
 def cophenet(Z, Y=None):
@@ -1924,10 +1939,9 @@ def to_mlab_linkage(Z):
     """
     xp = array_namespace(Z)
     Z = _asarray(Z, order='C', dtype=xp.float64, xp=xp)
-    Zs = Z.shape
-    if len(Zs) == 0 or (len(Zs) == 1 and Zs[0] == 0):
+    if Z.ndim == 0 or (Z.ndim == 1 and Z.shape[0] == 0):
         return xp_copy(Z, xp=xp)
-    is_valid_linkage(Z, throw=True, name='Z')
+    _is_valid_linkage(Z, throw=True, name='Z')
 
     return xp.concat((Z[:, :2] + 1.0, Z[:, 2:3]), axis=1)
 
@@ -2012,7 +2026,7 @@ def is_monotonic(Z):
     """
     xp = array_namespace(Z)
     Z = _asarray(Z, order='c', xp=xp)
-    is_valid_linkage(Z, throw=True, name='Z')
+    _is_valid_linkage(Z, throw=True, name='Z')
 
     # We expect the i'th value to be greater than its successor.
     return xp.all(Z[1:, 2] >= Z[:-1, 2])
@@ -2042,7 +2056,13 @@ def is_valid_im(R, warning=False, throw=False, name=None):
     Returns
     -------
     b : bool
-        True if the inconsistency matrix is valid.
+        True if the inconsistency matrix is valid; False otherwise.
+
+    Notes
+    -----
+    *Array API support (experimental):* If the input is a lazy Array (e.g. Dask
+    or JAX), the return value may be a 0-dimensional bool Array. When warning=True
+    or throw=True, calling this function materializes the array.
 
     See Also
     --------
@@ -2104,10 +2124,17 @@ def is_valid_im(R, warning=False, throw=False, name=None):
     >>> is_valid_im(R)
     False
 
+    """
+    return _is_valid_im(R, warning=warning, throw=throw, name=name, materialize=True)
+
+
+def _is_valid_im(R, warning=False, throw=False, name=None, materialize=False):
+    """Variant of `is_valid_im` to be called internally by other scipy functions,
+    which by default does not materialize lazy input arrays (Dask, JAX, etc.) when
+    warning=True or throw=True.
     """
     xp = array_namespace(R)
-    R = _asarray(R, order='c', xp=xp)
-    valid = True
+    R = _asarray(R, xp=xp)
     name_str = f"{name!r} " if name else ''
     try:
         if R.dtype != xp.float64:
@@ -2122,23 +2149,23 @@ def is_valid_im(R, warning=False, throw=False, name=None):
         if R.shape[0] < 1:
             raise ValueError(f'Inconsistency matrix {name_str}'
                              'must have at least one row.')
-        if xp.any(R[:, 0] < 0):
-            raise ValueError(f'Inconsistency matrix {name_str}'
-                             'contains negative link height means.')
-        if xp.any(R[:, 1] < 0):
-            raise ValueError(f'Inconsistency matrix {name_str}'
-                             'contains negative link height standard deviations.')
-        if xp.any(R[:, 2] < 0):
-            raise ValueError(f'Inconsistency matrix {name_str}'
-                             'contains negative link counts.')
-    except Exception as e:
+    except (TypeError, ValueError) as e:
         if throw:
             raise
         if warning:
             _warning(str(e))
-        valid = False
+        return False
 
-    return valid
+    return _lazy_valid_checks(
+        (xp.any(R[:, 0] < 0),
+         f'Inconsistency matrix {name_str} contains negative link height means.'),
+        (xp.any(R[:, 1] < 0),
+         f'Inconsistency matrix {name_str} contains negative link height standard '
+         'deviations.'),
+        (xp.any(R[:, 2] < 0),
+         f'Inconsistency matrix {name_str} contains negative link counts.'),
+        throw=throw, warning=warning, materialize=materialize, xp=xp
+    )
 
 
 def is_valid_linkage(Z, warning=False, throw=False, name=None):
@@ -2179,7 +2206,13 @@ def is_valid_linkage(Z, warning=False, throw=False, name=None):
     Returns
     -------
     b : bool
-        True if the inconsistency matrix is valid.
+        True if the inconsistency matrix is valid; False otherwise.
+
+    Notes
+    -----
+    *Array API support (experimental):* If the input is a lazy Array (e.g. Dask
+    or JAX), the return value may be a 0-dimensional bool Array. When warning=True
+    or throw=True, calling this function materializes the array.
 
     See Also
     --------
@@ -2225,10 +2258,18 @@ def is_valid_linkage(Z, warning=False, throw=False, name=None):
     >>> is_valid_linkage(Z)
     False
 
+    """
+    return _is_valid_linkage(Z, warning=warning, throw=throw,
+                             name=name, materialize=True)
+
+
+def _is_valid_linkage(Z, warning=False, throw=False, name=None, materialize=False):
+    """Variant of `is_valid_linkage` to be called internally by other scipy functions,
+    which by default does not materialize lazy input arrays (Dask, JAX, etc.) when
+    warning=True or throw=True.
     """
     xp = array_namespace(Z)
-    Z = _asarray(Z, order='c', xp=xp)
-    valid = True
+    Z = _asarray(Z, xp=xp)
     name_str = f"{name!r} " if name else ''
     try:
         if Z.dtype != xp.float64:
@@ -2241,32 +2282,85 @@ def is_valid_linkage(Z, warning=False, throw=False, name=None):
         if Z.shape[0] == 0:
             raise ValueError('Linkage must be computed on at least two '
                              'observations.')
-        n = Z.shape[0]
-        if n > 1:
-            if (xp.any(Z[:, 0] < 0) or xp.any(Z[:, 1] < 0)):
-                raise ValueError(f'Linkage {name_str}contains negative indices.')
-            if xp.any(Z[:, 2] < 0):
-                raise ValueError(f'Linkage {name_str}contains negative distances.')
-            if xp.any(Z[:, 3] < 0):
-                raise ValueError(f'Linkage {name_str}contains negative counts.')
-            if xp.any(Z[:, 3] > (Z.shape[0] + 1)):
-                raise ValueError('Linkage matrix contains excessive observations'
-                                 'in a cluster')
-        if xp.any(
-            xp.max(Z[:, :2], axis=1) >= xp.arange(n + 1, 2 * n + 1, dtype=Z.dtype)
-        ):
-            raise ValueError(f'Linkage {name_str}uses non-singleton cluster before'
-                             ' it is formed.')
-        if xpx.nunique(Z[:, :2]) < n * 2:
-            raise ValueError(f'Linkage {name_str}uses the same cluster more than once.')
-    except Exception as e:
+    except (TypeError, ValueError) as e:
         if throw:
             raise
         if warning:
             _warning(str(e))
-        valid = False
+        return False
+
+    n = Z.shape[0]
+    if n < 2:
+        return True
+
+    return _lazy_valid_checks(
+        (xp.any(Z[:, :2] < 0),
+         f'Linkage {name_str}contains negative indices.'),
+        (xp.any(Z[:, 2] < 0),
+         f'Linkage {name_str}contains negative distances.'),
+        (xp.any(Z[:, 3] < 0),
+         f'Linkage {name_str}contains negative counts.'),
+        (xp.any(Z[:, 3] > n + 1),
+         f'Linkage {name_str}contains excessive observations in a cluster'),
+        (xp.any(xp.max(Z[:, :2], axis=1) >= xp.arange(n + 1, 2 * n + 1, dtype=Z.dtype)),
+         f'Linkage {name_str}uses non-singleton cluster before it is formed.'),
+        (xpx.nunique(Z[:, :2]) < n * 2,
+         f'Linkage {name_str}uses the same cluster more than once.'),
+        throw=throw, warning=warning, materialize=materialize, xp=xp
+    )
+
+
+def _lazy_valid_checks(*args, throw=False, warning=False, materialize=False, xp):
+    """Validate a set of conditions on the contents of possibly lazy arrays.
 
-    return valid
+    Parameters
+    ----------
+    args : tuples of (Array, str)
+        The first element of each tuple must be a 0-dimensional Array
+        that evaluates to bool; the second element must be the message to convey
+        if the  first element evaluates to True.
+    throw: bool
+        Set to True to `raise ValueError(args[i][1])` if `args[i][0]` is True.
+    warning: bool
+        Set to True to issue a warning with message `args[i][1]` if `args[i][0]`
+        is True.
+    materialize: bool
+        Set to True to force materialization of lazy arrays when throw=True or
+        warning=True. If the inputs are lazy and materialize=False, ignore the
+        `throw` and `warning` flags.
+    xp: module
+        Array API namespace
+
+    Returns
+    -------
+    If xp is an eager backend (e.g. numpy) and all conditions are False, return True.
+    If throw is True, raise. Otherwise, return False.
+
+    If xp is a lazy backend (e.g. Dask or JAX), return a 0-dimensional bool Array.
+    """
+    conds = xp.concat([xp.reshape(cond, (1, )) for cond, _ in args])
+
+    lazy = is_lazy_array(conds)
+    if not throw and not warning or (lazy and not materialize):
+        out = ~xp.any(conds)
+        return out if lazy else bool(out)
+
+    if is_dask(xp):
+        # Only materialize the graph once, instead of once per check
+        conds = conds.compute()
+
+    # Don't call np.asarray(conds), as it would be blocked by the device transfer
+    # guard on CuPy and PyTorch and the densification guard on Sparse, whereas
+    # bool() will not.
+    conds = [bool(cond) for cond in conds]
+
+    for cond, (_, msg) in zip(conds, args):
+        if throw and cond:
+            raise ValueError(msg)
+        elif warning and cond:
+            warnings.warn(msg, ClusterWarning, stacklevel=3)
+
+    return not any(conds)
 
 
 def num_obs_linkage(Z):
@@ -2304,8 +2398,8 @@ def num_obs_linkage(Z):
     """
     xp = array_namespace(Z)
     Z = _asarray(Z, order='c', xp=xp)
-    is_valid_linkage(Z, throw=True, name='Z')
-    return (Z.shape[0] + 1)
+    _is_valid_linkage(Z, throw=True, name='Z')
+    return Z.shape[0] + 1
 
 
 def correspond(Z, Y):
@@ -2357,7 +2451,7 @@ def correspond(Z, Y):
     True
 
     """
-    is_valid_linkage(Z, throw=True)
+    _is_valid_linkage(Z, throw=True)
     distance.is_valid_y(Y, throw=True)
     xp = array_namespace(Z, Y)
     Z = _asarray(Z, order='c', xp=xp)
@@ -2640,11 +2734,9 @@ def fclusterdata(X, t, criterion='inconsistent',
     X = _asarray(X, order='C', dtype=xp.float64, xp=xp)
 
     if X.ndim != 2:
-        raise TypeError('The observation matrix X must be an n by m '
-                        'array.')
+        raise TypeError('The observation matrix X must be an n by m array.')
 
     Y = distance.pdist(X, metric=metric)
-    Y = xp.asarray(Y)
     Z = linkage(Y, method=method)
     if R is None:
         R = inconsistent(Z, d=depth)
@@ -4121,11 +4213,16 @@ def leaders(Z, T):
     >>> M
     array([1, 2, 3, 4], dtype=int32)
 
+    Notes
+    -----
+    *Array API support (experimental):* This function returns arrays
+    with data-dependent shape. In JAX, at the moment of writing this makes it
+    impossible to execute it inside `@jax.jit`.
     """
     xp = array_namespace(Z, T)
     Z = _asarray(Z, order='C', dtype=xp.float64, xp=xp)
     T = _asarray(T, order='C', xp=xp)
-    is_valid_linkage(Z, throw=True, name='Z')
+    _is_valid_linkage(Z, throw=True, name='Z')
 
     if T.dtype != xp.int32:
         raise TypeError('T must be a 1-D array of dtype int32.')
@@ -4133,15 +4230,20 @@ def leaders(Z, T):
     if T.shape[0] != Z.shape[0] + 1:
         raise ValueError('Mismatch: len(T)!=Z.shape[0] + 1.')
 
-    n_clusters = int(xpx.nunique(T))
-    n_obs = int(Z.shape[0] + 1)
-    L = np.zeros(n_clusters, dtype=np.int32)
-    M = np.zeros(n_clusters, dtype=np.int32)
-    Z = np.asarray(Z)
-    T = np.asarray(T, dtype=np.int32)
-    s = _hierarchy.leaders(Z, T, L, M, n_clusters, n_obs)
-    if s >= 0:
-        raise ValueError('T is not a valid assignment vector. Error found '
-                          f'when examining linkage node {s} (< 2n-1).')
-    L, M = xp.asarray(L), xp.asarray(M)
-    return (L, M)
+    n_obs = Z.shape[0] + 1
+
+    def leaders_(Z, T, validate):
+        if validate:
+            is_valid_linkage(Z, throw=True, name='Z')
+        n_clusters = int(xpx.nunique(T))
+        L = np.zeros(n_clusters, dtype=np.int32)
+        M = np.zeros(n_clusters, dtype=np.int32)
+        s = _hierarchy.leaders(Z, T, L, M, n_clusters, n_obs)
+        if s >= 0:
+            raise ValueError('T is not a valid assignment vector. Error found '
+                             f'when examining linkage node {s} (< 2n-1).')
+        return L, M
+
+    return xpx.lazy_apply(leaders_, Z, T, validate=is_lazy_array(Z),
+                          shape=((None,), (None, )), dtype=(xp.int32, xp.int32),
+                          as_numpy=True)
diff --git a/scipy/cluster/tests/test_hierarchy.py b/scipy/cluster/tests/test_hierarchy.py
index b4c806b2ac6b..50622ace2f93 100644
--- a/scipy/cluster/tests/test_hierarchy.py
+++ b/scipy/cluster/tests/test_hierarchy.py
@@ -41,15 +41,16 @@
 from scipy.cluster.hierarchy import (
     ClusterWarning, linkage, from_mlab_linkage, to_mlab_linkage,
     num_obs_linkage, inconsistent, cophenet, fclusterdata, fcluster,
-    is_isomorphic, single, leaders,
+    is_isomorphic, single, ward, leaders,
     correspond, is_monotonic, maxdists, maxinconsts, maxRstat,
     is_valid_linkage, is_valid_im, to_tree, leaves_list, dendrogram,
     set_link_color_palette, cut_tree, optimal_leaf_ordering,
-    _order_cluster_tree, _hierarchy, _LINKAGE_METHODS)
+    _order_cluster_tree, _hierarchy, _EUCLIDEAN_METHODS, _LINKAGE_METHODS)
 from scipy.spatial.distance import pdist
 from scipy.cluster._hierarchy import Heap
 from scipy._lib._array_api import xp_assert_close, xp_assert_equal
 import scipy._lib.array_api_extra as xpx
+from scipy._lib.array_api_extra.testing import lazy_xp_function
 
 from threading import Lock
 
@@ -69,24 +70,61 @@
     have_matplotlib = False
 
 skip_xp_backends = pytest.mark.skip_xp_backends
-
-
+xfail_xp_backends = pytest.mark.xfail_xp_backends
+use_linkage = skip_xp_backends(cpu_only=True, exceptions=["jax.numpy"],
+                               reason="linkage() invokes Cython code")
+
+lazy_xp_function(single)
+lazy_xp_function(ward)
+lazy_xp_function(linkage, static_argnames=('method', 'metric', 'optimal_ordering'))
+lazy_xp_function(cut_tree, static_argnames=('n_clusters', 'height'))
+lazy_xp_function(to_tree, jax_jit=False, allow_dask_compute=999,
+                 static_argnames=('rd', ))
+lazy_xp_function(optimal_leaf_ordering, static_argnames=('metric',))
+lazy_xp_function(cophenet, jax_jit=False, allow_dask_compute=2)
+lazy_xp_function(inconsistent, jax_jit=False, allow_dask_compute=2,
+                 static_argnames=('d',))
+lazy_xp_function(from_mlab_linkage, jax_jit=False, allow_dask_compute=2)
+lazy_xp_function(to_mlab_linkage, jax_jit=False, allow_dask_compute=1)
+lazy_xp_function(is_monotonic)
+
+# Note: these functions materialize lazy arrays when warning=True or throw=True
+lazy_xp_function(is_valid_im, static_argnames=("warning", "throw", "name")) 
+lazy_xp_function(is_valid_linkage, static_argnames=("warning", "throw", "name"))
+
+lazy_xp_function(num_obs_linkage)
+lazy_xp_function(correspond)
+lazy_xp_function(fcluster, jax_jit=False, allow_dask_compute=999, 
+                 static_argnames=('criterion', 'depth'))
+lazy_xp_function(fclusterdata, jax_jit=False, allow_dask_compute=999,
+                 static_argnames=('criterion', 'metric', 'depth', 'method'))
+lazy_xp_function(leaves_list, jax_jit=False, allow_dask_compute=2)
+lazy_xp_function(dendrogram, jax_jit=False, allow_dask_compute=999)
+lazy_xp_function(is_isomorphic, jax_jit=False, allow_dask_compute=2)
+lazy_xp_function(maxdists, jax_jit=False, allow_dask_compute=999)
+lazy_xp_function(maxinconsts, jax_jit=False, allow_dask_compute=999)
+lazy_xp_function(maxRstat, jax_jit=False, allow_dask_compute=999,
+                 static_argnames=('i',))
+
+# Returns data-dependent shape
+lazy_xp_function(leaders, jax_jit=False)
+
+
+@use_linkage
 class TestLinkage:
 
-    @skip_xp_backends(cpu_only=True)
+    @skip_xp_backends("jax.numpy", reason="Can't raise inside jax.pure_callback")
     def test_linkage_non_finite_elements_in_distance_matrix(self, xp):
         # Tests linkage(Y) where Y contains a non-finite element (e.g. NaN or Inf).
         # Exception expected.
         y = xp.asarray([xp.nan] + [0.0]*5)
         assert_raises(ValueError, linkage, y)
 
-    @skip_xp_backends(cpu_only=True)
     def test_linkage_empty_distance_matrix(self, xp):
         # Tests linkage(Y) where Y is a 0x4 linkage matrix. Exception expected.
         y = xp.zeros((0,))
         assert_raises(ValueError, linkage, y)
 
-    @skip_xp_backends(cpu_only=True)
     def test_linkage_tdist(self, xp):
         for method in ['single', 'complete', 'average', 'weighted']:
             self.check_linkage_tdist(method, xp)
@@ -97,7 +135,6 @@ def check_linkage_tdist(self, method, xp):
         expectedZ = getattr(hierarchy_test_data, 'linkage_ytdist_' + method)
         xp_assert_close(Z, xp.asarray(expectedZ), atol=1e-10)
 
-    @skip_xp_backends(cpu_only=True)
     def test_linkage_X(self, xp):
         for method in ['centroid', 'median', 'ward']:
             self.check_linkage_q(method, xp)
@@ -108,12 +145,11 @@ def check_linkage_q(self, method, xp):
         expectedZ = getattr(hierarchy_test_data, 'linkage_X_' + method)
         xp_assert_close(Z, xp.asarray(expectedZ), atol=1e-06)
 
-        y = scipy.spatial.distance.pdist(hierarchy_test_data.X,
-                                         metric="euclidean")
-        Z = linkage(xp.asarray(y), method)
+        X = xp.asarray(hierarchy_test_data.X)
+        y = pdist(X, metric="euclidean")
+        Z = linkage(y, method)
         xp_assert_close(Z, xp.asarray(expectedZ), atol=1e-06)
 
-    @skip_xp_backends(cpu_only=True)
     def test_compare_with_trivial(self, xp):
         rng = np.random.RandomState(0)
         n = 20
@@ -125,14 +161,13 @@ def test_compare_with_trivial(self, xp):
             Z = linkage(xp.asarray(d), method)
             xp_assert_close(Z, xp.asarray(Z_trivial), rtol=1e-14, atol=1e-15)
 
-    @skip_xp_backends(cpu_only=True)
     def test_optimal_leaf_ordering(self, xp):
         Z = linkage(xp.asarray(hierarchy_test_data.ytdist), optimal_ordering=True)
         expectedZ = getattr(hierarchy_test_data, 'linkage_ytdist_single_olo')
         xp_assert_close(Z, xp.asarray(expectedZ), atol=1e-10)
 
 
-@skip_xp_backends(cpu_only=True)
+@use_linkage
 class TestLinkageTies:
 
     _expectations = {
@@ -321,9 +356,8 @@ class TestLeaders:
 
     def test_leaders_single(self, xp):
         # Tests leaders using a flat clustering generated by single linkage.
-        X = hierarchy_test_data.Q_X
+        X = xp.asarray(hierarchy_test_data.Q_X)
         Y = pdist(X)
-        Y = xp.asarray(Y)
         Z = linkage(Y)
         T = fcluster(Z, criterion='maxclust', t=3)
         Lright = (xp.asarray([53, 55, 56]), xp.asarray([2, 3, 1]))
@@ -373,8 +407,8 @@ def test_is_isomorphic_4B(self, xp):
         # (3 flat clusters, different labelings, nonisomorphic)
         a = xp.asarray([1, 2, 3, 3])
         b = xp.asarray([1, 3, 2, 3])
-        assert is_isomorphic(a, b) is False
-        assert is_isomorphic(b, a) is False
+        assert not is_isomorphic(a, b)
+        assert not is_isomorphic(b, a)
 
     def test_is_isomorphic_4C(self, xp):
         # Tests is_isomorphic on test case #4C
@@ -419,20 +453,16 @@ def help_is_isomorphic_randperm(self, nobs, nclusters, noniso=False, nerrors=0,
             assert is_isomorphic(b, a) == (not noniso)
 
 
-@skip_xp_backends(cpu_only=True)
 class TestIsValidLinkage:
 
-    def test_is_valid_linkage_various_size(self, xp):
-        for nrow, ncol, valid in [(2, 5, False), (2, 3, False),
-                                  (1, 4, True), (2, 4, True)]:
-            self.check_is_valid_linkage_various_size(nrow, ncol, valid, xp)
-
-    def check_is_valid_linkage_various_size(self, nrow, ncol, valid, xp):
+    @pytest.mark.parametrize("nrow, ncol, valid", [(2, 5, False), (2, 3, False),
+                                                  (1, 4, True), (2, 4, True)])
+    def test_is_valid_linkage_various_size(self, nrow, ncol, valid, xp):
         # Tests is_valid_linkage(Z) with linkage matrices of various sizes
         Z = xp.asarray([[0, 1, 3.0, 2, 5],
                         [3, 2, 4.0, 3, 3]], dtype=xp.float64)
         Z = Z[:nrow, :ncol]
-        assert_(is_valid_linkage(Z) == valid)
+        xp_assert_equal(is_valid_linkage(Z), valid, check_namespace=False)
         if not valid:
             assert_raises(ValueError, is_valid_linkage, Z, throw=True)
 
@@ -440,15 +470,16 @@ def test_is_valid_linkage_int_type(self, xp):
         # Tests is_valid_linkage(Z) with integer type.
         Z = xp.asarray([[0, 1, 3.0, 2],
                         [3, 2, 4.0, 3]], dtype=xp.int64)
-        assert_(is_valid_linkage(Z) is False)
+        xp_assert_equal(is_valid_linkage(Z), False, check_namespace=False)
         assert_raises(TypeError, is_valid_linkage, Z, throw=True)
 
     def test_is_valid_linkage_empty(self, xp):
         # Tests is_valid_linkage(Z) with empty linkage.
         Z = xp.zeros((0, 4), dtype=xp.float64)
-        assert_(is_valid_linkage(Z) is False)
+        xp_assert_equal(is_valid_linkage(Z), False, check_namespace=False)
         assert_raises(ValueError, is_valid_linkage, Z, throw=True)
 
+    @use_linkage
     def test_is_valid_linkage_4_and_up(self, xp):
         # Tests is_valid_linkage(Z) on linkage on observation sets between
         # sizes 4 and 15 (step size 3).
@@ -456,8 +487,9 @@ def test_is_valid_linkage_4_and_up(self, xp):
             y = np.random.rand(i*(i-1)//2)
             y = xp.asarray(y)
             Z = linkage(y)
-            assert_(is_valid_linkage(Z) is True)
+            xp_assert_equal(is_valid_linkage(Z), True, check_namespace=False)
 
+    @use_linkage
     def test_is_valid_linkage_4_and_up_neg_index_left(self, xp):
         # Tests is_valid_linkage(Z) on linkage on observation sets between
         # sizes 4 and 15 (step size 3) with negative indices (left).
@@ -466,9 +498,13 @@ def test_is_valid_linkage_4_and_up_neg_index_left(self, xp):
             y = xp.asarray(y)
             Z = linkage(y)
             Z = xpx.at(Z)[i//2, 0].set(-2)
-            assert_(is_valid_linkage(Z) is False)
-            assert_raises(ValueError, is_valid_linkage, Z, throw=True)
+            xp_assert_equal(is_valid_linkage(Z), False, check_namespace=False)
+            # Use fully-qualified function name to bypass lazy_xp_function(),
+            # because `is_valid_*` materializes.
+            with pytest.raises(ValueError):
+                scipy.cluster.hierarchy.is_valid_linkage(Z, throw=True)
 
+    @use_linkage
     def test_is_valid_linkage_4_and_up_neg_index_right(self, xp):
         # Tests is_valid_linkage(Z) on linkage on observation sets between
         # sizes 4 and 15 (step size 3) with negative indices (right).
@@ -477,9 +513,12 @@ def test_is_valid_linkage_4_and_up_neg_index_right(self, xp):
             y = xp.asarray(y)
             Z = linkage(y)
             Z = xpx.at(Z)[i//2, 1].set(-2)
-            assert_(is_valid_linkage(Z) is False)
-            assert_raises(ValueError, is_valid_linkage, Z, throw=True)
+            xp_assert_equal(is_valid_linkage(Z), False, check_namespace=False)
+            with pytest.raises(ValueError):
+                scipy.cluster.hierarchy.is_valid_linkage(Z, throw=True)
+
 
+    @use_linkage
     def test_is_valid_linkage_4_and_up_neg_dist(self, xp):
         # Tests is_valid_linkage(Z) on linkage on observation sets between
         # sizes 4 and 15 (step size 3) with negative distances.
@@ -488,9 +527,11 @@ def test_is_valid_linkage_4_and_up_neg_dist(self, xp):
             y = xp.asarray(y)
             Z = linkage(y)
             Z = xpx.at(Z)[i//2, 2].set(-0.5)
-            assert_(is_valid_linkage(Z) is False)
-            assert_raises(ValueError, is_valid_linkage, Z, throw=True)
+            xp_assert_equal(is_valid_linkage(Z), False, check_namespace=False)
+            with pytest.raises(ValueError):
+                scipy.cluster.hierarchy.is_valid_linkage(Z, throw=True)
 
+    @use_linkage
     def test_is_valid_linkage_4_and_up_neg_counts(self, xp):
         # Tests is_valid_linkage(Z) on linkage on observation sets between
         # sizes 4 and 15 (step size 3) with negative counts.
@@ -499,40 +540,38 @@ def test_is_valid_linkage_4_and_up_neg_counts(self, xp):
             y = xp.asarray(y)
             Z = linkage(y)
             Z = xpx.at(Z)[i//2, 3].set(-2)
-            assert_(is_valid_linkage(Z) is False)
-            assert_raises(ValueError, is_valid_linkage, Z, throw=True)
+            xp_assert_equal(is_valid_linkage(Z), False, check_namespace=False)
+            with pytest.raises(ValueError):
+                scipy.cluster.hierarchy.is_valid_linkage(Z, throw=True)
 
 
-@skip_xp_backends(cpu_only=True)
 class TestIsValidInconsistent:
 
     def test_is_valid_im_int_type(self, xp):
         # Tests is_valid_im(R) with integer type.
         R = xp.asarray([[0, 1, 3.0, 2],
                         [3, 2, 4.0, 3]], dtype=xp.int64)
-        assert_(is_valid_im(R) is False)
+        xp_assert_equal(is_valid_im(R), False, check_namespace=False)
         assert_raises(TypeError, is_valid_im, R, throw=True)
 
-    def test_is_valid_im_various_size(self, xp):
-        for nrow, ncol, valid in [(2, 5, False), (2, 3, False),
-                                  (1, 4, True), (2, 4, True)]:
-            self.check_is_valid_im_various_size(nrow, ncol, valid, xp)
-
-    def check_is_valid_im_various_size(self, nrow, ncol, valid, xp):
+    @pytest.mark.parametrize("nrow, ncol, valid", [(2, 5, False), (2, 3, False),
+                                                  (1, 4, True), (2, 4, True)])
+    def test_is_valid_im_various_size(self, nrow, ncol, valid, xp):
         # Tests is_valid_im(R) with linkage matrices of various sizes
         R = xp.asarray([[0, 1, 3.0, 2, 5],
                         [3, 2, 4.0, 3, 3]], dtype=xp.float64)
         R = R[:nrow, :ncol]
-        assert_(is_valid_im(R) == valid)
+        xp_assert_equal(is_valid_im(R), valid, check_namespace=False)
         if not valid:
             assert_raises(ValueError, is_valid_im, R, throw=True)
 
     def test_is_valid_im_empty(self, xp):
         # Tests is_valid_im(R) with empty inconsistency matrix.
         R = xp.zeros((0, 4), dtype=xp.float64)
-        assert_(is_valid_im(R) is False)
+        xp_assert_equal(is_valid_im(R), False, check_namespace=False)
         assert_raises(ValueError, is_valid_im, R, throw=True)
 
+    @use_linkage
     def test_is_valid_im_4_and_up(self, xp):
         # Tests is_valid_im(R) on im on observation sets between sizes 4 and 15
         # (step size 3).
@@ -541,8 +580,9 @@ def test_is_valid_im_4_and_up(self, xp):
             y = xp.asarray(y)
             Z = linkage(y)
             R = inconsistent(Z)
-            assert_(is_valid_im(R) is True)
+            xp_assert_equal(is_valid_im(R), True, check_namespace=False)
 
+    @use_linkage
     def test_is_valid_im_4_and_up_neg_index_left(self, xp):
         # Tests is_valid_im(R) on im on observation sets between sizes 4 and 15
         # (step size 3) with negative link height means.
@@ -552,9 +592,13 @@ def test_is_valid_im_4_and_up_neg_index_left(self, xp):
             Z = linkage(y)
             R = inconsistent(Z)
             R = xpx.at(R)[i//2 , 0].set(-2.0)
-            assert_(is_valid_im(R) is False)
-            assert_raises(ValueError, is_valid_im, R, throw=True)
+            xp_assert_equal(is_valid_im(R), False, check_namespace=False)
+            # Use fully-qualified function name to bypass lazy_xp_function(),
+            # because `is_valid_*`materializes.
+            with pytest.raises(ValueError):
+                scipy.cluster.hierarchy.is_valid_im(R, throw=True)
 
+    @use_linkage
     def test_is_valid_im_4_and_up_neg_index_right(self, xp):
         # Tests is_valid_im(R) on im on observation sets between sizes 4 and 15
         # (step size 3) with negative link height standard deviations.
@@ -564,9 +608,11 @@ def test_is_valid_im_4_and_up_neg_index_right(self, xp):
             Z = linkage(y)
             R = inconsistent(Z)
             R = xpx.at(R)[i//2 , 1].set(-2.0)
-            assert_(is_valid_im(R) is False)
-            assert_raises(ValueError, is_valid_im, R, throw=True)
+            xp_assert_equal(is_valid_im(R), False, check_namespace=False)
+            with pytest.raises(ValueError):
+                scipy.cluster.hierarchy.is_valid_im(R, throw=True)
 
+    @use_linkage
     def test_is_valid_im_4_and_up_neg_dist(self, xp):
         # Tests is_valid_im(R) on im on observation sets between sizes 4 and 15
         # (step size 3) with negative link counts.
@@ -576,13 +622,13 @@ def test_is_valid_im_4_and_up_neg_dist(self, xp):
             Z = linkage(y)
             R = inconsistent(Z)
             R = xpx.at(R)[i//2, 2].set(-0.5)
-            assert_(is_valid_im(R) is False)
-            assert_raises(ValueError, is_valid_im, R, throw=True)
+            xp_assert_equal(is_valid_im(R), False, check_namespace=False)
+            with pytest.raises(ValueError):
+                scipy.cluster.hierarchy.is_valid_im(R, throw=True)
 
 
 class TestNumObsLinkage:
 
-    @skip_xp_backends(cpu_only=True)
     def test_num_obs_linkage_empty(self, xp):
         # Tests num_obs_linkage(Z) with empty linkage.
         Z = xp.zeros((0, 4), dtype=xp.float64)
@@ -591,15 +637,15 @@ def test_num_obs_linkage_empty(self, xp):
     def test_num_obs_linkage_1x4(self, xp):
         # Tests num_obs_linkage(Z) on linkage over 2 observations.
         Z = xp.asarray([[0, 1, 3.0, 2]], dtype=xp.float64)
-        assert_equal(num_obs_linkage(Z), 2)
+        assert num_obs_linkage(Z) == 2
 
     def test_num_obs_linkage_2x4(self, xp):
         # Tests num_obs_linkage(Z) on linkage over 3 observations.
         Z = xp.asarray([[0, 1, 3.0, 2],
                         [3, 2, 4.0, 3]], dtype=xp.float64)
-        assert_equal(num_obs_linkage(Z), 3)
+        assert num_obs_linkage(Z) == 3
 
-    @skip_xp_backends(cpu_only=True)
+    @use_linkage
     def test_num_obs_linkage_4_and_up(self, xp):
         # Tests num_obs_linkage(Z) on linkage on observation sets between sizes
         # 4 and 15 (step size 3).
@@ -607,7 +653,16 @@ def test_num_obs_linkage_4_and_up(self, xp):
             y = np.random.rand(i*(i-1)//2)
             y = xp.asarray(y)
             Z = linkage(y)
-            assert_equal(num_obs_linkage(Z), i)
+            assert num_obs_linkage(Z) == i
+
+    @use_linkage
+    def test_num_obs_linkage_multi_matrix(self, xp):
+        # Tests num_obs_linkage with observation matrices of multiple sizes.
+        for n in range(2, 10):
+            X = xp.asarray(np.random.rand(n, 4))
+            Y = pdist(X)
+            Z = linkage(Y)
+            assert num_obs_linkage(Z) == n
 
 
 @skip_xp_backends(cpu_only=True)
@@ -648,7 +703,6 @@ def test_Q_subtree_pre_order(self, xp):
                         rtol=1e-15)
 
 
-@skip_xp_backends(cpu_only=True)
 class TestCorrespond:
 
     def test_correspond_empty(self, xp):
@@ -657,6 +711,7 @@ def test_correspond_empty(self, xp):
         Z = xp.zeros((0,4), dtype=xp.float64)
         assert_raises(ValueError, correspond, Z, y)
 
+    @use_linkage
     def test_correspond_2_and_up(self, xp):
         # Tests correspond(Z, y) on linkage and CDMs over observation sets of
         # different sizes.
@@ -671,6 +726,7 @@ def test_correspond_2_and_up(self, xp):
             Z = linkage(y)
             assert_(correspond(Z, y))
 
+    @use_linkage
     def test_correspond_4_and_up(self, xp):
         # Tests correspond(Z, y) on linkage and CDMs over observation sets of
         # different sizes. Correspondence should be false.
@@ -685,6 +741,7 @@ def test_correspond_4_and_up(self, xp):
             assert not correspond(Z, y2)
             assert not correspond(Z2, y)
 
+    @use_linkage
     def test_correspond_4_and_up_2(self, xp):
         # Tests correspond(Z, y) on linkage and CDMs over observation sets of
         # different sizes. Correspondence should be false.
@@ -699,17 +756,7 @@ def test_correspond_4_and_up_2(self, xp):
             assert not correspond(Z, y2)
             assert not correspond(Z2, y)
 
-    def test_num_obs_linkage_multi_matrix(self, xp):
-        # Tests num_obs_linkage with observation matrices of multiple sizes.
-        for n in range(2, 10):
-            X = np.random.rand(n, 4)
-            Y = pdist(X)
-            Y = xp.asarray(Y)
-            Z = linkage(Y)
-            assert_equal(num_obs_linkage(Z), n)
 
-
-@skip_xp_backends(cpu_only=True)
 class TestIsMonotonic:
 
     def test_is_monotonic_empty(self, xp):
@@ -762,12 +809,14 @@ def test_is_monotonic_3x4_F3(self, xp):
                         [4, 5, 0.2, 4]], dtype=xp.float64)
         assert not is_monotonic(Z)
 
+    @use_linkage
     def test_is_monotonic_tdist_linkage1(self, xp):
         # Tests is_monotonic(Z) on clustering generated by single linkage on
         # tdist data set. Expecting True.
         Z = linkage(xp.asarray(hierarchy_test_data.ytdist), 'single')
         assert is_monotonic(Z)
 
+    @use_linkage
     def test_is_monotonic_tdist_linkage2(self, xp):
         # Tests is_monotonic(Z) on clustering generated by single linkage on
         # tdist data set. Perturbing. Expecting False.
@@ -775,6 +824,7 @@ def test_is_monotonic_tdist_linkage2(self, xp):
         Z = xpx.at(Z)[2, 2].set(0.0)
         assert not is_monotonic(Z)
 
+    @use_linkage
     def test_is_monotonic_Q_linkage(self, xp):
         # Tests is_monotonic(Z) on clustering generated by single linkage on
         # Q data set. Expecting True.
@@ -1166,18 +1216,19 @@ def calculate_maximum_inconsistencies(Z, R, k=3, xp=np):
 
 
 @pytest.mark.thread_unsafe
-@skip_xp_backends(cpu_only=True)
+@use_linkage
+@skip_xp_backends(eager_only=True)
 def test_unsupported_uncondensed_distance_matrix_linkage_warning(xp):
     assert_warns(ClusterWarning, linkage, xp.asarray([[0, 1], [1, 0]]))
 
 
 def test_euclidean_linkage_value_error(xp):
-    for method in scipy.cluster.hierarchy._EUCLIDEAN_METHODS:
+    for method in _EUCLIDEAN_METHODS:
         assert_raises(ValueError, linkage, xp.asarray([[1, 1], [1, 1]]),
                       method=method, metric='cityblock')
 
 
-@skip_xp_backends(cpu_only=True)
+@use_linkage
 def test_2x2_linkage(xp):
     Z1 = linkage(xp.asarray([1]), method='single', metric='euclidean')
     Z2 = linkage(xp.asarray([[0, 1], [0, 0]]), method='single', metric='euclidean')
@@ -1190,7 +1241,7 @@ def test_node_compare(xp):
     nobs = 50
     X = np.random.randn(nobs, 4)
     X = xp.asarray(X)
-    Z = scipy.cluster.hierarchy.ward(X)
+    Z = ward(X)
     tree = to_tree(Z)
     assert_(tree > tree.get_left())
     assert_(tree.get_right() > tree.get_left())
@@ -1204,7 +1255,7 @@ def test_cut_tree(xp):
     nobs = 50
     X = np.random.randn(nobs, 4)
     X = xp.asarray(X)
-    Z = scipy.cluster.hierarchy.ward(X)
+    Z = ward(X)
     cutree = cut_tree(Z)
 
     # cutree.dtype varies between int32 and int64 over platforms
@@ -1275,7 +1326,8 @@ def test_Heap(xp):
     assert_equal(pair['value'], 10)
 
 
-@skip_xp_backends(cpu_only=True)
+@use_linkage
+@skip_xp_backends("jax.numpy", reason="Can't raise inside jax.pure_callback")
 def test_centroid_neg_distance(xp):
     # gh-21011
     values = xp.asarray([0, 0, -1])
diff --git a/scipy/conftest.py b/scipy/conftest.py
index 247a69d8a716..b3fc5a96d8a5 100644
--- a/scipy/conftest.py
+++ b/scipy/conftest.py
@@ -13,7 +13,8 @@
 
 from scipy._lib._fpumode import get_fpu_mode
 from scipy._lib._array_api import (
-    SCIPY_ARRAY_API, SCIPY_DEVICE, array_namespace, default_xp
+    SCIPY_ARRAY_API, SCIPY_DEVICE, array_namespace, default_xp,
+    is_cupy, is_dask, is_jax,
 )
 from scipy._lib._testutils import FPUModeChangeWarning
 from scipy._lib.array_api_extra.testing import patch_lazy_xp_functions
@@ -154,7 +155,6 @@ def num_parallel_threads():
     try:
         import torch  # type: ignore[import-not-found]
         xp_available_backends.update({'torch': torch})
-        # can use `mps` or `cpu`
         torch.set_default_device(SCIPY_DEVICE)
         if SCIPY_DEVICE != "cpu":
             xp_skip_cpu_only_backends.add('torch')
@@ -409,6 +409,41 @@ def skip_or_xfail_xp_backends(request: pytest.FixtureRequest,
         skip_or_xfail(reason=reason)
 
 
+@pytest.fixture
+def devices(xp):
+    """Fixture that returns a list of all devices for the backend, plus None.
+    Used to test input->output device propagation.
+
+    Usage
+    -----
+    from scipy._lib._array_api import xp_device
+
+    def test_device(xp, devices):
+        for d in devices:
+            x = xp.asarray(..., device=d)
+            y = f(x)
+            assert xp_device(y) == xp_device(x)
+    """
+    if is_cupy(xp):
+        # CuPy does not support devices other than the current one
+        # data-apis/array-api-compat#293
+        pytest.xfail(reason="data-apis/array-api-compat#293")
+    if is_dask(xp):
+        # Skip dummy DASK_DEVICE from array-api-compat, which does not propagate
+        return ["cpu", None]
+    if is_jax(xp):
+        # The .device attribute is not accessible inside jax.jit; the consequence
+        # (downstream of array-api-compat hacks) is that a non-default device in
+        # input is not guaranteed to propagate to the output even if the scipy code
+        # states `device=xp_device(arg)`` in all array creation functions.
+        # While this issue is specific to jax.jit, it would be unnecessarily
+        # verbose to skip the test for each jit-capable function and run it for
+        # those that only support eager mode.
+        pytest.xfail(reason="jax-ml/jax#26000")
+
+    return xp.__array_namespace_info__().devices() + [None]
+
+
 # Following the approach of NumPy's conftest.py...
 # Use a known and persistent tmpdir for hypothesis' caches, which
 # can be automatically cleared by the OS or user.
diff --git a/scipy/differentiate/_differentiate.py b/scipy/differentiate/_differentiate.py
index 010ab1a0f172..8dbf3f755541 100644
--- a/scipy/differentiate/_differentiate.py
+++ b/scipy/differentiate/_differentiate.py
@@ -3,7 +3,7 @@
 import numpy as np
 import scipy._lib._elementwise_iterative_method as eim
 from scipy._lib._util import _RichResult
-from scipy._lib._array_api import array_namespace, xp_copy
+from scipy._lib._array_api import array_namespace, xp_copy, xp_promote
 
 _EERRORINCREASE = -1  # used in derivative
 
@@ -906,9 +906,7 @@ def f(x):
 
     """
     xp = array_namespace(x)
-    x = xp.asarray(x)
-    int_dtype = xp.isdtype(x.dtype, 'integral')
-    x0 = xp.asarray(x, dtype=xp.asarray(1.0).dtype) if int_dtype else x
+    x0 = xp_promote(x, force_floating=True, xp=xp)
 
     if x0.ndim < 1:
         message = "Argument `x` must be at least 1-D."
@@ -1097,9 +1095,9 @@ def hessian(f, x, *, tolerances=None, maxiter=10,
     rtol = tolerances.get('rtol', None)
 
     xp = array_namespace(x)
-    x = xp.asarray(x)
-    dtype = x.dtype if not xp.isdtype(x.dtype, 'integral') else xp.asarray(1.).dtype
-    finfo = xp.finfo(dtype)
+    x0 = xp_promote(x, force_floating=True, xp=xp)
+
+    finfo = xp.finfo(x0.dtype)
     rtol = finfo.eps**0.5 if rtol is None else rtol  # keep same as `derivative`
 
     # tighten the inner tolerance to make the inner error negligible
diff --git a/scipy/fft/_basic.py b/scipy/fft/_basic.py
index a3fc021c9ef9..8c07ee697b3c 100644
--- a/scipy/fft/_basic.py
+++ b/scipy/fft/_basic.py
@@ -1166,6 +1166,17 @@ def rfft2(x, s=None, axes=(-2, -1), norm=None, overwrite_x=False, workers=None,
     This is really just `rfftn` with different default behavior.
     For more details see `rfftn`.
 
+    Examples
+    --------
+    >>> import scipy.fft
+    >>> import numpy as np
+    >>> x = np.broadcast_to([1, 0, -1, 0], (4, 4))
+    >>> scipy.fft.rfft2(x)
+    array([[0.+0.j, 8.+0.j, 0.+0.j],
+           [0.+0.j, 0.+0.j, 0.+0.j],
+           [0.+0.j, 0.+0.j, 0.+0.j],
+           [0.+0.j, 0.+0.j, 0.+0.j]])
+
     """
     return (Dispatchable(x, np.ndarray),)
 
@@ -1484,6 +1495,15 @@ def hfft2(x, s=None, axes=(-2, -1), norm=None, overwrite_x=False, workers=None,
     This is really just `hfftn` with different default behavior.
     For more details see `hfftn`.
 
+    Examples
+    --------
+    >>> import scipy.fft
+    >>> import numpy as np
+    >>> x = np.array([[1+0j, 2+0j], [2+0j, 1+0j]])  # Hermitian-symmetric input
+    >>> scipy.fft.hfft2(x, s=(2, 2))
+    array([[ 6.,  0.],
+           [ 0., -2.]])
+
     """
     return (Dispatchable(x, np.ndarray),)
 
diff --git a/scipy/fft/_realtransforms.py b/scipy/fft/_realtransforms.py
index 1c7a3d683dd7..b7324f5d9a81 100644
--- a/scipy/fft/_realtransforms.py
+++ b/scipy/fft/_realtransforms.py
@@ -621,6 +621,19 @@ def dst(x, type=2, n=None, axis=-1, norm=None, overwrite_x=False, workers=None,
     The (unnormalized) DST-IV is its own inverse, up to a factor :math:`2N`. The
     orthonormalized DST-IV is exactly its own inverse.
 
+    Examples
+    --------
+    Compute the DST of a simple 1D array:
+
+    >>> import numpy as np
+    >>> from scipy.fft import dst
+    >>> x = np.array([1, -1, 1, -1])
+    >>> dst(x, type=2)
+    array([0., 0., 0., 8.])
+
+    This computes the Discrete Sine Transform (DST) of type-II for the input array. 
+    The output contains the transformed values corresponding to the given input sequence
+
     References
     ----------
     .. [1] Wikipedia, "Discrete sine transform",
diff --git a/scipy/fft/tests/test_helper.py b/scipy/fft/tests/test_helper.py
index 6a5fa58492fa..5492ce844d5d 100644
--- a/scipy/fft/tests/test_helper.py
+++ b/scipy/fft/tests/test_helper.py
@@ -10,9 +10,7 @@
 import pytest
 import numpy as np
 import sys
-from scipy._lib._array_api import (
-    xp_assert_close, get_xp_devices, xp_device
-)
+from scipy._lib._array_api import xp_assert_close, xp_device
 from scipy import fft
 
 skip_xp_backends = pytest.mark.skip_xp_backends
@@ -507,12 +505,7 @@ def test_uneven_dims(self, xp):
         xp_assert_close(fft.ifftshift(shift_dim_both), freqs)
 
 
-@skip_xp_backends("cupy",
-                  reason="CuPy has not implemented the `device` param")
-@skip_xp_backends("jax.numpy",
-                  reason="JAX has not implemented the `device` param")
 class TestFFTFreq:
-
     def test_definition(self, xp):
         x = xp.asarray([0, 1, 2, 3, 4, -4, -3, -2, -1], dtype=xp.float64)
         x2 = xp.asarray([0, 1, 2, 3, 4, -5, -4, -3, -2, -1], dtype=xp.float64)
@@ -531,18 +524,13 @@ def test_definition(self, xp):
         y = 10 * xp.pi * fft.fftfreq(10, xp.pi, xp=xp)
         xp_assert_close(y, x2, check_dtype=False)
 
-    def test_device(self, xp):
-        devices = get_xp_devices(xp)
+    def test_device(self, xp, devices):
         for d in devices:
             y = fft.fftfreq(9, xp=xp, device=d)
             x = xp.empty(0, device=d)
             assert xp_device(y) == xp_device(x)
 
 
-@skip_xp_backends("cupy",
-                  reason="CuPy has not implemented the `device` param")
-@skip_xp_backends("jax.numpy",
-                  reason="JAX has not implemented the `device` param")
 class TestRFFTFreq:
 
     def test_definition(self, xp):
@@ -563,8 +551,7 @@ def test_definition(self, xp):
         y = 10 * xp.pi * fft.rfftfreq(10, xp.pi, xp=xp)
         xp_assert_close(y, x2, check_dtype=False)
 
-    def test_device(self, xp):
-        devices = get_xp_devices(xp)
+    def test_device(self, xp, devices):
         for d in devices:
             y = fft.rfftfreq(9, xp=xp, device=d)
             x = xp.empty(0, device=d)
diff --git a/scipy/integrate/_cubature.py b/scipy/integrate/_cubature.py
index 3e6d8911d13e..ad806d6c2373 100644
--- a/scipy/integrate/_cubature.py
+++ b/scipy/integrate/_cubature.py
@@ -10,7 +10,7 @@
     array_namespace,
     xp_size,
     xp_copy,
-    xp_broadcast_promote
+    xp_promote
 )
 from scipy._lib._util import MapWrapper
 
@@ -323,7 +323,8 @@ def cubature(f, a, b, *, rule="gk21", rtol=1e-8, atol=0, max_subdivisions=10000,
 
     # Convert a and b to arrays and convert each point in points to an array, promoting
     # each to a common floating dtype.
-    a, b, *points = xp_broadcast_promote(a, b, *points, force_floating=True)
+    a, b, *points = xp_promote(a, b, *points, broadcast=True, force_floating=True,
+                               xp=xp)
     result_dtype = a.dtype
 
     if xp_size(a) == 0 or xp_size(b) == 0:
diff --git a/scipy/integrate/_quadrature.py b/scipy/integrate/_quadrature.py
index 4f4d508216e3..86d70feed7a4 100644
--- a/scipy/integrate/_quadrature.py
+++ b/scipy/integrate/_quadrature.py
@@ -8,7 +8,7 @@
 from scipy.special import roots_legendre
 from scipy.special import gammaln, logsumexp
 from scipy._lib._util import _rng_spawn
-from scipy._lib._array_api import _asarray, array_namespace, xp_broadcast_promote
+from scipy._lib._array_api import _asarray, array_namespace, xp_result_type
 
 
 __all__ = ['fixed_quad', 'romb',
@@ -124,7 +124,7 @@ def trapezoid(y, x=None, dx=1.0, axis=-1):
     # Cannot just use the broadcasted arrays that are returned
     # because trapezoid does not follow normal broadcasting rules
     # cf. https://github.com/scipy/scipy/pull/21524#issuecomment-2354105942
-    result_dtype = xp_broadcast_promote(y, force_floating=True, xp=xp)[0].dtype
+    result_dtype = xp_result_type(y, force_floating=True, xp=xp)
     nd = y.ndim
     slice1 = [slice(None)]*nd
     slice2 = [slice(None)]*nd
diff --git a/scipy/integrate/_tanhsinh.py b/scipy/integrate/_tanhsinh.py
index b577511a24f6..f973bcd2c1a7 100644
--- a/scipy/integrate/_tanhsinh.py
+++ b/scipy/integrate/_tanhsinh.py
@@ -4,7 +4,8 @@
 from scipy import special
 import scipy._lib._elementwise_iterative_method as eim
 from scipy._lib._util import _RichResult
-from scipy._lib._array_api import array_namespace, xp_copy, xp_ravel
+from scipy._lib._array_api import (array_namespace, xp_copy, xp_ravel,
+                                   xp_promote)
 
 
 __all__ = ['nsum']
@@ -97,8 +98,9 @@ def tanhsinh(f, a, b, *, args=(), log=False, maxlevel=None, minlevel=2,
         Absolute termination tolerance (default: 0) and relative termination
         tolerance (default: ``eps**0.75``, where ``eps`` is the precision of
         the result dtype), respectively.  Iteration will stop when
-        ``res.error < atol + rtol * abs(res.df)``. The error estimate is as
-        described in [1]_ Section 5. While not theoretically rigorous or
+        ``res.error < atol`` or  ``res.error < res.integral * rtol``. The error
+        estimate is as described in [1]_ Section 5 but with a lower bound of
+        ``eps * res.integral``. While not theoretically rigorous or
         conservative, it is said to work well in practice. Must be non-negative
         and finite if `log` is False, and must be expressed as the log of a
         non-negative and finite number if `log` is True.
@@ -443,9 +445,9 @@ def check_termination(work):
             stop[i] = True
         else:
             # Terminate if convergence criterion is met
-            work.rerr, work.aerr = _estimate_error(work, xp)
-            i = ((work.rerr < rtol) | (work.rerr + xp.real(work.Sn) < atol) if log
-                 else (work.rerr < rtol) | (work.rerr * xp.abs(work.Sn) < atol))
+            rerr, aerr = _estimate_error(work, xp)
+            i = (rerr < rtol) | (aerr < atol)
+            work.aerr =  xp.reshape(xp.astype(aerr, work.dtype), work.Sn.shape)
             work.status[i] = eim._ECONVERGED
             stop[i] = True
 
@@ -767,22 +769,23 @@ def _estimate_error(work, xp):
         d2 = xp.real(special.logsumexp(xp.stack([work.Sn, Snm2 + work.pi*1j]), axis=0))
         d3 = log_e1 + xp.max(xp.real(work.fjwj), axis=-1)
         d4 = work.d4
-        ds = xp.stack([d1 ** 2 / d2, 2 * d1, d3, d4])
+        d5 = log_e1 + xp.real(work.Sn)
+        temp = xp.where(d1 > -xp.inf, d1 ** 2 / d2, -xp.inf)
+        ds = xp.stack([temp, 2 * d1, d3, d4, d5])
         aerr = xp.max(ds, axis=0)
-        rerr = xp.maximum(log_e1, aerr - xp.real(work.Sn))
+        rerr = aerr - xp.real(work.Sn)
     else:
         # Note: explicit computation of log10 of each of these is unnecessary.
         d1 = xp.abs(work.Sn - Snm1)
         d2 = xp.abs(work.Sn - Snm2)
         d3 = e1 * xp.max(xp.abs(work.fjwj), axis=-1)
         d4 = work.d4
-        # If `d1` is 0, no need to warn. This does the right thing.
-        # with np.errstate(divide='ignore'):
-        ds = xp.stack([d1**(xp.log(d1)/xp.log(d2)), d1**2, d3, d4])
+        d5 = e1 * xp.abs(work.Sn)
+        temp = xp.where(d1 > 0, d1**(xp.log(d1)/xp.log(d2)), 0)
+        ds = xp.stack([temp, d1**2, d3, d4, d5])
         aerr = xp.max(ds, axis=0)
-        rerr = xp.maximum(e1, aerr/xp.abs(work.Sn))
+        rerr = aerr/xp.abs(work.Sn)
 
-    aerr = xp.reshape(xp.astype(aerr, work.dtype), work.Sn.shape)
     return rerr, aerr
 
 
@@ -818,14 +821,13 @@ def _tanhsinh_iv(f, a, b, log, maxfun, maxlevel, minlevel,
     # Input validation and standardization
 
     xp = array_namespace(a, b)
+    a, b = xp_promote(a, b, broadcast=True, force_floating=True, xp=xp)
 
     message = '`f` must be callable.'
     if not callable(f):
         raise ValueError(message)
 
     message = 'All elements of `a` and `b` must be real numbers.'
-    a, b = xp.asarray(a), xp.asarray(b)
-    a, b = xp.broadcast_arrays(a, b)
     if (xp.isdtype(a.dtype, 'complex floating')
             or xp.isdtype(b.dtype, 'complex floating')):
         raise ValueError(message)
@@ -894,16 +896,15 @@ def _tanhsinh_iv(f, a, b, log, maxfun, maxlevel, minlevel,
 def _nsum_iv(f, a, b, step, args, log, maxterms, tolerances):
     # Input validation and standardization
 
-    xp = array_namespace(a, b)
+    xp = array_namespace(a, b, step)
+    a, b, step = xp_promote(a, b, step, broadcast=True, force_floating=True, xp=xp)
 
     message = '`f` must be callable.'
     if not callable(f):
         raise ValueError(message)
 
     message = 'All elements of `a`, `b`, and `step` must be real numbers.'
-    a, b, step = xp.broadcast_arrays(xp.asarray(a), xp.asarray(b), xp.asarray(step))
-    dtype = xp.result_type(a.dtype, b.dtype, step.dtype)
-    if not xp.isdtype(dtype, 'numeric') or xp.isdtype(dtype, 'complex floating'):
+    if not xp.isdtype(a.dtype, ('integral', 'real floating')):
         raise ValueError(message)
 
     valid_b = b >= a  # NaNs will be False
diff --git a/scipy/integrate/lsoda.pyf b/scipy/integrate/lsoda.pyf
index d09e32a59184..20cc688320f5 100644
--- a/scipy/integrate/lsoda.pyf
+++ b/scipy/integrate/lsoda.pyf
@@ -23,11 +23,20 @@ python module lsoda__user__routines
 end python module lsoda__user__routines
 
 python module _lsoda
+  usercode '''
+
+#ifdef HAVE_BLAS_ILP64
+typedef npy_int64 F_INT;
+#else
+typedef int F_INT;
+#endif
+'''
+
     interface
        subroutine lsoda(f,neq,y,t,tout,itol,rtol,atol,itask,istate,iopt,rwork,lrw,iwork,liw,jac,jt)
          ! y1,t,istate = lsoda(f,jac,y0,t0,t1,rtol,atol,itask,istate,rwork,iwork,mf)
          callstatement (*f2py_func)(cb_f_in_lsoda__user__routines,&neq,y,&t,&tout,&itol,rtol,atol,&itask,&istate,&iopt,rwork,&lrw,iwork,&liw,cb_jac_in_lsoda__user__routines,&jt)
-         callprotoargument void*,int*,double*,double*,double*,int*,double*,double*,int*,int*,int*,double*,int*,int*,int*,void*,int*
+         callprotoargument void*,F_INT*,double*,double*,double*,F_INT*,double*,double*,F_INT*,F_INT*,F_INT*,double*,F_INT*,F_INT*,F_INT*,void*,F_INT*
          use lsoda__user__routines
          external f
          external jac
diff --git a/scipy/integrate/tests/test_tanhsinh.py b/scipy/integrate/tests/test_tanhsinh.py
index 9eeae2519619..aa847a815dd4 100644
--- a/scipy/integrate/tests/test_tanhsinh.py
+++ b/scipy/integrate/tests/test_tanhsinh.py
@@ -745,6 +745,16 @@ def test_compress_nodes_weights_gh21496(self, xp):
         x[-1] = 1000
         _tanhsinh(np.sin, 1, x)
 
+    def test_gh_22681_finite_error(self, xp):
+        # gh-22681 noted a case in which the error was NaN on some platforms;
+        # check that this does in fact fail in CI.
+        a = complex(12, -10)
+        b = complex(12, 39)
+        def f(t):
+            return xp.sin(a * (1 - t) + b * t)
+        res = _tanhsinh(f, xp.asarray(0.), xp.asarray(1.), atol=0, rtol=0, maxlevel=10)
+        assert xp.isfinite(res.error)
+
 
 @pytest.mark.skip_xp_backends('torch', reason='data-apis/array-api-compat#271')
 @pytest.mark.skip_xp_backends('array_api_strict', reason='No fancy indexing.')
diff --git a/scipy/integrate/vode.pyf b/scipy/integrate/vode.pyf
index 90774653486f..94181009d268 100644
--- a/scipy/integrate/vode.pyf
+++ b/scipy/integrate/vode.pyf
@@ -50,11 +50,18 @@ python module zvode__user__routines
 end python module zvode__user__routines
 
 python module _vode
+  usercode '''
+#ifdef HAVE_BLAS_ILP64
+typedef npy_int64 F_INT;
+#else
+typedef int F_INT;
+#endif
+'''
     interface
        subroutine dvode(f,jac,neq,y,t,tout,itol,rtol,atol,itask,istate,iopt,rwork,lrw,iwork,liw,mf,rpar,ipar)
          ! y1,t,istate = dvode(f,jac,y0,t0,t1,rtol,atol,itask,istate,rwork,iwork,mf)
          callstatement (*f2py_func)(cb_f_in_dvode__user__routines,&neq,y,&t,&tout,&itol,rtol,atol,&itask,&istate,&iopt,rwork,&lrw,iwork,&liw,cb_jac_in_dvode__user__routines,&mf,&rpar,&ipar)
-         callprotoargument void*,int*,double*,double*,double*,int*,double*,double*,int*,int*,int*,double*,int*,int*,int*,void*,int*,double*,int*
+         callprotoargument void*,F_INT*,double*,double*,double*,F_INT*,double*,double*,F_INT*,F_INT*,F_INT*,double*,F_INT*,F_INT*,F_INT*,void*,F_INT*,double*,F_INT*
          use dvode__user__routines
          external f
          external jac
@@ -85,7 +92,7 @@ python module _vode
        subroutine zvode(f,jac,neq,y,t,tout,itol,rtol,atol,itask,istate,iopt,zwork,lzw,rwork,lrw,iwork,liw,mf,rpar,ipar)
          ! y1,t,istate = zvode(f,jac,y0,t0,t1,rtol,atol,itask,istate,rwork,iwork,mf)
          callstatement (*f2py_func)(cb_f_in_zvode__user__routines,&neq,y,&t,&tout,&itol,rtol,atol,&itask,&istate,&iopt,zwork,&lzw,rwork,&lrw,iwork,&liw,cb_jac_in_zvode__user__routines,&mf,&rpar,&ipar)
-         callprotoargument void*,int*,complex_double*,double*,double*,int*,double*,double*,int*,int*,int*,complex_double*,int*,double*,int*,int*,int*,void*,int*,double*,int*
+         callprotoargument void*,F_INT*,complex_double*,double*,double*,F_INT*,double*,double*,F_INT*,F_INT*,F_INT*,complex_double*,F_INT*,double*,F_INT*,F_INT*,F_INT*,void*,F_INT*,double*,F_INT*
          use zvode__user__routines
          external f
          external jac
diff --git a/scipy/interpolate/_bspl.pyx b/scipy/interpolate/_bspl.pyx
deleted file mode 100644
index a47590b8541a..000000000000
--- a/scipy/interpolate/_bspl.pyx
+++ /dev/null
@@ -1,376 +0,0 @@
-"""
-Routines for evaluating and manipulating B-splines.
-
-"""
-
-import numpy as np
-cimport numpy as cnp
-
-from numpy cimport npy_intp, npy_int64, npy_int32
-
-cimport cython
-from libc.math cimport NAN
-
-cnp.import_array()
-
-cdef extern from "src/__fitpack.h" namespace "fitpack":
-    void _deBoor_D(const double *t, double x, int k, int ell, int m, double *result
-    ) noexcept nogil
-    npy_int64 _find_interval(const double* tptr, npy_int64 len_t,
-                           int k,
-                           double xval,
-                           npy_int64 prev_l,
-                           int extrapolate
-    ) noexcept nogil
-
-
-#------------------------------------------------------------------------------
-# B-splines
-#------------------------------------------------------------------------------
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-@cython.nonecheck(False)
-def evaluate_ndbspline(const double[:, ::1] xi,
-                       const double[:, ::1] t,
-                       const npy_int32[::1] len_t,
-                       const npy_int32[::1] k,
-                       int[::1] nu,
-                       bint extrapolate,
-                       const double[::1] c1r,
-                       npy_intp num_c_tr,
-                       const npy_intp[::1] strides_c1,
-                       const npy_intp[:, ::] indices_k1d,
-                       double[:, ::1] out,
-                      ):
-        """Evaluate an N-dim tensor product spline or its derivative.
-
-        Parameters
-        ----------
-        xi : ndarray, shape(npoints, ndim)
-            ``npoints`` values to evaluate the spline at, each value is
-            a point in an ``ndim``-dimensional space.
-        t : ndarray, shape(ndim, max_len_t)
-            Array of knots for each dimension.
-            This array packs the tuple of knot arrays per dimension into a single
-            2D array. The array is ragged (knot lengths may differ), hence
-            the real knots in dimension ``d`` are ``t[d, :len_t[d]]``.
-        len_t : ndarray, 1D, shape (ndim,)
-            Lengths of the knot arrays, per dimension.
-        k : tuple of ints, len(ndim)
-            Spline degrees in each dimension.
-        nu : ndarray of ints, shape(ndim,)
-            Orders of derivatives to compute, per dimension.
-        extrapolate : int
-            Whether to extrapolate out of bounds or return nans.
-        c1r: ndarray, one-dimensional
-            Flattened array of coefficients.
-            The original N-dimensional coefficient array ``c`` has shape
-            ``(n1, ..., nd, ...)`` where each ``ni == len(t[d]) - k[d] - 1``,
-            and the second "..." represents trailing dimensions of ``c``.
-            In code, given the C-ordered array ``c``, ``c1r`` is
-            ``c1 = c.reshape(c.shape[:ndim] + (-1,)); c1r = c1.ravel()``
-        num_c_tr : int
-            The number of elements of ``c1r``, which correspond to the trailing
-            dimensions of ``c``. In code, this is
-            ``c1 = c.reshape(c.shape[:ndim] + (-1,)); num_c_tr = c1.shape[-1]``.
-        strides_c1 : ndarray, one-dimensional
-            Pre-computed strides of the ``c1`` array.
-            Note: These are *data* strides, not numpy-style byte strides.
-            This array is equivalent to
-            ``[stride // s1.dtype.itemsize for stride in s1.strides]``.
-        indices_k1d : ndarray, shape((k+1)**ndim, ndim)
-            Pre-computed mapping between indices for iterating over a flattened
-            array of shape ``[k[d] + 1) for d in range(ndim)`` and
-            ndim-dimensional indices of the ``(k+1,)*ndim`` dimensional array.
-            This is essentially a transposed version of
-            ``np.unravel_index(np.arange((k+1)**ndim), (k+1,)*ndim)``.
-        out : ndarray, shape (npoints, num_c_tr)
-            Output values of the b-spline at given ``xi`` points.
-
-        Notes
-        -----
-
-        This function is essentially equivalent to the following: given an
-        N-dimensional vector ``x = (x1, x2, ..., xN)``, iterate over the
-        dimensions, form linear combinations of products,
-        B(x1) * B(x2) * ... B(xN) of (k+1)**N b-splines which are non-zero
-        at ``x``.
-
-        Since b-splines are localized, the sum has (k+1)**N non-zero elements.
-
-        If ``i = (i1, i2, ..., iN)`` is a vector if intervals of the knot
-        vectors, ``t[d, id] <= xd < t[d, id+1]``, for ``d=1, 2, ..., N``, then
-        the core loop of this function is nothing but
-
-        ```
-        result = 0
-        iters = [range(i[d] - self.k[d], i[d] + 1) for d in range(ndim)]
-        for idx in itertools.product(*iters):
-            term = self.c[idx] * np.prod([B(x[d], self.k[d], idx[d], self.t[d])
-                                          for d in range(ndim)])
-            result += term
-        ```
-
-        For efficiency reasons, we iterate over the flattened versions of the
-        arrays.
-
-        """
-        cdef:
-            npy_intp ndim = len(t)
-
-            # 'intervals': indices for a point in xi into the knot arrays t
-            npy_intp[::1] i = np.empty(ndim, dtype=np.intp)
-
-            # container for non-zero b-splines at each point in xi
-            double[:, ::1] b = np.empty((ndim, max(k) + 1), dtype=float)
-
-            const double[::1] xv     # an ndim-dimensional input point
-            double xd               # d-th component of x
-
-            const double[::1] td    # knots in dimension d
-
-            npy_intp kd             # d-th component of k
-
-            npy_intp i_c      # index to loop over range(num_c_tr)
-            npy_intp iflat    # index to loop over (k+1)**ndim non-zero terms
-            npy_intp volume   # the number of non-zero terms
-            const npy_intp[:] idx_b   # ndim-dimensional index corresponding to iflat
-
-            int out_of_bounds
-            npy_intp idx_cflat_base, idx
-            double factor
-            double[::1] wrk = np.empty(2*max(k) + 2, dtype=float)
-
-        if xi.shape[1] != ndim:
-            raise ValueError(f"Expacted data points in {ndim}-D space, got"
-                             f" {xi.shape[1]}-D points.")
-
-        if out.shape[0] != xi.shape[0]:
-            raise ValueError(f"out and xi are inconsistent: expected"
-                             f" {xi.shape[0]} output values, got"
-                             f" {out.shape[0]}.")
-        if out.shape[1] != num_c_tr:
-            raise ValueError(f"out and c are inconsistent: num_c={num_c_tr} "
-                             f" and out.shape[1] = {out.shape[1]}.")
-
-
-        with nogil:
-            # the number of non-zero terms for each point in ``xi``.
-            volume = 1
-            for d in range(ndim):
-                volume *= k[d] + 1
-
-            ### Iterate over the data points
-            for j in range(xi.shape[0]):
-                xv = xi[j, :]
-
-                # For each point, iterate over the dimensions
-                out_of_bounds = 0
-                for d in range(ndim):
-                    td = t[d, :len_t[d]]
-                    xd = xv[d]
-                    kd = k[d]
-
-                    # get the location of x[d] in t[d]
-                    i[d] = _find_interval(&td[0], td.shape[0], kd, xd, kd, extrapolate)
-
-                    if i[d] < 0:
-                        out_of_bounds = 1
-                        break
-
-                    # compute non-zero b-splines at this value of xd in dimension d
-                    _deBoor_D(&td[0], xd, kd, i[d], nu[d], &wrk[0])
-                    b[d, :kd+1] = wrk[:kd+1]
-
-                if out_of_bounds:
-                    # xd was nan or extrapolate=False: Fill the output array
-                    # *for this xv value*, and continue to the next xv in xi.
-                    for i_c in range(num_c_tr):
-                        out[j, i_c] = NAN
-                    continue
-
-                for i_c in range(num_c_tr):
-                    out[j, i_c] = 0.0
-
-                # iterate over the direct products of non-zero b-splines
-                for iflat in range(volume):
-                    idx_b = indices_k1d[iflat, :]
-                    # The line above is equivalent to
-                    # idx_b = np.unravel_index(iflat, (k+1,)*ndim)
-
-                    # From the indices in ``idx_b``, we prepare to index into
-                    # c1.ravel() : for each dimension d, need to shift the index
-                    # by ``i[d] - k[d]`` (see the docstring above).
-                    #
-                    # Since the strides of `c1` are pre-computed, and the array
-                    # is already raveled and is guaranteed to be C-ordered, we only
-                    # need to compute the base index for iterating over ``num_c_tr``
-                    # elements which represent the trailing dimensions of ``c``.
-                    #
-                    # This all is essentially equivalent to iterating over
-                    # idx_cflat = np.ravel_multi_index(tuple(idx_c) + (i_c,),
-                    #                                  c1.shape)
-                    idx_cflat_base = 0
-                    factor = 1.0
-                    for d in range(ndim):
-                        factor *= b[d, idx_b[d]]
-                        idx = idx_b[d] + i[d] - k[d]
-                        idx_cflat_base += idx * strides_c1[d]
-
-                    ### collect linear combinations of coef * factor
-                    for i_c in range(num_c_tr):
-                        out[j, i_c] = out[j, i_c] + c1r[idx_cflat_base + i_c] * factor
-
-
-@cython.wraparound(False)
-@cython.nonecheck(False)
-@cython.boundscheck(False)
-def _colloc_nd(const double[:, ::1] xvals,
-               const double[:, ::1] _t,
-               const npy_int32[::1] len_t,
-               const npy_int32[::1] k,
-               const npy_intp[:, ::1] _indices_k1d,
-               const npy_intp[::1] _cstrides):
-    """Construct the N-D tensor product collocation matrix as a CSR array.
-
-    In the dense representation, each row of the collocation matrix corresponds
-    to a data point and contains non-zero b-spline basis functions which are
-    non-zero at this data point.
-
-    Parameters
-    ----------
-    xvals : ndarray, shape(size, ndim)
-        Data points. ``xvals[j, :]`` gives the ``j``-th data point as an
-        ``ndim``-dimensional array.
-    t : tuple of 1D arrays, length-ndim
-        Tuple of knot vectors
-    k : ndarray, shape (ndim,)
-        Spline degrees
-
-    Returns
-    -------
-    csr_data, csr_indices, csr_indptr
-        The collocation matrix in the CSR array format.
-
-    Notes
-    -----
-    Algorithm: given `xvals` and the tuple of knots `t`, we construct a tensor
-    product spline, i.e. a linear combination of
-
-       B(x1; i1, t1) * B(x2; i2, t2) * ... * B(xN; iN, tN)
-
-
-    Here ``B(x; i, t)`` is the ``i``-th b-spline defined by the knot vector
-    ``t`` evaluated at ``x``.
-
-    Since ``B`` functions are localized, for each point `(x1, ..., xN)` we
-    loop over the dimensions, and
-    - find the location in the knot array, `t[i] <= x < t[i+1]`,
-    - compute all non-zero `B` values
-    - place these values into the relevant row
-
-    In the dense representation, the collocation matrix would have had a row per
-    data point, and each row has the values of the basis elements (i.e., tensor
-    products of B-splines) evaluated at this data point. Since the matrix is very
-    sparse (has size = len(x)**ndim, with only (k+1)**ndim non-zero elements per
-    row), we construct it in the CSR format.
-    """
-    cdef:
-        npy_intp size = xvals.shape[0]
-        npy_intp ndim = xvals.shape[1]
-
-        # 'intervals': indices for a point in xi into the knot arrays t
-        npy_intp[::1] i = np.empty(ndim, dtype=np.intp)
-
-        # container for non-zero b-splines at each point in xi
-        double[:, ::1] b = np.empty((ndim, max(k) + 1), dtype=float)
-
-        double xd               # d-th component of x
-        const double[::1] td    # knots in the dimension d
-        npy_intp kd             # d-th component of k
-
-        npy_intp iflat    # index to loop over (k+1)**ndim non-zero terms
-        npy_intp volume   # the number of non-zero terms
-
-        # shifted indices into the data array
-        npy_intp[::1] idx_c = np.ones(ndim, dtype=np.intp) * (-101)  # any sentinel would do, really
-        npy_intp idx_cflat
-
-        npy_intp[::1] nu = np.zeros(ndim, dtype=np.intp)
-
-        int out_of_bounds
-        double factor
-        double[::1] wrk = np.empty(2*max(k) + 2, dtype=float)
-
-        # output
-        double[::1] csr_data
-        npy_int64[::1] csr_indices
-
-        int j, d
-
-    # the number of non-zero b-splines for each data point.
-    volume = 1
-    for d in range(ndim):
-        volume *= k[d] + 1
-
-    # Allocate the collocation matrix in the CSR format.
-    # If dense, this would have been
-    # >>> matr = np.zeros((size, max_row_index), dtype=float)
-    csr_indices = np.empty(shape=(size*volume,), dtype=np.int64)
-    csr_data = np.empty(shape=(size*volume,), dtype=float)
-    csr_indptr = np.arange(0, volume*size + 1, volume, dtype=np.int64)
-
-    # ### Iterate over the data points ###
-    for j in range(size):
-        xv = xvals[j, :]
-
-        # For each point, iterate over the dimensions
-        out_of_bounds = 0
-        for d in range(ndim):
-            td = _t[d, :len_t[d]]
-            xd = xv[d]
-            kd = k[d]
-
-            # get the location of x[d] in t[d]
-            i[d] = _find_interval(&td[0], td.shape[0], kd, xd, kd, True)
-
-            if i[d] < 0:
-                out_of_bounds = 1
-                break
-
-            # compute non-zero b-splines at this value of xd in dimension d
-            _deBoor_D(&td[0], xd, kd, i[d], nu[d], &wrk[0])
-            b[d, :kd+1] = wrk[:kd+1]
-
-        if out_of_bounds:
-            raise ValueError(f"Out of bounds in {d = }, with {xv = }")
-
-        # Iterate over the products of non-zero b-splines and place them
-        # into the current row of the design matrix
-        for iflat in range(volume):
-            # the line below is an unrolled version of
-            # idx_b = np.unravel_index(iflat,  tuple(kd+1 for kd in k))
-            idx_b = _indices_k1d[iflat, :]
-
-            factor = 1.0
-            idx_cflat = 0
-            for d in range(ndim):
-                factor *= b[d, idx_b[d]]
-                idx_c[d] = idx_b[d] + i[d] - k[d]
-                idx_cflat += idx_c[d] * _cstrides[d]
-
-            # The `idx_cflat` computation above is an unrolled version of
-            # idx_cflat = np.ravel_multi_index(tuple(idx_c), c_shape)
-
-            # Fill the row of the collocation matrix in the CSR format.
-            # If it were dense, it would have been just
-            # >>> matr[j, idx_cflat] = factor
-
-            # Each row of the full matrix has `volume` non-zero elements.
-            # Thus the CSR format `indptr` increases in steps of `volume`
-            csr_indices[j*volume + iflat] = idx_cflat
-            csr_data[j*volume + iflat] = factor
-
-    return np.asarray(csr_data), np.asarray(csr_indices), csr_indptr
-
diff --git a/scipy/interpolate/_ndbspline.py b/scipy/interpolate/_ndbspline.py
index 51ac566ed5ff..0b2dea8ca5fc 100644
--- a/scipy/interpolate/_ndbspline.py
+++ b/scipy/interpolate/_ndbspline.py
@@ -5,7 +5,7 @@
 
 from math import prod
 
-from . import _bspl   # type: ignore[attr-defined]
+from . import _dierckx  # type: ignore[attr-defined]
 
 import scipy.sparse.linalg as ssl
 from scipy.sparse import csr_array
@@ -139,9 +139,9 @@ def __call__(self, xi, *, nu=None, extrapolate=None):
         extrapolate = bool(extrapolate)
 
         if nu is None:
-            nu = np.zeros((ndim,), dtype=np.intc)
+            nu = np.zeros((ndim,), dtype=np.int64)
         else:
-            nu = np.asarray(nu, dtype=np.intc)
+            nu = np.asarray(nu, dtype=np.int64)
             if nu.ndim != 1 or nu.shape[0] != ndim:
                 raise ValueError(
                     f"invalid number of derivative orders {nu = } for "
@@ -173,12 +173,10 @@ def __call__(self, xi, *, nu=None, extrapolate=None):
 
         # replacement for np.ravel_multi_index for indexing of `c1`:
         _strides_c1 = np.asarray([s // c1.dtype.itemsize
-                                  for s in c1.strides], dtype=np.intp)
+                                  for s in c1.strides], dtype=np.int64)
 
         num_c_tr = c1.shape[-1]  # # of trailing coefficients
-        out = np.empty(xi.shape[:-1] + (num_c_tr,), dtype=c1.dtype)
-
-        _bspl.evaluate_ndbspline(xi,
+        out = _dierckx.evaluate_ndbspline(xi,
                                  self._t,
                                  self._len_t,
                                  self._k,
@@ -188,7 +186,7 @@ def __call__(self, xi, *, nu=None, extrapolate=None):
                                  num_c_tr,
                                  _strides_c1,
                                  self._indices_k1d,
-                                 out,)
+        )
         out = out.view(self.c.dtype)
         return out.reshape(xi_shape[:-1] + self.c.shape[ndim:])
 
@@ -235,15 +233,12 @@ def design_matrix(cls, xvals, t, k, extrapolate=True):
         # The strides of the coeffs array: the computation is equivalent to
         # >>> cstrides = [s // 8 for s in np.empty(c_shape).strides]
         cs = c_shape[1:] + (1,)
-        cstrides = np.cumprod(cs[::-1], dtype=np.intp)[::-1].copy()
+        cstrides = np.cumprod(cs[::-1], dtype=np.int64)[::-1].copy()
 
         # heavy lifting happens here
-        data, indices, indptr = _bspl._colloc_nd(xvals,
-                                                _t,
-                                                len_t,
-                                                k,
-                                                _indices_k1d,
-                                                cstrides)
+        data, indices, indptr = _dierckx._coloc_nd(xvals,
+                _t, len_t, k, _indices_k1d, cstrides)
+
         return csr_array((data, indices, indptr))
 
 
@@ -271,7 +266,7 @@ def _preprocess_inputs(k, t_tpl):
         # make k a tuple
         k = (k,)*ndim
 
-    k = np.asarray([operator.index(ki) for ki in k], dtype=np.int32)
+    k = np.asarray([operator.index(ki) for ki in k], dtype=np.int64)
 
     if len(k) != ndim:
         raise ValueError(f"len(t) = {len(t_tpl)} != {len(k) = }.")
@@ -305,7 +300,7 @@ def _preprocess_inputs(k, t_tpl):
     # non-zero b-spline elements
     shape = tuple(kd + 1 for kd in k)
     indices = np.unravel_index(np.arange(prod(shape)), shape)
-    _indices_k1d = np.asarray(indices, dtype=np.intp).T.copy()
+    _indices_k1d = np.asarray(indices, dtype=np.int64).T.copy()
 
     # 5. pack the knots into a single array:
     #    ([1, 2, 3, 4], [5, 6], (7, 8, 9)) -->
@@ -318,7 +313,7 @@ def _preprocess_inputs(k, t_tpl):
     _t.fill(np.nan)
     for d in range(ndim):
         _t[d, :len(t_tpl[d])] = t_tpl[d]
-    len_t = np.asarray(len_t, dtype=np.int32)
+    len_t = np.asarray(len_t, dtype=np.int64)
 
     return k, _indices_k1d, (_t, len_t)
 
diff --git a/scipy/interpolate/meson.build b/scipy/interpolate/meson.build
index 72e2d20fd1dd..8223134b05db 100644
--- a/scipy/interpolate/meson.build
+++ b/scipy/interpolate/meson.build
@@ -87,10 +87,15 @@ fitpack_src = [
   'fitpack/pardtc.f'
 ]
 
-# TODO: Add flags for 64 bit ints
+if use_ilp64
+  _fflag_intsize = _fflag_ilp64
+else
+  _fflag_intsize = _fflag_lp64
+endif
+
 fitpack_lib = static_library('fitpack_lib',
   fitpack_src,
-  fortran_args: _fflag_Wno_maybe_uninitialized,
+  fortran_args: [_fflag_Wno_maybe_uninitialized, _fflag_intsize],
   override_options: ['b_lto=false'],
   gnu_symbol_visibility: 'hidden',
 )
@@ -123,23 +128,13 @@ py3.extension_module('_rgi_cython',
 
 __fitpack_lib = static_library('__fitpack',
     ['src/__fitpack.h', 'src/__fitpack.cc'],
-    dependencies:[lapack, np_dep, py3_dep],
+    dependencies:[lapack_ilp64, np_dep, py3_dep],
 )
 
 __fitpack_dep = declare_dependency(
     link_with: __fitpack_lib,
 )
 
-py3.extension_module('_bspl',
-  cython_gen_cpp.process('_bspl.pyx'),
-  cpp_args: cython_cpp_args,
-  include_directories: 'src/',
-  dependencies: [lapack, np_dep, __fitpack_dep],
-  link_args: version_link_args,
-  install: true,
-  subdir: 'scipy/interpolate'
-)
-
 
 py3.extension_module('_dierckx',
     ['src/_dierckxmodule.cc'],
@@ -150,9 +145,14 @@ py3.extension_module('_dierckx',
     subdir: 'scipy/interpolate'
 )
 
-# TODO: Add flags for 64 bit ints
+# Build _fitpack and dfitpack extensions: both are FITPACK wrappers.
+# XXX: some functions from dfitpack use the F_INT macro defined in dfitpack.pyf, while
+#      others rely on the build flags only. Consider cleaning this up at some point.
+
+
 py3.extension_module('_fitpack',
   ['src/_fitpackmodule.c'],
+  c_args: c_flags_ilp64,
   link_with: [fitpack_lib],
   include_directories: 'src/',
   dependencies: np_dep,
@@ -162,12 +162,20 @@ py3.extension_module('_fitpack',
   subdir: 'scipy/interpolate'
 )
 
-# TODO: Add flags for 64 bit ints
+if use_ilp64
+   # generator only accepts strings, not files
+   f2c_map_file = f2py_ilp64_opts[1]
+   extra_arg = f2py_ilp64_opts[0] + '=' + fs.parent(f2c_map_file) / fs.name(f2c_map_file)
+  _dfitpackmodule_obj = f2py_gen.process('src/dfitpack.pyf', extra_args: extra_arg)
+else
+  _dfitpackmodule_obj = f2py_gen.process('src/dfitpack.pyf')
+endif
+
 py3.extension_module('_dfitpack',
-  f2py_gen.process('src/dfitpack.pyf'),
-  c_args: [Wno_unused_variable],
+  _dfitpackmodule_obj,
+  c_args: [Wno_unused_variable] +  c_flags_ilp64,
   link_args: version_link_args,
-  dependencies: [lapack_dep, fortranobject_dep],
+  dependencies: [fortranobject_dep],
   link_with: [fitpack_lib],
   override_options: ['b_lto=false'],
   install: true,
diff --git a/scipy/interpolate/src/__fitpack.cc b/scipy/interpolate/src/__fitpack.cc
index 247f28119fd7..27d9e25c9e9e 100644
--- a/scipy/interpolate/src/__fitpack.cc
+++ b/scipy/interpolate/src/__fitpack.cc
@@ -1,4 +1,7 @@
 #include <string>
+#include <cstdint>
+#include <vector>
+#include <algorithm>
 #include "__fitpack.h"
 
 namespace fitpack{
@@ -215,7 +218,6 @@ data_matrix( /* inputs */
     triangularized matrix.
 
     This routine MODIFIES `a` & `y` in-place.
-
  */
 void
 qr_reduce(double *aptr, const int64_t m, const int64_t nz, // a(m, nz), packed
@@ -426,7 +428,6 @@ fpknot(const double *x_ptr, int64_t m,
 }
 
 
-
 /*
  * Evaluate the spline function
 */
@@ -470,7 +471,6 @@ _evaluate_spline(
                     out(ip, jp) += c(interval + a -k, jp) * wrk[a];
                 }
             }
-
         }
     }
 }
@@ -515,6 +515,7 @@ _coloc_matrix(const double *xptr, int64_t m,       // x, shape(m,)
     }
 }
 
+
 void
 norm_eq_lsq(const double *xptr, int64_t m,            // x, shape (m,)
               const double *tptr, int64_t len_t,        // t, shape (len_t,)
@@ -568,4 +569,234 @@ norm_eq_lsq(const double *xptr, int64_t m,            // x, shape (m,)
     }
 }
 
+
+/*** NDBSpline ***/
+
+/* Evaluate an N-dim tensor product spline or its derivative */
+void
+_evaluate_ndbspline(const double *xi_ptr, int64_t npts, int64_t ndim,  // xi, shape(npts, ndim) 
+                    const double *t_ptr, int64_t max_len_t,            // t, shape (ndim, max_len_t)
+                    const int64_t *len_t_ptr,                          // len_t, shape (ndim,)
+                    const int64_t *k_ptr,                              // k, shape (ndim,)
+                    const int64_t *nu_ptr,                             // nu, shape (ndim,)
+                    int i_extrap,
+                    const double *c1_ptr,  int64_t num_c1,             // flattened coefficients
+                    // pre-tabulated helpers for iterating over (k+1)**ndim subarrays
+                    const int64_t *strides_c1_ptr,                           // shape (ndim,)
+                    const int64_t *indices_k1d_ptr,  int64_t num_k1d,        // shape (num_k1, ndim)
+                    double *out_ptr, int64_t num_c_tr    // out, shape(npts, num_c_tr)
+)
+{
+    auto xi = ConstRealArray2D(xi_ptr, npts, ndim);
+    auto t = ConstRealArray2D(t_ptr, ndim, max_len_t);
+    auto len_t = ConstIndexArray1D(len_t_ptr, ndim);
+    auto k = ConstIndexArray1D(k_ptr, ndim);
+    auto nu = ConstIndexArray1D(nu_ptr, ndim);
+    auto c1 = ConstRealArray1D(c1_ptr, num_c1);
+    auto strides_c1 = ConstIndexArray1D(strides_c1_ptr, ndim);
+    auto indices_k1d = ConstIndexArray2D(indices_k1d_ptr, num_k1d, ndim);
+    auto out = RealArray2D(out_ptr, npts, num_c_tr);
+
+    // allocate work arrays (small, allocations unlikely to fail)
+    int64_t max_k = *std::max_element(k_ptr, k_ptr + ndim); 
+    std::vector<double> wrk(2*max_k + 2);
+    std::vector<int64_t> i(ndim);
+
+    std::vector<double> v_b(ndim * (max_k + 1));
+    auto b = RealArray2D(v_b.data(), ndim, max_k + 1);
+
+    // the number of non-zero terms for each point in ``xi``
+    int64_t volume = 1;
+    for (int d=0; d < ndim; d++) {
+        volume *= k(d) + 1;
+    }
+
+    // Iterate over the data points
+    for (int64_t j=0; j < npts; j++){
+
+        // For each point, iterate over the dimensions
+        bool out_of_bounds = false;
+        for(int d=0; d < ndim; d++) {
+            double xd = xi(j, d);
+            int64_t kd = k(d);
+
+            // knots in the dimension d
+            const double *td = t.data + max_len_t*d;
+
+            // get the location of x[d] in td
+            int64_t i_d = _find_interval(td, len_t(d), kd, xd, kd, i_extrap);
+
+            if (i_d < 0) {
+                out_of_bounds = true;
+                break;
+            }
+
+            // compute non-zero b-splines at this value of xd in dimension d
+            _deBoor_D(td, xd, kd, i_d, nu(d), wrk.data());
+
+            for (int s=0; s < kd + 1; s++) {
+                b(d, s) = wrk[s];
+            }
+            i[d] = i_d;
+        } // for (d=...
+
+        if (out_of_bounds) {
+            // xd was nan or extrapolate=False: Fill the output array
+            // for this data point, xi(j, :), and continue to the next xv in xi.
+
+            for (int i_c=0; i_c < num_c_tr; i_c++) {
+                out(j, i_c) = std::numeric_limits<double>::quiet_NaN();
+            }
+            continue;
+        }
+
+        // proceed to combining non-zero terms
+        for(int i_c=0; i_c < num_c_tr; i_c++) {
+            out(j, i_c) = 0;
+        }
+
+        // iterate over the direct product of non-zero b-splines
+        for (int64_t iflat=0; iflat < volume; iflat++) {
+            /* `idx_b = indiced_k1d[iflat, :]` assignment is equivalent to
+             * idx_b = np.unravel_index(iflat, (k+1,)*ndim)
+             * i.e. `idx_b` would be an ndim-dimensional index corresponding to
+             * `iflat`.
+             *
+             * From the indices in ``idx_b``, we prepare to index into
+             * c1.ravel() : for each dimension d, need to shift the index
+             * by ``i[d] - k[d]`` (see the docstring above).
+             *
+             * Since the strides of `c1` are pre-computed, and the array
+             * is already raveled and is guaranteed to be C-ordered, we only
+             * need to compute the base index for iterating over ``num_c_tr``
+             * elements which represent the trailing dimensions of ``c``.
+             *
+             * This all is essentially equivalent to iterating over
+             * idx_cflat = np.ravel_multi_index(tuple(idx_c) + (i_c,),
+             *                                  c1.shape)
+             */
+            int64_t idx_cflat_base = 0;
+            double factor = 1.0;
+            for (int d=0; d < ndim; d++) {
+                int64_t idx_d = indices_k1d(iflat, d);
+                factor *= b(d, idx_d);
+                int64_t idx = idx_d + i[d] - k(d);
+                idx_cflat_base += idx * strides_c1(d);
+            }
+
+            // finally, collect linear combinations of coef * factor
+            for (int i_c=0; i_c < num_c_tr; i_c++) {
+                out(j, i_c) += c1(idx_cflat_base + i_c) * factor;
+            }
+        }
+    } // for (j=...
+}
+
+
+/*
+ * Construct the N-D tensor product collocation matrix as a CSR array
+ * Return value is 0 on a normal return, and negative on error:
+ * if the data point `j` is problematic, return `-j`.
+ */
+int
+_coloc_nd(/* inputs */
+          const double *xi_ptr, int64_t npts, int64_t ndim,  // xi, shape(npts, ndim)
+          const double *t_ptr, int64_t max_len_t,            // t, shape (ndim, max_len_t)
+          const int64_t *len_t_ptr,                          // len_t, shape (ndim,)
+          const int64_t *k_ptr,                              // k, shape (ndim,)
+          /* pre-tabulated helpers for iterating over (k+1)**ndim subarrays */
+          const int64_t *indices_k1d_ptr, int64_t num_k1d,        // shape (num_k1, ndim)
+          const int64_t *strides_c1_ptr,                          // shape (ndim,)
+          /* outputs */
+          int64_t *csr_indices_ptr, int64_t volume,               // shape (npts*volume,)
+          double *csr_data_ptr
+)
+{
+    auto xi = ConstRealArray2D(xi_ptr, npts, ndim);
+    auto t = ConstRealArray2D(t_ptr, ndim, max_len_t);
+    auto len_t = ConstIndexArray1D(len_t_ptr, ndim);
+    auto k = ConstIndexArray1D(k_ptr, ndim);
+
+    auto strides_c1 = ConstIndexArray1D(strides_c1_ptr, ndim);
+    auto indices_k1d = ConstIndexArray2D(indices_k1d_ptr, num_k1d, ndim);
+
+    auto csr_indices = IndexArray1D(csr_indices_ptr, npts*volume);
+    auto csr_data = RealArray1D(csr_data_ptr, npts*volume);
+
+    // allocate work arrays (small, allocations unlikely to fail)
+    int64_t max_k = *std::max_element(k_ptr, k_ptr + ndim); 
+    std::vector<double> wrk(2*max_k + 2);
+    std::vector<int64_t> i(ndim);
+
+    std::vector<double> v_b(ndim * (max_k + 1));
+    auto b = RealArray2D(v_b.data(), ndim, max_k + 1);
+
+    // Iterate over the data points
+    for (int64_t j=0; j < npts; j++){
+
+        // For each point, iterate over the dimensions
+        bool out_of_bounds = false;
+        for(int d=0; d < ndim; d++) {
+            double xd = xi(j, d);
+            int64_t kd = k(d);
+
+            // knots in the dimension d
+            const double *td = t.data + max_len_t*d;
+
+            // get the location of x[d] in td
+            int64_t i_d = _find_interval(td, len_t(d), kd, xd, kd, 1);
+
+            if (i_d < 0) {
+                out_of_bounds = true;
+                break;
+            }
+
+            // compute non-zero b-splines at this value of xd in dimension d
+            _deBoor_D(td, xd, kd, i_d, 0, wrk.data());
+
+            for (int s=0; s < kd + 1; s++) {
+                b(d, s) = wrk[s];
+            }
+            i[d] = i_d;
+        } // for (d=...
+
+        if (out_of_bounds) {
+            // bail out
+            return -j;
+        }
+
+        // Iterate over the products of non-zero b-splines and place them
+        // into the current row of the design matrix
+        for (int64_t iflat=0; iflat < volume; iflat++) {
+            // The `idx_cflat` computation is an unrolled version of
+            // idx_cflat = np.ravel_multi_index(tuple(idx_c), c_shape)
+            //
+            // `_indiced_k1d` array is pre-tabulated such that `idx_d` is a d-th component
+            // of `idx_b = np.unravel_index(iflat,  tuple(kd+1 for kd in k))`
+            int64_t idx_cflat = 0;
+            double factor = 1.0;
+            for (int d=0; d < ndim; d++) {
+                int64_t idx_d = indices_k1d(iflat, d);
+                factor *= b(d, idx_d);
+                int64_t idx = idx_d + i[d] - k(d);
+                idx_cflat += idx * strides_c1(d);
+            }
+
+            /* 
+             *  Fill the row of the colocation matrix in the CSR format.
+             * If it were dense, it would have been just
+             * >>> matr[j, idx_cflat] = factor
+             *
+             * Each row of the full matrix has `volume` non-zero elements.
+             * Thus the CSR format `indptr` increases in steps of `volume`
+             */
+            csr_indices(j*volume + iflat) = idx_cflat;
+            csr_data(j*volume + iflat) = factor;
+        }  // for (iflat=...
+    } // for( j=...
+
+    return 0;
+}
+
+
 } // namespace fitpack
diff --git a/scipy/interpolate/src/__fitpack.h b/scipy/interpolate/src/__fitpack.h
index c39e2dacb7c8..d803844ae0cf 100644
--- a/scipy/interpolate/src/__fitpack.h
+++ b/scipy/interpolate/src/__fitpack.h
@@ -101,6 +101,10 @@ typedef Array1D<double, BOUNDS_CHECK> RealArray1D;
 typedef Array1D<const double, BOUNDS_CHECK> ConstRealArray1D;
 typedef Array2D<const double, BOUNDS_CHECK> ConstRealArray2D;
 
+typedef Array1D<int64_t, BOUNDS_CHECK> IndexArray1D;
+typedef Array1D<const int64_t, BOUNDS_CHECK> ConstIndexArray1D;
+typedef Array2D<const int64_t, BOUNDS_CHECK> ConstIndexArray2D;
+
 
 
 /*
@@ -231,4 +235,41 @@ norm_eq_lsq(const double *xptr, int64_t m,      // x, shape (m,)
               double *wrk
 );
 
+
+/*
+ * Evaluate an ND spline function
+ */
+void
+_evaluate_ndbspline(/* inputs */
+                    const double *xi_ptr, int64_t npts, int64_t ndim,  // xi, shape(npts, ndim) 
+                    const double *t_ptr, int64_t max_len_t,            // t, shape (ndim, max_len_t)
+                    const int64_t *len_t_ptr,                          // len_t, shape (ndim,)
+                    const int64_t *k_ptr,                              // k, shape (ndim,)
+                    const int64_t *nu_ptr,                             // nu, shape (ndim,)
+                    int i_extrap,
+                    /* flattened coefficients */
+                    const double *c1_ptr, int64_t num_c1,
+                    /* pre-tabulated helpers for iterating over (k+1)**ndim subarrays */
+                    const int64_t *strides_c1_ptr,                           // shape (ndim,)
+                    const int64_t *indices_k1d_ptr,  int64_t num_k1d,        // shape (num_k1, ndim)
+                    /* output */
+                    double *out_ptr, int64_t num_c_tr            // out, shape(npts, num_c_tr)
+);
+
+
+int
+_coloc_nd(/* inputs */
+          const double *xi_ptr, int64_t npts, int64_t ndim,  // xi, shape(npts, ndim)
+          const double *t_ptr, int64_t max_len_t,            // t, shape (ndim, max_len_t)
+          const int64_t *len_t_ptr,                          // len_t, shape (ndim,)
+          const int64_t *k_ptr,                              // k, shape (ndim,)
+          /* pre-tabulated helpers for iterating over (k+1)**ndim subarrays */
+          const int64_t *indices_k1d_ptr, int64_t num_k1d,   // shape (num_k1, ndim)
+          const int64_t *strides_c1_ptr,                     // shape (ndim,)
+          /* outputs */
+          int64_t *csr_indices_ptr, int64_t volume,          // shape (npts*volume,)
+          double *csr_data_ptr
+);
+
+
 } // namespace fitpack
diff --git a/scipy/interpolate/src/_dierckxmodule.cc b/scipy/interpolate/src/_dierckxmodule.cc
index c74207c69205..6d51f20dca81 100644
--- a/scipy/interpolate/src/_dierckxmodule.cc
+++ b/scipy/interpolate/src/_dierckxmodule.cc
@@ -706,14 +706,320 @@ py_find_interval(PyObject *self, PyObject *args)
 
     PyObject *py_interval = PyLong_FromSsize_t(interval);
     return py_interval;
+}
+
+
+/*** NDBspline ***/
+
+
+static char doc_evaluate_ndbspline[] =
+        "Evaluate an N-dim tensor product spline or its derivative.\n"
+        "\n"
+        "Parameters\n"
+        "----------\n"
+        "xi : ndarray, shape(npoints, ndim)\n"
+        "    ``npoints`` values to evaluate the spline at, each value is\n"
+        "    a point in an ``ndim``-dimensional space.\n"
+        "t : ndarray, shape(ndim, max_len_t)\n"
+        "    Array of knots for each dimension.\n"
+        "    This array packs the tuple of knot arrays per dimension into a single\n"
+        "    2D array. The array is ragged (knot lengths may differ), hence\n"
+        "    the real knots in dimension ``d`` are ``t[d, :len_t[d]]``.\n"
+        "len_t : ndarray, 1D, shape (ndim,)\n"
+        "    Lengths of the knot arrays, per dimension.\n"
+        "k : tuple of ints, len(ndim)\n"
+        "    Spline degrees in each dimension.\n"
+        "nu : ndarray of ints, shape(ndim,)\n"
+        "    Orders of derivatives to compute, per dimension.\n"
+        "extrapolate : int\n"
+        "    Whether to extrapolate out of bounds or return nans.\n"
+        "c1r: ndarray, one-dimensional\n"
+        "    Flattened array of coefficients.\n"
+        "    The original N-dimensional coefficient array ``c`` has shape\n"
+        "    ``(n1, ..., nd, ...)`` where each ``ni == len(t[d]) - k[d] - 1``,\n"
+        "    and the second '...' represents trailing dimensions of ``c``.\n"
+        "    In code, given the C-ordered array ``c``, ``c1r`` is\n"
+        "    ``c1 = c.reshape(c.shape[:ndim] + (-1,)); c1r = c1.ravel()``\n"
+        "num_c_tr : int\n"
+        "    The number of elements of ``c1r``, which correspond to the trailing\n"
+        "    dimensions of ``c``. In code, this is\n"
+        "    ``c1 = c.reshape(c.shape[:ndim] + (-1,)); num_c_tr = c1.shape[-1]``.\n"
+        "strides_c1 : ndarray, one-dimensional\n"
+        "    Pre-computed strides of the ``c1`` array.\n"
+        "    Note: These are *data* strides, not numpy-style byte strides.\n"
+        "    This array is equivalent to\n"
+        "    ``[stride // s1.dtype.itemsize for stride in s1.strides]``.\n"
+        "indices_k1d : ndarray, shape((k+1)**ndim, ndim)\n"
+        "    Pre-computed mapping between indices for iterating over a flattened\n"
+        "    array of shape ``[k[d] + 1) for d in range(ndim)`` and\n"
+        "    ndim-dimensional indices of the ``(k+1,)*ndim`` dimensional array.\n"
+        "    This is essentially a transposed version of\n"
+        "    ``np.unravel_index(np.arange((k+1)**ndim), (k+1,)*ndim)``.\n"
+        "\n"
+        "Returns\n"
+        "-------\n"
+        "out : ndarray, shape (npoints, num_c_tr)\n"
+        "    Output values of the b-spline at given ``xi`` points.\n"
+        "\n"
+        "Notes\n"
+        "-----\n"
+        "\n"
+        "This function is essentially equivalent to the following: given an\n"
+        "N-dimensional vector ``x = (x1, x2, ..., xN)``, iterate over the\n"
+        "dimensions, form linear combinations of products,\n"
+        "B(x1) * B(x2) * ... B(xN) of (k+1)**N b-splines which are non-zero\n"
+        "at ``x``.\n"
+        "\n"
+        "Since b-splines are localized, the sum has (k+1)**N non-zero elements.\n"
+        "\n"
+        "If ``i = (i1, i2, ..., iN)`` is a vector if intervals of the knot\n"
+        "vectors, ``t[d, id] <= xd < t[d, id+1]``, for ``d=1, 2, ..., N``, then\n"
+        "the core loop of this function is nothing but\n"
+        "\n"
+        "```\n"
+        "result = 0\n"
+        "iters = [range(i[d] - self.k[d], i[d] + 1) for d in range(ndim)]\n"
+        "for idx in itertools.product(*iters):\n"
+        "    term = self.c[idx] * np.prod([B(x[d], self.k[d], idx[d], self.t[d])\n"
+        "                                  for d in range(ndim)])\n"
+        "    result += term\n"
+        "```\n"
+        "\n"
+        "For efficiency reasons, we iterate over the flattened versions of the arrays.\n";
+/*
+def evaluate_ndbspline(const double[:, ::1] xi,
+                       const double[:, ::1] t,
+                       const npy_int64[::1] len_t,
+                       const npy_int64[::1] k,
+                       npy_int64[::1] nu,
+                       bint extrapolate,
+                       const double[::1] c1r,
+                       int num_c_tr,
+                       const npy_int64[::1] strides_c1,
+                       const npy_int64[:, ::] indices_k1d,
+*/
+static PyObject*
+py_evaluate_ndbspline(PyObject *self, PyObject *args)
+{
+    PyObject *py_xi=NULL;
+    PyObject *py_t=NULL, *py_c1r=NULL, *py_strides_c1=NULL, *py_indices_k1d=NULL;
+
+    PyObject *py_len_t=NULL, *py_k=NULL, *py_nu=NULL;
+    int num_c_tr;
+    int i_extrap;
+
+    if(!PyArg_ParseTuple(args, "OOOOOiOiOO",
+                         &py_xi, &py_t, &py_len_t, &py_k, &py_nu, &i_extrap,
+                         &py_c1r, &num_c_tr, &py_strides_c1, &py_indices_k1d)) {
+        return NULL;
+    }
+
+    if (!(check_array(py_xi, 2, NPY_DOUBLE) &&
+          check_array(py_t, 2, NPY_DOUBLE) &&
+          check_array(py_len_t, 1, NPY_INT64) &&
+          check_array(py_k, 1, NPY_INT64) &&
+          check_array(py_nu, 1, NPY_INT64) &&
+          check_array(py_c1r, 1, NPY_DOUBLE) &&
+          check_array(py_strides_c1, 1, NPY_INT64) &&
+          check_array(py_indices_k1d, 2, NPY_INT64))) {
+        return NULL;
+    }
+    PyArrayObject *a_xi = (PyArrayObject *)py_xi;
+    PyArrayObject *a_t = (PyArrayObject *)py_t;
+
+    PyArrayObject *a_len_t = (PyArrayObject *)py_len_t;
+    PyArrayObject *a_k = (PyArrayObject *)py_k;
+    PyArrayObject *a_nu = (PyArrayObject *)py_nu;
+
+    PyArrayObject *a_c1r = (PyArrayObject *)py_c1r;
+    PyArrayObject *a_strides_c1 = (PyArrayObject *)py_strides_c1;
+    PyArrayObject *a_indices_k1d = (PyArrayObject *)py_indices_k1d;
+
+    // sanity checks
+    int64_t ndim = PyArray_DIM(a_t, 0);
+    if (PyArray_DIM(a_xi, 1) != ndim) {
+        std::string msg = ("Expected data points in " + std::to_string(ndim) + "-D"
+                           " space, got " + std::to_string(PyArray_DIM(a_xi, 1)) +
+                           "-D points.");
+        PyErr_SetString(PyExc_ValueError, msg.c_str());
+        return NULL;
+    }
+
+    // allocate the output
+    npy_intp dims[2] = {PyArray_DIM(a_xi, 0), num_c_tr};
+    PyArrayObject *a_out = (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_DOUBLE);
+    if (a_out == NULL) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    // heavy lifting happens here
+    try {
+        fitpack::_evaluate_ndbspline(
+            /* inputs */
+            static_cast<const double *>(PyArray_DATA(a_xi)), PyArray_DIM(a_xi, 0), PyArray_DIM(a_xi, 1),
+            static_cast<const double *>(PyArray_DATA(a_t)), PyArray_DIM(a_t, 1),
+            static_cast<const int64_t *>(PyArray_DATA(a_len_t)),
+            static_cast<const int64_t *>(PyArray_DATA(a_k)),
+            static_cast<const int64_t *>(PyArray_DATA(a_nu)),
+            i_extrap,
+            /* flattened coefficients */
+            static_cast<const double *>(PyArray_DATA(a_c1r)), PyArray_DIM(a_c1r, 0),
+            /* tabulated helpers */
+            static_cast<const int64_t *>(PyArray_DATA(a_strides_c1)),
+            static_cast<const int64_t *>(PyArray_DATA(a_indices_k1d)), PyArray_DIM(a_indices_k1d, 0),
+
+            /* output */
+            static_cast<double*>(PyArray_DATA(a_out)), num_c_tr
+        );
+
+        return (PyObject *)(a_out);
+    }
+    catch (std::exception& e) {
+        PyErr_SetString(PyExc_RuntimeError, e.what());
+        return NULL;
+    }
+}
+
+
+static char doc_coloc_nd[] =
+    "Construct the N-D tensor product collocation matrix as a CSR array.\n"
+    "\n"
+    "In the dense representation, each row of the collocation matrix corresponds\n"
+    "to a data point and contains non-zero b-spline basis functions which are\n"
+    "non-zero at this data point.\n"
+    "\n"
+    "Parameters\n"
+    "----------\n"
+    "xvals : ndarray, shape(size, ndim)\n"
+    "    Data points. ``xvals[j, :]`` gives the ``j``-th data point as an\n"
+    "    ``ndim``-dimensional array.\n"
+    "t : tuple of 1D arrays, length-ndim\n"
+    "    Tuple of knot vectors\n"
+    "k : ndarray, shape (ndim,)\n"
+    "    Spline degrees\n"
+    "\n"
+    "Returns\n"
+    "-------\n"
+    "csr_data, csr_indices, csr_indptr\n"
+    "    The collocation matrix in the CSR array format.\n"
+    "\n"
+    "Notes\n"
+    "-----\n"
+    "Algorithm: given `xvals` and the tuple of knots `t`, we construct a tensor\n"
+    "product spline, i.e. a linear combination of\n"
+    "\n"
+    "   B(x1; i1, t1) * B(x2; i2, t2) * ... * B(xN; iN, tN)\n"
+    "\n"
+    "Here ``B(x; i, t)`` is the ``i``-th b-spline defined by the knot vector\n"
+    "``t`` evaluated at ``x``.\n"
+    "\n"
+    "Since ``B`` functions are localized, for each point `(x1, ..., xN)` we\n"
+    "loop over the dimensions, and\n"
+    "- find the location in the knot array, `t[i] <= x < t[i+1]`,\n"
+    "- compute all non-zero `B` values\n"
+    "- place these values into the relevant row\n"
+    "\n"
+    "In the dense representation, the collocation matrix would have had a row per\n"
+    "data point, and each row has the values of the basis elements (i.e., tensor\n"
+    "products of B-splines) evaluated at this data point. Since the matrix is very\n"
+    "sparse (has size = len(x)**ndim, with only (k+1)**ndim non-zero elements per\n"
+    "row), we construct it in the CSR format.\n";
+/*
+def _colloc_nd(const double[:, ::1] xvals,
+               const double[:, ::1] _t,
+               const npy_int64[::1] len_t,
+               const npy_int64[::1] k,
+               const npy_int64[:, ::1] _indices_k1d,
+               const npy_int64[::1] _cstrides):
+*/
+static PyObject*
+py_coloc_nd(PyObject *self, PyObject *args)
+{
+    PyObject *py_xi, *py_t, *py_len_t, *py_k, *py_indices_k1d, *py_strides;
 
+    if(!PyArg_ParseTuple(args, "OOOOOO",
+                         &py_xi, &py_t, &py_len_t, &py_k,
+                         &py_indices_k1d, &py_strides)) {
+        return NULL;
+    }
+
+    if (!(check_array(py_xi, 2, NPY_DOUBLE) &&
+          check_array(py_t, 2, NPY_DOUBLE) &&
+          check_array(py_len_t, 1, NPY_INT64) &&
+          check_array(py_k, 1, NPY_INT64) &&
+          check_array(py_indices_k1d, 2, NPY_INT64) &&
+          check_array(py_strides, 1, NPY_INT64))) {
+        return NULL;
+    }
+    PyArrayObject *a_xi = (PyArrayObject *)py_xi;
+    PyArrayObject *a_t = (PyArrayObject *)py_t;
+    PyArrayObject *a_len_t = (PyArrayObject *)py_len_t;
+    PyArrayObject *a_k = (PyArrayObject *)py_k;
+    PyArrayObject *a_indices_k1d = (PyArrayObject *)py_indices_k1d;
+    PyArrayObject *a_strides = (PyArrayObject *)py_strides;
+
+    /* allocate the outputs */
+    npy_intp npts = PyArray_DIM(a_xi, 0);
+    npy_intp ndim = PyArray_DIM(a_xi, 1);
+
+    // the number of non-zero b-splines at each data point
+    npy_intp volume = 1;
+    int64_t *k_data = static_cast<int64_t *>(PyArray_DATA(a_k));
+    for (int d=0; d < ndim; d++) {
+        volume *= k_data[d] + 1;
+    }
+
+    // Allocate the colocation matrix in the CSR format.
+    npy_intp dims[1] = {npts*volume};
+    PyObject *py_csr_data = PyArray_SimpleNew(1, dims, NPY_DOUBLE);
+    PyObject *py_csr_indices = PyArray_SimpleNew(1, dims, NPY_INT64);
+    PyObject *py_csr_indptr = PyArray_Arange(0, volume*npts + 1, volume, NPY_INT64);
+
+    if ((py_csr_data == NULL) || (py_csr_indices == NULL) || (py_csr_indptr == NULL)) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    PyArrayObject *a_csr_data = (PyArrayObject *)py_csr_data;
+    PyArrayObject *a_csr_indices = (PyArrayObject *)py_csr_indices;
+
+    // heavy lifting happens here
+    try {
+        int status = fitpack::_coloc_nd(
+            /* inputs */
+            static_cast<const double *>(PyArray_DATA(a_xi)), npts, ndim,
+            static_cast<const double *>(PyArray_DATA(a_t)), PyArray_DIM(a_t, 1),
+            static_cast<const int64_t *>(PyArray_DATA(a_len_t)),
+            static_cast<const int64_t *>(PyArray_DATA(a_k)),
+            /* tabulated helpers */
+            static_cast<const int64_t *>(PyArray_DATA(a_indices_k1d)), PyArray_DIM(a_indices_k1d, 0),
+            static_cast<const int64_t *>(PyArray_DATA(a_strides)),
+            /* outputs */
+            static_cast<int64_t *>(PyArray_DATA(a_csr_indices)), volume,
+            static_cast<double *>(PyArray_DATA(a_csr_data))
+        );
+        if (status < 0) {
+            std::string mesg = ("Data point " + std::to_string(-status) + " is out of bounds");
+            PyErr_SetString(PyExc_ValueError, mesg.c_str());
+        }
+
+        return Py_BuildValue("(NNN)", PyArray_Return(a_csr_data),
+                                      PyArray_Return(a_csr_indices),
+                                      py_csr_indptr
+        );
+    }
+    catch (std::exception& e) {
+        PyErr_SetString(PyExc_RuntimeError, e.what());
+        return NULL;
+    }
 }
 
 
 /////////////////////////////////////
 
 static PyMethodDef DierckxMethods[] = {
-    //...
+    /* FITPACK replacement helpers*/
     {"fpknot", py_fpknot, METH_VARARGS, 
      "fpknot replacement"},
     {"fpback", py_fpback, METH_VARARGS,
@@ -722,16 +1028,23 @@ static PyMethodDef DierckxMethods[] = {
      "row-by-row QR triangularization"},
     {"data_matrix", py_data_matrix, METH_VARARGS,
      "(m, k+1) array of non-zero b-splines"},
-    {"_coloc", py_coloc, METH_VARARGS,
-      doc_coloc},
-    {"_norm_eq_lsq", py_norm_eq_lsq, METH_VARARGS,
-     doc_norm_eq_lsq},
+    /* BSpline helpers */
     {"evaluate_spline", py_evaluate_spline, METH_VARARGS,
      doc_evaluate_spline},
     {"evaluate_all_bspl", py_evaluate_all_bspl, METH_VARARGS,
      doc_evaluate_all_bspl},
     {"find_interval", py_find_interval, METH_VARARGS,
      doc_find_interval},
+    /* make_{interp,lsq}_spline helpers*/
+    {"_coloc", py_coloc, METH_VARARGS,
+      doc_coloc},
+    {"_norm_eq_lsq", py_norm_eq_lsq, METH_VARARGS,
+     doc_norm_eq_lsq},
+    /* NdBSpline helpers */
+    {"evaluate_ndbspline", py_evaluate_ndbspline, METH_VARARGS,
+     doc_evaluate_ndbspline},
+    {"_coloc_nd", py_coloc_nd, METH_VARARGS,
+     doc_coloc_nd},
     //...
     {NULL, NULL, 0, NULL}        /* Sentinel */
 };
diff --git a/scipy/interpolate/src/_fitpackmodule.c b/scipy/interpolate/src/_fitpackmodule.c
index e569fe93034f..59ebf4600e85 100644
--- a/scipy/interpolate/src/_fitpackmodule.c
+++ b/scipy/interpolate/src/_fitpackmodule.c
@@ -5,7 +5,7 @@
 
 static PyObject *fitpack_error;
 
-#ifdef HAVE_ILP64
+#ifdef HAVE_BLAS_ILP64
 
 #define F_INT npy_int64
 #define F_INT_NPY NPY_INT64
diff --git a/scipy/interpolate/src/dfitpack.pyf b/scipy/interpolate/src/dfitpack.pyf
index 829e1cdbbe7c..35b5d0846a73 100644
--- a/scipy/interpolate/src/dfitpack.pyf
+++ b/scipy/interpolate/src/dfitpack.pyf
@@ -13,7 +13,7 @@ python module _dfitpack ! in
 
   usercode '''
 
-#ifdef HAVE_ILP64
+#ifdef HAVE_BLAS_ILP64
 typedef npy_int64 F_INT;
 #else
 typedef int F_INT;
diff --git a/scipy/interpolate/tests/test_bsplines.py b/scipy/interpolate/tests/test_bsplines.py
index 58e7fe3fb6f1..b6898752be28 100644
--- a/scipy/interpolate/tests/test_bsplines.py
+++ b/scipy/interpolate/tests/test_bsplines.py
@@ -2377,6 +2377,11 @@ def test_2D_separable(self):
         xp_assert_close(bspl2(xi),
                         target, atol=1e-14)
 
+        # test that a nan in -> nan out
+        xi = np.asarray(xi)
+        xi[0, 1] = np.nan
+        xp_assert_equal(np.isnan(bspl2(xi)), np.asarray([True, False, False]))
+
         # now check on a multidim xi
         rng = np.random.default_rng(12345)
         xi = rng.uniform(size=(4, 3, 2)) * 5
@@ -2828,6 +2833,15 @@ def test_2D_mixed(self, k):
         bspl = make_ndbspl((x, y), values, k=k, solver=ssl.spsolve)
         xp_assert_close(bspl(xi), values.ravel(), atol=1e-15)
 
+    def test_2D_nans(self):
+        x = np.arange(6)
+        y = np.arange(6) + 0.5
+        y[-1] = np.nan
+        values = x[:, None]**3 * (y**3 + 2*y)[None, :]
+
+        with assert_raises(ValueError):
+            make_ndbspl((x, y), values, k=1)
+
     def _get_sample_2d_data(self):
         # from test_rgi.py::TestIntepN
         x = np.array([.5, 2., 3., 4., 5.5, 6.])
diff --git a/scipy/interpolate/tests/test_fitpack.py b/scipy/interpolate/tests/test_fitpack.py
index d798f0eda4eb..2c30112ea3a1 100644
--- a/scipy/interpolate/tests/test_fitpack.py
+++ b/scipy/interpolate/tests/test_fitpack.py
@@ -1,5 +1,6 @@
 import itertools
 import os
+import sys
 
 import numpy as np
 from scipy._lib._array_api import (
@@ -448,6 +449,7 @@ def test_splprep_segfault():
     tck, u = splprep([x, y], task=-1, t=uknots)  # here is the crash
 
 
+@pytest.mark.skipif(sys.platform == 'darwin', reason='XXX: crashes on ILP64 CI, why')
 def test_bisplev_integer_overflow():
     np.random.seed(1)
 
diff --git a/scipy/linalg/fblas_64.pyf.src b/scipy/linalg/fblas_64.pyf.src
index 40aa47151dae..fb0f16dc5696 100644
--- a/scipy/linalg/fblas_64.pyf.src
+++ b/scipy/linalg/fblas_64.pyf.src
@@ -1,9 +1,6 @@
 python module _fblas_64
     usercode '''
-#if defined(BLAS_SYMBOL_PREFIX) || defined(BLAS_SYMBOL_SUFFIX)
-#include "blas64-prefix-defines.h"
-#endif
-#define F_INT npy_int64
+#include "_blas64_defines.h"
 '''
 
     interface
diff --git a/scipy/linalg/fblas_l1.pyf.src b/scipy/linalg/fblas_l1.pyf.src
index ccaebcc6b5ca..3fe5c5684b9d 100644
--- a/scipy/linalg/fblas_l1.pyf.src
+++ b/scipy/linalg/fblas_l1.pyf.src
@@ -136,9 +136,9 @@ subroutine <tchar>rot(n,x,offx,incx,y,offy,incy,c,s)
   check(offx>=0 && offx<len(x)) :: offx
   check(offy>=0 && offy<len(y)) :: offy
   integer optional, intent(in), depend(x,incx,offx,y,incy,offy) :: &
-       n = (len(x)-1-offx)/abs(incx)+1
-  check(len(x)-offx>(n-1)*abs(incx)) :: n
-  check(len(y)-offy>(n-1)*abs(incy)) :: n
+       n = (len(x)-1-offx)/labs(incx)+1
+  check(len(x)-offx>(n-1)*labs(incx)) :: n
+  check(len(y)-offy>(n-1)*labs(incy)) :: n
 
 end subroutine <tchar>rot
 
@@ -167,9 +167,9 @@ subroutine <prefix2>rotm(n,x,offx,incx,y,offy,incy,param)
   check(offx>=0 && offx<len(x)) :: offx
   check(offy>=0 && offy<len(y)) :: offy
   integer optional, intent(in),depend(x,incx,offx,y,incy,offy) :: &
-       n = (len(x)-offx)/abs(incx)
-  check(len(x)-offx>(n-1)*abs(incx)) :: n
-  check(len(y)-offy>(n-1)*abs(incy)) :: n
+       n = (len(x)-offx)/labs(incx)
+  check(len(x)-offx>(n-1)*labs(incx)) :: n
+  check(len(y)-offy>(n-1)*labs(incy)) :: n
 
 end subroutine <prefix2>rotm
 
@@ -188,9 +188,9 @@ subroutine <prefix>swap(n,x,offx,incx,y,offy,incy)
   check(offx>=0 && offx<len(x)) :: offx
   check(offy>=0 && offy<len(y)) :: offy
   integer optional, intent(in),depend(x,incx,offx,y,incy,offy) :: &
-       n = (len(x)-offx)/abs(incx)
-  check(len(x)-offx>(n-1)*abs(incx)) :: n
-  check(len(y)-offy>(n-1)*abs(incy)) :: n
+       n = (len(x)-offx)/labs(incx)
+  check(len(x)-offx>(n-1)*labs(incx)) :: n
+  check(len(y)-offy>(n-1)*labs(incy)) :: n
 
 end subroutine <prefix>swap
 
@@ -206,8 +206,8 @@ subroutine <prefix>scal(n,a,x,offx,incx)
   integer optional, intent(in), check(incx>0||incx<0) :: incx = 1
   integer optional, intent(in), depend(x) :: offx=0
   check(offx>=0 && offx<len(x)) :: offx
-  integer optional, intent(in),depend(x,incx,offx) :: n = (len(x)-offx)/abs(incx)
-  check(len(x)-offx>(n-1)*abs(incx)) :: n
+  integer optional, intent(in),depend(x,incx,offx) :: n = (len(x)-offx)/labs(incx)
+  check(len(x)-offx>(n-1)*labs(incx)) :: n
 
 end subroutine <prefix>scal
 
@@ -225,8 +225,8 @@ subroutine <tchar2c>scal(n,a,x,offx,incx)
   integer optional, intent(in),check(incx>0||incx<0) :: incx = 1
   integer optional, intent(in),depend(x) :: offx=0
   check(offx>=0 && offx<len(x)) :: offx
-  integer optional, intent(in),depend(x,incx,offx) :: n = (len(x)-offx)/abs(incx)
-  check(len(x)-offx>(n-1)*abs(incx)) :: n
+  integer optional, intent(in),depend(x,incx,offx) :: n = (len(x)-offx)/labs(incx)
+  check(len(x)-offx>(n-1)*labs(incx)) :: n
 
 end subroutine <tchar2c>scal
 
@@ -246,9 +246,9 @@ subroutine <prefix>copy(n,x,offx,incx,y,offy,incy)
   check(offx>=0 && offx<len(x)) :: offx
   check(offy>=0 && offy<len(y)) :: offy
   integer optional, intent(in),depend(x,incx,offx,y,incy,offy) :: &
-       n = (len(x)-offx)/abs(incx)
-  check(len(x)-offx>(n-1)*abs(incx)) :: n
-  check(len(y)-offy>(n-1)*abs(incy)) :: n
+       n = (len(x)-offx)/labs(incx)
+  check(len(x)-offx>(n-1)*labs(incx)) :: n
+  check(len(y)-offy>(n-1)*labs(incy)) :: n
 
 end subroutine <prefix>copy
 
@@ -269,9 +269,9 @@ subroutine <prefix>axpy(n,a,x,offx,incx,y,offy,incy)
   check(offx>=0 && offx<len(x)) :: offx
   check(offy>=0 && offy<len(y)) :: offy
   integer optional, intent(in),depend(x,incx,offx,y,incy,offy) :: &
-       n = (len(x)-offx)/abs(incx)
-  check(len(x)-offx>(n-1)*abs(incx)) :: n
-  check(len(y)-offy>(n-1)*abs(incy)) :: n
+       n = (len(x)-offx)/labs(incx)
+  check(len(x)-offx>(n-1)*labs(incx)) :: n
+  check(len(y)-offy>(n-1)*labs(incy)) :: n
 
 end subroutine <prefix>axpy
 
@@ -294,9 +294,9 @@ function sdot(n,x,offx,incx,y,offy,incy) result (xy)
   check(offx>=0 && offx<len(x)) :: offx
   check(offy>=0 && offy<len(y)) :: offy
   integer optional, intent(in),depend(x,incx,offx,y,incy,offy) :: &
-       n = (len(x)-offx)/abs(incx)
-  check(len(x)-offx>(n-1)*abs(incx)) :: n
-  check(len(y)-offy>(n-1)*abs(incy)) :: n
+       n = (len(x)-offx)/labs(incx)
+  check(len(x)-offx>(n-1)*labs(incx)) :: n
+  check(len(y)-offy>(n-1)*labs(incy)) :: n
 
 end function sdot
 
@@ -319,9 +319,9 @@ function ddot(n,x,offx,incx,y,offy,incy) result (xy)
   check(offx>=0 && offx<len(x)) :: offx
   check(offy>=0 && offy<len(y)) :: offy
   integer optional, intent(in),depend(x,incx,offx,y,incy,offy) :: &
-       n = (len(x)-offx)/abs(incx)
-  check(len(x)-offx>(n-1)*abs(incx)) :: n
-  check(len(y)-offy>(n-1)*abs(incy)) :: n
+       n = (len(x)-offx)/labs(incx)
+  check(len(x)-offx>(n-1)*labs(incx)) :: n
+  check(len(y)-offy>(n-1)*labs(incy)) :: n
 
 end function ddot
 
@@ -345,9 +345,9 @@ subroutine <prefix2c>dotu(n,x,offx,incx,y,offy,incy,xy)
   check(offy>=0 && offy<len(y)) :: offy
 
   integer optional,intent(in),depend(x,incx,offx,y,incy,offy) &
-       :: n = (len(x)-offx)/abs(incx)
-  check(len(x)-offx>(n-1)*abs(incx)) :: n
-  check(len(y)-offy>(n-1)*abs(incy)) :: n
+       :: n = (len(x)-offx)/labs(incx)
+  check(len(x)-offx>(n-1)*labs(incx)) :: n
+  check(len(y)-offy>(n-1)*labs(incy)) :: n
 
 end subroutine <prefix2c>dotu
 
@@ -370,9 +370,9 @@ subroutine <prefix2c>dotc(n,x,offx,incx,y,offy,incy,xy)
   check(offx>=0 && offx<len(x)) :: offx
   check(offy>=0 && offy<len(y)) :: offy
 
-  integer optional,intent(in),depend(x,incx,offx,y,incy,offy) :: n = (len(x)-offx)/abs(incx)
-  check(len(x)-offx>(n-1)*abs(incx)) :: n
-  check(len(y)-offy>(n-1)*abs(incy)) :: n
+  integer optional,intent(in),depend(x,incx,offx,y,incy,offy) :: n = (len(x)-offx)/labs(incx)
+  check(len(x)-offx>(n-1)*labs(incx)) :: n
+  check(len(y)-offy>(n-1)*labs(incy)) :: n
 
 end subroutine <prefix2c>dotc
 
@@ -393,8 +393,8 @@ function <prefix3>nrm2(n,x,offx,incx) result(n2)
   integer optional,intent(in),depend(x) :: offx=0
   check(offx>=0 && offx<len(x)) :: offx
 
-  integer optional,intent(in),depend(x,incx,offx) :: n = (len(x)-offx)/abs(incx)
-  check(len(x)-offx>(n-1)*abs(incx)) :: n
+  integer optional,intent(in),depend(x,incx,offx) :: n = (len(x)-offx)/labs(incx)
+  check(len(x)-offx>(n-1)*labs(incx)) :: n
 
 end function <prefix3>nrm2
 
@@ -415,8 +415,8 @@ function <prefix4>nrm2(n,x,offx,incx) result(n2)
   integer optional,intent(in),depend(x) :: offx=0
   check(offx>=0 && offx<len(x)) :: offx
 
-  integer optional,intent(in),depend(x,incx,offx) :: n = (len(x)-offx)/abs(incx)
-  check(len(x)-offx>(n-1)*abs(incx)) :: n
+  integer optional,intent(in),depend(x,incx,offx) :: n = (len(x)-offx)/labs(incx)
+  check(len(x)-offx>(n-1)*labs(incx)) :: n
 
 end function <prefix4>nrm2
 
@@ -434,8 +434,8 @@ function <prefix3>asum(n,x,offx,incx) result (s)
   integer optional, intent(in), check(incx>0||incx<0) :: incx = 1
   integer optional, intent(in), depend(x) :: offx=0
   check(offx>=0 && offx<len(x)) :: offx
-  integer optional, intent(in),depend(x,incx,offx) :: n = (len(x)-offx)/abs(incx)
-  check(len(x)-offx>(n-1)*abs(incx)) :: n
+  integer optional, intent(in),depend(x,incx,offx) :: n = (len(x)-offx)/labs(incx)
+  check(len(x)-offx>(n-1)*labs(incx)) :: n
 
 end function <prefix3>asum
 
@@ -453,8 +453,8 @@ function <prefix4>asum(n,x,offx,incx) result (s)
   integer optional, intent(in), check(incx>0||incx<0) :: incx = 1
   integer optional, intent(in), depend(x) :: offx=0
   check(offx>=0 && offx<len(x)) :: offx
-  integer optional, intent(in),depend(x,incx,offx) :: n = (len(x)-offx)/abs(incx)
-  check(len(x)-offx>(n-1)*abs(incx)) :: n
+  integer optional, intent(in),depend(x,incx,offx) :: n = (len(x)-offx)/labs(incx)
+  check(len(x)-offx>(n-1)*labs(incx)) :: n
 
 end function <prefix4>asum
 
@@ -473,8 +473,8 @@ function i<prefix>amax(n,x,offx,incx) result(k)
   integer optional, intent(in), check(incx>0||incx<0) :: incx = 1
   integer optional, intent(in), depend(x) :: offx=0
   check(offx>=0 && offx<len(x)) :: offx
-  integer optional, intent(in),depend(x,incx,offx) :: n = (len(x)-offx)/abs(incx)
-  check(len(x)-offx>(n-1)*abs(incx)) :: n
+  integer optional, intent(in),depend(x,incx,offx) :: n = (len(x)-offx)/labs(incx)
+  check(len(x)-offx>(n-1)*labs(incx)) :: n
 
 end function i<prefix>amax
 
diff --git a/scipy/linalg/fblas_l2.pyf.src b/scipy/linalg/fblas_l2.pyf.src
index 27b9972c0df7..f74a98200b87 100644
--- a/scipy/linalg/fblas_l2.pyf.src
+++ b/scipy/linalg/fblas_l2.pyf.src
@@ -34,7 +34,7 @@ subroutine <prefix>gemv(m,n,alpha,a,x,beta,y,offx,incx,offy,incy,trans,rows,cols
   <ftype> dimension(*), intent(in) :: x
   <ftype> dimension(ly), intent(in,copy,out), depend(ly),optional :: y
   integer intent(hide), depend(incy,rows,offy) :: ly = &
-       (y_capi==Py_None?1+offy+(rows-1)*abs(incy):-1)
+       (y_capi==Py_None?1+offy+(rows-1)*labs(incy):-1)
   <ftype> dimension(m,n), intent(in) :: a
   integer depend(a), intent(hide):: m = shape(a,0)
   integer depend(a), intent(hide):: n = shape(a,1)
@@ -42,11 +42,11 @@ subroutine <prefix>gemv(m,n,alpha,a,x,beta,y,offx,incx,offy,incy,trans,rows,cols
   integer optional, intent(in) :: offx=0
   integer optional, intent(in) :: offy=0
   check(offx>=0 && offx<len(x)) :: x
-  check(len(x)>offx+(cols-1)*abs(incx)) :: x
+  check(len(x)>offx+(cols-1)*labs(incx)) :: x
   depend(offx,cols,incx) :: x
 
   check(offy>=0 && offy<len(y)) :: y
-  check(len(y)>offy+(rows-1)*abs(incy)) :: y
+  check(len(y)>offy+(rows-1)*labs(incy)) :: y
   depend(offy,rows,incy) :: y
 
   integer depend(m,n,trans), intent(hide) :: rows = (trans?n:m)
@@ -76,7 +76,7 @@ subroutine <prefix>gbmv(m,n,kl,ku,alpha,a,lda,x,incx,offx,beta,y,incy,offy,trans
   integer optional, intent(in),check(incx>0||incx<0) :: incx = 1
   integer optional, intent(in),check(incy>0||incy<0) :: incy = 1
   integer intent(hide),depend(m,n,incy,offy,trans) :: ly = &
-      (y_capi==Py_None?1+offy+(trans==0?m-1:n-1)*abs(incy):-1)
+      (y_capi==Py_None?1+offy+(trans==0?m-1:n-1)*labs(incy):-1)
   integer optional, intent(in) :: offx=0
   integer optional, intent(in) :: offy=0
 
@@ -87,12 +87,12 @@ subroutine <prefix>gbmv(m,n,kl,ku,alpha,a,lda,x,incx,offx,beta,y,incy,offy,trans
 
   <ftype> dimension(ly), intent(in,out,copy,out=yout),depend(ly),optional :: y
   check(offy>=0 && offy<len(y)) :: y
-  check(len(y)>offy+(trans==0?m-1:n-1)*abs(incy)) :: y
+  check(len(y)>offy+(trans==0?m-1:n-1)*labs(incy)) :: y
   depend(offy,n,incy) :: y
 
   <ftype> dimension(*), intent(in) :: x
   check(offx>=0 && offx<len(x)) :: x
-  check(len(x)>offx+(trans==0?n-1:m-1)*abs(incx)) :: x
+  check(len(x)>offx+(trans==0?n-1:m-1)*labs(incx)) :: x
   depend(offx,n,incx) :: x
 
 end subroutine <prefix>gbmv
@@ -115,7 +115,7 @@ subroutine <prefix><s,s,h,h>bmv(n,k,alpha,a,lda,x,incx,offx,beta,y,incy,offy,low
   integer intent(in),depend(lda),check(k>=0&&k<=lda-1) :: k
   integer optional, intent(in),check(incx>0||incx<0) :: incx = 1
   integer optional, intent(in),check(incy>0||incy<0) :: incy = 1
-  integer intent(hide),depend(incy,n,offy) :: ly = (y_capi==Py_None?1+offy+(n-1)*abs(incy):-1)
+  integer intent(hide),depend(incy,n,offy) :: ly = (y_capi==Py_None?1+offy+(n-1)*labs(incy):-1)
   integer optional, intent(in) :: offx=0
   integer optional, intent(in) :: offy=0
 
@@ -126,12 +126,12 @@ subroutine <prefix><s,s,h,h>bmv(n,k,alpha,a,lda,x,incx,offx,beta,y,incy,offy,low
 
   <ftype> dimension(ly), intent(in,out,copy,out=yout),depend(ly),optional :: y
   check(offy>=0 && offy<len(y)) :: y
-  check(len(y)>offy+(n-1)*abs(incy)) :: y
+  check(len(y)>offy+(n-1)*labs(incy)) :: y
   depend(offy,n,incy) :: y
 
   <ftype> dimension(*), intent(in) :: x
   check(offx>=0 && offx<len(x)) :: x
-  check(len(x)>offx+(n-1)*abs(incx)) :: x
+  check(len(x)>offx+(n-1)*labs(incx)) :: x
   depend(offx,n,incx) :: x
 
 end subroutine <prefix><s,s,h,h>bmv
@@ -149,7 +149,7 @@ subroutine <prefix6><s,s,s,s,h,h>pmv(n,alpha,ap,x,incx,offx,beta,y,incy,offy,low
   integer intent(in),check(n>=0) :: n
   integer optional, intent(in),check(incx>0||incx<0) :: incx = 1
   integer optional, intent(in),check(incy>0||incy<0) :: incy = 1
-  integer intent(hide),depend(incy,n,offy) :: ly = (y_capi==Py_None?1+offy+(n-1)*abs(incy):-1)
+  integer intent(hide),depend(incy,n,offy) :: ly = (y_capi==Py_None?1+offy+(n-1)*labs(incy):-1)
   integer optional, intent(in) :: offx=0
   integer optional, intent(in) :: offy=0
 
@@ -160,12 +160,12 @@ subroutine <prefix6><s,s,s,s,h,h>pmv(n,alpha,ap,x,incx,offx,beta,y,incy,offy,low
 
   <ftype6> dimension(ly), intent(in,out,copy,out=yout),depend(ly),optional :: y
   check(offy>=0 && offy<len(y)) :: y
-  check(len(y)>offy+(n-1)*abs(incy)) :: y
+  check(len(y)>offy+(n-1)*labs(incy)) :: y
   depend(offy,n,incy) :: y
 
   <ftype6> dimension(*), intent(in) :: x
   check(offx>=0 && offx<len(x)) :: x
-  check(len(x)>offx+(n-1)*abs(incx)) :: x
+  check(len(x)>offx+(n-1)*labs(incx)) :: x
   depend(offx,n,incx) :: x
 
 end subroutine <prefix6><s,s,s,s,h,h>pmv
@@ -190,18 +190,18 @@ subroutine <prefix><symv,\0,hemv,\2>(n,alpha,a,x,beta,y,offx,incx,offy,incy,lowe
   <ftype> dimension(*), intent(in) :: x
   <ftype> dimension(ly), intent(in,copy,out),depend(ly),optional :: y
   integer intent(hide),depend(incy,n,offy) :: ly = &
-       (y_capi==Py_None?1+offy+(n-1)*abs(incy):-1)
+       (y_capi==Py_None?1+offy+(n-1)*labs(incy):-1)
   <ftype> dimension(n,n), intent(in),check(shape(a,0)==shape(a,1)) :: a
   integer depend(a), intent(hide):: n = shape(a,0)
 
   integer optional, intent(in) :: offx=0
   integer optional, intent(in) :: offy=0
   check(offx>=0 && offx<len(x)) :: x
-  check(len(x)>offx+(n-1)*abs(incx)) :: x
+  check(len(x)>offx+(n-1)*labs(incx)) :: x
   depend(offx,n,incx) :: x
 
   check(offy>=0 && offy<len(y)) :: y
-  check(len(y)>offy+(n-1)*abs(incy)) :: y
+  check(len(y)>offy+(n-1)*labs(incy)) :: y
   depend(offy,n,incy) :: y
 
 end subroutine  <prefix><symv,\0,hemv,\2>
@@ -246,9 +246,9 @@ subroutine <prefix6><sy,\0,\0,\0, he,\4>r(alpha,x,lower,incx,offx,n,a)
     integer, intent(in), optional :: offx = 0
     integer, intent(in), optional, check(incx>0||incx<0) :: incx = 1
 
-    integer, intent(in), optional :: n = (len(x)-1-offx)/abs(incx)+1
+    integer, intent(in), optional :: n = (len(x)-1-offx)/labs(incx)+1
     check(n >= 0) :: n
-    check(n <= (len(x)-1-offx)/abs(incx)+1) :: n
+    check(n <= (len(x)-1-offx)/labs(incx)+1) :: n
     depend(x, offx, incx) :: n
 
     <ftype6> dimension(n,n), intent(in,copy,out), optional :: a
@@ -275,11 +275,11 @@ subroutine <prefix><sy, \0, he, \2>r2(alpha,x,y,lower,incx,offx,incy,offy,n,a)
     integer intent(in), optional, check(incy>0||incy<0) :: incy = 1
     integer intent(in), optional :: offy = 0
 
-    integer intent(in), optional :: n = ((len(x)-1-offx)/abs(incx)+1 <= (len(y)-1-offy)/abs(incy)+1 ? (len(x)-1-offx)/abs(incx)+1 : (len(y)-1-offy)/abs(incy)+1)
+    integer intent(in), optional :: n = ((len(x)-1-offx)/labs(incx)+1 <= (len(y)-1-offy)/labs(incy)+1 ? (len(x)-1-offx)/labs(incx)+1 : (len(y)-1-offy)/labs(incy)+1)
     depend(x,incx,offx,y,incy,offy) :: n
     check(n>=0) :: n
-    check(n <= (len(x)-1-offx)/abs(incx)+1) :: n
-    check(n <= (len(y)-1-offy)/abs(incy)+1) :: n
+    check(n <= (len(x)-1-offx)/labs(incx)+1) :: n
+    check(n <= (len(y)-1-offy)/labs(incy)+1) :: n
 
     <ftype> dimension(n,n), intent(in,copy,out), optional :: a
     depend(incx, offx, x, incy, offy, y, n) :: a
@@ -305,7 +305,7 @@ subroutine <prefix6><s,s,s,s,h,h>pr(n,alpha,x,incx,offx,ap,lower)
 
     <ftype6> dimension(*), intent(in) :: x
     check(offx>=0 && offx<len(x)) :: x
-    check(len(x)>offx+(n-1)*abs(incx)) :: x
+    check(len(x)>offx+(n-1)*labs(incx)) :: x
     depend(offx,n,incx) :: x
 
     <ftype6> dimension(*),depend(n),intent(in,out,copy,out=apu) :: ap
@@ -334,12 +334,12 @@ subroutine <prefix><s,s,h,h>pr2(n,alpha,x,incx,offx,y,incy,offy,ap,lower)
 
     <ftype> dimension(*), intent(in) :: x
     check(offx>=0 && offx<len(x)) :: x
-    check(len(x)>offx+(n-1)*abs(incx)) :: x
+    check(len(x)>offx+(n-1)*labs(incx)) :: x
     depend(offx,n,incx) :: x
 
     <ftype> dimension(*), intent(in) :: y
     check(offy>=0 && offy<len(y)) :: y
-    check(len(y)>offy+(n-1)*abs(incy)) :: y
+    check(len(y)>offy+(n-1)*labs(incy)) :: y
     depend(offy,n,incy) :: y
 
     <ftype> dimension(*),depend(n),intent(in,out,copy,out=apu) :: ap
@@ -374,7 +374,7 @@ subroutine <prefix>tbsv(n,k,a,lda,x,incx,offx,lower,trans,diag)
 
   <ftype> dimension(*), intent(in,out,copy,out=xout) :: x
   check(offx>=0 && offx<len(x)) :: x
-  check(len(x)>offx+(n-1)*abs(incx)) :: x
+  check(len(x)>offx+(n-1)*labs(incx)) :: x
   depend(offx,n,incx) :: x
 
 end subroutine <prefix>tbsv
@@ -405,7 +405,7 @@ subroutine <prefix>tpsv(n,ap,x,incx,offx,lower,trans,diag)
 
   <ftype> dimension(*), intent(in,out,copy,out=xout) :: x
   check(offx>=0 && offx<len(x)) :: x
-  check(len(x)>offx+(n-1)*abs(incx)) :: x
+  check(len(x)>offx+(n-1)*labs(incx)) :: x
   depend(offx,n,incx) :: x
 
 end subroutine <prefix>tpsv
@@ -433,7 +433,7 @@ subroutine <prefix>trmv(n,a,x,offx,incx,lower,trans,diag)
   integer optional, intent(in), depend(x) :: offx=0
   check(offx>=0 && offx<len(x)) :: offx
 
-  check(len(x)>offx+(n-1)*abs(incx)) :: n
+  check(len(x)>offx+(n-1)*labs(incx)) :: n
   depend(x,offx,incx) :: n
 
 end subroutine <prefix>trmv
@@ -465,7 +465,7 @@ subroutine <prefix>trsv(n,a,lda,x,incx,offx,lower,trans,diag)
 
   <ftype> dimension(*), intent(in,out,copy,out=xout) :: x
   check(offx>=0 && offx<len(x)) :: x
-  check(len(x)>offx+(n-1)*abs(incx)) :: x
+  check(len(x)>offx+(n-1)*labs(incx)) :: x
   depend(offx,n,incx) :: x
 
 end subroutine <prefix>trsv
@@ -495,7 +495,7 @@ subroutine <prefix>tbmv(n,k,a,lda,x,incx,offx,lower,trans,diag)
 
   <ftype> dimension(*), intent(in,out,copy,out=xout) :: x
   check(offx>=0 && offx<len(x)) :: x
-  check(len(x)>offx+(n-1)*abs(incx)) :: x
+  check(len(x)>offx+(n-1)*labs(incx)) :: x
   depend(offx,n,incx) :: x
 
 end subroutine <prefix>tbmv
@@ -523,7 +523,7 @@ subroutine <prefix>tpmv(n,ap,x,incx,offx,lower,trans,diag)
 
   <ftype> dimension(*), intent(in,out,copy,out=xout) :: x
   check(offx>=0 && offx<len(x)) :: x
-  check(len(x)>offx+(n-1)*abs(incx)) :: x
+  check(len(x)>offx+(n-1)*labs(incx)) :: x
   depend(offx,n,incx) :: x
 
 end subroutine <prefix>tpmv
diff --git a/scipy/linalg/flapack_64.pyf.src b/scipy/linalg/flapack_64.pyf.src
index 28e276c7030f..d997981bae44 100644
--- a/scipy/linalg/flapack_64.pyf.src
+++ b/scipy/linalg/flapack_64.pyf.src
@@ -16,10 +16,7 @@
 
 python module _flapack_64
     usercode '''
-#if defined(BLAS_SYMBOL_PREFIX) || defined(BLAS_SYMBOL_SUFFIX)
-#include "blas64-prefix-defines.h"
-#endif
-#define F_INT npy_int64
+#include "_blas64_defines.h"
 '''
 
 interface
diff --git a/scipy/linalg/flapack_other.pyf.src b/scipy/linalg/flapack_other.pyf.src
index 1db51bbd8363..2e772887c2c8 100644
--- a/scipy/linalg/flapack_other.pyf.src
+++ b/scipy/linalg/flapack_other.pyf.src
@@ -2107,7 +2107,7 @@ subroutine <prefix>laswp(n,a,nrows,k1,k2,piv,off,inc,m,npiv)
 
     integer optional, intent(in),check(inc>0||inc<0) :: inc = 1
     integer optional,intent(in),depend(npiv),check(off>=0 && off<len(piv)) :: off=0
-    integer intent(hide),depend(npiv,inc,off),check(npiv-off>(m-1)*abs(inc)) :: m = (len(piv)-off)/abs(inc)
+    integer intent(hide),depend(npiv,inc,off),check(npiv-off>(m-1)*labs(inc)) :: m = (len(piv)-off)/labs(inc)
 
 end subroutine <prefix>laswp
 
@@ -2227,7 +2227,7 @@ subroutine <prefix>larf(side,m,n,v,incv,tau,c,ldc,work,lwork)
     character intent(in), check(side[0]=='L'||side[0]=='R') :: side = 'L'
     integer intent(in,hide), depend(c) :: m = shape(c,0)
     integer intent(in,hide), depend(c) :: n = shape(c,1)
-    <ftype> intent(in),dimension((side[0]=='L'?(1 + (m-1)*abs(incv)):(1 + (n-1)*abs(incv)))),depend(n,m,side,incv) :: v
+    <ftype> intent(in),dimension((side[0]=='L'?(1 + (m-1)*labs(incv)):(1 + (n-1)*labs(incv)))),depend(n,m,side,incv) :: v
     integer intent(in), check(incv>0||incv<0) :: incv = 1
     <ftype> intent(in) :: tau
     <ftype> dimension(m,n), intent(in,copy,out) :: c
@@ -2258,9 +2258,9 @@ subroutine <prefix2c>rot(n,x,offx,incx,y,offy,incy,c,s,lx,ly)
     integer optional, intent(in), check(incy>0||incy<0) :: incy = 1
     integer optional, intent(in), depend(lx), check(offx>=0 && offx<lx) :: offx=0
     integer optional, intent(in), depend(ly), check(offy>=0 && offy<ly) :: offy=0
-    integer optional, intent(in), depend(lx,incx,offx,ly,incy,offy) :: n = (lx-1-offx)/abs(incx)+1
-    check(lx-offx>(n-1)*abs(incx)) :: n
-    check(ly-offy>(n-1)*abs(incy)) :: n
+    integer optional, intent(in), depend(lx,incx,offx,ly,incy,offy) :: n = (lx-1-offx)/labs(incx)+1
+    check(lx-offx>(n-1)*labs(incx)) :: n
+    check(ly-offy>(n-1)*labs(incy)) :: n
 end subroutine <prefix2c>rot
 
 subroutine ilaver(major, minor, patch)
diff --git a/scipy/linalg/meson.build b/scipy/linalg/meson.build
index 95925a791cbc..4df5f8d2d468 100644
--- a/scipy/linalg/meson.build
+++ b/scipy/linalg/meson.build
@@ -50,7 +50,7 @@ linalg_cython_gen = generator(cython,
 
 # fblas
 fblas_module = custom_target('fblas_module',
-  output: ['_fblasmodule.c'],
+  output: ['_fblasmodule.c', '_fblas-f2pywrappers.f'],
   input: 'fblas.pyf.src',
   command: [generate_f2pymod, '@INPUT@', '-o', '@OUTDIR@'] + f2py_freethreading_arg,
   depend_files:
@@ -66,8 +66,10 @@ fblas_module = custom_target('fblas_module',
 # LAPACK - we have historically put these in `_fblas`.
 py3.extension_module('_fblas',
   fblas_module,
+  fortran_args: _fflag_lp64,
   link_args: version_link_args,
   dependencies: [lapack_dep, blas_dep, fortranobject_dep],
+  link_with: [g77_abi_wrappers],
   install: true,
   subdir: 'scipy/linalg'
 )
@@ -95,12 +97,74 @@ flapack_module = custom_target('flapack_module',
 py3.extension_module('_flapack',
   flapack_module,
   c_args: [Wno_empty_body],
+  fortran_args: _fflag_lp64,
   link_args: version_link_args,
   dependencies: [lapack_dep, blas_dep, fortranobject_dep],
   install: true,
   subdir: 'scipy/linalg'
 )
 
+# Add _fblas_64 and _flapack_64 if we're building with ILP64 support
+#
+# NOTE: what happened in the setup.py build was that we were linking LP64
+# libopenblas.so to `_fblas` and ILP64 `libopenblas64_.so` to `_fblas_64`
+# and used both at the same time. We never shipped wheels that way, it only
+# worked in a CI job. We are re-exporting the LP64 symbols in
+# `cython_blas`/`cython_lapack`, so we can't use only ILP64 even if we support
+# it in all SciPy code.
+# TODO: right now we're only detecting one BLAS library (like NumPy does), but
+#       we need two blas and two lapack dependency objects here.
+#       The ILP64 CI job in the 1.10.x branch downloads two OpenBLAS tarballs
+#       and then uses both in the build (search for `Download-OpenBLAS('1'))
+#       in azure-pipelines.yml if you want to check that).
+if use_ilp64
+  fblas64_module = custom_target('fblas64_module',
+    output: ['_fblas_64module.c', '_fblas_64-f2pywrappers.f'],
+    input: 'fblas_64.pyf.src',
+    command: [generate_f2pymod, '@INPUT@', '-o', '@OUTDIR@'] + f2py_ilp64_opts + f2py_freethreading_arg,
+    depend_files:
+      [
+        'fblas_l1.pyf.src',
+        'fblas_l2.pyf.src',
+        'fblas_l3.pyf.src',
+      ]
+  )
+
+  py3.extension_module('_fblas_64',
+    fblas64_module,
+    #['_fblas_64module.c'],
+    fortran_args: _fflag_ilp64,
+    link_args: version_link_args,
+    include_directories: ['../_build_utils/src'],   # for npy_cblas.h
+    dependencies: [lapack_ilp64, blas_ilp64, fortranobject_dep],
+    link_with: [g77_abi_wrappers_ilp64],
+    install: true,
+    link_language: 'fortran',
+    subdir: 'scipy/linalg'
+  )
+
+  flapack64_module = custom_target('flapack64_module',
+    output: ['_flapack_64module.c', '_flapack_64-f2pywrappers.f'],
+    input: 'flapack_64.pyf.src',
+    command: [generate_f2pymod, '@INPUT@', '-o', '@OUTDIR@'] + f2py_ilp64_opts + f2py_freethreading_arg,
+  )
+
+  py3.extension_module('_flapack_64',
+    flapack64_module,
+    #['_flapack_64module.c'],
+    c_args: [Wno_empty_body],
+    fortran_args: _fflag_ilp64,
+    link_args: version_link_args,
+    include_directories: ['../_build_utils/src'],   # for npy_cblas.h
+    dependencies: [lapack_ilp64, blas_ilp64, fortranobject_dep],
+    link_with: [g77_abi_wrappers_ilp64],
+    install: true,
+    link_language: 'fortran',
+    subdir: 'scipy/linalg'
+  )
+endif
+
+
 # TODO: cblas/clapack are built *only* for ATLAS. Why? Is it still needed?
 
 # _decomp_interpolative
diff --git a/scipy/linalg/tests/test_batch.py b/scipy/linalg/tests/test_batch.py
index 7a32e7a5cd66..7871a31ffa1f 100644
--- a/scipy/linalg/tests/test_batch.py
+++ b/scipy/linalg/tests/test_batch.py
@@ -441,8 +441,8 @@ def test_solve(self, bdim, dtype):
         if len(bdim) == 1:
             x = x[..., np.newaxis]
             b = b[..., np.newaxis]
-        assert_allclose(A @ x - b, 0, atol=1.5e-6)
-        assert_allclose(x, np.linalg.solve(A, b), atol=2e-6)
+        assert_allclose(A @ x - b, 0, atol=2e-6)
+        assert_allclose(x, np.linalg.solve(A, b), atol=3e-6)
 
     @pytest.mark.parametrize('bdim', [(5,), (5, 4), (2, 3, 5, 4)])
     @pytest.mark.parametrize('dtype', floating)
@@ -455,8 +455,8 @@ def test_lu_solve(self, bdim, dtype):
         if len(bdim) == 1:
             x = x[..., np.newaxis]
             b = b[..., np.newaxis]
-        assert_allclose(A @ x - b, 0, atol=1.5e-6)
-        assert_allclose(x, np.linalg.solve(A, b), atol=2e-6)
+        assert_allclose(A @ x - b, 0, atol=2e-6)
+        assert_allclose(x, np.linalg.solve(A, b), atol=3e-6)
 
     @pytest.mark.parametrize('l_and_u', [(1, 1), ([2, 1, 0], [0, 1 , 2])])
     @pytest.mark.parametrize('bdim', [(5,), (5, 4), (2, 3, 5, 4)])
diff --git a/scipy/meson.build b/scipy/meson.build
index b3803a4f4592..218ef4cdacd9 100644
--- a/scipy/meson.build
+++ b/scipy/meson.build
@@ -218,22 +218,30 @@ endif
 # 2. targets with #include's (due to no `depend_files` - see feature request
 #    at meson#8295)
 f2py_gen = generator(generate_f2pymod,
-  arguments : ['@INPUT@', '-o', '@BUILD_DIR@'] + f2py_freethreading_arg,
+  arguments : ['@INPUT@', '-o', '@BUILD_DIR@', '@EXTRA_ARGS@'] + f2py_freethreading_arg,
   output : ['_@BASENAME@module.c', '_@BASENAME@-f2pywrappers.f'],
 )
 
 
-# TODO: 64-bit BLAS and LAPACK
-#
-# Note that this works as long as BLAS and LAPACK are detected properly via
-# pkg-config. By default we look for OpenBLAS, other libraries can be configured via
-# `meson configure -Dblas=blas -Dlapack=lapack` (example to build with Netlib
-# BLAS and LAPACK).
-# For MKL and for auto-detecting one of multiple libs, we'll need a custom
-# dependency in Meson (like is done for scalapack) - see
-# https://github.com/mesonbuild/meson/issues/2835
+# Start of BLAS/LAPACK detection
+
 blas_name = get_option('blas')
 lapack_name = get_option('lapack')
+blas_symbol_suffix = get_option('blas-symbol-suffix')
+use_ilp64 = get_option('use-ilp64')
+
+# MKL-specific options
+_threading_opt = get_option('mkl-threading')
+if _threading_opt == 'auto'
+  # Switch default to iomp once conda-forge missing openmp.pc issue is fixed
+  mkl_opts = ['threading: seq']
+else
+  mkl_opts = ['threading: ' + _threading_opt]
+endif
+blas_opts = {'mkl': mkl_opts}
+mkl_version_req = '>=2023.0'  # see gh-24824
+mkl_may_use_sdl = not use_ilp64 and _threading_opt in ['auto', 'iomp']
+
 
 macOS13_3_or_later = false
 if host_machine.system() == 'darwin'
@@ -268,6 +276,7 @@ if blas_name == 'openblas' or blas_name == 'auto'
   endif
 endif
 
+# Try any other openblas 
 # pkg-config uses a lower-case name while CMake uses a capitalized name, so try
 # that too to make the fallback detection with CMake work
 if blas_name == 'openblas'
@@ -275,10 +284,7 @@ if blas_name == 'openblas'
 elif blas_name != 'scipy-openblas'  # if so, we found it already
   blas = dependency(blas_name)
 endif
-blas_dep = declare_dependency(
-  dependencies: blas,
-  compile_args: _args_blas_lapack
-)
+
 if blas_name == 'blas'
   # Netlib BLAS has a separate `libcblas.so` which we use directly in the g77
   # ABI wrappers, so detect it and error out if we cannot find it.
@@ -290,6 +296,30 @@ else
   cblas = []
 endif
 
+if blas_name == 'mkl'
+  blas = dependency('mkl',
+    modules: ['interface: lp64'] + mkl_opts,
+    required: false,  # may be required, but we need to emit a custom error message
+    version: mkl_version_req,
+  )
+  # Insert a second try with MKL, because we may be rejecting older versions
+  # or missing it because no pkg-config installed. If so, we need to retry
+  # with MKL SDL, and drop the version constraint (this always worked).
+  if not blas.found() and mkl_may_use_sdl
+    blas = dependency('mkl', modules: ['sdl: true'], required: false)
+  endif
+endif
+
+# fallback BLAS detection
+blas_dep = declare_dependency(
+  dependencies: blas,
+  compile_args: _args_blas_lapack
+)
+
+if not blas.found()
+  error('No BLAS library detected! SciPy needs one, please install it.')
+endif
+
 if 'mkl' in blas.name() or blas.name().to_lower() == 'accelerate' or blas_name == 'scipy-openblas'
   # For these libraries we know that they contain LAPACK, and it's desirable to
   # use that - no need to run the full detection twice.
@@ -299,6 +329,11 @@ elif lapack_name == 'openblas'
 else
   lapack = dependency(lapack_name)
 endif
+
+if not lapack.found()
+  error('No LAPACK library detected! SciPy needs one, please install it.')
+endif
+
 lapack_dep = declare_dependency(
   dependencies: lapack,
   compile_args: _args_blas_lapack
@@ -310,12 +345,11 @@ dependency_map = {
   'PYBIND11': pybind11_dep,
 }
 
-# FIXME: conda-forge sets MKL_INTERFACE_LAYER=LP64,GNU, see gh-11812.
-#        This needs work on gh-16200 to make MKL robust. We should be
-#        requesting `mkl-dynamic-lp64-seq` here. And then there's work needed
-#        in general to enable the ILP64 interface (also for OpenBLAS).
-uses_mkl = blas.name().to_lower().startswith('mkl') or lapack.name().to_lower().startswith('mkl')
-uses_accelerate = blas.name().to_lower().startswith('accelerate') or lapack.name().to_lower().startswith('accelerate')
+# NB: from this point on blas_name is e.g. 'mkl-lp64-dynamic-seq'
+blas_name = blas.name()
+lapack_name = lapack.name()
+uses_mkl = blas_name.to_lower().startswith('mkl')
+uses_accelerate = blas_name.to_lower().startswith('accelerate')
 use_g77_abi = uses_mkl or uses_accelerate or get_option('use-g77-abi')
 if use_g77_abi
   g77_abi_wrappers = static_library(
@@ -333,6 +367,107 @@ else
   )
 endif
 
+# Reuse the names, so we ensure we don't lose the arguments wrapped in with
+# declare_dependency. Also, avoids changing `dependencies: blas` to blas_dep in other files.
+# XXX: unused, remove?
+#blas = blas_dep
+#lapack = lapack_dep
+
+# Run ILP64 BLAS detection, if asked
+if use_ilp64
+  # Okay, we need ILP64 BLAS and LAPACK *in addition to LP64*. So we need to
+  # detect the ILP64 variants of the found LP64 libraries now.
+  _args_blas_ilp64 = ['-DHAVE_BLAS_ILP64']
+  c_flags_ilp64 = ['-DHAVE_BLAS_ILP64']
+  blas_interface = ['interface: ilp64']
+
+  if 'openblas' in blas_name
+    _args_blas_ilp64 += ['-DOPENBLAS_ILP64_NAMING_SCHEME']
+  endif
+
+  # Run the detection
+  if uses_mkl
+    mkl_uses_sdl = false   # FIXME, why
+    if mkl_uses_sdl
+      mkl_opts = ['sdl: true']
+    endif
+    blas_ilp64 = dependency('mkl', modules: ['interface: ilp64'] + mkl_opts)
+    lapack_ilp64 = blas_ilp64
+
+    _args_blas_ilp64 += [
+       '-DBLAS_SYMBOL_SUFFIX=_64',
+       '-DFIX_MKL_2025_ILP64_MISSING_SYMBOL'
+     ]
+     c_flags_ilp64 += ['-DBLAS_SYMBOL_SUFFIX=_64']
+
+  elif blas_name == 'scipy-openblas'
+      # scipy_openblas64, a separate library
+      blas_ilp64 = dependency('scipy-openblas64')
+      lapack_ilp64 = blas_ilp64
+
+      if not blas_ilp64.found()
+        error('scipy-openblas64 not found.')
+      endif
+
+  elif blas_name == 'accelerate'
+    blas_ilp64 = dependency(blas_name, modules: blas_interface)
+    _args_blas_ilp64 += ['-DACCELERATE_NEW_LAPACK']
+  else
+    # XXX: ILP64 detection has only been tested for MKL and scipy-openblas64
+    if blas_name == 'openblas'
+      # We cannot allow plain openblas here, that's already the LP64 library and
+      # will lead to problems (there is, as of now, no combined OpenBLAS build
+      # with 32 and 64 bit symbols)
+      blas_name = ['openblas64', 'openblas_ilp64']
+    endif
+
+    blas_ilp64 = dependency(blas_name, modules: blas_interface)
+    lapack_ilp64 = dependency(lapack_name, modules: ['lapack'] + blas_interface)
+  endif
+
+  # Pick up the symbol suffix, it may be auto-detected by Meson and different from LP64
+  if blas_symbol_suffix == 'auto'
+    if blas_name == 'scipy-openblas'
+      blas_symbol_suffix = '64_'
+    else
+      blas_symbol_suffix = blas_ilp64.get_variable('symbol_suffix', default_value: '')
+    endif
+    message(f'BLAS symbol suffix (ILP64): @blas_symbol_suffix@')
+  endif
+  _blas_incdir = []
+  if blas_symbol_suffix != ''
+    _args_blas_ilp64 += ['-DBLAS_SYMBOL_SUFFIX=' + blas_symbol_suffix]
+    _blas_incdir = ['.']
+  endif
+  # Declare the ILP64 dependencies
+  message('BLAS / LAPACK ILP64 detected: ', blas_ilp64.name(), ', ', lapack_ilp64.name())
+
+  blas_ilp64 = declare_dependency(
+    dependencies: [blas_ilp64],
+    compile_args: _args_blas_ilp64,
+    include_directories: _blas_incdir,
+  )
+  lapack_ilp64 = declare_dependency(dependencies: [lapack_ilp64, blas_ilp64])
+
+  g77_abi_wrappers_ilp64 = static_library(
+    'g77_abi_wrappers_ilp64',
+     ['_build_utils/src/wrap_g77_abi.c'],
+     dependencies: [py3_dep, blas_ilp64, np_dep],
+     c_args: _args_blas_ilp64,
+     gnu_symbol_visibility: 'hidden',
+   )
+else
+  # we're not using ILP64; user code will link to the always-available LP64 blas/lapack
+  # (all users must use preprocessor macros BLAS_NAME to handle the two options)
+  blas_ilp64 = blas
+  lapack_ilp64 = lapack
+  c_flags_ilp64 = []
+  message('LAPACK ILP64 not requested / not detected.')
+endif
+
+# End of BLAS/LAPACK handling
+
+
 scipy_dir = py3.get_install_dir() / 'scipy'
 
 # Generate version.py for sdist
@@ -380,18 +515,41 @@ _cython_tree = [fs.copyfile('__init__.py')]
 cython_args = ['-3', '--fast-fail', '--output-file', '@OUTPUT@', '--include-dir', '@BUILD_ROOT@', '@INPUT@']
 if cy.version().version_compare('>=3.1.0')
   cython_args += ['-Xfreethreading_compatible=True']
+
+  cython_shared_src = custom_target(
+    install: false,
+    output: '_cyutility.c',
+    command: [
+      cython, '-3', '--fast-fail', '-Xfreethreading_compatible=True',
+      '--generate-shared=' + meson.current_build_dir()/'_cyutility.c'
+    ],
+  )
+
+  cython_shared_module = py3.extension_module('_cyutility',
+    cython_shared_src,
+    subdir: 'scipy',
+    cython_args: cython_args,
+    install: true,
+    install_tag: 'python-runtime',
+  )
+
+  cython_args += ['--shared=scipy._cyutility']
+else
+  cython_shared_module = []
 endif
 cython_cplus_args = ['--cplus'] + cython_args
 
 cython_gen = generator(cython,
   arguments : cython_args,
   output : '@BASENAME@.c',
-  depends : _cython_tree)
+  depends : [_cython_tree, cython_shared_module]
+)
 
 cython_gen_cpp = generator(cython,
   arguments : cython_cplus_args,
   output : '@BASENAME@.cpp',
-  depends : [_cython_tree])
+  depends : [_cython_tree, cython_shared_module]
+)
 
 if use_pythran
   # TODO: add argument to mark extension modules as safe to run without the GIL,
@@ -479,13 +637,51 @@ fortran_ignore_warnings = ff.get_supported_arguments(
 
 # Intel Fortran (ifort) does not run the preprocessor by default, if Fortran
 # code uses preprocessor statements, add this compile flag to it.
-_fflag_fpp = []
-if ff.get_id() in ['intel-cl', 'intel-llvm-cl']
-  if is_windows
-    _fflag_fpp = ff.get_supported_arguments('/fpp')
+
+# Gfortran does run the preprocessor for .F files, and PROPACK is the only
+# component which needs the preprocessor (unless we need symbol renaming for
+# blas_symbol_suffix).
+_fflag_preprocess = []
+_gfortran_preprocess = ['-cpp', '-ffree-line-length-none', '-ffixed-line-length-none']
+if ff.has_multi_arguments(_gfortran_preprocess)
+  _fflag_preprocess = _gfortran_preprocess
+else
+  _fflag_preprocess = ff.first_supported_argument(['-fpp', '/fpp', 'cpp'])
+endif
+
+_fflag_lp64 = []
+_fflag_ilp64 = []
+f2py_ilp64_opts = []
+if use_ilp64
+  # Gfortran and Clang use `-fdefault-integer-8` to switch to 64-bit integers by
+  # default, all other known compilers use `-i8`
+  _fflag_ilp64 = ff.first_supported_argument(['-fdefault-integer-8', '-i8'])
+
+  # Write out a mapping file for f2py for defaulting to ILP64
+  conf_data = configuration_data()
+  if cc.sizeof('long') == 8
+    conf_data.set('int64_name', 'long')
+  elif cc.sizeof('long long') == 8
+    conf_data.set('int64_name', 'long long')
   else
-    _fflag_fpp = ff.get_supported_arguments('-fpp')
+    error('Neither `long` nor `long long` is 64-bit, giving up.')
   endif
+  int64_f2cmap = configure_file(
+    input: '_build_utils/int64.f2cmap.in',
+    output: 'int64.f2cmap',
+    configuration: conf_data,
+    install: false,
+  )
+  f2py_ilp64_opts = ['--f2cmap', int64_f2cmap]
+endif
+
+if blas_symbol_suffix != ''
+  # We need to patch source files that use BLAS/LAPACK symbols.
+  # In addition, we now need to enable the Fortran preprocessor on all targets
+  # that depend on BLAS/LAPACK
+  # Note: this came from `scipy/build_utils/_fortran.py` in the distutils build.
+  _fflag_lp64 += _fflag_preprocess  # TODO: propagate _fflag_lp64
+  _fflag_ilp64 += _fflag_preprocess
 endif
 
 # Deal with M_PI & friends; add `use_math_defines` to c_args or cpp_args
@@ -625,6 +821,17 @@ lapack_dep = declare_dependency(
   link_with: [g77_abi_wrappers, blas_lapack_wrapper_lib]
 )
 
+if use_ilp64
+  blas_ilp64 = declare_dependency(
+    dependencies: blas_ilp64,
+    link_with: [g77_abi_wrappers_ilp64]
+  )
+  lapack_ilp64 = declare_dependency(
+    dependencies: lapack_ilp64,
+    link_with: [g77_abi_wrappers_ilp64]
+  )
+endif
+
 subdir('_lib')
 subdir('special')
 subdir('linalg')
diff --git a/scipy/ndimage/_filters.py b/scipy/ndimage/_filters.py
index 0ee98674ed6d..e26b9329570b 100644
--- a/scipy/ndimage/_filters.py
+++ b/scipy/ndimage/_filters.py
@@ -36,6 +36,7 @@
 import math
 
 from scipy._lib._util import normalize_axis_index
+from scipy._lib._array_api import array_namespace, is_cupy, xp_size
 from . import _ni_support
 from . import _nd_image
 from . import _ni_docstrings
@@ -53,8 +54,10 @@
 
 def _vectorized_filter_iv(input, function, size, footprint, output, mode, cval, origin,
                           axes, batch_memory):
+    xp = array_namespace(input, footprint, output)
+
     # vectorized_filter input validation and standardization
-    input = np.asarray(input)
+    input = xp.asarray(input)
 
     if not callable(function):
         raise ValueError("`function` must be a callable.")
@@ -71,12 +74,12 @@ def _vectorized_filter_iv(input, function, size, footprint, output, mode, cval,
     if size is not None:
         # If provided, size must be an integer or tuple of integers.
         size = (size,)*input.ndim if np.isscalar(size) else tuple(size)
-        valid = [np.issubdtype(np.asarray(i).dtype, np.integer) and i > 0 for i in size]
+        valid = [xp.isdtype(xp.asarray(i).dtype, 'integral') and i > 0 for i in size]
         if not all(valid):
             raise ValueError("All elements of `size` must be positive integers.")
     else:
         # If provided, `footprint` must be array-like
-        footprint = np.asarray(footprint, dtype=bool)
+        footprint = xp.asarray(footprint, dtype=xp.bool)
         size = footprint.shape
         def footprinted_function(input, *args, axis=-1, **kwargs):
             return function(input[..., footprint], *args, axis=-1, **kwargs)
@@ -108,7 +111,7 @@ def footprinted_function(input, *args, axis=-1, **kwargs):
         origin = (0,) * n_axes
     else:
         origin = (origin,)*n_axes if np.isscalar(origin) else tuple(origin)
-        integral = [np.issubdtype(np.asarray(i).dtype, np.integer) for i in origin]
+        integral = [xp.isdtype(xp.asarray(i).dtype, 'integral') for i in origin]
         if not all(integral):
             raise ValueError("All elements of `origin` must be integers.")
         if not len(origin) == n_axes:
@@ -117,7 +120,7 @@ def footprinted_function(input, *args, axis=-1, **kwargs):
             raise ValueError(message)
 
     # mode must be one of the allowed strings, and we should convert it to the
-    # value required by `np.pad` here.
+    # value required by `np.pad`/`cp.pad` here.
     valid_modes = {'reflect', 'constant', 'nearest', 'mirror', 'wrap',
                    'grid-mirror', 'grid-constant', 'grid-wrap', 'valid'}
     if mode not in valid_modes:
@@ -136,20 +139,20 @@ def footprinted_function(input, *args, axis=-1, **kwargs):
         raise ValueError("Use of `cval` is compatible only with `mode='constant'`.")
 
     # `cval` must be a scalar or "broadcastable" to a tuple with the same
-    # dimensionality of `input`. (Full input validation done by `np.pad`.)
-    if not np.issubdtype(np.asarray(cval).dtype, np.number):
+    # dimensionality of `input`. (Full input validation done by `np.pad`/`cp.pad`.)
+    if not xp.isdtype(xp.asarray(cval).dtype, 'numeric'):
         raise ValueError("`cval` must include only numbers.")
 
     # `batch_memory` must be a positive number.
-    temp = np.asarray(batch_memory)
-    if temp.ndim != 0 or (not np.issubdtype(temp.dtype, np.number)) or temp <= 0:
+    temp = xp.asarray(batch_memory)
+    if temp.ndim != 0 or (not xp.isdtype(temp.dtype, 'numeric')) or temp <= 0:
         raise ValueError("`batch_memory` must be positive number.")
 
     # For simplicity, work with `axes` at the end.
     working_axes = tuple(range(-n_axes, 0))
     if axes is not None:
-        input = np.moveaxis(input, axes, working_axes)
-        output = (np.moveaxis(output, axes, working_axes)
+        input = xp.moveaxis(input, axes, working_axes)
+        output = (xp.moveaxis(output, axes, working_axes)
                   if output is not None else output)
 
     # Wrap the function to limit maximum memory usage, deal with `footprint`,
@@ -159,7 +162,7 @@ def wrapped_function(view, output=output):
         kwargs = {'axis': working_axes}
 
         if working_axes == ():
-            return footprinted_function(view, **kwargs)
+            return footprinted_function(xp.asarray(view), **kwargs)
 
         # for now, assume we only have to iterate over zeroth axis
         chunk_size = math.prod(view.shape[1:]) * view.dtype.itemsize
@@ -169,9 +172,9 @@ def wrapped_function(view, output=output):
 
         elif slices_per_batch == view.shape[0]:
             if output is None:
-                return footprinted_function(view, **kwargs)
+                return footprinted_function(xp.asarray(view), **kwargs)
             else:
-                output[...] = footprinted_function(view, **kwargs)
+                output[...] = footprinted_function(xp.asarray(view), **kwargs)
                 return output
 
         for i in range(0, view.shape[0], slices_per_batch):
@@ -179,15 +182,16 @@ def wrapped_function(view, output=output):
             if output is None:
                 # Look at the dtype before allocating the array. (In a follow-up, we
                 # can also look at the shape to support non-scalar elements.)
-                temp = footprinted_function(view[i:i2], **kwargs)
-                output = np.empty(view.shape[:-n_axes], dtype=temp.dtype)
-                output[i:i2] = temp
+                temp = footprinted_function(xp.asarray(view[i:i2]), **kwargs)
+                output = xp.empty(view.shape[:-n_axes], dtype=temp.dtype)
+                output[i:i2, ...] = temp
             else:
-                output[i:i2] = footprinted_function(view[i:i2], **kwargs)
+                output[i:i2, ...] = footprinted_function(xp.asarray(view[i:i2]),
+                                                         **kwargs)
         return output
 
     return (input, wrapped_function, size, mode, cval,
-            origin, working_axes, n_axes, n_batch)
+            origin, working_axes, n_axes, n_batch, xp)
 
 
 @_ni_docstrings.docfiller
@@ -405,39 +409,53 @@ def vectorized_filter(input, function, *, size=None, footprint=None, output=None
 
     """  # noqa: E501
 
-    (input, function, size, mode, cval, origin, working_axes, n_axes, n_batch
+    (input, function, size, mode, cval, origin, working_axes, n_axes, n_batch, xp
      ) = _vectorized_filter_iv(input, function, size, footprint, output, mode, cval,
         origin, axes, batch_memory)
 
-    # `np.pad` raises with these sorts of cases, but the best result is probably
-    # to return the original array. It could be argued that we should call the
-    # function on the empty array with `axis=None` just to determine the output
+    # `np.pad`/`cp.pad` raises with these sorts of cases, but the best result is
+    # probably to return the original array. It could be argued that we should call
+    # the function on the empty array with `axis=None` just to determine the output
     # dtype, but I can also see rationale against that.
-    if input.size == 0:
-        return input
+    if xp_size(input) == 0:
+        return xp.asarray(input)
 
     # This seems to be defined.
     if input.ndim == 0 and size == ():
-        return np.asarray(function(input) if footprint is None
+        return xp.asarray(function(input) if footprint is None
                           else function(input[footprint]))
 
-    # Border the image according to `mode` and `offset`. `np.pad` does the work,
-    # but it uses different names; adjust `mode` accordingly.
-    # Move this to input validation.
+    if is_cupy(xp):
+        # CuPy is the only GPU backend that has `pad` (with all modes)
+        # and `sliding_window_view`. An enhancement would be to use
+        # no-copy conversion to CuPy whenever the data is on the GPU.
+        cp = xp  # let there be no ambiguity!
+        swv = cp.lib.stride_tricks.sliding_window_view
+        pad = cp.pad
+    else:
+        # Try to perform no-copy conversion to NumPy for padding and
+        # `sliding_window_view`. (If that fails, fine - for now, the only
+        # GPU backend we support is CuPy.)
+        swv = np.lib.stride_tricks.sliding_window_view
+        pad = np.pad
+        input = np.asarray(input)
+        cval = np.asarray(cval)[()] if mode == 'constant' else None
+
+    # Border the image according to `mode` and `offset`.
     if mode != 'valid':
         kwargs = {'constant_values': cval} if mode == 'constant' else {}
         borders = tuple((i//2 + j, (i-1)//2 - j) for i, j in zip(size, origin))
-        bordered_input = np.pad(input, ((0, 0),)*n_batch + borders, mode=mode, **kwargs)
+        bordered_input = pad(input, ((0, 0),)*n_batch + borders, mode=mode, **kwargs)
     else:
         bordered_input = input
 
     # Evaluate function with sliding window view. Function is already wrapped to
     # manage memory, deal with `footprint`, populate `output`, etc.
-    view = np.lib.stride_tricks.sliding_window_view(bordered_input, size, working_axes)
+    view = swv(bordered_input, size, working_axes)
     res = function(view)
 
     # move working_axes back to original positions
-    return np.moveaxis(res, working_axes, axes) if axes is not None else res
+    return xp.moveaxis(res, working_axes, axes) if axes is not None else res
 
 
 def _invalid_origin(origin, lenw):
@@ -1861,6 +1879,8 @@ def maximum_filter(input, size=None, footprint=None, output=None,
     A sequence of modes (one per axis) is only supported when the footprint is
     separable. Otherwise, a single mode string must be provided.
 
+    %(nan)s
+
     Examples
     --------
     >>> from scipy import ndimage, datasets
diff --git a/scipy/ndimage/_interpolation.py b/scipy/ndimage/_interpolation.py
index 8e5ce6bc818d..9527cdce7245 100644
--- a/scipy/ndimage/_interpolation.py
+++ b/scipy/ndimage/_interpolation.py
@@ -33,6 +33,7 @@
 
 import numpy as np
 from scipy._lib._util import normalize_axis_index
+from scipy._lib import array_api_extra as xpx
 
 from scipy import special
 from . import _ni_support
@@ -841,6 +842,12 @@ def zoom(input, zoom, output=None, order=3, mode='constant', cval=0.0,
     complex_output = np.iscomplexobj(input)
     output = _ni_support._get_output(output, input, shape=output_shape,
                                      complex_output=complex_output)
+    if all(z == 1 for z in zoom) and prefilter:  # early exit for gh-20999
+        # zoom 1 means "return original image". If `prefilter=False`,
+        # `input` is *not* the original image; processing is still needed
+        # to undo the filter. So we only early exit if `prefilter`.
+        output = xpx.at(output)[...].set(input)
+        return output
     if complex_output:
         # import under different name to avoid confusion with zoom parameter
         from scipy.ndimage._interpolation import zoom as _zoom
diff --git a/scipy/ndimage/_support_alternative_backends.py b/scipy/ndimage/_support_alternative_backends.py
index 88cb6e994932..ad7a2a27a54b 100644
--- a/scipy/ndimage/_support_alternative_backends.py
+++ b/scipy/ndimage/_support_alternative_backends.py
@@ -23,6 +23,11 @@ def _maybe_convert_arg(arg, xp):
         return arg
 
 
+# Some cupyx.scipy.ndimage functions don't exist or are incompatible with
+# their SciPy counterparts
+CUPY_BLOCKLIST = ['vectorized_filter']
+
+
 def delegate_xp(delegator, module_name):
     def inner(func):
         @functools.wraps(func)
@@ -30,7 +35,7 @@ def wrapper(*args, **kwds):
             xp = delegator(*args, **kwds)
 
             # try delegating to a cupyx/jax namesake
-            if is_cupy(xp):
+            if is_cupy(xp) and func.__name__ not in CUPY_BLOCKLIST:
                 # https://github.com/cupy/cupy/issues/8336
                 import importlib
                 cupyx_module = importlib.import_module(f"cupyx.scipy.{module_name}")
diff --git a/scipy/ndimage/tests/test_filters.py b/scipy/ndimage/tests/test_filters.py
index 8cf88b5177ac..f2542aebce9a 100644
--- a/scipy/ndimage/tests/test_filters.py
+++ b/scipy/ndimage/tests/test_filters.py
@@ -15,7 +15,8 @@
     xp_assert_close,
     xp_assert_equal,
 )
-from scipy._lib._array_api import is_cupy, is_torch, array_namespace
+from scipy._lib._array_api import (is_cupy, is_torch, is_dask, is_jax, array_namespace,
+                                   is_array_api_strict, xp_copy)
 from scipy.ndimage._filters import _gaussian_kernel1d
 
 from . import types, float_types, complex_types
@@ -2766,6 +2767,8 @@ def test_gh_22333():
     assert_array_equal(actual, expected)
 
 
+@pytest.mark.filterwarnings("ignore:The given NumPy array is not writable:UserWarning")
+@pytest.mark.skip_xp_backends(cpu_only=True, exceptions=['cupy'])
 class TestVectorizedFilter:
     @pytest.mark.parametrize("axes, size",
                              [(None, (3, 4, 5)), ((0, 2), (3, 4)), ((-1,), (5,))])
@@ -2773,45 +2776,65 @@ class TestVectorizedFilter:
     @pytest.mark.parametrize("mode",
                              ['reflect', 'nearest', 'mirror', 'wrap', 'constant'])
     @pytest.mark.parametrize("use_output", [False, True])
-    def test_against_generic_filter(self, axes, size, origin, mode, use_output):
+    def test_against_generic_filter(self, axes, size, origin, mode, use_output, xp):
         rng = np.random.default_rng(435982456983456987356)
 
+        if use_output and (is_dask(xp) or is_jax(xp)):
+            pytest.skip("Requires mutable arrays.")
+
         input = rng.random(size=(11, 12, 13))
         input_copy = input.copy()  # check that it is not modified
-        output = np.zeros_like(input) if use_output else None
-
-        kwargs = dict(axes=axes, size=size, origin=origin, mode=mode, output=output)
-        ref = ndimage.generic_filter(input, np.mean, **kwargs)
-        res = ndimage.vectorized_filter(input, np.mean, **kwargs)
-        xp_assert_close(res, ref, atol=1e-15)
-        if use_output:
-            xp_assert_equal(output, res)
+        output = xp.zeros(input.shape) if use_output else None
 
-        kwargs.pop('size')
-        kwargs['footprint'] = rng.random(size=size or input.shape) > 0.5
+        kwargs = dict(axes=axes, size=size, origin=origin, mode=mode)
         ref = ndimage.generic_filter(input, np.mean, **kwargs)
-        res = ndimage.vectorized_filter(input, np.mean, **kwargs)
-        xp_assert_close(res, ref, atol=1e-15)
+        kwargs['output'] = output
+        res = ndimage.vectorized_filter(xp.asarray(input.tolist()),
+                                        xp.mean, **kwargs)
+        xp_assert_close(res, xp.asarray(ref.tolist()), atol=1e-15)
         if use_output:
             xp_assert_equal(output, res)
 
-        xp_assert_equal(input, input_copy)
+        if not (is_array_api_strict(xp) or is_dask(xp)):
+            # currently requires support for [..., mask] indexing
+            kwargs.pop('size')
+            kwargs.pop('output')
+            kwargs['footprint'] = rng.random(size=size or input.shape) > 0.5
+            ref = ndimage.generic_filter(input, np.mean, **kwargs)
+            kwargs['footprint'] = xp.asarray(kwargs['footprint'])
+            kwargs['output'] = output
+            res = ndimage.vectorized_filter(xp.asarray(input.tolist()),
+                                            xp.mean, **kwargs)
+            xp_assert_close(res, xp.asarray(ref.tolist()), atol=1e-15)
+            if use_output:
+                xp_assert_equal(output, res)
+
+        xp_assert_equal(xp.asarray(input), xp.asarray(input_copy))
 
     @pytest.mark.parametrize("dtype",
-                             [np.uint8, np.uint16, np.uint32, np.uint64,
-                              np.int8, np.int16, np.int32, np.int64,
-                              np.float32, np.float64, np.complex64, np.complex128])
+                             ["uint8", "uint16", "uint32", "uint64",
+                              "int8", "int16", "int32", "int64",
+                              "float32", "float64", "complex64", "complex128"])
     @pytest.mark.parametrize("batch_memory", [1, 16*3, np.inf])
     @pytest.mark.parametrize("use_footprint", [False, True])
-    def test_dtype_batch_memory(self, dtype, batch_memory, use_footprint):
+    def test_dtype_batch_memory(self, dtype, batch_memory, use_footprint, xp):
         rng = np.random.default_rng(435982456983456987356)
         w = 3
 
+        if is_jax(xp) and not (batch_memory == 1):
+            pytest.skip("Requires mutable array.")
+        if is_torch(xp) and dtype in {'uint16', 'uint32', 'uint64'}:
+            pytest.skip("Needs uint support.")
+
+        dtype = getattr(xp, dtype)
+
         if use_footprint:
-            footprint = np.asarray([True, False, True])
+            if (is_dask(xp) or is_array_api_strict(xp)):
+                pytest.skip("Requires [..., mask] indexing.")
+            footprint = xp.asarray([True, False, True])
             kwargs = dict(footprint=footprint, batch_memory=batch_memory)
         else:
-            footprint = np.asarray([True, True, True])
+            footprint = xp.asarray([True, True, True])
             kwargs = dict(size=w, batch_memory=batch_memory)
 
         # The intent here is to exercise all the code paths involved in `batch_memory`
@@ -2821,44 +2844,48 @@ def test_dtype_batch_memory(self, dtype, batch_memory, use_footprint):
         # *won't* fit.
         n = 16*3 + 1
         input = rng.integers(0, 42, size=(n,))
-        input = input + input*1j if np.issubdtype(dtype, np.complexfloating) else input
-        input = input.astype(dtype)
+        input = input + input*1j if xp.isdtype(dtype, 'complex floating') else input
+        input_padded = xp.asarray(np.pad(input, [(1, 1)], mode='symmetric'),
+                                  dtype=dtype)
+        input = xp.asarray(input, dtype=dtype)
 
-        input2 = np.pad(input, [(1, 1)], mode='symmetric')
-        ref = [np.sum(input2[i: i + w][footprint]) for i in range(n)]
-        sum_dtype = np.sum(input2).dtype
+        ref = [xp.sum(input_padded[i: i + w][footprint]) for i in range(n)]
+        sum_dtype = xp.sum(input_padded).dtype
 
         message = "`batch_memory` is insufficient for minimum chunk size."
         context = (pytest.raises(ValueError, match=message)
                    if batch_memory == 1 else contextlib.nullcontext())
         with context:
-            res = ndimage.vectorized_filter(input, np.sum, **kwargs)
-            xp_assert_close(res, np.asarray(ref, dtype=sum_dtype))
+            res = ndimage.vectorized_filter(input, xp.sum, **kwargs)
+            xp_assert_close(res, xp.asarray(ref, dtype=sum_dtype))
             assert res.dtype == sum_dtype
 
-            output = np.empty_like(input)
-            res = ndimage.vectorized_filter(input, np.sum, output=output, **kwargs)
-            xp_assert_close(res, np.asarray(ref, dtype=dtype))
+            output = xp.empty_like(input)
+            res = ndimage.vectorized_filter(input, xp.sum, output=output, **kwargs)
+            xp_assert_close(res, xp.asarray(ref, dtype=dtype))
             assert res.dtype == dtype
 
-    def test_mode_valid(self):
+    def test_mode_valid(self, xp):
         rng = np.random.default_rng(435982456983456987356)
         input = rng.random(size=(10, 11))
-        input_copy = input.copy()  # check that it is not modified
+        input_xp = xp.asarray(input)
+        input_xp_copy = xp_copy(input_xp)  # check that it is not modified
         size = (3, 5)
-        function = np.mean
-        res = ndimage.vectorized_filter(input, function, size=size, mode='valid')
+
+        res = ndimage.vectorized_filter(input_xp, xp.mean, size=size, mode='valid')
+
         view = np.lib.stride_tricks.sliding_window_view(input, size)
-        ref = function(view, axis=(-2, -1))
-        xp_assert_close(res, ref)
-        xp_assert_equal(res.shape, input.shape - np.asarray(size) + 1)
-        xp_assert_equal(input, input_copy)
+        ref = np.mean(view, axis=(-2, -1))
 
-    def test_input_validation(self):
-        input = np.ones((10, 10))
-        function = np.mean
+        xp_assert_close(res, xp.asarray(ref))
+        assert res.shape == tuple(input.shape - np.asarray(size) + 1)
+        xp_assert_equal(input_xp, input_xp_copy)
+
+    def test_input_validation(self, xp):
+        input = xp.ones((10, 10))
+        function = xp.mean
         size = 2
-        footprint = np.ones((2, 2))
+        footprint = xp.ones((2, 2))
 
         message = "`function` must be a callable."
         with pytest.raises(ValueError, match=message):
@@ -2874,7 +2901,7 @@ def test_input_validation(self):
 
         message = "All elements of `size` must be positive integers."
         with pytest.raises(ValueError, match=message):
-            ndimage.vectorized_filter(input, function, size=(1, None))
+            ndimage.vectorized_filter(input, function, size=(1, -1))
         with pytest.raises(ValueError, match=message):
             ndimage.vectorized_filter(input, function, size=0)
 
@@ -2882,7 +2909,7 @@ def test_input_validation(self):
         with pytest.raises(ValueError, match=message):
             ndimage.vectorized_filter(input, function, size=(1, 2, 3))
         with pytest.raises(ValueError, match=message):
-            ndimage.vectorized_filter(input, function, footprint=np.ones((2, 2, 2)))
+            ndimage.vectorized_filter(input, function, footprint=xp.ones((2, 2, 2)))
 
         message = "`axes` must be provided if the dimensionality..."
         with pytest.raises(ValueError, match=message):
@@ -2890,7 +2917,7 @@ def test_input_validation(self):
 
         message = "All elements of `origin` must be integers"
         with pytest.raises(ValueError, match=message):
-            ndimage.vectorized_filter(input, function, size=size, origin=(1, None))
+            ndimage.vectorized_filter(input, function, size=size, origin=(1, 1.5))
 
         message = "`origin` must be an integer or tuple of integers with length..."
         with pytest.raises(ValueError, match=message):
@@ -2909,44 +2936,45 @@ def test_input_validation(self):
         with pytest.raises(ValueError, match=message):
             ndimage.vectorized_filter(input, function, size=size, mode='valid', cval=1)
 
-        message = "`cval` must include only numbers."
-        with pytest.raises(ValueError, match=message):
+        other_messages = "|Unsupported|The array_api_strict|new|Value 'a duck'"
+        message = "`cval` must include only numbers." + other_messages
+        with pytest.raises((ValueError, TypeError), match=message):
             ndimage.vectorized_filter(input, function, size=size,
-                                      mode='constant', cval='a duck')
+                              mode='constant', cval='a duck')
 
-        message = "`batch_memory` must be positive number."
+        message = "`batch_memory` must be positive number." + other_messages
         with pytest.raises(ValueError, match=message):
             ndimage.vectorized_filter(input, function, size=size, batch_memory=0)
         with pytest.raises(ValueError, match=message):
             ndimage.vectorized_filter(input, function, size=size, batch_memory=(1, 2))
-        with pytest.raises(ValueError, match=message):
-            ndimage.vectorized_filter(input, function, size=size,
-                                      batch_memory="shrubbery")
+        with pytest.raises((ValueError, TypeError), match=message):
+            ndimage.vectorized_filter(input, function, size=size, batch_memory="a duck")
 
     @pytest.mark.parametrize('shape', [(0,), (1, 0), (0, 1, 0)])
-    def test_zero_size(self, shape):
-        input = np.empty(shape)
-        res = ndimage.vectorized_filter(input, np.mean, size=1)
+    def test_zero_size(self, shape, xp):
+        input = xp.empty(shape)
+        res = ndimage.vectorized_filter(input, xp.mean, size=1)
         xp_assert_equal(res, input)
 
-    def test_edge_cases(self):
+    @pytest.mark.filterwarnings("ignore:Mean of empty slice.:RuntimeWarning")
+    def test_edge_cases(self, xp):
         rng = np.random.default_rng(4835982345234982)
-        function = np.mean
+        function = xp.mean
 
         # 0-D input
-        input = np.asarray(1)
-        res = ndimage.vectorized_filter(1, function, size=())
-        xp_assert_equal(res, np.asarray(function(input, axis=())))
+        input = xp.asarray(1.)
+        res = ndimage.vectorized_filter(input, function, size=())
+        xp_assert_equal(res, xp.asarray(function(input, axis=())))
 
-        res = ndimage.vectorized_filter(1, function, footprint=True)
-        xp_assert_equal(res, np.asarray(function(input[True], axis=())))
+        if not (is_array_api_strict(xp) or is_dask(xp)):
+            res = ndimage.vectorized_filter(input, function, footprint=True)
+            xp_assert_equal(res, xp.asarray(function(input[True], axis=())))
 
-        with pytest.warns(RuntimeWarning, match="Mean of empty slice."):
-            res = ndimage.vectorized_filter(1, function, footprint=False)
-            xp_assert_equal(res, np.asarray(function(input[False], axis=())))
+            res = ndimage.vectorized_filter(input, function, footprint=False)
+            xp_assert_equal(res, xp.asarray(function(input[False], axis=())))
 
         # 1x1 window
-        input = rng.random((5, 5))
+        input = xp.asarray(rng.random((5, 5)))
         res = ndimage.vectorized_filter(input, function, size=1)
         xp_assert_equal(res, input)
 
diff --git a/scipy/ndimage/tests/test_interpolation.py b/scipy/ndimage/tests/test_interpolation.py
index 08edb9219c8c..03cbcdc4a19c 100644
--- a/scipy/ndimage/tests/test_interpolation.py
+++ b/scipy/ndimage/tests/test_interpolation.py
@@ -1323,6 +1323,23 @@ def test_zoom_0d_array(self, xp):
         expected = ndimage.zoom(a, factor)
         xp_assert_close(actual, expected)
 
+    @xfail_xp_backends("cupy", reason="CuPy `zoom` needs similar fix.")
+    def test_zoom_1_gh20999(self, xp):
+        # gh-20999 reported that zoom with `zoom=1` (or sequence of ones)
+        # introduced noise. Check that this is resolved.
+        x = xp.eye(3)
+        xp_assert_equal(ndimage.zoom(x, 1), x)
+        xp_assert_equal(ndimage.zoom(x, (1, 1)), x)
+
+    @xfail_xp_backends("cupy", reason="CuPy `zoom` needs similar fix.")
+    @skip_xp_backends("jax.numpy", reason="read-only backend")
+    @xfail_xp_backends("dask.array", reason="numpy round-trip")
+    def test_zoom_1_gh20999_output(self, xp):
+        x = xp.eye(3)
+        output = xp.zeros_like(x)
+        ndimage.zoom(x, 1, output=output)
+        xp_assert_equal(output, x)
+
 
 class TestRotate:
 
diff --git a/scipy/optimize/__nnls.c b/scipy/optimize/__nnls.c
new file mode 100644
index 000000000000..480b0e1aa64d
--- /dev/null
+++ b/scipy/optimize/__nnls.c
@@ -0,0 +1,268 @@
+#include "__nnls.h"
+#include <stdio.h>
+
+/* Algorithm NNLS: NONNEGATIVE LEAST SQUARES
+*
+* Given an m by n matrix A, an m-vector B, and an n-vector X, compute an
+* n-vector X which solves the least squares problem
+*
+*            a * x = b  subject to x >= 0
+*
+* This is a C translation of the original Fortran code, which was developed by
+* Charles L. Lawson and Richard J. Hanson at Jet Propulsion Laboratory
+* 1973 JUN 15, and published in the book "SOLVING LEAST SQUARES PROBLEMS",
+* Prentice-HalL, 1974. Revised FEB 1995 to accompany reprinting of the book
+* (DOI: 10.1137/1.9781611971217) by SIAM.
+*
+*/
+void
+__nnls(const int m, const int n, double* restrict a, double* restrict b,
+       double* restrict x, double* restrict w, double* restrict zz,
+       int* restrict indices, const int maxiter, double* rnorm, int* info)
+{
+    int i = 0, ii = 0, ip = 0, indz = 0, iteration = 0, iz = 0, izmax = 0;
+    int j = 0, jj = 0, k = 0, one = 1, tmpint = 0;
+    double tau = 0.0, unorm = 0.0, ztest, alpha, cc, ss, wmax, T, tmp_work;
+    double pivot = 1.0, pivot2 = 0.0, tmp = 0.0, spacing = 0.0;
+    *info = 1;
+    if (m <= 0 || n <= 0)
+    {
+        *info = 2;
+        return;
+    }
+
+    // Initialize the indices and the solution vector x.
+    for (i = 0; i < n; i++) { indices[i] = i; }
+    for (i = 0; i < n; i++) { x[i] = 0.0; }
+
+    // Outer loop
+    while (indz < (m < n ? m : n))
+    {
+        // Compute the dual vector components in set Z.
+        // Essentially a permuted gemv operation via BLAS ddot, in NumPy notation;
+        // w[indices[indz:]] = A[indz:m, indices[indsz:]] @ b[indz:m]
+        for (i = indz; i < n; i++)
+        {
+            j = indices[i];
+            tmpint = m - indz;
+            w[j] = ddot_(&tmpint, &a[indz + j*m], &one, &b[indz], &one);
+        }
+
+        // Find the next linearly independent column that corresponds to the
+        // largest entry in the dual vector w.
+        // ====================================================================
+        while (1)
+        {
+            // Finding the largest w[j] and its index
+            // izmax, wmax = argmax(w[indices[indz:]])
+            wmax = 0.0;
+            for (k = indz; k < n; k++)
+            {
+                j = indices[k];
+                if (w[j] > wmax) { wmax = w[j]; izmax = k; }
+            }
+            // If wmax <= 0.0, terminate since this is a KKT certificate.
+            if (wmax <= 0.0) { goto END; }
+            iz = izmax;
+            j = indices[iz];
+
+            // The sign of wmax is OK for j to be moved to set p. Begin the
+            // transformation and check new diagonal element to avoid near-linear
+            // dependence.
+            pivot = a[indz + j*m];
+            tmpint = m - indz;
+            dlarfgp_(&tmpint, &pivot, &a[indz + 1 + j*m], &one, &tau);
+
+            // Compute the norm of a[0:indz, j] to check for linear dependence.
+            unorm = (indz > 0 ? dnrm2_(&indz, &a[j*m], &one) : 0.0);
+            // unorm is nonnegative
+            spacing = (unorm > 0.0 ? nextafter(unorm, 2*unorm) - unorm : 0.0);
+
+            // Test for independence by checking the pivot for zero.
+            if (fabs(pivot) > 100.0*spacing)
+            {
+                // Column j is sufficiently independent. Copy b into zz and solve
+                // for ztest which is the new prospective value for x[j].
+                for (i = 0; i < m; i++) { zz[i] = b[i]; }
+                tmpint = m - indz;
+                pivot2 = a[indz + j*m];
+                a[indz + j*m] = 1.0;
+                dlarf_("L", &tmpint, &one, &a[indz + j*m], &one, &tau, &zz[indz], &tmpint, &tmp_work);
+                // See if ztest is positive. This is from the original F77 code.
+                // Probably better to use a sign test instead of a division.
+                ztest = zz[indz] / pivot;
+                if (ztest > 0.0)
+                {
+                    break;
+                } else {
+                    a[indz + j*m] = pivot2;
+                }
+            }
+            // Reject j as a candidate to be moved from set z to set p.
+            // a(indz,j) is restored, set w(j)=0., and loop back to test dual
+            // coeffs again.
+            w[j] = 0.0;
+        }
+        // ====================================================================
+
+        // the index j=indices[iz]  has been selected to be moved from set z to
+        // set p. Update b, update indices, apply householder transformations to
+        // cols in new set z,  zero subdiagonal elements in col j,  set w(j)=0.
+        for (i = 0; i < m; i++) { b[i] = zz[i]; }
+        indices[iz] = indices[indz];
+        indices[indz] = j;
+        indz++;
+        // Apply the householder transformation to the remaining columns.
+        if (indz < n)
+        {
+            tmpint = m - indz + 1;
+            for (k = indz; k < n; k++)
+            {
+                jj = indices[k];
+                dlarf_("L", &tmpint, &one, &a[indz - 1 + j*m], &one, &tau, &a[indz - 1 + jj*m], &tmpint, &tmp_work);
+            }
+        }
+        // Restore the pivot element into a, zero the subdiagonal elements in col j
+        a[indz - 1 + j*m] = pivot;
+        if (indz < m) { for (i = indz; i < m; i++) { a[j*m + i] = 0.0; } }
+        // Zero the dual coefficient for the column.
+        w[j] = 0.0;
+
+        // Solve the permuted triangular system, store in zz.
+        for (k = 0; k < indz; k++)
+        {
+            // ip traverses the indices of P set in reverse
+            ip = indz - 1 - k;
+            if (k != 0)
+            {
+                for (i = 0; i <= ip; i++)
+                {
+                    zz[i] = zz[i] - a[i + jj*m] * zz[ip + 1];
+                }
+            }
+            jj = indices[ip];
+            zz[ip] = zz[ip] / a[ip + jj*m];
+        }
+
+        // ****** Inner loop ******
+        while (1)
+        {
+            iteration++;
+            if (iteration >= maxiter) { *info = 3; goto END; }
+
+            // See if all new constrained coefficients are feasible,
+            // if not compute alpha
+            alpha = 2.0;
+            for (ip = 0; ip < indz; ip++)
+            {
+                k = indices[ip];
+                if (zz[ip] <= 0.0)
+                {
+                    T = -x[k] / (zz[ip] - x[k]);
+                    if (alpha > T)
+                    {
+                        alpha = T;
+                        jj = ip;
+                    }
+                }
+            }
+            // If all new constrained coefficients are feasible, alpha is still
+            // 2.0. If so exit from secondary loop to main loop.
+            if (alpha == 2.0) { break; }  // Get back to outer loop
+
+            // Otherwise interpolate between old x and zz.
+            for (ip = 0; ip < indz; ip++)
+            {
+                k = indices[ip];
+                x[k] = x[k] + alpha*(zz[ip] - x[k]);
+            }
+
+            // Modify a, b, and the indicies to move coefficient i from set p
+            // to set z. While loop simulates a goto in the original F77 code.
+            i = indices[jj];
+            while (1)
+            {
+                x[i] = 0.0;
+
+                if (jj != indz-1)
+                {
+                    jj++;
+                    for (j = jj; j < indz; j++)
+                    {
+                        ii = indices[j];
+                        indices[j-1] = ii;
+                        dlartgp_(&a[j-1 + ii*m], &a[j + ii*m], &cc, &ss, &a[j-1 + ii*m]);
+                        a[j + ii*m] = 0.0;
+                        // Apply the Givens rotation to all columns except ii.
+                        // Because the columns are not ordered we do it manually.
+                        for (k = 0; k < n; k++)
+                        {
+                            if (k != ii)
+                            {
+                                tmp = a[j-1 + k*m];
+                                a[j-1 + k*m] =  cc*tmp + ss*a[j + k*m];
+                                a[j   + k*m] = -ss*tmp + cc*a[j + k*m];
+                            }
+                        }
+                        tmp = b[j-1];
+                        b[j-1] =  cc*tmp + ss*b[j];
+                        b[j]   = -ss*tmp + cc*b[j];
+                    }
+                }
+                indz--;
+                indices[indz] = i;
+
+                // See if remaining coefficients in set P are feasible
+                // since determination of alpha guarantees it. If still
+                // there are infeasible ones, they are due to numerical
+                // noise. Any that are nonpositive will be set to zero
+                // and moved from set p to set z.
+                int nobreak = 0;
+                for (jj = 0; jj < indz; jj++)
+                {
+                    i = indices[jj];
+                    if (x[i] <= 0.0) { break; }
+                    if (jj == indz - 1) { nobreak = 1; }
+                }
+                // If for loop completes without break, then leave the while loop
+                if (nobreak) { break; }
+            }
+
+            for (i = 0; i < m; i++) { zz[i] = b[i]; }
+            for (k = 0; k < indz; k++)
+            {
+                ip = indz - 1 - k;
+                if (k != 0)
+                {
+                    for (i = 0; i <= ip; i++)
+                    {
+                        zz[i] = zz[i] - a[i + jj*m] * zz[ip + 1];
+
+                    }
+                }
+                jj = indices[ip];
+                zz[ip] = zz[ip] / a[ip + jj*m];
+            }
+            // ****** end of inner loop ******
+        }
+
+        // Back in the outer loop
+        for (k = 0; k < indz; k++)
+        {
+            i = indices[k];
+            x[i] = zz[k];
+        }
+        // ****** end of outer loop ******
+    }
+END:
+    // Compute the residual vector and its norm.
+    if (indz < m)
+    {
+        tmpint = m - indz;
+        *rnorm = dnrm2_(&tmpint, &b[indz], &one);
+    } else {
+        for (i = 0; i < n; i++) { w[i] = 0.0; }
+        *rnorm = 0.0;
+    }
+    return;
+}
diff --git a/scipy/optimize/__nnls.h b/scipy/optimize/__nnls.h
new file mode 100644
index 000000000000..5644e6859eb0
--- /dev/null
+++ b/scipy/optimize/__nnls.h
@@ -0,0 +1,17 @@
+#ifndef __NNLS_H
+#define __NNLS_H
+#include <math.h>
+
+double ddot_(int* n, double* dx, int* incx, double* dy, int* incy);
+void dlarf_(char* side, int* m, int* n, double* v, int* incv, double* tau, double* c, int* ldc, double* work);
+void dlarfgp_(int* n, double* alpha, double* x, int* incx, double* tau);
+void dlartgp_(double* f, double* g, double* cs, double* sn, double* r);
+double dnrm2_(int* n, double* x, int* incx);
+
+void
+__nnls(const int m, const int n, double* restrict a, double* restrict b,
+       double* restrict x, double* restrict w, double* restrict zz,
+       int* restrict indices, const int maxiter, double* rnorm, int* info);
+
+
+#endif
diff --git a/scipy/optimize/__slsqp.c b/scipy/optimize/__slsqp.c
new file mode 100644
index 000000000000..2a9c0716f372
--- /dev/null
+++ b/scipy/optimize/__slsqp.c
@@ -0,0 +1,1010 @@
+#include "__slsqp.h"
+
+void __nnls(const int m, const int n, double* restrict a, double* restrict b, double* restrict x, double* restrict w, double* restrict zz, int* restrict indices, const int maxiter, double* rnorm, int* info);
+static void ldp(int m, int n, double* g, double* h, double* x, double* buffer, int* indices, double* xnorm, int* mode);
+static void lsi(int me, int mg, int n, double* e, double* f, double* g, double* h, double* x, double* buffer, int* jw, double* xnorm, int* mode);
+static void lsei(int ma, int me, int mg, int n, double* a, double* b, double* e, double* f, double* g, double* h, double* x, double* buffer, int* jw, double* xnorm, int* mode);
+static void lsq(int m, int meq, int n, int augment, double aug_weight, double* Lf, double* gradx, double* C, double* d, double* xl, double* xu, double* x, double* y, double* buffer, int* jw, int* mode);
+static void ldl_update(int n, double* a, double* z, double sigma, double* w);
+
+/*
+ * The main SLSQP function. The function argument naming in the Fortran code is
+ * exceedingly inconsistent and very difficult to follow. Hence we adopted the
+ * following naming convention in SLSQP and the nested function arguments:
+ *
+ *  - funx: The function value at the current point. (1)
+ *  - gradx: The gradient of the function at the current point. (n)
+ *  - C: The equality and inequality constraint normals. (m x n)
+ *  - d: The  equality and inequality constraints, (m)
+ *  - xl: The lower bounds on x, (n)
+ *  - xu: The upper bounds on x, (n)
+ *  - sol: The solution vector, (n)
+ *  - mult: The Lagrange multipliers, (m + 2*n + 2)
+ *  - buffer: A buffer to hold various intermediate arrays.
+ *  - indices: An array to hold the indices of the active constraints. (m + 2*n + 2)
+ *
+ *  The buffer size should be greater than:
+ *  n*(n+1)//2 + m + 4*n + 3                                           # SLSQP
+ *  (n+1)*(n+2) + (n+1)*meq + m + (mineq + 2*n + 2)*(n+1) +  3*n + 3   # LSQ
+ *  mineq + 2n + 2 + 2*meq + (n+1) + (mineq + 3n + 3)*(n + 1 - meq)    # LSEI
+ *  (mineq + 2n + 2 + 2)*(n + 2) + mineq + 2n + 2                      # LDP
+ *  mineq + 2n + 2                                                     # NNLS
+ *
+ *
+ *  If applicable, the following are the problem matrix naming convention:
+ *
+ *  - A: The coefficient matrix of cost function |Ax - b|
+ *  - b: The RHS of cost function |Ax - b|
+ *  - E: The (E)quality constraint matrix of Ex = f
+ *  - f: The equality constraint RHS of Ex = f
+ *  - G: The inequality constraint matrix of Gx >= h
+ *  - h: The inequality constraint RHS of Gx >= h
+ *
+ */
+void
+__slsqp_body(
+    struct SLSQP_vars* S, double* funx, double* restrict gradx,
+    double* restrict C, double* restrict d, double* restrict sol,
+    double* restrict mult, double* restrict xl, double* restrict xu, double* buffer,
+    int* indices)
+{
+
+    int one = 1, lda = (S->m > 0 ? S->m : 1);
+    int j;
+    double done = 1.0, dmone = -1.0, alfmin = 0.1;
+    int n = S->n;
+    int m = S->m;
+    int n1 = n + 1;
+    int n2 = n1*n/2;
+
+    // Chop the buffer for various array pointers.
+    double* restrict bfgs       = &buffer[0];
+    double* restrict x0         = &buffer[n2];
+    double* restrict mu         = &buffer[n2 + n];
+    double* restrict s          = &buffer[n2 + n + m];
+    double* restrict u          = &buffer[n2 + n + m + n1];
+    double* restrict v          = &buffer[n2 + n + m + n1 + n1];
+    double* restrict lsq_buffer = &buffer[n2 + n + m + n1 + n1 + n1];
+
+    // The badlin flag keeps track whether the SQP problem on the current
+    // iteration was inconsistent or not.
+    int badlin = 0;
+
+    // Fortran code uses reverse communication for the iterations hence it
+    // needs to jump back to where it left off. Thus the goto statements are
+    // kept as is. Fortunately, they do not overlap too much and have a relatively
+    // clean separation.
+    if (S->mode ==  0) { goto MODE0; }
+    if (S->mode == -1) { goto MODEM1; }
+    if (S->mode == 1) { goto MODE1; }
+
+MODE0:
+    // We always use inexact line search, since exact search is broken in the
+    // original Fortran code.
+    S->exact = 0; // (S->acc < 0.0 ? 1 : 0);
+    S->acc = fabs(S->acc);
+    S->tol = 10*S->acc;
+    S->iter = 0;
+    S->reset = 0;
+    for (int i = 0; i < n; i++) { s[i] = 0.0; }
+    for (int i = 0; i < m; i++) { mu[i] = 0.0; }
+
+RESET_BFGS:
+    // Reset the BFGS matrix stored in packed format
+    S->reset++;
+    if (S->reset > 5) { goto LABEL255;}
+    for (int i = 0; i < n2; i++) { bfgs[i] = 0.0; }
+    j = 0;
+    for (int i = 0; i < n; i++)
+    {
+        bfgs[j] = 1.0;
+        j += n - i;
+    }
+    // 120
+
+ITER_START:
+    // Main iteration: Search direction, steplength, LDL'-update
+    // 130
+    S->mode = 9;
+    if (S->iter >= S->itermax) { return; }
+    S->iter++;
+
+    // Search direction as solution of the QP-problem
+    for (int i = 0; i < n; i++)
+    {
+        u[i] = -sol[i] + xl[i] ;
+        v[i] = -sol[i] + xu[i] ;
+    }
+
+    S->h4 = 1.0;
+    // augment and aug_weight are not used and hence 0.
+    lsq(m, S->meq, n, 0, 0, bfgs, gradx, C, d, u, v, s, mult, lsq_buffer, indices, &S->mode);
+
+    // Augmented problem for inconsistent linearization
+
+    // If it turns out that the original SQP problem is inconsistent,
+    // disallow termination with convergence on this iteration,
+    // even if the augmented problem was solved.
+    badlin = 0;
+
+    // If equality constraints are not full rank and all are equality constrained
+    // then the problem is inconsistent.
+    if ((S->mode == 6) && (n == S->meq)) { S->mode = 4;}
+
+    // If inconsistency detected, we augment the problem and try again.
+    // Fortran code augments the problem matrices by embedding them in larger
+    // buffers then calls lsq. However, these matrices are then copied into
+    // another buffer inside lsq hence we can let lsq insert into the second
+    // buffer without modifying the original matrices. The only change lsq needs
+    // is the weightvalue of the augmented variable which starts at 100 and
+    // being multiplied by 10 on each iteration. Hence we only pass that value
+    // with "aug_weight".
+    if (S->mode == 4)
+    {
+        badlin = 1;
+        // Reset the RHS of the constraints to zero of the augmented system.
+        for (int i = 0; i < n; i++) { s[i] = 0.0; }
+        S->h3 = 0.0;
+        double rho = 100.0;
+        S->inconsistent = 0;
+        while (1)
+        {
+            lsq(m, S->meq, n, 1, rho, bfgs, gradx, C, d, u, v, s, mult, lsq_buffer, indices, &S->mode);
+            S->h4 = 1.0 - s[n];
+            if (S->mode == 4)
+            {
+                rho *= 10.0;
+                S->inconsistent++;
+                if (S->inconsistent > 5) { return; }
+                continue;
+            } else if (S->mode != 1) {
+                return;
+            }
+            break;
+        }
+    } else if (S->mode != 1) {
+        return;
+    }
+
+    // Update multipliers for L1-test
+    for (int i = 0; i < n; i++) { v[i] = gradx[i]; }
+    dgemv_("T", &m, &n, &dmone, C, &lda, mult, &one, &done, v, &one);
+
+    S->f0 = *funx;
+    for (int i = 0; i < n; i++) { x0[i] = sol[i]; }
+    S->gs = ddot_(&n, gradx, &one, s, &one);
+    S->h1 = fabs(S->gs);
+    S->h2 = 0.0;
+    for (int j = 0; j < m; j++)
+    {
+        if (j < S->meq)
+        {
+            S->h3 = d[j];
+        } else {
+            S->h3 = 0.0;
+        }
+        S->h2 = S->h2 + fmax(-d[j], S->h3);
+        S->h3 = fabs(mult[j]);
+        mu[j] = fmax(S->h3, (mu[j] + S->h3)/2.0);
+        S->h1 = S->h1 + S->h3*fabs(d[j]);
+    }
+
+    // Check convergence
+    S->mode = 0;
+    if ((S->h1 < S->acc) && (S->h2 < S->acc) && (!badlin) && (*funx == *funx)) { return; }
+    S->h1 = 0.0;
+    for (int j = 0; j < m; j++)
+    {
+        if (j < S->meq)
+        {
+            S->h3 = d[j];
+        } else {
+            S->h3 = 0.0;
+        }
+        S->h1 += mu[j]*fmax(-d[j], S->h3);
+    }
+    // 180
+    S->t0 = *funx + S->h1;
+    S->h3 = S->gs - S->h1*S->h4;
+    S->mode = 8;
+    if (S->h3 >= 0.0) { goto RESET_BFGS; }
+
+    // Line search with an L1 test function
+    S->line = 0;
+    S->alpha = 1.0;
+
+    // Inexact line search
+LINE_SEARCH:
+
+    S->line++;
+    S->h3 = (S->alpha) * (S->h3);
+    dscal_(&n, &S->alpha, s, &one);
+    for (int i = 0; i < n; i++) { sol[i] = x0[i]; }
+    daxpy_(&n, &done, s, &one, sol, &one);
+
+    S->mode = 1;
+    return;
+
+MODE1:
+
+    S->t = *funx;
+    for (int j = 0; j < m; j++)
+    {
+        if (j < S->meq)
+        {
+            S->h1 = d[j];
+        } else {
+            S->h1 = 0.0;
+        }
+        S->t = S->t + mu[j]*fmax(-d[j], S->h1);
+    }
+    S->h1 = S->t - S->t0;
+
+    if ((S->h1 > (S->h3 / 10.0)) && (S->line <= 10))
+    {
+        S->alpha = fmax(S->h3/(2.0*(S->h3 - S->h1)), alfmin);
+        goto LINE_SEARCH;
+    }
+
+    // Check convergence
+    S->h3 = 0.0;
+    for (int j = 0; j < m; j++)
+    {
+        if (j < S->meq)
+        {
+            S->h1 = d[j];
+        } else {
+            S->h1 = 0.0;
+        }
+        S->h3 = S->h3 + fmax(-d[j], S->h1);
+    }
+    if (
+        ((fabs(*funx - S->f0) < S->acc) || (dnrm2_(&n, s, &one) < S->acc)) &&
+        (S->h3 < S->acc) &&
+        (!badlin) &&
+        (*funx == *funx) // To filter for finite entries
+    )
+    {
+        S->mode = 0;
+        return;
+    } else {
+        S->mode = -1;
+    }
+    return;
+
+LABEL255:
+    // Check relaxed convergence in case of positive directional derivative
+    S->h3 = 0.0;
+    for (int j = 0; j < m; j++)
+    {
+        if (j < S->meq)
+        {
+            S->h1 = d[j];
+        } else {
+            S->h1 = 0.0;
+        }
+        S->h3 = S->h3 + fmax(-d[j], S->h1);
+    }
+    if (((fabs(*funx - S->f0) < S->tol) || (dnrm2_(&n, s, &one) < S->tol)) &&
+        (S->h3 < S->tol) &&
+        (!badlin) &&
+        (*funx == *funx)
+    )
+    {
+        S->mode = 0;
+    } else {
+        S->mode = 8;
+    }
+    return;
+
+MODEM1:
+
+    // Call Jacobian at current x
+
+    // Update Cholesky factors of Hessian matrix modified by BFGS formula
+    // u[i] = gradx[i] - C.T @ mult - v[i]
+
+    for (int i = 0; i < n; i++) { u[i] = gradx[i]; }
+    dgemv_("T", &m, &n, &dmone, C, &lda, mult, &one, &done, u, &one);
+    for (int i = 0; i < n; i++)
+    {
+        u[i] = u[i] - v[i];
+    }
+
+    // L'*S
+    for (int i = 0; i < n; i++) { v[i] = s[i]; }
+    dtpmv_("L", "T", "U", &n, bfgs, v, &one);
+
+    // D*L'*S
+    j = 0;
+    for (int i = 0; i < n; i++) {
+        v[i] = bfgs[j]*v[i];
+        j += n - i;
+    }
+
+    // L*D*L'*S
+    dtpmv_("L", "N", "U", &n, bfgs, v, &one);
+
+    S->h1 = ddot_(&n, s, &one, u, &one);
+    S->h2 = ddot_(&n, s, &one, v, &one);
+    S->h3 = 0.2*(S->h2);
+    if (S->h1 < S->h3)
+    {
+        S->h4 = (S->h2 - S->h3) / (S->h2 - S->h1);
+        S->h1 = S->h3;
+        double tmp_dbl = 1.0 - S->h4;
+        dscal_(&n, &S->h4, u, &one);
+        daxpy_(&n, &tmp_dbl, v, &one, u, &one);
+    }
+
+    // Test for singular update, and reset hessian if so
+    if ((S->h1 == 0.0) || (S->h2 == 0.0)) { goto RESET_BFGS; }
+
+    ldl_update(n, bfgs, u, 1.0 / S->h1, v);
+    ldl_update(n, bfgs, v, -1.0 / S->h2, u);
+
+    // End of main iteration
+    goto ITER_START;
+
+    return;
+}
+
+
+/*
+ *          min     |A*x - b|
+ *        E*x = f
+ *        G*x >= h
+ *      xl <= x <= xu
+ *
+ * Problem data is kept in Lf, gradx, C, d, xl, xu arrays in a rather tedious
+ * format. C(m, n) is the constraint normals, d(n) is the constraint bounds.
+ * xl(n) and xu(n) are the lower and upper bounds on x.
+ *
+ * Lf is the LDL' factor of the BFGS matrix also holding the diagonal entries.
+ *
+ * NaN entries in xl, xu, signify unconstrained variables and hence not included.
+ *
+ * The C matrix, for a problem with all x bounds are given and finite,
+ * broken into E and G as follows:
+ *
+ *                      ┌────┐    ┌────┐    ┌┐
+ *                  meq │    │    │ E  │  = ││ f
+ *                      │   ─┼────┼>   │    ││
+ *                      ┼────┼    └────┘    └┘
+ *                      │    │    ┌────┐    ┌┐
+ *                      │    │    │    │    ││
+ *      mineq = m - meq │   ─┼────┼>   │    ││
+ *                      │    │    │    │    ││
+ *                      │    │    │    │    ││
+ *                      └────┘    │    │ >= ││
+ *                        C       ┼────┼    ┼┼
+ *                              n │  I │    ││  xl
+ *                                ┼────┼    ┼┼
+ *                              n │ -I │    ││ -xu
+ *                                └────┘    └┘
+ *                                   G       h
+ *
+ * A and b are stored in Lf[] in LAPACK packed format where Lf holds a unit, lower
+ * triangular matrix with diagonal entries are overwritten by the entries of d[]
+ * and vector and gradx[].
+ *
+ *  Lf[] = [d[0], s[1], s[2], . , d[1], s[n + 2], d[2], ...]
+ *
+ *  interpreted as:
+ *
+ *         [d[ 0 ],                          ]
+ *         [s[ 1 ], d[ 1 ], .  ,             ]
+ *  Lf[] = [s[ 2 ], s[n+2], .  ,             ]
+ *         [ .    ,   .   , .  , d[n-1]      ]
+ *         [s[ n ], s[2*n], .  ,   .   , d[n]]
+ *
+ * Then, the following relations recover the A and b
+ *
+ *          A = sqrt(d[]) * Lf[]^T
+ *          b = - inv( Lf[] * sqrt(d[]) ) * gradx[]
+ *
+ * The solution is returned in x() and the Lagrange multipliers are returned in y().
+ *
+ * For solving the problem in case of a detection of inconsistent linearization,
+ * see D. Kraft, "A software package for Sequential Quadratic Programming"
+ * Section 2.2.3
+ *
+ * In the original code, the augmented system is detected by mismatch of certain
+ * integers which is making things quite unreadable. Here we explicitly pass a
+ * flag.
+ *
+ * Inconsistent linearization augments all arrays to accomodate for the dummy
+ * variable. The function is still called with the original sizes but the flag
+ * allows for enlarging the problem and hence the supplied buffer should accomodate
+ * for this extra space.
+ *
+ * The required buffer size is given by:
+ * (2*(m - meq)*(n + 1)+2)*(n - meq +1) + 2*2*(m-meq)*(n + 1) + 2*(m-meq)*(n + 1)
+ *  + 2*meq + ld + (ld + 2*(m-meq)*(n + 1))*(n - meq)
+ *
+ */
+void lsq(
+    int m, int meq, int n, int augment, double aug_weight, double* restrict Lf,
+    double* restrict gradx, double* restrict C, double* restrict d,
+    double* restrict xl, double* restrict xu, double* restrict x,
+    double* restrict y, double* buffer, int* jw, int* mode)
+{
+    int one = 1, orign = n;
+    int mineq = m - meq;
+    double xnorm = 0.0;
+    int cursor = 0;
+    int ld = n;
+    int n_wG_rows = 0;
+
+    if (augment) {
+        ld = n + 1;
+        x[n]     = 1.0;
+        xl[n]    = 0.0;
+        xu[n]    = 1.0;
+    }
+
+    // Recover A and b from Lf and gradx
+    for (int i = 0; i < (ld+2)*ld; i++) { buffer[i] = 0.0; }
+    double* restrict wA = buffer;
+    double* restrict wb = &buffer[ld*(ld+1)];
+
+    // Depending on augmented, wA is either the full array or the top-left block.
+
+    for (int j = 0; j < n; j++)
+    {
+        double diag = sqrt(Lf[cursor++]);      // Extract the diagonal value from Lf.
+        wA[j + j * ld] = diag;                 // Place the sqrt diagonal.
+        for (int i = j + 1; i < n; i++)
+        {
+            wA[j + i * ld] = Lf[cursor++] * diag;
+        }
+    }
+
+    // Compute b = - 1/sqrt(d[]) * inv(Lf[]) * gradx[]. Lf is already in packed format.
+    for (int i = 0; i < n; i++) { wb[i] = gradx[i]; }
+    dtpsv_("L", "N", "U", &n, Lf, wb, &one);
+    cursor = 0;
+    for (int i = 0; i < n; i++)
+    {
+        wb[i] /= -sqrt(Lf[cursor]);
+        cursor += n - i;
+    }
+
+    // If augmented, fill in the extra entry in the bottom right corner.
+    if (augment) { wA[ld*ld - 1] = aug_weight; }
+
+    // If augmented, also increase the number of variables by 1.
+    if (augment) { n++; }
+
+    // Get the equality constraints if given.
+    double* restrict wE = &buffer[n*(n+1) + n];
+    double* restrict wf = &buffer[n*(n+1) + n + n*meq];
+    if (meq > 0)
+    {
+        for (int j = 0; j < n-1; j++)
+        {
+            for (int i = 0; i < meq; i++)
+            {
+                wE[i + j*meq] = C[i + j*m];
+            }
+        }
+        if (augment)
+        {
+            // n is incremented hence all Ceq is now in wE. Add the extra column.
+            for (int i = 0; i < meq; i++) { wE[i + (n-1)*meq] = -d[i]; }
+
+        } else  {
+            // If not augmented then handle j = n - 1 that is skipped.
+            for (int i = 0; i < meq; i++) { wE[i + (n-1)*meq] = C[i + (n-1)*m]; }
+
+        }
+        for (int i = 0; i < meq; i++) { wf[i] = -d[i]; }
+    }
+
+    // Get the inequality constraints if given. First zero out wG and wh.
+    double* restrict wG = &buffer[n*(n+1) + n + n*meq + meq];
+    double* restrict wh = &buffer[n*(n+1) + n + n*meq + meq + (mineq + 2*n)*ld];
+    // Zero out wG and wh
+    for (int i = 0; i < (mineq + 2*n)*(ld + 1); i++) { wG[i] = 0.0; }
+
+    // Convert the bounds on x to +I and -I blocks in G.
+    // Augment h by xl and -xu.
+    // Unbounded constraints are signified by NaN values and they do not appear
+    // in G and h. Hence there is a nancount tab to keep track of them.
+
+    // We first populate "wh" to get the number of unbounded constraints. That will
+    // define the unskipped row number of wG. This is different than the original
+    // Fortran code where the max allocated row number and the actual row number
+    // of wG has been kept separate and it causes to be sent to every nested
+    // function call. Instead we form wG and wh once with fixed size.
+
+    int nancount = 0;
+    int nrow = mineq;
+    if (m > meq)
+    {
+        for (int i = 0; i < mineq; i++) { wh[i] = -d[meq + i]; }
+    }
+    for (int i = 0; i < n; i++)
+    {
+        if (isnan(xl[i]))
+        {
+            nancount++;
+        } else {
+            wh[nrow++] = xl[i];
+        }
+    }
+    for (int i = 0; i < n; i++)
+    {
+        if (isnan(xu[i]))
+        {
+            nancount++;
+        } else {
+            wh[nrow++] = -xu[i];
+        }
+    }
+
+    n_wG_rows = mineq + 2*n - nancount;
+
+    // Now that we know the actual row number of wG, we can finally populate
+    // the top part with C.
+    if (m > meq)
+    {
+        for (int j = 0; j < orign; j++)
+        {
+            for (int i = 0; i < mineq; i++)
+            {
+                wG[i + j*n_wG_rows] = C[meq + i + j*m];
+            }
+        }
+    }
+
+    // If augmented add the extra column.
+    if (augment)
+    {
+        for (int i = 0; i < mineq; i++)
+        {
+            wG[i + orign*n_wG_rows] = fmax(-d[meq + i], 0.0);
+        }
+    }
+
+    // Reset counter
+    nrow = mineq;
+    for (int i = 0; i < n; i++)
+    {
+        if (!isnan(xl[i]))
+        {
+            wG[nrow + i*n_wG_rows] = 1.0;
+            nrow++;
+        }
+    }
+    for (int i = 0; i < n; i++)
+    {
+        if (!isnan(xu[i]))
+        {
+            wG[nrow + i*n_wG_rows] = -1.0;
+            nrow++;
+        }
+    }
+
+    // Assign the remaining part of the buffer to the LSEI problem.
+    double* restrict lsei_scratch = &wh[mineq + 2*n];
+
+    lsei(ld, meq, n_wG_rows, n, wA, wb, wE, wf, wG, wh, x, lsei_scratch, jw, &xnorm, mode);
+
+    if (*mode == 1)
+    {
+        // Restore the Lagrange multipliers, first equality, then inequality.
+        for (int i = 0; i < meq; i++) { y[i] = lsei_scratch[i+n_wG_rows]; }
+        for (int i = 0; i < mineq; i++) { y[meq + i] = lsei_scratch[i]; }
+
+        // Set the user-defined bounds on x to NaN
+        for (int i = 0; i < 2*n; i++) { y[m + i] = NAN; }
+    }
+
+    // Clamp the solution, if given, to the finite bound interval
+    for (int i = 0; i < n; i++)
+    {
+        if ((!isnan(xl[i])) && (x[i] < xl[i])) { x[i] = xl[i]; }
+        else if ((!isnan(xu[i])) && (x[i] > xu[i])) { x[i] = xu[i]; }
+    }
+
+    return;
+}
+
+
+/*
+ * Solve equality and inequality constrained least squares problem (LSEI)
+ *      min |A*x - b|, subject to E*x = f, G*x >= h.
+ *
+ *  ma, me, mg : number of rows in A, E, G
+ *  n          : number of columns in A, x
+ *  a          : matrix A (ma x n)
+ *  b          : vector b (ma)
+ *  e          : matrix E (me x n)
+ *  f          : vector f (me)
+ *  g          : matrix G (mg x n)
+ *  h          : vector h (mg)
+ *  x          : solution vector x (n)
+ *  buffer     : work buffer (mg + 2)*(n - me +1) + 3*mg + 2*me + ma + (ma + mg)*(n - me)
+ *  jw         : integer work array
+ *  xnorm      : norm of the solution
+ *  mode       : return code
+ *
+ *  The buffer pointers that will be used:
+ *  buffer[0]                : Lagrange multipliers (mg + me)
+ *  buffer[mg + me]          : wb, Modified b vector (ma)
+ *  buffer[mg + me + ma]     : tau, Pivots for the RQ decomposition of E (me)
+ *  buffer[mg + 2*me + ma]   : Scratch space
+ *
+ */
+void
+lsei(int ma, int me, int mg, int n,
+     double* restrict a, double* restrict b, double* restrict e,
+     double* restrict f, double* restrict g, double* restrict h,
+     double* restrict x, double* restrict buffer, int* jw,
+     double* xnorm, int* mode)
+{
+    int one = 1, nvars = 0, info = 0, lde = 0, ldg = 0;
+    double done = 1.0, dmone = -1.0, dzero = 0.0, t= 0.0;
+    const double epsmach = 2.220446049250313e-16;
+
+    for (int i = 0; i < n; i++) { x[i] = 0.0; }
+    // Return if the problem is over-constrained.
+    if (me > n) { *mode = 2; return; }
+
+    //    [E]         [E2 |  R]                                [x ]
+    //    [A] @ Q.T = [A2 | A1]  ,and, x is partitioned as x = [--]
+    //    [G]         [G2 | G1]                                [xe]
+
+    // me = 0 skips the equality constraint related computations even though it
+    // causes aliasing below. The aliased arrays are not referenced in that case.
+    // Use at least 1 for the leading dimension of E even when me = 0 for LAPACK
+    // calls.
+    nvars = (n - me);
+    double* restrict gmults      = &buffer[0];
+    double* restrict emults      = &buffer[mg];
+    double* restrict wb          = &buffer[me + mg];
+    double* restrict tau         = &buffer[me + mg + ma];
+    double* restrict a2          = &buffer[mg + 2*me + ma];
+    double* restrict g2          = &buffer[mg + 2*me + ma + ma*nvars];
+    double* restrict lsi_scratch = &buffer[mg + 2*me + ma + (ma + mg)*nvars];
+
+    // RQ decomposition of equality constraint data E and application to A, G.
+    // LAPACK RQ routine dgerq2 forms R on the right.
+    // dgeqr2 is the unblocked versions of dgeqrf without the memory allocation.
+    // Use top of the yet unutilized scratch space for throw-away work.
+    lde = (me > 0 ? me : 1);
+    ldg = (mg > 0 ? mg : 1);
+    dgerq2_(&me, &n, e, &lde, tau, lsi_scratch, &info);
+
+    // Right triangularize E and apply Q.T to A and G from the right.
+    dormr2_("R", "T", &ma, &n, &me, e, &lde, tau, a, &ma, lsi_scratch, &info);
+    dormr2_("R", "T", &mg, &n, &me, e, &lde, tau, g, &ldg, lsi_scratch, &info);
+
+    // Check the diagonal elements of E for rank deficiency.
+    for (int i = 0; i < me; i++)
+    {
+        if (!(fabs(e[i + (nvars + i)*me]) >= epsmach)) { *mode = 6;return; }
+    }
+    // Solve E*x = f and modify b.
+    // Note: RQ forms R at the right of E instead of [0, 0] position.
+    for (int i = 0; i < me; i++) { x[nvars + i] = f[i]; }
+    dtrsv_("U", "N", "N", &me, &e[(nvars)*me], &lde, &x[nvars], &one);
+
+    *mode = 1;
+    // Zero out the inequality multiplier.
+    for (int i = 0; i < mg; i++) { gmults[i] = 0.0; }
+
+    // If the problem is fully equality-constrained, revert the basis and return.
+    if (me == n) { goto ORIGINAL_BASIS; }
+
+    // Compute the modified RHS wb = b - A1*x
+    // Copy b into wb
+    for (int i = 0; i < ma; i++) { wb[i] = b[i]; }
+    // Compute wb -= A1*xe
+    dgemv_("N", &ma, &me, &dmone, &a[ma*nvars], &ma, &x[nvars], &one, &done, wb, &one);
+
+    // Store the transformed A2 and G2 in the buffer
+    for (int j = 0; j < nvars; j++)
+    {
+        for (int i = 0; i < ma; i++)
+        {
+            a2[i + j*ma] = a[i + j*ma];
+        }
+        for (int i = 0; i < mg; i++)
+        {
+            g2[i + j*mg] = g[i + j*mg];
+        }
+    }
+
+    if (mg == 0)
+    {
+        // No inequality constraints, solve the least squares problem directly.
+        // We deliberately use the unblocked algorithm to avoid allocation.
+        int lwork = ma*nvars + 3*nvars + 1;
+        // Save the RHS for residual computation
+        double* restrict wb_orig = &lsi_scratch[lwork];
+        for (int i = 0; i < ma; i++) { wb_orig[i] = wb[i]; }
+
+        int krank = 0;
+        t = sqrt(epsmach);
+        dgelsy_(&ma, &nvars, &one, a2, &ma, wb, &ma, jw, &t, &krank, lsi_scratch, &lwork, &info);
+
+        // Copy the solution to x
+        for (int i = 0; i < nvars; i++) { x[i] = wb[i]; }
+
+        // Compute the residual and its norm, use a since a2 is overwritten.
+        dgemv_("N", &ma, &nvars, &done, a, &ma, x, &one, &dmone, wb_orig, &one);
+        *xnorm = dnrm2_(&ma, wb_orig, &one);
+
+        *mode = 7;
+        if (krank < nvars) { return; }
+        *mode = 1;
+        goto ORIGINAL_BASIS;
+    }
+
+    // Modify h, and solve the inequality constrained least squares problem.
+    // h -= G1*xe
+    dgemv_("N", &mg, &me, &dmone, &g[mg*nvars], &ldg, &x[nvars], &one, &done, h, &one);
+
+    lsi(ma, mg, nvars, a2, wb, g2, h, x, lsi_scratch, jw, xnorm, mode);
+
+    // Copy multipliers from scratch to gmults
+    for (int i = 0; i < mg; i++) { gmults[i] = lsi_scratch[i]; }
+
+    // If no equality constraints this was an LSI problem all along.
+    if (me == 0) { return; }
+
+    t = dnrm2_(&me, &x[nvars], &one);
+    // Modify the norm by adding the equality solution.
+    *xnorm = hypot(*xnorm, t);
+    if (*mode != 1) { return; }
+
+ORIGINAL_BASIS:
+    // Convert the solution and multipliers to the original basis.
+    // b = A*x - b (residuals)
+    dgemv_("N", &ma, &n, &done, a, &ma, x, &one, &dmone, b, &one);
+    // f = A1^T*b - G1^T*w
+    dgemv_("T", &ma, &me, &done, &a[nvars*ma], &ma, b, &one, &dzero, f, &one);
+    dgemv_("T", &mg, &me, &dmone, &g[nvars*mg], &ldg, gmults, &one, &done, f, &one);
+
+    // x = Q.T*x
+    dormr2_("L", "T", &n, &one, &me, e, &lde, tau, x, &n, lsi_scratch, &info);
+
+    // Solve the triangular system for the equality multipliers, emults.
+    for (int i = 0; i < me; i++) { emults[i] = f[i]; }
+    dtrsv_("U", "T", "N", &me, &e[(n - me)*me], &lde, emults, &one);
+
+    return;
+}
+
+
+/*
+ * Solve inequality constrained least squares problem
+ *      min |Ax - b|  subject to Gx >= h
+ *
+ * A is (ma x n), b is (ma), G is (mg x n), h is (mg), x is (n)
+ * buffer is at least (mg+2)*(n+1) + 2*mg
+ * jw is at least (mg)
+ * xnorm is the 2-norm of the residual vector
+ * mode is the integer return code
+ *
+ * Return codes for mode
+ *  1: successful computation
+ *  2: error return because of wrong dimensions
+ *  3: iteration count exceeded by nnls
+ *  4: inequality constraints incompatible
+ *  5: matrix A is not rank n
+ *
+*/
+void
+lsi(int ma, int mg, int n, double* restrict a, double* restrict b, double* restrict g,
+    double* restrict h, double* restrict x, double* restrict buffer, int* jw,
+    double* xnorm, int* mode)
+{
+    int one = 1, tmp_int = 0, info = 0;
+    double done = 1.0, dmone = -1.0, tmp_dbl = 0.0;
+    const double epsmach = 2.220446049250313e-16;
+
+    // QR decomposition of A and application to b.
+    // We use the unblocked versions of the LAPACK routines to avoid
+    // allocating extra "work" memory for the blocked versions.
+    tmp_int = (ma < n ? ma : n);
+    dgeqr2_(&ma, &n, a, &ma, buffer, &buffer[tmp_int], &info);
+
+    // Compute Q^T b
+    dorm2r_("L", "T", &ma, &one, &tmp_int, a, &ma, buffer, b, &ma, &buffer[tmp_int], &info);
+
+    // Check the diagonal elements of R for rank deficiency.
+    *mode = 5;
+    *xnorm = 0.0;
+    for (int i = 0; i < tmp_int; i++) {
+        if (!(fabs(a[i + i*ma]) >= epsmach)) { return; }
+    }
+    // Transform G and h to form the LDP problem.
+    // Solve XR = G where R is the upper triangular matrix from the QR.
+    // The result is stored in G.
+    // Note: There is an inherent assumption that ma >= n. This is a bug carried
+    // over here from the original slsqp implementation.
+    dtrsm_("R", "U", "N", "N", &mg, &n, &done, a, &ma, g, &mg);
+    // h = h - Xf
+    dgemv_("N", &mg, &n, &dmone, g, &mg, b, &one, &done, h, &one);
+
+    // Solve the LDP problem.
+    ldp(mg, n, g, h, x, buffer, jw, xnorm, mode);
+    if (*mode != 1) { return; }
+
+    // Convert to the solution of the original problem.
+    daxpy_(&n, &done, b, &one, x, &one);
+    dtrsv_("U", "N", "N", &n, a, &ma, x, &one);
+
+    // If any, compute the norm of the tail of b and add to xnorm
+    tmp_int = ma - n;
+    tmp_dbl = dnrm2_(&tmp_int, &b[(n + 1 > ma ? ma : n + 1) - 1], &one);
+    *xnorm = hypot(*xnorm, tmp_dbl);
+
+    return;
+}
+
+/*
+ * Solve least distance problem
+ *  min (1/2)|x|^2  subject to  Gx >= h
+ *
+ * G is (m x n), h is (m)
+ * buffer is at least (m+2)*(n+1) + 2*m
+ * indices is int(n)
+ * x is (n)
+ * xnorm is the norm of the solution if succeded
+ * mode is the return code integer
+ *
+ * Mode return values
+ *  1  : solution found
+ *  2  : bad input dimensions
+ *  3  : iteration count exceeded by nnls
+ *  4  : inequality constraints incompatible
+ *
+*/
+void
+ldp(int m, int n, double* restrict g, double* restrict h, double* restrict x,
+    double* restrict buffer, int* indices, double* xnorm, int* mode)
+{
+    int one = 1;
+    double dzero = 0.0, rnorm = 0.0;
+    // Check for inputs and initialize x
+    if (n <= 0) { *mode = 2; return; }
+    for (int i = 0; i < n; i++) { x[i] = 0.0; }
+    if (m == 0) { *mode = 1; return; }
+
+    // Define pointers for the variables on buffer
+    double* restrict a    = &buffer[0];
+    double* restrict b    = &buffer[m*(n+1)];
+    double* restrict zz   = &buffer[(m+1)*(n+1)];
+    double* restrict y    = &buffer[(m+2)*(n+1)];
+    double* restrict w    = &buffer[(m+2)*(n+1) + m];
+
+    // Save the dual problem data into buffer
+    //       dual problem [G^T] [x] = [0]
+    //                    [h^T]       [1]
+
+    // LHS, G is (m x n), h is (m). Both transposed and stacked into (n+1) x m.
+    for (int j = 0; j < m; j++)
+    {
+        for (int i = 0; i < n; i++)
+        {
+            a[i + j*(n+1)] = g[j + i*m];
+        }
+        // Place h in the last row.
+        a[n + j*(n+1)] = h[j];
+    }
+    // RHS is (n+1)
+    for (int i = 0; i < n; i++) { b[i] = 0.0; }
+    b[n] = 1.0;
+
+    // Solve the dual problem
+    __nnls(n+1, m, a, b, y, w, zz, indices, 3*m, &rnorm, mode);
+    if (*mode != 1) { return; }
+    *mode = 4;
+    if (rnorm <= 0.0) { return; }
+
+    // Solve the primal problem
+    double fac = 1.0 - ddot_(&m, h, &one, y, &one);
+    if (!((1.0 + fac) - 1.0 > 0.0)) { return; }
+    *mode = 1;
+    fac = 1.0 / fac;
+    dgemv_("T", &m, &n, &fac, g, &m, y, &one, &dzero, x, &one);
+    *xnorm = dnrm2_(&n, x, &one);
+
+    // Compute the lagrange multipliers for the primal problem
+    for (int i = 0; i < m; i++) { buffer[i] = fac*y[i]; }
+    return;
+}
+
+
+/*
+ *
+ * Updates the LDL' factors of matrix a by rank-one matrix sigma*z*z'
+ * n     : order of the coefficient matrix a
+ * a     : positive definite matrix of dimension n; only the lower triangle is
+ *         used and is stored column by column as one dimensional array of
+ *         dimension n*(n+1)/2.
+ * z     : vector of dimension n of updating elements
+ * sigma : scalar factor by which the modifying dyade z*z' is multiplied
+ * w     : working array of dimension n
+ *
+ * Uses the composite-t method of fletcher and powell as described in "On the
+ * modification of LDL' factorizations", DOI:10.1090/S0025-5718-1974-0359297-1
+ *
+ * Implemented by: Dieter Kraft, dfvlr - Institut für Dynamik der Flugsysteme
+ *                 D-8031  Oberpfaffenhofen
+ *
+ */
+static void
+ldl_update(int n, double* restrict a, double* restrict z, double sigma, double* restrict w)
+{
+    int j, ij = 0;
+    const double epsmach = 2.220446049250313e-16;
+    if (sigma == 0.0) { return; }
+    double alpha, beta, delta, gamma, u, v, tp, t = 1.0 / sigma;
+
+    if (sigma <= 0.0)
+    {
+        // Negative update
+        for (int i = 0; i < n; i++) { w[i] = z[i]; }
+        for (int i = 0; i < n; i++)
+        {
+            v = w[i];
+            t = t + v*v/a[ij];
+            for (int j = i + 1; j < n; j++)
+            {
+                ij++;
+                w[j] = w[j] - v*a[ij];
+            }
+            ij++;
+        }
+        if (t >= 0.0) { t = epsmach / sigma; }
+
+        for (int i = 0; i < n; i++)
+        {
+            j = n - i - 1;
+            ij -= i + 1;
+            u = w[j];
+            w[j] = t;
+            t = t - u*u / a[ij];
+        }
+    }
+
+    // Positive update
+    for (int i = 0; i < n; i++)
+    {
+        v = z[i];
+        delta = v / a[ij];
+        // sigma == 0.0 is handled at the beginning.
+        tp = (sigma < 0.0 ? w[i] : t + delta*v);
+        alpha = tp / t;
+        a[ij] = alpha*a[ij];
+        if (i == n - 1) { return; }
+        beta = delta / tp;
+        if (alpha <= 4.0)
+        {
+            for (int j = i + 1; j < n; j++)
+            {
+                ij++;
+                z[j] = z[j] - v * a[ij];
+                a[ij] = a[ij] + beta * z[j];
+            }
+        } else {
+            gamma = t / tp;
+            for (int j = i + 1; j < n; j++)
+            {
+                ij++;
+                u = a[ij];
+                a[ij] = gamma * u + beta * z[j];
+                z[j] = z[j] - v * u;
+            }
+        }
+        ij++;
+        t = tp;
+    }
+
+    return;
+}
diff --git a/scipy/optimize/__slsqp.h b/scipy/optimize/__slsqp.h
new file mode 100644
index 000000000000..fabb8ae514de
--- /dev/null
+++ b/scipy/optimize/__slsqp.h
@@ -0,0 +1,412 @@
+/*
+ * This file and the accompanying __slsqp.c file are the C translations of the
+ * Fortran77 code of the SLSQP algorithm for the SciPy project and hence inherits
+ * SciPy license. The original Fortran code is available at
+ * http://www.netlib.org/toms/733 written by Dieter Kraft, see:
+ *
+ *  ALGORITHM 733, COLLECTED ALGORITHMS FROM ACM.
+ *  TRANSACTIONS ON MATHEMATICAL SOFTWARE,
+ *  VOL. 20, NO. 3, SEPTEMBER, 1994, PP. 262-281.
+ *  https://doi.org/10.1145/192115.192124
+ *
+ *
+ * The original Fortran code is released for use under BSD license, with the
+ * following statement from the original license holder ACM publications:
+  *
+ *  https://web.archive.org/web/20170106155705/http://permalink.gmane.org/gmane.comp.python.scientific.devel/6725
+ *  ------
+ *  From: Deborah Cotton <cotton@hq.acm.org>
+ *  Date: Fri, 14 Sep 2007 12:35:55 -0500
+ *  Subject: RE: Algorithm License requested
+ *  To: Alan Isaac
+ *
+ *  Prof. Issac,
+ *
+ *  In that case, then because the author consents to [the ACM] releasing
+ *  the code currently archived at http://www.netlib.org/toms/733 under the
+ *  BSD license, the ACM hereby releases this code under the BSD license.
+ *
+ *  Regards,
+ *
+ *  Deborah Cotton, Copyright & Permissions
+ *  ACM Publications
+ *  2 Penn Plaza, Suite 701**
+ *  New York, NY 10121-0701
+ *  permissions@acm.org
+ *  212.869.7440 ext. 652
+ *  Fax. 212.869.0481
+ *  ------
+*/
+
+#ifndef __SLSQPLIB_H
+#define __SLSQPLIB_H
+
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+#include "numpy/arrayobject.h"
+
+#define PYERR(errobj,message) {PyErr_SetString(errobj,message); return NULL;}
+static PyObject* slsqp_error;
+
+#include <math.h>
+#include "__nnls.h"
+
+// BLAS/LAPACK function prototypes used in SLSQP
+void daxpy_(int* n, double* sa, double* sx, int* incx, double* sy, int* incy);
+double ddot_(int* n, double* dx, int* incx, double* dy, int* incy);
+void dgelsy_(int* m, int* n, int* nrhs, double* a, int* lda, double* b, int* ldb, int* jpvt, double* rcond, int* rank, double* work, int* lwork, int* info);
+void dgemv_(char* trans, int* m, int* n, double* alpha, double* a, int* lda, double* x, int* incx, double* beta, double* y, int* incy);
+void dgeqr2_(int* m, int* n, double* a, int* lda, double* tau, double* work, int* info);
+void dgeqrf_(int* m, int* n, double* a, int* lda, double* tau, double* work, double* lwork, int* info);
+void dgerq2_(int* m, int* n, double* a, int* lda, double* tau, double* work, int* info);
+void dlarf_(char* side, int* m, int* n, double* v, int* incv, double* tau, double* c, int* ldc, double* work);
+void dlarfgp_(int* n, double* alpha, double* x, int* incx, double* tau);
+void dlartgp_(double* f, double* g, double* cs, double* sn, double* r);
+double dnrm2_(int* n, double* x, int* incx);
+void dorm2r_(char* side, char* trans, int* m, int* n, int* k, double* a, int* lda, double* tau, double* c, int* ldc, double* work, int* info);
+void dormr2_(char* side, char* trans, int* m, int* n, int* k, double* a, int* lda, double* tau, double* c, int* ldc, double* work, int* info);
+void dscal_(int* n, double* da, double* dx, int* incx);
+void dtpmv_(char* uplo, char* trans, char* diag, int* n, double* ap, double* x, int* incx);
+void dtpsv_(char* uplo, char* trans, char* diag, int* n, double* ap, double* x, int* incx);
+void dtrsm_(char* side, char* uplo, char* transa, char* diag, int* m, int* n, double* alpha, double* a, int* lda, double* b, int* ldb);
+void dtrsv_(char* uplo, char* trans, char* diag, int* n, double* a, int* lda, double* x, int* incx);
+
+
+// The SLSQP_vars struct holds the state of the algorithm and passed to Python
+// and back such that it is thread-safe.
+struct SLSQP_vars {
+    double acc, alpha, f0, gs, h1, h2, h3, h4, t, t0, tol;
+    int exact, inconsistent, reset, iter, itermax, line, m, meq, mode, n;
+};
+
+
+void __slsqp_body(struct SLSQP_vars* S, double* funx, double* gradx, double* C, double* d, double* sol, double* mult, double* xl, double* xu, double* buffer, int* indices);
+
+
+static PyObject*
+nnls(PyObject* Py_UNUSED(dummy), PyObject* args) {
+
+    int maxiter, info = 0;
+    PyArrayObject* ap_A=NULL;
+    PyArrayObject* ap_b=NULL;
+    double* buffer;
+    double rnorm;
+
+    // Get the input array
+    if (!PyArg_ParseTuple(args,
+                         ("O!O!i"),
+                         &PyArray_Type, (PyObject **)&ap_A,
+                         &PyArray_Type, (PyObject **)&ap_b,
+                         &maxiter)
+        )
+    {
+        return NULL;
+    }
+
+    // Check for dtype compatibility
+    if ((PyArray_TYPE(ap_A) != NPY_FLOAT64) || (PyArray_TYPE(ap_b) != NPY_FLOAT64))
+    {
+        PYERR(slsqp_error, "Inputs to nnls must be of type numpy.float64.");
+    }
+
+    int ndim = PyArray_NDIM(ap_A);              // Number of dimensions
+    if (ndim != 2)
+    {
+        PYERR(slsqp_error, "Input array A must be 2D.");
+    }
+    npy_intp* shape = PyArray_SHAPE(ap_A);       // Array shape
+    npy_intp m = shape[0];                       // Number of rows
+    npy_intp n = shape[1];                       // Number of columns
+
+    int ndim_b = PyArray_NDIM(ap_b);             // Number of dimensions
+    npy_intp* shape_b = PyArray_SHAPE(ap_b);     // Array shape
+    if (ndim_b == 1)
+    {
+        if (shape_b[0] != m)
+        {
+            PYERR(slsqp_error, "Input array b must have the same number of rows as A.");
+        }
+    } else if (ndim_b == 2) {
+        if (shape_b[0] != m)
+        {
+            PYERR(slsqp_error, "Input array b must have the same number of rows as A.");
+        }
+        if (shape_b[1] != 1)
+        {
+            PYERR(slsqp_error, "Input array b must have only one column.");
+        }
+    } else {
+        PYERR(slsqp_error, "Input array b must be 1D or 2D with one column.");
+    }
+
+    // Allocate memory for the algorithm,
+    // A is m x n, b is m, x is n, w is n, zz is m
+    // total m*(n+2) + 2*n
+    //indices is n
+    buffer = malloc((m*(n+2) + 3*n)*sizeof(double));
+    if (buffer == NULL)
+    {
+        PYERR(slsqp_error, "Memory allocation failed.");
+    }
+    int *indices = malloc(n*sizeof(int));
+    if (indices == NULL)
+    {
+        free(buffer);
+        PYERR(slsqp_error, "Memory allocation failed.");
+    }
+
+    double* x = &buffer[0];                 // Solution vector x (n)
+    double* a = &buffer[n];                 // Matrix A (m x n)
+    double* b = &buffer[n*m + n];           // Vector b (m)
+    double* w = &buffer[(n+1)*m + n];       // Vector w (n)
+    double* zz = &buffer[(n+1)*m + 2*n];    // Vector zz (m)
+
+    npy_intp* restrict strides = PyArray_STRIDES(ap_A);
+    double* restrict data_A = (double *)PyArray_DATA(ap_A);
+    npy_intp* restrict stride_b = PyArray_STRIDES(ap_b);
+    // If b is 2D then pick the stride of the first dimension
+    npy_intp rc_stride = (ndim_b == 1 ? stride_b[0] : stride_b[1]);
+    double* restrict data_b = (double *)PyArray_DATA(ap_b);
+
+    // Copy the data from the numpy array
+    for (int j = 0; j < n; j++) {
+        for (int i = 0; i < m; i++) {
+            a[i + j*m] = data_A[(j*strides[1] + i*strides[0])/sizeof(double)];
+        }
+    }
+    for (int i = 0; i < m; i++)
+    {
+        b[i] = data_b[(i * rc_stride)/sizeof(double)];
+    }
+
+    // Call nnls
+    __nnls((int)m, (int)n, a, b, x, w, zz, indices, maxiter, &rnorm, &info);
+    // x is the first n elements of buffer, shrink buffer to n elements
+    free(indices);
+    double* mem_ret = realloc(buffer, n*sizeof(double));
+    // Very unlikely, but just in case
+    if (mem_ret == NULL)
+    {
+        free(buffer);
+        PYERR(slsqp_error, "Memory reallocation failed.");
+    }
+    npy_intp shape_ret[1] = {n};
+    PyArrayObject* ap_ret = (PyArrayObject*)PyArray_SimpleNewFromData(1, shape_ret, NPY_FLOAT64, mem_ret);
+    // Return the result
+    return Py_BuildValue("Ndi",PyArray_Return(ap_ret), rnorm, info);
+
+}
+
+
+static PyObject*
+slsqp(PyObject* Py_UNUSED(dummy), PyObject* args)
+{
+    PyArrayObject *ap_gradx=NULL, *ap_C=NULL, *ap_d=NULL, *ap_mult=NULL;
+    PyArrayObject *ap_sol =NULL, *ap_xl=NULL, *ap_xu=NULL, *ap_buffer=NULL;
+    PyArrayObject* ap_indices=NULL;
+    PyObject* input_dict = NULL;
+    double funx;
+    struct SLSQP_vars Vars;
+
+    // The Python input should provide with a dictionary that maps to the struct
+    // SLSQP_vars. Necessary fields that would make the algorithm change
+    // behavior are m, meq, n, acc, maxiter, and mode. The rest can be left as zero.
+    // Changing values mid run is not recommended as they hold the internal state
+    // of the algorithm.
+    // The reason why they are returned is to make the algorithm stateless.
+
+    // The required arrays C, d, x, xl, xu, gradx, sol are passed as numpy arrays.
+    // The remaining arrays are going to be allocated in the buffer.
+
+    if (!PyArg_ParseTuple(args, "O!dO!O!O!O!O!O!O!O!O!",
+                          &PyDict_Type, (PyObject **)&input_dict,
+                          &funx,
+                          &PyArray_Type, (PyObject **)&ap_gradx,
+                          &PyArray_Type, (PyObject **)&ap_C,
+                          &PyArray_Type, (PyObject **)&ap_d,
+                          &PyArray_Type, (PyObject **)&ap_sol,
+                          &PyArray_Type, (PyObject **)&ap_mult,
+                          &PyArray_Type, (PyObject **)&ap_xl,
+                          &PyArray_Type, (PyObject **)&ap_xu,
+                          &PyArray_Type, (PyObject **)&ap_buffer,
+                          &PyArray_Type, (PyObject **)&ap_indices))
+    {
+        return NULL;
+    }
+
+    // Some helper x macros to pack and unpack the SLSQP_vars struct and
+    // the Python dictionary.
+
+    #define STRUCT_DOUBLE_FIELD_NAMES X(acc) X(alpha) X(f0) X(gs) X(h1) X(h2) X(h3) X(h4) X(t) X(t0) X(tol)
+    #define STRUCT_INT_FIELD_NAMES X(exact) X(inconsistent) X(reset) X(iter) X(itermax) X(line) X(m) X(meq) X(mode) X(n)
+    #define STRUCT_FIELD_NAMES STRUCT_INT_FIELD_NAMES STRUCT_DOUBLE_FIELD_NAMES
+
+    // Parse the dictionary, if the field is not found, raise an error.
+    // Do it separately for doubles and ints.
+    // Initialize the struct that will be populated from dict with zeros
+    #define X(name) Vars.name = 0;
+    STRUCT_FIELD_NAMES
+    #undef X
+
+    // PyDict_GetItemString returns a borrowed reference.
+    #define X(name) \
+        PyObject* name##_obj = PyDict_GetItemString(input_dict, #name); \
+        if (!name##_obj) { PYERR(slsqp_error, #name " not found in the dictionary."); } \
+        Vars.name = PyFloat_AsDouble(name##_obj);
+    STRUCT_DOUBLE_FIELD_NAMES
+    #undef X
+
+    #define X(name) \
+        PyObject* name##_obj = PyDict_GetItemString(input_dict, #name); \
+        if (!name##_obj) { PYERR(slsqp_error, #name " not found in the dictionary."); } \
+        Vars.name = (int)PyLong_AsLong(name##_obj);
+        STRUCT_INT_FIELD_NAMES
+    #undef X
+
+    // Basic error checks for the numpy arrays.
+    if ((PyArray_TYPE(ap_C) != NPY_FLOAT64) || (PyArray_TYPE(ap_d) != NPY_FLOAT64) ||
+        (PyArray_TYPE(ap_gradx) != NPY_FLOAT64) || (PyArray_TYPE(ap_sol) != NPY_FLOAT64) ||
+        (PyArray_TYPE(ap_xl) != NPY_FLOAT64) || (PyArray_TYPE(ap_xu) != NPY_FLOAT64) ||
+        (PyArray_TYPE(ap_buffer) != NPY_FLOAT64) || (PyArray_TYPE(ap_indices) != NPY_INT32))
+    {
+        PYERR(slsqp_error, "All inputs to slsqp must be of type numpy.float64, "
+                           "except \"indices\" which must be of numpy.int32.");
+    }
+
+    // Buffer is 1D hence both F and C contiguous, test with either of them.
+    if (!PyArray_IS_C_CONTIGUOUS(ap_buffer)) { PYERR(slsqp_error, "Input array buffer must be 1d contiguous."); }
+
+    // Derive the number of variables from the solution vector length.
+    int ndim_sol = PyArray_NDIM(ap_sol);
+    npy_intp* shape_sol = PyArray_SHAPE(ap_sol);
+    int ndim_mult = PyArray_NDIM(ap_mult);
+    npy_intp* shape_mult = PyArray_SHAPE(ap_mult);
+    int ndim_C = PyArray_NDIM(ap_C);
+    int ndim_d = PyArray_NDIM(ap_d);
+    int ndim_gradx = PyArray_NDIM(ap_gradx);
+    int ndim_xl = PyArray_NDIM(ap_xl);
+    int ndim_xu = PyArray_NDIM(ap_xu);
+
+    if (ndim_sol != 1) { PYERR(slsqp_error, "Input array sol must be 1D."); }
+    if ((int)shape_sol[0] != Vars.n) { PYERR(slsqp_error, "Input array \"sol\" must have at least n elements."); }
+    if (ndim_mult != 1) { PYERR(slsqp_error, "Input array \"mult\" must be 1D."); }
+    if ((int)shape_mult[0] != 2*Vars.n + Vars.m + 2) { PYERR(slsqp_error, "Input array \"mult\" must have m + 2*n + 2 elements."); }
+    if (ndim_C != 2) { PYERR(slsqp_error, "Input array \"C\" must be 2D."); }
+    if (ndim_d != 1) { PYERR(slsqp_error, "Input array d must be 1D."); }
+    if (ndim_gradx != 1) { PYERR(slsqp_error, "Input array gradx must be 1D."); }
+    if (ndim_xl != 1) { PYERR(slsqp_error, "Input array xl must be 1D."); }
+    if (ndim_xu != 1) { PYERR(slsqp_error, "Input array xu must be 1D."); }
+
+    double* gradx_data = (double*)PyArray_DATA(ap_gradx);
+    double* C_data = (double*)PyArray_DATA(ap_C);
+    double* d_data = (double*)PyArray_DATA(ap_d);
+    double* restrict sol_data = (double*)PyArray_DATA(ap_sol);
+    double* mult_data = (double*)PyArray_DATA(ap_mult);
+    double* restrict xl_data = (double*)PyArray_DATA(ap_xl);
+    double* restrict xu_data = (double*)PyArray_DATA(ap_xu);
+    double* buffer_data = (double*)PyArray_DATA(ap_buffer);
+    int* indices_data = (int*)PyArray_DATA(ap_indices);
+
+    __slsqp_body(&Vars, &funx, gradx_data, C_data, d_data, sol_data, mult_data, xl_data, xu_data, buffer_data, indices_data);
+
+    // During the intermediate steps, there can be a few ULPs of bound violations,
+    // hence we clamp the solution if given, to the finite bound values when mode
+    // is 1 or -1.
+    if ((Vars.mode == 1) || (Vars.mode == -1))
+    {
+        int n = Vars.n;
+        for (int i = 0; i < n; i++)
+        {
+            if ((!isnan(xl_data[i])) && (sol_data[i] < xl_data[i])) { sol_data[i] = xl_data[i]; }
+            else if ((!isnan(xu_data[i])) && (sol_data[i] > xu_data[i])) { sol_data[i] = xu_data[i]; }
+        }
+    }
+
+    // Map struct variables back to dictionary.
+    // Py_XXX_FromXXX returns a new reference, hence needs to be decremented.
+
+    #define X(name) do { \
+            PyObject* tmp_##name = PyFloat_FromDouble(Vars.name); \
+            if ((!tmp_##name) || (PyDict_SetItemString(input_dict, #name, tmp_##name) < 0)) { \
+            Py_XDECREF(tmp_##name); \
+            PYERR(slsqp_error, "Setting '" #name "' failed."); \
+            } \
+            Py_DECREF(tmp_##name); \
+        } while (0);
+        STRUCT_DOUBLE_FIELD_NAMES
+    #undef X
+
+    #define X(name) do { \
+            PyObject* tmp_##name = PyLong_FromLong((long)Vars.name); \
+            if ((!tmp_##name) || (PyDict_SetItemString(input_dict, #name, tmp_##name) < 0)) { \
+                Py_XDECREF(tmp_##name); \
+                PYERR(slsqp_error, "Setting '" #name "' failed."); \
+            } \
+            Py_DECREF(tmp_##name); \
+        } while (0);
+        STRUCT_INT_FIELD_NAMES
+    #undef X
+    #undef STRUCT_FIELD_NAMES
+    #undef STRUCT_INT_FIELD_NAMES
+    #undef STRUCT_DOUBLE_FIELD_NAMES
+
+    Py_RETURN_NONE;
+
+};
+
+
+static char doc_nnls[] = ("Compute the nonnegative least squares solution.\n\n"
+                           "    x, info = nnls(A)\n\n");
+
+
+static char doc_slsqp[] = (
+    "Sequential Least Squares Programming (SLSQP) optimizer.\n\n"
+    "    x, info = slsqp(S: dict, funx: np.float64, "
+    "gradx: NDArray, C: NDarray, d: NDArray, "
+    "sol: NDArray, xl: NDArray, xu: NDArray, buffer: NDArray, indices: NDArray)"
+    "\n\n");
+
+
+// Sentinel terminated method list.
+static struct PyMethodDef slsqplib_module_methods[] = {
+  {"nnls", nnls, METH_VARARGS, doc_nnls},
+  {"slsqp", slsqp, METH_VARARGS, doc_slsqp},
+  {NULL, NULL, 0, NULL}
+};
+
+
+struct PyModuleDef moduledef = {
+    PyModuleDef_HEAD_INIT,
+    "_slsqplib",
+    NULL,
+    -1,
+    slsqplib_module_methods,
+    NULL,
+    NULL,
+    NULL,
+    NULL
+};
+
+
+PyMODINIT_FUNC
+PyInit__slsqplib(void)
+{
+    import_array();
+
+    PyObject* module = PyModule_Create(&moduledef);
+    if (module == NULL) { return NULL; }
+    PyObject* mdict = PyModule_GetDict(module);
+    if (mdict == NULL) { return NULL; }
+    slsqp_error = PyErr_NewException("_slsqplib.error", NULL, NULL);
+    if (slsqp_error == NULL) { return NULL; }
+    if (PyDict_SetItemString(mdict, "error", slsqp_error)) { return NULL; }
+
+#if Py_GIL_DISABLED
+    PyUnstable_Module_SetGIL(module, Py_MOD_GIL_NOT_USED);
+#endif
+
+    return module;
+}
+
+
+#endif // __SLSQPLIB_H
diff --git a/scipy/optimize/_bracket.py b/scipy/optimize/_bracket.py
index 1db2c5b49e7a..8bc5ab0dc2f5 100644
--- a/scipy/optimize/_bracket.py
+++ b/scipy/optimize/_bracket.py
@@ -1,7 +1,7 @@
 import numpy as np
 import scipy._lib._elementwise_iterative_method as eim
 from scipy._lib._util import _RichResult
-from scipy._lib._array_api import array_namespace, xp_ravel, xp_default_dtype
+from scipy._lib._array_api import array_namespace, xp_ravel, xp_promote
 
 _ELIMITS = -1  # used in _bracket_root
 _ESTOPONESIDE = 2  # used in _bracket_root
@@ -14,13 +14,7 @@ def _bracket_root_iv(func, xl0, xr0, xmin, xmax, factor, args, maxiter):
     if not np.iterable(args):
         args = (args,)
 
-    xp = array_namespace(xl0)
-    xl0 = xp.asarray(xl0)[()]
-    if (not xp.isdtype(xl0.dtype, "numeric")
-        or xp.isdtype(xl0.dtype, "complex floating")):
-        raise ValueError('`xl0` must be numeric and real.')
-    if not xp.isdtype(xl0.dtype, "real floating"):
-        xl0 = xp.asarray(xl0, dtype=xp_default_dtype(xp))
+    xp = array_namespace(xl0, xr0, xmin, xmax, factor, *args)
 
     # If xr0 is not supplied, fill with a dummy value for the sake of
     # broadcasting. We need to wait until xmax has been validated to
@@ -33,8 +27,11 @@ def _bracket_root_iv(func, xl0, xr0, xmin, xmax, factor, args, maxiter):
     xmin = -xp.inf if xmin is None else xmin
     xmax = xp.inf if xmax is None else xmax
     factor = 2. if factor is None else factor
-    xl0, xr0, xmin, xmax, factor = xp.broadcast_arrays(
-        xl0, xp.asarray(xr0), xp.asarray(xmin), xp.asarray(xmax), xp.asarray(factor))
+    xl0, xr0, xmin, xmax, factor = xp_promote(
+        xl0, xr0, xmin, xmax, factor, broadcast=True, force_floating=True, xp=xp)
+
+    if not xp.isdtype(xl0.dtype, ('integral', 'real floating')):
+        raise ValueError('`xl0` must be numeric and real.')
 
     if (not xp.isdtype(xr0.dtype, "numeric")
         or xp.isdtype(xr0.dtype, "complex floating")):
@@ -425,13 +422,7 @@ def _bracket_minimum_iv(func, xm0, xl0, xr0, xmin, xmax, factor, args, maxiter):
     if not np.iterable(args):
         args = (args,)
 
-    xp = array_namespace(xm0)
-    xm0 = xp.asarray(xm0)[()]
-    if (not xp.isdtype(xm0.dtype, "numeric")
-        or xp.isdtype(xm0.dtype, "complex floating")):
-        raise ValueError('`xm0` must be numeric and real.')
-    if not xp.isdtype(xm0.dtype, "real floating"):
-        xm0 = xp.asarray(xm0, dtype=xp_default_dtype(xp))
+    xp = array_namespace(xm0, xl0, xr0, xmin, xmax, factor, *args)
 
     xmin = -xp.inf if xmin is None else xmin
     xmax = xp.inf if xmax is None else xmax
@@ -450,10 +441,12 @@ def _bracket_minimum_iv(func, xm0, xl0, xr0, xmin, xmax, factor, args, maxiter):
         xr0_not_supplied = True
 
     factor = 2.0 if factor is None else factor
-    xl0, xm0, xr0, xmin, xmax, factor = xp.broadcast_arrays(
-        xp.asarray(xl0), xm0, xp.asarray(xr0), xp.asarray(xmin),
-        xp.asarray(xmax), xp.asarray(factor)
-    )
+
+    xm0, xl0, xr0, xmin, xmax, factor = xp_promote(
+        xm0, xl0, xr0, xmin, xmax, factor, broadcast=True, force_floating=True, xp=xp)
+
+    if not xp.isdtype(xm0.dtype, ('integral', 'real floating')):
+        raise ValueError('`xm0` must be numeric and real.')
 
     if (not xp.isdtype(xl0.dtype, "numeric")
         or xp.isdtype(xl0.dtype, "complex floating")):
diff --git a/scipy/optimize/_cython_nnls.pyx b/scipy/optimize/_cython_nnls.pyx
deleted file mode 100644
index 53bfb82adf95..000000000000
--- a/scipy/optimize/_cython_nnls.pyx
+++ /dev/null
@@ -1,216 +0,0 @@
-# cython: boundscheck=False
-# cython: initializedcheck=False
-# cython: wraparound=False
-# cython: cdivision=True
-# cython: cpow=True
-
-
-__all__ = ['_nnls']
-
-from scipy.linalg.cython_lapack cimport dlarfgp, dlarf, dlartgp
-from scipy.linalg.cython_blas cimport dnrm2
-import numpy as np
-cimport numpy as cnp
-cnp.import_array()
-
-def _nnls(cnp.ndarray[cnp.float64_t, ndim=2] A_in,
-          cnp.ndarray[cnp.float64_t, ndim=1] b_in,
-          int maxiter):
-    # Make copies of the input to be mutated
-    cdef cnp.ndarray[cnp.float64_t, ndim=2, mode='c'] A = A_in.copy(order='C')
-    cdef cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] b = b_in.copy()
-
-    cdef int m = <int>A.shape[0], n = <int>A.shape[1]
-    cdef int i = 0, ii = 0, ip = 0, iteration = 0, iz = 0, iz1 = 0, izmax = 0
-    cdef int j = 0, jj = 0, k = 0
-    cdef int col = 0, nrow = 0, nsetp = 0, one = 1, tmpint = 0
-    cdef double tau = 0.0, unorm = 0.0, ztest, tmp, alpha, beta, cc, ss, wmax, T
-    cdef cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] w
-    cdef cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] x
-    cdef cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] work
-    cdef cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] zz
-    cdef cnp.ndarray[cnp.int32_t, ndim=1, mode='c'] inds
-    cdef bint skip = False
-
-    inds = cnp.PyArray_Arange(0, n, 1, cnp.NPY_INT32)
-    w = cnp.PyArray_EMPTY(1, [n], cnp.NPY_FLOAT64, 0)
-    work = cnp.PyArray_EMPTY(1, [m], cnp.NPY_FLOAT64, 0)
-    x = cnp.PyArray_ZEROS(1, [n], cnp.NPY_FLOAT64, 0)
-    zz = cnp.PyArray_EMPTY(1, [m], cnp.NPY_FLOAT64, 0)
-
-    # Quit if all coefficients are already in the solution or if m columns of A
-    # have been triangularized.
-    while (iz1 < n) and (nsetp < m):
-        # simulating a goto from col independence check
-        if skip:
-            skip = False
-        else:
-            w[inds[iz1:]] = b[nrow:] @ A[nrow:, inds[iz1:]]
-
-        # Find the largest w[j] and its index.
-        wmax = 0.0
-        for col in range(iz1, n):
-            j = inds[col]
-            if w[j] > wmax:
-                wmax = w[j]
-                izmax = col
-        iz = izmax
-        j = inds[iz]
-
-        # If wmax <= 0.0, terminate since this is a KKT certificate.
-        if wmax <= 0.0:
-            break
-
-        # The sign of wmax is OK for j to be moved to set p. Begin the transformation
-        # and check new diagonal element to avoid near-linear dependence.
-        work[nrow:] = A[nrow:, j]
-        tmpint = m - nrow
-        # DLARFGP( N, ALPHA, X, INCX, TAU )
-        dlarfgp(&tmpint, &work[nrow], &work[nrow+1], &one, &tau)
-        beta = work[nrow]
-        work[nrow] = 1.
-        unorm = 0.0
-        if nsetp > 0:
-            unorm = dnrm2(&nsetp, &A[0, j], &n)
-
-        if ((unorm + abs(beta)*0.01) - unorm) > 0.0:
-            # Column j is sufficiently independent. Copy b into zz and solve for
-            # ztest which is the new prospective value for x[j].
-            zz[:] = b[:]
-            # dlarf(SIDE, M, N, V, INCV, TAU, C, LDC, WORK)
-            dlarf(<char*>'L', &tmpint, &one, &work[nrow], &one, &tau,
-                  &zz[nrow], &tmpint, &tmp)
-            ztest = zz[nrow] / beta
-
-            if ztest <= 0.0:
-                # reject column j as a candidate to be moved from set z to set p.
-                # Set w[j] to 0.0 and move to the next greatest entry in w.
-                w[j] = 0.0
-                skip = True
-                continue
-        else:
-            # Column j is not numerically independent, reject column j
-            w[j] = 0.0
-            skip = True
-            continue
-
-        # column j accepted
-        A[nrow, j] = beta
-        b[:] = zz[:]
-        inds[iz] = inds[iz1]
-        inds[iz1] = j
-        iz1 += 1
-        nsetp += 1
-
-        if iz1 < n:
-            # Apply the householder trafo to remaining columns
-            for col in inds[iz1:]:
-                zz[nrow:] = A[nrow:, col]
-                dlarf(<char*>'L', &tmpint, &one, &work[nrow], &one, &tau,
-                      &zz[nrow], &tmpint, &tmp)
-                A[nrow:, col] = zz[nrow:]
-
-        nrow += 1
-
-        if nsetp < m-1:
-            A[nrow:, j] = 0.0
-
-        w[j] = 0.0
-
-        # Solve triangular system, store in zz
-        zz[:] = b[:]
-        for k in range(nsetp):
-            ip = nsetp - k - 1
-            if k != 0:
-                for ii in range(ip+1):
-                    zz[ii] -= A[ii, jj]*zz[ip+1]
-            jj = inds[ip]
-            zz[ip] /= A[ip, jj]
-
-        # Inner loop
-        while True:
-            iteration += 1
-
-            if iteration == maxiter:
-                return x, 0.0, -1
-
-            # See if all new constrained coefficients are feasible
-            # otherwise compute alpha that should be in [0, 1]
-            alpha = 2.0
-            for ip in range(nsetp):
-                k = inds[ip]
-                if zz[ip] <= 0.0:
-                    T = -x[k] / (zz[ip]-x[k])
-                    if alpha > T:
-                        alpha = T
-                        jj = ip
-
-            # If all new constrained coefficients are feasible
-            # alpha is still 2 then exit otherwise interpolate
-            # between old x and zz.
-            if alpha == 2.0:
-                break
-
-            x[inds[:nsetp]] *= 1 - alpha
-            x[inds[:nsetp]] += alpha*zz[:nsetp]
-
-            # Modify A, B, and the indices to move coefficient
-            # i from set p to set z. While loop simulates a goto
-            i = inds[jj]
-            while True:
-                x[i] = 0.0
-
-                if jj != nsetp:
-                    jj += 1
-                    for j in range(jj, nsetp):
-                        ii = inds[j]
-                        inds[j-1] = ii
-                        dlartgp(&A[j-1, ii], &A[j, ii], &cc, &ss, &A[j-1, ii])
-                        A[j, ii] = 0.0
-                        # Apply Givens rotation to all cols except ii
-                        for col in range(n):
-                            if col != ii:
-                                tmp = A[j-1, col]
-                                A[j-1, col] = cc*tmp + ss*A[j, col]
-                                A[j, col] = -ss*tmp + cc*A[j, col]
-
-                        tmp = b[j-1]
-                        b[j-1] = cc*tmp + ss*b[j]
-                        b[j] = -ss*tmp + cc*b[j]
-
-                nrow -= 1
-                nsetp -= 1
-                iz1 -= 1
-                inds[iz1] = i
-
-                # See if remaining coefficients in set P are feasible
-                # since determination of alpha guarantees it. If still
-                # there are infeasible ones, they are due to numerical
-                # noise. Any that are nonpositive will be set to zero
-                # and moved from set p to set z.
-                for jj in range(nsetp):
-                    i = inds[jj]
-                    if x[i] <= 0.0:
-                        # numerical noise; back to top of while loop
-                        break
-                else:
-                    # No break; leave while loop
-                    break
-
-            zz[:] = b[:]
-            for k in range(nsetp):
-                ip = nsetp - k - 1
-                if k != 0:
-                    for ii in range(ip+1):
-                        zz[ii] -= A[ii, jj]*zz[ip+1]
-                jj = inds[ip]
-                zz[ip] /= A[ip, jj]
-
-            # Back to inner loop beginning
-
-        # Back in outer loop
-        x[inds[:nsetp]] = zz[:nsetp]
-
-        # Back to the outer loop beginning
-
-    return x, np.linalg.norm(b[nrow:]), 0
diff --git a/scipy/optimize/_differentiable_functions.py b/scipy/optimize/_differentiable_functions.py
index e990dae1ef15..c4115a1cfeb1 100644
--- a/scipy/optimize/_differentiable_functions.py
+++ b/scipy/optimize/_differentiable_functions.py
@@ -1,12 +1,11 @@
 from collections import namedtuple
-from functools import partial
 
 import numpy as np
 import scipy.sparse as sps
 from ._numdiff import approx_derivative, group_columns
 from ._hessian_update_strategy import HessianUpdateStrategy
 from scipy.sparse.linalg import LinearOperator
-from scipy._lib._array_api import array_namespace
+from scipy._lib._array_api import array_namespace, xp_copy
 from scipy._lib import array_api_extra as xpx
 from scipy._lib._util import _ScalarFunctionWrapper
 
@@ -406,8 +405,112 @@ def fun_and_grad(self, x):
         return self.f, self.g
 
 
-def _VectorFunWrapper(fun, x):
-    return np.atleast_1d(fun(x))
+class _VectorFunWrapper:
+    def __init__(self, fun):
+        self.fun = fun
+        self.nfev = 0
+
+    def __call__(self, x):
+        self.nfev += 1
+        return np.atleast_1d(self.fun(x))
+
+
+class _VectorJacWrapper:
+    """
+    Wrapper class for Jacobian calculation
+    """
+    def __init__(
+            self,
+            jac,
+            fun=None,
+            finite_diff_options=None,
+            sparse_jacobian=None
+    ):
+        self.fun = fun
+        self.jac = jac
+        self.finite_diff_options = finite_diff_options
+        self.sparse_jacobian = sparse_jacobian
+
+        self.njev = 0
+        # number of function evaluations consumed by finite difference
+        self.nfev = 0
+
+    def __call__(self, x, f0=None, **kwds):
+        # Send a copy because the user may overwrite it.
+        # The user of this class might want `x` to remain unchanged.
+        if callable(self.jac):
+            J = self.jac(x)
+            self.njev += 1
+        elif self.jac in FD_METHODS:
+            J, dct = approx_derivative(
+                self.fun,
+                x,
+                f0=f0,
+                **self.finite_diff_options,
+            )
+            self.nfev += dct['nfev']
+
+        if self.sparse_jacobian:
+            return sps.csr_array(J)
+        elif sps.issparse(J):
+            return J.toarray()
+        elif isinstance(J, LinearOperator):
+            return J
+        else:
+            return np.atleast_2d(J)
+
+
+class _VectorHessWrapper:
+    """
+    Wrapper class for Jacobian calculation
+    """
+    def __init__(
+            self,
+            hess,
+            jac=None,
+            finite_diff_options=None,
+    ):
+        self.jac = jac
+        self.hess = hess
+        self.finite_diff_options = finite_diff_options
+        self.nhev = 0
+        # number of jac evaluations consumed by finite difference
+        self.njev = 0
+
+    def __call__(self, x, v, J0=None, **kwds):
+        # Send a copy because the user may overwrite it.
+        # The user of this class might want `x` to remain unchanged.
+        if callable(self.hess):
+            self.nhev += 1
+            return self._callable_hess(x, v)
+        elif self.hess in FD_METHODS:
+            return self._fd_hess(x, v, J0=J0)
+
+    def _fd_hess(self, x, v, J0=None):
+        if J0 is None:
+            J0 = self.jac(x)
+            self.njev += 1
+
+        # H will be a LinearOperator
+        H = approx_derivative(self.jac_dot_v, x,
+                              f0=J0.T.dot(v),
+                              args=(v,),
+                              **self.finite_diff_options)
+        return H
+
+    def jac_dot_v(self, x, v):
+        self.njev += 1
+        return self.jac(x).T.dot(v)
+
+    def _callable_hess(self, x, v):
+        H = self.hess(x, v)
+
+        if sps.issparse(H):
+            return sps.csr_array(H)
+        elif isinstance(H, LinearOperator):
+            return H
+        else:
+            return np.atleast_2d(np.asarray(H))
 
 
 class VectorFunction:
@@ -429,7 +532,8 @@ class VectorFunction:
     """
     def __init__(self, fun, x0, jac, hess,
                  finite_diff_rel_step=None, finite_diff_jac_sparsity=None,
-                 finite_diff_bounds=None, sparse_jacobian=None, workers=None):
+                 finite_diff_bounds=(-np.inf, np.inf), sparse_jacobian=None,
+                 workers=None):
         if not callable(jac) and jac not in FD_METHODS:
             raise ValueError(f"`jac` must be either callable or one of {FD_METHODS}.")
 
@@ -450,14 +554,19 @@ def __init__(self, fun, x0, jac, hess,
         if xp.isdtype(_x.dtype, "real floating"):
             _dtype = _x.dtype
 
-        # promotes to floating
+        # store original functions
+        self._orig_fun = fun
+        self._orig_jac = jac
+        self._orig_hess = hess
+
+        # promotes to floating, ensures that it's a copy
         self.x = xp.astype(_x, _dtype)
         self.x_dtype = _dtype
 
         self.n = self.x.size
-        self.nfev = 0
-        self.njev = 0
-        self.nhev = 0
+        self._nfev = 0
+        self._njev = 0
+        self._nhev = 0
         self.f_updated = False
         self.J_updated = False
         self.H_updated = False
@@ -492,125 +601,55 @@ def __init__(self, fun, x0, jac, hess,
                              "be estimated using one of the quasi-Newton "
                              "strategies.")
 
-        fun_wrapped = partial(_VectorFunWrapper, fun)
-
-        def update_fun():
-            self.nfev += 1
-            self.f = fun_wrapped(self.x)
-
-        self._update_fun_impl = update_fun
-        update_fun()
+        self.fun_wrapped = _VectorFunWrapper(fun)
+        self._update_fun()
 
         self.v = np.zeros_like(self.f)
         self.m = self.v.size
 
-        # Jacobian Evaluation
+        # Initial Jacobian Evaluation
         if callable(jac):
-            self.J = jac(self.x)
+            self.J = jac(xp_copy(self.x))
             self.J_updated = True
-            self.njev += 1
-
-            if (sparse_jacobian or
-                    sparse_jacobian is None and sps.issparse(self.J)):
-                def jac_wrapped(x):
-                    self.njev += 1
-                    return sps.csr_array(jac(x))
-                self.J = sps.csr_array(self.J)
-                self.sparse_jacobian = True
-
-            elif sps.issparse(self.J):
-                def jac_wrapped(x):
-                    self.njev += 1
-                    return jac(x).toarray()
-                self.J = self.J.toarray()
-                self.sparse_jacobian = False
-
-            else:
-                def jac_wrapped(x):
-                    self.njev += 1
-                    return np.atleast_2d(jac(x))
-                self.J = np.atleast_2d(self.J)
-                self.sparse_jacobian = False
-
-            def update_jac():
-                self.J = jac_wrapped(self.x)
-
+            self._njev += 1
         elif jac in FD_METHODS:
-            self.J, dct = approx_derivative(fun_wrapped, self.x, f0=self.f,
-                                            **finite_diff_options)
+            self.J, dct = approx_derivative(
+                self.fun_wrapped, self.x, f0=self.f, **finite_diff_options
+            )
             self.J_updated = True
-            self.nfev += dct['nfev']
-
-            if (sparse_jacobian or
-                    sparse_jacobian is None and sps.issparse(self.J)):
-                def update_jac():
-                    self._update_fun()
-                    self.J, dct = sps.csr_array(
-                        approx_derivative(fun_wrapped, self.x, f0=self.f,
-                                          **finite_diff_options))
-                    self.nfev += dct['nfev']
-                self.J = sps.csr_array(self.J)
-                self.sparse_jacobian = True
-
-            elif sps.issparse(self.J):
-                def update_jac():
-                    self._update_fun()
-                    self.J, dct = approx_derivative(fun_wrapped, self.x, f0=self.f,
-                                                    **finite_diff_options).toarray()
-                    self.nfev += dct['nfev']
-                self.J = self.J.toarray()
-                self.sparse_jacobian = False
+            self._nfev += dct['nfev']
+
+        self.sparse_jacobian = False
+        if (sparse_jacobian or
+                sparse_jacobian is None and sps.issparse(self.J)):
+            # something truthy was specified for sparse_jacobian,
+            # or it turns out that the Jacobian was sparse.
+            self.J = sps.csr_array(self.J)
+            self.sparse_jacobian = True
+        elif sps.issparse(self.J):
+            self.J = self.J.toarray()
+        elif isinstance(self.J, LinearOperator):
+            pass
+        else:
+            self.J = np.atleast_2d(self.J)
 
-            else:
-                def update_jac():
-                    self._update_fun()
-                    J, dct = approx_derivative(fun_wrapped, self.x, f0=self.f,
-                                               **finite_diff_options)
-                    self.J = np.atleast_2d(J)
-                    self.nfev += dct['nfev']
-                self.J = np.atleast_2d(self.J)
-                self.sparse_jacobian = False
+        self.jac_wrapped = _VectorJacWrapper(
+            jac,
+            fun=self.fun_wrapped,
+            finite_diff_options=finite_diff_options,
+            sparse_jacobian=self.sparse_jacobian
+        )
 
-        self._update_jac_impl = update_jac
+        self.hess_wrapped = _VectorHessWrapper(
+            hess, jac=self.jac_wrapped, finite_diff_options=finite_diff_options
+        )
 
         # Define Hessian
-        if callable(hess):
-            self.H = hess(self.x, self.v)
-            self.H_updated = True
-            self.nhev += 1
-
-            if sps.issparse(self.H):
-                def hess_wrapped(x, v):
-                    self.nhev += 1
-                    return sps.csr_array(hess(x, v))
-                self.H = sps.csr_array(self.H)
-
-            elif isinstance(self.H, LinearOperator):
-                def hess_wrapped(x, v):
-                    self.nhev += 1
-                    return hess(x, v)
-
-            else:
-                def hess_wrapped(x, v):
-                    self.nhev += 1
-                    return np.atleast_2d(np.asarray(hess(x, v)))
-                self.H = np.atleast_2d(np.asarray(self.H))
-
-            def update_hess():
-                self.H = hess_wrapped(self.x, self.v)
-        elif hess in FD_METHODS:
-            def jac_dot_v(x, v):
-                return jac_wrapped(x).T.dot(v)
-
-            def update_hess():
-                self._update_jac()
-                self.H = approx_derivative(jac_dot_v, self.x,
-                                           f0=self.J.T.dot(self.v),
-                                           args=(self.v,),
-                                           **finite_diff_options)
-
-            update_hess()
+        if callable(hess) or hess in FD_METHODS:
+            self.H = self.hess_wrapped(xp_copy(self.x), self.v, J0=self.J)
             self.H_updated = True
+            if callable(hess):
+                self._nhev += 1
         elif isinstance(hess, HessianUpdateStrategy):
             self.H = hess
             self.H.initialize(self.n, 'hess')
@@ -618,19 +657,26 @@ def update_hess():
             self.x_prev = None
             self.J_prev = None
 
-            def update_hess():
-                self._update_jac()
-                # When v is updated before x was updated, then x_prev and
-                # J_prev are None and we need this check.
-                if self.x_prev is not None and self.J_prev is not None:
-                    delta_x = self.x - self.x_prev
-                    delta_g = self.J.T.dot(self.v) - self.J_prev.T.dot(self.v)
-                    self.H.update(delta_x, delta_g)
+    @property
+    def nfev(self):
+        return self._nfev + self.jac_wrapped.nfev
 
-        self._update_hess_impl = update_hess
+    @property
+    def njev(self):
+        return self._njev + self.hess_wrapped.njev
 
-        if isinstance(hess, HessianUpdateStrategy):
-            def update_x(x):
+    @property
+    def nhev(self):
+        return self._nhev
+
+    def _update_v(self, v):
+        if not np.array_equal(v, self.v):
+            self.v = v
+            self.H_updated = False
+
+    def _update_x(self, x):
+        if not np.array_equal(x, self.x):
+            if isinstance(self._orig_hess, HessianUpdateStrategy):
                 self._update_jac()
                 self.x_prev = self.x
                 self.J_prev = self.J
@@ -640,48 +686,63 @@ def update_x(x):
                 self.J_updated = False
                 self.H_updated = False
                 self._update_hess()
-        else:
-            def update_x(x):
+            else:
                 _x = xpx.atleast_nd(self.xp.asarray(x), ndim=1, xp=self.xp)
                 self.x = self.xp.astype(_x, self.x_dtype)
                 self.f_updated = False
                 self.J_updated = False
                 self.H_updated = False
 
-        self._update_x_impl = update_x
-
-    def _update_v(self, v):
-        if not np.array_equal(v, self.v):
-            self.v = v
-            self.H_updated = False
-
-    def _update_x(self, x):
-        if not np.array_equal(x, self.x):
-            self._update_x_impl(x)
-
     def _update_fun(self):
         if not self.f_updated:
-            self._update_fun_impl()
+            self.f = self.fun_wrapped(xp_copy(self.x))
+            self._nfev += 1
             self.f_updated = True
 
     def _update_jac(self):
         if not self.J_updated:
-            self._update_jac_impl()
+            if self._orig_jac in FD_METHODS:
+                # need to update fun to get f0
+                self._update_fun()
+            else:
+                self._njev += 1
+
+            self.J = self.jac_wrapped(xp_copy(self.x), f0=self.f)
             self.J_updated = True
 
     def _update_hess(self):
         if not self.H_updated:
-            self._update_hess_impl()
+            if callable(self._orig_hess):
+                self.H = self.hess_wrapped(xp_copy(self.x), self.v)
+                self._nhev += 1
+            elif self._orig_hess in FD_METHODS:
+                self._update_jac()
+                self.H = self.hess_wrapped(xp_copy(self.x), self.v, J0=self.J)
+            elif isinstance(self._orig_hess, HessianUpdateStrategy):
+                self._update_jac()
+                # When v is updated before x was updated, then x_prev and
+                # J_prev are None and we need this check.
+                if self.x_prev is not None and self.J_prev is not None:
+                    delta_x = self.x - self.x_prev
+                    delta_g = self.J.T.dot(self.v) - self.J_prev.T.dot(self.v)
+                    self.H.update(delta_x, delta_g)
+
             self.H_updated = True
 
     def fun(self, x):
         self._update_x(x)
         self._update_fun()
-        return self.f
+        # returns a copy so that downstream can't overwrite the
+        # internal attribute
+        return xp_copy(self.f)
 
     def jac(self, x):
         self._update_x(x)
         self._update_jac()
+        if hasattr(self.J, "astype"):
+            # returns a copy so that downstream can't overwrite the
+            # internal attribute. But one can't copy a LinearOperator
+            return self.J.astype(self.J.dtype)
         return self.J
 
     def hess(self, x, v):
@@ -689,6 +750,10 @@ def hess(self, x, v):
         self._update_v(v)
         self._update_x(x)
         self._update_hess()
+        if hasattr(self.H, "astype"):
+            # returns a copy so that downstream can't overwrite the
+            # internal attribute. But one can't copy non-arrays
+            return self.H.astype(self.H.dtype)
         return self.H
 
 
diff --git a/scipy/optimize/_lbfgsb_py.py b/scipy/optimize/_lbfgsb_py.py
index 1883b31dbead..2d1f608f8357 100644
--- a/scipy/optimize/_lbfgsb_py.py
+++ b/scipy/optimize/_lbfgsb_py.py
@@ -323,11 +323,11 @@ def _minimize_lbfgsb(fun, x0, args=(), jac=None, bounds=None,
         If `jac is None` the absolute step size used for numerical
         approximation of the jacobian via forward differences.
     maxfun : int
-        Maximum number of function evaluations. Note that this function
-        may violate the limit because of evaluating gradients by numerical
-        differentiation.
+        Maximum number of function evaluations before minimization terminates.
+        Note that this function may violate the limit if the gradients
+        are evaluated by numerical differentiation.
     maxiter : int
-        Maximum number of iterations.
+        Maximum number of algorithm iterations.
     iprint : int, optional
         Deprecated option that previously controlled the text printed on the
         screen during the problem solution. Now the code does not emit any
@@ -359,6 +359,11 @@ def _minimize_lbfgsb(fun, x0, args=(), jac=None, bounds=None,
     relationship between the two is ``ftol = factr * numpy.finfo(float).eps``.
     I.e., `factr` multiplies the default machine floating-point precision to
     arrive at `ftol`.
+    If the minimization is slow to converge the optimizer may halt if the
+    total number of function evaluations exceeds `maxfun`, or the number of
+    algorithm iterations has reached `maxiter` (whichever comes first). If
+    this is the case then ``result.success=False``, and an appropriate
+    error message is contained in ``result.message``.
 
     """
     _check_unknown_options(unknown_options)
diff --git a/scipy/optimize/_lsq/dogbox.py b/scipy/optimize/_lsq/dogbox.py
index b986929626f2..7694c75d491b 100644
--- a/scipy/optimize/_lsq/dogbox.py
+++ b/scipy/optimize/_lsq/dogbox.py
@@ -308,7 +308,7 @@ def dogbox(fun, jac, x0, f0, J0, lb, ub, ftol, xtol, gtol, max_nfev, x_scale,
 
             cost = cost_new
 
-            J = jac(x, f)
+            J = jac(x)
             njev += 1
 
             if loss_function is not None:
diff --git a/scipy/optimize/_lsq/least_squares.py b/scipy/optimize/_lsq/least_squares.py
index abd2aeca4073..0e32307b68f9 100644
--- a/scipy/optimize/_lsq/least_squares.py
+++ b/scipy/optimize/_lsq/least_squares.py
@@ -6,9 +6,11 @@
 
 from scipy.sparse.linalg import LinearOperator
 from scipy.optimize import _minpack, OptimizeResult
-from scipy.optimize._numdiff import approx_derivative, group_columns
+from scipy.optimize._differentiable_functions import VectorFunction
+from scipy.optimize._numdiff import group_columns
 from scipy.optimize._minimize import Bounds
 from scipy._lib._sparse import issparse
+from scipy._lib._array_api import array_namespace
 from scipy._lib._util import _workers_wrapper
 
 from .trf import trf
@@ -41,14 +43,9 @@
 }
 
 
-def call_minpack(fun, x0, jac, ftol, xtol, gtol, max_nfev, x_scale, diff_step):
+def call_minpack(fun, x0, jac, ftol, xtol, gtol, max_nfev, x_scale, jac_method=None):
     n = x0.size
 
-    if diff_step is None:
-        epsfcn = EPS
-    else:
-        epsfcn = diff_step**2
-
     # Compute MINPACK's `diag`, which is inverse of our `x_scale` and
     # ``x_scale='jac'`` corresponds to ``diag=None``.
     if isinstance(x_scale, str) and x_scale == 'jac':
@@ -60,33 +57,36 @@ def call_minpack(fun, x0, jac, ftol, xtol, gtol, max_nfev, x_scale, diff_step):
     col_deriv = False
     factor = 100.0
 
-    if jac is None:
-        if max_nfev is None:
-            # n squared to account for Jacobian evaluations.
-            max_nfev = 100 * n * (n + 1)
-        x, info, status = _minpack._lmdif(
-            fun, x0, (), full_output, ftol, xtol, gtol,
-            max_nfev, epsfcn, factor, diag)
-    else:
-        if max_nfev is None:
-            max_nfev = 100 * n
-        x, info, status = _minpack._lmder(
-            fun, jac, x0, (), full_output, col_deriv,
-            ftol, xtol, gtol, max_nfev, factor, diag)
+    if max_nfev is None:
+        max_nfev = 100 * n
 
-    f = info['fvec']
+    # lmder is typically used for systems with analytic jacobians, with lmdif being
+    # used if there is only an objective fun (lmdif uses finite differences to estimate
+    # jacobian). Otherwise they're very similar internally.
+    # We now do all the finite differencing in VectorFunction, which means we can drop
+    # lmdif and just use lmder.
 
-    if callable(jac):
-        J = jac(x)
-    else:
-        J = np.atleast_2d(approx_derivative(fun, x))
+    # for sending a copy of x0 into _lmder
+    xp = array_namespace(x0)
+
+    x, info, status = _minpack._lmder(
+        fun, jac, xp.astype(x0, x0.dtype), (), full_output, col_deriv,
+        ftol, xtol, gtol, max_nfev, factor, diag)
+
+    f = info['fvec']
+    J = jac(x)
 
     cost = 0.5 * np.dot(f, f)
     g = J.T.dot(f)
     g_norm = norm(g, ord=np.inf)
 
     nfev = info['nfev']
-    njev = info.get('njev', None)
+    if callable(jac_method):
+        # user supplied a callable ("analytic") jac
+        njev = info.get('njev', None)
+    else:
+        # If there are no analytic jacobian evaluations we need to set `njev=None`.
+        njev = None
 
     status = FROM_MINPACK_TO_COMMON[status]
     active_mask = np.zeros_like(x0, dtype=int)
@@ -242,6 +242,17 @@ def loss_function(f, cost_only=False):
     return loss_function
 
 
+class _WrapArgsKwargs:
+    # Supplies a user function with args and kwargs.
+    def __init__(self, f, args=(), kwargs=None):
+        self.f = f
+        self.args = args
+        self.kwargs = kwargs or {}
+
+    def __call__(self, x):
+        return self.f(x, *self.args, **self.kwargs)
+
+
 @_workers_wrapper
 def least_squares(
         fun, x0, jac='2-point', bounds=(-np.inf, np.inf), method='trf',
@@ -286,12 +297,16 @@ def least_squares(
         twice as many operations as '2-point' (default). The scheme 'cs'
         uses complex steps, and while potentially the most accurate, it is
         applicable only when `fun` correctly handles complex inputs and
-        can be analytically continued to the complex plane. Method 'lm'
-        always uses the '2-point' scheme. If callable, it is used as
+        can be analytically continued to the complex plane. If callable, it is used as
         ``jac(x, *args, **kwargs)`` and should return a good approximation
         (or the exact value) for the Jacobian as an array_like (np.atleast_2d
         is applied), a sparse array (csr_array preferred for performance) or
         a `scipy.sparse.linalg.LinearOperator`.
+
+        .. versionchanged:: 1.16.0
+            An ability to use the '3-point', 'cs' keywords with the 'lm' method.
+            Previously 'lm' was limited to '2-point' and callable.
+
     bounds : 2-tuple of array_like or `Bounds`, optional
         There are two ways to specify bounds:
 
@@ -390,13 +405,16 @@ def least_squares(
         no effect with ``loss='linear'``, but for other `loss` values it is
         of crucial importance.
     max_nfev : None or int, optional
-        Maximum number of function evaluations before the termination.
-        If None (default), the value is chosen automatically:
+        For all methods this parameter controls the maximum number of function
+        evaluations used by each method, separate to those used in numerical
+        approximation of the jacobian.
+        If None (default), the value is chosen automatically as 100 * n.
 
-        * For 'trf' and 'dogbox' : 100 * n.
-        * For 'lm' :  100 * n if `jac` is callable and 100 * n * (n + 1)
-          otherwise (because 'lm' counts function calls in Jacobian
-          estimation).
+        .. versionchanged:: 1.16.0
+            The default for the 'lm' method is changed to 100 * n, for both a callable
+            and a numerically estimated jacobian. Previously the default when using an
+            estimated jacobian was 100 * n * (n + 1), because the method included
+            evaluations used in the estimation.
 
     diff_step : None or array_like, optional
         Determines the relative step size for the finite difference
@@ -511,9 +529,14 @@ def least_squares(
             sequence of strictly feasible iterates and `active_mask` is
             determined within a tolerance threshold.
         nfev : int
-            Number of function evaluations done. Methods 'trf' and 'dogbox' do
-            not count function calls for numerical Jacobian approximation, as
-            opposed to 'lm' method.
+            Number of function evaluations done. This number does not include
+            the function calls used for numerical Jacobian approximation.
+
+            .. versionchanged:: 1.16.0
+                For the 'lm' method the number of function calls used in numerical
+                Jacobian approximation is no longer included. This is to bring all
+                methods into line.
+
         njev : int or None
             Number of Jacobian evaluations done. If numerical Jacobian
             approximation is used in 'lm' method, it is set to None.
@@ -541,8 +564,8 @@ def least_squares(
 
     Notes
     -----
-    Method 'lm' (Levenberg-Marquardt) calls a wrapper over least-squares
-    algorithms implemented in MINPACK (lmder, lmdif). It runs the
+    Method 'lm' (Levenberg-Marquardt) calls a wrapper over a least-squares
+    algorithm implemented in MINPACK (lmder). It runs the
     Levenberg-Marquardt algorithm formulated as a trust-region type algorithm.
     The implementation is based on paper [JJMore]_, it is very robust and
     efficient with a lot of smart tricks. It should be your first choice
@@ -861,15 +884,36 @@ def least_squares(
     if method == 'trf':
         x0 = make_strictly_feasible(x0, lb, ub)
 
-    if kwargs is None:
-        kwargs = {}
     if tr_options is None:
         tr_options = {}
 
-    def fun_wrapped(x):
-        return np.atleast_1d(fun(x, *args, **kwargs))
-
-    f0 = fun_wrapped(x0)
+    ###########################################################################
+    # assemble VectorFunction
+    ###########################################################################
+    # first wrap the args/kwargs
+    fun_wrapped = _WrapArgsKwargs(fun, args=args, kwargs=kwargs)
+    jac_wrapped = jac
+    if callable(jac):
+        jac_wrapped = _WrapArgsKwargs(jac, args=args, kwargs=kwargs)
+
+    def _dummy_hess(x, *args):
+        # we don't care about Hessian evaluations
+        return x
+
+    vector_fun = VectorFunction(
+        fun_wrapped,
+        x0,
+        jac_wrapped,
+        _dummy_hess,
+        finite_diff_rel_step=diff_step,
+        finite_diff_jac_sparsity=jac_sparsity,
+        finite_diff_bounds=bounds,
+        workers=workers
+    )
+    ###########################################################################
+
+    f0 = vector_fun.fun(x0)
+    J0 = vector_fun.jac(x0)
 
     if f0.ndim != 1:
         raise ValueError("`fun` must return at most 1-d array_like. "
@@ -897,82 +941,46 @@ def fun_wrapped(x):
     else:
         initial_cost = 0.5 * np.dot(f0, f0)
 
-    if callable(jac):
-        J0 = jac(x0, *args, **kwargs)
-
-        if issparse(J0):
-            J0 = J0.tocsr()
-
-            def jac_wrapped(x, _=None):
-                return jac(x, *args, **kwargs).tocsr()
-
-        elif isinstance(J0, LinearOperator):
-            def jac_wrapped(x, _=None):
-                return jac(x, *args, **kwargs)
-
-        else:
-            J0 = np.atleast_2d(J0)
-
-            def jac_wrapped(x, _=None):
-                return np.atleast_2d(jac(x, *args, **kwargs))
-
-    else:  # Estimate Jacobian by finite differences.
+    if not callable(jac):
+        # Estimate Jacobian by finite differences.
         if method == 'lm':
             if jac_sparsity is not None:
                 raise ValueError("method='lm' does not support "
                                  "`jac_sparsity`.")
-
-            if jac != '2-point':
-                warn(f"jac='{jac}' works equivalently to '2-point' for method='lm'.",
-                     stacklevel=2)
-
-            J0 = jac_wrapped = None
         else:
+            # this will raise a ValueError if the jac_sparsity isn't correct
+            _ = check_jac_sparsity(jac_sparsity, m, n)
+
             if jac_sparsity is not None and tr_solver == 'exact':
                 raise ValueError("tr_solver='exact' is incompatible "
                                  "with `jac_sparsity`.")
 
-            jac_sparsity = check_jac_sparsity(jac_sparsity, m, n)
+    if J0.shape != (m, n):
+        raise ValueError(
+            f"The return value of `jac` has wrong shape: expected {(m, n)}, "
+            f"actual {J0.shape}."
+        )
 
-            def jac_wrapped(x, f):
-                J = approx_derivative(fun, x, rel_step=diff_step, method=jac,
-                                      f0=f, bounds=bounds, args=args,
-                                      kwargs=kwargs, sparsity=jac_sparsity,
-                                      workers=workers)
-                if J.ndim != 2:  # J is guaranteed not sparse.
-                    J = np.atleast_2d(J)
+    if not isinstance(J0, np.ndarray):
+        if method == 'lm':
+            raise ValueError("method='lm' works only with dense "
+                             "Jacobian matrices.")
 
-                return J
+        if tr_solver == 'exact':
+            raise ValueError(
+                "tr_solver='exact' works only with dense "
+                "Jacobian matrices.")
 
-            J0 = jac_wrapped(x0, f0)
+    jac_scale = isinstance(x_scale, str) and x_scale == 'jac'
+    if isinstance(J0, LinearOperator) and jac_scale:
+        raise ValueError("x_scale='jac' can't be used when `jac` "
+                         "returns LinearOperator.")
 
-    if J0 is not None:
-        if J0.shape != (m, n):
-            raise ValueError(
-                f"The return value of `jac` has wrong shape: expected {(m, n)}, "
-                f"actual {J0.shape}."
-            )
-
-        if not isinstance(J0, np.ndarray):
-            if method == 'lm':
-                raise ValueError("method='lm' works only with dense "
-                                 "Jacobian matrices.")
-
-            if tr_solver == 'exact':
-                raise ValueError(
-                    "tr_solver='exact' works only with dense "
-                    "Jacobian matrices.")
-
-        jac_scale = isinstance(x_scale, str) and x_scale == 'jac'
-        if isinstance(J0, LinearOperator) and jac_scale:
-            raise ValueError("x_scale='jac' can't be used when `jac` "
-                             "returns LinearOperator.")
-
-        if tr_solver is None:
-            if isinstance(J0, np.ndarray):
-                tr_solver = 'exact'
-            else:
-                tr_solver = 'lsmr'
+    if tr_solver is None:
+        if isinstance(J0, np.ndarray):
+            tr_solver = 'exact'
+        else:
+            tr_solver = 'lsmr'
 
     # Wrap callback function.  If callback is None, callback_wrapped also is None
     callback_wrapped = _wrap_callback(callback)
@@ -981,11 +989,11 @@ def jac_wrapped(x, f):
         if callback is not None:
             warn("Callback function specified, but not supported with `lm` method.",
                  stacklevel=2)
-        result = call_minpack(fun_wrapped, x0, jac_wrapped, ftol, xtol, gtol,
-                              max_nfev, x_scale, diff_step)
+        result = call_minpack(vector_fun.fun, x0, vector_fun.jac, ftol, xtol, gtol,
+                              max_nfev, x_scale, jac_method=jac)
 
     elif method == 'trf':
-        result = trf(fun_wrapped, jac_wrapped, x0, f0, J0, lb, ub, ftol, xtol,
+        result = trf(vector_fun.fun, vector_fun.jac, x0, f0, J0, lb, ub, ftol, xtol,
                      gtol, max_nfev, x_scale, loss_function, tr_solver,
                      tr_options.copy(), verbose, callback=callback_wrapped)
 
@@ -997,7 +1005,7 @@ def jac_wrapped(x, f):
             tr_options = tr_options.copy()
             del tr_options['regularize']
 
-        result = dogbox(fun_wrapped, jac_wrapped, x0, f0, J0, lb, ub, ftol,
+        result = dogbox(vector_fun.fun, vector_fun.jac, x0, f0, J0, lb, ub, ftol,
                         xtol, gtol, max_nfev, x_scale, loss_function,
                         tr_solver, tr_options, verbose, callback=callback_wrapped)
 
diff --git a/scipy/optimize/_lsq/trf.py b/scipy/optimize/_lsq/trf.py
index c72fbfae00f0..f17ec17a68fa 100644
--- a/scipy/optimize/_lsq/trf.py
+++ b/scipy/optimize/_lsq/trf.py
@@ -373,7 +373,7 @@ def trf_bounds(fun, jac, x0, f0, J0, lb, ub, ftol, xtol, gtol, max_nfev,
 
             cost = cost_new
 
-            J = jac(x, f)
+            J = jac(x)
             njev += 1
 
             if loss_function is not None:
@@ -548,7 +548,7 @@ def trf_no_bounds(fun, jac, x0, f0, J0, ftol, xtol, gtol, max_nfev,
 
             cost = cost_new
 
-            J = jac(x, f)
+            J = jac(x)
             njev += 1
 
             if loss_function is not None:
diff --git a/scipy/optimize/_nnls.py b/scipy/optimize/_nnls.py
index ecd12ee2ede5..8b3c79a9fce9 100644
--- a/scipy/optimize/_nnls.py
+++ b/scipy/optimize/_nnls.py
@@ -1,5 +1,5 @@
 import numpy as np
-from ._cython_nnls import _nnls
+from ._slsqplib import nnls as _nnls
 from scipy._lib.deprecation import _deprecate_positional_args, _NoValue
 
 
@@ -72,11 +72,13 @@ def nnls(A, b, *, maxiter=None, atol=_NoValue):
     b = np.asarray_chkfinite(b, dtype=np.float64)
 
     if len(A.shape) != 2:
-        raise ValueError("Expected a two-dimensional array (matrix)" +
-                         f", but the shape of A is {A.shape}")
-    if len(b.shape) != 1:
-        raise ValueError("Expected a one-dimensional array (vector)" +
-                         f", but the shape of b is {b.shape}")
+        raise ValueError(f"Expected a 2D array, but the shape of A is {A.shape}")
+
+    if (b.ndim > 2) or ((b.ndim == 2) and (b.shape[1] != 1)):
+        raise ValueError("Expected a 1D array,(or 2D with one column), but the,"
+                         f" shape of b is {b.shape}")
+    elif (b.ndim == 2) and (b.shape[1] == 1):
+        b = b.ravel()
 
     m, n = A.shape
 
@@ -88,7 +90,7 @@ def nnls(A, b, *, maxiter=None, atol=_NoValue):
     if not maxiter:
         maxiter = 3*n
     x, rnorm, info = _nnls(A, b, maxiter)
-    if info == -1:
+    if info == 3:
         raise RuntimeError("Maximum number of iterations reached.")
 
     return x, rnorm
diff --git a/scipy/optimize/_optimize.py b/scipy/optimize/_optimize.py
index cc99dac630a5..dffffcd73be4 100644
--- a/scipy/optimize/_optimize.py
+++ b/scipy/optimize/_optimize.py
@@ -41,7 +41,7 @@
 from scipy._lib._util import (MapWrapper, check_random_state, _RichResult,
                               _call_callback_maybe_halt, _transition_to_rng)
 from scipy.optimize._differentiable_functions import ScalarFunction, FD_METHODS
-from scipy._lib._array_api import array_namespace, xp_capabilities
+from scipy._lib._array_api import array_namespace, xp_capabilities, xp_promote
 from scipy._lib import array_api_extra as xpx
 
 
@@ -382,9 +382,7 @@ def rosen(x):
     >>> plt.show()
     """
     xp = array_namespace(x)
-    x = xp.asarray(x)
-    if xp.isdtype(x.dtype, 'integral'):
-        x = xp.astype(x, xp.asarray(1.).dtype)
+    x = xp_promote(x, force_floating=True, xp=xp)
     r = xp.sum(100.0 * (x[1:] - x[:-1]**2.0)**2.0 + (1 - x[:-1])**2.0,
                axis=0, dtype=x.dtype)
     return r
@@ -419,9 +417,7 @@ def rosen_der(x):
 
     """
     xp = array_namespace(x)
-    x = xp.asarray(x)
-    if xp.isdtype(x.dtype, 'integral'):
-        x = xp.astype(x, xp.asarray(1.).dtype)
+    x = xp_promote(x, force_floating=True, xp=xp)
     xm = x[1:-1]
     xm_m1 = x[:-2]
     xm_p1 = x[2:]
@@ -465,9 +461,8 @@ def rosen_hess(x):
 
     """
     xp = array_namespace(x)
-    x = xpx.atleast_nd(x, ndim=1, xp=xp)
-    if xp.isdtype(x.dtype, 'integral'):
-        x = xp.astype(x, xp.asarray(1.).dtype)
+    x = xp_promote(x, force_floating=True, xp=xp)
+
     H = (xpx.create_diagonal(-400 * x[:-1], offset=1, xp=xp) 
          - xpx.create_diagonal(400 * x[:-1], offset=-1, xp=xp))
     diagonal = xp.zeros(x.shape[0], dtype=x.dtype)
@@ -510,9 +505,8 @@ def rosen_hess_prod(x, p):
 
     """
     xp = array_namespace(x, p)
+    x = xp_promote(x, force_floating=True, xp=xp)
     x = xpx.atleast_nd(x, ndim=1, xp=xp)
-    if xp.isdtype(x.dtype, 'integral'):
-        x = xp.astype(x, xp.asarray(1.).dtype)
     p = xp.asarray(p, dtype=x.dtype)
     Hp = xp.zeros(x.shape[0], dtype=x.dtype)
     Hp[0] = (1200 * x[0]**2 - 400 * x[1] + 2) * p[0] - 400 * x[0] * p[1]
diff --git a/scipy/optimize/_slsqp_py.py b/scipy/optimize/_slsqp_py.py
index 5c92a76d71f6..42e9fa3fee2e 100644
--- a/scipy/optimize/_slsqp_py.py
+++ b/scipy/optimize/_slsqp_py.py
@@ -16,9 +16,8 @@
 __all__ = ['approx_jacobian', 'fmin_slsqp']
 
 import numpy as np
-from scipy.optimize._slsqp import slsqp
-from numpy import (zeros, array, linalg, append, concatenate, finfo,
-                   sqrt, vstack, isfinite, atleast_1d)
+from ._slsqplib import slsqp
+from scipy.linalg import norm as lanorm
 from ._optimize import (OptimizeResult, _check_unknown_options,
                         _prepare_scalar_function, _clip_x_for_func,
                         _check_clip_x)
@@ -26,11 +25,11 @@
 from ._constraints import old_bound_to_new, _arr_to_scalar
 from scipy._lib._array_api import array_namespace
 from scipy._lib import array_api_extra as xpx
-
+from numpy.typing import NDArray
 
 __docformat__ = "restructuredtext en"
 
-_epsilon = sqrt(finfo(float).eps)
+_epsilon = np.sqrt(np.finfo(np.float64).eps)
 
 
 def approx_jacobian(x, func, epsilon, *args):
@@ -225,7 +224,11 @@ def _minimize_slsqp(func, x0, args=(), jac=None, bounds=None,
     Options
     -------
     ftol : float
-        Precision goal for the value of f in the stopping criterion.
+        Precision target for the value of f in the stopping criterion. This value
+        controls the final accuracy for checking various optimality conditions;
+        gradient of the lagrangian and absolute sum of the constraint violations
+        should be lower than ``ftol``. Similarly, if computed step size and the
+        objective function chage are checked against this value. Default is 1e-6.
     eps : float
         Step size used for numerical approximation of the Jacobian.
     disp : bool
@@ -249,7 +252,6 @@ def _minimize_slsqp(func, x0, args=(), jac=None, bounds=None,
 
     """
     _check_unknown_options(unknown_options)
-    iter = maxiter - 1
     acc = ftol
     epsilon = eps
 
@@ -338,26 +340,15 @@ def cjac(x, *args):
 
     # Set the parameters that SLSQP will need
     # meq, mieq: number of equality and inequality constraints
-    meq = sum(map(len, [atleast_1d(c['fun'](x, *c['args']))
+    meq = sum(map(len, [np.atleast_1d(c['fun'](x, *c['args']))
               for c in cons['eq']]))
-    mieq = sum(map(len, [atleast_1d(c['fun'](x, *c['args']))
+    mieq = sum(map(len, [np.atleast_1d(c['fun'](x, *c['args']))
                for c in cons['ineq']]))
     # m = The total number of constraints
     m = meq + mieq
-    # la = The number of constraints, or 1 if there are no constraints
-    la = array([1, m]).max()
     # n = The number of independent variables
     n = len(x)
 
-    # Define the workspaces for SLSQP
-    n1 = n + 1
-    mineq = m - meq + n1 + n1
-    len_w = (3*n1+m)*(n1+1)+(n1-meq+1)*(mineq+2) + 2*mineq+(n1+mineq)*(n1-meq) \
-            + 2*meq + n1 + ((n+1)*n)//2 + 2*m + 3*n + 3*n1 + 1
-    len_jw = mineq
-    w = zeros(len_w)
-    jw = zeros(len_jw)
-
     # Decompose bounds into xl and xu
     if bounds is None or len(bounds) == 0:
         xl = np.empty(n, dtype=float)
@@ -365,8 +356,8 @@ def cjac(x, *args):
         xl.fill(np.nan)
         xu.fill(np.nan)
     else:
-        bnds = array([(_arr_to_scalar(l), _arr_to_scalar(u))
-                      for (l, u) in bounds], float)
+        bnds = np.array([(_arr_to_scalar(lo), _arr_to_scalar(up))
+                      for (lo, up) in bounds], float)
         if bnds.shape[0] != n:
             raise IndexError('SLSQP Error: the length of bounds is not '
                              'compatible with that of x0.')
@@ -377,10 +368,10 @@ def cjac(x, *args):
         if bnderr.any():
             raise ValueError("SLSQP Error: lb > ub in bounds "
                              f"{', '.join(str(b) for b in bnderr)}.")
-        xl, xu = bnds[:, 0], bnds[:, 1]
+        xl, xu = bnds[:, 0].copy(), bnds[:, 1].copy()
 
-        # Mark infinite bounds with nans; the Fortran code understands this
-        infbnd = ~isfinite(bnds)
+        # Mark infinite bounds with nans; the C code expects this
+        infbnd = ~np.isfinite(bnds)
         xl[infbnd[:, 0]] = np.nan
         xu[infbnd[:, 1]] = np.nan
 
@@ -393,125 +384,180 @@ def cjac(x, *args):
     wrapped_fun = _clip_x_for_func(sf.fun, new_bounds)
     wrapped_grad = _clip_x_for_func(sf.grad, new_bounds)
 
-    # Initialize the iteration counter and the mode value
-    mode = array(0, int)
-    acc = array(acc, float)
-    majiter = array(iter, int)
-    majiter_prev = 0
-
-    # Initialize internal SLSQP state variables
-    alpha = array(0, float)
-    f0 = array(0, float)
-    gs = array(0, float)
-    h1 = array(0, float)
-    h2 = array(0, float)
-    h3 = array(0, float)
-    h4 = array(0, float)
-    t = array(0, float)
-    t0 = array(0, float)
-    tol = array(0, float)
-    iexact = array(0, int)
-    incons = array(0, int)
-    ireset = array(0, int)
-    itermx = array(0, int)
-    line = array(0, int)
-    n1 = array(0, int)
-    n2 = array(0, int)
-    n3 = array(0, int)
+    # Initialize internal SLSQP state variables dictionary
+    # This dictionary is passed to the SLSQP matching the C struct defined as
+    #
+    # struct SLSQP_static_vars {
+    #     double acc, alpha, f0, gs, h1, h2, h3, h4, t, t0, tol;
+    #     int exact, inconsistent, reset, iter, itermax, line, mode, meq;
+    # };
+    #
+    # exact : a dummy variable and should be kept 0 since the underlying code
+    #         always uses an inexact search.
+    # inconsistent: a boolean set to 1 if the linearized QP is not well-defined
+    #               while the original nonlinear problem is still solvable. Then
+    #               the problem is augmented with a regularizing dummy variable.
+    # reset: holds the count of resetting bfgs to identity matrix.
+    # iter  : the current and itermax is the maximum number of iterations.
+    # line  : the current line search iteration.
+    # mode  : the exit mode of the solver.
+    # alpha, f0, gs, h1, h2, h3, h4, t, t0 : internal variables used by the solver.
+    #
+    # The dict holds the intermediate state of the solver. The keys are the same
+    # as the C struct members and will be modified in-place.
+    state_dict = {
+        "acc": acc,
+        "alpha": 0.0,
+        "f0": 0.0,
+        "gs": 0.0,
+        "h1": 0.0,
+        "h2": 0.0,
+        "h3": 0.0,
+        "h4": 0.0,
+        "t": 0.0,
+        "t0": 0.0,
+        "tol": 10.0*acc,
+        "exact": 0,
+        "inconsistent": 0,
+        "reset": 0,
+        "iter": 0,
+        "itermax": maxiter,
+        "line": 0,
+        "m": m,
+        "meq": meq,
+        "mode": 0,
+        "n": n
+    }
 
     # Print the header if iprint >= 2
     if iprint >= 2:
         print(f"{'NIT':>5} {'FC':>5} {'OBJFUN':>16} {'GNORM':>16}")
 
+    # Internal buffer and int array
+    indices = np.zeros([max(m + 2*n + 2, 1)], dtype=np.int32)
+
+    # The worst case workspace requirements for the buffer are:
+
+    # n*(n+1)//2 + m + 4*n + 3                                           # SLSQP
+    # (n+1)*(n+2) + (n+1)*meq + m + (mineq + 2*n + 2)*(n+1) +  3*n + 3   # LSQ
+    # mineq + 2n + 2 + 2*meq + (n+1) + (mineq + 3n + 3)*(n + 1 - meq)    # LSEI
+    # (mineq + 2n + 2 + 2)*(n + 2) + mineq + 2n + 2                      # LDP
+    # mineq + 2n + 2                                                     # NNLS
+
+    # If we sum all up and simplify by the help of sympy we get the following
+    buffer_size = (
+        n*(n+1)//2 + 3*m*n - (m + 5*n + 7)*meq + 9*m + 8*n*n + 35*n + meq*meq + 28
+    )
+    # If no inequality constraints are given, top up workspace for the missing
+    # terms.
+    if mieq == 0:
+        buffer_size += 2*n*(n + 1)
+    buffer = np.zeros(max(buffer_size, 1), dtype=np.float64)
+
     # mode is zero on entry, so call objective, constraints and gradients
     # there should be no func evaluations here because it's cached from
     # ScalarFunction
     fx = wrapped_fun(x)
-    g = append(wrapped_grad(x), 0.0)
-    c = _eval_constraint(x, cons)
-    a = _eval_con_normals(x, cons, la, n, m, meq, mieq)
+    g = wrapped_grad(x)
+
+    # Allocate the multiplier array both for constraints and user specified
+    # bounds (extra +2 is for a possible augmented problem).
+    mult = np.zeros([max(1, m + 2*n + 2)], dtype=np.float64)
+
+    # Allocate the constraints and normals once and repopulate as needed
+    C = np.zeros([max(1, m), n], dtype=np.float64, order='F')
+    d = np.zeros([max(1, m)], dtype=np.float64)
+    _eval_con_normals(C, x, cons, m, meq)
+    _eval_constraint(d, x, cons, m, meq)
 
-    while 1:
+    iter_prev = 0
+
+    while True:
         # Call SLSQP
-        slsqp(m, meq, x, xl, xu, fx, c, g, a, acc, majiter, mode, w, jw,
-              alpha, f0, gs, h1, h2, h3, h4, t, t0, tol,
-              iexact, incons, ireset, itermx, line,
-              n1, n2, n3)
+        slsqp(state_dict, fx, g, C, d, x, mult, xl, xu, buffer, indices)
 
-        if mode == 1:  # objective and constraint evaluation required
-            fx = wrapped_fun(x)
-            c = _eval_constraint(x, cons)
+        if state_dict['mode'] == 1:  # objective and constraint evaluation required
+            fx = sf.fun(x)
+            _eval_constraint(d, x, cons, m, meq)
 
-        if mode == -1:  # gradient evaluation required
-            g = append(wrapped_grad(x), 0.0)
-            a = _eval_con_normals(x, cons, la, n, m, meq, mieq)
+        if state_dict['mode'] == -1:  # gradient evaluation required
+            g = sf.grad(x)
+            _eval_con_normals(C, x, cons, m, meq)
 
-        if majiter > majiter_prev:
+        if state_dict['iter'] > iter_prev:
             # call callback if major iteration has incremented
             if callback is not None:
                 callback(np.copy(x))
 
             # Print the status of the current iterate if iprint > 2
             if iprint >= 2:
-                print(f"{majiter:5d} {sf.nfev:5d} {fx:16.6E} {linalg.norm(g):16.6E}")
+                print(f"{state_dict['iter']:5d} {sf.nfev:5d} "
+                      f"{fx:16.6E} {lanorm(g):16.6E}")
 
         # If exit mode is not -1 or 1, slsqp has completed
-        if abs(mode) != 1:
+        if abs(state_dict['mode']) != 1:
             break
 
-        majiter_prev = int(majiter)
+        iter_prev = state_dict['iter']
 
     # Optimization loop complete. Print status if requested
     if iprint >= 1:
-        print(exit_modes[int(mode)] + "    (Exit mode " + str(mode) + ')')
+        print(
+            exit_modes[state_dict['mode']] + f"    (Exit mode {state_dict['mode']})"
+        )
         print("            Current function value:", fx)
-        print("            Iterations:", majiter)
+        print("            Iterations:", state_dict['iter'])
         print("            Function evaluations:", sf.nfev)
         print("            Gradient evaluations:", sf.ngev)
 
-    return OptimizeResult(x=x, fun=fx, jac=g[:-1], nit=int(majiter),
-                          nfev=sf.nfev, njev=sf.ngev, status=int(mode),
-                          message=exit_modes[int(mode)], success=(mode == 0))
-
-
-def _eval_constraint(x, cons):
-    # Compute constraints
-    if cons['eq']:
-        c_eq = concatenate([atleast_1d(con['fun'](x, *con['args']))
-                            for con in cons['eq']])
-    else:
-        c_eq = zeros(0)
-
-    if cons['ineq']:
-        c_ieq = concatenate([atleast_1d(con['fun'](x, *con['args']))
-                             for con in cons['ineq']])
-    else:
-        c_ieq = zeros(0)
-
-    # Now combine c_eq and c_ieq into a single matrix
-    c = concatenate((c_eq, c_ieq))
-    return c
-
-
-def _eval_con_normals(x, cons, la, n, m, meq, mieq):
-    # Compute the normals of the constraints
-    if cons['eq']:
-        a_eq = vstack([con['jac'](x, *con['args'])
-                       for con in cons['eq']])
-    else:  # no equality constraint
-        a_eq = zeros((meq, n))
-
-    if cons['ineq']:
-        a_ieq = vstack([con['jac'](x, *con['args'])
-                        for con in cons['ineq']])
-    else:  # no inequality constraint
-        a_ieq = zeros((mieq, n))
-
-    # Now combine a_eq and a_ieq into a single a matrix
-    if m == 0:  # no constraints
-        a = zeros((la, n))
-    else:
-        a = vstack((a_eq, a_ieq))
-    a = concatenate((a, zeros([la, 1])), 1)
-
-    return a
+    return OptimizeResult(
+        x=x, fun=fx, jac=g, nit=state_dict['iter'], nfev=sf.nfev, njev=sf.ngev,
+        status=state_dict['mode'], message=exit_modes[state_dict['mode']],
+        success=(state_dict['mode'] == 0), multipliers=mult[:m]
+    )
+
+# The following functions modify their first input argument in-place.
+def _eval_constraint(d: NDArray, x: NDArray, cons: dict, m: int, meq: int):
+    if m == 0:
+        return
+
+    # The reason why we don't use regular increments with a sane for loop is that
+    # the constraint evaluations do not necessarily return scalars. Their
+    # output length needs to be taken into account while placing them in d.
+
+    if meq > 0:
+        row = 0
+        for con in cons['eq']:
+            temp = np.atleast_1d(con['fun'](x, *con['args'])).ravel()
+            d[row:row + len(temp)] = temp
+            row += len(temp)
+
+    if m > meq:
+        row = meq
+        for con in cons['ineq']:
+            temp = np.atleast_1d(con['fun'](x, *con['args'])).ravel()
+            d[row:row + len(temp)] = temp
+            row += len(temp)
+
+    return
+
+
+def _eval_con_normals(C: NDArray, x: NDArray, cons: dict, m: int, meq: int):
+    if m == 0:
+        return
+
+    if meq > 0:
+        row = 0
+        for con in cons['eq']:
+            temp = np.atleast_2d(con['jac'](x, *con['args']))
+            C[row:row + temp.shape[0], :] = temp
+            row += temp.shape[0]
+
+    if m > meq:
+        row = meq
+        for con in cons['ineq']:
+            temp = np.atleast_2d(con['jac'](x, *con['args']))
+            C[row:row + temp.shape[0], :] = temp
+            row += temp.shape[0]
+
+    return
diff --git a/scipy/optimize/meson.build b/scipy/optimize/meson.build
index 375f4c3aef79..d32cfe3f5ab2 100644
--- a/scipy/optimize/meson.build
+++ b/scipy/optimize/meson.build
@@ -62,6 +62,8 @@ py3.extension_module('_zeros',
   subdir: 'scipy/optimize'
 )
 
+
+# TODO: link to ILP64 LAPACK
 py3.extension_module('_lbfgsb',
   [
     '__lbfgsb.h',
@@ -85,16 +87,6 @@ py3.extension_module('_moduleTNC',
   subdir: 'scipy/optimize'
 )
 
-py3.extension_module('_slsqp',
-  [f2py_gen.process('slsqp/slsqp.pyf'), 'slsqp/slsqp_optmz.f'],
-  fortran_args: fortran_ignore_warnings,
-  link_args: version_link_args,
-  dependencies: [fortranobject_dep],
-  install: true,
-  link_language: 'fortran',
-  subdir: 'scipy/optimize'
-)
-
 py3.extension_module('_pava_pybind',
   ['_pava/pava_pybind.cpp'],
   include_directories: '_pava',
@@ -145,10 +137,14 @@ py3.extension_module('_bglu_dense',
   subdir: 'scipy/optimize'
 )
 
-py3.extension_module('_cython_nnls',
-  opt_gen.process('_cython_nnls.pyx'),
-  c_args: cython_c_args,
-  dependencies: np_dep,
+py3.extension_module('_slsqplib',
+  [
+    '__slsqp.h',
+    '__slsqp.c',
+    '__nnls.h',
+    '__nnls.c'
+  ],
+  dependencies: [lapack_dep, blas_dep, np_dep],
   link_args: version_link_args,
   install: true,
   subdir: 'scipy/optimize'
diff --git a/scipy/optimize/slsqp.py b/scipy/optimize/slsqp.py
index c2b77d2eb447..2b79d93a55d3 100644
--- a/scipy/optimize/slsqp.py
+++ b/scipy/optimize/slsqp.py
@@ -9,7 +9,6 @@
     'OptimizeResult',
     'fmin_slsqp',
     'slsqp',
-    'zeros',
 ]
 
 
diff --git a/scipy/optimize/slsqp/slsqp.pyf b/scipy/optimize/slsqp/slsqp.pyf
deleted file mode 100644
index 5799a4805a04..000000000000
--- a/scipy/optimize/slsqp/slsqp.pyf
+++ /dev/null
@@ -1,48 +0,0 @@
-!    -*- f90 -*-
-! Note: the context of this file is case sensitive.
-
-python module _slsqp ! in 
-    interface  ! in :slsqp
-        subroutine slsqp(m,meq,la,n,x,xl,xu,f,c,g,a,acc,iter,mode,w,l_w,jw,l_jw,alpha,f0,gs,h1,h2,h3,h4,t,t0,tol,iexact,incons,ireset,itermx,line,n1,n2,n3) ! in :slsqp:slsqp_optmz.f
-            integer :: m
-            integer :: meq
-            integer optional,check(len(c)>=la),depend(c) :: la=len(c)
-            integer optional,check(len(x)>=n),depend(x) :: n=len(x)
-            double precision dimension(n), intent(inout) :: x
-            double precision dimension(n),depend(n) :: xl
-            double precision dimension(n),depend(n) :: xu
-            double precision :: f
-            double precision dimension(la) :: c
-            double precision dimension(n + 1),depend(n) :: g
-            double precision dimension(la,n + 1),depend(la,n) :: a
-            double precision, intent(inout) :: acc
-            integer, intent(inout) :: iter
-            integer, intent(inout) :: mode
-            double precision dimension(l_w) :: w
-            integer optional,check(len(w)>=l_w),depend(w) :: l_w=len(w)
-            integer dimension(l_jw) :: jw
-            integer optional,check(len(jw)>=l_jw),depend(jw) :: l_jw=len(jw)
-            double precision, intent(inout) :: alpha
-            double precision, intent(inout) :: f0
-            double precision, intent(inout) :: gs
-            double precision, intent(inout) :: h1
-            double precision, intent(inout) :: h2
-            double precision, intent(inout) :: h3
-            double precision, intent(inout) :: h4
-            double precision, intent(inout) :: t
-            double precision, intent(inout) :: t0
-            double precision, intent(inout) :: tol
-            integer, intent(inout) :: iexact
-            integer, intent(inout) :: incons
-            integer, intent(inout) :: ireset
-            integer, intent(inout) :: itermx
-            integer, intent(inout) :: line
-            integer, intent(inout) :: n1
-            integer, intent(inout) :: n2
-            integer, intent(inout) :: n3
-        end subroutine slsqp
-    end interface 
-end python module slsqp
-
-! This file was auto-generated with f2py (version:2).
-! See http://cens.ioc.ee/projects/f2py2e/
diff --git a/scipy/optimize/slsqp/slsqp_optmz.f b/scipy/optimize/slsqp/slsqp_optmz.f
deleted file mode 100644
index b2e00a693e3b..000000000000
--- a/scipy/optimize/slsqp/slsqp_optmz.f
+++ /dev/null
@@ -1,2196 +0,0 @@
-C
-C      ALGORITHM 733, COLLECTED ALGORITHMS FROM ACM.
-C      TRANSACTIONS ON MATHEMATICAL SOFTWARE,
-C      VOL. 20, NO. 3, SEPTEMBER, 1994, PP. 262-281.
-C      https://doi.org/10.1145/192115.192124
-C
-C
-C      https://web.archive.org/web/20170106155705/http://permalink.gmane.org/gmane.comp.python.scientific.devel/6725
-C      ------
-C      From: Deborah Cotton <cotton@hq.acm.org>
-C      Date: Fri, 14 Sep 2007 12:35:55 -0500
-C      Subject: RE: Algorithm License requested
-C      To: Alan Isaac
-C
-C      Prof. Issac,
-C
-C      In that case, then because the author consents to [the ACM] releasing
-C      the code currently archived at http://www.netlib.org/toms/733 under the
-C      BSD license, the ACM hereby releases this code under the BSD license.
-C
-C      Regards,
-C
-C      Deborah Cotton, Copyright & Permissions
-C      ACM Publications
-C      2 Penn Plaza, Suite 701**
-C      New York, NY 10121-0701
-C      permissions@acm.org
-C      212.869.7440 ext. 652
-C      Fax. 212.869.0481
-C      ------
-C
-
-************************************************************************
-*                              optimizer                               *
-************************************************************************
-
-      SUBROUTINE slsqp (m, meq, la, n, x, xl, xu, f, c, g, a,
-     *                  acc, iter, mode, w, l_w, jw, l_jw,
-     *                  alpha, f0, gs, h1, h2, h3, h4, t, t0, tol,
-     *                  iexact, incons, ireset, itermx, line,
-     *                  n1, n2, n3)
-
-C   SLSQP       S EQUENTIAL  L EAST  SQ UARES  P ROGRAMMING
-C            TO SOLVE GENERAL NONLINEAR OPTIMIZATION PROBLEMS
-
-C***********************************************************************
-C*                                                                     *
-C*                                                                     *
-C*            A NONLINEAR PROGRAMMING METHOD WITH                      *
-C*            QUADRATIC  PROGRAMMING  SUBPROBLEMS                      *
-C*                                                                     *
-C*                                                                     *
-C*  THIS SUBROUTINE SOLVES THE GENERAL NONLINEAR PROGRAMMING PROBLEM   *
-C*                                                                     *
-C*            MINIMIZE    F(X)                                         *
-C*                                                                     *
-C*            SUBJECT TO  C (X) .EQ. 0  ,  J = 1,...,MEQ               *
-C*                         J                                           *
-C*                                                                     *
-C*                        C (X) .GE. 0  ,  J = MEQ+1,...,M             *
-C*                         J                                           *
-C*                                                                     *
-C*                        XL .LE. X .LE. XU , I = 1,...,N.             *
-C*                          I      I       I                           *
-C*                                                                     *
-C*  THE ALGORITHM IMPLEMENTS THE METHOD OF HAN AND POWELL              *
-C*  WITH BFGS-UPDATE OF THE B-MATRIX AND L1-TEST FUNCTION              *
-C*  WITHIN THE STEPLENGTH ALGORITHM.                                   *
-C*                                                                     *
-C*    PARAMETER DESCRIPTION:                                           *
-C*    ( * MEANS THIS PARAMETER WILL BE CHANGED DURING CALCULATION )    *
-C*                                                                     *
-C*    M              IS THE TOTAL NUMBER OF CONSTRAINTS, M .GE. 0      *
-C*    MEQ            IS THE NUMBER OF EQUALITY CONSTRAINTS, MEQ .GE. 0 *
-C*    LA             SEE A, LA .GE. MAX(M,1)                           *
-C*    N              IS THE NUMBER OF VARIBLES, N .GE. 1               *
-C*  * X()            X() STORES THE CURRENT ITERATE OF THE N VECTOR X  *
-C*                   ON ENTRY X() MUST BE INITIALIZED. ON EXIT X()     *
-C*                   STORES THE SOLUTION VECTOR X IF MODE = 0.         *
-C*    XL()           XL() STORES AN N VECTOR OF LOWER BOUNDS XL TO X.  *
-C*                   ELEMENTS MAY BE NAN TO INDICATE NO LOWER BOUND.   *
-C*    XU()           XU() STORES AN N VECTOR OF UPPER BOUNDS XU TO X.  *
-C*                   ELEMENTS MAY BE NAN TO INDICATE NO UPPER BOUND.   *
-C*    F              IS THE VALUE OF THE OBJECTIVE FUNCTION.           *
-C*    C()            C() STORES THE M VECTOR C OF CONSTRAINTS,         *
-C*                   EQUALITY CONSTRAINTS (IF ANY) FIRST.              *
-C*                   DIMENSION OF C MUST BE GREATER OR EQUAL LA,       *
-C*                   which must be GREATER OR EQUAL MAX(1,M).          *
-C*    G()            G() STORES THE N VECTOR G OF PARTIALS OF THE      *
-C*                   OBJECTIVE FUNCTION; DIMENSION OF G MUST BE        *
-C*                   GREATER OR EQUAL N+1.                             *
-C*    A(),LA,M,N     THE LA BY N + 1 ARRAY A() STORES                  *
-C*                   THE M BY N MATRIX A OF CONSTRAINT NORMALS.        *
-C*                   A() HAS FIRST DIMENSIONING PARAMETER LA,          *
-C*                   WHICH MUST BE GREATER OR EQUAL MAX(1,M).          *
-C*    F,C,G,A        MUST ALL BE SET BY THE USER BEFORE EACH CALL.     *
-C*  * ACC            ABS(ACC) CONTROLS THE FINAL ACCURACY.             *
-C*                   IF ACC .LT. ZERO AN EXACT LINESEARCH IS PERFORMED,*
-C*                   OTHERWISE AN ARMIJO-TYPE LINESEARCH IS USED.      *
-C*  * ITER           PRESCRIBES THE MAXIMUM NUMBER OF ITERATIONS.      *
-C*                   ON EXIT ITER INDICATES THE NUMBER OF ITERATIONS.  *
-C*  * MODE           MODE CONTROLS CALCULATION:                        *
-C*                   REVERSE COMMUNICATION IS USED IN THE SENSE THAT   *
-C*                   THE PROGRAM IS INITIALIZED BY MODE = 0; THEN IT IS*
-C*                   TO BE CALLED REPEATEDLY BY THE USER UNTIL A RETURN*
-C*                   WITH MODE .NE. IABS(1) TAKES PLACE.               *
-C*                   IF MODE = -1 GRADIENTS HAVE TO BE CALCULATED,     *
-C*                   WHILE WITH MODE = 1 FUNCTIONS HAVE TO BE CALCULATED
-C*                   MODE MUST NOT BE CHANGED BETWEEN SUBSEQUENT CALLS *
-C*                   OF SQP.                                           *
-C*                   EVALUATION MODES:                                 *
-C*        MODE = -1: GRADIENT EVALUATION, (G&A)                        *
-C*                0: ON ENTRY: INITIALIZATION, (F,G,C&A)               *
-C*                   ON EXIT : REQUIRED ACCURACY FOR SOLUTION OBTAINED *
-C*                1: FUNCTION EVALUATION, (F&C)                        *
-C*                                                                     *
-C*                   FAILURE MODES:                                    *
-C*                2: NUMBER OF EQUALITY CONSTRAINTS LARGER THAN N      *
-C*                3: MORE THAN 3*N ITERATIONS IN LSQ SUBPROBLEM        *
-C*                4: INEQUALITY CONSTRAINTS INCOMPATIBLE               *
-C*                5: SINGULAR MATRIX E IN LSQ SUBPROBLEM               *
-C*                6: SINGULAR MATRIX C IN LSQ SUBPROBLEM               *
-C*                7: RANK-DEFICIENT EQUALITY CONSTRAINT SUBPROBLEM HFTI*
-C*                8: POSITIVE DIRECTIONAL DERIVATIVE FOR LINESEARCH    *
-C*                9: MORE THAN ITER ITERATIONS IN SQP                  *
-C*             >=10: WORKING SPACE W OR JW TOO SMALL,                  *
-C*                   W SHOULD BE ENLARGED TO L_W=MODE/1000             *
-C*                   JW SHOULD BE ENLARGED TO L_JW=MODE-1000*L_W       *
-C*  * W(), L_W       W() IS A ONE DIMENSIONAL WORKING SPACE,           *
-C*                   THE LENGTH L_W OF WHICH SHOULD BE AT LEAST        *
-C*                   (3*N1+M)*(N1+1)                        for LSQ    *
-C*                  +(N1-MEQ+1)*(MINEQ+2) + 2*MINEQ         for LSI    *
-C*                  +(N1+MINEQ)*(N1-MEQ) + 2*MEQ + N1       for LSEI   *
-C*                  + N1*N/2 + 2*M + 3*N + 3*N1 + 1         for SLSQPB *
-C*                   with MINEQ = M - MEQ + 2*N1  &  N1 = N+1          *
-C*        NOTICE:    FOR PROPER DIMENSIONING OF W IT IS RECOMMENDED TO *
-C*                   COPY THE FOLLOWING STATEMENTS INTO THE HEAD OF    *
-C*                   THE CALLING PROGRAM (AND REMOVE THE COMMENT C)    *
-c#######################################################################
-C     INTEGER LEN_W, LEN_JW, M, N, N1, MEQ, MINEQ
-C     PARAMETER (M=... , MEQ=... , N=...  )
-C     PARAMETER (N1= N+1, MINEQ= M-MEQ+N1+N1)
-C     PARAMETER (LEN_W=
-c    $           (3*N1+M)*(N1+1)
-c    $          +(N1-MEQ+1)*(MINEQ+2) + 2*MINEQ
-c    $          +(N1+MINEQ)*(N1-MEQ) + 2*MEQ + N1
-c    $          +(N+1)*N/2 + 2*M + 3*N + 3*N1 + 1,
-c    $           LEN_JW=MINEQ)
-C     DOUBLE PRECISION W(LEN_W)
-C     INTEGER          JW(LEN_JW)
-c#######################################################################
-C*                   THE FIRST M+N+N*N1/2 ELEMENTS OF W MUST NOT BE    *
-C*                   CHANGED BETWEEN SUBSEQUENT CALLS OF SLSQP.        *
-C*                   ON RETURN W(1) ... W(M) CONTAIN THE MULTIPLIERS   *
-C*                   ASSOCIATED WITH THE GENERAL CONSTRAINTS, WHILE    *
-C*                   W(M+1) ... W(M+N(N+1)/2) STORE THE CHOLESKY FACTOR*
-C*                   L*D*L(T) OF THE APPROXIMATE HESSIAN OF THE        *
-C*                   LAGRANGIAN COLUMNWISE DENSE AS LOWER TRIANGULAR   *
-C*                   UNIT MATRIX L WITH D IN ITS 'DIAGONAL' and        *
-C*                   W(M+N(N+1)/2+N+2 ... W(M+N(N+1)/2+N+2+M+2N)       *
-C*                   CONTAIN THE MULTIPLIERS ASSOCIATED WITH ALL       *
-C*                   ALL CONSTRAINTS OF THE QUADRATIC PROGRAM FINDING  *
-C*                   THE SEARCH DIRECTION TO THE SOLUTION X*           *
-C*  * JW(), L_JW     JW() IS A ONE DIMENSIONAL INTEGER WORKING SPACE   *
-C*                   THE LENGTH L_JW OF WHICH SHOULD BE AT LEAST       *
-C*                   MINEQ                                             *
-C*                   with MINEQ = M - MEQ + 2*N1  &  N1 = N+1          *
-C*                                                                     *
-C*  THE USER HAS TO PROVIDE THE FOLLOWING SUBROUTINES:                 *
-C*     LDL(N,A,Z,SIG,W) :   UPDATE OF THE LDL'-FACTORIZATION.          *
-C*     LINMIN(A,B,F,TOL) :  LINESEARCH ALGORITHM IF EXACT = 1          *
-C*     LSQ(M,MEQ,LA,N,NC,C,D,A,B,XL,XU,X,LAMBDA,W,....) :              *
-C*                                                                     *
-C*        SOLUTION OF THE QUADRATIC PROGRAM                            *
-C*                QPSOL IS RECOMMENDED:                                *
-C*     PE GILL, W MURRAY, MA SAUNDERS, MH WRIGHT:                      *
-C*     USER'S GUIDE FOR SOL/QPSOL:                                     *
-C*     A FORTRAN PACKAGE FOR QUADRATIC PROGRAMMING,                    *
-C*     TECHNICAL REPORT SOL 83-7, JULY 1983                            *
-C*     DEPARTMENT OF OPERATIONS RESEARCH, STANFORD UNIVERSITY          *
-C*     STANFORD, CA 94305                                              *
-C*     QPSOL IS THE MOST ROBUST AND EFFICIENT QP-SOLVER                *
-C*     AS IT ALLOWS WARM STARTS WITH PROPER WORKING SETS               *
-C*                                                                     *
-C*     IF IT IS NOT AVAILABLE USE LSEI, A CONSTRAINT LINEAR LEAST      *
-C*     SQUARES SOLVER IMPLEMENTED USING THE SOFTWARE HFTI, LDP, NNLS   *
-C*     FROM C.L. LAWSON, R.J.HANSON: SOLVING LEAST SQUARES PROBLEMS,   *
-C*     PRENTICE HALL, ENGLEWOOD CLIFFS, 1974.                          *
-C*     LSEI COMES WITH THIS PACKAGE, together with all necessary SR's. *
-C*                                                                     *
-C*     TOGETHER WITH A COUPLE OF SUBROUTINES FROM BLAS LEVEL 1         *
-C*                                                                     *
-C*     SQP IS HEAD SUBROUTINE FOR BODY SUBROUTINE SQPBDY               *
-C*     IN WHICH THE ALGORITHM HAS BEEN IMPLEMENTED.                    *
-C*                                                                     *
-C*  IMPLEMENTED BY: DIETER KRAFT, DFVLR OBERPFAFFENHOFEN               *
-C*  as described in Dieter Kraft: A Software Package for               *
-C*                                Sequential Quadratic Programming     *
-C*                                DFVLR-FB 88-28, 1988                 *
-C*  which should be referenced if the user publishes results of SLSQP  *
-C*                                                                     *
-C*  DATE:           APRIL - OCTOBER, 1981.                             *
-C*  STATUS:         DECEMBER, 31-ST, 1984.                             *
-C*  STATUS:         MARCH   , 21-ST, 1987, REVISED TO FORTRAN 77       *
-C*  STATUS:         MARCH   , 20-th, 1989, REVISED TO MS-FORTRAN       *
-C*  STATUS:         APRIL   , 14-th, 1989, HESSE   in-line coded       *
-C*  STATUS:         FEBRUARY, 28-th, 1991, FORTRAN/2 Version 1.04      *
-C*                                         accepts Statement Functions *
-C*  STATUS:         MARCH   ,  1-st, 1991, tested with SALFORD         *
-C*                                         FTN77/386 COMPILER VERS 2.40*
-C*                                         in protected mode           *
-C*                                                                     *
-C***********************************************************************
-C*                                                                     *
-C*  Copyright 1991: Dieter Kraft, FHM                                  *
-C*                                                                     *
-C***********************************************************************
-
-      INTEGER          il, im, ir, is, iter, iu, iv, iw, ix, l_w, l_jw,
-     *                 jw(l_jw), la, m, meq, mineq, mode, n
-
-      DOUBLE PRECISION acc, a(la,n+1), c(la), f, g(n+1),
-     *                 x(n), xl(n), xu(n), w(l_w)
-
-      INTEGER          iexact, incons, ireset, itermx, line, n1, n2, n3
-
-      DOUBLE PRECISION alpha, f0, gs, h1, h2, h3, h4, t, t0, tol
-
-c     dim(W) =         N1*(N1+1) + MEQ*(N1+1) + MINEQ*(N1+1)  for LSQ
-c                    +(N1-MEQ+1)*(MINEQ+2) + 2*MINEQ          for LSI
-c                    +(N1+MINEQ)*(N1-MEQ) + 2*MEQ + N1        for LSEI
-c                    + N1*N/2 + 2*M + 3*N +3*N1 + 1           for SLSQPB
-c                      with MINEQ = M - MEQ + 2*N1  &  N1 = N+1
-
-C   CHECK LENGTH OF WORKING ARRAYS
-
-      n1 = n+1
-      mineq = m-meq+n1+n1
-      il = (3*n1+m)*(n1+1) +
-     .(n1-meq+1)*(mineq+2) + 2*mineq +
-     .(n1+mineq)*(n1-meq)  + 2*meq +
-     .n1*n/2 + 2*m + 3*n + 4*n1 + 1
-      im = MAX(mineq, n1-meq)
-      IF (l_w .LT. il .OR. l_jw .LT. im) THEN
-          mode = 1000*MAX(10,il)
-          mode = mode+MAX(10,im)
-          RETURN
-      ENDIF
-
-C   PREPARE DATA FOR CALLING SQPBDY  -  INITIAL ADDRESSES IN W
-
-      im = 1
-      il = im + MAX(1,m)
-      il = im + la
-      ix = il + n1*n/2 + 1
-      ir = ix + n
-      is = ir + n + n + MAX(1,m)
-      is = ir + n + n + la
-      iu = is + n1
-      iv = iu + n1
-      iw = iv + n1
-
-      CALL slsqpb  (m, meq, la, n, x, xl, xu, f, c, g, a, acc, iter,
-     * mode, w(ir), w(il), w(ix), w(im), w(is), w(iu), w(iv), w(iw), jw,
-     * alpha, f0, gs, h1, h2, h3, h4, t, t0, tol,
-     * iexact, incons, ireset, itermx, line,
-     * n1, n2, n3)
-
-      END
-
-      SUBROUTINE slsqpb (m, meq, la, n, x, xl, xu, f, c, g, a, acc,
-     *                   iter, mode, r, l, x0, mu, s, u, v, w, iw,
-     *                   alpha, f0, gs, h1, h2, h3, h4, t, t0, tol,
-     *                   iexact, incons, ireset, itermx, line,
-     *                   n1, n2, n3)
-
-C   NONLINEAR PROGRAMMING BY SOLVING SEQUENTIALLY QUADRATIC PROGRAMS
-
-C        -  L1 - LINE SEARCH,  POSITIVE DEFINITE  BFGS UPDATE  -
-
-C                      BODY SUBROUTINE FOR SLSQP
-
-      INTEGER          iw(*), i, iexact, incons, ireset, iter, itermx,
-     *                 k, j, la, line, m, meq, mode, n, n1, n2, n3
-      LOGICAL          badlin
-
-
-      DOUBLE PRECISION a(la,n+1), c(la), g(n+1), l((n+1)*(n+2)/2),
-     *                 mu(la), r(m+n+n+2), s(n+1), u(n+1), v(n+1), w(*),
-     *                 x(n), xl(n), xu(n), x0(n),
-     *                 ddot_sl, dnrm2_, linmin,
-     *                 acc, alfmin, alpha, f, f0, gs, h1, h2, h3, h4,
-     *                 hun, one, t, t0, ten, tol, two, ZERO
-
-c     dim(W) =         N1*(N1+1) + MEQ*(N1+1) + MINEQ*(N1+1)  for LSQ
-c                     +(N1-MEQ+1)*(MINEQ+2) + 2*MINEQ
-c                     +(N1+MINEQ)*(N1-MEQ) + 2*MEQ + N1       for LSEI
-c                      with MINEQ = M - MEQ + 2*N1  &  N1 = N+1
-
-      DATA             ZERO /0.0d0/, one /1.0d0/, alfmin /1.0d-1/,
-     *                 hun /1.0d+2/, ten /1.0d+1/, two /2.0d0/
-
-C     The badlin flag keeps track whether the SQP problem on the current
-C     iteration was inconsistent or not.
-      badlin = .false.
-
-      IF (mode) 260, 100, 220
-
-  100 itermx = iter
-      IF (acc.GE.ZERO) THEN
-          iexact = 0
-      ELSE
-          iexact = 1
-      ENDIF
-      acc = ABS(acc)
-      tol = ten*acc
-      iter = 0
-      ireset = 0
-      n1 = n + 1
-      n2 = n1*n/2
-      n3 = n2 + 1
-      s(1) = ZERO
-      mu(1) = ZERO
-      CALL dcopy_(n, s(1),  0, s,  1)
-      CALL dcopy_(m, mu(1), 0, mu, 1)
-
-C   RESET BFGS MATRIX
-
-  110 ireset = ireset + 1
-      IF (ireset.GT.5) GO TO 255
-      l(1) = ZERO
-      CALL dcopy_(n2, l(1), 0, l, 1)
-      j = 1
-      DO 120 i=1,n
-         l(j) = one
-         j = j + n1 - i
-  120 CONTINUE
-
-C   MAIN ITERATION : SEARCH DIRECTION, STEPLENGTH, LDL'-UPDATE
-
-  130 iter = iter + 1
-      mode = 9
-      IF (iter.GT.itermx) GO TO 330
-
-C   SEARCH DIRECTION AS SOLUTION OF QP - SUBPROBLEM
-
-      CALL dcopy_(n, xl, 1, u, 1)
-      CALL dcopy_(n, xu, 1, v, 1)
-      CALL daxpy_sl(n, -one, x, 1, u, 1)
-      CALL daxpy_sl(n, -one, x, 1, v, 1)
-      h4 = one
-      CALL lsq (m, meq, n , n3, la, l, g, a, c, u, v, s, r, w, iw, mode)
-
-C   AUGMENTED PROBLEM FOR INCONSISTENT LINEARIZATION
-C
-C   If it turns out that the original SQP problem is inconsistent,
-C   disallow termination with convergence on this iteration,
-C   even if the augmented problem was solved.
-
-      badlin = .false.
-      IF (mode.EQ.6) THEN
-          IF (n.EQ.meq) THEN
-              mode = 4
-          ENDIF
-      ENDIF
-      IF (mode.EQ.4) THEN
-          badlin = .true.
-          DO 140 j=1,m
-             IF (j.LE.meq) THEN
-                 a(j,n1) = -c(j)
-             ELSE
-                 a(j,n1) = MAX(-c(j),ZERO)
-             ENDIF
-  140     CONTINUE
-          s(1) = ZERO
-          CALL dcopy_(n, s(1), 0, s, 1)
-          h3 = ZERO
-          g(n1) = ZERO
-          l(n3) = hun
-          s(n1) = one
-          u(n1) = ZERO
-          v(n1) = one
-          incons = 0
-  150     CALL lsq (m, meq, n1, n3, la, l, g, a, c, u, v, s, r,
-     *              w, iw, mode)
-          h4 = one - s(n1)
-          IF (mode.EQ.4) THEN
-              l(n3) = ten*l(n3)
-              incons = incons + 1
-              IF (incons.GT.5) GO TO 330
-              GOTO 150
-          ELSE IF (mode.NE.1) THEN
-              GOTO 330
-          ENDIF
-      ELSE IF (mode.NE.1) THEN
-          GOTO 330
-      ENDIF
-
-C   UPDATE MULTIPLIERS FOR L1-TEST
-
-      DO 160 i=1,n
-         v(i) = g(i) - ddot_sl(m,a(1,i),1,r,1)
-  160 CONTINUE
-      f0 = f
-      CALL dcopy_(n, x, 1, x0, 1)
-      gs = ddot_sl(n, g, 1, s, 1)
-      h1 = ABS(gs)
-      h2 = ZERO
-      DO 170 j=1,m
-         IF (j.LE.meq) THEN
-             h3 = c(j)
-         ELSE
-             h3 = ZERO
-         ENDIF
-         h2 = h2 + MAX(-c(j),h3)
-         h3 = ABS(r(j))
-         mu(j) = MAX(h3,(mu(j)+h3)/two)
-         h1 = h1 + h3*ABS(c(j))
-  170 CONTINUE
-
-C   CHECK CONVERGENCE
-
-      mode = 0
-      IF (h1.LT.acc .AND. h2.LT.acc .AND. .NOT. badlin
-     *     .AND. f .EQ. f) GO TO 330
-      h1 = ZERO
-      DO 180 j=1,m
-         IF (j.LE.meq) THEN
-             h3 = c(j)
-         ELSE
-             h3 = ZERO
-         ENDIF
-         h1 = h1 + mu(j)*MAX(-c(j),h3)
-  180 CONTINUE
-      t0 = f + h1
-      h3 = gs - h1*h4
-      mode = 8
-      IF (h3.GE.ZERO) GO TO 110
-
-C   LINE SEARCH WITH AN L1-TESTFUNCTION
-
-      line = 0
-      alpha = one
-      IF (iexact.EQ.1) GOTO 210
-
-C   INEXACT LINESEARCH
-
-  190     line = line + 1
-          h3 = alpha*h3
-          CALL dscal_sl(n, alpha, s, 1)
-          CALL dcopy_(n, x0, 1, x, 1)
-          CALL daxpy_sl(n, one, s, 1, x, 1)
-          mode = 1
-          GO TO 330
-  200         IF (h1.LE.h3/ten .OR. line.GT.10) GO TO 240
-              alpha = MAX(h3/(two*(h3-h1)),alfmin)
-              GO TO 190
-
-C   EXACT LINESEARCH
-
-  210 IF (line.NE.3) THEN
-          alpha = linmin(line,alfmin,one,t,tol)
-          CALL dcopy_(n, x0, 1, x, 1)
-          CALL daxpy_sl(n, alpha, s, 1, x, 1)
-          mode = 1
-          GOTO 330
-      ENDIF
-      CALL dscal_sl(n, alpha, s, 1)
-      GOTO 240
-
-C   CALL FUNCTIONS AT CURRENT X
-
-  220     t = f
-          DO 230 j=1,m
-             IF (j.LE.meq) THEN
-                 h1 = c(j)
-             ELSE
-                 h1 = ZERO
-             ENDIF
-             t = t + mu(j)*MAX(-c(j),h1)
-  230     CONTINUE
-          h1 = t - t0
-          GOTO (200, 210) iexact+1
-
-C   CHECK CONVERGENCE
-
-  240 h3 = ZERO
-      DO 250 j=1,m
-         IF (j.LE.meq) THEN
-             h1 = c(j)
-         ELSE
-             h1 = ZERO
-         ENDIF
-         h3 = h3 + MAX(-c(j),h1)
-  250 CONTINUE
-      IF ((ABS(f-f0).LT.acc .OR. dnrm2_(n,s,1).LT.acc) .AND. h3.LT.acc
-     *     .AND. .NOT. badlin .AND. f .EQ. f)
-     *   THEN
-            mode = 0
-         ELSE
-            mode = -1
-         ENDIF
-      GO TO 330
-
-C   CHECK relaxed CONVERGENCE in case of positive directional derivative
-
-  255 CONTINUE
-      h3 = ZERO
-      DO 256 j=1,m
-         IF (j.LE.meq) THEN
-             h1 = c(j)
-         ELSE
-             h1 = ZERO
-         ENDIF
-         h3 = h3 + MAX(-c(j),h1)
-  256 CONTINUE
-      IF ((ABS(f-f0).LT.tol .OR. dnrm2_(n,s,1).LT.tol) .AND. h3.LT.tol
-     *     .AND. .NOT. badlin .AND. f .EQ. f)
-     *   THEN
-            mode = 0
-         ELSE
-            mode = 8
-         ENDIF
-      GO TO 330
-
-C   CALL JACOBIAN AT CURRENT X
-
-C   UPDATE CHOLESKY-FACTORS OF HESSIAN MATRIX BY MODIFIED BFGS FORMULA
-
-  260 DO 270 i=1,n
-         u(i) = g(i) - ddot_sl(m,a(1,i),1,r,1) - v(i)
-  270 CONTINUE
-
-C   L'*S
-
-      k = 0
-      DO 290 i=1,n
-         h1 = ZERO
-         k = k + 1
-         DO 280 j=i+1,n
-            k = k + 1
-            h1 = h1 + l(k)*s(j)
-  280    CONTINUE
-         v(i) = s(i) + h1
-  290 CONTINUE
-
-C   D*L'*S
-
-      k = 1
-      DO 300 i=1,n
-         v(i) = l(k)*v(i)
-         k = k + n1 - i
-  300 CONTINUE
-
-C   L*D*L'*S
-
-      DO 320 i=n,1,-1
-         h1 = ZERO
-         k = i
-         DO 310 j=1,i - 1
-            h1 = h1 + l(k)*v(j)
-            k = k + n - j
-  310    CONTINUE
-         v(i) = v(i) + h1
-  320 CONTINUE
-
-      h1 = ddot_sl(n,s,1,u,1)
-      h2 = ddot_sl(n,s,1,v,1)
-      h3 = 0.2d0*h2
-      IF (h1.LT.h3) THEN
-          h4 = (h2-h3)/(h2-h1)
-          h1 = h3
-          CALL dscal_sl(n, h4, u, 1)
-          CALL daxpy_sl(n, one-h4, v, 1, u, 1)
-      ENDIF
-      IF (h1.EQ.0 .or. h2.EQ.0) THEN
-C         Singular update: reset hessian.
-          GO TO 110
-      end if
-      CALL ldl(n, l, u, +one/h1, v)
-      CALL ldl(n, l, v, -one/h2, u)
-
-C   END OF MAIN ITERATION
-
-      GO TO 130
-
-C   END OF SLSQPB
-
-  330 END
-
-
-      SUBROUTINE lsq(m,meq,n,nl,la,l,g,a,b,xl,xu,x,y,w,jw,mode)
-
-C   MINIMIZE with respect to X
-
-C             ||E*X - F||
-C                                      1/2  T
-C   WITH UPPER TRIANGULAR MATRIX E = +D   *L ,
-
-C                                      -1/2  -1
-C                     AND VECTOR F = -D    *L  *G,
-
-C  WHERE THE UNIT LOWER TRIDIANGULAR MATRIX L IS STORED COLUMNWISE
-C  DENSE IN THE N*(N+1)/2 ARRAY L WITH VECTOR D STORED IN ITS
-C 'DIAGONAL' THUS SUBSTITUTING THE ONE-ELEMENTS OF L
-
-C   SUBJECT TO
-
-C             A(J)*X - B(J) = 0 ,         J=1,...,MEQ,
-C             A(J)*X - B(J) >=0,          J=MEQ+1,...,M,
-C             XL(I) <= X(I) <= XU(I),     I=1,...,N,
-C     ON ENTRY, THE USER HAS TO PROVIDE THE ARRAYS L, G, A, B, XL, XU.
-C     WITH DIMENSIONS: L(N*(N+1)/2), G(N), A(LA,N), B(M), XL(N), XU(N)
-C     THE WORKING ARRAY W MUST HAVE AT LEAST THE FOLLOWING DIMENSION:
-c     DIM(W) =        (3*N+M)*(N+1)                        for LSQ
-c                    +(N-MEQ+1)*(MINEQ+2) + 2*MINEQ        for LSI
-c                    +(N+MINEQ)*(N-MEQ) + 2*MEQ + N        for LSEI
-c                      with MINEQ = M - MEQ + 2*N
-C     ON RETURN, NO ARRAY WILL BE CHANGED BY THE SUBROUTINE.
-C     X     STORES THE N-DIMENSIONAL SOLUTION VECTOR
-C     Y     STORES THE VECTOR OF LAGRANGE MULTIPLIERS OF DIMENSION
-C           M+N+N (CONSTRAINTS+LOWER+UPPER BOUNDS)
-C     MODE  IS A SUCCESS-FAILURE FLAG WITH THE FOLLOWING MEANINGS:
-C          MODE=1: SUCCESSFUL COMPUTATION
-C               2: ERROR RETURN BECAUSE OF WRONG DIMENSIONS (N<1)
-C               3: ITERATION COUNT EXCEEDED BY NNLS
-C               4: INEQUALITY CONSTRAINTS INCOMPATIBLE
-C               5: MATRIX E IS NOT OF FULL RANK
-C               6: MATRIX C IS NOT OF FULL RANK
-C               7: RANK DEFECT IN HFTI
-
-c     coded            Dieter Kraft, april 1987
-c     revised                        march 1989
-
-
-      DOUBLE PRECISION l,g,a,b,w,xl,xu,x,y,
-     .                 diag,ZERO,one,ddot_sl,xnorm
-
-      INTEGER          jw(*),i,ic,id,ie,IF,ig,ih,il,im,ip,iu,iw,
-     .     i1,i2,i3,i4,la,m,meq,mineq,mode,m1,n,nl,n1,n2,n3,
-     .     nancnt,j
-
-      DIMENSION        a(la,n), b(la), g(n), l(nl),
-     .                 w(*), x(n), xl(n), xu(n), y(m+n+n)
-
-      DATA             ZERO/0.0d0/, one/1.0d0/
-
-      n1 = n + 1
-      mineq = m - meq
-      m1 = mineq + n + n
-
-c  determine whether to solve problem
-c  with inconsistent linerarization (n2=1)
-c  or not (n2=0)
-
-      n2 = n1*n/2 + 1
-      IF (n2.EQ.nl) THEN
-          n2 = 0
-      ELSE
-          n2 = 1
-      ENDIF
-      n3 = n-n2
-
-C  RECOVER MATRIX E AND VECTOR F FROM L AND G
-
-      i2 = 1
-      i3 = 1
-      i4 = 1
-      ie = 1
-      IF = n*n+1
-      DO 10 i=1,n3
-         i1 = n1-i
-         diag = SQRT (l(i2))
-         w(i3) = ZERO
-         CALL dcopy_ (i1  ,  w(i3), 0, w(i3), 1)
-         CALL dcopy_ (i1-n2, l(i2), 1, w(i3), n)
-         CALL dscal_sl (i1-n2,     diag, w(i3), n)
-         w(i3) = diag
-         w(IF-1+i) = (g(i) - ddot_sl(i-1, w(i4), 1, w(IF), 1))/diag
-         i2 = i2 + i1 - n2
-         i3 = i3 + n1
-         i4 = i4 + n
-   10 CONTINUE
-      IF (n2.EQ.1) THEN
-          w(i3) = l(nl)
-          w(i4) = ZERO
-          CALL dcopy_ (n3, w(i4), 0, w(i4), 1)
-          w(IF-1+n) = ZERO
-      ENDIF
-      CALL dscal_sl (n, - one, w(IF), 1)
-
-      ic = IF + n
-      id = ic + meq*n
-
-      IF (meq .GT. 0) THEN
-
-C  RECOVER MATRIX C FROM UPPER PART OF A
-
-          DO 20 i=1,meq
-              CALL dcopy_ (n, a(i,1), la, w(ic-1+i), meq)
-   20     CONTINUE
-
-C  RECOVER VECTOR D FROM UPPER PART OF B
-
-          CALL dcopy_ (meq, b(1), 1, w(id), 1)
-          CALL dscal_sl (meq,   - one, w(id), 1)
-
-      ENDIF
-
-      ig = id + meq
-
-C  RECOVER MATRIX G FROM LOWER PART OF A
-C  The matrix G(mineq+2*n,m1) is stored at w(ig)
-C  Not all rows will be filled if some of the upper/lower
-C  bounds are unbounded.
-
-      IF (mineq .GT. 0) THEN
-
-          DO 30 i=1,mineq
-              CALL dcopy_ (n, a(meq+i,1), la, w(ig-1+i), m1)
-   30     CONTINUE
-
-      ENDIF
-
-      ih = ig + m1*n
-      iw = ih + mineq + 2*n
-
-      IF (mineq .GT. 0) THEN
-
-C  RECOVER H FROM LOWER PART OF B
-C  The vector H(mineq+2*n) is stored at w(ih)
-
-          CALL dcopy_ (mineq, b(meq+1), 1, w(ih), 1)
-          CALL dscal_sl (mineq,       - one, w(ih), 1)
-
-      ENDIF
-
-C  AUGMENT MATRIX G BY +I AND -I, AND,
-C  AUGMENT VECTOR H BY XL AND XU
-C  NaN value indicates no bound
-
-      ip = ig + mineq
-      il = ih + mineq
-      nancnt = 0
-
-      DO 40 i=1,n
-         if (xl(i).eq.xl(i)) then
-            w(il) = xl(i)
-            do 41 j=1,n
-               w(ip + m1*(j-1)) = 0
- 41         continue
-            w(ip + m1*(i-1)) = 1
-            ip = ip + 1
-            il = il + 1
-         else
-            nancnt = nancnt + 1
-         end if
-   40 CONTINUE
-
-      DO 50 i=1,n
-         if (xu(i).eq.xu(i)) then
-            w(il) = -xu(i)
-            do 51 j=1,n
-               w(ip + m1*(j-1)) = 0
- 51         continue
-            w(ip + m1*(i-1)) = -1
-            ip = ip + 1
-            il = il + 1
-         else
-            nancnt = nancnt + 1
-         end if
- 50   CONTINUE
-
-      CALL lsei (w(ic), w(id), w(ie), w(IF), w(ig), w(ih), MAX(1,meq),
-     .           meq, n, n, m1, m1-nancnt, n, x, xnorm, w(iw), jw, mode)
-
-      IF (mode .EQ. 1) THEN
-
-c   restore Lagrange multipliers (only for user-defined variables)
-
-          CALL dcopy_ (m,  w(iw),     1, y(1),      1)
-
-c   set rest of the multipliers to nan (they are not used)
-
-          IF (n3 .GT. 0) THEN
-             y(m+1) = 0
-             y(m+1) = 0 / y(m+1)
-             do 60 i=m+2,m+n3+n3
-                y(i) = y(m+1)
- 60          continue
-          ENDIF
-
-      ENDIF
-      call bound(n, x, xl, xu)
-
-C   END OF SUBROUTINE LSQ
-
-      END
-
-
-      SUBROUTINE lsei(c,d,e,f,g,h,lc,mc,LE,me,lg,mg,n,x,xnrm,w,jw,mode)
-
-C     FOR MODE=1, THE SUBROUTINE RETURNS THE SOLUTION X OF
-C     EQUALITY & INEQUALITY CONSTRAINED LEAST SQUARES PROBLEM LSEI :
-
-C                MIN ||E*X - F||
-C                 X
-
-C                S.T.  C*X  = D,
-C                      G*X >= H.
-
-C     USING QR DECOMPOSITION & ORTHOGONAL BASIS OF NULLSPACE OF C
-C     CHAPTER 23.6 OF LAWSON & HANSON: SOLVING LEAST SQUARES PROBLEMS.
-
-C     THE FOLLOWING DIMENSIONS OF THE ARRAYS DEFINING THE PROBLEM
-C     ARE NECESSARY
-C     DIM(E) :   FORMAL (LE,N),    ACTUAL (ME,N)
-C     DIM(F) :   FORMAL (LE  ),    ACTUAL (ME  )
-C     DIM(C) :   FORMAL (LC,N),    ACTUAL (MC,N)
-C     DIM(D) :   FORMAL (LC  ),    ACTUAL (MC  )
-C     DIM(G) :   FORMAL (LG,N),    ACTUAL (MG,N)
-C     DIM(H) :   FORMAL (LG  ),    ACTUAL (MG  )
-C     DIM(X) :   FORMAL (N   ),    ACTUAL (N   )
-C     DIM(W) :   2*MC+ME+(ME+MG)*(N-MC)  for LSEI
-C              +(N-MC+1)*(MG+2)+2*MG     for LSI
-C     DIM(JW):   MAX(MG,L)
-C     ON ENTRY, THE USER HAS TO PROVIDE THE ARRAYS C, D, E, F, G, AND H.
-C     ON RETURN, ALL ARRAYS WILL BE CHANGED BY THE SUBROUTINE.
-C     X     STORES THE SOLUTION VECTOR
-C     XNORM STORES THE RESIDUUM OF THE SOLUTION IN EUCLIDIAN NORM
-C     W     STORES THE VECTOR OF LAGRANGE MULTIPLIERS IN ITS FIRST
-C           MC+MG ELEMENTS
-C     MODE  IS A SUCCESS-FAILURE FLAG WITH THE FOLLOWING MEANINGS:
-C          MODE=1: SUCCESSFUL COMPUTATION
-C               2: ERROR RETURN BECAUSE OF WRONG DIMENSIONS (N<1)
-C               3: ITERATION COUNT EXCEEDED BY NNLS
-C               4: INEQUALITY CONSTRAINTS INCOMPATIBLE
-C               5: MATRIX E IS NOT OF FULL RANK
-C               6: MATRIX C IS NOT OF FULL RANK
-C               7: RANK DEFECT IN HFTI
-
-C     18.5.1981, DIETER KRAFT, DFVLR OBERPFAFFENHOFEN
-C     20.3.1987, DIETER KRAFT, DFVLR OBERPFAFFENHOFEN
-
-      INTEGER          jw(*),i,ie,IF,ig,iw,j,k,krank,l,lc,LE,lg,
-     .                 mc,mc1,me,mg,mode,n
-
-      DOUBLE PRECISION c(lc,n),e(LE,n),g(lg,n),d(lc),f(LE),h(lg),x(n),
-     .                 w(*),t,ddot_sl,xnrm,rnorm(1),dnrm2_,epmach,ZERO
-      DATA             epmach/2.22d-16/,ZERO/0.0d+00/
-
-      mode=2
-      IF(mc.GT.n)                      GOTO 75
-      l=n-mc
-      mc1=mc+1
-      iw=(l+1)*(mg+2)+2*mg+mc
-      ie=iw+mc+1
-      IF=ie+me*l
-      ig=IF+me
-
-C  TRIANGULARIZE C AND APPLY FACTORS TO E AND G
-
-      DO 10 i=1,mc
-          j=MIN(i+1,lc)
-          CALL h12(1,i,i+1,n,c(i,1),lc,w(iw+i),c(j,1),lc,1,mc-i)
-          CALL h12(2,i,i+1,n,c(i,1),lc,w(iw+i),e     ,LE,1,me)
-   10     CALL h12(2,i,i+1,n,c(i,1),lc,w(iw+i),g     ,lg,1,mg)
-
-C  SOLVE C*X=D AND MODIFY F
-
-      mode=6
-      DO 15 i=1,mc
-          IF(ABS(c(i,i)).LT.epmach)    GOTO 75
-          x(i)=(d(i)-ddot_sl(i-1,c(i,1),lc,x,1))/c(i,i)
-   15 CONTINUE
-      mode=1
-      w(mc1) = ZERO
-      CALL dcopy_ (mg-mc,w(mc1),0,w(mc1),1)
-
-      IF(mc.EQ.n)                      GOTO 50
-
-      DO 20 i=1,me
-   20     w(IF-1+i)=f(i)-ddot_sl(mc,e(i,1),LE,x,1)
-
-C  STORE TRANSFORMED E & G
-
-      DO 25 i=1,me
-   25     CALL dcopy_(l,e(i,mc1),LE,w(ie-1+i),me)
-      DO 30 i=1,mg
-   30     CALL dcopy_(l,g(i,mc1),lg,w(ig-1+i),mg)
-
-      IF(mg.GT.0)                      GOTO 40
-
-C  SOLVE LS WITHOUT INEQUALITY CONSTRAINTS
-
-      mode=7
-      k=MAX(LE,n)
-      t=SQRT(epmach)
-      CALL hfti (w(ie),me,me,l,w(IF),k,1,t,krank,rnorm,w,w(l+1),jw)
-C  HFTI IS MORE GENERIC, BUT WE ONLY CALL IT WITH NB=1, SO RETRIEVE THE
-C  SINGLE VALUE WE NEED FROM RNORM HERE
-      xnrm = rnorm(1)
-      CALL dcopy_(l,w(IF),1,x(mc1),1)
-      IF(krank.NE.l)                   GOTO 75
-      mode=1
-                                       GOTO 50
-C  MODIFY H AND SOLVE INEQUALITY CONSTRAINED LS PROBLEM
-
-   40 DO 45 i=1,mg
-   45     h(i)=h(i)-ddot_sl(mc,g(i,1),lg,x,1)
-      CALL lsi
-     . (w(ie),w(IF),w(ig),h,me,me,mg,mg,l,x(mc1),xnrm,w(mc1),jw,mode)
-      IF(mc.EQ.0)                      GOTO 75
-      t=dnrm2_(mc,x,1)
-      xnrm=SQRT(xnrm*xnrm+t*t)
-      IF(mode.NE.1)                    GOTO 75
-
-C  SOLUTION OF ORIGINAL PROBLEM AND LAGRANGE MULTIPLIERS
-
-   50 DO 55 i=1,me
-   55     f(i)=ddot_sl(n,e(i,1),LE,x,1)-f(i)
-      DO 60 i=1,mc
-   60     d(i)=ddot_sl(me,e(1,i),1,f,1)-ddot_sl(mg,g(1,i),1,w(mc1),1)
-
-      DO 65 i=mc,1,-1
-   65     CALL h12(2,i,i+1,n,c(i,1),lc,w(iw+i),x,1,1,1)
-
-      DO 70 i=mc,1,-1
-          j=MIN(i+1,lc)
-          w(i)=(d(i)-ddot_sl(mc-i,c(j,i),1,w(j),1))/c(i,i)
-   70 CONTINUE
-
-C  END OF SUBROUTINE LSEI
-
-   75                                  END
-
-
-      SUBROUTINE lsi(e,f,g,h,LE,me,lg,mg,n,x,xnorm,w,jw,mode)
-
-C     FOR MODE=1, THE SUBROUTINE RETURNS THE SOLUTION X OF
-C     INEQUALITY CONSTRAINED LINEAR LEAST SQUARES PROBLEM:
-
-C                    MIN ||E*X-F||
-C                     X
-
-C                    S.T.  G*X >= H
-
-C     THE ALGORITHM IS BASED ON QR DECOMPOSITION AS DESCRIBED IN
-C     CHAPTER 23.5 OF LAWSON & HANSON: SOLVING LEAST SQUARES PROBLEMS
-
-C     THE FOLLOWING DIMENSIONS OF THE ARRAYS DEFINING THE PROBLEM
-C     ARE NECESSARY
-C     DIM(E) :   FORMAL (LE,N),    ACTUAL (ME,N)
-C     DIM(F) :   FORMAL (LE  ),    ACTUAL (ME  )
-C     DIM(G) :   FORMAL (LG,N),    ACTUAL (MG,N)
-C     DIM(H) :   FORMAL (LG  ),    ACTUAL (MG  )
-C     DIM(X) :   N
-C     DIM(W) :   (N+1)*(MG+2) + 2*MG
-C     DIM(JW):   LG
-C     ON ENTRY, THE USER HAS TO PROVIDE THE ARRAYS E, F, G, AND H.
-C     ON RETURN, ALL ARRAYS WILL BE CHANGED BY THE SUBROUTINE.
-C     X     STORES THE SOLUTION VECTOR
-C     XNORM STORES THE RESIDUUM OF THE SOLUTION IN EUCLIDIAN NORM
-C     W     STORES THE VECTOR OF LAGRANGE MULTIPLIERS IN ITS FIRST
-C           MG ELEMENTS
-C     MODE  IS A SUCCESS-FAILURE FLAG WITH THE FOLLOWING MEANINGS:
-C          MODE=1: SUCCESSFUL COMPUTATION
-C               2: ERROR RETURN BECAUSE OF WRONG DIMENSIONS (N<1)
-C               3: ITERATION COUNT EXCEEDED BY NNLS
-C               4: INEQUALITY CONSTRAINTS INCOMPATIBLE
-C               5: MATRIX E IS NOT OF FULL RANK
-
-C     03.01.1980, DIETER KRAFT: CODED
-C     20.03.1987, DIETER KRAFT: REVISED TO FORTRAN 77
-
-      INTEGER          i,j,LE,lg,me,mg,mode,n,jw(lg)
-
-      DOUBLE PRECISION e(LE,n),f(LE),g(lg,n),h(lg),x(n),w(*),
-     .                 ddot_sl,xnorm,dnrm2_,epmach,t,one
-      DATA             epmach/2.22d-16/,one/1.0d+00/
-
-C  QR-FACTORS OF E AND APPLICATION TO F
-
-      DO 10 i=1,n
-      j=MIN(i+1,n)
-      CALL h12(1,i,i+1,me,e(1,i),1,t,e(1,j),1,LE,n-i)
-   10 CALL h12(2,i,i+1,me,e(1,i),1,t,f     ,1,1 ,1  )
-
-C  TRANSFORM G AND H TO GET LEAST DISTANCE PROBLEM
-
-      mode=5
-      DO 30 i=1,mg
-          DO 20 j=1,n
-              IF (.NOT.(ABS(e(j,j)).GE.epmach)) GOTO 50
-   20         g(i,j)=(g(i,j)-ddot_sl(j-1,g(i,1),lg,e(1,j),1))/e(j,j)
-   30     h(i)=h(i)-ddot_sl(n,g(i,1),lg,f,1)
-
-C  SOLVE LEAST DISTANCE PROBLEM
-
-      CALL ldp(g,lg,mg,n,h,x,xnorm,w,jw,mode)
-      IF (mode.NE.1)                     GOTO 50
-
-C  SOLUTION OF ORIGINAL PROBLEM
-
-      CALL daxpy_sl(n,one,f,1,x,1)
-      DO 40 i=n,1,-1
-          j=MIN(i+1,n)
-   40     x(i)=(x(i)-ddot_sl(n-i,e(i,j),LE,x(j),1))/e(i,i)
-      j=MIN(n+1,me)
-      t=dnrm2_(me-n,f(j),1)
-      xnorm=SQRT(xnorm*xnorm+t*t)
-
-C  END OF SUBROUTINE LSI
-
-   50                                    END
-
-      SUBROUTINE ldp(g,mg,m,n,h,x,xnorm,w,INDEX,mode)
-
-C                     T
-C     MINIMIZE   1/2 X X    SUBJECT TO   G * X >= H.
-
-C       C.L. LAWSON, R.J. HANSON: 'SOLVING LEAST SQUARES PROBLEMS'
-C       PRENTICE HALL, ENGLEWOOD CLIFFS, NEW JERSEY, 1974.
-
-C     PARAMETER DESCRIPTION:
-
-C     G(),MG,M,N   ON ENTRY G() STORES THE M BY N MATRIX OF
-C                  LINEAR INEQUALITY CONSTRAINTS. G() HAS FIRST
-C                  DIMENSIONING PARAMETER MG
-C     H()          ON ENTRY H() STORES THE M VECTOR H REPRESENTING
-C                  THE RIGHT SIDE OF THE INEQUALITY SYSTEM
-
-C     REMARK: G(),H() WILL NOT BE CHANGED DURING CALCULATIONS BY LDP
-
-C     X()          ON ENTRY X() NEED NOT BE INITIALIZED.
-C                  ON EXIT X() STORES THE SOLUTION VECTOR X IF MODE=1.
-C     XNORM        ON EXIT XNORM STORES THE EUCLIDIAN NORM OF THE
-C                  SOLUTION VECTOR IF COMPUTATION IS SUCCESSFUL
-C     W()          W IS A ONE DIMENSIONAL WORKING SPACE, THE LENGTH
-C                  OF WHICH SHOULD BE AT LEAST (M+2)*(N+1) + 2*M
-C                  ON EXIT W() STORES THE LAGRANGE MULTIPLIERS
-C                  ASSOCIATED WITH THE CONSTRAINTS
-C                  AT THE SOLUTION OF PROBLEM LDP
-C     INDEX()      INDEX() IS A ONE DIMENSIONAL INTEGER WORKING SPACE
-C                  OF LENGTH AT LEAST M
-C     MODE         MODE IS A SUCCESS-FAILURE FLAG WITH THE FOLLOWING
-C                  MEANINGS:
-C          MODE=1: SUCCESSFUL COMPUTATION
-C               2: ERROR RETURN BECAUSE OF WRONG DIMENSIONS (N.LE.0)
-C               3: ITERATION COUNT EXCEEDED BY NNLS
-C               4: INEQUALITY CONSTRAINTS INCOMPATIBLE
-
-
-      DOUBLE PRECISION g,h,x,xnorm,w,u,v,
-     .                 ZERO,one,fac,rnorm,dnrm2_,ddot_sl,diff
-      INTEGER          INDEX,i,IF,iw,iwdual,iy,iz,j,m,mg,mode,n,n1
-      DIMENSION        g(mg,n),h(m),x(n),w(*),INDEX(m)
-      diff(u,v)=       u-v
-      DATA             ZERO,one/0.0d0,1.0d0/
-
-      mode=2
-      IF(n.LE.0)                    GOTO 50
-
-C  STATE DUAL PROBLEM
-
-      mode=1
-      x(1)=ZERO
-      CALL dcopy_(n,x(1),0,x,1)
-      xnorm=ZERO
-      IF(m.EQ.0)                    GOTO 50
-      iw=0
-      DO 20 j=1,m
-          DO 10 i=1,n
-              iw=iw+1
-   10         w(iw)=g(j,i)
-          iw=iw+1
-   20     w(iw)=h(j)
-      IF=iw+1
-      DO 30 i=1,n
-          iw=iw+1
-   30     w(iw)=ZERO
-      w(iw+1)=one
-      n1=n+1
-      iz=iw+2
-      iy=iz+n1
-      iwdual=iy+m
-
-C  SOLVE DUAL PROBLEM
-
-      CALL nnls (w,n1,n1,m,w(IF),w(iy),rnorm,w(iwdual),w(iz),INDEX,mode)
-
-      IF(mode.NE.1)                 GOTO 50
-      mode=4
-      IF(rnorm.LE.ZERO)             GOTO 50
-
-C  COMPUTE SOLUTION OF PRIMAL PROBLEM
-
-      fac=one-ddot_sl(m,h,1,w(iy),1)
-      IF(.NOT.(diff(one+fac,one).GT.ZERO)) GOTO 50
-      mode=1
-      fac=one/fac
-      DO 40 j=1,n
-   40     x(j)=fac*ddot_sl(m,g(1,j),1,w(iy),1)
-      xnorm=dnrm2_(n,x,1)
-
-C  COMPUTE LAGRANGE MULTIPLIERS FOR PRIMAL PROBLEM
-
-      w(1)=ZERO
-      CALL dcopy_(m,w(1),0,w,1)
-      CALL daxpy_sl(m,fac,w(iy),1,w,1)
-
-C  END OF SUBROUTINE LDP
-
-   50                               END
-
-
-      SUBROUTINE nnls (a, mda, m, n, b, x, rnorm, w, z, INDEX, mode)
-
-C     C.L.LAWSON AND R.J.HANSON, JET PROPULSION LABORATORY:
-C     'SOLVING LEAST SQUARES PROBLEMS'. PRENTICE-HALL.1974
-
-C      **********   NONNEGATIVE LEAST SQUARES   **********
-
-C     GIVEN AN M BY N MATRIX, A, AND AN M-VECTOR, B, COMPUTE AN
-C     N-VECTOR, X, WHICH SOLVES THE LEAST SQUARES PROBLEM
-
-C                  A*X = B  SUBJECT TO  X >= 0
-
-C     A(),MDA,M,N
-C            MDA IS THE FIRST DIMENSIONING PARAMETER FOR THE ARRAY,A().
-C            ON ENTRY A()  CONTAINS THE M BY N MATRIX,A.
-C            ON EXIT A() CONTAINS THE PRODUCT Q*A,
-C            WHERE Q IS AN M BY M ORTHOGONAL MATRIX GENERATED
-C            IMPLICITLY BY THIS SUBROUTINE.
-C            EITHER M>=N OR M<N IS PERMISSIBLE.
-C            THERE IS NO RESTRICTION ON THE RANK OF A.
-C     B()    ON ENTRY B() CONTAINS THE M-VECTOR, B.
-C            ON EXIT B() CONTAINS Q*B.
-C     X()    ON ENTRY X() NEED NOT BE INITIALIZED.
-C            ON EXIT X() WILL CONTAIN THE SOLUTION VECTOR.
-C     RNORM  ON EXIT RNORM CONTAINS THE EUCLIDEAN NORM OF THE
-C            RESIDUAL VECTOR.
-C     W()    AN N-ARRAY OF WORKING SPACE.
-C            ON EXIT W() WILL CONTAIN THE DUAL SOLUTION VECTOR.
-C            W WILL SATISFY W(I)=0 FOR ALL I IN SET P
-C            AND W(I)<=0 FOR ALL I IN SET Z
-C     Z()    AN M-ARRAY OF WORKING SPACE.
-C     INDEX()AN INTEGER WORKING ARRAY OF LENGTH AT LEAST N.
-C            ON EXIT THE CONTENTS OF THIS ARRAY DEFINE THE SETS
-C            P AND Z AS FOLLOWS:
-C            INDEX(1)    THRU INDEX(NSETP) = SET P.
-C            INDEX(IZ1)  THRU INDEX (IZ2)  = SET Z.
-C            IZ1=NSETP + 1 = NPP1, IZ2=N.
-C     MODE   THIS IS A SUCCESS-FAILURE FLAG WITH THE FOLLOWING MEANING:
-C            1    THE SOLUTION HAS BEEN COMPUTED SUCCESSFULLY.
-C            2    THE DIMENSIONS OF THE PROBLEM ARE WRONG,
-C                 EITHER M <= 0 OR N <= 0.
-C            3    ITERATION COUNT EXCEEDED, MORE THAN 3*N ITERATIONS.
-
-      INTEGER          i,ii,ip,iter,itmax,iz,izmax,iz1,iz2,j,jj,jz,
-     *                 k,l,m,mda,mode,n,npp1,nsetp,INDEX(n)
-
-      DOUBLE PRECISION a(mda,n),b(m),x(n),w(n),z(m),asave,diff,
-     *                 factor,ddot_sl,ZERO,one,wmax,alpha,
-     *                 c,s,t,u,v,up,rnorm,unorm,dnrm2_
-
-      diff(u,v)=       u-v
-
-      DATA             ZERO,one,factor/0.0d0,1.0d0,1.0d-2/
-
-c     revised          Dieter Kraft, March 1983
-
-      mode=2
-      IF(m.LE.0.OR.n.LE.0)            GOTO 290
-      mode=1
-      iter=0
-      itmax=3*n
-
-C STEP ONE (INITIALIZE)
-
-      DO 100 i=1,n
-  100    INDEX(i)=i
-      iz1=1
-      iz2=n
-      nsetp=0
-      npp1=1
-      x(1)=ZERO
-      CALL dcopy_(n,x(1),0,x,1)
-
-C STEP TWO (COMPUTE DUAL VARIABLES)
-C .....ENTRY LOOP A
-
-  110 IF(iz1.GT.iz2.OR.nsetp.GE.m)    GOTO 280
-      DO 120 iz=iz1,iz2
-         j=INDEX(iz)
-  120    w(j)=ddot_sl(m-nsetp,a(npp1,j),1,b(npp1),1)
-
-C STEP THREE (TEST DUAL VARIABLES)
-
-  130 wmax=ZERO
-      DO 140 iz=iz1,iz2
-      j=INDEX(iz)
-         IF(w(j).LE.wmax)             GOTO 140
-         wmax=w(j)
-         izmax=iz
-  140 CONTINUE
-
-C .....EXIT LOOP A
-
-      IF(wmax.LE.ZERO)                GOTO 280
-      iz=izmax
-      j=INDEX(iz)
-
-C STEP FOUR (TEST INDEX J FOR LINEAR DEPENDENCY)
-
-      asave=a(npp1,j)
-      CALL h12(1,npp1,npp1+1,m,a(1,j),1,up,z,1,1,0)
-      unorm=dnrm2_(nsetp,a(1,j),1)
-      t=factor*ABS(a(npp1,j))
-      IF(diff(unorm+t,unorm).LE.ZERO) GOTO 150
-      CALL dcopy_(m,b,1,z,1)
-      CALL h12(2,npp1,npp1+1,m,a(1,j),1,up,z,1,1,1)
-      IF(z(npp1)/a(npp1,j).GT.ZERO)   GOTO 160
-  150 a(npp1,j)=asave
-      w(j)=ZERO
-                                      GOTO 130
-C STEP FIVE (ADD COLUMN)
-
-  160 CALL dcopy_(m,z,1,b,1)
-      INDEX(iz)=INDEX(iz1)
-      INDEX(iz1)=j
-      iz1=iz1+1
-      nsetp=npp1
-      npp1=npp1+1
-      DO 170 jz=iz1,iz2
-         jj=INDEX(jz)
-  170    CALL h12(2,nsetp,npp1,m,a(1,j),1,up,a(1,jj),1,mda,1)
-      k=MIN(npp1,mda)
-      w(j)=ZERO
-      CALL dcopy_(m-nsetp,w(j),0,a(k,j),1)
-
-C STEP SIX (SOLVE LEAST SQUARES SUB-PROBLEM)
-C .....ENTRY LOOP B
-
-  180 DO 200 ip=nsetp,1,-1
-         IF(ip.EQ.nsetp)              GOTO 190
-         CALL daxpy_sl(ip,-z(ip+1),a(1,jj),1,z,1)
-  190    jj=INDEX(ip)
-  200    z(ip)=z(ip)/a(ip,jj)
-      iter=iter+1
-      IF(iter.LE.itmax)               GOTO 220
-  210 mode=3
-                                      GOTO 280
-C STEP SEVEN TO TEN (STEP LENGTH ALGORITHM)
-
-  220 alpha=one
-      jj=0
-      DO 230 ip=1,nsetp
-         IF(z(ip).GT.ZERO)            GOTO 230
-         l=INDEX(ip)
-         t=-x(l)/(z(ip)-x(l))
-         IF(alpha.LT.t)               GOTO 230
-         alpha=t
-         jj=ip
-  230 CONTINUE
-      DO 240 ip=1,nsetp
-         l=INDEX(ip)
-  240    x(l)=(one-alpha)*x(l) + alpha*z(ip)
-
-C .....EXIT LOOP B
-
-      IF(jj.EQ.0)                     GOTO 110
-
-C STEP ELEVEN (DELETE COLUMN)
-
-      i=INDEX(jj)
-  250 x(i)=ZERO
-      jj=jj+1
-      DO 260 j=jj,nsetp
-         ii=INDEX(j)
-         INDEX(j-1)=ii
-         CALL dsrotg(a(j-1,ii),a(j,ii),c,s)
-         t=a(j-1,ii)
-         CALL dsrot(n,a(j-1,1),mda,a(j,1),mda,c,s)
-         a(j-1,ii)=t
-         a(j,ii)=ZERO
-  260    CALL dsrot(1,b(j-1),1,b(j),1,c,s)
-      npp1=nsetp
-      nsetp=nsetp-1
-      iz1=iz1-1
-      INDEX(iz1)=i
-      IF(nsetp.LE.0)                  GOTO 210
-      DO 270 jj=1,nsetp
-         i=INDEX(jj)
-         IF(x(i).LE.ZERO)             GOTO 250
-  270 CONTINUE
-      CALL dcopy_(m,b,1,z,1)
-                                      GOTO 180
-C STEP TWELVE (SOLUTION)
-
-  280 k=MIN(npp1,m)
-      rnorm=dnrm2_(m-nsetp,b(k),1)
-      IF(npp1.GT.m) THEN
-         w(1)=ZERO
-         CALL dcopy_(n,w(1),0,w,1)
-      ENDIF
-
-C END OF SUBROUTINE NNLS
-
-  290                                 END
-
-      SUBROUTINE hfti(a,mda,m,n,b,mdb,nb,tau,krank,rnorm,h,g,ip)
-
-C     RANK-DEFICIENT LEAST SQUARES ALGORITHM AS DESCRIBED IN:
-C     C.L.LAWSON AND R.J.HANSON, JET PROPULSION LABORATORY, 1973 JUN 12
-C     TO APPEAR IN 'SOLVING LEAST SQUARES PROBLEMS', PRENTICE-HALL, 1974
-
-C     A(*,*),MDA,M,N   THE ARRAY A INITIALLY CONTAINS THE M x N MATRIX A
-C                      OF THE LEAST SQUARES PROBLEM AX = B.
-C                      THE FIRST DIMENSIONING PARAMETER MDA MUST SATISFY
-C                      MDA >= M. EITHER M >= N OR M < N IS PERMITTED.
-C                      THERE IS NO RESTRICTION ON THE RANK OF A.
-C                      THE MATRIX A WILL BE MODIFIED BY THE SUBROUTINE.
-C     B(*,*),MDB,NB    IF NB = 0 THE SUBROUTINE WILL MAKE NO REFERENCE
-C                      TO THE ARRAY B. IF NB > 0 THE ARRAY B() MUST
-C                      INITIALLY CONTAIN THE M x NB MATRIX B  OF THE
-C                      THE LEAST SQUARES PROBLEM AX = B AND ON RETURN
-C                      THE ARRAY B() WILL CONTAIN THE N x NB SOLUTION X.
-C                      IF NB>1 THE ARRAY B() MUST BE DOUBLE SUBSCRIPTED
-C                      WITH FIRST DIMENSIONING PARAMETER MDB>=MAX(M,N),
-C                      IF NB=1 THE ARRAY B() MAY BE EITHER SINGLE OR
-C                      DOUBLE SUBSCRIPTED.
-C     TAU              ABSOLUTE TOLERANCE PARAMETER FOR PSEUDORANK
-C                      DETERMINATION, PROVIDED BY THE USER.
-C     KRANK            PSEUDORANK OF A, SET BY THE SUBROUTINE.
-C     RNORM            ON EXIT, RNORM(J) WILL CONTAIN THE EUCLIDIAN
-C                      NORM OF THE RESIDUAL VECTOR FOR THE PROBLEM
-C                      DEFINED BY THE J-TH COLUMN VECTOR OF THE ARRAY B.
-C     H(), G()         ARRAYS OF WORKING SPACE OF LENGTH >= N.
-C     IP()             INTEGER ARRAY OF WORKING SPACE OF LENGTH >= N
-C                      RECORDING PERMUTATION INDICES OF COLUMN VECTORS
-
-      INTEGER          i,j,jb,k,kp1,krank,l,ldiag,lmax,m,
-     .                 mda,mdb,n,nb,ip(n)
-
-      DOUBLE PRECISION a(mda,n),b(mdb,nb),h(n),g(n),rnorm(nb),factor,
-     .                 tau,ZERO,hmax,diff,tmp,ddot_sl,dnrm2_,u,v
-      diff(u,v)=       u-v
-      DATA             ZERO/0.0d0/, factor/1.0d-3/
-
-      k=0
-      ldiag=MIN(m,n)
-      IF(ldiag.LE.0)                  GOTO 270
-
-C   COMPUTE LMAX
-
-      DO 80 j=1,ldiag
-          IF(j.EQ.1)                  GOTO 20
-          lmax=j
-          DO 10 l=j,n
-              h(l)=h(l)-a(j-1,l)**2
-   10         IF(h(l).GT.h(lmax)) lmax=l
-          IF(diff(hmax+factor*h(lmax),hmax).GT.ZERO)
-     .                                GOTO 50
-   20     lmax=j
-          DO 40 l=j,n
-              h(l)=ZERO
-              DO 30 i=j,m
-   30             h(l)=h(l)+a(i,l)**2
-   40         IF(h(l).GT.h(lmax)) lmax=l
-          hmax=h(lmax)
-
-C   COLUMN INTERCHANGES IF NEEDED
-
-   50     ip(j)=lmax
-          IF(ip(j).EQ.j)              GOTO 70
-          DO 60 i=1,m
-              tmp=a(i,j)
-              a(i,j)=a(i,lmax)
-   60         a(i,lmax)=tmp
-          h(lmax)=h(j)
-
-C   J-TH TRANSFORMATION AND APPLICATION TO A AND B
-
-   70     i=MIN(j+1,n)
-          CALL h12(1,j,j+1,m,a(1,j),1,h(j),a(1,i),1,mda,n-j)
-   80     CALL h12(2,j,j+1,m,a(1,j),1,h(j),b,1,mdb,nb)
-
-C   DETERMINE PSEUDORANK
-
-      DO 90 j=1,ldiag
-   90     IF(ABS(a(j,j)).LE.tau)      GOTO 100
-      k=ldiag
-      GOTO 110
-  100 k=j-1
-  110 kp1=k+1
-
-C   NORM OF RESIDUALS
-
-      DO 130 jb=1,nb
-  130     rnorm(jb)=dnrm2_(m-k,b(kp1,jb),1)
-      IF(k.GT.0)                      GOTO 160
-      DO 150 jb=1,nb
-          DO 150 i=1,n
-  150         b(i,jb)=ZERO
-      GOTO 270
-  160 IF(k.EQ.n)                      GOTO 180
-
-C   HOUSEHOLDER DECOMPOSITION OF FIRST K ROWS
-
-      DO 170 i=k,1,-1
-  170     CALL h12(1,i,kp1,n,a(i,1),mda,g(i),a,mda,1,i-1)
-  180 DO 250 jb=1,nb
-
-C   SOLVE K*K TRIANGULAR SYSTEM
-
-          DO 210 i=k,1,-1
-              j=MIN(i+1,n)
-  210         b(i,jb)=(b(i,jb)-ddot_sl(k-i,a(i,j),mda,b(j,jb),1))/a(i,i)
-
-C   COMPLETE SOLUTION VECTOR
-
-          IF(k.EQ.n)                  GOTO 240
-          DO 220 j=kp1,n
-  220         b(j,jb)=ZERO
-          DO 230 i=1,k
-  230         CALL h12(2,i,kp1,n,a(i,1),mda,g(i),b(1,jb),1,mdb,1)
-
-C   REORDER SOLUTION ACCORDING TO PREVIOUS COLUMN INTERCHANGES
-
-  240     DO 250 j=ldiag,1,-1
-              IF(ip(j).EQ.j)          GOTO 250
-              l=ip(j)
-              tmp=b(l,jb)
-              b(l,jb)=b(j,jb)
-              b(j,jb)=tmp
-  250 CONTINUE
-  270 krank=k
-      END
-
-      SUBROUTINE h12 (mode,lpivot,l1,m,u,iue,up,c,ice,icv,ncv)
-
-C     C.L.LAWSON AND R.J.HANSON, JET PROPULSION LABORATORY, 1973 JUN 12
-C     TO APPEAR IN 'SOLVING LEAST SQUARES PROBLEMS', PRENTICE-HALL, 1974
-
-C     CONSTRUCTION AND/OR APPLICATION OF A SINGLE
-C     HOUSEHOLDER TRANSFORMATION  Q = I + U*(U**T)/B
-
-C     MODE    = 1 OR 2   TO SELECT ALGORITHM  H1  OR  H2 .
-C     LPIVOT IS THE INDEX OF THE PIVOT ELEMENT.
-C     L1,M   IF L1 <= M   THE TRANSFORMATION WILL BE CONSTRUCTED TO
-C            ZERO ELEMENTS INDEXED FROM L1 THROUGH M.
-C            IF L1 > M THE SUBROUTINE DOES AN IDENTITY TRANSFORMATION.
-C     U(),IUE,UP
-C            ON ENTRY TO H1 U() STORES THE PIVOT VECTOR.
-C            IUE IS THE STORAGE INCREMENT BETWEEN ELEMENTS.
-C            ON EXIT FROM H1 U() AND UP STORE QUANTITIES DEFINING
-C            THE VECTOR U OF THE HOUSEHOLDER TRANSFORMATION.
-C            ON ENTRY TO H2 U() AND UP
-C            SHOULD STORE QUANTITIES PREVIOUSLY COMPUTED BY H1.
-C            THESE WILL NOT BE MODIFIED BY H2.
-C     C()    ON ENTRY TO H1 OR H2 C() STORES A MATRIX WHICH WILL BE
-C            REGARDED AS A SET OF VECTORS TO WHICH THE HOUSEHOLDER
-C            TRANSFORMATION IS TO BE APPLIED.
-C            ON EXIT C() STORES THE SET OF TRANSFORMED VECTORS.
-C     ICE    STORAGE INCREMENT BETWEEN ELEMENTS OF VECTORS IN C().
-C     ICV    STORAGE INCREMENT BETWEEN VECTORS IN C().
-C     NCV    NUMBER OF VECTORS IN C() TO BE TRANSFORMED.
-C            IF NCV <= 0 NO OPERATIONS WILL BE DONE ON C().
-
-      INTEGER          incr, ice, icv, iue, lpivot, l1, mode, ncv
-      INTEGER          i, i2, i3, i4, j, m
-      DOUBLE PRECISION u,up,c,cl,clinv,b,sm,one,ZERO
-      DIMENSION        u(iue,*), c(*)
-      DATA             one/1.0d+00/, ZERO/0.0d+00/
-
-      IF (0.GE.lpivot.OR.lpivot.GE.l1.OR.l1.GT.m) GOTO 80
-      cl=ABS(u(1,lpivot))
-      IF (mode.EQ.2)                              GOTO 30
-
-C     ****** CONSTRUCT THE TRANSFORMATION ******
-
-          DO 10 j=l1,m
-             sm=ABS(u(1,j))
-   10     cl=MAX(sm,cl)
-      IF (cl.LE.ZERO)                             GOTO 80
-      clinv=one/cl
-      sm=(u(1,lpivot)*clinv)**2
-          DO 20 j=l1,m
-   20     sm=sm+(u(1,j)*clinv)**2
-      cl=cl*SQRT(sm)
-      IF (u(1,lpivot).GT.ZERO) cl=-cl
-      up=u(1,lpivot)-cl
-      u(1,lpivot)=cl
-                                                  GOTO 40
-C     ****** APPLY THE TRANSFORMATION  I+U*(U**T)/B  TO C ******
-
-   30 IF (cl.LE.ZERO)                             GOTO 80
-   40 IF (ncv.LE.0)                               GOTO 80
-      b=up*u(1,lpivot)
-      IF (b.GE.ZERO)                              GOTO 80
-      b=one/b
-      i2=1-icv+ice*(lpivot-1)
-      incr=ice*(l1-lpivot)
-          DO 70 j=1,ncv
-          i2=i2+icv
-          i3=i2+incr
-          i4=i3
-          sm=c(i2)*up
-              DO 50 i=l1,m
-              sm=sm+c(i3)*u(1,i)
-   50         i3=i3+ice
-          IF (sm.EQ.ZERO)                         GOTO 70
-          sm=sm*b
-          c(i2)=c(i2)+sm*up
-              DO 60 i=l1,m
-              c(i4)=c(i4)+sm*u(1,i)
-   60         i4=i4+ice
-   70     CONTINUE
-   80                                             END
-
-      SUBROUTINE ldl (n,a,z,sigma,w)
-C   LDL     LDL' - RANK-ONE - UPDATE
-
-C   PURPOSE:
-C           UPDATES THE LDL' FACTORS OF MATRIX A BY RANK-ONE MATRIX
-C           SIGMA*Z*Z'
-
-C   INPUT ARGUMENTS: (* MEANS PARAMETERS ARE CHANGED DURING EXECUTION)
-C     N     : ORDER OF THE COEFFICIENT MATRIX A
-C   * A     : POSITIVE DEFINITE MATRIX OF DIMENSION N;
-C             ONLY THE LOWER TRIANGLE IS USED AND IS STORED COLUMN BY
-C             COLUMN AS ONE DIMENSIONAL ARRAY OF DIMENSION N*(N+1)/2.
-C   * Z     : VECTOR OF DIMENSION N OF UPDATING ELEMENTS
-C     SIGMA : SCALAR FACTOR BY WHICH THE MODIFYING DYADE Z*Z' IS
-C             MULTIPLIED
-
-C   OUTPUT ARGUMENTS:
-C     A     : UPDATED LDL' FACTORS
-
-C   WORKING ARRAY:
-C     W     : VECTOR OP DIMENSION N (USED ONLY IF SIGMA .LT. ZERO)
-
-C   METHOD:
-C     THAT OF FLETCHER AND POWELL AS DESCRIBED IN :
-C     FLETCHER,R.,(1974) ON THE MODIFICATION OF LDL' FACTORIZATION.
-C     POWELL,M.J.D.      MATH.COMPUTATION 28, 1067-1078.
-
-C   IMPLEMENTED BY:
-C     KRAFT,D., DFVLR - INSTITUT FUER DYNAMIK DER FLUGSYSTEME
-C               D-8031  OBERPFAFFENHOFEN
-
-C   STATUS: 15. JANUARY 1980
-
-C   SUBROUTINES REQUIRED: NONE
-
-      INTEGER          i, ij, j, n
-      DOUBLE PRECISION a(*), t, v, w(*), z(*), u, tp, one, beta, four,
-     *                 ZERO, alpha, delta, gamma, sigma, epmach
-      DATA ZERO, one, four, epmach /0.0d0, 1.0d0, 4.0d0, 2.22d-16/
-
-      IF(sigma.EQ.ZERO)               GOTO 280
-      ij=1
-      t=one/sigma
-      IF(sigma.GT.ZERO)               GOTO 220
-C PREPARE NEGATIVE UPDATE
-      DO 150 i=1,n
-  150     w(i)=z(i)
-      DO 170 i=1,n
-          v=w(i)
-          t=t+v*v/a(ij)
-          DO 160 j=i+1,n
-              ij=ij+1
-  160         w(j)=w(j)-v*a(ij)
-  170     ij=ij+1
-      IF(t.GE.ZERO) t=epmach/sigma
-      DO 210 i=1,n
-          j=n+1-i
-          ij=ij-i
-          u=w(j)
-          w(j)=t
-  210     t=t-u*u/a(ij)
-  220 CONTINUE
-C HERE UPDATING BEGINS
-      DO 270 i=1,n
-          v=z(i)
-          delta=v/a(ij)
-          IF(sigma.LT.ZERO) tp=w(i)
-          IF(sigma.GT.ZERO) tp=t+delta*v
-          alpha=tp/t
-          a(ij)=alpha*a(ij)
-          IF(i.EQ.n)                  GOTO 280
-          beta=delta/tp
-          IF(alpha.GT.four)           GOTO 240
-          DO 230 j=i+1,n
-              ij=ij+1
-              z(j)=z(j)-v*a(ij)
-  230         a(ij)=a(ij)+beta*z(j)
-                                      GOTO 260
-  240     gamma=t/tp
-          DO 250 j=i+1,n
-              ij=ij+1
-              u=a(ij)
-              a(ij)=gamma*u+beta*z(j)
-  250         z(j)=z(j)-v*u
-  260     ij=ij+1
-  270     t=tp
-  280 RETURN
-C END OF LDL
-      END
-
-      DOUBLE PRECISION FUNCTION linmin (mode, ax, bx, f, tol)
-C   LINMIN  LINESEARCH WITHOUT DERIVATIVES
-
-C   PURPOSE:
-
-C  TO FIND THE ARGUMENT LINMIN WHERE THE FUNCTION F TAKES IT'S MINIMUM
-C  ON THE INTERVAL AX, BX.
-C  COMBINATION OF GOLDEN SECTION AND SUCCESSIVE QUADRATIC INTERPOLATION.
-
-C   INPUT ARGUMENTS: (* MEANS PARAMETERS ARE CHANGED DURING EXECUTION)
-
-C *MODE   SEE OUTPUT ARGUMENTS
-C  AX     LEFT ENDPOINT OF INITIAL INTERVAL
-C  BX     RIGHT ENDPOINT OF INITIAL INTERVAL
-C  F      FUNCTION VALUE AT LINMIN WHICH IS TO BE BROUGHT IN BY
-C         REVERSE COMMUNICATION CONTROLLED BY MODE
-C  TOL    DESIRED LENGTH OF INTERVAL OF UNCERTAINTY OF FINAL RESULT
-
-C   OUTPUT ARGUMENTS:
-
-C  LINMIN ABSCISSA APPROXIMATING THE POINT WHERE F ATTAINS A MINIMUM
-C  MODE   CONTROLS REVERSE COMMUNICATION
-C         MUST BE SET TO 0 INITIALLY, RETURNS WITH INTERMEDIATE
-C         VALUES 1 AND 2 WHICH MUST NOT BE CHANGED BY THE USER,
-C         ENDS WITH CONVERGENCE WITH VALUE 3.
-
-C   WORKING ARRAY:
-
-C  NONE
-
-C   METHOD:
-
-C  THIS FUNCTION SUBPROGRAM IS A SLIGHTLY MODIFIED VERSION OF THE
-C  ALGOL 60 PROCEDURE LOCALMIN GIVEN IN
-C  R.P. BRENT: ALGORITHMS FOR MINIMIZATION WITHOUT DERIVATIVES,
-C              PRENTICE-HALL (1973).
-
-C   IMPLEMENTED BY:
-
-C     KRAFT, D., DFVLR - INSTITUT FUER DYNAMIK DER FLUGSYSTEME
-C                D-8031  OBERPFAFFENHOFEN
-
-C   STATUS: 31. AUGUST  1984
-
-C   SUBROUTINES REQUIRED: NONE
-
-      INTEGER          mode
-      DOUBLE PRECISION f, tol, a, b, c, d, e, p, q, r, u, v, w, x, m,
-     &                 fu, fv, fw, fx, eps, tol1, tol2, ZERO, ax, bx
-      DATA             c /0.381966011d0/, eps /1.5d-8/, ZERO /0.0d0/
-
-C  EPS = SQUARE - ROOT OF MACHINE PRECISION
-C  C = GOLDEN SECTION RATIO = (3-SQRT(5))/2
-
-      GOTO (10, 55), mode
-
-C  INITIALIZATION
-
-      a = ax
-      b = bx
-      e = ZERO
-      v = a + c*(b - a)
-      w = v
-      x = w
-      linmin = x
-      mode = 1
-      GOTO 100
-
-C  MAIN LOOP STARTS HERE
-
-   10 fx = f
-      fv = fx
-      fw = fv
-   20 m = 0.5d0*(a + b)
-      tol1 = eps*ABS(x) + tol
-      tol2 = tol1 + tol1
-
-C  TEST CONVERGENCE
-
-      IF (ABS(x - m) .LE. tol2 - 0.5d0*(b - a)) GOTO 90
-      r = ZERO
-      q = r
-      p = q
-      IF (ABS(e) .LE. tol1) GOTO 30
-
-C  FIT PARABOLA
-
-      r = (x - w)*(fx - fv)
-      q = (x - v)*(fx - fw)
-      p = (x - v)*q - (x - w)*r
-      q = q - r
-      q = q + q
-      IF (q .GT. ZERO) p = -p
-      IF (q .LT. ZERO) q = -q
-      r = e
-      e = d
-
-C  IS PARABOLA ACCEPTABLE
-
-   30 IF (ABS(p) .GE. 0.5d0*ABS(q*r) .OR.
-     &    p .LE. q*(a - x) .OR. p .GE. q*(b-x)) GOTO 40
-
-C  PARABOLIC INTERPOLATION STEP
-
-      d = p/q
-
-C  F MUST NOT BE EVALUATED TOO CLOSE TO A OR B
-
-      IF (u - a .LT. tol2) d = SIGN(tol1, m - x)
-      IF (b - u .LT. tol2) d = SIGN(tol1, m - x)
-      GOTO 50
-
-C  GOLDEN SECTION STEP
-
-   40 IF (x .GE. m) e = a - x
-      IF (x .LT. m) e = b - x
-      d = c*e
-
-C  F MUST NOT BE EVALUATED TOO CLOSE TO X
-
-   50 IF (ABS(d) .LT. tol1) d = SIGN(tol1, d)
-      u = x + d
-      linmin = u
-      mode = 2
-      GOTO 100
-   55 fu = f
-
-C  UPDATE A, B, V, W, AND X
-
-      IF (fu .GT. fx) GOTO 60
-      IF (u .GE. x) a = x
-      IF (u .LT. x) b = x
-      v = w
-      fv = fw
-      w = x
-      fw = fx
-      x = u
-      fx = fu
-      GOTO 85
-   60 IF (u .LT. x) a = u
-      IF (u .GE. x) b = u
-      IF (fu .LE. fw .OR. w .EQ. x) GOTO 70
-      IF (fu .LE. fv .OR. v .EQ. x .OR. v .EQ. w) GOTO 80
-      GOTO 85
-   70 v = w
-      fv = fw
-      w = u
-      fw = fu
-      GOTO 85
-   80 v = u
-      fv = fu
-   85 GOTO 20
-
-C  END OF MAIN LOOP
-
-   90 linmin = x
-      mode = 3
-  100 RETURN
-
-C  END OF LINMIN
-
-      END
-
-C## Following a selection from BLAS Level 1
-
-      SUBROUTINE daxpy_sl(n,da,dx,incx,dy,incy)
-
-C     CONSTANT TIMES A VECTOR PLUS A VECTOR.
-C     USES UNROLLED LOOPS FOR INCREMENTS EQUAL TO ONE.
-C     JACK DONGARRA, LINPACK, 3/11/78.
-
-      DOUBLE PRECISION dx(*),dy(*),da
-      INTEGER i,incx,incy,ix,iy,m,mp1,n
-
-      IF(n.LE.0)RETURN
-      IF(da.EQ.0.0d0)RETURN
-      IF(incx.EQ.1.AND.incy.EQ.1)GO TO 20
-
-C        CODE FOR UNEQUAL INCREMENTS OR EQUAL INCREMENTS
-C        NOT EQUAL TO 1
-
-      ix = 1
-      iy = 1
-      IF(incx.LT.0)ix = (-n+1)*incx + 1
-      IF(incy.LT.0)iy = (-n+1)*incy + 1
-      DO 10 i = 1,n
-        dy(iy) = dy(iy) + da*dx(ix)
-        ix = ix + incx
-        iy = iy + incy
-   10 CONTINUE
-      RETURN
-
-C        CODE FOR BOTH INCREMENTS EQUAL TO 1
-
-C        CLEAN-UP LOOP
-
-   20 m = MOD(n,4)
-      IF( m .EQ. 0 ) GO TO 40
-      DO 30 i = 1,m
-        dy(i) = dy(i) + da*dx(i)
-   30 CONTINUE
-      IF( n .LT. 4 ) RETURN
-   40 mp1 = m + 1
-      DO 50 i = mp1,n,4
-        dy(i) = dy(i) + da*dx(i)
-        dy(i + 1) = dy(i + 1) + da*dx(i + 1)
-        dy(i + 2) = dy(i + 2) + da*dx(i + 2)
-        dy(i + 3) = dy(i + 3) + da*dx(i + 3)
-   50 CONTINUE
-      RETURN
-      END
-
-      SUBROUTINE  dcopy_(n,dx,incx,dy,incy)
-
-C     COPIES A VECTOR, X, TO A VECTOR, Y.
-C     USES UNROLLED LOOPS FOR INCREMENTS EQUAL TO ONE.
-C     JACK DONGARRA, LINPACK, 3/11/78.
-
-      DOUBLE PRECISION dx(*),dy(*)
-      INTEGER i,incx,incy,ix,iy,m,mp1,n
-
-      IF(n.LE.0)RETURN
-      IF(incx.EQ.1.AND.incy.EQ.1)GO TO 20
-
-C        CODE FOR UNEQUAL INCREMENTS OR EQUAL INCREMENTS
-C        NOT EQUAL TO 1
-
-      ix = 1
-      iy = 1
-      IF(incx.LT.0)ix = (-n+1)*incx + 1
-      IF(incy.LT.0)iy = (-n+1)*incy + 1
-      DO 10 i = 1,n
-        dy(iy) = dx(ix)
-        ix = ix + incx
-        iy = iy + incy
-   10 CONTINUE
-      RETURN
-
-C        CODE FOR BOTH INCREMENTS EQUAL TO 1
-
-C        CLEAN-UP LOOP
-
-   20 m = MOD(n,7)
-      IF( m .EQ. 0 ) GO TO 40
-      DO 30 i = 1,m
-        dy(i) = dx(i)
-   30 CONTINUE
-      IF( n .LT. 7 ) RETURN
-   40 mp1 = m + 1
-      DO 50 i = mp1,n,7
-        dy(i) = dx(i)
-        dy(i + 1) = dx(i + 1)
-        dy(i + 2) = dx(i + 2)
-        dy(i + 3) = dx(i + 3)
-        dy(i + 4) = dx(i + 4)
-        dy(i + 5) = dx(i + 5)
-        dy(i + 6) = dx(i + 6)
-   50 CONTINUE
-      RETURN
-      END
-
-      DOUBLE PRECISION FUNCTION ddot_sl(n,dx,incx,dy,incy)
-
-C     FORMS THE DOT PRODUCT OF TWO VECTORS.
-C     USES UNROLLED LOOPS FOR INCREMENTS EQUAL TO ONE.
-C     JACK DONGARRA, LINPACK, 3/11/78.
-
-      DOUBLE PRECISION dx(*),dy(*),dtemp
-      INTEGER i,incx,incy,ix,iy,m,mp1,n
-
-      ddot_sl = 0.0d0
-      dtemp = 0.0d0
-      IF(n.LE.0)RETURN
-      IF(incx.EQ.1.AND.incy.EQ.1)GO TO 20
-
-C        CODE FOR UNEQUAL INCREMENTS OR EQUAL INCREMENTS
-C          NOT EQUAL TO 1
-
-      ix = 1
-      iy = 1
-      IF(incx.LT.0)ix = (-n+1)*incx + 1
-      IF(incy.LT.0)iy = (-n+1)*incy + 1
-      DO 10 i = 1,n
-        dtemp = dtemp + dx(ix)*dy(iy)
-        ix = ix + incx
-        iy = iy + incy
-   10 CONTINUE
-      ddot_sl = dtemp
-      RETURN
-
-C        CODE FOR BOTH INCREMENTS EQUAL TO 1
-
-C        CLEAN-UP LOOP
-
-   20 m = MOD(n,5)
-      IF( m .EQ. 0 ) GO TO 40
-      DO 30 i = 1,m
-        dtemp = dtemp + dx(i)*dy(i)
-   30 CONTINUE
-      IF( n .LT. 5 ) GO TO 60
-   40 mp1 = m + 1
-      DO 50 i = mp1,n,5
-        dtemp = dtemp + dx(i)*dy(i) + dx(i + 1)*dy(i + 1) +
-     *   dx(i + 2)*dy(i + 2) + dx(i + 3)*dy(i + 3) + dx(i + 4)*dy(i + 4)
-   50 CONTINUE
-   60 ddot_sl = dtemp
-      RETURN
-      END
-
-      DOUBLE PRECISION FUNCTION dnrm1(n,x,i,j)
-      INTEGER n, i, j, k
-      DOUBLE PRECISION snormx, sum, x(n), ZERO, one, scale, temp
-      DATA ZERO/0.0d0/, one/1.0d0/
-
-C      DNRM1 - COMPUTES THE I-NORM OF A VECTOR
-C              BETWEEN THE ITH AND THE JTH ELEMENTS
-
-C      INPUT -
-C      N       LENGTH OF VECTOR
-C      X       VECTOR OF LENGTH N
-C      I       INITIAL ELEMENT OF VECTOR TO BE USED
-C      J       FINAL ELEMENT TO USE
-
-C      OUTPUT -
-C      DNRM1   NORM
-
-      snormx=ZERO
-      DO 10 k=i,j
- 10      snormx=MAX(snormx,ABS(x(k)))
-      dnrm1 = snormx
-      IF (snormx.EQ.ZERO) RETURN
-      scale = snormx
-      IF (snormx.GE.one) scale=SQRT(snormx)
-      sum=ZERO
-      DO 20 k=i,j
-         temp=ZERO
-         IF (ABS(x(k))+scale .NE. scale) temp = x(k)/snormx
-         IF (one+temp.NE.one) sum = sum+temp*temp
- 20      CONTINUE
-      sum=SQRT(sum)
-      dnrm1=snormx*sum
-      RETURN
-      END
-
-      DOUBLE PRECISION FUNCTION dnrm2_ ( n, dx, incx)
-      INTEGER          n, i, j, nn, next, incx
-      DOUBLE PRECISION dx(*), cutlo, cuthi, hitest, sum, xmax, ZERO, one
-      DATA             ZERO, one /0.0d0, 1.0d0/
-
-C     EUCLIDEAN NORM OF THE N-VECTOR STORED IN DX() WITH STORAGE
-C     INCREMENT INCX .
-C     IF    N .LE. 0 RETURN WITH RESULT = 0.
-C     IF N .GE. 1 THEN INCX MUST BE .GE. 1
-
-C           C.L.LAWSON, 1978 JAN 08
-
-C     FOUR PHASE METHOD     USING TWO BUILT-IN CONSTANTS THAT ARE
-C     HOPEFULLY APPLICABLE TO ALL MACHINES.
-C         CUTLO = MAXIMUM OF  SQRT(U/EPS)   OVER ALL KNOWN MACHINES.
-C         CUTHI = MINIMUM OF  SQRT(V)       OVER ALL KNOWN MACHINES.
-C     WHERE
-C         EPS = SMALLEST NO. SUCH THAT EPS + 1. .GT. 1.
-C         U   = SMALLEST POSITIVE NO.   (UNDERFLOW LIMIT)
-C         V   = LARGEST  NO.            (OVERFLOW  LIMIT)
-
-C     BRIEF OUTLINE OF ALGORITHM..
-
-C     PHASE 1    SCANS ZERO COMPONENTS.
-C     MOVE TO PHASE 2 WHEN A COMPONENT IS NONZERO AND .LE. CUTLO
-C     MOVE TO PHASE 3 WHEN A COMPONENT IS .GT. CUTLO
-C     MOVE TO PHASE 4 WHEN A COMPONENT IS .GE. CUTHI/M
-C     WHERE M = N FOR X() REAL AND M = 2*N FOR COMPLEX.
-
-C     VALUES FOR CUTLO AND CUTHI..
-C     FROM THE ENVIRONMENTAL PARAMETERS LISTED IN THE IMSL CONVERTER
-C     DOCUMENT THE LIMITING VALUES ARE AS FOLLOWS..
-C     CUTLO, S.P.   U/EPS = 2**(-102) FOR  HONEYWELL.  CLOSE SECONDS ARE
-C                   UNIVAC AND DEC AT 2**(-103)
-C                   THUS CUTLO = 2**(-51) = 4.44089E-16
-C     CUTHI, S.P.   V = 2**127 FOR UNIVAC, HONEYWELL, AND DEC.
-C                   THUS CUTHI = 2**(63.5) = 1.30438E19
-C     CUTLO, D.P.   U/EPS = 2**(-67) FOR HONEYWELL AND DEC.
-C                   THUS CUTLO = 2**(-33.5) = 8.23181D-11
-C     CUTHI, D.P.   SAME AS S.P.  CUTHI = 1.30438D19
-C     DATA CUTLO, CUTHI / 8.232D-11,  1.304D19 /
-C     DATA CUTLO, CUTHI / 4.441E-16,  1.304E19 /
-      DATA cutlo, cuthi / 8.232d-11,  1.304d19 /
-
-      IF(n .GT. 0) GO TO 10
-         dnrm2_  = ZERO
-         GO TO 300
-
-   10 assign 30 to next
-      sum = ZERO
-      nn = n * incx
-C                       BEGIN MAIN LOOP
-      i = 1
-   20    GO TO next,(30, 50, 70, 110)
-   30 IF( ABS(dx(i)) .GT. cutlo) GO TO 85
-      assign 50 to next
-      xmax = ZERO
-
-C                        PHASE 1.  SUM IS ZERO
-
-   50 IF( dx(i) .EQ. ZERO) GO TO 200
-      IF( ABS(dx(i)) .GT. cutlo) GO TO 85
-
-C                        PREPARE FOR PHASE 2.
-
-      assign 70 to next
-      GO TO 105
-
-C                        PREPARE FOR PHASE 4.
-
-  100 i = j
-      assign 110 to next
-      sum = (sum / dx(i)) / dx(i)
-  105 xmax = ABS(dx(i))
-      GO TO 115
-
-C                   PHASE 2.  SUM IS SMALL.
-C                             SCALE TO AVOID DESTRUCTIVE UNDERFLOW.
-
-   70 IF( ABS(dx(i)) .GT. cutlo ) GO TO 75
-
-C                   COMMON CODE FOR PHASES 2 AND 4.
-C                   IN PHASE 4 SUM IS LARGE.  SCALE TO AVOID OVERFLOW.
-
-  110 IF( ABS(dx(i)) .LE. xmax ) GO TO 115
-         sum = one + sum * (xmax / dx(i))**2
-         xmax = ABS(dx(i))
-         GO TO 200
-
-  115 sum = sum + (dx(i)/xmax)**2
-      GO TO 200
-
-C                  PREPARE FOR PHASE 3.
-
-   75 sum = (sum * xmax) * xmax
-
-C     FOR REAL OR D.P. SET HITEST = CUTHI/N
-C     FOR COMPLEX      SET HITEST = CUTHI/(2*N)
-
-   85 hitest = cuthi/float( n )
-
-C                   PHASE 3.  SUM IS MID-RANGE.  NO SCALING.
-
-      DO 95 j =i,nn,incx
-      IF(ABS(dx(j)) .GE. hitest) GO TO 100
-   95    sum = sum + dx(j)**2
-      dnrm2_ = SQRT( sum )
-      GO TO 300
-
-  200 CONTINUE
-      i = i + incx
-      IF ( i .LE. nn ) GO TO 20
-
-C              END OF MAIN LOOP.
-
-C              COMPUTE SQUARE ROOT AND ADJUST FOR SCALING.
-
-      dnrm2_ = xmax * SQRT(sum)
-  300 CONTINUE
-      RETURN
-      END
-
-      SUBROUTINE  dsrot (n,dx,incx,dy,incy,c,s)
-
-C     APPLIES A PLANE ROTATION.
-C     JACK DONGARRA, LINPACK, 3/11/78.
-
-      DOUBLE PRECISION dx(*),dy(*),dtemp,c,s
-      INTEGER i,incx,incy,ix,iy,n
-
-      IF(n.LE.0)RETURN
-      IF(incx.EQ.1.AND.incy.EQ.1)GO TO 20
-
-C       CODE FOR UNEQUAL INCREMENTS OR EQUAL INCREMENTS NOT EQUAL
-C         TO 1
-
-      ix = 1
-      iy = 1
-      IF(incx.LT.0)ix = (-n+1)*incx + 1
-      IF(incy.LT.0)iy = (-n+1)*incy + 1
-      DO 10 i = 1,n
-        dtemp = c*dx(ix) + s*dy(iy)
-        dy(iy) = c*dy(iy) - s*dx(ix)
-        dx(ix) = dtemp
-        ix = ix + incx
-        iy = iy + incy
-   10 CONTINUE
-      RETURN
-
-C       CODE FOR BOTH INCREMENTS EQUAL TO 1
-
-   20 DO 30 i = 1,n
-        dtemp = c*dx(i) + s*dy(i)
-        dy(i) = c*dy(i) - s*dx(i)
-        dx(i) = dtemp
-   30 CONTINUE
-      RETURN
-      END
-
-      SUBROUTINE dsrotg(da,db,c,s)
-
-C     CONSTRUCT GIVENS PLANE ROTATION.
-C     JACK DONGARRA, LINPACK, 3/11/78.
-C                    MODIFIED 9/27/86.
-
-      DOUBLE PRECISION da,db,c,s,roe,scale,r,z,one,ZERO
-      DATA one, ZERO /1.0d+00, 0.0d+00/
-
-      roe = db
-      IF( ABS(da) .GT. ABS(db) ) roe = da
-      scale = ABS(da) + ABS(db)
-      IF( scale .NE. ZERO ) GO TO 10
-         c = one
-         s = ZERO
-         r = ZERO
-         GO TO 20
-   10 r = scale*SQRT((da/scale)**2 + (db/scale)**2)
-      r = SIGN(one,roe)*r
-      c = da/r
-      s = db/r
-   20 z = s
-      IF( ABS(c) .GT. ZERO .AND. ABS(c) .LE. s ) z = one/c
-      da = r
-      db = z
-      RETURN
-      END
-
-      SUBROUTINE  dscal_sl(n,da,dx,incx)
-
-C     SCALES A VECTOR BY A CONSTANT.
-C     USES UNROLLED LOOPS FOR INCREMENT EQUAL TO ONE.
-C     JACK DONGARRA, LINPACK, 3/11/78.
-
-      DOUBLE PRECISION da,dx(*)
-      INTEGER i,incx,m,mp1,n,nincx
-
-      IF(n.LE.0)RETURN
-      IF(incx.EQ.1)GO TO 20
-
-
-C        CODE FOR INCREMENT NOT EQUAL TO 1
-
-      nincx = n*incx
-      DO 10 i = 1,nincx,incx
-        dx(i) = da*dx(i)
-   10 CONTINUE
-      RETURN
-
-C        CODE FOR INCREMENT EQUAL TO 1
-
-C        CLEAN-UP LOOP
-
-   20 m = MOD(n,5)
-      IF( m .EQ. 0 ) GO TO 40
-      DO 30 i = 1,m
-        dx(i) = da*dx(i)
-   30 CONTINUE
-      IF( n .LT. 5 ) RETURN
-   40 mp1 = m + 1
-      DO 50 i = mp1,n,5
-        dx(i) = da*dx(i)
-        dx(i + 1) = da*dx(i + 1)
-        dx(i + 2) = da*dx(i + 2)
-        dx(i + 3) = da*dx(i + 3)
-        dx(i + 4) = da*dx(i + 4)
-   50 CONTINUE
-      RETURN
-      END
-
-      subroutine bound(n, x, xl, xu)
-      integer n, i
-      double precision x(n), xl(n), xu(n)
-      do i = 1, n
-C        Note that xl(i) and xu(i) may be NaN to indicate no bound
-         if(xl(i).eq.xl(i).and.x(i) < xl(i))then
-            x(i) = xl(i)
-         else if(xu(i).eq.xu(i).and.x(i) > xu(i))then
-            x(i) = xu(i)
-         end if
-      end do
-      end subroutine bound
diff --git a/scipy/optimize/tests/test_bracket.py b/scipy/optimize/tests/test_bracket.py
index ca1b2ced20b1..95996730eca6 100644
--- a/scipy/optimize/tests/test_bracket.py
+++ b/scipy/optimize/tests/test_bracket.py
@@ -249,13 +249,13 @@ def test_input_validation(self, xp):
         with pytest.raises(ValueError, match=message):
             _bracket_root(lambda x: x, -4+1j, 4)
         with pytest.raises(ValueError, match=message):
-            _bracket_root(lambda x: x, -4, 'hello')
+            _bracket_root(lambda x: x, -4, 4+1j)
         with pytest.raises(ValueError, match=message):
-            _bracket_root(lambda x: x, -4, 4, xmin=np)
+            _bracket_root(lambda x: x, -4, 4, xmin=4+1j)
         with pytest.raises(ValueError, match=message):
-            _bracket_root(lambda x: x, -4, 4, xmax=object())
+            _bracket_root(lambda x: x, -4, 4, xmax=4+1j)
         with pytest.raises(ValueError, match=message):
-            _bracket_root(lambda x: x, -4, 4, factor=sum)
+            _bracket_root(lambda x: x, -4, 4, factor=4+1j)
 
         message = "All elements of `factor` must be greater than 1."
         with pytest.raises(ValueError, match=message):
@@ -321,7 +321,7 @@ def f(x):
 
         # 2. bracket endpoint hits root exactly
         f.count = 0
-        res = _bracket_root(f, xp.asarray(5.), xp.asarray(10.), 
+        res = _bracket_root(f, xp.asarray(5.), xp.asarray(10.),
                             factor=2)
 
         assert res.nfev == 4
@@ -330,12 +330,12 @@ def f(x):
 
         # 3. bracket limit hits root exactly
         with np.errstate(over='ignore'):
-            res = _bracket_root(f, xp.asarray(5.), xp.asarray(10.), 
+            res = _bracket_root(f, xp.asarray(5.), xp.asarray(10.),
                                 xmin=0)
         xp_assert_close(res.xl, xp.asarray(0.), atol=1e-15)
 
         with np.errstate(over='ignore'):
-            res = _bracket_root(f, xp.asarray(-10.), xp.asarray(-5.), 
+            res = _bracket_root(f, xp.asarray(-10.), xp.asarray(-5.),
                                 xmax=0)
         xp_assert_close(res.xr, xp.asarray(0.), atol=1e-15)
 
@@ -552,23 +552,21 @@ def test_input_validation(self, xp):
         with pytest.raises(ValueError, match=message):
             _bracket_minimum(lambda x: x**2, xp.asarray(4+1j))
         with pytest.raises(ValueError, match=message):
-            _bracket_minimum(lambda x: x**2, xp.asarray(-4), xl0='hello')
+            _bracket_minimum(lambda x: x**2, xp.asarray(-4), xl0=4+1j)
         with pytest.raises(ValueError, match=message):
-            _bracket_minimum(lambda x: x**2, xp.asarray(-4),
-                             xr0='farcical aquatic ceremony')
+            _bracket_minimum(lambda x: x**2, xp.asarray(-4), xr0=4+1j)
         with pytest.raises(ValueError, match=message):
-            _bracket_minimum(lambda x: x**2, xp.asarray(-4), xmin=np)
+            _bracket_minimum(lambda x: x**2, xp.asarray(-4), xmin=4+1j)
         with pytest.raises(ValueError, match=message):
-            _bracket_minimum(lambda x: x**2, xp.asarray(-4), xmax=object())
+            _bracket_minimum(lambda x: x**2, xp.asarray(-4), xmax=4+1j)
         with pytest.raises(ValueError, match=message):
-            _bracket_minimum(lambda x: x**2, xp.asarray(-4), factor=sum)
+            _bracket_minimum(lambda x: x**2, xp.asarray(-4), factor=4+1j)
 
         message = "All elements of `factor` must be greater than 1."
         with pytest.raises(ValueError, match=message):
             _bracket_minimum(lambda x: x, xp.asarray(-4), factor=0.5)
 
-        message = "shape mismatch: objects cannot be broadcast"
-        # raised by `xp.broadcast, but the traceback is readable IMO
+        message = "Array shapes are incompatible for broadcasting."
         with pytest.raises(ValueError, match=message):
             _bracket_minimum(lambda x: x**2, xp.asarray([-2, -3]), xl0=[-3, -4, -5])
 
@@ -803,8 +801,10 @@ def bracket_minimum_single(xm0, xl0, xr0, xmin, xmax, factor, a):
         factor = rng.random(size=shape) + 1.5
         refs = bracket_minimum_single(xm0, xl0, xr0, xmin, xmax, factor, a).ravel()
         args = tuple(xp.asarray(arg, dtype=xp.float64) for arg in args)
-        res = _bracket_minimum(f, xp.asarray(xm0), xl0=xl0, xr0=xr0, xmin=xmin,
-                               xmax=xmax, factor=factor, args=args, maxiter=maxiter)
+        res = _bracket_minimum(f, xp.asarray(xm0), xl0=xp.asarray(xl0),
+                               xr0=xp.asarray(xr0), xmin=xp.asarray(xmin),
+                               xmax=xp.asarray(xmax), factor=xp.asarray(factor),
+                               args=args, maxiter=maxiter)
 
         attrs = ['xl', 'xm', 'xr', 'fl', 'fm', 'fr', 'success', 'nfev', 'nit']
         for attr in attrs:
diff --git a/scipy/optimize/tests/test_chandrupatla.py b/scipy/optimize/tests/test_chandrupatla.py
index 8714362c5f8b..f582aeb9dc06 100644
--- a/scipy/optimize/tests/test_chandrupatla.py
+++ b/scipy/optimize/tests/test_chandrupatla.py
@@ -282,7 +282,7 @@ def test_convergence(self, xp):
         # Test that the convergence tolerances behave as expected
         rng = np.random.default_rng(2585255913088665241)
         p = xp.asarray(rng.random(size=3))
-        bracket = (xp.asarray(-5), xp.asarray(0), xp.asarray(5))
+        bracket = (xp.asarray(-5, dtype=xp.float64), xp.asarray(0), xp.asarray(5))
         args = (p,)
         kwargs0 = dict(args=args, xatol=0, xrtol=0, fatol=0, frtol=0)
 
@@ -582,7 +582,8 @@ def f(*args, **kwargs):
             return self.f(*args, **kwargs)
         f.f_evals = 0
 
-        res = find_root(f, (xp.asarray(-5.), xp.asarray(5.)), args=args_xp)
+        bracket = xp.asarray(-5., dtype=xp.float64), xp.asarray(5., dtype=xp.float64)
+        res = find_root(f, bracket, args=args_xp)
         refs = find_root_single(p).ravel()
 
         ref_x = [ref.x for ref in refs]
diff --git a/scipy/optimize/tests/test_differentiable_functions.py b/scipy/optimize/tests/test_differentiable_functions.py
index 5c16672f7371..1f8f53b794fd 100644
--- a/scipy/optimize/tests/test_differentiable_functions.py
+++ b/scipy/optimize/tests/test_differentiable_functions.py
@@ -481,8 +481,9 @@ def test_finite_difference_jac(self):
         assert_array_equal(analit.nfev, nfev)
         assert_array_equal(ex.njev, njev)
         assert_array_equal(analit.njev, njev)
-        approx = VectorFunction(ex.fun, x0, '2-point', ex.hess, None, None,
-                                (-np.inf, np.inf), None)
+        # create with defaults for the keyword arguments, to
+        # ensure that the defaults work
+        approx = VectorFunction(ex.fun, x0, '2-point', ex.hess)
         nfev += 3
         assert_array_equal(ex.nfev, nfev)
         assert_array_equal(analit.nfev+approx.nfev, nfev)
@@ -563,6 +564,24 @@ def test_finite_difference_jac(self):
         assert_array_almost_equal(f_analit, f_approx)
         assert_array_almost_equal(J_analit, J_approx)
 
+    def test_updating_on_initial_setup(self):
+        # Check that memoisation works with the freshly created VectorFunction
+        # On initialization vf.f_updated attribute wasn't being set correctly.
+        x0 = np.array([2.5, 3.0])
+        ex = ExVectorialFunction()
+        vf = VectorFunction(ex.fun, x0, ex.jac, ex.hess)
+        assert vf.f_updated
+        assert vf.nfev == 1
+        assert vf.njev == 1
+        assert ex.nfev == 1
+        assert ex.njev == 1
+        vf.fun(x0)
+        vf.jac(x0)
+        assert vf.nfev == 1
+        assert vf.njev == 1
+        assert ex.nfev == 1
+        assert ex.njev == 1
+
     @pytest.mark.fail_slow(5.0)
     def test_workers(self):
         x0 = np.array([2.5, 3.0])
@@ -761,6 +780,29 @@ def test_finite_difference_hess_linear_operator(self):
         assert_array_equal(ex.nhev, nhev)
         assert_array_equal(analit.nhev+approx.nhev, nhev)
 
+    def test_fgh_overlap(self):
+        # VectorFunction.fun/jac should return copies to internal attributes
+        ex = ExVectorialFunction()
+        x0 = np.array([1.0, 0.0])
+
+        vf = VectorFunction(ex.fun, x0, '3-point', ex.hess, None, None,
+                            (-np.inf, np.inf), None)
+        f = vf.fun(np.array([1.1, 0.1]))
+        J = vf.jac([1.1, 0.1])
+        assert vf.f is not f
+        assert vf.J is not J
+        assert_equal(f, vf.f)
+        assert_equal(J, vf.J)
+
+        vf = VectorFunction(ex.fun, x0, ex.jac, ex.hess, None, None,
+                            (-np.inf, np.inf), None)
+        f = vf.fun(np.array([1.1, 0.1]))
+        J = vf.jac([1.1, 0.1])
+        assert vf.f is not f
+        assert vf.J is not J
+        assert_equal(f, vf.f)
+        assert_equal(J, vf.J)
+
     @pytest.mark.thread_unsafe
     def test_x_storage_overlap(self):
         # VectorFunction should not store references to arrays, it should
@@ -818,6 +860,44 @@ def test_float_size(self):
         res = vf.jac(x0)
         assert res.dtype == np.float32
 
+    def test_sparse_analytic_jac(self):
+        ex = ExVectorialFunction()
+        x0 = np.array([1.0, 0.0])
+        def sparse_adapter(func):
+            def inner(x):
+                f_x = func(x)
+                return csr_array(f_x)
+            return inner
+
+        # jac(x) returns dense jacobian
+        vf1 = VectorFunction(ex.fun, x0, ex.jac, ex.hess, None, None,
+                            (-np.inf, np.inf), sparse_jacobian=None)
+        # jac(x) returns sparse jacobian, but sparse_jacobian=False requests dense
+        vf2 = VectorFunction(ex.fun, x0, sparse_adapter(ex.jac), ex.hess, None, None,
+                            (-np.inf, np.inf), sparse_jacobian=False)
+
+        res1 = vf1.jac(x0 + 1)
+        res2 = vf2.jac(x0 + 1)
+        assert_equal(res1, res2)
+
+    def test_sparse_numerical_jac(self):
+        ex = ExVectorialFunction()
+        x0 = np.array([1.0, 0.0])
+        N = len(x0)
+
+        # normal dense numerical difference
+        vf1 = VectorFunction(ex.fun, x0, '2-point', ex.hess, None, None,
+                             (-np.inf, np.inf), sparse_jacobian=None)
+        # use sparse numerical difference, but ask it to be converted to dense
+        finite_diff_jac_sparsity = csr_array(np.ones((N, N)))
+        vf2 = VectorFunction(ex.fun, x0, '2-point', ex.hess, None,
+                             finite_diff_jac_sparsity, (-np.inf, np.inf),
+                             sparse_jacobian=False)
+
+        res1 = vf1.jac(x0 + 1)
+        res2 = vf2.jac(x0 + 1)
+        assert_equal(res1, res2)
+
 
 def test_LinearVectorFunction():
     A_dense = np.array([
@@ -911,7 +991,6 @@ def test_ScalarFunctionNoReferenceCycle():
     platform.python_implementation() == "PyPy",
     reason="assert_deallocate not available on PyPy"
 )
-@pytest.mark.xfail(reason="TODO remove reference cycle from VectorFunction")
 def test_VectorFunctionNoReferenceCycle():
     """Regression test for gh-20768."""
     ex = ExVectorialFunction()
diff --git a/scipy/optimize/tests/test_least_squares.py b/scipy/optimize/tests/test_least_squares.py
index 0cfecc20d85f..0b1614b2673c 100644
--- a/scipy/optimize/tests/test_least_squares.py
+++ b/scipy/optimize/tests/test_least_squares.py
@@ -36,6 +36,15 @@ def fun_rosenbrock(x):
     return np.array([10 * (x[1] - x[0]**2), (1 - x[0])])
 
 
+class Fun_Rosenbrock:
+    def __init__(self):
+        self.nfev = 0
+
+    def __call__(self, x, a=0):
+        self.nfev += 1
+        return fun_rosenbrock(x)
+
+
 def jac_rosenbrock(x):
     return np.array([
         [-20 * x[0], 10],
@@ -235,19 +244,13 @@ def test_x_scale_options(self):
                       2.0, x_scale=1.0+2.0j, method=self.method)
 
     def test_diff_step(self):
-        # res1 and res2 should be equivalent.
-        # res2 and res3 should be different.
         res1 = least_squares(fun_trivial, 2.0, diff_step=1e-1,
                              method=self.method)
-        res2 = least_squares(fun_trivial, 2.0, diff_step=-1e-1,
-                             method=self.method)
         res3 = least_squares(fun_trivial, 2.0,
                              diff_step=None, method=self.method)
         assert_allclose(res1.x, 0, atol=1e-4)
-        assert_allclose(res2.x, 0, atol=1e-4)
         assert_allclose(res3.x, 0, atol=1e-4)
-        assert_equal(res1.x, res2.x)
-        assert_equal(res1.nfev, res2.nfev)
+
 
     def test_incorrect_options_usage(self):
         assert_raises(TypeError, least_squares, fun_trivial, 2.0,
@@ -267,7 +270,6 @@ def test_full_result(self):
         assert_allclose(res.optimality, 0, atol=1e-2)
         assert_equal(res.active_mask, 0)
         if self.method == 'lm':
-            assert_(res.nfev < 30)
             assert_(res.njev is None)
         else:
             assert_(res.nfev < 10)
@@ -295,6 +297,17 @@ def test_full_result_single_fev(self):
         assert_equal(res.status, 0)
         assert_equal(res.success, 0)
 
+    def test_nfev(self):
+        # checks that the true number of nfev are being consumed
+        for i in range(1, 3):
+            rng = np.random.default_rng(128908)
+            x0 = rng.uniform(size=2) * 10
+            ftrivial = Fun_Rosenbrock()
+            res = least_squares(
+               ftrivial, x0, jac=jac_rosenbrock, method=self.method, max_nfev=i
+            )
+            assert res.nfev == ftrivial.nfev
+
     def test_rosenbrock(self):
         x0 = [-2, 1]
         x_opt = [1, 1]
diff --git a/scipy/optimize/tests/test_nnls.py b/scipy/optimize/tests/test_nnls.py
index 911ec10c9fd7..7ff42e2de884 100644
--- a/scipy/optimize/tests/test_nnls.py
+++ b/scipy/optimize/tests/test_nnls.py
@@ -433,6 +433,37 @@ def test_atol_deprecation_warning(self):
         """Test that using atol parameter triggers deprecation warning"""
         a = np.array([[1, 0], [1, 0], [0, 1]])
         b = np.array([2, 1, 1])
-        
+
         with pytest.warns(DeprecationWarning, match="{'atol'}"):
             nnls(a, b, atol=1e-8)
+
+    def test_2D_singleton_RHS_input(self):
+        # Test that a 2D singleton RHS input is accepted
+        A = np.array([[1.0, 0.5, -1.],
+                      [1.0, 0.5, 0.0],
+                      [-1., 0.0, 1.0]])
+        b = np.array([[-1.0, 2.0, 2.0]]).T
+        x, r = nnls(A, b)
+        assert_allclose(x, np.array([1.0, 2.0, 3.0]))
+        assert_allclose(r, 0.0)
+
+    def test_2D_not_singleton_RHS_input_2(self):
+        # Test that a 2D but not a column vector RHS input is rejected
+        A = np.array([[1.0, 0.5, -1.],
+                      [1.0, 0.5, 0.0],
+                      [1.0, 0.5, 0.0],
+                      [0.0, 0.0, 1.0]])
+        b = np.ones(shape=[4, 2], dtype=np.float64)
+        with pytest.raises(ValueError, match="Expected a 1D array"):
+            nnls(A, b)
+
+    def test_gh_22791_32bit(self):
+        # Scikit-learn got hit by this problem on 32-bit arch.
+        desired = [0, 0, 1.05617285, 0, 0, 0, 0, 0.23123048, 0, 0, 0, 0.26128651]
+        rng = np.random.RandomState(42)
+        n_samples, n_features = 5, 12
+        X = rng.randn(n_samples, n_features)
+        X[:2, :] = 0
+        y = rng.randn(n_samples)
+        coef, _ = nnls(X, y)
+        assert_allclose(coef, desired)
diff --git a/scipy/optimize/tests/test_optimize.py b/scipy/optimize/tests/test_optimize.py
index ff4140f65df2..5886ad0b2f48 100644
--- a/scipy/optimize/tests/test_optimize.py
+++ b/scipy/optimize/tests/test_optimize.py
@@ -3039,7 +3039,7 @@ def test_equal_bounds(method, kwds, bound_type, constraints, callback):
 
     # compare the output of a solution with FD vs that of an analytic grad
     assert res.success
-    assert_allclose(res.fun, expected.fun, rtol=1.5e-6)
+    assert_allclose(res.fun, expected.fun, rtol=2e-6)
     assert_allclose(res.x, expected.x, rtol=5e-4)
 
     if fd_needed or kwds['jac'] is False:
diff --git a/scipy/optimize/tests/test_slsqp.py b/scipy/optimize/tests/test_slsqp.py
index 45216aa296b5..33b87fc14bb5 100644
--- a/scipy/optimize/tests/test_slsqp.py
+++ b/scipy/optimize/tests/test_slsqp.py
@@ -592,7 +592,6 @@ def target(x):
         # The problem is infeasible, so it cannot succeed
         assert not res.success
 
-    @pytest.mark.thread_unsafe
     def test_parameters_stay_within_bounds(self):
         # gh11403. For some problems the SLSQP Fortran code suggests a step
         # outside one of the lower/upper bounds. When this happens
@@ -607,7 +606,40 @@ def test_parameters_stay_within_bounds(self):
         def f(x):
             assert (x >= bounds.lb).all()
             return np.linalg.norm(x)
+        # The following should not raise any warnings which was the case, with the
+        # old Fortran code.
+        res = minimize(f, x0, method='SLSQP', bounds=bounds)
+        assert res.success
+
+
+def test_slsqp_segfault_wrong_workspace_computation():
+    # See gh-14915
+    # This problem is not well-defined, however should not cause a segfault.
+    # The previous F77 workspace computation did not handle only equality-
+    # constrained problems correctly.
+    rng = np.random.default_rng(1742651087222879)
+    x = rng.uniform(size=[22,365])
+    target = np.linspace(0.9, 4.0, 50)
+
+    def metric(v, weights):
+        return [[0, 0],[1, 1]]
+
+    def efficient_metric(v, target):
+        def metric_a(weights):
+            return metric(v, weights)[1][0]
+
+        def metric_b(weights, v):
+            return metric(v, weights)[0][0]
+
+        constraints = ({'type': 'eq', 'fun': lambda x: metric_a(x) - target},
+                       {'type': 'eq', 'fun': lambda x: np.sum(x) - 1})
+        weights = np.array([len(v)*[1./len(v)]])[0]
+        result = minimize(metric_b,
+                          weights,
+                          args=(v,),
+                          method='SLSQP',
+                          constraints=constraints)
+        return result
+
+    efficient_metric(x, target)
 
-        with pytest.warns(RuntimeWarning, match='x were outside bounds'):
-            res = minimize(f, x0, method='SLSQP', bounds=bounds)
-            assert res.success
diff --git a/scipy/signal/_short_time_fft.py b/scipy/signal/_short_time_fft.py
index d43370a18307..5da4017fce75 100644
--- a/scipy/signal/_short_time_fft.py
+++ b/scipy/signal/_short_time_fft.py
@@ -381,14 +381,14 @@ class ShortTimeFFT:
 
     It is possible to calculate the STFT of signal parts:
 
-    >>> p_q = SFT.nearest_k_p(N // 2)
-    >>> Sx0 = SFT.stft(x[:p_q])
-    >>> Sx1 = SFT.stft(x[p_q:])
+    >>> N2 = SFT.nearest_k_p(N // 2)
+    >>> Sx0 = SFT.stft(x[:N2])
+    >>> Sx1 = SFT.stft(x[N2:])
 
     When assembling sequential STFT parts together, the overlap needs to be
     considered:
 
-    >>> p0_ub = SFT.upper_border_begin(p_q)[1] - SFT.p_min
+    >>> p0_ub = SFT.upper_border_begin(N2)[1] - SFT.p_min
     >>> p1_le = SFT.lower_border_end[1] - SFT.p_min
     >>> Sx01 = np.hstack((Sx0[:, :p0_ub],
     ...                   Sx0[:, p0_ub:] + Sx1[:, :p1_le],
@@ -1675,7 +1675,15 @@ def p_min(self) -> int:
 
     @lru_cache(maxsize=256)
     def _post_padding(self, n: int) -> tuple[int, int]:
-        """Largest signal index and slice index due to padding."""
+        """Largest signal index and slice index due to padding.
+
+        Parameters
+        ----------
+        n : int
+            Number of samples of input signal (must be ≥ half of the window length).
+        """
+        if not (n >= (m2p := self.m_num - self.m_num_mid)):
+            raise ValueError(f"Parameter n must be >= ceil(m_num/2) = {m2p}!")
         w2 = self.win.real**2 + self.win.imag**2
         # move window to the right until the overlap for t < t[n] vanishes:
         q1 = n // self.hop   # last slice index with t[p1] <= t[n]
@@ -1696,6 +1704,11 @@ def k_max(self, n: int) -> int:
         A detailed example is provided in the :ref:`tutorial_stft_sliding_win`
         section of the :ref:`user_guide`.
 
+        Parameters
+        ----------
+        n : int
+            Number of samples of input signal (must be ≥ half of the window length).
+
         See Also
         --------
         k_min: The smallest possible signal index.
@@ -1797,6 +1810,19 @@ def upper_border_begin(self, n: int) -> tuple[int, int]:
         A detailed example is given :ref:`tutorial_stft_sliding_win` section
         of the :ref:`user_guide`.
 
+        Parameters
+        ----------
+        n : int
+            Number of samples of input signal (must be ≥ half of the window length).
+
+        Returns
+        -------
+        k_ub : int
+            Lowest signal index, where a touching time slice sticks out past the
+            signal end.
+        p_ub : int
+            Lowest index of time slice of which the end sticks out past the signal end.
+
         See Also
         --------
         k_min: The smallest possible signal index.
@@ -1808,13 +1834,15 @@ def upper_border_begin(self, n: int) -> tuple[int, int]:
         p_range: Determine and validate slice index range.
         ShortTimeFFT: Class this method belongs to.
         """
+        if not (n >= (m2p := self.m_num - self.m_num_mid)):
+            raise ValueError(f"Parameter n must be >= ceil(m_num/2) = {m2p}!")
         w2 = self.win.real**2 + self.win.imag**2
         q2 = n // self.hop + 1  # first t[q] >= t[n]
         q1 = max((n-self.m_num) // self.hop - 1, -1)
         # move window left until does not stick out to the right:
         for q_ in range(q2, q1, -1):
             k_ = q_ * self.hop + (self.m_num - self.m_num_mid)
-            if k_ < n or all(w2[n-k_:] == 0):
+            if k_ <= n or all(w2[n-k_:] == 0):
                 return (q_ + 1) * self.hop - self.m_num_mid, q_ + 1
         return 0, 0  # border starts at first slice
 
diff --git a/scipy/signal/_spline_filters.py b/scipy/signal/_spline_filters.py
index 920becdffbf7..e3d080cc9cae 100644
--- a/scipy/signal/_spline_filters.py
+++ b/scipy/signal/_spline_filters.py
@@ -5,7 +5,7 @@
                    moveaxis, abs, complex64, float32)
 import numpy as np
 
-from scipy._lib._array_api import array_namespace
+from scipy._lib._array_api import array_namespace, xp_promote
 
 from scipy._lib._util import normalize_axis_index
 
@@ -714,8 +714,8 @@ def symiirorder1(signal, c0, z1, precision=-1.0):
         The filtered signal.
     """
     xp = array_namespace(signal)
-
-    # internals of symiirorder1 are numpy-only
+    signal = xp_promote(signal, force_floating=True, xp=xp)
+    # This function uses C internals
     signal = np.asarray(signal)
 
     if abs(z1) >= 1:
@@ -729,9 +729,6 @@ def symiirorder1(signal, c0, z1, precision=-1.0):
         signal = signal[None, :]
         squeeze_dim = True
 
-    if np.issubdtype(signal.dtype, np.integer):
-        signal = signal.astype(np.promote_types(signal.dtype, np.float32))
-
     y0 = symiirorder1_ic(signal, z1, precision)
 
     # Apply first the system 1 / (1 - z1 * z^-1)
@@ -797,10 +794,9 @@ def symiirorder2(input, r, omega, precision=-1.0):
         The filtered signal.
     """
     xp = array_namespace(input)
-
-    # internals are numpy-only
-    input = np.asarray(input)
-    omega = np.asarray(omega)
+    input = xp_promote(input, force_floating=True, xp=xp)
+    # This function uses C internals
+    input = np.ascontiguousarray(input)
 
     if r >= 1.0:
         raise ValueError('r must be less than 1.0')
@@ -808,22 +804,16 @@ def symiirorder2(input, r, omega, precision=-1.0):
     if input.ndim > 2:
         raise ValueError('Input must be 1D or 2D')
 
-    if not input.flags.c_contiguous:
-        input = input.copy()
-
     squeeze_dim = False
     if input.ndim == 1:
         input = input[None, :]
         squeeze_dim = True
 
-    if np.issubdtype(input.dtype, np.integer):
-        input = input.astype(np.promote_types(input.dtype, np.float32))
-
     rsq = r * r
-    a2 = 2 * r * np.cos(omega)
+    a2 = 2 * r * math.cos(omega)
     a3 = -rsq
-    cs = np.atleast_1d(1 - 2 * r * np.cos(omega) + rsq)
-    sos = np.atleast_2d(np.r_[cs, 0, 0, 1, -a2, -a3]).astype(input.dtype)
+    cs = 1 - 2 * r * math.cos(omega) + rsq
+    sos = np.asarray([cs, 0, 0, 1, -a2, -a3], dtype=input.dtype)
 
     # Find the starting (forward) conditions.
     ic_fwd = symiirorder2_ic_fwd(input, r, omega, precision)
@@ -831,7 +821,7 @@ def symiirorder2(input, r, omega, precision=-1.0):
     # Apply first the system cs / (1 - a2 * z^-1 - a3 * z^-2)
     # Compute the initial conditions in the form expected by sosfilt
     # coef = np.asarray([[a3, a2], [0, a3]], dtype=input.dtype)
-    coef = np.r_[a3, a2, 0, a3].reshape(2, 2).astype(input.dtype)
+    coef = np.asarray([[a3, a2], [0, a3]], dtype=input.dtype)
     zi = np.matmul(coef, ic_fwd[:, :, None])[:, :, 0]
 
     y_fwd, _ = sosfilt(sos, axis_slice(input, 2), zi=zi[None])
diff --git a/scipy/signal/tests/_scipy_spectral_test_shim.py b/scipy/signal/tests/_scipy_spectral_test_shim.py
index c23f310bcae4..42d3d830d0e3 100644
--- a/scipy/signal/tests/_scipy_spectral_test_shim.py
+++ b/scipy/signal/tests/_scipy_spectral_test_shim.py
@@ -103,7 +103,7 @@ def _stft_wrapper(x, fs=1.0, window='hann', nperseg=256, noverlap=None,
         # This is an edge case where shortTimeFFT returns one more time slice
         # than the Scipy stft() shorten to remove last time slice:
         if n % 2 == 1 and nperseg % 2 == 1 and noverlap % 2 == 1:
-            x = x[..., :axis - 1]
+            x = x[..., : -1]
 
         nadd = (-(x.shape[-1]-nperseg) % nstep) % nperseg
         zeros_shape = list(x.shape[:-1]) + [nadd]
@@ -124,11 +124,8 @@ def _stft_wrapper(x, fs=1.0, window='hann', nperseg=256, noverlap=None,
     k_off = nperseg // 2
     p0 = 0  # ST.lower_border_end[1] + 1
     nn = x.shape[axis] if padded else n+k_off+1
-    p1 = ST.upper_border_begin(nn)[1]  # ST.p_max(n) + 1
-
-    # This is bad hack to pass the test test_roundtrip_boundary_extension():
-    if padded is True and nperseg - noverlap == 1:
-        p1 -= nperseg // 2 - 1  # the reasoning behind this is not clear to me
+    # number of frames akin to legacy stft computation
+    p1 = (x.shape[axis] - nperseg) // nstep + 1 
 
     detr = None if detrend is False else detrend
     Sxx = ST.stft_detrend(x, detr, p0, p1, k_offset=k_off, axis=axis)
@@ -136,11 +133,6 @@ def _stft_wrapper(x, fs=1.0, window='hann', nperseg=256, noverlap=None,
     if x.dtype in (np.float32, np.complex64):
         Sxx = Sxx.astype(np.complex64)
 
-    # workaround for test_average_all_segments() - seems to be buggy behavior:
-    if boundary is None and padded is False:
-        t, Sxx = t[1:-1], Sxx[..., :-2]
-        t -= k_off / fs
-
     return ST.f, t, Sxx
 
 
diff --git a/scipy/signal/tests/test_short_time_fft.py b/scipy/signal/tests/test_short_time_fft.py
index 5e7c122c4aa5..afdeec5121eb 100644
--- a/scipy/signal/tests/test_short_time_fft.py
+++ b/scipy/signal/tests/test_short_time_fft.py
@@ -531,7 +531,11 @@ def test_border_values():
     assert SFT.p_max(10) == 4
     assert SFT.k_max(10) == 16
     assert SFT.upper_border_begin(10) == (4, 2)
-
+    # Raise exceptions:
+    with pytest.raises(ValueError, match="^Parameter n must be"):
+        SFT.upper_border_begin(3)
+    with pytest.raises(ValueError, match="^Parameter n must be"):
+        SFT._post_padding(3)
 
 def test_border_values_exotic():
     """Ensure that the border calculations are correct for windows with
@@ -541,7 +545,11 @@ def test_border_values_exotic():
     assert SFT.lower_border_end == (0, 0)
 
     SFT = ShortTimeFFT(np.flip(w), hop=20, fs=1)
-    assert SFT.upper_border_begin(4) == (0, 0)
+    assert SFT.upper_border_begin(4) == (16, 1)
+    assert SFT.upper_border_begin(5) == (16, 1)
+    assert SFT.upper_border_begin(23) == (36, 2)
+    assert SFT.upper_border_begin(24) == (36, 2)
+    assert SFT.upper_border_begin(25) == (36, 2)
 
     SFT._hop = -1  # provoke unreachable line
     with pytest.raises(RuntimeError):
diff --git a/scipy/signal/tests/test_splines.py b/scipy/signal/tests/test_splines.py
index 0d2afa42e46c..185ce2ba2d65 100644
--- a/scipy/signal/tests/test_splines.py
+++ b/scipy/signal/tests/test_splines.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pytest
 import scipy._lib.array_api_extra as xpx
-from scipy._lib._array_api import xp_assert_close, is_cupy
+from scipy._lib._array_api import is_cupy, xp_assert_close, xp_default_dtype
 
 from scipy.signal._spline import (
     symiirorder1_ic, symiirorder2_ic_fwd, symiirorder2_ic_bwd)
@@ -190,7 +190,6 @@ def test_symiir1_values(self, dtype, xp):
             0.19982875, 0.20355805, 0.47378628, 0.57232247, 0.51597393,
             0.25935107, 0.31438554, 0.41096728, 0.4190693 , 0.25812255,
             0.33671467], dtype=res.dtype)
-        assert res.dtype == dtype
         atol = {xp.float64: 1e-15, xp.float32: 1e-7}[dtype]
         xp_assert_close(res, exp_res, atol=atol)
 
@@ -332,10 +331,10 @@ def test_symiir2_initial_bwd(self, dtype, precision, xp):
     def test_symiir2(self, dtype, precision, xp):
         dtype = getattr(xp, dtype)
 
-        r = xp.asarray(0.5, dtype=dtype)
-        omega = xp.asarray(xp.pi / 3.0, dtype=dtype)
-        cs = 1 - 2 * r * xp.cos(omega) + r * r
-        a2 = 2 * r * xp.cos(omega)
+        r = 0.5
+        omega = math.pi / 3.0
+        cs = 1 - 2 * r * math.cos(omega) + r * r
+        a2 = 2 * r * math.cos(omega)
         a3 = -r * r
 
         n = 100
@@ -367,13 +366,15 @@ def test_symiir2(self, dtype, precision, xp):
         out = symiirorder2(signal, r, omega, precision)
         xp_assert_close(out, exp, atol=4e-6, rtol=6e-7)
 
-    @skip_xp_backends(cpu_only=True)
+    @skip_xp_backends(cpu_only=True, exceptions=["cupy"], reason="C internals")
     @pytest.mark.parametrize('dtyp', ['float32', 'float64'])
     def test_symiir2_values(self, dtyp, xp):
         rng = np.random.RandomState(1234)
         s = rng.uniform(size=16).astype(dtyp)
         s = xp.asarray(s)
-        dtyp = getattr(xp, dtyp)
+
+        # cupy returns f64 for f32 inputs
+        dtype = xp.float64 if is_cupy(xp) else getattr(xp, dtyp)
 
         res = symiirorder2(s, 0.1, 0.1, precision=1e-10)
 
@@ -382,13 +383,9 @@ def test_symiir2_values(self, dtyp, xp):
             [0.26572609, 0.53408018, 0.51032696, 0.72115829, 0.69486885,
              0.3649055 , 0.37349478, 0.74165032, 0.89718521, 0.80582483,
              0.46758053, 0.51898709, 0.65025605, 0.65394321, 0.45273595,
-             0.53539183], dtype=dtyp
+             0.53539183], dtype=dtype
         )
 
-        if not is_cupy(xp):
-            # cupy returns f64 for f32 inputs
-            assert res.dtype == dtyp
-
         # The values in SciPy 1.14 agree with those in SciPy 1.9.1 to this
         # accuracy only. Implementation differences are twofold:
         # 1. boundary conditions are computed differently
@@ -397,7 +394,7 @@ def test_symiir2_values(self, dtyp, xp):
         # test_symiir2_initial_{fwd,bwd} above, so the difference is likely
         # due to a different way roundoff errors accumulate in the filter.
         # In that respect, sosfilt is likely doing a better job.
-        xp_assert_close(res, exp_res, atol=2e-6, check_dtype=False)
+        xp_assert_close(res, exp_res, atol=2e-6)
 
         I1 = xp.asarray(1 + 1j, dtype=xp.result_type(s, xp.complex64))
         s = s * I1
@@ -405,7 +402,7 @@ def test_symiir2_values(self, dtyp, xp):
         with pytest.raises((TypeError, ValueError)):
             res = symiirorder2(s, 0.5, 0.1)
 
-    @skip_xp_backends(cpu_only=True)
+    @skip_xp_backends(cpu_only=True, exceptions=["cupy"], reason="C internals")
     @xfail_xp_backends("cupy", reason="cupy does not accept integer arrays")
     def test_symiir1_integer_input(self, xp):
         s = xp.where(
@@ -413,11 +410,11 @@ def test_symiir1_integer_input(self, xp):
             xp.asarray(-1),
             xp.asarray(1),
         )
-        expected = symiirorder1(xp.astype(s, xp.float64), 0.5, 0.5)
+        expected = symiirorder1(xp.astype(s, xp_default_dtype(xp)), 0.5, 0.5)
         out = symiirorder1(s, 0.5, 0.5)
         xp_assert_close(out, expected)
 
-    @skip_xp_backends(cpu_only=True)
+    @skip_xp_backends(cpu_only=True, exceptions=["cupy"], reason="C internals")
     @xfail_xp_backends("cupy", reason="cupy does not accept integer arrays")
     def test_symiir2_integer_input(self, xp):
         s = xp.where(
@@ -425,6 +422,6 @@ def test_symiir2_integer_input(self, xp):
             xp.asarray(-1),
             xp.asarray(1),
         )
-        expected = symiirorder2(xp.astype(s, xp.float64), 0.5, xp.pi / 3.0)
+        expected = symiirorder2(xp.astype(s, xp_default_dtype(xp)), 0.5, xp.pi / 3.0)
         out = symiirorder2(s, 0.5, xp.pi / 3.0)
         xp_assert_close(out, expected)
diff --git a/scipy/signal/tests/test_windows.py b/scipy/signal/tests/test_windows.py
index 0a4cf945e8b3..1dad96494b5a 100644
--- a/scipy/signal/tests/test_windows.py
+++ b/scipy/signal/tests/test_windows.py
@@ -502,6 +502,10 @@ def test_basic(self, xp):
 class TestKaiserBesselDerived:
 
     def test_basic(self, xp):
+        # cover case `M < 1`
+        w = windows.kaiser_bessel_derived(0.5, beta=4.0, xp=xp)
+        xp_assert_equal(w, xp.asarray([]))
+
         M = 100
         w = windows.kaiser_bessel_derived(M, beta=4.0, xp=xp)
         w2 = windows.get_window(('kaiser bessel derived', 4.0),
diff --git a/scipy/signal/windows/_windows.py b/scipy/signal/windows/_windows.py
index d7faeacf1a0e..d7ddde67a2fc 100644
--- a/scipy/signal/windows/_windows.py
+++ b/scipy/signal/windows/_windows.py
@@ -1379,7 +1379,7 @@ def kaiser_bessel_derived(M, beta, *, sym=True, xp=None, device=None):
             "shapes"
         )
     elif M < 1:
-        return xp.array([])
+        return xp.asarray([])
     elif M % 2:
         raise ValueError(
             "Kaiser-Bessel Derived windows are only defined for even number "
@@ -2319,8 +2319,8 @@ def _fftautocorr(x):
     x_fft = sp_fft.rfft(x, use_N, axis=-1)
     cxy = sp_fft.irfft(x_fft * x_fft.conj(), n=use_N)[:, :N]
     # Or equivalently (but in most cases slower):
-    # cxy = xp.array([xp.convolve(xx, yy[::-1], mode='full')
-    #                 for xx, yy in zip(x, x)])[:, N-1:2*N-1]
+    # cxy = xp.asarray([xp.convolve(xx, yy[::-1], mode='full')
+    #                   for xx, yy in zip(x, x)])[:, N-1:2*N-1]
     return cxy
 
 
diff --git a/scipy/sparse/linalg/_dsolve/_superlu_utils.c b/scipy/sparse/linalg/_dsolve/_superlu_utils.c
index 49b928a4312d..63951b7f33f1 100644
--- a/scipy/sparse/linalg/_dsolve/_superlu_utils.c
+++ b/scipy/sparse/linalg/_dsolve/_superlu_utils.c
@@ -15,18 +15,8 @@
    been allocated.  (It's ok to FREE unallocated memory)---will be ignored.
 */
 
-#ifndef WITH_THREAD
-static SuperLUGlobalObject superlu_py_global = {0};
-#endif
-
 static SuperLUGlobalObject *get_tls_global(void)
 {
-#ifndef WITH_THREAD
-    if (superlu_py_global.memory_dict == NULL) {
-        superlu_py_global.memory_dict = PyDict_New();
-    }
-    return &superlu_py_global;
-#else
     PyObject *thread_dict;
     SuperLUGlobalObject *obj;
     const char *key = "scipy.sparse.linalg._dsolve._superlu.__global_object";
@@ -53,7 +43,6 @@ static SuperLUGlobalObject *get_tls_global(void)
     PyDict_SetItemString(thread_dict, key, (PyObject *)obj);
 
     return obj;
-#endif
 }
 
 jmp_buf *superlu_python_jmpbuf(void)
diff --git a/scipy/sparse/linalg/_dsolve/tests/test_linsolve.py b/scipy/sparse/linalg/_dsolve/tests/test_linsolve.py
index 660e3baae015..e9e101c3166d 100644
--- a/scipy/sparse/linalg/_dsolve/tests/test_linsolve.py
+++ b/scipy/sparse/linalg/_dsolve/tests/test_linsolve.py
@@ -4,7 +4,6 @@
 import numpy as np
 from numpy import array, finfo, arange, eye, all, unique, ones, dot
 from numpy.exceptions import ComplexWarning
-import numpy.random as random
 from numpy.testing import (
         assert_array_almost_equal, assert_almost_equal,
         assert_equal, assert_array_equal, assert_, assert_allclose,
@@ -61,7 +60,6 @@ def setup_method(self):
         d = arange(n) + 1
         self.n = n
         self.A = dia_array(((d, 2*d, d[::-1]), (-3, 0, 5)), shape=(n,n)).tocsc()
-        random.seed(1234)
 
     def _check_singular(self):
         A = csc_array((5,5), dtype='d')
@@ -71,7 +69,8 @@ def _check_singular(self):
     def _check_non_singular(self):
         # Make a diagonal dominant, to make sure it is not singular
         n = 5
-        a = csc_array(random.rand(n, n))
+        rng = np.random.default_rng(14332)
+        a = csc_array(rng.random((n, n)))
         b = ones(n)
 
         expected = splu(a).solve(b)
@@ -113,9 +112,11 @@ def test_factorizes_nonsquare_matrix_with_umfpack(self):
     def test_call_with_incorrectly_sized_matrix_without_umfpack(self):
         use_solver(useUmfpack=False)
         solve = factorized(self.A)
-        b = random.rand(4)
-        B = random.rand(4, 3)
-        BB = random.rand(self.n, 3, 9)
+
+        rng = np.random.default_rng(230498)
+        b = rng.random(4)
+        B = rng.random((4, 3))
+        BB = rng.random((self.n, 3, 9))
 
         with assert_raises(ValueError, match="is of incompatible size"):
             solve(b)
@@ -129,9 +130,11 @@ def test_call_with_incorrectly_sized_matrix_without_umfpack(self):
     def test_call_with_incorrectly_sized_matrix_with_umfpack(self):
         use_solver(useUmfpack=True)
         solve = factorized(self.A)
-        b = random.rand(4)
-        B = random.rand(4, 3)
-        BB = random.rand(self.n, 3, 9)
+
+        rng = np.random.default_rng(643095823)
+        b = rng.random(4)
+        B = rng.random((4, 3))
+        BB = rng.random((self.n, 3, 9))
 
         # does not raise
         solve(b)
@@ -144,7 +147,8 @@ def test_call_with_incorrectly_sized_matrix_with_umfpack(self):
     def test_call_with_cast_to_complex_without_umfpack(self):
         use_solver(useUmfpack=False)
         solve = factorized(self.A)
-        b = random.rand(4)
+        rng = np.random.default_rng(23454)
+        b = rng.random(4)
         for t in [np.complex64, np.complex128]:
             with assert_raises(TypeError, match="Cannot cast array data"):
                 solve(b.astype(t))
@@ -153,7 +157,8 @@ def test_call_with_cast_to_complex_without_umfpack(self):
     def test_call_with_cast_to_complex_with_umfpack(self):
         use_solver(useUmfpack=True)
         solve = factorized(self.A)
-        b = random.rand(4)
+        rng = np.random.default_rng(23454)
+        b = rng.random(4)
         for t in [np.complex64, np.complex128]:
             assert_warns(ComplexWarning, solve, b.astype(t))
 
@@ -246,8 +251,8 @@ def test_bvector_smoketest(self):
                         [1., 0., 1.],
                         [0., 0., 1.]])
         As = csc_array(Adense)
-        random.seed(1234)
-        x = random.randn(3)
+        rng = np.random.default_rng(1234)
+        x = rng.standard_normal(3)
         b = As@x
         x2 = spsolve(As, b)
 
@@ -258,8 +263,8 @@ def test_bmatrix_smoketest(self):
                         [1., 0., 1.],
                         [0., 0., 1.]])
         As = csc_array(Adense)
-        random.seed(1234)
-        x = random.randn(3, 4)
+        rng = np.random.default_rng(1234)
+        x = rng.standard_normal((3, 4))
         Bdense = As.dot(x)
         Bs = csc_array(Bdense)
         x2 = spsolve(As, Bs)
@@ -452,7 +457,6 @@ def setup_method(self):
         d = arange(n) + 1
         self.n = n
         self.A = dia_array(((d, 2*d, d[::-1]), (-3, 0, 5)), shape=(n, n)).tocsc()
-        random.seed(1234)
 
     def _smoketest(self, spxlu, check, dtype, idx_dtype):
         if np.issubdtype(dtype, np.complexfloating):
@@ -465,7 +469,7 @@ def _smoketest(self, spxlu, check, dtype, idx_dtype):
         A.indptr = A.indptr.astype(idx_dtype, copy=False)
         lu = spxlu(A)
 
-        rng = random.RandomState(1234)
+        rng = np.random.RandomState(1234)
 
         # Input shapes
         for k in [None, 1, 2, self.n, self.n+2]:
@@ -552,7 +556,7 @@ def test_spilu_nnz0(self):
     def test_splu_basic(self):
         # Test basic splu functionality.
         n = 30
-        rng = random.RandomState(12)
+        rng = np.random.RandomState(12)
         a = rng.rand(n, n)
         a[a < 0.95] = 0
         # First test with a singular matrix
@@ -572,7 +576,8 @@ def test_splu_basic(self):
     def test_splu_perm(self):
         # Test the permutation vectors exposed by splu.
         n = 30
-        a = random.random((n, n))
+        rng = np.random.default_rng(1342354)
+        a = rng.random((n, n))
         a[a < 0.95] = 0
         # Make a diagonal dominant, to make sure it is not singular
         a += 4*eye(n)
@@ -621,7 +626,8 @@ def test_natural_permc(self, splu_fun, rtol):
     def test_lu_refcount(self):
         # Test that we are keeping track of the reference count with splu.
         n = 30
-        a = random.random((n, n))
+        rng = np.random.default_rng(1342354)
+        a = rng.random((n, n))
         a[a < 0.95] = 0
         # Make a diagonal dominant, to make sure it is not singular
         a += 4*eye(n)
@@ -638,14 +644,15 @@ def test_lu_refcount(self):
 
     def test_bad_inputs(self):
         A = self.A.tocsc()
+        rng = np.random.default_rng(235634)
 
         assert_raises(ValueError, splu, A[:,:4])
         assert_raises(ValueError, spilu, A[:,:4])
 
         for lu in [splu(A), spilu(A)]:
-            b = random.rand(42)
-            B = random.rand(42, 3)
-            BB = random.rand(self.n, 3, 9)
+            b = rng.random(42)
+            B = rng.random((42, 3))
+            BB = rng.random((self.n, 3, 9))
             assert_raises(ValueError, lu.solve, b)
             assert_raises(ValueError, lu.solve, B)
             assert_raises(ValueError, lu.solve, BB)
@@ -877,14 +884,14 @@ def random_triangle_matrix(n, lower=True, format="csr", choice_of_A="real"):
                 A = A.tocsr(copy=False)
             return A
 
-        np.random.seed(1234)
+        rng = np.random.default_rng(1234)
         A = random_triangle_matrix(n, lower=lower)
         if choice_of_b == "floats":
-            b = np.random.rand(n, m)
+            b = rng.random((n, m))
         elif choice_of_b == "ints":
-            b = np.random.randint(-9, 9, (n, m))
+            b = rng.integers(-9, 9, (n, m))
         elif choice_of_b == "complexints":
-            b = np.random.randint(-9, 9, (n, m)) + np.random.randint(-9, 9, (n, m)) * 1j
+            b = rng.integers(-9, 9, (n, m)) + rng.integers(-9, 9, (n, m)) * 1j
         else:
             raise ValueError(
                 "choice_of_b must be 'floats', 'ints', or 'complexints'.")
diff --git a/scipy/sparse/linalg/_interface.py b/scipy/sparse/linalg/_interface.py
index 7e8a765e2b6c..af8226343fec 100644
--- a/scipy/sparse/linalg/_interface.py
+++ b/scipy/sparse/linalg/_interface.py
@@ -821,22 +821,22 @@ def _matmat(self, X):
 
     def _adjoint(self):
         if self.__adj is None:
-            self.__adj = _AdjointMatrixOperator(self)
+            self.__adj = _AdjointMatrixOperator(self.A)
         return self.__adj
 
+
 class _AdjointMatrixOperator(MatrixLinearOperator):
-    def __init__(self, adjoint):
-        self.A = adjoint.A.T.conj()
-        self.__adjoint = adjoint
-        self.args = (adjoint,)
-        self.shape = adjoint.shape[1], adjoint.shape[0]
+    def __init__(self, adjoint_array):
+        self.A = adjoint_array.T.conj()
+        self.args = (adjoint_array,)
+        self.shape = adjoint_array.shape[1], adjoint_array.shape[0]
 
     @property
     def dtype(self):
-        return self.__adjoint.dtype
+        return self.args[0].dtype
 
     def _adjoint(self):
-        return self.__adjoint
+        return MatrixLinearOperator(self.args[0])
 
 
 class IdentityOperator(LinearOperator):
diff --git a/scipy/sparse/linalg/_propack/meson.build b/scipy/sparse/linalg/_propack/meson.build
index d33cdc0e7646..b6b5dd94d419 100644
--- a/scipy/sparse/linalg/_propack/meson.build
+++ b/scipy/sparse/linalg/_propack/meson.build
@@ -94,7 +94,7 @@ foreach ele: elements
       fortran_ignore_warnings,
       _fflag_Wno_intrinsic_shadow,
       _fflag_Wno_uninitialized,
-      _fflag_fpp,
+      _fflag_preprocess,
     ],
     gnu_symbol_visibility: 'hidden',
   )
diff --git a/scipy/sparse/linalg/tests/test_interface.py b/scipy/sparse/linalg/tests/test_interface.py
index 5fc13bc49557..a28b28fad228 100644
--- a/scipy/sparse/linalg/tests/test_interface.py
+++ b/scipy/sparse/linalg/tests/test_interface.py
@@ -13,6 +13,7 @@
 
 import scipy.sparse.linalg._interface as interface
 from scipy.sparse._sputils import matrix
+from scipy._lib._gcutils import assert_deallocated, IS_PYPY
 
 
 class TestLinearOperator:
@@ -524,3 +525,13 @@ def test_sparse_matmat_exception():
         A @ np.identity(4)
     with assert_raises(ValueError):
         np.identity(4) @ A
+
+
+@pytest.mark.skipif(IS_PYPY, reason="Test not meaningful on PyPy")
+def test_MatrixLinearOperator_refcycle():
+    # gh-10634
+    # Test that MatrixLinearOperator can be automatically garbage collected
+    A = np.eye(2)
+    with assert_deallocated(interface.MatrixLinearOperator, A) as op:
+        op.adjoint()
+        del op
diff --git a/scipy/spatial/distance.py b/scipy/spatial/distance.py
index 9b64bbf5388a..8ed75b56d0d6 100644
--- a/scipy/spatial/distance.py
+++ b/scipy/spatial/distance.py
@@ -107,20 +107,19 @@
 
 import math
 import warnings
-import numpy as np
 import dataclasses
-
 from collections.abc import Callable
 from functools import partial
-from scipy._lib._util import _asarray_validated, _transition_to_rng
-from scipy._lib.deprecation import _deprecated
 
-from . import _distance_wrap
-from . import _hausdorff
-from ..linalg import norm
-from ..special import rel_entr
+import numpy as np
 
-from . import _distance_pybind
+from scipy._lib._array_api import _asarray
+from scipy._lib._util import _asarray_validated, _transition_to_rng
+from scipy._lib import array_api_extra as xpx
+from scipy._lib.deprecation import _deprecated
+from scipy.linalg import norm
+from scipy.special import rel_entr
+from . import _hausdorff, _distance_pybind, _distance_wrap
 
 
 def _copy_array_if_base_present(a):
@@ -2293,14 +2292,33 @@ def pdist(X, metric='euclidean', *, out=None, **kwargs):
     # between all pairs of vectors in X using the distance metric 'abc' but
     # with a more succinct, verifiable, but less efficient implementation.
 
+    X = _asarray(X)
+    if X.ndim != 2:
+        raise ValueError('A 2-dimensional array must be passed.')
+
+    n = X.shape[0]
+    return xpx.lazy_apply(_np_pdist, X, out,
+                          # lazy_apply doesn't support Array kwargs
+                          kwargs.pop('w', None),
+                          kwargs.pop('V', None),
+                          kwargs.pop('VI', None),
+                          # See src/distance_pybind.cpp::pdist
+                          shape=((n * (n - 1)) // 2, ), dtype=X.dtype, 
+                          as_numpy=True, metric=metric, **kwargs)
+
+
+def _np_pdist(X, out, w, V, VI, metric='euclidean', **kwargs):
+
     X = _asarray_validated(X, sparse_ok=False, objects_ok=True, mask_ok=True,
                            check_finite=False)
+    m, n = X.shape
 
-    s = X.shape
-    if len(s) != 2:
-        raise ValueError('A 2-dimensional array must be passed.')
-
-    m, n = s
+    if w is not None:
+        kwargs["w"] = w
+    if V is not None:
+        kwargs["V"] = V
+    if VI is not None:
+        kwargs["VI"] = VI
 
     if callable(metric):
         mstr = getattr(metric, '__name__', 'UnknownCustomMetric')
@@ -2620,7 +2638,7 @@ def is_valid_y(y, warning=False, throw=False, name=None):
     throw : bool, optional
         Throws an exception if the variable passed is not a valid
         condensed distance matrix.
-    name : bool, optional
+    name : str, optional
         Used when referencing the offending variable in the
         warning or exception message.
 
@@ -2648,34 +2666,25 @@ def is_valid_y(y, warning=False, throw=False, name=None):
     False
 
     """
-    y = np.asarray(y, order='c')
-    valid = True
+    y = _asarray(y)
+    name_str = f"'{name}' " if name else ""
     try:
         if len(y.shape) != 1:
-            if name:
-                raise ValueError(f"Condensed distance matrix '{name}' must "
-                                 "have shape=1 (i.e. be one-dimensional).")
-            else:
-                raise ValueError('Condensed distance matrix must have shape=1 '
-                                 '(i.e. be one-dimensional).')
+            raise ValueError(f"Condensed distance matrix {name_str}must "
+                             "have shape=1 (i.e. be one-dimensional).")
         n = y.shape[0]
         d = int(np.ceil(np.sqrt(n * 2)))
         if (d * (d - 1) / 2) != n:
-            if name:
-                raise ValueError(f"Length n of condensed distance matrix '{name}' "
-                                 "must be a binomial coefficient, i.e."
-                                 "there must be a k such that (k \\choose 2)=n)!")
-            else:
-                raise ValueError('Length n of condensed distance matrix must '
-                                 'be a binomial coefficient, i.e. there must '
-                                 'be a k such that (k \\choose 2)=n)!')
+            raise ValueError(f"Length n of condensed distance matrix {name_str}"
+                             "must be a binomial coefficient, i.e. "
+                             "there must be a k such that (k \\choose 2)=n)!")
     except Exception as e:
         if throw:
             raise
         if warning:
             warnings.warn(str(e), stacklevel=2)
-        valid = False
-    return valid
+        return False
+    return True
 
 
 def num_obs_dm(d):
@@ -2733,7 +2742,7 @@ def num_obs_y(Y):
     >>> num_obs_y(Y)
     4
     """
-    Y = np.asarray(Y, order='c')
+    Y = _asarray(Y)
     is_valid_y(Y, throw=True, name='Y')
     k = Y.shape[0]
     if k == 0:
diff --git a/scipy/spatial/tests/test_distance.py b/scipy/spatial/tests/test_distance.py
index 774c773ff8bc..472f99394cd3 100644
--- a/scipy/spatial/tests/test_distance.py
+++ b/scipy/spatial/tests/test_distance.py
@@ -63,6 +63,7 @@
                                     russellrao, seuclidean, sokalmichener,  # noqa: F401
                                     sokalsneath, sqeuclidean, yule)
 from scipy._lib._util import np_long, np_ulong
+from scipy.conftest import skip_xp_invalid_arg
 
 
 @pytest.fixture(params=_METRICS_NAMES, scope="session")
@@ -1370,6 +1371,7 @@ def test_pdist_canberra_ticket_711(self):
         right_y = 0.01492537
         assert_allclose(pdist_y, right_y, atol=eps, verbose=verbose > 2)
 
+    @skip_xp_invalid_arg
     def test_pdist_custom_notdouble(self):
         # tests that when using a custom metric the data type is not altered
         class myclass:
@@ -2106,7 +2108,7 @@ def test_Xdist_deprecated_args(metric):
         pdist(X1, metric, 2.)
 
     for arg in ["p", "V", "VI"]:
-        kwargs = {arg: "foo"}
+        kwargs = {arg: np.asarray(1.)}
 
         if ((arg == "V" and metric == "seuclidean")
                 or (arg == "VI" and metric == "mahalanobis")
diff --git a/scipy/spatial/transform/_rotation.pyx b/scipy/spatial/transform/_rotation.pyx
index e0f086775a7e..53675734f271 100644
--- a/scipy/spatial/transform/_rotation.pyx
+++ b/scipy/spatial/transform/_rotation.pyx
@@ -2580,17 +2580,9 @@ cdef class Rotation:
             raise ValueError("Expected input of shape (3,) or (P, 3), "
                              "got {}.".format(vectors.shape))
 
-        single_vector = False
-        if vectors.shape == (3,):
-            single_vector = True
-            vectors = vectors[None, :]
-
-        matrix = self.as_matrix()
-        if self._single:
-            matrix = matrix[None, :, :]
-
-        n_vectors = vectors.shape[0]
-        n_rotations = len(self._quat)
+        cdef bint single_vector = vectors.ndim == 1
+        cdef Py_ssize_t n_vectors = 1 if single_vector else len(vectors)
+        cdef Py_ssize_t n_rotations = 1 if self.single else len(self)
 
         if n_vectors != 1 and n_rotations != 1 and n_vectors != n_rotations:
             raise ValueError("Expected equal numbers of rotations and vectors "
@@ -2598,15 +2590,25 @@ cdef class Rotation:
                              "{} rotations and {} vectors.".format(
                                 n_rotations, n_vectors))
 
+        cdef np.ndarray matrix = self.as_matrix()
+
         if inverse:
-            result = np.einsum('ikj,ik->ij', matrix, vectors)
-        else:
-            result = np.einsum('ijk,ik->ij', matrix, vectors)
+            matrix = np.swapaxes(matrix, -1, -2)
 
-        if self._single and single_vector:
-            return result[0]
-        else:
-            return result
+        if single_vector:
+            return np.matmul(matrix, vectors)
+
+        if self.single:
+            matrix = matrix[None, :, :]
+
+        if n_rotations == 1:
+            # Single rotation/many vectors, use matmul for speed: The axes argument
+            # is such that the input arguments don't need to be transposed and the
+            # output argument is contineous in memory.
+            return np.matmul(matrix, vectors, axes=[(-2, -1), (-1, -2), (-1, -2)])[0]
+
+        # for stacks of matrices einsum is faster
+        return np.einsum('ijk,ik->ij', matrix, vectors)
 
     @cython.embedsignature(True)
     def __mul__(Rotation self, Rotation other):
diff --git a/scipy/spatial/transform/tests/test_rotation.py b/scipy/spatial/transform/tests/test_rotation.py
index 1c06546e382f..2c91018f7a2b 100644
--- a/scipy/spatial/transform/tests/test_rotation.py
+++ b/scipy/spatial/transform/tests/test_rotation.py
@@ -5,7 +5,7 @@
 from numpy.testing import assert_allclose
 from scipy.spatial.transform import Rotation, Slerp
 from scipy.stats import special_ortho_group
-from itertools import permutations
+from itertools import permutations, product
 
 import pickle
 import copy
@@ -1206,6 +1206,23 @@ def test_apply_multiple_rotations_multiple_points():
     v_inverse = np.array([[2, -1, 3], [4, 6, -5]])
     assert_allclose(r.apply(v, inverse=True), v_inverse)
 
+def test_apply_shapes():
+    vector0 = np.array([1.0, 2.0, 3.0])
+    vector1 = np.array([vector0])
+    vector2 = np.array([vector0, vector0])
+    matrix0 = np.identity(3)
+    matrix1 = np.array([matrix0])
+    matrix2 = np.array([matrix0, matrix0])
+
+    for m, v in product([matrix0, matrix1, matrix2], [vector0, vector1, vector2]):
+        r = Rotation.from_matrix(m)
+        shape = v.shape
+        if not r.single and (v.shape == (3,) or v.shape == (1, 3)):
+            shape = (len(r), 3)
+        x = r.apply(v)
+        assert x.shape == shape
+        x = r.apply(v, inverse=True)
+        assert x.shape == shape
 
 def test_getitem():
     mat = np.empty((2, 3, 3))
diff --git a/scipy/special/_logsumexp.py b/scipy/special/_logsumexp.py
index 4bdb31251584..ac34e9b5c3fd 100644
--- a/scipy/special/_logsumexp.py
+++ b/scipy/special/_logsumexp.py
@@ -1,8 +1,9 @@
 import numpy as np
 from scipy._lib._array_api import (
     array_namespace,
+    xp_device,
     xp_size,
-    xp_broadcast_promote,
+    xp_promote,
     xp_float_to_complex,
 )
 from scipy._lib import array_api_extra as xpx
@@ -104,7 +105,7 @@ def logsumexp(a, axis=None, b=None, keepdims=False, return_sign=False):
 
     """
     xp = array_namespace(a, b)
-    a, b = xp_broadcast_promote(a, b, ensure_writeable=True, force_floating=True, xp=xp)
+    a, b = xp_promote(a, b, broadcast=True, force_floating=True, xp=xp)
     a = xpx.atleast_nd(a, ndim=1, xp=xp)
     b = xpx.atleast_nd(b, ndim=1, xp=xp) if b is not None else b
     axis = tuple(range(a.ndim)) if axis is None else axis
@@ -115,10 +116,10 @@ def logsumexp(a, axis=None, b=None, keepdims=False, return_sign=False):
             # delegate edge case handling to the behavior of `xp.log` and `xp.exp`,
             # which should follow the C99 standard for complex values.
             b_exp_a = xp.exp(a) if b is None else b * xp.exp(a)
-            sum = xp.sum(b_exp_a, axis=axis, keepdims=True)
-            sgn_inf = _sign(sum, xp) if return_sign else None
-            sum = xp.abs(sum) if return_sign else sum
-            out_inf = xp.log(sum)
+            sum_ = xp.sum(b_exp_a, axis=axis, keepdims=True)
+            sgn_inf = _sign(sum_, xp=xp) if return_sign else None
+            sum_ = xp.abs(sum_) if return_sign else sum_
+            out_inf = xp.log(sum_)
 
         with np.errstate(divide='ignore', invalid='ignore'):  # log of zero is OK
             out, sgn = _logsumexp(a, b, axis=axis, return_sign=return_sign, xp=xp)
@@ -132,17 +133,17 @@ def logsumexp(a, axis=None, b=None, keepdims=False, return_sign=False):
     else:
         shape = np.asarray(a.shape)  # NumPy is convenient for shape manipulation
         shape[axis] = 1
-        out = xp.full(tuple(shape), -xp.inf, dtype=a.dtype)
+        out = xp.full(tuple(shape), -xp.inf, dtype=a.dtype, device=xp_device(a))
         sgn = xp.sign(out)
 
     if xp.isdtype(out.dtype, 'complex floating'):
         if return_sign:
             real = xp.real(sgn)
-            imag = xp_float_to_complex(_wrap_radians(xp.imag(sgn), xp))
+            imag = xp_float_to_complex(_wrap_radians(xp.imag(sgn), xp=xp), xp=xp)
             sgn = real + imag*1j
         else:
             real = xp.real(out)
-            imag = xp_float_to_complex(_wrap_radians(xp.imag(out), xp))
+            imag = xp_float_to_complex(_wrap_radians(xp.imag(out), xp=xp), xp=xp)
             out = real + imag*1j
 
     # Deal with shape details - reducing dimensions and convert 0-D to scalar for NumPy
@@ -154,8 +155,7 @@ def logsumexp(a, axis=None, b=None, keepdims=False, return_sign=False):
     return (out, sgn) if return_sign else out
 
 
-def _wrap_radians(x, xp=None):
-    xp = array_namespace(x) if xp is None else xp
+def _wrap_radians(x, *, xp):
     # Wrap radians to (-pi, pi] interval
     wrapped = -((-x + xp.pi) % (2 * xp.pi) - xp.pi)
     # preserve relative precision
@@ -163,7 +163,7 @@ def _wrap_radians(x, xp=None):
     return xp.where(no_wrap, x, wrapped)
 
 
-def _elements_and_indices_with_max_real(a, axis=-1, xp=None):
+def _elements_and_indices_with_max_real(a, *, axis=-1, xp):
     # This is an array-API compatible `max` function that works something
     # like `np.max` for complex input. The important part is that it finds
     # the element with maximum real part. When there are multiple complex values
@@ -172,53 +172,51 @@ def _elements_and_indices_with_max_real(a, axis=-1, xp=None):
     # `take_along_axis`, and even if it did, we would have problems with axis tuples.
     # Feel free to rewrite! It's ugly, but it's not the purpose of the PR, and
     # it gets the job done.
-    xp = array_namespace(a) if xp is None else xp
 
     if xp.isdtype(a.dtype, "complex floating"):
         # select all elements with max real part.
         real_a = xp.real(a)
-        max = xp.max(real_a, axis=axis, keepdims=True)
-        mask = real_a == max
+        max_ = xp.max(real_a, axis=axis, keepdims=True)
+        mask = real_a == max_
 
         # Of those, choose one arbitrarily. This is a reasonably
         # simple, array-API compatible way of doing so that doesn't
         # have a problem with `axis` being a tuple or None.
-        i = xp.reshape(xp.arange(xp_size(a)), a.shape)
+        i = xp.reshape(xp.arange(xp_size(a), device=xp_device(a)), a.shape)
         i = xpx.at(i, ~mask).set(-1)
         max_i = xp.max(i, axis=axis, keepdims=True)
         mask = i == max_i
         a = xp.where(mask, a, 0.)
-        max = xp.sum(a, axis=axis, dtype=a.dtype, keepdims=True)
+        max_ = xp.sum(a, axis=axis, dtype=a.dtype, keepdims=True)
     else:
-        max = xp.max(a, axis=axis, keepdims=True)
-        mask = a == max
+        max_ = xp.max(a, axis=axis, keepdims=True)
+        mask = a == max_
 
-    return xp.asarray(max), xp.asarray(mask)
+    return max_, mask
 
 
-def _sign(x, xp):
+def _sign(x, *, xp):
     return x / xp.where(x == 0, 1., xp.abs(x))
 
 
-def _logsumexp(a, b, axis, return_sign, xp):
-
+def _logsumexp(a, b, *, axis, return_sign, xp):
     # This has been around for about a decade, so let's consider it a feature:
     # Even if element of `a` is infinite or NaN, it adds nothing to the sum if
     # the corresponding weight is zero.
     if b is not None:
-        a = xpx.at(a, b == 0).set(-xp.inf)
+        a = xpx.at(a, b == 0).set(-xp.inf, copy=True)
 
     # Find element with maximum real part, since this is what affects the magnitude
     # of the exponential. Possible enhancement: include log of `b` magnitude in `a`.
     a_max, i_max = _elements_and_indices_with_max_real(a, axis=axis, xp=xp)
 
     # for precision, these terms are separated out of the main sum.
-    a = xpx.at(a, i_max).set(-xp.inf)
+    a = xpx.at(a, i_max).set(-xp.inf, copy=True if b is None else None)
     i_max_dt = xp.astype(i_max, a.dtype)
     # This is an inefficient way of getting `m` because it is the sum of a sparse
     # array; however, this is the simplest way I can think of to get the right shape.
-    m = (xp.sum(i_max_dt, axis=axis, keepdims=True, dtype=a.dtype) if b is None
-         else xp.sum(b * i_max_dt, axis=axis, keepdims=True, dtype=a.dtype))
+    b_i_max = i_max_dt if b is None else b * i_max_dt
+    m = xp.sum(b_i_max, axis=axis, keepdims=True, dtype=a.dtype)
 
     # Shift, exponentiate, scale, and sum
     exp = b * xp.exp(a - a_max) if b is not None else xp.exp(a - a_max)
diff --git a/scipy/special/_support_alternative_backends.py b/scipy/special/_support_alternative_backends.py
index aa0d8fa011ed..65d98ac92059 100644
--- a/scipy/special/_support_alternative_backends.py
+++ b/scipy/special/_support_alternative_backends.py
@@ -1,11 +1,12 @@
-import sys
 import functools
 import operator
 
 import numpy as np
 from scipy._lib._array_api import (
-    array_namespace, scipy_namespace_for, is_numpy, is_marray, SCIPY_ARRAY_API
+    array_namespace, scipy_namespace_for, is_numpy, is_dask, is_marray,
+    xp_promote, SCIPY_ARRAY_API
 )
+import scipy._lib.array_api_extra as xpx
 from . import _ufuncs
 # These don't really need to be imported, but otherwise IDEs might not realize
 # that these are defined in this file / report an error in __init__.py
@@ -18,60 +19,68 @@
 array_api_compat_prefix = "scipy._lib.array_api_compat"
 
 
-def get_array_special_func(f_name, xp, n_array_args):
-    spx = scipy_namespace_for(xp)
-    f = None
+def get_array_special_func(f_name, xp):
     if is_numpy(xp):
-        f = getattr(_ufuncs, f_name, None)
-    elif spx is not None:
-        f = getattr(spx.special, f_name, None)
+        return getattr(_ufuncs, f_name)
 
-    if f is not None:
-        return f
+    spx = scipy_namespace_for(xp)
+    if spx is not None:
+        f = getattr(spx.special, f_name, None)
+        if f is not None:
+            return f
 
     # if generic array-API implementation is available, use that;
     # otherwise, fall back to NumPy/SciPy
     if f_name in _generic_implementations:
-        _f = _generic_implementations[f_name](xp=xp, spx=spx)
-        if _f is not None:
-            return _f
-
-    _f = getattr(_ufuncs, f_name, None)
-    def __f(*args, _f=_f, _xp=xp, **kwargs):
-        array_args = args[:n_array_args]
-        other_args = args[n_array_args:]
-        if is_marray(_xp):
-            data_args = [np.asarray(arg.data) for arg in array_args]
-            out = _f(*data_args, *other_args, **kwargs)
-            mask = functools.reduce(operator.or_, (arg.mask for arg in array_args))
-            return _xp.asarray(out, mask=mask)
-        else:
-            array_args = [np.asarray(arg) for arg in array_args]
-            out = _f(*array_args, *other_args, **kwargs)
-            return _xp.asarray(out)
-
-    return __f
+        f = _generic_implementations[f_name](xp=xp, spx=spx)
+        if f is not None:
+            return f
+
+    def f(*args, **kwargs):
+        if is_marray(xp):
+            _f = globals()[f_name]  # Allow nested wrapping
+            data_args = [arg.data for arg in args]
+            out = _f(*data_args, **kwargs)
+            mask = functools.reduce(operator.or_, (arg.mask for arg in args))
+            return xp.asarray(out, mask=mask)
+
+        elif is_dask(xp):
+            # IMPORTANT: map_blocks works only because all ufuncs in this module
+            # are elementwise. It would be a grave mistake to apply this to gufuncs
+            # or any other function with reductions, as they would change their
+            # output depending on chunking!
+
+            _f = globals()[f_name]  # Allow nested wrapping
+            # Hide dtype kwarg from map_blocks
+            return xp.map_blocks(functools.partial(_f, **kwargs), *args)
 
+        else:
+            _f = getattr(_ufuncs, f_name)
+            args = [np.asarray(arg) for arg in args]
+            out = _f(*args, **kwargs)
+            return xp.asarray(out)
 
-def _get_shape_dtype(*args, xp):
-    args = xp.broadcast_arrays(*args)
-    shape = args[0].shape
-    dtype = xp.result_type(*args)
-    if xp.isdtype(dtype, 'integral'):
-        dtype = xp.float64
-        args = [xp.asarray(arg, dtype=dtype) for arg in args]
-    return args, shape, dtype
+    return f
 
 
 def _rel_entr(xp, spx):
     def __rel_entr(x, y, *, xp=xp):
-        args, shape, dtype = _get_shape_dtype(x, y, xp=xp)
-        x, y = args
-        res = xp.full(x.shape, xp.inf, dtype=dtype)
-        res[(x == 0) & (y >= 0)] = xp.asarray(0, dtype=dtype)
-        i = (x > 0) & (y > 0)
-        res[i] = x[i] * (xp.log(x[i]) - xp.log(y[i]))
+        # https://github.com/data-apis/array-api-extra/issues/160
+        mxp = array_namespace(x._meta, y._meta) if is_dask(xp) else xp
+        x, y = xp_promote(x, y, broadcast=True, force_floating=True, xp=xp)
+        xy_pos = (x > 0) & (y > 0)
+        xy_inf = xp.isinf(x) & xp.isinf(y)
+        res = xpx.apply_where(
+            xy_pos & ~xy_inf,
+            (x, y),
+            # Note: for very large x, this can overflow.
+            lambda x, y: x * (mxp.log(x) - mxp.log(y)),
+            fill_value=xp.inf
+        )
+        res = xpx.at(res)[(x == 0) & (y >= 0)].set(0)
+        res = xpx.at(res)[xp.isnan(x) | xp.isnan(y) | (xy_pos & xy_inf)].set(xp.nan)
         return res
+
     return __rel_entr
 
 
@@ -83,14 +92,19 @@ def __xlogy(x, y, *, xp=xp):
     return __xlogy
 
 
+def _get_native_func(xp, spx, f_name):
+    f = getattr(spx.special, f_name, None) if spx else None
+    if f is None and hasattr(xp, 'special'):
+        f = getattr(xp.special, f_name, None)
+    return f
+
+
 def _chdtr(xp, spx):
     # The difference between this and just using `gammainc`
     # defined by `get_array_special_func` is that if `gammainc`
     # isn't found, we don't want to use the SciPy version; we'll
     # return None here and use the SciPy version of `chdtr`.
-    gammainc = getattr(spx.special, 'gammainc', None) if spx else None  # noqa: F811
-    if gammainc is None and hasattr(xp, 'special'):
-        gammainc = getattr(xp.special, 'gammainc', None)
+    gammainc = _get_native_func(xp, spx, 'gammainc')  # noqa: F811
     if gammainc is None:
         return None
 
@@ -109,9 +123,7 @@ def _chdtrc(xp, spx):
     # defined by `get_array_special_func` is that if `gammaincc`
     # isn't found, we don't want to use the SciPy version; we'll
     # return None here and use the SciPy version of `chdtrc`.
-    gammaincc = getattr(spx.special, 'gammaincc', None) if spx else None  # noqa: F811
-    if gammaincc is None and hasattr(xp, 'special'):
-        gammaincc = getattr(xp.special, 'gammaincc', None)
+    gammaincc = _get_native_func(xp, spx, 'gammaincc')  # noqa: F811
     if gammaincc is None:
         return None
 
@@ -124,9 +136,7 @@ def __chdtrc(v, x):
 
 
 def _betaincc(xp, spx):
-    betainc = getattr(spx.special, 'betainc', None) if spx else None  # noqa: F811
-    if betainc is None and hasattr(xp, 'special'):
-        betainc = getattr(xp.special, 'betainc', None)
+    betainc = _get_native_func(xp, spx, 'betainc')  # noqa: F811
     if betainc is None:
         return None
 
@@ -137,9 +147,7 @@ def __betaincc(a, b, x):
 
 
 def _stdtr(xp, spx):
-    betainc = getattr(spx.special, 'betainc', None) if spx else None  # noqa: F811
-    if betainc is None and hasattr(xp, 'special'):
-        betainc = getattr(xp.special, 'betainc', None)
+    betainc = _get_native_func(xp, spx, 'betainc')  # noqa: F811
     if betainc is None:
         return None
 
@@ -152,10 +160,7 @@ def __stdtr(df, t):
 
 
 def _stdtrit(xp, spx):
-    betainc = getattr(spx.special, 'betainc', None) if spx else None  # noqa: F811
-    if betainc is None and hasattr(xp, 'special'):
-        betainc = getattr(xp.special, 'betainc', None)
-
+    betainc = _get_native_func(xp, spx, 'betainc')  # noqa: F811
     # If betainc is not defined, the root-finding would be done with `xp`
     # despite `stdtr` being evaluated with SciPy/NumPy `stdtr`. Save the
     # conversions: in this case, just evaluate `stdtrit` with SciPy/NumPy.
@@ -185,18 +190,19 @@ def fun(t, df, p):  return stdtr(df, t) - p
 
 # functools.wraps doesn't work because:
 # 'numpy.ufunc' object has no attribute '__module__'
-def support_alternative_backends(f_name, n_array_args):
+def support_alternative_backends(f_name):
     func = getattr(_ufuncs, f_name)
 
     @functools.wraps(func)
     def wrapped(*args, **kwargs):
-        xp = array_namespace(*args[:n_array_args])
-        f = get_array_special_func(f_name, xp, n_array_args)
+        xp = array_namespace(*args)
+        f = get_array_special_func(f_name, xp)
         return f(*args, **kwargs)
 
     return wrapped
 
 
+# function name: number of args (for testing purposes)
 array_special_func_map = {
     'log_ndtr': 1,
     'ndtr': 1,
@@ -223,10 +229,11 @@ def wrapped(*args, **kwargs):
     'stdtrit': 2,
 }
 
-for f_name, n_array_args in array_special_func_map.items():
-    f = (support_alternative_backends(f_name, n_array_args)
-         if SCIPY_ARRAY_API
-         else getattr(_ufuncs, f_name))
-    sys.modules[__name__].__dict__[f_name] = f
+globals().update(
+    {f_name: support_alternative_backends(f_name)
+     if SCIPY_ARRAY_API
+     else getattr(_ufuncs, f_name)
+     for f_name in array_special_func_map}
+)
 
 __all__ = list(array_special_func_map)
diff --git a/scipy/special/meson.build b/scipy/special/meson.build
index 583c29c1f300..e18876588aa6 100644
--- a/scipy/special/meson.build
+++ b/scipy/special/meson.build
@@ -53,7 +53,7 @@ endif
 
 py3.extension_module('_special_ufuncs',
   ['_special_ufuncs.cpp', '_special_ufuncs_docs.cpp', 'sf_error.cc'],
-  include_directories: ['../_lib', '../_build_utils/src'],
+  include_directories: ['..', '../_lib', '../_build_utils/src'],
   dependencies: [np_dep],
   link_args: version_link_args,
   cpp_args: ufuncs_cpp_args,
@@ -63,7 +63,7 @@ py3.extension_module('_special_ufuncs',
 
 py3.extension_module('_gufuncs',
   ['_gufuncs.cpp', '_gufuncs_docs.cpp', 'sf_error.cc'],
-  include_directories: ['../_lib', '../_build_utils/src'],
+  include_directories: ['..', '../_lib', '../_build_utils/src'],
   dependencies: [np_dep],
   link_args: version_link_args,
   cpp_args: ufuncs_cpp_args,
@@ -121,7 +121,7 @@ py3.extension_module('_ufuncs',
   ],
   c_args: [cython_c_args, Wno_maybe_uninitialized],
   cpp_args: ['-DSP_SPECFUN_ERROR'],
-  include_directories: ['../_lib', '../_build_utils/src'],
+  include_directories: ['..', '../_lib', '../_build_utils/src'],
   dependencies: [
     lapack_dep,
     npymath_lib,
@@ -162,7 +162,7 @@ py3.extension_module('_ufuncs_cxx',
     uf_cython_gen_cpp.process(cython_special[2]),  # _ufuncs_cxx.pyx
     ],
   cpp_args: ufuncs_cxx_cpp_args,
-  include_directories: ['../_lib/boost_math/include', '../_lib',
+  include_directories: ['..', '../_lib/boost_math/include', '../_lib',
                         '../_build_utils/src'],
   link_args: version_link_args,
   dependencies: [np_dep, ellint_dep],
@@ -174,7 +174,7 @@ py3.extension_module('_ellip_harm_2',
   [uf_cython_gen.process('_ellip_harm_2.pyx'), 'sf_error.cc'],
   c_args: [cython_c_args],
   cpp_args: ['-DSP_SPECFUN_ERROR'],
-  include_directories: ['../_lib', '../_build_utils/src'],
+  include_directories: ['..', '../_lib', '../_build_utils/src'],
   link_args: version_link_args,
   dependencies: [lapack_dep, np_dep],
   install: true,
@@ -191,7 +191,7 @@ py3.extension_module('cython_special',
   ],
   c_args: [cython_c_args, Wno_maybe_uninitialized],
   cpp_args: ['-DSP_SPECFUN_ERROR'],
-  include_directories: ['../_lib', '../_build_utils/src'],
+  include_directories: ['..', '../_lib', '../_build_utils/src'],
   link_args: version_link_args,
   dependencies: [np_dep, npymath_lib, lapack_dep],
   link_with: cdflib_lib,
diff --git a/scipy/special/sf_error.cc b/scipy/special/sf_error.cc
index 89e24fe522fc..180f09926772 100644
--- a/scipy/special/sf_error.cc
+++ b/scipy/special/sf_error.cc
@@ -4,10 +4,11 @@
 #include <stdarg.h>
 #include <stdlib.h>
 
+#include "scipy_config.h"
 #include "sf_error.h"
 
 /* If this isn't volatile clang tries to optimize it away */
-static volatile sf_action_t sf_error_actions[] = {
+static volatile SCIPY_TLS sf_action_t sf_error_actions[] = {
     SF_ERROR_IGNORE, /* SF_ERROR_OK */
     SF_ERROR_IGNORE, /* SF_ERROR_SINGULAR */
     SF_ERROR_IGNORE, /* SF_ERROR_UNDERFLOW */
@@ -76,9 +77,7 @@ void sf_error_v(const char *func_name, sf_error_t code, const char *fmt, va_list
         PyOS_snprintf(msg, 2048, "scipy.special/%s: %s", func_name, sf_error_messages[(int) code]);
     }
 
-#ifdef WITH_THREAD
     save = PyGILState_Ensure();
-#endif
 
     if (PyErr_Occurred()) {
         goto skip_warn;
@@ -119,11 +118,7 @@ void sf_error_v(const char *func_name, sf_error_t code, const char *fmt, va_list
     }
 
 skip_warn:
-#ifdef WITH_THREAD
     PyGILState_Release(save);
-#else
-    ;
-#endif
 }
 
 void sf_error(const char *func_name, sf_error_t code, const char *fmt, ...) {
diff --git a/scipy/special/tests/test_logsumexp.py b/scipy/special/tests/test_logsumexp.py
index dadb3aab0ddb..31d9e72779d9 100644
--- a/scipy/special/tests/test_logsumexp.py
+++ b/scipy/special/tests/test_logsumexp.py
@@ -4,19 +4,16 @@
 
 import numpy as np
 
-from scipy._lib._array_api import is_array_api_strict, xp_default_dtype
+from scipy._lib._array_api import is_array_api_strict, xp_default_dtype, xp_device
 from scipy._lib._array_api_no_0d import (xp_assert_equal, xp_assert_close,
                                          xp_assert_less)
 
 from scipy.special import log_softmax, logsumexp, softmax
 from scipy.special._logsumexp import _wrap_radians
-from scipy.stats.tests.test_stats import skip_xp_backends
 
 from scipy._lib.array_api_extra.testing import lazy_xp_function
 
 
-
-
 dtypes = ['float32', 'float64', 'int32', 'int64', 'complex64', 'complex128']
 integral_dtypes = ['int32', 'int64']
 
@@ -31,7 +28,7 @@ def test_wrap_radians(xp):
                     0, 1e-300, 1, math.pi, math.pi+1])
     ref = xp.asarray([math.pi-1, math.pi, -1, -1e-300,
                     0, 1e-300, 1, math.pi, -math.pi+1])
-    res = _wrap_radians(x, xp)
+    res = _wrap_radians(x, xp=xp)
     xp_assert_close(res, ref, atol=0)
 
 
@@ -184,7 +181,6 @@ def test_array_like(self):
         desired = np.asarray(1000.0 + math.log(2.0))
         xp_assert_close(logsumexp(a), desired)
 
-    @skip_xp_backends('array_api_strict', reason='data-apis/array-api-strict#131')
     @pytest.mark.parametrize('dtype', dtypes)
     def test_dtypes_a(self, dtype, xp):
         dtype = getattr(xp, dtype)
@@ -194,7 +190,6 @@ def test_dtypes_a(self, dtype, xp):
         desired = xp.asarray(1000.0 + math.log(2.0), dtype=desired_dtype)
         xp_assert_close(logsumexp(a), desired)
 
-    @skip_xp_backends('array_api_strict', reason='data-apis/array-api-strict#131')
     @pytest.mark.parametrize('dtype_a', dtypes)
     @pytest.mark.parametrize('dtype_b', dtypes)
     def test_dtypes_ab(self, dtype_a, dtype_b, xp):
@@ -225,7 +220,6 @@ def test_gh18295(self, xp):
         ref = xp.logaddexp(a[0], a[1])
         xp_assert_close(res, ref)
 
-    @skip_xp_backends('array_api_strict', reason='data-apis/array-api-strict#131')
     @pytest.mark.filterwarnings(
         "ignore:The `numpy.copyto` function is not implemented:FutureWarning:dask"
     )
@@ -298,6 +292,23 @@ def test_gh22601_infinite_elements(self, x, y, xp):
         ref = xp.log(xp.sum(xp.exp(xp.asarray([x, y]))))
         xp_assert_equal(res, ref)
 
+    def test_no_writeback(self, xp):
+        """Test that logsumexp doesn't accidentally write back to its parameters."""
+        a = xp.asarray([5., 4.])
+        b = xp.asarray([3., 2.])
+        logsumexp(a)
+        logsumexp(a, b=b)
+        xp_assert_equal(a, xp.asarray([5., 4.]))
+        xp_assert_equal(b, xp.asarray([3., 2.]))
+
+    @pytest.mark.parametrize("x_raw", [1.0, 1.0j, []])
+    def test_device(self, x_raw, xp, devices):
+        """Test input device propagation to output."""
+        for d in devices:
+            x = xp.asarray(x_raw, device=d)
+            assert xp_device(logsumexp(x)) == xp_device(x)
+            assert xp_device(logsumexp(x, b=x)) == xp_device(x)
+
 
 class TestSoftmax:
     def test_softmax_fixtures(self, xp):
diff --git a/scipy/special/tests/test_sf_error.py b/scipy/special/tests/test_sf_error.py
index 2dfe8287ee4f..d86830415fbc 100644
--- a/scipy/special/tests/test_sf_error.py
+++ b/scipy/special/tests/test_sf_error.py
@@ -2,7 +2,7 @@
 import warnings
 
 import numpy as np
-from numpy.testing import assert_, assert_equal, IS_PYPY
+from numpy.testing import assert_, assert_equal, HAS_REFCOUNT
 import pytest
 from pytest import raises as assert_raises
 
@@ -73,7 +73,8 @@ def test_seterr():
         sc.seterr(**entry_err)
 
 
-@pytest.mark.skipif(IS_PYPY, reason="Test not meaningful on PyPy")
+@pytest.mark.thread_unsafe
+@pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
 def test_sf_error_special_refcount():
     # Regression test for gh-16233.
     # Check that the reference count of scipy.special is not increased
diff --git a/scipy/special/tests/test_sph_harm.py b/scipy/special/tests/test_sph_harm.py
index 310bda00b4d8..d4fa8791ac57 100644
--- a/scipy/special/tests/test_sph_harm.py
+++ b/scipy/special/tests/test_sph_harm.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pytest
 
-from numpy.testing import assert_allclose, suppress_warnings
+from numpy.testing import assert_allclose
 import scipy.special as sc
 
 class TestSphHarm:
@@ -47,6 +47,7 @@ def test_all(self, n_max, m_max):
 
         np.testing.assert_allclose(y_actual, y_desired, rtol=1e-05)
 
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def test_first_harmonics():
     # Test against explicit representations of the first four
     # spherical harmonics which use `theta` as the azimuthal angle,
@@ -78,9 +79,7 @@ def Y11(theta, phi):
     theta, phi = np.meshgrid(theta, phi)
 
     for harm, m, n in zip(harms, m, n):
-        with suppress_warnings() as sup:
-            sup.filter(category=DeprecationWarning)
-            assert_allclose(sc.sph_harm(m, n, theta, phi),
-                            harm(theta, phi),
-                            rtol=1e-15, atol=1e-15,
-                            err_msg=f"Y^{m}_{n} incorrect")
+        assert_allclose(sc.sph_harm(m, n, theta, phi),
+                        harm(theta, phi),
+                        rtol=1e-15, atol=1e-15,
+                        err_msg=f"Y^{m}_{n} incorrect")
diff --git a/scipy/special/tests/test_support_alternative_backends.py b/scipy/special/tests/test_support_alternative_backends.py
index bf19acf5a542..8f24a881344f 100644
--- a/scipy/special/tests/test_support_alternative_backends.py
+++ b/scipy/special/tests/test_support_alternative_backends.py
@@ -1,27 +1,41 @@
+from types import ModuleType
+
 import pytest
 
+from scipy import special
 from scipy.special._support_alternative_backends import (get_array_special_func,
                                                          array_special_func_map)
-from scipy import special
 from scipy._lib._array_api_no_0d import xp_assert_close
 from scipy._lib._array_api import (is_cupy, is_dask, is_jax, is_torch,
-                                   is_array_api_strict, SCIPY_DEVICE)
+                                   SCIPY_ARRAY_API, SCIPY_DEVICE)
 from scipy._lib.array_api_compat import numpy as np
+from scipy._lib.array_api_extra.testing import lazy_xp_function
+
+
+special_wrapped = ModuleType("special_wrapped")
+lazy_xp_modules = [special_wrapped]
+for f_name in array_special_func_map:
+    f = getattr(special, f_name)
+    setattr(special_wrapped, f_name, f)
+    lazy_xp_function(f)
 
 
+@pytest.mark.skipif(not SCIPY_ARRAY_API, reason="Alternative backends must be enabled.")
 def test_dispatch_to_unrecognized_library():
     xp = pytest.importorskip("array_api_strict")
-    f = get_array_special_func('ndtr', xp=xp, n_array_args=1)
+    f = get_array_special_func('ndtr', xp=xp)
     x = [1, 2, 3]
     res = f(xp.asarray(x))
     ref = xp.asarray(special.ndtr(np.asarray(x)))
     xp_assert_close(res, ref)
 
 
+@pytest.mark.skipif(not SCIPY_ARRAY_API,
+                    reason="xp_promote won't accept non-numpy objects")
 @pytest.mark.parametrize('dtype', ['float32', 'float64', 'int64'])
 def test_rel_entr_generic(dtype):
     xp = pytest.importorskip("array_api_strict")
-    f = get_array_special_func('rel_entr', xp=xp, n_array_args=2)
+    f = get_array_special_func('rel_entr', xp=xp)
     dtype_np = getattr(np, dtype)
     dtype_xp = getattr(xp, dtype)
     x = [-1, 0, 0, 1]
@@ -54,15 +68,12 @@ def test_support_alternative_backends(xp, f_name, n_args, dtype, shapes):
     ):
         pytest.skip(f"`{f_name}` does not have an array-agnostic implementation "
                     "and cannot delegate to PyTorch.")
-    if is_dask(xp) and f_name == 'rel_entr':
-        pytest.skip("boolean index assignment")
     if is_jax(xp) and f_name == "stdtrit":
         pytest.skip(f"`{f_name}` requires scipy.optimize support for immutable arrays")
-    if is_array_api_strict(xp) and f_name == "xlogy":
-        pytest.skip(f"`{f_name}` needs data-apis/array-api-strict#131 to be resolved")
 
     shapes = shapes[:n_args]
-    f = getattr(special, f_name)
+    f = getattr(special, f_name)  # Unwrapped
+    fw = getattr(special_wrapped, f_name)  # Wrapped by lazy_xp_function
 
     dtype_np = getattr(np, dtype)
     dtype_xp = getattr(xp, dtype)
@@ -91,7 +102,14 @@ def test_support_alternative_backends(xp, f_name, n_args, dtype, shapes):
 
     args_xp = [xp.asarray(arg, dtype=dtype_xp) for arg in args_np]
 
-    res = f(*args_xp)
+    if is_dask(xp):
+        # We're using map_blocks to dispatch the function to Dask.
+        # This is the correct thing to do IF all tested functions are elementwise;
+        # otherwise the output would change depending on chunking.
+        # Try to trigger bugs related to having multiple chunks.
+        args_xp = [arg.rechunk(5) for arg in args_xp]
+
+    res = fw(*args_xp)
     ref = xp.asarray(f(*args_np), dtype=dtype_xp)
 
     eps = np.finfo(dtype_np).eps
diff --git a/scipy/stats/_continued_fraction.py b/scipy/stats/_continued_fraction.py
index 7e02fa66a253..efa0411608ab 100644
--- a/scipy/stats/_continued_fraction.py
+++ b/scipy/stats/_continued_fraction.py
@@ -1,7 +1,7 @@
 import numpy as np
 
 from scipy._lib._array_api import (
-    array_namespace, xp_ravel, xp_copy, is_torch,  xp_default_dtype
+    array_namespace, xp_ravel, xp_copy, xp_promote
 )
 import scipy._lib._elementwise_iterative_method as eim
 from scipy._lib._util import _RichResult
@@ -29,6 +29,14 @@ def _continued_fraction_iv(a, b, args, tolerances, maxiter, log):
     if not np.iterable(args):
         args = (args,)
 
+    # Call each callable once to determine namespace and dtypes
+    a0, b0 = a(0, *args), b(0, *args)
+    xp = array_namespace(a0, b0, *args)
+    a0, b0, *args = xp_promote(a0, b0, *args, force_floating=True, broadcast=True,
+                               xp=xp)
+    shape, dtype = a0.shape, a0.dtype
+    a0, b0, *args = (xp_ravel(arg) for arg in (a0, b0) + tuple(args))
+
     tolerances = {} if tolerances is None else tolerances
     eps = tolerances.get('eps', None)
     tiny = tolerances.get('tiny', None)
@@ -53,7 +61,7 @@ def _continued_fraction_iv(a, b, args, tolerances, maxiter, log):
     if not isinstance(log, bool):
         raise ValueError('`log` must be boolean.')
 
-    return a, b, args, eps, tiny, maxiter, log
+    return a, b, args, eps, tiny, maxiter, log, a0, b0, shape, dtype, xp
 
 
 def _continued_fraction(a, b, *, args=(), tolerances=None, maxiter=100, log=False):
@@ -265,7 +273,7 @@ def _continued_fraction(a, b, *, args=(), tolerances=None, maxiter=100, log=Fals
     """
 
     res = _continued_fraction_iv(a, b, args, tolerances, maxiter, log)
-    a, b, args, eps, tiny, maxiter, log = res
+    a, b, args, eps, tiny, maxiter, log, a0, b0, shape, dtype, xp = res
     callback = None  # don't want to test it, but easy to add later
 
     # The EIM framework was designed for the case in where there would
@@ -274,7 +282,6 @@ def _continued_fraction(a, b, *, args=(), tolerances=None, maxiter=100, log=Fals
     # and the first argument is an integer (the number of the term). Rather
     # than complicate the framework, we wrap the user-provided callables to
     # make this problem fit within the existing framework.
-    xp = array_namespace(*args) if args else array_namespace(a(0))
 
     def a(n, *args, a=a):
         n = int(xp.real(xp_ravel(n))[0])
@@ -287,36 +294,7 @@ def b(n, *args, b=b):
     def func(n, *args):
         return xp.stack((a(n, *args), b(n, *args)), axis=-1)
 
-    # Initialization
-    # The EIM framework was written with only one callable in mind. Again,
-    # rather than complicating the framework, we call its `initialize` function
-    # on each callable to get the shape and dtype, then we broadcast these
-    # shapes, compute the result dtype, and broadcast/promote the zeroth terms
-    # and `*args` to this shape/dtype.
-
-    # `float32` here avoids influencing precision of resulting float type
-    # patch up promotion: in numpy (int64, float32) -> float64, while in torch
-    # (int64, float32) -> float32 irrespective of the default_dtype.
-    dt = {'dtype': None
-          if is_torch(xp) and xp_default_dtype(xp) == xp.float64
-          else xp.float32}
-    zero = xp.asarray(0, **dt)
-
-    temp = eim._initialize(a, (zero,), args, complex_ok=True)
-    _, _, fs_a, _, shape_a, dtype_a, xp_a = temp
-    temp = eim._initialize(b, (zero,), args, complex_ok=True)
-    _, _, fs_b, _, shape_b, dtype_b, xp_b = temp
-
-    xp = array_namespace(fs_a[0], fs_b[0], *args)
-
-    shape = np.broadcast_shapes(shape_a, shape_b)  # OK to use NumPy on tuples
-    dtype = xp.result_type(dtype_a, dtype_b)
-    an = xp.astype(xp_ravel(xp.broadcast_to(xp.reshape(fs_a[0], shape_a), shape)), dtype)  # noqa: E501
-    bn = xp.astype(xp_ravel(xp.broadcast_to(xp.reshape(fs_b[0], shape_b), shape)), dtype)  # noqa: E501
-    args = [xp.astype(xp_ravel(xp.broadcast_to(arg, shape)), dtype) for arg in args]
-
-    status = xp.full_like(an, xp.asarray(eim._EINPROGRESS),
-                          dtype=xp.int32)  # in progress
+    status = xp.full_like(a0, eim._EINPROGRESS, dtype=xp.int32)  # in progress
     nit, nfev = 0, 1  # one function evaluation (per function) performed above
     maxiter = 100 if maxiter is None else maxiter
 
@@ -331,7 +309,7 @@ def func(n, *args):
 
     # "Set f0 and C0 to the value b0 or to tiny if b0=0. Set D0 = 0.
     zero = -xp.inf if log else 0
-    fn = xp.where(bn == zero, tiny, bn)
+    fn = xp.where(b0 == zero, tiny, b0)
     Cnm1 = xp_copy(fn)
     Dnm1 = xp.full_like(fn, zero)
 
diff --git a/scipy/stats/_distribution_infrastructure.py b/scipy/stats/_distribution_infrastructure.py
index 6d41162d066d..760501219279 100644
--- a/scipy/stats/_distribution_infrastructure.py
+++ b/scipy/stats/_distribution_infrastructure.py
@@ -3510,10 +3510,10 @@ def make_distribution(dist):
             is defined. The preferred interface may change in future SciPy versions,
             in which case support for an old interface version may be deprecated
             and eventually removed.
-        parameters : dict
-            Each key is the name of a parameter,
+        parameters : dict or tuple
+            If a dictionary, each key is the name of a parameter,
             and the corresponding value is either a dictionary or tuple.
-            If a dictionary, it may have the following items, with default
+            If the value is a dictionary, it may have the following items, with default
             values used for entries which aren't present.
 
             endpoints : tuple, default: (-inf, inf)
@@ -3533,7 +3533,17 @@ def make_distribution(dist):
                 ``endpoints`` tuple above, and should define a subinterval of the
                 domain given by ``endpoints``.
 
-            A ``tuple`` value ``(a, b)`` is equivalent to ``{endpoints: (a, b)}``.
+            A tuple value ``(a, b)`` associated to a key in the ``parameters``
+            dictionary is equivalent to ``{endpoints: (a, b)}``.
+
+            Custom distributions with multiple parameterizations can be defined by
+            having the ``parameters`` attribute be a tuple of dictionaries with
+            the structure described above. In this case, ``dist``\'s class must also
+            define a method ``process_parameters`` to map between the different
+            parameterizations. It must take all parameters from all parameterizations
+            as optional keyword arguments and return a dictionary mapping parameters to
+            values, filling in values from other parameterizations using values from
+            the supplied parameterization. See example.
 
         support : dict or tuple
             A dictionary describing the support of the distribution or a tuple
@@ -3548,7 +3558,9 @@ def make_distribution(dist):
         ``moment``, and ``sample``.
         If defined, these methods must accept the parameters of the distribution as
         keyword arguments and also accept any positional-only arguments accepted by
-        the corresponding method of `ContinuousDistribution`. The ``moment`` method
+        the corresponding method of `ContinuousDistribution`. 
+        When multiple parameterizations are defined, these methods must accept
+        all parameters from all parameterizations. The ``moment`` method
         must accept the ``order`` and ``kind`` arguments by position or keyword, but
         may return ``None`` if a formula is not available for the arguments; in this
         case, the infrastructure will fall back to a default implementation. The
@@ -3577,6 +3589,7 @@ class or its methods for more information.
     >>> import numpy as np
     >>> import matplotlib.pyplot as plt
     >>> from scipy import stats
+    >>> from scipy import special
 
     Create a `ContinuousDistribution` from `scipy.stats.loguniform`.
 
@@ -3651,6 +3664,45 @@ class or its methods for more information.
     >>> np.isclose(X.cdf(2.1), Y.cdf(2.1))
     np.True_
 
+    Create a custom distribution with multiple parameterizations. Here we create a
+    custom version of the beta distribution that has an alternative parameterization
+    in terms of the mean ``mu`` and a dispersion parameter ``nu``.
+
+    >>> class MyBeta:
+    ...     @property
+    ...     def __make_distribution_version__(self):
+    ...         return "1.16.0"
+    ...
+    ...     @property
+    ...     def parameters(self):
+    ...         return ({"a": (0, np.inf), "b": (0, np.inf)},
+    ...                 {"mu": (0, 1), "nu": (0, np.inf)})
+    ...
+    ...     def process_parameters(self, a=None, b=None, mu=None, nu=None):
+    ...         if a is not None and b is not None:
+    ...             nu = a + b
+    ...             mu = a / nu
+    ...         else:
+    ...             a = mu * nu
+    ...             b = nu - a
+    ...         return dict(a=a, b=b, mu=mu, nu=nu)
+    ...
+    ...     @property
+    ...     def support(self):
+    ...         return {'endpoints': (0, 1)}
+    ...
+    ...     def pdf(self, x, a, b, mu, nu):
+    ...         return special._ufuncs._beta_pdf(x, a, b)
+    ...
+    ...     def cdf(self, x, a, b, mu, nu):
+    ...         return special.betainc(a, b, x)
+    >>>
+    >>> MyBeta = stats.make_distribution(MyBeta())
+    >>> X = MyBeta(a=2.0, b=2.0)
+    >>> Y = MyBeta(mu=0.5, nu=4.0)
+    >>> np.isclose(X.pdf(0.3), Y.pdf(0.3))
+    np.True_
+
     """
     if dist in {stats.levy_stable, stats.vonmises}:
         raise NotImplementedError(f"`{dist.name}` is not supported.")
@@ -3795,13 +3847,24 @@ def _get_domain_info(info):
 
 
 def _make_distribution_custom(dist):
-    parameters = []
-
-    for name, info in dist.parameters.items():
-        domain_info, typical = _get_domain_info(info)
-        domain = _RealDomain(**domain_info)
-        param = _RealParameter(name, domain=domain, typical=typical)
-        parameters.append(param)
+    dist_parameters = (
+        dist.parameters if isinstance(dist.parameters, tuple) else (dist.parameters, )
+    )
+    parameterizations = []
+    for parameterization in dist_parameters:
+        # The attribute name ``parameters`` appears reasonable from a user facing
+        # perspective, but there is a little tension here with the internal. It's
+        # important to keep in mind that the ``parameters`` attribute in a
+        # user-created custom distribution specifies ``_parameterizations`` within
+        # the infrastructure.
+        parameters = []
+
+        for name, info in parameterization.items():
+            domain_info, typical = _get_domain_info(info)
+            domain = _RealDomain(**domain_info)
+            param = _RealParameter(name, domain=domain, typical=typical)
+            parameters.append(param)
+        parameterizations.append(_Parameterization(*parameters) if parameters else [])
 
     domain_info, _ = _get_domain_info(dist.support)
     _x_support = _RealDomain(**domain_info)
@@ -3809,8 +3872,7 @@ def _make_distribution_custom(dist):
     repr_str = dist.__class__.__name__
 
     class CustomDistribution(ContinuousDistribution):
-        _parameterizations = ([_Parameterization(*parameters)] if parameters
-                              else [])
+        _parameterizations = parameterizations
         _variable = _x_param
 
         def __repr__(self):
@@ -3847,6 +3909,13 @@ def _moment_standardized_formula(self, order, **kwargs):
         CustomDistribution._moment_central_formula = _moment_central_formula
         CustomDistribution._moment_standardized_formula = _moment_standardized_formula
 
+    if hasattr(dist, 'process_parameters'):
+        setattr(
+            CustomDistribution,
+            "_process_parameters",
+            getattr(dist, "process_parameters")
+        )
+
     support_etc = _combine_docs(CustomDistribution, include_examples=False).lstrip()
     docs = [
         f"This class represents `{repr_str}` as a subclass of "
diff --git a/scipy/stats/_entropy.py b/scipy/stats/_entropy.py
index 34bdb779ab13..12e7c45a0dd7 100644
--- a/scipy/stats/_entropy.py
+++ b/scipy/stats/_entropy.py
@@ -8,7 +8,7 @@
 import numpy as np
 from scipy import special
 from ._axis_nan_policy import _axis_nan_policy_factory, _broadcast_arrays
-from scipy._lib._array_api import array_namespace
+from scipy._lib._array_api import array_namespace, xp_promote
 from scipy._lib import array_api_extra as xpx
 
 __all__ = ['entropy', 'differential_entropy']
@@ -318,9 +318,7 @@ class of statistics based on spacings. Scandinavian Journal of
 
     """
     xp = array_namespace(values)
-    values = xp.asarray(values)
-    if xp.isdtype(values.dtype, "integral"):  # type: ignore[union-attr]
-        values = xp.astype(values, xp.asarray(1.).dtype)
+    values = xp_promote(values, force_floating=True, xp=xp)
     values = xp.moveaxis(values, axis, -1)
     n = values.shape[-1]  # type: ignore[union-attr]
 
diff --git a/scipy/stats/_morestats.py b/scipy/stats/_morestats.py
index 3926856d2510..27575dcd3f9b 100644
--- a/scipy/stats/_morestats.py
+++ b/scipy/stats/_morestats.py
@@ -16,6 +16,7 @@
     array_namespace,
     xp_size,
     xp_vector_norm,
+    xp_promote,
 )
 
 from ._ansari_swilk_statistics import gscale, swilk
@@ -863,8 +864,7 @@ def _log_var(logx, xp):
     # compute log of variance of x from log(x)
     logmean = _log_mean(logx)
     # get complex dtype with component dtypes same as `logx` dtype;
-    # see data-apis/array-api#841
-    dtype = xp.result_type(logx.dtype, xp.complex64)
+    dtype = xp.result_type(logx.dtype, 1j)
     pij = xp.full(logx.shape, pi * 1j, dtype=dtype)
     logxmu = special.logsumexp(xp.stack((logx, logmean + pij)), axis=0)
     res = (xp.real(xp.asarray(special.logsumexp(2 * logxmu, axis=0)))
@@ -896,14 +896,17 @@ def boxcox_llf(lmb, data):
 
     Notes
     -----
-    The Box-Cox log-likelihood function is defined here as
+    The Box-Cox log-likelihood function :math:`l` is defined here as
 
     .. math::
 
-        llf = (\lambda - 1) \sum_i(\log(x_i)) -
-              N/2 \log(\sum_i (y_i - \bar{y})^2 / N),
+        l = (\lambda - 1) \sum_i^N \log(x_i) -
+              \frac{N}{2} \log\left(\sum_i^N (y_i - \bar{y})^2 / N\right),
 
-    where ``y`` is the Box-Cox transformed input data ``x``.
+    where :math:`N` is the number of data points ``data`` and :math:`y` is the Box-Cox
+    transformed input data.
+    This corresponds to the *profile log-likelihood* of the original data :math:`x`
+    with some constant terms dropped.
 
     Examples
     --------
@@ -953,16 +956,12 @@ def boxcox_llf(lmb, data):
 
     """
     xp = array_namespace(data)
-    data = xp.asarray(data)
+    data = xp_promote(data, force_floating=True, xp=xp)
+
     N = data.shape[0]
     if N == 0:
         return xp.nan
 
-    dt = data.dtype
-    if xp.isdtype(dt, 'integral'):
-        data = xp.asarray(data, dtype=xp.float64)
-        dt = xp.float64
-
     logdata = xp.log(data)
 
     # Compute the variance of the transformed data.
@@ -977,7 +976,7 @@ def boxcox_llf(lmb, data):
         logvar = _log_var(logx, xp) - 2 * math.log(abs(lmb))
 
     res = (lmb - 1) * xp.sum(logdata, axis=0) - N/2 * logvar
-    res = xp.astype(res, dt)
+    res = xp.astype(res, data.dtype, copy=False)
     res = res[()] if res.ndim == 0 else res
     return res
 
@@ -1081,10 +1080,15 @@ def boxcox(x, lmbda=None, alpha=None, optimizer=None):
 
     Notes
     -----
-    The Box-Cox transform is given by::
+    The Box-Cox transform is given by:
+    
+    .. math::
 
-        y = (x**lmbda - 1) / lmbda,  for lmbda != 0
-            log(x),                  for lmbda = 0
+        y =
+        \begin{cases}
+        \frac{x^\lambda - 1}{\lambda}, &\text{for } \lambda \neq 0
+        \log(x),                       &\text{for } \lambda = 0
+        \end{cases}
 
     `boxcox` requires the input data to be positive.  Sometimes a Box-Cox
     transformation provides a shift parameter to achieve this; `boxcox` does
@@ -1096,9 +1100,9 @@ def boxcox(x, lmbda=None, alpha=None, optimizer=None):
 
     .. math::
 
-        llf(\hat{\lambda}) - llf(\lambda) < \frac{1}{2}\chi^2(1 - \alpha, 1),
+        l(\hat{\lambda}) - l(\lambda) < \frac{1}{2}\chi^2(1 - \alpha, 1),
 
-    with ``llf`` the log-likelihood function and :math:`\chi^2` the chi-squared
+    with :math:`l` the log-likelihood function and :math:`\chi^2` the chi-squared
     function.
 
     References
@@ -1537,12 +1541,24 @@ def yeojohnson(x, lmbda=None):
 
     Notes
     -----
-    The Yeo-Johnson transform is given by::
+    The Yeo-Johnson transform is given by:
+
+    .. math::
 
-        y = ((x + 1)**lmbda - 1) / lmbda,                for x >= 0, lmbda != 0
-            log(x + 1),                                  for x >= 0, lmbda = 0
-            -((-x + 1)**(2 - lmbda) - 1) / (2 - lmbda),  for x < 0, lmbda != 2
-            -log(-x + 1),                                for x < 0, lmbda = 2
+        y =
+        \begin{cases}
+        \frac{(x + 1)^\lambda - 1}{\lambda},
+        &\text{for } x \geq 0, \lambda \neq 0
+        \\
+        \log(x + 1),
+        &\text{for } x \geq 0, \lambda = 0
+        \\
+        -\frac{(-x + 1)^{2 - \lambda} - 1}{2 - \lambda},
+        &\text{for } x < 0, \lambda \neq 2
+        \\
+        -\log(-x + 1),
+        &\text{for } x < 0, \lambda = 2
+        \end{cases}
 
     Unlike `boxcox`, `yeojohnson` does not require the input data to be
     positive.
@@ -1650,15 +1666,18 @@ def yeojohnson_llf(lmb, data):
 
     Notes
     -----
-    The Yeo-Johnson log-likelihood function is defined here as
+    The Yeo-Johnson log-likelihood function :math:`l` is defined here as
 
     .. math::
 
-        llf = -N/2 \log(\hat{\sigma}^2) + (\lambda - 1)
-              \sum_i \text{ sign }(x_i)\log(|x_i| + 1)
+        l = -\frac{N}{2} \log(\hat{\sigma}^2) + (\lambda - 1)
+              \sum_i^N \text{sign}(x_i) \log(|x_i| + 1)
 
-    where :math:`\hat{\sigma}^2` is estimated variance of the Yeo-Johnson
-    transformed input data ``x``.
+    where :math:`N` is the number of data points :math:`x`=``data`` and
+    :math:`\hat{\sigma}^2` is the estimated variance of the Yeo-Johnson transformed
+    input data :math:`x`.
+    This corresponds to the *profile log-likelihood* of the original data :math:`x`
+    with some constant terms dropped.
 
     .. versionadded:: 1.2.0
 
@@ -3931,9 +3950,7 @@ def median_test(*samples, ties='below', correction=True, lambda_=1,
 def _circfuncs_common(samples, period, xp=None):
     xp = array_namespace(samples) if xp is None else xp
 
-    if xp.isdtype(samples.dtype, 'integral'):
-        dtype = xp.asarray(1.).dtype  # get default float type
-        samples = xp.asarray(samples, dtype=dtype)
+    samples = xp_promote(samples, force_floating=True, xp=xp)
 
     # Recast samples as radians that range between 0 and 2 pi and calculate
     # the sine and cosine
diff --git a/scipy/stats/_multivariate.py b/scipy/stats/_multivariate.py
index 0fdb6f63bfcd..bad820cfef12 100644
--- a/scipy/stats/_multivariate.py
+++ b/scipy/stats/_multivariate.py
@@ -3859,6 +3859,7 @@ class random_correlation_gen(multi_rv_generic):
     r"""A random correlation matrix.
 
     Return a random correlation matrix, given a vector of eigenvalues.
+    The returned matrix is symmetric positive semidefinite with unit diagonal.
 
     The `eigs` keyword specifies the eigenvalues of the correlation matrix,
     and implies the dimension.
@@ -3871,7 +3872,8 @@ class random_correlation_gen(multi_rv_generic):
     Parameters
     ----------
     eigs : 1d ndarray
-        Eigenvalues of correlation matrix
+        Eigenvalues of correlation matrix. All eigenvalues need to be non-negative and
+        need to sum to the number of eigenvalues.
     seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
         If `seed` is None (or `np.random`), the `numpy.random.RandomState`
         singleton is used.
diff --git a/scipy/stats/_page_trend_test.py b/scipy/stats/_page_trend_test.py
index 87a4d0d17c07..1e11f5ac01fb 100644
--- a/scipy/stats/_page_trend_test.py
+++ b/scipy/stats/_page_trend_test.py
@@ -1,9 +1,12 @@
+from dataclasses import dataclass
 from itertools import permutations
-import numpy as np
 import math
+import threading
+
+import numpy as np
+
 from ._continuous_distns import norm
 import scipy.stats
-from dataclasses import dataclass
 
 
 @dataclass
@@ -304,6 +307,8 @@ def page_trend_test(data, ranked=False, predicted_ranks=None, method='auto'):
                         method='exact')
 
     """
+    if not hasattr(_pagel_state, 'state'):
+        _pagel_state.state = _PageL()
 
     # Possible values of the method parameter and the corresponding function
     # used to evaluate the p value
@@ -405,8 +410,8 @@ def _l_p_exact(L, m, n):
     # [1] uses m, n; [5] uses n, k.
     # Switch convention here because exact calculation code references [5].
     L, n, k = int(L), int(m), int(n)
-    _pagel_state.set_k(k)
-    return _pagel_state.sf(L, n)
+    _pagel_state.state.set_k(k)
+    return _pagel_state.state.sf(L, n)
 
 
 class _PageL:
@@ -476,4 +481,6 @@ def pmf(self, l, n):
 
 
 # Maintain state for faster repeat calls to page_trend_test w/ method='exact'
-_pagel_state = _PageL()
+# _PageL() is calculated once per thread and stored as an attribute on
+# this thread-local variable inside page_trend_test().
+_pagel_state = threading.local()
diff --git a/scipy/stats/_quantile.py b/scipy/stats/_quantile.py
index 7ddfce7e992b..4d2a8778b82c 100644
--- a/scipy/stats/_quantile.py
+++ b/scipy/stats/_quantile.py
@@ -1,6 +1,6 @@
 import numpy as np
 from scipy.special import betainc
-from scipy._lib._array_api import xp_default_dtype, xp_ravel, array_namespace
+from scipy._lib._array_api import xp_ravel, array_namespace, xp_promote
 import scipy._lib.array_api_extra as xpx
 from scipy.stats._axis_nan_policy import _broadcast_arrays, _contains_nan
 from scipy.stats._stats_py import _length_nonmasked
@@ -8,17 +8,16 @@
 
 def _quantile_iv(x, p, method, axis, nan_policy, keepdims):
     xp = array_namespace(x, p)
-    x = xp.asarray(x)
-    p = xp.asarray(p)
 
-    if not xp.isdtype(x.dtype, ('integral', 'real floating')):
+    if not xp.isdtype(xp.asarray(x).dtype, ('integral', 'real floating')):
         raise ValueError("`x` must have real dtype.")
-    if xp.isdtype(x.dtype, 'integral'):
-        x = xp.astype(x, xp_default_dtype(xp))
 
-    if not xp.isdtype(p.dtype, 'real floating'):
+    if not xp.isdtype(xp.asarray(p).dtype, 'real floating'):
         raise ValueError("`p` must have real floating dtype.")
 
+    x, p = xp_promote(x, p, force_floating=True, xp=xp)
+    dtype = x.dtype
+
     axis_none = axis is None
     ndim = max(x.ndim, p.ndim)
     if axis_none:
@@ -47,10 +46,6 @@ def _quantile_iv(x, p, method, axis, nan_policy, keepdims):
         message = "If specified, `keepdims` must be True or False."
         raise ValueError(message)
 
-    dtype = xp.result_type(p, x)
-    x = xp.astype(x, dtype, copy=False)
-    p = xp.astype(p, dtype, copy=False)
-
     # If data has length zero along `axis`, the result will be an array of NaNs just
     # as if the data had length 1 along axis and were filled with NaNs. This is treated
     # naturally below whether `nan_policy` is `'propagate'` or `'omit'`.
diff --git a/scipy/stats/_resampling.py b/scipy/stats/_resampling.py
index 8552f4538427..27b1daf1375f 100644
--- a/scipy/stats/_resampling.py
+++ b/scipy/stats/_resampling.py
@@ -7,7 +7,7 @@
 
 from scipy._lib._util import (check_random_state, _rename_parameter, rng_integers,
                               _transition_to_rng)
-from scipy._lib._array_api import array_namespace, is_numpy
+from scipy._lib._array_api import array_namespace, is_numpy, xp_result_type
 from scipy.special import ndtr, ndtri, comb, factorial
 
 from ._common import ConfidenceInterval
@@ -698,6 +698,7 @@ def _monte_carlo_test_iv(data, rvs, statistic, vectorized, n_resamples,
         vectorized = 'axis' in signature
 
     xp = array_namespace(*data)
+    dtype = xp_result_type(*data, force_floating=True, xp=xp)
 
     if not vectorized:
         if is_numpy(xp):
@@ -732,10 +733,6 @@ def _monte_carlo_test_iv(data, rvs, statistic, vectorized, n_resamples,
     if alternative not in alternatives:
         raise ValueError(f"`alternative` must be in {alternatives}")
 
-    # Infer the desired p-value dtype based on the input types
-    min_float = getattr(xp, 'float16', xp.float32)
-    dtype = xp.result_type(*data_iv, min_float)
-
     return (data_iv, rvs, statistic_vectorized, vectorized, n_resamples_int,
             batch_iv, alternative, axis_int, dtype, xp)
 
diff --git a/scipy/stats/_stats_py.py b/scipy/stats/_stats_py.py
index cb9551e36069..2f7e8e1af065 100644
--- a/scipy/stats/_stats_py.py
+++ b/scipy/stats/_stats_py.py
@@ -78,7 +78,7 @@
     is_marray,
     xp_size,
     xp_vector_norm,
-    xp_broadcast_promote,
+    xp_promote,
     xp_capabilities,
     xp_ravel,
 )
@@ -145,16 +145,6 @@ def _chk2_asarray(a, b, axis):
     return a, b, outaxis
 
 
-def _convert_common_float(*arrays, xp=None):
-    xp = array_namespace(*arrays) if xp is None else xp
-    arrays = [_asarray(array, subok=True) for array in arrays]
-    dtypes = [(xp.asarray(1.).dtype if xp.isdtype(array.dtype, 'integral')
-               else array.dtype) for array in arrays]
-    dtype = xp.result_type(*dtypes)
-    arrays = [xp.astype(array, dtype, copy=False) for array in arrays]
-    return arrays[0] if len(arrays)==1 else tuple(arrays)
-
-
 SignificanceResult = _make_tuple_bunch('SignificanceResult',
                                        ['statistic', 'pvalue'], [])
 # Let's call a SignificanceResult with legacy :correlation" attribute a
@@ -630,9 +620,10 @@ def _put_val_to_limits(a, limits, inclusive, val=np.nan, xp=None):
         mask |= (a < lower_limit) if lower_include else a <= lower_limit
     if upper_limit is not None:
         mask |= (a > upper_limit) if upper_include else a >= upper_limit
-    if xp.all(mask):
+    lazy = is_lazy_array(mask)
+    if not lazy and xp.all(mask):
         raise ValueError("No array values within given limits")
-    if xp.any(mask):
+    if lazy or xp.any(mask):
         a = xp.where(mask, val, a)
     return a, mask
 
@@ -805,11 +796,15 @@ def tmin(a, lowerlimit=None, axis=0, inclusive=True, nan_policy='propagate'):
     a, mask = _put_val_to_limits(a, (lowerlimit, None), (inclusive, None),
                                  val=max_, xp=xp)
 
-    min_ = xp.min(a, axis=axis)
-    valid = ~xp.all(mask, axis=axis)  # At least one element above lowerlimit
-    # Output dtype is data-dependent
-    # Possible loss of precision for int types
-    res = min_ if xp.all(valid) else xp.where(valid, min_, xp.nan)
+    res = xp.min(a, axis=axis)
+    invalid = xp.all(mask, axis=axis)  # All elements are below lowerlimit
+
+    # For eager backends, output dtype is data-dependent
+    if is_lazy_array(invalid) or xp.any(invalid):
+        # Possible loss of precision for int types
+        res = xp_promote(res, force_floating=True, xp=xp)
+        res = xp.where(invalid, xp.nan, res)
+
     return res[()] if res.ndim == 0 else res
 
 
@@ -864,11 +859,15 @@ def tmax(a, upperlimit=None, axis=0, inclusive=True, nan_policy='propagate'):
     a, mask = _put_val_to_limits(a, (None, upperlimit), (None, inclusive),
                                  val=min_, xp=xp)
 
-    max_ = xp.max(a, axis=axis)
-    valid = ~xp.all(mask, axis=axis)  # At least one element below upperlimit
-    # Output dtype is data-dependent
-    # Possible loss of precision for int types
-    res = max_ if xp.all(valid) else xp.where(valid, max_, xp.nan)
+    res = xp.max(a, axis=axis)
+    invalid = xp.all(mask, axis=axis)  # All elements are above upperlimit
+
+    # For eager backends, output dtype is data-dependent
+    if is_lazy_array(invalid) or xp.any(invalid):
+        # Possible loss of precision for int types
+        res = xp_promote(res, force_floating=True, xp=xp)
+        res = xp.where(invalid, xp.nan, res)
+    
     return res[()] if res.ndim == 0 else res
 
 
@@ -1118,10 +1117,7 @@ def moment(a, order=1, axis=0, nan_policy='propagate', *, center=None):
     xp = array_namespace(a)
     a, axis = _chk_asarray(a, axis, xp=xp)
 
-    if xp.isdtype(a.dtype, 'integral'):
-        a = xp.asarray(a, dtype=xp.float64)
-    else:
-        a = xp.asarray(a)
+    a = xp_promote(a, force_floating=True, xp=xp)
 
     order = xp.asarray(order, dtype=a.dtype)
     if xp_size(order) == 0:
@@ -1158,7 +1154,8 @@ def _demean(a, mean, axis, *, xp, precision_warning=True):
     # Used in e.g. `_moment`, `_zscore`, `_xp_var`. See gh-15905.
     a_zero_mean = a - mean
 
-    if xp_size(a_zero_mean) == 0:
+    if (xp_size(a_zero_mean) == 0 or not precision_warning
+        or is_lazy_array(a_zero_mean)):
         return a_zero_mean
 
     eps = xp.finfo(mean.dtype).eps * 10
@@ -1171,7 +1168,7 @@ def _demean(a, mean, axis, *, xp, precision_warning=True):
     with np.errstate(invalid='ignore'):
         precision_loss = xp.any(xp.asarray(rel_diff < eps) & xp.asarray(n > 1))
 
-    if precision_loss and precision_warning:
+    if precision_loss:
         message = ("Precision loss occurred in moment calculation due to "
                    "catastrophic cancellation. This occurs when the data "
                    "are nearly identical. Results may be unreliable.")
@@ -1188,9 +1185,7 @@ def _moment(a, order, axis, *, mean=None, xp=None):
     """
     xp = array_namespace(a) if xp is None else xp
 
-    if xp.isdtype(a.dtype, 'integral'):
-        a = xp.asarray(a, dtype=xp.float64)
-
+    a = xp_promote(a, force_floating=True, xp=xp)
     dtype = a.dtype
 
     # moment of empty array is the same regardless of order
@@ -2860,7 +2855,7 @@ def gzscore(a, *, axis=0, ddof=0, nan_policy='propagate'):
 
     """
     xp = array_namespace(a)
-    a = _convert_common_float(a, xp=xp)
+    a = xp_promote(a, force_floating=True, xp=xp)
     log = ma.log if isinstance(a, ma.MaskedArray) else xp.log
     return zscore(log(a), axis=axis, ddof=ddof, nan_policy=nan_policy)
 
@@ -2922,7 +2917,7 @@ def zmap(scores, compare, axis=0, ddof=0, nan_policy='propagate'):
 
     like_zscore = (scores is compare)
     xp = array_namespace(scores, compare)
-    scores, compare = _convert_common_float(scores, compare, xp=xp)
+    scores, compare = xp_promote(scores, compare, force_floating=True, xp=xp)
 
     with warnings.catch_warnings():
         if like_zscore:  # zscore should not emit SmallSampleWarning
@@ -3053,7 +3048,7 @@ def gstd(a, axis=0, ddof=1, *, keepdims=False, nan_policy='propagate'):
 
     """
     xp = array_namespace(a)
-    a = xp_broadcast_promote(a, force_floating=True)[0]  # just promote to correct float
+    a = xp_promote(a, force_floating=True, xp=xp)
 
     kwargs = dict(axis=axis, correction=ddof, keepdims=keepdims, nan_policy=nan_policy)
     with np.errstate(invalid='ignore', divide='ignore'):
@@ -4652,8 +4647,8 @@ def pearsonr(x, y, *, alternative='two-sided', method=None, axis=0):
 
     """
     xp = array_namespace(x, y)
-    x = xp.asarray(x)
-    y = xp.asarray(y)
+    x, y = xp_promote(x, y, force_floating=True, xp=xp)
+    dtype = x.dtype
 
     if not is_numpy(xp) and method is not None:
         method = 'invalid'
@@ -4693,10 +4688,6 @@ def pearsonr(x, y, *, alternative='two-sided', method=None, axis=0):
     y = xp.moveaxis(y, axis, -1)
     axis = -1
 
-    dtype = xp.result_type(x.dtype, y.dtype)
-    if xp.isdtype(dtype, "integral"):
-        dtype = xp.asarray(1.).dtype
-
     if xp.isdtype(dtype, "complex floating"):
         raise ValueError('This function does not support complex data')
 
@@ -6753,11 +6744,7 @@ def ttest_ind(a, b, *, axis=0, equal_var=True, nan_policy='propagate',
     """
     xp = array_namespace(a, b)
 
-    default_float = xp.asarray(1.).dtype
-    if xp.isdtype(a.dtype, 'integral'):
-        a = xp.astype(a, default_float)
-    if xp.isdtype(b.dtype, 'integral'):
-        b = xp.astype(b, default_float)
+    a, b = xp_promote(a, b, force_floating=True, xp=xp)
 
     if axis is None:
         a, b, axis = xp_ravel(a), xp_ravel(b), 0
@@ -7355,8 +7342,8 @@ def power_divergence(f_obs, f_exp=None, ddof=0, axis=0, lambda_=None):
 
 
 def _power_divergence(f_obs, f_exp, ddof, axis, lambda_, sum_check=True):
-    xp = array_namespace(f_obs)
-    default_float = xp.asarray(1.).dtype
+    xp = array_namespace(f_obs, f_exp)
+    f_obs, f_exp = xp_promote(f_obs, f_exp, force_floating=True, xp=xp)
 
     # Convert the input argument `lambda_` to a numerical value.
     if isinstance(lambda_, str):
@@ -7368,16 +7355,9 @@ def _power_divergence(f_obs, f_exp, ddof, axis, lambda_, sum_check=True):
     elif lambda_ is None:
         lambda_ = 1
 
-    f_obs = xp.asarray(f_obs)
-    dtype = default_float if xp.isdtype(f_obs.dtype, 'integral') else f_obs.dtype
-    f_obs = xp.asarray(f_obs, dtype=dtype)
-    f_obs_float = xp.asarray(f_obs, dtype=xp.float64)
-
     if f_exp is not None:
-        f_exp = xp.asarray(f_exp)
-        dtype = default_float if xp.isdtype(f_exp.dtype, 'integral') else f_exp.dtype
-        f_exp = xp.asarray(f_exp, dtype=dtype)
-
+        # not sure why we force to float64, but not going to touch it
+        f_obs_float = xp.asarray(f_obs, dtype=xp.float64)
         bshape = _broadcast_shapes((f_obs_float.shape, f_exp.shape))
         f_obs_float = xp.broadcast_to(f_obs_float, bshape)
         f_exp = xp.broadcast_to(f_exp, bshape)
@@ -9051,8 +9031,8 @@ def combine_pvalues(pvalues, method='fisher', weights=None, *, axis=0):
 
     """
     xp = array_namespace(pvalues, weights)
-    pvalues, weights = xp_broadcast_promote(pvalues, weights,
-                                            force_floating=True, xp=xp)
+    pvalues, weights = xp_promote(pvalues, weights, broadcast=True,
+                                  force_floating=True, xp=xp)
 
     if xp_size(pvalues) == 0:
         # This is really only needed for *testing* _axis_nan_policy decorator
@@ -10914,7 +10894,7 @@ def _xp_mean(x, /, *, axis=None, weights=None, keepdims=False, nan_policy='propa
                          or (weights is not None and xp_size(weights) == 0)):
         return gmean(x, weights=weights, axis=axis, keepdims=keepdims)
 
-    x, weights = xp_broadcast_promote(x, weights, force_floating=True)
+    x, weights = xp_promote(x, weights, broadcast=True, force_floating=True, xp=xp)
     if weights is not None:
         x, weights = _share_masks(x, weights, xp=xp)
 
@@ -10938,11 +10918,12 @@ def _xp_mean(x, /, *, axis=None, weights=None, keepdims=False, nan_policy='propa
     # appear in `x` or `weights`. Emit warning if there is an all-NaN slice.
     # Test nan_policy before the implicit call to bool(contains_nan)
     # to avoid raising on lazy xps on the default nan_policy='propagate'
-    if nan_policy == 'omit' and contains_nan:
+    lazy = is_lazy_array(x)
+    if nan_policy == 'omit' and (lazy or contains_nan):
         nan_mask = xp.isnan(x)
         if weights is not None:
             nan_mask |= xp.isnan(weights)
-        if xp.any(xp.all(nan_mask, axis=axis)):
+        if not lazy and xp.any(xp.all(nan_mask, axis=axis)):
             message = (too_small_1d_omit if (x.ndim == 1 or axis is None)
                        else too_small_nd_omit)
             warnings.warn(message, SmallSampleWarning, stacklevel=2)
diff --git a/scipy/stats/tests/test_continued_fraction.py b/scipy/stats/tests/test_continued_fraction.py
index c3b9ae5cbd5d..ac7ba3280169 100644
--- a/scipy/stats/tests/test_continued_fraction.py
+++ b/scipy/stats/tests/test_continued_fraction.py
@@ -19,23 +19,25 @@ class TestContinuedFraction:
     p = rng.uniform(1, 10, size=10)
 
     def a1(self, n, x=1.5):
-        xp = array_namespace(x)
         if n == 0:
             y = 0*x
         elif n == 1:
             y = x
         else:
             y = -x**2
-        return xp.asarray(y, dtype=x.dtype)
+        if np.isscalar(y) and np.__version__ < "2.0":
+            y = np.full_like(x, y)  # preserve dtype pre NEP 50
+        return y
 
     def b1(self, n, x=1.5):
-        xp = array_namespace(x)
         if n == 0:
             y = 0*x
         else:
             one = x/x  # gets array of correct type, dtype, and shape
             y = one * (2*n - 1)
-        return xp.asarray(y, dtype=x.dtype)
+        if np.isscalar(y) and np.__version__ < "2.0":
+            y = np.full_like(x, y)  # preserve dtype pre NEP 50
+        return y
 
     def log_a1(self, n, x):
         xp = array_namespace(x)
@@ -45,7 +47,7 @@ def log_a1(self, n, x):
             y = xp.log(x)
         else:
             y = 2 * xp.log(x) + math.pi * 1j
-        return xp.asarray(y, dtype=x.dtype)
+        return y
 
     def log_b1(self, n, x):
         xp = array_namespace(x)
@@ -54,7 +56,7 @@ def log_b1(self, n, x):
         else:
             one = x - x  # gets array of correct type, dtype, and shape
             y = one + math.log(2 * n - 1)
-        return xp.asarray(y, dtype=x.dtype)
+        return y
 
     def test_input_validation(self, xp):
         a1 = self.a1
diff --git a/scipy/stats/tests/test_continuous.py b/scipy/stats/tests/test_continuous.py
index e8e4eeeabb27..94f0fd4e1f8a 100644
--- a/scipy/stats/tests/test_continuous.py
+++ b/scipy/stats/tests/test_continuous.py
@@ -10,6 +10,7 @@
 from hypothesis import strategies, given, reproduce_failure, settings  # noqa: F401
 import hypothesis.extra.numpy as npst
 
+from scipy import special
 from scipy import stats
 from scipy.stats._fit import _kolmogorov_smirnov
 from scipy.stats._ksstats import kolmogn
@@ -1272,6 +1273,70 @@ def cdf(self, x, *, c, mu, sigma):
         assert_allclose(X1.icdf(p), X2.icdf(p))
         assert_allclose(X1.iccdf(p), X2.iccdf(p))
 
+    @pytest.mark.parametrize("a", [0.5, np.asarray([0.5, 1.0, 2.0, 4.0, 8.0])])
+    @pytest.mark.parametrize("b", [0.5, np.asarray([0.5, 1.0, 2.0, 4.0, 8.0])])
+    def test_custom_multiple_parameterizations(self, a, b):
+        rng = np.random.default_rng(7548723590230982)
+        class MyBeta:
+            @property
+            def __make_distribution_version__(self):
+                return "1.16.0"
+
+            @property
+            def parameters(self):
+                return (
+                    {"a": (0, np.inf), "b": (0, np.inf)},
+                    {"mu": (0, 1), "nu": (0, np.inf)},
+                )
+
+            def process_parameters(self, a=None, b=None, mu=None, nu=None):
+                if a is not None and b is not None and mu is None and nu is None:
+                    nu = a + b
+                    mu = a / nu
+                else:
+                    a = mu * nu
+                    b = nu - a
+                return {"a": a, "b": b, "mu": mu, "nu": nu}
+
+            @property
+            def support(self):
+                return {'endpoints': (0, 1)}
+
+            def pdf(self, x, a, b, mu, nu):
+                return special._ufuncs._beta_pdf(x, a, b)
+
+            def cdf(self, x, a, b, mu, nu):
+                return special.betainc(a, b, x)
+
+        Beta = stats.make_distribution(stats.beta)
+        MyBeta = stats.make_distribution(MyBeta())
+
+        mu = a / (a + b)
+        nu = a + b
+
+        X = MyBeta(a=a, b=b)
+        Y = MyBeta(mu=mu, nu=nu)
+        Z = Beta(a=a, b=b)
+
+        x = Z.sample(shape=10, rng=rng)
+        p = Z.cdf(x)
+
+        assert_allclose(X.support(), Z.support())
+        assert_allclose(X.median(), Z.median())
+        assert_allclose(X.pdf(x), Z.pdf(x))
+        assert_allclose(X.cdf(x), Z.cdf(x))
+        assert_allclose(X.ccdf(x), Z.ccdf(x))
+        assert_allclose(X.icdf(p), Z.icdf(p))
+        assert_allclose(X.iccdf(p), Z.iccdf(p))
+
+        assert_allclose(Y.support(), Z.support())
+        assert_allclose(Y.median(), Z.median())
+        assert_allclose(Y.pdf(x), Z.pdf(x))
+        assert_allclose(Y.cdf(x), Z.cdf(x))
+        assert_allclose(Y.ccdf(x), Z.ccdf(x))
+        assert_allclose(Y.icdf(p), Z.icdf(p))
+        assert_allclose(Y.iccdf(p), Z.iccdf(p))
+
     def test_input_validation(self):
         message = '`levy_stable` is not supported.'
         with pytest.raises(NotImplementedError, match=message):
diff --git a/scipy/stats/tests/test_marray.py b/scipy/stats/tests/test_marray.py
index df6644ab1ff2..ffb3121fcf35 100644
--- a/scipy/stats/tests/test_marray.py
+++ b/scipy/stats/tests/test_marray.py
@@ -3,7 +3,7 @@
 from scipy import stats
 
 from scipy._lib._array_api import xp_assert_close, xp_assert_equal
-from scipy.stats._stats_py import _xp_mean, _xp_var
+from scipy.stats._stats_py import _xp_mean, _xp_var, _length_nonmasked
 from scipy.stats._axis_nan_policy import _axis_nan_policy_factory
 
 
@@ -38,6 +38,7 @@ def get_arrays(n_arrays, *, dtype='float64', xp=np, shape=(7, 8), seed=849121654
 
 @skip_backend('dask.array', reason='Arrays need `device` attribute: dask/dask#11711')
 @skip_backend('jax.numpy', reason="JAX doesn't allow item assignment.")
+@skip_backend('torch', reason="marray#99")
 @pytest.mark.parametrize('fun, kwargs', [(stats.gmean, {}),
                                          (stats.hmean, {}),
                                          (stats.pmean, {'p': 2})])
@@ -51,6 +52,7 @@ def test_xmean(fun, kwargs, axis, xp):
 
 @skip_backend('dask.array', reason='Arrays need `device` attribute: dask/dask#11711')
 @skip_backend('jax.numpy', reason="JAX doesn't allow item assignment.")
+@skip_backend('torch', reason="marray#99")
 @pytest.mark.parametrize('axis', [0, 1, None])
 @pytest.mark.parametrize('keepdims', [False, True])
 def test_xp_mean(axis, keepdims, xp):
@@ -272,3 +274,16 @@ def test_ttest_ind_from_stats(xp):
     xp_assert_close(res.pvalue.mask, mask)
     assert res.statistic.shape == shape
     assert res.pvalue.shape == shape
+
+def test_length_nonmasked_marray_iterable_axis_raises():
+    xp = marray._get_namespace(np)
+
+    data = [[1.0, 2.0], [3.0, 4.0]]
+    mask = [[False, False], [True, False]]
+    marr = xp.asarray(data, mask=mask)
+
+    # Axis tuples are not currently supported for MArray input.
+    # This test can be removed after support is added.
+    with pytest.raises(NotImplementedError,
+        match="`axis` must be an integer or None for use with `MArray`"):
+        _length_nonmasked(marr, axis=(0, 1), xp=xp)
diff --git a/scipy/stats/tests/test_morestats.py b/scipy/stats/tests/test_morestats.py
index 9a8209eb7831..bb1d8b5d5a2b 100644
--- a/scipy/stats/tests/test_morestats.py
+++ b/scipy/stats/tests/test_morestats.py
@@ -2035,7 +2035,7 @@ def test_gh_6873(self, xp):
         xp_assert_close(llf, xp.asarray(-17.93934208579061))
 
     def test_instability_gh20021(self, xp):
-        data = xp.asarray([2003, 1950, 1997, 2000, 2009])
+        data = xp.asarray([2003, 1950, 1997, 2000, 2009], dtype=xp.float64)
         llf = stats.boxcox_llf(1e-8, data)
         # The expected value was computed with mpsci, set mpmath.mp.dps=100
         # expect float64 output for integer input
diff --git a/scipy/stats/tests/test_quantile.py b/scipy/stats/tests/test_quantile.py
index b181f305b53d..744d9e2cea29 100644
--- a/scipy/stats/tests/test_quantile.py
+++ b/scipy/stats/tests/test_quantile.py
@@ -2,7 +2,7 @@
 import numpy as np
 
 from scipy import stats
-from scipy._lib._array_api import xp_default_dtype, is_numpy, is_torch
+from scipy._lib._array_api import xp_default_dtype, is_numpy, is_torch, SCIPY_ARRAY_API
 from scipy._lib._array_api_no_0d import xp_assert_close, xp_assert_equal
 from scipy._lib._util import _apply_over_batch
 
@@ -133,6 +133,8 @@ def test_against_reference(self, axis, keepdims, nan_policy, dtype, method, xp):
             if is_torch(xp):
                 pytest.skip("sum_cpu not implemented for UInt64, see "
                             "data-apis/array-api-compat#242")
+            if not SCIPY_ARRAY_API:
+                pytest.skip("MArray is only available if SCIPY_ARRAY_API=1")
             marray = pytest.importorskip('marray')
             kwargs = dict(axis=axis, keepdims=keepdims, method=method)
             mxp = marray._get_namespace(xp)
@@ -160,7 +162,11 @@ def test_integer_input_output_dtype(self, xp):
          ([[], []], 0.5, np.full(2, np.nan), {'axis': -1}),
          ([[], []], 0.5, np.zeros((0,)), {'axis': 0, 'keepdims': False}),
          ([[], []], 0.5, np.zeros((1, 0)), {'axis': 0, 'keepdims': True}),
-         ([], [0.5, 0.6], np.full(2, np.nan), {}),])
+         ([], [0.5, 0.6], np.full(2, np.nan), {}),
+         (np.arange(1, 28).reshape((3, 3, 3)), 0.5, [[[14.]]],
+          {'axis': None, 'keepdims': True}),
+         ([[1, 2], [3, 4]], [0.25, 0.5, 0.75], [[1.75, 2.5, 3.25]], 
+          {'axis': None, 'keepdims': True}),])
     def test_edge_cases(self, x, p, ref, kwargs, xp):
         default_dtype = xp_default_dtype(xp)
         x, p, ref = xp.asarray(x), xp.asarray(p), xp.asarray(ref, dtype=default_dtype)
diff --git a/scipy/stats/tests/test_stats.py b/scipy/stats/tests/test_stats.py
index 2dfd19d8e1b8..77e96e3beddf 100644
--- a/scipy/stats/tests/test_stats.py
+++ b/scipy/stats/tests/test_stats.py
@@ -44,7 +44,8 @@
                                    is_torch, xp_default_dtype, xp_size, SCIPY_ARRAY_API,
                                    make_skip_xp_backends)
 from scipy._lib._array_api_no_0d import xp_assert_close, xp_assert_equal
-from scipy._lib import array_api_extra as xpx
+import scipy._lib.array_api_extra as xpx
+from scipy._lib.array_api_extra.testing import lazy_xp_function
 
 skip_xp_backends = pytest.mark.skip_xp_backends
 boolean_index_skip_reason = 'JAX/Dask arrays do not support boolean assignment.'
@@ -74,15 +75,20 @@
 TINY = array([1e-12,2e-12,3e-12,4e-12,5e-12,6e-12,7e-12,8e-12,9e-12], float)
 ROUND = array([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5], float)
 
+lazy_xp_modules = [stats]
+lazy_xp_function(stats.tmean, static_argnames=("inclusive", "axis"))
+lazy_xp_function(stats.tvar, static_argnames=("inclusive", "axis", "ddof"))
+lazy_xp_function(stats.tstd, static_argnames=("inclusive", "axis", "ddof"))
+lazy_xp_function(stats.tsem, static_argnames=("inclusive", "axis", "ddof"))
+lazy_xp_function(stats.tmin, static_argnames=("inclusive", "axis"))
+lazy_xp_function(stats.tmax, static_argnames=("inclusive", "axis"))
+
 
 class TestTrimmedStats:
     # TODO: write these tests to handle missing values properly
     dprec = np.finfo(np.float64).precision
 
     @make_skip_xp_backends(stats.tmean)
-    @pytest.mark.filterwarnings(
-        "ignore:invalid value encountered in divide:RuntimeWarning:dask"
-    )
     def test_tmean(self, xp):
         default_dtype = xp_default_dtype(xp)
         x = xp.asarray(X, dtype=default_dtype)
@@ -172,19 +178,17 @@ def test_tstd(self, xp):
         xp_assert_close(y, xp.std(x, correction=1))
 
     @make_skip_xp_backends(stats.tmin)
-    @pytest.mark.xfail_xp_backends("array_api_strict",
-                                   reason="broadcast int dtype vs. xp.nan")
     def test_tmin(self, xp):
-        x = xp.arange(10)
-        xp_assert_equal(stats.tmin(x), xp.asarray(0))
-        xp_assert_equal(stats.tmin(x, lowerlimit=0), xp.asarray(0))
-        xp_assert_equal(stats.tmin(x, lowerlimit=0, inclusive=False), xp.asarray(1))
+        x = xp.arange(10.)
+        xp_assert_equal(stats.tmin(x), xp.asarray(0.))
+        xp_assert_equal(stats.tmin(x, lowerlimit=0), xp.asarray(0.))
+        xp_assert_equal(stats.tmin(x, lowerlimit=0, inclusive=False), xp.asarray(1.))
 
         x = xp.reshape(x, (5, 2))
         xp_assert_equal(stats.tmin(x, lowerlimit=0, inclusive=False),
-                        xp.asarray([2, 1]))
-        xp_assert_equal(stats.tmin(x, axis=1), xp.asarray([0, 2, 4, 6, 8]))
-        xp_assert_equal(stats.tmin(x, axis=None), xp.asarray(0))
+                        xp.asarray([2., 1.]))
+        xp_assert_equal(stats.tmin(x, axis=1), xp.asarray([0., 2., 4., 6., 8.]))
+        xp_assert_equal(stats.tmin(x, axis=None), xp.asarray(0.))
 
         x = xpx.at(xp.arange(10.), 9).set(xp.nan)
         xp_assert_equal(stats.tmin(x), xp.asarray(xp.nan))
@@ -213,19 +217,17 @@ def test_tmin_scalar_and_nanpolicy(self, xp):
                 stats.tmin(x, nan_policy='foobar')
 
     @make_skip_xp_backends(stats.tmax)
-    @pytest.mark.xfail_xp_backends("array_api_strict",
-                                   reason="broadcast int dtype vs. xp.nan")
     def test_tmax(self, xp):
-        x = xp.arange(10)
-        xp_assert_equal(stats.tmax(x), xp.asarray(9))
-        xp_assert_equal(stats.tmax(x, upperlimit=9), xp.asarray(9))
-        xp_assert_equal(stats.tmax(x, upperlimit=9, inclusive=False), xp.asarray(8))
+        x = xp.arange(10.)
+        xp_assert_equal(stats.tmax(x), xp.asarray(9.))
+        xp_assert_equal(stats.tmax(x, upperlimit=9), xp.asarray(9.))
+        xp_assert_equal(stats.tmax(x, upperlimit=9, inclusive=False), xp.asarray(8.))
 
         x = xp.reshape(x, (5, 2))
         xp_assert_equal(stats.tmax(x, upperlimit=9, inclusive=False),
-                        xp.asarray([8, 7]))
-        xp_assert_equal(stats.tmax(x, axis=1), xp.asarray([1, 3, 5, 7, 9]))
-        xp_assert_equal(stats.tmax(x, axis=None), xp.asarray(9))
+                        xp.asarray([8., 7.]))
+        xp_assert_equal(stats.tmax(x, axis=1), xp.asarray([1., 3., 5., 7., 9.]))
+        xp_assert_equal(stats.tmax(x, axis=None), xp.asarray(9.))
 
         x = xpx.at(xp.arange(10.), 9).set(xp.nan)
         xp_assert_equal(stats.tmax(x), xp.asarray(xp.nan))
@@ -255,6 +257,29 @@ def test_tmax_scalar_and_nanpolicy(self, xp):
             with assert_raises(ValueError, match=msg):
                 stats.tmax(x, nan_policy='foobar')
 
+    @make_skip_xp_backends(stats.tmin, stats.tmax)
+    def test_tmin_tmax_int_dtype(self, xp):
+        x = xp.reshape(xp.arange(10, dtype=xp.int16), (2, 5)).T
+
+        # When tmin/tmax don't need to inject any NaNs,
+        # retain the input dtype. Dask/JAX can't inspect
+        # the data so they always return float.
+        expect_dtype = xp_default_dtype(xp) if is_lazy_array(x) else x.dtype
+        xp_assert_equal(stats.tmin(x), xp.asarray([0, 5], dtype=expect_dtype))
+        xp_assert_equal(stats.tmax(x), xp.asarray([4, 9], dtype=expect_dtype))
+
+        # When they do inject NaNs, all backends behave the same.
+        xp_assert_equal(stats.tmin(x, lowerlimit=6), xp.asarray([xp.nan, 6.]))
+        xp_assert_equal(stats.tmax(x, upperlimit=3), xp.asarray([3., xp.nan]))
+
+    @skip_xp_backends(eager_only=True, reason="Only with data-dependent output dtype")
+    @make_skip_xp_backends(stats.tmin, stats.tmax)
+    def test_gh_22626(self, xp):
+        # Test that `tmin`/`tmax` returns exact result with outrageously large integers
+        x = xp.arange(2**62, 2**62+10)
+        xp_assert_equal(stats.tmin(x[None, :]), x)
+        xp_assert_equal(stats.tmax(x[None, :]), x)
+
     @make_skip_xp_backends(stats.tsem)
     def test_tsem(self, xp):
         x = xp.asarray(X.tolist())  # use default dtype of xp
@@ -264,13 +289,6 @@ def test_tsem(self, xp):
         xp_assert_close(y, xp.std(y_ref, correction=1) / xp_size(y_ref)**0.5)
         xp_assert_close(stats.tsem(x, limits=[-1, 10]), stats.tsem(x, limits=None))
 
-    @make_skip_xp_backends(stats.tmax, stats.tmin)
-    def test_gh_22626(self, xp):
-        # Test that `tmin`/`tmax` returns exact result with outrageously large integers
-        x = xp.arange(2**62, 2**62+10)
-        xp_assert_equal(stats.tmin(x[None, :]), x)
-        xp_assert_equal(stats.tmax(x[None, :]), x)
-
 
 class TestPearsonrWilkinson:
     """ W.II.D. Compute a correlation matrix on all the variables.
@@ -2111,15 +2129,15 @@ def wkq(x, y, rank, weigher, add):
     def weigher(x):
         return 1. / (x + 1)
 
-    np.random.seed(42)
+    rng = np.random.default_rng(42)
     for s in range(3,10):
         a = []
         # Generate rankings with ties
         for i in range(s):
             a += [i]*i
         b = list(a)
-        np.random.shuffle(a)
-        np.random.shuffle(b)
+        rng.shuffle(a)
+        rng.shuffle(b)
         # First pass: use element indices as ranks
         rank = np.arange(len(a), dtype=np.intp)
         for _ in range(2):
@@ -2128,7 +2146,7 @@ def weigher(x):
                 actual = stats.weightedtau(a, b, rank, weigher, add).statistic
                 assert_approx_equal(expected, actual)
             # Second pass: use a random rank
-            np.random.shuffle(rank)
+            rng.shuffle(rank)
 
 
 class TestFindRepeats:
@@ -2746,7 +2764,7 @@ def test_gh16955(self, nan_policy):
         # was deprecated, so check for the appropriate error.
         my_dtype = np.dtype([('asdf', np.uint8), ('qwer', np.float64, (3,))])
         test = np.zeros(10, dtype=my_dtype)
-        message = "Argument `a` is not....|An argument has dtype..."
+        message = "Argument `a` is not....|An argument has dtype...|The DType..."
         with pytest.raises(TypeError, match=message):
             stats.mode(test, nan_policy=nan_policy)
 
@@ -3025,7 +3043,11 @@ def test_zscore_nan_raise(self, xp):
 
     def test_zscore_constant_input_1d(self, xp):
         x = xp.asarray([-0.087] * 3)
-        with pytest.warns(RuntimeWarning, match="Precision loss occurred..."):
+        warn_ctx = (
+            contextlib.nullcontext() if is_lazy_array(x)
+            else pytest.warns(RuntimeWarning, match="Precision loss occurred..."))
+
+        with warn_ctx:
             z = stats.zscore(x)
         xp_assert_equal(z, xp.full(x.shape, xp.nan))
 
@@ -3036,12 +3058,16 @@ def test_zscore_constant_input_1d(self, xp):
     def test_zscore_constant_input_2d(self, xp):
         x = xp.asarray([[10.0, 10.0, 10.0, 10.0],
                         [10.0, 11.0, 12.0, 13.0]])
-        with pytest.warns(RuntimeWarning, match="Precision loss occurred..."):
+        warn_ctx = (
+            contextlib.nullcontext() if is_lazy_array(x)
+            else pytest.warns(RuntimeWarning, match="Precision loss occurred..."))
+
+        with warn_ctx:        
             z0 = stats.zscore(x, axis=0)
         xp_assert_close(z0, xp.asarray([[xp.nan, -1.0, -1.0, -1.0],
                                         [xp.nan, 1.0, 1.0, 1.0]]))
 
-        with pytest.warns(RuntimeWarning, match="Precision loss occurred..."):
+        with warn_ctx:
             z1 = stats.zscore(x, axis=1)
         xp_assert_equal(z1, xp.stack([xp.asarray([xp.nan, xp.nan, xp.nan, xp.nan]),
                                       stats.zscore(x[1, :])]))
@@ -3050,7 +3076,7 @@ def test_zscore_constant_input_2d(self, xp):
         xp_assert_equal(z, xp.reshape(stats.zscore(xp.reshape(x, (-1,))), x.shape))
 
         y = xp.ones((3, 6))
-        with pytest.warns(RuntimeWarning, match="Precision loss occurred..."):
+        with warn_ctx:
             z = stats.zscore(y, axis=None)
         xp_assert_equal(z, xp.full(y.shape, xp.asarray(xp.nan)))
 
@@ -3061,14 +3087,17 @@ def test_zscore_constant_input_2d_nan_policy_omit(self, xp):
                         [10.0, 12.0, xp.nan, 10.0]])
         s = (3/2)**0.5
         s2 = 2**0.5
+        warn_ctx = (
+            contextlib.nullcontext() if is_lazy_array(x)
+            else pytest.warns(RuntimeWarning, match="Precision loss occurred..."))
 
-        with pytest.warns(RuntimeWarning, match="Precision loss occurred..."):
+        with warn_ctx:
             z0 = stats.zscore(x, nan_policy='omit', axis=0)
         xp_assert_close(z0, xp.asarray([[xp.nan, -s, -1.0, xp.nan],
                                         [xp.nan, 0, 1.0, xp.nan],
                                         [xp.nan, s, xp.nan, xp.nan]]))
 
-        with pytest.warns(RuntimeWarning, match="Precision loss occurred..."):
+        with warn_ctx:
             z1 = stats.zscore(x, nan_policy='omit', axis=1)
         xp_assert_close(z1, xp.asarray([[xp.nan, xp.nan, xp.nan, xp.nan],
                                         [-s, 0, s, xp.nan],
@@ -3145,7 +3174,11 @@ def test_degenerate_input(self, xp):
         scores = xp.arange(3)
         compare = xp.ones(3)
         ref = xp.asarray([-xp.inf, xp.nan, xp.inf])
-        with pytest.warns(RuntimeWarning, match="Precision loss occurred..."):
+        warn_ctx = (
+            contextlib.nullcontext() if is_lazy_array(scores)
+            else pytest.warns(RuntimeWarning, match="Precision loss occurred..."))
+
+        with warn_ctx:
             res = stats.zmap(scores, compare)
         xp_assert_equal(res, ref)
 
@@ -3698,19 +3731,29 @@ def test_skew_propagate_nan(self, xp):
 
     def test_skew_constant_value(self, xp):
         # Skewness of a constant input should be NaN (gh-16061)
-        with pytest.warns(RuntimeWarning, match="Precision loss occurred"):
-            a = xp.asarray([-0.27829495]*10)  # xp.repeat not currently available
+        a = xp.asarray([-0.27829495]*10)  # xp.repeat not currently available
+        warn_ctx = (
+            contextlib.nullcontext() if is_lazy_array(a)
+            else pytest.warns(RuntimeWarning, match="Precision loss occurred..."))
+
+        with warn_ctx:
             xp_assert_equal(stats.skew(a), xp.asarray(xp.nan))
+        with warn_ctx:
             xp_assert_equal(stats.skew(a*2.**50), xp.asarray(xp.nan))
+        with warn_ctx:
             xp_assert_equal(stats.skew(a/2.**50), xp.asarray(xp.nan))
+        with warn_ctx:
             xp_assert_equal(stats.skew(a, bias=False), xp.asarray(xp.nan))
 
-            # # similarly, from gh-11086:
-            a = xp.asarray([14.3]*7)
+        # # similarly, from gh-11086:
+        a = xp.asarray([14.3]*7)
+        with warn_ctx:
             xp_assert_equal(stats.skew(a), xp.asarray(xp.nan))
-            a = 1. + xp.arange(-3., 4)*1e-16
+        a = 1. + xp.arange(-3., 4)*1e-16
+        with warn_ctx:
             xp_assert_equal(stats.skew(a), xp.asarray(xp.nan))
 
+    @skip_xp_backends(eager_only=True)
     def test_precision_loss_gh15554(self, xp):
         # gh-15554 was one of several issues that have reported problems with
         # constant or near-constant input. We can't always fix these, but
@@ -3721,7 +3764,6 @@ def test_precision_loss_gh15554(self, xp):
             a[:, 0] = 1.01
             stats.skew(xp.asarray(a))
 
-    @pytest.mark.skip_xp_backends('dask.array', reason=boolean_index_skip_reason)
     @pytest.mark.parametrize('axis', [-1, 0, 2, None])
     @pytest.mark.parametrize('bias', [False, True])
     def test_vectorization(self, xp, axis, bias):
@@ -3810,13 +3852,19 @@ def test_kurtosis_propagate_nan(self):
     def test_kurtosis_constant_value(self, xp):
         # Kurtosis of a constant input should be NaN (gh-16061)
         a = xp.asarray([-0.27829495]*10)
-        with pytest.warns(RuntimeWarning, match="Precision loss occurred"):
+        warn_ctx = (
+            contextlib.nullcontext() if is_lazy_array(a)
+            else pytest.warns(RuntimeWarning, match="Precision loss occurred..."))
+
+        with warn_ctx:
             assert xp.isnan(stats.kurtosis(a, fisher=False))
+        with warn_ctx:
             assert xp.isnan(stats.kurtosis(a * float(2**50), fisher=False))
+        with warn_ctx:
             assert xp.isnan(stats.kurtosis(a / float(2**50), fisher=False))
+        with warn_ctx:
             assert xp.isnan(stats.kurtosis(a, fisher=False, bias=False))
 
-    @pytest.mark.skip_xp_backends('dask.array', reason=boolean_index_skip_reason)
     @pytest.mark.parametrize('axis', [-1, 0, 2, None])
     @pytest.mark.parametrize('bias', [False, True])
     @pytest.mark.parametrize('fisher', [False, True])
@@ -5702,8 +5750,9 @@ def test_ttest_many_dims(self, kwds, equal_var):
     def test_nans_on_axis(self, kwds, axis):
         # confirm that with `nan_policy='propagate'`, NaN results are returned
         # on the correct location
-        a = np.random.randint(10, size=(5, 3, 10)).astype('float')
-        b = np.random.randint(10, size=(5, 3, 10)).astype('float')
+        rng = np.random.default_rng(363836384995579937222)
+        a = rng.integers(10, size=(5, 3, 10)).astype('float')
+        b = rng.integers(10, size=(5, 3, 10)).astype('float')
         # set some indices in `a` and `b` to be `np.nan`.
         a[0][2][3] = np.nan
         b[2][0][6] = np.nan
@@ -6061,8 +6110,13 @@ def test_ttest_ind_zero_division(self, xp):
         # test zero division problem
         x = xp.zeros(3)
         y = xp.ones(3)
-        with pytest.warns(RuntimeWarning, match="Precision loss occurred"):
+        warn_ctx = (
+            contextlib.nullcontext() if is_lazy_array(x)
+            else pytest.warns(RuntimeWarning, match="Precision loss occurred..."))
+
+        with warn_ctx:
             t, p = stats.ttest_ind(x, y, equal_var=False)
+
         xp_assert_equal(t, xp.asarray(-xp.inf))
         xp_assert_equal(p, xp.asarray(0.))
 
@@ -7276,16 +7330,17 @@ def test_1d_numeric_array_like_input(self, xp):
         assert_allclose(gstd_actual, self.gstd_array_1d)
 
     @skip_xp_invalid_arg
-    def test_raises_value_error_non_numeric_input(self, xp):
-        # this is raised by NumPy, but it's quite interpretable
-        with pytest.raises(TypeError, match="ufunc 'log' not supported"):
+    def test_raises_error_non_numeric_input(self, xp):
+        message = "could not convert string to float|The DType..."
+        with pytest.raises((ValueError, TypeError), match=message):
             stats.gstd('You cannot take the logarithm of a string.')
 
-    @skip_xp_backends(eager_only=True)
+    @pytest.mark.filterwarnings("ignore:divide by zero encountered:RuntimeWarning:dask")
+    @pytest.mark.filterwarnings("ignore:invalid value encountered:RuntimeWarning:dask")
     @pytest.mark.parametrize('bad_value', (0, -1, np.inf, np.nan))
     def test_returns_nan_invalid_value(self, bad_value, xp):
         x = xp.asarray(self.array_1d + [bad_value])
-        if np.isfinite(bad_value):
+        if np.isfinite(bad_value) and not is_lazy_array(x):
             message = "The geometric standard deviation is only defined..."
             with pytest.warns(RuntimeWarning, match=message):
                 res = stats.gstd(x)
@@ -9488,6 +9543,11 @@ def test_nan_policy(self, xp):
         ref = xp.mean(x[~mask])
         xp_assert_close(res, ref)
 
+    @skip_xp_backends(eager_only=True)
+    def test_nan_policy_warns(self, xp):
+        x = xp.arange(10.)
+        x = xp.where(x == 3, xp.nan, x)
+
         # Check for warning if omitting NaNs causes empty slice
         message = 'After omitting NaNs...'
         with pytest.warns(RuntimeWarning, match=message):
@@ -9547,7 +9607,6 @@ def test_complex_gh22404(self, xp):
         xp_assert_close(res, xp.asarray(ref))
 
 
-@pytest.mark.skip_xp_backends('dask.array', reason=boolean_index_skip_reason)
 class TestXP_Var:
     @pytest.mark.parametrize('axis', [None, 1, -1, (-2, 2)])
     @pytest.mark.parametrize('keepdims', [False, True])
@@ -9596,6 +9655,11 @@ def test_nan_policy(self, xp):
         ref = xp.var(x[~mask])
         xp_assert_close(res, ref)
 
+    @skip_xp_backends(eager_only=True)
+    def test_nan_policy_warns(self, xp):
+        x = xp.arange(10.)
+        x = xp.where(x == 3, xp.nan, x)
+
         # Check for warning if omitting NaNs causes empty slice
         message = 'After omitting NaNs...'
         with pytest.warns(RuntimeWarning, match=message):
@@ -9627,17 +9691,18 @@ def test_empty(self, xp):
         ref = xp.asarray([])
         xp_assert_equal(res, ref)
 
+    @pytest.mark.filterwarnings(
+        "ignore:overflow encountered in reduce:RuntimeWarning"
+    ) # Overflow occurs for float32 input
     def test_dtype(self, xp):
         max = xp.finfo(xp.float32).max
         x_np = np.asarray([max, max/2], dtype=np.float32)
         x_xp = xp.asarray(x_np)
 
-        # Overflow occurs for float32 input
-        with np.errstate(over='ignore'):
-            res = _xp_var(x_xp)
-            ref = np.var(x_np)
-            np.testing.assert_equal(ref, np.inf)
-            xp_assert_close(res, xp.asarray(ref))
+        res = _xp_var(x_xp)
+        ref = np.var(x_np)
+        np.testing.assert_equal(ref, np.inf)
+        xp_assert_close(res, xp.asarray(ref))
 
         # correct result is returned if `float64` is used
         res = _xp_var(x_xp, dtype=xp.float64)
diff --git a/scipy/stats/tests/test_variation.py b/scipy/stats/tests/test_variation.py
index 2381a6b38e95..5fc906530a07 100644
--- a/scipy/stats/tests/test_variation.py
+++ b/scipy/stats/tests/test_variation.py
@@ -172,9 +172,9 @@ def test_neg_inf_nan(self, xp):
                       reason='`nan_policy` only supports NumPy backend')
     @pytest.mark.parametrize("nan_policy", ['propagate', 'omit'])
     def test_combined_edge_cases(self, nan_policy, xp):
-        x = xp.array([[0, 10, xp.nan, 1],
-                      [0, -5, xp.nan, 2],
-                      [0, -5, xp.nan, 3]])
+        x = xp.asarray([[0, 10, xp.nan, 1],
+                        [0, -5, xp.nan, 2],
+                        [0, -5, xp.nan, 3]])
         if nan_policy == 'omit':
             with pytest.warns(SmallSampleWarning, match=too_small_nd_omit):
                 y = variation(x, axis=0, nan_policy=nan_policy)
diff --git a/tools/generate_f2pymod.py b/tools/generate_f2pymod.py
index e61524cc39b5..aaedad2bd26f 100644
--- a/tools/generate_f2pymod.py
+++ b/tools/generate_f2pymod.py
@@ -9,6 +9,7 @@
 import os
 import re
 import subprocess
+import sys
 
 
 # START OF CODE VENDORED FROM `numpy.distutils.from_template`
@@ -266,6 +267,8 @@ def main():
     parser.add_argument("--free-threading",
                         action=argparse.BooleanOptionalAction,
                         help="Whether to add --free-threading-compatible")
+    parser.add_argument("--f2cmap", type=str,
+                        help="Path to the f2cmap file")
     args = parser.parse_args()
 
     if not args.infile.endswith(('.pyf', '.pyf.src', '.f.src')):
@@ -290,10 +293,13 @@ def main():
 
     # Now invoke f2py to generate the C API module file
     if args.infile.endswith(('.pyf.src', '.pyf')):
-        p = subprocess.Popen(
-            ['f2py', fname_pyf, '--build-dir', outdir_abs] + nogil_arg,
-            stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=os.getcwd()
-        )
+        cmd = [sys.executable, '-m', 'numpy.f2py', fname_pyf,
+               '--build-dir', outdir_abs] + nogil_arg
+        if args.f2cmap:
+            cmd += ['--f2cmap', args.f2cmap]
+
+        p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
+                             stderr=subprocess.PIPE, cwd=os.getcwd())
         out, err = p.communicate()
         if not (p.returncode == 0):
             raise RuntimeError(f"Processing {fname_pyf} with f2py failed!\n"