diff --git a/scipy/_build_utils/src/mdspan.h b/scipy/_build_utils/src/mdspan.h
new file mode 100644
index 000000000000..4dce5c3805b1
--- /dev/null
+++ b/scipy/_build_utils/src/mdspan.h
@@ -0,0 +1,311 @@
+#pragma once
+
+#include <array>
+#include <limits>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace std {
+
+template <typename Index, size_t... Extents>
+class extents;
+
+namespace detail {
+
+    template <typename Index, size_t Rank, size_t Value, size_t... Extents>
+    struct fill_extents {
+        using type = typename fill_extents<Index, Rank - 1, Value, Extents..., Value>::type;
+    };
+
+    template <typename Index, size_t Value, size_t... Extents>
+    struct fill_extents<Index, 0, Value, Extents...> {
+        using type = extents<Index, Extents...>;
+    };
+
+    template <typename Index, size_t Rank, size_t Value>
+    using fill_extents_t = typename fill_extents<Index, Rank, Value>::type;
+
+} // namespace detail
+
+inline constexpr size_t dynamic_extent = numeric_limits<size_t>::max();
+
+template <typename Index, size_t... Extents>
+class extents {
+    static_assert(((Extents == dynamic_extent) && ... && true), "extents must all be dynamic");
+
+  public:
+    using index_type = Index;
+    using size_type = make_unsigned_t<index_type>;
+    using rank_type = size_t;
+
+  private:
+    array<index_type, sizeof...(Extents)> m_dexts;
+
+  public:
+    constexpr extents() = default;
+
+    template <class... OtherIndex>
+    constexpr explicit extents(OtherIndex... exts) : m_dexts{exts...} {}
+
+    template <class OtherIndex>
+    constexpr extents(const array<OtherIndex, sizeof...(Extents)> &dexts) noexcept : m_dexts(dexts) {}
+
+    constexpr index_type extent(rank_type i) const noexcept { return m_dexts[i]; }
+
+    static constexpr rank_type rank() noexcept { return sizeof...(Extents); }
+};
+
+template <typename Index, size_t Rank>
+using dextents = detail::fill_extents_t<Index, Rank, dynamic_extent>;
+
+struct full_extent_t {
+    explicit full_extent_t() = default;
+};
+
+inline constexpr full_extent_t full_extent;
+
+template <typename Offset, typename Extent, typename Stride>
+struct strided_slice {
+    using offset_type = Offset;
+    using extent_type = Extent;
+    using stride_type = Stride;
+
+    strided_slice() = default;
+
+    strided_slice(offset_type offset, extent_type extent, stride_type stride)
+        : offset(offset), extent(extent), stride(stride) {}
+
+    offset_type offset;
+    extent_type extent;
+    stride_type stride;
+};
+
+namespace detail {
+
+    template <typename Index, typename Offset, typename Extent, typename Stride>
+    Index submdspan_extent(Index ext, strided_slice<Offset, Extent, Stride> slice) {
+        return (slice.extent - slice.offset) / slice.stride;
+    }
+
+    template <typename Index, typename Index0, typename Index1>
+    Index submdspan_extent(Index ext, std::tuple<Index0, Index1> slice) {
+        return std::get<1>(slice) - std::get<0>(slice);
+    }
+
+    template <typename Index>
+    Index submdspan_extent(Index ext, full_extent_t slice) {
+        return ext;
+    }
+
+    template <size_t... I, typename Index, size_t... Extents, typename... Slices>
+    auto submdspan_extents(std::index_sequence<I...>, const extents<Index, Extents...> exts, Slices... slices) {
+        return extents<Index, Extents...>{submdspan_extent(exts.extent(I), slices)...};
+    }
+
+    template <typename Index, size_t... Extents, typename... Slices>
+    auto submdspan_extents(const extents<Index, Extents...> exts, Slices... slices) {
+        return submdspan_extents(std::index_sequence_for<Slices...>(), exts, slices...);
+    }
+
+} // namespace detail
+
+template <typename Index, size_t... Extents, typename... Slices>
+auto submdspan_extents(const extents<Index, Extents...> &exts, Slices... slices) {
+    return detail::submdspan_extents(exts, slices...);
+}
+
+template <typename Mapping, typename... Slices>
+auto submdspan_mapping(const Mapping &, Slices...);
+
+struct layout_left;
+
+struct layout_right;
+
+struct layout_stride {
+    template <typename Extents>
+    class mapping {
+      public:
+        using extents_type = Extents;
+        using index_type = typename extents_type::index_type;
+        using size_type = typename extents_type::size_type;
+        using rank_type = typename extents_type::rank_type;
+        using layout_type = layout_stride;
+
+      private:
+        extents_type m_exts;
+        array<index_type, extents_type::rank()> m_strides;
+
+      public:
+        constexpr mapping() = default;
+
+        constexpr mapping(const Extents &exts, const array<index_type, extents_type::rank()> &strides)
+            : m_exts(exts), m_strides(strides) {}
+
+        constexpr const extents_type &extents() const noexcept { return m_exts; }
+
+        constexpr const array<index_type, extents_type::rank()> &strides() const noexcept { return m_strides; }
+
+        constexpr index_type extent(rank_type i) const noexcept { return m_exts.extent(i); }
+
+        constexpr index_type stride(rank_type i) const noexcept { return m_strides[i]; }
+
+        template <typename... Args>
+        constexpr index_type operator()(Args... args) const noexcept {
+            static_assert(sizeof...(Args) == extents_type::rank(), "index must have same rank as extents");
+
+            index_type indices[extents_type::rank()] = {args...};
+            index_type res = 0;
+            for (rank_type i = 0; i < extents_type::rank(); ++i) {
+                res += indices[i] * m_strides[i];
+            }
+
+            return res;
+        }
+    };
+};
+
+namespace detail {
+
+    template <typename Index, typename Offset, typename Extent, typename Stride>
+    Index submdspan_stride(Index stride, strided_slice<Offset, Extent, Stride> slice) {
+        return stride * slice.stride;
+    }
+
+    template <typename Index, typename Index0, typename Index1>
+    Index submdspan_stride(Index stride, std::tuple<Index0, Index1> slice) {
+        return stride;
+    }
+
+    template <typename Index>
+    Index submdspan_stride(Index stride, full_extent_t slice) {
+        return stride;
+    }
+
+    template <size_t... I, typename Index, size_t Rank, typename... Slices>
+    auto submdspan_strides(std::index_sequence<I...>, const array<Index, Rank> strides, Slices... slices) {
+        array<Index, Rank> res{submdspan_stride(strides[I], slices)...};
+        return res;
+    }
+
+    template <typename Index, size_t Rank, typename... Slices>
+    auto submdspan_strides(const array<Index, Rank> strides, Slices... slices) {
+        return submdspan_strides(std::index_sequence_for<Slices...>(), strides, slices...);
+    }
+
+} // namespace detail
+
+template <typename Index, size_t Rank, typename... Slices>
+auto submdspan_strides(const array<Index, Rank> &strides, Slices... slices) {
+    return detail::submdspan_strides(strides, slices...);
+}
+
+template <typename Extents, typename... Slices>
+auto submdspan_mapping(const layout_stride::mapping<Extents> &map, Slices... slices) {
+    return layout_stride::mapping(submdspan_extents(map.extents(), slices...),
+                                  submdspan_strides(map.strides(), slices...));
+}
+
+template <typename Element>
+class default_accessor {
+  public:
+    using offset_policy = default_accessor;
+    using element_type = Element;
+    using reference = Element &;
+    using data_handle_type = Element *;
+
+    constexpr reference access(data_handle_type p, size_t i) const noexcept { return p[i]; }
+
+    constexpr data_handle_type offset(data_handle_type p, size_t i) const noexcept { return p + i; }
+};
+
+template <typename T, typename Extents, typename LayoutPolicy = layout_right,
+          typename AccessorPolicy = default_accessor<T>>
+class mdspan {
+  public:
+    using extents_type = Extents;
+    using layout_type = LayoutPolicy;
+    using accessor_type = AccessorPolicy;
+    using mapping_type = typename LayoutPolicy::template mapping<Extents>;
+    using element_type = T;
+    using value_type = remove_cv_t<T>;
+    using index_type = typename Extents::index_type;
+    using size_type = typename Extents::size_type;
+    using rank_type = typename Extents::rank_type;
+    using data_handle_type = typename AccessorPolicy::data_handle_type;
+    using reference = typename AccessorPolicy::reference;
+
+  private:
+    data_handle_type m_ptr;
+    mapping_type m_map;
+    accessor_type m_acc;
+
+  public:
+    constexpr mdspan() = default;
+
+    constexpr mdspan(data_handle_type p, const mapping_type &m) : m_ptr(p), m_map(m) {}
+
+    constexpr mdspan(data_handle_type p, const mapping_type &m, const accessor_type &a)
+        : m_ptr(p), m_map(m), m_acc(a) {}
+
+    template <typename... OtherIndices>
+    constexpr reference operator()(OtherIndices... indices) const {
+        return m_acc.access(m_ptr, m_map(static_cast<index_type>(std::move(indices))...));
+    }
+
+    template <typename OtherIndex>
+    constexpr reference operator[](OtherIndex index) const {
+        return m_acc.access(m_ptr, m_map(static_cast<index_type>(index)));
+    }
+
+    constexpr const data_handle_type &data_handle() const noexcept { return m_ptr; }
+
+    constexpr const mapping_type &mapping() const noexcept { return m_map; }
+
+    constexpr const accessor_type &accessor() const noexcept { return m_acc; }
+
+    constexpr index_type stride(rank_type r) const { return m_map.stride(r); }
+
+    constexpr const extents_type &extents() const noexcept { return m_map.extents(); }
+
+    constexpr index_type extent(rank_type r) const noexcept { return m_map.extent(r); }
+
+    constexpr size_type size() const noexcept {
+        size_type res = 1;
+        for (rank_type i = 0; i < extents_type::rank(); ++i) {
+            res *= m_map.extent(i);
+        }
+
+        return res;
+    }
+};
+
+namespace detail {
+
+    template <typename Offset, typename Extent, typename Stride>
+    auto submdspan_offset(strided_slice<Offset, Extent, Stride> slice) {
+        return slice.offset;
+    }
+
+    template <typename Index0, typename Index1>
+    auto submdspan_offset(std::tuple<Index0, Index1> slice) {
+        return std::get<0>(slice);
+    }
+
+    inline auto submdspan_offset(full_extent_t slice) { return 0; }
+
+} // namespace detail
+
+template <class T, class Extents, class LayoutPolicy, class AccessorPolicy, class... SliceArgs>
+auto submdspan(const mdspan<T, Extents, LayoutPolicy, AccessorPolicy> &src, SliceArgs... args) {
+    static_assert(Extents::rank() == sizeof...(SliceArgs), "number of slices must equal extents rank");
+
+    using submdspan_type = mdspan<T, Extents, LayoutPolicy, AccessorPolicy>;
+
+    auto src_map = src.mapping();
+    auto src_acc = src.accessor();
+    return submdspan_type(src_acc.offset(src.data_handle(), src_map(detail::submdspan_offset(args)...)),
+                          submdspan_mapping(src.mapping(), args...), src_acc);
+}
+
+} // namespace std
diff --git a/scipy/_build_utils/src/mdspan_addl.h b/scipy/_build_utils/src/mdspan_addl.h
new file mode 100644
index 000000000000..608502a7cfd0
--- /dev/null
+++ b/scipy/_build_utils/src/mdspan_addl.h
@@ -0,0 +1,74 @@
+#pragma once
+
+#include <array>
+#include "mdspan.h"
+
+namespace scipy {
+
+namespace detail {
+
+/* XXX
+ * 1. where to put it? scipy::detail namespace is a placeholder, really.
+ * 2. how to avoid duplicating the whole of the layout_stride? Tried
+ *    following the suggestion of template<typename LayoutPolicy> struct BoundsCheckingLayoutPolicy
+ *    but the syntax defeats me.
+ */
+
+
+struct bounds_checking_layout_stride {
+    template <typename Extents>
+    class mapping {
+      public:
+        using extents_type = Extents;
+        using index_type = typename extents_type::index_type;
+        using size_type = typename extents_type::size_type;
+        using rank_type = typename extents_type::rank_type;
+        using layout_type = bounds_checking_layout_stride;
+
+      private:
+        extents_type m_exts;
+        std::array<index_type, extents_type::rank()> m_strides;
+
+      public:
+        constexpr mapping() = default;
+
+        constexpr mapping(const Extents &exts, const std::array<index_type, extents_type::rank()> &strides)
+            : m_exts(exts), m_strides(strides) {}
+
+        constexpr const extents_type &extents() const noexcept { return m_exts; }
+
+        constexpr const std::array<index_type, extents_type::rank()> &strides() const noexcept { return m_strides; }
+
+        constexpr index_type extent(rank_type i) const noexcept { return m_exts.extent(i); }
+
+        constexpr index_type stride(rank_type i) const noexcept { return m_strides[i]; }
+
+        template <typename... Args>
+        constexpr index_type operator()(Args... args) const {
+            static_assert(sizeof...(Args) == extents_type::rank(), "index must have same rank as extents");
+
+            index_type indices[extents_type::rank()] = {args...};
+
+            // boundscheck
+            for (rank_type i = 0; i < extents_type::rank(); ++i) {
+                bool in_bounds = (0 <= indices[i]) && (indices[i] < m_exts.extent(i));
+                if(!in_bounds){
+                    auto mesg = "OOB: index = " + std::to_string(indices[i]) + " of size = ";
+                    mesg = mesg + std::to_string(m_exts.extent(i)) + " in dimension = " + std::to_string(i);
+                    throw std::runtime_error(mesg);
+                }
+            }
+
+            index_type res = 0;
+            for (rank_type i = 0; i < extents_type::rank(); ++i) {
+                res += indices[i] * m_strides[i];
+            }
+
+            return res;
+        }
+    };
+};
+
+} // namespace detail
+
+} // namespace scipy
diff --git a/scipy/interpolate/__init__.py b/scipy/interpolate/__init__.py
index 9ea5ba9d88b9..000c3631d84a 100644
--- a/scipy/interpolate/__init__.py
+++ b/scipy/interpolate/__init__.py
@@ -80,6 +80,9 @@
    make_interp_spline
    make_lsq_spline
    make_smoothing_spline
+   generate_knots
+   make_splrep
+   make_splprep
 
 Functional interface to FITPACK routines:
 
@@ -181,6 +184,7 @@
 from ._ndgriddata import *
 
 from ._bsplines import *
+from ._fitpack_repro import generate_knots, make_splrep, make_splprep
 
 from ._pade import *
 
diff --git a/scipy/interpolate/_bspl.pyx b/scipy/interpolate/_bspl.pyx
index 04f2ca88fbd3..b25a4677b38e 100644
--- a/scipy/interpolate/_bspl.pyx
+++ b/scipy/interpolate/_bspl.pyx
@@ -13,8 +13,40 @@ from libc.math cimport NAN
 
 cnp.import_array()
 
-cdef extern from "src/__fitpack.h":
-    void _deBoor_D(const double *t, double x, int k, int ell, int m, double *result) nogil
+cdef extern from "src/__fitpack.h" namespace "fitpack":
+    void _deBoor_D(const double *t, double x, int k, int ell, int m, double *result
+    ) noexcept nogil
+    ssize_t _find_interval(const double* tptr, ssize_t len_t,
+                           int k,
+                           double xval,
+                           ssize_t prev_l,
+                           int extrapolate
+    ) noexcept nogil
+    void qr_reduce(double *aptr, const ssize_t m, const ssize_t nz,    # a
+                   ssize_t *offset,
+                   const ssize_t nc,
+                   double *yptr, const ssize_t ydim1,                  # y
+                   const ssize_t startrow
+    ) except+ nogil
+    void data_matrix(const double *xptr, ssize_t m,
+                       const double *tptr, ssize_t len_t,
+                       int k,
+                       const double *wptr,
+                       double *Aptr,    # outputs
+                       ssize_t *offset_ptr,
+                       Py_ssize_t *nc,
+                       double *wrk
+    ) except+ nogil
+    void fpback(const double *Rptr, ssize_t m, ssize_t nz,
+                ssize_t nc,
+                const double *yptr, ssize_t ydim2,
+                double *cptr
+    ) except+ nogil
+    double fpknot(const double *x_ptr, ssize_t m,
+                  const double *t_ptr, ssize_t len_t,
+                  int k,
+                  const double *residuals_ptr
+    ) except+ nogil
 
 ctypedef double complex double_complex
 
@@ -63,30 +95,7 @@ cdef inline int find_interval(const double[::1] t,
         Suitable interval or -1 if xval was nan.
 
     """
-    cdef:
-        int l
-        int n = t.shape[0] - k - 1
-        double tb = t[k]
-        double te = t[n]
-
-    if xval != xval:
-        # nan
-        return -1
-
-    if ((xval < tb) or (xval > te)) and not extrapolate:
-        return -1
-
-    l = prev_l if k < prev_l < n else k
-
-    # xval is in support, search for interval s.t. t[interval] <= xval < t[l+1]
-    while(xval < t[l] and l != k):
-        l -= 1
-
-    l += 1
-    while(xval >= t[l] and l != n):
-        l += 1
-
-    return l-1
+    return _find_interval(&t[0], t.shape[0], k, xval, prev_l, extrapolate)
 
 
 @cython.wraparound(False)
@@ -499,7 +508,8 @@ def _make_design_matrix(const double[::1] x,
                         const double[::1] t,
                         int k,
                         bint extrapolate,
-                        int32_or_int64[::1] indices):
+                        int32_or_int64[::1] indices,
+                        int nu=0):
     """
     Returns a design matrix in CSR format.
 
@@ -543,7 +553,7 @@ def _make_design_matrix(const double[::1] x,
         # extrapolate=False and out of bound values are already dealt with in
         # design_matrix
         ind = find_interval(t, k, xval, ind, extrapolate)
-        _deBoor_D(&t[0], xval, k, ind, 0, &work[0])
+        _deBoor_D(&t[0], xval, k, ind, nu, &work[0])
 
         # data[(k + 1) * i : (k + 1) * (i + 1)] = work[:k + 1]
         # indices[(k + 1) * i : (k + 1) * (i + 1)] = np.arange(ind - k, ind + 1)
@@ -915,3 +925,79 @@ def _colloc_nd(double[:, ::1] xvals, tuple t not None, const npy_int32[::1] k):
             csr_data[j*volume + iflat] = factor
 
     return np.asarray(csr_data), np.asarray(csr_indices), csr_indptr
+
+
+# ---------------------------
+# wrappers for fitpack repro
+# ---------------------------
+def _qr_reduce(double[:, ::1] a, ssize_t[::1] offset, ssize_t nc,   # A packed
+               double[:, ::1] y,
+               ssize_t startrow=1
+):
+    # (A, offset, nc) is a PackedMatrix instance, unpacked
+    qr_reduce(&a[0, 0], a.shape[0], a.shape[1],
+              &offset[0],
+              nc,
+              &y[0, 0], y.shape[1],
+              startrow)
+
+
+def _data_matrix(const double[::1] x,
+                 const double[::1] t,
+                 int k,
+                 const double[::1] w):
+    cdef:
+         ssize_t m = x.shape[0]
+         double[:, ::1] A = np.empty((m, k+1), dtype=float)
+         ssize_t[::1] offset = np.zeros(m, dtype=np.intp)
+         double[::1] wrk = np.empty(2*k+2, dtype=float)
+         ssize_t nc
+
+    if w.shape[0] != x.shape[0]:
+        raise ValueError(f"{len(w) =} != {len(x) =}.")
+
+    data_matrix(&x[0], m,
+                &t[0], t.shape[0],
+                k,
+                &w[0],
+                &A[0, 0],    # output: (A, offset, nc)
+                &offset[0],
+                &nc,
+                &wrk[0],     # work array
+    )
+    return np.asarray(A), np.asarray(offset), int(nc)
+
+
+def _fpback(const double[:, ::1] R, ssize_t nc,  # (R, offset, nc) triangular => offset is range(nc)
+            const double[:, ::1] y
+):
+    cdef:
+        ssize_t m = R.shape[0]
+        ssize_t nz = R.shape[1]
+
+    if y.shape[0] != m:
+        raise ValueError(f"{y.shape = } != {m =}.")
+    if nc > m:
+        raise ValueError(f"{nc = } > {m = }.")
+
+    cdef double[:, ::1] c = np.empty_like(y[:nc, :])
+
+    fpback(&R[0, 0], m, nz,
+           nc,
+           &y[0, 0], y.shape[1],
+           &c[0, 0])
+
+    return np.asarray(c)
+
+
+def _fpknot(const double[::1] x,
+            const double[::1] t,
+            int k,
+            const double[::1] residuals):
+    if x.shape[0] != residuals.shape[0]:
+        raise ValueError(f"{len(x) = } != {len(residuals) =}")
+
+    return fpknot(&x[0], x.shape[0],
+                  &t[0], t.shape[0],
+                  k,
+                  &residuals[0])
diff --git a/scipy/interpolate/_bsplines.py b/scipy/interpolate/_bsplines.py
index 8cc6eb5411ed..b3c58c06c16d 100644
--- a/scipy/interpolate/_bsplines.py
+++ b/scipy/interpolate/_bsplines.py
@@ -13,6 +13,7 @@
 from scipy.special import poch
 from itertools import combinations
 
+
 __all__ = ["BSpline", "make_interp_spline", "make_lsq_spline",
            "make_smoothing_spline"]
 
@@ -336,7 +337,7 @@ def basis_element(cls, t, extrapolate=True):
         return cls.construct_fast(t, c, k, extrapolate)
 
     @classmethod
-    def design_matrix(cls, x, t, k, extrapolate=False):
+    def design_matrix(cls, x, t, k, extrapolate=False, nu=0):
         """
         Returns a design matrix as a CSR format sparse array.
 
@@ -455,7 +456,7 @@ def design_matrix(cls, x, t, k, extrapolate=False):
 
         # indptr is not passed to Cython as it is already fully computed
         data, indices = _bspl._make_design_matrix(
-            x, t, k, extrapolate, indices
+            x, t, k, extrapolate, indices, nu
         )
         return csr_array(
             (data, indices, indptr),
@@ -926,13 +927,22 @@ def insert_knot(self, x, m=1):
 
 def _not_a_knot(x, k):
     """Given data x, construct the knot vector w/ not-a-knot BC.
-    cf de Boor, XIII(12)."""
+    cf de Boor, XIII(12).
+
+    For even k, it's a bit ad hoc: Greville sites + omit 2nd and 2nd-to-last
+    data points, a la not-a-knot.
+    This seems to match what Dierckx does, too:
+    https://github.com/scipy/scipy/blob/maintenance/1.11.x/scipy/interpolate/fitpack/fpcurf.f#L63-L80
+    """
     x = np.asarray(x)
-    if k % 2 != 1:
-        raise ValueError("Odd degree for now only. Got %s." % k)
+    if k % 2 == 1:
+        k2 = (k + 1) // 2
+        t = x.copy()
+    else:
+        k2 = k // 2
+        t = (x[1:] + x[:-1]) / 2
 
-    m = (k - 1) // 2
-    t = x[m+1:-m-1]
+    t = t[k2:-k2]
     t = np.r_[(x[0],)*(k+1), t, (x[-1],)*(k+1)]
     return t
 
@@ -1417,13 +1427,6 @@ def make_interp_spline(x, y, k=3, t=None, bc_type=None, axis=0,
         if deriv_l is None and deriv_r is None:
             if bc_type == 'periodic':
                 t = _periodic_knots(x, k)
-            elif k == 2:
-                # OK, it's a bit ad hoc: Greville sites + omit
-                # 2nd and 2nd-to-last points, a la not-a-knot
-                t = (x[1:] + x[:-1]) / 2.
-                t = np.r_[(x[0],)*(k+1),
-                          t[1:-1],
-                          (x[-1],)*(k+1)]
             else:
                 t = _not_a_knot(x, k)
         else:
@@ -1503,7 +1506,7 @@ def make_interp_spline(x, y, k=3, t=None, bc_type=None, axis=0,
     return BSpline.construct_fast(t, c, k, axis=axis)
 
 
-def make_lsq_spline(x, y, t, k=3, w=None, axis=0, check_finite=True):
+def make_lsq_spline(x, y, t, k=3, w=None, axis=0, check_finite=True, *, method="qr"):
     r"""Compute the (coefficients of) an LSQ (Least SQuared) based
     fitting B-spline.
 
@@ -1541,6 +1544,11 @@ def make_lsq_spline(x, y, t, k=3, w=None, axis=0, check_finite=True):
         Disabling may give a performance gain, but may result in problems
         (crashes, non-termination) if the inputs do contain infinities or NaNs.
         Default is True.
+    method : str, optional
+        Method for solving the linear LSQ problem. Allowed values are "norm-eq"
+        (Explicitly construct and solve the normal system of equations), and
+        "qr" (Use the QR factorization of the design matrix).
+        Default is "qr".
 
     Returns
     -------
@@ -1623,46 +1631,79 @@ def make_lsq_spline(x, y, t, k=3, w=None, axis=0, check_finite=True):
 
     y = np.moveaxis(y, axis, 0)    # now internally interp axis is zero
 
-    if x.ndim != 1 or np.any(x[1:] - x[:-1] <= 0):
-        raise ValueError("Expect x to be a 1-D sorted array_like.")
+    if x.ndim != 1:
+        raise ValueError("Expect x to be a 1-D sequence.")
     if x.shape[0] < k+1:
         raise ValueError("Need more x points.")
     if k < 0:
         raise ValueError("Expect non-negative k.")
     if t.ndim != 1 or np.any(t[1:] - t[:-1] < 0):
-        raise ValueError("Expect t to be a 1-D sorted array_like.")
+        raise ValueError("Expect t to be a 1D strictly increasing sequence.")
     if x.size != y.shape[0]:
         raise ValueError(f'Shapes of x {x.shape} and y {y.shape} are incompatible')
     if k > 0 and np.any((x < t[k]) | (x > t[-k])):
         raise ValueError('Out of bounds w/ x = %s.' % x)
     if x.size != w.size:
         raise ValueError(f'Shapes of x {x.shape} and w {w.shape} are incompatible')
+    if method == "norm-eq" and np.any(x[1:] - x[:-1] <= 0):
+        raise ValueError("Expect x to be a 1D strictly increasing sequence.")
+    if method == "qr" and any(x[1:] - x[:-1] < 0):
+        raise ValueError("Expect x to be a 1D non-decreasing sequence.")
 
     # number of coefficients
     n = t.size - k - 1
 
-    # construct A.T @ A and rhs with A the collocation matrix, and
-    # rhs = A.T @ y for solving the LSQ problem  ``A.T @ A @ c = A.T @ y``
-    lower = True
+    # multiple r.h.s
     extradim = prod(y.shape[1:])
-    ab = np.zeros((k+1, n), dtype=np.float64, order='F')
-    rhs = np.zeros((n, extradim), dtype=y.dtype, order='F')
-    _bspl._norm_eq_lsq(x, t, k,
-                       y.reshape(-1, extradim),
-                       w,
-                       ab, rhs)
-    rhs = rhs.reshape((n,) + y.shape[1:])
-
-    # have observation matrix & rhs, can solve the LSQ problem
-    cho_decomp = cholesky_banded(ab, overwrite_ab=True, lower=lower,
-                                 check_finite=check_finite)
-    c = cho_solve_banded((cho_decomp, lower), rhs, overwrite_b=True,
-                         check_finite=check_finite)
+    yy = y.reshape(-1, extradim)
+
+    if method == "norm-eq":
+        # construct A.T @ A and rhs with A the collocation matrix, and
+        # rhs = A.T @ y for solving the LSQ problem  ``A.T @ A @ c = A.T @ y``
+        lower = True
+        ab = np.zeros((k+1, n), dtype=np.float64, order='F')
+        rhs = np.zeros((n, extradim), dtype=y.dtype, order='F')
+        _bspl._norm_eq_lsq(x, t, k,
+                           yy,
+                           w,
+                           ab, rhs)
+        # have observation matrix & rhs, can solve the LSQ problem
+        cho_decomp = cholesky_banded(ab, overwrite_ab=True, lower=lower,
+                                     check_finite=check_finite)
+        c = cho_solve_banded((cho_decomp, lower), rhs, overwrite_b=True,
+                             check_finite=check_finite)
+    elif method == "qr":
+        _, _, c = _lsq_solve_qr(x, yy, t, k, w)
+    else:
+        raise ValueError(f"Unknown {method =}.")
 
+    # restore the shape of `c` for both single and multiple r.h.s.
+    c = c.reshape((n,) + y.shape[1:])
     c = np.ascontiguousarray(c)
     return BSpline.construct_fast(t, c, k, axis=axis)
 
 
+######################
+# LSQ spline helpers #
+######################
+
+def _lsq_solve_qr(x, y, t, k, w):
+    """Solve for the LSQ spline coeffs given x, y and knots.
+
+    `y` is always 2D: for 1D data, the shape is ``(m, 1)``.
+    `w` is always 1D: one weight value per `x` value.
+
+    """
+    assert y.ndim == 2
+
+    y_w = y * w[:, None]
+    A, offset, nc = _bspl._data_matrix(x, t, k, w)
+    _bspl._qr_reduce(A, offset, nc, y_w)         # modifies arguments in-place
+    c = _bspl._fpback(A, nc, y_w)
+
+    return A, y_w, c
+
+
 #############################
 #  Smoothing spline helpers #
 #############################
diff --git a/scipy/interpolate/_fitpack_repro.py b/scipy/interpolate/_fitpack_repro.py
new file mode 100644
index 000000000000..88accc272aae
--- /dev/null
+++ b/scipy/interpolate/_fitpack_repro.py
@@ -0,0 +1,988 @@
+""" Replicate FITPACK's logic for constructing smoothing spline functions and curves.
+
+    Currently provides analogs of splrep and splprep python routines, i.e.
+    curfit.f and parcur.f routines (the drivers are fpcurf.f and fppara.f, respectively)
+
+    The Fortran sources are from
+    https://github.com/scipy/scipy/blob/maintenance/1.11.x/scipy/interpolate/fitpack/
+
+    .. [1] P. Dierckx, "Algorithms for smoothing data with periodic and
+        parametric splines, Computer Graphics and Image Processing",
+        20 (1982) 171-184.
+        :doi:`10.1016/0146-664X(82)90043-0`.
+    .. [2] P. Dierckx, "Curve and surface fitting with splines", Monographs on
+         Numerical Analysis, Oxford University Press, 1993.
+    .. [3] P. Dierckx, "An algorithm for smoothing, differentiation and integration
+         of experimental data using spline functions",
+         Journal of Computational and Applied Mathematics, vol. I, no 3, p. 165 (1975).
+         https://doi.org/10.1016/0771-050X(75)90034-0
+"""
+import warnings
+import operator
+import numpy as np
+
+from scipy.interpolate._bsplines import (
+    _not_a_knot, make_interp_spline, BSpline, fpcheck, _lsq_solve_qr
+)
+from scipy.interpolate._bspl import _qr_reduce, _fpback, _fpknot
+
+#    cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
+#    c  part 1: determination of the number of knots and their position     c
+#    c  **************************************************************      c
+#
+# https://github.com/scipy/scipy/blob/maintenance/1.11.x/scipy/interpolate/fitpack/fpcurf.f#L31
+
+
+# Hardcoded in curfit.f
+TOL = 0.001
+MAXIT = 20
+
+
+def _get_residuals(x, y, t, k, w):
+    # FITPACK has (w*(spl(x)-y))**2; make_lsq_spline has w*(spl(x)-y)**2
+    w2 = w**2
+
+    # inline the relevant part of
+    # >>> spl = make_lsq_spline(x, y, w=w2, t=t, k=k)
+    # NB:
+    #     1. y is assumed to be 2D here. For 1D case (parametric=False),
+    #        the call must have been preceded by y = y[:, None] (cf _validate_inputs)
+    #     2. We always sum the squares across axis=1:
+    #         * For 1D (parametric=False), the last dimension has size one,
+    #           so the summation is a no-op.
+    #         * For 2D (parametric=True), the summation is actually how the
+    #           'residuals' are defined, see Eq. (42) in Dierckx1982
+    #           (the reference is in the docstring of `class F`) below.
+    _, _, c = _lsq_solve_qr(x, y, t, k, w)
+    c = np.ascontiguousarray(c)
+    spl = BSpline(t, c, k)
+    return _compute_residuals(w2, spl(x), y)
+
+
+def _compute_residuals(w2, splx, y):
+    delta = ((splx - y)**2).sum(axis=1)
+    return w2 * delta
+
+
+def add_knot(x, t, k, residuals):
+    """Add a new knot.
+
+    (Approximately) replicate FITPACK's logic:
+      1. split the `x` array into knot intervals, ``t(j+k) <= x(i) <= t(j+k+1)``
+      2. find the interval with the maximum sum of residuals
+      3. insert a new knot into the middle of that interval.
+
+    NB: a new knot is in fact an `x` value at the middle of the interval.
+    So *the knots are a subset of `x`*.
+
+    This routine is an analog of
+    https://github.com/scipy/scipy/blob/v1.11.4/scipy/interpolate/fitpack/fpcurf.f#L190-L215
+    (cf _split function)
+
+    and https://github.com/scipy/scipy/blob/v1.11.4/scipy/interpolate/fitpack/fpknot.f
+    """
+    new_knot = _fpknot(x, t, k, residuals)
+
+    idx_t = np.searchsorted(t, new_knot)
+    t_new = np.r_[t[:idx_t], new_knot, t[idx_t:]]
+    return t_new
+
+
+def _validate_inputs(x, y, w, k, s, xb, xe, parametric):
+    """Common input validations for generate_knots and make_splrep.
+    """
+    x = np.asarray(x, dtype=float)
+    y = np.asarray(y, dtype=float)
+
+    if w is None:
+        w = np.ones_like(x, dtype=float)
+    else:
+        w = np.asarray(w, dtype=float)
+        if w.ndim != 1:
+            raise ValueError(f"{w.ndim = } not implemented yet.")
+        if (w < 0).any():
+            raise ValueError("Weights must be non-negative")
+
+    if y.ndim == 0 or y.ndim > 2:
+        raise ValueError(f"{y.ndim = } not supported (must be 1 or 2.)")
+
+    parametric = bool(parametric)
+    if parametric:
+        if y.ndim != 2:
+            raise ValueError(f"{y.ndim = } != 2 not supported with {parametric =}.")
+    else:
+        if y.ndim != 1:
+            raise ValueError(f"{y.ndim = } != 1 not supported with {parametric =}.")
+        # all _impl functions expect y.ndim = 2
+        y = y[:, None]
+
+    if w.shape[0] != x.shape[0]:
+        raise ValueError(f"Weights is incompatible: {w.shape =} != {x.shape}.")
+
+    if x.shape[0] != y.shape[0]:
+        raise ValueError(f"Data is incompatible: {x.shape = } and {y.shape = }.")
+    if x.ndim != 1 or (x[1:] < x[:-1]).any():
+        raise ValueError("Expect `x` to be an ordered 1D sequence.")
+
+    k = operator.index(k)
+
+    if s < 0:
+        raise ValueError(f"`s` must be non-negative. Got {s = }")
+
+    if xb is None:
+        xb = min(x)
+    if xe is None:
+        xe = max(x)
+
+    return x, y, w, k, s, xb, xe
+
+
+def generate_knots(x, y, *, w=None, xb=None, xe=None, k=3, s=0, nest=None):
+    """Replicate FITPACK's constructing the knot vector.
+
+    Parameters
+    ----------
+    x, y : array_like
+        The data points defining the curve ``y = f(x)``.
+    w : array_like, optional
+        Weights.
+    xb : float, optional
+        The boundary of the approximation interval. If None (default),
+        is set to ``x[0]``.
+    xe : float, optional
+        The boundary of the approximation interval. If None (default),
+        is set to ``x[-1]``.
+    k : int, optional
+        The spline degree. Default is cubic, ``k = 3``.
+    s : float, optional
+        The smoothing factor. Default is ``s = 0``.
+    nest : int, optional
+        Stop when at least this many knots are placed.
+
+    Yields
+    ------
+    t : ndarray
+        Knot vectors with an increasing number of knots.
+        The generator is finite: it stops when the smoothing critetion is
+        satisfied, or when then number of knots exceeds the maximum value:
+        the user-provided `nest` or `x.size + k + 1` --- which is the knot vector
+        for the interpolating spline.
+
+    Examples
+    --------
+    Generate some noisy data and fit a sequence of LSQ splines:
+
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from scipy.interpolate import make_lsq_spline, generate_knots
+    >>> rng = np.random.default_rng(12345)
+    >>> x = np.linspace(-3, 3, 50)
+    >>> y = np.exp(-x**2) + 0.1 * rng.standard_normal(size=50)
+
+    >>> knots = list(generate_knots(x, y, s=1e-10))
+    >>> for t in knots[::3]:
+    ...     spl = make_lsq_spline(x, y, t)
+    ...     xs = xs = np.linspace(-3, 3, 201)
+    ...     plt.plot(xs, spl(xs), '-', label=f'n = {len(t)}', lw=3, alpha=0.7)
+    >>> plt.plot(x, y, 'o', label='data')
+    >>> plt.plot(xs, np.exp(-xs**2), '--')
+    >>> plt.legend()
+
+    Note that increasing the number of knots make the result follow the data
+    more and more closely.
+
+    Also note that a step of the generator may add multiple knots:
+
+    >>> [len(t) for t in knots]
+    [8, 9, 10, 12, 16, 24, 40, 48, 52]
+
+    Notes
+    -----
+    The routine generates successive knots vectors of increasing length, starting
+    from ``2*(k+1)`` to ``len(x) + k + 1``, trying to make knots more dense
+    in the regions where the deviation of the LSQ spline from data is large.
+
+    When the maximum number of knots, ``len(x) + k + 1`` is reached
+    (this happens when ``s`` is small and ``nest`` is large), the generator
+    stops, and the last output is the knots for the interpolation with the
+    not-a-knot boundary condition.
+
+    Knots are located at data sites, unless ``k`` is even and the number of knots
+    is ``len(x) + k + 1``. In that case, the last output of the generator
+    has internal knots at Greville sites, ``(x[1:] + x[:-1]) / 2``.
+
+    .. versionadded:: 1.14.0
+
+    """
+    if s == 0:
+        if nest is not None or w is not None:
+            raise ValueError("s == 0 is interpolation only")
+        t = _not_a_knot(x, k)
+        yield t
+        return
+
+    x, y, w, k, s, xb, xe = _validate_inputs(
+        x, y, w, k, s, xb, xe, parametric=np.ndim(y) == 2
+    )
+
+    yield from _generate_knots_impl(x, y, w=w, xb=xb, xe=xe, k=k, s=s, nest=nest)
+
+
+def _generate_knots_impl(x, y, *, w=None, xb=None, xe=None, k=3, s=0, nest=None):
+
+    acc = s * TOL
+    m = x.size    # the number of data points
+
+    if nest is None:
+        # the max number of knots. This is set in _fitpack_impl.py line 274
+        # and fitpack.pyf line 198
+        nest = max(m + k + 1, 2*k + 3)
+    else:
+        if nest < 2*(k + 1):
+            raise ValueError(f"`nest` too small: {nest = } < 2*(k+1) = {2*(k+1)}.")
+
+    nmin = 2*(k + 1)    # the number of knots for an LSQ polynomial approximation
+    nmax = m + k + 1  # the number of knots for the spline interpolation
+
+    # start from no internal knots
+    t = np.asarray([xb]*(k+1) + [xe]*(k+1), dtype=float)
+    n = t.shape[0]
+    fp = 0.0
+    fpold = 0.0
+
+    # c  main loop for the different sets of knots. m is a safe upper bound
+    # c  for the number of trials.
+    for _ in range(m):
+        yield t
+
+        # construct the LSQ spline with this set of knots
+        fpold = fp
+        residuals = _get_residuals(x, y, t, k, w=w)
+        fp = residuals.sum()
+        fpms = fp - s
+
+        # c  test whether the approximation sinf(x) is an acceptable solution.
+        # c  if f(p=inf) < s accept the choice of knots.
+        if (abs(fpms) < acc) or (fpms < 0):
+            return
+
+        # ### c  increase the number of knots. ###
+
+        # c  determine the number of knots nplus we are going to add.
+        if n == nmin:
+            # the first iteration
+            nplus = 1
+        else:
+            delta = fpold - fp
+            npl1 = int(nplus * fpms / delta) if delta > acc else nplus*2
+            nplus = min(nplus*2, max(npl1, nplus//2, 1))
+
+        # actually add knots
+        for j in range(nplus):
+            t = add_knot(x, t, k, residuals)
+
+            # check if we have enough knots already
+
+            n = t.shape[0]
+            # c  if n = nmax, sinf(x) is an interpolating spline.
+            # c  if n=nmax we locate the knots as for interpolation.
+            if n >= nmax:
+                t = _not_a_knot(x, k)
+                yield t
+                return
+
+            # c  if n=nest we cannot increase the number of knots because of
+            # c  the storage capacity limitation.
+            if n >= nest:
+                yield t
+                return
+
+            # recompute if needed
+            if j < nplus - 1:
+                residuals = _get_residuals(x, y, t, k, w=w)
+
+    # this should never be reached
+    return
+
+
+#   cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
+#   c  part 2: determination of the smoothing spline sp(x).                c
+#   c  ***************************************************                 c
+#   c  we have determined the number of knots and their position.          c
+#   c  we now compute the b-spline coefficients of the smoothing spline    c
+#   c  sp(x). the observation matrix a is extended by the rows of matrix   c
+#   c  b expressing that the kth derivative discontinuities of sp(x) at    c
+#   c  the interior knots t(k+2),...t(n-k-1) must be zero. the corres-     c
+#   c  ponding weights of these additional rows are set to 1/p.            c
+#   c  iteratively we then have to determine the value of p such that      c
+#   c  f(p)=sum((w(i)*(y(i)-sp(x(i))))**2) be = s. we already know that    c
+#   c  the least-squares kth degree polynomial corresponds to p=0, and     c
+#   c  that the least-squares spline corresponds to p=infinity. the        c
+#   c  iteration process which is proposed here, makes use of rational     c
+#   c  interpolation. since f(p) is a convex and strictly decreasing       c
+#   c  function of p, it can be approximated by a rational function        c
+#   c  r(p) = (u*p+v)/(p+w). three values of p(p1,p2,p3) with correspond-  c
+#   c  ing values of f(p) (f1=f(p1)-s,f2=f(p2)-s,f3=f(p3)-s) are used      c
+#   c  to calculate the new value of p such that r(p)=s. convergence is    c
+#   c  guaranteed by taking f1>0 and f3<0.                                 c
+#   cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
+
+
+def prodd(t, i, j, k):
+    res = 1.0
+    for s in range(k+2):
+        if i + s != j:
+            res *= (t[j] - t[i+s])
+    return res
+
+
+def disc(t, k):
+    """Discontinuity matrix: jumps of k-th derivatives of b-splines at internal knots.
+
+    See Eqs. (9)-(10) of Ref. [1], or, equivalently, Eq. (3.43) of Ref. [2].
+
+    This routine assumes internal knots are all simple (have multiplicity =1).
+
+    Parameters
+    ----------
+    t : ndarray, 1D, shape(n,)
+        Knots.
+    k : int
+        The spline degree
+
+    Returns
+    -------
+    disc : ndarray, shape(n-2*k-1, k+2)
+        The jumps of the k-th derivatives of b-splines at internal knots,
+        ``t[k+1], ...., t[n-k-1]``.
+
+    Notes
+    -----
+
+    The normalization here follows FITPACK:
+    (https://github.com/scipy/scipy/blob/maintenance/1.11.x/scipy/interpolate/fitpack/fpdisc.f#L36)
+
+    The k-th derivative jumps are multiplied by a factor::
+
+        (delta / nrint)**k / k!
+
+    where ``delta`` is the length of the interval spanned by internal knots, and
+    ``nrint`` is one less the number of internal knots (i.e., the number of
+    subintervals between them).
+
+    References
+    ----------
+    .. [1] Paul Dierckx, Algorithms for smoothing data with periodic and parametric
+           splines, Computer Graphics and Image Processing, vol. 20, p. 171 (1982).
+           :doi:`10.1016/0146-664X(82)90043-0`
+
+    .. [2] Tom Lyche and Knut Morken, Spline methods,
+        http://www.uio.no/studier/emner/matnat/ifi/INF-MAT5340/v05/undervisningsmateriale/
+
+    """
+    n = t.shape[0]
+
+    # the length of the base interval spanned by internal knots & the number
+    # of subintervas between these internal knots
+    delta = t[n - k - 1] - t[k]
+    nrint = n - 2*k - 1
+
+    matr = np.empty((nrint - 1, k + 2), dtype=float)
+    for jj in range(nrint - 1):
+        j = jj + k + 1
+        for ii in range(k + 2):
+            i = jj + ii
+            matr[jj, ii] = (t[i + k + 1] - t[i]) / prodd(t, i, j, k)
+        # NB: equivalent to
+        # row = [(t[i + k + 1] - t[i]) / prodd(t, i, j, k) for i in range(j-k-1, j+1)]
+        # assert (matr[j-k-1, :] == row).all()
+
+    # follow FITPACK
+    matr *= (delta/ nrint)**k
+
+    # make it packed
+    offset = np.array([i for i in range(nrint-1)])
+    nc = n - k - 1
+    return matr, offset, nc
+
+
+class F:
+    """ The r.h.s. of ``f(p) = s``.
+
+    Given scalar `p`, we solve the system of equations in the LSQ sense:
+
+        | A     |  @ | c | = | y |
+        | B / p |    | 0 |   | 0 |
+
+    where `A` is the matrix of b-splines and `b` is the discontinuity matrix
+    (the jumps of the k-th derivatives of b-spline basis elements at knots).
+
+    Since we do that repeatedly while minimizing over `p`, we QR-factorize
+    `A` only once and update the QR factorization only of the `B` rows of the
+    augmented matrix |A, B/p|.
+
+    The system of equations is Eq. (15) Ref. [1]_, the strategy and implementation
+    follows that of FITPACK, see specific links below.
+
+    References
+    ----------
+    [1] P. Dierckx, Algorithms for Smoothing Data with Periodic and Parametric Splines,
+        COMPUTER GRAPHICS AND IMAGE PROCESSING vol. 20, pp 171-184 (1982.)
+        https://doi.org/10.1016/0146-664X(82)90043-0
+
+    """
+    def __init__(self, x, y, t, k, s, w=None, *, R=None, Y=None):
+        self.x = x
+        self.y = y
+        self.t = t
+        self.k = k
+        w = np.ones_like(x, dtype=float) if w is None else w
+        if w.ndim != 1:
+            raise ValueError(f"{w.ndim = } != 1.")
+        self.w = w
+        self.s = s
+
+        if y.ndim != 2:
+            raise ValueError(f"F: expected y.ndim == 2, got {y.ndim = } instead.")
+
+        # ### precompute what we can ###
+
+        # https://github.com/scipy/scipy/blob/maintenance/1.11.x/scipy/interpolate/fitpack/fpcurf.f#L250
+        # c  evaluate the discontinuity jump of the kth derivative of the
+        # c  b-splines at the knots t(l),l=k+2,...n-k-1 and store in b.
+        b, b_offset, b_nc = disc(t, k)
+
+        # the QR factorization of the data matrix, if not provided
+        # NB: otherwise, must be consistent with x,y & s, but this is not checked
+        if R is None and Y is None:
+            R, Y, _ = _lsq_solve_qr(x, y, t, k, w)
+
+        # prepare to combine R and the discontinuity matrix (AB); also r.h.s. (YY)
+        # https://github.com/scipy/scipy/blob/maintenance/1.11.x/scipy/interpolate/fitpack/fpcurf.f#L269
+        # c  the rows of matrix b with weight 1/p are rotated into the
+        # c  triangularised observation matrix a which is stored in g.
+        nc = t.shape[0] - k - 1
+        nz = k + 1
+        if R.shape[1] != nz:
+            raise ValueError(f"Internal error: {R.shape[1] =} != {k+1 =}.")
+
+        # r.h.s. of the augmented system
+        z = np.zeros((b.shape[0], Y.shape[1]), dtype=float)
+        self.YY = np.r_[Y[:nc], z]
+
+        # l.h.s. of the augmented system
+        AA = np.zeros((nc + b.shape[0], self.k+2), dtype=float)
+        AA[:nc, :nz] = R[:nc, :]
+        # AA[nc:, :] = b.a / p  # done in __call__(self, p)
+        self.AA  = AA
+        self.offset = np.r_[np.arange(nc, dtype=np.intp), b_offset]
+
+        self.nc = nc
+        self.b = b
+
+    def __call__(self, p):
+        # https://github.com/scipy/scipy/blob/maintenance/1.11.x/scipy/interpolate/fitpack/fpcurf.f#L279
+        # c  the row of matrix b is rotated into triangle by givens transformation
+
+        # copy the precomputed matrices over for in-place work
+        # R = PackedMatrix(self.AB.a.copy(), self.AB.offset.copy(), nc)
+        AB = self.AA.copy()
+        offset = self.offset.copy()
+        nc = self.nc
+
+        AB[nc:, :] = self.b / p
+        QY = self.YY.copy()
+
+        # heavy lifting happens here, in-place
+        _qr_reduce(AB, offset, nc, QY, startrow=nc)
+
+        # solve for the coefficients
+        c = _fpback(AB, nc, QY)
+
+        spl = BSpline(self.t, c, self.k)
+        residuals = _compute_residuals(self.w**2, spl(self.x), self.y)
+        fp = residuals.sum()
+
+        self.spl = spl   # store it
+
+        return fp - self.s
+
+
+def fprati(p1, f1, p2, f2, p3, f3):
+    """The root of r(p) = (u*p + v) / (p + w) given three points and values,
+    (p1, f2), (p2, f2) and (p3, f3).
+
+    The FITPACK analog adjusts the bounds, and we do not
+    https://github.com/scipy/scipy/blob/maintenance/1.11.x/scipy/interpolate/fitpack/fprati.f
+
+    NB: FITPACK uses p < 0 to encode p=infinity. We just use the infinity itself.
+    Since the bracket is ``p1 <= p2 <= p3``, ``p3`` can be infinite (in fact,
+    this is what the minimizer starts with, ``p3=inf``).
+    """
+    h1 = f1 * (f2 - f3)
+    h2 = f2 * (f3 - f1)
+    h3 = f3 * (f1 - f2)
+    if p3 == np.inf:
+        return -(p2*h1 + p1*h2) / h3
+    return -(p1*p2*h3 + p2*p3*h1 + p1*p3*h2) / (p1*h1 + p2*h2 + p3*h3)
+
+
+class Bunch:
+    def __init__(self, **kwargs):
+        self.__dict__.update(**kwargs)
+
+
+_iermesg = {
+2: """error. a theoretically impossible result was found during
+the iteration process for finding a smoothing spline with
+fp = s. probably causes : s too small.
+there is an approximation returned but the corresponding
+weighted sum of squared residuals does not satisfy the
+condition abs(fp-s)/s < tol.
+""",
+3: """error. the maximal number of iterations maxit (set to 20
+by the program) allowed for finding a smoothing spline
+with fp=s has been reached. probably causes : s too small
+there is an approximation returned but the corresponding
+weighted sum of squared residuals does not satisfy the
+condition abs(fp-s)/s < tol.
+"""
+}
+
+
+def root_rati(f, p0, bracket, acc):
+    """Solve `f(p) = 0` using a rational function approximation.
+
+    In a nutshell, since the function f(p) is known to be monotonically decreasing, we
+       - maintain the bracket (p1, f1), (p2, f2) and (p3, f3)
+       - at each iteration step, approximate f(p) by a rational function
+         r(p) = (u*p + v) / (p + w)
+         and make a step to p_new to the root of f(p): r(p_new) = 0.
+         The coefficients u, v and w are found from the bracket values p1..3 and f1...3
+
+    The algorithm and implementation follows
+    https://github.com/scipy/scipy/blob/maintenance/1.11.x/scipy/interpolate/fitpack/fpcurf.f#L229
+    and
+    https://github.com/scipy/scipy/blob/maintenance/1.11.x/scipy/interpolate/fitpack/fppara.f#L290
+
+    Note that the latter is for parametric splines and the former is for 1D spline
+    functions. The minimization is indentical though [modulo a summation over the
+    dimensions in the computation of f(p)], so we reuse the minimizer for both
+    d=1 and d>1.
+    """
+    # Magic values from
+    # https://github.com/scipy/scipy/blob/maintenance/1.11.x/scipy/interpolate/fitpack/fpcurf.f#L27
+    con1 = 0.1
+    con9 = 0.9
+    con4 = 0.04
+
+    # bracketing flags (follow FITPACK)
+    # https://github.com/scipy/scipy/blob/maintenance/1.11.x/scipy/interpolate/fitpack/fppara.f#L365
+    ich1, ich3 = 0, 0
+
+    (p1, f1), (p3, f3)  = bracket
+    p = p0
+
+    for it in range(MAXIT):
+        p2, f2 = p, f(p)
+
+        # c  test whether the approximation sp(x) is an acceptable solution.
+        if abs(f2) < acc:
+            ier, converged = 0, True
+            break
+
+        # c  carry out one more step of the iteration process.
+        if ich3 == 0:
+            if f2 - f3 <= acc:
+                # c  our initial choice of p is too large.
+                p3 = p2
+                f3 = f2
+                p = p*con4
+                if p <= p1:
+                     p = p1*con9 + p2*con1
+                continue
+            else:
+                if f2 < 0:
+                    ich3 = 1
+
+        if ich1 == 0:
+            if f1 - f2 <= acc:
+                # c  our initial choice of p is too small
+                p1 = p2
+                f1 = f2
+                p = p/con4
+                if p3 != np.inf and p <= p3:
+                     p = p2*con1 + p3*con9
+                continue
+            else:
+                if f2 > 0:
+                    ich1 = 1
+
+        # c  test whether the iteration process proceeds as theoretically expected.
+        # [f(p) should be monotonically decreasing]
+        if f1 <= f2 or f2 <= f3:
+            ier, converged = 2, False
+            break
+
+        # actually make the iteration step
+        p = fprati(p1, f1, p2, f2, p3, f3)
+
+        # c  adjust the value of p1,f1,p3 and f3 such that f1 > 0 and f3 < 0.
+        if f2 < 0:
+            p3, f3 = p2, f2
+        else:
+            p1, f1 = p2, f2
+
+    else:
+        # not converged in MAXIT iterations
+        ier, converged = 3, False
+
+    if ier != 0:
+        warnings.warn(RuntimeWarning(_iermesg[ier]), stacklevel=2)
+
+    return Bunch(converged=converged, root=p, iterations=it, ier=ier)
+
+
+def _make_splrep_impl(x, y, *, w=None, xb=None, xe=None, k=3, s=0, t=None, nest=None):
+    """Shared infra for make_splrep and make_splprep.
+    """
+    acc = s * TOL
+    m = x.size    # the number of data points
+
+    if nest is None:
+        # the max number of knots. This is set in _fitpack_impl.py line 274
+        # and fitpack.pyf line 198
+        nest = max(m + k + 1, 2*k + 3)
+    else:
+        if nest < 2*(k + 1):
+            raise ValueError(f"`nest` too small: {nest = } < 2*(k+1) = {2*(k+1)}.")    
+        if t is not None:
+            raise ValueError("Either supply `t` or `nest`.")
+
+    if t is None:
+        gen = _generate_knots_impl(x, y, w=w, k=k, s=s, xb=xb, xe=xe, nest=nest)
+        t = list(gen)[-1]
+    else:
+        fpcheck(x, t, k)
+
+    if t.shape[0] == 2 * (k + 1):
+        # nothing to optimize
+        _, _, c = _lsq_solve_qr(x, y, t, k, w)
+        return BSpline(t, c, k)
+
+    ### solve ###
+
+    # c  initial value for p.
+    # https://github.com/scipy/scipy/blob/maintenance/1.11.x/scipy/interpolate/fitpack/fpcurf.f#L253
+    R, Y, _ = _lsq_solve_qr(x, y, t, k, w)
+    nc = t.shape[0] -k -1
+    p = nc / R[:, 0].sum()
+
+    # ### bespoke solver ####
+    # initial conditions
+    # f(p=inf) : LSQ spline with knots t   (XXX: reuse R, c)
+    residuals = _get_residuals(x, y, t, k, w=w)
+    fp = residuals.sum()
+    fpinf = fp - s
+
+    # f(p=0): LSQ spline without internal knots
+    residuals = _get_residuals(x, y, np.array([xb]*(k+1) + [xe]*(k+1)), k, w)
+    fp0 = residuals.sum()
+    fp0 = fp0 - s
+
+    # solve
+    bracket = (0, fp0), (np.inf, fpinf)
+    f = F(x, y, t, k=k, s=s, w=w, R=R, Y=Y)
+    _ = root_rati(f, p, bracket, acc)
+
+    # solve ALTERNATIVE: is roughly equivalent, gives slightly different results
+    # starting from scratch, that would have probably been tolerable;
+    # backwards compatibility dictates that we replicate the FITPACK minimizer though.
+ #   f = F(x, y, t, k=k, s=s, w=w, R=R, Y=Y)
+ #   from scipy.optimize import root_scalar
+ #   res_ = root_scalar(f, x0=p, rtol=acc)
+ #   assert res_.converged
+
+    # f.spl is the spline corresponding to the found `p` value
+    return f.spl
+
+
+def make_splrep(x, y, *, w=None, xb=None, xe=None, k=3, s=0, t=None, nest=None):
+    r"""Find the B-spline representation of a 1D function.
+
+    Given the set of data points ``(x[i], y[i])``, determine a smooth spline
+    approximation of degree ``k`` on the interval ``xb <= x <= xe``.
+
+    Parameters
+    ----------
+    x, y : array_like, shape (m,)
+        The data points defining a curve ``y = f(x)``.
+    w : array_like, shape (m,), optional
+        Strictly positive 1D array of weights, of the same length as `x` and `y`.
+        The weights are used in computing the weighted least-squares spline
+        fit. If the errors in the y values have standard-deviation given by the
+        vector ``d``, then `w` should be ``1/d``.
+        Default is ``np.ones(m)``.
+    xb, xe : float, optional
+        The interval to fit.  If None, these default to ``x[0]`` and ``x[-1]``,
+        respectively.
+    k : int, optional
+        The degree of the spline fit. It is recommended to use cubic splines,
+        ``k=3``, which is the default. Even values of `k` should be avoided,
+        especially with small `s` values.
+    s : float, optional
+        The smoothing condition. The amount of smoothness is determined by
+        satisfying the conditions::
+
+            sum((w * (g(x)  - y))**2 ) <= s
+
+        where ``g(x)`` is the smoothed fit to ``(x, y)``. The user can use `s`
+        to control the tradeoff between closeness to data and smoothness of fit.
+        Larger `s` means more smoothing while smaller values of `s` indicate less
+        smoothing.
+        Recommended values of `s` depend on the weights, `w`. If the weights
+        represent the inverse of the standard deviation of `y`, then a good `s`
+        value should be found in the range ``(m-sqrt(2*m), m+sqrt(2*m))`` where
+        ``m`` is the number of datapoints in `x`, `y`, and `w`.
+        Default is ``s = 0.0``, i.e. interpolation.
+    t : array_like, optional
+        The spline knots. If None (default), the knots will be constructed
+        automatically.
+        There must be at least ``2*k + 2`` and at most ``m + k + 1`` knots.
+    nest : int, optional
+        The target length of the knot vector. Should be between ``2*(k + 1)``
+        (the minimum number of knots for a degree-``k`` spline), and
+        ``m + k + 1`` (the number of knots of the interpolating spline).
+        The actual number of knots returned by this routine may be slightly
+        larger than `nest`.
+        Default is None (no limit, add up to ``m + k + 1`` knots).
+
+    Returns
+    -------
+    spl : a `BSpline` instance
+        For `s=0`,  ``spl(x) == y``.
+        For non-zero values of `s` the `spl` represents the smoothed approximation
+        to `(x, y)`, generally with fewer knots.
+
+    See Also
+    --------
+    generate_knots : is used under the hood for generating the knots
+    make_splprep : the analog of this routine for parametric curves
+    make_interp_spline : construct an interpolating spline (``s = 0``)
+    make_lsq_spline : construct the least-squares spline given the knot vector
+    splrep : a FITPACK analog of this routine
+
+    References
+    ----------
+    .. [1] P. Dierckx, "Algorithms for smoothing data with periodic and
+        parametric splines, Computer Graphics and Image Processing",
+        20 (1982) 171-184.
+    .. [2] P. Dierckx, "Curve and surface fitting with splines", Monographs on
+        Numerical Analysis, Oxford University Press, 1993.
+
+    Notes
+    -----
+    This routine constructs the smoothing spline function, :math:`g(x)`, to
+    minimize the sum of jumps, :math:`D_j`, of the ``k``-th derivative at the
+    internal knots (:math:`x_b < t_i < x_e`), where
+
+        .. math::
+
+            D_i = g^{(k)}(t_i + 0) - g^{(k)}(t_i - 0)
+
+    Specifically, the routine constructs the spline function :math:`g(x)` which
+    minimizes
+
+        .. math::
+
+                \sum_i | D_i |^2 \to \mathrm{min}
+
+    provided that
+
+        .. math::
+
+               \sum_{j=1}^m (w_j \times (g(x_j) - y_j))^2 \leqslant s ,
+
+    where :math:`s > 0` is the input parameter.
+
+    In other words, we balance maximizing the smoothness (measured as the jumps
+    of the derivative, the first criterion), and the deviation of :math:`g(x_j)`
+    from the data :math:`y_j` (the second criterion).
+
+    Note that the summation in the second criterion is over all data points,
+    and in the first criterion it is over the internal spline knots (i.e.
+    those with ``xb < t[i] < xe``). The spline knots are in general a subset
+    of data, see `generate_knots` for details.
+
+    Also note the difference of this routine to `make_lsq_spline`: the latter
+    routine does not consider smoothness and simply solves a least-squares
+    problem
+
+        .. math::
+
+            \sum w_j \times (g(x_j) - y_j)^2 \to \mathrm{min}
+
+    for a spline function :math:`g(x)` with a _fixed_ knot vector ``t``.
+
+    .. versionadded:: 1.14.0
+    """
+    if s == 0:
+        if t is not None or w is not None or nest is not None:
+            raise ValueError("s==0 is for interpolation only")
+        return make_interp_spline(x, y, k=k)
+
+    x, y, w, k, s, xb, xe = _validate_inputs(x, y, w, k, s, xb, xe, parametric=False)
+
+    spl = _make_splrep_impl(x, y, w=w, xb=xb, xe=xe, k=k, s=s, t=t, nest=nest)
+
+    # postprocess: squeeze out the last dimension: was added to simplify the internals.
+    spl.c = spl.c[:, 0]
+    return spl
+
+
+def make_splprep(x, *, w=None, u=None, ub=None, ue=None, k=3, s=0, t=None, nest=None):
+    r"""
+    Find a smoothed B-spline representation of a parametric N-D curve.
+
+    Given a list of N 1D arrays, `x`, which represent a curve in
+    N-dimensional space parametrized by `u`, find a smooth approximating
+    spline curve ``g(u)``.
+
+    Parameters
+    ----------
+    x : array_like, shape (m, ndim)
+        Sampled data points representing the curve in ``ndim`` dimensions.
+        The typical use is a list of 1D arrays, each of length ``m``.
+    w : array_like, shape(m,), optional
+        Strictly positive 1D array of weights.
+        The weights are used in computing the weighted least-squares spline
+        fit. If the errors in the `x` values have standard deviation given by
+        the vector d, then `w` should be 1/d. Default is ``np.ones(m)``.
+    u : array_like, optional
+        An array of parameter values for the curve in the parametric form.
+        If not given, these values are calculated automatically, according to::
+
+            v[0] = 0
+            v[i] = v[i-1] + distance(x[i], x[i-1])
+            u[i] = v[i] / v[-1]
+
+    ub, ue : float, optional
+        The end-points of the parameters interval. Default to ``u[0]`` and ``u[-1]``.
+    k : int, optional
+        Degree of the spline. Cubic splines, ``k=3``, are recommended.
+        Even values of `k` should be avoided especially with a small ``s`` value.
+        Default is ``k=3``
+    s : float, optional
+        A smoothing condition.  The amount of smoothness is determined by
+        satisfying the conditions::
+
+            sum((w * (g(u) - x))**2) <= s,
+
+        where ``g(u)`` is the smoothed approximation to ``x``.  The user can
+        use `s` to control the trade-off between closeness and smoothness
+        of fit.  Larger ``s`` means more smoothing while smaller values of ``s``
+        indicate less smoothing.
+        Recommended values of ``s`` depend on the weights, ``w``.  If the weights
+        represent the inverse of the standard deviation of ``x``, then a good
+        ``s`` value should be found in the range ``(m - sqrt(2*m), m + sqrt(2*m))``,
+        where ``m`` is the number of data points in ``x`` and ``w``.
+    t : array_like, optional
+        The spline knots. If None (default), the knots will be constructed
+        automatically.
+        There must be at least ``2*k + 2`` and at most ``m + k + 1`` knots.
+    nest : int, optional
+        The target length of the knot vector. Should be between ``2*(k + 1)``
+        (the minimum number of knots for a degree-``k`` spline), and
+        ``m + k + 1`` (the number of knots of the interpolating spline).
+        The actual number of knots returned by this routine may be slightly
+        larger than `nest`.
+        Default is None (no limit, add up to ``m + k + 1`` knots).
+
+    Returns
+    -------
+    spl : a `BSpline` instance
+        For `s=0`,  ``spl(u) == x``.
+        For non-zero values of ``s``, `spl` represents the smoothed approximation
+        to ``x``, generally with fewer knots.
+    u : ndarray
+        The values of the parameters
+
+    See Also
+    --------
+    generate_knots : is used under the hood for generating the knots
+    make_splrep : the analog of this routine 1D functions
+    make_interp_spline : construct an interpolating spline (``s = 0``)
+    make_lsq_spline : construct the least-squares spline given the knot vector
+    splprep : a FITPACK analog of this routine
+
+    Notes
+    -----
+    Given a set of :math:`m` data points in :math:`D` dimensions, :math:`\vec{x}_j`,
+    with :math:`j=1, ..., m` and :math:`\vec{x}_j = (x_{j; 1}, ..., x_{j; D})`,
+    this routine constructs the parametric spline curve :math:`g_a(u)` with
+    :math:`a=1, ..., D`, to minimize the sum of jumps, :math:`D_{i; a}`, of the
+    ``k``-th derivative at the internal knots (:math:`u_b < t_i < u_e`), where
+
+        .. math::
+
+            D_{i; a} = g_a^{(k)}(t_i + 0) - g_a^{(k)}(t_i - 0)
+
+    Specifically, the routine constructs the spline function :math:`g(u)` which
+    minimizes
+
+        .. math::
+
+                \sum_i \sum_{a=1}^D | D_{i; a} |^2 \to \mathrm{min}
+
+    provided that
+
+        .. math::
+
+            \sum_{j=1}^m \sum_{a=1}^D (w_j \times (g_a(u_j) - x_{j; a}))^2 \leqslant s
+
+    where :math:`u_j` is the value of the parameter corresponding to the data point
+    :math:`(x_{j; 1}, ..., x_{j; D})`, and :math:`s > 0` is the input parameter.
+
+    In other words, we balance maximizing the smoothness (measured as the jumps
+    of the derivative, the first criterion), and the deviation of :math:`g(u_j)`
+    from the data :math:`x_j` (the second criterion).
+
+    Note that the summation in the second criterion is over all data points,
+    and in the first criterion it is over the internal spline knots (i.e.
+    those with ``ub < t[i] < ue``). The spline knots are in general a subset
+    of data, see `generate_knots` for details.
+
+    .. versionadded:: 1.14.0
+
+    References
+    ----------
+    .. [1] P. Dierckx, "Algorithms for smoothing data with periodic and
+        parametric splines, Computer Graphics and Image Processing",
+        20 (1982) 171-184.
+    .. [2] P. Dierckx, "Curve and surface fitting with splines", Monographs on
+        Numerical Analysis, Oxford University Press, 1993.
+    """
+    x = np.stack(x, axis=1)
+
+    # construct the default parametrization of the curve
+    if u is None:
+        dp = (x[1:, :] - x[:-1, :])**2
+        u = np.sqrt((dp).sum(axis=1)).cumsum()
+        u = np.r_[0, u / u[-1]]
+
+    if s == 0:
+        if t is not None or w is not None or nest is not None:
+            raise ValueError("s==0 is for interpolation only")
+        return make_interp_spline(u, x.T, k=k, axis=1), u
+
+    u, x, w, k, s, ub, ue = _validate_inputs(u, x, w, k, s, ub, ue, parametric=True)
+
+    spl = _make_splrep_impl(u, x, w=w, xb=ub, xe=ue, k=k, s=s, t=t, nest=nest)
+
+    # posprocess: `axis=1` so that spl(u).shape == np.shape(x)
+    # when `x` is a list of 1D arrays (cf original splPrep)
+    cc = spl.c.T
+    spl1 = BSpline(spl.t, cc, spl.k, axis=1)
+
+    return spl1, u
+
diff --git a/scipy/interpolate/_ndbspline.py b/scipy/interpolate/_ndbspline.py
index 826dddb311d7..c196c4aa28d8 100644
--- a/scipy/interpolate/_ndbspline.py
+++ b/scipy/interpolate/_ndbspline.py
@@ -5,7 +5,7 @@
 
 from math import prod
 
-from . import _bspl  # type: ignore
+from . import _bspl
 
 import scipy.sparse.linalg as ssl
 from scipy.sparse import csr_array
diff --git a/scipy/interpolate/meson.build b/scipy/interpolate/meson.build
index 43cc02102244..2ddad6f0bc5b 100644
--- a/scipy/interpolate/meson.build
+++ b/scipy/interpolate/meson.build
@@ -121,11 +121,21 @@ py3.extension_module('_rgi_cython',
   subdir: 'scipy/interpolate'
 )
 
+__fitpack_lib = static_library('__fitpack',
+    ['src/__fitpack.h', 'src/__fitpack.cc'],
+    dependencies:[lapack]
+    # XXX: static_library or (dynamic) library(...)? If dynamic, how to install
+)
+
+__fitpack_dep = declare_dependency(
+    link_with: __fitpack_lib,
+)
+
 py3.extension_module('_bspl',
-  cython_gen.process('_bspl.pyx'),
-  c_args: cython_c_args,
+  cython_gen_cpp.process('_bspl.pyx'),
+  cpp_args: cython_cpp_args,
   include_directories: 'src/',
-  dependencies: np_dep,
+  dependencies: [lapack, np_dep, __fitpack_dep],
   link_args: version_link_args,
   install: true,
   subdir: 'scipy/interpolate'
@@ -185,6 +195,7 @@ py3.install_sources([
     '_fitpack_impl.py',
     '_fitpack_py.py',
     '_fitpack2.py',
+    '_fitpack_repro.py',
     '_interpolate.py',
     '_ndgriddata.py',
     '_pade.py',
diff --git a/scipy/interpolate/src/__fitpack.cc b/scipy/interpolate/src/__fitpack.cc
new file mode 100644
index 000000000000..d2dd36b10fda
--- /dev/null
+++ b/scipy/interpolate/src/__fitpack.cc
@@ -0,0 +1,435 @@
+#include <cstring>
+#include "__fitpack.h"
+
+namespace fitpack{
+
+/*
+ * B-spline evaluation routine.
+ */
+void
+_deBoor_D(const double *t, double x, int k, int ell, int m, double *result) {
+    /*
+     * On completion the result array stores
+     * the k+1 non-zero values of beta^(m)_i,k(x):  for i=ell, ell-1, ell-2, ell-k.
+     * Where t[ell] <= x < t[ell+1].
+     */
+    /*
+     * Implements a recursive algorithm similar to the original algorithm of
+     * deBoor.
+     */
+    double *hh = result + k + 1;
+    double *h = result;
+    double xb, xa, w;
+    int ind, j, n;
+
+    /*
+     * Perform k-m "standard" deBoor iterations
+     * so that h contains the k+1 non-zero values of beta_{ell,k-m}(x)
+     * needed to calculate the remaining derivatives.
+     */
+    result[0] = 1.0;
+    for (j = 1; j <= k - m; j++) {
+        memcpy(hh, h, j*sizeof(double));
+        h[0] = 0.0;
+        for (n = 1; n <= j; n++) {
+            ind = ell + n;
+            xb = t[ind];
+            xa = t[ind - j];
+            if (xb == xa) {
+                h[n] = 0.0;
+                continue;
+            }
+            w = hh[n - 1]/(xb - xa);
+            h[n - 1] += w*(xb - x);
+            h[n] = w*(x - xa);
+        }
+    }
+
+    /*
+     * Now do m "derivative" recursions
+     * to convert the values of beta into the mth derivative
+     */
+    for (j = k - m + 1; j <= k; j++) {
+        memcpy(hh, h, j*sizeof(double));
+        h[0] = 0.0;
+        for (n = 1; n <= j; n++) {
+            ind = ell + n;
+            xb = t[ind];
+            xa = t[ind - j];
+            if (xb == xa) {
+                h[m] = 0.0;
+                continue;
+            }
+            w = j*hh[n - 1]/(xb - xa);
+            h[n - 1] -= w;
+            h[n] = w;
+        }
+    }
+}
+
+
+/*
+ *  Find an interval such that t[interval] <= xval < t[interval+1].
+ */
+ssize_t
+_find_interval(const double* tptr, ssize_t len_t,
+               int k,
+               double xval,
+               ssize_t prev_l,
+               int extrapolate)
+{
+    auto t = wrap_1D(tptr, len_t);
+
+    ssize_t n = t.extent(0) - k - 1;
+
+    // NB: t.extent(8) does not fail? Gives the last extent (=dimension)
+    //std::cout<< "t.size() = " << t.size() << "  t.extent(0) = "<< t.extent(0) << " t.extent(8) = " << t.extent(8) << "\n"; 
+
+    double tb = t[k];
+    double te = t[n];
+
+    if (xval != xval) {
+        // nan
+        return -1;
+    }
+
+    if (((xval < tb) || (xval > te)) && !extrapolate) {
+        return -1;
+    }
+    ssize_t l = (k < prev_l) && (prev_l < n) ? prev_l : k;
+
+    // xval is in support, search for interval s.t. t[interval] <= xval < t[l+1]
+    while ((xval < t[l]) && (l != k)) {
+        l -= 1;
+    }
+
+    l += 1;
+    while ((xval >= t[l]) && (l != n)) {
+        l += 1;
+    }
+
+    return l-1;
+}
+
+
+/*
+ * Fill the (m, k+1) matrix of non-zero b-splines.
+ *
+ * A row gives b-splines which are non-zero at the corresponding value of `x`.
+ * Also for each row store the `offset`: with full matrices, the non-zero elements
+ * in row `i` start at `offset[i]`. IOW,
+ * A_full[i, offset[i]: offset[i] + k + 1] = A_packed[i, :].
+ *
+ * What we construct here is `A_packed` and `offset` arrays.
+ *
+ * We also take into account possible weights for each `x` value: they
+ * multiply the rows of the data matrix.
+ *
+ * To reconstruct the full dense matrix, `A_full`, we would need to know the
+ * number of its columns, `nc`. So we return it, too.
+ */
+void
+data_matrix( /* inputs */
+            const double *xptr, ssize_t m,      // x, shape (m,)
+            const double *tptr, ssize_t len_t,  // t, shape (len_t,)
+            int k,
+            const double *wptr,                 // weights, shape (m,) // NB: len(w) == len(x), not checked
+            /* outputs */
+            double *Aptr,                       // A, shape(m, k+1)
+            ssize_t *offset_ptr,                // offset, shape (m,)
+            ssize_t *nc,                        // the number of coefficients
+            /* work array*/
+            double *wrk)                        // work, shape (2k+2)
+{
+    auto x = wrap_1D(xptr, m);
+    auto t = wrap_1D(tptr, len_t);
+    auto w = wrap_1D(wptr, m);
+    auto offset = wrap_1D(offset_ptr, m);
+    auto A = wrap_2D(Aptr, m, k+1);
+
+    ssize_t ind = k;
+    for (int i=0; i < m; ++i) {
+        double xval = x[i];
+
+        // find the interval
+        ind = _find_interval(t.data_handle(), len_t, k, xval, ind, 0);
+        if (ind < 0){
+            // should not happen here, validation is expected on the python side
+            throw std::runtime_error("find_interval: out of bounds with x = " + std::to_string(xval));
+        }
+        offset[i] = ind - k;
+
+        // compute non-zero b-splines
+        _deBoor_D(t.data_handle(), xval, k, ind, 0, wrk);
+
+        for (ssize_t j=0; j < k+1; ++j) {
+            A(i, j) = wrk[j] * w[i];
+        }
+    }
+
+    *nc = len_t - k - 1;
+}
+
+
+/*
+ *   Solve the LSQ problem ||y - A@c||^2 via QR factorization.
+ *  
+    QR factorization follows FITPACK: we reduce A row-by-row by Givens rotations.
+    To zero out the lower triangle, we use in the row `i` and column `j < i`,
+    the diagonal element in that column. That way, the sequence is
+    (here `[x]` are the pair of elements to Givens-rotate)
+
+     [x] x x x       x  x  x x      x  x  x x      x x  x  x      x x x x
+     [x] x x x  ->   0 [x] x x  ->  0 [x] x x  ->  0 x  x  x  ->  0 x x x
+      0  x x x       0 [x] x x      0  0  x x      0 0 [x] x      0 0 x x
+      0  x x x       0  x  x x      0 [x] x x      0 0 [x] x      0 0 0 x
+
+    The matrix A has a special structure: each row has at most (k+1) non-zeros, so
+    is encoded as a PackedMatrix instance.
+
+    On exit, the return matrix, also of shape (m, k+1), contains
+    elements of the upper triangular matrix `R[i, i: i + k + 1]`.
+    When we process the element (i, j), we store the rotated row in R[i, :],
+    and *shift it to the left*, so that the the diagonal element is always in the
+    zero-th place. This way, the process above becomes
+
+     [x] x x x       x  x x x       x  x x x       x  x x x      x x x x
+     [x] x x x  ->  [x] x x -  ->  [x] x x -   ->  x  x x -  ->  x x x -
+      x  x x -      [x] x x -       x  x - -      [x] x - -      x x - -
+      x  x x -       x  x x -      [x] x x -      [x] x - -      x - - -
+
+    The most confusing part is that when rotating the row `i` with a row `j`
+    above it, the offsets differ: for the upper row  `j`, `R[j, 0]` is the diagonal
+    element, while for the row `i`, `R[i, 0]` is the element being annihilated.
+
+    NB. This row-by-row Givens reduction process follows FITPACK:
+    https://github.com/scipy/scipy/blob/maintenance/1.11.x/scipy/interpolate/fitpack/fpcurf.f#L112-L161
+    A possibly more efficient way could be to note that all data points which
+    lie between two knots all have the same offset: if `t[i] < x_1 .... x_s < t[i+1]`,
+    the `s-1` corresponding rows form an `(s-1, k+1)`-sized "block".
+    Then a blocked QR implementation could look like
+    https://people.sc.fsu.edu/~jburkardt/f77_src/band_qr/band_qr.f
+    The `startrow` optional argument accounts for the scenatio with a two-step
+    factorization. Namely, the preceding rows are assumend to be already
+    processed and are skipped.
+    This is to account for the scenario where we append new rows to an already
+    triangularized matrix.
+
+    This routine MODIFIES `a` & `y` in-place.
+
+ */
+void
+qr_reduce(double *aptr, const ssize_t m, const ssize_t nz, // a(m, nz), packed
+          ssize_t *offset,                                 // offset(m)
+          const ssize_t nc,                                // dense would be a(m, nc)
+          double *yptr, const ssize_t ydim1,               // y(m, ydim2)
+          const ssize_t startrow
+)
+{
+    auto R = wrap_2D(aptr, m, nz);
+    auto y = wrap_2D(yptr, m, ydim1);
+
+
+    std::cout << "boom! " << R(m, nz);   // XXX: remove
+
+    for (ssize_t i=startrow; i < m; ++i) {
+        ssize_t oi = offset[i];
+        for (ssize_t j=oi; j < nc; ++j) {
+
+            // rotate only the lower diagonal
+            if (j >= std::min(i, nc)) {
+                break;
+            }
+
+            // in dense format: diag a1[j, j] vs a1[i, j]
+            double c, s, r;
+            DLARTG(&R(j, 0), &R(i, 0), &c, &s, &r);
+
+            // rotate l.h.s.
+            R(j, 0) = r;
+            for (ssize_t l=1; l < R.extent(1); ++l) {
+                std::tie(R(j, l), R(i, l-1)) = fprota(c, s, R(j, l), R(i, l));
+            }
+            R(i, R.extent(1) - 1) = 0.0;
+
+            // rotate r.h.s.
+            for (ssize_t l=0; l < y.extent(1); ++l) {
+                std::tie(y(j, l), y(i, l)) = fprota(c, s, y(j, l), y(i, l));
+            }
+        }
+        if (i < nc) {
+            offset[i] = i;
+        }
+
+    } // for(i = ...
+}
+
+
+/*
+ * Back substitution solve of `R @ c = y` with an upper triangular R.
+ *
+ * R is in the 'packed' format:
+ *    1. Each row has at most `nz` non-zero elements, stored in a (m, nz) array
+ *    2. All non-zeros are consecutive.
+ * Since R is upper triangular, non-zeros in row `i` start at `i`. IOW,
+ * the 'offset' array is `np.arange(nc)`.
+ *
+ * If `R` were dense, it would have had shape `(m, nc)`. Since R is upper triangular,
+ * the first `nc` rows contain non-zeros; the last `m-nc` rows are all zeros
+ * (in fact, they may contain whatever, and are not referenced in the routine).
+ *
+ * `y` array is always two-dimensional, and has shape `(m, ydim2)`.
+ * IOW, if the original data is 1D, `y.shape == (m, 1)`.
+ *
+ * The output `c` array has the shape `(nc, ydim2)`.
+ */
+void
+fpback( /* inputs*/
+       const double *Rptr, ssize_t m, ssize_t nz,    // R(m, nz), packed
+       ssize_t nc,                                   // dense R would be (m, nc)
+       const double *yptr, ssize_t ydim2,            // y(m, ydim2)
+        /* output */
+       double *cptr)                                 // c(nc, ydim2)
+{
+    auto R = wrap_2D(Rptr, m, nz);
+    auto y = wrap_2D(yptr, m, ydim2);
+    auto c = wrap_2D(cptr, nc, ydim2);
+
+    // c[nc-1, ...] = y[nc-1] / R[nc-1, 0]
+    for (ssize_t l=0; l < ydim2; ++l) {
+        c(nc - 1, l) = y(nc - 1, l) / R(nc-1, 0);
+    }
+
+    //for i in range(nc-2, -1, -1):
+    //    nel = min(nz, nc-i)
+    //    c[i, ...] = ( y[i] - (R[i, 1:nel, None] * c[i+1:i+nel, ...]).sum(axis=0) ) / R[i, 0]
+    for (ssize_t i=nc-2; i >= 0; --i) {
+        ssize_t nel = std::min(nz, nc - i);
+        for (ssize_t l=0; l < ydim2; ++l){
+            double ssum = y(i, l);
+            for (ssize_t j=1; j < nel; ++j) {
+                ssum -= R(i, j) * c(i + j, l);
+            }
+            ssum /= R(i, 0);
+            c(i, l) = ssum;
+        }
+    }
+}
+
+
+/*
+ * A helper for _fpknot:
+ *
+ * Split the `x` array into knot "runs" and sum the residuals per "run".
+ *
+ * Here a "run" is a set of `x` values which lie between consecutive knots:
+ * these are `x(i)` which for a given `j` satisfy `t(j+k) <= x(i) <= t(j+k+1)`.
+ *
+ * The _split routine returns two vectors: a vector of indices into `x`, `ix`,
+ * and a vector of partial sums of residuals, `fparts`.
+ * The i-th entry is the i-th "run". IOW the pair (fparts, ix) means that
+ * `fparts[i]` is the sum of residuals over the "run" x[ix[i]] <= xvalue <= x[ix[i+1]].
+ *
+ * This routine is a (best-effort) translation of
+ * https://github.com/scipy/scipy/blob/v1.11.4/scipy/interpolate/fitpack/fpcurf.f#L190-L215
+ *
+ */
+pair_t
+_split(array_1D_t<const double> x,
+       array_1D_t<const double> t,
+       int k,
+       array_1D_t<const double> residuals)
+{
+    /*
+     * c  search for knot interval t(number+k) <= x <= t(number+k+1) where
+     * c  fpint(number) is maximal on the condition that nrdata(number)
+     * c  not equals zero.
+     */
+    ssize_t interval = k+1;
+    ssize_t nc = t.extent(0) - k - 1;
+
+    std::vector<ssize_t> ix;
+    ix.push_back(0.0);
+
+    std::vector<double> fparts;
+    double fpart = 0.0;
+
+    for(ssize_t i=0; i < x.extent(0); i++) {
+        double xv = x(i);
+        double rv = residuals(i);
+        fpart += rv;
+
+        if ((xv >= t(interval)) && (interval < nc)) {
+            // end of the current interval: split the weight at xv by 1/2
+            // between two intervals
+            double carry = rv / 2.0;
+            fpart -= carry;
+            fparts.push_back(fpart);
+
+            fpart = carry;
+            interval++;
+
+            ix.push_back(i);
+        }
+    } // for i
+
+    // the last interval
+    ix.push_back(x.extent(0) - 1);
+    fparts.push_back(fpart);
+
+    return std::make_tuple(fparts, ix);
+}
+
+
+/*
+ * Find a position for a new knot, a la FITPACK.
+ *
+ *   (Approximately) replicate FITPACK's logic:
+ *     1. split the `x` array into knot intervals, ``t(j+k) <= x(i) <= t(j+k+1)``
+ *     2. find the interval with the maximum sum of residuals
+ *     3. insert a new knot into the middle of that interval.
+ *
+ *   NB: a new knot is in fact an `x` value at the middle of the interval.
+ *   So *the knots are a subset of `x`*.
+ *
+ *   This routine is an analog of
+ *   https://github.com/scipy/scipy/blob/v1.11.4/scipy/interpolate/fitpack/fpcurf.f#L190-L215
+ *   (cf _split function)
+ *
+ *   and https://github.com/scipy/scipy/blob/v1.11.4/scipy/interpolate/fitpack/fpknot.f
+ */
+double
+fpknot(const double *x_ptr, ssize_t m,
+       const double *t_ptr, ssize_t len_t,
+       int k,
+       const double *residuals_ptr)
+{
+    auto x = wrap_1D(x_ptr, m);
+    auto t = wrap_1D(t_ptr, len_t);
+    auto residuals = wrap_1D(residuals_ptr, m);
+
+    std::vector<double> fparts;
+    std::vector<ssize_t> ix;
+    std::tie(fparts, ix) = _split(x, t, k, residuals);
+
+    ssize_t idx_max = -101;
+    double fpart_max = -1e100;
+    for (size_t i=0; i < fparts.size(); i++) {
+        bool is_better = (ix[i+1] - ix[i] > 1) && (fparts[i] > fpart_max);
+        if(is_better) {
+            idx_max = i;
+            fpart_max = fparts[i];
+        }
+    }
+
+    if (idx_max == -101) {
+        throw std::runtime_error("Internal error. Please report it to SciPy developers.");
+    }
+
+    // round up, like Dierckx does? This is really arbitrary though.
+    ssize_t idx_newknot = (ix[idx_max] + ix[idx_max+1] + 1) / 2;
+
+    return x(idx_newknot);
+}
+
+} // namespace fitpack
diff --git a/scipy/interpolate/src/__fitpack.h b/scipy/interpolate/src/__fitpack.h
index 97ac44a95015..b30dfb77dd3c 100644
--- a/scipy/interpolate/src/__fitpack.h
+++ b/scipy/interpolate/src/__fitpack.h
@@ -1,64 +1,158 @@
+#pragma once
+#include <iostream>
+#include <tuple>
+#include <vector>
+#include <array>
+#include "../_build_utils/src/fortran_defs.h"
+#include "../_build_utils/src/mdspan_addl.h"
+
+#define DLARTG F_FUNC(dlartg, DLARTG)
+
+
+namespace fitpack {
+
+
 /*
- * B-spline evaluation routine.
+ * 1D and 2D array wrappers, with and without boundschecking
+ */
+
+constexpr bool BOUNDS_CHECK = true;   // XXX: flip to false in the end.
+
+using LayoutType = std::conditional<BOUNDS_CHECK,
+        scipy::detail::bounds_checking_layout_stride,
+        std::layout_stride
+      >::type;
+
+
+template<typename T>
+using array_1D_t = std::mdspan<T, std::dextents<ssize_t, 1>, LayoutType>;
+
+template<typename T>
+using array_2D_t = std::mdspan<T, std::dextents<ssize_t, 2>, LayoutType>;
+
+
+template<typename T> inline array_1D_t<T>
+wrap_1D(T *ptr, ssize_t sz) {
+    std::array<ssize_t, 1> exts = {sz};
+    std::array<ssize_t, 1> strides = {1};
+    return array_1D_t<T>(ptr, {exts, strides});
+}
+
+template<typename T> inline array_2D_t<T>
+wrap_2D(T *ptr, ssize_t sz1, ssize_t sz2) {
+    std::array<ssize_t, 2> exts = {sz1, sz2};
+    std::array<ssize_t, 2> strides = {sz2, 1};
+    return array_2D_t<T>(ptr, {exts, strides});
+}
+
+
+/*
+ * LAPACK Givens rotation
  */
+extern "C" {
+void DLARTG(double *f, double *g, double *cs, double *sn, double *r);
+}
 
-static inline void
-_deBoor_D(const double *t, double x, int k, int ell, int m, double *result) {
-    /*
-     * On completion the result array stores
-     * the k+1 non-zero values of beta^(m)_i,k(x):  for i=ell, ell-1, ell-2, ell-k.
-     * Where t[ell] <= x < t[ell+1].
-     */
-    /*
-     * Implements a recursive algorithm similar to the original algorithm of
-     * deBoor.
-     */
-    double *hh = result + k + 1;
-    double *h = result;
-    double xb, xa, w;
-    int ind, j, n;
-
-    /*
-     * Perform k-m "standard" deBoor iterations
-     * so that h contains the k+1 non-zero values of beta_{ell,k-m}(x)
-     * needed to calculate the remaining derivatives.
-     */
-    result[0] = 1.0;
-    for (j = 1; j <= k - m; j++) {
-        memcpy(hh, h, j*sizeof(double));
-        h[0] = 0.0;
-        for (n = 1; n <= j; n++) {
-            ind = ell + n;
-            xb = t[ind];
-            xa = t[ind - j];
-            if (xb == xa) {
-                h[n] = 0.0;
-                continue;
-            }
-            w = hh[n - 1]/(xb - xa);
-            h[n - 1] += w*(xb - x);
-            h[n] = w*(x - xa);
-        }
-    }
-
-    /*
-     * Now do m "derivative" recursions
-     * to convert the values of beta into the mth derivative
-     */
-    for (j = k - m + 1; j <= k; j++) {
-        memcpy(hh, h, j*sizeof(double));
-        h[0] = 0.0;
-        for (n = 1; n <= j; n++) {
-            ind = ell + n;
-            xb = t[ind];
-            xa = t[ind - j];
-            if (xb == xa) {
-                h[m] = 0.0;
-                continue;
-            }
-            w = j*hh[n - 1]/(xb - xa);
-            h[n - 1] -= w;
-            h[n] = w;
-        }
-    }
+
+/*
+ * Apply a Givens transform.
+ */
+template<typename T>
+inline
+std::tuple<T, T>
+fprota(T c, T s, T a, T b)
+{
+    return std::make_tuple(
+        c*a + s*b,
+       -s*a + c*b
+    );
 }
+
+
+/*
+ * B-spline evaluation routine.
+ */
+void
+_deBoor_D(const double *t, double x, int k, int ell, int m, double *result);
+
+
+/*
+ *  Find an interval such that t[interval] <= xval < t[interval+1].
+ */
+ssize_t
+_find_interval(const double* tptr, ssize_t len_t,
+               int k,
+               double xval,
+               ssize_t prev_l,
+               int extrapolate);
+
+
+
+/*
+ * Fill the (m, k+1) matrix of non-zero b-splines.
+ */
+void
+data_matrix(/* inputs */
+            const double *xptr, ssize_t m,      // x, shape (m,)
+            const double *tptr, ssize_t len_t,  // t, shape (len_t,)
+            int k,
+            const double *wptr,                 // weights, shape (m,) // NB: len(w) == len(x), not checked
+            /* outputs */
+            double *Aptr,                       // A, shape(m, k+1)
+            ssize_t *offset_ptr,                // offset, shape (m,)
+            ssize_t *nc,                        // the number of coefficient
+            /* work array*/
+            double *wrk                         // work, shape (2k+2)
+);
+
+
+/*
+    Solve the LSQ problem ||y - A@c||^2 via QR factorization.
+    This routine MODIFIES `a` & `y` in-place.
+*/
+void
+qr_reduce(double *aptr, const ssize_t m, const ssize_t nz, // a(m, nz), packed
+          ssize_t *offset,                                 // offset(m)
+          const ssize_t nc,                                // dense would be a(m, nc)
+          double *yptr, const ssize_t ydim1,               // y(m, ydim2)
+          const ssize_t startrow=1
+);
+
+
+/*
+ * Back substitution solve of `R @ c = y` with an upper triangular R.
+ */
+void
+fpback( /* inputs*/
+       const double *Rptr, ssize_t m, ssize_t nz,    // R(m, nz), packed
+       ssize_t nc,                                   // dense R would be (m, nc)
+       const double *yptr, ssize_t ydim2,            // y(m, ydim2)
+        /* output */
+       double *cptr                                 // c(nc, ydim2)
+);
+
+
+/*
+ * A helper for _fpknot:
+ * Split the `x` array into knot "runs" and sum the residuals per "run".
+ */
+typedef std::tuple<std::vector<double>, std::vector<ssize_t>> pair_t;
+
+pair_t
+_split(array_1D_t<const double> x,
+       array_1D_t<const double> t,
+       int k,
+       array_1D_t<const double> residuals);
+
+
+/*
+ * Find a position for a new knot, a la FITPACK
+ */
+double
+fpknot(const double *x_ptr, ssize_t m,
+       const double *t_ptr, ssize_t len_t,
+       int k,
+       const double *residuals_ptr);
+
+
+} // namespace fitpack
diff --git a/scipy/interpolate/src/_fitpackmodule.c b/scipy/interpolate/src/_fitpackmodule.c
index dc79b2b28833..e1e14f71e504 100644
--- a/scipy/interpolate/src/_fitpackmodule.c
+++ b/scipy/interpolate/src/_fitpackmodule.c
@@ -4,7 +4,6 @@
 #define PyInt_AsLong PyLong_AsLong
 
 static PyObject *fitpack_error;
-#include "__fitpack.h"
 
 #ifdef HAVE_ILP64
 
diff --git a/scipy/interpolate/tests/test_bsplines.py b/scipy/interpolate/tests/test_bsplines.py
index 3a75f4d30743..640a33585d1a 100644
--- a/scipy/interpolate/tests/test_bsplines.py
+++ b/scipy/interpolate/tests/test_bsplines.py
@@ -1,14 +1,15 @@
 import os
 import operator
 import itertools
+import math
 
 import numpy as np
-from numpy.testing import assert_equal, assert_allclose, assert_
+from numpy.testing import assert_equal, assert_allclose, assert_, suppress_warnings
 from pytest import raises as assert_raises
 import pytest
 
 from scipy.interpolate import (
-        BSpline, BPoly, PPoly, make_interp_spline, make_lsq_spline, _bspl,
+        BSpline, BPoly, PPoly, make_interp_spline, make_lsq_spline,
         splev, splrep, splprep, splder, splantider, sproot, splint, insert,
         CubicSpline, NdBSpline, make_smoothing_spline, RegularGridInterpolator,
 )
@@ -18,6 +19,9 @@
 from scipy.interpolate._bsplines import (_not_a_knot, _augknt,
                                         _woodbury_algorithm, _periodic_knots,
                                          _make_interp_per_full_matr)
+
+from scipy.interpolate import generate_knots, make_splrep, make_splprep
+
 import scipy.interpolate._fitpack_impl as _impl
 from scipy._lib._util import AxisError
 
@@ -26,6 +30,7 @@
 
 from scipy.interpolate import dfitpack
 from scipy.interpolate import _bsplines as _b
+from scipy.interpolate import _bspl
 
 
 class TestBSpline:
@@ -1136,7 +1141,7 @@ def test_broken_x(self, k):
             make_interp_spline(x, y, k=k)
 
     def test_not_a_knot(self):
-        for k in [3, 5]:
+        for k in [2, 3, 4, 5, 6, 7]:
             b = make_interp_spline(self.xx, self.yy, k)
             assert_allclose(b(self.xx), self.yy, atol=1e-14, rtol=1e-14)
 
@@ -1571,6 +1576,8 @@ def make_lsq_full_matrix(x, y, t, k=3):
     return c, (A, Y)
 
 
+parametrize_lsq_methods = pytest.mark.parametrize("method", ["norm-eq", "qr"])
+
 class TestLSQ:
     #
     # Test make_lsq_spline
@@ -1581,12 +1588,13 @@ class TestLSQ:
     y = np.random.random(n)
     t = _augknt(np.linspace(x[0], x[-1], 7), k)
 
-    def test_lstsq(self):
+    @parametrize_lsq_methods
+    def test_lstsq(self, method):
         # check LSQ construction vs a full matrix version
         x, y, t, k = self.x, self.y, self.t, self.k
 
         c0, AY = make_lsq_full_matrix(x, y, t, k)
-        b = make_lsq_spline(x, y, t, k)
+        b = make_lsq_spline(x, y, t, k, method=method)
 
         assert_allclose(b.c, c0)
         assert_equal(b.c.shape, (t.size - k - 1,))
@@ -1596,53 +1604,108 @@ def test_lstsq(self):
         c1, _, _, _ = np.linalg.lstsq(aa, y, rcond=-1)
         assert_allclose(b.c, c1)
 
-    def test_weights(self):
+    @parametrize_lsq_methods
+    def test_weights(self, method):
         # weights = 1 is same as None
         x, y, t, k = self.x, self.y, self.t, self.k
         w = np.ones_like(x)
 
-        b = make_lsq_spline(x, y, t, k)
-        b_w = make_lsq_spline(x, y, t, k, w=w)
+        b = make_lsq_spline(x, y, t, k, method=method)
+        b_w = make_lsq_spline(x, y, t, k, w=w, method=method)
 
         assert_allclose(b.t, b_w.t, atol=1e-14)
         assert_allclose(b.c, b_w.c, atol=1e-14)
         assert_equal(b.k, b_w.k)
 
-    def test_multiple_rhs(self):
+    def test_weights_same(self):
+        # both methods treat weights 
+        x, y, t, k = self.x, self.y, self.t, self.k
+        w = np.random.default_rng(1234).uniform(size=x.shape[0])
+
+        b_ne = make_lsq_spline(x, y, t, k, w=w, method="norm-eq")
+        b_qr = make_lsq_spline(x, y, t, k, w=w, method="qr")
+        b_no_w = make_lsq_spline(x, y, t, k, method="qr")
+
+        assert_allclose(b_ne.c, b_qr.c, atol=1e-14)
+        assert not np.allclose(b_no_w.c, b_qr.c, atol=1e-14)
+
+    @parametrize_lsq_methods
+    def test_multiple_rhs(self, method):
         x, t, k, n = self.x, self.t, self.k, self.n
         y = np.random.random(size=(n, 5, 6, 7))
 
-        b = make_lsq_spline(x, y, t, k)
+        b = make_lsq_spline(x, y, t, k, method=method)
         assert_equal(b.c.shape, (t.size-k-1, 5, 6, 7))
 
-    def test_complex(self):
+    @parametrize_lsq_methods
+    def test_multiple_rhs_2(self, method):
+        x, t, k, n = self.x, self.t, self.k, self.n
+        nrhs = 3
+        y = np.random.random(size=(n, nrhs))
+        b = make_lsq_spline(x, y, t, k, method=method)
+
+        bb = [make_lsq_spline(x, y[:, i], t, k, method=method)
+              for i in range(nrhs)]
+        coefs = np.vstack([bb[i].c for i in range(nrhs)]).T
+
+        assert_allclose(coefs, b.c, atol=1e-15)
+
+    def test_multiple_rhs_3(self):
+        x, t, k, n = self.x, self.t, self.k, self.n
+        nrhs = 3
+        y = np.random.random(size=(n, nrhs))
+        b_qr = make_lsq_spline(x, y, t, k, method="qr")
+        b_neq = make_lsq_spline(x, y, t, k, method="norm-eq")
+        assert_allclose(b_qr.c, b_neq.c, atol=1e-15)
+
+    @parametrize_lsq_methods
+    def test_complex(self, method):
+        if method == "qr":
+            pytest.xfail("method='qr' does not handle complex-valued `y` yet.")
+
         # cmplx-valued `y`
         x, t, k = self.x, self.t, self.k
         yc = self.y * (1. + 2.j)
 
-        b = make_lsq_spline(x, yc, t, k)
-        b_re = make_lsq_spline(x, yc.real, t, k)
-        b_im = make_lsq_spline(x, yc.imag, t, k)
+        b = make_lsq_spline(x, yc, t, k, method=method)
+        b_re = make_lsq_spline(x, yc.real, t, k, method=method)
+        b_im = make_lsq_spline(x, yc.imag, t, k, method=method)
 
         assert_allclose(b(x), b_re(x) + 1.j*b_im(x), atol=1e-15, rtol=1e-15)
 
-    def test_int_xy(self):
+    @parametrize_lsq_methods
+    def test_int_xy(self, method):
         x = np.arange(10).astype(int)
         y = np.arange(10).astype(int)
         t = _augknt(x, k=1)
         # Cython chokes on "buffer type mismatch"
-        make_lsq_spline(x, y, t, k=1)
+        make_lsq_spline(x, y, t, k=1, method=method)
 
-    def test_sliced_input(self):
+    @parametrize_lsq_methods
+    def test_f32_xy(self, method):
+        x = np.arange(10, dtype=np.float32)
+        y = np.arange(10, dtype=np.float32)
+        t = _augknt(x, k=1)
+        spl_f32 = make_lsq_spline(x, y, t, k=1, method=method)
+        spl_f64 = make_lsq_spline(
+            x.astype(float), y.astype(float), t.astype(float), k=1, method=method
+        )
+
+        x2 = (x[1:] + x[:-1]) / 2.0
+        assert_allclose(spl_f32(x2), spl_f64(x2), atol=1e-15)
+
+    @parametrize_lsq_methods
+    def test_sliced_input(self, method):
         # Cython code chokes on non C contiguous arrays
         xx = np.linspace(-1, 1, 100)
 
         x = xx[::3]
         y = xx[::3]
         t = _augknt(x, 1)
-        make_lsq_spline(x, y, t, k=1)
+        make_lsq_spline(x, y, t, k=1, method=method)
 
-    def test_checkfinite(self):
+    @parametrize_lsq_methods
+    def test_checkfinite(self, method):
         # check_finite defaults to True; nans and such trigger a ValueError
         x = np.arange(12).astype(float)
         y = x**2
@@ -1650,15 +1713,246 @@ def test_checkfinite(self):
 
         for z in [np.nan, np.inf, -np.inf]:
             y[-1] = z
-            assert_raises(ValueError, make_lsq_spline, x, y, t)
+            assert_raises(ValueError, make_lsq_spline, x, y, t, method=method)
 
-    def test_read_only(self):
+    @parametrize_lsq_methods
+    def test_read_only(self, method):
         # Check that make_lsq_spline works with read only arrays
         x, y, t = self.x, self.y, self.t
         x.setflags(write=False)
         y.setflags(write=False)
         t.setflags(write=False)
-        make_lsq_spline(x=x, y=y, t=t)
+        make_lsq_spline(x=x, y=y, t=t, method=method)
+
+    @pytest.mark.parametrize('k', list(range(1, 7)))
+    def test_qr_vs_norm_eq(self, k):
+        # check that QR and normal eq solutions match
+        x, y = self.x, self.y
+        t = _augknt(np.linspace(x[0], x[-1], 7), k)
+        spl_norm_eq = make_lsq_spline(x, y, t, k=k, method='norm-eq')
+        spl_qr = make_lsq_spline(x, y, t, k=k, method='qr')
+
+        xx = (x[1:] + x[:-1]) / 2.0
+        assert_allclose(spl_norm_eq(xx), spl_qr(xx), atol=1e-15)
+
+    def test_duplicates(self):
+        # method="qr" can handle duplicated data points
+        x = np.repeat(self.x, 2)
+        y = np.repeat(self.y, 2)
+        spl_1 = make_lsq_spline(self.x, self.y, self.t, k=3, method='qr')
+        spl_2 = make_lsq_spline(x, y, self.t, k=3, method='qr')
+
+        xx = (x[1:] + x[:-1]) / 2.0
+        assert_allclose(spl_1(xx), spl_2(xx), atol=1e-15)
+
+
+class PackedMatrix:
+    """A simplified CSR format for when non-zeros in each row are consecutive.
+
+    Assuming that each row of an `(m, nc)` matrix 1) only has `nz` non-zeros, and
+    2) these non-zeros are consecutive, we only store an `(m, nz)` matrix of
+    non-zeros and a 1D array of row offsets. This way, a row `i` of the original
+    matrix A is ``A[i, offset[i]: offset[i] + nz]``.
+
+    """
+    def __init__(self, a, offset, nc):
+        self.a = a
+        self.offset = offset
+        self.nc = nc
+
+        assert a.ndim == 2
+        assert offset.ndim == 1
+        assert a.shape[0] == offset.shape[0]
+
+    @property
+    def shape(self):
+        return self.a.shape[0], self.nc
+
+    def todense(self):
+        out = np.zeros(self.shape)
+        nelem = self.a.shape[1]
+        for i in range(out.shape[0]):
+            nel = min(self.nc - self.offset[i], nelem)
+            out[i, self.offset[i]:self.offset[i] + nel] = self.a[i, :nel]
+        return out
+
+
+def _qr_reduce_py(a_p, y, startrow=1):
+    """This is a python counterpart of the `_qr_reduce` routine,
+    declared in interpolate/src/__fitpack.h
+    """
+    from scipy.linalg.lapack import dlartg
+
+    # unpack the packed format
+    a = a_p.a
+    offset = a_p.offset
+    nc = a_p.nc
+
+    m, nz = a.shape
+
+    assert y.shape[0] == m
+    R = a.copy()
+    y1 = y.copy()
+
+    for i in range(startrow, m):
+        oi = offset[i]
+        for j in range(oi, nc):
+            # rotate only the lower diagonal
+            if j >= min(i, nc):
+                break
+
+            # In dense format: diag a1[j, j] vs a1[i, j]
+            c, s, r = dlartg(R[j, 0], R[i, 0])
+
+            # rotate l.h.s.
+            R[j, 0] = r
+            for l in range(1, nz):
+                R[j, l], R[i, l-1] = fprota(c, s, R[j, l], R[i, l])
+            R[i, -1] = 0.0
+
+            # rotate r.h.s.
+            for l in range(y1.shape[1]):
+                y1[j, l], y1[i, l] = fprota(c, s, y1[j, l], y1[i, l])
+
+    # convert to packed
+    offs = list(range(R.shape[0]))
+    R_p = PackedMatrix(R, np.array(offs), nc)
+
+    return R_p, y1
+
+
+def fprota(c, s, a, b):
+    """Givens rotate [a, b].
+
+    [aa] = [ c s] @ [a]
+    [bb]   [-s c]   [b]
+
+    """
+    aa =  c*a + s*b
+    bb = -s*a + c*b
+    return aa, bb
+
+
+def fpback(R_p, y):
+    """Backsubsitution solve upper triangular banded `R @ c = y.`
+
+    `R` is in the "packed" format: `R[i, :]` is `a[i, i:i+k+1]`
+    """
+    R = R_p.a
+    _, nz = R.shape
+    nc = R_p.nc
+    assert y.shape[0] == R.shape[0]
+
+    c = np.zeros_like(y[:nc])
+    c[nc-1, ...] = y[nc-1] / R[nc-1, 0]
+    for i in range(nc-2, -1, -1):
+        nel = min(nz, nc-i)
+        # NB: broadcast R across trailing dimensions of `c`.
+        summ = (R[i, 1:nel, None] * c[i+1:i+nel, ...]).sum(axis=0)
+        c[i, ...] = ( y[i] - summ ) / R[i, 0]
+    return c
+
+
+class TestGivensQR:
+    # Test row-by-row QR factorization, used for the LSQ spline construction.
+    # This is implementation detail; still test it separately.
+    def _get_xyt(self, n):
+        k = 3
+        x = np.arange(n, dtype=float)
+        y = x**3 + 1/(1+x)
+        t = _not_a_knot(x, k)
+        return x, y, t, k
+
+    def test_vs_full(self):
+        n = 10
+        x, y, t, k = self._get_xyt(n)
+
+        # design matrix
+        a_csr = BSpline.design_matrix(x, t, k)
+
+        # dense QR
+        q, r = sl.qr(a_csr.todense())
+        qTy = q.T @ y
+
+        # prepare the PackedMatrix to factorize
+        # convert to "packed" format
+        m, nc = a_csr.shape
+        assert nc == t.shape[0] - k - 1
+
+        offset = a_csr.indices[::(k+1)]
+        offset = np.ascontiguousarray(offset, dtype=np.intp)
+        A = a_csr.data.reshape(m, k+1)
+
+        R = PackedMatrix(A, offset, nc)
+        y_ = y[:, None]     # _qr_reduce requires `y` a 2D array
+        _bspl._qr_reduce(A, offset, nc, y_)      # modifies arguments in-place
+
+        # signs may differ
+        assert_allclose(np.minimum(R.todense() + r,
+                                   R.todense() - r), 0, atol=1e-15)
+        assert_allclose(np.minimum(abs(qTy - y_[:, 0]),
+                                   abs(qTy + y_[:, 0])), 0, atol=2e-13)
+
+        # sign changes are consistent between Q and R:
+        c_full = sl.solve(r, qTy)
+        c_banded = _bspl._fpback(R.a, R.nc, y_)
+        assert_allclose(c_full, c_banded[:, 0], atol=5e-13)
+
+    def test_py_vs_compiled(self):
+        # test _qr_reduce vs a python implementation
+        n = 10
+        x, y, t, k = self._get_xyt(n)
+
+        # design matrix
+        a_csr = BSpline.design_matrix(x, t, k)
+        m, nc = a_csr.shape
+        assert nc == t.shape[0] - k - 1
+
+        offset = a_csr.indices[::(k+1)]
+        offset = np.ascontiguousarray(offset, dtype=np.intp)
+        A = a_csr.data.reshape(m, k+1)
+
+        R = PackedMatrix(A, offset, nc)
+        y_ = y[:, None]
+
+        RR, yy = _qr_reduce_py(R, y_)
+        _bspl._qr_reduce(A, offset, nc , y_)   # in-place
+
+        assert_allclose(RR.a, R.a, atol=1e-15)
+        assert_equal(RR.offset, R.offset)
+        assert RR.nc == R.nc
+        assert_allclose(yy, y_, atol=1e-15)
+
+    # Test C-level construction of the design matrix
+
+    def test_data_matrix(self):
+        n = 10
+        x, y, t, k = self._get_xyt(n)
+        w = np.arange(1, n+1, dtype=float)
+        A, offset, nc = _bspl._data_matrix(x, t, k, w)
+
+        m = x.shape[0]
+        a_csr = BSpline.design_matrix(x, t, k)
+        a_w = (a_csr * w[:, None]).tocsr()
+        A_ = a_w.data.reshape((m, k+1))
+        offset_ = a_w.indices[::(k+1)]
+
+        assert_allclose(A, A_, atol=1e-15)
+        assert_equal(offset, offset_)
+        assert nc == t.shape[0] - k - 1
+
+    def test_fpback(self):
+        n = 10
+        x, y, t, k = self._get_xyt(n)
+        y = np.c_[y, y**2]
+        A, offset, nc = _bspl._data_matrix(x, t, k, w=np.ones_like(x))
+        R = PackedMatrix(A, offset, nc)
+        _bspl._qr_reduce(A, offset, nc, y)
+
+        c = fpback(R, y)
+        cc = _bspl._fpback(A, nc, y)
+
+        assert_allclose(cc, c, atol=1e-14)
 
 
 def data_file(basename):
@@ -2619,3 +2913,581 @@ def test_condition_5_3(self):
         with pytest.raises(ValueError, match="Schoenberg-Whitney*"):
             _b.fpcheck(x, t, k)
 
+
+# ### python replicas of generate_knots(...) implementation details, for testing.
+# ### see TestGenerateKnots::test_split_and_add_knot
+def _split(x, t, k, residuals):
+    """Split the knot interval into "runs".
+    """
+    ix = np.searchsorted(x, t[k:-k])
+    # sum half-open intervals
+    fparts = [residuals[ix[i]:ix[i+1]].sum() for i in range(len(ix)-1)]
+    carries = residuals[ix[1:-1]]
+
+    for i in range(len(carries)):     # split residuals at internal knots
+        carry = carries[i] / 2
+        fparts[i] += carry
+        fparts[i+1] -= carry
+
+    fparts[-1] += residuals[-1]       # add the contribution of the last knot
+
+    assert_allclose(sum(fparts), sum(residuals), atol=1e-15)
+
+    return fparts, ix
+
+
+def _add_knot(x, t, k, residuals):
+    """Insert a new knot given reduals."""
+    fparts, ix = _split(x, t, k, residuals)
+
+    # find the interval with max fparts and non-zero number of x values inside
+    idx_max = -101
+    fpart_max = -1e100
+    for i in range(len(fparts)):
+        if ix[i+1] - ix[i] > 1 and fparts[i] > fpart_max:
+            idx_max = i
+            fpart_max = fparts[i]
+
+    if idx_max == -101:
+        raise ValueError("Internal error, please report it to SciPy developers.")
+
+    # round up, like Dierckx does? This is really arbitrary though.
+    idx_newknot = (ix[idx_max] + ix[idx_max+1] + 1) // 2
+    new_knot = x[idx_newknot]
+    idx_t = np.searchsorted(t, new_knot)
+    t_new = np.r_[t[:idx_t], new_knot, t[idx_t:]]
+    return t_new
+
+
+class TestGenerateKnots:
+    def test_split_add_knot(self):
+        # smoke test implementation details: insert a new knot given residuals
+        x = np.arange(8, dtype=float)
+        y = x**3 + 1./(1 + x)
+        k = 3
+        t = np.array([0.]*(k+1) + [7.]*(k+1))
+        spl = make_lsq_spline(x, y, k=k, t=t)
+        residuals = (spl(x) - y)**2
+
+        from scipy.interpolate import _fitpack_repro as _fr
+        new_t = _fr.add_knot(x, t, k, residuals)
+        new_t_py = _add_knot(x, t, k, residuals)
+
+        assert_allclose(new_t, new_t_py, atol=1e-15)
+
+        # redo with new knots
+        spl2 = make_lsq_spline(x, y, k=k, t=new_t)
+        residuals2 = (spl2(x) - y)**2
+
+        new_t2 = _fr.add_knot(x, new_t, k, residuals2)
+        new_t2_py = _add_knot(x, new_t, k, residuals2)
+
+        assert_allclose(new_t2, new_t2_py, atol=1e-15)
+
+    @pytest.mark.parametrize('k', [1, 2, 3, 4, 5])
+    def test_s0(self, k):
+        x = np.arange(8)
+        y = np.sin(x*np.pi/8)
+        t = list(generate_knots(x, y, k=k, s=0))[-1]
+
+        tt = splrep(x, y, k=k, s=0)[0]
+        assert_allclose(t, tt, atol=1e-15)
+
+    def test_s0_1(self):
+        # with these data, naive algorithm tries to insert >= nmax knots
+        n = 10
+        x = np.arange(n)
+        y = x**3
+        knots = list(generate_knots(x, y, k=3, s=0))   # does not error out
+        assert_allclose(knots[-1], _not_a_knot(x, 3), atol=1e-15)
+
+    def test_s0_n20(self):
+        n = 20
+        x = np.arange(n)
+        y = x**3
+        knots = list(generate_knots(x, y, k=3, s=0))
+        assert_allclose(knots[-1], _not_a_knot(x, 3), atol=1e-15)
+
+    def test_s0_nest(self):
+        # s=0 and non-default nest: not implemented, errors out
+        x = np.arange(10)
+        y = x**3
+        with assert_raises(ValueError):
+            list(generate_knots(x, y, k=3, s=0, nest=10))
+
+    def test_s_switch(self):
+        # test the process switching to interpolating knots when len(t) == m + k + 1
+        """
+        To generate the `wanted` list below apply the following diff and rerun
+        the test. The stdout will contain successive iterations of the `t`
+        array.
+
+$ git diff scipy/interpolate/fitpack/fpcurf.f
+diff --git a/scipy/interpolate/fitpack/fpcurf.f b/scipy/interpolate/fitpack/fpcurf.f
+index 1afb1900f1..d817e51ad8 100644
+--- a/scipy/interpolate/fitpack/fpcurf.f
++++ b/scipy/interpolate/fitpack/fpcurf.f
+@@ -216,6 +216,9 @@ c  t(j+k) <= x(i) <= t(j+k+1) and store it in fpint(j),j=1,2,...nrint.
+         do 190 l=1,nplus
+ c  add a new knot.
+           call fpknot(x,m,t,n,fpint,nrdata,nrint,nest,1)
++          print*, l, nest, ': ', t
++          print*, "n, nmax = ", n, nmax
++
+ c  if n=nmax we locate the knots as for interpolation.
+           if(n.eq.nmax) go to 10
+ c  test whether we cannot further increase the number of knots.
+        """  # NOQA: E501
+        x = np.arange(8)
+        y = np.sin(x*np.pi/8)
+        k = 3
+
+        knots = list(generate_knots(x, y, k=k, s=1e-7))
+        wanted = [[0., 0., 0., 0., 7., 7., 7., 7.],
+                  [0., 0., 0., 0., 4., 7., 7., 7., 7.],
+                  [0., 0., 0., 0., 2., 4., 7., 7., 7., 7.], 
+                  [0., 0., 0., 0., 2., 4., 6., 7., 7., 7., 7.],
+                  [0., 0., 0., 0., 2., 3., 4., 5., 7, 7., 7., 7.]
+        ]
+
+        assert len(knots) == len(wanted)
+        for t, tt in zip(knots, wanted):
+            assert_allclose(t, tt, atol=1e-15)
+
+        # also check that the last knot vector matches FITPACK
+        t, _, _ = splrep(x, y, k=k, s=1e-7)
+        assert_allclose(knots[-1], t, atol=1e-15)
+
+    def test_list_input(self):
+        # test that list inputs are accepted
+        x = list(range(8))
+        gen = generate_knots(x, x, s=0.1, k=1)
+        next(gen)
+
+    def test_nest(self):
+        # test that nest < nmax stops the process early (and we get 10 knots not 12)
+        x = np.arange(8)
+        y = np.sin(x*np.pi/8)
+        s = 1e-7
+
+        knots = list(generate_knots(x, y, k=3, s=s, nest=10))
+        assert_allclose(knots[-1],
+                        [0., 0., 0., 0., 2., 4., 7., 7., 7., 7.], atol=1e-15)
+
+        with assert_raises(ValueError):
+            # nest < 2*(k+1)
+            list(generate_knots(x, y, k=3, nest=4))
+
+    def test_weights(self):
+        x = np.arange(8)
+        y = np.sin(x*np.pi/8)
+
+        with assert_raises(ValueError):
+            list(generate_knots(x, y, w=np.arange(11)))   # len(w) != len(x)
+
+        with assert_raises(ValueError):
+            list(generate_knots(x, y, w=-np.ones(8)))    # w < 0
+
+    @pytest.mark.parametrize("npts", [30, 50, 100])
+    @pytest.mark.parametrize("s", [0.1, 1e-2, 0])
+    def test_vs_splrep(self, s, npts):
+        # XXX this test is brittle: differences start apearing for k=3 and s=1e-6,
+        # also for k != 3. Might be worth investigating at some point.
+        # I think we do not really guarantee exact agreement with splrep. Instead,
+        # we guarantee it is the same *in most cases*; otherwise slight differences
+        # are allowed. There is no theorem, it is al heuristics by P. Dierckx.
+        # The best we can do it to best-effort reproduce it.
+        rndm = np.random.RandomState(12345)
+        x = 10*np.sort(rndm.uniform(size=npts))
+        y = np.sin(x*np.pi/10) + np.exp(-(x-6)**2)
+
+        k = 3
+        t = splrep(x, y, k=k, s=s)[0]
+        tt = list(generate_knots(x, y, k=k, s=s))[-1]
+
+        assert_allclose(tt, t, atol=1e-15)
+
+    def test_s_too_small(self):
+        n = 14
+        x = np.arange(n)
+        y = x**3
+
+        # XXX splrep warns that "s too small": ier=2
+        knots = list(generate_knots(x, y, k=3, s=1e-50))
+
+        with suppress_warnings() as sup:
+            r = sup.record(RuntimeWarning)
+            tck = splrep(x, y, k=3, s=1e-50)
+            assert len(r) == 1
+        assert_equal(knots[-1], tck[0])
+
+
+def disc_naive(t, k):
+    """Straitforward way to compute the discontinuity matrix. For testing ONLY.
+
+    This routine returns a dense matrix, while `_fitpack_repro.disc` returns
+    a packed one.
+    """
+    n = t.shape[0]
+
+    delta = t[n - k - 1] - t[k]
+    nrint = n - 2*k - 1
+
+    ti = t[k+1:n-k-1]   # internal knots
+    tii = np.repeat(ti, 2)
+    tii[::2] += 1e-10
+    tii[1::2] -= 1e-10
+    m = BSpline.design_matrix(tii, t, k, nu=k).todense()
+
+    matr = np.empty((nrint-1, m.shape[1]), dtype=float)
+    for i in range(0, m.shape[0], 2):
+        matr[i//2, :] = m[i, :] - m[i+1, :]
+
+    matr *= (delta/nrint)**k / math.factorial(k)
+    return matr
+
+
+class F_dense:
+    """ The r.h.s. of ``f(p) = s``, an analog of _fitpack_repro.F
+    Uses full matrices, so is for tests only.
+    """
+    def __init__(self, x, y, t, k, s, w=None):
+        self.x = x
+        self.y = y
+        self.t = t
+        self.k = k
+        self.w = np.ones_like(x, dtype=float) if w is None else w
+        assert self.w.ndim == 1
+
+        # lhs
+        a_csr = BSpline.design_matrix(x, t, k)
+        self.a_w = (a_csr * self.w[:, None]).tocsr()
+        from scipy.interpolate import _fitpack_repro as _fr
+        self.b = PackedMatrix(*_fr.disc(t, k))
+
+        self.a_dense = (a_csr * self.w[:, None]).todense()
+        self.b_dense = PackedMatrix(*_fr.disc(t, k)).todense()
+
+        # rhs
+        assert y.ndim == 1
+        yy = y * self.w
+        self.yy = np.r_[yy, np.zeros(self.b.shape[0])]
+
+        self.s = s
+
+    def __call__(self, p):
+        ab = np.vstack((self.a_dense, self.b_dense / p))
+
+        # LSQ solution of ab @ c = yy
+        from scipy.linalg import qr, solve
+        q, r = qr(ab, mode='economic')
+
+        qy = q.T @ self.yy
+
+        nc = r.shape[1]
+        c = solve(r[:nc, :nc], qy[:nc])
+
+        spl = BSpline(self.t, c, self.k)
+        fp = np.sum(self.w**2 * (spl(self.x) - self.y)**2)
+
+        self.spl = spl   # store it
+
+        return fp - self.s
+
+
+class TestMakeSplrep:
+    def test_input_errors(self):
+        x = np.linspace(0, 10, 11)
+        y = np.linspace(0, 10, 12)
+        with assert_raises(ValueError):
+            # len(x) != len(y)
+            make_splrep(x, y)
+
+        with assert_raises(ValueError):
+            # 0D inputs
+            make_splrep(1, 2, s=0.1)
+
+        with assert_raises(ValueError):
+            # y.ndim > 2
+            y = np.ones((x.size, 2, 2, 2))
+            make_splrep(x, y, s=0.1)
+
+        w = np.ones(12)
+        with assert_raises(ValueError):
+            # len(weights) != len(x)
+            make_splrep(x, x**3, w=w, s=0.1)
+
+        w = -np.ones(12)
+        with assert_raises(ValueError):
+            # w < 0
+            make_splrep(x, x**3, w=w, s=0.1)
+
+        w = np.ones((x.shape[0], 2))
+        with assert_raises(ValueError):
+            # w.ndim != 1
+            make_splrep(x, x**3, w=w, s=0.1)
+
+        with assert_raises(ValueError):
+            # x not ordered
+            make_splrep(x[::-1], x**3, s=0.1)
+
+        with assert_raises(TypeError):
+            # k != int(k)
+            make_splrep(x, x**3, k=2.5, s=0.1)
+
+        with assert_raises(ValueError):
+            # s < 0
+            make_splrep(x, x**3, s=-1)
+
+        with assert_raises(ValueError):
+            # nest < 2*k + 2
+            make_splrep(x, x**3, k=3, nest=2, s=0.1)
+
+        with assert_raises(ValueError):
+            # nest not None and s==0
+            make_splrep(x, x**3, s=0, nest=11)
+
+        with assert_raises(ValueError):
+            # len(x) != len(y)
+            make_splrep(np.arange(8), np.arange(9), s=0.1)
+
+    def _get_xykt(self):
+        x = np.linspace(0, 5, 11)
+        y  = np.sin(x*3.14 / 5)**2
+        k = 3
+        s = 1.7e-4
+        tt = np.array([0]*(k+1) + [2.5, 4.0] + [5]*(k+1))
+
+        return x, y, k, s, tt
+
+    def test_fitpack_F(self):
+        # test an implementation detail: banded/packed linalg vs full matrices
+        from scipy.interpolate._fitpack_repro import F
+
+        x, y, k, s, t = self._get_xykt()
+        f = F(x, y[:, None], t, k, s)    # F expects y to be 2D
+        f_d = F_dense(x, y, t, k, s)
+        for p in [1, 10, 100]:
+            assert_allclose(f(p), f_d(p), atol=1e-15)
+
+    def test_fitpack_F_with_weights(self):
+        # repeat test_fitpack_F, with weights
+        from scipy.interpolate._fitpack_repro import F
+
+        x, y, k, s, t = self._get_xykt()
+        w = np.arange(x.shape[0], dtype=float)
+        fw = F(x, y[:, None], t, k, s, w=w)       # F expects y to be 2D
+        fw_d = F_dense(x, y, t, k, s, w=w)
+
+        f_d = F_dense(x, y, t, k, s)   # no weights
+
+        for p in [1, 10, 100]:
+            assert_allclose(fw(p), fw_d(p), atol=1e-15)
+            assert not np.allclose(f_d(p), fw_d(p), atol=1e-15)
+
+    def test_disc_matrix(self):
+        # test an implementation detail: discontinuity matrix
+        # (jumps of k-th derivative at knots)
+        import scipy.interpolate._fitpack_repro as _fr
+
+        rng = np.random.default_rng(12345)
+        t = np.r_[0, 0, 0, 0, np.sort(rng.uniform(size=7))*5, 5, 5, 5, 5]
+
+        n, k = len(t), 3
+        D = PackedMatrix(*_fr.disc(t, k)).todense()
+        D_dense = disc_naive(t, k)
+        assert D.shape[0] == n - 2*k - 2   # number of internal knots
+        assert_allclose(D, D_dense, atol=1e-15)
+
+    def test_simple_vs_splrep(self):
+        x, y, k, s, tt = self._get_xykt()
+        tt = np.array([0]*(k+1) + [2.5, 4.0] + [5]*(k+1))
+
+        t,c,k = splrep(x, y, k=k, s=s)
+        assert all(t == tt)
+ 
+        spl = make_splrep(x, y, k=k, s=s)
+        assert_allclose(c[:spl.c.size], spl.c, atol=1e-15)
+
+    def test_with_knots(self):
+        x, y, k, s, _ = self._get_xykt()
+
+        t = list(generate_knots(x, y, k=k, s=s))[-1]
+
+        spl_auto = make_splrep(x, y, k=k, s=s)
+        spl_t = make_splrep(x, y, t=t, k=k, s=s)
+
+        assert_allclose(spl_auto.t, spl_t.t, atol=1e-15)
+        assert_allclose(spl_auto.c, spl_t.c, atol=1e-15)
+        assert_allclose(spl_auto.k, spl_t.k, atol=1e-15)
+
+    def test_no_internal_knots(self):
+        # should not fail if there are no internal knots
+        n = 10
+        x = np.arange(n)
+        y = x**3
+        k = 3
+        spl = make_splrep(x, y, k=k, s=1)
+        assert spl.t.shape[0] == 2*(k+1)
+
+    def test_default_s(self):
+        n = 10
+        x = np.arange(n)
+        y = x**3
+        spl = make_splrep(x, y, k=3)
+        spl_i = make_interp_spline(x, y, k=3)
+
+        assert_allclose(spl.c, spl_i.c, atol=1e-15)
+
+    def test_s_too_small(self):
+        # both splrep and make_splrep warn that "s too small": ier=2
+        n = 14
+        x = np.arange(n)
+        y = x**3
+
+        with suppress_warnings() as sup:
+            r = sup.record(RuntimeWarning)
+            tck = splrep(x, y, k=3, s=1e-50)
+            spl = make_splrep(x, y, k=3, s=1e-50)
+            assert len(r) == 2
+            assert_equal(spl.t, tck[0])
+            assert_allclose(np.r_[spl.c, [0]*(spl.k+1)],
+                            tck[1], atol=5e-13)
+
+    def test_shape(self):
+        # make sure coefficients have the right shape (not extra dims)
+        n, k = 10, 3
+        x = np.arange(n)
+        y = x**3
+
+        spl = make_splrep(x, y, k=k)
+        spl_1 = make_splrep(x, y, k=k, s=1e-5)
+
+        assert spl.c.ndim == 1
+        assert spl_1.c.ndim == 1
+
+        # force the general code path, not shortcuts
+        spl_2 = make_splrep(x, y + 1/(1+y), k=k, s=1e-5)
+        assert spl_2.c.ndim == 1
+
+    def test_s0_vs_not(self):
+        # check that the shapes are consistent
+        n, k = 10, 3
+        x = np.arange(n)
+        y = x**3
+
+        spl_0 = make_splrep(x, y, k=3, s=0)
+        spl_1 = make_splrep(x, y, k=3, s=1)
+
+        assert spl_0.c.ndim == 1
+        assert spl_1.c.ndim == 1
+
+        assert spl_0.t.shape[0] == n + k + 1
+        assert spl_1.t.shape[0] == 2 * (k + 1)
+
+
+class TestMakeSplprep:
+    def _get_xyk(self, m=10, k=3):
+        x = np.arange(m) * np.pi / m
+        y = [np.sin(x), np.cos(x)]
+        return x, y, k
+
+    @pytest.mark.parametrize('s', [0, 0.1, 1e-3, 1e-5])
+    def test_simple_vs_splprep(self, s):
+        # Check/document the interface vs splPrep
+        # The four values of `s` are to probe all code paths and shortcuts
+        m, k = 10, 3
+        x = np.arange(m) * np.pi / m
+        y = [np.sin(x), np.cos(x)]
+
+        # the number of knots depends on `s` (this is by construction)
+        num_knots = {0: 14, 0.1: 8, 1e-3: 8 + 1, 1e-5: 8 + 2}
+
+        # construct the splines
+        (t, c, k), u_ = splprep(y, s=s)
+        spl, u = make_splprep(y, s=s)
+
+        # parameters
+        assert_allclose(u, u_, atol=1e-15)
+
+        # knots
+        assert_allclose(spl.t, t, atol=1e-15)
+        assert len(t) == num_knots[s]
+
+        # coefficients: note the transpose
+        cc = np.asarray(c).T
+        assert_allclose(spl.c, cc, atol=1e-15)
+
+        # values: note axis=1
+        assert_allclose(spl(u),
+                        BSpline(t, c, k, axis=1)(u), atol=1e-15)
+
+    @pytest.mark.parametrize('s', [0, 0.1, 1e-3, 1e-5])
+    def test_array_not_list(self, s):
+        # the argument of splPrep is either a list of arrays or a 2D array (sigh)
+        _, y, _ = self._get_xyk()
+        assert isinstance(y, list)
+        assert np.shape(y)[0] == 2
+
+        # assert the behavior of FITPACK's splrep
+        tck, u = splprep(y, s=s)
+        tck_a, u_a = splprep(np.asarray(y), s=s)
+        assert_allclose(u, u_a, atol=s)
+        assert_allclose(tck[0], tck_a[0], atol=1e-15)
+        assert_allclose(tck[1], tck_a[1], atol=1e-15)
+        assert tck[2] == tck_a[2]
+        assert np.shape(splev(u, tck)) == np.shape(y)
+
+        spl, u = make_splprep(y, s=s)
+        assert_allclose(u, u_a, atol=1e-15)
+        assert_allclose(spl.t, tck_a[0], atol=1e-15)
+        assert_allclose(spl.c.T, tck_a[1], atol=1e-15)
+        assert spl.k == tck_a[2]
+        assert spl(u).shape == np.shape(y)
+
+        spl, u = make_splprep(np.asarray(y), s=s)
+        assert_allclose(u, u_a, atol=1e-15)
+        assert_allclose(spl.t, tck_a[0], atol=1e-15)
+        assert_allclose(spl.c.T, tck_a[1], atol=1e-15)
+        assert spl.k == tck_a[2]
+        assert spl(u).shape == np.shape(y)
+
+        with assert_raises(ValueError):
+            make_splprep(np.asarray(y).T, s=s)
+
+    def test_default_s_is_zero(self):
+        x, y, k = self._get_xyk(m=10)
+
+        spl, u = make_splprep(y)
+        assert_allclose(spl(u), y, atol=1e-15)
+
+    def test_s_zero_vs_near_zero(self):
+        # s=0 and s \approx 0 are consistent
+        x, y, k = self._get_xyk(m=10)
+
+        spl_i, u_i = make_splprep(y, s=0)
+        spl_n, u_n = make_splprep(y, s=1e-15)
+
+        assert_allclose(u_i, u_n, atol=1e-15)
+        assert_allclose(spl_i(u_i), y, atol=1e-15)
+        assert_allclose(spl_n(u_n), y, atol=1e-7)
+        assert spl_i.axis == spl_n.axis
+        assert spl_i.c.shape == spl_n.c.shape
+
+    def test_1D(self):
+        x = np.arange(8, dtype=float)
+        with assert_raises(ValueError):
+            splprep(x)
+
+        with assert_raises(ValueError):
+            make_splprep(x, s=0)
+
+        with assert_raises(ValueError):
+            make_splprep(x, s=0.1)
+
+        tck, u_ = splprep([x], s=1e-5)
+        spl, u = make_splprep([x], s=1e-5)
+
+        assert spl(u).shape == (1, 8)
+        assert_allclose(spl(u), [x], atol=1e-15)
+
diff --git a/scipy/interpolate/tests/test_fitpack.py b/scipy/interpolate/tests/test_fitpack.py
index c76178681de0..14d121f363bd 100644
--- a/scipy/interpolate/tests/test_fitpack.py
+++ b/scipy/interpolate/tests/test_fitpack.py
@@ -9,6 +9,7 @@
 from scipy._lib._testutils import check_free_memory
 
 from scipy.interpolate import RectBivariateSpline
+from scipy.interpolate import make_splrep
 
 from scipy.interpolate._fitpack_py import (splrep, splev, bisplrep, bisplev,
      sproot, splprep, splint, spalde, splder, splantider, insert, dblint)
@@ -78,6 +79,15 @@ def err_est(k, d):
                 err = norm2(f1(tt, d) - splev(tt, tck, d)) / norm2(f1(tt, d))
                 assert err < tol
 
+            # smoke test make_splrep
+            if not per:
+                spl = make_splrep(x, v, k=k, s=s, xb=xb, xe=xe)
+                if len(spl.t) == len(tck[0]):
+                    assert_allclose(spl.t, tck[0], atol=1e-15)
+                    assert_allclose(spl.c, tck[1][:spl.c.size], atol=1e-13)
+                else:
+                    assert k == 5   # knot length differ in some k=5 cases
+
     def check_2(self, per=0, N=20, ia=0, ib=2*np.pi):
         a, b, dx = 0, 2*np.pi, 0.2*np.pi
         x = np.linspace(a, b, N+1)    # nodes
diff --git a/scipy/interpolate/tests/test_fitpack2.py b/scipy/interpolate/tests/test_fitpack2.py
index f1ebf0d26e87..c509073bed94 100644
--- a/scipy/interpolate/tests/test_fitpack2.py
+++ b/scipy/interpolate/tests/test_fitpack2.py
@@ -13,6 +13,7 @@
         LSQSphereBivariateSpline, SmoothSphereBivariateSpline,
         RectSphereBivariateSpline)
 
+from scipy.interpolate import make_splrep
 
 class TestUnivariateSpline:
     def test_linear_constant(self):
@@ -24,6 +25,10 @@ def test_linear_constant(self):
         assert_almost_equal(lut.get_residual(),0.0)
         assert_array_almost_equal(lut([1,1.5,2]),[3,3,3])
 
+        spl = make_splrep(x, y, k=1, s=len(x))
+        assert_allclose(spl.t[1:-1], lut.get_knots(), atol=1e-15)
+        assert_allclose(spl.c, lut.get_coeffs(), atol=1e-15)
+
     def test_preserve_shape(self):
         x = [1, 2, 3]
         y = [0, 2, 4]