scikit-learn-contrib · MatthewSZhang · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025
diff --git a/.github/workflows/emscripten.yml b/.github/workflows/emscripten.yml
@@ -9,7 +9,7 @@ jobs:
     steps:
       - uses: actions/checkout@v5
       - name: Build WASM wheel
-        uses: pypa/[email protected].0
+        uses: pypa/[email protected].1
         env:
           CIBW_PLATFORM: pyodide
       - name: Upload package

diff --git a/.github/workflows/wheel.yml b/.github/workflows/wheel.yml
@@ -33,7 +33,7 @@ jobs:
     steps:
       - uses: actions/checkout@v5
       - name: Build wheels
-        uses: pypa/[email protected].0
+        uses: pypa/[email protected].1
         env:
           CIBW_SKIP: "*_i686 *_ppc64le *_s390x *_universal2 *-musllinux_* cp314t*"
           CIBW_PROJECT_REQUIRES_PYTHON: ">=3.10"

diff --git a/fastcan/narx/_feature.py b/fastcan/narx/_feature.py
@@ -6,6 +6,7 @@
 # SPDX-License-Identifier: MIT
 
 import math
+import warnings
 from itertools import combinations_with_replacement
 from numbers import Integral
 
@@ -198,12 +199,16 @@ def make_poly_features(X, ids):
             None,
             Interval(Integral, 1, None, closed="left"),
         ],
+        "max_poly": [None, Interval(Integral, 1, None, closed="left")],
+        "random_state": ["random_state"],
     },
     prefer_skip_nested_validation=True,
 )
 def make_poly_ids(
     n_features=1,
     degree=1,
+    max_poly=None,
+    random_state=None,
 ):
     """Generate ids for polynomial features.
     (variable_index, variable_index, ...)
@@ -217,6 +222,15 @@ def make_poly_ids(
     degree : int, default=1
         The maximum degree of polynomial features.
 
+    max_poly : int, default=None
+        Maximum number of ids of polynomial features to generate.
+        Randomly selected by reservoir sampling.
+        If None, all possible ids are returned.
+
+    random_state : int or RandomState instance, default=None
+        Used when `max_poly` is not None to subsample ids of polynomial features.
+        See :term:`Glossary <random_state>` for details.
+
     Returns
     -------
     ids : array-like of shape (n_outputs, degree)
@@ -236,29 +250,45 @@ def make_poly_ids(
            [1, 2, 2],
            [2, 2, 2]])
     """
-    n_outputs = math.comb(n_features + degree, degree) - 1
-    if n_outputs > np.iinfo(np.intp).max:
+    n_total = math.comb(n_features + degree, degree) - 1
+    if n_total > np.iinfo(np.intp).max:
         msg = (
-            "The output that would result from the current configuration would"
-            f" have {n_outputs} features which is too large to be"
-            f" indexed by {np.intp().dtype.name}."
+            "The current configuration would "
+            f"result in {n_total} features which is too large to be "
+            f"indexed by {np.intp().dtype.name}."
         )
         raise ValueError(msg)
-
-    ids = np.array(
-        list(
-            combinations_with_replacement(
-                range(n_features + 1),
-                degree,
-            )
+    if n_total > 10_000_000:
+        warnings.warn(
+            "Total number of polynomial features is larger than 10,000,000! "
+            f"The current configuration would result in {n_total} features. "
+            "This may take a while.",
+            UserWarning,
+        )
+    if max_poly is not None and max_poly < n_total:
+        # reservoir sampling
+        rng = np.random.default_rng(random_state)
+        reservoir = []
+        for i, comb in enumerate(
+            combinations_with_replacement(range(n_features + 1), degree)
+        ):
+            if i < max_poly:
+                reservoir.append(comb)
+            else:
+                j = rng.integers(0, i + 1)
+                if j < max_poly:
+                    reservoir[j] = comb
+        ids = np.array(reservoir)
+    else:
+        ids = np.array(
+            list(combinations_with_replacement(range(n_features + 1), degree))
         )
-    )
 
     const_id = np.where((ids == 0).all(axis=1))
     return np.delete(ids, const_id, 0)  # remove the constant feature
 
 
-def _valiate_time_shift_poly_ids(
+def _validate_time_shift_poly_ids(
     time_shift_ids, poly_ids, n_samples=None, n_features=None, n_outputs=None
 ):
     if n_samples is None:
@@ -496,7 +526,7 @@ def tp2fd(time_shift_ids, poly_ids):
     [[-1  1]
      [ 2  3]]
     """
-    _time_shift_ids, _poly_ids = _valiate_time_shift_poly_ids(
+    _time_shift_ids, _poly_ids = _validate_time_shift_poly_ids(
         time_shift_ids,
         poly_ids,
     )

diff --git a/fastcan/narx/_utils.py b/fastcan/narx/_utils.py
@@ -7,7 +7,11 @@
 
 import numpy as np
 from scipy.stats import rankdata
-from sklearn.utils import check_array, check_consistent_length, column_or_1d
+from sklearn.utils import (
+    check_array,
+    check_consistent_length,
+    column_or_1d,
+)
 from sklearn.utils._param_validation import Interval, StrOptions, validate_params
 from sklearn.utils.validation import check_is_fitted
 
@@ -132,6 +136,8 @@ def _get_term_str(term_feat_ids, term_delay_ids):
             Interval(Integral, 1, None, closed="left"),
         ],
         "fit_intercept": ["boolean"],
+        "max_candidates": [None, Interval(Integral, 1, None, closed="left")],
+        "random_state": ["random_state"],
         "include_zero_delay": [None, "array-like"],
         "static_indices": [None, "array-like"],
         "refine_verbose": ["verbose"],
@@ -155,6 +161,8 @@ def make_narx(
     poly_degree=1,
     *,
     fit_intercept=True,
+    max_candidates=None,
+    random_state=None,
     include_zero_delay=None,
     static_indices=None,
     refine_verbose=1,
@@ -186,6 +194,15 @@ def make_narx(
     fit_intercept : bool, default=True
         Whether to fit the intercept. If set to False, intercept will be zeros.
 
+    max_candidates : int, default=None
+        Maximum number of candidate polynomial terms retained before selection.
+        Randomly selected by reservoir sampling.
+        If None, all candidates are considered.
+
+    random_state : int or RandomState instance, default=None
+        Used when `max_candidates` is not None to subsample candidate terms.
+        See :term:`Glossary <random_state>` for details.
+
     include_zero_delay : {None, array-like} of shape (n_features,) default=None
         Whether to include the original (zero-delay) features.
 
@@ -306,6 +323,8 @@ def make_narx(
     poly_ids_all = make_poly_ids(
         time_shift_ids_all.shape[0],
         poly_degree,
+        max_poly=max_candidates,
+        random_state=random_state,
     )
     poly_terms = make_poly_features(time_shift_vars, poly_ids_all)
 

diff --git a/fastcan/narx/tests/test_narx.py b/fastcan/narx/tests/test_narx.py
@@ -29,10 +29,18 @@ def test_narx_is_sklearn_estimator():
         check_estimator(NARX(), expected_failed_checks=expected_failures)
 
 
-def test_poly_ids():
-    with pytest.raises(ValueError, match=r"The output that would result from the .*"):
+def test_poly_ids(monkeypatch):
+    with pytest.raises(ValueError, match=r"The current configuration would .*"):
         make_poly_ids(10, 1000)
 
+    # Mock combinations_with_replacement to avoid heavy computation
+    monkeypatch.setattr(
+        "fastcan.narx._feature.combinations_with_replacement",
+        lambda *args, **kwargs: iter([[0, 0]]),
+    )
+    with pytest.warns(UserWarning, match=r"Total number of polynomial features .*"):
+        make_poly_ids(18, 10)
+
 
 def test_time_ids():
     with pytest.raises(ValueError, match=r"The length of `include_zero_delay`.*"):
@@ -553,6 +561,57 @@ def test_make_narx_refine_print(capsys):
     assert "No. of iterations: " in captured.out
 
 
+def test_make_narx_max_candidates():
+    """Test max_candidates and random_state in make_narx."""
+    rng = np.random.default_rng(12345)
+    X = rng.random((100, 2))
+    y = rng.random((100, 1))
+    max_delay = 3
+    poly_degree = 10
+    n_terms_to_select = 5
+    max_candidates = 20
+
+    # With the same random_state, the results should be identical
+    narx1 = make_narx(
+        X,
+        y,
+        n_terms_to_select=n_terms_to_select,
+        max_delay=max_delay,
+        poly_degree=poly_degree,
+        max_candidates=max_candidates,
+        random_state=123,
+        verbose=0,
+    )
+    narx2 = make_narx(
+        X,
+        y,
+        n_terms_to_select=n_terms_to_select,
+        max_delay=max_delay,
+        poly_degree=poly_degree,
+        max_candidates=max_candidates,
+        random_state=123,
+        verbose=0,
+    )
+    assert_array_equal(narx1.feat_ids, narx2.feat_ids)
+    assert_array_equal(narx1.delay_ids, narx2.delay_ids)
+
+    # With different random_state, the results should be different
+    narx3 = make_narx(
+        X,
+        y,
+        n_terms_to_select=n_terms_to_select,
+        max_delay=max_delay,
+        poly_degree=poly_degree,
+        max_candidates=max_candidates,
+        random_state=456,
+        verbose=0,
+    )
+    assert not np.array_equal(narx1.feat_ids, narx3.feat_ids)
+
+    # Check if number of selected terms is correct
+    assert narx1.feat_ids.shape[0] == n_terms_to_select
+
+
 @pytest.mark.parametrize("max_delay", [1, 3, 7, 10])
 def test_nan_split(max_delay):
     n_sessions = 10

diff --git a/pixi.lock b/pixi.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -143,8 +143,8 @@ asv-publish = { cmd = "python -m asv publish", cwd = "asv_benchmarks" }
 asv-preview = { cmd = "python -m asv preview", cwd = "asv_benchmarks", depends-on = ["asv-publish"] }
 
 [tool.pixi.feature.test.tasks]
-test = "pytest ./tests ./fastcan/narx/tests"
-test-coverage = { cmd = "rm -rf .coverage && pytest --cov-report {{ FMT }} --cov={{ PACKAGE }} .", args = [{ arg = "PACKAGE", default = "fastcan" }, { arg = "FMT", default = "html" }] }
+test = "pytest"
+test-coverage = { cmd = "rm -rf .coverage && pytest --cov-report {{ FMT }} --cov={{ PACKAGE }}", args = [{ arg = "FMT", default = "html" }, { arg = "PACKAGE", default = "fastcan" }] }
 
 [tool.pixi.feature.build.tasks]
 build-wheel = "rm -rf dist && python -m build -wnx -Cinstall-args=--tags=runtime,python-runtime,devel"
@@ -192,6 +192,12 @@ static = { features = ["static"], no-default-feature = true }
 nogil = { features = ["nogil"], no-default-feature = true }
 wasm = { features = ["wasm"], no-default-feature = true }
 
+[tool.pytest.ini_options]
+testpaths = [ 
+    "./tests",
+    "./fastcan/narx/tests",
+]
+
 [tool.coverage.run]
 omit = ["**/tests/*"]