From 71460c274a927188710d1a76b059aa9bbc1c07ca Mon Sep 17 00:00:00 2001 From: sikai zhang Date: Tue, 14 Oct 2025 15:16:14 +0800 Subject: [PATCH 1/2] FEAT add max-candidates in make_narx --- .github/workflows/emscripten.yml | 2 +- .github/workflows/wheel.yml | 2 +- fastcan/narx/_feature.py | 60 ++++++++++++++++++++++-------- fastcan/narx/_utils.py | 21 ++++++++++- fastcan/narx/tests/test_narx.py | 63 +++++++++++++++++++++++++++++++- pixi.lock | 2 +- pyproject.toml | 10 ++++- 7 files changed, 137 insertions(+), 23 deletions(-) diff --git a/.github/workflows/emscripten.yml b/.github/workflows/emscripten.yml index dfa4b70..a0aa9ff 100644 --- a/.github/workflows/emscripten.yml +++ b/.github/workflows/emscripten.yml @@ -9,7 +9,7 @@ jobs: steps: - uses: actions/checkout@v5 - name: Build WASM wheel - uses: pypa/cibuildwheel@v3.2.0 + uses: pypa/cibuildwheel@v3.2.1 env: CIBW_PLATFORM: pyodide - name: Upload package diff --git a/.github/workflows/wheel.yml b/.github/workflows/wheel.yml index 14d80b5..dc5f965 100644 --- a/.github/workflows/wheel.yml +++ b/.github/workflows/wheel.yml @@ -33,7 +33,7 @@ jobs: steps: - uses: actions/checkout@v5 - name: Build wheels - uses: pypa/cibuildwheel@v3.2.0 + uses: pypa/cibuildwheel@v3.2.1 env: CIBW_SKIP: "*_i686 *_ppc64le *_s390x *_universal2 *-musllinux_* cp314t*" CIBW_PROJECT_REQUIRES_PYTHON: ">=3.10" diff --git a/fastcan/narx/_feature.py b/fastcan/narx/_feature.py index a97889e..5bd7028 100644 --- a/fastcan/narx/_feature.py +++ b/fastcan/narx/_feature.py @@ -6,6 +6,7 @@ # SPDX-License-Identifier: MIT import math +import warnings from itertools import combinations_with_replacement from numbers import Integral @@ -198,12 +199,16 @@ def make_poly_features(X, ids): None, Interval(Integral, 1, None, closed="left"), ], + "max_poly": [None, Interval(Integral, 1, None, closed="left")], + "random_state": ["random_state"], }, prefer_skip_nested_validation=True, ) def make_poly_ids( n_features=1, degree=1, + max_poly=None, + random_state=None, ): """Generate ids for polynomial features. (variable_index, variable_index, ...) @@ -217,6 +222,15 @@ def make_poly_ids( degree : int, default=1 The maximum degree of polynomial features. + max_poly : int, default=None + Maximum number of ids of polynomial features to generate. + Randomly selected by reservoir sampling. + If None, all possible ids are returned. + + random_state : int or RandomState instance, default=None + Used when `max_poly` is not None to subsample ids of polynomial features. + See :term:`Glossary ` for details. + Returns ------- ids : array-like of shape (n_outputs, degree) @@ -236,29 +250,45 @@ def make_poly_ids( [1, 2, 2], [2, 2, 2]]) """ - n_outputs = math.comb(n_features + degree, degree) - 1 - if n_outputs > np.iinfo(np.intp).max: + n_total = math.comb(n_features + degree, degree) - 1 + if n_total > np.iinfo(np.intp).max: msg = ( - "The output that would result from the current configuration would" - f" have {n_outputs} features which is too large to be" - f" indexed by {np.intp().dtype.name}." + "The current configuration would " + f"result in {n_total} features which is too large to be " + f"indexed by {np.intp().dtype.name}." ) raise ValueError(msg) - - ids = np.array( - list( - combinations_with_replacement( - range(n_features + 1), - degree, - ) + if n_total > 10_000_000: + warnings.warn( + "Total number of polynomial features is larger than 10,000,000! " + f"The current configuration would result in {n_total} features. " + "This may take a while.", + UserWarning, + ) + if max_poly is not None and max_poly < n_total: + # reservoir sampling + rng = np.random.default_rng(random_state) + reservoir = [] + for i, comb in enumerate( + combinations_with_replacement(range(n_features + 1), degree) + ): + if i < max_poly: + reservoir.append(comb) + else: + j = rng.integers(0, i + 1) + if j < max_poly: + reservoir[j] = comb + ids = np.array(reservoir) + else: + ids = np.array( + list(combinations_with_replacement(range(n_features + 1), degree)) ) - ) const_id = np.where((ids == 0).all(axis=1)) return np.delete(ids, const_id, 0) # remove the constant feature -def _valiate_time_shift_poly_ids( +def _validate_time_shift_poly_ids( time_shift_ids, poly_ids, n_samples=None, n_features=None, n_outputs=None ): if n_samples is None: @@ -496,7 +526,7 @@ def tp2fd(time_shift_ids, poly_ids): [[-1 1] [ 2 3]] """ - _time_shift_ids, _poly_ids = _valiate_time_shift_poly_ids( + _time_shift_ids, _poly_ids = _validate_time_shift_poly_ids( time_shift_ids, poly_ids, ) diff --git a/fastcan/narx/_utils.py b/fastcan/narx/_utils.py index 70b88ca..fd4667a 100644 --- a/fastcan/narx/_utils.py +++ b/fastcan/narx/_utils.py @@ -7,7 +7,11 @@ import numpy as np from scipy.stats import rankdata -from sklearn.utils import check_array, check_consistent_length, column_or_1d +from sklearn.utils import ( + check_array, + check_consistent_length, + column_or_1d, +) from sklearn.utils._param_validation import Interval, StrOptions, validate_params from sklearn.utils.validation import check_is_fitted @@ -132,6 +136,8 @@ def _get_term_str(term_feat_ids, term_delay_ids): Interval(Integral, 1, None, closed="left"), ], "fit_intercept": ["boolean"], + "max_candidates": [None, Interval(Integral, 1, None, closed="left")], + "random_state": ["random_state"], "include_zero_delay": [None, "array-like"], "static_indices": [None, "array-like"], "refine_verbose": ["verbose"], @@ -155,6 +161,8 @@ def make_narx( poly_degree=1, *, fit_intercept=True, + max_candidates=None, + random_state=None, include_zero_delay=None, static_indices=None, refine_verbose=1, @@ -186,6 +194,15 @@ def make_narx( fit_intercept : bool, default=True Whether to fit the intercept. If set to False, intercept will be zeros. + max_candidates : int, default=None + Maximum number of candidate polynomial terms retained before selection. + Randomly selected by reservoir sampling. + If None, all candidates are considered. + + random_state : int or RandomState instance, default=None + Used when `max_candidates` is not None to subsample candidate terms. + See :term:`Glossary ` for details. + include_zero_delay : {None, array-like} of shape (n_features,) default=None Whether to include the original (zero-delay) features. @@ -306,6 +323,8 @@ def make_narx( poly_ids_all = make_poly_ids( time_shift_ids_all.shape[0], poly_degree, + max_poly=max_candidates, + random_state=random_state, ) poly_terms = make_poly_features(time_shift_vars, poly_ids_all) diff --git a/fastcan/narx/tests/test_narx.py b/fastcan/narx/tests/test_narx.py index 9eb2ef0..177f5c2 100644 --- a/fastcan/narx/tests/test_narx.py +++ b/fastcan/narx/tests/test_narx.py @@ -29,10 +29,18 @@ def test_narx_is_sklearn_estimator(): check_estimator(NARX(), expected_failed_checks=expected_failures) -def test_poly_ids(): - with pytest.raises(ValueError, match=r"The output that would result from the .*"): +def test_poly_ids(monkeypatch): + with pytest.raises(ValueError, match=r"The current configuration would .*"): make_poly_ids(10, 1000) + # Mock combinations_with_replacement to avoid heavy computation + monkeypatch.setattr( + "fastcan.narx._feature.combinations_with_replacement", + lambda *args, **kwargs: iter([[0, 0]]), + ) + with pytest.warns(UserWarning, match=r"Total number of polynomial features .*"): + make_poly_ids(18, 10) + def test_time_ids(): with pytest.raises(ValueError, match=r"The length of `include_zero_delay`.*"): @@ -553,6 +561,57 @@ def test_make_narx_refine_print(capsys): assert "No. of iterations: " in captured.out +def test_make_narx_max_candidates(): + """Test max_candidates and random_state in make_narx.""" + rng = np.random.default_rng(12345) + X = rng.random((100, 2)) + y = rng.random((100, 1)) + max_delay = 3 + poly_degree = 10 + n_terms_to_select = 5 + max_candidates = 20 + + # With the same random_state, the results should be identical + narx1 = make_narx( + X, + y, + n_terms_to_select=n_terms_to_select, + max_delay=max_delay, + poly_degree=poly_degree, + max_candidates=max_candidates, + random_state=123, + verbose=0, + ) + narx2 = make_narx( + X, + y, + n_terms_to_select=n_terms_to_select, + max_delay=max_delay, + poly_degree=poly_degree, + max_candidates=max_candidates, + random_state=123, + verbose=0, + ) + assert_array_equal(narx1.feat_ids, narx2.feat_ids) + assert_array_equal(narx1.delay_ids, narx2.delay_ids) + + # With different random_state, the results should be different + narx3 = make_narx( + X, + y, + n_terms_to_select=n_terms_to_select, + max_delay=max_delay, + poly_degree=poly_degree, + max_candidates=max_candidates, + random_state=456, + verbose=0, + ) + assert not np.array_equal(narx1.feat_ids, narx3.feat_ids) + + # Check if number of selected terms is correct + assert narx1.feat_ids.shape[0] == n_terms_to_select + + @pytest.mark.parametrize("max_delay", [1, 3, 7, 10]) def test_nan_split(max_delay): n_sessions = 10 diff --git a/pixi.lock b/pixi.lock index d56ed82..c837ddc 100644 --- a/pixi.lock +++ b/pixi.lock @@ -8351,7 +8351,7 @@ packages: - pypi: ./ name: fastcan version: 0.4.1 - sha256: 4cba5e10ba2470a292c43ba40897b2aa523bcd0d4d7ed2979ac6bd5dd81f4ce8 + sha256: 0c3bd4756f12a17bb430db6172b64124f1833e15085fa57c9d7e53801d2f30b5 requires_dist: - scikit-learn>=1.7.0,!=1.7.1 requires_python: '>=3.10' diff --git a/pyproject.toml b/pyproject.toml index c7091c5..36026ae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -143,8 +143,8 @@ asv-publish = { cmd = "python -m asv publish", cwd = "asv_benchmarks" } asv-preview = { cmd = "python -m asv preview", cwd = "asv_benchmarks", depends-on = ["asv-publish"] } [tool.pixi.feature.test.tasks] -test = "pytest ./tests ./fastcan/narx/tests" -test-coverage = { cmd = "rm -rf .coverage && pytest --cov-report {{ FMT }} --cov={{ PACKAGE }} .", args = [{ arg = "PACKAGE", default = "fastcan" }, { arg = "FMT", default = "html" }] } +test = "pytest" +test-coverage = { cmd = "rm -rf .coverage && pytest --cov-report {{ FMT }} --cov={{ PACKAGE }}", args = [{ arg = "PACKAGE", default = "fastcan" }, { arg = "FMT", default = "html" }] } [tool.pixi.feature.build.tasks] build-wheel = "rm -rf dist && python -m build -wnx -Cinstall-args=--tags=runtime,python-runtime,devel" @@ -192,6 +192,12 @@ static = { features = ["static"], no-default-feature = true } nogil = { features = ["nogil"], no-default-feature = true } wasm = { features = ["wasm"], no-default-feature = true } +[tool.pytest.ini_options] +testpaths = [ + "./tests", + "./fastcan/narx/tests", +] + [tool.coverage.run] omit = ["**/tests/*"] From 421b775577138c6e91fe110169fb12373841b586 Mon Sep 17 00:00:00 2001 From: sikai zhang Date: Tue, 14 Oct 2025 15:32:27 +0800 Subject: [PATCH 2/2] fix codecov --- pixi.lock | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pixi.lock b/pixi.lock index c837ddc..41ec6cb 100644 --- a/pixi.lock +++ b/pixi.lock @@ -8351,7 +8351,7 @@ packages: - pypi: ./ name: fastcan version: 0.4.1 - sha256: 0c3bd4756f12a17bb430db6172b64124f1833e15085fa57c9d7e53801d2f30b5 + sha256: 07bc539901f32163cadb6d96549d0b169fcc4577e48a210c96fc82b04e953309 requires_dist: - scikit-learn>=1.7.0,!=1.7.1 requires_python: '>=3.10' diff --git a/pyproject.toml b/pyproject.toml index 36026ae..068cf12 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -144,7 +144,7 @@ asv-preview = { cmd = "python -m asv preview", cwd = "asv_benchmarks", depends-o [tool.pixi.feature.test.tasks] test = "pytest" -test-coverage = { cmd = "rm -rf .coverage && pytest --cov-report {{ FMT }} --cov={{ PACKAGE }}", args = [{ arg = "PACKAGE", default = "fastcan" }, { arg = "FMT", default = "html" }] } +test-coverage = { cmd = "rm -rf .coverage && pytest --cov-report {{ FMT }} --cov={{ PACKAGE }}", args = [{ arg = "FMT", default = "html" }, { arg = "PACKAGE", default = "fastcan" }] } [tool.pixi.feature.build.tasks] build-wheel = "rm -rf dist && python -m build -wnx -Cinstall-args=--tags=runtime,python-runtime,devel"