Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions pymars/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,15 @@
# e.g., from ._basis import BasisFunction (if users need to interact with it directly)

__all__ = [
'Earth',
'EarthRegressor',
'EarthClassifier',
'CategoricalImputer',
'GLMEarth',
'EarthCV',
'plot_basis_functions',
'plot_residuals',
'plot_partial_dependence',
'plot_individual_conditional_expectation',
'get_model_explanation'
"Earth",
"EarthRegressor",
"EarthClassifier",
"CategoricalImputer",
"GLMEarth",
"EarthCV",
"plot_basis_functions",
"plot_residuals",
"plot_partial_dependence",
"plot_individual_conditional_expectation",
"get_model_explanation",
]
236 changes: 168 additions & 68 deletions pymars/_basis.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pymars/_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def fit(self, X, categorical_features):
def transform(self, X):
X_arr = np.asarray(X, dtype=object).copy()
if X_arr.ndim == 1:
X_arr = X_arr.reshape(-1,1)
X_arr = X_arr.reshape(-1, 1)
for idx, le in self.encoders.items():
col = X_arr[:, idx]
new_col = []
Expand Down
287 changes: 203 additions & 84 deletions pymars/_forward.py

Large diffs are not rendered by default.

85 changes: 51 additions & 34 deletions pymars/_missing.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

"""
Utilities for handling missing values in pymars.

Expand All @@ -13,7 +12,8 @@

logger = logging.getLogger(__name__)

def handle_missing_X(X, strategy='mean', allow_missing_for_some_strategies=False):

def handle_missing_X(X, strategy="mean", allow_missing_for_some_strategies=False):
"""
Handle missing values in the input feature matrix X.

Expand Down Expand Up @@ -50,29 +50,33 @@ def handle_missing_X(X, strategy='mean', allow_missing_for_some_strategies=False
# and contains non-numeric NaNs like None, or actual strings.
# This basic handler assumes numeric data primarily.
# More sophisticated handling for mixed types would be needed.
pass # For now, let it proceed, np.isnan will fail if not float.
pass # For now, let it proceed, np.isnan will fail if not float.

nan_present = np.isnan(X).any()

if not nan_present:
return X

if strategy == 'error':
if strategy == "error":
raise ValueError("Input X contains NaN values and strategy is 'error'.")

if strategy == 'pass_through':
if strategy == "pass_through":
if allow_missing_for_some_strategies:
return X # Basis functions must be able to handle NaNs
return X # Basis functions must be able to handle NaNs
else:
raise ValueError("Strategy 'pass_through' for NaNs requires model to be configured to allow missing values.")
raise ValueError(
"Strategy 'pass_through' for NaNs requires model to be configured to allow missing values."
)

X_processed = np.copy(X) # Work on a copy
X_processed = np.copy(X) # Work on a copy

if X_processed.ndim == 1: # Handle 1D array case
X_processed = X_processed.reshape(-1, 1) # Temporarily make it 2D for consistent processing
was_1d = True
if X_processed.ndim == 1: # Handle 1D array case
X_processed = X_processed.reshape(
-1, 1
) # Temporarily make it 2D for consistent processing
was_1d = True
else:
was_1d = False
was_1d = False

for j in range(X_processed.shape[1]):
col = X_processed[:, j]
Expand All @@ -81,30 +85,35 @@ def handle_missing_X(X, strategy='mean', allow_missing_for_some_strategies=False
if not nan_mask_col.any():
continue

if strategy == 'mean':
if strategy == "mean":
fill_value = np.nanmean(col)
elif strategy == 'median':
elif strategy == "median":
fill_value = np.nanmedian(col)
elif strategy == 'most_frequent':
elif strategy == "most_frequent":
# Simple approach for most_frequent with numbers
# For categorical, a more robust method (e.g., scipy.stats.mode) is needed
unique_vals, counts = np.unique(col[~nan_mask_col], return_counts=True)
if unique_vals.size > 0:
fill_value = unique_vals[np.argmax(counts)]
else: # All values were NaN
fill_value = 0 # Or some other default
else: # All values were NaN
fill_value = 0 # Or some other default
else:
raise ValueError(f"Unknown missing value strategy: {strategy}")

col[nan_mask_col] = fill_value

if was_1d and X_processed.shape[1] == 1:
X_processed = X_processed.ravel() # Convert back to 1D if original was 1D
X_processed = X_processed.ravel() # Convert back to 1D if original was 1D

return X_processed


def handle_missing_y(y, strategy='mean', allow_missing_for_some_strategies=False, problem_type='regression'):
def handle_missing_y(
y,
strategy="mean",
allow_missing_for_some_strategies=False,
problem_type="regression",
):
"""
Handle missing values in the target variable y.

Expand Down Expand Up @@ -137,40 +146,48 @@ def handle_missing_y(y, strategy='mean', allow_missing_for_some_strategies=False

nan_mask = np.isnan(y)
if not nan_mask.any():
return y, nan_mask # No NaNs
return y, nan_mask # No NaNs

if strategy is None: # Determine default based on problem type
strategy = 'mean' if problem_type == 'regression' else 'error'
if strategy is None: # Determine default based on problem type
strategy = "mean" if problem_type == "regression" else "error"

if strategy == 'error':
if strategy == "error":
raise ValueError("Target y contains NaN values and strategy is 'error'.")

if strategy == 'remove_samples':
if strategy == "remove_samples":
# This strategy implies X also needs to be filtered.
# The function calling this should handle that synchronization.
# Here, we just return the filtered y and the mask of what *was* NaN.
return y[~nan_mask], nan_mask

y_processed = np.copy(y)

if strategy == 'mean':
if problem_type == 'classification':
if strategy == "mean":
if problem_type == "classification":
raise ValueError("Cannot use 'mean' imputation for classification target.")
fill_value = np.nanmean(y_processed)
elif strategy == 'median':
if problem_type == 'classification':
raise ValueError("Cannot use 'median' imputation for classification target.")
elif strategy == "median":
if problem_type == "classification":
raise ValueError(
"Cannot use 'median' imputation for classification target."
)
fill_value = np.nanmedian(y_processed)
elif strategy == 'most_frequent':
elif strategy == "most_frequent":
unique_vals, counts = np.unique(y_processed[~nan_mask], return_counts=True)
if unique_vals.size > 0:
fill_value = unique_vals[np.argmax(counts)]
else: # All values were NaN
fill_value = 0 if problem_type == 'regression' else (y_processed.dtype.type(0) if np.issubdtype(y_processed.dtype, np.integer) else 0.0) # Default
else: # All values were NaN
fill_value = (
0
if problem_type == "regression"
else (
y_processed.dtype.type(0)
if np.issubdtype(y_processed.dtype, np.integer)
else 0.0
)
) # Default
else:
raise ValueError(f"Unknown missing value strategy for y: {strategy}")

y_processed[nan_mask] = fill_value
return y_processed, nan_mask


Loading
Loading