diff --git a/docs/source/user_guide/modeling_gcm/model_evaluation.rst b/docs/source/user_guide/modeling_gcm/model_evaluation.rst index abea8881cc..df7c242805 100644 --- a/docs/source/user_guide/modeling_gcm/model_evaluation.rst +++ b/docs/source/user_guide/modeling_gcm/model_evaluation.rst @@ -81,7 +81,7 @@ the chain structure example X→Y→Z again: If non-root node and the data is categorical: A functional causal model based on a classifier, i.e., X_i = f(PA_i, N_i). - Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a class (category) using the conditional probability distribution produced by a classification model.Here, different model classes are evaluated using the (negative) F1 score and the best performing model class is selected. + Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a class (category) using the conditional probability distribution produced by a classification model. Here, different model classes are evaluated using the log loss metric and the best performing model class is selected. In total, 3 nodes were analyzed: diff --git a/dowhy/gcm/auto.py b/dowhy/gcm/auto.py index 0adef5ca8e..b888af94d8 100644 --- a/dowhy/gcm/auto.py +++ b/dowhy/gcm/auto.py @@ -11,7 +11,6 @@ from sklearn.exceptions import ConvergenceWarning from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.model_selection import KFold, StratifiedKFold, train_test_split -from sklearn.preprocessing import MultiLabelBinarizer from dowhy.gcm import config from dowhy.gcm.causal_mechanisms import AdditiveNoiseModel, ClassifierFCM, DiscreteAdditiveNoiseModel @@ -30,6 +29,7 @@ ) from dowhy.gcm.ml.classification import ( create_ada_boost_classifier, + create_decision_tree_classifier, create_extra_trees_classifier, create_gaussian_nb_classifier, create_knn_classifier, @@ -55,8 +55,9 @@ from dowhy.graph import get_ordered_predecessors, is_root_node _LIST_OF_POTENTIAL_CLASSIFIERS_GOOD = [ - partial(create_logistic_regression_classifier, max_iter=10000), create_hist_gradient_boost_classifier, + partial(create_logistic_regression_classifier, max_iter=10000), + create_decision_tree_classifier, ] _LIST_OF_POTENTIAL_REGRESSORS_GOOD = [ create_linear_regressor, @@ -152,9 +153,8 @@ def __str__(self): summary_strings.append( "A functional causal model based on a classifier, i.e., X_i = f(PA_i, N_i).\n" "Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a " - "class (category) using the conditional probability distribution produced by a " - "classification model." - "Here, different model classes are evaluated using the (negative) F1 score and the best" + "class (category) using the conditional probability distribution produced by a classification model. " + "Here, different model classes are evaluated using the log loss metric and the best" " performing model class is selected." ) summary_strings.append("\nIn total, %d nodes were analyzed:" % len(list(self._nodes))) @@ -223,7 +223,7 @@ def assign_causal_mechanisms( A functional causal model based on a classifier, i.e., X_i = f(PA_i, N_i). Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a class (category) using the conditional probability distribution produced by a classification model. Here, different model classes are evaluated - using the (negative) F1 score and the best performing model class is selected. + using the log loss metric and the best performing model class is selected. The current model zoo is: @@ -528,20 +528,13 @@ def find_best_model( metric_name = "given" if metric is None: - metric_name = "(negative) F1" if is_classification_problem: - metric = lambda y_true, y_preds: -metrics.f1_score( - y_true, y_preds, average="macro", zero_division=0 - ) # Higher score is better + metric_name = "log loss" + metric = metrics.log_loss # Lower score is better (better calibrated probabilities) else: metric_name = "mean squared error (MSE)" metric = metrics.mean_squared_error - labelBinarizer = None - if is_classification_problem: - labelBinarizer = MultiLabelBinarizer() - labelBinarizer.fit(Y) - if is_classification_problem: if len(np.unique(Y)) == 1: raise ValueError( @@ -559,20 +552,29 @@ def estimate_average_score(prediction_model_factory: Callable[[], PredictionMode with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=ConvergenceWarning) + warnings.filterwarnings("ignore", category=RuntimeWarning) for train_indices, test_indices in kfolds: - if is_classification_problem and len(np.unique(Y[train_indices[:max_samples_per_split]])) == 1: - continue + if is_classification_problem: + unique_training_labels = np.unique(Y[train_indices[:max_samples_per_split]]) + unique_test_labels = np.unique(Y[test_indices[:max_samples_per_split]]) + if len(unique_training_labels) == 1 or len(unique_test_labels) == 1: + continue model_instance = prediction_model_factory() model_instance.fit(X[train_indices[:max_samples_per_split]], Y[train_indices[:max_samples_per_split]]) y_true = Y[test_indices[:max_samples_per_split]] - y_pred = model_instance.predict(X[test_indices[:max_samples_per_split]]) - if labelBinarizer is not None: - y_true = labelBinarizer.transform(y_true) - y_pred = labelBinarizer.transform(y_pred) - average_result.append(metric(y_true, y_pred)) + if is_classification_problem: + # For classification, use probabilities for log loss calculation + y_pred_proba = model_instance.predict_probabilities(X[test_indices[:max_samples_per_split]]) + # Convert string labels to label indices for log_loss + label_to_idx = {label: idx for idx, label in enumerate(unique_test_labels)} + y_true_indices = np.array([label_to_idx[label] for label in y_true.flatten()]) + average_result.append(metric(y_true_indices, y_pred_proba)) + else: + y_pred = model_instance.predict(X[test_indices[:max_samples_per_split]]) + average_result.append(metric(y_true, y_pred)) if len(average_result) == 0: return float("inf") diff --git a/dowhy/gcm/ml/classification.py b/dowhy/gcm/ml/classification.py index 105b022465..18283913a6 100644 --- a/dowhy/gcm/ml/classification.py +++ b/dowhy/gcm/ml/classification.py @@ -6,6 +6,7 @@ from packaging import version from sklearn.pipeline import make_pipeline from sklearn.preprocessing import PolynomialFeatures +from sklearn.tree import DecisionTreeClassifier from dowhy.gcm.ml.prediction_model import PredictionModel @@ -107,3 +108,7 @@ def create_polynom_logistic_regression_classifier( PolynomialFeatures(degree=degree, include_bias=False), LogisticRegression(**kwargs_logistic_regression) ) ) + + +def create_decision_tree_classifier() -> SklearnClassificationModel: + return SklearnClassificationModel(DecisionTreeClassifier()) diff --git a/tests/gcm/test_auto.py b/tests/gcm/test_auto.py index b36074f97a..1e28231087 100644 --- a/tests/gcm/test_auto.py +++ b/tests/gcm/test_auto.py @@ -9,6 +9,7 @@ from sklearn.linear_model import ElasticNetCV, LassoCV, LinearRegression, LogisticRegression, RidgeCV from sklearn.naive_bayes import GaussianNB from sklearn.pipeline import Pipeline +from sklearn.tree import DecisionTreeClassifier from dowhy import gcm from dowhy.gcm import ( @@ -40,19 +41,84 @@ def _generate_non_linear_regression_data(): def _generate_linear_classification_data(): - X = np.random.normal(0, 1, (1000, 5)) + X = np.random.normal(0, 1, (100, 5)) Y = (np.sum(X * np.random.uniform(-5, 5, X.shape[1]), axis=1) > 0).astype(str) return X, Y -def _generate_non_classification_data(): +def _generate_non_linear_classification_data(): X = np.random.normal(0, 1, (1000, 5)) Y = (np.sum(np.exp(X), axis=1) > np.median(np.sum(np.exp(X), axis=1))).astype(str) return X, Y +def _generate_linear_multiclass_classification_data_with_mixed_features(): + """Generate multi-class classification data with mixed categorical and numerical features (linear relationship).""" + n_samples = 100 + + # Numerical features + num_feat1 = np.random.normal(0, 1, n_samples) + num_feat2 = np.random.normal(0, 1, n_samples) + + # Categorical features + cat_feat1 = np.random.choice(["TypeA", "TypeB"], n_samples) + cat_feat2 = np.random.choice(["Group1", "Group2", "Group3"], n_samples) + + # Create target variable based on linear combination of features + # Convert categorical to numerical for decision making + cat1_numeric = np.where(cat_feat1 == "TypeA", 1, -1) + cat2_numeric = np.where(cat_feat2 == "Group1", 2, np.where(cat_feat2 == "Group2", 0, -2)) + + # Linear combination to determine class + decision_value = 2 * num_feat1 + 1.5 * num_feat2 + 0.8 * cat1_numeric + 0.5 * cat2_numeric + + # Convert to 3 classes + Y = np.where(decision_value > 1, "Class_A", np.where(decision_value > -1, "Class_B", "Class_C")) + + # Combine features + X = np.column_stack([num_feat1, num_feat2, cat_feat1, cat_feat2]) + + return X, Y + + +def _generate_non_linear_multiclass_classification_data_with_mixed_features(): + """Generate multi-class classification data with mixed categorical and numerical features (non-linear relationship).""" + n_samples = 1000 + + # Numerical features + num_feat1 = np.random.normal(0, 1, n_samples) + num_feat2 = np.random.normal(0, 1, n_samples) + + # Categorical features + cat_feat1 = np.random.choice(["TypeA", "TypeB"], n_samples) + cat_feat2 = np.random.choice(["Group1", "Group2", "Group3"], n_samples) + + # Create target variable based on non-linear combination of features + # Convert categorical to numerical for decision making + cat1_numeric = np.where(cat_feat1 == "TypeA", 1, -1) + cat2_numeric = np.where(cat_feat2 == "Group1", 2, np.where(cat_feat2 == "Group2", 0, -2)) + + # Non-linear combination: use exponentials and products + decision_value = ( + np.exp(num_feat1 * 0.5) + + np.sin(num_feat2 * 2) + + num_feat1 * num_feat2 * 0.3 + + cat1_numeric * np.exp(num_feat2 * 0.2) + + cat2_numeric * np.cos(num_feat1) + ) + + # Convert to 3 classes based on percentiles + p33, p67 = np.percentile(decision_value, [33, 67]) + Y = np.where(decision_value > p67, "Class_A", np.where(decision_value > p33, "Class_B", "Class_C")) + + # Combine features + X = np.column_stack([num_feat1, num_feat2, cat_feat1, cat_feat2]) + + return X, Y + + @flaky(max_runs=3) def test_given_linear_regression_problem_when_auto_assign_causal_models_with_good_quality_returns_linear_model(): X, Y = _generate_linear_regression_data() @@ -148,7 +214,7 @@ def test_given_linear_classification_problem_when_auto_assign_causal_models_with @flaky(max_runs=3) def test_given_non_linear_classification_problem_when_auto_assign_causal_models_with_good_quality_returns_non_linear_model(): - X, Y = _generate_non_classification_data() + X, Y = _generate_non_linear_classification_data() causal_model = ProbabilisticCausalModel( nx.DiGraph([("X0", "Y"), ("X1", "Y"), ("X2", "Y"), ("X3", "Y"), ("X4", "Y")]) @@ -164,7 +230,7 @@ def test_given_non_linear_classification_problem_when_auto_assign_causal_models_ @flaky(max_runs=3) def test_given_non_linear_classification_problem_when_auto_assign_causal_models_with_better_quality_returns_non_linear_model(): - X, Y = _generate_non_classification_data() + X, Y = _generate_non_linear_classification_data() causal_model = ProbabilisticCausalModel( nx.DiGraph([("X0", "Y"), ("X1", "Y"), ("X2", "Y"), ("X3", "Y"), ("X4", "Y")]) @@ -384,7 +450,7 @@ def test_given_continuous_data_when_print_auto_summary_then_returns_expected_for If non-root node and the data is categorical: A functional causal model based on a classifier, i.e., X_i = f(PA_i, N_i). -Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a class (category) using the conditional probability distribution produced by a classification model.Here, different model classes are evaluated using the (negative) F1 score and the best performing model class is selected. +Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a class (category) using the conditional probability distribution produced by a classification model. Here, different model classes are evaluated using the log loss metric and the best performing model class is selected. In total, 6 nodes were analyzed: @@ -459,7 +525,7 @@ def test_given_categorical_data_when_print_auto_summary_then_returns_expected_fo If non-root node and the data is categorical: A functional causal model based on a classifier, i.e., X_i = f(PA_i, N_i). -Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a class (category) using the conditional probability distribution produced by a classification model.Here, different model classes are evaluated using the (negative) F1 score and the best performing model class is selected. +Here, N_i follows a uniform distribution on [0, 1] and is used to randomly sample a class (category) using the conditional probability distribution produced by a classification model. Here, different model classes are evaluated using the log loss metric and the best performing model class is selected. In total, 6 nodes were analyzed: @@ -483,7 +549,7 @@ def test_given_categorical_data_when_print_auto_summary_then_returns_expected_fo in summary_string ) assert "This represents the causal relationship as Y := f(X0,X1,X2,X3,X4,N)." in summary_string - assert "For the model selection, the following models were evaluated on the (negative) F1 metric:" in summary_string + assert "For the model selection, the following models were evaluated on the log loss metric:" in summary_string assert ( """===Note=== Note, based on the selected auto assignment quality, the set of evaluated models changes. @@ -602,3 +668,27 @@ def test_given_missing_data_mixed_numerical_and_categorical_when_auto_assign_mec # Just check if it doesn't raise errors. gcm.intrinsic_causal_influence(causal_model, "Z") + + +@flaky(max_runs=3) +def test_given_linear_multiclass_mixed_features_when_auto_assign_causal_models_with_good_quality_returns_linear_model(): + X, Y = _generate_linear_multiclass_classification_data_with_mixed_features() + + causal_model = ProbabilisticCausalModel(nx.DiGraph([("X0", "Y"), ("X1", "Y"), ("X2", "Y"), ("X3", "Y")])) + data = {"X" + str(i): X[:, i] for i in range(X.shape[1])} + data.update({"Y": Y}) + assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.GOOD) + assert isinstance(causal_model.causal_mechanism("Y").classifier_model.sklearn_model, LogisticRegression) + + +@flaky(max_runs=3) +def test_given_non_linear_multiclass_mixed_features_when_auto_assign_causal_models_with_good_quality_returns_non_linear_model(): + X, Y = _generate_non_linear_multiclass_classification_data_with_mixed_features() + + causal_model = ProbabilisticCausalModel(nx.DiGraph([("X0", "Y"), ("X1", "Y"), ("X2", "Y"), ("X3", "Y")])) + data = {"X" + str(i): X[:, i] for i in range(X.shape[1])} + data.update({"Y": Y}) + assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.GOOD) + assert isinstance( + causal_model.causal_mechanism("Y").classifier_model.sklearn_model, DecisionTreeClassifier + ) or isinstance(causal_model.causal_mechanism("Y").classifier_model.sklearn_model, HistGradientBoostingClassifier)