Skip to content

Commit

Permalink
Enable auto early stopping
Browse files Browse the repository at this point in the history
  • Loading branch information
ClaudioSalvatoreArcidiacono committed Sep 9, 2023
1 parent 6862162 commit 340ac3c
Show file tree
Hide file tree
Showing 5 changed files with 154 additions and 62 deletions.
4 changes: 1 addition & 3 deletions python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2538,14 +2538,12 @@ def set_categorical_feature(
self : Dataset
Dataset with set categorical features.
"""
if self.categorical_feature == categorical_feature:
if self.categorical_feature == categorical_feature or categorical_feature == 'auto':
return self
if self.data is not None:
if self.categorical_feature is None:
self.categorical_feature = categorical_feature
return self._free_handle()
elif categorical_feature == 'auto':
return self
else:
if self.categorical_feature != 'auto':
_log_warning('categorical_feature in Dataset is overridden.\n'
Expand Down
3 changes: 3 additions & 0 deletions python-package/lightgbm/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -1145,6 +1145,7 @@ def __init__(
random_state: Optional[Union[int, np.random.RandomState]] = None,
n_jobs: Optional[int] = None,
importance_type: str = 'split',
validation_fraction: Optional[float] = 0.1,
client: Optional[Client] = None,
**kwargs: Any
):
Expand Down Expand Up @@ -1350,6 +1351,7 @@ def __init__(
random_state: Optional[Union[int, np.random.RandomState]] = None,
n_jobs: Optional[int] = None,
importance_type: str = 'split',
validation_fraction: Optional[float] = 0.1,
client: Optional[Client] = None,
**kwargs: Any
):
Expand Down Expand Up @@ -1520,6 +1522,7 @@ def __init__(
random_state: Optional[Union[int, np.random.RandomState]] = None,
n_jobs: Optional[int] = None,
importance_type: str = 'split',
validation_fraction: Optional[float] = 0.1,
client: Optional[Client] = None,
**kwargs: Any
):
Expand Down
37 changes: 23 additions & 14 deletions python-package/lightgbm/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,11 +455,9 @@ def _make_n_folds(
nfold: int,
params: Dict[str, Any],
seed: int,
fpreproc: Optional[_LGBM_PreprocFunction],
stratified: bool,
shuffle: bool,
eval_train_metric: bool
) -> CVBooster:
) -> Iterable[Tuple[np.ndarray, np.ndarray]]:
"""Make a n-fold list of Booster from random indices."""
full_data = full_data.construct()
num_data = full_data.num_data()
Expand Down Expand Up @@ -500,7 +498,16 @@ def _make_n_folds(
test_id = [randidx[i: i + kstep] for i in range(0, num_data, kstep)]
train_id = [np.concatenate([test_id[i] for i in range(nfold) if k != i]) for k in range(nfold)]
folds = zip(train_id, test_id)
return folds


def _make_cvbooster(
full_data: Dataset,
params: Dict[str, Any],
folds: Iterable[Tuple[np.ndarray, np.ndarray]],
fpreproc: Optional[_LGBM_PreprocFunction],
eval_train_metric: bool,
) -> CVBooster:
ret = CVBooster()
for train_idx, test_idx in folds:
train_set = full_data.subset(sorted(train_idx))
Expand Down Expand Up @@ -720,8 +727,10 @@ def cv(

results = defaultdict(list)
cvfolds = _make_n_folds(full_data=train_set, folds=folds, nfold=nfold,
params=params, seed=seed, fpreproc=fpreproc,
stratified=stratified, shuffle=shuffle,
params=params, seed=seed,
stratified=stratified, shuffle=shuffle)
cvbooster = _make_cvbooster(full_data=train_set, folds=cvfolds,
params=params, fpreproc=fpreproc,
eval_train_metric=eval_train_metric)

# setup callbacks
Expand Down Expand Up @@ -752,34 +761,34 @@ def cv(

for i in range(num_boost_round):
for cb in callbacks_before_iter:
cb(callback.CallbackEnv(model=cvfolds,
cb(callback.CallbackEnv(model=cvbooster,
params=params,
iteration=i,
begin_iteration=0,
end_iteration=num_boost_round,
evaluation_result_list=None))
cvfolds.update(fobj=fobj) # type: ignore[call-arg]
res = _agg_cv_result(cvfolds.eval_valid(feval)) # type: ignore[call-arg]
cvbooster.update(fobj=fobj) # type: ignore[call-arg]
res = _agg_cv_result(cvbooster.eval_valid(feval)) # type: ignore[call-arg]
for _, key, mean, _, std in res:
results[f'{key}-mean'].append(mean)
results[f'{key}-stdv'].append(std)
try:
for cb in callbacks_after_iter:
cb(callback.CallbackEnv(model=cvfolds,
cb(callback.CallbackEnv(model=cvbooster,
params=params,
iteration=i,
begin_iteration=0,
end_iteration=num_boost_round,
evaluation_result_list=res))
except callback.EarlyStopException as earlyStopException:
cvfolds.best_iteration = earlyStopException.best_iteration + 1
for bst in cvfolds.boosters:
bst.best_iteration = cvfolds.best_iteration
cvbooster.best_iteration = earlyStopException.best_iteration + 1
for bst in cvbooster.boosters:
bst.best_iteration = cvbooster.best_iteration
for k in results:
results[k] = results[k][:cvfolds.best_iteration]
results[k] = results[k][:cvbooster.best_iteration]
break

if return_cvbooster:
results['cvbooster'] = cvfolds # type: ignore[assignment]
results['cvbooster'] = cvbooster # type: ignore[assignment]

return dict(results)
115 changes: 76 additions & 39 deletions python-package/lightgbm/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
_LGBMCheckClassificationTargets, _LGBMCheckSampleWeight, _LGBMCheckXY, _LGBMClassifierBase,
_LGBMComputeSampleWeight, _LGBMCpuCount, _LGBMLabelEncoder, _LGBMModelBase, _LGBMRegressorBase,
dt_DataTable, pd_DataFrame)
from .engine import train
from .engine import _make_n_folds, train

__all__ = [
'LGBMClassifier',
Expand Down Expand Up @@ -412,6 +412,7 @@ def __init__(
random_state: Optional[Union[int, np.random.RandomState]] = None,
n_jobs: Optional[int] = None,
importance_type: str = 'split',
validation_fraction: Optional[float] = 0.1,
**kwargs
):
r"""Construct a gradient boosting model.
Expand Down Expand Up @@ -491,6 +492,10 @@ def __init__(
The type of feature importance to be filled into ``feature_importances_``.
If 'split', result contains numbers of times the feature is used in a model.
If 'gain', result contains total gains of splits which use the feature.
validation_fraction : float or None, optional (default=0.1)
Proportion of training data to set aside as
validation data for early stopping. If None, early stopping is done on
the training data. Only used if early stopping is performed.
**kwargs
Other parameters for the model.
Check http://lightgbm.readthedocs.io/en/latest/Parameters.html for more parameters.
Expand Down Expand Up @@ -566,6 +571,7 @@ def __init__(
self._n_features_in: int = -1
self._classes: Optional[np.ndarray] = None
self._n_classes: int = -1
self.validation_fraction = validation_fraction
self.set_params(**kwargs)

def _more_tags(self) -> Dict[str, Any]:
Expand Down Expand Up @@ -668,9 +674,24 @@ def _process_params(self, stage: str) -> Dict[str, Any]:
params.pop('importance_type', None)
params.pop('n_estimators', None)
params.pop('class_weight', None)
params.pop("validation_fraction", None)

if isinstance(params['random_state'], np.random.RandomState):
params['random_state'] = params['random_state'].randint(np.iinfo(np.int32).max)

params = _choose_param_value(
main_param_name="early_stopping_round",
params=params,
default_value="auto",
)
if params["early_stopping_round"] == "auto":
params["early_stopping_round"] = 10 if hasattr(self, "n_rows_train") and self.n_rows_train > 10000 else None

if params["early_stopping_round"] is True:
params["early_stopping_round"] = 10
elif params["early_stopping_round"] is False:
params["early_stopping_round"] = None

if self._n_classes > 2:
for alias in _ConfigAliases.get('num_class'):
params.pop(alias, None)
Expand Down Expand Up @@ -765,7 +786,6 @@ def fit(
params['metric'] = [params['metric']] if isinstance(params['metric'], (str, type(None))) else params['metric']
params['metric'] = [e for e in eval_metrics_builtin if e not in params['metric']] + params['metric']
params['metric'] = [metric for metric in params['metric'] if metric is not None]

if not isinstance(X, (pd_DataFrame, dt_DataTable)):
_X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, ensure_min_samples=2)
if sample_weight is not None:
Expand All @@ -789,44 +809,61 @@ def fit(
train_set = Dataset(data=_X, label=_y, weight=sample_weight, group=group,
init_score=init_score, categorical_feature=categorical_feature,
params=params)
self._n_rows_train = _X.shape[0]
if params["early_stopping_round"] == "auto":
params["early_stopping_round"] = 10 if self.n_rows_train > 10000 else None
if params["early_stopping_round"] is not None and eval_set is None:
if self.validation_fraction is not None:
n_splits = max(int(np.ceil(1 / self.validation_fraction)), 2)
stratified = isinstance(self, LGBMClassifier)
cvfolds = _make_n_folds(full_data=train_set, folds=None, nfold=n_splits,
params=params, seed=self.random_state,
stratified=stratified, shuffle=True)
train_idx, val_idx = next(cvfolds)
valid_set = train_set.subset(sorted(val_idx))
train_set = train_set.subset(sorted(train_idx))
else:
valid_set = train_set
valid_set = valid_set.construct()
valid_sets = [valid_set]

valid_sets: List[Dataset] = []
if eval_set is not None:

def _get_meta_data(collection, name, i):
if collection is None:
return None
elif isinstance(collection, list):
return collection[i] if len(collection) > i else None
elif isinstance(collection, dict):
return collection.get(i, None)
else:
raise TypeError(f"{name} should be dict or list")

if isinstance(eval_set, tuple):
eval_set = [eval_set]
for i, valid_data in enumerate(eval_set):
# reduce cost for prediction training data
if valid_data[0] is X and valid_data[1] is y:
valid_set = train_set
else:
valid_weight = _get_meta_data(eval_sample_weight, 'eval_sample_weight', i)
valid_class_weight = _get_meta_data(eval_class_weight, 'eval_class_weight', i)
if valid_class_weight is not None:
if isinstance(valid_class_weight, dict) and self._class_map is not None:
valid_class_weight = {self._class_map[k]: v for k, v in valid_class_weight.items()}
valid_class_sample_weight = _LGBMComputeSampleWeight(valid_class_weight, valid_data[1])
if valid_weight is None or len(valid_weight) == 0:
valid_weight = valid_class_sample_weight
else:
valid_weight = np.multiply(valid_weight, valid_class_sample_weight)
valid_init_score = _get_meta_data(eval_init_score, 'eval_init_score', i)
valid_group = _get_meta_data(eval_group, 'eval_group', i)
valid_set = Dataset(data=valid_data[0], label=valid_data[1], weight=valid_weight,
group=valid_group, init_score=valid_init_score,
categorical_feature='auto', params=params)

valid_sets.append(valid_set)
else:
valid_sets: List[Dataset] = []
if eval_set is not None:
def _get_meta_data(collection, name, i):
if collection is None:
return None
elif isinstance(collection, list):
return collection[i] if len(collection) > i else None
elif isinstance(collection, dict):
return collection.get(i, None)
else:
raise TypeError(f"{name} should be dict or list")

if isinstance(eval_set, tuple):
eval_set = [eval_set]
for i, valid_data in enumerate(eval_set):
# reduce cost for prediction training data
if valid_data[0] is X and valid_data[1] is y:
valid_set = train_set
else:
valid_weight = _get_meta_data(eval_sample_weight, 'eval_sample_weight', i)
valid_class_weight = _get_meta_data(eval_class_weight, 'eval_class_weight', i)
if valid_class_weight is not None:
if isinstance(valid_class_weight, dict) and self._class_map is not None:
valid_class_weight = {self._class_map[k]: v for k, v in valid_class_weight.items()}
valid_class_sample_weight = _LGBMComputeSampleWeight(valid_class_weight, valid_data[1])
if valid_weight is None or len(valid_weight) == 0:
valid_weight = valid_class_sample_weight
else:
valid_weight = np.multiply(valid_weight, valid_class_sample_weight)
valid_init_score = _get_meta_data(eval_init_score, 'eval_init_score', i)
valid_group = _get_meta_data(eval_group, 'eval_group', i)
valid_set = Dataset(data=valid_data[0], label=valid_data[1], weight=valid_weight,
group=valid_group, init_score=valid_init_score,
categorical_feature='auto', params=params)

valid_sets.append(valid_set)

if isinstance(init_model, LGBMModel):
init_model = init_model.booster_
Expand Down
57 changes: 51 additions & 6 deletions tests/python_package_test/test_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,51 @@ def test_binary_classification_with_custom_objective():
assert ret < 0.05


@pytest.mark.parametrize('use_weight', [True, False])
def test_binary_classification_with_auto_early_stopping(use_weight):

X, y = load_breast_cancer(return_X_y=True)
n_estimators = 1000
gbm = lgb.LGBMClassifier(
n_estimators=n_estimators, random_state=42, verbose=-1, early_stopping=True
)
weight = np.full_like(y, 2) if use_weight else None
gbm.fit(X, y, sample_weight=weight)
assert bool(gbm.best_iteration_)


def test_regression_with_auto_early_stopping():
X, y = make_synthetic_regression()
n_estimators = 1000
gbm = lgb.LGBMRegressor(
n_estimators=n_estimators,
random_state=42,
early_stopping=True,
verbose=-1,
)
gbm.fit(X, y)
assert bool(gbm.best_iteration_)

@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version')
def test_lambdarank_with_auto_early_stopping():
rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank'
X_train, y_train = load_svmlight_file(str(rank_example_dir / 'rank.train'))
q_train = np.loadtxt(str(rank_example_dir / 'rank.train.query'))
gbm = lgb.LGBMRanker(
n_estimators=50, random_state=42, early_stopping=True
)
gbm.fit(
X_train,
y_train,
group=q_train,
eval_at=[1, 3],
callbacks=[
lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))
]
)
assert bool(gbm.best_iteration_)


def test_dart():
X, y = make_synthetic_regression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
Expand Down Expand Up @@ -609,21 +654,21 @@ def test_pandas_categorical():
X[cat_cols_actual] = X[cat_cols_actual].astype('category')
X_test[cat_cols_actual] = X_test[cat_cols_actual].astype('category')
cat_values = [X[col].cat.categories.tolist() for col in cat_cols_to_store]
gbm0 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y)
gbm0 = lgb.sklearn.LGBMClassifier(n_estimators=10, random_state=42).fit(X, y)
pred0 = gbm0.predict(X_test, raw_score=True)
pred_prob = gbm0.predict_proba(X_test)[:, 1]
gbm1 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, pd.Series(y), categorical_feature=[0])
gbm1 = lgb.sklearn.LGBMClassifier(n_estimators=10, random_state=42).fit(X, pd.Series(y), categorical_feature=[0])
pred1 = gbm1.predict(X_test, raw_score=True)
gbm2 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=['A'])
gbm2 = lgb.sklearn.LGBMClassifier(n_estimators=10, random_state=42).fit(X, y, categorical_feature=['A'])
pred2 = gbm2.predict(X_test, raw_score=True)
gbm3 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=['A', 'B', 'C', 'D'])
gbm3 = lgb.sklearn.LGBMClassifier(n_estimators=10, random_state=42).fit(X, y, categorical_feature=['A', 'B', 'C', 'D'])
pred3 = gbm3.predict(X_test, raw_score=True)
gbm3.booster_.save_model('categorical.model')
gbm4 = lgb.Booster(model_file='categorical.model')
pred4 = gbm4.predict(X_test)
gbm5 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=['A', 'B', 'C', 'D', 'E'])
gbm5 = lgb.sklearn.LGBMClassifier(n_estimators=10, random_state=42).fit(X, y, categorical_feature=['A', 'B', 'C', 'D', 'E'])
pred5 = gbm5.predict(X_test, raw_score=True)
gbm6 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=[])
gbm6 = lgb.sklearn.LGBMClassifier(n_estimators=10, random_state=42).fit(X, y, categorical_feature=[])
pred6 = gbm6.predict(X_test, raw_score=True)
with pytest.raises(AssertionError):
np.testing.assert_allclose(pred0, pred1)
Expand Down

0 comments on commit 340ac3c

Please sign in to comment.