Merge pull request #352 from kozistr/feature/scion-optimizer

kozistr · web-flow · commit d18bb4bde96f · 2025-02-19T23:48:18.000+09:00
[Feature] Implement `SCION` optimizer
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@
 
 ## The reasons why you use `pytorch-optimizer`.
 
-* Wide range of supported optimizers. Currently, **98 optimizers (+ `bitsandbytes`, `qgalore`, `torchao`)**, **16 lr schedulers**, and **13 loss functions** are supported!
+* Wide range of supported optimizers. Currently, **99 optimizers (+ `bitsandbytes`, `qgalore`, `torchao`)**, **16 lr schedulers**, and **13 loss functions** are supported!
 * Including many variants such as `ADOPT`, `Cautious`, `AdamD`, `StableAdamW`, and `Gradient Centrailiaztion`
 * Easy to use, clean, and tested codes
 * Active maintenance
@@ -206,6 +206,7 @@ get_supported_optimizers(['adam*', 'ranger*'])
 | EXAdam        | *The Power of Adaptive Cross-Moments*                                                             | [github](https://github.com/AhmedMostafa16/EXAdam)                                                             | <https://arxiv.org/abs/2412.20302>                                                          | [cite](https://github.com/AhmedMostafa16/EXAdam?tab=readme-ov-file#citation)                                                        |
 | GCSAM         | *Gradient Centralized Sharpness Aware Minimization*                                               | [github](https://github.com/mhassann22/GCSAM)                                                                  | <https://arxiv.org/abs/2501.11584>                                                          | [cite](https://github.com/mhassann22/GCSAM?tab=readme-ov-file#citation)                                                             |
 | LookSAM       | *Towards Efficient and Scalable Sharpness-Aware Minimization*                                     | [github](https://github.com/rollovd/LookSAM)                                                                   | <https://arxiv.org/abs/2203.02714>                                                          | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220302714L/exportcitation)                                                        |
+| SCION         | *Training Deep Learning Models with Norm-Constrained LMOs*                                        |                                                                                                                | <https://arxiv.org/abs/2502.07529>                                                          | [cite](https://ui.adsabs.harvard.edu/abs/2025arXiv250207529P/exportcitation)                                                        |
 
 ## Supported LR Scheduler
 
diff --git a/docs/changelogs/v3.4.2.md b/docs/changelogs/v3.4.2.md
@@ -0,0 +1,14 @@
+### Change Log
+
+### Feature
+
+* Implement `SCION` optimizer. (#348, #352)
+    * [Training Deep Learning Models with Norm-Constrained LMOs](https://arxiv.org/abs/2502.07529)
+
+### Docs
+
+* Fix `AliG` optimizer visualization. (#350)
+
+### Contributions
+
+thanks to @AidinHamedi
diff --git a/docs/index.md b/docs/index.md
@@ -10,7 +10,7 @@
 
 ## The reasons why you use `pytorch-optimizer`.
 
-* Wide range of supported optimizers. Currently, **98 optimizers (+ `bitsandbytes`, `qgalore`, `torchao`)**, **16 lr schedulers**, and **13 loss functions** are supported!
+* Wide range of supported optimizers. Currently, **99 optimizers (+ `bitsandbytes`, `qgalore`, `torchao`)**, **16 lr schedulers**, and **13 loss functions** are supported!
 * Including many variants such as `ADOPT`, `Cautious`, `AdamD`, `StableAdamW`, and `Gradient Centrailiaztion`
 * Easy to use, clean, and tested codes
 * Active maintenance
@@ -206,6 +206,7 @@ get_supported_optimizers(['adam*', 'ranger*'])
 | EXAdam        | *The Power of Adaptive Cross-Moments*                                                             | [github](https://github.com/AhmedMostafa16/EXAdam)                                                             | <https://arxiv.org/abs/2412.20302>                                                          | [cite](https://github.com/AhmedMostafa16/EXAdam?tab=readme-ov-file#citation)                                                        |
 | GCSAM         | *Gradient Centralized Sharpness Aware Minimization*                                               | [github](https://github.com/mhassann22/GCSAM)                                                                  | <https://arxiv.org/abs/2501.11584>                                                          | [cite](https://github.com/mhassann22/GCSAM?tab=readme-ov-file#citation)                                                             |
 | LookSAM       | *Towards Efficient and Scalable Sharpness-Aware Minimization*                                     | [github](https://github.com/rollovd/LookSAM)                                                                   | <https://arxiv.org/abs/2203.02714>                                                          | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220302714L/exportcitation)                                                        |
+| SCION         | *Training Deep Learning Models with Norm-Constrained LMOs*                                        |                                                                                                                | <https://arxiv.org/abs/2502.07529>                                                          | [cite](https://ui.adsabs.harvard.edu/abs/2025arXiv250207529P/exportcitation)                                                        |
 
 ## Supported LR Scheduler
 
diff --git a/docs/optimizer.md b/docs/optimizer.md
@@ -336,6 +336,10 @@
     :docstring:
     :members:
 
+::: pytorch_optimizer.SCION
+    :docstring:
+    :members:
+
 ::: pytorch_optimizer.StableAdamW
     :docstring:
     :members:
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,10 +18,10 @@ keywords = [
     "GrokFast", "GSAM", "Kate", "Lamb", "LaProp", "LARS", "Lion", "LOMO", "Lookahead", "MADGRAD", "MARS", "MSVAG",
     "Muno", "Nero", "NovoGrad", "OrthoGrad", "PAdam", "PCGrad", "PID", "PNM", "Prodigy", "PSGD", "QHAdam", "QHM",
     "RAdam", "Ranger", "Ranger21", "RotoGrad", "SAM", "GCSAM", "LookSAM", "ScheduleFreeSGD", "ScheduleFreeAdamW",
-    "ScheduleFreeRAdam", "SGDP", "Shampoo", "ScalableShampoo", "SGDW", "SignSGD", "SM3", "SOAP", "SopihaH", "SPAM",
-    "SRMM", "StableAdamW", "SWATS", "TAM", "Tiger", "TRAC", "WSAM", "Yogi", "BCE", "BCEFocal", "Focal", "FocalCosine",
-    "SoftF1", "Dice", "LDAM", "Jaccard", "Bi-Tempered", "Tversky", "FocalTversky", "LovaszHinge", "bitsandbytes", "WSD",
-    "QGaLore",
+    "ScheduleFreeRAdam", "SCION", "SGDP", "Shampoo", "ScalableShampoo", "SGDW", "SignSGD", "SM3", "SOAP", "SopihaH",
+    "SPAM", "SRMM", "StableAdamW", "SWATS", "TAM", "Tiger", "TRAC", "WSAM", "Yogi", "BCE", "BCEFocal", "Focal",
+    "FocalCosine", "SoftF1", "Dice", "LDAM", "Jaccard", "Bi-Tempered", "Tversky", "FocalTversky", "LovaszHinge",
+    "bitsandbytes", "WSD", "QGaLore",
 ]
 classifiers = [
     "License :: OSI Approved :: Apache Software License",
diff --git a/pytorch_optimizer/__init__.py b/pytorch_optimizer/__init__.py
@@ -58,6 +58,7 @@
     PNM,
     QHM,
     SAM,
+    SCION,
     SGDP,
     SGDW,
     SM3,
diff --git a/pytorch_optimizer/optimizer/__init__.py b/pytorch_optimizer/optimizer/__init__.py
@@ -81,6 +81,7 @@
 from pytorch_optimizer.optimizer.rotograd import RotoGrad
 from pytorch_optimizer.optimizer.sam import BSAM, GSAM, SAM, WSAM, LookSAM
 from pytorch_optimizer.optimizer.schedulefree import ScheduleFreeAdamW, ScheduleFreeRAdam, ScheduleFreeSGD
+from pytorch_optimizer.optimizer.scion import SCION
 from pytorch_optimizer.optimizer.sgd import ASGD, SGDW, AccSGD, SGDSaI, SignSGD
 from pytorch_optimizer.optimizer.sgdp import SGDP
 from pytorch_optimizer.optimizer.shampoo import ScalableShampoo, Shampoo
@@ -300,6 +301,7 @@ def load_optimizer(optimizer: str) -> OPTIMIZER:
     SPAM,
     Kron,
     EXAdam,
+    SCION,
     Ranger25,
 ]
 OPTIMIZERS: Dict[str, OPTIMIZER] = {str(optimizer.__name__).lower(): optimizer for optimizer in OPTIMIZER_LIST}
diff --git a/pytorch_optimizer/optimizer/scion.py b/pytorch_optimizer/optimizer/scion.py
@@ -0,0 +1,113 @@
+from typing import Literal
+
+import torch
+
+from pytorch_optimizer.base.exception import NoSparseGradientError
+from pytorch_optimizer.base.optimizer import BaseOptimizer
+from pytorch_optimizer.base.type import CLOSURE, DEFAULTS, LOSS, PARAMETERS
+from pytorch_optimizer.optimizer.shampoo_utils import zero_power_via_newton_schulz_5
+
+LMO_TYPE = Literal['spectral', 'sign', 'col_norm', 'row_norm']
+
+
+class SCION(BaseOptimizer):
+    r"""Training Deep Learning Models with Norm-Constrained LMOs.
+
+    :param params: PARAMETERS. iterable of parameters to optimize or dicts defining parameter groups.
+    :param lr: float. learning rate.
+    :param momentum: float. momentum factor.
+    :param constraint: bool. whether to use a constraint SCG or not.
+    :param lmo_type: LMO_TYPE. supported LMO types.
+    :param weight_decay: float. weight decay (L2 penalty).
+    :param weight_decouple: bool. the optimizer uses decoupled weight decay as in AdamW.
+    """
+
+    def __init__(
+        self,
+        params: PARAMETERS,
+        lr: float = 1e-4,
+        momentum: float = 0.1,
+        constraint: bool = False,
+        lmo_type: LMO_TYPE = 'spectral',
+        weight_decay: float = 0.0,
+        weight_decouple: bool = True,
+        **kwargs,
+    ):
+        self.validate_learning_rate(lr)
+        self.validate_range(momentum, 'momentum', 0.0, 1.0, '(]')
+        self.validate_options(lmo_type, 'lmo_type', ['spectral', 'sign', 'col_norm', 'row_norm'])
+
+        defaults: DEFAULTS = {
+            'lr': lr,
+            'momentum': momentum,
+            'constraint': constraint,
+            'lmo_type': lmo_type,
+            'weight_decay': weight_decay,
+            'weight_decouple': weight_decouple,
+        }
+        super().__init__(params, defaults)
+
+    def __str__(self) -> str:
+        return 'SCION'
+
+    @torch.no_grad()
+    def reset(self):
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+                state['d'] = torch.zeros_like(p)
+
+    @staticmethod
+    def get_lmo_direction(grad: torch.Tensor, lmo_type: str) -> torch.Tensor:
+        r"""Get LMO direction."""
+        if lmo_type == 'spectral' and grad.ndim == 2:
+            return zero_power_via_newton_schulz_5(grad)
+        if lmo_type == 'sign':
+            return torch.sign(grad)
+        if lmo_type == 'col_norm':
+            return grad / torch.norm(grad, dim=0, keepdim=True).add_(1e-6)
+        if lmo_type == 'row_norm' and grad.ndim == 2:
+            return grad / torch.norm(grad, dim=1, keepdim=True).add_(1e-6)
+        return torch.sign(grad)
+
+    @torch.no_grad()
+    def step(self, closure: CLOSURE = None) -> LOSS:
+        loss: LOSS = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            step_size: float = -group['lr']
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+
+                grad = p.grad
+                if grad.is_sparse:
+                    raise NoSparseGradientError(str(self))
+
+                state = self.state[p]
+                if 'd' not in state:
+                    state['d'] = torch.zeros_like(p)
+
+                d = state['d']
+                d.mul_(1.0 - group['momentum']).add_(grad, alpha=group['momentum'])
+
+                update = self.get_lmo_direction(d, group['lmo_type'])
+
+                if not group['constraint']:
+                    self.apply_weight_decay(
+                        p,
+                        grad,
+                        lr=group['lr'],
+                        weight_decay=group['weight_decay'],
+                        weight_decouple=group['weight_decouple'],
+                        fixed_decay=False,
+                    )
+
+                    p.add_(update, alpha=step_size)
+                else:
+                    p.mul_(1.0 - step_size).add_(update, alpha=step_size)
+
+        return loss
diff --git a/tests/constants.py b/tests/constants.py
@@ -15,6 +15,7 @@
     PID,
     PNM,
     QHM,
+    SCION,
     SGDP,
     SGDW,
     SM3,
@@ -563,6 +564,8 @@
     (FOCUS, {'lr': 1e-1, 'weight_decay': 1e-3}, 5),
     (Kron, {'lr': 1e0, 'weight_decay': 1e-3}, 3),
     (EXAdam, {'lr': 1e-1, 'weight_decay': 1e-3}, 5),
+    (SCION, {'lr': 5e-1, 'constraint': False, 'weight_decay': 1e-3}, 10),
+    (SCION, {'lr': 1e-1, 'constraint': True}, 10),
     (Ranger25, {'lr': 1e-1}, 3),
     (Ranger25, {'lr': 1e-1, 't_alpha_beta3': 5}, 3),
     (Ranger25, {'lr': 5e-2, 'stable_adamw': False, 'orthograd': False, 'eps': None, 'lookahead_merge_time': 2}, 3),
diff --git a/tests/test_general_optimizer_parameters.py b/tests/test_general_optimizer_parameters.py
@@ -57,6 +57,7 @@ def test_epsilon(optimizer_name):
         'focus',
         'kron',
         'sgd',
+        'scion',
     ):
         pytest.skip(f'skip {optimizer_name} optimizer')
 
@@ -86,6 +87,7 @@ def test_weight_decay(optimizer_name):
         'lomo',
         'ftrl',
         'muon',
+        'scion',
     ):
         pytest.skip(f'skip {optimizer_name} optimizer')
 
diff --git a/tests/test_load_modules.py b/tests/test_load_modules.py
@@ -34,7 +34,7 @@ def test_load_lr_scheduler_invalid(invalid_lr_scheduler_names):
 
 
 def test_get_supported_optimizers():
-    assert len(get_supported_optimizers()) == 95
+    assert len(get_supported_optimizers()) == 96
     assert len(get_supported_optimizers('adam*')) == 8
     assert len(get_supported_optimizers(['adam*', 'ranger*'])) == 11
 
diff --git a/tests/test_optimizer_parameters.py b/tests/test_optimizer_parameters.py
@@ -303,3 +303,11 @@ def test_load_wrapper_optimizer(optimizer_instance):
 
     state = optimizer.state_dict()
     optimizer.load_state_dict(state)
+
+
+def test_scion_lmo_direction():
+    x = torch.zeros((1, 1), dtype=torch.float32)
+
+    optimizer_instance = load_optimizer('SCION')
+    for lmo_direction in ('spectral', 'sign', 'col_norm', 'row_norm'):
+        optimizer_instance.get_lmo_direction(x, lmo_direction)

-Original file line number
+Diff line change
     PNM,
     QHM,
     SAM,
 +    SCION,
     SGDP,
     SGDW,
     SM3,