diff --git a/README.md b/README.md index ec739e0..a494297 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ The platforms and engines in this repository are **reference implementations** | Intel XPU | Data Center GPU Max / Arc | xccl (oneCCL) | ✅ Example (requires vendor support) | TBD | | Cambricon MLU | MLU370 / MLU590 | CNCL | ✅ Supported | [User Guide](docs/user_guide_mlu/README.md) | | MetaX | MetaX GPUs (CUDA-compatible) | NCCL | ✅ Example (requires vendor support) | TBD | +| Enflame GCU | GCU | ECCL / FlagCX | ✅ Example (requires vendor support) | [User Guide](docs/user_guide_enflame/README.md) | | Huawei NPU | Ascend 910B | HCCL | Built-in (verl core) | [Ascend Tutorial](https://github.com/verl-project/verl/tree/main/docs/ascend_tutorial) | @@ -47,11 +48,13 @@ verl-FL (main framework) ├── PlatformRegistry.register("intel") → PlatformXPU ├── PlatformRegistry.register("cambricon")→ PlatformMLU ├── PlatformRegistry.register("metax") → PlatformMetaX + ├── PlatformRegistry.register("enflame") → PlatformENFLAME ├── PlatformRegistry.register("flagos") → PlatformFlagOS │ ├── EngineRegistry.register(device="xpu", vendor="intel") ├── EngineRegistry.register(device="mlu", vendor="cambricon") ├── EngineRegistry.register(device="cuda", vendor="metax") + ├── EngineRegistry.register(device="enflame", vendor="enflame") └── EngineRegistry.register(device="cuda", vendor="flagos") ``` @@ -83,6 +86,7 @@ Each hardware platform provides a standalone user guide (following the structure - **[Cambricon MLU](docs/user_guide_mlu/README.md)** — Cambricon MLU370 / MLU590 user guide - **[MetaX GPU](docs/user_guide_metax/README.md)** — MetaX GPU user guide - **[FlagOS](docs/user_guide_flagos/README.md)** — FlagOS unified heterogeneous engine user guide ([NVIDIA](docs/user_guide_flagos/nvidia/README.md)) +- **[Enflame GCU](docs/user_guide_enflame/README.md)** — Enflame GCU user guide ### Developer Guides diff --git a/docs/development.md b/docs/development.md index c5d2b96..8d12d70 100644 --- a/docs/development.md +++ b/docs/development.md @@ -689,6 +689,7 @@ Existing reference implementations: - `docs/user_guide_mlu/` — Cambricon MLU - `docs/user_guide_metax/` — MetaX - `docs/user_guide_flagos/` — FlagOS +- `docs/user_guide_enflame/` — Enflame GCU > **Tip**: Refer to `verl/docs/ascend_tutorial` (Huawei NPU) for documentation quality and coverage expectations. That tutorial covers installation, quick start, advanced features, performance tuning, precision analysis, and FAQ. @@ -898,6 +899,7 @@ The following files in this repository serve as examples: | Intel XPU | `platforms/platform_xpu.py` | `engines/fsdp_xpu.py`, `engines/megatron_xpu.py` | | Cambricon MLU | `platforms/platform_mlu.py` | `engines/fsdp_mlu.py`, `engines/megatron_mlu.py` | | MetaX | `platforms/platform_cuda_metax.py` | `engines/fsdp_metax.py`, `engines/megatron_metax.py` | +| Enflame GCU | `platforms/platform_enflame.py` | `engines/fsdp_enflame.py`, `engines/megatron_enflame.py` | --- diff --git a/docs/user_guide_enflame/README.md b/docs/user_guide_enflame/README.md new file mode 100644 index 0000000..52fe13f --- /dev/null +++ b/docs/user_guide_enflame/README.md @@ -0,0 +1,55 @@ +# Enflame GCU User Guide + +Last updated: 06/22/2026. + +## Introduction + +This document describes how to use verl for reinforcement learning training on Enflame GCU accelerators via `torch_gcu` and ECCL/FlagCX communication. + +## Platform Summary + +| Item | Description | +|------|-------------| +| Device type | `enflame` | +| Vendor identifier | `enflame` | +| PyTorch API | `torch.gcu` (via `torch_gcu`) | +| Communication backend | `eccl` (default) or `flagcx` (when `USE_FLAGCX=1`) | +| Device visibility env var | `TOPS_VISIBLE_DEVICES` | +| Ray resource name | `GPU` (built-in) | +| IPC support | No (use device tensor path for weight transfer; Python SHM unsupported) | + +## Environment Variables + +```bash +export VERL_PLATFORM=enflame +export TOPS_VISIBLE_DEVICES=0,1,2,3 +export RAY_EXPERIMENTAL_NOSET_TOPS_VISIBLE_DEVICES=1 +export USE_FLAGCX=0 # use ECCL on homogenous ENFLAME cluster +``` + +When using **verl (upstream) + verl_hardware_plugin** (not verl-FL built-in platform), +Ray workers do not inherit shell exports. Pass these through Hydra / `ray_init.runtime_env`: + +```bash ++ray_kwargs.ray_init.runtime_env.env_vars.VERL_PLATFORM='enflame' ++ray_kwargs.ray_init.runtime_env.env_vars.VERL_USE_EXTERNAL_MODULES='verl_hardware_plugin' ++ray_kwargs.ray_init.runtime_env.env_vars.RAY_EXPERIMENTAL_NOSET_TOPS_VISIBLE_DEVICES='1' +``` + +Verify before training: + +```bash +python -c "import verl_hardware_plugin; import verl; from verl.plugin.platform import get_platform; print(get_platform().device_name)" +# Expected: enflame (not cpu) +``` + +## Notes + +- `torch_gcu` may patch `torch.cuda.is_available()`; platform auto-detection probes `torch.gcu` before CUDA. +- FlagCX Stream compatibility is handled in `PlatformENFLAME.ensure_initialized()`. +- For Migration-based runtime patches, install the Migration package before importing verl. + +## Related Documentation + +- [FlagOS User Guide](../user_guide_flagos/README.md) +- [Development Guide](../development.md) diff --git a/tests/test_plugin_registration.py b/tests/test_plugin_registration.py index 6254794..7c8da5b 100644 --- a/tests/test_plugin_registration.py +++ b/tests/test_plugin_registration.py @@ -66,6 +66,56 @@ def test_mlu_detection_with_env(self): with mock.patch.dict(os.environ, {"VERL_PLATFORM": "cambricon"}): assert _detect_platform_name() == "cambricon" + def test_enflame_registered(self): + from verl.plugin.platform.platform_manager import PlatformRegistry + from verl_hardware_plugin.platforms.platform_enflame import PlatformENFLAME # noqa: F401 + + assert "enflame" in PlatformRegistry.registered_names() + cls = PlatformRegistry.get("enflame") + assert cls is PlatformENFLAME + + def test_enflame_detection_with_env(self): + from verl.plugin.platform.platform_manager import _detect_platform_name + from verl_hardware_plugin.platforms.platform_enflame import PlatformENFLAME # noqa: F401 + + with _fresh_registries(): + with mock.patch.dict(os.environ, {"VERL_PLATFORM": "enflame"}): + assert _detect_platform_name() == "enflame" + + def test_enflame_device_and_vendor_names(self): + from verl_hardware_plugin.platforms.platform_enflame import PlatformENFLAME + + platform = PlatformENFLAME() + assert platform.device_name == "gcu" + assert platform.vendor_name == "enflame" + + def test_enflame_gcu_ipc_collect_shim(self): + from types import ModuleType + from unittest import mock + + import verl_hardware_plugin.platforms.platform_enflame as platform_enflame + + fake_gcu = ModuleType("gcu") + old_patched = platform_enflame._gcu_runtime_patched + try: + platform_enflame._gcu_runtime_patched = False + with mock.patch.object(platform_enflame, "_ensure_torch_gcu", return_value=True): + with mock.patch.object(platform_enflame.torch, "gcu", fake_gcu, create=True): + module = platform_enflame._get_gcu_module() + assert module is fake_gcu + assert callable(module.ipc_collect) + module.ipc_collect() + finally: + platform_enflame._gcu_runtime_patched = old_patched + + def test_enflame_communication_backend(self): + from verl_hardware_plugin.platforms.platform_enflame import PlatformENFLAME + + with mock.patch.dict(os.environ, {}, clear=True): + assert PlatformENFLAME().communication_backend_name() == "eccl" + with mock.patch.dict(os.environ, {"USE_FLAGCX": "1"}, clear=False): + assert PlatformENFLAME().communication_backend_name() == "flagcx" + def test_metax_detection_with_env(self): from verl.plugin.platform.platform_manager import _detect_platform_name from verl_hardware_plugin.platforms.platform_cuda_metax import PlatformMetaX # noqa: F401 @@ -148,6 +198,26 @@ def test_megatron_metax_engine_registered(self): assert EngineRegistry._engines["language_model"]["megatron"][("cuda", "metax")] is MegatronMetaXEngineWithLMHead + def test_fsdp_enflame_engines_registered(self): + from verl.workers.engine.base import EngineRegistry + from verl_hardware_plugin.engines.fsdp_enflame import ( + FSDPEnflameEngineWithLMHead, + FSDPEnflameEngineWithValueHead, + ) + + assert EngineRegistry._engines["language_model"]["fsdp"][("gcu", "enflame")] is FSDPEnflameEngineWithLMHead + assert EngineRegistry._engines["value_model"]["fsdp"][("gcu", "enflame")] is FSDPEnflameEngineWithValueHead + + def test_megatron_enflame_engine_registered(self): + from verl.workers.engine.base import EngineRegistry + from verl_hardware_plugin.engines.megatron_enflame import MegatronEnflameEngineWithLMHead + + assert ( + EngineRegistry._engines["language_model"]["megatron"][("gcu", "enflame")] + is MegatronEnflameEngineWithLMHead + ) + + class TestFLEnvManager: """Test FLEnvManager utility.""" diff --git a/verl_hardware_plugin/__init__.py b/verl_hardware_plugin/__init__.py index 7374194..07110de 100644 --- a/verl_hardware_plugin/__init__.py +++ b/verl_hardware_plugin/__init__.py @@ -3,7 +3,7 @@ """verl hardware plugin - Multi-chip platform and engine support. -This package registers hardware platforms (MetaX, XPU, MLU) and their +This package registers hardware platforms (MetaX, XPU, MLU, Enflame GCU) and their corresponding training engines with verl's plugin system. Discovered automatically via setuptools entry_points (verl.plugins group). diff --git a/verl_hardware_plugin/engines/__init__.py b/verl_hardware_plugin/engines/__init__.py index 29f506d..5dc139f 100755 --- a/verl_hardware_plugin/engines/__init__.py +++ b/verl_hardware_plugin/engines/__init__.py @@ -32,6 +32,58 @@ logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) +def enflame_fsdp_engine_registered() -> bool: + """Return True when Enflame FSDP engines are present on the active EngineRegistry.""" + try: + from verl.workers.engine.base import EngineRegistry + + registry = EngineRegistry._engines.get("language_model", {}).get("fsdp", {}) + return ("gcu", "enflame") in registry or ("enflame", "enflame") in registry + except Exception: + return False + + +def ensure_enflame_engines_registered() -> None: + """Register Enflame engines on the current EngineRegistry if missing. + + Migration may reload ``verl.workers.engine.base`` after the plugin first + imported, which clears ``EngineRegistry._engines``. Re-import engine modules + when lookup keys are absent. + """ + enflame_required = os.getenv("VERL_PLATFORM", "").strip().lower() == "enflame" + + if not enflame_fsdp_engine_registered(): + try: + from verl_hardware_plugin.engines import fsdp_enflame # noqa: F401 + + logger.info("Registered engines: fsdp_enflame") + except Exception as e: + if enflame_required: + logger.error("Failed to register Enflame FSDP engines (required): %s", e) + raise + logger.debug("ENFLAME FSDP engines not registered: %s", e) + + try: + from verl.workers.engine.base import EngineRegistry + + megatron_registry = EngineRegistry._engines.get("language_model", {}).get("megatron", {}) + if ("gcu", "enflame") not in megatron_registry and ("enflame", "enflame") not in megatron_registry: + from verl_hardware_plugin.engines import megatron_enflame # noqa: F401 + + logger.info("Registered engines: megatron_enflame") + except Exception as e: + if enflame_required: + logger.error("Failed to register Enflame Megatron engines (required): %s", e) + raise + logger.debug("ENFLAME Megatron engines not registered: %s", e) + + if enflame_required and not enflame_fsdp_engine_registered(): + raise RuntimeError( + "Enflame FSDP engine is not registered after ensure_enflame_engines_registered(). " + "Set VERL_LOGGING_LEVEL=DEBUG and check fsdp_enflame import errors." + ) + + def register_all_engines(): """Import all engine modules to trigger their @register decorators. @@ -105,3 +157,7 @@ def register_all_engines(): logger.info("Registered engines: megatron_metax") except Exception as e: logger.debug("MetaX Megatron engines not registered: %s", e) + + # Enflame GCU engines (ECCL/FlagCX communication) + ensure_enflame_engines_registered() + diff --git a/verl_hardware_plugin/engines/fsdp_enflame.py b/verl_hardware_plugin/engines/fsdp_enflame.py new file mode 100644 index 0000000..11d955f --- /dev/null +++ b/verl_hardware_plugin/engines/fsdp_enflame.py @@ -0,0 +1,53 @@ +# Copyright (c) 2026 BAAI. All rights reserved. +# Licensed under the Apache License, Version 2.0. + +"""FSDP engine for Enflame GCU devices.""" + +import logging +import os + +from verl.trainer.config import CheckpointConfig +from verl.workers.config import FSDPEngineConfig, FSDPOptimizerConfig, HFModelConfig +from verl.workers.engine.base import EngineRegistry +from verl.workers.engine.fsdp import FSDPEngineWithLMHead +from verl.workers.engine.fsdp.transformer_impl import FSDPEngineWithValueHead + +logger = logging.getLogger(__name__) +logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) + + +@EngineRegistry.register(model_type="language_model", backend=["fsdp", "fsdp2"], device="gcu", vendor="enflame") +class FSDPEnflameEngineWithLMHead(FSDPEngineWithLMHead): + """FSDP Engine for Enflame GCU with ECCL/FlagCX communication backend.""" + + def __init__( + self, + model_config: HFModelConfig, + engine_config: FSDPEngineConfig, + optimizer_config: FSDPOptimizerConfig, + checkpoint_config: CheckpointConfig, + ): + super().__init__(model_config, engine_config, optimizer_config, checkpoint_config) + logger.info("FSDPEnflameEngineWithLMHead initialized") + + def initialize(self): + super().initialize() + logger.info("FSDPEnflameEngineWithLMHead initialized for ENFLAME") + + +@EngineRegistry.register(model_type="value_model", backend=["fsdp", "fsdp2"], device="gcu", vendor="enflame") +class FSDPEnflameEngineWithValueHead(FSDPEngineWithValueHead): + """FSDP Engine for Enflame GCU value model training.""" + + def __init__( + self, + model_config: HFModelConfig, + engine_config: FSDPEngineConfig, + optimizer_config: FSDPOptimizerConfig, + checkpoint_config: CheckpointConfig, + ): + super().__init__(model_config, engine_config, optimizer_config, checkpoint_config) + logger.info("FSDPEnflameEngineWithValueHead initialized") + + def initialize(self): + super().initialize() diff --git a/verl_hardware_plugin/engines/megatron_enflame.py b/verl_hardware_plugin/engines/megatron_enflame.py new file mode 100644 index 0000000..3ed7ed9 --- /dev/null +++ b/verl_hardware_plugin/engines/megatron_enflame.py @@ -0,0 +1,22 @@ +# Copyright (c) 2026 BAAI. All rights reserved. +# Licensed under the Apache License, Version 2.0. + +"""Megatron engine for Enflame GCU devices.""" + +import logging +import os + +from verl.workers.engine.base import EngineRegistry +from verl.workers.engine.megatron.transformer_impl import MegatronEngineWithLMHead + +logger = logging.getLogger(__name__) +logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) + + +@EngineRegistry.register(model_type="language_model", backend="megatron", device="gcu", vendor="enflame") +class MegatronEnflameEngineWithLMHead(MegatronEngineWithLMHead): + """Megatron Engine for Enflame GCU with ECCL/FlagCX communication backend.""" + + def initialize(self): + super().initialize() + logger.info("MegatronEnflameEngineWithLMHead initialized for ENFLAME") diff --git a/verl_hardware_plugin/platforms/__init__.py b/verl_hardware_plugin/platforms/__init__.py index 0e296b7..d0641c5 100644 --- a/verl_hardware_plugin/platforms/__init__.py +++ b/verl_hardware_plugin/platforms/__init__.py @@ -62,3 +62,12 @@ def register_all_platforms(): logger.info("Registered platform: metax (cuda)") except Exception as e: logger.debug("MetaX platform not registered: %s", e) + + # Enflame GCU — requires torch_gcu + try: + from verl_hardware_plugin.platforms import platform_enflame # noqa: F401 + + logger.info("Registered platform: enflame (gcu)") + except Exception as e: + logger.debug("ENFLAME platform not registered: %s", e) + diff --git a/verl_hardware_plugin/platforms/platform_enflame.py b/verl_hardware_plugin/platforms/platform_enflame.py new file mode 100644 index 0000000..56f2378 --- /dev/null +++ b/verl_hardware_plugin/platforms/platform_enflame.py @@ -0,0 +1,232 @@ +# Copyright (c) 2026 BAAI. All rights reserved. +# Licensed under the Apache License, Version 2.0. + +"""Enflame GCU platform implementation. + +Supports Enflame GCU accelerators via torch_gcu and ECCL/FlagCX communication +backends. + +Key design decisions for Enflame GCU: +- device_name: "gcu" (torch_gcu PyTorch API via torch.gcu.*) +- vendor_name: "enflame" (engine lookup vendor key) +- communication_backend: "flagcx" when USE_FLAGCX=1, otherwise "eccl" +- ray_resource_name: "GPU" (Ray maps ENFLAME workers to the built-in GPU resource) +- visible_devices_envvar: "TOPS_VISIBLE_DEVICES" +- is_ipc_supported: True (verl uses device-tensor/reduce_tensor path; Python SHM is unsupported on torch_gcu) +- ensure_initialized: loads torch_gcu and applies gcu runtime shims (ipc_collect, Stream) + +Prerequisites: +- torch_gcu must be installed (provides torch.gcu.* API) +- Enflame driver and runtime must be installed on the host + +Example usage: + export VERL_PLATFORM=enflame + export USE_FLAGCX=0 + python -m verl.trainer.main --config config.yaml +""" + +import logging +import os +from contextlib import contextmanager +from types import ModuleType +from typing import Any, Optional + +import torch + +from verl.plugin.platform.platform_base import PlatformBase +from verl.plugin.platform.platform_manager import PlatformRegistry + +logger = logging.getLogger(__name__) +logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) + + +def _ensure_torch_gcu() -> bool: + """Try to import torch_gcu so that torch.gcu becomes available.""" + if hasattr(torch, "gcu"): + return True + try: + import torch_gcu # noqa: F401 + + return hasattr(torch, "gcu") + except (ImportError, RuntimeError, AttributeError): + return False + + +_gcu_runtime_patched = False + + +def _patch_gcu_runtime(gcu: ModuleType) -> None: + """Apply torch.gcu compatibility shims required by verl/vLLM weight transfer.""" + global _gcu_runtime_patched + if _gcu_runtime_patched: + return + _gcu_runtime_patched = True + + # torch_gcu stubs cuda.ipc_collect but not gcu.ipc_collect; verl cleanup calls it. + if not hasattr(gcu, "ipc_collect"): + gcu.ipc_collect = lambda: None + + stream_cls = getattr(gcu, "Stream", None) + if stream_cls is not None and not hasattr(stream_cls, "cuda_stream"): + if hasattr(stream_cls, "gcu_stream"): + try: + stream_cls.cuda_stream = property(lambda self: self.gcu_stream) + except TypeError: + logger.warning("Failed to patch torch.gcu.Stream.cuda_stream (Stream class is not mutable).") + + +def _get_gcu_module() -> ModuleType: + """Return the ``torch.gcu`` module, importing ``torch_gcu`` if needed.""" + if not _ensure_torch_gcu(): + raise RuntimeError("Enflame platform requires the 'torch_gcu' package. Please install it first.") + gcu = torch.gcu + _patch_gcu_runtime(gcu) + return gcu + + +def _gcu_vllm_runtime_env_vars() -> dict[str, str]: + """Env vars required for stable vLLM rollout on torch_gcu (verl 0.7.1 e2e parity).""" + return { + "NCCL_CUMEM_ENABLE": "0", + "TORCH_ECCL_AVOID_RECORD_STREAMS": "1", + "TORCHGCU_INDUCTOR_ENABLE": "0", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ENABLE_V1_MULTIPROCESSING": "0", + } + + +@PlatformRegistry.register(platform="enflame") +class PlatformENFLAME(PlatformBase): + """Platform backend for Enflame GCU accelerators. + + Registration key: "enflame" (``VERL_PLATFORM``) + Engines for this platform should register with: device="gcu", vendor="enflame" + + Note: torch_gcu may patch torch.cuda to return True on GCU devices, so + platform auto-detection must probe torch.gcu before CUDA. + """ + + @property + def device_name(self) -> str: + return "gcu" + + @property + def vendor_name(self) -> str: + return "enflame" + + @property + def device_module(self) -> ModuleType: + return _get_gcu_module() + + def is_available(self) -> bool: + if not _ensure_torch_gcu(): + return False + try: + return torch.gcu.is_available() + except (ImportError, RuntimeError, AttributeError): + return False + + def is_platform_available(self, use_smi_check: bool = False) -> bool: + """Determine if the current machine has Enflame GCU hardware. + + Must be probed before CUDA because torch_gcu may patch torch.cuda. + """ + if not _ensure_torch_gcu(): + return False + if use_smi_check: + return True + try: + gcu = getattr(torch, "gcu", None) + is_available = getattr(gcu, "is_available", None) + return callable(is_available) and is_available() + except (ImportError, RuntimeError, AttributeError): + return False + + def current_device(self) -> int: + return _get_gcu_module().current_device() + + def device_count(self) -> int: + return _get_gcu_module().device_count() + + def set_device(self, device_index: int) -> None: + _get_gcu_module().set_device(device_index) + + def synchronize(self, device_index: Optional[int] = None) -> None: + if device_index is not None: + _get_gcu_module().synchronize(device_index) + else: + _get_gcu_module().synchronize() + + def manual_seed(self, seed: int) -> None: + _get_gcu_module().manual_seed(seed) + + def manual_seed_all(self, seed: int) -> None: + _get_gcu_module().manual_seed_all(seed) + + def set_allocator_settings(self, settings: str) -> None: + gcu = _get_gcu_module() + if hasattr(gcu, "memory") and hasattr(gcu.memory, "_set_allocator_settings"): + gcu.memory._set_allocator_settings(settings) + + def empty_cache(self) -> None: + _get_gcu_module().empty_cache() + + def get_device_capability(self, device_index: int = 0) -> tuple[Optional[int], Optional[int]]: + gcu = _get_gcu_module() + if hasattr(gcu, "get_device_capability"): + return gcu.get_device_capability(device_index) + return (None, None) + + def communication_backend_name(self) -> str: + if os.getenv("USE_FLAGCX", "").lower() in ("1", "true"): + return "flagcx" + return "eccl" + + def visible_devices_envvar(self) -> str: + return "TOPS_VISIBLE_DEVICES" + + def ray_resource_name(self) -> str: + return "GPU" + + def ray_resource_options(self, num_gpus: float) -> dict[str, Any]: + return {"num_gpus": num_gpus} + + def ray_noset_envvars(self) -> list[str]: + return ["RAY_EXPERIMENTAL_NOSET_TOPS_VISIBLE_DEVICES"] + + def is_ipc_supported(self) -> bool: + """Tell verl to avoid Python multiprocessing SHM for weight transfer. + + verl sets ``use_shm = not is_support_ipc()`` in vLLM rollout. torch_gcu + does not support the SHM fallback; return True so verl uses on-device + buffers with ``reduce_tensor`` instead. + """ + return True + + @contextmanager + def nvtx_range(self, msg: str): + logger.debug("NVTX range (no-op on ENFLAME): %s", msg) + yield + + def profiler_start(self) -> None: + pass + + def profiler_stop(self) -> None: + pass + + def apply_model_patches(self, model_type: str) -> None: + pass + + def rollout_env_vars(self) -> dict[str, str]: + return dict(_gcu_vllm_runtime_env_vars()) + + def get_collective_module(self) -> Any: + return None + + def cudart(self) -> Any: + return None + + def ensure_initialized(self) -> None: + """Eagerly load ``torch_gcu`` and apply runtime compatibility shims.""" + _get_gcu_module() + logger.debug("torch_gcu initialised by PlatformENFLAME.ensure_initialized()")