NVIDIA-BioNeMo · jstjohn · Jun 2, 2026 · May 22, 2026 · May 22, 2026 · May 22, 2026
@@ -10,7 +10,26 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,source=requirements.txt,target=/workspace/requirements.txt \
     PIP_CONSTRAINT= pip install -r /workspace/requirements.txt
 
+# Sandboxed agent CLIs use these helpers on Linux.
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    bubblewrap \
+    uidmap \
+    && rm -rf /var/lib/apt/lists/*
+
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/
+
+# Install Node.js 22 LTS and the OpenAI Codex CLI.
+RUN curl -fsSL https://deb.nodesource.com/setup_22.x | bash - \
+    && apt-get install -y --no-install-recommends nodejs \
+    && rm -rf /var/lib/apt/lists/* \
+    && npm install -g --no-fund --no-audit @openai/codex \
+    && npm cache clean --force
+
+# Default Codex to Landlock where nested namespaces are restricted.
+RUN mkdir -p /home/ubuntu/.codex \
+    && printf '[features]\nuse_legacy_landlock = true\n' > /home/ubuntu/.codex/config.toml \
+    && chown -R ubuntu:ubuntu /home/ubuntu/.codex
+
 USER ubuntu
 RUN curl https://cursor.com/install -fsS | bash  # Install cursor-agent CLI tool
 RUN curl -fsSL https://claude.ai/install.sh | bash  # Install Claude CLI tool

@@ -8,6 +8,7 @@
         "source=${localEnv:HOME}/.cache,target=/home/ubuntu/.cache,type=bind,consistency=cached",
         "source=${localEnv:HOME}/.claude,target=/home/ubuntu/.claude,type=bind,consistency=cached",
         "source=${localEnv:HOME}/.claude.json,target=/home/ubuntu/.claude.json,type=bind,consistency=cached",
+        "source=${localEnv:HOME}/.codex,target=/home/ubuntu/.codex,type=bind,consistency=cached",
         "source=${localEnv:HOME}/.config,target=/home/ubuntu/.config,type=bind,consistency=cached",
         "source=${localEnv:HOME}/.cursor,target=/home/ubuntu/.cursor,type=bind,consistency=cached",
         "source=${localEnv:HOME}/.gnupg,target=/home/ubuntu/.gnupg,type=bind,consistency=cached",

@@ -8,6 +8,7 @@ mkdir -p ~/.gnupg
 mkdir -p ~/.config
 mkdir -p ~/.cursor
 mkdir -p ~/.claude
+mkdir -p ~/.codex
 [ ! -f ~/.netrc ] && touch ~/.netrc
 
 [ ! -f ~/.bash_history_devcontainer ] && touch ~/.bash_history_devcontainer

@@ -9,7 +9,7 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
 DEVCONTAINER_JSON="${SCRIPT_DIR}/devcontainer.json"
 CONTAINER_NAME="${BIONEMO_CONTAINER_NAME:-bionemo-devcontainer}"
-IMAGE_NAME="${BIONEMO_IMAGE_NAME:-bionemo-devcontainer:latest}"
+IMAGE_NAME="${BIONEMO_IMAGE_NAME:-${CONTAINER_NAME}:latest}"
 
 # ---------------------------------------------------------------------------
 # Helpers

@@ -5,7 +5,7 @@
 rm -f /usr/local/lib/python*/dist-packages/transformer_engine-*.dist-info/direct_url.json
 export UV_LOCK_TIMEOUT=900  # increase to 15 minutes (900 seconds), adjust as needed
 export UV_LINK_MODE=copy
-uv venv --system-site-packages
+uv venv --clear --system-site-packages
 
 # 2. Activate the environment
 source .venv/bin/activate
@@ -38,8 +38,8 @@ for pkg_dir in "$RECIPE_ROOT/../../../sub-packages/bionemo-recipeutils" "$RECIPE
     fi
 done
 
-# 6. Install the recipe with all remaining dependencies
-uv pip install -c pip-constraints.txt -e . --no-build-isolation
+# 6. Install the recipe with all remaining dependencies, including test extras
+uv pip install -c pip-constraints.txt -e '.[test]' --no-build-isolation
 
 # 7. Restore original pyproject.toml (the edit was only needed for uv resolution)
 mv pyproject.toml.ci_bak pyproject.toml
@@ -1,3 +1,5 @@
 poetry-core
+poetry_dynamic_versioning  # build dep of nvidia-resiliency-ext (transitively pulled by megatron-bridge); needed in the venv because we install with --no-build-isolation
+grpcio-tools  # build dep of nvidia-resiliency-ext: its setup.py shells out to `python -m grpc_tools.protoc` to compile *.proto files; --no-build-isolation means we have to provide it in the venv up-front
 wheel_stub
 ninja  # should speed up causal-conv1d build
@@ -24,7 +24,7 @@ dependencies = [
     "causal_conv1d",
     "nv-grouped-gemm",
     "megatron-core",
-    "nvidia-resiliency-ext",
+    # nvidia-resiliency-ext is pulled transitively by megatron-bridge.
     "emerging_optimizers",
     "subquadratic-ops-torch-cu13",
 
@@ -88,22 +88,27 @@ override-dependencies = [
     "triton; sys_platform == 'never'",
     "transformer-engine; sys_platform == 'never'",
     "transformer-engine[pytorch]; sys_platform == 'never'",
+    # Avoid optional log-pattern-mining dependency conflicts from nvidia-resiliency-ext.
+    "logsage; sys_platform == 'never'",
+    "drain3; sys_platform == 'never'",
 ]
 
 [tool.uv.sources]
 # Shared recipe utilities (framework-agnostic)
 # External dependencies with specific git tags/commits
-causal_conv1d = { git = "https://github.com/Dao-AILab/causal-conv1d.git", tag = "v1.5.4" }
+# 1.6.1 fixes a custom-op no-storage failure in no-grad/frozen forward paths.
+causal_conv1d = { git = "https://github.com/Dao-AILab/causal-conv1d.git", tag = "v1.6.1" }
 nv-grouped-gemm = { git = "https://github.com/fanshiqing/grouped_gemm", tag = "v1.1.4.post6" }
 
 # Internal dependencies
 bionemo-recipeutils = { git = "https://github.com/NVIDIA/bionemo-framework.git", branch = "main", subdirectory = "sub-packages/bionemo-recipeutils" }
 bionemo-core = { git = "https://github.com/NVIDIA/bionemo-framework.git", branch = "main", subdirectory = "sub-packages/bionemo-core" }
-nvidia-resiliency-ext = { git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git", rev = "54f85fe422d296cf04ea524130014bd3a2c3add1" }
+# nvidia-resiliency-ext is intentionally left to Megatron-Bridge so the transitive pin stays consistent.
 
-# Megatron Bundle. This points to a version that still supports the deprecated no_weight_decay_cond field until the API for an alternative has been finalized.
-megatron-bridge = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git", rev = "549e3cb970c170b1d7a86d021261efe05e8a5d9f" }
-megatron-core = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git", rev = "549e3cb970c170b1d7a86d021261efe05e8a5d9f", subdirectory = "3rdparty/Megatron-LM" }
+# Megatron Bundle. MCore is sourced from the same Megatron-Bridge release tag.
+megatron-bridge = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git", tag = "v0.4.1" }
+megatron-core = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git", tag = "v0.4.1", subdirectory = "3rdparty/Megatron-LM" }
 
 [tool.uv.extra-build-dependencies]
 warp-lang = ["wheel_stub"]
+nvidia-resiliency-ext = ["poetry_dynamic_versioning"]
@@ -21,14 +21,17 @@
 
 
 try:
-    from subquadratic_ops_torch.causal_conv1d import causal_conv1d as _subq_causal_conv1d
     from subquadratic_ops_torch.fft_causal_conv1d import fft_causal_conv1d as _subq_fft_causal_conv1d
 except ImportError as _subq_import_error:
-    _subq_causal_conv1d = None
     _subq_fft_causal_conv1d = None
     _subq_error_msg = f"subquadratic_ops_torch not available: {_subq_import_error}"
 
 
+def _linear_causal_fft_size(input_len: int, filter_len: int) -> int:
+    """Return an FFT size that cannot alias a causal convolution prefix."""
+    return max(2 * input_len, 2 * filter_len)
+
+
 def adjust_filter_shape_for_broadcast(u, h):
     """Adjust filter shape for broadcasting compatibility with input tensor."""
     h = h.squeeze()  # Standardize to [D, L] from [1, D, L] and [D, 1, L]
@@ -50,7 +53,7 @@ def fftconv_func(*, u, k, D):  # noqa: N803
     The convolution is computed in the frequency domain and then transformed back to the time domain.
     """
     seqlen = u.shape[-1]
-    fft_size = 2 * seqlen
+    fft_size = _linear_causal_fft_size(seqlen, k.shape[-1])
 
     k_f = torch.fft.rfft(k, n=fft_size) / fft_size
     k_f = adjust_filter_shape_for_broadcast(u, k_f)
@@ -98,21 +101,14 @@ def parallel_fir(
                     D=bias,
                 ).to(dtype=u.dtype)
     else:
-        if use_subquadratic_ops:
-            # subq-ops causal_conv1d expects pre-padded [B, D, L+pad] input and [D, K] weight; dtypes must match
-            pad_size = fir_length - 1
-            x_padded = F.pad(u.to(torch.float32), (pad_size, 0))
-            w = weight.squeeze(1) if weight.dim() == 3 else weight
-            z = _subq_causal_conv1d(x_padded, w.to(torch.float32))[..., pad_size:]
-        else:
-            z = F.conv1d(
-                u.to(torch.float32),
-                weight.to(torch.float32),
-                bias=None,
-                stride=1,
-                padding=fir_length - 1,
-                groups=u.shape[1],  # always set to D, regardless of filter grouping
-            )[..., :L]
+        z = F.conv1d(
+            u.to(torch.float32),
+            weight.to(torch.float32),
+            bias=None,
+            stride=1,
+            padding=fir_length - 1,
+            groups=u.shape[1],  # always set to D, regardless of filter grouping
+        )[..., :L]
 
         z = z.to(u.dtype)
 
@@ -130,7 +126,7 @@ def parallel_fir(
 
 def parallel_iir(*, z_pre, h, D, L, poles, t, hidden_size, compute_state):  # noqa: N803
     """Compute the output state of the short convolutional filter."""
-    fft_size = 2 * L
+    fft_size = _linear_causal_fft_size(L, h.shape[-1])
     x1, x2, v = z_pre.split([hidden_size, hidden_size, hidden_size], dim=1)
 
     x1v = x1 * v
@@ -221,9 +217,9 @@ def prefill_via_modal_fft(*, x1v, L, poles, t, X_s):  # noqa: N803
     # When the model has a long convolution derived from a recurrence in modal form and prefill_style is "fft",
     # we split the filter into poles and residues and reuse FFT computation on the input.
     bs = x1v.shape[0]
-    fft_size = 2 * L
+    fft_size = X_s.shape[-1]
     state_s = (poles.to(torch.float32) * t).exp()
-    state_S = torch.fft.fft(state_s, n=fft_size).repeat(bs, 1, 1, 1)  # noqa N806: B, D, state_dim, 2 * L
+    state_S = torch.fft.fft(state_s, n=fft_size).repeat(bs, 1, 1, 1)  # noqa N806: B, D, state_dim, fft_size
     state = torch.fft.ifft(X_s[..., None, :] * state_S, n=fft_size)
     # Do not try to fix `UserWarning: Casting complex values to real discards
     # the imaginary part` by inserting state.real conversion anywhere before

@@ -16,6 +16,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import inspect
 from contextlib import nullcontext
 from dataclasses import dataclass
 from typing import Optional, Union
@@ -120,13 +121,20 @@ def __init__(
             pp_layer_offset, layer_type_list = self._select_layers_for_pipeline_parallel(layer_type_list)
 
         if get_cpu_offload_context is not None:
-            (self.offload_context, self.group_prefetch_offload_commit_async) = get_cpu_offload_context(
+            # Megatron Core changed this helper from six to seven positional arguments
+            # across releases. Pass only the arguments accepted by the installed version.
+            offload_args = [
                 self.config.cpu_offloading,
                 self.config.cpu_offloading_num_layers,
                 self.config.num_layers,
                 self.config.cpu_offloading_activations,
                 self.config.cpu_offloading_weights,
                 self.config.cpu_offloading_double_buffering,
+                getattr(self.config, "cpu_offloading_retain_pinned_cpu_buffers", False),
+            ]
+            num_offload_params = len(inspect.signature(get_cpu_offload_context).parameters)
+            (self.offload_context, self.group_prefetch_offload_commit_async) = get_cpu_offload_context(
+                *offload_args[:num_offload_params],
             )
             self.config._cpu_offloading_context = self.offload_context if self.config.cpu_offloading else None
         else:

@@ -108,8 +108,10 @@ def bias_dropout_add_exec_handler(self):
         if self.training:
             return torch.enable_grad
         else:
-            # Validation, Test, Inference, Etc.
-            return torch.inference_mode
+            # torch.inference_mode marks outputs as inference tensors. Those flags
+            # persist after leaving the context and can break downstream autograd or
+            # torch.library custom ops that consume frozen model outputs.
+            return torch.no_grad
 
     def forward(
         self,

@@ -42,6 +42,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 try:
     from transformer_engine.common.recipe import DelayedScaling, Format
 except ImportError:
@@ -117,8 +118,10 @@ def __init__(
         self.fast_conv_proj = self.hyena_config.fast_conv_proj
         self.fast_conv_mixer = self.hyena_config.fast_conv_mixer
 
-        # Use b2b causal conv1d
         self.use_subquadratic_ops = self.transformer_config.use_subquadratic_ops
+        # TODO: Re-enable B2BCausalConv1dModule for short/medium Hyena layers once
+        # subquadratic-ops updates it to support causal_conv1d 1.6+ semantics.
+        self.use_fused_b2b_causal_conv1d = False
 
         # Per attention head and per partition values.
         assert torch.distributed.is_initialized()
@@ -197,7 +200,7 @@ def __init__(
                 use_conv_bias=self.transformer_config.use_short_conv_bias,
             )
 
-            if self.use_subquadratic_ops:
+            if self.use_fused_b2b_causal_conv1d:
                 # Create a wrapper module that doesn't register parameters
                 # Use the existing weights from the original model
                 self.b2b_kernel = B2BCausalConv1dModule(
@@ -228,7 +231,7 @@ def __init__(
                 max_sequence_length,
             )
 
-            if self.use_subquadratic_ops and self.operator_type == "hyena_medium_conv":
+            if self.use_fused_b2b_causal_conv1d and self.operator_type == "hyena_medium_conv":
                 # Create a wrapper module that doesn't register parameters
                 # Use the existing weights from the original model
                 self.b2b_kernel = B2BCausalConv1dModule(
@@ -308,7 +311,7 @@ def forward(self, x, layer_past=None, inference_context=None, _hyena_use_cp=True
         else:
             features = rearrange(features, "l b d -> b d l").contiguous()
 
-        is_b2b_eligible = self.use_subquadratic_ops and self.operator_type in [
+        is_b2b_eligible = self.use_fused_b2b_causal_conv1d and self.operator_type in [
             "hyena_short_conv",
             "hyena_medium_conv",
         ]

@@ -486,6 +486,7 @@ def fftconv_func(u, k, D, dropout_mask, gelu=True, k_rev=None, bidirectional=Fal
         if use_subquadratic_ops:
             y = fft_causal_conv1d(u, k.squeeze(0))
         else:
+            fft_size = max(fft_size, 2 * k.shape[-1])
             k_f = torch.fft.rfft(k, n=fft_size) / fft_size
             if k_rev is not None:
                 k_rev_f = torch.fft.rfft(k_rev, n=fft_size) / fft_size
@@ -754,7 +755,8 @@ def forward(self, L, *args, **kwargs):  # noqa: N803
         """
         return self.filter(L, *args, **kwargs)
 
-    @torch.compile(mode="max-autotune")
+    # Keep this eager. Compiling this helper can leave global dispatcher state
+    # that interferes with unrelated custom autograd/custom-op call sites.
     def filter(self, L, *args, **kwargs):  # noqa: N803
         """Compute the filter as a function of h and decay for the requested sequence length."""
         h = self.h[:, :L]
@@ -1456,10 +1458,7 @@ def forward(self, x, inference_context=None, _use_cp=True):
         # subquadratic_ops causal_conv1d is only applied to the projection conv of Hyena LI layer
         # Projection conv is fused with SE/MR layers (B2BCausalConv1dModule)
         if self.use_fast_causal_conv:  # hyena_proj_conv case
-            if self.use_subquadratic_ops:  # hyena_proj_conv of LI layer when subquadratic_ops is enabled
-                y = causal_conv1d(x, weight)[..., pad_size:]
-            else:
-                y = causal_conv1d_fn(x, weight, bias=None, activation=None)[..., pad_size:]
+            y = causal_conv1d_fn(x, weight, bias=None, activation=None)[..., pad_size:]
         else:  # hyena_short_conv case
             y = F.conv1d(
                 x,

@@ -76,7 +76,6 @@
 )
 from megatron.bridge.training.config import DistributedInitConfig, RNGConfig
 from megatron.bridge.training.mixed_precision import MIXED_PRECISION_RECIPES, get_mixed_precision_config
-from megatron.bridge.training.tokenizers.tokenizer import _HuggingFaceTokenizer
 from megatron.bridge.training.utils.checkpoint_utils import (
     file_exists,
     get_checkpoint_run_config_filename,
@@ -97,6 +96,14 @@
 from megatron.core.utils import get_batch_on_this_cp_rank
 from torch import Tensor
 
+
+try:
+    from megatron.bridge.training.tokenizers.tokenizer import _HuggingFaceTokenizer
+except ImportError:
+    from megatron.core.tokenizers.text.libraries.huggingface_tokenizer import (
+        HuggingFaceTokenizer as _HuggingFaceTokenizer,
+    )
+
 from bionemo.evo2.data.dataset_tokenizer import DEFAULT_HF_TOKENIZER_MODEL_PATH
 from bionemo.evo2.data.fasta_dataset import SimpleFastaDataset
 from bionemo.recipeutils.inference.collation import batch_collator
@@ -656,7 +663,6 @@ def _predict_step(
     if not parallel_state.is_pipeline_last_stage():
         return None
 
-    # Forward pass
     output_tensor = model(
         input_ids=batch["tokens"],
         position_ids=batch["position_ids"],
@@ -1036,9 +1042,11 @@ def predict(
                 f"Valid range: -{original_num_layers} to {original_num_layers - 1}."
             )
 
-        # Set the model to use fewer layers and skip post-processing (output heads)
+        # Set the model to use fewer layers and skip post-processing (output heads).
         model_provider.num_layers = target_num_layers
         model_provider.post_process = False
+        if hasattr(model_provider, "fp32_residual_connection"):
+            model_provider.fp32_residual_connection = False
 
         # Also truncate the hybrid_override_pattern if it exists, since it must match num_layers
         if hasattr(model_provider, "hybrid_override_pattern") and model_provider.hybrid_override_pattern is not None:
@@ -1276,6 +1284,12 @@ def predict(
 def main() -> None:
     """CLI entry point for Evo2 prediction."""
     args = parse_args()
+    try:
+        from megatron.bridge.utils.instantiate_utils import register_allowed_target_prefix
+
+        register_allowed_target_prefix("bionemo.")
+    except ImportError:
+        pass
     predict(
         fasta_path=args.fasta,
         ckpt_dir=args.ckpt_dir,