From 3aa6ab2d327f551ebcbe6752c823cafa9432de86 Mon Sep 17 00:00:00 2001 From: JM Huibonhoa Date: Wed, 17 Jun 2026 18:45:31 -0400 Subject: [PATCH 01/12] feat: substrate support for BYO and python runtimes for SandboxAgent CR Signed-off-by: JM Huibonhoa --- Makefile | 27 ++- .../crd/bases/kagent.dev_sandboxagents.yaml | 2 - go/api/v1alpha2/agent_runtime_test.go | 8 +- go/api/v1alpha2/agent_spec_validation.go | 23 +- go/api/v1alpha2/agent_spec_validation_test.go | 26 ++- go/api/v1alpha2/agent_types.go | 12 +- go/api/v1alpha2/sandboxagent_types.go | 1 - .../controller/sandboxagent_substrate.go | 36 +++ .../translator/agent/adk_api_translator.go | 6 +- .../translator/agent/deployments.go | 17 +- .../agent/digest_testmain_external_test.go | 1 + .../translator/agent/imageconfig_test.go | 23 +- .../testdata/outputs/agent_with_code.json | 2 +- .../outputs/agent_with_git_skills.json | 2 +- .../testdata/outputs/agent_with_skills.json | 2 +- go/core/pkg/app/app.go | 2 +- .../filter_translator_owned_test.go | 20 +- go/core/pkg/sandboxbackend/routing.go | 8 +- .../sandboxbackend/substrate/actor_errors.go | 12 + .../sandboxbackend/substrate/agent_actor.go | 138 +++++++++++- .../substrate/agent_lifecycle.go | 160 +++++++++++-- .../substrate/agent_lifecycle_test.go | 211 ++++++++++++++++-- .../substrate/agents_backend.go | 18 +- .../pkg/sandboxbackend/substrate/bluegreen.go | 107 +++++++++ .../substrate/config_hash_test.go | 165 ++++++++++++++ .../substrate/lifecycle_delete.go | 31 ++- .../substrate/lifecycle_shared.go | 58 ++++- .../templates/kagent.dev_sandboxagents.yaml | 2 - python/Dockerfile | 165 ++++++-------- python/Dockerfile.app | 5 +- python/Dockerfile.full | 116 ++++++++++ .../src/kagent/adk/_config_materialize.py | 51 +++++ .../packages/kagent-adk/src/kagent/adk/cli.py | 5 + .../unittests/test_config_materialize.py | 55 +++++ scripts/controller-digest-ldflags.sh | 5 +- ui/src/app/agents/new/page.tsx | 33 +-- .../agent-form/ByoDeploymentFields.tsx | 5 +- .../components/agent-form/agent-form-types.ts | 1 + ui/src/lib/__tests__/sandboxAgentForm.test.ts | 8 +- ui/src/lib/sandboxAgentForm.ts | 12 +- 40 files changed, 1327 insertions(+), 254 deletions(-) create mode 100644 go/core/pkg/sandboxbackend/substrate/bluegreen.go create mode 100644 go/core/pkg/sandboxbackend/substrate/config_hash_test.go create mode 100644 python/Dockerfile.full create mode 100644 python/packages/kagent-adk/src/kagent/adk/_config_materialize.py create mode 100644 python/packages/kagent-adk/tests/unittests/test_config_materialize.py diff --git a/Makefile b/Makefile index c14a6e085b..5abfb561bf 100644 --- a/Makefile +++ b/Makefile @@ -60,14 +60,18 @@ SKILLS_INIT_IMAGE_NAME ?= skills-init CONTROLLER_IMAGE_TAG ?= $(VERSION) UI_IMAGE_TAG ?= $(VERSION) APP_IMAGE_TAG ?= $(VERSION) +APP_FULL_IMAGE_TAG ?= $(VERSION)-full KAGENT_ADK_IMAGE_TAG ?= $(VERSION) +KAGENT_ADK_FULL_IMAGE_TAG ?= $(VERSION)-full GOLANG_ADK_IMAGE_TAG ?= $(VERSION) GOLANG_ADK_FULL_IMAGE_TAG ?= $(VERSION)-full SKILLS_INIT_IMAGE_TAG ?= $(VERSION) CONTROLLER_IMG ?= $(DOCKER_REGISTRY)/$(DOCKER_REPO)/$(CONTROLLER_IMAGE_NAME):$(CONTROLLER_IMAGE_TAG) UI_IMG ?= $(DOCKER_REGISTRY)/$(DOCKER_REPO)/$(UI_IMAGE_NAME):$(UI_IMAGE_TAG) APP_IMG ?= $(DOCKER_REGISTRY)/$(DOCKER_REPO)/$(APP_IMAGE_NAME):$(APP_IMAGE_TAG) +APP_FULL_IMG ?= $(DOCKER_REGISTRY)/$(DOCKER_REPO)/$(APP_IMAGE_NAME):$(APP_FULL_IMAGE_TAG) KAGENT_ADK_IMG ?= $(DOCKER_REGISTRY)/$(DOCKER_REPO)/$(KAGENT_ADK_IMAGE_NAME):$(KAGENT_ADK_IMAGE_TAG) +KAGENT_ADK_FULL_IMG ?= $(DOCKER_REGISTRY)/$(DOCKER_REPO)/$(KAGENT_ADK_IMAGE_NAME):$(KAGENT_ADK_FULL_IMAGE_TAG) GOLANG_ADK_IMG ?= $(DOCKER_REGISTRY)/$(DOCKER_REPO)/$(GOLANG_ADK_IMAGE_NAME):$(GOLANG_ADK_IMAGE_TAG) GOLANG_ADK_FULL_IMG ?= $(DOCKER_REGISTRY)/$(DOCKER_REPO)/$(GOLANG_ADK_IMAGE_NAME):$(GOLANG_ADK_FULL_IMAGE_TAG) SKILLS_INIT_IMG ?= $(DOCKER_REGISTRY)/$(DOCKER_REPO)/$(SKILLS_INIT_IMAGE_NAME):$(SKILLS_INIT_IMAGE_TAG) @@ -197,12 +201,14 @@ build-all: buildx-create .PHONY: build build: ## Build and push all component images -build: buildx-create build-ui build-skills-init build-golang-adk build-golang-adk-full build-app build-controller +build: buildx-create build-ui build-skills-init build-golang-adk build-golang-adk-full build-app build-app-full build-controller @echo "Build completed successfully." @echo "Controller Image: $(CONTROLLER_IMG)" @echo "UI Image: $(UI_IMG)" @echo "App Image: $(APP_IMG)" + @echo "App Full Image: $(APP_FULL_IMG)" @echo "Kagent ADK Image: $(KAGENT_ADK_IMG)" + @echo "Kagent ADK Full Image: $(KAGENT_ADK_FULL_IMG)" @echo "Golang ADK Image: $(GOLANG_ADK_IMG)" @echo "Golang ADK Full Image: $(GOLANG_ADK_FULL_IMG)" @echo "Skills Init Image: $(SKILLS_INIT_IMG)" @@ -230,7 +236,9 @@ build-img-versions: ## Print the fully-qualified image tags for all components @echo controller=$(CONTROLLER_IMG) @echo ui=$(UI_IMG) @echo app=$(APP_IMG) + @echo app-full=$(APP_FULL_IMG) @echo kagent-adk=$(KAGENT_ADK_IMG) + @echo kagent-adk-full=$(KAGENT_ADK_FULL_IMG) @echo golang-adk=$(GOLANG_ADK_IMG) @echo golang-adk-full=$(GOLANG_ADK_FULL_IMG) @echo skills-init=$(SKILLS_INIT_IMG) @@ -242,10 +250,11 @@ controller-manifests: ## Regenerate CRD manifests and copy them into the Helm ch .PHONY: build-controller build-controller: ## Build and push the controller image (embeds agent runtime digests via scripts/controller-digest-ldflags.sh) -build-controller: buildx-create controller-manifests build-app build-golang-adk build-golang-adk-full +build-controller: buildx-create controller-manifests build-app build-app-full build-golang-adk build-golang-adk-full @set -e; \ DIGEST_LDFLAGS=$$(CONTAINER_RUNTIME=$(CONTAINER_RUNTIME) \ APP_IMG=$(APP_IMG) \ + APP_FULL_IMG=$(APP_FULL_IMG) \ GOLANG_ADK_IMG=$(GOLANG_ADK_IMG) \ GOLANG_ADK_FULL_IMG=$(GOLANG_ADK_FULL_IMG) \ ./scripts/controller-digest-ldflags.sh); \ @@ -268,11 +277,23 @@ build-kagent-adk: buildx-create $(DOCKER_PUSH) $(KAGENT_ADK_IMG) .PHONY: build-app -build-app: ## Build and push the app image (depends on kagent-adk) +build-app: ## Build and push the app image (distroless slim; depends on kagent-adk) build-app: buildx-create build-kagent-adk $(DOCKER_BUILDER) $(DOCKER_BUILD_ARGS) $(TOOLS_IMAGE_BUILD_ARGS) --build-arg KAGENT_ADK_VERSION=$(KAGENT_ADK_IMAGE_TAG) --build-arg DOCKER_REGISTRY=$(DOCKER_REGISTRY) -t $(APP_IMG) -f python/Dockerfile.app ./python $(DOCKER_PUSH) $(APP_IMG) +.PHONY: build-kagent-adk-full +build-kagent-adk-full: ## Build and push the full Python kagent ADK image (includes sandbox runtime) +build-kagent-adk-full: buildx-create + $(DOCKER_BUILDER) $(DOCKER_BUILD_ARGS) $(TOOLS_IMAGE_BUILD_ARGS) -t $(KAGENT_ADK_FULL_IMG) -f python/Dockerfile.full ./python + $(DOCKER_PUSH) $(KAGENT_ADK_FULL_IMG) + +.PHONY: build-app-full +build-app-full: ## Build and push the full app image (sandbox runtime; depends on kagent-adk-full) +build-app-full: buildx-create build-kagent-adk-full + $(DOCKER_BUILDER) $(DOCKER_BUILD_ARGS) $(TOOLS_IMAGE_BUILD_ARGS) --build-arg KAGENT_ADK_VERSION=$(KAGENT_ADK_FULL_IMAGE_TAG) --build-arg DOCKER_REGISTRY=$(DOCKER_REGISTRY) -t $(APP_FULL_IMG) -f python/Dockerfile.app ./python + $(DOCKER_PUSH) $(APP_FULL_IMG) + .PHONY: build-golang-adk build-golang-adk: ## Build and push the Go ADK image build-golang-adk: buildx-create diff --git a/go/api/config/crd/bases/kagent.dev_sandboxagents.yaml b/go/api/config/crd/bases/kagent.dev_sandboxagents.yaml index 3f8f594b50..1fe25e28c7 100644 --- a/go/api/config/crd/bases/kagent.dev_sandboxagents.yaml +++ b/go/api/config/crd/bases/kagent.dev_sandboxagents.yaml @@ -11339,8 +11339,6 @@ spec: rule: '!has(self.skills) || self.platform != ''substrate''' - message: spec.substrate may only be set when spec.platform is substrate rule: '!has(self.substrate) || self.platform == ''substrate''' - - message: BYO agents are not supported when spec.platform is substrate - rule: '!has(self.type) || self.type != ''BYO'' || self.platform != ''substrate''' - message: type must be specified rule: has(self.type) - message: type must be either Declarative or BYO diff --git a/go/api/v1alpha2/agent_runtime_test.go b/go/api/v1alpha2/agent_runtime_test.go index 9ff0c14815..01987e1fb5 100644 --- a/go/api/v1alpha2/agent_runtime_test.go +++ b/go/api/v1alpha2/agent_runtime_test.go @@ -19,8 +19,14 @@ func TestEffectiveDeclarativeRuntimeForAgent(t *testing.T) { require.Equal(t, DeclarativeRuntime_Python, EffectiveDeclarativeRuntimeForAgent(agent)) }) - t.Run("SandboxAgent on substrate uses Go", func(t *testing.T) { + t.Run("SandboxAgent on substrate honors configured runtime", func(t *testing.T) { sa := &SandboxAgent{Spec: SandboxAgentSpec{AgentSpec: substrateSpec, Platform: SandboxPlatformSubstrate}} + require.Equal(t, DeclarativeRuntime_Python, EffectiveDeclarativeRuntimeForAgent(sa)) + }) + + t.Run("SandboxAgent on substrate honors Go runtime when set", func(t *testing.T) { + goSpec := AgentSpec{Type: AgentType_Declarative, Declarative: &DeclarativeAgentSpec{Runtime: DeclarativeRuntime_Go}} + sa := &SandboxAgent{Spec: SandboxAgentSpec{AgentSpec: goSpec, Platform: SandboxPlatformSubstrate}} require.Equal(t, DeclarativeRuntime_Go, EffectiveDeclarativeRuntimeForAgent(sa)) }) diff --git a/go/api/v1alpha2/agent_spec_validation.go b/go/api/v1alpha2/agent_spec_validation.go index baa5c62c4d..445fd84958 100644 --- a/go/api/v1alpha2/agent_spec_validation.go +++ b/go/api/v1alpha2/agent_spec_validation.go @@ -3,9 +3,8 @@ package v1alpha2 import "fmt" const ( - substrateSandboxSkillsUnsupportedMsg = "spec.skills is not supported when spec.platform is substrate" - substrateSandboxPythonRuntimeUnsupportedMsg = "spec.declarative.runtime must be \"go\" when spec.platform is substrate" - substrateSandboxBYOUnsupportedMsg = "BYO agents are not supported when spec.platform is substrate" + substrateSandboxSkillsUnsupportedMsg = "spec.skills is not supported when spec.platform is substrate" + substrateSandboxBYOMissingCommandMsg = "BYO agents on substrate must set spec.byo.deployment.cmd (substrate does not fall back to the image entrypoint)" ) // AgentSpecHasSkills reports whether the spec configures any skill sources. @@ -18,23 +17,23 @@ func AgentSpecHasSkills(spec *AgentSpec) bool { } // ValidateSubstrateSandboxAgentSpec rejects substrate sandbox configurations that kagent -// does not support yet (for example declarative skills on Agent Substrate). +// does not support yet (for example declarative skills on Agent Substrate). Declarative +// Python/Go and BYO (Go/Python) agents are supported; BYO agents must provide an explicit +// command because substrate copies the container Command verbatim with no image-entrypoint +// fallback. func ValidateSubstrateSandboxAgentSpec(agent *SandboxAgent) error { if agent == nil || AgentSandboxPlatform(agent) != SandboxPlatformSubstrate { return nil } spec := agent.GetAgentSpec() - if spec.Type == AgentType_BYO { - return fmt.Errorf("%s", substrateSandboxBYOUnsupportedMsg) - } if AgentSpecHasSkills(spec) { return fmt.Errorf("%s", substrateSandboxSkillsUnsupportedMsg) } - if spec.Type == AgentType_Declarative && - spec.Declarative != nil && - spec.Declarative.Runtime != "" && - spec.Declarative.Runtime != DeclarativeRuntime_Go { - return fmt.Errorf("%s", substrateSandboxPythonRuntimeUnsupportedMsg) + if spec.Type == AgentType_BYO { + dep := spec.BYO + if dep == nil || dep.Deployment == nil || dep.Deployment.Cmd == nil || *dep.Deployment.Cmd == "" { + return fmt.Errorf("%s", substrateSandboxBYOMissingCommandMsg) + } } return nil } diff --git a/go/api/v1alpha2/agent_spec_validation_test.go b/go/api/v1alpha2/agent_spec_validation_test.go index ca43ccb576..90935f7571 100644 --- a/go/api/v1alpha2/agent_spec_validation_test.go +++ b/go/api/v1alpha2/agent_spec_validation_test.go @@ -36,7 +36,7 @@ func TestValidateSubstrateSandboxAgentSpec(t *testing.T) { require.Contains(t, err.Error(), substrateSandboxSkillsUnsupportedMsg) }) - t.Run("rejects python runtime on substrate platform", func(t *testing.T) { + t.Run("allows python runtime on substrate platform", func(t *testing.T) { agent := &SandboxAgent{ Spec: SandboxAgentSpec{ Platform: SandboxPlatformSubstrate, @@ -48,24 +48,36 @@ func TestValidateSubstrateSandboxAgentSpec(t *testing.T) { }, }, } - err := ValidateSubstrateSandboxAgentSpec(agent) - require.Error(t, err) - require.Contains(t, err.Error(), substrateSandboxPythonRuntimeUnsupportedMsg) + require.NoError(t, ValidateSubstrateSandboxAgentSpec(agent)) }) - t.Run("rejects BYO agents on substrate platform", func(t *testing.T) { + t.Run("rejects BYO agents without an explicit command on substrate platform", func(t *testing.T) { agent := &SandboxAgent{ Spec: SandboxAgentSpec{ Platform: SandboxPlatformSubstrate, AgentSpec: AgentSpec{ Type: AgentType_BYO, - BYO: &BYOAgentSpec{}, + BYO: &BYOAgentSpec{Deployment: &ByoDeploymentSpec{Image: "example/agent:latest"}}, }, }, } err := ValidateSubstrateSandboxAgentSpec(agent) require.Error(t, err) - require.Contains(t, err.Error(), substrateSandboxBYOUnsupportedMsg) + require.Contains(t, err.Error(), substrateSandboxBYOMissingCommandMsg) + }) + + t.Run("allows BYO agents with an explicit command on substrate platform", func(t *testing.T) { + cmd := "/app" + agent := &SandboxAgent{ + Spec: SandboxAgentSpec{ + Platform: SandboxPlatformSubstrate, + AgentSpec: AgentSpec{ + Type: AgentType_BYO, + BYO: &BYOAgentSpec{Deployment: &ByoDeploymentSpec{Image: "example/agent:latest", Cmd: &cmd}}, + }, + }, + } + require.NoError(t, ValidateSubstrateSandboxAgentSpec(agent)) }) t.Run("allows BYO agents on agent-sandbox platform", func(t *testing.T) { diff --git a/go/api/v1alpha2/agent_types.go b/go/api/v1alpha2/agent_types.go index ebfdcd7325..7022cc0063 100644 --- a/go/api/v1alpha2/agent_types.go +++ b/go/api/v1alpha2/agent_types.go @@ -282,16 +282,10 @@ func EffectiveDeclarativeRuntime(spec *AgentSpec) DeclarativeRuntime { } // EffectiveDeclarativeRuntimeForAgent returns the runtime for a reconciled agent object. -// Substrate SandboxAgents always use Go; regular Agents honor spec.declarative.runtime. +// All agents (including substrate SandboxAgents) honor spec.declarative.runtime, defaulting +// to Python when unset. func EffectiveDeclarativeRuntimeForAgent(agent AgentObject) DeclarativeRuntime { - spec := agent.GetAgentSpec() - if agent.GetWorkloadMode() == WorkloadModeSandbox && - AgentSandboxPlatform(agent) == SandboxPlatformSubstrate && - spec != nil && - spec.Type == AgentType_Declarative { - return DeclarativeRuntime_Go - } - return EffectiveDeclarativeRuntime(spec) + return EffectiveDeclarativeRuntime(agent.GetAgentSpec()) } // NetworkConfig configures outbound network access for sandboxed execution paths. diff --git a/go/api/v1alpha2/sandboxagent_types.go b/go/api/v1alpha2/sandboxagent_types.go index fcfa3db9cf..dc9ca25160 100644 --- a/go/api/v1alpha2/sandboxagent_types.go +++ b/go/api/v1alpha2/sandboxagent_types.go @@ -39,7 +39,6 @@ type SandboxAgent struct { // +kubebuilder:validation:XValidation:rule="!has(self.skills) || self.platform != 'substrate'",message="spec.skills is not supported when spec.platform is substrate" // +kubebuilder:validation:XValidation:rule="!has(self.substrate) || self.platform == 'substrate'",message="spec.substrate may only be set when spec.platform is substrate" -// +kubebuilder:validation:XValidation:rule="!has(self.type) || self.type != 'BYO' || self.platform != 'substrate'",message="BYO agents are not supported when spec.platform is substrate" type SandboxAgentSpec struct { AgentSpec `json:",inline"` diff --git a/go/core/internal/controller/sandboxagent_substrate.go b/go/core/internal/controller/sandboxagent_substrate.go index 8ed117071a..65c5ce58df 100644 --- a/go/core/internal/controller/sandboxagent_substrate.go +++ b/go/core/internal/controller/sandboxagent_substrate.go @@ -33,9 +33,45 @@ func (r *SandboxAgentController) reconcileSubstrateSandboxAgent(ctx context.Cont } return ctrl.Result{Requeue: true}, nil } + + if !r.reconcileSubstrateBlueGreen(ctx, sa) { + // Retiring superseded templates / their goldens advances one ate-api step per pass; + // requeue until the rollout converges so old templates and goldens are cleaned up. + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, nil + } return ctrl.Result{}, nil } +// reconcileSubstrateBlueGreen drives the cleanup half of a config-change rollout: it retires +// ActorTemplates superseded by a newer Ready one (deleting each old template with its now-Suspended +// golden), and best-effort reaps stale per-session actors. The new template keeps serving the old +// golden until its own golden is Ready (see ResolveCurrentActorTemplate), so this never causes +// downtime. Returns true when nothing more remains to retire. Errors are logged, not surfaced, so a +// transient ate-api failure doesn't wedge reconciliation. +func (r *SandboxAgentController) reconcileSubstrateBlueGreen(ctx context.Context, sa *v1alpha2.SandboxAgent) bool { + if r.SubstrateLifecycle == nil { + return true + } + retireDone, err := r.SubstrateLifecycle.RetireSupersededTemplates(ctx, sa) + if err != nil { + sandboxAgentControllerLog.Info("retiring superseded substrate templates failed (will retry)", + "sandboxagent", sa.Namespace+"/"+sa.Name, "err", err.Error()) + return true + } + + // Best-effort reap of stale session actors keyed to a previous config. Not required for + // correctness (config-hashed ids mean they're never reused), so failures don't requeue. + if r.SubstrateActorBackend != nil { + if active, err := substrate.ResolveCurrentActorTemplate(ctx, r.Client, sa.Namespace, sa.Name); err == nil && active != nil { + if _, err := r.SubstrateActorBackend.ReapStaleSessionActors(ctx, sa, active.Name); err != nil { + sandboxAgentControllerLog.Info("reap of stale substrate session actors failed (will retry)", + "sandboxagent", sa.Namespace+"/"+sa.Name, "err", err.Error()) + } + } + } + return retireDone +} + func (r *SandboxAgentController) reconcileSubstrateSandboxAgentDelete(ctx context.Context, sa *v1alpha2.SandboxAgent) (ctrl.Result, error) { if !controllerutil.ContainsFinalizer(sa, sandboxAgentSubstrateFinalizer) { return ctrl.Result{}, nil diff --git a/go/core/internal/controller/translator/agent/adk_api_translator.go b/go/core/internal/controller/translator/agent/adk_api_translator.go index 12b232048a..e1486c064d 100644 --- a/go/core/internal/controller/translator/agent/adk_api_translator.go +++ b/go/core/internal/controller/translator/agent/adk_api_translator.go @@ -116,9 +116,11 @@ var DefaultImageConfig = ImageConfig{ Repository: "kagent-dev/kagent/app", } -// PythonADKImageDigest, GoADKImageDigest, and GoADKFullImageDigest are set at -// controller link time from the pushed runtime image manifest digests. +// PythonADKImageDigest, PythonADKFullImageDigest, GoADKImageDigest, and GoADKFullImageDigest +// are set at controller link time from the pushed runtime image manifest digests. The "full" +// variants bundle the sandbox runtime (code execution / bash tools); the slim variants do not. var PythonADKImageDigest string +var PythonADKFullImageDigest string var GoADKImageDigest string var GoADKFullImageDigest string diff --git a/go/core/internal/controller/translator/agent/deployments.go b/go/core/internal/controller/translator/agent/deployments.go index 2a55255697..3ef1fe8a58 100644 --- a/go/core/internal/controller/translator/agent/deployments.go +++ b/go/core/internal/controller/translator/agent/deployments.go @@ -123,13 +123,20 @@ func validateExtraContainers(containers []corev1.Container) error { return nil } -func resolvePythonRuntimeImage(registry string) (string, error) { +func resolvePythonRuntimeImage(registry string, full bool) (string, error) { repo := DefaultImageConfig.Repository - if d := normalizeImageDigest(PythonADKImageDigest); d != "" { + digest := PythonADKImageDigest + imageLabel := "app" + if full { + digest = PythonADKFullImageDigest + imageLabel = "app-full" + } + if d := normalizeImageDigest(digest); d != "" { return fmt.Sprintf("%s/%s@%s", registry, repo, d), nil } return "", fmt.Errorf( - "app image digest is not set at link time; rebuild the controller after pushing agent runtime images", + "%s image digest is not set at link time; rebuild the controller after pushing agent runtime images", + imageLabel, ) } @@ -182,7 +189,7 @@ func resolveInlineDeployment(agent v1alpha2.AgentObject, mdd *modelDeploymentDat } var image string - full := runtime == v1alpha2.DeclarativeRuntime_Go && needsSRTSettings(agent, specRef.Sandbox) + full := needsSRTSettings(agent, specRef.Sandbox) switch runtime { case v1alpha2.DeclarativeRuntime_Go: var err error @@ -192,7 +199,7 @@ func resolveInlineDeployment(agent v1alpha2.AgentObject, mdd *modelDeploymentDat } default: var err error - image, err = resolvePythonRuntimeImage(registry) + image, err = resolvePythonRuntimeImage(registry, full) if err != nil { return nil, err } diff --git a/go/core/internal/controller/translator/agent/digest_testmain_external_test.go b/go/core/internal/controller/translator/agent/digest_testmain_external_test.go index db798c77cf..292ffb1ec8 100644 --- a/go/core/internal/controller/translator/agent/digest_testmain_external_test.go +++ b/go/core/internal/controller/translator/agent/digest_testmain_external_test.go @@ -9,6 +9,7 @@ import ( func TestMain(m *testing.M) { translator.PythonADKImageDigest = "sha256:test-app" + translator.PythonADKFullImageDigest = "sha256:test-app-full" translator.GoADKImageDigest = "sha256:test-go-base" translator.GoADKFullImageDigest = "sha256:test-go-full" os.Exit(m.Run()) diff --git a/go/core/internal/controller/translator/agent/imageconfig_test.go b/go/core/internal/controller/translator/agent/imageconfig_test.go index be1870dcf0..9ed21ec869 100644 --- a/go/core/internal/controller/translator/agent/imageconfig_test.go +++ b/go/core/internal/controller/translator/agent/imageconfig_test.go @@ -75,14 +75,33 @@ func TestResolveGoRuntimeImageWithoutDigest(t *testing.T) { func TestResolvePythonRuntimeImageWithDigest(t *testing.T) { original := PythonADKImageDigest + originalFull := PythonADKFullImageDigest t.Cleanup(func() { PythonADKImageDigest = original + PythonADKFullImageDigest = originalFull }) PythonADKImageDigest = "sha256:app-digest" + PythonADKFullImageDigest = "sha256:app-full-digest" - got, err := resolvePythonRuntimeImage("cr.kagent.dev") + got, err := resolvePythonRuntimeImage("cr.kagent.dev", false) require.NoError(t, err) require.Equal(t, "cr.kagent.dev/kagent-dev/kagent/app@sha256:app-digest", got) + + gotFull, err := resolvePythonRuntimeImage("cr.kagent.dev", true) + require.NoError(t, err) + require.Equal(t, "cr.kagent.dev/kagent-dev/kagent/app@sha256:app-full-digest", gotFull) +} + +func TestResolvePythonFullRuntimeImageWithoutDigest(t *testing.T) { + original := PythonADKFullImageDigest + t.Cleanup(func() { + PythonADKFullImageDigest = original + }) + PythonADKFullImageDigest = "" + + _, err := resolvePythonRuntimeImage("cr.kagent.dev", true) + require.Error(t, err) + require.Contains(t, err.Error(), "app-full") } func TestPythonADKImageDigestSupportsLinkerFlag(t *testing.T) { @@ -103,7 +122,7 @@ func TestResolvePythonRuntimeImageWithoutDigest(t *testing.T) { }) PythonADKImageDigest = "" - _, err := resolvePythonRuntimeImage("cr.kagent.dev") + _, err := resolvePythonRuntimeImage("cr.kagent.dev", false) require.Error(t, err) require.Contains(t, err.Error(), "app") } diff --git a/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_code.json b/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_code.json index f9e03a9be8..451bf4d795 100644 --- a/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_code.json +++ b/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_code.json @@ -193,7 +193,7 @@ "value": "/config/srt-settings.json" } ], - "image": "cr.kagent.dev/kagent-dev/kagent/app@sha256:test-app", + "image": "cr.kagent.dev/kagent-dev/kagent/app@sha256:test-app-full", "imagePullPolicy": "IfNotPresent", "name": "kagent", "ports": [ diff --git a/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_git_skills.json b/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_git_skills.json index dd5efe2e78..11aaf8c5f8 100644 --- a/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_git_skills.json +++ b/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_git_skills.json @@ -215,7 +215,7 @@ "value": "/config/srt-settings.json" } ], - "image": "cr.kagent.dev/kagent-dev/kagent/app@sha256:test-app", + "image": "cr.kagent.dev/kagent-dev/kagent/app@sha256:test-app-full", "imagePullPolicy": "IfNotPresent", "name": "kagent", "ports": [ diff --git a/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_skills.json b/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_skills.json index 400fa7fff1..ff48dc868a 100644 --- a/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_skills.json +++ b/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_skills.json @@ -215,7 +215,7 @@ "value": "/config/srt-settings.json" } ], - "image": "cr.kagent.dev/kagent-dev/kagent/app@sha256:test-app", + "image": "cr.kagent.dev/kagent-dev/kagent/app@sha256:test-app-full", "imagePullPolicy": "IfNotPresent", "name": "kagent", "ports": [ diff --git a/go/core/pkg/app/app.go b/go/core/pkg/app/app.go index 3781eefab8..148ad7ecf5 100644 --- a/go/core/pkg/app/app.go +++ b/go/core/pkg/app/app.go @@ -557,7 +557,7 @@ func Start(getExtensionConfig GetExtensionConfig, migrationRunner MigrationRunne if atenetRouterURL == "" { atenetRouterURL = substrate.DefaultAtenetRouterURL } - substrateSandboxActorBackend = substrate.NewSandboxAgentActorBackend(substrateAteClient, atenetRouterURL) + substrateSandboxActorBackend = substrate.NewSandboxAgentActorBackend(substrateAteClient, mgr.GetClient(), atenetRouterURL) agentsSubstrate := substrate.NewAgentsBackend(substrateLifecycle, substrateAteClient) extensionCfg.SandboxBackend = sandboxbackend.NewRoutingBackend(extensionCfg.SandboxBackend, agentsSubstrate) } diff --git a/go/core/pkg/sandboxbackend/filter_translator_owned_test.go b/go/core/pkg/sandboxbackend/filter_translator_owned_test.go index 6537fdac29..d528f15460 100644 --- a/go/core/pkg/sandboxbackend/filter_translator_owned_test.go +++ b/go/core/pkg/sandboxbackend/filter_translator_owned_test.go @@ -57,7 +57,7 @@ func TestFilterTranslatorOwnedTypesForList(t *testing.T) { require.Len(t, out, len(allTypes)) }) - t.Run("substrate SandboxAgent drops agent-sandbox GVKs", func(t *testing.T) { + t.Run("substrate SandboxAgent drops agent-sandbox GVKs and ActorTemplate from prune", func(t *testing.T) { require.NoError(t, atev1alpha1.AddToScheme(scheme)) routing := sandboxbackend.NewRoutingBackend(agentsxk8s.New(), substrate.NewAgentsBackend(nil, nil)) allWithSubstrate := append(allTypes, &atev1alpha1.ActorTemplate{}) @@ -69,18 +69,16 @@ func TestFilterTranslatorOwnedTypesForList(t *testing.T) { } out, err := sandboxbackend.FilterTranslatorOwnedTypesForList(cl, sa, allWithSubstrate, routing) require.NoError(t, err) - require.Len(t, out, 3) - var sawSandbox, sawActorTemplate bool + // Substrate manages ActorTemplate lifecycle itself (blue-green), so it is excluded from the + // generic prune list along with the agent-sandbox Sandbox GVK — leaving only the generic + // Deployment + ConfigMap types. + require.Len(t, out, 2) for _, o := range out { - if _, ok := o.(*agentsandboxv1.Sandbox); ok { - sawSandbox = true - } - if _, ok := o.(*atev1alpha1.ActorTemplate); ok { - sawActorTemplate = true - } + _, isSandbox := o.(*agentsandboxv1.Sandbox) + require.False(t, isSandbox, "substrate agents must not list agent-sandbox Sandbox resources") + _, isActorTemplate := o.(*atev1alpha1.ActorTemplate) + require.False(t, isActorTemplate, "ActorTemplate must be excluded from generic prune (managed via blue-green)") } - require.False(t, sawSandbox, "substrate agents must not list agent-sandbox Sandbox resources") - require.True(t, sawActorTemplate) }) t.Run("agent-sandbox SandboxAgent keeps Sandbox GVK only", func(t *testing.T) { diff --git a/go/core/pkg/sandboxbackend/routing.go b/go/core/pkg/sandboxbackend/routing.go index 9f660a19a8..bdb2fa147c 100644 --- a/go/core/pkg/sandboxbackend/routing.go +++ b/go/core/pkg/sandboxbackend/routing.go @@ -58,13 +58,17 @@ func (r *RoutingBackend) GetOwnedResourceTypes() []client.Object { return out } -// OwnedResourceTypesFor returns owned-resource types for the agent's sandbox platform. +// OwnedResourceTypesFor returns the owned-resource types the reconciler should prune for the +// agent's sandbox platform. It delegates to the platform backend's per-agent method (NOT +// GetOwnedResourceTypes, which is the broader watch set) so a platform can watch a type without +// having it generically pruned — substrate uses this to manage ActorTemplate lifecycle itself +// (blue-green: keep the old template serving until the new golden is Ready). func (r *RoutingBackend) OwnedResourceTypesFor(agent v1alpha2.AgentObject) ([]client.Object, error) { b, err := r.backendFor(agent) if err != nil { return nil, err } - return b.GetOwnedResourceTypes(), nil + return b.OwnedResourceTypesFor(agent) } func (r *RoutingBackend) ComputeReady(ctx context.Context, cl client.Client, nn types.NamespacedName) (metav1.ConditionStatus, string, string) { diff --git a/go/core/pkg/sandboxbackend/substrate/actor_errors.go b/go/core/pkg/sandboxbackend/substrate/actor_errors.go index c75a40b4e6..221f0014cb 100644 --- a/go/core/pkg/sandboxbackend/substrate/actor_errors.go +++ b/go/core/pkg/sandboxbackend/substrate/actor_errors.go @@ -22,6 +22,18 @@ func wrapResumeActorError(actorID string, err error) error { return fmt.Errorf("substrate ResumeActor %q: %w", actorID, err) } +// wrapCreateActorError normalizes a CreateActor failure, surfacing ErrNoFreeWorkers (so the chat +// path can buffer/retry) when the WorkerPool is at capacity. +func wrapCreateActorError(actorID string, err error) error { + if err == nil { + return nil + } + if isNoFreeWorkersError(err) { + return fmt.Errorf("%w", ErrNoFreeWorkers) + } + return fmt.Errorf("substrate CreateActor %q: %w", actorID, err) +} + func isNoFreeWorkersError(err error) bool { if errors.Is(err, ErrNoFreeWorkers) { return true diff --git a/go/core/pkg/sandboxbackend/substrate/agent_actor.go b/go/core/pkg/sandboxbackend/substrate/agent_actor.go index 821cdbc669..8a85500e55 100644 --- a/go/core/pkg/sandboxbackend/substrate/agent_actor.go +++ b/go/core/pkg/sandboxbackend/substrate/agent_actor.go @@ -5,33 +5,55 @@ import ( "crypto/sha256" "fmt" "strings" + "time" "github.com/agent-substrate/substrate/pkg/proto/ateapipb" "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + // ensureActorBufferTimeout caps how long a chat request waits for substrate worker capacity + // before giving up. During a config rollout the new golden's build can occupy the worker(s), + // so resuming/creating the session actor briefly returns "no free workers"; we buffer the + // request rather than fail it. Bounded so a genuinely stuck/zero-capacity pool still errors. + ensureActorBufferTimeout = 2 * time.Minute + ensureActorRetryInitial = 500 * time.Millisecond + ensureActorRetryMax = 4 * time.Second ) // SandboxAgentActorBackend manages ate-api actors for SandboxAgent workloads. type SandboxAgentActorBackend struct { client *Client + kube client.Client atenetRouterURL string } // NewSandboxAgentActorBackend returns a backend that ensures SandboxAgent actors on ate-api. -func NewSandboxAgentActorBackend(client *Client, atenetRouterURL string) *SandboxAgentActorBackend { +// kube is used to resolve the agent's current (config-hashed) ActorTemplate. +func NewSandboxAgentActorBackend(client *Client, kube client.Client, atenetRouterURL string) *SandboxAgentActorBackend { atenetRouterURL = strings.TrimSpace(atenetRouterURL) if atenetRouterURL == "" { atenetRouterURL = DefaultAtenetRouterURL } return &SandboxAgentActorBackend{ client: client, + kube: kube, atenetRouterURL: atenetRouterURL, } } // EnsureSessionActor creates and resumes the per-session actor for a SandboxAgent chat. +// +// During a config rollout the new golden's build can occupy the WorkerPool (especially a +// single-replica pool), so the underlying ResumeActor/CreateActor briefly returns "no free +// workers". Rather than failing the chat, this buffers the request: it retries with backoff +// (re-resolving the current template each pass, so once the new golden is Ready the request lands +// on it with the new config) until a worker frees, the caller's context is cancelled, or the +// buffer timeout elapses. Other errors are returned immediately. func (b *SandboxAgentActorBackend) EnsureSessionActor(ctx context.Context, sa *v1alpha2.SandboxAgent, sessionID string) (sandboxbackend.EnsureResult, error) { if sa == nil { return sandboxbackend.EnsureResult{}, fmt.Errorf("SandboxAgent is required") @@ -47,8 +69,37 @@ func (b *SandboxAgentActorBackend) EnsureSessionActor(ctx context.Context, sa *v return sandboxbackend.EnsureResult{}, fmt.Errorf("substrate actor backend called for platform %q", v1alpha2.AgentSandboxPlatform(sa)) } - actorID := SandboxAgentSessionActorID(sa, sessionID) - tmplNS, tmplName := sa.Namespace, SandboxAgentActorTemplateName(sa) + bufferDeadline := time.Now().Add(ensureActorBufferTimeout) + backoff := ensureActorRetryInitial + for { + res, err := b.ensureSessionActorOnce(ctx, sa, sessionID) + if err == nil || !isNoFreeWorkersError(err) { + return res, err + } + if time.Now().After(bufferDeadline) { + return sandboxbackend.EnsureResult{}, fmt.Errorf("substrate worker capacity for %s/%s not available within %s: %w", sa.Namespace, sa.Name, ensureActorBufferTimeout, err) + } + timer := time.NewTimer(backoff) + select { + case <-ctx.Done(): + timer.Stop() + return sandboxbackend.EnsureResult{}, fmt.Errorf("waiting for substrate worker capacity for %s/%s: %w", sa.Namespace, sa.Name, ctx.Err()) + case <-timer.C: + } + if backoff *= 2; backoff > ensureActorRetryMax { + backoff = ensureActorRetryMax + } + } +} + +// ensureSessionActorOnce performs a single create/resume/reachability attempt for the session's +// actor against the currently-resolved template. +func (b *SandboxAgentActorBackend) ensureSessionActorOnce(ctx context.Context, sa *v1alpha2.SandboxAgent, sessionID string) (sandboxbackend.EnsureResult, error) { + actorID, tmplName, err := b.sessionActorRef(ctx, sa, sessionID) + if err != nil { + return sandboxbackend.EnsureResult{}, err + } + tmplNS := sa.Namespace actor, err := b.client.GetActor(ctx, actorID) if err != nil { @@ -57,7 +108,7 @@ func (b *SandboxAgentActorBackend) EnsureSessionActor(ctx context.Context, sa *v } actor, err = b.client.CreateActor(ctx, actorID, tmplNS, tmplName) if err != nil { - return sandboxbackend.EnsureResult{}, fmt.Errorf("substrate CreateActor %q: %w", actorID, err) + return sandboxbackend.EnsureResult{}, wrapCreateActorError(actorID, err) } } @@ -86,7 +137,10 @@ func (b *SandboxAgentActorBackend) SuspendSessionActor(ctx context.Context, sa * if b == nil || b.client == nil || sa == nil { return nil } - actorID := SandboxAgentSessionActorID(sa, sessionID) + actorID, _, err := b.sessionActorRef(ctx, sa, sessionID) + if err != nil { + return err + } actor, err := b.client.GetActor(ctx, actorID) if err != nil { if status.Code(err) == codes.NotFound { @@ -116,7 +170,64 @@ func (b *SandboxAgentActorBackend) DeleteSandboxAgentSessionActor(ctx context.Co if sa == nil { return true, nil } - return b.DeleteSandboxAgentActor(ctx, SandboxAgentSessionActorID(sa, sessionID)) + actorID, _, err := b.sessionActorRef(ctx, sa, sessionID) + if err != nil { + return false, err + } + return b.DeleteSandboxAgentActor(ctx, actorID) +} + +// sessionActorRef resolves the agent's current (config-hashed) ActorTemplate and returns the +// session actor id keyed to it plus the template name to create the actor from. Keying the +// id on the config hash means a config change yields a new actor id, so the next message +// creates a fresh actor from the new golden instead of resuming the stale one. +func (b *SandboxAgentActorBackend) sessionActorRef(ctx context.Context, sa *v1alpha2.SandboxAgent, sessionID string) (actorID, templateName string, err error) { + tmpl, err := ResolveCurrentActorTemplate(ctx, b.kube, sa.Namespace, sa.Name) + if err != nil { + return "", "", err + } + if tmpl == nil { + return "", "", fmt.Errorf("no ActorTemplate generated yet for SandboxAgent %s/%s", sa.Namespace, sa.Name) + } + hash := tmpl.Annotations[SandboxConfigHashAnnotation] + return SandboxAgentSessionActorID(sa, hash, sessionID), tmpl.Name, nil +} + +// ReapStaleSessionActors deletes this agent's per-session actors that were created from a +// superseded ActorTemplate (before a config change). It only deletes SUSPENDED actors: a RUNNING +// one may be mid-request, and kagent's transport suspends session actors after each request, so a +// stale actor converges to SUSPENDED on its own — force-suspending it could cut a live response. +// With config-hashed actor ids these actors are never addressed again, so this is storage hygiene, +// not correctness, and is best-effort. (Superseded GOLDEN actors are handled by +// Lifecycle.RetireSupersededTemplates, which removes them with their template once a newer +// template is Ready.) Returns true when no stale suspended session actors remain. +func (b *SandboxAgentActorBackend) ReapStaleSessionActors(ctx context.Context, sa *v1alpha2.SandboxAgent, activeTemplateName string) (bool, error) { + if b == nil || b.client == nil || sa == nil { + return true, nil + } + sessionPrefix := sandboxAgentActorPrefix(sa) + "-" + actors, err := b.client.ListActors(ctx) + if err != nil { + return false, fmt.Errorf("list substrate actors: %w", err) + } + allDone := true + for _, actor := range actors { + id := strings.TrimSpace(actor.GetActorId()) + if id == "" || !strings.HasPrefix(id, sessionPrefix) { + continue // not a session actor of this agent + } + if actor.GetActorTemplateName() == activeTemplateName { + continue // belongs to the active template + } + done, err := deleteActorIfSuspended(ctx, b.client, id) + if err != nil { + return false, fmt.Errorf("delete stale session actor %q: %w", id, err) + } + if !done { + allDone = false + } + } + return allDone, nil } // DeleteAllSandboxAgentActors deletes legacy per-agent actors and all session actors for a SandboxAgent. @@ -153,14 +264,21 @@ func sandboxAgentActorPrefix(sa *v1alpha2.SandboxAgent) string { return SandboxAgentActorID(sa) } -// SandboxAgentSessionActorID returns a stable ate-api actor id for a SandboxAgent chat session. -func SandboxAgentSessionActorID(sa *v1alpha2.SandboxAgent, sessionID string) string { - raw := fmt.Sprintf("%s-%s", sandboxAgentActorPrefix(sa), sanitizeSessionID(sessionID)) +// SandboxAgentSessionActorID returns the ate-api actor id for a SandboxAgent chat session at a +// given config hash. The hash segment ties the actor to a specific golden snapshot: a config +// change produces a new id, so the next message creates a fresh actor instead of resuming the +// stale one. The id keeps the agent prefix (asr---) so per-agent cleanup still matches. +func SandboxAgentSessionActorID(sa *v1alpha2.SandboxAgent, configHash, sessionID string) string { + hashSeg := "" + if configHash != "" { + hashSeg = configHash + "-" + } + raw := fmt.Sprintf("%s-%s%s", sandboxAgentActorPrefix(sa), hashSeg, sanitizeSessionID(sessionID)) raw = strings.ToLower(strings.ReplaceAll(raw, "_", "-")) if len(raw) <= 63 && dns1123Label.MatchString(raw) { return raw } - sum := sha256.Sum256([]byte(sa.Namespace + "/" + sa.Name + "/" + sessionID)) + sum := sha256.Sum256([]byte(sa.Namespace + "/" + sa.Name + "/" + configHash + "/" + sessionID)) return fmt.Sprintf("%s-%x", sandboxAgentIDPrefix, sum[:12]) } diff --git a/go/core/pkg/sandboxbackend/substrate/agent_lifecycle.go b/go/core/pkg/sandboxbackend/substrate/agent_lifecycle.go index d77d97c0c6..456dadd760 100644 --- a/go/core/pkg/sandboxbackend/substrate/agent_lifecycle.go +++ b/go/core/pkg/sandboxbackend/substrate/agent_lifecycle.go @@ -2,6 +2,7 @@ package substrate import ( "fmt" + "strconv" "strings" atev1alpha1 "github.com/agent-substrate/substrate/pkg/api/v1alpha1" @@ -15,11 +16,34 @@ import ( // buildSandboxAgentActorTemplate is invoked from the translator via AgentsBackend.BuildSandbox. const ( - sandboxAgentIDPrefix = "asr" - defaultKagentContainer = "kagent" - SandboxAgentLabelKey = "kagent.dev/sandbox-agent" - defaultGoEntrypoint = "/app" + sandboxAgentIDPrefix = "asr" + defaultKagentContainer = "kagent" + SandboxAgentLabelKey = "kagent.dev/sandbox-agent" + defaultGoEntrypoint = "/app" + // defaultPythonEntrypoint is the absolute path to the kagent-adk console script in the + // Python ADK image venv. Substrate copies Command verbatim into the OCI Process.Args with + // no PATH/entrypoint fallback, so the path must be explicit and kept in sync with the + // Python Dockerfile's UV_PROJECT_ENVIRONMENT (/.kagent/.venv). + defaultPythonEntrypoint = "/.kagent/.venv/bin/kagent-adk" substrateKagentListenPort int32 = 80 + // pythonRuntimeLibPath / pythonVenvPath mirror the Python ADK image layout + // (python/Dockerfile): bundled shared libs live on LD_LIBRARY_PATH and the project + // venv at UV_PROJECT_ENVIRONMENT. Substrate ignores the image's ENV directives (see + // pythonRuntimeImageEnv), so these are re-supplied via the ActorTemplate env. + pythonRuntimeLibPath = "/usr/lib/kagent-libs" + pythonVenvPath = "/.kagent/.venv" + + // SandboxConfigHashAnnotation carries the rendered-config hash on the generated + // ActorTemplate. It mirrors the translator's "kagent.dev/config-hash" pod-template + // annotation (keep in sync). A golden snapshot is an immutable memory image, so a + // config change must produce a NEW ActorTemplate (substrate snapshots once and + // no-ops in PhaseReady); folding the hash into the template name does that, and the + // annotation lets the chat path/reaper key session actors to the active template. + SandboxConfigHashAnnotation = "kagent.dev/config-hash" + + // sandboxAgentTemplateNameMaxBase reserves room in the 63-char DNS-1123 budget for + // the "-" suffix (hash is up to 16 hex chars). + sandboxAgentTemplateNameMaxBase = 46 ) func (p *Lifecycle) buildSandboxAgentActorTemplate( @@ -35,13 +59,27 @@ func (p *Lifecycle) buildSandboxAgentActorTemplate( if err != nil { return nil, err } - command, containerEnv := buildSubstrateKagentContainerCommand(sa) + command, containerEnv, err := buildSubstrateKagentContainerCommand(sa, kagentContainer) + if err != nil { + return nil, err + } + + // The config hash is computed by the translator and stamped on the pod template. + // Folding it into the ActorTemplate name makes a config change create a new template + // (and thus a fresh golden snapshot) instead of mutating one substrate will never + // re-snapshot. The annotation carries the same hash for the chat path and reaper. + configHash := shortConfigHash(podTemplate.Annotations[SandboxConfigHashAnnotation]) + annotations := map[string]string{} + if configHash != "" { + annotations[SandboxConfigHashAnnotation] = configHash + } desired := &atev1alpha1.ActorTemplate{ ObjectMeta: metav1.ObjectMeta{ - Name: SandboxAgentActorTemplateName(sa), - Namespace: sa.Namespace, - Labels: sandboxAgentLifecycleLabels(sa), + Name: sandboxAgentActorTemplateName(sa, configHash), + Namespace: sa.Namespace, + Labels: sandboxAgentLifecycleLabels(sa), + Annotations: annotations, }, Spec: atev1alpha1.ActorTemplateSpec{ PauseImage: p.Defaults.PauseImage, @@ -76,31 +114,83 @@ func findKagentContainer(containers []corev1.Container) *corev1.Container { return nil } -// buildSubstrateKagentContainerCommand returns an ActorTemplate command for Substrate. -// Substrate runs Command directly (no shell). Config is materialized from secret-backed -// env vars at startup via MaterializeFromEnv in the Go ADK entrypoint. -func buildSubstrateKagentContainerCommand(sa *v1alpha2.SandboxAgent) ([]string, []corev1.EnvVar) { +// buildSubstrateKagentContainerCommand returns the ActorTemplate command and the prepended +// env for a SandboxAgent on Substrate. Substrate runs Command directly (no shell) and copies +// it verbatim into the OCI Process.Args with no PATH/entrypoint fallback, so the command must +// be fully explicit. +// +// For declarative agents the command is the runtime ADK entrypoint and config is materialized +// from secret-backed env vars at startup (Go: MaterializeFromEnv in the Go ADK; Python: the +// `static` command materializes the same env vars before reading /config). For BYO agents the +// user-provided container Command/Args are used verbatim; the BYO image must serve A2A on the +// substrate listen port (80). +func buildSubstrateKagentContainerCommand(sa *v1alpha2.SandboxAgent, container *corev1.Container) ([]string, []corev1.EnvVar, error) { // KAGENT_NAME / KAGENT_NAMESPACE are normally injected by the translator pod // template, but KAGENT_NAMESPACE uses a Downward API fieldRef which Substrate // ActorTemplates do not support (it gets dropped by sanitizeActorTemplateEnvVar). - // Without it the Go ADK derives a wrong app name, and the controller rejects + // Without it the ADK derives a wrong app name, and the controller rejects // session callbacks with "Session does not belong to this agent". Set both as // literals here; they are prepended before the pod env so they win deduplication. env := []corev1.EnvVar{ {Name: "KAGENT_NAME", Value: sa.Name}, {Name: "KAGENT_NAMESPACE", Value: sa.Namespace}, } + + spec := sa.GetAgentSpec() + if spec != nil && spec.Type == v1alpha2.AgentType_BYO { + // BYO: use the explicit container command + args verbatim. Validation + // (ValidateSubstrateSandboxAgentSpec) guarantees a command is set. + if len(container.Command) == 0 { + return nil, nil, fmt.Errorf("BYO substrate agent %q is missing an explicit container command", sa.Name) + } + cmd := append([]string{}, container.Command...) + cmd = append(cmd, container.Args...) + return cmd, env, nil + } + + // Declarative: secret-backed config is materialized at startup. + runtime := v1alpha2.EffectiveDeclarativeRuntimeForAgent(sa) env = append(env, kagentAgentSecretEnv(sa)...) - return buildSubstrateGoKagentCommand(), env + if runtime == v1alpha2.DeclarativeRuntime_Python { + env = append(env, pythonRuntimeImageEnv()...) + } + return buildSubstrateDeclarativeCommand(runtime), env, nil +} + +// pythonRuntimeImageEnv returns the runtime-critical ENV directives baked into the Python +// ADK image (python/Dockerfile). Substrate builds the OCI Process.Env from a hardcoded PATH +// plus the ActorTemplate env only — it does NOT apply the image's ENV directives (the same +// way it ignores the image entrypoint). Without LD_LIBRARY_PATH the standalone interpreter +// cannot locate its bundled shared libraries (libz, libsqlite3, ...) and crashes on import +// (e.g. numpy: "ImportError: libz.so.1: cannot open shared object file"); the failed startup +// then surfaces as a gVisor "inconsistent private memory files on restore" error because the +// golden snapshot captures only the pause container. The Go static binary needs none of this. +// Keep in sync with the final-stage ENV block of python/Dockerfile. +func pythonRuntimeImageEnv() []corev1.EnvVar { + return []corev1.EnvVar{ + {Name: "LD_LIBRARY_PATH", Value: pythonRuntimeLibPath}, + {Name: "VIRTUAL_ENV", Value: pythonVenvPath}, + {Name: "PYTHONUNBUFFERED", Value: "1"}, + {Name: "LANG", Value: "C.UTF-8"}, + {Name: "LC_ALL", Value: "C.UTF-8"}, + } } -// buildSubstrateGoKagentCommand returns the explicit command for the declarative -// Go ADK image. Substrate's atelet copies Command verbatim into the OCI spec's -// Process.Args with no fallback to the image entrypoint, so an empty command -// makes `runsc create` fail with "Spec.Process.Arg must be defined". BYO agents -// are rejected for the substrate platform by validation, so only the declarative -// entrypoint is needed here. -func buildSubstrateGoKagentCommand() []string { +// buildSubstrateDeclarativeCommand returns the explicit command for a declarative ADK image. +// Substrate's atelet copies Command verbatim into the OCI spec's Process.Args with no fallback +// to the image entrypoint, so an empty command makes `runsc create` fail with +// "Spec.Process.Arg must be defined". +func buildSubstrateDeclarativeCommand(runtime v1alpha2.DeclarativeRuntime) []string { + if runtime == v1alpha2.DeclarativeRuntime_Python { + // The Python ADK `static` command reads config.json/agent-card.json from its + // --filepath (default /config), which the materialization step populates from + // the secret-backed env vars before the server starts. + return []string{ + defaultPythonEntrypoint, "static", + "--host", "0.0.0.0", + "--port", fmt.Sprintf("%d", substrateKagentListenPort), + } + } return []string{ defaultGoEntrypoint, "--host", "0.0.0.0", @@ -141,11 +231,35 @@ func sandboxAgentLifecycleLabels(sa *v1alpha2.SandboxAgent) map[string]string { } } -// SandboxAgentActorTemplateName is the generated ActorTemplate name for a SandboxAgent. -func SandboxAgentActorTemplateName(sa *v1alpha2.SandboxAgent) string { +// sandboxAgentActorTemplateBaseName is the stable name prefix for a SandboxAgent's +// ActorTemplate(s), independent of config. Used as the truncation base for hashed names. +func sandboxAgentActorTemplateBaseName(sa *v1alpha2.SandboxAgent) string { return truncateDNS1123(sa.Name) } +// sandboxAgentActorTemplateName is the generated ActorTemplate name for a SandboxAgent at a +// given config hash. The hash suffix makes each distinct config a distinct template (and +// golden). When the hash is empty (no config materialized) it falls back to the stable base +// name. Consumers must NOT assume this name — they resolve the live template via +// ResolveCurrentActorTemplate, since the hash depends on rendered config they don't have. +func sandboxAgentActorTemplateName(sa *v1alpha2.SandboxAgent, configHash string) string { + if configHash == "" { + return sandboxAgentActorTemplateBaseName(sa) + } + base := truncateDNS1123To(sa.Name, sandboxAgentTemplateNameMaxBase) + return fmt.Sprintf("%s-%s", base, configHash) +} + +// shortConfigHash converts the translator's decimal config-hash annotation into a short, +// DNS-1123-safe hex token (≤16 chars). Returns "" when the annotation is absent/unparseable. +func shortConfigHash(annotationValue string) string { + v, err := strconv.ParseUint(strings.TrimSpace(annotationValue), 10, 64) + if err != nil { + return "" + } + return fmt.Sprintf("%x", v) +} + func sandboxAgentSnapshotsLocation(sa *v1alpha2.SandboxAgent) string { if sa == nil { return substrateSnapshotsLocationFor("", "", "") diff --git a/go/core/pkg/sandboxbackend/substrate/agent_lifecycle_test.go b/go/core/pkg/sandboxbackend/substrate/agent_lifecycle_test.go index adaafb53ad..045850812b 100644 --- a/go/core/pkg/sandboxbackend/substrate/agent_lifecycle_test.go +++ b/go/core/pkg/sandboxbackend/substrate/agent_lifecycle_test.go @@ -3,9 +3,15 @@ package substrate import ( "testing" + atev1alpha1 "github.com/agent-substrate/substrate/pkg/api/v1alpha1" "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" ) func TestActorTemplateEnvFromPodEnv(t *testing.T) { @@ -43,39 +49,210 @@ func TestActorTemplateEnvFromPodEnv(t *testing.T) { require.NotNil(t, got[1].ValueFrom.SecretKeyRef) } -func TestBuildSubstrateGoKagentCommand(t *testing.T) { +func TestBuildSubstrateDeclarativeCommand(t *testing.T) { t.Parallel() // Substrate's atelet copies Command verbatim into the OCI Process.Args with - // no image-entrypoint fallback, so the declarative Go command must be explicit. - require.Equal(t, []string{"/app", "--host", "0.0.0.0", "--port", "80"}, buildSubstrateGoKagentCommand()) + // no image-entrypoint fallback, so the declarative command must be explicit. + require.Equal(t, + []string{"/app", "--host", "0.0.0.0", "--port", "80"}, + buildSubstrateDeclarativeCommand(v1alpha2.DeclarativeRuntime_Go), + ) + require.Equal(t, + []string{"/.kagent/.venv/bin/kagent-adk", "static", "--host", "0.0.0.0", "--port", "80"}, + buildSubstrateDeclarativeCommand(v1alpha2.DeclarativeRuntime_Python), + ) } -func TestBuildSubstrateKagentContainerCommand(t *testing.T) { +func declarativeSandboxAgent(runtime v1alpha2.DeclarativeRuntime) *v1alpha2.SandboxAgent { + sa := &v1alpha2.SandboxAgent{ + Spec: v1alpha2.SandboxAgentSpec{ + AgentSpec: v1alpha2.AgentSpec{ + Type: v1alpha2.AgentType_Declarative, + Declarative: &v1alpha2.DeclarativeAgentSpec{Runtime: runtime}, + }, + }, + } + sa.Name = "my-agent" + sa.Namespace = "kagent" + return sa +} + +func TestBuildSubstrateKagentContainerCommandDeclarative(t *testing.T) { + t.Parallel() + + for _, tc := range []struct { + name string + runtime v1alpha2.DeclarativeRuntime + wantCmd []string + wantLibEnv bool // Python needs LD_LIBRARY_PATH re-supplied (substrate drops image ENV); Go does not. + }{ + {"go", v1alpha2.DeclarativeRuntime_Go, []string{"/app", "--host", "0.0.0.0", "--port", "80"}, false}, + {"python", v1alpha2.DeclarativeRuntime_Python, []string{"/.kagent/.venv/bin/kagent-adk", "static", "--host", "0.0.0.0", "--port", "80"}, true}, + } { + t.Run(tc.name, func(t *testing.T) { + sa := declarativeSandboxAgent(tc.runtime) + cmd, env, err := buildSubstrateKagentContainerCommand(sa, &corev1.Container{}) + require.NoError(t, err) + require.Equal(t, tc.wantCmd, cmd) + + // KAGENT_NAME / KAGENT_NAMESPACE must be literal values so the ADK can + // derive the correct app name (fieldRef env vars are dropped on Substrate). + envByName := map[string]string{} + for _, e := range env { + envByName[e.Name] = e.Value + } + require.Equal(t, "my-agent", envByName["KAGENT_NAME"]) + require.Equal(t, "kagent", envByName["KAGENT_NAMESPACE"]) + + // Substrate ignores the image's ENV, so the Python runtime image's + // LD_LIBRARY_PATH must be re-supplied or numpy fails to load libz.so.1. + if tc.wantLibEnv { + require.Equal(t, pythonRuntimeLibPath, envByName["LD_LIBRARY_PATH"]) + require.Equal(t, "1", envByName["PYTHONUNBUFFERED"]) + } else { + _, ok := envByName["LD_LIBRARY_PATH"] + require.False(t, ok, "Go declarative must not carry the Python runtime ENV") + } + }) + } +} + +func TestBuildSubstrateKagentContainerCommandBYO(t *testing.T) { t.Parallel() + cmd := "/serve" sa := &v1alpha2.SandboxAgent{ Spec: v1alpha2.SandboxAgentSpec{ AgentSpec: v1alpha2.AgentSpec{ - Type: v1alpha2.AgentType_Declarative, - Declarative: &v1alpha2.DeclarativeAgentSpec{ - Runtime: v1alpha2.DeclarativeRuntime_Go, - }, + Type: v1alpha2.AgentType_BYO, + BYO: &v1alpha2.BYOAgentSpec{Deployment: &v1alpha2.ByoDeploymentSpec{Image: "example/agent:latest", Cmd: &cmd}}, }, }, } - sa.Name = "my-agent" + sa.Name = "byo-agent" sa.Namespace = "kagent" - cmd, env := buildSubstrateKagentContainerCommand(sa) - require.Equal(t, []string{"/app", "--host", "0.0.0.0", "--port", "80"}, cmd) + + container := &corev1.Container{Command: []string{"/serve"}, Args: []string{"--host", "0.0.0.0", "--port", "80"}} + got, env, err := buildSubstrateKagentContainerCommand(sa, container) + require.NoError(t, err) + // BYO uses the container command + args verbatim. + require.Equal(t, []string{"/serve", "--host", "0.0.0.0", "--port", "80"}, got) require.NotEmpty(t, env) - // KAGENT_NAME / KAGENT_NAMESPACE must be literal values so the Go ADK can - // derive the correct app name (fieldRef env vars are dropped on Substrate). - envByName := map[string]string{} + // A BYO agent missing an explicit command is rejected. + _, _, err = buildSubstrateKagentContainerCommand(sa, &corev1.Container{}) + require.Error(t, err) +} + +func newTestLifecycle(t *testing.T) *Lifecycle { + t.Helper() + scheme := runtime.NewScheme() + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + utilruntime.Must(atev1alpha1.AddToScheme(scheme)) + return &Lifecycle{ + Client: fake.NewClientBuilder().WithScheme(scheme).Build(), + Defaults: LifecycleDefaults{ + PauseImage: "gcr.io/test/pause@sha256:deadbeef", + }, + } +} + +// envByName flattens an ActorTemplate env list into name->present for assertions. +func actorEnvNames(env []atev1alpha1.EnvVar) map[string]bool { + out := map[string]bool{} for _, e := range env { - envByName[e.Name] = e.Value + out[e.Name] = true + } + return out +} + +// TestBuildSandboxAgentActorTemplate exercises the full ActorTemplate generation for each +// supported runtime/type on substrate (Go declarative, Python declarative, BYO), asserting the +// pinned image, the explicit command, and the env wiring side by side. +func TestBuildSandboxAgentActorTemplate(t *testing.T) { + t.Parallel() + + const pinnedImage = "registry.example/kagent-dev/kagent/app@sha256:1111111111111111111111111111111111111111111111111111111111111111" + cmd := "/serve" + wpKey := types.NamespacedName{Namespace: "kagent", Name: "kagent-default"} + + podTemplateFor := func(container corev1.Container) corev1.PodTemplateSpec { + container.Name = defaultKagentContainer + container.Image = pinnedImage + return corev1.PodTemplateSpec{Spec: corev1.PodSpec{Containers: []corev1.Container{container}}} + } + + for _, tc := range []struct { + name string + sa *v1alpha2.SandboxAgent + container corev1.Container + wantCommand []string + // declarative agents carry secret-backed config env; BYO does not. + wantConfigEnv bool + // Python declarative re-supplies the image's LD_LIBRARY_PATH (substrate drops image ENV). + wantLibEnv bool + }{ + { + name: "go declarative", + sa: &v1alpha2.SandboxAgent{ + ObjectMeta: metav1.ObjectMeta{Name: "go-agent", Namespace: "kagent"}, + Spec: v1alpha2.SandboxAgentSpec{ + Platform: v1alpha2.SandboxPlatformSubstrate, + AgentSpec: v1alpha2.AgentSpec{Type: v1alpha2.AgentType_Declarative, Declarative: &v1alpha2.DeclarativeAgentSpec{Runtime: v1alpha2.DeclarativeRuntime_Go}}, + }, + }, + container: corev1.Container{Args: []string{"--host", "0.0.0.0", "--port", "8080", "--filepath", "/config"}}, + wantCommand: []string{"/app", "--host", "0.0.0.0", "--port", "80"}, + wantConfigEnv: true, + wantLibEnv: false, + }, + { + name: "python declarative", + sa: &v1alpha2.SandboxAgent{ + ObjectMeta: metav1.ObjectMeta{Name: "py-agent", Namespace: "kagent"}, + Spec: v1alpha2.SandboxAgentSpec{ + Platform: v1alpha2.SandboxPlatformSubstrate, + AgentSpec: v1alpha2.AgentSpec{Type: v1alpha2.AgentType_Declarative, Declarative: &v1alpha2.DeclarativeAgentSpec{Runtime: v1alpha2.DeclarativeRuntime_Python}}, + }, + }, + container: corev1.Container{Args: []string{"--host", "0.0.0.0", "--port", "8080", "--filepath", "/config"}}, + wantCommand: []string{"/.kagent/.venv/bin/kagent-adk", "static", "--host", "0.0.0.0", "--port", "80"}, + wantConfigEnv: true, + wantLibEnv: true, + }, + { + name: "byo", + sa: &v1alpha2.SandboxAgent{ + ObjectMeta: metav1.ObjectMeta{Name: "byo-agent", Namespace: "kagent"}, + Spec: v1alpha2.SandboxAgentSpec{ + Platform: v1alpha2.SandboxPlatformSubstrate, + AgentSpec: v1alpha2.AgentSpec{Type: v1alpha2.AgentType_BYO, BYO: &v1alpha2.BYOAgentSpec{Deployment: &v1alpha2.ByoDeploymentSpec{Image: pinnedImage, Cmd: &cmd}}}, + }, + }, + container: corev1.Container{Command: []string{"/serve"}, Args: []string{"--host", "0.0.0.0", "--port", "80"}}, + wantCommand: []string{"/serve", "--host", "0.0.0.0", "--port", "80"}, + wantConfigEnv: false, + wantLibEnv: false, + }, + } { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + p := newTestLifecycle(t) + tmpl, err := p.buildSandboxAgentActorTemplate(tc.sa, wpKey, podTemplateFor(tc.container)) + require.NoError(t, err) + + require.Len(t, tmpl.Spec.Containers, 1) + c := tmpl.Spec.Containers[0] + require.Equal(t, pinnedImage, c.Image, "ActorTemplate must use the digest-pinned image") + require.Equal(t, tc.wantCommand, c.Command) + require.Equal(t, wpKey.Name, tmpl.Spec.WorkerPoolRef.Name) + + names := actorEnvNames(c.Env) + require.True(t, names["KAGENT_NAME"], "KAGENT_NAME must be a literal env var") + require.True(t, names["KAGENT_NAMESPACE"], "KAGENT_NAMESPACE must be a literal env var") + require.Equal(t, tc.wantConfigEnv, names["KAGENT_CONFIG_JSON"], "declarative agents materialize config from secret env; BYO does not") + require.Equal(t, tc.wantLibEnv, names["LD_LIBRARY_PATH"], "Python declarative re-supplies the image LD_LIBRARY_PATH that substrate drops") + }) } - require.Equal(t, "my-agent", envByName["KAGENT_NAME"]) - require.Equal(t, "kagent", envByName["KAGENT_NAMESPACE"]) } diff --git a/go/core/pkg/sandboxbackend/substrate/agents_backend.go b/go/core/pkg/sandboxbackend/substrate/agents_backend.go index e0e557f250..33b5a020d6 100644 --- a/go/core/pkg/sandboxbackend/substrate/agents_backend.go +++ b/go/core/pkg/sandboxbackend/substrate/agents_backend.go @@ -30,8 +30,14 @@ func (b *AgentsBackend) GetOwnedResourceTypes() []client.Object { return []client.Object{&atev1alpha1.ActorTemplate{}} } +// OwnedResourceTypesFor returns no types: substrate ActorTemplates are intentionally excluded +// from the reconciler's generic prune so a config change does not delete the currently-serving +// template. Their lifecycle is managed explicitly (blue-green) in the SandboxAgent controller — +// the old template is retired only after the new golden is Ready. ActorTemplate remains in +// GetOwnedResourceTypes for watches, and owner-reference GC still removes all templates when the +// SandboxAgent itself is deleted. func (b *AgentsBackend) OwnedResourceTypesFor(_ v1alpha2.AgentObject) ([]client.Object, error) { - return b.GetOwnedResourceTypes(), nil + return nil, nil } func (b *AgentsBackend) BuildSandbox(ctx context.Context, in sandboxbackend.BuildInput) ([]client.Object, error) { @@ -71,12 +77,14 @@ func (b *AgentsBackend) ComputeReady(ctx context.Context, cl client.Client, nn t if b.Lifecycle == nil { return metav1.ConditionUnknown, "SubstrateLifecycleNotConfigured", "substrate lifecycle is not configured" } - tmplKey := types.NamespacedName{Namespace: nn.Namespace, Name: SandboxAgentActorTemplateName(sa)} - ready, err := b.Lifecycle.actorTemplateReady(ctx, tmplKey) + tmpl, err := ResolveCurrentActorTemplate(ctx, cl, nn.Namespace, sa.Name) if err != nil { - return metav1.ConditionUnknown, "ActorTemplateGetFailed", err.Error() + return metav1.ConditionUnknown, "ActorTemplateListFailed", err.Error() } - if !ready { + if tmpl == nil { + return metav1.ConditionFalse, "ActorTemplateNotFound", "ActorTemplate has not been generated yet" + } + if tmpl.Status.Phase != atev1alpha1.PhaseReady { return metav1.ConditionFalse, "ActorTemplateNotReady", "ActorTemplate golden snapshot is not ready" } return metav1.ConditionTrue, "ActorTemplateReady", "ActorTemplate golden snapshot is ready" diff --git a/go/core/pkg/sandboxbackend/substrate/bluegreen.go b/go/core/pkg/sandboxbackend/substrate/bluegreen.go new file mode 100644 index 0000000000..7adcdccd25 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/bluegreen.go @@ -0,0 +1,107 @@ +package substrate + +import ( + "context" + "fmt" + + atev1alpha1 "github.com/agent-substrate/substrate/pkg/api/v1alpha1" + "github.com/agent-substrate/substrate/pkg/proto/ateapipb" + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + apierrors "k8s.io/apimachinery/pkg/api/errors" +) + +// RetireSupersededTemplates implements the blue-green half of config-change rollout: it deletes +// a SandboxAgent's older ActorTemplates (and their golden actors) once a newer template is +// serving, so the previous golden keeps answering traffic until the new one is Ready. +// +// It keeps two templates: the newest (the desired/just-applied one, possibly still building) and +// the active one (newest with a Ready golden, which chat resolves to). Every other template is +// retired — but ONLY if its own golden is already Suspended (Phase==Ready). A template whose +// golden is still building is left alone: deleting it would orphan a RUNNING golden that can +// never be suspended afterwards (it would permanently pin a worker). The golden actor is deleted +// before the template object so we never leave a golden without its template. +// +// Performs at most one mutating ate-api step per superseded golden per call; returns done==false +// to be requeued until all superseded templates are gone. +func (p *Lifecycle) RetireSupersededTemplates(ctx context.Context, sa *v1alpha2.SandboxAgent) (bool, error) { + if sa == nil || p == nil || p.Client == nil { + return true, nil + } + templates, err := listSandboxAgentActorTemplates(ctx, p.Client, sa.Namespace, sa.Name) + if err != nil { + return false, err + } + if len(templates) <= 1 { + return true, nil + } + + var newest, active *atev1alpha1.ActorTemplate + for _, t := range templates { + if newest == nil || t.CreationTimestamp.After(newest.CreationTimestamp.Time) { + newest = t + } + if t.Status.Phase == atev1alpha1.PhaseReady { + if active == nil || t.CreationTimestamp.After(active.CreationTimestamp.Time) { + active = t + } + } + } + + done := true + for _, t := range templates { + if t.Name == newest.Name || (active != nil && t.Name == active.Name) { + continue // keep the desired template and the one currently serving + } + if t.Status.Phase != atev1alpha1.PhaseReady { + // Golden still building — retiring it now would orphan a RUNNING golden that can't + // be suspended once its template is gone. Let it reach Ready, then retire next pass. + done = false + continue + } + // Golden is Suspended (Ready) — delete it first, then the template object. + if goldenID := t.Status.GoldenActorID; goldenID != "" { + gone, err := deleteGoldenActor(ctx, p.AteClient, goldenID) + if err != nil { + return false, fmt.Errorf("delete superseded golden %q for template %s: %w", goldenID, t.Name, err) + } + if !gone { + done = false + continue + } + } + if err := p.Client.Delete(ctx, t); err != nil && !apierrors.IsNotFound(err) { + return false, fmt.Errorf("delete superseded ActorTemplate %s: %w", t.Name, err) + } + } + return done, nil +} + +// deleteActorIfSuspended deletes an actor only when it is Suspended (idle). RUNNING/RESUMING +// actors are left untouched — for substrate, RUNNING means an actor is resumed on a worker, and +// kagent's transport suspends a session actor after each request completes. Force-suspending a +// RUNNING actor could cut a live response mid-stream, so we wait for it to quiesce instead. +// Returns done==true when the actor no longer exists (or never did). +func deleteActorIfSuspended(ctx context.Context, c *Client, actorID string) (bool, error) { + if actorID == "" || c == nil { + return true, nil + } + actor, err := c.GetActor(ctx, actorID) + if err != nil { + if status.Code(err) == codes.NotFound { + return true, nil + } + return false, fmt.Errorf("get actor %q: %w", actorID, err) + } + switch actor.GetStatus() { + case ateapipb.Actor_STATUS_SUSPENDED, ateapipb.Actor_STATUS_UNSPECIFIED: + if err := c.DeleteActor(ctx, actorID); err != nil && status.Code(err) != codes.NotFound { + return false, fmt.Errorf("delete actor %q: %w", actorID, err) + } + return true, nil + default: + // RUNNING / RESUMING / SUSPENDING — actively (or transitionally) in use; skip. + return false, nil + } +} diff --git a/go/core/pkg/sandboxbackend/substrate/config_hash_test.go b/go/core/pkg/sandboxbackend/substrate/config_hash_test.go new file mode 100644 index 0000000000..687f06bcf0 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/config_hash_test.go @@ -0,0 +1,165 @@ +package substrate + +import ( + "context" + "strings" + "testing" + + atev1alpha1 "github.com/agent-substrate/substrate/pkg/api/v1alpha1" + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestShortConfigHash(t *testing.T) { + t.Parallel() + // Matches the translator's decimal uint64 annotation; rendered as hex. + require.Equal(t, "ff", shortConfigHash("255")) + require.Equal(t, "", shortConfigHash("")) + require.Equal(t, "", shortConfigHash("not-a-number")) + require.NotEqual(t, shortConfigHash("100"), shortConfigHash("101")) +} + +func TestSandboxAgentActorTemplateNameWithHash(t *testing.T) { + t.Parallel() + sa := &v1alpha2.SandboxAgent{ObjectMeta: metav1.ObjectMeta{Name: "my-agent", Namespace: "kagent"}} + + // Distinct configs → distinct template names → distinct golden snapshots. + n1 := sandboxAgentActorTemplateName(sa, "abc123") + n2 := sandboxAgentActorTemplateName(sa, "def456") + require.Equal(t, "my-agent-abc123", n1) + require.NotEqual(t, n1, n2) + require.LessOrEqual(t, len(n1), 63) + + // Empty hash falls back to the stable base name (preserves prior behavior). + require.Equal(t, "my-agent", sandboxAgentActorTemplateName(sa, "")) + + // Long agent names stay within the DNS-1123 budget once the hash suffix is added. + long := &v1alpha2.SandboxAgent{ObjectMeta: metav1.ObjectMeta{Name: strings.Repeat("a", 80)}} + require.LessOrEqual(t, len(sandboxAgentActorTemplateName(long, "deadbeefdeadbeef")), 63) +} + +func TestSandboxAgentSessionActorIDVariesWithHash(t *testing.T) { + t.Parallel() + sa := &v1alpha2.SandboxAgent{ObjectMeta: metav1.ObjectMeta{Name: "my-agent", Namespace: "kagent"}} + + id1 := SandboxAgentSessionActorID(sa, "abc123", "sess-1") + id2 := SandboxAgentSessionActorID(sa, "def456", "sess-1") + require.NotEqual(t, id1, id2, "config change must yield a new actor id so a fresh actor is created") + + // Same hash + session is stable so repeated messages resume the warm actor. + require.Equal(t, id1, SandboxAgentSessionActorID(sa, "abc123", "sess-1")) + + // Keeps the per-agent prefix so DeleteAll / reaping still match by prefix. + prefix := sandboxAgentActorPrefix(sa) + require.True(t, strings.HasPrefix(id1, prefix+"-")) +} + +func TestBuildActorTemplateStampsConfigHash(t *testing.T) { + t.Parallel() + p := newTestLifecycle(t) + sa := &v1alpha2.SandboxAgent{ + ObjectMeta: metav1.ObjectMeta{Name: "py-agent", Namespace: "kagent"}, + Spec: v1alpha2.SandboxAgentSpec{ + Platform: v1alpha2.SandboxPlatformSubstrate, + AgentSpec: v1alpha2.AgentSpec{Type: v1alpha2.AgentType_Declarative, Declarative: &v1alpha2.DeclarativeAgentSpec{Runtime: v1alpha2.DeclarativeRuntime_Python}}, + }, + } + pod := corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{SandboxConfigHashAnnotation: "255"}}, + Spec: corev1.PodSpec{Containers: []corev1.Container{{ + Name: defaultKagentContainer, + Image: "registry.example/app@sha256:1111111111111111111111111111111111111111111111111111111111111111", + }}}, + } + wpKey := types.NamespacedName{Namespace: "kagent", Name: "kagent-default"} + tmpl, err := p.buildSandboxAgentActorTemplate(sa, wpKey, pod) + require.NoError(t, err) + require.Equal(t, "py-agent-ff", tmpl.Name, "template name must carry the config-hash suffix") + require.Equal(t, "ff", tmpl.Annotations[SandboxConfigHashAnnotation]) +} + +func TestResolveCurrentActorTemplate(t *testing.T) { + t.Parallel() + scheme := runtime.NewScheme() + utilruntime.Must(atev1alpha1.AddToScheme(scheme)) + + // Old template is Ready (serving); newer one is still building. Blue-green: serve the old + // Ready golden until the new is Ready, so the resolver must prefer the Ready one even though + // it's older. + oldReady := &atev1alpha1.ActorTemplate{ObjectMeta: metav1.ObjectMeta{ + Name: "my-agent-old", Namespace: "kagent", + Labels: map[string]string{SandboxAgentLabelKey: "my-agent"}, + CreationTimestamp: metav1.Unix(100, 0), + }, Status: atev1alpha1.ActorTemplateStatus{Phase: atev1alpha1.PhaseReady}} + newerBuilding := &atev1alpha1.ActorTemplate{ObjectMeta: metav1.ObjectMeta{ + Name: "my-agent-new", Namespace: "kagent", + Labels: map[string]string{SandboxAgentLabelKey: "my-agent"}, + CreationTimestamp: metav1.Unix(200, 0), + }, Status: atev1alpha1.ActorTemplateStatus{Phase: atev1alpha1.PhaseResumeGoldenActor}} + other := &atev1alpha1.ActorTemplate{ObjectMeta: metav1.ObjectMeta{ + Name: "other-agent", Namespace: "kagent", + Labels: map[string]string{SandboxAgentLabelKey: "other-agent"}, + }} + cl := fake.NewClientBuilder().WithScheme(scheme).WithObjects(oldReady, newerBuilding, other).Build() + + got, err := ResolveCurrentActorTemplate(context.Background(), cl, "kagent", "my-agent") + require.NoError(t, err) + require.NotNil(t, got) + require.Equal(t, "my-agent-old", got.Name, "must prefer the newest READY template (no downtime during rebuild)") + + none, err := ResolveCurrentActorTemplate(context.Background(), cl, "kagent", "absent") + require.NoError(t, err) + require.Nil(t, none) + + // When none is Ready yet (first build), fall back to the newest. + firstBuild := fake.NewClientBuilder().WithScheme(scheme).WithObjects(newerBuilding).Build() + got, err = ResolveCurrentActorTemplate(context.Background(), firstBuild, "kagent", "my-agent") + require.NoError(t, err) + require.NotNil(t, got) + require.Equal(t, "my-agent-new", got.Name) +} + +func TestRetireSupersededTemplates(t *testing.T) { + t.Parallel() + scheme := runtime.NewScheme() + utilruntime.Must(atev1alpha1.AddToScheme(scheme)) + + lbl := map[string]string{SandboxAgentLabelKey: "my-agent"} + mk := func(name string, created int64, phase atev1alpha1.PhaseType) *atev1alpha1.ActorTemplate { + return &atev1alpha1.ActorTemplate{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: "kagent", Labels: lbl, CreationTimestamp: metav1.Unix(created, 0)}, + Status: atev1alpha1.ActorTemplateStatus{Phase: phase}, // GoldenActorID empty → no ate-api call + } + } + oldReady := mk("my-agent-old", 100, atev1alpha1.PhaseReady) // superseded → retire + activeReady := mk("my-agent-active", 150, atev1alpha1.PhaseReady) // newest Ready → keep (serving) + newestBuilding := mk("my-agent-new", 200, atev1alpha1.PhaseResumeGoldenActor) // desired/building → keep + + cl := fake.NewClientBuilder().WithScheme(scheme).WithObjects(oldReady, activeReady, newestBuilding).Build() + p := &Lifecycle{Client: cl} + sa := &v1alpha2.SandboxAgent{ObjectMeta: metav1.ObjectMeta{Name: "my-agent", Namespace: "kagent"}} + + // The superseded oldReady (Ready, golden Suspended) is retired now; the serving activeReady and + // the building newest are kept. No retirement is pending (the building one is kept, not retired), + // so done==true — the controller's ActorTemplate watch re-triggers retirement of activeReady once + // newest becomes Ready. + done, err := p.RetireSupersededTemplates(context.Background(), sa) + require.NoError(t, err) + require.True(t, done, "no retirement pending: oldReady removed, active+building kept") + + remaining, err := listSandboxAgentActorTemplates(context.Background(), cl, "kagent", "my-agent") + require.NoError(t, err) + names := map[string]bool{} + for _, t := range remaining { + names[t.Name] = true + } + require.False(t, names["my-agent-old"], "superseded Ready template must be retired") + require.True(t, names["my-agent-active"], "newest Ready (serving) template must be kept") + require.True(t, names["my-agent-new"], "newest (building) template must be kept") +} diff --git a/go/core/pkg/sandboxbackend/substrate/lifecycle_delete.go b/go/core/pkg/sandboxbackend/substrate/lifecycle_delete.go index 0c274e6a5c..16f47d8b6b 100644 --- a/go/core/pkg/sandboxbackend/substrate/lifecycle_delete.go +++ b/go/core/pkg/sandboxbackend/substrate/lifecycle_delete.go @@ -9,6 +9,7 @@ import ( "github.com/kagent-dev/kagent/go/api/v1alpha2" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" ) // CleanupGeneratedTemplate removes external Substrate actors that Kubernetes garbage collection cannot see. @@ -72,13 +73,29 @@ func (p *Lifecycle) CleanupSandboxAgentTemplate(ctx context.Context, sa *v1alpha if sa == nil || p == nil || p.Client == nil { return true, nil } - tmplKey := types.NamespacedName{Namespace: sa.Namespace, Name: SandboxAgentActorTemplateName(sa)} - goldenID, err := p.goldenActorID(ctx, tmplKey) - if err != nil { - return false, err + // A SandboxAgent may have multiple generated ActorTemplates in flight (a config change + // creates a new hashed template before the old one is pruned). Clean the golden actor of + // every template carrying the agent's lifecycle label. + list := &atev1alpha1.ActorTemplateList{} + if err := p.Client.List(ctx, list, + client.InNamespace(sa.Namespace), + client.MatchingLabels{SandboxAgentLabelKey: sa.Name}, + ); err != nil { + return false, fmt.Errorf("list ActorTemplates for %s/%s: %w", sa.Namespace, sa.Name, err) } - if goldenID == "" { - return true, nil + allDone := true + for i := range list.Items { + goldenID := strings.TrimSpace(list.Items[i].Status.GoldenActorID) + if goldenID == "" { + continue + } + done, err := deleteGoldenActor(ctx, p.AteClient, goldenID) + if err != nil { + return false, fmt.Errorf("delete golden actor %q: %w", goldenID, err) + } + if !done { + allDone = false + } } - return deleteGoldenActor(ctx, p.AteClient, goldenID) + return allDone, nil } diff --git a/go/core/pkg/sandboxbackend/substrate/lifecycle_shared.go b/go/core/pkg/sandboxbackend/substrate/lifecycle_shared.go index 3489cf7a4d..6236a23616 100644 --- a/go/core/pkg/sandboxbackend/substrate/lifecycle_shared.go +++ b/go/core/pkg/sandboxbackend/substrate/lifecycle_shared.go @@ -133,13 +133,67 @@ func actorTemplateName(ah *v1alpha2.AgentHarness) string { } func truncateDNS1123(s string) string { + return truncateDNS1123To(s, 63) +} + +func truncateDNS1123To(s string, max int) string { s = strings.ToLower(strings.ReplaceAll(s, "_", "-")) - if len(s) > 63 { - s = strings.TrimRight(s[:63], "-") + if len(s) > max { + s = strings.TrimRight(s[:max], "-") } return s } +// ResolveCurrentActorTemplate returns the ActorTemplate a SandboxAgent should currently serve +// from: the newest non-terminating template whose golden snapshot is Ready. This is the +// blue-green pivot — during a config change the new template builds while this keeps returning +// the previous Ready template, so chat and readiness stay on the working golden with no downtime +// and flip atomically once the new golden is Ready. Falls back to the newest template when none +// is Ready yet (the very first build). Returns (nil, nil) when no template exists. +func ResolveCurrentActorTemplate(ctx context.Context, kube client.Client, namespace, agentName string) (*atev1alpha1.ActorTemplate, error) { + templates, err := listSandboxAgentActorTemplates(ctx, kube, namespace, agentName) + if err != nil { + return nil, err + } + var newestReady, newest *atev1alpha1.ActorTemplate + for i := range templates { + t := templates[i] + if newest == nil || t.CreationTimestamp.After(newest.CreationTimestamp.Time) { + newest = t + } + if t.Status.Phase == atev1alpha1.PhaseReady { + if newestReady == nil || t.CreationTimestamp.After(newestReady.CreationTimestamp.Time) { + newestReady = t + } + } + } + if newestReady != nil { + return newestReady, nil + } + return newest, nil +} + +// listSandboxAgentActorTemplates returns the non-terminating generated ActorTemplates for an agent. +func listSandboxAgentActorTemplates(ctx context.Context, kube client.Client, namespace, agentName string) ([]*atev1alpha1.ActorTemplate, error) { + if kube == nil { + return nil, fmt.Errorf("kubernetes client is required") + } + list := &atev1alpha1.ActorTemplateList{} + if err := kube.List(ctx, list, + client.InNamespace(namespace), + client.MatchingLabels{SandboxAgentLabelKey: agentName}, + ); err != nil { + return nil, fmt.Errorf("list ActorTemplates for %s/%s: %w", namespace, agentName, err) + } + out := make([]*atev1alpha1.ActorTemplate, 0, len(list.Items)) + for i := range list.Items { + if list.Items[i].DeletionTimestamp.IsZero() { + out = append(out, &list.Items[i]) + } + } + return out, nil +} + // pinImageRef ensures image refs satisfy Substrate ActorTemplate validation (must contain "@"). func pinImageRef(image string) (string, error) { image = strings.TrimSpace(image) diff --git a/helm/kagent-crds/templates/kagent.dev_sandboxagents.yaml b/helm/kagent-crds/templates/kagent.dev_sandboxagents.yaml index 3f8f594b50..1fe25e28c7 100644 --- a/helm/kagent-crds/templates/kagent.dev_sandboxagents.yaml +++ b/helm/kagent-crds/templates/kagent.dev_sandboxagents.yaml @@ -11339,8 +11339,6 @@ spec: rule: '!has(self.skills) || self.platform != ''substrate''' - message: spec.substrate may only be set when spec.platform is substrate rule: '!has(self.substrate) || self.platform == ''substrate''' - - message: BYO agents are not supported when spec.platform is substrate - rule: '!has(self.type) || self.type != ''BYO'' || self.platform != ''substrate''' - message: type must be specified rule: has(self.type) - message: type must be either Declarative or BYO diff --git a/python/Dockerfile b/python/Dockerfile index 03b9bf492f..06a792f428 100644 --- a/python/Dockerfile +++ b/python/Dockerfile @@ -1,140 +1,101 @@ -### STAGE 1: base image -ARG BASE_IMAGE_REGISTRY=cgr.dev +### STAGE 1: uv binary ARG UV_VERSION=0.11.15 FROM ghcr.io/astral-sh/uv:${UV_VERSION} AS uv-bin -FROM $BASE_IMAGE_REGISTRY/chainguard/wolfi-base:latest AS base-os -# Build arg to control SSL verification (set DISABLE_SSL_VERIFY=1 to skip SSL checks) -ARG DISABLE_SSL_VERIFY=0 +### STAGE 2: builder +# Build the uv-managed standalone Python interpreter and the project venv on a full base +# (debian-slim, digest-pinnable). Nothing from this stage ships except /python and the venv. +FROM debian:12-slim AS builder +ARG TOOLS_PYTHON_VERSION=3.13 ENV LANG=C.UTF-8 ENV LC_ALL=C.UTF-8 - -# Install packages with conditional SSL verification -# When DISABLE_SSL_VERIFY=1, use --no-check-certificate to bypass SSL checks (development only) -RUN --mount=type=cache,target=/var/cache/apk,rw \ - if [ "$DISABLE_SSL_VERIFY" = "1" ]; then \ - echo "WARNING: Disabling SSL verification for apk (development only)"; \ - apk update --no-check-certificate && apk add --no-check-certificate \ - curl openssl bash git ca-certificates libstdc++; \ - else \ - apk update && apk add \ - curl openssl bash git ca-certificates libstdc++; \ - fi - -# Install uv from upstream so we control the version and pick up rkyv fixes -# independently of the Wolfi apk release cadence. -COPY --from=uv-bin /uv /uvx /usr/local/bin/ - -### STAGE 2: python -FROM base-os AS python-os -ARG TOOLS_PYTHON_VERSION=3.13 - ENV PYTHONOPTIMIZE=2 ENV PYTHONUNBUFFERED=1 -# Optimize malloc for containerized Python workloads -# 256KB threshold balances memory efficiency with performance -ENV MALLOC_TRIM_THRESHOLD_=262144 -ENV MALLOC_ARENA_MAX=2 - -ENV GIT_LFS_SKIP_SMUDGE=1 - +# uv configuration: install a managed standalone Python under /python and a copy-mode venv +# (no editable installs) so the final image is self-contained with no source tree dependency. ENV UV_LINK_MODE=copy ENV UV_COMPILE_BYTECODE=1 -ENV UV_COMPILE_BYTECODE_TIMEOUT=300 -ENV UV_SYSTEM_PYTHON=1 ENV UV_NO_PROGRESS=1 ENV UV_HTTP_TIMEOUT=60 -ENV UV_CONCURRENT_DOWNLOADS=10 - -# Configure the Python directories ENV UV_CACHE_DIR=/.kagent/cache/packages -ENV UV_TOOL_DIR=/.kagent/cache/tools ENV UV_PYTHON_DOWNLOADS_DIR=/.kagent/cache/downloads ENV UV_PROJECT_ENVIRONMENT=/.kagent/.venv - ENV UV_PYTHON_INSTALL_DIR=/python ENV UV_PYTHON_PREFERENCE=only-managed -RUN addgroup -g 1001 pythongroup && \ - adduser -u 1001 -G pythongroup -s /bin/bash -D python -h /.kagent/ && \ - mkdir -p $UV_PYTHON_DOWNLOADS_DIR && \ - mkdir -p $UV_TOOL_DIR && \ - mkdir -p $UV_CACHE_DIR && \ - mkdir -p /python && \ - chown -vR 1001:1001 /.kagent /python - -# Install anthropic sandbox runtime and dependencies -RUN --mount=type=cache,target=/var/cache/apk,rw \ - apk add \ - nodejs npm node-gyp bubblewrap socat ripgrep - -# Install sandbox runtime from a specific commit of the GitHub repo without using global prefix -# This avoids scope-related rename issues in global node_modules -# Using BuildKit cache for npm to speed up rebuilds -# Keep the pinned sandbox-runtime revision, but replace its vulnerable locked lodash-es version. -RUN --mount=type=cache,target=/root/.npm \ - mkdir -p /opt && \ - cd /opt && \ - git clone --depth 1 --revision=ef4afdef4d711ba21a507d7f7369e305f7d3dbfa https://github.com/anthropic-experimental/sandbox-runtime.git && \ - cd sandbox-runtime && \ - npm install --save-exact lodash-es@4.18.1 @types/lodash-es@4.17.12 && \ - npm install --save-exact brace-expansion@5.0.6 && \ - npm run build && \ - # CVE-2026-26996: all minimatch instances (3.1.2, 9.0.5) are transitive dev - # deps (eslint, typescript-eslint). Prune dev deps after build to remove them. - npm prune --omit=dev && \ - npm install -g --ignore-scripts - -# Ensure the sandbox runtime binaries are on PATH -ENV PATH="/opt/sandbox-runtime/node_modules/.bin:$PATH" - -USER python -WORKDIR /.kagent +RUN apt-get update \ + && apt-get install -y --no-install-recommends ca-certificates \ + && rm -rf /var/lib/apt/lists/* -### STAGE 3: final -FROM python-os AS builder -ARG TOOLS_PYTHON_VERSION +COPY --from=uv-bin /uv /uvx /usr/local/bin/ WORKDIR /.kagent -ENV PATH=$PATH:/.kagent/bin:/.kagent/.venv/bin - # Copy dependency files first for better layer caching -COPY --chown=python:pythongroup pyproject.toml . -COPY --chown=python:pythongroup .python-version . -COPY --chown=python:pythongroup uv.lock . -COPY --chown=python:pythongroup packages/kagent-adk packages/kagent-adk -COPY --chown=python:pythongroup packages/kagent-core packages/kagent-core -COPY --chown=python:pythongroup packages/kagent-skills packages/kagent-skills -COPY --chown=python:pythongroup packages/agentsts-adk packages/agentsts-adk -COPY --chown=python:pythongroup packages/agentsts-core packages/agentsts-core -COPY --chown=python:pythongroup README.md . +COPY pyproject.toml . +COPY .python-version . +COPY uv.lock . +COPY README.md . +COPY packages/kagent-adk packages/kagent-adk +COPY packages/kagent-core packages/kagent-core +COPY packages/kagent-skills packages/kagent-skills +COPY packages/agentsts-adk packages/agentsts-adk +COPY packages/agentsts-core packages/agentsts-core ARG VERSION -# Install dependencies - make sure /.kagent/.venv/bin in path and not in cache mount -RUN --mount=type=cache,target=/.kagent/cache,uid=1001,gid=1001 \ +# Create the venv and install kagent-adk. --no-editable copies the workspace packages into +# site-packages so the runtime does not need the source tree. +RUN --mount=type=cache,target=/.kagent/cache,rw \ echo "Creating virtual environment and installing dependencies..." \ && uv venv --python=python$TOOLS_PYTHON_VERSION \ - && uv lock && uv sync --package kagent-adk \ + && uv lock && uv sync --package kagent-adk --no-editable \ && uv cache prune \ && echo "Installation complete." -# Create a separate venv for bash tool commands (sandbox environment) -# This venv does not have pip installed -RUN --mount=type=cache,target=/.kagent/cache,uid=1001,gid=1001 \ - echo "Creating bash tool sandbox environment..." \ - && mkdir -p /.kagent/sandbox-venv \ - && uv venv --python=python$TOOLS_PYTHON_VERSION /.kagent/sandbox-venv \ - && echo "Bash tool sandbox environment created." +# Pre-create the config dir owned by the runtime user. On Agent Substrate the config is +# materialized into /config at startup (the env-injected path); distroless runs as nonroot and +# cannot create top-level dirs, so it must exist with the right owner ahead of time. On the +# normal Deployment path /config is overlaid by the mounted Secret volume, so this is harmless. +RUN mkdir -p /staging/config + +### STAGE 3: final (distroless) +# distroless/cc provides glibc + libstdc++ (required by the standalone CPython build) but no +# shell or package manager. Agents that need in-container code execution / bash tools use the +# "full" image (python/Dockerfile.full) instead. +FROM gcr.io/distroless/cc-debian12:nonroot +ARG VERSION +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +ENV PYTHONUNBUFFERED=1 ENV PATH="/.kagent/.venv/bin:$PATH" -ENV UV_PROJECT_ENVIRONMENT=/app/.venv -ENV BASH_VENV_PATH=/.kagent/sandbox-venv ENV VIRTUAL_ENV=/.kagent/.venv +# The standalone interpreter and the venv (the venv's python is linked to /python). +COPY --from=builder /python /python +COPY --from=builder /.kagent/.venv /.kagent/.venv +# Writable config dir for substrate config materialization (see builder stage). +COPY --from=builder --chown=65532:65532 /staging/config /config + +# The standalone CPython build and numpy's C-extensions dynamically link a handful of system +# libraries that distroless/cc does not ship (zlib, bz2, lzma, ffi, sqlite). Copy them from the +# builder into an arch-agnostic dir on LD_LIBRARY_PATH. The *-linux-gnu glob matches the single +# multiarch dir present for the target arch (amd64/arm64). +COPY --from=builder /usr/lib/*-linux-gnu/libz.so.1 /usr/lib/kagent-libs/ +COPY --from=builder /usr/lib/*-linux-gnu/libbz2.so.1* /usr/lib/kagent-libs/ +COPY --from=builder /usr/lib/*-linux-gnu/liblzma.so.5* /usr/lib/kagent-libs/ +COPY --from=builder /usr/lib/*-linux-gnu/libffi.so.8* /usr/lib/kagent-libs/ +COPY --from=builder /usr/lib/*-linux-gnu/libsqlite3.so.0* /usr/lib/kagent-libs/ +ENV LD_LIBRARY_PATH=/usr/lib/kagent-libs + WORKDIR /app +USER 65532:65532 + +LABEL org.opencontainers.image.source=https://github.com/kagent-dev/kagent +LABEL org.opencontainers.image.description="Kagent ADK Python runtime (distroless, no sandbox runtime)." +LABEL org.opencontainers.image.version="$VERSION" -ENTRYPOINT ["kagent-adk", "run", "--host", "0.0.0.0", "--port", "8080"] +ENTRYPOINT ["/.kagent/.venv/bin/kagent-adk", "run", "--host", "0.0.0.0", "--port", "8080"] diff --git a/python/Dockerfile.app b/python/Dockerfile.app index 8cbfac077c..4d5c5f8042 100644 --- a/python/Dockerfile.app +++ b/python/Dockerfile.app @@ -1,6 +1,9 @@ ARG KAGENT_ADK_VERSION=latest ARG DOCKER_REGISTRY=ghcr.io ARG DOCKER_REPO=kagent-dev/kagent +# The base runtime is selected by tag: KAGENT_ADK_VERSION= for the distroless slim +# base, or -full for the base that includes the sandbox runtime. The app image just +# overrides the entrypoint to serve declarative agents. FROM $DOCKER_REGISTRY/$DOCKER_REPO/kagent-adk:$KAGENT_ADK_VERSION # Offline mode @@ -15,4 +18,4 @@ LABEL org.opencontainers.image.authors="Kagent Creators 🤖" LABEL org.opencontainers.image.version="$VERSION" ENTRYPOINT ["kagent-adk", "static"] -CMD ["--host", "0.0.0.0", "--port", "8080"] \ No newline at end of file +CMD ["--host", "0.0.0.0", "--port", "8080"] diff --git a/python/Dockerfile.full b/python/Dockerfile.full new file mode 100644 index 0000000000..6129676445 --- /dev/null +++ b/python/Dockerfile.full @@ -0,0 +1,116 @@ +# Full Python ADK runtime image: includes the Anthropic sandbox-runtime (node, bubblewrap, +# socat, ripgrep) and a bash tool venv for agents that execute code / run shell tools. Unlike +# python/Dockerfile (distroless slim), this image needs a shell and package tooling, so it is +# built on a digest-pinnable debian-slim base rather than distroless. The controller selects +# this image (PythonADKFullImageDigest) for declarative agents that need SRT (skills, +# executeCodeBlocks) and for sandboxed BYO agents. + +### STAGE 1: uv binary +ARG UV_VERSION=0.11.15 +FROM ghcr.io/astral-sh/uv:${UV_VERSION} AS uv-bin + +### STAGE 2: base os + sandbox runtime +# node:20-bookworm-slim is a digest-pinnable debian-bookworm base that ships Node 20 (the +# sandbox-runtime requires node >= 20; debian's own nodejs package is still on 18). +FROM node:20-bookworm-slim AS python-os +ARG TOOLS_PYTHON_VERSION=3.13 + +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +ENV PYTHONOPTIMIZE=2 +ENV PYTHONUNBUFFERED=1 + +# Optimize malloc for containerized Python workloads +ENV MALLOC_TRIM_THRESHOLD_=262144 +ENV MALLOC_ARENA_MAX=2 +ENV GIT_LFS_SKIP_SMUDGE=1 + +ENV UV_LINK_MODE=copy +ENV UV_COMPILE_BYTECODE=1 +ENV UV_NO_PROGRESS=1 +ENV UV_HTTP_TIMEOUT=60 +ENV UV_CACHE_DIR=/.kagent/cache/packages +ENV UV_TOOL_DIR=/.kagent/cache/tools +ENV UV_PYTHON_DOWNLOADS_DIR=/.kagent/cache/downloads +ENV UV_PROJECT_ENVIRONMENT=/.kagent/.venv +ENV UV_PYTHON_INSTALL_DIR=/python +ENV UV_PYTHON_PREFERENCE=only-managed + +# node/npm come from the base image; add the remaining runtime tooling. +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + curl openssl bash git ca-certificates \ + bubblewrap socat ripgrep \ + && rm -rf /var/lib/apt/lists/* + +COPY --from=uv-bin /uv /uvx /usr/local/bin/ + +RUN groupadd -g 1001 pythongroup \ + && useradd -u 1001 -g pythongroup -s /bin/bash -d /.kagent -m python \ + && mkdir -p $UV_PYTHON_DOWNLOADS_DIR $UV_TOOL_DIR $UV_CACHE_DIR /python \ + && chown -R 1001:1001 /.kagent /python + +# Install the Anthropic sandbox runtime from a pinned revision. Replace its vulnerable locked +# transitive deps, build, then prune dev deps (matches the previous Chainguard-based image). +# Fetch the pinned revision via init+fetch (debian's git predates `git clone --revision`). +# GitHub allows fetching an arbitrary commit SHA directly. +RUN --mount=type=cache,target=/root/.npm \ + mkdir -p /opt/sandbox-runtime && cd /opt/sandbox-runtime \ + && git init -q \ + && git remote add origin https://github.com/anthropic-experimental/sandbox-runtime.git \ + && git fetch --depth 1 origin ef4afdef4d711ba21a507d7f7369e305f7d3dbfa \ + && git checkout -q FETCH_HEAD \ + && npm pkg delete scripts.prepare \ + && npm install --ignore-scripts --save-exact lodash-es@4.18.1 @types/lodash-es@4.17.12 \ + && npm install --ignore-scripts --save-exact brace-expansion@5.0.6 \ + && npm run build \ + && npm prune --omit=dev \ + && npm install -g --ignore-scripts + +ENV PATH="/opt/sandbox-runtime/node_modules/.bin:$PATH" + +USER python +WORKDIR /.kagent + +### STAGE 3: final (install project) +FROM python-os AS builder +ARG TOOLS_PYTHON_VERSION + +WORKDIR /.kagent +ENV PATH=$PATH:/.kagent/bin:/.kagent/.venv/bin + +COPY --chown=python:pythongroup pyproject.toml . +COPY --chown=python:pythongroup .python-version . +COPY --chown=python:pythongroup uv.lock . +COPY --chown=python:pythongroup packages/kagent-adk packages/kagent-adk +COPY --chown=python:pythongroup packages/kagent-core packages/kagent-core +COPY --chown=python:pythongroup packages/kagent-skills packages/kagent-skills +COPY --chown=python:pythongroup packages/agentsts-adk packages/agentsts-adk +COPY --chown=python:pythongroup packages/agentsts-core packages/agentsts-core +COPY --chown=python:pythongroup README.md . + +ARG VERSION + +RUN --mount=type=cache,target=/.kagent/cache,uid=1001,gid=1001 \ + echo "Creating virtual environment and installing dependencies..." \ + && uv venv --python=python$TOOLS_PYTHON_VERSION \ + && uv lock && uv sync --package kagent-adk \ + && uv cache prune \ + && echo "Installation complete." + +# Separate venv for bash tool commands (sandbox environment); no pip installed. +RUN --mount=type=cache,target=/.kagent/cache,uid=1001,gid=1001 \ + mkdir -p /.kagent/sandbox-venv \ + && uv venv --python=python$TOOLS_PYTHON_VERSION /.kagent/sandbox-venv + +ENV PATH="/.kagent/.venv/bin:$PATH" +ENV BASH_VENV_PATH=/.kagent/sandbox-venv +ENV VIRTUAL_ENV=/.kagent/.venv + +LABEL org.opencontainers.image.source=https://github.com/kagent-dev/kagent +LABEL org.opencontainers.image.description="Kagent ADK Python runtime (full: includes sandbox runtime)." +LABEL org.opencontainers.image.version="$VERSION" + +WORKDIR /app + +ENTRYPOINT ["kagent-adk", "run", "--host", "0.0.0.0", "--port", "8080"] diff --git a/python/packages/kagent-adk/src/kagent/adk/_config_materialize.py b/python/packages/kagent-adk/src/kagent/adk/_config_materialize.py new file mode 100644 index 0000000000..eca766eaf3 --- /dev/null +++ b/python/packages/kagent-adk/src/kagent/adk/_config_materialize.py @@ -0,0 +1,51 @@ +"""Materialize Agent Substrate secret-backed configuration from environment variables. + +On Agent Substrate the ActorTemplate cannot mount the agent config as files; instead the +config is injected as secret-backed environment variables and the running process must write +them to the on-disk paths the ADK loads from at startup. This mirrors the Go ADK's +``MaterializeFromEnv`` (see ``go/adk/pkg/config/config_materialize.go``): the environment value +is written verbatim (raw, not base64-encoded) to the destination file. + +When the environment variables are absent (the normal Kubernetes Deployment path, where the +config is mounted as a volume) this is a no-op. +""" + +import logging +import os + +logger = logging.getLogger(__name__) + +# Environment variables injected by the substrate ActorTemplate, keyed to the file name the +# ADK loads from within the config directory. +_ENV_TO_CONFIG_FILE = { + "KAGENT_CONFIG_JSON": "config.json", + "KAGENT_AGENT_CARD_JSON": "agent-card.json", + "KAGENT_SRT_SETTINGS_JSON": "srt-settings.json", +} + +# The bearer token is materialized to a fixed path outside the config dir, matching the Go ADK. +_KAGENT_TOKEN_ENV = "KAGENT_TOKEN" +_KAGENT_TOKEN_PATH = "/var/run/secrets/tokens/kagent-token" + + +def _materialize_env_to_file(env_key: str, path: str) -> bool: + """Write the raw value of ``env_key`` to ``path`` (0600). Returns True if written.""" + value = os.getenv(env_key, "").strip() + if not value: + return False + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "w") as f: + f.write(value) + os.chmod(path, 0o600) + return True + + +def materialize_from_env(config_dir: str) -> None: + """Write substrate secret-backed env vars to the paths the ADK loads from. + + No-op for any variable that is unset, so the volume-mounted Deployment path is unaffected. + """ + for env_key, filename in _ENV_TO_CONFIG_FILE.items(): + if _materialize_env_to_file(env_key, os.path.join(config_dir, filename)): + logger.info("Materialized %s from %s", filename, env_key) + _materialize_env_to_file(_KAGENT_TOKEN_ENV, _KAGENT_TOKEN_PATH) diff --git a/python/packages/kagent-adk/src/kagent/adk/cli.py b/python/packages/kagent-adk/src/kagent/adk/cli.py index e32d0aacbf..9c32d19f2d 100644 --- a/python/packages/kagent-adk/src/kagent/adk/cli.py +++ b/python/packages/kagent-adk/src/kagent/adk/cli.py @@ -14,6 +14,7 @@ from kagent.core import KAgentConfig, configure_logging, configure_tracing from . import AgentConfig, KAgentApp +from ._config_materialize import materialize_from_env from .tools import add_skills_tool_to_agent logger = logging.getLogger(__name__) @@ -60,6 +61,10 @@ def static( ): app_cfg = KAgentConfig() + # On Agent Substrate the config is injected as secret-backed env vars rather than mounted + # files; materialize them into `filepath` before loading. No-op on the Deployment path. + materialize_from_env(filepath) + with open(os.path.join(filepath, "config.json"), "r") as f: config = json.load(f) agent_config = AgentConfig.model_validate(config) diff --git a/python/packages/kagent-adk/tests/unittests/test_config_materialize.py b/python/packages/kagent-adk/tests/unittests/test_config_materialize.py new file mode 100644 index 0000000000..0707a610ee --- /dev/null +++ b/python/packages/kagent-adk/tests/unittests/test_config_materialize.py @@ -0,0 +1,55 @@ +import os + +import pytest + +from kagent.adk._config_materialize import materialize_from_env + + +def test_materializes_present_env_vars(tmp_path, monkeypatch): + monkeypatch.setenv("KAGENT_CONFIG_JSON", '{"model": {"type": "openai"}}') + monkeypatch.setenv("KAGENT_AGENT_CARD_JSON", '{"name": "test"}') + monkeypatch.setenv("KAGENT_SRT_SETTINGS_JSON", '{"network": {}}') + monkeypatch.delenv("KAGENT_TOKEN", raising=False) + + config_dir = tmp_path / "config" + materialize_from_env(str(config_dir)) + + assert (config_dir / "config.json").read_text() == '{"model": {"type": "openai"}}' + assert (config_dir / "agent-card.json").read_text() == '{"name": "test"}' + assert (config_dir / "srt-settings.json").read_text() == '{"network": {}}' + # Written with 0600 permissions, matching the Go ADK. + assert oct(os.stat(config_dir / "config.json").st_mode & 0o777) == "0o600" + + +def test_noop_when_env_absent(tmp_path, monkeypatch): + for key in ("KAGENT_CONFIG_JSON", "KAGENT_AGENT_CARD_JSON", "KAGENT_SRT_SETTINGS_JSON", "KAGENT_TOKEN"): + monkeypatch.delenv(key, raising=False) + + config_dir = tmp_path / "config" + # Should not raise and should not create the directory/files. + materialize_from_env(str(config_dir)) + + assert not (config_dir / "config.json").exists() + + +def test_blank_env_is_skipped(tmp_path, monkeypatch): + monkeypatch.setenv("KAGENT_CONFIG_JSON", " ") + monkeypatch.delenv("KAGENT_AGENT_CARD_JSON", raising=False) + + config_dir = tmp_path / "config" + materialize_from_env(str(config_dir)) + + assert not (config_dir / "config.json").exists() + + +def test_partial_env_only_writes_present(tmp_path, monkeypatch): + monkeypatch.setenv("KAGENT_CONFIG_JSON", "{}") + monkeypatch.delenv("KAGENT_AGENT_CARD_JSON", raising=False) + monkeypatch.delenv("KAGENT_SRT_SETTINGS_JSON", raising=False) + + config_dir = tmp_path / "config" + materialize_from_env(str(config_dir)) + + assert (config_dir / "config.json").exists() + assert not (config_dir / "agent-card.json").exists() + assert not (config_dir / "srt-settings.json").exists() diff --git a/scripts/controller-digest-ldflags.sh b/scripts/controller-digest-ldflags.sh index 7f34640a78..f94371ba16 100755 --- a/scripts/controller-digest-ldflags.sh +++ b/scripts/controller-digest-ldflags.sh @@ -2,7 +2,8 @@ # Emit -X ldflags for agent runtime image digests baked into the controller binary. # # Required environment variables: -# APP_IMG Python agent runtime image ref (repo:tag) +# APP_IMG Python agent runtime image ref (repo:tag) - distroless slim +# APP_FULL_IMG Python agent full runtime image ref (repo:tag) - includes sandbox runtime # GOLANG_ADK_IMG Go agent runtime image ref (repo:tag) # GOLANG_ADK_FULL_IMG Go agent full runtime image ref (repo:tag) # @@ -17,6 +18,7 @@ TRANSLATOR_PKG="github.com/kagent-dev/kagent/go/core/internal/controller/transla MANIFEST_ACCEPT="application/vnd.oci.image.index.v1+json, application/vnd.docker.distribution.manifest.list.v2+json, application/vnd.oci.image.manifest.v1+json, application/vnd.docker.distribution.manifest.v2+json" : "${APP_IMG:?APP_IMG is required}" +: "${APP_FULL_IMG:?APP_FULL_IMG is required}" : "${GOLANG_ADK_IMG:?GOLANG_ADK_IMG is required}" : "${GOLANG_ADK_FULL_IMG:?GOLANG_ADK_FULL_IMG is required}" @@ -96,5 +98,6 @@ append_digest_ldflag() { } append_digest_ldflag "PythonADKImageDigest" "${APP_IMG}" +append_digest_ldflag "PythonADKFullImageDigest" "${APP_FULL_IMG}" append_digest_ldflag "GoADKImageDigest" "${GOLANG_ADK_IMG}" append_digest_ldflag "GoADKFullImageDigest" "${GOLANG_ADK_FULL_IMG}" diff --git a/ui/src/app/agents/new/page.tsx b/ui/src/app/agents/new/page.tsx index 6146277fb7..cb3785730f 100644 --- a/ui/src/app/agents/new/page.tsx +++ b/ui/src/app/agents/new/page.tsx @@ -176,7 +176,9 @@ function AgentPageContent({ isEditMode, agentName, agentNamespace }: AgentPageCo const useDeclarativeAgentFields = formUsesDeclarativeSections(state.agentType); const substrateSandboxAgent = state.runInSandbox && state.sandboxPlatform === "substrate"; - const showDeclarativeRuntimeField = useDeclarativeAgentFields && !substrateSandboxAgent; + // Substrate now supports both Python and Go declarative runtimes, so the runtime selector is + // shown for declarative agents regardless of platform. + const showDeclarativeRuntimeField = useDeclarativeAgentFields; const showByoFields = formUsesByoSections(state.agentType); const showModelAndBehaviorSection = useDeclarativeAgentFields; const skillsEnabled = @@ -253,9 +255,6 @@ function AgentPageContent({ isEditMode, agentName, agentNamespace }: AgentPageCo agentResponse.workloadMode === "sandbox" ? sandboxFieldsFromApiSpec(agent.spec?.platform, agent.spec?.substrate) : {}; - const isSubstrateSandbox = - agentResponse.workloadMode === "sandbox" && - agent.spec?.platform === "substrate"; const useDeclarativeForm = agent.spec.type === "Declarative"; if (useDeclarativeForm) { const decl = agent.spec?.declarative; @@ -287,11 +286,8 @@ function AgentPageContent({ isEditMode, agentName, agentNamespace }: AgentPageCo : [newEmptyGitSkillRow()], skillsGitAuthSecretName: agent.spec?.skills?.gitAuthSecretRef?.name || "", stream: decl?.stream ?? false, - declarativeRuntime: isSubstrateSandbox - ? "go" - : decl?.runtime === "go" - ? "go" - : "python", + // Honor the persisted runtime for all platforms (substrate supports Python and Go). + declarativeRuntime: decl?.runtime === "go" ? "go" : "python", selectedMemoryModel: memoryModelConfig ? { ref: memoryModelConfig, spec: { model: memorySpec?.modelConfig || "", provider: "" } } : null, @@ -390,6 +386,12 @@ function AgentPageContent({ isEditMode, agentName, agentNamespace }: AgentPageCo const newErrors = validateAgentData(formData); + // BYO agents on substrate must set an explicit command: substrate copies the container + // command verbatim and does not fall back to the image entrypoint (mirrors the backend). + if (state.agentType === "BYO" && substrateSandboxAgent && !state.byoCmd.trim()) { + newErrors.byoCmd = "Command is required for BYO agents on Agent Substrate"; + } + if (useDeclarativeAgentFields && skillsEnabled) { const skillsInput = { skillRefs: state.skillRefs || [], @@ -741,9 +743,10 @@ function AgentPageContent({ isEditMode, agentName, agentNamespace }: AgentPageCo Sandbox platform - Agent Substrate runs declarative agents as ate.dev actors using the Go ADK - runtime. Skills are not supported on substrate yet. A new substrate actor is started - for each chat session. + Agent Substrate runs declarative (Python or Go) and BYO agents as ate.dev + actors. BYO images must set an explicit command and serve A2A on port 80. + Skills are not supported on substrate yet. A new substrate actor is started for + each chat session. onByoCmdChange(e.target.value)} From 6f38b38bbbf2ed5631458e896fccf4f690072b1f Mon Sep 17 00:00:00 2001 From: JM Huibonhoa Date: Tue, 23 Jun 2026 12:28:47 -0400 Subject: [PATCH 11/12] fix: test agent CLBO Signed-off-by: JM Huibonhoa --- go/core/test/e2e/agents/kebab/Dockerfile | 4 +++- python/Dockerfile.app | 2 +- python/Dockerfile.full | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/go/core/test/e2e/agents/kebab/Dockerfile b/go/core/test/e2e/agents/kebab/Dockerfile index 37ec3b8301..337c1eb554 100644 --- a/go/core/test/e2e/agents/kebab/Dockerfile +++ b/go/core/test/e2e/agents/kebab/Dockerfile @@ -14,6 +14,8 @@ COPY README.md README.md COPY .python-version .python-version COPY uv.lock uv.lock -RUN uv sync --locked --refresh +# Install only the kebab package into the inherited ADK venv. The base image already provides +# google-adk/kagent-adk; syncing this child project's lockfile downgrades shared runtime deps. +RUN uv pip install --python /.kagent/.venv/bin/python --no-deps . CMD ["kebab"] \ No newline at end of file diff --git a/python/Dockerfile.app b/python/Dockerfile.app index 4d5c5f8042..b60503e419 100644 --- a/python/Dockerfile.app +++ b/python/Dockerfile.app @@ -17,5 +17,5 @@ LABEL org.opencontainers.image.description="Kagent app is the Kagent agent runti LABEL org.opencontainers.image.authors="Kagent Creators 🤖" LABEL org.opencontainers.image.version="$VERSION" -ENTRYPOINT ["kagent-adk", "static"] +ENTRYPOINT ["/.kagent/.venv/bin/kagent-adk", "static"] CMD ["--host", "0.0.0.0", "--port", "8080"] diff --git a/python/Dockerfile.full b/python/Dockerfile.full index 79c1978783..f853107948 100644 --- a/python/Dockerfile.full +++ b/python/Dockerfile.full @@ -116,4 +116,4 @@ LABEL org.opencontainers.image.version="$VERSION" WORKDIR /app -ENTRYPOINT ["kagent-adk", "run", "--host", "0.0.0.0", "--port", "8080"] +ENTRYPOINT ["/.kagent/.venv/bin/kagent-adk", "run", "--host", "0.0.0.0", "--port", "8080"] From 884e3c587de965ee2205d6f0b5e0484660789fb6 Mon Sep 17 00:00:00 2001 From: JM Huibonhoa Date: Wed, 24 Jun 2026 14:58:22 -0400 Subject: [PATCH 12/12] fix: ensure on HandleDeleteSession when DeleteSandboxAgentSessionActor is invoked that all actors for a given session are deleted and ensure cleanup matches actor by owning template Signed-off-by: JM Huibonhoa --- .../sandboxbackend/substrate/agent_actor.go | 81 ++++++++++++++++-- .../substrate/config_hash_test.go | 83 +++++++++++++++++++ 2 files changed, 159 insertions(+), 5 deletions(-) diff --git a/go/core/pkg/sandboxbackend/substrate/agent_actor.go b/go/core/pkg/sandboxbackend/substrate/agent_actor.go index 164ea14cd1..6af8836cc2 100644 --- a/go/core/pkg/sandboxbackend/substrate/agent_actor.go +++ b/go/core/pkg/sandboxbackend/substrate/agent_actor.go @@ -130,16 +130,58 @@ func (b *SandboxAgentActorBackend) DeleteSandboxAgentActor(ctx context.Context, return deleteActor(ctx, b.client, actorID) } -// DeleteSandboxAgentSessionActor deletes the actor for a single chat session. +// DeleteSandboxAgentSessionActor deletes the actor(s) for a single chat session. Because the +// session actor id is keyed on the config hash and old templates/goldens are retained, a session +// can have actors under several hashes (one per config it was active under). Deleting only the +// current-hash actor would orphan the others, so this deletes the session's actor for every +// retained config hash. func (b *SandboxAgentActorBackend) DeleteSandboxAgentSessionActor(ctx context.Context, sa *v1alpha2.SandboxAgent, sessionID string) (bool, error) { - if sa == nil { + if b == nil || b.client == nil || sa == nil { return true, nil } - actorID, _, err := b.sessionActorRef(ctx, sa, sessionID) + hashes, err := b.retainedSessionConfigHashes(ctx, sa) if err != nil { return false, err } - return b.DeleteSandboxAgentActor(ctx, actorID) + allDone := true + seen := make(map[string]struct{}, len(hashes)) + for _, hash := range hashes { + actorID := SandboxAgentSessionActorID(sa, hash, sessionID) + if _, ok := seen[actorID]; ok { + continue + } + seen[actorID] = struct{}{} + done, err := b.DeleteSandboxAgentActor(ctx, actorID) + if err != nil { + return false, err + } + if !done { + allDone = false + } + } + return allDone, nil +} + +// retainedSessionConfigHashes returns the distinct config-hash segments across the agent's +// retained ActorTemplates (plus "" for legacy/no-hash actors). These are the hashes a session's +// actor id could have been keyed on, mirroring sessionActorRef's per-template derivation. +func (b *SandboxAgentActorBackend) retainedSessionConfigHashes(ctx context.Context, sa *v1alpha2.SandboxAgent) ([]string, error) { + templates, err := listSandboxAgentActorTemplates(ctx, b.kube, sa.Namespace, sa.Name) + if err != nil { + return nil, err + } + // Always include "" so a session actor created before any config hash existed is still cleaned. + hashes := []string{""} + seen := map[string]struct{}{"": {}} + for _, t := range templates { + hash := t.Annotations[consts.ConfigHashAnnotation] + if _, ok := seen[hash]; ok { + continue + } + seen[hash] = struct{}{} + hashes = append(hashes, hash) + } + return hashes, nil } // sessionActorRef resolves the agent's current (config-hashed) ActorTemplate and returns the @@ -164,6 +206,21 @@ func (b *SandboxAgentActorBackend) DeleteAllSandboxAgentActors(ctx context.Conte return true, nil } prefix := sandboxAgentActorPrefix(sa) + + // Build the set of ActorTemplates this agent owns (one per retained config hash). Session + // actors are created FROM these templates, so matching an actor's source template reliably + // identifies it even when its id falls back to the prefix-less asr- form (long agent + // name / session id), which id-prefix matching alone would miss. This runs before template + // cleanup in the delete path, so the templates are still present here. + templates, err := listSandboxAgentActorTemplates(ctx, b.kube, sa.Namespace, sa.Name) + if err != nil { + return false, err + } + ownedTemplates := make(map[string]struct{}, len(templates)) + for _, t := range templates { + ownedTemplates[t.Name] = struct{}{} + } + actors, err := b.client.ListActors(ctx) if err != nil { return false, fmt.Errorf("list substrate actors: %w", err) @@ -174,7 +231,7 @@ func (b *SandboxAgentActorBackend) DeleteAllSandboxAgentActors(ctx context.Conte if id == "" { continue } - if id != SandboxAgentActorID(sa) && !strings.HasPrefix(id, prefix+"-") { + if !actorBelongsToSandboxAgent(sa, actor, prefix, ownedTemplates) { continue } done, err := deleteActor(ctx, b.client, id) @@ -188,6 +245,20 @@ func (b *SandboxAgentActorBackend) DeleteAllSandboxAgentActors(ctx context.Conte return allDone, nil } +// actorBelongsToSandboxAgent reports whether an actor was created for this SandboxAgent. It matches +// on the actor's source ActorTemplate first (robust: survives the prefix-less asr- id +// fallback), then falls back to id-prefix matching as a backstop for orphaned actors whose +// template was already deleted. +func actorBelongsToSandboxAgent(sa *v1alpha2.SandboxAgent, actor *ateapipb.Actor, prefix string, ownedTemplates map[string]struct{}) bool { + if actor.GetActorTemplateNamespace() == sa.Namespace { + if _, ok := ownedTemplates[actor.GetActorTemplateName()]; ok { + return true + } + } + id := strings.TrimSpace(actor.GetActorId()) + return id == SandboxAgentActorID(sa) || strings.HasPrefix(id, prefix+"-") +} + func sandboxAgentActorPrefix(sa *v1alpha2.SandboxAgent) string { return SandboxAgentActorID(sa) } diff --git a/go/core/pkg/sandboxbackend/substrate/config_hash_test.go b/go/core/pkg/sandboxbackend/substrate/config_hash_test.go index 587a7d8b62..d2d06d7991 100644 --- a/go/core/pkg/sandboxbackend/substrate/config_hash_test.go +++ b/go/core/pkg/sandboxbackend/substrate/config_hash_test.go @@ -6,6 +6,7 @@ import ( "testing" atev1alpha1 "github.com/agent-substrate/substrate/pkg/api/v1alpha1" + "github.com/agent-substrate/substrate/pkg/proto/ateapipb" "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/kagent-dev/kagent/go/core/pkg/consts" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" @@ -217,3 +218,85 @@ func TestResolveCurrentActorTemplatePrefersDesiredGeneration(t *testing.T) { require.NoError(t, err) require.Equal(t, "agent-openai", got.Name, "while the desired golden builds, serve the most-recently-desired Ready template") } + +func TestActorBelongsToSandboxAgent(t *testing.T) { + t.Parallel() + sa := &v1alpha2.SandboxAgent{ObjectMeta: metav1.ObjectMeta{Name: "my-agent", Namespace: "kagent"}} + prefix := sandboxAgentActorPrefix(sa) + owned := map[string]struct{}{"my-agent-abc123": {}, "my-agent": {}} + + tests := []struct { + name string + actor *ateapipb.Actor + want bool + }{ + { + // The case comment #2 flags: a long agent name / session id forces the prefix-less + // asr- fallback id, which id-prefix matching misses — but the owning template matches. + name: "prefix-less fallback id matched by owning template", + actor: &ateapipb.Actor{ActorId: sandboxAgentIDPrefix + "-deadbeefdeadbeefdeadbeef", ActorTemplateNamespace: "kagent", ActorTemplateName: "my-agent-abc123"}, + want: true, + }, + { + name: "normal session id matched by prefix", + actor: &ateapipb.Actor{ActorId: prefix + "-sess-1", ActorTemplateNamespace: "kagent", ActorTemplateName: "my-agent-abc123"}, + want: true, + }, + { + name: "legacy per-agent id matched exactly", + actor: &ateapipb.Actor{ActorId: SandboxAgentActorID(sa)}, + want: true, + }, + { + name: "orphan actor whose template was already deleted still matched by prefix", + actor: &ateapipb.Actor{ActorId: prefix + "-sess-2", ActorTemplateName: "gone"}, + want: true, + }, + { + name: "unrelated actor not matched", + actor: &ateapipb.Actor{ActorId: "asr-other-ns-other-agent-sess", ActorTemplateNamespace: "kagent", ActorTemplateName: "other-agent"}, + want: false, + }, + { + name: "same template name in a different namespace not matched", + actor: &ateapipb.Actor{ActorId: "asr-xyz", ActorTemplateNamespace: "elsewhere", ActorTemplateName: "my-agent-abc123"}, + want: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + require.Equal(t, tt.want, actorBelongsToSandboxAgent(sa, tt.actor, prefix, owned)) + }) + } +} + +func TestRetainedSessionConfigHashes(t *testing.T) { + t.Parallel() + scheme := runtime.NewScheme() + utilruntime.Must(atev1alpha1.AddToScheme(scheme)) + + tmplA := &atev1alpha1.ActorTemplate{ObjectMeta: metav1.ObjectMeta{ + Name: "agent-abc123", Namespace: "kagent", + Labels: map[string]string{SandboxAgentLabelKey: "agent"}, + Annotations: map[string]string{consts.ConfigHashAnnotation: "abc123"}, + }} + tmplB := &atev1alpha1.ActorTemplate{ObjectMeta: metav1.ObjectMeta{ + Name: "agent-def456", Namespace: "kagent", + Labels: map[string]string{SandboxAgentLabelKey: "agent"}, + Annotations: map[string]string{consts.ConfigHashAnnotation: "def456"}, + }} + other := &atev1alpha1.ActorTemplate{ObjectMeta: metav1.ObjectMeta{ + Name: "other", Namespace: "kagent", + Labels: map[string]string{SandboxAgentLabelKey: "other-agent"}, + Annotations: map[string]string{consts.ConfigHashAnnotation: "zzz999"}, + }} + cl := fake.NewClientBuilder().WithScheme(scheme).WithObjects(tmplA, tmplB, other).Build() + + b := &SandboxAgentActorBackend{kube: cl} + sa := &v1alpha2.SandboxAgent{ObjectMeta: metav1.ObjectMeta{Name: "agent", Namespace: "kagent"}} + hashes, err := b.retainedSessionConfigHashes(context.Background(), sa) + require.NoError(t, err) + // "" is always included (legacy/no-hash actors), plus each retained template's hash; the other + // agent's template hash is excluded. + require.ElementsMatch(t, []string{"", "abc123", "def456"}, hashes) +}