diff --git a/Makefile b/Makefile index 22ab8f2459..13b7d3b7c4 100644 --- a/Makefile +++ b/Makefile @@ -63,7 +63,9 @@ ACP_SANDBOX_OPENCLAW_IMAGE_NAME ?= acp-sandbox-openclaw CONTROLLER_IMAGE_TAG ?= $(VERSION) UI_IMAGE_TAG ?= $(VERSION) APP_IMAGE_TAG ?= $(VERSION) +APP_FULL_IMAGE_TAG ?= $(VERSION)-full KAGENT_ADK_IMAGE_TAG ?= $(VERSION) +KAGENT_ADK_FULL_IMAGE_TAG ?= $(VERSION)-full GOLANG_ADK_IMAGE_TAG ?= $(VERSION) GOLANG_ADK_FULL_IMAGE_TAG ?= $(VERSION)-full SKILLS_INIT_IMAGE_TAG ?= $(VERSION) @@ -71,7 +73,9 @@ ACP_SANDBOX_IMAGE_TAG ?= $(VERSION) CONTROLLER_IMG ?= $(DOCKER_REGISTRY)/$(DOCKER_REPO)/$(CONTROLLER_IMAGE_NAME):$(CONTROLLER_IMAGE_TAG) UI_IMG ?= $(DOCKER_REGISTRY)/$(DOCKER_REPO)/$(UI_IMAGE_NAME):$(UI_IMAGE_TAG) APP_IMG ?= $(DOCKER_REGISTRY)/$(DOCKER_REPO)/$(APP_IMAGE_NAME):$(APP_IMAGE_TAG) +APP_FULL_IMG ?= $(DOCKER_REGISTRY)/$(DOCKER_REPO)/$(APP_IMAGE_NAME):$(APP_FULL_IMAGE_TAG) KAGENT_ADK_IMG ?= $(DOCKER_REGISTRY)/$(DOCKER_REPO)/$(KAGENT_ADK_IMAGE_NAME):$(KAGENT_ADK_IMAGE_TAG) +KAGENT_ADK_FULL_IMG ?= $(DOCKER_REGISTRY)/$(DOCKER_REPO)/$(KAGENT_ADK_IMAGE_NAME):$(KAGENT_ADK_FULL_IMAGE_TAG) GOLANG_ADK_IMG ?= $(DOCKER_REGISTRY)/$(DOCKER_REPO)/$(GOLANG_ADK_IMAGE_NAME):$(GOLANG_ADK_IMAGE_TAG) GOLANG_ADK_FULL_IMG ?= $(DOCKER_REGISTRY)/$(DOCKER_REPO)/$(GOLANG_ADK_IMAGE_NAME):$(GOLANG_ADK_FULL_IMAGE_TAG) SKILLS_INIT_IMG ?= $(DOCKER_REGISTRY)/$(DOCKER_REPO)/$(SKILLS_INIT_IMAGE_NAME):$(SKILLS_INIT_IMAGE_TAG) @@ -208,12 +212,14 @@ build-all: buildx-create .PHONY: build build: ## Build and push all component images -build: buildx-create build-ui build-skills-init build-golang-adk build-golang-adk-full build-app build-controller +build: buildx-create build-ui build-skills-init build-golang-adk build-golang-adk-full build-app build-app-full build-controller @echo "Build completed successfully." @echo "Controller Image: $(CONTROLLER_IMG)" @echo "UI Image: $(UI_IMG)" @echo "App Image: $(APP_IMG)" + @echo "App Full Image: $(APP_FULL_IMG)" @echo "Kagent ADK Image: $(KAGENT_ADK_IMG)" + @echo "Kagent ADK Full Image: $(KAGENT_ADK_FULL_IMG)" @echo "Golang ADK Image: $(GOLANG_ADK_IMG)" @echo "Golang ADK Full Image: $(GOLANG_ADK_FULL_IMG)" @echo "Skills Init Image: $(SKILLS_INIT_IMG)" @@ -241,7 +247,9 @@ build-img-versions: ## Print the fully-qualified image tags for all components @echo controller=$(CONTROLLER_IMG) @echo ui=$(UI_IMG) @echo app=$(APP_IMG) + @echo app-full=$(APP_FULL_IMG) @echo kagent-adk=$(KAGENT_ADK_IMG) + @echo kagent-adk-full=$(KAGENT_ADK_FULL_IMG) @echo golang-adk=$(GOLANG_ADK_IMG) @echo golang-adk-full=$(GOLANG_ADK_FULL_IMG) @echo skills-init=$(SKILLS_INIT_IMG) @@ -256,10 +264,11 @@ controller-manifests: ## Regenerate CRD manifests and copy them into the Helm ch .PHONY: build-controller build-controller: ## Build and push the controller image (embeds agent runtime + acp-sandbox digests via scripts/controller-digest-ldflags.sh) -build-controller: buildx-create controller-manifests build-app build-golang-adk build-golang-adk-full build-acp-sandbox-openclaw build-acp-sandbox-hermes +build-controller: buildx-create controller-manifests build-app build-app-full build-golang-adk build-golang-adk-full build-acp-sandbox-openclaw build-acp-sandbox-hermes @set -e; \ DIGEST_LDFLAGS=$$(CONTAINER_RUNTIME=$(CONTAINER_RUNTIME) \ APP_IMG=$(APP_IMG) \ + APP_FULL_IMG=$(APP_FULL_IMG) \ GOLANG_ADK_IMG=$(GOLANG_ADK_IMG) \ GOLANG_ADK_FULL_IMG=$(GOLANG_ADK_FULL_IMG) \ ACP_SANDBOX_OPENCLAW_IMG=$(ACP_SANDBOX_OPENCLAW_IMG) \ @@ -284,11 +293,23 @@ build-kagent-adk: buildx-create $(DOCKER_PUSH) $(KAGENT_ADK_IMG) .PHONY: build-app -build-app: ## Build and push the app image (depends on kagent-adk) +build-app: ## Build and push the app image (distroless slim; depends on kagent-adk) build-app: buildx-create build-kagent-adk $(DOCKER_BUILDER) $(DOCKER_BUILD_ARGS) $(TOOLS_IMAGE_BUILD_ARGS) --build-arg KAGENT_ADK_VERSION=$(KAGENT_ADK_IMAGE_TAG) --build-arg DOCKER_REGISTRY=$(DOCKER_REGISTRY) -t $(APP_IMG) -f python/Dockerfile.app ./python $(DOCKER_PUSH) $(APP_IMG) +.PHONY: build-kagent-adk-full +build-kagent-adk-full: ## Build and push the full Python kagent ADK image (includes sandbox runtime) +build-kagent-adk-full: buildx-create + $(DOCKER_BUILDER) $(DOCKER_BUILD_ARGS) $(TOOLS_IMAGE_BUILD_ARGS) -t $(KAGENT_ADK_FULL_IMG) -f python/Dockerfile.full ./python + $(DOCKER_PUSH) $(KAGENT_ADK_FULL_IMG) + +.PHONY: build-app-full +build-app-full: ## Build and push the full app image (sandbox runtime; depends on kagent-adk-full) +build-app-full: buildx-create build-kagent-adk-full + $(DOCKER_BUILDER) $(DOCKER_BUILD_ARGS) $(TOOLS_IMAGE_BUILD_ARGS) --build-arg KAGENT_ADK_VERSION=$(KAGENT_ADK_FULL_IMAGE_TAG) --build-arg DOCKER_REGISTRY=$(DOCKER_REGISTRY) -t $(APP_FULL_IMG) -f python/Dockerfile.app ./python + $(DOCKER_PUSH) $(APP_FULL_IMG) + .PHONY: build-golang-adk build-golang-adk: ## Build and push the Go ADK image build-golang-adk: buildx-create @@ -342,8 +363,8 @@ lint: ## Run linters for Go and Python make -C python lint .PHONY: push-test-agent -push-test-agent: buildx-create build-kagent-adk ## Build and push E2E test agent images to the local registry - echo "Building FROM DOCKER_REGISTRY=$(DOCKER_REGISTRY)/$(DOCKER_REPO)/kagent-adk:$(VERSION)" +push-test-agent: buildx-create build-kagent-adk build-kagent-adk-full ## Build and push E2E test agent images to the local registry + echo "Building FROM DOCKER_REGISTRY=$(DOCKER_REGISTRY)/$(DOCKER_REPO)/kagent-adk:$(VERSION)-full" $(DOCKER_BUILDER) $(DOCKER_BUILD_ARGS) $(TOOLS_IMAGE_BUILD_ARGS) -t $(DOCKER_REGISTRY)/kebab:latest -f go/core/test/e2e/agents/kebab/Dockerfile ./go/core/test/e2e/agents/kebab $(DOCKER_PUSH) $(DOCKER_REGISTRY)/kebab:latest kubectl apply --namespace kagent --context kind-$(KIND_CLUSTER_NAME) -f go/core/test/e2e/agents/kebab/agent.yaml diff --git a/go/api/config/crd/bases/kagent.dev_sandboxagents.yaml b/go/api/config/crd/bases/kagent.dev_sandboxagents.yaml index bff6672d68..6bb5fbbbd2 100644 --- a/go/api/config/crd/bases/kagent.dev_sandboxagents.yaml +++ b/go/api/config/crd/bases/kagent.dev_sandboxagents.yaml @@ -11328,8 +11328,6 @@ spec: x-kubernetes-validations: - message: spec.skills is not supported for sandbox agents rule: '!has(self.skills)' - - message: BYO agents are not supported for sandbox agents - rule: '!has(self.type) || self.type != ''BYO''' - message: type must be specified rule: has(self.type) - message: type must be either Declarative or BYO diff --git a/go/api/v1alpha2/agent_runtime_test.go b/go/api/v1alpha2/agent_runtime_test.go index f12d8f5c31..614b2a2c73 100644 --- a/go/api/v1alpha2/agent_runtime_test.go +++ b/go/api/v1alpha2/agent_runtime_test.go @@ -6,31 +6,36 @@ import ( "github.com/stretchr/testify/require" ) -func TestEffectiveDeclarativeRuntimeForAgent(t *testing.T) { - substrateSpec := AgentSpec{ - Type: AgentType_Declarative, - Declarative: &DeclarativeAgentSpec{ - Runtime: DeclarativeRuntime_Python, +func TestEffectiveDeclarativeRuntime(t *testing.T) { + tests := []struct { + name string + spec *AgentSpec + want DeclarativeRuntime + }{ + { + name: "nil spec defaults to Python", + spec: nil, + want: DeclarativeRuntime_Python, + }, + { + name: "unset runtime defaults to Python", + spec: &AgentSpec{Type: AgentType_Declarative, Declarative: &DeclarativeAgentSpec{}}, + want: DeclarativeRuntime_Python, + }, + { + name: "explicit Python runtime", + spec: &AgentSpec{Type: AgentType_Declarative, Declarative: &DeclarativeAgentSpec{Runtime: DeclarativeRuntime_Python}}, + want: DeclarativeRuntime_Python, + }, + { + name: "explicit Go runtime is honored", + spec: &AgentSpec{Type: AgentType_Declarative, Declarative: &DeclarativeAgentSpec{Runtime: DeclarativeRuntime_Go}}, + want: DeclarativeRuntime_Go, }, } - - t.Run("regular Agent keeps configured runtime", func(t *testing.T) { - agent := &Agent{Spec: substrateSpec} - require.Equal(t, DeclarativeRuntime_Python, EffectiveDeclarativeRuntimeForAgent(agent)) - }) - - t.Run("SandboxAgent uses Go", func(t *testing.T) { - sa := &SandboxAgent{Spec: SandboxAgentSpec{AgentSpec: substrateSpec}} - require.Equal(t, DeclarativeRuntime_Go, EffectiveDeclarativeRuntimeForAgent(sa)) - }) - - t.Run("regular Agent honors Go runtime", func(t *testing.T) { - agent := &Agent{Spec: AgentSpec{ - Type: AgentType_Declarative, - Declarative: &DeclarativeAgentSpec{ - Runtime: DeclarativeRuntime_Go, - }, - }} - require.Equal(t, DeclarativeRuntime_Go, EffectiveDeclarativeRuntimeForAgent(agent)) - }) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + require.Equal(t, tt.want, EffectiveDeclarativeRuntime(tt.spec)) + }) + } } diff --git a/go/api/v1alpha2/agent_spec_validation.go b/go/api/v1alpha2/agent_spec_validation.go index 73e7e6e2a4..5e46592fbd 100644 --- a/go/api/v1alpha2/agent_spec_validation.go +++ b/go/api/v1alpha2/agent_spec_validation.go @@ -1,11 +1,13 @@ package v1alpha2 -import "fmt" +import ( + "fmt" + "strings" +) const ( - substrateSandboxSkillsUnsupportedMsg = "spec.skills is not supported for sandbox agents" - substrateSandboxPythonRuntimeUnsupportedMsg = "spec.declarative.runtime must be \"go\" for sandbox agents" - substrateSandboxBYOUnsupportedMsg = "BYO agents are not supported for sandbox agents" + substrateSandboxSkillsUnsupportedMsg = "spec.skills is not supported for sandbox agents" + substrateSandboxBYOMissingCommandMsg = "BYO agents on substrate must set spec.byo.deployment.cmd (substrate does not fall back to the image entrypoint)" ) // AgentSpecHasSkills reports whether the spec configures any skill sources. @@ -18,23 +20,25 @@ func AgentSpecHasSkills(spec *AgentSpec) bool { } // ValidateSubstrateSandboxAgentSpec rejects sandbox agent configurations that kagent -// does not support on Agent Substrate (for example declarative skills or BYO agents). +// does not support on Agent Substrate (for example declarative skills). Declarative +// Python/Go and BYO (Go/Python) agents are supported; BYO agents must provide an explicit +// command because substrate copies the container Command verbatim with no image-entrypoint +// fallback. func ValidateSubstrateSandboxAgentSpec(agent *SandboxAgent) error { if agent == nil { return nil } spec := agent.GetAgentSpec() - if spec.Type == AgentType_BYO { - return fmt.Errorf("%s", substrateSandboxBYOUnsupportedMsg) - } if AgentSpecHasSkills(spec) { return fmt.Errorf("%s", substrateSandboxSkillsUnsupportedMsg) } - if spec.Type == AgentType_Declarative && - spec.Declarative != nil && - spec.Declarative.Runtime != "" && - spec.Declarative.Runtime != DeclarativeRuntime_Go { - return fmt.Errorf("%s", substrateSandboxPythonRuntimeUnsupportedMsg) + if spec.Type == AgentType_BYO { + dep := spec.BYO + // Trim so a whitespace-only cmd is rejected like an empty one (substrate would treat it + // as no command, and the UI trims before validating — keep backend/UI aligned). + if dep == nil || dep.Deployment == nil || dep.Deployment.Cmd == nil || strings.TrimSpace(*dep.Deployment.Cmd) == "" { + return fmt.Errorf("%s", substrateSandboxBYOMissingCommandMsg) + } } return nil } diff --git a/go/api/v1alpha2/agent_spec_validation_test.go b/go/api/v1alpha2/agent_spec_validation_test.go index cfe6f6a3b8..704edad3a5 100644 --- a/go/api/v1alpha2/agent_spec_validation_test.go +++ b/go/api/v1alpha2/agent_spec_validation_test.go @@ -25,7 +25,7 @@ func TestValidateSubstrateSandboxAgentSpec(t *testing.T) { require.Contains(t, err.Error(), substrateSandboxSkillsUnsupportedMsg) }) - t.Run("rejects python runtime", func(t *testing.T) { + t.Run("allows python runtime", func(t *testing.T) { agent := &SandboxAgent{ Spec: SandboxAgentSpec{ AgentSpec: AgentSpec{ @@ -36,23 +36,49 @@ func TestValidateSubstrateSandboxAgentSpec(t *testing.T) { }, }, } + require.NoError(t, ValidateSubstrateSandboxAgentSpec(agent)) + }) + + t.Run("rejects BYO agents without an explicit command", func(t *testing.T) { + agent := &SandboxAgent{ + Spec: SandboxAgentSpec{ + AgentSpec: AgentSpec{ + Type: AgentType_BYO, + BYO: &BYOAgentSpec{Deployment: &ByoDeploymentSpec{Image: "example/agent:latest"}}, + }, + }, + } err := ValidateSubstrateSandboxAgentSpec(agent) require.Error(t, err) - require.Contains(t, err.Error(), substrateSandboxPythonRuntimeUnsupportedMsg) + require.Contains(t, err.Error(), substrateSandboxBYOMissingCommandMsg) }) - t.Run("rejects BYO agents", func(t *testing.T) { + t.Run("rejects BYO agents with a whitespace-only command", func(t *testing.T) { + cmd := " " agent := &SandboxAgent{ Spec: SandboxAgentSpec{ AgentSpec: AgentSpec{ Type: AgentType_BYO, - BYO: &BYOAgentSpec{}, + BYO: &BYOAgentSpec{Deployment: &ByoDeploymentSpec{Image: "example/agent:latest", Cmd: &cmd}}, }, }, } err := ValidateSubstrateSandboxAgentSpec(agent) require.Error(t, err) - require.Contains(t, err.Error(), substrateSandboxBYOUnsupportedMsg) + require.Contains(t, err.Error(), substrateSandboxBYOMissingCommandMsg) + }) + + t.Run("allows BYO agents with an explicit command", func(t *testing.T) { + cmd := "/app" + agent := &SandboxAgent{ + Spec: SandboxAgentSpec{ + AgentSpec: AgentSpec{ + Type: AgentType_BYO, + BYO: &BYOAgentSpec{Deployment: &ByoDeploymentSpec{Image: "example/agent:latest", Cmd: &cmd}}, + }, + }, + } + require.NoError(t, ValidateSubstrateSandboxAgentSpec(agent)) }) t.Run("allows go runtime", func(t *testing.T) { diff --git a/go/api/v1alpha2/agent_types.go b/go/api/v1alpha2/agent_types.go index 185e3f9615..459778dcec 100644 --- a/go/api/v1alpha2/agent_types.go +++ b/go/api/v1alpha2/agent_types.go @@ -252,6 +252,7 @@ type SandboxConfig struct { } // EffectiveDeclarativeRuntime returns the ADK runtime from spec fields (defaults to Python when not set). +// All agents (including substrate SandboxAgents) honor spec.declarative.runtime. func EffectiveDeclarativeRuntime(spec *AgentSpec) DeclarativeRuntime { if spec == nil { return DeclarativeRuntime_Python @@ -263,18 +264,6 @@ func EffectiveDeclarativeRuntime(spec *AgentSpec) DeclarativeRuntime { return runtime } -// EffectiveDeclarativeRuntimeForAgent returns the runtime for a reconciled agent object. -// SandboxAgents always use Go; regular Agents honor spec.declarative.runtime. -func EffectiveDeclarativeRuntimeForAgent(agent AgentObject) DeclarativeRuntime { - spec := agent.GetAgentSpec() - if agent.GetWorkloadMode() == WorkloadModeSandbox && - spec != nil && - spec.Type == AgentType_Declarative { - return DeclarativeRuntime_Go - } - return EffectiveDeclarativeRuntime(spec) -} - // NetworkConfig configures outbound network access for sandboxed execution paths. type NetworkConfig struct { // AllowedDomains lists the domains that sandboxed execution may contact. diff --git a/go/api/v1alpha2/sandboxagent_types.go b/go/api/v1alpha2/sandboxagent_types.go index 4cf28fc02a..b1bceeeb1f 100644 --- a/go/api/v1alpha2/sandboxagent_types.go +++ b/go/api/v1alpha2/sandboxagent_types.go @@ -38,7 +38,6 @@ type SandboxAgent struct { } // +kubebuilder:validation:XValidation:rule="!has(self.skills)",message="spec.skills is not supported for sandbox agents" -// +kubebuilder:validation:XValidation:rule="!has(self.type) || self.type != 'BYO'",message="BYO agents are not supported for sandbox agents" type SandboxAgentSpec struct { AgentSpec `json:",inline"` diff --git a/go/core/internal/controller/sandboxagent_controller.go b/go/core/internal/controller/sandboxagent_controller.go index ecc0a53239..1c91502453 100644 --- a/go/core/internal/controller/sandboxagent_controller.go +++ b/go/core/internal/controller/sandboxagent_controller.go @@ -79,7 +79,7 @@ func (r *SandboxAgentController) Reconcile(ctx context.Context, req ctrl.Request return ctrl.Result{}, fmt.Errorf("get SandboxAgent: %w", err) } - if r.SubstrateLifecycle != nil { + if r.substrateConfigured() { if res, err := r.reconcileSubstrateSandboxAgent(ctx, &sa); err != nil || !res.IsZero() { return res, err } @@ -108,7 +108,7 @@ func (r *SandboxAgentController) SetupWithManager(mgr ctrl.Manager) error { if err != nil { return err } - if r.SubstrateLifecycle != nil { + if r.substrateConfigured() { build = build.Watches( &atev1alpha1.ActorTemplate{}, handler.EnqueueRequestsFromMapFunc(r.enqueueSandboxAgentForSubstrateResource), diff --git a/go/core/internal/controller/sandboxagent_substrate.go b/go/core/internal/controller/sandboxagent_substrate.go index 8003bf1378..c225399177 100644 --- a/go/core/internal/controller/sandboxagent_substrate.go +++ b/go/core/internal/controller/sandboxagent_substrate.go @@ -16,10 +16,18 @@ import ( const sandboxAgentSubstrateFinalizer = "kagent.dev/sandbox-agent-substrate-cleanup" +// substrateConfigured reports whether the substrate backend is wired. The lifecycle and actor +// backend are constructed together (only when an ate-api endpoint is set), so they are +// all-or-nothing; gating once here lets the substrate reconcile path and its helpers assume both +// are present rather than nil-checking each dependency at every call site. +func (r *SandboxAgentController) substrateConfigured() bool { + return r.SubstrateLifecycle != nil && r.SubstrateActorBackend != nil +} + +// reconcileSubstrateSandboxAgent is only reached when substrateConfigured() is true (see +// Reconcile), so SubstrateLifecycle and SubstrateActorBackend are guaranteed non-nil here and in +// the helpers it calls. func (r *SandboxAgentController) reconcileSubstrateSandboxAgent(ctx context.Context, sa *v1alpha2.SandboxAgent) (ctrl.Result, error) { - if r.SubstrateLifecycle == nil { - return ctrl.Result{}, fmt.Errorf("substrate sandbox backend is not configured") - } if !sa.DeletionTimestamp.IsZero() { return r.reconcileSubstrateSandboxAgentDelete(ctx, sa) } @@ -29,6 +37,13 @@ func (r *SandboxAgentController) reconcileSubstrateSandboxAgent(ctx context.Cont } return ctrl.Result{Requeue: true}, nil } + + // A config change creates a new config-hashed ActorTemplate (applied via the translator's + // BuildSandbox path); the previous template, its golden, and any per-session actors are left + // in place. Superseded goldens and suspended session actors are stateful and pin no workers + // (a suspended actor frees its worker), so they are retained — not retired — and cleaned up + // only when the SandboxAgent itself is deleted (see reconcileSubstrateSandboxAgentDelete). + // ResolveCurrentActorTemplate keeps chat and readiness pointed at the newest Ready template. return ctrl.Result{}, nil } @@ -44,24 +59,16 @@ func (r *SandboxAgentController) reconcileSubstrateSandboxAgentDelete(ctx contex return r.removeSubstrateSandboxAgentFinalizer(ctx, sa) } - if r.SubstrateActorBackend != nil { - done, err := r.SubstrateActorBackend.DeleteAllSandboxAgentActors(ctx, sa) - if err != nil { - return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, err - } - if !done { - return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, nil - } + if done, err := r.SubstrateActorBackend.DeleteAllSandboxAgentActors(ctx, sa); err != nil { + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, err + } else if !done { + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, nil } - if r.SubstrateLifecycle != nil { - done, err := r.SubstrateLifecycle.CleanupSandboxAgentTemplate(ctx, sa) - if err != nil { - return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, err - } - if !done { - return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, nil - } + if done, err := r.SubstrateLifecycle.CleanupSandboxAgentTemplate(ctx, sa); err != nil { + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, err + } else if !done { + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, nil } return r.removeSubstrateSandboxAgentFinalizer(ctx, sa) diff --git a/go/core/internal/controller/translator/agent/adk_api_translator.go b/go/core/internal/controller/translator/agent/adk_api_translator.go index 4b09d30a22..f5b1f18797 100644 --- a/go/core/internal/controller/translator/agent/adk_api_translator.go +++ b/go/core/internal/controller/translator/agent/adk_api_translator.go @@ -116,9 +116,11 @@ var DefaultImageConfig = ImageConfig{ Repository: "kagent-dev/kagent/app", } -// PythonADKImageDigest, GoADKImageDigest, and GoADKFullImageDigest are set at -// controller link time from the pushed runtime image manifest digests. +// PythonADKImageDigest, PythonADKFullImageDigest, GoADKImageDigest, and GoADKFullImageDigest +// are set at controller link time from the pushed runtime image manifest digests. The "full" +// variants bundle the sandbox runtime (code execution / bash tools); the slim variants do not. var PythonADKImageDigest string +var PythonADKFullImageDigest string var GoADKImageDigest string var GoADKFullImageDigest string diff --git a/go/core/internal/controller/translator/agent/deployments.go b/go/core/internal/controller/translator/agent/deployments.go index 2a55255697..794b7ab57c 100644 --- a/go/core/internal/controller/translator/agent/deployments.go +++ b/go/core/internal/controller/translator/agent/deployments.go @@ -123,13 +123,20 @@ func validateExtraContainers(containers []corev1.Container) error { return nil } -func resolvePythonRuntimeImage(registry string) (string, error) { +func resolvePythonRuntimeImage(registry string, full bool) (string, error) { repo := DefaultImageConfig.Repository - if d := normalizeImageDigest(PythonADKImageDigest); d != "" { + digest := PythonADKImageDigest + imageLabel := "app" + if full { + digest = PythonADKFullImageDigest + imageLabel = "app-full" + } + if d := normalizeImageDigest(digest); d != "" { return fmt.Sprintf("%s/%s@%s", registry, repo, d), nil } return "", fmt.Errorf( - "app image digest is not set at link time; rebuild the controller after pushing agent runtime images", + "%s image digest is not set at link time; rebuild the controller after pushing agent runtime images", + imageLabel, ) } @@ -172,8 +179,8 @@ func resolveInlineDeployment(agent v1alpha2.AgentObject, mdd *modelDeploymentDat spec = *specRef.Declarative.Deployment } - // Determine runtime (defaults to python if not set; substrate SandboxAgents use Go). - runtime := v1alpha2.EffectiveDeclarativeRuntimeForAgent(agent) + // Determine runtime (defaults to python when spec.declarative.runtime is unset). + runtime := v1alpha2.EffectiveDeclarativeRuntime(agent.GetAgentSpec()) // Get registry registry := DefaultImageConfig.Registry @@ -182,7 +189,7 @@ func resolveInlineDeployment(agent v1alpha2.AgentObject, mdd *modelDeploymentDat } var image string - full := runtime == v1alpha2.DeclarativeRuntime_Go && needsSRTSettings(agent, specRef.Sandbox) + full := needsSRTSettings(agent, specRef.Sandbox) switch runtime { case v1alpha2.DeclarativeRuntime_Go: var err error @@ -192,7 +199,7 @@ func resolveInlineDeployment(agent v1alpha2.AgentObject, mdd *modelDeploymentDat } default: var err error - image, err = resolvePythonRuntimeImage(registry) + image, err = resolvePythonRuntimeImage(registry, full) if err != nil { return nil, err } diff --git a/go/core/internal/controller/translator/agent/digest_testmain_external_test.go b/go/core/internal/controller/translator/agent/digest_testmain_external_test.go index db798c77cf..292ffb1ec8 100644 --- a/go/core/internal/controller/translator/agent/digest_testmain_external_test.go +++ b/go/core/internal/controller/translator/agent/digest_testmain_external_test.go @@ -9,6 +9,7 @@ import ( func TestMain(m *testing.M) { translator.PythonADKImageDigest = "sha256:test-app" + translator.PythonADKFullImageDigest = "sha256:test-app-full" translator.GoADKImageDigest = "sha256:test-go-base" translator.GoADKFullImageDigest = "sha256:test-go-full" os.Exit(m.Run()) diff --git a/go/core/internal/controller/translator/agent/imageconfig_test.go b/go/core/internal/controller/translator/agent/imageconfig_test.go index be1870dcf0..9ed21ec869 100644 --- a/go/core/internal/controller/translator/agent/imageconfig_test.go +++ b/go/core/internal/controller/translator/agent/imageconfig_test.go @@ -75,14 +75,33 @@ func TestResolveGoRuntimeImageWithoutDigest(t *testing.T) { func TestResolvePythonRuntimeImageWithDigest(t *testing.T) { original := PythonADKImageDigest + originalFull := PythonADKFullImageDigest t.Cleanup(func() { PythonADKImageDigest = original + PythonADKFullImageDigest = originalFull }) PythonADKImageDigest = "sha256:app-digest" + PythonADKFullImageDigest = "sha256:app-full-digest" - got, err := resolvePythonRuntimeImage("cr.kagent.dev") + got, err := resolvePythonRuntimeImage("cr.kagent.dev", false) require.NoError(t, err) require.Equal(t, "cr.kagent.dev/kagent-dev/kagent/app@sha256:app-digest", got) + + gotFull, err := resolvePythonRuntimeImage("cr.kagent.dev", true) + require.NoError(t, err) + require.Equal(t, "cr.kagent.dev/kagent-dev/kagent/app@sha256:app-full-digest", gotFull) +} + +func TestResolvePythonFullRuntimeImageWithoutDigest(t *testing.T) { + original := PythonADKFullImageDigest + t.Cleanup(func() { + PythonADKFullImageDigest = original + }) + PythonADKFullImageDigest = "" + + _, err := resolvePythonRuntimeImage("cr.kagent.dev", true) + require.Error(t, err) + require.Contains(t, err.Error(), "app-full") } func TestPythonADKImageDigestSupportsLinkerFlag(t *testing.T) { @@ -103,7 +122,7 @@ func TestResolvePythonRuntimeImageWithoutDigest(t *testing.T) { }) PythonADKImageDigest = "" - _, err := resolvePythonRuntimeImage("cr.kagent.dev") + _, err := resolvePythonRuntimeImage("cr.kagent.dev", false) require.Error(t, err) require.Contains(t, err.Error(), "app") } diff --git a/go/core/internal/controller/translator/agent/manifest_builder.go b/go/core/internal/controller/translator/agent/manifest_builder.go index ebde454264..1412490097 100644 --- a/go/core/internal/controller/translator/agent/manifest_builder.go +++ b/go/core/internal/controller/translator/agent/manifest_builder.go @@ -14,6 +14,7 @@ import ( "github.com/kagent-dev/kagent/go/core/internal/controller/translator/labels" "github.com/kagent-dev/kagent/go/core/internal/skillsinit" "github.com/kagent-dev/kagent/go/core/internal/utils" + "github.com/kagent-dev/kagent/go/core/pkg/consts" "github.com/kagent-dev/kagent/go/core/pkg/env" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" appsv1 "k8s.io/api/apps/v1" @@ -26,8 +27,10 @@ import ( // configHashAnnotation is set on the agent pod template so a change to // the serialized config Secret (including tool URLs and ModelConfig/RMS -// Secret rotations folded in via Status hashes) rolls the pod. -const configHashAnnotation = "kagent.dev/config-hash" +// Secret rotations folded in via Status hashes) rolls the pod. It is the +// shared consts.ConfigHashAnnotation key (the substrate backend mirrors +// the same annotation onto generated ActorTemplates), defined once there. +const configHashAnnotation = consts.ConfigHashAnnotation type manifestContext struct { agent v1alpha2.AgentObject @@ -110,7 +113,7 @@ func (a *adkApiTranslator) BuildManifest( podTemplate := buildPodTemplate(manifestCtx, podRuntime, configHash) - workloadObjects, err := a.buildWorkloadObjects(ctx, manifestCtx, podTemplate) + workloadObjects, err := a.buildWorkloadObjects(ctx, manifestCtx, podTemplate, configSecret.secret) if err != nil { return nil, err } @@ -549,18 +552,20 @@ func buildPodTemplate( } func agentRuntime(agent v1alpha2.AgentObject) v1alpha2.DeclarativeRuntime { - return v1alpha2.EffectiveDeclarativeRuntimeForAgent(agent) + return v1alpha2.EffectiveDeclarativeRuntime(agent.GetAgentSpec()) } func (a *adkApiTranslator) buildWorkloadObjects( ctx context.Context, manifestCtx manifestContext, podTemplate corev1.PodTemplateSpec, + configSecret *corev1.Secret, ) ([]client.Object, error) { if manifestCtx.runInSandbox() { sbObjs, err := a.sandboxBackend.BuildSandbox(ctx, sandboxbackend.BuildInput{ - Agent: manifestCtx.agent, - PodTemplate: podTemplate, + Agent: manifestCtx.agent, + PodTemplate: podTemplate, + ConfigSecret: configSecret, }) if err != nil { return nil, fmt.Errorf("build sandbox workload: %w", err) diff --git a/go/core/internal/controller/translator/agent/remotemcpserver_tls_test.go b/go/core/internal/controller/translator/agent/remotemcpserver_tls_test.go index 2e90cffd78..bb9555df03 100644 --- a/go/core/internal/controller/translator/agent/remotemcpserver_tls_test.go +++ b/go/core/internal/controller/translator/agent/remotemcpserver_tls_test.go @@ -9,6 +9,7 @@ import ( "github.com/kagent-dev/kagent/go/api/v1alpha2" translator "github.com/kagent-dev/kagent/go/core/internal/controller/translator/agent" + "github.com/kagent-dev/kagent/go/core/pkg/consts" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -653,7 +654,7 @@ func Test_AdkApiTranslator_RMSTLS_SecretHashChangesAgentConfigHash(t *testing.T) outputs, err := translator.TranslateAgent(context.Background(), trans, agent) require.NoError(t, err) dep := findDeployment(t, outputs) - return dep.Spec.Template.Annotations["kagent.dev/config-hash"] + return dep.Spec.Template.Annotations[consts.ConfigHashAnnotation] } preRotate := build("deadbeef") diff --git a/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_code.json b/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_code.json index f9e03a9be8..451bf4d795 100644 --- a/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_code.json +++ b/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_code.json @@ -193,7 +193,7 @@ "value": "/config/srt-settings.json" } ], - "image": "cr.kagent.dev/kagent-dev/kagent/app@sha256:test-app", + "image": "cr.kagent.dev/kagent-dev/kagent/app@sha256:test-app-full", "imagePullPolicy": "IfNotPresent", "name": "kagent", "ports": [ diff --git a/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_git_skills.json b/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_git_skills.json index dd5efe2e78..11aaf8c5f8 100644 --- a/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_git_skills.json +++ b/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_git_skills.json @@ -215,7 +215,7 @@ "value": "/config/srt-settings.json" } ], - "image": "cr.kagent.dev/kagent-dev/kagent/app@sha256:test-app", + "image": "cr.kagent.dev/kagent-dev/kagent/app@sha256:test-app-full", "imagePullPolicy": "IfNotPresent", "name": "kagent", "ports": [ diff --git a/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_skills.json b/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_skills.json index 400fa7fff1..ff48dc868a 100644 --- a/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_skills.json +++ b/go/core/internal/controller/translator/agent/testdata/outputs/agent_with_skills.json @@ -215,7 +215,7 @@ "value": "/config/srt-settings.json" } ], - "image": "cr.kagent.dev/kagent-dev/kagent/app@sha256:test-app", + "image": "cr.kagent.dev/kagent-dev/kagent/app@sha256:test-app-full", "imagePullPolicy": "IfNotPresent", "name": "kagent", "ports": [ diff --git a/go/core/pkg/app/app.go b/go/core/pkg/app/app.go index 3ee2ce8cb5..bf08cc7d2c 100644 --- a/go/core/pkg/app/app.go +++ b/go/core/pkg/app/app.go @@ -538,7 +538,7 @@ func Start(getExtensionConfig GetExtensionConfig, migrationRunner MigrationRunne if atenetRouterURL == "" { atenetRouterURL = substrate.DefaultAtenetRouterURL } - substrateSandboxActorBackend = substrate.NewSandboxAgentActorBackend(substrateAteClient, atenetRouterURL) + substrateSandboxActorBackend = substrate.NewSandboxAgentActorBackend(substrateAteClient, mgr.GetClient(), atenetRouterURL) agentHarnessSessionActorBackend = substrate.NewAgentHarnessSessionActorBackend(substrateAteClient, atenetRouterURL) agentsSubstrate := substrate.NewAgentsBackend(substrateLifecycle, substrateAteClient) extensionCfg.SandboxBackend = agentsSubstrate diff --git a/go/core/pkg/consts/annotations.go b/go/core/pkg/consts/annotations.go new file mode 100644 index 0000000000..377ac5994b --- /dev/null +++ b/go/core/pkg/consts/annotations.go @@ -0,0 +1,9 @@ +package consts + +// ConfigHashAnnotation is the annotation key carrying the hash of an agent's rendered config, +// used to propagate config changes. On the Deployment path the translator stamps it on the agent +// pod template so a config change rolls the Deployment; on the substrate path it is mirrored onto +// the generated ActorTemplate to drive a new golden snapshot and session actor. It is shared here +// because the writer (translator) and the substrate backend (writer/reader) live in different +// packages and must agree on the key. +const ConfigHashAnnotation = "kagent.dev/config-hash" diff --git a/go/core/pkg/sandboxbackend/backend.go b/go/core/pkg/sandboxbackend/backend.go index 516f634561..cbd081bf7b 100644 --- a/go/core/pkg/sandboxbackend/backend.go +++ b/go/core/pkg/sandboxbackend/backend.go @@ -16,6 +16,10 @@ type BuildInput struct { PodTemplate corev1.PodTemplateSpec WorkloadName string ExtraLabels map[string]string + // ConfigSecret is the rendered agent config Secret (config.json / agent-card.json / + // srt-settings.json). The substrate backend clones it under a per-config-hash name so each + // golden snapshot materializes its own config (see AgentsBackend.BuildSandbox). May be nil. + ConfigSecret *corev1.Secret } // Backend builds sandbox CRD objects and evaluates their readiness. diff --git a/go/core/pkg/sandboxbackend/filter_translator_owned_test.go b/go/core/pkg/sandboxbackend/filter_translator_owned_test.go index e773219281..012c64b30b 100644 --- a/go/core/pkg/sandboxbackend/filter_translator_owned_test.go +++ b/go/core/pkg/sandboxbackend/filter_translator_owned_test.go @@ -44,18 +44,17 @@ func TestFilterTranslatorOwnedTypesForList(t *testing.T) { } }) - t.Run("SandboxAgent keeps sandbox GVKs", func(t *testing.T) { + t.Run("SandboxAgent drops ActorTemplate from prune (managed via blue-green)", func(t *testing.T) { sa := &v1alpha2.SandboxAgent{ObjectMeta: metav1.ObjectMeta{Name: "s", Namespace: "ns"}} out, err := sandboxbackend.FilterTranslatorOwnedTypesForList(cl, sa, allTypes, backend) require.NoError(t, err) - require.Len(t, out, len(allTypes)) - var sawActorTemplate bool + // Substrate manages ActorTemplate lifecycle itself (config-hashed, blue-green), so it is + // excluded from the generic prune list — leaving only the generic Deployment + ConfigMap. + require.Len(t, out, 2) for _, o := range out { - if _, ok := o.(*atev1alpha1.ActorTemplate); ok { - sawActorTemplate = true - } + _, ok := o.(*atev1alpha1.ActorTemplate) + require.False(t, ok, "ActorTemplate must be excluded from generic prune (managed via blue-green)") } - require.True(t, sawActorTemplate) }) t.Run("nil backend is passthrough", func(t *testing.T) { diff --git a/go/core/pkg/sandboxbackend/substrate/actor_errors.go b/go/core/pkg/sandboxbackend/substrate/actor_errors.go index c75a40b4e6..cd6bc288e3 100644 --- a/go/core/pkg/sandboxbackend/substrate/actor_errors.go +++ b/go/core/pkg/sandboxbackend/substrate/actor_errors.go @@ -22,6 +22,19 @@ func wrapResumeActorError(actorID string, err error) error { return fmt.Errorf("substrate ResumeActor %q: %w", actorID, err) } +// wrapCreateActorError normalizes a CreateActor failure, surfacing the clean ErrNoFreeWorkers +// (rather than an opaque gRPC FailedPrecondition) when the WorkerPool is at capacity so the chat +// caller can return an actionable "no free workers" error. +func wrapCreateActorError(actorID string, err error) error { + if err == nil { + return nil + } + if isNoFreeWorkersError(err) { + return fmt.Errorf("%w", ErrNoFreeWorkers) + } + return fmt.Errorf("substrate CreateActor %q: %w", actorID, err) +} + func isNoFreeWorkersError(err error) bool { if errors.Is(err, ErrNoFreeWorkers) { return true diff --git a/go/core/pkg/sandboxbackend/substrate/agent_actor.go b/go/core/pkg/sandboxbackend/substrate/agent_actor.go index 220c69f8ef..9a28ba163d 100644 --- a/go/core/pkg/sandboxbackend/substrate/agent_actor.go +++ b/go/core/pkg/sandboxbackend/substrate/agent_actor.go @@ -8,30 +8,43 @@ import ( "github.com/agent-substrate/substrate/pkg/proto/ateapipb" "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/pkg/consts" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" + "sigs.k8s.io/controller-runtime/pkg/client" ) // SandboxAgentActorBackend manages ate-api actors for SandboxAgent workloads. type SandboxAgentActorBackend struct { client *Client + kube client.Client atenetRouterURL string } // NewSandboxAgentActorBackend returns a backend that ensures SandboxAgent actors on ate-api. -func NewSandboxAgentActorBackend(client *Client, atenetRouterURL string) *SandboxAgentActorBackend { +// kube is used to resolve the agent's current (config-hashed) ActorTemplate. +func NewSandboxAgentActorBackend(client *Client, kube client.Client, atenetRouterURL string) *SandboxAgentActorBackend { atenetRouterURL = strings.TrimSpace(atenetRouterURL) if atenetRouterURL == "" { atenetRouterURL = DefaultAtenetRouterURL } return &SandboxAgentActorBackend{ client: client, + kube: kube, atenetRouterURL: atenetRouterURL, } } -// EnsureSessionActor creates and resumes the per-session actor for a SandboxAgent chat. +// EnsureSessionActor creates (or resumes) the per-session actor for a SandboxAgent chat and waits +// for it to be reachable. It resolves the agent's current (newest Ready) ActorTemplate, so during +// a config change requests keep landing on the previous Ready golden until the new one is Ready. +// +// If the WorkerPool has no free worker, CreateActor/ResumeActor surface ErrNoFreeWorkers and this +// returns it immediately (no buffering). On a single-replica pool the lone worker may be busy +// building a new golden, so a config change can briefly make chat return "no free workers"; on a +// multi-replica pool the spare workers keep serving the current golden, so a rollout does not hit +// that error. Scaling the WorkerPool is the remedy for capacity pressure, not in-process retries. func (b *SandboxAgentActorBackend) EnsureSessionActor(ctx context.Context, sa *v1alpha2.SandboxAgent, sessionID string) (sandboxbackend.EnsureResult, error) { if sa == nil { return sandboxbackend.EnsureResult{}, fmt.Errorf("SandboxAgent is required") @@ -44,8 +57,11 @@ func (b *SandboxAgentActorBackend) EnsureSessionActor(ctx context.Context, sa *v return sandboxbackend.EnsureResult{}, fmt.Errorf("substrate ate-api client is required") } - actorID := SandboxAgentSessionActorID(sa, sessionID) - tmplNS, tmplName := sa.Namespace, SandboxAgentActorTemplateName(sa) + actorID, tmplName, err := b.sessionActorRef(ctx, sa, sessionID) + if err != nil { + return sandboxbackend.EnsureResult{}, err + } + tmplNS := sa.Namespace actor, err := b.client.GetActor(ctx, actorID) if err != nil { @@ -54,7 +70,7 @@ func (b *SandboxAgentActorBackend) EnsureSessionActor(ctx context.Context, sa *v } actor, err = b.client.CreateActor(ctx, actorID, tmplNS, tmplName) if err != nil { - return sandboxbackend.EnsureResult{}, fmt.Errorf("substrate CreateActor %q: %w", actorID, err) + return sandboxbackend.EnsureResult{}, wrapCreateActorError(actorID, err) } } @@ -83,7 +99,10 @@ func (b *SandboxAgentActorBackend) SuspendSessionActor(ctx context.Context, sa * if b == nil || b.client == nil || sa == nil { return nil } - actorID := SandboxAgentSessionActorID(sa, sessionID) + actorID, _, err := b.sessionActorRef(ctx, sa, sessionID) + if err != nil { + return err + } actor, err := b.client.GetActor(ctx, actorID) if err != nil { if status.Code(err) == codes.NotFound { @@ -108,12 +127,74 @@ func (b *SandboxAgentActorBackend) DeleteSandboxAgentActor(ctx context.Context, return deleteActor(ctx, b.client, actorID) } -// DeleteSandboxAgentSessionActor deletes the actor for a single chat session. +// DeleteSandboxAgentSessionActor deletes the actor(s) for a single chat session. Because the +// session actor id is keyed on the config hash and old templates/goldens are retained, a session +// can have actors under several hashes (one per config it was active under). Deleting only the +// current-hash actor would orphan the others, so this deletes the session's actor for every +// retained config hash. func (b *SandboxAgentActorBackend) DeleteSandboxAgentSessionActor(ctx context.Context, sa *v1alpha2.SandboxAgent, sessionID string) (bool, error) { - if sa == nil { + if b == nil || b.client == nil || sa == nil { return true, nil } - return b.DeleteSandboxAgentActor(ctx, SandboxAgentSessionActorID(sa, sessionID)) + hashes, err := b.retainedSessionConfigHashes(ctx, sa) + if err != nil { + return false, err + } + allDone := true + seen := make(map[string]struct{}, len(hashes)) + for _, hash := range hashes { + actorID := SandboxAgentSessionActorID(sa, hash, sessionID) + if _, ok := seen[actorID]; ok { + continue + } + seen[actorID] = struct{}{} + done, err := b.DeleteSandboxAgentActor(ctx, actorID) + if err != nil { + return false, err + } + if !done { + allDone = false + } + } + return allDone, nil +} + +// retainedSessionConfigHashes returns the distinct config-hash segments across the agent's +// retained ActorTemplates (plus "" for legacy/no-hash actors). These are the hashes a session's +// actor id could have been keyed on, mirroring sessionActorRef's per-template derivation. +func (b *SandboxAgentActorBackend) retainedSessionConfigHashes(ctx context.Context, sa *v1alpha2.SandboxAgent) ([]string, error) { + templates, err := listSandboxAgentActorTemplates(ctx, b.kube, sa.Namespace, sa.Name) + if err != nil { + return nil, err + } + // Always include "" so a session actor created before any config hash existed is still cleaned. + hashes := []string{""} + seen := map[string]struct{}{"": {}} + for _, t := range templates { + hash := t.Annotations[consts.ConfigHashAnnotation] + if _, ok := seen[hash]; ok { + continue + } + seen[hash] = struct{}{} + hashes = append(hashes, hash) + } + return hashes, nil +} + +// sessionActorRef resolves the agent's current (config-hashed) ActorTemplate and returns the +// session actor id keyed to it plus the template name to create the actor from. Keying the +// id on the config hash means a config change yields a new actor id, so the next message +// creates a fresh actor from the new golden instead of resuming the stale one. +func (b *SandboxAgentActorBackend) sessionActorRef(ctx context.Context, sa *v1alpha2.SandboxAgent, sessionID string) (actorID, templateName string, err error) { + tmpl, err := ResolveCurrentActorTemplate(ctx, b.kube, sa.Namespace, sa.Name) + if err != nil { + return "", "", err + } + if tmpl == nil { + return "", "", fmt.Errorf("no ActorTemplate generated yet for SandboxAgent %s/%s", sa.Namespace, sa.Name) + } + hash := tmpl.Annotations[consts.ConfigHashAnnotation] + return SandboxAgentSessionActorID(sa, hash, sessionID), tmpl.Name, nil } // DeleteAllSandboxAgentActors deletes legacy per-agent actors and all session actors for a SandboxAgent. @@ -122,6 +203,21 @@ func (b *SandboxAgentActorBackend) DeleteAllSandboxAgentActors(ctx context.Conte return true, nil } prefix := sandboxAgentActorPrefix(sa) + + // Build the set of ActorTemplates this agent owns (one per retained config hash). Session + // actors are created FROM these templates, so matching an actor's source template reliably + // identifies it even when its id falls back to the prefix-less asr- form (long agent + // name / session id), which id-prefix matching alone would miss. This runs before template + // cleanup in the delete path, so the templates are still present here. + templates, err := listSandboxAgentActorTemplates(ctx, b.kube, sa.Namespace, sa.Name) + if err != nil { + return false, err + } + ownedTemplates := make(map[string]struct{}, len(templates)) + for _, t := range templates { + ownedTemplates[t.Name] = struct{}{} + } + actors, err := b.client.ListActors(ctx) if err != nil { return false, fmt.Errorf("list substrate actors: %w", err) @@ -132,7 +228,7 @@ func (b *SandboxAgentActorBackend) DeleteAllSandboxAgentActors(ctx context.Conte if id == "" { continue } - if id != SandboxAgentActorID(sa) && !strings.HasPrefix(id, prefix+"-") { + if !actorBelongsToSandboxAgent(sa, actor, prefix, ownedTemplates) { continue } done, err := deleteActor(ctx, b.client, id) @@ -146,18 +242,39 @@ func (b *SandboxAgentActorBackend) DeleteAllSandboxAgentActors(ctx context.Conte return allDone, nil } +// actorBelongsToSandboxAgent reports whether an actor was created for this SandboxAgent. It matches +// on the actor's source ActorTemplate first (robust: survives the prefix-less asr- id +// fallback), then falls back to id-prefix matching as a backstop for orphaned actors whose +// template was already deleted. +func actorBelongsToSandboxAgent(sa *v1alpha2.SandboxAgent, actor *ateapipb.Actor, prefix string, ownedTemplates map[string]struct{}) bool { + if actor.GetActorTemplateNamespace() == sa.Namespace { + if _, ok := ownedTemplates[actor.GetActorTemplateName()]; ok { + return true + } + } + id := strings.TrimSpace(actor.GetActorId()) + return id == SandboxAgentActorID(sa) || strings.HasPrefix(id, prefix+"-") +} + func sandboxAgentActorPrefix(sa *v1alpha2.SandboxAgent) string { return SandboxAgentActorID(sa) } -// SandboxAgentSessionActorID returns a stable ate-api actor id for a SandboxAgent chat session. -func SandboxAgentSessionActorID(sa *v1alpha2.SandboxAgent, sessionID string) string { - raw := fmt.Sprintf("%s-%s", sandboxAgentActorPrefix(sa), sanitizeSessionID(sessionID)) +// SandboxAgentSessionActorID returns the ate-api actor id for a SandboxAgent chat session at a +// given config hash. The hash segment ties the actor to a specific golden snapshot: a config +// change produces a new id, so the next message creates a fresh actor instead of resuming the +// stale one. The id keeps the agent prefix (asr---) so per-agent cleanup still matches. +func SandboxAgentSessionActorID(sa *v1alpha2.SandboxAgent, configHash, sessionID string) string { + hashSeg := "" + if configHash != "" { + hashSeg = configHash + "-" + } + raw := fmt.Sprintf("%s-%s%s", sandboxAgentActorPrefix(sa), hashSeg, sanitizeSessionID(sessionID)) raw = strings.ToLower(strings.ReplaceAll(raw, "_", "-")) if len(raw) <= 63 && dns1123Label.MatchString(raw) { return raw } - sum := sha256.Sum256([]byte(sa.Namespace + "/" + sa.Name + "/" + sessionID)) + sum := sha256.Sum256([]byte(sa.Namespace + "/" + sa.Name + "/" + configHash + "/" + sessionID)) return fmt.Sprintf("%s-%x", sandboxAgentIDPrefix, sum[:12]) } diff --git a/go/core/pkg/sandboxbackend/substrate/agent_lifecycle.go b/go/core/pkg/sandboxbackend/substrate/agent_lifecycle.go index d77d97c0c6..c34dde9ad6 100644 --- a/go/core/pkg/sandboxbackend/substrate/agent_lifecycle.go +++ b/go/core/pkg/sandboxbackend/substrate/agent_lifecycle.go @@ -2,10 +2,12 @@ package substrate import ( "fmt" + "strconv" "strings" atev1alpha1 "github.com/agent-substrate/substrate/pkg/api/v1alpha1" "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/pkg/consts" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -15,11 +17,37 @@ import ( // buildSandboxAgentActorTemplate is invoked from the translator via AgentsBackend.BuildSandbox. const ( - sandboxAgentIDPrefix = "asr" - defaultKagentContainer = "kagent" - SandboxAgentLabelKey = "kagent.dev/sandbox-agent" - defaultGoEntrypoint = "/app" + sandboxAgentIDPrefix = "asr" + defaultKagentContainer = "kagent" + SandboxAgentLabelKey = "kagent.dev/sandbox-agent" + // desiredGenerationAnnotation records the SandboxAgent generation that last applied a given + // ActorTemplate; the template for the current desired config carries the highest value. + desiredGenerationAnnotation = "kagent.dev/desired-generation" + defaultGoEntrypoint = "/app" + // defaultPythonEntrypoint is the absolute path to the kagent-adk console script in the + // Python ADK image venv. Substrate copies Command verbatim into the OCI Process.Args with + // no PATH/entrypoint fallback, so the path must be explicit and kept in sync with the + // Python Dockerfile's UV_PROJECT_ENVIRONMENT (/.kagent/.venv). + defaultPythonEntrypoint = "/.kagent/.venv/bin/kagent-adk" substrateKagentListenPort int32 = 80 + // pythonRuntimeLibPath / pythonVenvPath mirror the Python ADK image layout + // (python/Dockerfile): bundled shared libs live on LD_LIBRARY_PATH and the project + // venv at UV_PROJECT_ENVIRONMENT. Substrate ignores the image's ENV directives (see + // pythonRuntimeImageEnv), so these are re-supplied via the ActorTemplate env. + pythonRuntimeLibPath = "/usr/lib/kagent-libs" + pythonVenvPath = "/.kagent/.venv" + // pythonRuntimePath mirrors the image's `ENV PATH="/.kagent/.venv/bin:$PATH"`. Substrate + // builds the OCI Process.Env from a hardcoded PATH that does NOT include the venv bin, so any + // bare-name console-script execution (or locating the venv interpreter without an absolute + // path) would fail; re-supply it with the venv bin first, then the standard system dirs. + pythonRuntimePath = "/.kagent/.venv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + + // sandboxAgentTemplateNameMaxBase reserves room in the 63-char DNS-1123 budget for + // the "-" suffix (hash is up to 16 hex chars). A golden snapshot is an immutable + // memory image, so a config change must produce a NEW ActorTemplate (substrate snapshots + // once and no-ops in PhaseReady); folding the shared consts.ConfigHashAnnotation hash + // into the template name (and mirroring it as an annotation) is what achieves that. + sandboxAgentTemplateNameMaxBase = 46 ) func (p *Lifecycle) buildSandboxAgentActorTemplate( @@ -35,13 +63,43 @@ func (p *Lifecycle) buildSandboxAgentActorTemplate( if err != nil { return nil, err } - command, containerEnv := buildSubstrateKagentContainerCommand(sa) + // The config hash is computed by the translator and stamped on the pod template. + // Folding it into the ActorTemplate name makes a config change create a new template + // (and thus a fresh golden snapshot) instead of mutating one substrate will never + // re-snapshot. The annotation carries the same hash for the chat path and reaper. + configHash := shortConfigHash(podTemplate.Annotations[consts.ConfigHashAnnotation]) + + // The config is read from a per-config-hash Secret (cloned in AgentsBackend.BuildSandbox), + // not the shared per-agent Secret. A golden snapshot materializes config.json from this + // Secret's env at build time; if every config revision shared one Secret name, substrate's + // secret-value cache could hand a stale revision to a new golden, so the golden would freeze + // the wrong provider's config (e.g. an OpenAI agent in a Gemini actor). The per-hash name is + // a cache-miss for each distinct config, so each golden captures exactly its own config. + secretName := sandboxAgentConfigSecretName(sa, configHash) + command, containerEnv, err := buildSubstrateKagentContainerCommand(sa, kagentContainer, secretName) + if err != nil { + return nil, err + } + + annotations := map[string]string{ + // The agent generation at the time this template was (re)applied. It bumps only on a spec + // change, so the template matching the agent's CURRENT desired config always carries the + // highest generation — including on a flip-back to a retained older config, where the + // reused template is re-applied with the new generation. ResolveCurrentActorTemplate uses + // this (not creationTimestamp) to pick the desired template, so chat/readiness follow the + // current config rather than whichever golden was built most recently. + desiredGenerationAnnotation: strconv.FormatInt(sa.GetGeneration(), 10), + } + if configHash != "" { + annotations[consts.ConfigHashAnnotation] = configHash + } desired := &atev1alpha1.ActorTemplate{ ObjectMeta: metav1.ObjectMeta{ - Name: SandboxAgentActorTemplateName(sa), - Namespace: sa.Namespace, - Labels: sandboxAgentLifecycleLabels(sa), + Name: sandboxAgentActorTemplateName(sa, configHash), + Namespace: sa.Namespace, + Labels: sandboxAgentLifecycleLabels(sa), + Annotations: annotations, }, Spec: atev1alpha1.ActorTemplateSpec{ PauseImage: p.Defaults.PauseImage, @@ -76,31 +134,84 @@ func findKagentContainer(containers []corev1.Container) *corev1.Container { return nil } -// buildSubstrateKagentContainerCommand returns an ActorTemplate command for Substrate. -// Substrate runs Command directly (no shell). Config is materialized from secret-backed -// env vars at startup via MaterializeFromEnv in the Go ADK entrypoint. -func buildSubstrateKagentContainerCommand(sa *v1alpha2.SandboxAgent) ([]string, []corev1.EnvVar) { +// buildSubstrateKagentContainerCommand returns the ActorTemplate command and the prepended +// env for a SandboxAgent on Substrate. Substrate runs Command directly (no shell) and copies +// it verbatim into the OCI Process.Args with no PATH/entrypoint fallback, so the command must +// be fully explicit. +// +// For declarative agents the command is the runtime ADK entrypoint and config is materialized +// from secret-backed env vars at startup (Go: MaterializeFromEnv in the Go ADK; Python: the +// `static` command materializes the same env vars before reading /config). For BYO agents the +// user-provided container Command/Args are used verbatim; the BYO image must serve A2A on the +// substrate listen port (80). +func buildSubstrateKagentContainerCommand(sa *v1alpha2.SandboxAgent, container *corev1.Container, configSecretName string) ([]string, []corev1.EnvVar, error) { // KAGENT_NAME / KAGENT_NAMESPACE are normally injected by the translator pod // template, but KAGENT_NAMESPACE uses a Downward API fieldRef which Substrate // ActorTemplates do not support (it gets dropped by sanitizeActorTemplateEnvVar). - // Without it the Go ADK derives a wrong app name, and the controller rejects + // Without it the ADK derives a wrong app name, and the controller rejects // session callbacks with "Session does not belong to this agent". Set both as // literals here; they are prepended before the pod env so they win deduplication. env := []corev1.EnvVar{ {Name: "KAGENT_NAME", Value: sa.Name}, {Name: "KAGENT_NAMESPACE", Value: sa.Namespace}, } - env = append(env, kagentAgentSecretEnv(sa)...) - return buildSubstrateGoKagentCommand(), env + + spec := sa.GetAgentSpec() + if spec != nil && spec.Type == v1alpha2.AgentType_BYO { + // BYO: use the explicit container command + args verbatim. Validation + // (ValidateSubstrateSandboxAgentSpec) guarantees a command is set. + if len(container.Command) == 0 { + return nil, nil, fmt.Errorf("BYO substrate agent %q is missing an explicit container command", sa.Name) + } + cmd := append([]string{}, container.Command...) + cmd = append(cmd, container.Args...) + return cmd, env, nil + } + + // Declarative: secret-backed config is materialized at startup from the per-config-hash Secret. + env = append(env, kagentAgentSecretEnv(configSecretName)...) + runtime := v1alpha2.EffectiveDeclarativeRuntime(sa.GetAgentSpec()) + if runtime == v1alpha2.DeclarativeRuntime_Python { + env = append(env, pythonRuntimeImageEnv()...) + } + return buildSubstrateDeclarativeCommand(runtime), env, nil +} + +// pythonRuntimeImageEnv returns the runtime-critical ENV directives baked into the Python +// ADK image (python/Dockerfile). Substrate builds the OCI Process.Env from a hardcoded PATH +// plus the ActorTemplate env only — it does NOT apply the image's ENV directives (the same +// way it ignores the image entrypoint). Without LD_LIBRARY_PATH the standalone interpreter +// cannot locate its bundled shared libraries (libz, libsqlite3, ...) and crashes on import +// (e.g. numpy: "ImportError: libz.so.1: cannot open shared object file"); the failed startup +// then surfaces as a gVisor "inconsistent private memory files on restore" error because the +// golden snapshot captures only the pause container. The Go static binary needs none of this. +// Keep in sync with the final-stage ENV block of python/Dockerfile. +func pythonRuntimeImageEnv() []corev1.EnvVar { + return []corev1.EnvVar{ + {Name: "PATH", Value: pythonRuntimePath}, + {Name: "LD_LIBRARY_PATH", Value: pythonRuntimeLibPath}, + {Name: "VIRTUAL_ENV", Value: pythonVenvPath}, + {Name: "PYTHONUNBUFFERED", Value: "1"}, + {Name: "LANG", Value: "C.UTF-8"}, + {Name: "LC_ALL", Value: "C.UTF-8"}, + } } -// buildSubstrateGoKagentCommand returns the explicit command for the declarative -// Go ADK image. Substrate's atelet copies Command verbatim into the OCI spec's -// Process.Args with no fallback to the image entrypoint, so an empty command -// makes `runsc create` fail with "Spec.Process.Arg must be defined". BYO agents -// are rejected for the substrate platform by validation, so only the declarative -// entrypoint is needed here. -func buildSubstrateGoKagentCommand() []string { +// buildSubstrateDeclarativeCommand returns the explicit command for a declarative ADK image. +// Substrate's atelet copies Command verbatim into the OCI spec's Process.Args with no fallback +// to the image entrypoint, so an empty command makes `runsc create` fail with +// "Spec.Process.Arg must be defined". +func buildSubstrateDeclarativeCommand(runtime v1alpha2.DeclarativeRuntime) []string { + if runtime == v1alpha2.DeclarativeRuntime_Python { + // The Python ADK `static` command reads config.json/agent-card.json from its + // --filepath (default /config), which the materialization step populates from + // the secret-backed env vars before the server starts. + return []string{ + defaultPythonEntrypoint, "static", + "--host", "0.0.0.0", + "--port", fmt.Sprintf("%d", substrateKagentListenPort), + } + } return []string{ defaultGoEntrypoint, "--host", "0.0.0.0", @@ -108,8 +219,18 @@ func buildSubstrateGoKagentCommand() []string { } } -func kagentAgentSecretEnv(sa *v1alpha2.SandboxAgent) []corev1.EnvVar { - secretName := sa.Name +// sandboxAgentConfigSecretName returns the name of the Secret holding a SandboxAgent's rendered +// config for a given config hash. It mirrors the ActorTemplate name so the config Secret and the +// template that consumes it are paired per config. When the hash is empty (no config rendered) it +// falls back to the translator's per-agent Secret name. +func sandboxAgentConfigSecretName(sa *v1alpha2.SandboxAgent, configHash string) string { + if configHash == "" { + return sa.Name + } + return sandboxAgentActorTemplateName(sa, configHash) +} + +func kagentAgentSecretEnv(secretName string) []corev1.EnvVar { return []corev1.EnvVar{ secretEnv("KAGENT_CONFIG_JSON", secretName, "config.json"), secretEnv("KAGENT_AGENT_CARD_JSON", secretName, "agent-card.json"), @@ -141,11 +262,35 @@ func sandboxAgentLifecycleLabels(sa *v1alpha2.SandboxAgent) map[string]string { } } -// SandboxAgentActorTemplateName is the generated ActorTemplate name for a SandboxAgent. -func SandboxAgentActorTemplateName(sa *v1alpha2.SandboxAgent) string { +// sandboxAgentActorTemplateBaseName is the stable name prefix for a SandboxAgent's +// ActorTemplate(s), independent of config. Used as the truncation base for hashed names. +func sandboxAgentActorTemplateBaseName(sa *v1alpha2.SandboxAgent) string { return truncateDNS1123(sa.Name) } +// sandboxAgentActorTemplateName is the generated ActorTemplate name for a SandboxAgent at a +// given config hash. The hash suffix makes each distinct config a distinct template (and +// golden). When the hash is empty (no config materialized) it falls back to the stable base +// name. Consumers must NOT assume this name — they resolve the live template via +// ResolveCurrentActorTemplate, since the hash depends on rendered config they don't have. +func sandboxAgentActorTemplateName(sa *v1alpha2.SandboxAgent, configHash string) string { + if configHash == "" { + return sandboxAgentActorTemplateBaseName(sa) + } + base := truncateDNS1123To(sa.Name, sandboxAgentTemplateNameMaxBase) + return fmt.Sprintf("%s-%s", base, configHash) +} + +// shortConfigHash converts the translator's decimal config-hash annotation into a short, +// DNS-1123-safe hex token (≤16 chars). Returns "" when the annotation is absent/unparseable. +func shortConfigHash(annotationValue string) string { + v, err := strconv.ParseUint(strings.TrimSpace(annotationValue), 10, 64) + if err != nil { + return "" + } + return fmt.Sprintf("%x", v) +} + func sandboxAgentSnapshotsLocation(sa *v1alpha2.SandboxAgent) string { if sa == nil { return substrateSnapshotsLocationFor("", "", "") diff --git a/go/core/pkg/sandboxbackend/substrate/agent_lifecycle_test.go b/go/core/pkg/sandboxbackend/substrate/agent_lifecycle_test.go index adaafb53ad..e2625f7c73 100644 --- a/go/core/pkg/sandboxbackend/substrate/agent_lifecycle_test.go +++ b/go/core/pkg/sandboxbackend/substrate/agent_lifecycle_test.go @@ -3,9 +3,15 @@ package substrate import ( "testing" + atev1alpha1 "github.com/agent-substrate/substrate/pkg/api/v1alpha1" "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" ) func TestActorTemplateEnvFromPodEnv(t *testing.T) { @@ -43,39 +49,219 @@ func TestActorTemplateEnvFromPodEnv(t *testing.T) { require.NotNil(t, got[1].ValueFrom.SecretKeyRef) } -func TestBuildSubstrateGoKagentCommand(t *testing.T) { +func TestBuildSubstrateDeclarativeCommand(t *testing.T) { t.Parallel() // Substrate's atelet copies Command verbatim into the OCI Process.Args with - // no image-entrypoint fallback, so the declarative Go command must be explicit. - require.Equal(t, []string{"/app", "--host", "0.0.0.0", "--port", "80"}, buildSubstrateGoKagentCommand()) + // no image-entrypoint fallback, so the declarative command must be explicit. + require.Equal(t, + []string{"/app", "--host", "0.0.0.0", "--port", "80"}, + buildSubstrateDeclarativeCommand(v1alpha2.DeclarativeRuntime_Go), + ) + require.Equal(t, + []string{"/.kagent/.venv/bin/kagent-adk", "static", "--host", "0.0.0.0", "--port", "80"}, + buildSubstrateDeclarativeCommand(v1alpha2.DeclarativeRuntime_Python), + ) } -func TestBuildSubstrateKagentContainerCommand(t *testing.T) { +func declarativeSandboxAgent(runtime v1alpha2.DeclarativeRuntime) *v1alpha2.SandboxAgent { + sa := &v1alpha2.SandboxAgent{ + Spec: v1alpha2.SandboxAgentSpec{ + AgentSpec: v1alpha2.AgentSpec{ + Type: v1alpha2.AgentType_Declarative, + Declarative: &v1alpha2.DeclarativeAgentSpec{Runtime: runtime}, + }, + }, + } + sa.Name = "my-agent" + sa.Namespace = "kagent" + return sa +} + +func TestBuildSubstrateKagentContainerCommandDeclarative(t *testing.T) { t.Parallel() + for _, tc := range []struct { + name string + runtime v1alpha2.DeclarativeRuntime + wantCmd []string + wantLibEnv bool // Python needs LD_LIBRARY_PATH re-supplied (substrate drops image ENV); Go does not. + }{ + {"go", v1alpha2.DeclarativeRuntime_Go, []string{"/app", "--host", "0.0.0.0", "--port", "80"}, false}, + {"python", v1alpha2.DeclarativeRuntime_Python, []string{"/.kagent/.venv/bin/kagent-adk", "static", "--host", "0.0.0.0", "--port", "80"}, true}, + } { + t.Run(tc.name, func(t *testing.T) { + sa := declarativeSandboxAgent(tc.runtime) + cmd, env, err := buildSubstrateKagentContainerCommand(sa, &corev1.Container{}, "my-agent-abc123") + require.NoError(t, err) + require.Equal(t, tc.wantCmd, cmd) + + // KAGENT_NAME / KAGENT_NAMESPACE must be literal values so the ADK can + // derive the correct app name (fieldRef env vars are dropped on Substrate). + envByName := map[string]string{} + for _, e := range env { + envByName[e.Name] = e.Value + } + require.Equal(t, "my-agent", envByName["KAGENT_NAME"]) + require.Equal(t, "kagent", envByName["KAGENT_NAMESPACE"]) + + // Config env must reference the per-config-hash Secret (so a golden materializes its + // own config), not the shared per-agent Secret. + for _, e := range env { + if e.Name == "KAGENT_CONFIG_JSON" { + require.NotNil(t, e.ValueFrom) + require.NotNil(t, e.ValueFrom.SecretKeyRef) + require.Equal(t, "my-agent-abc123", e.ValueFrom.SecretKeyRef.Name) + } + } + + // Substrate ignores the image's ENV, so the Python runtime image's + // LD_LIBRARY_PATH and PATH must be re-supplied (or numpy fails to load + // libz.so.1, and bare-name console scripts aren't found). + if tc.wantLibEnv { + require.Equal(t, pythonRuntimeLibPath, envByName["LD_LIBRARY_PATH"]) + require.Equal(t, "1", envByName["PYTHONUNBUFFERED"]) + require.Contains(t, envByName["PATH"], pythonVenvPath+"/bin", "venv bin must be on PATH") + } else { + _, ok := envByName["LD_LIBRARY_PATH"] + require.False(t, ok, "Go declarative must not carry the Python runtime ENV") + } + }) + } +} + +func TestBuildSubstrateKagentContainerCommandBYO(t *testing.T) { + t.Parallel() + + cmd := "/serve" sa := &v1alpha2.SandboxAgent{ Spec: v1alpha2.SandboxAgentSpec{ AgentSpec: v1alpha2.AgentSpec{ - Type: v1alpha2.AgentType_Declarative, - Declarative: &v1alpha2.DeclarativeAgentSpec{ - Runtime: v1alpha2.DeclarativeRuntime_Go, - }, + Type: v1alpha2.AgentType_BYO, + BYO: &v1alpha2.BYOAgentSpec{Deployment: &v1alpha2.ByoDeploymentSpec{Image: "example/agent:latest", Cmd: &cmd}}, }, }, } - sa.Name = "my-agent" + sa.Name = "byo-agent" sa.Namespace = "kagent" - cmd, env := buildSubstrateKagentContainerCommand(sa) - require.Equal(t, []string{"/app", "--host", "0.0.0.0", "--port", "80"}, cmd) + + container := &corev1.Container{Command: []string{"/serve"}, Args: []string{"--host", "0.0.0.0", "--port", "80"}} + got, env, err := buildSubstrateKagentContainerCommand(sa, container, "byo-agent") + require.NoError(t, err) + // BYO uses the container command + args verbatim. + require.Equal(t, []string{"/serve", "--host", "0.0.0.0", "--port", "80"}, got) require.NotEmpty(t, env) - // KAGENT_NAME / KAGENT_NAMESPACE must be literal values so the Go ADK can - // derive the correct app name (fieldRef env vars are dropped on Substrate). - envByName := map[string]string{} + // A BYO agent missing an explicit command is rejected. + _, _, err = buildSubstrateKagentContainerCommand(sa, &corev1.Container{}, "byo-agent") + require.Error(t, err) +} + +func newTestLifecycle(t *testing.T) *Lifecycle { + t.Helper() + scheme := runtime.NewScheme() + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + utilruntime.Must(atev1alpha1.AddToScheme(scheme)) + return &Lifecycle{ + Client: fake.NewClientBuilder().WithScheme(scheme).Build(), + Defaults: LifecycleDefaults{ + PauseImage: "gcr.io/test/pause@sha256:deadbeef", + }, + } +} + +// envByName flattens an ActorTemplate env list into name->present for assertions. +func actorEnvNames(env []atev1alpha1.EnvVar) map[string]bool { + out := map[string]bool{} for _, e := range env { - envByName[e.Name] = e.Value + out[e.Name] = true + } + return out +} + +// TestBuildSandboxAgentActorTemplate exercises the full ActorTemplate generation for each +// supported runtime/type on substrate (Go declarative, Python declarative, BYO), asserting the +// pinned image, the explicit command, and the env wiring side by side. +func TestBuildSandboxAgentActorTemplate(t *testing.T) { + t.Parallel() + + const pinnedImage = "registry.example/kagent-dev/kagent/app@sha256:1111111111111111111111111111111111111111111111111111111111111111" + cmd := "/serve" + wpKey := types.NamespacedName{Namespace: "kagent", Name: "kagent-default"} + + podTemplateFor := func(container corev1.Container) corev1.PodTemplateSpec { + container.Name = defaultKagentContainer + container.Image = pinnedImage + return corev1.PodTemplateSpec{Spec: corev1.PodSpec{Containers: []corev1.Container{container}}} + } + + for _, tc := range []struct { + name string + sa *v1alpha2.SandboxAgent + container corev1.Container + wantCommand []string + // declarative agents carry secret-backed config env; BYO does not. + wantConfigEnv bool + // Python declarative re-supplies the image's LD_LIBRARY_PATH (substrate drops image ENV). + wantLibEnv bool + }{ + { + name: "go declarative", + sa: &v1alpha2.SandboxAgent{ + ObjectMeta: metav1.ObjectMeta{Name: "go-agent", Namespace: "kagent"}, + Spec: v1alpha2.SandboxAgentSpec{ + AgentSpec: v1alpha2.AgentSpec{Type: v1alpha2.AgentType_Declarative, Declarative: &v1alpha2.DeclarativeAgentSpec{Runtime: v1alpha2.DeclarativeRuntime_Go}}, + }, + }, + container: corev1.Container{Args: []string{"--host", "0.0.0.0", "--port", "8080", "--filepath", "/config"}}, + wantCommand: []string{"/app", "--host", "0.0.0.0", "--port", "80"}, + wantConfigEnv: true, + wantLibEnv: false, + }, + { + name: "python declarative", + sa: &v1alpha2.SandboxAgent{ + ObjectMeta: metav1.ObjectMeta{Name: "py-agent", Namespace: "kagent"}, + Spec: v1alpha2.SandboxAgentSpec{ + AgentSpec: v1alpha2.AgentSpec{Type: v1alpha2.AgentType_Declarative, Declarative: &v1alpha2.DeclarativeAgentSpec{Runtime: v1alpha2.DeclarativeRuntime_Python}}, + }, + }, + container: corev1.Container{Args: []string{"--host", "0.0.0.0", "--port", "8080", "--filepath", "/config"}}, + wantCommand: []string{"/.kagent/.venv/bin/kagent-adk", "static", "--host", "0.0.0.0", "--port", "80"}, + wantConfigEnv: true, + wantLibEnv: true, + }, + { + name: "byo", + sa: &v1alpha2.SandboxAgent{ + ObjectMeta: metav1.ObjectMeta{Name: "byo-agent", Namespace: "kagent"}, + Spec: v1alpha2.SandboxAgentSpec{ + AgentSpec: v1alpha2.AgentSpec{Type: v1alpha2.AgentType_BYO, BYO: &v1alpha2.BYOAgentSpec{Deployment: &v1alpha2.ByoDeploymentSpec{Image: pinnedImage, Cmd: &cmd}}}, + }, + }, + container: corev1.Container{Command: []string{"/serve"}, Args: []string{"--host", "0.0.0.0", "--port", "80"}}, + wantCommand: []string{"/serve", "--host", "0.0.0.0", "--port", "80"}, + wantConfigEnv: false, + wantLibEnv: false, + }, + } { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + p := newTestLifecycle(t) + tmpl, err := p.buildSandboxAgentActorTemplate(tc.sa, wpKey, podTemplateFor(tc.container)) + require.NoError(t, err) + + require.Len(t, tmpl.Spec.Containers, 1) + c := tmpl.Spec.Containers[0] + require.Equal(t, pinnedImage, c.Image, "ActorTemplate must use the digest-pinned image") + require.Equal(t, tc.wantCommand, c.Command) + require.Equal(t, wpKey.Name, tmpl.Spec.WorkerPoolRef.Name) + + names := actorEnvNames(c.Env) + require.True(t, names["KAGENT_NAME"], "KAGENT_NAME must be a literal env var") + require.True(t, names["KAGENT_NAMESPACE"], "KAGENT_NAMESPACE must be a literal env var") + require.Equal(t, tc.wantConfigEnv, names["KAGENT_CONFIG_JSON"], "declarative agents materialize config from secret env; BYO does not") + require.Equal(t, tc.wantLibEnv, names["LD_LIBRARY_PATH"], "Python declarative re-supplies the image LD_LIBRARY_PATH that substrate drops") + }) } - require.Equal(t, "my-agent", envByName["KAGENT_NAME"]) - require.Equal(t, "kagent", envByName["KAGENT_NAMESPACE"]) } diff --git a/go/core/pkg/sandboxbackend/substrate/agents_backend.go b/go/core/pkg/sandboxbackend/substrate/agents_backend.go index be06b9cba1..36587eb3b0 100644 --- a/go/core/pkg/sandboxbackend/substrate/agents_backend.go +++ b/go/core/pkg/sandboxbackend/substrate/agents_backend.go @@ -6,7 +6,9 @@ import ( atev1alpha1 "github.com/agent-substrate/substrate/pkg/api/v1alpha1" "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/pkg/consts" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" + corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -30,8 +32,15 @@ func (b *AgentsBackend) GetOwnedResourceTypes() []client.Object { return []client.Object{&atev1alpha1.ActorTemplate{}} } +// OwnedResourceTypesFor returns no types: substrate ActorTemplates are intentionally excluded +// from the reconciler's generic prune so a config change does not delete the currently-serving +// template. A config change creates a new config-hashed template; superseded templates and their +// (suspended) goldens are stateful and pin no workers, so they are retained — not retired — and +// removed only when the SandboxAgent is deleted (DeleteAllSandboxAgentActors + +// CleanupSandboxAgentTemplate, plus owner-reference GC of the template objects). ActorTemplate +// remains in GetOwnedResourceTypes for watches. func (b *AgentsBackend) OwnedResourceTypesFor(_ v1alpha2.AgentObject) ([]client.Object, error) { - return b.GetOwnedResourceTypes(), nil + return nil, nil } func (b *AgentsBackend) BuildSandbox(ctx context.Context, in sandboxbackend.BuildInput) ([]client.Object, error) { @@ -54,9 +63,44 @@ func (b *AgentsBackend) BuildSandbox(ctx context.Context, in sandboxbackend.Buil if err != nil { return nil, err } + + // Clone the rendered config into a per-config-hash Secret that the ActorTemplate references + // (see kagentAgentSecretEnv). The golden snapshot materializes config.json from this Secret at + // build time; a per-hash name guarantees each distinct config gets its own Secret, so substrate + // cannot hand a stale cached config value to a new golden (which previously froze the wrong + // provider's config into the golden). The Secret is owner-referenced to the SandboxAgent by the + // translator, so it is GC'd with the agent; like the ActorTemplate it is retained across config + // changes (the substrate prune list is empty) and removed only on agent delete. + if configSecret := buildSandboxAgentConfigSecret(sa, in); configSecret != nil { + return []client.Object{configSecret, tmpl}, nil + } return []client.Object{tmpl}, nil } +// buildSandboxAgentConfigSecret clones the rendered config Secret under the per-config-hash name +// the ActorTemplate references. Returns nil when there is no config to clone or no hash (the +// ActorTemplate then falls back to the translator's per-agent Secret). +func buildSandboxAgentConfigSecret(sa *v1alpha2.SandboxAgent, in sandboxbackend.BuildInput) *corev1.Secret { + if in.ConfigSecret == nil { + return nil + } + configHash := shortConfigHash(in.PodTemplate.Annotations[consts.ConfigHashAnnotation]) + if configHash == "" { + return nil + } + return &corev1.Secret{ + TypeMeta: metav1.TypeMeta{APIVersion: "v1", Kind: "Secret"}, + ObjectMeta: metav1.ObjectMeta{ + Name: sandboxAgentConfigSecretName(sa, configHash), + Namespace: sa.Namespace, + Labels: sandboxAgentLifecycleLabels(sa), + }, + Type: in.ConfigSecret.Type, + Data: in.ConfigSecret.Data, + StringData: in.ConfigSecret.StringData, + } +} + func (b *AgentsBackend) ComputeReady(ctx context.Context, cl client.Client, nn types.NamespacedName) (metav1.ConditionStatus, string, string) { sa := &v1alpha2.SandboxAgent{} if err := cl.Get(ctx, nn, sa); err != nil { @@ -68,12 +112,14 @@ func (b *AgentsBackend) ComputeReady(ctx context.Context, cl client.Client, nn t if b.Lifecycle == nil { return metav1.ConditionUnknown, "SubstrateLifecycleNotConfigured", "substrate lifecycle is not configured" } - tmplKey := types.NamespacedName{Namespace: nn.Namespace, Name: SandboxAgentActorTemplateName(sa)} - ready, err := b.Lifecycle.actorTemplateReady(ctx, tmplKey) + tmpl, err := ResolveCurrentActorTemplate(ctx, cl, nn.Namespace, sa.Name) if err != nil { - return metav1.ConditionUnknown, "ActorTemplateGetFailed", err.Error() + return metav1.ConditionUnknown, "ActorTemplateListFailed", err.Error() + } + if tmpl == nil { + return metav1.ConditionFalse, "ActorTemplateNotFound", "ActorTemplate has not been generated yet" } - if !ready { + if tmpl.Status.Phase != atev1alpha1.PhaseReady { return metav1.ConditionFalse, "ActorTemplateNotReady", "ActorTemplate golden snapshot is not ready" } return metav1.ConditionTrue, "ActorTemplateReady", "ActorTemplate golden snapshot is ready" diff --git a/go/core/pkg/sandboxbackend/substrate/config_hash_test.go b/go/core/pkg/sandboxbackend/substrate/config_hash_test.go new file mode 100644 index 0000000000..e54834ba7f --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/config_hash_test.go @@ -0,0 +1,300 @@ +package substrate + +import ( + "context" + "strings" + "testing" + + atev1alpha1 "github.com/agent-substrate/substrate/pkg/api/v1alpha1" + "github.com/agent-substrate/substrate/pkg/proto/ateapipb" + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/pkg/consts" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestShortConfigHash(t *testing.T) { + t.Parallel() + // Matches the translator's decimal uint64 annotation; rendered as hex. + require.Equal(t, "ff", shortConfigHash("255")) + require.Equal(t, "", shortConfigHash("")) + require.Equal(t, "", shortConfigHash("not-a-number")) + require.NotEqual(t, shortConfigHash("100"), shortConfigHash("101")) +} + +func TestSandboxAgentActorTemplateNameWithHash(t *testing.T) { + t.Parallel() + sa := &v1alpha2.SandboxAgent{ObjectMeta: metav1.ObjectMeta{Name: "my-agent", Namespace: "kagent"}} + + // Distinct configs → distinct template names → distinct golden snapshots. + n1 := sandboxAgentActorTemplateName(sa, "abc123") + n2 := sandboxAgentActorTemplateName(sa, "def456") + require.Equal(t, "my-agent-abc123", n1) + require.NotEqual(t, n1, n2) + require.LessOrEqual(t, len(n1), 63) + + // Empty hash falls back to the stable base name (preserves prior behavior). + require.Equal(t, "my-agent", sandboxAgentActorTemplateName(sa, "")) + + // Long agent names stay within the DNS-1123 budget once the hash suffix is added. + long := &v1alpha2.SandboxAgent{ObjectMeta: metav1.ObjectMeta{Name: strings.Repeat("a", 80)}} + require.LessOrEqual(t, len(sandboxAgentActorTemplateName(long, "deadbeefdeadbeef")), 63) +} + +func TestSandboxAgentSessionActorIDVariesWithHash(t *testing.T) { + t.Parallel() + sa := &v1alpha2.SandboxAgent{ObjectMeta: metav1.ObjectMeta{Name: "my-agent", Namespace: "kagent"}} + + id1 := SandboxAgentSessionActorID(sa, "abc123", "sess-1") + id2 := SandboxAgentSessionActorID(sa, "def456", "sess-1") + require.NotEqual(t, id1, id2, "config change must yield a new actor id so a fresh actor is created") + + // Same hash + session is stable so repeated messages resume the warm actor. + require.Equal(t, id1, SandboxAgentSessionActorID(sa, "abc123", "sess-1")) + + // Keeps the per-agent prefix so DeleteAll / reaping still match by prefix. + prefix := sandboxAgentActorPrefix(sa) + require.True(t, strings.HasPrefix(id1, prefix+"-")) +} + +func TestBuildActorTemplateStampsConfigHash(t *testing.T) { + t.Parallel() + p := newTestLifecycle(t) + sa := &v1alpha2.SandboxAgent{ + ObjectMeta: metav1.ObjectMeta{Name: "py-agent", Namespace: "kagent"}, + Spec: v1alpha2.SandboxAgentSpec{ + AgentSpec: v1alpha2.AgentSpec{Type: v1alpha2.AgentType_Declarative, Declarative: &v1alpha2.DeclarativeAgentSpec{Runtime: v1alpha2.DeclarativeRuntime_Python}}, + }, + } + pod := corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{consts.ConfigHashAnnotation: "255"}}, + Spec: corev1.PodSpec{Containers: []corev1.Container{{ + Name: defaultKagentContainer, + Image: "registry.example/app@sha256:1111111111111111111111111111111111111111111111111111111111111111", + }}}, + } + wpKey := types.NamespacedName{Namespace: "kagent", Name: "kagent-default"} + tmpl, err := p.buildSandboxAgentActorTemplate(sa, wpKey, pod) + require.NoError(t, err) + require.Equal(t, "py-agent-ff", tmpl.Name, "template name must carry the config-hash suffix") + require.Equal(t, "ff", tmpl.Annotations[consts.ConfigHashAnnotation]) +} + +func TestBuildSandboxClonesConfigSecretPerHash(t *testing.T) { + t.Parallel() + scheme := runtime.NewScheme() + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + utilruntime.Must(atev1alpha1.AddToScheme(scheme)) + wp := &atev1alpha1.WorkerPool{ObjectMeta: metav1.ObjectMeta{Name: "kagent-default", Namespace: "kagent"}} + cl := fake.NewClientBuilder().WithScheme(scheme).WithObjects(wp).Build() + p := &Lifecycle{ + Client: cl, + Defaults: LifecycleDefaults{PauseImage: "gcr.io/test/pause@sha256:deadbeef", DefaultWorkerPool: types.NamespacedName{Name: "kagent-default", Namespace: "kagent"}}, + } + b := NewAgentsBackend(p, nil) + sa := &v1alpha2.SandboxAgent{ + ObjectMeta: metav1.ObjectMeta{Name: "py-agent", Namespace: "kagent"}, + Spec: v1alpha2.SandboxAgentSpec{ + AgentSpec: v1alpha2.AgentSpec{Type: v1alpha2.AgentType_Declarative, Declarative: &v1alpha2.DeclarativeAgentSpec{Runtime: v1alpha2.DeclarativeRuntime_Python}}, + }, + } + pod := corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{consts.ConfigHashAnnotation: "255"}}, + Spec: corev1.PodSpec{Containers: []corev1.Container{{ + Name: defaultKagentContainer, + Image: "registry.example/app@sha256:1111111111111111111111111111111111111111111111111111111111111111", + }}}, + } + cfg := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "py-agent", Namespace: "kagent"}, + StringData: map[string]string{"config.json": `{"model":{"type":"gemini"}}`}, + } + + objs, err := b.BuildSandbox(context.Background(), sandboxbackend.BuildInput{Agent: sa, PodTemplate: pod, ConfigSecret: cfg}) + require.NoError(t, err) + require.Len(t, objs, 2, "expect a per-hash config Secret plus the ActorTemplate") + + sec, ok := objs[0].(*corev1.Secret) + require.True(t, ok, "first object must be the cloned config Secret") + require.Equal(t, "py-agent-ff", sec.Name, "config Secret is named per config hash (paired with the template)") + require.Equal(t, `{"model":{"type":"gemini"}}`, sec.StringData["config.json"], "clone carries the rendered config verbatim") + + tmpl, ok := objs[1].(*atev1alpha1.ActorTemplate) + require.True(t, ok) + require.Equal(t, "py-agent-ff", tmpl.Name, "ActorTemplate name matches its per-hash config Secret") + + // No config Secret in the input → no clone (falls back to the per-agent Secret), just the template. + objs, err = b.BuildSandbox(context.Background(), sandboxbackend.BuildInput{Agent: sa, PodTemplate: pod}) + require.NoError(t, err) + require.Len(t, objs, 1) +} + +func TestResolveCurrentActorTemplate(t *testing.T) { + t.Parallel() + scheme := runtime.NewScheme() + utilruntime.Must(atev1alpha1.AddToScheme(scheme)) + + // Old template is Ready (serving); newer one is still building. Blue-green: serve the old + // Ready golden until the new is Ready, so the resolver must prefer the Ready one even though + // it's older. + oldReady := &atev1alpha1.ActorTemplate{ObjectMeta: metav1.ObjectMeta{ + Name: "my-agent-old", Namespace: "kagent", + Labels: map[string]string{SandboxAgentLabelKey: "my-agent"}, + CreationTimestamp: metav1.Unix(100, 0), + }, Status: atev1alpha1.ActorTemplateStatus{Phase: atev1alpha1.PhaseReady}} + newerBuilding := &atev1alpha1.ActorTemplate{ObjectMeta: metav1.ObjectMeta{ + Name: "my-agent-new", Namespace: "kagent", + Labels: map[string]string{SandboxAgentLabelKey: "my-agent"}, + CreationTimestamp: metav1.Unix(200, 0), + }, Status: atev1alpha1.ActorTemplateStatus{Phase: atev1alpha1.PhaseResumeGoldenActor}} + other := &atev1alpha1.ActorTemplate{ObjectMeta: metav1.ObjectMeta{ + Name: "other-agent", Namespace: "kagent", + Labels: map[string]string{SandboxAgentLabelKey: "other-agent"}, + }} + cl := fake.NewClientBuilder().WithScheme(scheme).WithObjects(oldReady, newerBuilding, other).Build() + + got, err := ResolveCurrentActorTemplate(context.Background(), cl, "kagent", "my-agent") + require.NoError(t, err) + require.NotNil(t, got) + require.Equal(t, "my-agent-old", got.Name, "must prefer the newest READY template (no downtime during rebuild)") + + none, err := ResolveCurrentActorTemplate(context.Background(), cl, "kagent", "absent") + require.NoError(t, err) + require.Nil(t, none) + + // When none is Ready yet (first build), fall back to the newest. + firstBuild := fake.NewClientBuilder().WithScheme(scheme).WithObjects(newerBuilding).Build() + got, err = ResolveCurrentActorTemplate(context.Background(), firstBuild, "kagent", "my-agent") + require.NoError(t, err) + require.NotNil(t, got) + require.Equal(t, "my-agent-new", got.Name) +} + +func TestResolveCurrentActorTemplatePrefersDesiredGeneration(t *testing.T) { + t.Parallel() + scheme := runtime.NewScheme() + utilruntime.Must(atev1alpha1.AddToScheme(scheme)) + + // Flip-back scenario: the gemini template was created LATER (higher creationTimestamp) but the + // agent has since flipped back to the openai config, re-applying the older openai template with + // a NEWER generation. The resolver must follow generation (current desired config), not creation + // time — otherwise a flip-back keeps serving the stale (gemini) golden. + openai := &atev1alpha1.ActorTemplate{ObjectMeta: metav1.ObjectMeta{ + Name: "agent-openai", Namespace: "kagent", + Labels: map[string]string{SandboxAgentLabelKey: "agent"}, + Annotations: map[string]string{desiredGenerationAnnotation: "6"}, // re-applied on flip-back + CreationTimestamp: metav1.Unix(100, 0), // created earlier + }, Status: atev1alpha1.ActorTemplateStatus{Phase: atev1alpha1.PhaseReady}} + gemini := &atev1alpha1.ActorTemplate{ObjectMeta: metav1.ObjectMeta{ + Name: "agent-gemini", Namespace: "kagent", + Labels: map[string]string{SandboxAgentLabelKey: "agent"}, + Annotations: map[string]string{desiredGenerationAnnotation: "5"}, + CreationTimestamp: metav1.Unix(200, 0), // created later, but no longer desired + }, Status: atev1alpha1.ActorTemplateStatus{Phase: atev1alpha1.PhaseReady}} + cl := fake.NewClientBuilder().WithScheme(scheme).WithObjects(openai, gemini).Build() + + got, err := ResolveCurrentActorTemplate(context.Background(), cl, "kagent", "agent") + require.NoError(t, err) + require.NotNil(t, got) + require.Equal(t, "agent-openai", got.Name, "must serve the current desired config (highest generation), not the newest-created golden") + + // Forward rollout: desired (gen 7) is still building; serve the previous Ready (gen 6). + building := &atev1alpha1.ActorTemplate{ObjectMeta: metav1.ObjectMeta{ + Name: "agent-new", Namespace: "kagent", + Labels: map[string]string{SandboxAgentLabelKey: "agent"}, + Annotations: map[string]string{desiredGenerationAnnotation: "7"}, + CreationTimestamp: metav1.Unix(300, 0), + }, Status: atev1alpha1.ActorTemplateStatus{Phase: atev1alpha1.PhaseResumeGoldenActor}} + cl2 := fake.NewClientBuilder().WithScheme(scheme).WithObjects(openai, gemini, building).Build() + got, err = ResolveCurrentActorTemplate(context.Background(), cl2, "kagent", "agent") + require.NoError(t, err) + require.Equal(t, "agent-openai", got.Name, "while the desired golden builds, serve the most-recently-desired Ready template") +} + +func TestActorBelongsToSandboxAgent(t *testing.T) { + t.Parallel() + sa := &v1alpha2.SandboxAgent{ObjectMeta: metav1.ObjectMeta{Name: "my-agent", Namespace: "kagent"}} + prefix := sandboxAgentActorPrefix(sa) + owned := map[string]struct{}{"my-agent-abc123": {}, "my-agent": {}} + + tests := []struct { + name string + actor *ateapipb.Actor + want bool + }{ + { + // The case comment #2 flags: a long agent name / session id forces the prefix-less + // asr- fallback id, which id-prefix matching misses — but the owning template matches. + name: "prefix-less fallback id matched by owning template", + actor: &ateapipb.Actor{ActorId: sandboxAgentIDPrefix + "-deadbeefdeadbeefdeadbeef", ActorTemplateNamespace: "kagent", ActorTemplateName: "my-agent-abc123"}, + want: true, + }, + { + name: "normal session id matched by prefix", + actor: &ateapipb.Actor{ActorId: prefix + "-sess-1", ActorTemplateNamespace: "kagent", ActorTemplateName: "my-agent-abc123"}, + want: true, + }, + { + name: "legacy per-agent id matched exactly", + actor: &ateapipb.Actor{ActorId: SandboxAgentActorID(sa)}, + want: true, + }, + { + name: "orphan actor whose template was already deleted still matched by prefix", + actor: &ateapipb.Actor{ActorId: prefix + "-sess-2", ActorTemplateName: "gone"}, + want: true, + }, + { + name: "unrelated actor not matched", + actor: &ateapipb.Actor{ActorId: "asr-other-ns-other-agent-sess", ActorTemplateNamespace: "kagent", ActorTemplateName: "other-agent"}, + want: false, + }, + { + name: "same template name in a different namespace not matched", + actor: &ateapipb.Actor{ActorId: "asr-xyz", ActorTemplateNamespace: "elsewhere", ActorTemplateName: "my-agent-abc123"}, + want: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + require.Equal(t, tt.want, actorBelongsToSandboxAgent(sa, tt.actor, prefix, owned)) + }) + } +} + +func TestRetainedSessionConfigHashes(t *testing.T) { + t.Parallel() + scheme := runtime.NewScheme() + utilruntime.Must(atev1alpha1.AddToScheme(scheme)) + + tmplA := &atev1alpha1.ActorTemplate{ObjectMeta: metav1.ObjectMeta{ + Name: "agent-abc123", Namespace: "kagent", + Labels: map[string]string{SandboxAgentLabelKey: "agent"}, + Annotations: map[string]string{consts.ConfigHashAnnotation: "abc123"}, + }} + tmplB := &atev1alpha1.ActorTemplate{ObjectMeta: metav1.ObjectMeta{ + Name: "agent-def456", Namespace: "kagent", + Labels: map[string]string{SandboxAgentLabelKey: "agent"}, + Annotations: map[string]string{consts.ConfigHashAnnotation: "def456"}, + }} + other := &atev1alpha1.ActorTemplate{ObjectMeta: metav1.ObjectMeta{ + Name: "other", Namespace: "kagent", + Labels: map[string]string{SandboxAgentLabelKey: "other-agent"}, + Annotations: map[string]string{consts.ConfigHashAnnotation: "zzz999"}, + }} + cl := fake.NewClientBuilder().WithScheme(scheme).WithObjects(tmplA, tmplB, other).Build() + + b := &SandboxAgentActorBackend{kube: cl} + sa := &v1alpha2.SandboxAgent{ObjectMeta: metav1.ObjectMeta{Name: "agent", Namespace: "kagent"}} + hashes, err := b.retainedSessionConfigHashes(context.Background(), sa) + require.NoError(t, err) + // "" is always included (legacy/no-hash actors), plus each retained template's hash; the other + // agent's template hash is excluded. + require.ElementsMatch(t, []string{"", "abc123", "def456"}, hashes) +} diff --git a/go/core/pkg/sandboxbackend/substrate/lifecycle_delete.go b/go/core/pkg/sandboxbackend/substrate/lifecycle_delete.go index 0c274e6a5c..16f47d8b6b 100644 --- a/go/core/pkg/sandboxbackend/substrate/lifecycle_delete.go +++ b/go/core/pkg/sandboxbackend/substrate/lifecycle_delete.go @@ -9,6 +9,7 @@ import ( "github.com/kagent-dev/kagent/go/api/v1alpha2" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" ) // CleanupGeneratedTemplate removes external Substrate actors that Kubernetes garbage collection cannot see. @@ -72,13 +73,29 @@ func (p *Lifecycle) CleanupSandboxAgentTemplate(ctx context.Context, sa *v1alpha if sa == nil || p == nil || p.Client == nil { return true, nil } - tmplKey := types.NamespacedName{Namespace: sa.Namespace, Name: SandboxAgentActorTemplateName(sa)} - goldenID, err := p.goldenActorID(ctx, tmplKey) - if err != nil { - return false, err + // A SandboxAgent may have multiple generated ActorTemplates in flight (a config change + // creates a new hashed template before the old one is pruned). Clean the golden actor of + // every template carrying the agent's lifecycle label. + list := &atev1alpha1.ActorTemplateList{} + if err := p.Client.List(ctx, list, + client.InNamespace(sa.Namespace), + client.MatchingLabels{SandboxAgentLabelKey: sa.Name}, + ); err != nil { + return false, fmt.Errorf("list ActorTemplates for %s/%s: %w", sa.Namespace, sa.Name, err) } - if goldenID == "" { - return true, nil + allDone := true + for i := range list.Items { + goldenID := strings.TrimSpace(list.Items[i].Status.GoldenActorID) + if goldenID == "" { + continue + } + done, err := deleteGoldenActor(ctx, p.AteClient, goldenID) + if err != nil { + return false, fmt.Errorf("delete golden actor %q: %w", goldenID, err) + } + if !done { + allDone = false + } } - return deleteGoldenActor(ctx, p.AteClient, goldenID) + return allDone, nil } diff --git a/go/core/pkg/sandboxbackend/substrate/lifecycle_shared.go b/go/core/pkg/sandboxbackend/substrate/lifecycle_shared.go index f3d0fa52da..20835d8791 100644 --- a/go/core/pkg/sandboxbackend/substrate/lifecycle_shared.go +++ b/go/core/pkg/sandboxbackend/substrate/lifecycle_shared.go @@ -3,6 +3,7 @@ package substrate import ( "context" "fmt" + "strconv" "strings" atev1alpha1 "github.com/agent-substrate/substrate/pkg/api/v1alpha1" @@ -148,13 +149,94 @@ func actorTemplateName(ah *v1alpha2.AgentHarness) string { } func truncateDNS1123(s string) string { + return truncateDNS1123To(s, 63) +} + +func truncateDNS1123To(s string, max int) string { s = strings.ToLower(strings.ReplaceAll(s, "_", "-")) - if len(s) > 63 { - s = strings.TrimRight(s[:63], "-") + if len(s) > max { + s = strings.TrimRight(s[:max], "-") } return s } +// ResolveCurrentActorTemplate returns the ActorTemplate a SandboxAgent should currently serve +// from: the template matching the agent's CURRENT desired config whose golden is Ready, else the +// most-recently-desired Ready template (the previous config) while the desired one is still +// building — the blue-green pivot, with no downtime and an atomic flip once the new golden is +// Ready. +// +// "Desired" is tracked by the kagent.dev/desired-generation annotation (the agent generation that +// last applied the template), NOT creationTimestamp. Creation time is wrong for a flip-back to a +// retained older config: that template's golden was built earlier, so by-creation ordering would +// keep serving the newer (now-undesired) config. The desired template is always re-applied with +// the current (highest) generation, so picking the highest-generation Ready template follows the +// current config in both directions. Falls back to the highest-generation template when none is +// Ready yet (first build). Returns (nil, nil) when no template exists. +func ResolveCurrentActorTemplate(ctx context.Context, kube client.Client, namespace, agentName string) (*atev1alpha1.ActorTemplate, error) { + templates, err := listSandboxAgentActorTemplates(ctx, kube, namespace, agentName) + if err != nil { + return nil, err + } + var desiredReady, desired *atev1alpha1.ActorTemplate + for i := range templates { + t := templates[i] + if desired == nil || moreDesiredActorTemplate(t, desired) { + desired = t + } + if t.Status.Phase == atev1alpha1.PhaseReady { + if desiredReady == nil || moreDesiredActorTemplate(t, desiredReady) { + desiredReady = t + } + } + } + if desiredReady != nil { + return desiredReady, nil + } + return desired, nil +} + +// moreDesiredActorTemplate reports whether a is "more desired" than b: a higher desired-generation +// wins (the template applied for the current config), with creationTimestamp as a tiebreaker for +// legacy templates that predate the annotation. +func moreDesiredActorTemplate(a, b *atev1alpha1.ActorTemplate) bool { + ga, gb := actorTemplateDesiredGeneration(a), actorTemplateDesiredGeneration(b) + if ga != gb { + return ga > gb + } + return a.CreationTimestamp.After(b.CreationTimestamp.Time) +} + +// actorTemplateDesiredGeneration parses the desired-generation annotation; absent/invalid is 0. +func actorTemplateDesiredGeneration(t *atev1alpha1.ActorTemplate) int64 { + g, err := strconv.ParseInt(t.Annotations[desiredGenerationAnnotation], 10, 64) + if err != nil { + return 0 + } + return g +} + +// listSandboxAgentActorTemplates returns the non-terminating generated ActorTemplates for an agent. +func listSandboxAgentActorTemplates(ctx context.Context, kube client.Client, namespace, agentName string) ([]*atev1alpha1.ActorTemplate, error) { + if kube == nil { + return nil, fmt.Errorf("kubernetes client is required") + } + list := &atev1alpha1.ActorTemplateList{} + if err := kube.List(ctx, list, + client.InNamespace(namespace), + client.MatchingLabels{SandboxAgentLabelKey: agentName}, + ); err != nil { + return nil, fmt.Errorf("list ActorTemplates for %s/%s: %w", namespace, agentName, err) + } + out := make([]*atev1alpha1.ActorTemplate, 0, len(list.Items)) + for i := range list.Items { + if list.Items[i].DeletionTimestamp.IsZero() { + out = append(out, &list.Items[i]) + } + } + return out, nil +} + // pinImageRef ensures image refs satisfy Substrate ActorTemplate validation (must contain "@"). func pinImageRef(image string) (string, error) { image = strings.TrimSpace(image) diff --git a/go/core/test/e2e/agents/kebab/Dockerfile b/go/core/test/e2e/agents/kebab/Dockerfile index e6959afb35..337c1eb554 100644 --- a/go/core/test/e2e/agents/kebab/Dockerfile +++ b/go/core/test/e2e/agents/kebab/Dockerfile @@ -2,7 +2,9 @@ ARG DOCKER_REGISTRY=ghcr.io ARG VERSION=latest ARG DOCKER_REPO=kagent-dev/kagent -FROM $DOCKER_REGISTRY/$DOCKER_REPO/kagent-adk:$VERSION +# Use the "-full" runtime image: it ships uv + a shell, which `uv sync` below needs. +# The default kagent-adk image is now distroless slim and has neither uv nor /bin/sh. +FROM $DOCKER_REGISTRY/$DOCKER_REPO/kagent-adk:$VERSION-full WORKDIR /app @@ -12,6 +14,8 @@ COPY README.md README.md COPY .python-version .python-version COPY uv.lock uv.lock -RUN uv sync --locked --refresh +# Install only the kebab package into the inherited ADK venv. The base image already provides +# google-adk/kagent-adk; syncing this child project's lockfile downgrades shared runtime deps. +RUN uv pip install --python /.kagent/.venv/bin/python --no-deps . CMD ["kebab"] \ No newline at end of file diff --git a/helm/kagent-crds/templates/kagent.dev_sandboxagents.yaml b/helm/kagent-crds/templates/kagent.dev_sandboxagents.yaml index bff6672d68..6bb5fbbbd2 100644 --- a/helm/kagent-crds/templates/kagent.dev_sandboxagents.yaml +++ b/helm/kagent-crds/templates/kagent.dev_sandboxagents.yaml @@ -11328,8 +11328,6 @@ spec: x-kubernetes-validations: - message: spec.skills is not supported for sandbox agents rule: '!has(self.skills)' - - message: BYO agents are not supported for sandbox agents - rule: '!has(self.type) || self.type != ''BYO''' - message: type must be specified rule: has(self.type) - message: type must be either Declarative or BYO diff --git a/python/Dockerfile b/python/Dockerfile index 03b9bf492f..e79ca36019 100644 --- a/python/Dockerfile +++ b/python/Dockerfile @@ -1,140 +1,106 @@ -### STAGE 1: base image -ARG BASE_IMAGE_REGISTRY=cgr.dev +### STAGE 1: uv binary ARG UV_VERSION=0.11.15 FROM ghcr.io/astral-sh/uv:${UV_VERSION} AS uv-bin -FROM $BASE_IMAGE_REGISTRY/chainguard/wolfi-base:latest AS base-os -# Build arg to control SSL verification (set DISABLE_SSL_VERIFY=1 to skip SSL checks) -ARG DISABLE_SSL_VERIFY=0 +### STAGE 2: builder +# Build the uv-managed standalone Python interpreter and the project venv on a full base +# (debian-slim, digest-pinnable). Nothing from this stage ships except /python and the venv. +# Digest = multi-arch index resolved via `docker buildx imagetools inspect debian:12-slim`. +FROM debian:12-slim@sha256:96e378d7e6531ac9a15ad505478fcc2e69f371b10f5cdf87857c4b8188404716 AS builder +ARG TOOLS_PYTHON_VERSION=3.13 ENV LANG=C.UTF-8 ENV LC_ALL=C.UTF-8 - -# Install packages with conditional SSL verification -# When DISABLE_SSL_VERIFY=1, use --no-check-certificate to bypass SSL checks (development only) -RUN --mount=type=cache,target=/var/cache/apk,rw \ - if [ "$DISABLE_SSL_VERIFY" = "1" ]; then \ - echo "WARNING: Disabling SSL verification for apk (development only)"; \ - apk update --no-check-certificate && apk add --no-check-certificate \ - curl openssl bash git ca-certificates libstdc++; \ - else \ - apk update && apk add \ - curl openssl bash git ca-certificates libstdc++; \ - fi - -# Install uv from upstream so we control the version and pick up rkyv fixes -# independently of the Wolfi apk release cadence. -COPY --from=uv-bin /uv /uvx /usr/local/bin/ - -### STAGE 2: python -FROM base-os AS python-os -ARG TOOLS_PYTHON_VERSION=3.13 - ENV PYTHONOPTIMIZE=2 ENV PYTHONUNBUFFERED=1 -# Optimize malloc for containerized Python workloads -# 256KB threshold balances memory efficiency with performance -ENV MALLOC_TRIM_THRESHOLD_=262144 -ENV MALLOC_ARENA_MAX=2 - -ENV GIT_LFS_SKIP_SMUDGE=1 - +# uv configuration: install a managed standalone Python under /python and a copy-mode venv +# (no editable installs) so the final image is self-contained with no source tree dependency. ENV UV_LINK_MODE=copy ENV UV_COMPILE_BYTECODE=1 -ENV UV_COMPILE_BYTECODE_TIMEOUT=300 -ENV UV_SYSTEM_PYTHON=1 ENV UV_NO_PROGRESS=1 ENV UV_HTTP_TIMEOUT=60 -ENV UV_CONCURRENT_DOWNLOADS=10 - -# Configure the Python directories ENV UV_CACHE_DIR=/.kagent/cache/packages -ENV UV_TOOL_DIR=/.kagent/cache/tools ENV UV_PYTHON_DOWNLOADS_DIR=/.kagent/cache/downloads ENV UV_PROJECT_ENVIRONMENT=/.kagent/.venv - ENV UV_PYTHON_INSTALL_DIR=/python ENV UV_PYTHON_PREFERENCE=only-managed -RUN addgroup -g 1001 pythongroup && \ - adduser -u 1001 -G pythongroup -s /bin/bash -D python -h /.kagent/ && \ - mkdir -p $UV_PYTHON_DOWNLOADS_DIR && \ - mkdir -p $UV_TOOL_DIR && \ - mkdir -p $UV_CACHE_DIR && \ - mkdir -p /python && \ - chown -vR 1001:1001 /.kagent /python - -# Install anthropic sandbox runtime and dependencies -RUN --mount=type=cache,target=/var/cache/apk,rw \ - apk add \ - nodejs npm node-gyp bubblewrap socat ripgrep - -# Install sandbox runtime from a specific commit of the GitHub repo without using global prefix -# This avoids scope-related rename issues in global node_modules -# Using BuildKit cache for npm to speed up rebuilds -# Keep the pinned sandbox-runtime revision, but replace its vulnerable locked lodash-es version. -RUN --mount=type=cache,target=/root/.npm \ - mkdir -p /opt && \ - cd /opt && \ - git clone --depth 1 --revision=ef4afdef4d711ba21a507d7f7369e305f7d3dbfa https://github.com/anthropic-experimental/sandbox-runtime.git && \ - cd sandbox-runtime && \ - npm install --save-exact lodash-es@4.18.1 @types/lodash-es@4.17.12 && \ - npm install --save-exact brace-expansion@5.0.6 && \ - npm run build && \ - # CVE-2026-26996: all minimatch instances (3.1.2, 9.0.5) are transitive dev - # deps (eslint, typescript-eslint). Prune dev deps after build to remove them. - npm prune --omit=dev && \ - npm install -g --ignore-scripts - -# Ensure the sandbox runtime binaries are on PATH -ENV PATH="/opt/sandbox-runtime/node_modules/.bin:$PATH" - -USER python -WORKDIR /.kagent +RUN apt-get update \ + && apt-get install -y --no-install-recommends ca-certificates \ + && rm -rf /var/lib/apt/lists/* -### STAGE 3: final -FROM python-os AS builder -ARG TOOLS_PYTHON_VERSION +COPY --from=uv-bin /uv /uvx /usr/local/bin/ WORKDIR /.kagent -ENV PATH=$PATH:/.kagent/bin:/.kagent/.venv/bin - # Copy dependency files first for better layer caching -COPY --chown=python:pythongroup pyproject.toml . -COPY --chown=python:pythongroup .python-version . -COPY --chown=python:pythongroup uv.lock . -COPY --chown=python:pythongroup packages/kagent-adk packages/kagent-adk -COPY --chown=python:pythongroup packages/kagent-core packages/kagent-core -COPY --chown=python:pythongroup packages/kagent-skills packages/kagent-skills -COPY --chown=python:pythongroup packages/agentsts-adk packages/agentsts-adk -COPY --chown=python:pythongroup packages/agentsts-core packages/agentsts-core -COPY --chown=python:pythongroup README.md . +COPY pyproject.toml . +COPY .python-version . +COPY uv.lock . +COPY README.md . +COPY packages/kagent-adk packages/kagent-adk +COPY packages/kagent-core packages/kagent-core +COPY packages/kagent-skills packages/kagent-skills +COPY packages/agentsts-adk packages/agentsts-adk +COPY packages/agentsts-core packages/agentsts-core ARG VERSION -# Install dependencies - make sure /.kagent/.venv/bin in path and not in cache mount -RUN --mount=type=cache,target=/.kagent/cache,uid=1001,gid=1001 \ +# Create the venv and install kagent-adk. --no-editable copies the workspace packages into +# site-packages so the runtime does not need the source tree. +RUN --mount=type=cache,target=/.kagent/cache,rw \ echo "Creating virtual environment and installing dependencies..." \ && uv venv --python=python$TOOLS_PYTHON_VERSION \ - && uv lock && uv sync --package kagent-adk \ + && uv lock && uv sync --package kagent-adk --no-editable \ && uv cache prune \ && echo "Installation complete." -# Create a separate venv for bash tool commands (sandbox environment) -# This venv does not have pip installed -RUN --mount=type=cache,target=/.kagent/cache,uid=1001,gid=1001 \ - echo "Creating bash tool sandbox environment..." \ - && mkdir -p /.kagent/sandbox-venv \ - && uv venv --python=python$TOOLS_PYTHON_VERSION /.kagent/sandbox-venv \ - && echo "Bash tool sandbox environment created." +# Pre-create the config dir owned by the runtime user. On Agent Substrate the config is +# materialized into /config at startup (the env-injected path); distroless runs as nonroot and +# cannot create top-level dirs, so it must exist with the right owner ahead of time. On the +# normal Deployment path /config is overlaid by the mounted Secret volume, so this is harmless. +RUN mkdir -p /staging/config + +### STAGE 3: final (distroless) +# distroless/cc provides glibc + libstdc++ (required by the standalone CPython build) but no +# shell or package manager. Agents that need in-container code execution / bash tools use the +# "full" image (python/Dockerfile.full) instead. +# Pinned by digest so rebuilds can't silently pull a different base. Distroless publishes no +# version-numbered tags (only :nonroot/:latest/:debug), so a digest is the only way to pin tighter +# than "Debian 12". Tag kept for readability; bump the digest via dependency tooling. +# Digest = multi-arch index resolved via `docker buildx imagetools inspect gcr.io/distroless/cc-debian12:nonroot`. +FROM gcr.io/distroless/cc-debian12:nonroot@sha256:b0ae8e989418b458e0f25489bc3be523718938a2b70864cc0f6a00af1ddbd985 +ARG VERSION +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +ENV PYTHONUNBUFFERED=1 ENV PATH="/.kagent/.venv/bin:$PATH" -ENV UV_PROJECT_ENVIRONMENT=/app/.venv -ENV BASH_VENV_PATH=/.kagent/sandbox-venv ENV VIRTUAL_ENV=/.kagent/.venv +# The standalone interpreter and the venv (the venv's python is linked to /python). +COPY --from=builder /python /python +COPY --from=builder /.kagent/.venv /.kagent/.venv +# Writable config dir for substrate config materialization (see builder stage). +COPY --from=builder --chown=65532:65532 /staging/config /config + +# The standalone CPython build and numpy's C-extensions dynamically link a handful of system +# libraries that distroless/cc does not ship (zlib, bz2, lzma, ffi, sqlite). Copy them from the +# builder into an arch-agnostic dir on LD_LIBRARY_PATH. The *-linux-gnu glob matches the single +# multiarch dir present for the target arch (amd64/arm64). +COPY --from=builder /usr/lib/*-linux-gnu/libz.so.1 /usr/lib/kagent-libs/ +COPY --from=builder /usr/lib/*-linux-gnu/libbz2.so.1* /usr/lib/kagent-libs/ +COPY --from=builder /usr/lib/*-linux-gnu/liblzma.so.5* /usr/lib/kagent-libs/ +COPY --from=builder /usr/lib/*-linux-gnu/libffi.so.8* /usr/lib/kagent-libs/ +COPY --from=builder /usr/lib/*-linux-gnu/libsqlite3.so.0* /usr/lib/kagent-libs/ +ENV LD_LIBRARY_PATH=/usr/lib/kagent-libs + WORKDIR /app +USER 65532:65532 + +LABEL org.opencontainers.image.source=https://github.com/kagent-dev/kagent +LABEL org.opencontainers.image.description="Kagent ADK Python runtime (distroless, no sandbox runtime)." +LABEL org.opencontainers.image.version="$VERSION" -ENTRYPOINT ["kagent-adk", "run", "--host", "0.0.0.0", "--port", "8080"] +ENTRYPOINT ["/.kagent/.venv/bin/kagent-adk", "run", "--host", "0.0.0.0", "--port", "8080"] diff --git a/python/Dockerfile.app b/python/Dockerfile.app index 8cbfac077c..b60503e419 100644 --- a/python/Dockerfile.app +++ b/python/Dockerfile.app @@ -1,6 +1,9 @@ ARG KAGENT_ADK_VERSION=latest ARG DOCKER_REGISTRY=ghcr.io ARG DOCKER_REPO=kagent-dev/kagent +# The base runtime is selected by tag: KAGENT_ADK_VERSION= for the distroless slim +# base, or -full for the base that includes the sandbox runtime. The app image just +# overrides the entrypoint to serve declarative agents. FROM $DOCKER_REGISTRY/$DOCKER_REPO/kagent-adk:$KAGENT_ADK_VERSION # Offline mode @@ -14,5 +17,5 @@ LABEL org.opencontainers.image.description="Kagent app is the Kagent agent runti LABEL org.opencontainers.image.authors="Kagent Creators 🤖" LABEL org.opencontainers.image.version="$VERSION" -ENTRYPOINT ["kagent-adk", "static"] -CMD ["--host", "0.0.0.0", "--port", "8080"] \ No newline at end of file +ENTRYPOINT ["/.kagent/.venv/bin/kagent-adk", "static"] +CMD ["--host", "0.0.0.0", "--port", "8080"] diff --git a/python/Dockerfile.full b/python/Dockerfile.full new file mode 100644 index 0000000000..f853107948 --- /dev/null +++ b/python/Dockerfile.full @@ -0,0 +1,119 @@ +# Full Python ADK runtime image: includes the Anthropic sandbox-runtime (node, bubblewrap, +# socat, ripgrep) and a bash tool venv for agents that execute code / run shell tools. Unlike +# python/Dockerfile (distroless slim), this image needs a shell and package tooling, so it is +# built on a digest-pinnable debian-slim base rather than distroless. The controller selects +# this image (PythonADKFullImageDigest) for declarative agents that need SRT (skills, +# executeCodeBlocks) and for sandboxed BYO agents. + +### STAGE 1: uv binary +ARG UV_VERSION=0.11.15 +FROM ghcr.io/astral-sh/uv:${UV_VERSION} AS uv-bin + +### STAGE 2: base os + sandbox runtime +# node:20-bookworm-slim is a digest-pinnable debian-bookworm base that ships Node 20 (the +# sandbox-runtime requires node >= 20; debian's own nodejs package is still on 18). +# Digest = multi-arch index resolved via `docker buildx imagetools inspect node:20-bookworm-slim`. +FROM node:20-bookworm-slim@sha256:2cf067cfed83d5ea958367df9f966191a942351a2df77d6f0193e162b5febfc0 AS python-os +ARG TOOLS_PYTHON_VERSION=3.13 + +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +ENV PYTHONOPTIMIZE=2 +ENV PYTHONUNBUFFERED=1 + +# Optimize malloc for containerized Python workloads +ENV MALLOC_TRIM_THRESHOLD_=262144 +ENV MALLOC_ARENA_MAX=2 +ENV GIT_LFS_SKIP_SMUDGE=1 + +ENV UV_LINK_MODE=copy +ENV UV_COMPILE_BYTECODE=1 +ENV UV_NO_PROGRESS=1 +ENV UV_HTTP_TIMEOUT=60 +ENV UV_CACHE_DIR=/.kagent/cache/packages +ENV UV_TOOL_DIR=/.kagent/cache/tools +ENV UV_PYTHON_DOWNLOADS_DIR=/.kagent/cache/downloads +ENV UV_PROJECT_ENVIRONMENT=/.kagent/.venv +ENV UV_PYTHON_INSTALL_DIR=/python +ENV UV_PYTHON_PREFERENCE=only-managed + +# node/npm come from the base image; add the remaining runtime tooling. +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + curl openssl bash git ca-certificates \ + bubblewrap socat ripgrep \ + && rm -rf /var/lib/apt/lists/* + +COPY --from=uv-bin /uv /uvx /usr/local/bin/ + +RUN groupadd -g 1001 pythongroup \ + && useradd -u 1001 -g pythongroup -s /bin/bash -d /.kagent -m python \ + && mkdir -p $UV_PYTHON_DOWNLOADS_DIR $UV_TOOL_DIR $UV_CACHE_DIR /python \ + && chown -R 1001:1001 /.kagent /python + +# Install the Anthropic sandbox runtime from a pinned revision. Replace its vulnerable locked +# transitive deps, build, then prune dev deps (matches the previous Chainguard-based image). +# Fetch the pinned revision via init+fetch (debian's git predates `git clone --revision`). +# GitHub allows fetching an arbitrary commit SHA directly. +RUN --mount=type=cache,target=/root/.npm \ + mkdir -p /opt/sandbox-runtime && cd /opt/sandbox-runtime \ + && git init -q \ + && git remote add origin https://github.com/anthropic-experimental/sandbox-runtime.git \ + && git fetch --depth 1 origin ef4afdef4d711ba21a507d7f7369e305f7d3dbfa \ + && git checkout -q FETCH_HEAD \ + && npm pkg delete scripts.prepare \ + && npm install --ignore-scripts --save-exact lodash-es@4.18.1 @types/lodash-es@4.17.12 \ + && npm install --ignore-scripts --save-exact brace-expansion@5.0.6 \ + && npm run build \ + && npm prune --omit=dev \ + && npm install -g --ignore-scripts + +ENV PATH="/opt/sandbox-runtime/node_modules/.bin:$PATH" + +USER python +WORKDIR /.kagent + +### STAGE 3: final (install project) +FROM python-os AS builder +# Default kept here too: ARG defaults do not carry across stages, and this value is used below +# in `uv venv --python=python$TOOLS_PYTHON_VERSION`, so an unset build-arg would break the build. +ARG TOOLS_PYTHON_VERSION=3.13 + +WORKDIR /.kagent +ENV PATH=$PATH:/.kagent/bin:/.kagent/.venv/bin + +COPY --chown=python:pythongroup pyproject.toml . +COPY --chown=python:pythongroup .python-version . +COPY --chown=python:pythongroup uv.lock . +COPY --chown=python:pythongroup packages/kagent-adk packages/kagent-adk +COPY --chown=python:pythongroup packages/kagent-core packages/kagent-core +COPY --chown=python:pythongroup packages/kagent-skills packages/kagent-skills +COPY --chown=python:pythongroup packages/agentsts-adk packages/agentsts-adk +COPY --chown=python:pythongroup packages/agentsts-core packages/agentsts-core +COPY --chown=python:pythongroup README.md . + +ARG VERSION + +RUN --mount=type=cache,target=/.kagent/cache,uid=1001,gid=1001 \ + echo "Creating virtual environment and installing dependencies..." \ + && uv venv --python=python$TOOLS_PYTHON_VERSION \ + && uv lock && uv sync --package kagent-adk \ + && uv cache prune \ + && echo "Installation complete." + +# Separate venv for bash tool commands (sandbox environment); no pip installed. +RUN --mount=type=cache,target=/.kagent/cache,uid=1001,gid=1001 \ + mkdir -p /.kagent/sandbox-venv \ + && uv venv --python=python$TOOLS_PYTHON_VERSION /.kagent/sandbox-venv + +ENV PATH="/.kagent/.venv/bin:$PATH" +ENV BASH_VENV_PATH=/.kagent/sandbox-venv +ENV VIRTUAL_ENV=/.kagent/.venv + +LABEL org.opencontainers.image.source=https://github.com/kagent-dev/kagent +LABEL org.opencontainers.image.description="Kagent ADK Python runtime (full: includes sandbox runtime)." +LABEL org.opencontainers.image.version="$VERSION" + +WORKDIR /app + +ENTRYPOINT ["/.kagent/.venv/bin/kagent-adk", "run", "--host", "0.0.0.0", "--port", "8080"] diff --git a/python/packages/kagent-adk/src/kagent/adk/_config_materialize.py b/python/packages/kagent-adk/src/kagent/adk/_config_materialize.py new file mode 100644 index 0000000000..66e5406ffd --- /dev/null +++ b/python/packages/kagent-adk/src/kagent/adk/_config_materialize.py @@ -0,0 +1,57 @@ +"""Materialize Agent Substrate secret-backed configuration from environment variables. + +On Agent Substrate the ActorTemplate cannot mount the agent config as files; instead the +config is injected as secret-backed environment variables and the running process must write +them to the on-disk paths the ADK loads from at startup. This mirrors the Go ADK's +``MaterializeFromEnv`` (see ``go/adk/pkg/config/config_materialize.go``): the environment value +is written verbatim (raw, not base64-encoded) to the destination file. + +When the environment variables are absent (the normal Kubernetes Deployment path, where the +config is mounted as a volume) this is a no-op. +""" + +import logging +import os + +logger = logging.getLogger(__name__) + +# Environment variables injected by the substrate ActorTemplate, keyed to the file name the +# ADK loads from within the config directory. +_ENV_TO_CONFIG_FILE = { + "KAGENT_CONFIG_JSON": "config.json", + "KAGENT_AGENT_CARD_JSON": "agent-card.json", + "KAGENT_SRT_SETTINGS_JSON": "srt-settings.json", +} + +# The bearer token is materialized to a fixed path outside the config dir, matching the Go ADK. +_KAGENT_TOKEN_ENV = "KAGENT_TOKEN" +_KAGENT_TOKEN_PATH = "/var/run/secrets/tokens/kagent-token" + + +def _materialize_env_to_file(env_key: str, path: str) -> bool: + """Write the raw value of ``env_key`` to ``path`` (0600). Returns True if written.""" + value = os.getenv(env_key, "").strip() + if not value: + return False + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "w") as f: + f.write(value) + os.chmod(path, 0o600) + return True + + +def materialize_from_env(config_dir: str) -> None: + """Write substrate secret-backed env vars to the paths the ADK loads from. + + No-op for any variable that is unset, so the volume-mounted Deployment path is unaffected. + """ + for env_key, filename in _ENV_TO_CONFIG_FILE.items(): + if _materialize_env_to_file(env_key, os.path.join(config_dir, filename)): + logger.info("Materialized %s from %s", filename, env_key) + # Best-effort: the token path (/var/run/secrets/tokens) may not exist or be writable for a + # nonroot runtime. A missing token only degrades authenticated callbacks, so log and continue + # rather than crash startup. + try: + _materialize_env_to_file(_KAGENT_TOKEN_ENV, _KAGENT_TOKEN_PATH) + except OSError as e: + logger.warning("Could not materialize %s to %s: %s", _KAGENT_TOKEN_ENV, _KAGENT_TOKEN_PATH, e) diff --git a/python/packages/kagent-adk/src/kagent/adk/cli.py b/python/packages/kagent-adk/src/kagent/adk/cli.py index e32d0aacbf..9c32d19f2d 100644 --- a/python/packages/kagent-adk/src/kagent/adk/cli.py +++ b/python/packages/kagent-adk/src/kagent/adk/cli.py @@ -14,6 +14,7 @@ from kagent.core import KAgentConfig, configure_logging, configure_tracing from . import AgentConfig, KAgentApp +from ._config_materialize import materialize_from_env from .tools import add_skills_tool_to_agent logger = logging.getLogger(__name__) @@ -60,6 +61,10 @@ def static( ): app_cfg = KAgentConfig() + # On Agent Substrate the config is injected as secret-backed env vars rather than mounted + # files; materialize them into `filepath` before loading. No-op on the Deployment path. + materialize_from_env(filepath) + with open(os.path.join(filepath, "config.json"), "r") as f: config = json.load(f) agent_config = AgentConfig.model_validate(config) diff --git a/python/packages/kagent-adk/tests/unittests/test_config_materialize.py b/python/packages/kagent-adk/tests/unittests/test_config_materialize.py new file mode 100644 index 0000000000..f3e0bd8910 --- /dev/null +++ b/python/packages/kagent-adk/tests/unittests/test_config_materialize.py @@ -0,0 +1,81 @@ +import os + +import pytest + +from kagent.adk._config_materialize import materialize_from_env + + +def test_materializes_present_env_vars(tmp_path, monkeypatch): + monkeypatch.setenv("KAGENT_CONFIG_JSON", '{"model": {"type": "openai"}}') + monkeypatch.setenv("KAGENT_AGENT_CARD_JSON", '{"name": "test"}') + monkeypatch.setenv("KAGENT_SRT_SETTINGS_JSON", '{"network": {}}') + monkeypatch.delenv("KAGENT_TOKEN", raising=False) + + config_dir = tmp_path / "config" + materialize_from_env(str(config_dir)) + + assert (config_dir / "config.json").read_text() == '{"model": {"type": "openai"}}' + assert (config_dir / "agent-card.json").read_text() == '{"name": "test"}' + assert (config_dir / "srt-settings.json").read_text() == '{"network": {}}' + # Written with 0600 permissions, matching the Go ADK. + assert oct(os.stat(config_dir / "config.json").st_mode & 0o777) == "0o600" + + +def test_noop_when_env_absent(tmp_path, monkeypatch): + for key in ("KAGENT_CONFIG_JSON", "KAGENT_AGENT_CARD_JSON", "KAGENT_SRT_SETTINGS_JSON", "KAGENT_TOKEN"): + monkeypatch.delenv(key, raising=False) + + config_dir = tmp_path / "config" + # Should not raise and should not create the directory/files. + materialize_from_env(str(config_dir)) + + assert not (config_dir / "config.json").exists() + + +def test_blank_env_is_skipped(tmp_path, monkeypatch): + monkeypatch.setenv("KAGENT_CONFIG_JSON", " ") + monkeypatch.delenv("KAGENT_AGENT_CARD_JSON", raising=False) + + config_dir = tmp_path / "config" + materialize_from_env(str(config_dir)) + + assert not (config_dir / "config.json").exists() + + +def test_partial_env_only_writes_present(tmp_path, monkeypatch): + monkeypatch.setenv("KAGENT_CONFIG_JSON", "{}") + monkeypatch.delenv("KAGENT_AGENT_CARD_JSON", raising=False) + monkeypatch.delenv("KAGENT_SRT_SETTINGS_JSON", raising=False) + + config_dir = tmp_path / "config" + materialize_from_env(str(config_dir)) + + assert (config_dir / "config.json").exists() + assert not (config_dir / "agent-card.json").exists() + assert not (config_dir / "srt-settings.json").exists() + + +def test_unwritable_token_path_does_not_crash(tmp_path, monkeypatch): + # A nonroot runtime may not be able to write the token path; that must degrade gracefully + # (log + continue), not crash startup, and config files must still be materialized. + monkeypatch.setenv("KAGENT_CONFIG_JSON", "{}") + monkeypatch.setenv("KAGENT_TOKEN", "tok") + monkeypatch.setattr( + "kagent.adk._config_materialize._KAGENT_TOKEN_PATH", + str(tmp_path / "ro" / "tokens" / "kagent-token"), + ) + + # Make the token's parent dir creation fail as if on a read-only mount. + real_makedirs = os.makedirs + + def fake_makedirs(path, *args, **kwargs): + if "/ro/" in path or path.endswith("/ro"): + raise PermissionError("read-only file system") + return real_makedirs(path, *args, **kwargs) + + monkeypatch.setattr(os, "makedirs", fake_makedirs) + + config_dir = tmp_path / "config" + materialize_from_env(str(config_dir)) # must not raise + + assert (config_dir / "config.json").exists() diff --git a/scripts/controller-digest-ldflags.sh b/scripts/controller-digest-ldflags.sh index 718abd5ca4..498d069081 100755 --- a/scripts/controller-digest-ldflags.sh +++ b/scripts/controller-digest-ldflags.sh @@ -3,6 +3,7 @@ # # Required environment variables: # APP_IMG Python agent runtime image ref (repo:tag) +# APP_FULL_IMG Python agent full runtime image ref (repo:tag) # GOLANG_ADK_IMG Go agent runtime image ref (repo:tag) # GOLANG_ADK_FULL_IMG Go agent full runtime image ref (repo:tag) # ACP_SANDBOX_OPENCLAW_IMG acp-sandbox openclaw workload image ref (repo:tag) @@ -20,6 +21,7 @@ SUBSTRATE_PKG="github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate MANIFEST_ACCEPT="application/vnd.oci.image.index.v1+json, application/vnd.docker.distribution.manifest.list.v2+json, application/vnd.oci.image.manifest.v1+json, application/vnd.docker.distribution.manifest.v2+json" : "${APP_IMG:?APP_IMG is required}" +: "${APP_FULL_IMG:?APP_FULL_IMG is required}" : "${GOLANG_ADK_IMG:?GOLANG_ADK_IMG is required}" : "${GOLANG_ADK_FULL_IMG:?GOLANG_ADK_FULL_IMG is required}" : "${ACP_SANDBOX_OPENCLAW_IMG:?ACP_SANDBOX_OPENCLAW_IMG is required}" @@ -102,6 +104,7 @@ append_digest_ldflag() { } append_digest_ldflag "${TRANSLATOR_PKG}" "PythonADKImageDigest" "${APP_IMG}" +append_digest_ldflag "${TRANSLATOR_PKG}" "PythonADKFullImageDigest" "${APP_FULL_IMG}" append_digest_ldflag "${TRANSLATOR_PKG}" "GoADKImageDigest" "${GOLANG_ADK_IMG}" append_digest_ldflag "${TRANSLATOR_PKG}" "GoADKFullImageDigest" "${GOLANG_ADK_FULL_IMG}" append_digest_ldflag "${SUBSTRATE_PKG}" "AcpSandboxOpenClawImageDigest" "${ACP_SANDBOX_OPENCLAW_IMG}" diff --git a/ui/src/app/agents/new/page.tsx b/ui/src/app/agents/new/page.tsx index 91480ec19e..87c386952d 100644 --- a/ui/src/app/agents/new/page.tsx +++ b/ui/src/app/agents/new/page.tsx @@ -150,7 +150,9 @@ function AgentPageContent({ isEditMode, agentName, agentNamespace }: AgentPageCo const useDeclarativeAgentFields = formUsesDeclarativeSections(state.agentType); const substrateSandboxAgent = state.runInSandbox; - const showDeclarativeRuntimeField = useDeclarativeAgentFields && !substrateSandboxAgent; + // Substrate supports both Python and Go declarative runtimes, so the runtime selector is + // shown for declarative agents. + const showDeclarativeRuntimeField = useDeclarativeAgentFields; const showByoFields = formUsesByoSections(state.agentType); const showModelAndBehaviorSection = useDeclarativeAgentFields; const skillsEnabled = useDeclarativeAgentFields && !state.runInSandbox; @@ -225,7 +227,6 @@ function AgentPageContent({ isEditMode, agentName, agentNamespace }: AgentPageCo agentResponse.workloadMode === "sandbox" ? sandboxFieldsFromApiSpec(agent.spec?.substrate) : {}; - const isSubstrateSandbox = agentResponse.workloadMode === "sandbox"; const useDeclarativeForm = agent.spec.type === "Declarative"; if (useDeclarativeForm) { const decl = agent.spec?.declarative; @@ -257,11 +258,8 @@ function AgentPageContent({ isEditMode, agentName, agentNamespace }: AgentPageCo : [newEmptyGitSkillRow()], skillsGitAuthSecretName: agent.spec?.skills?.gitAuthSecretRef?.name || "", stream: decl?.stream ?? false, - declarativeRuntime: isSubstrateSandbox - ? "go" - : decl?.runtime === "go" - ? "go" - : "python", + // Honor the persisted runtime for all platforms (substrate supports Python and Go). + declarativeRuntime: decl?.runtime === "go" ? "go" : "python", selectedMemoryModel: memoryModelConfig ? { ref: memoryModelConfig, spec: { model: memorySpec?.modelConfig || "", provider: "" } } : null, @@ -359,6 +357,12 @@ function AgentPageContent({ isEditMode, agentName, agentNamespace }: AgentPageCo const newErrors = validateAgentData(formData); + // BYO agents on substrate must set an explicit command: substrate copies the container + // command verbatim and does not fall back to the image entrypoint (mirrors the backend). + if (state.agentType === "BYO" && substrateSandboxAgent && !state.byoCmd.trim()) { + newErrors.byoCmd = "Command is required for BYO agents on Agent Substrate"; + } + if (useDeclarativeAgentFields && skillsEnabled) { const skillsInput = { skillRefs: state.skillRefs || [], @@ -712,9 +716,10 @@ function AgentPageContent({ isEditMode, agentName, agentNamespace }: AgentPageCo Agent Substrate settings - Agent Substrate runs declarative agents as ate.dev actors using the Go ADK - runtime. Skills are not supported on substrate yet. A new substrate actor is started - for each chat session. + Agent Substrate runs declarative (Python or Go) and BYO agents as ate.dev + actors. BYO images must set an explicit command and serve A2A on port 80. + Skills are not supported on substrate yet. A new substrate actor is started for + each chat session.
@@ -851,6 +856,7 @@ function AgentPageContent({ isEditMode, agentName, agentNamespace }: AgentPageCo > setState((prev) => ({ ...prev, byoImage: v }))} onByoCmdChange={(v) => setState((prev) => ({ ...prev, byoCmd: v }))} diff --git a/ui/src/components/agent-form/ByoDeploymentFields.tsx b/ui/src/components/agent-form/ByoDeploymentFields.tsx index ebcb3c082e..e210474229 100644 --- a/ui/src/components/agent-form/ByoDeploymentFields.tsx +++ b/ui/src/components/agent-form/ByoDeploymentFields.tsx @@ -25,6 +25,7 @@ type EnvPair = { export function ByoDeploymentFields({ byoImage, + commandRequired = false, byoCmd, byoArgs, replicas, @@ -51,6 +52,8 @@ export function ByoDeploymentFields({ serviceAccountInputId = "agent-field-service-account-byo", }: { byoImage: string; + /** When true (BYO on Agent Substrate), the command is required and the label reflects that. */ + commandRequired?: boolean; byoCmd: string; byoArgs: string; replicas: string; @@ -58,7 +61,7 @@ export function ByoDeploymentFields({ imagePullSecrets: string[]; envPairs: EnvPair[]; serviceAccountName: string; - errors: Pick; + errors: Pick; disabled: boolean; onByoImageChange: (v: string) => void; onByoCmdChange: (v: string) => void; @@ -104,13 +107,22 @@ export function ByoDeploymentFields({
- Command (optional) + {commandRequired ? "Command (required)" : "Command (optional)"} + {commandRequired && ( + + Required on Agent Substrate: it copies the command verbatim and does not fall back to + the image entrypoint. + + )} onByoCmdChange(e.target.value)} placeholder="/app/start" disabled={disabled} + className={errors.byoCmd ? "border-destructive" : ""} + aria-invalid={!!errors.byoCmd} /> + {errors.byoCmd} Args (space-separated) diff --git a/ui/src/components/agent-form/agent-form-types.ts b/ui/src/components/agent-form/agent-form-types.ts index 26a8d89271..7df74bd1cc 100644 --- a/ui/src/components/agent-form/agent-form-types.ts +++ b/ui/src/components/agent-form/agent-form-types.ts @@ -14,5 +14,6 @@ export interface AgentFormValidationErrors { memoryTtl?: string; serviceAccountName?: string; promptSources?: string; + byoCmd?: string; agentHarness?: AgentHarnessFormValidationError; } diff --git a/ui/src/lib/__tests__/sandboxAgentForm.test.ts b/ui/src/lib/__tests__/sandboxAgentForm.test.ts index 03eeffd322..562fac6fb6 100644 --- a/ui/src/lib/__tests__/sandboxAgentForm.test.ts +++ b/ui/src/lib/__tests__/sandboxAgentForm.test.ts @@ -83,11 +83,9 @@ describe("substrate sandbox chat helpers", () => { }); describe("substrateSupportedForAgentType", () => { - it("disallows substrate for BYO agents", () => { - expect(substrateSupportedForAgentType("BYO")).toBe(false); - }); - it("allows substrate for declarative agents", () => { + it("allows substrate for declarative and BYO agents", () => { expect(substrateSupportedForAgentType("Declarative")).toBe(true); + expect(substrateSupportedForAgentType("BYO")).toBe(true); expect(substrateSupportedForAgentType(undefined)).toBe(true); }); }); diff --git a/ui/src/lib/sandboxAgentForm.ts b/ui/src/lib/sandboxAgentForm.ts index dda6fc8b04..6ed2934eea 100644 --- a/ui/src/lib/sandboxAgentForm.ts +++ b/ui/src/lib/sandboxAgentForm.ts @@ -29,9 +29,12 @@ export function buildSandboxSubstrateFromForm(agentFormData: AgentFormData): San return substrate; } -/** BYO agents cannot run on Agent Substrate; only declarative agents are supported. */ +/** + * Agent Substrate supports declarative (Python/Go) and BYO agents. AgentHarness has its own + * substrate runtime and is configured elsewhere. + */ export function substrateSupportedForAgentType(agentType: string | undefined): boolean { - return agentType !== "BYO"; + return agentType === "Declarative" || agentType === "BYO" || agentType === undefined; } /** Sandbox agents run on Agent Substrate with a dedicated actor per chat session. */