diff --git a/.github/workflows/integration-test-k8s.yml b/.github/workflows/integration-test-k8s.yml index 3b441dd30..e32f94cb0 100644 --- a/.github/workflows/integration-test-k8s.yml +++ b/.github/workflows/integration-test-k8s.yml @@ -18,11 +18,11 @@ on: jobs: integration-test: runs-on: ubuntu-latest - timeout-minutes: 60 + timeout-minutes: 75 strategy: fail-fast: false # Continue testing other profiles even if one fails matrix: - profile: [ai-gateway, aibrix, routing-strategies] + profile: [ai-gateway, aibrix, routing-strategies, llm-d] steps: - name: Check out the repo @@ -165,4 +165,3 @@ jobs: if: always() run: | make e2e-cleanup || true - diff --git a/e2e/README.md b/e2e/README.md index 6e977e5d6..ec5cbf6fd 100644 --- a/e2e/README.md +++ b/e2e/README.md @@ -16,7 +16,7 @@ The framework follows a **separation of concerns** design: - **aibrix**: Tests Semantic Router with vLLM AIBrix integration - **istio**: Tests Semantic Router with Istio Gateway (future) - **production-stack**: Tests vLLM Production Stack configurations (future) -- **llm-d**: Tests with LLM-D (future) +- **llm-d**: Tests Semantic Router with LLM-D distributed inference - **dynamo**: Tests with Nvidia Dynamo (future) ## Directory Structure diff --git a/e2e/cmd/e2e/main.go b/e2e/cmd/e2e/main.go index 3d2e6863b..23ddc3aae 100644 --- a/e2e/cmd/e2e/main.go +++ b/e2e/cmd/e2e/main.go @@ -12,11 +12,13 @@ import ( aigateway "github.com/vllm-project/semantic-router/e2e/profiles/ai-gateway" aibrix "github.com/vllm-project/semantic-router/e2e/profiles/aibrix" dynamicconfig "github.com/vllm-project/semantic-router/e2e/profiles/dynamic-config" + llmd "github.com/vllm-project/semantic-router/e2e/profiles/llm-d" routingstrategies "github.com/vllm-project/semantic-router/e2e/profiles/routing-strategies" // Import profiles to register test cases _ "github.com/vllm-project/semantic-router/e2e/profiles/ai-gateway" _ "github.com/vllm-project/semantic-router/e2e/profiles/aibrix" + _ "github.com/vllm-project/semantic-router/e2e/profiles/llm-d" _ "github.com/vllm-project/semantic-router/e2e/profiles/routing-strategies" ) @@ -105,6 +107,8 @@ func getProfile(name string) (framework.Profile, error) { return dynamicconfig.NewProfile(), nil case "aibrix": return aibrix.NewProfile(), nil + case "llm-d": + return llmd.NewProfile(), nil case "routing-strategies": return routingstrategies.NewProfile(), nil // Add more profiles here as they are implemented diff --git a/e2e/profiles/llm-d/manifests/httproute-services.yaml b/e2e/profiles/llm-d/manifests/httproute-services.yaml new file mode 100644 index 000000000..8eed2015b --- /dev/null +++ b/e2e/profiles/llm-d/manifests/httproute-services.yaml @@ -0,0 +1,51 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: vsr-llama8b-svc + namespace: default +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: vllm-llama3-8b-instruct + matches: + - path: + type: PathPrefix + value: / + headers: + - type: Exact + name: x-selected-model + value: llama3-8b + timeouts: + request: 300s +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: vsr-phi4-mini-svc + namespace: default +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: vllm-phi4-mini + matches: + - path: + type: PathPrefix + value: / + headers: + - type: Exact + name: x-selected-model + value: phi4-mini + timeouts: + request: 300s diff --git a/e2e/profiles/llm-d/manifests/inference-sim.yaml b/e2e/profiles/llm-d/manifests/inference-sim.yaml new file mode 100644 index 000000000..91c8e221a --- /dev/null +++ b/e2e/profiles/llm-d/manifests/inference-sim.yaml @@ -0,0 +1,101 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-llama3-8b-instruct + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: vllm-llama3-8b-instruct + template: + metadata: + labels: + app: vllm-llama3-8b-instruct + spec: + containers: + - name: sim + image: ghcr.io/llm-d/llm-d-inference-sim:v0.6.1 + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + args: + - --model + - llama3-8b + - --port + - "8000" + ports: + - containerPort: 8000 +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-llama3-8b-instruct + namespace: default + labels: + app: vllm-llama3-8b-instruct +spec: + type: ClusterIP + selector: + app: vllm-llama3-8b-instruct + ports: + - port: 8000 + targetPort: 8000 + protocol: TCP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: phi4-mini + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: phi4-mini + template: + metadata: + labels: + app: phi4-mini + spec: + containers: + - name: sim + image: ghcr.io/llm-d/llm-d-inference-sim:v0.6.1 + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + args: + - --model + - phi4-mini + - --port + - "8000" + ports: + - containerPort: 8000 +--- +apiVersion: v1 +kind: Service +metadata: + name: phi4-mini + namespace: default + labels: + app: phi4-mini +spec: + type: ClusterIP + selector: + app: phi4-mini + ports: + - port: 8000 + targetPort: 8000 + protocol: TCP diff --git a/e2e/profiles/llm-d/manifests/rbac.yaml b/e2e/profiles/llm-d/manifests/rbac.yaml new file mode 100644 index 000000000..60e4d6774 --- /dev/null +++ b/e2e/profiles/llm-d/manifests/rbac.yaml @@ -0,0 +1,27 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: llmd-epp-access +rules: + - apiGroups: ["inference.networking.k8s.io", "inference.networking.x-k8s.io"] + resources: ["inferencepools", "inferenceobjectives"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: llmd-epp-access-binding +subjects: + - kind: ServiceAccount + name: vllm-llama3-8b-instruct-epp + namespace: default + - kind: ServiceAccount + name: vllm-phi4-mini-epp + namespace: default +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: llmd-epp-access diff --git a/e2e/profiles/llm-d/profile.go b/e2e/profiles/llm-d/profile.go new file mode 100644 index 000000000..f7d446150 --- /dev/null +++ b/e2e/profiles/llm-d/profile.go @@ -0,0 +1,527 @@ +package llmd + +import ( + "context" + "fmt" + "os" + "os/exec" + "path/filepath" + "runtime" + "strings" + "time" + + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" + + "github.com/vllm-project/semantic-router/e2e/pkg/framework" + "github.com/vllm-project/semantic-router/e2e/pkg/helm" + "github.com/vllm-project/semantic-router/e2e/pkg/helpers" + + _ "github.com/vllm-project/semantic-router/e2e/testcases" +) + +const ( + kindNamespace = "default" + semanticNamespace = "vllm-semantic-router-system" + gatewayNamespace = "istio-system" + istioVersion = "1.28.0" + gatewayCRDURL = "https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.2.0/standard-install.yaml" + inferenceCRDURL = "https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v1.1.0/manifests.yaml" +) + +type Profile struct { + verbose bool +} + +func NewProfile() *Profile { + return &Profile{} +} + +func (p *Profile) Name() string { + return "llm-d" +} + +func (p *Profile) Description() string { + return "Tests Semantic Router with LLM-D distributed inference" +} + +func (p *Profile) Setup(ctx context.Context, opts *framework.SetupOptions) error { + p.verbose = opts.Verbose + + fmt.Printf("[Profile] llm-d setup start (istio=%s, gatewayCRD=%s, inferenceCRD=%s)\\n", + istioVersion, gatewayCRDURL, inferenceCRDURL) + + rollback := []func(){} + rollbackAll := func() { + for i := len(rollback) - 1; i >= 0; i-- { + rollback[i]() + } + } + + istioctlPath, err := p.ensureIstioctl(ctx) + if err != nil { + return err + } + if p.verbose { + fmt.Printf("[Profile] istioctl ready at %s\n", istioctlPath) + } + + if err := p.kubectlApply(ctx, gatewayCRDURL); err != nil { + return fmt.Errorf("gateway CRDs: %w", err) + } + rollback = append(rollback, func() { _ = p.kubectlDelete(ctx, gatewayCRDURL) }) + if p.verbose { + fmt.Println("[Profile] applied gateway CRDs") + } + if err := p.kubectlApply(ctx, inferenceCRDURL); err != nil { + rollbackAll() + return fmt.Errorf("inference CRDs: %w", err) + } + rollback = append(rollback, func() { _ = p.kubectlDelete(ctx, inferenceCRDURL) }) + if p.verbose { + fmt.Println("[Profile] applied inference CRDs") + } + + if err := p.installIstio(ctx, istioctlPath); err != nil { + rollbackAll() + return fmt.Errorf("install istio: %w", err) + } + rollback = append(rollback, func() { _ = p.uninstallIstio(ctx) }) + if p.verbose { + fmt.Println("[Profile] istio installed") + } + + if err := p.deploySemanticRouter(ctx, opts); err != nil { + rollbackAll() + return fmt.Errorf("deploy semantic router: %w", err) + } + rollback = append(rollback, func() { + deployer := helm.NewDeployer(opts.KubeConfig, opts.Verbose) + _ = deployer.Uninstall(ctx, "semantic-router", semanticNamespace) + }) + if p.verbose { + fmt.Println("[Profile] semantic-router deployed") + } + + if err := p.deployInferenceSim(ctx, opts); err != nil { + rollbackAll() + return fmt.Errorf("deploy inference sim: %w", err) + } + rollback = append(rollback, func() { _ = p.kubectlDelete(ctx, "e2e/profiles/llm-d/manifests/inference-sim.yaml") }) + if p.verbose { + fmt.Println("[Profile] inference simulators deployed") + } + + if err := p.deployLLMD(ctx); err != nil { + rollbackAll() + return fmt.Errorf("deploy llm-d resources: %w", err) + } + rollback = append(rollback, func() { + _ = p.kubectlDelete(ctx, "e2e/profiles/llm-d/manifests/rbac.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/llmd-base/dest-rule-epp-llama.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/llmd-base/dest-rule-epp-phi4.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/llmd-base/inferencepool-llama.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/llmd-base/inferencepool-phi4.yaml") + }) + if p.verbose { + fmt.Println("[Profile] llm-d schedulers and pools deployed") + } + + if err := p.deployGatewayRoutes(ctx); err != nil { + rollbackAll() + return fmt.Errorf("deploy gateway routes: %w", err) + } + rollback = append(rollback, func() { + _ = p.kubectlDelete(ctx, "deploy/kubernetes/istio/envoyfilter.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/istio/destinationrule.yaml") + _ = p.kubectlDelete(ctx, "e2e/profiles/llm-d/manifests/httproute-services.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/istio/gateway.yaml") + }) + if p.verbose { + fmt.Println("[Profile] gateway routes deployed") + } + + if err := p.waitHTTPRouteAccepted(ctx, "vsr-llama8b-svc", "default", 2*time.Minute); err != nil { + rollbackAll() + return err + } + if err := p.waitHTTPRouteResolvedRefs(ctx, "vsr-llama8b-svc", "default", 2*time.Minute); err != nil { + rollbackAll() + return err + } + if err := p.waitHTTPRouteAccepted(ctx, "vsr-phi4-mini-svc", "default", 2*time.Minute); err != nil { + rollbackAll() + return err + } + if err := p.waitHTTPRouteResolvedRefs(ctx, "vsr-phi4-mini-svc", "default", 2*time.Minute); err != nil { + rollbackAll() + return err + } + + if err := p.verifyEnvironment(ctx, opts); err != nil { + rollbackAll() + return fmt.Errorf("verify environment: %w", err) + } + + if p.verbose { + fmt.Println("[Profile] llm-d setup complete") + } + return nil +} + +func (p *Profile) Teardown(ctx context.Context, opts *framework.TeardownOptions) error { + p.verbose = opts.Verbose + fmt.Println("[Profile] llm-d teardown start") + _ = p.kubectlDelete(ctx, "e2e/profiles/llm-d/manifests/httproute-services.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/llmd-base/dest-rule-epp-llama.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/llmd-base/dest-rule-epp-phi4.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/llmd-base/inferencepool-llama.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/llmd-base/inferencepool-phi4.yaml") + _ = p.kubectlDelete(ctx, "e2e/profiles/llm-d/manifests/inference-sim.yaml") + _ = p.kubectlDelete(ctx, "e2e/profiles/llm-d/manifests/rbac.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/istio/envoyfilter.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/istio/destinationrule.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/istio/gateway.yaml") + + deployer := helm.NewDeployer(opts.KubeConfig, opts.Verbose) + deployer.Uninstall(ctx, "semantic-router", semanticNamespace) + + _ = p.uninstallIstio(ctx) + _ = p.kubectlDelete(ctx, gatewayCRDURL) + _ = p.kubectlDelete(ctx, inferenceCRDURL) + fmt.Println("[Profile] llm-d teardown complete") + + return nil +} + +func (p *Profile) GetTestCases() []string { + // Shared router testcases that we also want to validate in the llm-d environment + shared := []string{ + "chat-completions-request", + "chat-completions-stress-request", + "chat-completions-progressive-stress", + "domain-classify", + } + + // For llm-d we currently only reuse shared router testcases. + // llm-d-specific HA/traffic semantics are expected to be covered in LLM-D / infra tests. + return shared +} + +func (p *Profile) GetServiceConfig() framework.ServiceConfig { + return framework.ServiceConfig{ + Name: "inference-gateway-istio", + Namespace: kindNamespace, + PortMapping: "8080:80", + } +} + +func (p *Profile) ensureIstioctl(ctx context.Context) (string, error) { + if path, err := exec.LookPath("istioctl"); err == nil { + return path, nil + } + + osPart := runtime.GOOS + if osPart == "darwin" { + osPart = "osx" + } + arch := runtime.GOARCH + platform := fmt.Sprintf("%s-%s", osPart, arch) + + cacheDir := filepath.Join(os.TempDir(), "istioctl-"+istioVersion+"-"+platform) + bin := filepath.Join(cacheDir, "istioctl") + if _, err := os.Stat(bin); err == nil { + return bin, nil + } + + if err := os.MkdirAll(cacheDir, 0o755); err != nil { + return "", err + } + + url := fmt.Sprintf("https://github.com/istio/istio/releases/download/%s/istioctl-%s-%s.tar.gz", istioVersion, istioVersion, platform) + tgz := filepath.Join(cacheDir, "istioctl.tgz") + + if err := p.runCmd(ctx, "curl", "-fL", "-o", tgz, url); err != nil { + return "", err + } + if err := p.runCmd(ctx, "tar", "-xzf", tgz, "-C", cacheDir); err != nil { + return "", err + } + if err := os.Chmod(bin, 0o755); err != nil { + return "", err + } + return bin, nil +} + +func (p *Profile) installIstio(ctx context.Context, istioctl string) error { + return p.runCmd(ctx, istioctl, "install", "-y", "--set", "profile=minimal", "--set", "values.pilot.env.ENABLE_GATEWAY_API=true", "--set", "values.pilot.env.ENABLE_GATEWAY_API_INFERENCE_EXTENSION=true") +} + +func (p *Profile) uninstallIstio(ctx context.Context) error { + istioctl, err := exec.LookPath("istioctl") + if err != nil { + return nil + } + return p.runCmd(ctx, istioctl, "x", "uninstall", "--purge", "-y") +} + +func (p *Profile) deploySemanticRouter(ctx context.Context, opts *framework.SetupOptions) error { + deployer := helm.NewDeployer(opts.KubeConfig, opts.Verbose) + installOpts := helm.InstallOptions{ + ReleaseName: "semantic-router", + Chart: "deploy/helm/semantic-router", + Namespace: semanticNamespace, + ValuesFiles: []string{"e2e/profiles/llm-d/values.yaml"}, + Set: map[string]string{ + "image.repository": "ghcr.io/vllm-project/semantic-router/extproc", + "image.tag": opts.ImageTag, + "image.pullPolicy": "Never", + }, + Wait: true, + Timeout: "20m", + } + if err := deployer.Install(ctx, installOpts); err != nil { + return err + } + return deployer.WaitForDeployment(ctx, semanticNamespace, "semantic-router", 10*time.Minute) +} + +func (p *Profile) deployInferenceSim(ctx context.Context, opts *framework.SetupOptions) error { + return p.kubectlApply(ctx, "e2e/profiles/llm-d/manifests/inference-sim.yaml") +} + +func (p *Profile) deployLLMD(ctx context.Context) error { + if err := p.kubectlApply(ctx, "deploy/kubernetes/llmd-base/inferencepool-llama.yaml"); err != nil { + return err + } + if err := p.kubectlApply(ctx, "deploy/kubernetes/llmd-base/inferencepool-phi4.yaml"); err != nil { + return err + } + if err := p.kubectlApply(ctx, "deploy/kubernetes/llmd-base/dest-rule-epp-llama.yaml"); err != nil { + return err + } + if err := p.kubectlApply(ctx, "deploy/kubernetes/llmd-base/dest-rule-epp-phi4.yaml"); err != nil { + return err + } + if err := p.kubectlApply(ctx, "e2e/profiles/llm-d/manifests/rbac.yaml"); err != nil { + return err + } + return nil +} + +func (p *Profile) deployGatewayRoutes(ctx context.Context) error { + if err := p.kubectlApply(ctx, "deploy/kubernetes/istio/gateway.yaml"); err != nil { + return err + } + if err := p.kubectlApply(ctx, "e2e/profiles/llm-d/manifests/httproute-services.yaml"); err != nil { + return err + } + if err := p.kubectlApply(ctx, "deploy/kubernetes/istio/destinationrule.yaml"); err != nil { + return err + } + if err := p.kubectlApply(ctx, "deploy/kubernetes/istio/envoyfilter.yaml"); err != nil { + return err + } + // Ensure EnvoyFilter ext-proc matches Gateway listener context for this e2e run + _ = p.patchEnvoyFilterForGateway(ctx) + return nil +} + +func (p *Profile) verifyEnvironment(ctx context.Context, opts *framework.SetupOptions) error { + config, err := clientcmd.BuildConfigFromFlags("", opts.KubeConfig) + if err != nil { + return err + } + client, err := kubernetes.NewForConfig(config) + if err != nil { + return err + } + + // Verify required CRDs/APIs from Gateway API and Inference Extension are registered. + type apiCheck struct { + groupVersion string + expectedResources []string + optional bool + } + checkAPIGroup := func(c apiCheck) error { + resources, err := client.Discovery().ServerResourcesForGroupVersion(c.groupVersion) + if err != nil { + if c.optional { + if p.verbose { + fmt.Printf("[Verify] API group %s not found (optional): %v\n", c.groupVersion, err) + } + return nil + } + return fmt.Errorf("discover %s: %w", c.groupVersion, err) + } + found := make(map[string]bool, len(resources.APIResources)) + for _, r := range resources.APIResources { + found[r.Name] = true + } + for _, r := range c.expectedResources { + if !found[r] { + if c.optional { + if p.verbose { + fmt.Printf("[Verify] Missing optional resource %s in %s\n", r, c.groupVersion) + } + return nil + } + return fmt.Errorf("missing %s in %s", r, c.groupVersion) + } + } + if p.verbose { + fmt.Printf("[Verify] API group %s present with %v\n", c.groupVersion, c.expectedResources) + } + return nil + } + + for _, c := range []apiCheck{ + {groupVersion: "gateway.networking.k8s.io/v1", expectedResources: []string{"gateways", "httproutes"}}, + {groupVersion: "inference.networking.k8s.io/v1", expectedResources: []string{"inferencepools"}}, + // EndpointPickerConfig CRD is optional in some environments; treat as best-effort. + {groupVersion: "inference.networking.x-k8s.io/v1alpha1", expectedResources: []string{"endpointpickerconfigs"}, optional: true}, + } { + if err := checkAPIGroup(c); err != nil { + return err + } + } + + // endpoints readiness check moved after deployments ready + + // Actively wait for critical deployments to become Available before checking readiness counts. + // This avoids flakiness when resources are still pulling images just after creation. + deployer := helm.NewDeployer(opts.KubeConfig, opts.Verbose) + deploymentsToWait := []struct { + ns, name string + }{ + {semanticNamespace, "semantic-router"}, + {gatewayNamespace, "istiod"}, + {"default", "vllm-llama3-8b-instruct"}, + {"default", "phi4-mini"}, + {"default", "llm-d-inference-scheduler-llama3-8b"}, + {"default", "llm-d-inference-scheduler-phi4-mini"}, + {"default", "inference-gateway-istio"}, + } + for _, d := range deploymentsToWait { + if err := deployer.WaitForDeployment(ctx, d.ns, d.name, 10*time.Minute); err != nil { + return fmt.Errorf("wait for deployment %s/%s: %w", d.ns, d.name, err) + } + } + + if err := helpers.CheckDeployment(ctx, client, semanticNamespace, "semantic-router", p.verbose); err != nil { + return err + } + if err := helpers.CheckDeployment(ctx, client, gatewayNamespace, "istiod", p.verbose); err != nil { + return err + } + if err := helpers.CheckDeployment(ctx, client, "default", "vllm-llama3-8b-instruct", p.verbose); err != nil { + return err + } + if err := helpers.CheckDeployment(ctx, client, "default", "phi4-mini", p.verbose); err != nil { + return err + } + if err := helpers.CheckDeployment(ctx, client, "default", "llm-d-inference-scheduler-llama3-8b", p.verbose); err != nil { + return err + } + if err := helpers.CheckDeployment(ctx, client, "default", "llm-d-inference-scheduler-phi4-mini", p.verbose); err != nil { + return err + } + if err := helpers.VerifyServicePodsRunning(ctx, client, "default", "inference-gateway-istio", p.verbose); err != nil { + return err + } + if err := p.checkInferencePoolEndpointReady(ctx, client, "default", "vllm-llama3-8b-instruct", 2*time.Minute); err != nil { + return err + } + if err := p.checkInferencePoolEndpointReady(ctx, client, "default", "phi4-mini", 2*time.Minute); err != nil { + return err + } + return nil +} + +// Note: GAIE controller is shipped by some providers (e.g., kgateway, nginx-gateway) or via provider-specific enable flags. +// For Istio-based profile we rely on pilot env ENABLE_GATEWAY_API_INFERENCE_EXTENSION=true instead of a standalone controller manifest. + +func (p *Profile) runCmdOutput(ctx context.Context, name string, args ...string) (string, error) { + cmd := exec.CommandContext(ctx, name, args...) + out, err := cmd.CombinedOutput() + if err != nil { + return "", err + } + return string(out), nil +} + +func (p *Profile) waitHTTPRouteAccepted(ctx context.Context, name, ns string, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + out, err := p.runCmdOutput(ctx, "kubectl", "get", "httproute", name, "-n", ns, "-o", "jsonpath={.status.parents[*].conditions[?(@.type==\"Accepted\")].status}") + if err == nil && strings.Contains(out, "True") { + return nil + } + time.Sleep(2 * time.Second) + } + if p.verbose { + _ = p.runCmd(ctx, "kubectl", "-n", "gateway-inference-system", "logs", "deploy/gateway-api-inference-extension-controller", "--tail=100") + _ = p.runCmd(ctx, "kubectl", "-n", "default", "logs", "deploy/inference-gateway-istio", "--tail=100") + } + return fmt.Errorf("HTTPRoute %s/%s not Accepted", ns, name) +} + +func (p *Profile) waitHTTPRouteResolvedRefs(ctx context.Context, name, ns string, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + out, err := p.runCmdOutput(ctx, "kubectl", "get", "httproute", name, "-n", ns, "-o", "jsonpath={.status.parents[*].conditions[?(@.type==\"ResolvedRefs\")].status}") + if err == nil && strings.Contains(out, "True") { + return nil + } + time.Sleep(2 * time.Second) + } + if p.verbose { + _ = p.runCmd(ctx, "kubectl", "-n", "gateway-inference-system", "logs", "deploy/gateway-api-inference-extension-controller", "--tail=100") + } + return fmt.Errorf("HTTPRoute %s/%s not ResolvedRefs", ns, name) +} + +func (p *Profile) checkInferencePoolEndpointReady(ctx context.Context, client *kubernetes.Clientset, ns, name string, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + ep, err := client.CoreV1().Endpoints(ns).Get(ctx, name, v1.GetOptions{}) + if err != nil { + return err + } + addrs := 0 + for _, s := range ep.Subsets { + addrs += len(s.Addresses) + } + if addrs > 0 { + return nil + } + time.Sleep(2 * time.Second) + } + return fmt.Errorf("endpoints %s/%s empty", ns, name) +} + +func (p *Profile) runCmd(ctx context.Context, name string, args ...string) error { + cmd := exec.CommandContext(ctx, name, args...) + if p.verbose { + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + } + return cmd.Run() +} + +func (p *Profile) kubectlApply(ctx context.Context, target string) error { + return p.runCmd(ctx, "kubectl", "apply", "-f", target) +} + +func (p *Profile) kubectlDelete(ctx context.Context, target string) error { + return p.runCmd(ctx, "kubectl", "delete", "-f", target, "--ignore-not-found") +} +func (p *Profile) patchEnvoyFilterForGateway(ctx context.Context) error { + // Add match.context=GATEWAY and listener.portNumber=80 to the first configPatch via JSON patch + patch := `[ + {"op":"add","path":"/spec/configPatches/0/match/context","value":"GATEWAY"}, + {"op":"add","path":"/spec/configPatches/0/match/listener/portNumber","value":80} + ]` + return p.runCmd(ctx, "kubectl", "-n", "default", "patch", "envoyfilter", "semantic-router", "--type=json", "-p", patch) +} diff --git a/e2e/profiles/llm-d/values.yaml b/e2e/profiles/llm-d/values.yaml new file mode 100644 index 000000000..21ea100fa --- /dev/null +++ b/e2e/profiles/llm-d/values.yaml @@ -0,0 +1,69 @@ +# Profile revision 2025-11-21: math -> phi4-mini; cs/default -> llama3-8b; removed global HTTPRoute catch-all. +config: + # Allow Envoy to re-run route matching after Semantic Router sets x-selected-model. + # Without this, Gateway API routes that depend on that header won't be chosen and return 404. + clear_route_cache: true + default_model: llama3-8b + # Enable domain classification to return the x-vsr-selected-category header + classifier: + category_model: + model_id: models/category_classifier_modernbert-base_model + threshold: 0.6 + use_modernbert: true + category_mapping_path: models/category_classifier_modernbert-base_model/category_mapping.json + pii_model: + model_id: "" + threshold: 1.0 + use_modernbert: false + pii_mapping_path: "" + # Optional domains used by decision rules + categories: + - name: math + description: "Mathematics, arithmetic, calculation" + - name: computer science + description: "Computer networks, programming, systems" + decisions: + - name: math_route + priority: 20 + rules: + operator: OR + conditions: + - type: domain + name: math + modelRefs: + - model: phi4-mini + use_reasoning: false + - name: cs_route + priority: 10 + rules: + operator: OR + conditions: + - type: domain + name: computer science + modelRefs: + - model: llama3-8b + use_reasoning: false + - name: default_route + priority: 1 + rules: + operator: OR + conditions: + - type: domain + name: other + modelRefs: + - model: llama3-8b + use_reasoning: false + semantic_cache: + enabled: false + prompt_guard: + enabled: false + tools: + enabled: false + bert_model: + model_id: models/all-MiniLM-L12-v2 + threshold: 0.6 + use_cpu: true + +# Keep consistent with the default chart: initContainer, model downloads, and PVC use chart defaults +image: + pullPolicy: IfNotPresent diff --git a/tools/make/e2e.mk b/tools/make/e2e.mk index cd8981b89..e417053ce 100644 --- a/tools/make/e2e.mk +++ b/tools/make/e2e.mk @@ -96,6 +96,7 @@ e2e-help: ## Show help for E2E testing @echo "Available Profiles:" @echo " ai-gateway - Test Semantic Router with Envoy AI Gateway" @echo " aibrix - Test Semantic Router with vLLM AIBrix" + @echo " llm-d - Test Semantic Router with LLM-D" @echo " istio - Test Semantic Router with Istio (coming soon)" @echo "" @echo "Environment Variables:" @@ -127,4 +128,3 @@ e2e-help: ## Show help for E2E testing @echo " 2. make e2e-test-only # Run all tests" @echo " 3. make e2e-test-only E2E_TESTS=\"test1\" # Run specific test" @echo " 4. make e2e-cleanup # Clean up when done" -