diff --git a/api/v1alpha1/llamastackdistribution_types.go b/api/v1alpha1/llamastackdistribution_types.go index fc3e3e9a..71f7e548 100644 --- a/api/v1alpha1/llamastackdistribution_types.go +++ b/api/v1alpha1/llamastackdistribution_types.go @@ -81,7 +81,14 @@ type LlamaStackDistributionSpec struct { type ServerSpec struct { Distribution DistributionType `json:"distribution"` ContainerSpec ContainerSpec `json:"containerSpec,omitempty"` - PodOverrides *PodOverrides `json:"podOverrides,omitempty"` // Optional pod-level overrides + // Workers configures the number of uvicorn worker processes to run. + // When set, the operator will launch llama-stack using uvicorn with the specified worker count. + // Ref: https://fastapi.tiangolo.com/deployment/server-workers/ + // CPU requests are set to the number of workers when set, otherwise 1 full core + // +optional + // +kubebuilder:validation:Minimum=1 + Workers *int32 `json:"workers,omitempty"` + PodOverrides *PodOverrides `json:"podOverrides,omitempty"` // Optional pod-level overrides // PodDisruptionBudget controls voluntary disruption tolerance for the server pods // +optional PodDisruptionBudget *PodDisruptionBudgetSpec `json:"podDisruptionBudget,omitempty"` diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index ec9b5534..678d9a65 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -344,6 +344,11 @@ func (in *ServerSpec) DeepCopyInto(out *ServerSpec) { *out = *in out.Distribution = in.Distribution in.ContainerSpec.DeepCopyInto(&out.ContainerSpec) + if in.Workers != nil { + in, out := &in.Workers, &out.Workers + *out = new(int32) + **out = **in + } if in.PodOverrides != nil { in, out := &in.PodOverrides, &out.PodOverrides *out = new(PodOverrides) diff --git a/config/crd/bases/llamastack.io_llamastackdistributions.yaml b/config/crd/bases/llamastack.io_llamastackdistributions.yaml index 3dda47da..e51b0926 100644 --- a/config/crd/bases/llamastack.io_llamastackdistributions.yaml +++ b/config/crd/bases/llamastack.io_llamastackdistributions.yaml @@ -2568,6 +2568,15 @@ spec: required: - configMapName type: object + workers: + description: |- + Workers configures the number of uvicorn worker processes to run. + When set, the operator will launch llama-stack using uvicorn with the specified worker count. + Ref: https://fastapi.tiangolo.com/deployment/server-workers/ + CPU requests are set to the number of workers when set, otherwise 1 full core + format: int32 + minimum: 1 + type: integer required: - distribution type: object diff --git a/config/samples/_v1alpha1_llamastackdistribution.yaml b/config/samples/_v1alpha1_llamastackdistribution.yaml index 646a673b..8e11036f 100644 --- a/config/samples/_v1alpha1_llamastackdistribution.yaml +++ b/config/samples/_v1alpha1_llamastackdistribution.yaml @@ -14,6 +14,7 @@ spec: name: llama-stack distribution: name: starter + workers: 2 podDisruptionBudget: minAvailable: 1 topologySpreadConstraints: diff --git a/controllers/resource_helper.go b/controllers/resource_helper.go index 654bb16c..064edf0e 100644 --- a/controllers/resource_helper.go +++ b/controllers/resource_helper.go @@ -21,14 +21,17 @@ import ( "errors" "fmt" "regexp" + "strconv" "strings" llamav1alpha1 "github.com/llamastack/llama-stack-k8s-operator/api/v1alpha1" autoscalingv2 "k8s.io/api/autoscaling/v2" corev1 "k8s.io/api/core/v1" policyv1 "k8s.io/api/policy/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" + ctrlLog "sigs.k8s.io/controller-runtime/pkg/log" ) // Constants for validation limits. @@ -91,21 +94,27 @@ try: print('Using core module path (llama_stack.core.server.server)', file=sys.stderr) print(1) else: - print('Using new CLI command (llama stack run)', file=sys.stderr) + print('Using uvicorn CLI command', file=sys.stderr) print(2) except Exception as e: print(f'Version detection failed, defaulting to new CLI: {e}', file=sys.stderr) print(2) ") +PORT=${LLS_PORT:-8321} +WORKERS=${LLS_WORKERS:-1} + # Execute the appropriate CLI based on version case $VERSION_CODE in 0) python3 -m llama_stack.distribution.server.server --config /etc/llama-stack/run.yaml ;; 1) python3 -m llama_stack.core.server.server /etc/llama-stack/run.yaml ;; - 2) llama stack run /etc/llama-stack/run.yaml ;; - *) echo "Invalid version code: $VERSION_CODE, using new CLI"; llama stack run /etc/llama-stack/run.yaml ;; + 2) exec uvicorn llama_stack.core.server.server:create_app --host 0.0.0.0 --port "$PORT" --workers "$WORKERS" --factory ;; + *) echo "Invalid version code: $VERSION_CODE, using uvicorn CLI command"; \ + exec uvicorn llama_stack.core.server.server:create_app --host 0.0.0.0 --port "$PORT" --workers "$WORKERS" --factory ;; esac` +const llamaStackConfigPath = "/etc/llama-stack/run.yaml" + // validateConfigMapKeys validates that all ConfigMap keys contain only safe characters. // Note: This function validates key names only. PEM content validation is performed // separately in the controller's reconcileCABundleConfigMap function. @@ -152,10 +161,12 @@ func getStartupProbe(instance *llamav1alpha1.LlamaStackDistribution) *corev1.Pro // buildContainerSpec creates the container specification. func buildContainerSpec(ctx context.Context, r *LlamaStackDistributionReconciler, instance *llamav1alpha1.LlamaStackDistribution, image string) corev1.Container { + workers, workersSet := getEffectiveWorkers(instance) + container := corev1.Container{ Name: getContainerName(instance), Image: image, - Resources: resolveContainerResources(instance.Spec.Server.ContainerSpec), + Resources: resolveContainerResources(instance.Spec.Server.ContainerSpec, workers, workersSet), Ports: []corev1.ContainerPort{{ContainerPort: getContainerPort(instance)}}, StartupProbe: getStartupProbe(instance), } @@ -170,22 +181,59 @@ func buildContainerSpec(ctx context.Context, r *LlamaStackDistributionReconciler // resolveContainerResources ensures the container always has CPU and memory // requests defined so that HPAs using utilization metrics can function. -func resolveContainerResources(spec llamav1alpha1.ContainerSpec) corev1.ResourceRequirements { +func resolveContainerResources(spec llamav1alpha1.ContainerSpec, workers int32, workersSet bool) corev1.ResourceRequirements { resources := spec.Resources + ensureRequests(&resources, workers) + if workersSet { + ensureLimitsMatchRequests(&resources) + } + + cpuReq := resources.Requests[corev1.ResourceCPU] + memReq := resources.Requests[corev1.ResourceMemory] + cpuLimit := resources.Limits[corev1.ResourceCPU] + memLimit := resources.Limits[corev1.ResourceMemory] + + ctrlLog.Log.WithName("resource_helper").WithValues( + "workers", workers, + "workersEnabled", workersSet, + ).V(1).Info("Defaulted resource values for llama-stack container", + "cpuRequest", cpuReq.String(), + "memoryRequest", memReq.String(), + "cpuLimit", cpuLimit.String(), + "memoryLimit", memLimit.String(), + ) + + return resources +} + +func ensureRequests(resources *corev1.ResourceRequirements, workers int32) { if resources.Requests == nil { resources.Requests = corev1.ResourceList{} } if cpuQty, ok := resources.Requests[corev1.ResourceCPU]; !ok || cpuQty.IsZero() { - resources.Requests[corev1.ResourceCPU] = llamav1alpha1.DefaultServerCPURequest + // Default to 1 full core per worker unless user overrides. + resources.Requests[corev1.ResourceCPU] = resource.MustParse(strconv.Itoa(int(workers))) } if memQty, ok := resources.Requests[corev1.ResourceMemory]; !ok || memQty.IsZero() { resources.Requests[corev1.ResourceMemory] = llamav1alpha1.DefaultServerMemoryRequest } +} - return resources +func ensureLimitsMatchRequests(resources *corev1.ResourceRequirements) { + if resources.Limits == nil { + resources.Limits = corev1.ResourceList{} + } + + if cpuLimit, ok := resources.Limits[corev1.ResourceCPU]; !ok || cpuLimit.IsZero() { + resources.Limits[corev1.ResourceCPU] = resources.Requests[corev1.ResourceCPU] + } + + if memLimit, ok := resources.Limits[corev1.ResourceMemory]; !ok || memLimit.IsZero() { + resources.Limits[corev1.ResourceMemory] = resources.Requests[corev1.ResourceMemory] + } } // getContainerName returns the container name, using custom name if specified. @@ -204,9 +252,18 @@ func getContainerPort(instance *llamav1alpha1.LlamaStackDistribution) int32 { return llamav1alpha1.DefaultServerPort } +// getEffectiveWorkers returns a positive worker count, defaulting to 1. +func getEffectiveWorkers(instance *llamav1alpha1.LlamaStackDistribution) (int32, bool) { + if instance.Spec.Server.Workers != nil && *instance.Spec.Server.Workers > 0 { + return *instance.Spec.Server.Workers, true + } + return 1, false +} + // configureContainerEnvironment sets up environment variables for the container. func configureContainerEnvironment(ctx context.Context, r *LlamaStackDistributionReconciler, instance *llamav1alpha1.LlamaStackDistribution, container *corev1.Container) { mountPath := getMountPath(instance) + workers, _ := getEffectiveWorkers(instance) // Add HF_HOME variable to our mount path so that downloaded models and datasets are stored // on the same volume as the storage. This is not critical but useful if the server is @@ -227,6 +284,22 @@ func configureContainerEnvironment(ctx context.Context, r *LlamaStackDistributio }) } + // Always provide worker/port/config env for uvicorn; workers default to 1 when unspecified. + container.Env = append(container.Env, + corev1.EnvVar{ + Name: "LLS_WORKERS", + Value: strconv.Itoa(int(workers)), + }, + corev1.EnvVar{ + Name: "LLS_PORT", + Value: strconv.Itoa(int(getContainerPort(instance))), + }, + corev1.EnvVar{ + Name: "LLAMA_STACK_CONFIG", + Value: llamaStackConfigPath, + }, + ) + // Finally, add the user provided env vars container.Env = append(container.Env, instance.Spec.Server.ContainerSpec.Env...) } diff --git a/controllers/resource_helper_test.go b/controllers/resource_helper_test.go index 69519e15..b5830947 100644 --- a/controllers/resource_helper_test.go +++ b/controllers/resource_helper_test.go @@ -32,6 +32,10 @@ import ( "k8s.io/apimachinery/pkg/util/intstr" ) +func int32Ptr(val int32) *int32 { + return &val +} + func TestBuildContainerSpec(t *testing.T) { testCases := []struct { name string @@ -54,7 +58,7 @@ func TestBuildContainerSpec(t *testing.T) { Image: "test-image:latest", Resources: corev1.ResourceRequirements{ Requests: corev1.ResourceList{ - corev1.ResourceCPU: llamav1alpha1.DefaultServerCPURequest, + corev1.ResourceCPU: resource.MustParse("1"), corev1.ResourceMemory: llamav1alpha1.DefaultServerMemoryRequest, }, }, @@ -66,6 +70,9 @@ func TestBuildContainerSpec(t *testing.T) { }}, Env: []corev1.EnvVar{ {Name: "HF_HOME", Value: "/.llama"}, + {Name: "LLS_WORKERS", Value: "1"}, + {Name: "LLS_PORT", Value: "8321"}, + {Name: "LLAMA_STACK_CONFIG", Value: "/etc/llama-stack/run.yaml"}, }, }, }, @@ -101,7 +108,7 @@ func TestBuildContainerSpec(t *testing.T) { StartupProbe: newDefaultStartupProbe(9000), Resources: corev1.ResourceRequirements{ Requests: corev1.ResourceList{ - corev1.ResourceCPU: llamav1alpha1.DefaultServerCPURequest, + corev1.ResourceCPU: resource.MustParse("1"), corev1.ResourceMemory: llamav1alpha1.DefaultServerMemoryRequest, }, Limits: corev1.ResourceList{ @@ -111,6 +118,9 @@ func TestBuildContainerSpec(t *testing.T) { }, Env: []corev1.EnvVar{ {Name: "HF_HOME", Value: "/custom/path"}, + {Name: "LLS_WORKERS", Value: "1"}, + {Name: "LLS_PORT", Value: "9000"}, + {Name: "LLAMA_STACK_CONFIG", Value: "/etc/llama-stack/run.yaml"}, {Name: "TEST_ENV", Value: "test-value"}, }, VolumeMounts: []corev1.VolumeMount{{ @@ -138,7 +148,7 @@ func TestBuildContainerSpec(t *testing.T) { Image: "test-image:latest", Resources: corev1.ResourceRequirements{ Requests: corev1.ResourceList{ - corev1.ResourceCPU: llamav1alpha1.DefaultServerCPURequest, + corev1.ResourceCPU: resource.MustParse("1"), corev1.ResourceMemory: llamav1alpha1.DefaultServerMemoryRequest, }, }, @@ -152,7 +162,47 @@ func TestBuildContainerSpec(t *testing.T) { }}, Env: []corev1.EnvVar{ {Name: "HF_HOME", Value: "/.llama"}, + {Name: "LLS_WORKERS", Value: "1"}, + {Name: "LLS_PORT", Value: "8321"}, + {Name: "LLAMA_STACK_CONFIG", Value: "/etc/llama-stack/run.yaml"}, + }, + }, + }, + { + name: "uvicorn workers configured", + instance: &llamav1alpha1.LlamaStackDistribution{ + Spec: llamav1alpha1.LlamaStackDistributionSpec{ + Server: llamav1alpha1.ServerSpec{ + Workers: int32Ptr(4), + }, + }, + }, + image: "test-image:latest", + expectedResult: corev1.Container{ + Name: llamav1alpha1.DefaultContainerName, + Image: "test-image:latest", + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("4"), + corev1.ResourceMemory: llamav1alpha1.DefaultServerMemoryRequest, + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("4"), + corev1.ResourceMemory: llamav1alpha1.DefaultServerMemoryRequest, + }, + }, + Ports: []corev1.ContainerPort{{ContainerPort: llamav1alpha1.DefaultServerPort}}, + StartupProbe: newDefaultStartupProbe(llamav1alpha1.DefaultServerPort), + Env: []corev1.EnvVar{ + {Name: "HF_HOME", Value: "/.llama"}, + {Name: "LLS_WORKERS", Value: "4"}, + {Name: "LLS_PORT", Value: "8321"}, + {Name: "LLAMA_STACK_CONFIG", Value: "/etc/llama-stack/run.yaml"}, }, + VolumeMounts: []corev1.VolumeMount{{ + Name: "lls-storage", + MountPath: llamav1alpha1.DefaultMountPath, + }}, }, }, { @@ -177,7 +227,7 @@ func TestBuildContainerSpec(t *testing.T) { ImagePullPolicy: corev1.PullAlways, Resources: corev1.ResourceRequirements{ Requests: corev1.ResourceList{ - corev1.ResourceCPU: llamav1alpha1.DefaultServerCPURequest, + corev1.ResourceCPU: resource.MustParse("1"), corev1.ResourceMemory: llamav1alpha1.DefaultServerMemoryRequest, }, }, @@ -187,6 +237,9 @@ func TestBuildContainerSpec(t *testing.T) { Args: []string{}, Env: []corev1.EnvVar{ {Name: "HF_HOME", Value: llamav1alpha1.DefaultMountPath}, + {Name: "LLS_WORKERS", Value: "1"}, + {Name: "LLS_PORT", Value: "8321"}, + {Name: "LLAMA_STACK_CONFIG", Value: "/etc/llama-stack/run.yaml"}, }, VolumeMounts: []corev1.VolumeMount{ { diff --git a/docs/api-overview.md b/docs/api-overview.md index 2c86f317..b24178e5 100644 --- a/docs/api-overview.md +++ b/docs/api-overview.md @@ -218,6 +218,7 @@ _Appears in:_ | --- | --- | --- | --- | | `distribution` _[DistributionType](#distributiontype)_ | | | | | `containerSpec` _[ContainerSpec](#containerspec)_ | | | | +| `workers` _integer_ | Workers configures the number of uvicorn worker processes to run.
When set, the operator will launch llama-stack using uvicorn with the specified worker count.
Ref: https://fastapi.tiangolo.com/deployment/server-workers/
CPU requests are set to the number of workers when set, otherwise 1 full core | | Minimum: 1
| | `podOverrides` _[PodOverrides](#podoverrides)_ | | | | | `podDisruptionBudget` _[PodDisruptionBudgetSpec](#poddisruptionbudgetspec)_ | PodDisruptionBudget controls voluntary disruption tolerance for the server pods | | | | `topologySpreadConstraints` _[TopologySpreadConstraint](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#topologyspreadconstraint-v1-core) array_ | TopologySpreadConstraints defines fine-grained spreading rules | | | diff --git a/release/operator.yaml b/release/operator.yaml index 919f19ae..12a36bec 100644 --- a/release/operator.yaml +++ b/release/operator.yaml @@ -2577,6 +2577,15 @@ spec: required: - configMapName type: object + workers: + description: |- + Workers configures the number of uvicorn worker processes to run. + When set, the operator will launch llama-stack using uvicorn with the specified worker count. + Ref: https://fastapi.tiangolo.com/deployment/server-workers/ + CPU requests are set to the number of workers when set, otherwise 1 full core + format: int32 + minimum: 1 + type: integer required: - distribution type: object