diff --git a/api/v1alpha1/llamastackdistribution_types.go b/api/v1alpha1/llamastackdistribution_types.go
index fc3e3e9a..71f7e548 100644
--- a/api/v1alpha1/llamastackdistribution_types.go
+++ b/api/v1alpha1/llamastackdistribution_types.go
@@ -81,7 +81,14 @@ type LlamaStackDistributionSpec struct {
type ServerSpec struct {
Distribution DistributionType `json:"distribution"`
ContainerSpec ContainerSpec `json:"containerSpec,omitempty"`
- PodOverrides *PodOverrides `json:"podOverrides,omitempty"` // Optional pod-level overrides
+ // Workers configures the number of uvicorn worker processes to run.
+ // When set, the operator will launch llama-stack using uvicorn with the specified worker count.
+ // Ref: https://fastapi.tiangolo.com/deployment/server-workers/
+ // CPU requests are set to the number of workers when set, otherwise 1 full core
+ // +optional
+ // +kubebuilder:validation:Minimum=1
+ Workers *int32 `json:"workers,omitempty"`
+ PodOverrides *PodOverrides `json:"podOverrides,omitempty"` // Optional pod-level overrides
// PodDisruptionBudget controls voluntary disruption tolerance for the server pods
// +optional
PodDisruptionBudget *PodDisruptionBudgetSpec `json:"podDisruptionBudget,omitempty"`
diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
index ec9b5534..678d9a65 100644
--- a/api/v1alpha1/zz_generated.deepcopy.go
+++ b/api/v1alpha1/zz_generated.deepcopy.go
@@ -344,6 +344,11 @@ func (in *ServerSpec) DeepCopyInto(out *ServerSpec) {
*out = *in
out.Distribution = in.Distribution
in.ContainerSpec.DeepCopyInto(&out.ContainerSpec)
+ if in.Workers != nil {
+ in, out := &in.Workers, &out.Workers
+ *out = new(int32)
+ **out = **in
+ }
if in.PodOverrides != nil {
in, out := &in.PodOverrides, &out.PodOverrides
*out = new(PodOverrides)
diff --git a/config/crd/bases/llamastack.io_llamastackdistributions.yaml b/config/crd/bases/llamastack.io_llamastackdistributions.yaml
index 3dda47da..e51b0926 100644
--- a/config/crd/bases/llamastack.io_llamastackdistributions.yaml
+++ b/config/crd/bases/llamastack.io_llamastackdistributions.yaml
@@ -2568,6 +2568,15 @@ spec:
required:
- configMapName
type: object
+ workers:
+ description: |-
+ Workers configures the number of uvicorn worker processes to run.
+ When set, the operator will launch llama-stack using uvicorn with the specified worker count.
+ Ref: https://fastapi.tiangolo.com/deployment/server-workers/
+ CPU requests are set to the number of workers when set, otherwise 1 full core
+ format: int32
+ minimum: 1
+ type: integer
required:
- distribution
type: object
diff --git a/config/samples/_v1alpha1_llamastackdistribution.yaml b/config/samples/_v1alpha1_llamastackdistribution.yaml
index 646a673b..8e11036f 100644
--- a/config/samples/_v1alpha1_llamastackdistribution.yaml
+++ b/config/samples/_v1alpha1_llamastackdistribution.yaml
@@ -14,6 +14,7 @@ spec:
name: llama-stack
distribution:
name: starter
+ workers: 2
podDisruptionBudget:
minAvailable: 1
topologySpreadConstraints:
diff --git a/controllers/resource_helper.go b/controllers/resource_helper.go
index 654bb16c..064edf0e 100644
--- a/controllers/resource_helper.go
+++ b/controllers/resource_helper.go
@@ -21,14 +21,17 @@ import (
"errors"
"fmt"
"regexp"
+ "strconv"
"strings"
llamav1alpha1 "github.com/llamastack/llama-stack-k8s-operator/api/v1alpha1"
autoscalingv2 "k8s.io/api/autoscaling/v2"
corev1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
+ "k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
+ ctrlLog "sigs.k8s.io/controller-runtime/pkg/log"
)
// Constants for validation limits.
@@ -91,21 +94,27 @@ try:
print('Using core module path (llama_stack.core.server.server)', file=sys.stderr)
print(1)
else:
- print('Using new CLI command (llama stack run)', file=sys.stderr)
+ print('Using uvicorn CLI command', file=sys.stderr)
print(2)
except Exception as e:
print(f'Version detection failed, defaulting to new CLI: {e}', file=sys.stderr)
print(2)
")
+PORT=${LLS_PORT:-8321}
+WORKERS=${LLS_WORKERS:-1}
+
# Execute the appropriate CLI based on version
case $VERSION_CODE in
0) python3 -m llama_stack.distribution.server.server --config /etc/llama-stack/run.yaml ;;
1) python3 -m llama_stack.core.server.server /etc/llama-stack/run.yaml ;;
- 2) llama stack run /etc/llama-stack/run.yaml ;;
- *) echo "Invalid version code: $VERSION_CODE, using new CLI"; llama stack run /etc/llama-stack/run.yaml ;;
+ 2) exec uvicorn llama_stack.core.server.server:create_app --host 0.0.0.0 --port "$PORT" --workers "$WORKERS" --factory ;;
+ *) echo "Invalid version code: $VERSION_CODE, using uvicorn CLI command"; \
+ exec uvicorn llama_stack.core.server.server:create_app --host 0.0.0.0 --port "$PORT" --workers "$WORKERS" --factory ;;
esac`
+const llamaStackConfigPath = "/etc/llama-stack/run.yaml"
+
// validateConfigMapKeys validates that all ConfigMap keys contain only safe characters.
// Note: This function validates key names only. PEM content validation is performed
// separately in the controller's reconcileCABundleConfigMap function.
@@ -152,10 +161,12 @@ func getStartupProbe(instance *llamav1alpha1.LlamaStackDistribution) *corev1.Pro
// buildContainerSpec creates the container specification.
func buildContainerSpec(ctx context.Context, r *LlamaStackDistributionReconciler, instance *llamav1alpha1.LlamaStackDistribution, image string) corev1.Container {
+ workers, workersSet := getEffectiveWorkers(instance)
+
container := corev1.Container{
Name: getContainerName(instance),
Image: image,
- Resources: resolveContainerResources(instance.Spec.Server.ContainerSpec),
+ Resources: resolveContainerResources(instance.Spec.Server.ContainerSpec, workers, workersSet),
Ports: []corev1.ContainerPort{{ContainerPort: getContainerPort(instance)}},
StartupProbe: getStartupProbe(instance),
}
@@ -170,22 +181,59 @@ func buildContainerSpec(ctx context.Context, r *LlamaStackDistributionReconciler
// resolveContainerResources ensures the container always has CPU and memory
// requests defined so that HPAs using utilization metrics can function.
-func resolveContainerResources(spec llamav1alpha1.ContainerSpec) corev1.ResourceRequirements {
+func resolveContainerResources(spec llamav1alpha1.ContainerSpec, workers int32, workersSet bool) corev1.ResourceRequirements {
resources := spec.Resources
+ ensureRequests(&resources, workers)
+ if workersSet {
+ ensureLimitsMatchRequests(&resources)
+ }
+
+ cpuReq := resources.Requests[corev1.ResourceCPU]
+ memReq := resources.Requests[corev1.ResourceMemory]
+ cpuLimit := resources.Limits[corev1.ResourceCPU]
+ memLimit := resources.Limits[corev1.ResourceMemory]
+
+ ctrlLog.Log.WithName("resource_helper").WithValues(
+ "workers", workers,
+ "workersEnabled", workersSet,
+ ).V(1).Info("Defaulted resource values for llama-stack container",
+ "cpuRequest", cpuReq.String(),
+ "memoryRequest", memReq.String(),
+ "cpuLimit", cpuLimit.String(),
+ "memoryLimit", memLimit.String(),
+ )
+
+ return resources
+}
+
+func ensureRequests(resources *corev1.ResourceRequirements, workers int32) {
if resources.Requests == nil {
resources.Requests = corev1.ResourceList{}
}
if cpuQty, ok := resources.Requests[corev1.ResourceCPU]; !ok || cpuQty.IsZero() {
- resources.Requests[corev1.ResourceCPU] = llamav1alpha1.DefaultServerCPURequest
+ // Default to 1 full core per worker unless user overrides.
+ resources.Requests[corev1.ResourceCPU] = resource.MustParse(strconv.Itoa(int(workers)))
}
if memQty, ok := resources.Requests[corev1.ResourceMemory]; !ok || memQty.IsZero() {
resources.Requests[corev1.ResourceMemory] = llamav1alpha1.DefaultServerMemoryRequest
}
+}
- return resources
+func ensureLimitsMatchRequests(resources *corev1.ResourceRequirements) {
+ if resources.Limits == nil {
+ resources.Limits = corev1.ResourceList{}
+ }
+
+ if cpuLimit, ok := resources.Limits[corev1.ResourceCPU]; !ok || cpuLimit.IsZero() {
+ resources.Limits[corev1.ResourceCPU] = resources.Requests[corev1.ResourceCPU]
+ }
+
+ if memLimit, ok := resources.Limits[corev1.ResourceMemory]; !ok || memLimit.IsZero() {
+ resources.Limits[corev1.ResourceMemory] = resources.Requests[corev1.ResourceMemory]
+ }
}
// getContainerName returns the container name, using custom name if specified.
@@ -204,9 +252,18 @@ func getContainerPort(instance *llamav1alpha1.LlamaStackDistribution) int32 {
return llamav1alpha1.DefaultServerPort
}
+// getEffectiveWorkers returns a positive worker count, defaulting to 1.
+func getEffectiveWorkers(instance *llamav1alpha1.LlamaStackDistribution) (int32, bool) {
+ if instance.Spec.Server.Workers != nil && *instance.Spec.Server.Workers > 0 {
+ return *instance.Spec.Server.Workers, true
+ }
+ return 1, false
+}
+
// configureContainerEnvironment sets up environment variables for the container.
func configureContainerEnvironment(ctx context.Context, r *LlamaStackDistributionReconciler, instance *llamav1alpha1.LlamaStackDistribution, container *corev1.Container) {
mountPath := getMountPath(instance)
+ workers, _ := getEffectiveWorkers(instance)
// Add HF_HOME variable to our mount path so that downloaded models and datasets are stored
// on the same volume as the storage. This is not critical but useful if the server is
@@ -227,6 +284,22 @@ func configureContainerEnvironment(ctx context.Context, r *LlamaStackDistributio
})
}
+ // Always provide worker/port/config env for uvicorn; workers default to 1 when unspecified.
+ container.Env = append(container.Env,
+ corev1.EnvVar{
+ Name: "LLS_WORKERS",
+ Value: strconv.Itoa(int(workers)),
+ },
+ corev1.EnvVar{
+ Name: "LLS_PORT",
+ Value: strconv.Itoa(int(getContainerPort(instance))),
+ },
+ corev1.EnvVar{
+ Name: "LLAMA_STACK_CONFIG",
+ Value: llamaStackConfigPath,
+ },
+ )
+
// Finally, add the user provided env vars
container.Env = append(container.Env, instance.Spec.Server.ContainerSpec.Env...)
}
diff --git a/controllers/resource_helper_test.go b/controllers/resource_helper_test.go
index 69519e15..b5830947 100644
--- a/controllers/resource_helper_test.go
+++ b/controllers/resource_helper_test.go
@@ -32,6 +32,10 @@ import (
"k8s.io/apimachinery/pkg/util/intstr"
)
+func int32Ptr(val int32) *int32 {
+ return &val
+}
+
func TestBuildContainerSpec(t *testing.T) {
testCases := []struct {
name string
@@ -54,7 +58,7 @@ func TestBuildContainerSpec(t *testing.T) {
Image: "test-image:latest",
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
- corev1.ResourceCPU: llamav1alpha1.DefaultServerCPURequest,
+ corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: llamav1alpha1.DefaultServerMemoryRequest,
},
},
@@ -66,6 +70,9 @@ func TestBuildContainerSpec(t *testing.T) {
}},
Env: []corev1.EnvVar{
{Name: "HF_HOME", Value: "/.llama"},
+ {Name: "LLS_WORKERS", Value: "1"},
+ {Name: "LLS_PORT", Value: "8321"},
+ {Name: "LLAMA_STACK_CONFIG", Value: "/etc/llama-stack/run.yaml"},
},
},
},
@@ -101,7 +108,7 @@ func TestBuildContainerSpec(t *testing.T) {
StartupProbe: newDefaultStartupProbe(9000),
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
- corev1.ResourceCPU: llamav1alpha1.DefaultServerCPURequest,
+ corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: llamav1alpha1.DefaultServerMemoryRequest,
},
Limits: corev1.ResourceList{
@@ -111,6 +118,9 @@ func TestBuildContainerSpec(t *testing.T) {
},
Env: []corev1.EnvVar{
{Name: "HF_HOME", Value: "/custom/path"},
+ {Name: "LLS_WORKERS", Value: "1"},
+ {Name: "LLS_PORT", Value: "9000"},
+ {Name: "LLAMA_STACK_CONFIG", Value: "/etc/llama-stack/run.yaml"},
{Name: "TEST_ENV", Value: "test-value"},
},
VolumeMounts: []corev1.VolumeMount{{
@@ -138,7 +148,7 @@ func TestBuildContainerSpec(t *testing.T) {
Image: "test-image:latest",
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
- corev1.ResourceCPU: llamav1alpha1.DefaultServerCPURequest,
+ corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: llamav1alpha1.DefaultServerMemoryRequest,
},
},
@@ -152,7 +162,47 @@ func TestBuildContainerSpec(t *testing.T) {
}},
Env: []corev1.EnvVar{
{Name: "HF_HOME", Value: "/.llama"},
+ {Name: "LLS_WORKERS", Value: "1"},
+ {Name: "LLS_PORT", Value: "8321"},
+ {Name: "LLAMA_STACK_CONFIG", Value: "/etc/llama-stack/run.yaml"},
+ },
+ },
+ },
+ {
+ name: "uvicorn workers configured",
+ instance: &llamav1alpha1.LlamaStackDistribution{
+ Spec: llamav1alpha1.LlamaStackDistributionSpec{
+ Server: llamav1alpha1.ServerSpec{
+ Workers: int32Ptr(4),
+ },
+ },
+ },
+ image: "test-image:latest",
+ expectedResult: corev1.Container{
+ Name: llamav1alpha1.DefaultContainerName,
+ Image: "test-image:latest",
+ Resources: corev1.ResourceRequirements{
+ Requests: corev1.ResourceList{
+ corev1.ResourceCPU: resource.MustParse("4"),
+ corev1.ResourceMemory: llamav1alpha1.DefaultServerMemoryRequest,
+ },
+ Limits: corev1.ResourceList{
+ corev1.ResourceCPU: resource.MustParse("4"),
+ corev1.ResourceMemory: llamav1alpha1.DefaultServerMemoryRequest,
+ },
+ },
+ Ports: []corev1.ContainerPort{{ContainerPort: llamav1alpha1.DefaultServerPort}},
+ StartupProbe: newDefaultStartupProbe(llamav1alpha1.DefaultServerPort),
+ Env: []corev1.EnvVar{
+ {Name: "HF_HOME", Value: "/.llama"},
+ {Name: "LLS_WORKERS", Value: "4"},
+ {Name: "LLS_PORT", Value: "8321"},
+ {Name: "LLAMA_STACK_CONFIG", Value: "/etc/llama-stack/run.yaml"},
},
+ VolumeMounts: []corev1.VolumeMount{{
+ Name: "lls-storage",
+ MountPath: llamav1alpha1.DefaultMountPath,
+ }},
},
},
{
@@ -177,7 +227,7 @@ func TestBuildContainerSpec(t *testing.T) {
ImagePullPolicy: corev1.PullAlways,
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
- corev1.ResourceCPU: llamav1alpha1.DefaultServerCPURequest,
+ corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: llamav1alpha1.DefaultServerMemoryRequest,
},
},
@@ -187,6 +237,9 @@ func TestBuildContainerSpec(t *testing.T) {
Args: []string{},
Env: []corev1.EnvVar{
{Name: "HF_HOME", Value: llamav1alpha1.DefaultMountPath},
+ {Name: "LLS_WORKERS", Value: "1"},
+ {Name: "LLS_PORT", Value: "8321"},
+ {Name: "LLAMA_STACK_CONFIG", Value: "/etc/llama-stack/run.yaml"},
},
VolumeMounts: []corev1.VolumeMount{
{
diff --git a/docs/api-overview.md b/docs/api-overview.md
index 2c86f317..b24178e5 100644
--- a/docs/api-overview.md
+++ b/docs/api-overview.md
@@ -218,6 +218,7 @@ _Appears in:_
| --- | --- | --- | --- |
| `distribution` _[DistributionType](#distributiontype)_ | | | |
| `containerSpec` _[ContainerSpec](#containerspec)_ | | | |
+| `workers` _integer_ | Workers configures the number of uvicorn worker processes to run.
When set, the operator will launch llama-stack using uvicorn with the specified worker count.
Ref: https://fastapi.tiangolo.com/deployment/server-workers/
CPU requests are set to the number of workers when set, otherwise 1 full core | | Minimum: 1
|
| `podOverrides` _[PodOverrides](#podoverrides)_ | | | |
| `podDisruptionBudget` _[PodDisruptionBudgetSpec](#poddisruptionbudgetspec)_ | PodDisruptionBudget controls voluntary disruption tolerance for the server pods | | |
| `topologySpreadConstraints` _[TopologySpreadConstraint](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#topologyspreadconstraint-v1-core) array_ | TopologySpreadConstraints defines fine-grained spreading rules | | |
diff --git a/release/operator.yaml b/release/operator.yaml
index 919f19ae..12a36bec 100644
--- a/release/operator.yaml
+++ b/release/operator.yaml
@@ -2577,6 +2577,15 @@ spec:
required:
- configMapName
type: object
+ workers:
+ description: |-
+ Workers configures the number of uvicorn worker processes to run.
+ When set, the operator will launch llama-stack using uvicorn with the specified worker count.
+ Ref: https://fastapi.tiangolo.com/deployment/server-workers/
+ CPU requests are set to the number of workers when set, otherwise 1 full core
+ format: int32
+ minimum: 1
+ type: integer
required:
- distribution
type: object