Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion api/v1alpha1/llamastackdistribution_types.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions config/crd/bases/llamastack.io_llamastackdistributions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2568,6 +2568,15 @@ spec:
required:
- configMapName
type: object
workers:
description: |-
Workers configures the number of uvicorn worker processes to run.
When set, the operator will launch llama-stack using uvicorn with the specified worker count.
Ref: https://fastapi.tiangolo.com/deployment/server-workers/
CPU requests are set to the number of workers when set, otherwise 1 full core
format: int32
minimum: 1
type: integer
required:
- distribution
type: object
Expand Down
1 change: 1 addition & 0 deletions config/samples/_v1alpha1_llamastackdistribution.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ spec:
name: llama-stack
distribution:
name: starter
workers: 2
podDisruptionBudget:
minAvailable: 1
topologySpreadConstraints:
Expand Down
87 changes: 80 additions & 7 deletions controllers/resource_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,17 @@ import (
"errors"
"fmt"
"regexp"
"strconv"
"strings"

llamav1alpha1 "github.com/llamastack/llama-stack-k8s-operator/api/v1alpha1"
autoscalingv2 "k8s.io/api/autoscaling/v2"
corev1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
ctrlLog "sigs.k8s.io/controller-runtime/pkg/log"
)

// Constants for validation limits.
Expand Down Expand Up @@ -91,21 +94,27 @@ try:
print('Using core module path (llama_stack.core.server.server)', file=sys.stderr)
print(1)
else:
print('Using new CLI command (llama stack run)', file=sys.stderr)
print('Using uvicorn CLI command', file=sys.stderr)
print(2)
except Exception as e:
print(f'Version detection failed, defaulting to new CLI: {e}', file=sys.stderr)
print(2)
")

PORT=${LLS_PORT:-8321}
WORKERS=${LLS_WORKERS:-1}

# Execute the appropriate CLI based on version
case $VERSION_CODE in
0) python3 -m llama_stack.distribution.server.server --config /etc/llama-stack/run.yaml ;;
1) python3 -m llama_stack.core.server.server /etc/llama-stack/run.yaml ;;
2) llama stack run /etc/llama-stack/run.yaml ;;
*) echo "Invalid version code: $VERSION_CODE, using new CLI"; llama stack run /etc/llama-stack/run.yaml ;;
2) exec uvicorn llama_stack.core.server.server:create_app --host 0.0.0.0 --port "$PORT" --workers "$WORKERS" --factory ;;
*) echo "Invalid version code: $VERSION_CODE, using uvicorn CLI command"; \
exec uvicorn llama_stack.core.server.server:create_app --host 0.0.0.0 --port "$PORT" --workers "$WORKERS" --factory ;;
esac`

const llamaStackConfigPath = "/etc/llama-stack/run.yaml"

// validateConfigMapKeys validates that all ConfigMap keys contain only safe characters.
// Note: This function validates key names only. PEM content validation is performed
// separately in the controller's reconcileCABundleConfigMap function.
Expand Down Expand Up @@ -152,10 +161,12 @@ func getStartupProbe(instance *llamav1alpha1.LlamaStackDistribution) *corev1.Pro

// buildContainerSpec creates the container specification.
func buildContainerSpec(ctx context.Context, r *LlamaStackDistributionReconciler, instance *llamav1alpha1.LlamaStackDistribution, image string) corev1.Container {
workers, workersSet := getEffectiveWorkers(instance)

container := corev1.Container{
Name: getContainerName(instance),
Image: image,
Resources: resolveContainerResources(instance.Spec.Server.ContainerSpec),
Resources: resolveContainerResources(instance.Spec.Server.ContainerSpec, workers, workersSet),
Ports: []corev1.ContainerPort{{ContainerPort: getContainerPort(instance)}},
StartupProbe: getStartupProbe(instance),
}
Expand All @@ -170,22 +181,59 @@ func buildContainerSpec(ctx context.Context, r *LlamaStackDistributionReconciler

// resolveContainerResources ensures the container always has CPU and memory
// requests defined so that HPAs using utilization metrics can function.
func resolveContainerResources(spec llamav1alpha1.ContainerSpec) corev1.ResourceRequirements {
func resolveContainerResources(spec llamav1alpha1.ContainerSpec, workers int32, workersSet bool) corev1.ResourceRequirements {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need to log somewhere that we are setting the resources based out of the value of the workers

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Included it in api docs as well as added a log.

resources := spec.Resources

ensureRequests(&resources, workers)
if workersSet {
ensureLimitsMatchRequests(&resources)
}

cpuReq := resources.Requests[corev1.ResourceCPU]
memReq := resources.Requests[corev1.ResourceMemory]
cpuLimit := resources.Limits[corev1.ResourceCPU]
memLimit := resources.Limits[corev1.ResourceMemory]

ctrlLog.Log.WithName("resource_helper").WithValues(
"workers", workers,
"workersEnabled", workersSet,
).V(1).Info("Defaulted resource values for llama-stack container",
"cpuRequest", cpuReq.String(),
"memoryRequest", memReq.String(),
"cpuLimit", cpuLimit.String(),
"memoryLimit", memLimit.String(),
)

return resources
}

func ensureRequests(resources *corev1.ResourceRequirements, workers int32) {
if resources.Requests == nil {
resources.Requests = corev1.ResourceList{}
}

if cpuQty, ok := resources.Requests[corev1.ResourceCPU]; !ok || cpuQty.IsZero() {
resources.Requests[corev1.ResourceCPU] = llamav1alpha1.DefaultServerCPURequest
// Default to 1 full core per worker unless user overrides.
resources.Requests[corev1.ResourceCPU] = resource.MustParse(strconv.Itoa(int(workers)))
}

if memQty, ok := resources.Requests[corev1.ResourceMemory]; !ok || memQty.IsZero() {
resources.Requests[corev1.ResourceMemory] = llamav1alpha1.DefaultServerMemoryRequest
}
}

return resources
func ensureLimitsMatchRequests(resources *corev1.ResourceRequirements) {
if resources.Limits == nil {
resources.Limits = corev1.ResourceList{}
}

if cpuLimit, ok := resources.Limits[corev1.ResourceCPU]; !ok || cpuLimit.IsZero() {
resources.Limits[corev1.ResourceCPU] = resources.Requests[corev1.ResourceCPU]
}

if memLimit, ok := resources.Limits[corev1.ResourceMemory]; !ok || memLimit.IsZero() {
resources.Limits[corev1.ResourceMemory] = resources.Requests[corev1.ResourceMemory]
}
}

// getContainerName returns the container name, using custom name if specified.
Expand All @@ -204,9 +252,18 @@ func getContainerPort(instance *llamav1alpha1.LlamaStackDistribution) int32 {
return llamav1alpha1.DefaultServerPort
}

// getEffectiveWorkers returns a positive worker count, defaulting to 1.
func getEffectiveWorkers(instance *llamav1alpha1.LlamaStackDistribution) (int32, bool) {
if instance.Spec.Server.Workers != nil && *instance.Spec.Server.Workers > 0 {
return *instance.Spec.Server.Workers, true
}
return 1, false
}

// configureContainerEnvironment sets up environment variables for the container.
func configureContainerEnvironment(ctx context.Context, r *LlamaStackDistributionReconciler, instance *llamav1alpha1.LlamaStackDistribution, container *corev1.Container) {
mountPath := getMountPath(instance)
workers, _ := getEffectiveWorkers(instance)

// Add HF_HOME variable to our mount path so that downloaded models and datasets are stored
// on the same volume as the storage. This is not critical but useful if the server is
Expand All @@ -227,6 +284,22 @@ func configureContainerEnvironment(ctx context.Context, r *LlamaStackDistributio
})
}

// Always provide worker/port/config env for uvicorn; workers default to 1 when unspecified.
container.Env = append(container.Env,
corev1.EnvVar{
Name: "LLS_WORKERS",
Value: strconv.Itoa(int(workers)),
},
corev1.EnvVar{
Name: "LLS_PORT",
Value: strconv.Itoa(int(getContainerPort(instance))),
},
corev1.EnvVar{
Name: "LLAMA_STACK_CONFIG",
Value: llamaStackConfigPath,
},
)

// Finally, add the user provided env vars
container.Env = append(container.Env, instance.Spec.Server.ContainerSpec.Env...)
}
Expand Down
61 changes: 57 additions & 4 deletions controllers/resource_helper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ import (
"k8s.io/apimachinery/pkg/util/intstr"
)

func int32Ptr(val int32) *int32 {
return &val
}

func TestBuildContainerSpec(t *testing.T) {
testCases := []struct {
name string
Expand All @@ -54,7 +58,7 @@ func TestBuildContainerSpec(t *testing.T) {
Image: "test-image:latest",
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: llamav1alpha1.DefaultServerCPURequest,
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: llamav1alpha1.DefaultServerMemoryRequest,
},
},
Expand All @@ -66,6 +70,9 @@ func TestBuildContainerSpec(t *testing.T) {
}},
Env: []corev1.EnvVar{
{Name: "HF_HOME", Value: "/.llama"},
{Name: "LLS_WORKERS", Value: "1"},
{Name: "LLS_PORT", Value: "8321"},
{Name: "LLAMA_STACK_CONFIG", Value: "/etc/llama-stack/run.yaml"},
},
},
},
Expand Down Expand Up @@ -101,7 +108,7 @@ func TestBuildContainerSpec(t *testing.T) {
StartupProbe: newDefaultStartupProbe(9000),
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: llamav1alpha1.DefaultServerCPURequest,
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: llamav1alpha1.DefaultServerMemoryRequest,
},
Limits: corev1.ResourceList{
Expand All @@ -111,6 +118,9 @@ func TestBuildContainerSpec(t *testing.T) {
},
Env: []corev1.EnvVar{
{Name: "HF_HOME", Value: "/custom/path"},
{Name: "LLS_WORKERS", Value: "1"},
{Name: "LLS_PORT", Value: "9000"},
{Name: "LLAMA_STACK_CONFIG", Value: "/etc/llama-stack/run.yaml"},
{Name: "TEST_ENV", Value: "test-value"},
},
VolumeMounts: []corev1.VolumeMount{{
Expand Down Expand Up @@ -138,7 +148,7 @@ func TestBuildContainerSpec(t *testing.T) {
Image: "test-image:latest",
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: llamav1alpha1.DefaultServerCPURequest,
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: llamav1alpha1.DefaultServerMemoryRequest,
},
},
Expand All @@ -152,7 +162,47 @@ func TestBuildContainerSpec(t *testing.T) {
}},
Env: []corev1.EnvVar{
{Name: "HF_HOME", Value: "/.llama"},
{Name: "LLS_WORKERS", Value: "1"},
{Name: "LLS_PORT", Value: "8321"},
{Name: "LLAMA_STACK_CONFIG", Value: "/etc/llama-stack/run.yaml"},
},
},
},
{
name: "uvicorn workers configured",
instance: &llamav1alpha1.LlamaStackDistribution{
Spec: llamav1alpha1.LlamaStackDistributionSpec{
Server: llamav1alpha1.ServerSpec{
Workers: int32Ptr(4),
},
},
},
image: "test-image:latest",
expectedResult: corev1.Container{
Name: llamav1alpha1.DefaultContainerName,
Image: "test-image:latest",
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("4"),
corev1.ResourceMemory: llamav1alpha1.DefaultServerMemoryRequest,
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("4"),
corev1.ResourceMemory: llamav1alpha1.DefaultServerMemoryRequest,
},
},
Ports: []corev1.ContainerPort{{ContainerPort: llamav1alpha1.DefaultServerPort}},
StartupProbe: newDefaultStartupProbe(llamav1alpha1.DefaultServerPort),
Env: []corev1.EnvVar{
{Name: "HF_HOME", Value: "/.llama"},
{Name: "LLS_WORKERS", Value: "4"},
{Name: "LLS_PORT", Value: "8321"},
{Name: "LLAMA_STACK_CONFIG", Value: "/etc/llama-stack/run.yaml"},
},
VolumeMounts: []corev1.VolumeMount{{
Name: "lls-storage",
MountPath: llamav1alpha1.DefaultMountPath,
}},
},
},
{
Expand All @@ -177,7 +227,7 @@ func TestBuildContainerSpec(t *testing.T) {
ImagePullPolicy: corev1.PullAlways,
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: llamav1alpha1.DefaultServerCPURequest,
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: llamav1alpha1.DefaultServerMemoryRequest,
},
},
Expand All @@ -187,6 +237,9 @@ func TestBuildContainerSpec(t *testing.T) {
Args: []string{},
Env: []corev1.EnvVar{
{Name: "HF_HOME", Value: llamav1alpha1.DefaultMountPath},
{Name: "LLS_WORKERS", Value: "1"},
{Name: "LLS_PORT", Value: "8321"},
{Name: "LLAMA_STACK_CONFIG", Value: "/etc/llama-stack/run.yaml"},
},
VolumeMounts: []corev1.VolumeMount{
{
Expand Down
1 change: 1 addition & 0 deletions docs/api-overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ _Appears in:_
| --- | --- | --- | --- |
| `distribution` _[DistributionType](#distributiontype)_ | | | |
| `containerSpec` _[ContainerSpec](#containerspec)_ | | | |
| `workers` _integer_ | Workers configures the number of uvicorn worker processes to run.<br />When set, the operator will launch llama-stack using uvicorn with the specified worker count.<br />Ref: https://fastapi.tiangolo.com/deployment/server-workers/<br />CPU requests are set to the number of workers when set, otherwise 1 full core | | Minimum: 1 <br /> |
| `podOverrides` _[PodOverrides](#podoverrides)_ | | | |
| `podDisruptionBudget` _[PodDisruptionBudgetSpec](#poddisruptionbudgetspec)_ | PodDisruptionBudget controls voluntary disruption tolerance for the server pods | | |
| `topologySpreadConstraints` _[TopologySpreadConstraint](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#topologyspreadconstraint-v1-core) array_ | TopologySpreadConstraints defines fine-grained spreading rules | | |
Expand Down
9 changes: 9 additions & 0 deletions release/operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2577,6 +2577,15 @@ spec:
required:
- configMapName
type: object
workers:
description: |-
Workers configures the number of uvicorn worker processes to run.
When set, the operator will launch llama-stack using uvicorn with the specified worker count.
Ref: https://fastapi.tiangolo.com/deployment/server-workers/
CPU requests are set to the number of workers when set, otherwise 1 full core
format: int32
minimum: 1
type: integer
required:
- distribution
type: object
Expand Down