NexusGPU
diff --git a/‎api/v1/schedulingconfigtemplate_types.go
Lines changed: 8 additions & 3 deletions b/‎api/v1/schedulingconfigtemplate_types.go
Lines changed: 8 additions & 3 deletions
diff --git a/‎api/v1/tensorfusionworkload_types.go
Lines changed: 1 addition & 13 deletions b/‎api/v1/tensorfusionworkload_types.go
Lines changed: 1 addition & 13 deletions
diff --git a/‎api/v1/workloadprofile_types.go
Lines changed: 3 additions & 0 deletions b/‎api/v1/workloadprofile_types.go
Lines changed: 3 additions & 0 deletions
diff --git a/‎api/v1/zz_generated.deepcopy.go
Lines changed: 10 additions & 21 deletions b/‎api/v1/zz_generated.deepcopy.go
Lines changed: 10 additions & 21 deletions
diff --git a/‎charts/tensor-fusion/Chart.yaml
Lines changed: 1 addition & 1 deletion b/‎charts/tensor-fusion/Chart.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml
Lines changed: 0 additions & 3 deletions b/‎charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml
Lines changed: 0 additions & 3 deletions
diff --git a/‎charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml
Lines changed: 3 additions & 1 deletion b/‎charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml
Lines changed: 3 additions & 1 deletion
diff --git a/‎charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml
Lines changed: 0 additions & 3 deletions b/‎charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml
Lines changed: 0 additions & 3 deletions
diff --git a/‎charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml
Lines changed: 145 additions & 3 deletions b/‎charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml
Lines changed: 145 additions & 3 deletions
diff --git a/‎charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml
Lines changed: 3 additions & 0 deletions b/‎charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml
Lines changed: 3 additions & 0 deletions
@@ -179,8 +179,10 @@ type SmartSchedulerModelInput struct {
 	PredictionPeriod  string `json:"predictionPeriod,omitempty"`
 }
 
+// Avoid hot GPU devices and continuously balance the workload\nimplemented by trigger a simulation scheduling and advise better GPU nodes for scheduler
 type ReBalancerConfig struct {
-	Internal              string             `json:"internal,omitempty"`
+	Enable                *bool              `json:"enable,omitempty"`
+	Interval              string             `json:"interval,omitempty"`
 	ReBalanceCoolDownTime string             `json:"reBalanceCoolDownTime,omitempty"`
 	Threshold             ReBalanceThreshold `json:"threshold,omitempty"`
 }
@@ -194,8 +196,11 @@ type HypervisorScheduling struct {
 }
 
 type MultiProcessQueuing struct {
-	Enable               *bool    `json:"enable,omitempty"`
-	Interval             string   `json:"interval,omitempty"`
+	// +optional
+	Enable *bool `json:"enable,omitempty"`
+
+	Interval string `json:"interval,omitempty"`
+
 	QueueLevelTimeSlices []string `json:"queueLevelTimeSlices,omitempty"`
 }
 
 
@@ -20,18 +20,6 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 
-// TensorFusionWorkloadSpec defines the desired state of TensorFusionWorkload.
-type TensorFusionWorkloadSpec struct {
-	Replicas *int32 `json:"replicas,omitempty"`
-	PoolName string `json:"poolName"`
-	// +optional
-	Resources Resources `json:"resources"`
-	// +optional
-	Qos QoSLevel `json:"qos,omitempty"`
-	// +optional
-	IsLocalGPU bool `json:"isLocalGPU,omitempty"`
-}
-
 type WorkerPhase string
 
 const (
@@ -74,7 +62,7 @@ type TensorFusionWorkload struct {
 	metav1.TypeMeta   `json:",inline"`
 	metav1.ObjectMeta `json:"metadata,omitempty"`
 
-	Spec   TensorFusionWorkloadSpec   `json:"spec,omitempty"`
+	Spec   WorkloadProfileSpec        `json:"spec,omitempty"`
 	Status TensorFusionWorkloadStatus `json:"status,omitempty"`
 }
 
 
@@ -32,6 +32,9 @@ const (
 
 // WorkloadProfileSpec defines the desired state of WorkloadProfile.
 type WorkloadProfileSpec struct {
+	// +optional
+	Replicas *int32 `json:"replicas,omitempty"`
+
 	// +optional
 	PoolName string `json:"poolName,omitempty"`
 
 
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 1.2.19
+version: 1.2.20
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 
@@ -577,7 +577,6 @@ spec:
             description: GPUPoolStatus defines the observed state of GPUPool.
             properties:
               allocatedTFlopsPercent:
-                description: updated with interval
                 type: string
               allocatedVRAMPercent:
                 type: string
@@ -706,7 +705,6 @@ spec:
                 format: int32
                 type: integer
               savedCostsPerMonth:
-                description: aggregated with interval
                 type: string
               totalGPUs:
                 format: int32
@@ -727,7 +725,6 @@ spec:
                 pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
                 x-kubernetes-int-or-string: true
               utilizedTFlopsPercent:
-                description: calculated every 5m average
                 type: string
               utilizedVRAMPercent:
                 type: string
 
@@ -226,7 +226,9 @@ spec:
                   avoid hot GPU devices and continuously balance the workload
                   implemented by trigger a simulation scheduling and advise better GPU nodes for scheduler
                 properties:
-                  internal:
+                  enable:
+                    type: boolean
+                  interval:
                     type: string
                   reBalanceCoolDownTime:
                     type: string
 
@@ -708,7 +708,6 @@ spec:
             description: TensorFusionClusterStatus defines the observed state of TensorFusionCluster.
             properties:
               allocatedTFlopsPercent:
-                description: updated with interval
                 type: string
               allocatedVRAMPercent:
                 type: string
@@ -808,7 +807,6 @@ spec:
                 format: int64
                 type: integer
               savedCostsPerMonth:
-                description: aggregated with interval
                 type: string
               totalGPUs:
                 format: int32
@@ -832,7 +830,6 @@ spec:
                 pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
                 x-kubernetes-int-or-string: true
               utilizedTFlopsPercent:
-                description: calculated every 5m average
                 type: string
               utilizedVRAMPercent:
                 type: string
 
@@ -38,13 +38,157 @@ spec:
           metadata:
             type: object
           spec:
-            description: TensorFusionWorkloadSpec defines the desired state of TensorFusionWorkload.
+            description: WorkloadProfileSpec defines the desired state of WorkloadProfile.
             properties:
+              autoScalingConfig:
+                description: |-
+                  AutoScalingConfig configured here will override Pool's schedulingConfig
+                  This field can not be fully supported in annotation, if user want to enable auto-scaling in annotation,
+                  user can set tensor-fusion.ai/auto-limits|requests|replicas: 'true'
+                properties:
+                  autoSetLimits:
+                    description: |-
+                      layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly
+                      VPA-like, aggregate metrics data <1m
+                    properties:
+                      enable:
+                        type: boolean
+                      evaluationPeriod:
+                        type: string
+                      extraTFlopsBufferRatio:
+                        type: string
+                      ignoredDeltaRange:
+                        type: string
+                      maxRatioToRequests:
+                        description: the multiplier of requests, to avoid limit set
+                          too high, like 5.0
+                        type: string
+                      prediction:
+                        properties:
+                          enable:
+                            type: boolean
+                          historyDataPeriod:
+                            type: string
+                          model:
+                            type: string
+                          predictionPeriod:
+                            type: string
+                        type: object
+                      scaleUpStep:
+                        type: string
+                      targetResource:
+                        description: target resource to scale limits, such as "tflops",
+                          "vram", or "all" by default
+                        type: string
+                    type: object
+                  autoSetReplicas:
+                    description: |-
+                      layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit
+                      HPA-like, aggregate metrics data 1m-1h (when tf-worker scaled-up, should also trigger client pod's owner[Deployment etc.]'s replica increasing, check if KNative works)
+                    properties:
+                      enable:
+                        type: boolean
+                      evaluationPeriod:
+                        type: string
+                      scaleDownCoolDownTime:
+                        type: string
+                      scaleDownStep:
+                        type: string
+                      scaleUpCoolDownTime:
+                        type: string
+                      scaleUpStep:
+                        type: string
+                      targetTFlopsOfLimits:
+                        type: string
+                    type: object
+                  autoSetRequests:
+                    description: |-
+                      layer 3 adjusting, to match the actual usage in the long run
+                      Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks
+                    properties:
+                      aggregationPeriod:
+                        type: string
+                      enable:
+                        type: boolean
+                      evaluationPeriod:
+                        type: string
+                      extraBufferRatio:
+                        description: the request buffer ratio, for example actual
+                          usage is 1.0, 10% buffer will be 1.1 as final preferred
+                          requests
+                        type: string
+                      percentileForAutoRequests:
+                        type: string
+                      prediction:
+                        properties:
+                          enable:
+                            type: boolean
+                          historyDataPeriod:
+                            type: string
+                          model:
+                            type: string
+                          predictionPeriod:
+                            type: string
+                        type: object
+                      targetResource:
+                        description: target resource to scale requests, such as "tflops",
+                          "vram", or "all" by default
+                        type: string
+                    type: object
+                  scaleToZero:
+                    description: |-
+                      additional layer to save VRAM, auto-freeze memory and cool down to RAM and Disk
+                      Hypervisor will monitor and trigger freeze of inactive workers, Operator should mark them as scaled-to-zero and release the GPU pool resources, don't scale down CPU client part, so that they can continue to serve the traffic or scale down by other auto-scaling solutions like KEDA/KNative
+                    properties:
+                      autoFreeze:
+                        items:
+                          properties:
+                            enable:
+                              type: boolean
+                            freezeToDiskTTL:
+                              type: string
+                            freezeToMemTTL:
+                              type: string
+                            qos:
+                              enum:
+                              - low
+                              - medium
+                              - high
+                              - critical
+                              type: string
+                          type: object
+                        type: array
+                      intelligenceWarmup:
+                        properties:
+                          enable:
+                            type: boolean
+                          historyDataPeriod:
+                            type: string
+                          model:
+                            type: string
+                          predictionPeriod:
+                            type: string
+                        type: object
+                    type: object
+                type: object
+              gpuCount:
+                description: The number of GPUs to be used by the workload, default
+                  to 1
+                type: integer
               isLocalGPU:
+                description: Schedule the workload to the same GPU server that runs
+                  vGPU worker for best performance, default to false
+                type: boolean
+              noStandaloneWorkerMode:
+                description: This mode is only available when `is-local-gpu` set to
+                  true, in this mode, TensorFusion will also inject vGPU worker into
+                  init container, so that to achieve best performance, trade-off is
+                  user might by-pass the vGPU worker and using physical GPU directly
                 type: boolean
               poolName:
                 type: string
               qos:
+                description: Qos defines the quality of service level for the client.
                 enum:
                 - low
                 - medium
@@ -96,8 +240,6 @@ spec:
                 - limits
                 - requests
                 type: object
-            required:
-            - poolName
             type: object
           status:
             description: TensorFusionWorkloadStatus defines the observed state of
 
@@ -194,6 +194,9 @@ spec:
                 - high
                 - critical
                 type: string
+              replicas:
+                format: int32
+                type: integer
               resources:
                 properties:
                   limits:
Original file line number	Diff line number	Diff line change
`@@ -179,8 +179,10 @@ type SmartSchedulerModelInput struct {`
`179`	`179`	PredictionPeriod string `json:"predictionPeriod,omitempty"`
`180`	`180`	`}`
`181`	`181`
	`182`	`+// Avoid hot GPU devices and continuously balance the workload\nimplemented by trigger a simulation scheduling and advise better GPU nodes for scheduler`
`182`	`183`	`type ReBalancerConfig struct {`
`183`		- Internal string `json:"internal,omitempty"`
	`184`	+ Enable *bool `json:"enable,omitempty"`
	`185`	+ Interval string `json:"interval,omitempty"`
`184`	`186`	ReBalanceCoolDownTime string `json:"reBalanceCoolDownTime,omitempty"`
`185`	`187`	Threshold ReBalanceThreshold `json:"threshold,omitempty"`
`186`	`188`	`}`
`@@ -194,8 +196,11 @@ type HypervisorScheduling struct {`
`194`	`196`	`}`
`195`	`197`
`196`	`198`	`type MultiProcessQueuing struct {`
`197`		- Enable *bool `json:"enable,omitempty"`
`198`		- Interval string `json:"interval,omitempty"`
	`199`	`+ // +optional`
	`200`	+ Enable *bool `json:"enable,omitempty"`
	`201`	`+`
	`202`	+ Interval string `json:"interval,omitempty"`
	`203`	`+`
`199`	`204`	QueueLevelTimeSlices []string `json:"queueLevelTimeSlices,omitempty"`
`200`	`205`	`}`
`201`	`206`