Skip to content

Commit 2ef9bfc

Browse files
authored
fix: schedulingtemplate typo, merge workload profile and workload spec (#189)
* fix: schedulingtemplate typo, merge workload profile and workload spec definition * fix: add 50 series gpu info
1 parent c3c1f5e commit 2ef9bfc

20 files changed

+402
-86
lines changed

api/v1/schedulingconfigtemplate_types.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -179,8 +179,10 @@ type SmartSchedulerModelInput struct {
179179
PredictionPeriod string `json:"predictionPeriod,omitempty"`
180180
}
181181

182+
// Avoid hot GPU devices and continuously balance the workload\nimplemented by trigger a simulation scheduling and advise better GPU nodes for scheduler
182183
type ReBalancerConfig struct {
183-
Internal string `json:"internal,omitempty"`
184+
Enable *bool `json:"enable,omitempty"`
185+
Interval string `json:"interval,omitempty"`
184186
ReBalanceCoolDownTime string `json:"reBalanceCoolDownTime,omitempty"`
185187
Threshold ReBalanceThreshold `json:"threshold,omitempty"`
186188
}
@@ -194,8 +196,11 @@ type HypervisorScheduling struct {
194196
}
195197

196198
type MultiProcessQueuing struct {
197-
Enable *bool `json:"enable,omitempty"`
198-
Interval string `json:"interval,omitempty"`
199+
// +optional
200+
Enable *bool `json:"enable,omitempty"`
201+
202+
Interval string `json:"interval,omitempty"`
203+
199204
QueueLevelTimeSlices []string `json:"queueLevelTimeSlices,omitempty"`
200205
}
201206

api/v1/tensorfusionworkload_types.go

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,18 +20,6 @@ import (
2020
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2121
)
2222

23-
// TensorFusionWorkloadSpec defines the desired state of TensorFusionWorkload.
24-
type TensorFusionWorkloadSpec struct {
25-
Replicas *int32 `json:"replicas,omitempty"`
26-
PoolName string `json:"poolName"`
27-
// +optional
28-
Resources Resources `json:"resources"`
29-
// +optional
30-
Qos QoSLevel `json:"qos,omitempty"`
31-
// +optional
32-
IsLocalGPU bool `json:"isLocalGPU,omitempty"`
33-
}
34-
3523
type WorkerPhase string
3624

3725
const (
@@ -74,7 +62,7 @@ type TensorFusionWorkload struct {
7462
metav1.TypeMeta `json:",inline"`
7563
metav1.ObjectMeta `json:"metadata,omitempty"`
7664

77-
Spec TensorFusionWorkloadSpec `json:"spec,omitempty"`
65+
Spec WorkloadProfileSpec `json:"spec,omitempty"`
7866
Status TensorFusionWorkloadStatus `json:"status,omitempty"`
7967
}
8068

api/v1/workloadprofile_types.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ const (
3232

3333
// WorkloadProfileSpec defines the desired state of WorkloadProfile.
3434
type WorkloadProfileSpec struct {
35+
// +optional
36+
Replicas *int32 `json:"replicas,omitempty"`
37+
3538
// +optional
3639
PoolName string `json:"poolName,omitempty"`
3740

api/v1/zz_generated.deepcopy.go

Lines changed: 10 additions & 21 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

charts/tensor-fusion/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 1.2.19
18+
version: 1.2.20
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to

charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -577,7 +577,6 @@ spec:
577577
description: GPUPoolStatus defines the observed state of GPUPool.
578578
properties:
579579
allocatedTFlopsPercent:
580-
description: updated with interval
581580
type: string
582581
allocatedVRAMPercent:
583582
type: string
@@ -706,7 +705,6 @@ spec:
706705
format: int32
707706
type: integer
708707
savedCostsPerMonth:
709-
description: aggregated with interval
710708
type: string
711709
totalGPUs:
712710
format: int32
@@ -727,7 +725,6 @@ spec:
727725
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
728726
x-kubernetes-int-or-string: true
729727
utilizedTFlopsPercent:
730-
description: calculated every 5m average
731728
type: string
732729
utilizedVRAMPercent:
733730
type: string

charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,9 @@ spec:
226226
avoid hot GPU devices and continuously balance the workload
227227
implemented by trigger a simulation scheduling and advise better GPU nodes for scheduler
228228
properties:
229-
internal:
229+
enable:
230+
type: boolean
231+
interval:
230232
type: string
231233
reBalanceCoolDownTime:
232234
type: string

charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -708,7 +708,6 @@ spec:
708708
description: TensorFusionClusterStatus defines the observed state of TensorFusionCluster.
709709
properties:
710710
allocatedTFlopsPercent:
711-
description: updated with interval
712711
type: string
713712
allocatedVRAMPercent:
714713
type: string
@@ -808,7 +807,6 @@ spec:
808807
format: int64
809808
type: integer
810809
savedCostsPerMonth:
811-
description: aggregated with interval
812810
type: string
813811
totalGPUs:
814812
format: int32
@@ -832,7 +830,6 @@ spec:
832830
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
833831
x-kubernetes-int-or-string: true
834832
utilizedTFlopsPercent:
835-
description: calculated every 5m average
836833
type: string
837834
utilizedVRAMPercent:
838835
type: string

charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml

Lines changed: 145 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,157 @@ spec:
3838
metadata:
3939
type: object
4040
spec:
41-
description: TensorFusionWorkloadSpec defines the desired state of TensorFusionWorkload.
41+
description: WorkloadProfileSpec defines the desired state of WorkloadProfile.
4242
properties:
43+
autoScalingConfig:
44+
description: |-
45+
AutoScalingConfig configured here will override Pool's schedulingConfig
46+
This field can not be fully supported in annotation, if user want to enable auto-scaling in annotation,
47+
user can set tensor-fusion.ai/auto-limits|requests|replicas: 'true'
48+
properties:
49+
autoSetLimits:
50+
description: |-
51+
layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly
52+
VPA-like, aggregate metrics data <1m
53+
properties:
54+
enable:
55+
type: boolean
56+
evaluationPeriod:
57+
type: string
58+
extraTFlopsBufferRatio:
59+
type: string
60+
ignoredDeltaRange:
61+
type: string
62+
maxRatioToRequests:
63+
description: the multiplier of requests, to avoid limit set
64+
too high, like 5.0
65+
type: string
66+
prediction:
67+
properties:
68+
enable:
69+
type: boolean
70+
historyDataPeriod:
71+
type: string
72+
model:
73+
type: string
74+
predictionPeriod:
75+
type: string
76+
type: object
77+
scaleUpStep:
78+
type: string
79+
targetResource:
80+
description: target resource to scale limits, such as "tflops",
81+
"vram", or "all" by default
82+
type: string
83+
type: object
84+
autoSetReplicas:
85+
description: |-
86+
layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit
87+
HPA-like, aggregate metrics data 1m-1h (when tf-worker scaled-up, should also trigger client pod's owner[Deployment etc.]'s replica increasing, check if KNative works)
88+
properties:
89+
enable:
90+
type: boolean
91+
evaluationPeriod:
92+
type: string
93+
scaleDownCoolDownTime:
94+
type: string
95+
scaleDownStep:
96+
type: string
97+
scaleUpCoolDownTime:
98+
type: string
99+
scaleUpStep:
100+
type: string
101+
targetTFlopsOfLimits:
102+
type: string
103+
type: object
104+
autoSetRequests:
105+
description: |-
106+
layer 3 adjusting, to match the actual usage in the long run
107+
Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks
108+
properties:
109+
aggregationPeriod:
110+
type: string
111+
enable:
112+
type: boolean
113+
evaluationPeriod:
114+
type: string
115+
extraBufferRatio:
116+
description: the request buffer ratio, for example actual
117+
usage is 1.0, 10% buffer will be 1.1 as final preferred
118+
requests
119+
type: string
120+
percentileForAutoRequests:
121+
type: string
122+
prediction:
123+
properties:
124+
enable:
125+
type: boolean
126+
historyDataPeriod:
127+
type: string
128+
model:
129+
type: string
130+
predictionPeriod:
131+
type: string
132+
type: object
133+
targetResource:
134+
description: target resource to scale requests, such as "tflops",
135+
"vram", or "all" by default
136+
type: string
137+
type: object
138+
scaleToZero:
139+
description: |-
140+
additional layer to save VRAM, auto-freeze memory and cool down to RAM and Disk
141+
Hypervisor will monitor and trigger freeze of inactive workers, Operator should mark them as scaled-to-zero and release the GPU pool resources, don't scale down CPU client part, so that they can continue to serve the traffic or scale down by other auto-scaling solutions like KEDA/KNative
142+
properties:
143+
autoFreeze:
144+
items:
145+
properties:
146+
enable:
147+
type: boolean
148+
freezeToDiskTTL:
149+
type: string
150+
freezeToMemTTL:
151+
type: string
152+
qos:
153+
enum:
154+
- low
155+
- medium
156+
- high
157+
- critical
158+
type: string
159+
type: object
160+
type: array
161+
intelligenceWarmup:
162+
properties:
163+
enable:
164+
type: boolean
165+
historyDataPeriod:
166+
type: string
167+
model:
168+
type: string
169+
predictionPeriod:
170+
type: string
171+
type: object
172+
type: object
173+
type: object
174+
gpuCount:
175+
description: The number of GPUs to be used by the workload, default
176+
to 1
177+
type: integer
43178
isLocalGPU:
179+
description: Schedule the workload to the same GPU server that runs
180+
vGPU worker for best performance, default to false
181+
type: boolean
182+
noStandaloneWorkerMode:
183+
description: This mode is only available when `is-local-gpu` set to
184+
true, in this mode, TensorFusion will also inject vGPU worker into
185+
init container, so that to achieve best performance, trade-off is
186+
user might by-pass the vGPU worker and using physical GPU directly
44187
type: boolean
45188
poolName:
46189
type: string
47190
qos:
191+
description: Qos defines the quality of service level for the client.
48192
enum:
49193
- low
50194
- medium
@@ -96,8 +240,6 @@ spec:
96240
- limits
97241
- requests
98242
type: object
99-
required:
100-
- poolName
101243
type: object
102244
status:
103245
description: TensorFusionWorkloadStatus defines the observed state of

charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,9 @@ spec:
194194
- high
195195
- critical
196196
type: string
197+
replicas:
198+
format: int32
199+
type: integer
197200
resources:
198201
properties:
199202
limits:

0 commit comments

Comments
 (0)