Skip to content

feat: add GPUNodeClaim for cloud vendor integration and karpenter integration #282

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Jul 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
7dc8133
fix: bump kubernetes version
Code2Life Jul 15, 2025
19372e7
feat: add gpunodeclaim resource for cloud provisioning
Code2Life Jul 15, 2025
8f4d930
fix: gpu node claim
Code2Life Jul 15, 2025
a765e80
fix: node claim management
Code2Life Jul 15, 2025
982ee7f
fix: upgrade kubernetes version, add gpu node claim test
Code2Life Jul 16, 2025
0ba02d2
fix: reduce log rotation period
Code2Life Jul 16, 2025
1b8761e
fix: merge karpenter integration code
Code2Life Jul 16, 2025
86df326
fix: node claim provider nil issue
Code2Life Jul 16, 2025
05d1ad9
fix: log lib issue, node claim test
Code2Life Jul 16, 2025
fe52574
fix: add context to cloud providers and improve logging with controll…
Code2Life Jul 16, 2025
a24a40d
fix: add api version for node claim
Code2Life Jul 17, 2025
8b94fea
fix: node pricing map refactor
Code2Life Jul 17, 2025
9ec5a66
fix: node claim owner ref issue
Code2Life Jul 17, 2025
8b4b996
fix: provisioning mode duplicate creation issue
Code2Life Jul 18, 2025
75a5f2a
fix: node compaction and karpenter provision issue
Code2Life Jul 20, 2025
86a39b8
fix: schedule simulation detail
Code2Life Jul 20, 2025
d63ba19
fix: add killer switch for provisioning mode, fix provision bug
Code2Life Jul 20, 2025
7059ba4
fix: ut issue
Code2Life Jul 20, 2025
d0be5ff
fix: global config not loaded bug
Code2Life Jul 20, 2025
159f3f9
fix: ut issue
Code2Life Jul 20, 2025
d4360b5
fix: lint issues
Code2Life Jul 20, 2025
dc25ccb
fix: ut issues, add serial test mode
Code2Life Jul 20, 2025
e1c009d
fix: ut issues, compaction provision bug
Code2Life Jul 20, 2025
0172c51
fix: node filter test case issue
Code2Life Jul 20, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"Aliyun",
"AMDCDNA",
"AMDRDNA",
"apierrors",
"apimachinery",
"apimachineryruntime",
"apiruntime",
Expand Down Expand Up @@ -37,8 +38,10 @@
"CUDA",
"cycjimmy",
"datanode",
"deepcopy",
"defaultbinder",
"dylib",
"eastus",
"envtest",
"essd",
"Eventf",
Expand All @@ -57,6 +60,7 @@
"gosec",
"gpuallocator",
"gpunode",
"gpunodeclaim",
"gpunodeclaims",
"gpunodeclasses",
"gpunodes",
Expand All @@ -75,6 +79,7 @@
"influxdata",
"jsonpatch",
"karpenter",
"karpv",
"klog",
"Klogr",
"kubebuilder",
Expand All @@ -86,13 +91,16 @@
"libcuda",
"libnvidia",
"lineprotocol",
"mapstructure",
"metav",
"metricsserver",
"Milli",
"mitchellh",
"mito",
"mutatingwebhookconfiguration",
"ngpu",
"nindent",
"nodeclassref",
"noderesources",
"nolint",
"NVML",
Expand Down Expand Up @@ -126,6 +134,7 @@
"statefulsets",
"strategicpatch",
"strategicpatches",
"stretchr",
"subresource",
"Tabler",
"tensorfusion",
Expand Down
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ vet: ## Run go vet against code.
test: manifests generate fmt vet envtest ## Run tests.
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" GO_TESTING=true go run github.com/onsi/ginkgo/v2/ginkgo -p -timeout 0 -cover -coverprofile cover.out -r --skip-file ./test/e2e

.PHONY: test-serial
test-serial: manifests generate fmt vet envtest ## Run tests.
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" GO_TESTING=true go run github.com/onsi/ginkgo/v2/ginkgo -timeout 0 -r --skip-file ./test/e2e

.PHONY: ut
ut: manifests generate ## Run unit tests by make ut F=<focus-file>
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" cd internal/controller && GO_TESTING=true go run github.com/onsi/ginkgo/v2/ginkgo -p -timeout 0 --focus-file $F && cd ../../
Expand Down
16 changes: 16 additions & 0 deletions PROJECT
Original file line number Diff line number Diff line change
Expand Up @@ -87,4 +87,20 @@ resources:
kind: TensorFusionWorkload
path: github.com/NexusGPU/tensor-fusion/api/v1
version: v1
- api:
crdVersion: v1
namespaced: true
controller: true
domain: tensor-fusion.ai
kind: GPUResourceQuota
path: github.com/NexusGPU/tensor-fusion/api/v1
version: v1
- api:
crdVersion: v1
namespaced: true
controller: true
domain: tensor-fusion.ai
kind: GPUNodeClaim
path: github.com/NexusGPU/tensor-fusion/api/v1
version: v1
version: "3"
9 changes: 0 additions & 9 deletions api/v1/gpunode_funcs.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
package v1

import (
"time"

"k8s.io/apimachinery/pkg/api/resource"
)

Expand All @@ -18,10 +16,3 @@ func (node *GPUNode) InitializeStatus(initTFlops, initVRAM resource.Quantity, in
ObservedGeneration: node.Generation,
}
}

func (node *GPUNode) SetAnnotationToTriggerNodeSync() {
if node.Annotations == nil {
node.Annotations = make(map[string]string)
}
node.Annotations["tensor-fusion.ai/refresh-node-state"] = time.Now().String()
}
18 changes: 0 additions & 18 deletions api/v1/gpunode_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,6 @@ const (

// GPUNodeStatus defines the observed state of GPUNode.
type GPUNodeStatus struct {
// the identifier of the kubernetes node, in nodeSelector mode, GPUNode name is the same as kubernetes node name because of it's owned by the Kubernetes node, while in node provisioning mode owned by the GPUNode, and K8S Node name is uncontrollable
KubernetesNodeName string `json:"kubernetesNodeName"`

// +kubebuilder:default=Pending
Phase TensorFusionGPUNodePhase `json:"phase"`

Expand Down Expand Up @@ -112,21 +109,6 @@ const (
)

type GPUNodeInfo struct {
// +optional
// only set when node is managed by TensorFusion
InstanceID string `json:"instanceID,omitempty"`
Region string `json:"region,omitempty"`

Hostname string `json:"hostname,omitempty"`
IP string `json:"ip,omitempty"`
KernelVersion string `json:"kernelVersion,omitempty"`
OSImage string `json:"osImage,omitempty"`
GPUDriverVersion string `json:"gpuDriverVersion,omitempty"`
GPUModel string `json:"gpuModel,omitempty"`
GPUCount int32 `json:"gpuCount,omitempty"`
OperatingSystem string `json:"operatingSystem,omitempty"`
Architecture string `json:"architecture,omitempty"`

// Additional space for L1/L2 VRAM buffer
RAMSize resource.Quantity `json:"ramSize,omitempty"`
DataDiskSize resource.Quantity `json:"dataDiskSize,omitempty"`
Expand Down
102 changes: 102 additions & 0 deletions api/v1/gpunodeclaim_types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/*
Copyright 2024.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package v1

import (
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// GPUNodeClaimStatus defines the observed state of GPUNodeClaim.
type GPUNodeClaimStatus struct {

// +kubebuilder:default=Pending
Phase GPUNodeClaimPhase `json:"phase"`

InstanceID string `json:"instanceID,omitempty"`
}

type GPUNodeClaimPhase string

const (
GPUNodeClaimPending GPUNodeClaimPhase = "Pending"
GPUNodeClaimCreating GPUNodeClaimPhase = "Creating"
GPUNodeClaimBound GPUNodeClaimPhase = "Bound"
)

const GPUNodeClaimKind = "GPUNodeClaim"

// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
// +kubebuilder:resource:scope=Cluster
// +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase"

// GPUNodeClaim is the Schema for the gpunodeclaims API.
type GPUNodeClaim struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata"`

Spec GPUNodeClaimSpec `json:"spec,omitempty"`
Status GPUNodeClaimStatus `json:"status,omitempty"`
}

// +kubebuilder:object:root=true

// GPUNodeClaimList contains a list of GPUNodeClaim.
type GPUNodeClaimList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata"`
Items []GPUNodeClaim `json:"items"`
}

func init() {
SchemeBuilder.Register(&GPUNodeClaim{}, &GPUNodeClaimList{})
}

type CapacityTypeEnum string

const (
CapacityTypeOnDemand CapacityTypeEnum = "OnDemand"

CapacityTypeReserved CapacityTypeEnum = "Reserved"

// Spot and Preemptive are aliases of each other, used by different providers
CapacityTypeSpot CapacityTypeEnum = "Spot"
)

// GPUNodeClaimSpec defines the desired state of GPUNodeClaim.
type GPUNodeClaimSpec struct {
NodeName string `json:"nodeName,omitempty"`
Region string `json:"region,omitempty"`
Zone string `json:"zone,omitempty"`
InstanceType string `json:"instanceType,omitempty"`
NodeClassRef GroupKindName `json:"nodeClassRef,omitempty"`
CapacityType CapacityTypeEnum `json:"capacityType,omitempty"`

TFlopsOffered resource.Quantity `json:"tflopsOffered"`
VRAMOffered resource.Quantity `json:"vramOffered"`
GPUDeviceOffered int32 `json:"gpuDeviceOffered"`

ExtraParams map[string]string `json:"extraParams,omitempty"`
}

type GroupKindName struct {
Group string `json:"group"`
Kind string `json:"kind"`
Version string `json:"version"`
Name string `json:"name"`
}
4 changes: 4 additions & 0 deletions api/v1/gpunodeclass_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,10 @@ type GPUNodeClassList struct {
Items []GPUNodeClass `json:"items"`
}

const (
GPUNodeClassKind = "GPUNodeClass"
)

func init() {
SchemeBuilder.Register(&GPUNodeClass{}, &GPUNodeClassList{})
}
33 changes: 29 additions & 4 deletions api/v1/gpupool_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,14 @@ const (
// NodeProvisioner or NodeSelector, they are exclusive.
// NodeSelector is for existing GPUs, NodeProvisioner is for Karpenter-like auto management.
type NodeProvisioner struct {

// TensorFusion GPUNodeClass name
NodeClass string `json:"nodeClass,omitempty"`

// Karpenter NodeClass name
// +optional
KarpenterNodeClassRef *GroupKindName `json:"karpenterNodeClassRef,omitempty"`

// +optional
GPURequirements []Requirement `json:"gpuRequirements,omitempty"`
// +optional
Expand Down Expand Up @@ -167,13 +173,13 @@ type Requirement struct {
Values []string `json:"values,omitempty"`
}

// +kubebuilder:validation:Enum=node.kubernetes.io/instance-type;kubernetes.io/arch;kubernetes.io/os;topology.kubernetes.io/region;topology.kubernetes.io/zone;karpenter.sh/capacity-type;tensor-fusion.ai/gpu-arch;tensor-fusion.ai/gpu-instance-family;tensor-fusion.ai/gpu-instance-size
// +kubebuilder:validation:Enum=node.kubernetes.io/instance-type;kubernetes.io/arch;kubernetes.io/os;topology.kubernetes.io/region;topology.kubernetes.io/zone;karpenter.sh/capacity-type;tensor-fusion.ai/gpu-vendor;tensor-fusion.ai/gpu-instance-family;tensor-fusion.ai/gpu-instance-size
type NodeRequirementKey string

const (
NodeRequirementKeyInstanceType NodeRequirementKey = "node.kubernetes.io/instance-type"
NodeRequirementKeyArchitecture NodeRequirementKey = "kubernetes.io/arch"
NodeRequirementKeyGPUArchitecture NodeRequirementKey = "tensor-fusion.ai/gpu-arch"
NodeRequirementKeyInstanceType NodeRequirementKey = "node.kubernetes.io/instance-type"
NodeRequirementKeyArchitecture NodeRequirementKey = "kubernetes.io/arch"
NodeRequirementKeyGPUVendor NodeRequirementKey = "tensor-fusion.ai/gpu-vendor"

NodeRequirementKeyOS NodeRequirementKey = "kubernetes.io/os"
NodeRequirementKeyRegion NodeRequirementKey = "topology.kubernetes.io/region"
Expand Down Expand Up @@ -401,6 +407,10 @@ type GPUPoolStatus struct {
// TODO not implemented yet
BudgetExceeded string `json:"budgetExceeded,omitempty"`

// +optional
// +kubebuilder:default="None"
ProvisioningPhase ProvisioningPhase `json:"provisioningPhase,omitempty"`

// +optional
LastCompactionTime *metav1.Time `json:"lastCompactionTime,omitempty"`
}
Expand All @@ -416,6 +426,21 @@ const (
TensorFusionPoolPhaseDestroying = TensorFusionPoolPhase(constants.PhaseDestroying)
)

// +kubebuilder:validation:Enum=None;Initializing;Provisioning;Completed
type ProvisioningPhase string

const (
// None means not in provisioning mode
ProvisioningPhaseNone = ProvisioningPhase("None")

// When NodeClaim created and pending GPUNodeClaim not empty, it's provisioning state,
// check until all GPUNodeClaims are bound, unless next scale up should not happen
ProvisioningPhaseProvisioning = ProvisioningPhase("Provisioning")

// When all GPUNodeClaims are bound, set to Completed
ProvisioningPhaseCompleted = ProvisioningPhase("Completed")
)

type PoolProvisioningStatus struct {
InitializingNodes int32 `json:"initializingNodes,omitempty"`
TerminatingNodes int32 `json:"terminatingNodes,omitempty"`
Expand Down
Loading
Loading