Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update kubelet plugin to v1alpha3 API #14

Merged
merged 3 commits into from
Mar 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
20 changes: 15 additions & 5 deletions cmd/nvidia-dra-controller/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,17 @@ func (d driver) GetClaimParameters(ctx context.Context, claim *resourcev1.Resour
return nil, fmt.Errorf("unknown ResourceClaim.ParametersRef.Kind: %v", claim.Spec.ParametersRef.Kind)
}

func (d driver) Allocate(ctx context.Context, claim *resourcev1.ResourceClaim, claimParameters interface{}, class *resourcev1.ResourceClass, classParameters interface{}, selectedNode string) (*resourcev1.AllocationResult, error) {
func (d driver) Allocate(ctx context.Context, cas []*controller.ClaimAllocation, selectedNode string) {
// In production version of the driver the common operations for every
// d.allocate looped call should be done prior this loop, and can be reused
// for every d.allocate() looped call.
// E.g.: selectedNode=="" check, client stup and CRD fetching.
for _, ca := range cas {
ca.Allocation, ca.Error = d.allocate(ctx, ca.Claim, ca.ClaimParameters, ca.Class, ca.ClassParameters, selectedNode)
}
}

func (d driver) allocate(ctx context.Context, claim *resourcev1.ResourceClaim, claimParameters interface{}, class *resourcev1.ResourceClass, classParameters interface{}, selectedNode string) (*resourcev1.AllocationResult, error) {
if selectedNode == "" {
return nil, fmt.Errorf("TODO: immediate allocations not yet supported")
}
Expand All @@ -126,6 +136,10 @@ func (d driver) Allocate(ctx context.Context, claim *resourcev1.ResourceClaim, c
return nil, fmt.Errorf("error retrieving node specific Gpu CRD: %w", err)
}

if crd.Status != nascrd.NodeAllocationStateStatusReady {
return nil, fmt.Errorf("NodeAllocationStateStatus: %v", crd.Status)
}

if crd.Spec.AllocatedClaims == nil {
crd.Spec.AllocatedClaims = make(map[string]nascrd.AllocatedDevices)
}
Expand All @@ -134,10 +148,6 @@ func (d driver) Allocate(ctx context.Context, claim *resourcev1.ResourceClaim, c
return buildAllocationResult(selectedNode, true), nil
}

if crd.Status != nascrd.NodeAllocationStateStatusReady {
return nil, fmt.Errorf("NodeAllocationStateStatus: %v", crd.Status)
}

var onSuccess OnSuccessCallback
classParams, ok := classParameters.(*gpucrd.DeviceClassParametersSpec)
if !ok {
Expand Down
53 changes: 35 additions & 18 deletions cmd/nvidia-dra-plugin/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import (
"k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/util/retry"
"k8s.io/klog/v2"
drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1alpha2"
drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1alpha3"

nascrd "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/gpu/nas/v1alpha1"
nasclient "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/gpu/nas/v1alpha1/client"
Expand Down Expand Up @@ -99,39 +99,56 @@ func (d *driver) Shutdown(ctx context.Context) error {
})
}

func (d *driver) NodePrepareResource(ctx context.Context, req *drapbv1.NodePrepareResourceRequest) (*drapbv1.NodePrepareResourceResponse, error) {
func (d *driver) NodePrepareResources(ctx context.Context, req *drapbv1.NodePrepareResourcesRequest) (*drapbv1.NodePrepareResourcesResponse, error) {

klog.Infof("NodePrepareResource is called: number of claims: %d", len(req.Claims))
preparedResources := &drapbv1.NodePrepareResourcesResponse{Claims: map[string]*drapbv1.NodePrepareResourceResponse{}}

// In production version some common operations of d.nodeUnprepareResources
// should be done outside of the loop, for instance updating the CR could
// be done once after all HW was prepared.
for _, claim := range req.Claims {
preparedResources.Claims[claim.Uid] = d.nodePrepareResource(ctx, claim)
}

return preparedResources, nil
}

func (d *driver) nodePrepareResource(ctx context.Context, claim *drapbv1.Claim) *drapbv1.NodePrepareResourceResponse {
d.Lock()
defer d.Unlock()

klog.Infof("NodePrepareResource is called: request: %+v", req)

isPrepared, prepared, err := d.IsPrepared(ctx, req.ClaimUid)
isPrepared, prepared, err := d.isPrepared(ctx, claim.Uid)
if err != nil {
return nil, fmt.Errorf("error checking if claim is already prepared: %w", err)
return &drapbv1.NodePrepareResourceResponse{
Error: fmt.Sprintf("error checking if claim is already prepared: %v", err),
}
}

if isPrepared {
klog.Infof("Returning cached devices for claim '%v': %s", req.ClaimUid, prepared)
return &drapbv1.NodePrepareResourceResponse{CdiDevices: prepared}, nil
klog.Infof("Returning cached devices for claim '%v': %s", claim.Uid, prepared)
return &drapbv1.NodePrepareResourceResponse{CDIDevices: prepared}
}

prepared, err = d.Prepare(ctx, req.ClaimUid)
prepared, err = d.prepare(ctx, claim.Uid)
if err != nil {
return nil, fmt.Errorf("error preparing devices for claim %v: %w", req.ClaimUid, err)
return &drapbv1.NodePrepareResourceResponse{
Error: fmt.Sprintf("error preparing devices for claim %v: %v", claim.Uid, err),
}
}

klog.Infof("Returning newly prepared devices for claim '%v': %s", req.ClaimUid, prepared)
return &drapbv1.NodePrepareResourceResponse{CdiDevices: prepared}, nil
klog.Infof("Returning newly prepared devices for claim '%v': %s", claim.Uid, prepared)
return &drapbv1.NodePrepareResourceResponse{CDIDevices: prepared}
}

func (d *driver) NodeUnprepareResource(ctx context.Context, req *drapbv1.NodeUnprepareResourceRequest) (*drapbv1.NodeUnprepareResourceResponse, error) {
func (d *driver) NodeUnprepareResources(ctx context.Context, req *drapbv1.NodeUnprepareResourcesRequest) (*drapbv1.NodeUnprepareResourcesResponse, error) {
// We don't upprepare as part of NodeUnprepareResource, we do it
// asynchronously when the claims themselves are deleted and the
// AllocatedClaim has been removed.
return &drapbv1.NodeUnprepareResourceResponse{}, nil
return &drapbv1.NodeUnprepareResourcesResponse{}, nil
}

func (d *driver) IsPrepared(ctx context.Context, claimUID string) (bool, []string, error) {
func (d *driver) isPrepared(ctx context.Context, claimUID string) (bool, []string, error) {
err := d.nasclient.Get(ctx)
if err != nil {
return false, nil, err
Expand All @@ -142,7 +159,7 @@ func (d *driver) IsPrepared(ctx context.Context, claimUID string) (bool, []strin
return false, nil, nil
}

func (d *driver) Prepare(ctx context.Context, claimUID string) ([]string, error) {
func (d *driver) prepare(ctx context.Context, claimUID string) ([]string, error) {
var err error
var prepared []string
err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
Expand All @@ -169,7 +186,7 @@ func (d *driver) Prepare(ctx context.Context, claimUID string) ([]string, error)
return prepared, nil
}

func (d *driver) Unprepare(ctx context.Context, claimUID string) error {
func (d *driver) unprepare(ctx context.Context, claimUID string) error {
err := retry.RetryOnConflict(retry.DefaultRetry, func() error {
err := d.nasclient.Get(ctx)
if err != nil {
Expand Down Expand Up @@ -329,7 +346,7 @@ func (d *driver) cleanupClaimAllocations(ctx context.Context, nas *nascrd.NodeAl
go func(claimUID string) {
defer wg.Done()
klog.Infof("Attempting to unprepare resources for claim %v", claimUID)
err := d.Unprepare(ctx, claimUID)
err := d.unprepare(ctx, claimUID)
if err != nil {
errors <- fmt.Errorf("error unpreparing resources for claim %v: %w", claimUID, err)
return
Expand Down
96 changes: 35 additions & 61 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,50 +2,24 @@ module github.com/NVIDIA/k8s-dra-driver

go 1.20

replace (
k8s.io/api => k8s.io/api v0.27.0-beta.0
k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.27.0-beta.0
k8s.io/apimachinery => k8s.io/apimachinery v0.27.0-beta.0
k8s.io/apiserver => k8s.io/apiserver v0.27.0-beta.0
k8s.io/cli-runtime => k8s.io/cli-runtime v0.27.0-beta.0
k8s.io/client-go => k8s.io/client-go v0.27.0-beta.0
k8s.io/cloud-provider => k8s.io/cloud-provider v0.27.0-beta.0
k8s.io/cluster-bootstrap => k8s.io/cluster-bootstrap v0.27.0-beta.0
k8s.io/code-generator => k8s.io/code-generator v0.27.0-beta.0
k8s.io/component-base => k8s.io/component-base v0.27.0-beta.0
k8s.io/component-helpers => k8s.io/component-helpers v0.27.0-beta.0
k8s.io/cri-api => k8s.io/cri-api v0.27.0-beta.0
k8s.io/csi-translation-lib => k8s.io/csi-translation-lib v0.27.0-beta.0
k8s.io/dynamic-resource-allocation => k8s.io/dynamic-resource-allocation v0.27.0-beta.0
k8s.io/kube-aggregator => k8s.io/kube-aggregator v0.27.0-beta.0
k8s.io/kube-controller-manager => k8s.io/kube-controller-manager v0.27.0-beta.0
k8s.io/kube-proxy => k8s.io/kube-proxy v0.27.0-beta.0
k8s.io/kube-scheduler => k8s.io/kube-scheduler v0.27.0-beta.0
k8s.io/kubectl => k8s.io/kubectl v0.27.0-beta.0
k8s.io/kubelet => k8s.io/kubelet v0.27.0-beta.0
k8s.io/legacy-cloud-providers => k8s.io/legacy-cloud-providers v0.27.0-beta.0
k8s.io/metrics => k8s.io/metrics v0.27.0-beta.0
k8s.io/sample-apiserver => k8s.io/sample-apiserver v0.27.0-beta.0
)

require (
github.com/NVIDIA/go-nvlib v0.0.0-20231116150931-9fd385bace0d
github.com/NVIDIA/go-nvml v0.12.0-2
github.com/NVIDIA/nvidia-container-toolkit v1.14.4-0.20231120225202-039d7fd32429
github.com/prometheus/client_golang v1.14.0
github.com/prometheus/client_golang v1.16.0
github.com/sirupsen/logrus v1.9.3
github.com/spf13/pflag v1.0.5
github.com/stretchr/testify v1.8.4
github.com/urfave/cli/v2 v2.27.1
golang.org/x/mod v0.15.0
k8s.io/api v0.27.0-beta.0
k8s.io/apimachinery v0.27.0-beta.0
k8s.io/client-go v0.27.0-beta.0
k8s.io/component-base v0.27.0-beta.0
k8s.io/dynamic-resource-allocation v0.0.0-00010101000000-000000000000
k8s.io/klog/v2 v2.90.1
k8s.io/kubelet v0.27.0-beta.0
k8s.io/mount-utils v0.26.3
k8s.io/api v0.29.2
k8s.io/apimachinery v0.29.2
k8s.io/client-go v0.29.2
k8s.io/component-base v0.29.2
k8s.io/dynamic-resource-allocation v0.29.2
k8s.io/klog/v2 v2.110.1
k8s.io/kubelet v0.29.2
k8s.io/mount-utils v0.29.2
tags.cncf.io/container-device-interface v0.6.2
tags.cncf.io/container-device-interface/specs-go v0.6.0
)
Expand All @@ -56,27 +30,27 @@ require (
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/emicklei/go-restful/v3 v3.9.0 // indirect
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
github.com/evanphx/json-patch v4.12.0+incompatible // indirect
github.com/fsnotify/fsnotify v1.6.0 // indirect
github.com/go-logr/logr v1.2.3 // indirect
github.com/go-logr/logr v1.3.0 // indirect
github.com/go-logr/zapr v1.2.3 // indirect
github.com/go-openapi/jsonpointer v0.19.6 // indirect
github.com/go-openapi/jsonreference v0.20.1 // indirect
github.com/go-openapi/jsonreference v0.20.2 // indirect
github.com/go-openapi/swag v0.22.3 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.3 // indirect
github.com/google/gnostic v0.5.7-v3refs // indirect
github.com/google/go-cmp v0.5.9 // indirect
github.com/google/gofuzz v1.1.0 // indirect
github.com/google/gnostic-models v0.6.8 // indirect
github.com/google/go-cmp v0.6.0 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/uuid v1.3.0 // indirect
github.com/imdario/mergo v0.3.6 // indirect
github.com/inconshreveable/mousetrap v1.0.1 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.2 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
github.com/moby/sys/mountinfo v0.6.2 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
Expand All @@ -85,32 +59,32 @@ require (
github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_model v0.3.0 // indirect
github.com/prometheus/common v0.37.0 // indirect
github.com/prometheus/procfs v0.8.0 // indirect
github.com/prometheus/client_model v0.4.0 // indirect
github.com/prometheus/common v0.44.0 // indirect
github.com/prometheus/procfs v0.10.1 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/spf13/cobra v1.6.1 // indirect
github.com/spf13/cobra v1.7.0 // indirect
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 // indirect
github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
go.uber.org/atomic v1.9.0 // indirect
go.uber.org/multierr v1.8.0 // indirect
go.uber.org/atomic v1.10.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.21.0 // indirect
golang.org/x/net v0.17.0 // indirect
golang.org/x/oauth2 v0.7.0 // indirect
golang.org/x/sys v0.13.0 // indirect
golang.org/x/term v0.13.0 // indirect
golang.org/x/text v0.13.0 // indirect
golang.org/x/time v0.1.0 // indirect
golang.org/x/net v0.19.0 // indirect
golang.org/x/oauth2 v0.10.0 // indirect
golang.org/x/sys v0.15.0 // indirect
golang.org/x/term v0.15.0 // indirect
golang.org/x/text v0.14.0 // indirect
golang.org/x/time v0.3.0 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 // indirect
google.golang.org/grpc v1.56.3 // indirect
google.golang.org/protobuf v1.30.0 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20230822172742-b8732ec3820d // indirect
google.golang.org/grpc v1.58.3 // indirect
google.golang.org/protobuf v1.31.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/kube-openapi v0.0.0-20230308215209-15aac26d736a // indirect
k8s.io/utils v0.0.0-20230209194617-a36077c30491 // indirect
k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 // indirect
k8s.io/utils v0.0.0-20230726121419-3b25d923346b // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
sigs.k8s.io/yaml v1.3.0 // indirect
)
Loading