Skip to content

Commit

Permalink
Add configurable options
Browse files Browse the repository at this point in the history
  • Loading branch information
gjulianm committed Mar 3, 2025
1 parent a0064d5 commit 9334ffc
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 21 deletions.
37 changes: 16 additions & 21 deletions components/kubernetes/nvidia/nvidia.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,6 @@ import (
)

const nvkindPackage = "github.com/NVIDIA/nvkind/cmd/nvkind"
const nvkindVersion = "eeeb9ca30763177fbe7b4d10fb6b7e21725e2295"
const nvkindRequiredGoVersion = "1.23"
const kindNodeImageName = "kindest/node"

const gpuOperatorVersion = "v24.9.2"

const nvkindClusterValues = `
image: %s
Expand All @@ -53,16 +48,16 @@ type KindCluster struct {
// clusters require a set of patches that aren't trivial. Instead of writing them all down here, we have
// decided to use the nvkind tool to create the cluster. This means that we cannot follow the same code path
// as for regular kind clusters.
func NewKindCluster(env config.Env, vm *remote.Host, name string, kubeVersion string, opts ...pulumi.ResourceOption) (*KindCluster, error) {
func NewKindCluster(env config.Env, vm *remote.Host, name string, clusterOpts *KindClusterOptions, opts ...pulumi.ResourceOption) (*KindCluster, error) {
// Configure the nvidia container toolkit
cmd, err := configureContainerToolkit(env, vm, opts...)
cmd, err := configureContainerToolkit(env, vm, clusterOpts, opts...)
if err != nil {
return nil, fmt.Errorf("failed to prepare NVIDIA runtime: %w", err)
}
opts = utils.MergeOptions(opts, utils.PulumiDependsOn(cmd))

// Create the cluster
cluster, err := initNvkindCluster(env, vm, name, kubeVersion, opts...)
cluster, err := initNvkindCluster(env, vm, name, clusterOpts, opts...)
if err != nil {
return nil, fmt.Errorf("failed to create nvkind cluster: %w", err)
}
Expand All @@ -78,7 +73,7 @@ func NewKindCluster(env config.Env, vm *remote.Host, name string, kubeVersion st
opts = append(opts, pulumi.Provider(cluster.KubeProvider), pulumi.Parent(cluster.KubeProvider), pulumi.DeletedWith(cluster.KubeProvider))

// Now install the operator
operator, err := installGPUOperator(env, opts...)
operator, err := installGPUOperator(env, clusterOpts, opts...)
if err != nil {
return nil, fmt.Errorf("failed to install GPU operator: %w", err)
}
Expand All @@ -89,7 +84,7 @@ func NewKindCluster(env config.Env, vm *remote.Host, name string, kubeVersion st
}, nil
}

func configureContainerToolkit(env config.Env, vm *remote.Host, opts ...pulumi.ResourceOption) (pulumi.Resource, error) {
func configureContainerToolkit(env config.Env, vm *remote.Host, clusterOpts *KindClusterOptions, opts ...pulumi.ResourceOption) (pulumi.Resource, error) {
// Ensure we have Docker
dockerManager, err := docker.NewManager(env, vm, opts...)
if err != nil {
Expand Down Expand Up @@ -119,14 +114,14 @@ func configureContainerToolkit(env config.Env, vm *remote.Host, opts ...pulumi.R
return vm.OS.Runner().Command(
env.CommonNamer().ResourceName("nvidia-ctk-check"),
&command.Args{
Create: pulumi.String("docker run --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all ubuntu:20.04 nvidia-smi -L"),
Create: pulumi.Sprintf("docker run --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all %s nvidia-smi -L", clusterOpts.cudaSanityCheckImage),
},
utils.MergeOptions(opts, utils.PulumiDependsOn(ctkConfigureCmd))...,
)
}

// installNvkind installs the nvkind tool with all the necessary requisites
func installNvkind(env config.Env, vm *remote.Host, kindVersion string, kubeVersion string, opts ...pulumi.ResourceOption) (command.Command, error) {
func installNvkind(env config.Env, vm *remote.Host, kindVersion string, clusterOpts *KindClusterOptions, opts ...pulumi.ResourceOption) (command.Command, error) {
// kind is a requisite for nvkind, as it calls it under the hood
kindInstall, err := kubernetes.InstallKindBinary(env, vm, kindVersion, opts...)
if err != nil {
Expand All @@ -138,7 +133,7 @@ func installNvkind(env config.Env, vm *remote.Host, kindVersion string, kubeVers
env.CommonNamer().ResourceName("kubectl-install"),
&command.Args{
// use snap installer as it contains multiple versions, rather than APT
Create: pulumi.Sprintf("sudo snap install kubectl --classic --channel=%s/stable", kubeVersion),
Create: pulumi.Sprintf("sudo snap install kubectl --classic --channel=%s/stable", clusterOpts.kubeVersion),
},
opts...,
)
Expand All @@ -151,7 +146,7 @@ func installNvkind(env config.Env, vm *remote.Host, kindVersion string, kubeVers
env.CommonNamer().ResourceName("golang-install"),
&command.Args{
// use snap installer as it contains multiple versions, rather than APT
Create: pulumi.Sprintf("sudo snap install --classic go --channel=%s/stable", nvkindRequiredGoVersion),
Create: pulumi.Sprintf("sudo snap install --classic go --channel=%s/stable", clusterOpts.hostGoVersion),
},
opts...,
)
Expand All @@ -164,7 +159,7 @@ func installNvkind(env config.Env, vm *remote.Host, kindVersion string, kubeVers
env.CommonNamer().ResourceName("nvkind-install"),
&command.Args{
// Ensure it gets installed to the global $PATH to avoid having to copy it or change $PATH
Create: pulumi.Sprintf("sudo GOBIN=/usr/local/bin go install %s@%s", nvkindPackage, nvkindVersion),
Create: pulumi.Sprintf("sudo GOBIN=/usr/local/bin go install %s@%s", nvkindPackage, clusterOpts.nvkindVersion),
},
utils.MergeOptions(opts, utils.PulumiDependsOn(golangInstall, kindInstall, kubectlInstall))...,
)
Expand All @@ -177,16 +172,16 @@ func installNvkind(env config.Env, vm *remote.Host, kindVersion string, kubeVers

// inivtNvkindCluster creates a new Kubernetes cluster using nvkind so that nodes can be GPU-enabled, installing
// the necessary components and configuring the cluster.
func initNvkindCluster(env config.Env, vm *remote.Host, name string, kubeVersion string, opts ...pulumi.ResourceOption) (*kubernetes.Cluster, error) {
func initNvkindCluster(env config.Env, vm *remote.Host, name string, clusterOpts *KindClusterOptions, opts ...pulumi.ResourceOption) (*kubernetes.Cluster, error) {
return components.NewComponent(env, name, func(clusterComp *kubernetes.Cluster) error {
opts = utils.MergeOptions[pulumi.ResourceOption](opts, pulumi.Parent(clusterComp))
kindVersionConfig, err := kubernetes.GetKindVersionConfig(kubeVersion)
kindVersionConfig, err := kubernetes.GetKindVersionConfig(clusterOpts.kubeVersion)
if err != nil {
return err
}

// Install nvkind to create the cluster
nvkindInstall, err := installNvkind(env, vm, kindVersionConfig.KindVersion, kubeVersion, opts...)
nvkindInstall, err := installNvkind(env, vm, kindVersionConfig.KindVersion, clusterOpts, opts...)
if err != nil {
return fmt.Errorf("failed to install nvkind: %w", err)
}
Expand All @@ -202,7 +197,7 @@ func initNvkindCluster(env config.Env, vm *remote.Host, name string, kubeVersion
return err
}

nodeImage := fmt.Sprintf("%s/%s:%s", env.InternalDockerhubMirror(), kindNodeImageName, kindVersionConfig.NodeImageVersion)
nodeImage := fmt.Sprintf("%s/%s:%s", env.InternalDockerhubMirror(), clusterOpts.kindImage, kindVersionConfig.NodeImageVersion)
nvkindValuesPath := "/tmp/nvkind-values.yaml"
nvkindValuesContent := pulumi.Sprintf(nvkindClusterValues, nodeImage)
nvkindValues, err := vm.OS.FileManager().CopyInlineFile(
Expand Down Expand Up @@ -255,7 +250,7 @@ func initNvkindCluster(env config.Env, vm *remote.Host, name string, kubeVersion
}

// installGPUOperator installs the GPU operator in the cluster
func installGPUOperator(env config.Env, opts ...pulumi.ResourceOption) (*helm.Release, error) {
func installGPUOperator(env config.Env, clusterOpts *KindClusterOptions, opts ...pulumi.ResourceOption) (*helm.Release, error) {
// Create namespace
operatorNs := "gpu-operator"
ns, err := corev1.NewNamespace(env.Ctx(), operatorNs, &corev1.NamespaceArgs{
Expand All @@ -274,7 +269,7 @@ func installGPUOperator(env config.Env, opts ...pulumi.ResourceOption) (*helm.Re
},
Chart: pulumi.String("gpu-operator"),
Namespace: pulumi.String(operatorNs),
Version: pulumi.String(gpuOperatorVersion),
Version: pulumi.String(clusterOpts.gpuOperatorVersion),
CreateNamespace: pulumi.Bool(true),
DependencyUpdate: pulumi.BoolPtr(true),
}, opts...)
Expand Down
104 changes: 104 additions & 0 deletions components/kubernetes/nvidia/options.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
package nvidia

// defaultGpuOperatorVersion is the default version of the Nvidia GPU operator to install
const defaultGpuOperatorVersion = "v24.9.2"

// defaultNvkindVersion is the default version of the nvkind utility to install
// Must be a valid reference for the github.com/NVIDIA/nvkind repository
const defaultNvkindVersion = "eeeb9ca30763177fbe7b4d10fb6b7e21725e2295"

// defaultHostGoVersion is the default version of Go to install in the host. This version
// must be compatible with the nvkind utility
const defaultHostGoVersion = "1.23"

// defaultKindNodeImage is the default image to use for the kind nodes.
const defaultKindNodeImage = "kindest/node"

// defaultCudaSanityCheckImage is a Docker image that contains a CUDA sample to
// validate the GPU setup with the default CUDA installation. Note that the CUDA
// version in this image must be equal or less than the one installed in the
// AMI.
const defaultCudaSanityCheckImage = "669783387624.dkr.ecr.us-east-1.amazonaws.com/dockerhub/nvidia/cuda:12.6.3-base-ubuntu22.04"

// KindClusterOptions contains the options for creating a kind cluster with the Nvidia GPU operator
type KindClusterOptions struct {
// kubeVersion is the version of Kubernetes to install in the kind cluster
kubeVersion string

// gpuOperatorVersion is the version of the Nvidia GPU operator to install
gpuOperatorVersion string

// nvkindVersion is the version of the nvkind utility to install
nvkindVersion string

// hostGoVersion is the version of Go to install in the host
hostGoVersion string

// kindImage is the image to use for the kind nodes
kindImage string

// cudaSanityCheckImage is a Docker image to use when performing sanity checks for validation of the GPU setup in containers
cudaSanityCheckImage string
}

// KindClusterOption is a function that modifies a KindClusterOptions
type KindClusterOption func(*KindClusterOptions)

// WithGPUOperatorVersion sets the version of the Nvidia GPU operator to install
func WithKubeVersion(version string) KindClusterOption {
return func(o *KindClusterOptions) {
o.kubeVersion = version
}
}

// WithGPUOperatorVersion sets the version of the Nvidia GPU operator to install
func WithGPUOperatorVersion(version string) KindClusterOption {
return func(o *KindClusterOptions) {
o.gpuOperatorVersion = version
}
}

// WithNvkindVersion sets the version of the nvkind utility to install
func WithNvkindVersion(version string) KindClusterOption {
return func(o *KindClusterOptions) {
o.nvkindVersion = version
}
}

// WithHostGoVersion sets the version of Go to install in the host
func WithHostGoVersion(version string) KindClusterOption {
return func(o *KindClusterOptions) {
o.hostGoVersion = version
}
}

// WithKindImage sets the image to use for the kind nodes. The version used by this image will
// be the one defined by kubernetes.GetKindVersionConfig based on the kubernetes version used.
func WithKindImage(image string) KindClusterOption {
return func(o *KindClusterOptions) {
o.kindImage = image
}
}

// WithCudaSanityCheckImage sets the image to use for the CUDA sanity check commands. Note that
// the CUDA version in this image must be equal or less than the one installed in the AMI.
func WithCudaSanityCheckImage(image string) KindClusterOption {
return func(o *KindClusterOptions) {
o.cudaSanityCheckImage = image
}
}

// NewKindClusterOptions creates a new KindClusterOptions with the given options, or defaults
func NewKindClusterOptions(opts ...KindClusterOption) *KindClusterOptions {
o := &KindClusterOptions{
gpuOperatorVersion: defaultGpuOperatorVersion,
nvkindVersion: defaultNvkindVersion,
hostGoVersion: defaultHostGoVersion,
kindImage: defaultKindNodeImage,
cudaSanityCheckImage: defaultCudaSanityCheckImage,
}
for _, opt := range opts {
opt(o)
}
return o
}

0 comments on commit 9334ffc

Please sign in to comment.