@@ -24,13 +24,14 @@ import (
2424)
2525
2626var (
27- testenv env.Environment
28- nodeType * string
29- efaEnabled * bool
30- nvidiaTestImage * string
31- nodeCount int
32- gpuPerNode int
33- efaPerNode int
27+ testenv env.Environment
28+ nodeType * string
29+ installDevicePlugin * bool
30+ efaEnabled * bool
31+ nvidiaTestImage * string
32+ nodeCount int
33+ gpuPerNode int
34+ efaPerNode int
3435)
3536
3637var (
@@ -42,10 +43,97 @@ var (
4243 efaDevicePluginManifest []byte
4344)
4445
46+ func deployMPIOperator (ctx context.Context , config * envconf.Config ) (context.Context , error ) {
47+ dep := appsv1.Deployment {
48+ ObjectMeta : metav1.ObjectMeta {Name : "mpi-operator" , Namespace : "mpi-operator" },
49+ }
50+ err := wait .For (conditions .New (config .Client ().Resources ()).DeploymentConditionMatch (& dep , appsv1 .DeploymentAvailable , v1 .ConditionTrue ),
51+ wait .WithContext (ctx ))
52+ if err != nil {
53+ return ctx , fmt .Errorf ("failed to deploy mpi-operator: %v" , err )
54+ }
55+ return ctx , nil
56+ }
57+
58+ func deployNvidiaDevicePlugin (ctx context.Context , config * envconf.Config ) (context.Context , error ) {
59+ ds := appsv1.DaemonSet {
60+ ObjectMeta : metav1.ObjectMeta {Name : "nvidia-device-plugin-daemonset" , Namespace : "kube-system" },
61+ }
62+ err := wait .For (fwext .NewConditionExtension (config .Client ().Resources ()).DaemonSetReady (& ds ),
63+ wait .WithContext (ctx ))
64+ if err != nil {
65+ return ctx , fmt .Errorf ("failed to deploy nvidia-device-plugin: %v" , err )
66+ }
67+ return ctx , nil
68+ }
69+
70+ func deployEFAPlugin (ctx context.Context , config * envconf.Config ) (context.Context , error ) {
71+ err := fwext .ApplyManifests (config .Client ().RESTConfig (), efaDevicePluginManifest )
72+ if err != nil {
73+ return ctx , err
74+ }
75+
76+ ds := appsv1.DaemonSet {
77+ ObjectMeta : metav1.ObjectMeta {Name : "aws-efa-k8s-device-plugin-daemonset" , Namespace : "kube-system" },
78+ }
79+ err = wait .For (fwext .NewConditionExtension (config .Client ().Resources ()).DaemonSetReady (& ds ),
80+ wait .WithContext (ctx ))
81+ if err != nil {
82+ return ctx , fmt .Errorf ("failed to deploy efa-device-plugin: %v" , err )
83+ }
84+
85+ return ctx , nil
86+ }
87+
88+ func checkNodeTypes (ctx context.Context , config * envconf.Config ) (context.Context , error ) {
89+ clientset , err := kubernetes .NewForConfig (config .Client ().RESTConfig ())
90+ if err != nil {
91+ return ctx , err
92+ }
93+
94+ nodes , err := clientset .CoreV1 ().Nodes ().List (ctx , metav1.ListOptions {})
95+ if err != nil {
96+ return ctx , err
97+ }
98+
99+ singleNodeType := true
100+ for i := 1 ; i < len (nodes .Items )- 1 ; i ++ {
101+ if nodes .Items [i ].Labels ["node.kubernetes.io/instance-type" ] != nodes .Items [i - 1 ].Labels ["node.kubernetes.io/instance-type" ] {
102+ singleNodeType = false
103+ }
104+ }
105+ if ! singleNodeType {
106+ return ctx , fmt .Errorf ("Node types are not the same, all node types must be the same in the cluster" )
107+ }
108+
109+ if * nodeType != "" {
110+ for _ , v := range nodes .Items {
111+ if v .Labels ["node.kubernetes.io/instance-type" ] == * nodeType {
112+ nodeCount ++
113+ gpu := v .Status .Capacity ["nvidia.com/gpu" ]
114+ gpuPerNode = int (gpu .Value ())
115+ efa := v .Status .Capacity ["vpc.amazonaws.com/efa" ]
116+ efaPerNode = int (efa .Value ())
117+ }
118+ }
119+ } else {
120+ log .Printf ("No node type specified. Using the node type %s in the node groups." , nodes .Items [0 ].Labels ["node.kubernetes.io/instance-type" ])
121+ nodeType = aws .String (nodes .Items [0 ].Labels ["node.kubernetes.io/instance-type" ])
122+ nodeCount = len (nodes .Items )
123+ gpu := nodes .Items [0 ].Status .Capacity ["nvidia.com/gpu" ]
124+ gpuPerNode = int (gpu .Value ())
125+ efa := nodes .Items [0 ].Status .Capacity ["vpc.amazonaws.com/efa" ]
126+ efaPerNode = int (efa .Value ())
127+ }
128+
129+ return ctx , nil
130+ }
131+
45132func TestMain (m * testing.M ) {
46133 nodeType = flag .String ("nodeType" , "" , "node type for the tests" )
47134 nvidiaTestImage = flag .String ("nvidiaTestImage" , "" , "nccl test image for nccl tests" )
48135 efaEnabled = flag .Bool ("efaEnabled" , false , "enable efa tests" )
136+ installDevicePlugin = flag .Bool ("installDevicePlugin" , true , "install nvidia device plugin" )
49137 cfg , err := envconf .NewFromFlags ()
50138 if err != nil {
51139 log .Fatalf ("failed to initialize test environment: %v" , err )
@@ -57,95 +145,30 @@ func TestMain(m *testing.M) {
57145
58146 // all NVIDIA tests require the device plugin and MPI operator
59147 manifests := [][]byte {
60- nvidiaDevicePluginManifest ,
61148 mpiOperatorManifest ,
62149 }
63-
64- testenv .Setup (
150+ setUpFunctions := []env.Func {
65151 func (ctx context.Context , config * envconf.Config ) (context.Context , error ) {
66152 err := fwext .ApplyManifests (config .Client ().RESTConfig (), manifests ... )
67153 if err != nil {
68154 return ctx , err
69155 }
70156 return ctx , nil
71157 },
72- func (ctx context.Context , config * envconf.Config ) (context.Context , error ) {
73- dep := appsv1.Deployment {
74- ObjectMeta : metav1.ObjectMeta {Name : "mpi-operator" , Namespace : "mpi-operator" },
75- }
76- err := wait .For (conditions .New (config .Client ().Resources ()).DeploymentConditionMatch (& dep , appsv1 .DeploymentAvailable , v1 .ConditionTrue ),
77- wait .WithContext (ctx ))
78- if err != nil {
79- return ctx , fmt .Errorf ("failed to deploy mpi-operator: %v" , err )
80- }
81- return ctx , nil
82- },
83- func (ctx context.Context , config * envconf.Config ) (context.Context , error ) {
84- ds := appsv1.DaemonSet {
85- ObjectMeta : metav1.ObjectMeta {Name : "nvidia-device-plugin-daemonset" , Namespace : "kube-system" },
86- }
87- err := wait .For (fwext .NewConditionExtension (config .Client ().Resources ()).DaemonSetReady (& ds ),
88- wait .WithContext (ctx ))
89- if err != nil {
90- return ctx , fmt .Errorf ("failed to deploy nvidia-device-plugin: %v" , err )
91- }
92- return ctx , nil
93- },
94- func (ctx context.Context , config * envconf.Config ) (context.Context , error ) {
95- clientset , err := kubernetes .NewForConfig (cfg .Client ().RESTConfig ())
96- if err != nil {
97- return ctx , err
98- }
99- if * efaEnabled {
100- err := fwext .ApplyManifests (cfg .Client ().RESTConfig (), efaDevicePluginManifest )
101- if err != nil {
102- return ctx , err
103- }
104- ds := appsv1.DaemonSet {
105- ObjectMeta : metav1.ObjectMeta {Name : "aws-efa-k8s-device-plugin-daemonset" , Namespace : "kube-system" },
106- }
107- err = wait .For (fwext .NewConditionExtension (cfg .Client ().Resources ()).DaemonSetReady (& ds ),
108- wait .WithContext (ctx ))
109- if err != nil {
110- return ctx , fmt .Errorf ("failed to deploy efa-device-plugin: %v" , err )
111- }
112- }
113- nodes , err := clientset .CoreV1 ().Nodes ().List (ctx , metav1.ListOptions {})
114- if err != nil {
115- return ctx , err
116- }
158+ deployMPIOperator ,
159+ checkNodeTypes ,
160+ }
117161
118- singleNodeType := true
119- for i := 1 ; i < len (nodes .Items )- 1 ; i ++ {
120- if nodes .Items [i ].Labels ["node.kubernetes.io/instance-type" ] != nodes .Items [i - 1 ].Labels ["node.kubernetes.io/instance-type" ] {
121- singleNodeType = false
122- }
123- }
124- if ! singleNodeType {
125- return ctx , fmt .Errorf ("Node types are not the same, all node types must be the same in the cluster" )
126- }
127- if * nodeType != "" {
128- for _ , v := range nodes .Items {
129- if v .Labels ["node.kubernetes.io/instance-type" ] == * nodeType {
130- nodeCount ++
131- gpu := v .Status .Capacity ["nvidia.com/gpu" ]
132- gpuPerNode = int (gpu .Value ())
133- efa := v .Status .Capacity ["vpc.amazonaws.com/efa" ]
134- efaPerNode = int (efa .Value ())
135- }
136- }
137- } else {
138- log .Printf ("No node type specified. Using the node type %s in the node groups." , nodes .Items [0 ].Labels ["node.kubernetes.io/instance-type" ])
139- nodeType = aws .String (nodes .Items [0 ].Labels ["node.kubernetes.io/instance-type" ])
140- nodeCount = len (nodes .Items )
141- gpu := nodes .Items [0 ].Status .Capacity ["nvidia.com/gpu" ]
142- gpuPerNode = int (gpu .Value ())
143- efa := nodes .Items [0 ].Status .Capacity ["vpc.amazonaws.com/efa" ]
144- efaPerNode = int (efa .Value ())
145- }
146- return ctx , nil
147- },
148- )
162+ if * installDevicePlugin {
163+ manifests = append (manifests , nvidiaDevicePluginManifest )
164+ setUpFunctions = append (setUpFunctions , deployNvidiaDevicePlugin )
165+ }
166+
167+ if * efaEnabled {
168+ setUpFunctions = append (setUpFunctions , deployEFAPlugin )
169+ }
170+
171+ testenv .Setup (setUpFunctions ... )
149172
150173 testenv .Finish (
151174 func (ctx context.Context , config * envconf.Config ) (context.Context , error ) {
0 commit comments