Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add the RayJob Multikueue e2e tests
Browse files Browse the repository at this point in the history
mszadkow committed Jan 14, 2025
1 parent 3320b1b commit c03cf53
Showing 5 changed files with 103 additions and 18 deletions.
7 changes: 7 additions & 0 deletions charts/kueue/templates/rbac/role.yaml
Original file line number Diff line number Diff line change
@@ -298,9 +298,16 @@ rules:
- rayclusters/finalizers
- rayclusters/status
- rayjobs/finalizers
verbs:
- get
- update
- apiGroups:
- ray.io
resources:
- rayjobs/status
verbs:
- get
- patch
- update
- apiGroups:
- scheduling.k8s.io
7 changes: 7 additions & 0 deletions config/components/rbac/role.yaml
Original file line number Diff line number Diff line change
@@ -297,9 +297,16 @@ rules:
- rayclusters/finalizers
- rayclusters/status
- rayjobs/finalizers
verbs:
- get
- update
- apiGroups:
- ray.io
resources:
- rayjobs/status
verbs:
- get
- patch
- update
- apiGroups:
- scheduling.k8s.io
90 changes: 73 additions & 17 deletions test/e2e/multikueue/e2e_test.go
Original file line number Diff line number Diff line change
@@ -18,13 +18,16 @@ package mke2e

import (
"fmt"
"os"
"os/exec"
"runtime"

"github.com/google/go-cmp/cmp/cmpopts"
kfmpi "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1"
kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
"github.com/onsi/ginkgo/v2"
"github.com/onsi/gomega"
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
apimeta "k8s.io/apimachinery/pkg/api/meta"
@@ -40,11 +43,13 @@ import (
workloadjobset "sigs.k8s.io/kueue/pkg/controller/jobs/jobset"
workloadpytorchjob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/pytorchjob"
workloadmpijob "sigs.k8s.io/kueue/pkg/controller/jobs/mpijob"
workloadrayjob "sigs.k8s.io/kueue/pkg/controller/jobs/rayjob"
utiltesting "sigs.k8s.io/kueue/pkg/util/testing"
testingjob "sigs.k8s.io/kueue/pkg/util/testingjobs/job"
testingjobset "sigs.k8s.io/kueue/pkg/util/testingjobs/jobset"
testingmpijob "sigs.k8s.io/kueue/pkg/util/testingjobs/mpijob"
testingpytorchjob "sigs.k8s.io/kueue/pkg/util/testingjobs/pytorchjob"
testingrayjob "sigs.k8s.io/kueue/pkg/util/testingjobs/rayjob"
"sigs.k8s.io/kueue/pkg/workload"
"sigs.k8s.io/kueue/test/util"
)
@@ -491,6 +496,58 @@ var _ = ginkgo.Describe("MultiKueue", func() {
}, util.Timeout, util.Interval).Should(gomega.Succeed())
})
})

ginkgo.It("Should run a RayJob on worker if admitted", func() {
var found bool
E2eKuberayTestImage, found := os.LookupEnv("KUBERAY_RAY_IMAGE")
gomega.Expect(found).To(gomega.BeTrue())
if runtime.GOOS == "darwin" {
E2eKuberayTestImage, found = os.LookupEnv("KUBERAY_RAY_IMAGE_ARM")
gomega.Expect(found).To(gomega.BeTrue())
}

// Since it requires 1.5 CPU, this job can only be admitted in worker 1.
rayjob := testingrayjob.MakeJob("rayjob1", managerNs.Name).
Suspend(true).
Queue(managerLq.Name).
WithSubmissionMode(rayv1.K8sJobMode).
Request(rayv1.HeadNode, corev1.ResourceCPU, "1").
Request(rayv1.WorkerNode, corev1.ResourceCPU, "0.5").
Entrypoint("python -c \"import ray; ray.init(); print(ray.cluster_resources())\"").
RayVersion("2.9.0").
Image(rayv1.HeadNode, E2eKuberayTestImage, []string{}).
Image(rayv1.WorkerNode, E2eKuberayTestImage, []string{}).
Obj()

ginkgo.By("Creating the RayJob", func() {
gomega.Expect(k8sManagerClient.Create(ctx, rayjob)).Should(gomega.Succeed())
})

wlLookupKey := types.NamespacedName{Name: workloadrayjob.GetWorkloadNameForRayJob(rayjob.Name, rayjob.UID), Namespace: managerNs.Name}
// the execution should be given to the worker1
waitForJobAdmitted(wlLookupKey, multiKueueAc.Name, "worker1")

ginkgo.By("Waiting for the RayJob to finish", func() {
gomega.Eventually(func(g gomega.Gomega) {
createdRayJob := &rayv1.RayJob{}
g.Expect(k8sManagerClient.Get(ctx, client.ObjectKeyFromObject(rayjob), createdRayJob)).To(gomega.Succeed())
g.Expect(createdRayJob.Status.JobDeploymentStatus).To(gomega.Equal(rayv1.JobDeploymentStatusComplete))
finishReasonMessage := "Job finished successfully."
checkFinishStatusCondition(g, wlLookupKey, finishReasonMessage)
}, 5*util.LongTimeout, util.Interval).Should(gomega.Succeed())
})

ginkgo.By("Checking no objects are left in the worker clusters and the RayJob is completed", func() {
gomega.Eventually(func(g gomega.Gomega) {
workerWl := &kueue.Workload{}
g.Expect(k8sWorker1Client.Get(ctx, wlLookupKey, workerWl)).To(utiltesting.BeNotFoundError())
g.Expect(k8sWorker2Client.Get(ctx, wlLookupKey, workerWl)).To(utiltesting.BeNotFoundError())
workerRayJob := &rayv1.RayJob{}
g.Expect(k8sWorker1Client.Get(ctx, client.ObjectKeyFromObject(rayjob), workerRayJob)).To(utiltesting.BeNotFoundError())
g.Expect(k8sWorker2Client.Get(ctx, client.ObjectKeyFromObject(rayjob), workerRayJob)).To(utiltesting.BeNotFoundError())
}, util.Timeout, util.Interval).Should(gomega.Succeed())
})
})
})
ginkgo.When("The connection to a worker cluster is unreliable", func() {
ginkgo.It("Should update the cluster status to reflect the connection state", func() {
@@ -571,23 +628,22 @@ var _ = ginkgo.Describe("MultiKueue", func() {
})

func waitForJobAdmitted(wlLookupKey types.NamespacedName, acName, workerName string) {
ginkgo.By(fmt.Sprintf("Waiting to be admitted in %s and manager", workerName), func() {
gomega.Eventually(func(g gomega.Gomega) {
createdWorkload := &kueue.Workload{}
g.Expect(k8sManagerClient.Get(ctx, wlLookupKey, createdWorkload)).To(gomega.Succeed())
g.Expect(apimeta.FindStatusCondition(createdWorkload.Status.Conditions, kueue.WorkloadAdmitted)).To(gomega.BeComparableTo(&metav1.Condition{
Type: kueue.WorkloadAdmitted,
Status: metav1.ConditionTrue,
Reason: "Admitted",
Message: "The workload is admitted",
}, util.IgnoreConditionTimestampsAndObservedGeneration))
g.Expect(workload.FindAdmissionCheck(createdWorkload.Status.AdmissionChecks, acName)).To(gomega.BeComparableTo(&kueue.AdmissionCheckState{
Name: acName,
State: kueue.CheckStateReady,
Message: fmt.Sprintf(`The workload got reservation on "%s"`, workerName),
}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime")))
}, util.Timeout, util.Interval).Should(gomega.Succeed())
})
ginkgo.By(fmt.Sprintf("Waiting to be admitted in %s and manager", workerName))
gomega.EventuallyWithOffset(1, func(g gomega.Gomega) {
createdWorkload := &kueue.Workload{}
g.Expect(k8sManagerClient.Get(ctx, wlLookupKey, createdWorkload)).To(gomega.Succeed())
g.Expect(apimeta.FindStatusCondition(createdWorkload.Status.Conditions, kueue.WorkloadAdmitted)).To(gomega.BeComparableTo(&metav1.Condition{
Type: kueue.WorkloadAdmitted,
Status: metav1.ConditionTrue,
Reason: "Admitted",
Message: "The workload is admitted",
}, util.IgnoreConditionTimestampsAndObservedGeneration))
g.Expect(workload.FindAdmissionCheck(createdWorkload.Status.AdmissionChecks, acName)).To(gomega.BeComparableTo(&kueue.AdmissionCheckState{
Name: acName,
State: kueue.CheckStateReady,
Message: fmt.Sprintf(`The workload got reservation on "%s"`, workerName),
}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime")))
}, util.LongTimeout, util.Interval).Should(gomega.Succeed())
}

func checkFinishStatusCondition(g gomega.Gomega, wlLookupKey types.NamespacedName, finishReasonMessage string) {
8 changes: 7 additions & 1 deletion test/e2e/multikueue/suite_test.go
Original file line number Diff line number Diff line change
@@ -27,6 +27,7 @@ import (
kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
"github.com/onsi/ginkgo/v2"
"github.com/onsi/gomega"
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
authenticationv1 "k8s.io/api/authentication/v1"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
@@ -98,6 +99,8 @@ func kubeconfigForMultiKueueSA(ctx context.Context, c client.Client, restConfig
policyRule(kftraining.SchemeGroupVersion.Group, "xgboostjobs/status", "get"),
policyRule(kfmpi.SchemeGroupVersion.Group, "mpijobs", resourceVerbs...),
policyRule(kfmpi.SchemeGroupVersion.Group, "mpijobs/status", "get"),
policyRule(rayv1.SchemeGroupVersion.Group, "rayjobs", resourceVerbs...),
policyRule(rayv1.SchemeGroupVersion.Group, "rayjobs/status", "get"),
},
}
err := c.Create(ctx, cr)
@@ -273,7 +276,10 @@ var _ = ginkgo.BeforeSuite(func() {
util.WaitForKubeFlowMPIOperatorAvailability(ctx, k8sWorker1Client)
util.WaitForKubeFlowMPIOperatorAvailability(ctx, k8sWorker2Client)

ginkgo.GinkgoLogr.Info("Kueue and JobSet operators are available in all the clusters", "waitingTime", time.Since(waitForAvailableStart))
util.WaitForKubeRayOperatorAvailability(ctx, k8sWorker1Client)
util.WaitForKubeRayOperatorAvailability(ctx, k8sWorker2Client)

ginkgo.GinkgoLogr.Info("Kueue and all integration operators are available in all the clusters", "waitingTime", time.Since(waitForAvailableStart))

discoveryClient, err := discovery.NewDiscoveryClientForConfig(managerCfg)
gomega.Expect(err).NotTo(gomega.HaveOccurred())
9 changes: 9 additions & 0 deletions test/util/e2e.go
Original file line number Diff line number Diff line change
@@ -9,6 +9,7 @@ import (
kfmpi "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1"
kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
"github.com/onsi/gomega"
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
@@ -60,6 +61,9 @@ func CreateClientUsingCluster(kContext string) (client.WithWatch, *rest.Config)
err = kfmpi.AddToScheme(scheme.Scheme)
gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred())

err = rayv1.AddToScheme(scheme.Scheme)
gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred())

client, err := client.NewWithWatch(cfg, client.Options{Scheme: scheme.Scheme})
gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred())
return client, cfg
@@ -130,3 +134,8 @@ func WaitForKubeFlowMPIOperatorAvailability(ctx context.Context, k8sClient clien
kftoKey := types.NamespacedName{Namespace: "mpi-operator", Name: "mpi-operator"}
waitForOperatorAvailability(ctx, k8sClient, kftoKey)
}

func WaitForKubeRayOperatorAvailability(ctx context.Context, k8sClient client.Client) {
kroKey := types.NamespacedName{Namespace: "ray-system", Name: "kuberay-operator"}
waitForOperatorAvailability(ctx, k8sClient, kroKey)
}

0 comments on commit c03cf53

Please sign in to comment.