Skip to content

Commit

Permalink
Merge branch 'dev' into release-1.18.3-patch
Browse files Browse the repository at this point in the history
  • Loading branch information
Uburro authored Feb 20, 2025
2 parents dfe10cf + a7de89d commit ed7517d
Show file tree
Hide file tree
Showing 114 changed files with 23,147 additions and 3,419 deletions.
5 changes: 5 additions & 0 deletions .codespellrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[codespell]
# Ref: https://github.com/codespell-project/codespell#using-a-config-file
skip = .git*,*.svg,go.sum,.codespellrc
check-hidden = true
ignore-words-list = notin
4 changes: 2 additions & 2 deletions .github/workflows/gpubench_only.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:

steps:
- name: Harden Runner
uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.4
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
with:
egress-policy: audit

Expand All @@ -43,7 +43,7 @@ jobs:

steps:
- name: Harden Runner
uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.4
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
with:
egress-policy: audit

Expand Down
24 changes: 20 additions & 4 deletions .github/workflows/one_job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:

steps:
- name: Harden Runner
uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.4
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
with:
egress-policy: audit

Expand All @@ -52,7 +52,7 @@ jobs:

steps:
- name: Harden Runner
uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.4
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
with:
egress-policy: audit

Expand All @@ -69,8 +69,19 @@ jobs:
echo "UNSTABLE - is ${{ needs.pre-build.outputs.unstable }}"
make get-version UNSTABLE=${{ needs.pre-build.outputs.unstable }}
- name: Check if version synced
run: make test-version-sync
- name: Run make sync-version-from-scratch
run: |
make kustomize helmify yq
make sync-version-from-scratch
- name: Check for uncommitted changes
run: |
if [[ -n "$(git status --porcelain)" ]]; then
echo "❌ Uncommitted changes detected after make sync-version-from-scratch"
git diff
exit 1
fi
shell: bash

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@f7ce87c1d6bead3e36075b2ce75da1f6cc28aaca # v3.9.0
Expand Down Expand Up @@ -133,6 +144,11 @@ jobs:
make docker-build UNSTABLE="${UNSTABLE}" IMAGE_NAME=populate_jail DOCKERFILE=populate_jail/populate_jail.dockerfile
make docker-push UNSTABLE="${UNSTABLE}" IMAGE_NAME=populate_jail
echo "Building image of the soperatorchecks"
make docker-build UNSTABLE="${UNSTABLE}" IMAGE_NAME=soperatorchecks DOCKERFILE=soperatorchecks.dockerfile IMAGE_VERSION="$OPERATOR_IMAGE_TAG"
echo "Pushing image of the soperatorchecks"
make docker-push UNSTABLE="${UNSTABLE}" IMAGE_NAME=soperatorchecks IMAGE_VERSION="$OPERATOR_IMAGE_TAG"
echo "Building image of the operator"
make docker-build UNSTABLE="${UNSTABLE}" IMAGE_NAME=slurm-operator DOCKERFILE=Dockerfile IMAGE_VERSION="$OPERATOR_IMAGE_TAG"
echo "Pushing image of the operator"
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ bin
cover.out
release_all.sh
upload_to_build_agent.sh
.vscode
49 changes: 49 additions & 0 deletions .golangci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
linters:
presets:
- bugs
- complexity
- error
- format
- import
- metalinter
- module
- performance
- style
- test
- unused
disable:
- depguard
- ineffassign
- funlen
- forcetypeassert
- testpackage
- tagliatelle
- godot
- misspell
- goconst
- dupl
- gci
- whitespace
- gochecknoinits
- gocognit
- nestif
- gocyclo
- maintidx
- godox
- gofumpt
- gomnd
- lll
- nlreturn
- nolintlint
- wsl
- prealloc
fast: true

output:
formats:
- format: colored-line-number

run:
relative-path-mode: gomod
allow-parallel-runners: true
allow-serial-runners: true
11 changes: 11 additions & 0 deletions .mockery.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
with-expecter: true
issue-845-fix: True
resolve-type-alias: False
packages:
nebius.ai/slurm-operator/internal/slurmapi:
interfaces:
Client:
config:
dir: "{{.InterfaceDirRelative}}/fake"
outpkg: "fake"
filename: "mock_{{ .InterfaceName | camelcase | firstLower }}.go"
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ RUN GOOS=$GOOS GOARCH=$GOARCH CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAGS \
go build -o slurm_operator ./cmd/

#######################################################################################################################
FROM alpine:latest@sha256:56fa17d2a7e7f168a043a2712e63aed1f8543aeafdcee47c58dcffe38ed51099 AS slurm-operator
FROM alpine:latest@sha256:a8560b36e8b8210634f77d9f7f9efd7ffa463e380b75e2e74aff4511df3ef88c AS slurm-operator

COPY --from=operator_builder /operator/slurm_operator /usr/bin/

Expand Down
42 changes: 33 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@ SHELL = /usr/bin/env bash -o pipefail
.SHELLFLAGS = -ec

# Limit the scope of generation otherwise it will try to generate configs for non-controller code
GENPATH = "./api/v1;"
GENPATH = "./api/v1;./api/v1alpha1;"

CHART_PATH = helm
CHART_OPERATOR_PATH = $(CHART_PATH)/soperator
CHART_SOPERATORCHECKS_PATH = $(CHART_PATH)/soperatorchecks
CHART_NODECONFIGURATOR_PATH = $(CHART_PATH)/nodeconfigurator
CHART_OPERATOR_CRDS_PATH = $(CHART_PATH)/soperator-crds
CHART_CLUSTER_PATH = $(CHART_PATH)/slurm-cluster
CHART_STORAGE_PATH = $(CHART_PATH)/slurm-cluster-storage
Expand Down Expand Up @@ -79,8 +81,9 @@ help: ## Display this help.
.PHONY: manifests
manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects.
$(CONTROLLER_GEN) crd webhook paths=$(GENPATH) output:crd:artifacts:config=config/crd/bases
$(CONTROLLER_GEN) rbac:roleName=manager-role paths="./internal/controller/clustercontroller/..." output:artifacts:config=config/rbac/clustercontroller/
$(CONTROLLER_GEN) rbac:roleName=node-configurator-role paths="./internal/rebooter/..." output:artifacts:config=config/rbac/node-configurator/
$(CONTROLLER_GEN) rbac:roleName=manager-role paths="./internal/controller/..." output:artifacts:config=config/rbac/clustercontroller/
$(CONTROLLER_GEN) rbac:roleName=nodeconfigurator-role paths="./internal/rebooter/..." output:artifacts:config=config/rbac/nodeconfigurator/
$(CONTROLLER_GEN) rbac:roleName=soperator-checks-role paths="./internal/soperatorchecks/..." output:artifacts:config=config/rbac/soperatorchecks/
.PHONY: generate
generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
$(CONTROLLER_GEN) object paths=$(GENPATH)
Expand All @@ -107,11 +110,19 @@ lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes

.PHONY: helm
helm: generate manifests ## Update soperator Helm chart
$(KUSTOMIZE) build config/crd > $(CHART_OPERATOR_PATH)/crds/slurmcluster-crd.yaml
$(KUSTOMIZE) build config/crd > $(CHART_OPERATOR_CRDS_PATH)/templates/slurmcluster-crd.yaml
$(KUSTOMIZE) build config/crd/bases > $(CHART_OPERATOR_PATH)/crds/slurmcluster-crd.yaml
$(KUSTOMIZE) build config/crd/bases > $(CHART_OPERATOR_CRDS_PATH)/templates/slurmcluster-crd.yaml
# Because of helmify rewrite a file we need to make backup of values.yaml
mv $(CHART_OPERATOR_PATH)/values.yaml $(CHART_OPERATOR_PATH)/values.yaml.bak
$(KUSTOMIZE) build --load-restrictor LoadRestrictionsNone config/rbac/soperator-helm | $(HELMIFY) $(CHART_OPERATOR_PATH)
mv $(CHART_NODECONFIGURATOR_PATH)/values.yaml $(CHART_NODECONFIGURATOR_PATH)/values.yaml.bak
$(KUSTOMIZE) build --load-restrictor LoadRestrictionsNone config/rbac/clustercontroller | $(HELMIFY) $(CHART_OPERATOR_PATH)
$(KUSTOMIZE) build --load-restrictor LoadRestrictionsNone config/rbac/nodeconfigurator | $(HELMIFY) $(CHART_NODECONFIGURATOR_PATH)
$(KUSTOMIZE) build --load-restrictor LoadRestrictionsNone config/soperatorchecks | $(HELMIFY) $(CHART_SOPERATORCHECKS_PATH)
mv $(CHART_OPERATOR_PATH)/values.yaml.bak $(CHART_OPERATOR_PATH)/values.yaml
mv $(CHART_NODECONFIGURATOR_PATH)/values.yaml.bak $(CHART_NODECONFIGURATOR_PATH)/values.yaml
# Because of helmify rewrite a file we need to add the missing if statement
@$(SED_COMMAND) '1s|^|{{- if and .Values.rebooter.generateRBAC .Values.rebooter.enabled }}\n|' $(CHART_NODECONFIGURATOR_PATH)/templates/nodeconfigurator-rbac.yaml
@echo -e "\n{{- end }}" >> $(CHART_NODECONFIGURATOR_PATH)/templates/nodeconfigurator-rbac.yaml

.PHONY: get-version
get-version:
Expand Down Expand Up @@ -151,6 +162,11 @@ sync-version: yq ## Sync versions from file
@$(YQ) -i ".images.[0].newTag = \"$(OPERATOR_IMAGE_TAG)\"" "config/manager/kustomization.yaml"
@# endregion config/manager/kustomization.yaml

@echo 'Syncing config/soperatorchecks/kustomization.yaml'
@$(YQ) -i ".images.[0].newName = \"$(IMAGE_REPO)/soperatorchecks\"" "config/soperatorchecks/kustomization.yaml"
@$(YQ) -i ".images.[0].newTag = \"$(OPERATOR_IMAGE_TAG)\"" "config/soperatorchecks/kustomization.yaml"
@# endregion config/soperatorchecks/kustomization.yaml

@# region config/manager/manager.yaml
@echo 'Syncing config/manager/manager.yaml'
@$(SED_COMMAND) "s/image: controller:[^ ]*/image: controller:$(OPERATOR_IMAGE_TAG)/" config/manager/manager.yaml
Expand All @@ -162,10 +178,14 @@ sync-version: yq ## Sync versions from file
@$(YQ) -i ".version = \"$(OPERATOR_IMAGE_TAG)\"" "$(CHART_OPERATOR_CRDS_PATH)/Chart.yaml"
@$(YQ) -i ".version = \"$(OPERATOR_IMAGE_TAG)\"" "$(CHART_CLUSTER_PATH)/Chart.yaml"
@$(YQ) -i ".version = \"$(OPERATOR_IMAGE_TAG)\"" "$(CHART_STORAGE_PATH)/Chart.yaml"
@$(YQ) -i ".version = \"$(OPERATOR_IMAGE_TAG)\"" "$(CHART_SOPERATORCHECKS_PATH)/Chart.yaml"
@$(YQ) -i ".version = \"$(OPERATOR_IMAGE_TAG)\"" "$(CHART_NODECONFIGURATOR_PATH)/Chart.yaml"
@$(YQ) -i ".appVersion = \"$(OPERATOR_IMAGE_TAG)\"" "$(CHART_OPERATOR_PATH)/Chart.yaml"
@$(YQ) -i ".appVersion = \"$(OPERATOR_IMAGE_TAG)\"" "$(CHART_OPERATOR_CRDS_PATH)/Chart.yaml"
@$(YQ) -i ".appVersion = \"$(OPERATOR_IMAGE_TAG)\"" "$(CHART_CLUSTER_PATH)/Chart.yaml"
@$(YQ) -i ".appVersion = \"$(OPERATOR_IMAGE_TAG)\"" "$(CHART_STORAGE_PATH)/Chart.yaml"
@$(YQ) -i ".appVersion = \"$(OPERATOR_IMAGE_TAG)\"" "$(CHART_SOPERATORCHECKS_PATH)/Chart.yaml"
@$(YQ) -i ".appVersion = \"$(OPERATOR_IMAGE_TAG)\"" "$(CHART_NODECONFIGURATOR_PATH)/Chart.yaml"
@# endregion helm chart versions
#
@# region helm/slurm-cluster/values.yaml
Expand All @@ -181,6 +201,12 @@ sync-version: yq ## Sync versions from file
@$(YQ) -i ".images.exporter = \"$(IMAGE_REPO)/exporter:$(IMAGE_VERSION)\"" "helm/slurm-cluster/values.yaml"
@# endregion helm/slurm-cluster/values.yaml

@# region helm/nodeconfigurator/values.yaml
@echo 'Syncing helm/nodeconfigurator/values.yaml'
@$(YQ) -i ".rebooter.image.repository = \"$(IMAGE_REPO)/rebooter\"" "helm/nodeconfigurator/values.yaml"
@$(YQ) -i ".rebooter.image.tag = \"$(OPERATOR_IMAGE_TAG)\"" "helm/nodeconfigurator/values.yaml"
@# endregion helm/nodeconfigurator/values.yaml

@# region helm/slurm-cluster/templates/_registry_helpers.tpl
@echo "Syncing $(CHART_CLUSTER_PATH)/templates/_registry_helpers.tpl"
@echo '{{/* This file is generated by make sync-version. */}}' > $(CHART_CLUSTER_PATH)/templates/_registry_helpers.tpl
Expand Down Expand Up @@ -228,9 +254,7 @@ endif
ifndef DOCKERFILE
$(error DOCKERFILE is not set, docker image cannot be built)
endif
ifeq (${IMAGE_NAME},slurm-operator)
docker build $(DOCKER_BUILD_ARGS) --tag $(IMAGE_REPO)/${IMAGE_NAME}:${IMAGE_VERSION} --target ${IMAGE_NAME} ${DOCKER_IGNORE_CACHE} ${DOCKER_LOAD} ${DOCKER_BUILD_PLATFORM} -f ${DOCKERFILE} ${DOCKER_OUTPUT} .
else ifeq ($(IMAGE_NAME),rebooter)
ifeq ($(filter ${IMAGE_NAME},slurm-operator rebooter soperatorchecks),${IMAGE_NAME})
docker build $(DOCKER_BUILD_ARGS) --tag $(IMAGE_REPO)/${IMAGE_NAME}:${IMAGE_VERSION} --target ${IMAGE_NAME} ${DOCKER_IGNORE_CACHE} ${DOCKER_LOAD} ${DOCKER_BUILD_PLATFORM} -f ${DOCKERFILE} ${DOCKER_OUTPUT} .
else
cd images && docker build $(DOCKER_BUILD_ARGS) --tag $(IMAGE_REPO)/${IMAGE_NAME}:${IMAGE_VERSION} --target ${IMAGE_NAME} ${DOCKER_IGNORE_CACHE} ${DOCKER_LOAD} ${DOCKER_BUILD_PLATFORM} -f ${DOCKERFILE} ${DOCKER_OUTPUT} .
Expand Down
57 changes: 0 additions & 57 deletions api/v1/slurmcluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -724,63 +724,6 @@ type SlurmNodeWorker struct {
//
// +kubebuilder:validation:Optional
PriorityClass string `json:"priorityClass,omitempty"`
// It's alpha feature and will be moved to separate CRD in the future
// Rebooter defines the configuration for the Slurm worker node rebooter
//
// +kubebuilder:validation:Optional
Rebooter Rebooter `json:"rebooter"`
}

// Rebooter defines the configuration for the Slurm worker node rebooter
type Rebooter struct {
// enabled defines whether the rebooter is enabled
//
// +kubebuilder:validation:Optional
// +kubebuilder:default=false
Enabled bool `json:"enabled"`

// Image defines the rebooter container image
//
// +kubebuilder:validation:Optional
Image string `json:"image"`

// imagePullPolicy defines the image pull policy
//
// +kubebuilder:validation:Enum=Always;Never;IfNotPresent
// +kubebuilder:validation:Optional
// +kubebuilder:default="IfNotPresent"
ImagePullPolicy corev1.PullPolicy `json:"imagePullPolicy,omitempty"`

// Resources defines the [corev1.ResourceRequirements] for the container
//
// +kubebuilder:validation:Optional
Resources corev1.ResourceList `json:"resources,omitempty"`

// evictionMethod defines the method of eviction for the Slurm worker node
// Must be one of [drain, evict]. Now only evict is supported
//
// +kubebuilder:validation:Optional
// +kubebuilder:validation:Enum="evict"
// +kubebuilder:default="evict"
EvictionMethod string `json:"evictionMethod,omitempty"`

// logLevel defines the log level for the rebooter
//
// +kubebuilder:validation:Optional
// +kubebuilder:default="info"
// +kubebuilder:validation:Enum="debug";"info";"warn";"error"
LogLevel string `json:"logLevel,omitempty"`

// Namespace defines the namespace where the rebooter will be deployed
// By default, the same namespace as the soperator
//
// +kubebuilder:validation:Optional
Namespace string `json:"namespace,omitempty"`

// serviceAccountName defines the service account name for the rebooter
//
// +kubebuilder:validation:Optional
ServiceAccountName string `json:"serviceAccountName,omitempty"`
}

// SlurmNodeWorkerVolumes defines the volumes for the Slurm worker node
Expand Down
23 changes: 0 additions & 23 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit ed7517d

Please sign in to comment.