helm/slurm-cluster/values.yaml

clusterName: "slurm1"
# Additional annotations for the cluster
annotations: {}
# Add appArmor profile to the cluster
useDefaultAppArmorProfile: false
# Maintenance defines the maintenance window for the cluster.
# It can have the following values:
# - none: No maintenance is performed. The cluster operates normally.
# - downscale: Scales down all components to 0.
# - downscaleAndDeletePopulateJail: Scales down all components to 0 and deletes the kubernetes Kind Jobs populateJail.
# - downscaleAndOverwritePopulateJail: Scales down all components to 0 and overwrite populateJail (same as overwrite=true).
# - skipPopulateJail: Skips the execution of the populateJail job during maintenance.
maintenance: "none"
# Slurm cluster type. Can be now gpu or cpu
clusterType: gpu
# partitionConfiguration define partition configuration of slurm worker nodes
# https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION
partitionConfiguration:
  # Could be default or custom
  configType: "default"
  # configuration as list string started with PartitionName
  # Example for custom ConfigType:
  rawConfig: []
#    - PartitionName=low_priority Nodes=worker-[0-15] Default=YES MaxTime=INFINITE State=UP PriorityTier=1
#    - PartitionName=high_priority  Nodes=worker-[10-20] Default=NO MaxTime=INFINITE State=UP PriorityTier=2
# K8s node filters used in Slurm node specifications. Define which nodes should be used to schedule pods to
k8sNodeFilters:
  - name: gpu
    affinity:
      nodeAffinity:
        requiredDuringSchedulingIgnoredDuringExecution:
          nodeSelectorTerms:
            - matchExpressions:
                - key: "nebius.com/node-group-id"
                  operator: In
                  values:
                    - "node-group-id-here"
    tolerations:
      - key: nvidia.com/gpu
        operator: Exists
        effect: NoSchedule
  - name: no-gpu
    affinity:
      nodeAffinity:
        requiredDuringSchedulingIgnoredDuringExecution:
          nodeSelectorTerms:
            - matchExpressions:
                - key: "nebius.com/node-group-id"
                  operator: In
                  values:
                    - "node-group-id-here"
# Sources for the volumes used in Slurm node specifications
volumeSources:
  - name: controller-spool
    persistentVolumeClaim:
      claimName: "controller-spool-pvc"
      readOnly: false
  - name: jail
    persistentVolumeClaim:
      claimName: "jail-pvc"
      readOnly: false
#  - name: jail-snapshot
#    persistentVolumeClaim:
#      claimName: "jail-snapshot-pvc"
#      readOnly: true
#  - name: mlperf-sd
#    persistentVolumeClaim:
#      claimName: "jail-submount-mlperf-sd-pvc"
#      readOnly: false

# Secret references needed for Slurm cluster operation
secrets: {}
# Secret reference required for login sshd. If secret name empty - operator generate own secret with keys
# sshdKeysName: ""
# Job performing initial jail file system population
populateJail:
  imagePullPolicy: "IfNotPresent"
  appArmorProfile: "unconfined"
  # Name of the k8s node filter
  k8sNodeFilterName: "gpu"
  # Configuration of the volume containing the custom initial jail content (the default content is used if not set)
  jailSnapshotVolume: null
  #  jailSnapshotVolume:
  #    volumeSourceName: "jail-snapshot"
  overwrite: false
ncclSettings:
  # TopologyType define type of NCCL GPU topology Enum: auto, custom
  topologyType: "auto"
  # TopologyData defines NCCL GPU topology
  topologyData: ""
# Periodic checks. e.g. GPU health
periodicChecks:
  # NCCL test benchmark
  ncclBenchmark:
    # Whether to enable the benchmark
    enabled: true
    # CronJob schedule. By default, runs every 3 hours
    schedule: "0 */3 * * *"
    # CronJob timeout in seconds. By default, equals to 30 min
    activeDeadlineSeconds: 1800
    # Number of successful finished jobs to retain
    successfulJobsHistoryLimit: 3
    # Number of failed finished jobs to retain
    failedJobsHistoryLimit: 24
    # NCCL test settings
    ncclArguments:
      # Minimum memory size to start NCCL with
      minBytes: "512Mb"
      # Maximum memory size to finish NCCL with
      maxBytes: "8Gb"
      # Multiplication factor between two sequential memory sizes
      stepFactor: "2"
      # NCCL timeout in its special format. By default, 20 minutes
      timeout: "20:00"
      # Threshold for benchmark result that must be guaranteed. CronJob will fail if the result is less than the threshold
      thresholdMoreThan: "0"
      # UseInfiniband defines using NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_ALGO=Ring env variables for test
      useInfiniband: true
    # Actions performed on benchmark failure
    failureActions:
      # Whether to drain Slurm node in case of benchmark failure
      setSlurmNodeDrainState: true
    # Name of the k8s node filter
    k8sNodeFilterName: "no-gpu"
    imagePullPolicy: "IfNotPresent"
    appArmorProfile: "unconfined"
slurmConfig: {}
# defMemPerNode: 1228800
# defCpuPerGPU: 16
# completeWait: 5
# debugFlags: "Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs"
# epilog: "/path/to/epilog.sh"
# prolog: "/path/to/prolog.sh"
# taskPluginParam: "Verbose"
# maxJobCount: 10000
# minJobAge: 86400
slurmNodes:
  accounting:
    enabled: false
    k8sNodeFilterName: "no-gpu"
    slurmConfig: {}
    # accountingStorageTRES: "gres/gpu,license/iop1"
    # accountingStoreFlags: job_comment,job_env,job_extra,job_script,no_stdio
    # acctGatherInterconnectType: "acct_gather_interconnect/ofed"
    # acctGatherFilesystemType: "acct_gather_filesystem/lustre"
    # jobAcctGatherType: "jobacct_gather/cgroup"
    # jobAcctGatherFrequency: 30
    # priorityWeightAge: 1
    # priorityWeightFairshare: 1
    # priorityWeightQOS: 1
    # priorityWeightTRES: ""
    slurmdbdConfig: {}
    # archiveEvents: "yes"
    # archiveJobs: "yes"
    # archiveSteps: "yes"
    # archiveSuspend: "yes"
    # archiveResv: "yes"
    # archiveUsage: "yes"
    # archiveTXN: "yes"
    # debugLevel: "info"
    # tcpTimeout: 120
    # purgeEventAfter: "1month"
    # purgeJobAfter: "1month"
    # purgeStepAfter: "1month"
    # purgeSuspendAfter: "12month"
    # purgeResvAfter: "1month"
    # purgeUsageAfter: "1month"
    # debugFlags: "DB_ARCHIVE"
    slurmdbd:
      imagePullPolicy: "IfNotPresent"
      appArmorProfile: "unconfined"
      port: 6819
      resources:
        cpu: "1000m"
        memory: "3Gi"
        ephemeralStorage: "10Gi"
    munge:
      imagePullPolicy: "IfNotPresent"
      appArmorProfile: "unconfined"
      resources:
        cpu: "1000m"
        memory: "1Gi"
        ephemeralStorage: "5Gi"
    externalDB:
      enabled: false
      # host: ""
      # port: 3306
      # user: ""
      # passwordSecretKeyRef:
      #   name: ""
      #   key: ""
    mariadbOperator:
      enabled: false
      protectedSecret: false
      resources:
        cpu: "1000m"
        memory: "1Gi"
        ephemeralStorage: "5Gi"
      replicas: 1
      replication: {}
      # enabled: false
      # primary: {}
      # probesEnabled: false
      # replica: {}
      # syncBinlog: false
      storage: {}
      # ephemeral: false
      # resizeInUseVolumes: false
      # size: {}
      # storageClassName: ""
      # volumeClaimTemplate: ""
      # waitForVolumeResize: false
      podSecurityContext: {}
      # fsGroup: 2000
      # runAsUser: 1000
      # runAsGroup: 3000
      # runAsNonRoot: true
      # supplementalGroups: [1000]
      securityContext: {}
      # runAsUser: 1000
      # runAsGroup: 3000
      # runAsNonRoot: true
      # capabilities:
      #   add: ["NET_ADMIN", "SYS_TIME"]
      metrics:
        enabled: false
        # username: ""
        # serviceMonitor: {}
        # jobLabel: ""
        # interval: ""
        # scrapeTimeout: ""
        # exporter: {}
        # passwordSecretKeyRef: {}
        # name: ""
        # key: ""
      # affinity: {}
      # args: []
      # env: []
      # envFrom: []
      # image: ""
      # imagePullPolicy: ""
      # imagePullSecrets: []
      # initContainers: []
      # livenessProbe: {}
      # nodeSelector: {}
      # podMetadata: {}
      # securityContext: {}
      # serviceAccountName: ""
      # sidecarContainers: []
      # tolerations: []
      # resources: {}
      # volumeMounts: []
      # volumes: []
      # resources: {}
      #   limits:
      #     cpu: "100m"
      #     memory: "128Mi"
      #   requests:
      #     cpu: "50m"
      #     memory: "64Mi"
  controller:
    size: 2
    k8sNodeFilterName: "no-gpu"
    slurmctld:
      imagePullPolicy: "IfNotPresent"
      appArmorProfile: "unconfined"
      port: 6817
      resources:
        cpu: "1000m"
        memory: "3Gi"
        ephemeralStorage: "20Gi"
    munge:
      imagePullPolicy: "IfNotPresent"
      appArmorProfile: "unconfined"
      resources:
        cpu: "1000m"
        memory: "1Gi"
        ephemeralStorage: "5Gi"
    volumes:
      spool:
        volumeSourceName: "controller-spool"
      jail:
        volumeSourceName: "jail"
  worker:
    size: 2
    k8sNodeFilterName: "gpu"
    cgroupVersion: v2
    enableGDRCopy: false
    priorityClass: ""
    slurmNodeExtra: ""
    supervisordConfigMapRefName: ""
    sshdConfigMapRefName: ""
    slurmd:
      imagePullPolicy: "IfNotPresent"
      appArmorProfile: "unconfined"
      port: 6818
      resources:
        cpu: "156000m"
        memory: "1220Gi"
        ephemeralStorage: "55Gi"
        gpu: 8
      securityLimitsConfig: ""
    munge:
      imagePullPolicy: "IfNotPresent"
      appArmorProfile: "unconfined"
      resources:
        cpu: "2000m"
        memory: "4Gi"
        ephemeralStorage: "5Gi"
    volumes:
      spool:
        volumeClaimTemplateSpec:
          storageClassName: "nebius-network-ssd"
          accessModes: ["ReadWriteOnce"]
          resources:
            requests:
              storage: "128Gi"
      jail:
        volumeSourceName: "jail"
      jailSubMounts: []
      #      jailSubMounts:
      #        - name: "mlcommons-sd-bench-data"
      #          mountPath: "/mlperf-sd"
      #          subPath: ""
      #          readOnly: false
      #          volumeSourceName: "mlperf-sd"
  login:
    size: 2
    k8sNodeFilterName: "no-gpu"
    sshd:
      imagePullPolicy: "IfNotPresent"
      appArmorProfile: "unconfined"
      port: 22
      resources:
        cpu: "3000m"
        memory: "9Gi"
        ephemeralStorage: "30Gi"
    # Authorized keys required for SSH connection to Slurm login nodes
    sshRootPublicKeys:
      - "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKzxkjzPQ4EyZSjan4MLGFSA18idpZicoKW7Hfff username1"
      - "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICL8scMKnwu+Y9S6XDACacZ54+qu+YRo2y4Ieddd username2"
    # Either `LoadBalancer` or `NodePort`
    sshdServiceType: "LoadBalancer"
    # Could be needed in case of sshdServiceType == `LoadBalancer`
    # sshdServiceLoadBalancerIP: "192.168.0.1"
    sshdServiceLoadBalancerIP: ""
    # Required in case of sshdServiceType == `NodePort`
    sshdServiceNodePort: 30022
    # Annotations to be configure on login service
    sshdServiceAnnotations: {}
    munge:
      imagePullPolicy: "IfNotPresent"
      appArmorProfile: "unconfined"
      resources:
        cpu: "500m"
        memory: "500Mi"
        ephemeralStorage: "5Gi"
    volumes:
      jail:
        volumeSourceName: "jail"
      jailSubMounts: []
      #      jailSubMounts:
      #        - name: "mlcommons-sd-bench-data"
      #          mountPath: "/mlperf-sd"
      #          volumeSourceName: "mlperf-sd"
  exporter:
    enabled: true
    size: 1
    k8sNodeFilterName: "no-gpu"
    exporter:
      imagePullPolicy: "IfNotPresent"
      appArmorProfile: "unconfined"
      resources:
        cpu: "250m"
        memory: "256Mi"
        ephemeralStorage: "500Mi"
    munge:
      imagePullPolicy: "IfNotPresent"
      appArmorProfile: "unconfined"
      resources:
        cpu: "1000m"
        memory: "1Gi"
        ephemeralStorage: "5Gi"
    volumes:
      jail:
        volumeSourceName: "jail"
  rest:
    enabled: false
    size: 1
    k8sNodeFilterName: "no-gpu"
    rest:
      imagePullPolicy: "IfNotPresent"
      appArmorProfile: "unconfined"
      resources:
        cpu: "1000m"
        memory: "1Gi"
        ephemeralStorage: "500Mi"
telemetry: {}
# jobsTelemetry:
#   otelCollectorHttpHost: vmsingle-slurm.monitoring-system.svc.cluster.local
#   otelCollectorPath: /opentelemetry/api/v1/push
#   otelCollectorPort: 8429
#   sendJobsEvents: true
#   sendOtelMetrics: true
# openTelemetryCollector:
#   enabled: true
#   replicasOtelCollector: 1
#   otelCollectorPort: 8429

images:
  slurmctld: "cr.eu-north1.nebius.cloud/soperator/controller_slurmctld:1.18.3-jammy-slurm24.05.5"
  slurmrestd: "cr.eu-north1.nebius.cloud/soperator/slurmrestd:1.18.3-jammy-slurm24.05.5"
  slurmd: "cr.eu-north1.nebius.cloud/soperator/worker_slurmd:1.18.3-jammy-slurm24.05.5"
  sshd: "cr.eu-north1.nebius.cloud/soperator/login_sshd:1.18.3-jammy-slurm24.05.5"
  munge: "cr.eu-north1.nebius.cloud/soperator/munge:1.18.3-jammy-slurm24.05.5"
  populateJail: "cr.eu-north1.nebius.cloud/soperator/populate_jail:1.18.3-jammy-slurm24.05.5"
  ncclBenchmark: "cr.eu-north1.nebius.cloud/soperator/nccl_benchmark:1.18.3-jammy-slurm24.05.5"
  slurmdbd: "cr.eu-north1.nebius.cloud/soperator/controller_slurmdbd:1.18.3-jammy-slurm24.05.5"
  exporter: "cr.eu-north1.nebius.cloud/soperator/exporter:1.18.3-jammy-slurm24.05.5"
  mariaDB: "docker-registry1.mariadb.com/library/mariadb:11.4.3"
  rebooter: "cr.eu-north1.nebius.cloud/soperator/rebooter:1.17.0"