Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated stackhpc lab for v1.157 upgrades #5

Draft
wants to merge 10 commits into
base: vtest-v1.157-upgrade
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions environments/lab/builder.pkrvars.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# compute node build, currently non-cuda
# run using:

# cd packer/
# PACKER_LOG=1 /usr/bin/packer build \
# -only=openstack.openhpc \
# -on-error=ask \
# -var-file=../environments/vtest/builder.pkrvars.hcl \
# openstack.pkr.hcl

flavor = "en1.medium"
networks = ["a28bd8de-2729-434b-a009-732143d17ce5"]
source_image_name = "Rocky-9-GenericCloud-Base-9.5-20241118.0.x86_64.qcow2"
#ssh_username = rocky
ssh_private_key_file = "/home/rocky/.ssh/vsdeployer"
ssh_keypair_name = "vsdeployer"
volume_type = "unencrypted"

inventory_groups = "compute,control,login,update"
21 changes: 0 additions & 21 deletions environments/lab/hooks/build.yml

This file was deleted.

4 changes: 4 additions & 0 deletions environments/lab/hooks/post-bootstrap.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- name: Import parent hook
vars:
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
import_playbook: "{{ appliances_environment_root }}/../nrel/hooks/post-bootstrap.yml"
9 changes: 6 additions & 3 deletions environments/lab/hooks/post.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,16 @@
- name: Check for ansible slurm user
# rc 0, empty stdout when not found
shell:
cmd: "sacctmgr show user --parsable2 --noheader {{ ansible_user }}"
cmd: "{{ openhpc_bin_dir }}/sacctmgr show user --parsable2 --noheader {{ ansible_user }}"
register: _sacctmgr_ansible_user
changed_when: false

- name: Allow ansible user to run jobs
shell:
cmd: sacctmgr -i create user Name={{ ansible_user }} Cluster={{ openhpc_cluster_name }} Account=root DefaultAccount=root
cmd: "{{ openhpc_bin_dir }}/sacctmgr -i create user Name={{ ansible_user }} Cluster={{ openhpc_cluster_name }} Account=root DefaultAccount=root"
when: "_sacctmgr_ansible_user.stdout == ''"


- name: Import parent hook
vars:
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
import_playbook: "{{ appliances_environment_root }}/../nrel/hooks/post-bootstrap.yml"
63 changes: 27 additions & 36 deletions environments/lab/hooks/pre.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,42 +9,33 @@
path: /var/scratch
state: directory

- name: Build custom Slurm
hosts: localhost
become: no
gather_facts: no
tags: slurm
tasks:
- include_tasks: build.yml

- name: Copy custom Slurm to storage
- name: Get custom slurm
hosts: control
become: yes
gather_facts: no
tags: slurm
become: true
tasks:
- name: Ensure shared slurm directory exists
file:
state: directory
path: "{{ slurm_build_dir }}" # NB this will be exported by nfs filesystems.yml
owner: root
group: root
mode: u=rwX,go=rX

- name: Copy custom slurm
copy:
src: "{{ item.src }}"
dest: "{{ item.dest }}"
owner: root
group: root
mode: u=rwx,go=rx
loop:
- src: "{{ slurm_local_build_dir }}/sbin/"
dest: "{{ openhpc_sbin_dir }}"
- src: "{{ slurm_local_build_dir }}/lib/"
dest: "{{ openhpc_lib_dir }}"
- src: "{{ slurm_local_build_dir }}/bin/"
dest: "{{ openhpc_bin_dir }}"
vars:
slurm_local_build_dir: "{{ appliances_environment_root }}/slurmbuild/{{ slurm_build_version }}"
- name: Ensure temp directory
file:
path: /tmp/slurmdir
state: directory

- name: Download custom slurm
ansible.builtin.get_url:
url: http://tool.net/nrel-slurm.tgz
dest: /tmp/slurmdir/slurm.tgz

- name: Ensure exports directory
file:
path: /exports
state: directory

- name: Unarchive slurm
ansible.builtin.unarchive:
src: /tmp/slurmdir/slurm.tgz
dest: /exports/
remote_src: yes

- name: Import parent hook
vars:
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
import_playbook: "{{ appliances_environment_root }}/../nrel/hooks/pre.yml"

23 changes: 23 additions & 0 deletions environments/lab/inventory/group_vars/all/cuda.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# `cuda_distro`: Optional. Default `rhel8`.
# `cuda_repo`: Optional. Default `https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo`
# `cuda_driver_stream`: Optional. The default value `default` will, on first use of this role, enable the dkms-flavour `nvidia-driver` DNF module stream with the current highest version number. The `latest-dkms` stream is not enabled, and subsequent runs of the role will *not* change the enabled stream, even if a later version has become available. Changing this value once an `nvidia-driver` stream has been enabled raises an error. If an upgrade of the `nvidia-driver` module is required, the currently-enabled stream and all packages should be manually removed.
# `cuda_packages`: Optional. Default: `['cuda', 'nvidia-gds']`.
# `cuda_persistenced_state`: Optional. State of systemd `nvidia-persistenced` service. Values as [ansible.builtin.systemd:state](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/systemd_module.html#parameter-state). Default `started`.

# cuda_distro: rhel9
cuda_distro: "rhel{{ ansible_distribution_major_version }}"
cuda_repo: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo"
cuda_driver_stream: default
cuda_package_version: 'latest'
cuda_packages:
- "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}"
- nvidia-gds
# _cuda_version_tuple: # discovered from installed package e.g. ('12', '1', '0')
cuda_version_short: "{{ _cuda_version_tuple[0] }}.{{ _cuda_version_tuple[1] }}"
cuda_samples_release_url: "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v{{ cuda_version_short }}.tar.gz"
cuda_samples_path: "/var/lib/{{ ansible_user }}/cuda_samples"
cuda_samples_programs:
- deviceQuery
- bandwidthTest
# cuda_devices: # discovered from deviceQuery run
cuda_persistenced_state: started
21 changes: 21 additions & 0 deletions environments/lab/inventory/group_vars/all/lex.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
---
# sync_slurm_allocations vars
sync_slurm_allocations_clustername: "lab"
slurm_sync_allocations_log_file: "sync_slurm_accounting_lab.log"


#slurm_sync_allocations_log_path: "/var/log"
# slurm_sync_allocations_vault_file: "vault"
# slurm_sync_allocations_vault_secret_file: "sync_slurm_allocations_secret.yml"

#slurm_sync_allocations_log_path: "/var/log"
#slurm_sync_allocations_slurm_path: "/usr/sbin"

#slurm_sync_allocations_install_path: "/nopt/{{sync_slurm_allocations_clustername}}/lex/slurm_sync_allocations"

# sync_slurm_allocations vars
slurm_lex_accounting_dev: False
#slurm_lex_accounting_clustername: "{{ vermilion_cluster_name_prefix }}"

#sync_slurm_allocations_clustername: "{{ vermilion_cluster_name_prefix }}"
#slurm_sync_allocations_log_file: "sync_slurm_accounting_{{ vermilion_cluster_name_prefix }}.log"
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
---

nfs_server: "{{ (hostvars[groups['control'] | first].server_networks)['nrel-lab-storage'][0] }}"
nfs_server: "{{ (hostvars[groups['control'] | first].server_networks)['storage'][0] }}"
nfs_enable:
server: "{{ inventory_hostname in groups['control'] }}"
# Don't mount share on server where it is exported from...
# Could do something like nfs_clients: '"nfs_servers" not in group_names' instead.
# See also constructed inventory: https://docs.ansible.com/ansible/devel/collections/ansible/builtin/constructed_inventory.html
clients: "{{ inventory_hostname in groups['cluster'] and inventory_hostname not in groups['control'] }}"
clients: "{{ inventory_hostname in groups['cluster'] }}"
nfs_configurations:
- comment: NFS share /home from slurm controller
nfs_export: "/home"
nfs_export: "/exports/home"
nfs_client_mnt_point: "/home"
- comment: NFS share /nopt from slurm controller
nfs_export: /nopt
nfs_export: /exports/nopt
nfs_client_mnt_point: "/nopt"
- comment: NFS share /projects from slurm controller
nfs_export: /project
nfs_export: /exports/project
nfs_client_mnt_point: /project
- comment: NFS share /scratch from slurm controller
nfs_export: /scratch
nfs_export: /exports/scratch
nfs_client_mnt_point: /scratch
2 changes: 1 addition & 1 deletion environments/lab/inventory/group_vars/all/openhpc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# openhpc_packages_extra: "{{ openhpc_packages_extra_nrel | reject('match', '.*-impi-') }}" # TODO: FIXME

# define paths to slurm on nodes:
slurm_build_version: '23.11.0' # quote to avoid ansible autoconversion weirdness
slurm_build_version: '23.11.1' # quote to avoid ansible autoconversion weirdness
slurm_build_path: /nopt/vtest/slurm
slurm_build_dir: "{{ slurm_build_path }}/{{ slurm_build_version }}"

Expand Down
2 changes: 2 additions & 0 deletions environments/lab/inventory/group_vars/all/overrides.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
vermilion_cluster_name: lab
vermilion_cluster_name_prefix: lab
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ prometheus_scrape_configs_default:
- cpu
- meminfo
- bonding
- infiniband
- cpufreq
scrape_interval: 30s
scrape_timeout: 20s
Expand Down
1 change: 1 addition & 0 deletions environments/lab/inventory/group_vars/all/pulp.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
appliances_pulp_url: http://10.0.0.124:8080
2 changes: 0 additions & 2 deletions environments/lab/inventory/group_vars/hpctests/overrides.yml

This file was deleted.

1 change: 0 additions & 1 deletion environments/lab/inventory/group_vars/podman/overrides.yml

This file was deleted.

66 changes: 0 additions & 66 deletions environments/lab/slurmbuild/Dockerfile

This file was deleted.

11 changes: 0 additions & 11 deletions environments/lab/slurmbuild/README.md

This file was deleted.

3 changes: 3 additions & 0 deletions environments/lab/terraform/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
cloud-config.tftpl
vtest_plan
.terraform.tfstate.lock.info
6 changes: 6 additions & 0 deletions environments/lab/terraform/compute_names.auto.tfvars
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
compute_names = {

sm-0001: "stackhpc"
sm-0002: "stackhpc"

}
12 changes: 9 additions & 3 deletions environments/lab/terraform/inventory.tpl
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
[all:vars]
ansible_user=rocky
openhpc_cluster_name=${cluster_slurm_name}
ansible_ssh_common_args='-o ProxyCommand="ssh rocky@${proxy_fip} -W %h:%p"'
openhpc_cluster_name=${cluster_name}
#ansible_ssh_common_args='-o ProxyCommand="ssh rocky@${proxy_fip} -W %h:%p"'
ansible_python_interpreter=/usr/bin/python3

[control]
${control.name} ansible_host=${[for n in control.network: n.fixed_ip_v4 if n.access_network][0]} server_networks='${jsonencode({for net in control.network: net.name => [ net.fixed_ip_v4 ] })}'

[control:vars]
appliances_state_dir=/var/lib/state

[admin]
${cluster_name}-vtadmin

[login]
%{ for login in logins ~}
${login.name} ansible_host=${[for n in login.network: n.fixed_ip_v4 if n.access_network][0]} server_networks='${jsonencode({for net in login.network: net.name => [ net.fixed_ip_v4 ] })}'
Expand All @@ -19,7 +23,9 @@ ${login.name} ansible_host=${[for n in login.network: n.fixed_ip_v4 if n.access_
${compute.name} ansible_host=${[for n in compute.network: n.fixed_ip_v4 if n.access_network][0]} server_networks='${jsonencode({for net in compute.network: net.name => [ net.fixed_ip_v4 ] })}'
%{ endfor ~}

## Define groups for slurm parititions:
## Define groups for slurm partitions:
#################################################################
# small nodes

[${cluster_slurm_name}_sm]
${cluster_name}-sm-[0001:0002]
Loading