Skip to content

Define cluster images in a single variable #523

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .github/workflows/extra.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@ on:
branches:
- main
paths:
- 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
- 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json'
- 'ansible/roles/doca/**'
- 'ansible/roles/cuda/**'
- 'ansible/roles/lustre/**'
- '.github/workflows/extra.yml'
pull_request:
paths:
- 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
- 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json'
- 'ansible/roles/doca/**'
- 'ansible/roles/cuda/**'
- 'ansible/roles/lustre/**'
Expand All @@ -30,7 +30,7 @@ jobs:
matrix: # build RL8, RL9
build:
- image_name: openhpc-extra-RL8
source_image_name_key: RL8 # key into environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
source_image_name_key: RL8 # key into environments/.stackhpc/tofu/cluster_image.auto.tfvars.json
inventory_groups: doca,cuda,lustre
volume_size: 30 # needed for cuda
- image_name: openhpc-extra-RL9
Expand All @@ -51,7 +51,7 @@ jobs:
run: |
{
echo 'FAT_IMAGES<<EOF'
cat environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
cat environments/.stackhpc/tofu/cluster_image.auto.tfvars.json
echo EOF
} >> "$GITHUB_ENV"

Expand Down Expand Up @@ -98,7 +98,7 @@ jobs:
PACKER_LOG=1 packer build \
-on-error=${{ vars.PACKER_ON_ERROR }} \
-var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \
-var "source_image_name=${{ fromJSON(env.FAT_IMAGES)['cluster_image'][matrix.build.source_image_name_key] }}" \
-var "source_image_name=${{ fromJSON(env.FAT_IMAGES)['cluster_image_names'][matrix.build.source_image_name_key]['default'] }}" \
-var "image_name=${{ matrix.build.image_name }}" \
-var "inventory_groups=${{ matrix.build.inventory_groups }}" \
-var "volume_size=${{ matrix.build.volume_size }}" \
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/s3-image-sync.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ on:
branches:
- main
paths:
- 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
- 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json'
env:
S3_BUCKET: openhpc-images-prerelease
IMAGE_PATH: environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
IMAGE_PATH: environments/.stackhpc/tofu/cluster_image.auto.tfvars.json

jobs:
s3_cleanup:
Expand Down
20 changes: 10 additions & 10 deletions .github/workflows/stackhpc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,17 +84,17 @@ jobs:
with:
tofu_version: 1.6.2

- name: Initialise terraform
run: terraform init
working-directory: ${{ github.workspace }}/environments/.stackhpc/terraform
- name: Initialise OpenTofu
run: tofu init
working-directory: ${{ github.workspace }}/environments/.stackhpc/tofu

- name: Write clouds.yaml
run: |
mkdir -p ~/.config/openstack/
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
shell: bash

- name: Setup environment-specific inventory/terraform inputs
- name: Setup environment-specific inventory/tofu inputs
run: |
. venv/bin/activate
. environments/.stackhpc/activate
Expand All @@ -108,15 +108,15 @@ jobs:
run: |
. venv/bin/activate
. environments/.stackhpc/activate
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
terraform apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
cd $APPLIANCES_ENVIRONMENT_ROOT/tofu
tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"

- name: Delete infrastructure if provisioning failed
run: |
. venv/bin/activate
. environments/.stackhpc/activate
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
terraform destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
cd $APPLIANCES_ENVIRONMENT_ROOT/tofu
tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
if: failure() && steps.provision_servers.outcome == 'failure'

- name: Configure cluster
Expand Down Expand Up @@ -213,8 +213,8 @@ jobs:
run: |
. venv/bin/activate
. environments/.stackhpc/activate
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
terraform destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
cd $APPLIANCES_ENVIRONMENT_ROOT/tofu
tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
if: ${{ success() || cancelled() }}

# - name: Delete images
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/trivyscan.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ on:
branches:
- main
paths:
- 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
- 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json'

jobs:
scan:
Expand All @@ -18,7 +18,7 @@ jobs:
matrix:
build: ["RL8", "RL9"]
env:
JSON_PATH: environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
JSON_PATH: environments/.stackhpc/tofu/cluster_image.auto.tfvars.json
OS_CLOUD: openstack
CI_CLOUD: ${{ vars.CI_CLOUD }}

Expand Down Expand Up @@ -72,7 +72,7 @@ jobs:
- name: Parse image name json
id: manifest
run: |
IMAGE_NAME=$(jq --arg version "${{ matrix.build }}" -r '.cluster_image[$version]' "${{ env.JSON_PATH }}")
IMAGE_NAME=$(jq --arg version "${{ matrix.build }}" -r '.cluster_image_names[$version].default' "${{ env.JSON_PATH }}")
echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT"

- name: Download image
Expand Down
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,14 +90,18 @@ Create an OpenTofu variables file to define the required infrastructure, e.g.:
login_nodes = {
login-0: "login_flavor_name"
}
cluster_image_id = "rocky_linux_9_image_uuid"
cluster_image_ids = {
default: $ROCKYLINUX_9_IMAGE_UUID
}
compute = {
general = {
nodes: ["compute-0", "compute-1"]
flavor: "compute_flavor_name"
}
}

where `$ROCKYLINUX_9_IMAGE_UUID` should be replaced with the ID of the appropriate image (see above).

Variables marked `*` refer to OpenStack resources which must already exist. The above is a minimal configuration - for all variables and descriptions see `environments/$ENV/terraform/terraform.tfvars`.

To deploy this infrastructure, ensure the venv and the environment are [activated](#create-a-new-environment) and run:
Expand Down
2 changes: 1 addition & 1 deletion ansible/roles/passwords/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@
- name: Template k3s token to terraform
template:
src: k3s-token.auto.tfvars.json.j2
dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/terraform/k3s-token.auto.tfvars.json"
dest: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/tofu/k3s-token.auto.tfvars.json"
delegate_to: localhost
run_once: true
22 changes: 12 additions & 10 deletions docs/production.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,28 +41,30 @@ and referenced from the `site` and `production` environments, e.g.:
- OpenTofu configurations should be defined in the `site` environment and used
as a module from the other environments. This can be done with the
cookie-cutter generated configurations:
- Delete the *contents* of the cookie-cutter generated `terraform/` directories
- Delete the *contents* of the cookie-cutter generated `tofu/` directories
from the `production` and `staging` environments.
- Create a `main.tf` in those directories which uses `site/terraform/` as a
- Create a `main.tf` in those directories which uses `site/tofu/` as a
[module](https://opentofu.org/docs/language/modules/), e.g. :

```
...
module "cluster" {
source = "../../site/terraform/"
source = "../../site/tofu/"

cluster_name = "foo"
...
}
```

Note that:
- Environment-specific variables (`cluster_name`) should be hardcoded
into the module block.
- Environment-independent variables (e.g. maybe `cluster_net` if the
same is used for staging and production) should be set as *defaults*
in `environments/site/terraform/variables.tf`, and then don't need to
be passed in to the module.
Note that:
- Environment-specific variables (e.g. `cluster_name`) should be hardcoded
into the module block.
- Environment-independent variables should be set as *defaults*
in `environments/site/tofu/variables.tf`, and then don't need to
be passed in to the module. Examples include `cluster_net` (assuming
staging/production use the same network) and `cluster_image_ids` (because
staging should test the image(s) which will subsequently be deployed
to production after testing on a branch).

- Vault-encrypt secrets. Running the `generate-passwords.yml` playbook creates
a secrets file at `environments/$ENV/inventory/group_vars/all/secrets.yml`.
Expand Down

This file was deleted.

10 changes: 10 additions & 0 deletions environments/.stackhpc/tofu/cluster_image.auto.tfvars.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"cluster_image_names": {
"RL8": {
"default": "openhpc-RL8-250107-1534-b03caaf3"
},
"RL9": {
"default": "openhpc-RL9-250107-1535-b03caaf3"
}
}
}
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

# This terraform configuration uses the "skeleton" terraform, so that is checked by CI.

terraform {
Expand Down Expand Up @@ -25,9 +26,9 @@ variable "os_version" {
default = "RL9"
}

variable "cluster_image" {
description = "single image for all cluster nodes, keyed by os_version - a convenience for CI"
type = map(string)
variable "cluster_image_names" {
description = "image *names* keyed by os_version, for CI"
type = map(map(string))
}

variable "cluster_net" {}
Expand Down Expand Up @@ -58,20 +59,22 @@ variable "k3s_token" {
type = string
}

data "openstack_images_image_v2" "cluster" {
name = var.cluster_image[var.os_version]
data "openstack_images_image_v2" "cluster_images" {
for_each = var.cluster_image_names[var.os_version]

name = each.value
most_recent = true
}

module "cluster" {
source = "../../skeleton/{{cookiecutter.environment}}/terraform/"
source = "../../skeleton/{{cookiecutter.environment}}/tofu/"

cluster_name = var.cluster_name
cluster_net = var.cluster_net
cluster_subnet = var.cluster_subnet
vnic_type = var.vnic_type
key_pair = "slurm-app-ci"
cluster_image_id = data.openstack_images_image_v2.cluster.id
cluster_image_ids = {for key, img in data.openstack_images_image_v2.cluster_images: key => img.id}
control_node_flavor = var.control_node_flavor
k3s_token = var.k3s_token

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ module "compute" {
cluster_subnet_id = data.openstack_networking_subnet_v2.cluster_subnet.id

flavor = each.value.flavor
image_id = lookup(each.value, "image_id", var.cluster_image_id)
image_id = lookup(var.cluster_image_ids, each.key, var.cluster_image_ids["default"])
vnic_type = lookup(each.value, "vnic_type", var.vnic_type)
vnic_profile = lookup(each.value, "vnic_profile", var.vnic_profile)
key_pair = var.key_pair
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,13 @@ resource "openstack_compute_instance_v2" "control" {
for_each = toset(["control"])

name = "${var.cluster_name}-${each.key}"
image_id = var.cluster_image_id
image_id = var.cluster_image_ids["default"]
flavor_name = var.control_node_flavor
key_pair = var.key_pair

# root device:
block_device {
uuid = var.cluster_image_id
uuid = var.cluster_image_ids["default"]
source_type = "image"
destination_type = var.volume_backed_instances ? "volume" : "local"
volume_size = var.volume_backed_instances ? var.root_volume_size : null
Expand Down Expand Up @@ -102,14 +102,14 @@ resource "openstack_compute_instance_v2" "login" {
for_each = var.login_nodes

name = "${var.cluster_name}-${each.key}"
image_id = var.cluster_image_id
image_id = var.cluster_image_ids["default"]
flavor_name = each.value
key_pair = var.key_pair

dynamic "block_device" {
for_each = var.volume_backed_instances ? [1]: []
content {
uuid = var.cluster_image_id
uuid = var.cluster_image_ids["default"]
source_type = "image"
destination_type = "volume"
volume_size = var.root_volume_size
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,13 @@ variable "login_nodes" {
description = "Mapping defining login nodes: key -> (str) nodename suffix, value -> (str) flavor name"
}

variable "cluster_image_id" {
type = string
description = "ID of default image for the cluster"
variable "cluster_image_ids" {
type = map(string)
description = <<-EOF
Mapping of UUIDs defining images for the cluster. Valid keys are:
- "default": required, defines default image for cluster
- any key from "compute" variable, to define different images for compute node groups
EOF
}

variable "compute" {
Expand All @@ -49,7 +53,6 @@ variable "compute" {
nodes: List of node names
flavor: String flavor name
Optional:
image_id: Overrides variable cluster_image_id
vnic_type: Overrides variable vnic_type
vnic_profile: Overrides variable vnic_profile
EOF
Expand Down
Loading