From 23d55c794f26538cc42a7f02719344ba698ec7f8 Mon Sep 17 00:00:00 2001
From: Steve Brasier <steveb@stackhpc.com>
Date: Thu, 17 Oct 2024 13:48:32 +0000
Subject: [PATCH 01/11] add sequence docs

---
 docs/sequence.md | 54 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 docs/sequence.md

diff --git a/docs/sequence.md b/docs/sequence.md
new file mode 100644
index 000000000..7c8b2ddb6
--- /dev/null
+++ b/docs/sequence.md
@@ -0,0 +1,54 @@
+# Slurm Appliance Sequences
+
+
+
+## Image build
+
+This sequence applies to both "fatimage" builds (usually only done in StackHPC CI) and "extra" builds. The differences are:
+- Which image the build VM uses, i.e. the starting image: A genericcloud image for fatimage builds or a fatimages build for an extra build.
+- Which inventory groups the build VM is added to.
+
+Note that ansible-init does not run during an image build. It is disabled via a metadata flag.
+
+```mermaid
+sequenceDiagram
+    participant ansible as Ansible Deploy Host
+    participant cloud as Cloud
+    note over ansible: $ packer build ...
+    ansible->>cloud: Create VM
+    create participant packer as Build VM
+    cloud->>packer: Create VM
+    note over packer: Boot
+    ansible->>packer: Wait for ssh connection
+    ansible->>packer: Run ansible/fatimage.yml playbook
+    ansible->>packer: Shutdown
+    ansible->>cloud: Create image from Build VM root disk
+    destroy packer
+    note over cloud: Image openhpc-... created
+
+```
+
+## Cluster Creation
+
+```mermaid
+sequenceDiagram
+    participant ansible as Ansible Deploy Host
+    participant cloud as Cloud
+    note over ansible: $ ansible-playbook ansible/adhoc/generate-passwords.yml
+    ansible->>ansible: Template secrets to inventory group_vars
+    note over ansible: $ tofu apply ....
+    ansible->>cloud: Create infra
+    create participant nodes as Cluster Instances
+    cloud->>nodes: Create instances
+    note over nodes: Boot
+    rect rgb(204, 232, 252)
+    note over nodes: ansible-init
+    nodes->>cloud: Query metadata
+    cloud->>nodes: Metadata sent
+    nodes->>nodes: Start k3s
+    end
+    note over ansible: $ ansible-playbook ansible/site.yml
+    ansible->>nodes: Wait for ansible-init completion
+    ansible->>nodes: Ansible tasks
+    note over nodes: All services running
+

From 01dcdbaa29602f9f78958c5b0444d53c17ceb57b Mon Sep 17 00:00:00 2001
From: Steve Brasier <33413598+sjpb@users.noreply.github.com>
Date: Wed, 30 Oct 2024 13:28:08 +0000
Subject: [PATCH 02/11] Update docs/sequence.md

Co-authored-by: wtripp180901 <78219569+wtripp180901@users.noreply.github.com>
---
 docs/sequence.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/sequence.md b/docs/sequence.md
index 7c8b2ddb6..2e9d76a4b 100644
--- a/docs/sequence.md
+++ b/docs/sequence.md
@@ -19,6 +19,9 @@ sequenceDiagram
     create participant packer as Build VM
     cloud->>packer: Create VM
     note over packer: Boot
+    packer->>cloud: Query metadata
+    cloud->>packer: Metadata sent
+    packer->>packer: Skip ansible-init
     ansible->>packer: Wait for ssh connection
     ansible->>packer: Run ansible/fatimage.yml playbook
     ansible->>packer: Shutdown

From 5e6f477a72c024a049e6e01c32b0dcb1954adbd4 Mon Sep 17 00:00:00 2001
From: Steve Brasier <33413598+sjpb@users.noreply.github.com>
Date: Wed, 30 Oct 2024 13:28:16 +0000
Subject: [PATCH 03/11] Update docs/sequence.md

Co-authored-by: wtripp180901 <78219569+wtripp180901@users.noreply.github.com>
---
 docs/sequence.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sequence.md b/docs/sequence.md
index 2e9d76a4b..634ad1285 100644
--- a/docs/sequence.md
+++ b/docs/sequence.md
@@ -38,7 +38,7 @@ sequenceDiagram
     participant ansible as Ansible Deploy Host
     participant cloud as Cloud
     note over ansible: $ ansible-playbook ansible/adhoc/generate-passwords.yml
-    ansible->>ansible: Template secrets to inventory group_vars
+    ansible->>ansible: Template secrets to inventory group_vars and tofu metadata
     note over ansible: $ tofu apply ....
     ansible->>cloud: Create infra
     create participant nodes as Cluster Instances

From b46e9f0762f59f3a4f8c0f1a0af244f2919b7cee Mon Sep 17 00:00:00 2001
From: Steve Brasier <33413598+sjpb@users.noreply.github.com>
Date: Wed, 30 Oct 2024 13:28:23 +0000
Subject: [PATCH 04/11] Update docs/sequence.md

Co-authored-by: wtripp180901 <78219569+wtripp180901@users.noreply.github.com>
---
 docs/sequence.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sequence.md b/docs/sequence.md
index 634ad1285..abf00b07e 100644
--- a/docs/sequence.md
+++ b/docs/sequence.md
@@ -48,7 +48,7 @@ sequenceDiagram
     note over nodes: ansible-init
     nodes->>cloud: Query metadata
     cloud->>nodes: Metadata sent
-    nodes->>nodes: Start k3s
+    nodes->>nodes: Start k3s and connect to its peers
     end
     note over ansible: $ ansible-playbook ansible/site.yml
     ansible->>nodes: Wait for ansible-init completion

From 4a664a40a0cc3198ecc17dba06b7a6d3ff48b78f Mon Sep 17 00:00:00 2001
From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com>
Date: Thu, 9 Jan 2025 15:37:05 +0000
Subject: [PATCH 05/11] Added release train to sequence diagram

---
 docs/sequence.md | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/docs/sequence.md b/docs/sequence.md
index abf00b07e..a4e66186f 100644
--- a/docs/sequence.md
+++ b/docs/sequence.md
@@ -14,6 +14,14 @@ Note that ansible-init does not run during an image build. It is disabled via a
 sequenceDiagram
     participant ansible as Ansible Deploy Host
     participant cloud as Cloud
+    ansible->>cloud: Create VM
+    create participant pulp as Local Pulp Server
+    cloud->>pulp: Create VM
+    ansible->>pulp: Run ansible/adhoc/deploy-pulp.yml
+    note over pulp: Pulp server installed & configured
+    ansible->>pulp: Run ansible/adhoc/sync-pulp.yml # needs to point at ark too somehow
+    participant ark as Ark
+    ark-->>pulp: Sync repos
     note over ansible: $ packer build ...
     ansible->>cloud: Create VM
     create participant packer as Build VM
@@ -23,8 +31,14 @@ sequenceDiagram
     cloud->>packer: Metadata sent
     packer->>packer: Skip ansible-init
     ansible->>packer: Wait for ssh connection
-    ansible->>packer: Run ansible/fatimage.yml playbook
+    rect rgb(204, 232, 252)
+    note right of ansible: fatimage.yml
+    ansible->>packer: Overwrite repo files with Pulp repos and update
+    packer->>pulp: dnf update
+    pulp-->>packer: Package updates
+    ansible->>packer: Perform installation tasks
     ansible->>packer: Shutdown
+    end
     ansible->>cloud: Create image from Build VM root disk
     destroy packer
     note over cloud: Image openhpc-... created

From c8002b9c024f9c3300970df97889187ba791d626 Mon Sep 17 00:00:00 2001
From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com>
Date: Fri, 14 Feb 2025 15:59:44 +0000
Subject: [PATCH 06/11] Add sequence for slurm controlled rebuild

---
 docs/sequence.md | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/docs/sequence.md b/docs/sequence.md
index a4e66186f..042178f48 100644
--- a/docs/sequence.md
+++ b/docs/sequence.md
@@ -69,3 +69,45 @@ sequenceDiagram
     ansible->>nodes: Ansible tasks
     note over nodes: All services running
 
+```
+
+## Slurm Controlled Rebuild WIP
+
+This sequence applies to active clusters, after running ansible/site.yml for the first time, to reimage the cluster while main.tf has set:
+- `ignore_image_changes: true`
+- `compute_init_enable: ['compute',..]`
+
+```mermaid
+sequenceDiagram
+    participant ansible as Ansible Deploy Host
+    participant cloud as Cloud
+    participant nodes as Cluster Instances
+    note over ansible: Update cluster_image.auto.tfvars.json
+    note over ansible: $ tofu apply ....
+    ansible->>ansible: target_image templated to hostvars
+    ansible->>cloud: Update state.tf with new image
+    cloud->>nodes: Reimage login and control nodes
+    note over ansible: $ ansible-playbook ansible/site.yml
+    ansible->>nodes: Hostvars templated to NFS exports directory
+    ansible->>nodes: Ansible tasks
+    note over nodes: $ srun --reboot ...
+    rect rgb(204, 232, 252)
+    note over nodes: RebootProgram
+    nodes->>cloud: Query and compare instance image
+    cloud->>nodes: Reimage if target =/= current
+    rect rgb(252, 200, 100)
+    note over nodes: compute-init
+    nodes->>nodes: Retrieve hostvars from nfs mount
+    note over nodes: Compute nodes rejoin cluster
+
+    end
+    nodes->>nodes: srun task completes
+    end
+
+
+
+
+
+
+
+

From 100fdc823e8b4000cdec4596f2876bfa7f1e867c Mon Sep 17 00:00:00 2001
From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com>
Date: Fri, 14 Feb 2025 16:48:37 +0000
Subject: [PATCH 07/11] Update sequence.md

---
 docs/sequence.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sequence.md b/docs/sequence.md
index 042178f48..7b38c3923 100644
--- a/docs/sequence.md
+++ b/docs/sequence.md
@@ -71,7 +71,7 @@ sequenceDiagram
 
 ```
 
-## Slurm Controlled Rebuild WIP
+## Slurm Controlled Rebuild
 
 This sequence applies to active clusters, after running ansible/site.yml for the first time, to reimage the cluster while main.tf has set:
 - `ignore_image_changes: true`

From 83948b15b9498010a92cc7daeda0812b31fc7cde Mon Sep 17 00:00:00 2001
From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com>
Date: Mon, 17 Feb 2025 16:27:05 +0000
Subject: [PATCH 08/11] Update sequence.md

---
 docs/sequence.md | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/docs/sequence.md b/docs/sequence.md
index 7b38c3923..efdb3bcd8 100644
--- a/docs/sequence.md
+++ b/docs/sequence.md
@@ -99,15 +99,6 @@ sequenceDiagram
     note over nodes: compute-init
     nodes->>nodes: Retrieve hostvars from nfs mount
     note over nodes: Compute nodes rejoin cluster
-
     end
     nodes->>nodes: srun task completes
     end
-
-
-
-
-
-
-
-

From ee201d4305bcbf63e7d85f03cb33fb933c668337 Mon Sep 17 00:00:00 2001
From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com>
Date: Mon, 17 Feb 2025 16:33:26 +0000
Subject: [PATCH 09/11] Update sequence.md

---
 docs/sequence.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/sequence.md b/docs/sequence.md
index efdb3bcd8..778029ff2 100644
--- a/docs/sequence.md
+++ b/docs/sequence.md
@@ -73,9 +73,11 @@ sequenceDiagram
 
 ## Slurm Controlled Rebuild
 
-This sequence applies to active clusters, after running ansible/site.yml for the first time, to reimage the cluster while main.tf has set:
-- `ignore_image_changes: true`
-- `compute_init_enable: ['compute',..]`
+This sequence applies to active clusters, after running ansible/site.yml for the first time. Slurm controlled rebuild requires:
+- `ignore_image_changes: true` in `main.tf`
+- `compute_init_enable: ['compute',..]` in `main.tf`
+- `rebuild` group is populated with `control` in the inventory
+
 
 ```mermaid
 sequenceDiagram

From 58d79a7d32c1b49f41df531c62e042aad564b4b5 Mon Sep 17 00:00:00 2001
From: Steve Brasier <steveb@stackhpc.com>
Date: Fri, 21 Mar 2025 15:10:14 +0000
Subject: [PATCH 10/11] make Pulp generic for build

---
 docs/sequence.md | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/docs/sequence.md b/docs/sequence.md
index 778029ff2..a9527f617 100644
--- a/docs/sequence.md
+++ b/docs/sequence.md
@@ -4,27 +4,29 @@
 
 ## Image build
 
-This sequence applies to both "fatimage" builds (usually only done in StackHPC CI) and "extra" builds. The differences are:
-- Which image the build VM uses, i.e. the starting image: A genericcloud image for fatimage builds or a fatimages build for an extra build.
-- Which inventory groups the build VM is added to.
+This sequence applies to both:
+- "fatimage" builds, starting from GenericCloud images and using
+  control,login,compute inventory groups to install all packages, e.g. StackHPC
+  CI builds
+- "extra" builds, starting from StackHPC images and using selected inventory
+  groups to add specfic features for a site-specific image.
 
-Note that ansible-init does not run during an image build. It is disabled via a metadata flag.
+Note that a generic Pulp server is shown in the below diagram. This may be
+StackHPC's Ark server or a local Pulp mirroring Ark. It is assumed a local Pulp
+has already had the relevant snapshots synced from Ark (although it is possible
+to trigger this during an image build).
+
+Note that ansible-init does not run during an image build. It is disabled via
+a metadata flag.
 
 ```mermaid
 sequenceDiagram
     participant ansible as Ansible Deploy Host
     participant cloud as Cloud
-    ansible->>cloud: Create VM
-    create participant pulp as Local Pulp Server
-    cloud->>pulp: Create VM
-    ansible->>pulp: Run ansible/adhoc/deploy-pulp.yml
-    note over pulp: Pulp server installed & configured
-    ansible->>pulp: Run ansible/adhoc/sync-pulp.yml # needs to point at ark too somehow
-    participant ark as Ark
-    ark-->>pulp: Sync repos
     note over ansible: $ packer build ...
     ansible->>cloud: Create VM
     create participant packer as Build VM
+    participant pulp as Pulp
     cloud->>packer: Create VM
     note over packer: Boot
     packer->>cloud: Query metadata
@@ -42,7 +44,6 @@ sequenceDiagram
     ansible->>cloud: Create image from Build VM root disk
     destroy packer
     note over cloud: Image openhpc-... created
-
 ```
 
 ## Cluster Creation

From 435d0848b5b9a77c9e0709d8ab457b7b651ae413 Mon Sep 17 00:00:00 2001
From: Steve Brasier <steveb@stackhpc.com>
Date: Fri, 21 Mar 2025 15:59:23 +0000
Subject: [PATCH 11/11] update for pulp, k3s, add boxes for clarity

---
 docs/sequence.md | 66 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 46 insertions(+), 20 deletions(-)

diff --git a/docs/sequence.md b/docs/sequence.md
index a9527f617..8723674e9 100644
--- a/docs/sequence.md
+++ b/docs/sequence.md
@@ -29,9 +29,12 @@ sequenceDiagram
     participant pulp as Pulp
     cloud->>packer: Create VM
     note over packer: Boot
+    rect rgb(204, 232, 252)
+    note right of packer: ansible-init
     packer->>cloud: Query metadata
     cloud->>packer: Metadata sent
     packer->>packer: Skip ansible-init
+    end
     ansible->>packer: Wait for ssh connection
     rect rgb(204, 232, 252)
     note right of ansible: fatimage.yml
@@ -43,65 +46,88 @@ sequenceDiagram
     end
     ansible->>cloud: Create image from Build VM root disk
     destroy packer
-    note over cloud: Image openhpc-... created
+    note over cloud: Image created
 ```
 
 ## Cluster Creation
 
+In the below it is assumed that no additional packages are installed beyond
+what is present in the image, i.e. Ark/local Pulp access is not required.
+
 ```mermaid
 sequenceDiagram
     participant ansible as Ansible Deploy Host
     participant cloud as Cloud
+    rect rgb(204, 232, 252)
     note over ansible: $ ansible-playbook ansible/adhoc/generate-passwords.yml
-    ansible->>ansible: Template secrets to inventory group_vars and tofu metadata
-    note over ansible: $ tofu apply ....
+    ansible->>ansible: Template secrets to inventory group_vars
+    end
+    rect rgb(204, 232, 252)
+    note over ansible: $ tofu apply ...
     ansible->>cloud: Create infra
     create participant nodes as Cluster Instances
     cloud->>nodes: Create instances
+    end
     note over nodes: Boot
     rect rgb(204, 232, 252)
-    note over nodes: ansible-init
+    note right of nodes: ansible-init
     nodes->>cloud: Query metadata
     cloud->>nodes: Metadata sent
-    nodes->>nodes: Start k3s and connect to its peers
     end
+    rect rgb(204, 232, 252)
     note over ansible: $ ansible-playbook ansible/site.yml
     ansible->>nodes: Wait for ansible-init completion
     ansible->>nodes: Ansible tasks
     note over nodes: All services running
-
+    end
 ```
 
 ## Slurm Controlled Rebuild
 
-This sequence applies to active clusters, after running ansible/site.yml for the first time. Slurm controlled rebuild requires:
-- `ignore_image_changes: true` in `main.tf`
-- `compute_init_enable: ['compute',..]` in `main.tf`
-- `rebuild` group is populated with `control` in the inventory
+This sequence applies to active clusters, after running the `site.yml` playbook
+for the first time. Slurm controlled rebuild requires that:
+- Compute groups in the OpenTofu `compute` variable have:
+    - `ignore_image_changes: true`
+    - `compute_init_enable: ['compute', ... ]`
+- The Ansible `rebuild` inventory group contains the `control` group.
 
+TODO: should also document how compute-init does NOT run if the `site.yml`
+playbook has not been run.
 
 ```mermaid
 sequenceDiagram
     participant ansible as Ansible Deploy Host
     participant cloud as Cloud
     participant nodes as Cluster Instances
-    note over ansible: Update cluster_image.auto.tfvars.json
+    note over ansible: Update OpenTofu cluster_image variable [1]
+    rect rgb(204, 232, 250)
     note over ansible: $ tofu apply ....
-    ansible->>ansible: target_image templated to hostvars
-    ansible->>cloud: Update state.tf with new image
+    ansible<<->>cloud: Check login/compute current vs desired images
     cloud->>nodes: Reimage login and control nodes
+    ansible->>ansible: Update inventory/hosts.yml for<br>compute node image_id
+    end
+    rect rgb(204, 232, 250)
     note over ansible: $ ansible-playbook ansible/site.yml
-    ansible->>nodes: Hostvars templated to NFS exports directory
+    ansible->>nodes: Hostvars templated to nfs share
     ansible->>nodes: Ansible tasks
+    note over nodes:All services running
+    end
     note over nodes: $ srun --reboot ...
-    rect rgb(204, 232, 252)
-    note over nodes: RebootProgram
-    nodes->>cloud: Query and compare instance image
-    cloud->>nodes: Reimage if target =/= current
+    rect rgb(204, 232, 250)
+    note over nodes: RebootProgram [2]
+    nodes->>cloud: Compare current instance image to target from hostvars
+    cloud->>nodes: Reimage if target != current
     rect rgb(252, 200, 100)
-    note over nodes: compute-init
+    note over nodes: compute-init [3]
     nodes->>nodes: Retrieve hostvars from nfs mount
+    nodes->>nodes: Run ansible tasks
     note over nodes: Compute nodes rejoin cluster
     end
-    nodes->>nodes: srun task completes
     end
+    nodes->>nodes: srun task completes
+```
+Notes:
+1. And/or login/compute group overrides
+2. Running on control node
+3. On hosts targeted by job
+