Skip to content

Commit 905d3a1

Browse files
authored
Update deconfigure scripts (#311)
Add comments to deconfigure scripts and during configuration make sure that the deconfigure scripts are installed on the root file system so that the deconfiguration can be done whether the cluster still exists or not. Fix external login node configuration for rhel9 Resolves #300
1 parent b04d0d2 commit 905d3a1

File tree

12 files changed

+172
-160
lines changed

12 files changed

+172
-160
lines changed

.vscode/settings.json

+3-2
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,6 @@
33
"files.trimTrailingWhitespace": true,
44
"files.watcherExclude": {
55
"**/cdk.out": true
6-
}
7-
}
6+
},
7+
"makefile.configureOnOpen": false
8+
}

source/cdk/cdk_slurm_stack.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -3173,7 +3173,7 @@ def create_parallel_cluster_config(self):
31733173
region = self.cluster_region
31743174
cluster_name = self.config['slurm']['ClusterName']
31753175
CfnOutput(self, "Command01_MountHeadNodeNfs",
3176-
value = f"head_ip=head_node.{self.config['slurm']['ClusterName']}.pcluster && sudo mkdir -p /opt/slurm/{cluster_name} && sudo mount $head_ip:/opt/slurm /opt/slurm/{cluster_name}"
3176+
value = f"head_ip=head_node.{self.config['slurm']['ClusterName']}.pcluster && sudo mkdir -p /opt/slurm/{cluster_name} && sudo mount $head_ip:/opt/slurm /opt/slurm/{cluster_name} && sudo systemctl daemon-reload"
31773177
)
31783178
CfnOutput(self, "Command02_CreateUsersGroupsJsonConfigure",
31793179
value = f"sudo /opt/slurm/{cluster_name}/config/bin/create_users_groups_json_configure.sh"
@@ -3182,10 +3182,10 @@ def create_parallel_cluster_config(self):
31823182
value = f"sudo /opt/slurm/{cluster_name}/config/bin/external_login_node_configure.sh"
31833183
)
31843184
CfnOutput(self, "command10_CreateUsersGroupsJsonDeconfigure",
3185-
value = f"sudo /opt/slurm/{cluster_name}/config/bin/create_users_groups_json_deconfigure.sh"
3185+
value = f"sudo /opt/aws-eda-slurm-cluster/{cluster_name}/bin/create_users_groups_json_deconfigure.sh"
31863186
)
31873187
CfnOutput(self, "command11_ExternalLoginNodeDeconfigure",
3188-
value = f"sudo /opt/slurm/{cluster_name}/config/bin/external_login_nodes_deconfigure.sh && sudo umount /opt/slurm/{cluster_name}"
3188+
value = f"sudo /opt/aws-eda-slurm-cluster/{cluster_name}/bin/external_login_node_deconfigure.sh"
31893189
)
31903190

31913191
def create_queue_config(self, queue_name, allocation_strategy, purchase_option):

source/resources/lambdas/DeconfigureExternalLoginNodes/DeconfigureExternalLoginNodes.py

+3-40
Original file line numberDiff line numberDiff line change
@@ -118,46 +118,9 @@ def lambda_handler(event, context):
118118
ssm_script = dedent(f"""
119119
set -ex
120120
121-
mount_dest=/opt/slurm/{cluster_name}
122-
123-
# Make sure that the cluster is still mounted and mount is accessible.
124-
# If the cluster has already been deleted then the mount will be hung and we have to do manual cleanup.
125-
if mount | grep " $mount_dest "; then
126-
echo "$mount_dest is mounted."
127-
if ! timeout 1s ls $mount_dest; then
128-
echo "Mount point ($mount_dest) is hung. Source may have already been deleted."
129-
timeout 5s sudo umount -lf $mount_dest
130-
timeout 1s rm -rf $mount_dest
131-
fi
132-
fi
133-
134-
script="$mount_dest/config/bin/external_login_node_deconfigure.sh"
135-
if ! timeout 1s ls $script; then
136-
echo "$script doesn't exist"
137-
else
138-
sudo $script
139-
fi
140-
141-
# Do manual cleanup just in case something above failed.
142-
143-
sudo rm -f /etc/profile.d/slurm_{cluster_name}_modulefiles.sh
144-
145-
sudo grep -v ' $mount_dest ' /etc/fstab > /etc/fstab.new
146-
if diff -q /etc/fstab /etc/fstab.new; then
147-
sudo rm -f /etc/fstab.new
148-
else
149-
sudo cp /etc/fstab /etc/fstab.$(date '+%Y-%m-%d@%H:%M:%S~')
150-
sudo mv -f /etc/fstab.new /etc/fstab
151-
fi
152-
153-
if timeout 1s mountpoint $mount_dest; then
154-
echo "$mount_dest is a mountpoint"
155-
sudo umount -lf $mount_dest
156-
fi
157-
158-
if timeout 1s ls $mount_dest; then
159-
sudo rmdir $mount_dest
160-
fi
121+
script="/opt/aws-eda-slurm-cluster/{cluster_name}/bin/external_login_node_deconfigure.sh"
122+
sudo $script
123+
161124
""")
162125

163126
response = ssm_client.send_command(

source/resources/lambdas/DeconfigureUsersGroupsJson/DeconfigureUsersGroupsJson.py

+2-41
Original file line numberDiff line numberDiff line change
@@ -98,47 +98,8 @@ def lambda_handler(event, context):
9898
ssm_client = boto3.client('ssm', region_name=cluster_region)
9999
commands = dedent(f"""set -ex
100100
101-
mount_dest=/opt/slurm/{cluster_name}
102-
103-
# Make sure that the cluster is still mounted and mount is accessible.
104-
# If the cluster has already been deleted then the mount will be hung and we have to do manual cleanup.
105-
# Another failure mechanism is if the cluster didn't deploy in which case the mount may not even exist.
106-
if mount | grep " $mount_dest "; then
107-
echo "$mount_dest is mounted."
108-
if ! timeout 1s ls $mount_dest; then
109-
echo "Mount point ($mount_dest) is hung. Source may have already been deleted."
110-
timeout 5s sudo umount -lf $mount_dest
111-
timeout 1s rm -rf $mount_dest
112-
fi
113-
fi
114-
115-
script="$mount_dest/config/bin/create_users_groups_json_deconfigure.sh"
116-
if ! timeout 1s ls $script; then
117-
echo "$script doesn't exist or isn't accessible."
118-
else
119-
sudo $script
120-
fi
121-
122-
# Do manual cleanup just in case something above failed.
123-
124-
sudo grep -v " $mount_dest " /etc/fstab > /etc/fstab.new
125-
if diff -q /etc/fstab /etc/fstab.new; then
126-
sudo rm -f /etc/fstab.new
127-
else
128-
sudo cp /etc/fstab /etc/fstab.$(date '+%Y-%m-%d@%H:%M:%S~')
129-
sudo mv -f /etc/fstab.new /etc/fstab
130-
fi
131-
132-
if timeout 1s mountpoint $mount_dest; then
133-
echo "$mount_dest is a mountpoint"
134-
sudo umount -lf $mount_dest
135-
fi
136-
137-
if timeout 1s ls $mount_dest; then
138-
sudo rmdir $mount_dest
139-
fi
140-
141-
true
101+
script="/opt/aws-eda-slurm-cluster/{cluster_name}/bin/create_users_groups_json_deconfigure.sh"
102+
sudo $script
142103
""")
143104
logger.info(f"Submitting SSM command")
144105
send_command_response = ssm_client.send_command(

source/resources/parallel-cluster/config/bin/create_users_groups_json_configure.sh

+26-4
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,40 @@
44

55
# This script creates the json file with user and group information.
66
# It also creates a crontab entry to update the json file every hour.
7+
#
8+
# The script and ansible playbooks needed to undo this will be installed at:
9+
#
10+
# /opt/aws-eda-slurm-cluster/{{ cluster_name }}
11+
#
12+
# To deconfigure the instance, run the following script:
13+
#
14+
# /opt/aws-eda-slurm-cluster/{{ cluster_name }}/create_users_groups_json_deconfigure.sh
715

816
full_script=$(realpath $0)
917
script_dir=$(dirname $full_script)
1018
base_script=$(basename $full_script)
1119

12-
date
13-
echo "Started create_users_groups_json_configure.sh: $full_script"
20+
echo "$(date): Started create_users_groups_json_configure.sh: $full_script"
1421

1522
config_dir={{ ExternalLoginNodeSlurmConfigDir }}
1623
config_bin_dir=$config_dir/bin
1724

25+
ErrorSnsTopicArn={{ ErrorSnsTopicArn }}
26+
27+
# Notify user of errors
28+
function on_exit {
29+
rc=$?
30+
set +e
31+
if [[ $rc -ne 0 ]] && [[ ":$ErrorSnsTopicArn" != ":" ]]; then
32+
message_file=$(mktemp)
33+
echo "See log files for more info:
34+
grep ${script_name} /var/log/messages | less" > $message_file
35+
aws sns publish --topic-arn $ErrorSnsTopicArn --subject "${ClusterName} ${script_name} failed" --message file://$message_file
36+
rm $message_file
37+
fi
38+
}
39+
trap on_exit EXIT
40+
1841
# Configure using ansible
1942
if ! yum list installed ansible &> /dev/null; then
2043
yum install -y ansible || amazon-linux-extras install -y ansible2
@@ -29,7 +52,6 @@ ansible-playbook $PLAYBOOKS_PATH/ParallelClusterCreateUsersGroupsJsonConfigure.y
2952
-e @$ANSIBLE_PATH/ansible_external_login_node_vars.yml
3053
popd
3154

32-
date
33-
echo "Finished create_users_groups_json_configure.sh: $full_script"
55+
echo "$(date): Finished create_users_groups_json_configure.sh: $full_script"
3456

3557
exit 0

source/resources/parallel-cluster/config/bin/create_users_groups_json_deconfigure.sh

+20-19
Original file line numberDiff line numberDiff line change
@@ -2,44 +2,45 @@
22
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
33
# SPDX-License-Identifier: MIT-0
44

5-
# This script creates the json file with user and group information.
6-
# It also creates a crontab entry to update the json file every hour.
5+
# This script deconfigures this instance from creating the json file with user and group information.
76

87
full_script=$(realpath $0)
98
script_dir=$(dirname $full_script)
109
base_script=$(basename $full_script)
10+
ANSIBLE_PATH=$(dirname $script_dir)/ansible
11+
PLAYBOOKS_PATH=$ANSIBLE_PATH/playbooks
1112

12-
date
13-
echo "Started create_users_groups_json_deconfigure.sh: $full_script"
13+
echo "$(date): Started create_users_groups_json_deconfigure.sh: $full_script"
1414

15-
config_dir={{ ExternalLoginNodeSlurmConfigDir }}
16-
config_bin_dir=$config_dir/bin
15+
ErrorSnsTopicArn={{ ErrorSnsTopicArn }}
1716

18-
temp_config_dir=/tmp/{{ClusterName}}_config
19-
temp_config_bin_dir=$temp_config_dir/bin
20-
if [[ $script_dir != $temp_config_bin_dir ]]; then
21-
rm -rf $temp_config_dir
22-
cp -r $config_dir $temp_config_dir
23-
exec $temp_config_dir/bin/$base_script
24-
fi
17+
# Notify user of errors
18+
function on_exit {
19+
rc=$?
20+
set +e
21+
if [[ $rc -ne 0 ]] && [[ ":$ErrorSnsTopicArn" != ":" ]]; then
22+
message_file=$(mktemp)
23+
echo "See log files for more info:
24+
grep ${script_name} /var/log/messages | less" > $message_file
25+
aws sns publish --topic-arn $ErrorSnsTopicArn --subject "${ClusterName} ${script_name} failed" --message file://$message_file
26+
rm $message_file
27+
fi
28+
}
29+
trap on_exit EXIT
2530

2631
# Install ansible
2732
if ! yum list installed ansible &> /dev/null; then
2833
yum install -y ansible || amazon-linux-extras install -y ansible2
2934
fi
3035

31-
ANSIBLE_PATH=$temp_config_dir/ansible
32-
PLAYBOOKS_PATH=$ANSIBLE_PATH/playbooks
33-
3436
pushd $PLAYBOOKS_PATH
3537
ansible-playbook $PLAYBOOKS_PATH/ParallelClusterCreateUsersGroupsJsonDeconfigure.yml \
3638
-i inventories/local.yml \
3739
-e @$ANSIBLE_PATH/ansible_external_login_node_vars.yml
3840
popd
3941

40-
rm -rf $temp_config_dir
42+
rm -rf $(dirname $script_dir)
4143

42-
date
43-
echo "Finished create_users_groups_json_deconfigure.sh: $full_script"
44+
echo "$(date): Finished create_users_groups_json_deconfigure.sh: $full_script"
4445

4546
exit 0

source/resources/parallel-cluster/config/bin/external_login_node_configure.sh

+10
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,16 @@
22
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
33
# SPDX-License-Identifier: MIT-0
44

5+
# This script configures an instance as an external login node for a ParallelCluster cluster.
6+
#
7+
# The script and ansible playbooks needed to undo this will be installed at:
8+
#
9+
# /opt/aws-eda-slurm-cluster/{{ cluster_name }}
10+
#
11+
# To deconfigure the instance as a login node run the following script:
12+
#
13+
# /opt/aws-eda-slurm-cluster/{{ cluster_name }}/external_login_node_deconfigure.sh
14+
515
full_script=$(realpath $0)
616
script_dir=$(dirname $full_script)
717
script_name=$(basename $full_script)

source/resources/parallel-cluster/config/bin/external_login_node_deconfigure.sh

+23-18
Original file line numberDiff line numberDiff line change
@@ -2,43 +2,48 @@
22
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
33
# SPDX-License-Identifier: MIT-0
44

5+
# This script deconfigures an instance that has been configured as a ParallelCluster Slurm login node.
6+
#
7+
# This script and it's ansible playbook are copied to /opt/aws-eda-slurm-cluster/{{ cluster_name }} so
8+
# that they can be executed whether the cluster still exists or not.
9+
510
full_script=$(realpath $0)
611
script_dir=$(dirname $full_script)
712
base_script=$(basename $full_script)
13+
ANSIBLE_PATH=$(dirname $script_dir)/ansible
14+
PLAYBOOKS_PATH=$ANSIBLE_PATH/playbooks
815

9-
date
10-
echo "Started external_login_node_deconfigure.sh: $full_script"
16+
echo "$(date): Started $base_script: $full_script"
1117

1218
ErrorSnsTopicArn={{ ErrorSnsTopicArn }}
1319

14-
config_dir={{ ExternalLoginNodeSlurmConfigDir }}
15-
config_bin_dir=$config_dir/bin
16-
17-
temp_config_dir=/tmp/{{ClusterName}}_config
18-
temp_config_bin_dir=$temp_config_dir/bin
19-
if [[ $script_dir != $temp_config_bin_dir ]]; then
20-
rm -rf $temp_config_dir
21-
cp -r $config_dir $temp_config_dir
22-
exec $temp_config_dir/bin/$base_script
23-
fi
20+
# Notify user of errors
21+
function on_exit {
22+
rc=$?
23+
set +e
24+
if [[ $rc -ne 0 ]] && [[ ":$ErrorSnsTopicArn" != ":" ]]; then
25+
message_file=$(mktemp)
26+
echo "See log files for more info:
27+
grep ${script_name} /var/log/messages | less" > $message_file
28+
aws sns publish --topic-arn $ErrorSnsTopicArn --subject "${ClusterName} ${script_name} failed" --message file://$message_file
29+
rm $message_file
30+
fi
31+
}
32+
trap on_exit EXIT
2433

2534
# Install ansible
2635
if ! yum list installed ansible &> /dev/null; then
2736
yum install -y ansible || amazon-linux-extras install -y ansible2
2837
fi
2938

30-
ANSIBLE_PATH=$temp_config_dir/ansible
31-
PLAYBOOKS_PATH=$ANSIBLE_PATH/playbooks
32-
3339
pushd $PLAYBOOKS_PATH
3440
ansible-playbook $PLAYBOOKS_PATH/ParallelClusterExternalLoginNodeDeconfigure.yml \
3541
-i inventories/local.yml \
3642
-e @$ANSIBLE_PATH/ansible_external_login_node_vars.yml
3743
popd
3844

39-
rm -rf $temp_config_dir
45+
rm -rf $(dirname $script_dir)
4046

41-
date
42-
echo "Finished external_login_node_deconfigure.sh: $full_script"
47+
echo "$(date): Finished $base_script: $full_script"
4348

4449
exit 0

source/resources/playbooks/roles/ParallelClusterCreateUsersGroupsJsonConfigure/tasks/main.yml

+37
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,40 @@
3232
group: root
3333
mode: 0600
3434
force: yes
35+
36+
- name: Create /opt/aws-eda-slurm-cluster/{{ cluster_name }}
37+
file:
38+
path: /opt/aws-eda-slurm-cluster/{{ cluster_name }}
39+
owner: root
40+
group: root
41+
mode: 0700
42+
state: directory
43+
44+
- name: Create /opt/aws-eda-slurm-cluster/{{ cluster_name }}/bin
45+
file:
46+
path: /opt/aws-eda-slurm-cluster/{{ cluster_name }}/bin
47+
owner: root
48+
group: root
49+
mode: 0700
50+
state: directory
51+
52+
- name: Copy {{ slurm_config_dir }}/bin/create_users_groups_json_deconfigure.sh to /opt/aws-eda-slurm-cluster/{{ cluster_name }}/bin/
53+
copy:
54+
src: "{{ slurm_config_dir }}/bin/create_users_groups_json_deconfigure.sh"
55+
dest: /opt/aws-eda-slurm-cluster/{{ cluster_name }}/bin/create_users_groups_json_deconfigure.sh
56+
remote_src: true
57+
force: true # Has to be true or won't be copied when they are different.
58+
owner: root
59+
group: root
60+
mode: 0700
61+
62+
- name: Copy {{ slurm_config_dir }}/ansible/ to /opt/aws-eda-slurm-cluster/{{ cluster_name }}/ansible/
63+
copy:
64+
src: "{{ slurm_config_dir }}/ansible"
65+
dest: /opt/aws-eda-slurm-cluster/{{ cluster_name }}/
66+
remote_src: true
67+
force: true # Has to be true or won't be copied when they are different.
68+
owner: root
69+
group: root
70+
directory_mode: 0700
71+
mode: 0600

0 commit comments

Comments
 (0)