Skip to content

Commit edf576d

Browse files
authored
[Develop] Introduce Global Cleanup IAM Role for ParallelCluster Build-Image (#6912)
* feat: Global Cleanup IAM Role for ParallelCluster Build-Image * Introduce ensure_cleanup_role() with bootrapped tagging and idempotent creation / update logic. * 4-step safe update sequence documented, now only after the inline policy succeeds, set or bump the bootrapped tag. * Lambda VPCAccess managed policy is attached only when LambdaFunctionsVpcConfig exists in the config * Modify image_operations_controller to invoke ensure_cleanup_role when Build/Iam/CleanupLambdaRole is not provided and to fail fast on permission errors. * Refactor imagebuilder_stack to remove all per-stack cleanup-role logic and wire Lambda to the global role by default. * Update constants (role prefix / expected revision tag key & value). * IamClient – add create_role, attach_role_policy, put_role_policy, tag_role
1 parent bd4c015 commit edf576d

File tree

13 files changed

+523
-1347
lines changed

13 files changed

+523
-1347
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ CHANGELOG
1010
- Support DCV on Amazon Linux 2023.
1111
- Upgrade Python runtime used by Lambda functions to python3.12 (from python3.9).
1212
- Remove `berkshelf`. All cookbooks are local and do not need `berkshelf` dependency management.
13+
- The build-image command now deploys a global role that is used to automatically delete the build-image stack after images either succeed or fail the build.
14+
The role is meant to exists even after the stack has been deleted. This is to prevent build-image stack deletion failures, reported in https://github.com/aws/aws-parallelcluster/issues/5914
1315
- Add the configuration parameter `HeadNode/SharedStorageEfsSettings/Encrypted` to enable encryption on the EFS file system used for the head node internal shared storage.
1416

1517
**BUG FIXES**

cli/src/pcluster/api/controllers/image_operations_controller.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
import logging
1111
import os as os_lib
1212

13+
import yaml
14+
1315
from pcluster.api.controllers.common import (
1416
assert_supported_operation,
1517
configure_aws_region,
@@ -52,14 +54,15 @@
5254
from pcluster.aws.common import AWSClientError
5355
from pcluster.aws.ec2 import Ec2Client
5456
from pcluster.constants import SUPPORTED_ARCHITECTURES, SUPPORTED_OSES, Operation
57+
from pcluster.imagebuilder_utils import ensure_default_build_image_stack_cleanup_role
5558
from pcluster.models.imagebuilder import (
5659
BadRequestImageBuilderActionError,
5760
ConfigValidationError,
5861
ImageBuilder,
5962
NonExistingImageError,
6063
)
6164
from pcluster.models.imagebuilder_resources import ImageBuilderStack, NonExistingStackError
62-
from pcluster.utils import get_installed_version, to_utc_datetime
65+
from pcluster.utils import get_installed_version, get_partition, to_utc_datetime
6366
from pcluster.validators.common import FailureLevel
6467

6568
LOGGER = logging.getLogger(__name__)
@@ -105,6 +108,29 @@ def build_image(
105108
validation_failure_level = validation_failure_level or ValidationLevel.ERROR
106109
dryrun = dryrun or False
107110

111+
raw_cfg_str = build_image_request_content["imageConfiguration"]
112+
cfg_dict = yaml.safe_load(raw_cfg_str) or {}
113+
# If CleanupLambdaRole exists in the config, skip ensure_default_build_image_stack_cleanup_role
114+
has_custom_cleanup_role = cfg_dict.get("Build", {}).get("Iam", {}).get("CleanupLambdaRole")
115+
116+
if not has_custom_cleanup_role:
117+
try:
118+
# If LambdaFunctionsVpcConfig exists in the config, attach the AWS-managed LambdaVPCAccess policy
119+
has_lambda_functions_vpc_config = cfg_dict.get("DeploymentSettings", {}).get("LambdaFunctionsVpcConfig")
120+
account_id = AWSApi.instance().sts.get_account_id()
121+
ensure_default_build_image_stack_cleanup_role(
122+
account_id, get_partition(), attach_vpc_access_policy=bool(has_lambda_functions_vpc_config)
123+
)
124+
except AWSClientError as e:
125+
if e.error_code in ("AccessDenied", "AccessDeniedException", "UnauthorizedOperation"):
126+
raise BadRequestException(
127+
"Current principal lacks permissions to create or update the ParallelCluster build-image "
128+
"cleanup IAM role. "
129+
"Either pass `Build/Iam/CleanupLambdaRole` or grant the missing permissions to continue. "
130+
"For detailed instructions, please refer to our public documentation."
131+
)
132+
raise
133+
108134
build_image_request_content = BuildImageRequestContent.from_dict(build_image_request_content)
109135

110136
try:

cli/src/pcluster/aws/iam.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,23 @@ def get_role(self, role_name):
3232
def get_instance_profile(self, instance_profile_name):
3333
"""Get instance profile information."""
3434
return self._client.get_instance_profile(InstanceProfileName=instance_profile_name)
35+
36+
@AWSExceptionHandler.handle_client_exception
37+
def create_role(self, **kwargs):
38+
"""Create IAM role."""
39+
return self._client.create_role(**kwargs)
40+
41+
@AWSExceptionHandler.handle_client_exception
42+
def attach_role_policy(self, role_name, policy_arn):
43+
"""Attach a managed policy to the given role."""
44+
return self._client.attach_role_policy(RoleName=role_name, PolicyArn=policy_arn)
45+
46+
@AWSExceptionHandler.handle_client_exception
47+
def put_role_policy(self, role_name, policy_name, policy_document):
48+
"""Create or replace the specified inline policy on a role."""
49+
return self._client.put_role_policy(RoleName=role_name, PolicyName=policy_name, PolicyDocument=policy_document)
50+
51+
@AWSExceptionHandler.handle_client_exception
52+
def tag_role(self, role_name, tags):
53+
"""Add or overwrite one or more tags for the specified role."""
54+
return self._client.tag_role(RoleName=role_name, Tags=tags)

cli/src/pcluster/constants.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,3 +335,8 @@ class Operation(Enum):
335335
PCLUSTER_BUCKET_PROTECTED_FOLDER = "parallelcluster"
336336
PCLUSTER_BUCKET_PROTECTED_PREFIX = f"{PCLUSTER_BUCKET_PROTECTED_FOLDER}/"
337337
PCLUSTER_BUCKET_REQUIRED_BOOTSTRAP_FEATURES = ["basic", "export-logs"]
338+
339+
PCLUSTER_BUILD_IMAGE_CLEANUP_ROLE_PREFIX = "PClusterBuildImageCleanupRole"
340+
# Tag key & expected revision (increment when policy widens)
341+
PCLUSTER_BUILD_IMAGE_CLEANUP_ROLE_REVISION = 1
342+
PCLUSTER_BUILD_IMAGE_CLEANUP_ROLE_BOOTSTRAP_TAG_KEY = "parallelcluster:build-image-cleanup-role-bootstrapped"

cli/src/pcluster/imagebuilder_utils.py

Lines changed: 181 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,21 @@
88
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
99
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
1010
# limitations under the License.
11+
import json
12+
import logging
1113
import os
1214

1315
import yaml
1416

1517
from pcluster.aws.aws_api import AWSApi
16-
from pcluster.utils import get_url_scheme, yaml_load
18+
from pcluster.aws.common import AWSClientError
19+
from pcluster.constants import (
20+
IAM_ROLE_PATH,
21+
PCLUSTER_BUILD_IMAGE_CLEANUP_ROLE_BOOTSTRAP_TAG_KEY,
22+
PCLUSTER_BUILD_IMAGE_CLEANUP_ROLE_PREFIX,
23+
PCLUSTER_BUILD_IMAGE_CLEANUP_ROLE_REVISION,
24+
)
25+
from pcluster.utils import generate_string_hash, get_url_scheme, yaml_load
1726

1827
ROOT_VOLUME_TYPE = "gp3"
1928
PCLUSTER_RESERVED_VOLUME_SIZE = 37
@@ -65,3 +74,174 @@ def _generate_action(action_name, commands):
6574
"""Generate action in imagebuilder components."""
6675
action = {"name": action_name, "action": "ExecuteBash", "inputs": {"commands": [commands]}}
6776
return action
77+
78+
79+
def get_cleanup_role_name(account_id: str) -> str:
80+
"""Return the role name including a revision number."""
81+
hashed_account_id = generate_string_hash(account_id)
82+
return (
83+
f"{PCLUSTER_BUILD_IMAGE_CLEANUP_ROLE_PREFIX}-{hashed_account_id}-v{PCLUSTER_BUILD_IMAGE_CLEANUP_ROLE_REVISION}"
84+
)
85+
86+
87+
def _expected_inline_policy(account_id: str, partition: str):
88+
"""Return the inline policy document (JSON-serialised string)."""
89+
return json.dumps(
90+
{
91+
"Version": "2012-10-17",
92+
"Statement": [
93+
{
94+
"Action": ["iam:DetachRolePolicy", "iam:DeleteRole", "iam:DeleteRolePolicy"],
95+
"Resource": f"arn:{partition}:iam::{account_id}:role/parallelcluster/*",
96+
"Effect": "Allow",
97+
},
98+
{
99+
"Action": ["iam:DeleteInstanceProfile", "iam:RemoveRoleFromInstanceProfile"],
100+
"Resource": f"arn:{partition}:iam::{account_id}:instance-profile/parallelcluster/*",
101+
"Effect": "Allow",
102+
},
103+
{
104+
"Action": "imagebuilder:DeleteInfrastructureConfiguration",
105+
"Resource": f"arn:{partition}:imagebuilder:*:{account_id}:infrastructure-configuration/"
106+
f"parallelclusterimage-*",
107+
"Effect": "Allow",
108+
},
109+
{
110+
"Action": ["imagebuilder:DeleteComponent"],
111+
"Resource": [f"arn:{partition}:imagebuilder:*:{account_id}:component/parallelclusterimage-*/*"],
112+
"Effect": "Allow",
113+
},
114+
{
115+
"Action": "imagebuilder:DeleteImageRecipe",
116+
"Resource": f"arn:{partition}:imagebuilder:*:{account_id}:image-recipe/parallelclusterimage-*/*",
117+
"Effect": "Allow",
118+
},
119+
{
120+
"Action": "imagebuilder:DeleteDistributionConfiguration",
121+
"Resource": f"arn:{partition}:imagebuilder:*:{account_id}:distribution-configuration/"
122+
f"parallelclusterimage-*",
123+
"Effect": "Allow",
124+
},
125+
{
126+
"Action": ["imagebuilder:DeleteImage", "imagebuilder:GetImage", "imagebuilder:CancelImageCreation"],
127+
"Resource": f"arn:{partition}:imagebuilder:*:{account_id}:image/parallelclusterimage-*/*",
128+
"Effect": "Allow",
129+
},
130+
{
131+
"Action": "cloudformation:DeleteStack",
132+
"Resource": f"arn:{partition}:cloudformation:*:{account_id}:stack/*/*",
133+
"Condition": {
134+
"ForAnyValue:StringLike": {"cloudformation:ResourceTag/parallelcluster:image_id": "*"}
135+
},
136+
"Effect": "Allow",
137+
},
138+
# The below two permissions are required for the DeleteStackFunction Lambda to tag the
139+
# created AMI with 'parallelcluster:build_status' and 'parallelcluster:parent_image' tags
140+
{"Action": "ec2:CreateTags", "Resource": f"arn:{partition}:ec2:*::image/*", "Effect": "Allow"},
141+
{"Action": "tag:TagResources", "Resource": "*", "Effect": "Allow"},
142+
{
143+
"Action": ["lambda:DeleteFunction", "lambda:RemovePermission"],
144+
"Resource": f"arn:{partition}:lambda:*:{account_id}:function:ParallelClusterImage-*",
145+
"Effect": "Allow",
146+
},
147+
{
148+
"Action": "logs:DeleteLogGroup",
149+
"Resource": f"arn:{partition}:logs:*:{account_id}:log-group:/aws/lambda/ParallelClusterImage-*:*",
150+
"Effect": "Allow",
151+
},
152+
{
153+
"Action": [
154+
"SNS:GetTopicAttributes",
155+
"SNS:DeleteTopic",
156+
"SNS:GetSubscriptionAttributes",
157+
"SNS:Unsubscribe",
158+
],
159+
"Resource": f"arn:{partition}:sns:*:{account_id}:ParallelClusterImage-*",
160+
"Effect": "Allow",
161+
},
162+
],
163+
}
164+
)
165+
166+
167+
def ensure_default_build_image_stack_cleanup_role(
168+
account_id: str, partition="aws", attach_vpc_access_policy: bool = False
169+
) -> str:
170+
"""
171+
Ensure the global (account-wide) cleanup role exists and is at the expected revision.
172+
173+
The function follows a safe order:
174+
1. If the role does not exist, create it without the bootstrapped tag.
175+
2. If LambdaFunctionsVpcConfig exists in the config, attach the AWS-managed LambdaVPCAccess policy.
176+
3. Attach the AWS-managed Lambda basic policy.
177+
4. Update/write the inline policy (least-privilege cleanup policy).
178+
5. Only after the inline policy succeeds, set the bootstrapped tag.
179+
180+
This way, if step 2, 3 or 4 fails (e.g., lack of iam:PutRolePolicy permission),
181+
future invocations will keep retrying.
182+
"""
183+
iam = AWSApi.instance().iam
184+
role_name = get_cleanup_role_name(account_id)
185+
role_arn = f"arn:{partition}:iam::{account_id}:role{IAM_ROLE_PATH}{role_name}"
186+
187+
# Assume-role trust policy
188+
assume_doc = {
189+
"Version": "2012-10-17",
190+
"Statement": [
191+
{
192+
"Effect": "Allow",
193+
"Principal": {"Service": "lambda.amazonaws.com"},
194+
"Action": "sts:AssumeRole",
195+
"Condition": {
196+
"ArnLike": {
197+
"aws:SourceArn": f"arn:{partition}:lambda:*:{account_id}:function:ParallelClusterImage-*"
198+
}
199+
},
200+
}
201+
],
202+
}
203+
# Check whether the role already exists
204+
try:
205+
resp = iam.get_role(role_name=role_name)
206+
tags = {t["Key"]: t["Value"] for t in resp["Role"].get("Tags", [])}
207+
already_bootstrapped = tags.get(PCLUSTER_BUILD_IMAGE_CLEANUP_ROLE_BOOTSTRAP_TAG_KEY, "").lower() == "true"
208+
except AWSClientError as e:
209+
if e.error_code == "NoSuchEntity":
210+
logging.info("Creating default build-image stack cleanup role %s because it does not exists.", role_name)
211+
iam.create_role(
212+
RoleName=role_name,
213+
Path=IAM_ROLE_PATH,
214+
AssumeRolePolicyDocument=json.dumps(assume_doc),
215+
Description="AWS ParallelCluster build-image cleanup Lambda execution role. Please do not delete it.",
216+
)
217+
already_bootstrapped = False
218+
else:
219+
raise
220+
221+
# Attach AWSLambdaVPCAccessExecutionRole
222+
if attach_vpc_access_policy:
223+
iam.attach_role_policy(
224+
role_name,
225+
f"arn:{partition}:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole",
226+
)
227+
228+
if already_bootstrapped:
229+
return role_arn
230+
231+
# Attach AWSLambdaBasicExecutionRole
232+
cleanup_role_basic_managed_policy = f"arn:{partition}:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole"
233+
iam.attach_role_policy(role_name, cleanup_role_basic_managed_policy)
234+
235+
# Put inline policy
236+
iam.put_role_policy(
237+
role_name=role_name,
238+
policy_name="ParallelClusterCleanupInline",
239+
policy_document=_expected_inline_policy(account_id, partition),
240+
)
241+
242+
# Set bootstrapped tag after policy write succeeds
243+
iam.tag_role(
244+
role_name=role_name,
245+
tags=[{"Key": PCLUSTER_BUILD_IMAGE_CLEANUP_ROLE_BOOTSTRAP_TAG_KEY, "Value": "true"}],
246+
)
247+
return role_arn

0 commit comments

Comments
 (0)