From 02928b3b7c9531e0468c822d02e003c8d8762965 Mon Sep 17 00:00:00 2001 From: Cecilia Lau Date: Tue, 26 Aug 2025 08:59:38 -0400 Subject: [PATCH 1/5] slurm: add support for dumping and loading accounts with parents Signed-off-by: Cecilia Lau --- .../commands/add_allocation_defaults.py | 1 + coldfront/plugins/slurm/associations.py | 204 ++++++++++++------ .../slurm/management/commands/slurm_dump.py | 2 +- coldfront/plugins/slurm/tests/__init__.py | 0 coldfront/plugins/slurm/tests/test_parents.py | 197 +++++++++++++++++ coldfront/plugins/slurm/utils.py | 1 + 6 files changed, 339 insertions(+), 66 deletions(-) create mode 100644 coldfront/plugins/slurm/tests/__init__.py create mode 100644 coldfront/plugins/slurm/tests/test_parents.py diff --git a/coldfront/core/allocation/management/commands/add_allocation_defaults.py b/coldfront/core/allocation/management/commands/add_allocation_defaults.py index 9f7dc019ba..e1271a4e37 100644 --- a/coldfront/core/allocation/management/commands/add_allocation_defaults.py +++ b/coldfront/core/allocation/management/commands/add_allocation_defaults.py @@ -61,6 +61,7 @@ def handle(self, *args, **options): ("Purchase Order Number", "Int", False, True), ("send_expiry_email_on_date", "Date", False, True), ("slurm_account_name", "Text", False, False), + ("slurm_children", "Text", False, False), ("slurm_specs", "Attribute Expanded Text", False, True), ("slurm_specs_attriblist", "Text", False, True), ("slurm_user_specs", "Attribute Expanded Text", False, True), diff --git a/coldfront/plugins/slurm/associations.py b/coldfront/plugins/slurm/associations.py index 28b74fde92..e6ca3d8537 100644 --- a/coldfront/plugins/slurm/associations.py +++ b/coldfront/plugins/slurm/associations.py @@ -8,15 +8,21 @@ import re import sys +from django.core.exceptions import ObjectDoesNotExist + +from coldfront.core.allocation.models import Allocation, AllocationAttribute, AllocationAttributeType from coldfront.core.resource.models import Resource from coldfront.plugins.slurm.utils import ( SLURM_ACCOUNT_ATTRIBUTE_NAME, + SLURM_CHILDREN_ATTRIBUTE_NAME, SLURM_CLUSTER_ATTRIBUTE_NAME, SLURM_SPECS_ATTRIBUTE_NAME, SLURM_USER_SPECS_ATTRIBUTE_NAME, SlurmError, ) +SLURM_ACCOUNT_ATTRIBUTE_TYPE = AllocationAttributeType.objects.get(name=SLURM_ACCOUNT_ATTRIBUTE_NAME) + logger = logging.getLogger(__name__) @@ -69,6 +75,8 @@ def new_from_stream(stream): """Create a new SlurmCluster by parsing the output from sacctmgr dump.""" cluster = None parent = None + parent_account = None + no_cluster_error = SlurmParserError("Failed to parse Slurm cluster name. Is this in sacctmgr dump file format?") for line in stream: line = line.strip() if re.match("^#", line): @@ -77,28 +85,36 @@ def new_from_stream(stream): parts = line.split(":") name = re.sub(r"^Cluster - ", "", parts[0]).strip("\n'") if len(name) == 0: - raise (SlurmParserError("Cluster name not found for line: {}".format(line))) + raise SlurmParserError(f"Cluster name not found for line: {line}") cluster = SlurmCluster(name) cluster.specs += parts[1:] elif re.match("^Account - '[^']+'", line): + if not cluster or not cluster.name: + raise no_cluster_error account = SlurmAccount.new_from_sacctmgr(line) - cluster.accounts[account.name] = account + if parent == "root": + cluster.accounts[account.name] = account + elif parent_account: + parent_account.add_child(account) elif re.match("^Parent - '[^']+'", line): + if not cluster or not cluster.name: + raise no_cluster_error parent = re.sub(r"^Parent - ", "", line).strip("\n'") if parent == "root": cluster.accounts["root"] = SlurmAccount("root") if not parent: - raise (SlurmParserError("Parent name not found for line: {}".format(line))) + raise SlurmParserError(f"Parent name not found for line: {line}") + parent_account = cluster.get_account(parent) elif re.match("^User - '[^']+'", line): + if not cluster or not cluster.name: + raise no_cluster_error user = SlurmUser.new_from_sacctmgr(line) - if not parent: - raise (SlurmParserError("Found user record without Parent for line: {}".format(line))) - account = cluster.accounts[parent] - account.add_user(user) - cluster.accounts[parent] = account + if not parent or not parent_account: + raise SlurmParserError(f"Found user record without Parent for line: {line}") + parent_account.add_child(user) if not cluster or not cluster.name: - raise (SlurmParserError("Failed to parse Slurm cluster name. Is this in sacctmgr dump file format?")) + raise no_cluster_error return cluster @@ -109,25 +125,41 @@ def new_from_resource(resource): specs = resource.get_attribute_list(SLURM_SPECS_ATTRIBUTE_NAME) user_specs = resource.get_attribute_list(SLURM_USER_SPECS_ATTRIBUTE_NAME) if not name: - raise (SlurmError("Resource {} missing slurm_cluster".format(resource))) + raise SlurmError(f"Resource {resource} missing slurm_cluster") cluster = SlurmCluster(name, specs) # Process allocations - for allocation in resource.allocation_set.filter(status__name__in=["Active", "Renewal Requested"]): - cluster.add_allocation(allocation, user_specs=user_specs) + allocations = resource.allocation_set.filter(status__name__in=["Active", "Renewal Requested"]) + for allocation in allocations: + cluster.add_allocation(allocation, allocations, user_specs=user_specs) + # remove child accounts cluster accounts + child_accounts = set() + for account in cluster.accounts.values(): + if account.child_type == SlurmAccount: + child_accounts.update(account.children.keys()) + for account_name in child_accounts: + del cluster.accounts[account_name] # Process child resources children = Resource.objects.filter(parent_resource_id=resource.id, resource_type__name="Cluster Partition") for r in children: partition_specs = r.get_attribute_list(SLURM_SPECS_ATTRIBUTE_NAME) partition_user_specs = r.get_attribute_list(SLURM_USER_SPECS_ATTRIBUTE_NAME) - for allocation in r.allocation_set.filter(status__name__in=["Active", "Renewal Requested"]): - cluster.add_allocation(allocation, specs=partition_specs, user_specs=partition_user_specs) + allocations = r.allocation_set.filter(status__name__in=["Active", "Renewal Requested"]) + for allocation in allocations: + cluster.add_allocation(allocation, allocations, specs=partition_specs, user_specs=partition_user_specs) + # remove child accounts cluster accounts + child_accounts = set() + for account in cluster.accounts.values(): + if account.child_type == SlurmAccount: + child_accounts.update(account.children.keys()) + for account_name in child_accounts: + del cluster.accounts[account_name] return cluster - def add_allocation(self, allocation, specs=None, user_specs=None): + def add_allocation(self, allocation, res_allocations, specs=None, user_specs=None): if specs is None: specs = [] @@ -138,54 +170,58 @@ def add_allocation(self, allocation, specs=None, user_specs=None): logger.debug("Adding allocation name=%s specs=%s user_specs=%s", name, specs, user_specs) account = self.accounts.get(name, SlurmAccount(name)) - account.add_allocation(allocation, user_specs=user_specs) + account.add_allocation(allocation, res_allocations, user_specs=user_specs) account.specs += specs self.accounts[name] = account + def get_account(self, account_name): + if account_name in self.accounts.keys(): + return self.accounts[account_name] + for account in self.accounts.values(): + result = account.get_account(account_name) + if result: + return result + return None + def write(self, out): - self._write(out, "# ColdFront Allocation Slurm associations dump {}\n".format(datetime.datetime.now().date())) - self._write( - out, - "Cluster - '{}':{}\n".format( - self.name, - self.format_specs(), - ), - ) + self._write(out, f"# ColdFront Allocation Slurm associations dump {datetime.datetime.now().date()}\n") + self._write(out, f"Cluster - '{self.name}':{self.format_specs()}\n") if "root" in self.accounts: self.accounts["root"].write(out) else: self._write(out, "Parent - 'root'\n") self._write(out, "User - 'root':DefaultAccount='root':AdminLevel='Administrator':Fairshare=1\n") - for name, account in self.accounts.items(): + for account in self.accounts.values(): if account.name == "root": continue account.write(out) for name, account in self.accounts.items(): - account.write_users(out) + account.write_children(out) class SlurmAccount(SlurmBase): def __init__(self, name, specs=None): super().__init__(name, specs=specs) - self.users = {} + self.child_type = None + self.children = {} @staticmethod def new_from_sacctmgr(line): """Create a new SlurmAccount by parsing a line from sacctmgr dump. For example: Account - 'physics':Description='physics group':Organization='cas':Fairshare=100""" if not re.match("^Account - '[^']+'", line): - raise (SlurmParserError('Invalid format. Must start with "Account" for line: {}'.format(line))) + raise SlurmParserError(f'Invalid format. Must start with "Account" for line: {line}') parts = line.split(":") name = re.sub(r"^Account - ", "", parts[0]).strip("\n'") if len(name) == 0: - raise (SlurmParserError("Cluster name not found for line: {}".format(line))) + raise SlurmParserError(f"Cluster name not found for line: {line}") return SlurmAccount(name, specs=parts[1:]) - def add_allocation(self, allocation, user_specs=None): + def add_allocation(self, allocation: Allocation, res_allocations, user_specs=None): """Add users from a ColdFront Allocation model to SlurmAccount""" if user_specs is None: user_specs = [] @@ -195,39 +231,83 @@ def add_allocation(self, allocation, user_specs=None): name = "root" if name != self.name: - raise (SlurmError("Allocation {} slurm_account_name does not match {}".format(allocation, self.name))) - - self.specs += allocation.get_attribute_list(SLURM_SPECS_ATTRIBUTE_NAME) + raise SlurmError( + f"Allocation {allocation} {SLURM_ACCOUNT_ATTRIBUTE_NAME} {name} does not match {self.name}" + ) - allocation_user_specs = allocation.get_attribute_list(SLURM_USER_SPECS_ATTRIBUTE_NAME) - for u in allocation.allocationuser_set.filter(status__name="Active"): - user = SlurmUser(u.user.username) - user.specs += allocation_user_specs - user.specs += user_specs - self.add_user(user) + child_accounts = set(allocation.get_attribute_list(SLURM_CHILDREN_ATTRIBUTE_NAME)) + if len(child_accounts) > 0 and allocation.allocationuser_set.count() > 0: + raise SlurmError( + f"Allocation {allocation} cannot be a parent and have users!" + f" Please remove users or all {SLURM_CHILDREN_ATTRIBUTE_NAME} attributes." + ) - def add_user(self, user): - if user.name not in self.users: - self.users[user.name] = user + self.specs += allocation.get_attribute_list(SLURM_SPECS_ATTRIBUTE_NAME) - rec = self.users[user.name] - rec.specs += user.specs - self.users[user.name] = rec + if len(child_accounts) > 0: + self.child_type = SlurmAccount + for account_name in child_accounts: + account = self.children.get(account_name, SlurmAccount(account_name)) + try: + child_allocation = res_allocations.get( + pk=AllocationAttribute.objects.get( + allocation_attribute_type=SLURM_ACCOUNT_ATTRIBUTE_TYPE, value=account_name + ).allocation.pk + ) + account.add_allocation(child_allocation, res_allocations, user_specs=user_specs) + except ObjectDoesNotExist: + raise SlurmError( + f"No allocation with {SLURM_ACCOUNT_ATTRIBUTE_TYPE}={account_name} in correct resource" # Don't have an easy way to get the resource here + ) + + self.add_child(account) + else: + self.child_type = SlurmUser + allocation_user_specs = allocation.get_attribute_list(SLURM_USER_SPECS_ATTRIBUTE_NAME) + for u in allocation.allocationuser_set.filter(status__name="Active"): + user = SlurmUser(u.user.username) + user.specs += allocation_user_specs + user.specs += user_specs + self.add_child(user) + + def add_child(self, child): + if not self.child_type: + self.child_type = type(child) + else: + if type(child) is not self.child_type: + raise SlurmError( + f"Cannot assign child of type {type(child)} to parent with child_type {self.child_type}" + ) + if child.name not in self.children: + self.children[child.name] = child + + ch = self.children[child.name] + ch.specs += child.specs + self.children[child.name] = ch + + def get_account(self, account_name): + if self.child_type != SlurmAccount: + return None + if account_name in self.children.keys(): + return self.children[account_name] + for account in self.children.values(): + result = account.get_account(account_name) + if result: + return result + return None def write(self, out): if self.name != "root": - self._write( - out, - "Account - '{}':{}\n".format( - self.name, - self.format_specs(), - ), - ) + self._write(out, f"Account - '{self.name}':{self.format_specs()}\n") - def write_users(self, out): - self._write(out, "Parent - '{}'\n".format(self.name)) - for uid, user in self.users.items(): - user.write(out) + def write_children(self, out): + self._write(out, f"Parent - '{self.name}'\n") + for child in self.children.values(): + child.write(out) + if self.child_type == SlurmUser: + return + for child in self.children.values(): + child.write_children(out) class SlurmUser(SlurmBase): @@ -236,20 +316,14 @@ def new_from_sacctmgr(line): """Create a new SlurmUser by parsing a line from sacctmgr dump. For example: User - 'jane':DefaultAccount='physics':Fairshare=Parent:QOS='general-compute'""" if not re.match("^User - '[^']+'", line): - raise (SlurmParserError('Invalid format. Must start with "User" for line: {}'.format(line))) + raise SlurmParserError(f'Invalid format. Must start with "User" for line: {line}') parts = line.split(":") name = re.sub(r"^User - ", "", parts[0]).strip("\n'") if len(name) == 0: - raise (SlurmParserError("User name not found for line: {}".format(line))) + raise SlurmParserError("User name not found for line: {line}") return SlurmUser(name, specs=parts[1:]) def write(self, out): - self._write( - out, - "User - '{}':{}\n".format( - self.name, - self.format_specs(), - ), - ) + self._write(out, f"User - '{self.name}':{self.format_specs()}\n") diff --git a/coldfront/plugins/slurm/management/commands/slurm_dump.py b/coldfront/plugins/slurm/management/commands/slurm_dump.py index 5e590f8628..a9d5ee58a5 100644 --- a/coldfront/plugins/slurm/management/commands/slurm_dump.py +++ b/coldfront/plugins/slurm/management/commands/slurm_dump.py @@ -53,5 +53,5 @@ def handle(self, *args, **options): cluster.write(self.stdout) continue - with open(os.path.join(out_dir, "{}.cfg".format(cluster.name)), "w") as fh: + with open(os.path.join(out_dir, f"{cluster.name}.cfg"), "w") as fh: cluster.write(fh) diff --git a/coldfront/plugins/slurm/tests/__init__.py b/coldfront/plugins/slurm/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/coldfront/plugins/slurm/tests/test_parents.py b/coldfront/plugins/slurm/tests/test_parents.py new file mode 100644 index 0000000000..17093facff --- /dev/null +++ b/coldfront/plugins/slurm/tests/test_parents.py @@ -0,0 +1,197 @@ +# SPDX-FileCopyrightText: (C) ColdFront Authors +# +# SPDX-License-Identifier: AGPL-3.0-or-later + +from io import StringIO +import sys + +from django.core.management import call_command +from django.test import TestCase + +from coldfront.core.allocation.models import AllocationAttributeType, Allocation +from coldfront.core.resource.models import Resource, ResourceAttribute, ResourceAttributeType, ResourceType +from coldfront.core.test_helpers.factories import ( + AllocationAttributeFactory, + AllocationAttributeTypeFactory, + AllocationFactory, + AllocationStatusChoiceFactory, + AllocationUserFactory, + ProjectFactory, + ProjectUserFactory, + ResourceFactory, + ResourceTypeFactory, + UserFactory, +) +from coldfront.plugins.slurm.associations import SlurmCluster + +# Building this account structure in slurm/coldfront +# 'a' represents an account and 'u' represents a user +# +# a1 a7 +# / \ / \ +# a2 a3 u1 u3 +# / / \ +# u1 a4 a5 +# / \ | +# u1 u2 a6 +# | +# u3 + + +class AssociationTest(TestCase): + @classmethod + def setUpClass(cls): + # call_command("import_field_of_science_data") + # call_command("add_default_grant_options") + call_command("add_default_project_choices") + call_command("add_allocation_defaults") + call_command("add_resource_defaults") + + super(AssociationTest, cls).setUpClass() + + @classmethod + def setUpTestData(cls): + # create cluster resource + cls.resource = ResourceFactory(resource_type=ResourceType.objects.get(name="Cluster")) + ResourceAttribute.objects.create( + resource=cls.resource, + resource_attribute_type=ResourceAttributeType.objects.get(name="slurm_cluster"), + value="test_cluster", + ) + # create users + cls.u1 = UserFactory(username="u1") + cls.u2 = UserFactory(username="u2") + cls.u3 = UserFactory(username="u3") + # create project + cls.project = ProjectFactory(title="test_proj") + ProjectUserFactory(project=cls.project, user=cls.u1) + ProjectUserFactory(project=cls.project, user=cls.u2) + ProjectUserFactory(project=cls.project, user=cls.u3) + # create allocations + alloc_kwargs = {"project": cls.project, "status": AllocationStatusChoiceFactory(name="Active")} + san_aat = AllocationAttributeType.objects.get(name="slurm_account_name") + sc_aat = AllocationAttributeType.objects.get(name="slurm_children") + cls.a1 = Allocation.objects.create(**alloc_kwargs) + cls.a2 = Allocation.objects.create(**alloc_kwargs) + cls.a3 = Allocation.objects.create(**alloc_kwargs) + cls.a4 = Allocation.objects.create(**alloc_kwargs) + cls.a5 = Allocation.objects.create(**alloc_kwargs) + cls.a6 = Allocation.objects.create(**alloc_kwargs) + cls.a7 = Allocation.objects.create(**alloc_kwargs) + cls.a1.resources.add(cls.resource) + cls.a2.resources.add(cls.resource) + cls.a3.resources.add(cls.resource) + cls.a4.resources.add(cls.resource) + cls.a5.resources.add(cls.resource) + cls.a6.resources.add(cls.resource) + cls.a7.resources.add(cls.resource) + # slurm account names + aat_kwargs = {"allocation_attribute_type": san_aat} + AllocationAttributeFactory(allocation=cls.a1, value="a1", **aat_kwargs) + AllocationAttributeFactory(allocation=cls.a2, value="a2", **aat_kwargs) + AllocationAttributeFactory(allocation=cls.a3, value="a3", **aat_kwargs) + AllocationAttributeFactory(allocation=cls.a4, value="a4", **aat_kwargs) + AllocationAttributeFactory(allocation=cls.a5, value="a5", **aat_kwargs) + AllocationAttributeFactory(allocation=cls.a6, value="a6", **aat_kwargs) + AllocationAttributeFactory(allocation=cls.a7, value="a7", **aat_kwargs) + # slurm children + aat_kwargs = {"allocation_attribute_type": sc_aat} + AllocationAttributeFactory(allocation=cls.a1, value="a2", **aat_kwargs) + AllocationAttributeFactory(allocation=cls.a1, value="a3", **aat_kwargs) + AllocationAttributeFactory(allocation=cls.a3, value="a4", **aat_kwargs) + AllocationAttributeFactory(allocation=cls.a3, value="a5", **aat_kwargs) + AllocationAttributeFactory(allocation=cls.a5, value="a6", **aat_kwargs) + # add users to allocations + AllocationUserFactory(allocation=cls.a2, user=cls.u1) + AllocationUserFactory(allocation=cls.a4, user=cls.u1) + AllocationUserFactory(allocation=cls.a4, user=cls.u2) + AllocationUserFactory(allocation=cls.a6, user=cls.u3) + AllocationUserFactory(allocation=cls.a7, user=cls.u1) + AllocationUserFactory(allocation=cls.a7, user=cls.u3) + + def test_allocations_to_slurm(self): + """non-exhaustive, should make better""" + cluster = SlurmCluster.new_from_resource(self.resource) + self.assertEqual(cluster.name, "test_cluster") + self.assertEqual(len(cluster.accounts), 2) + self.assertIn("a1", cluster.accounts) + self.assertIn("a7", cluster.accounts) + self.assertEqual(len(cluster.accounts["a7"].children), 2) + + def test_slurm_dump_roundtrip(self): + """todo""" + dump = StringIO(""" +# To edit this file start with a cluster line for the new cluster +# Cluster - 'cluster_name':MaxNodesPerJob=50 +# Followed by Accounts you want in this fashion (root is created by default)... +# Parent - 'root' +# Account - 'cs':MaxNodesPerJob=5:MaxJobs=4:MaxTRESMins=cpu=20:FairShare=399:MaxWallDuration=40:Description='Computer Science':Organization='LC' +# Any of the options after a ':' can be left out and they can be in any order. +# If you want to add any sub accounts just list the Parent THAT HAS ALREADY +# BEEN CREATED before the account line in this fashion... +# Parent - 'cs' +# Account - 'test':MaxNodesPerJob=1:MaxJobs=1:MaxTRESMins=cpu=1:FairShare=1:MaxWallDuration=1:Description='Test Account':Organization='Test' +# To add users to a account add a line like this after a Parent - 'line' +# User - 'lipari':MaxNodesPerJob=2:MaxJobs=3:MaxTRESMins=cpu=4:FairShare=1:MaxWallDurationPerJob=1 +Cluster - 'test_cluster' +Parent - 'root' +User - 'root':DefaultAccount='root':AdminLevel='Administrator':Fairshare=1 +Account - 'a1':Description='a1':Organization='a1' +Account - 'a7':Description='a7':Organization='a7' +Parent - 'a1' +Account - 'a2':Description='a2':Organization='a2' +Account - 'a3':Description='a3':Organization='a3' +Parent - 'a2' +User - 'u1' +Parent - 'a3' +Account - 'a4':Description='a4':Organization='a4' +Account - 'a5':Description='a5':Organization='a5' +Parent - 'a4' +User - 'u1' +User - 'u2' +Parent - 'a5' +Account - 'a6':Description='a6':Organization='a6' +Parent - 'a6' +User - 'u3' +Parent - 'a7' +User - 'u1' +User - 'u3' + """) + + expected = """Cluster - 'test_cluster': +Parent - 'root' +User - 'root':DefaultAccount='root':AdminLevel='Administrator':Fairshare=1 +Account - 'a1': +Account - 'a7': +Parent - 'a1' +Account - 'a3': +Account - 'a2': +Parent - 'a3' +Account - 'a4': +Account - 'a5': +Parent - 'a4' +User - 'u1': +User - 'u2': +Parent - 'a5' +Account - 'a6': +Parent - 'a6' +User - 'u3': +Parent - 'a2' +User - 'u1': +Parent - 'a7' +User - 'u1': +User - 'u3':""".strip() + + cluster = SlurmCluster.new_from_resource(self.resource) + out = StringIO("") + cluster.write(out) + cluster.write(sys.stdout) + out.seek(0) + + cluster = SlurmCluster.new_from_stream(out) + + self.assertEqual(cluster.name, "test_cluster") + self.assertEqual(len(cluster.accounts), 3) + self.assertIn("a1", cluster.accounts) + self.assertIn("a7", cluster.accounts) + self.assertEqual(len(cluster.accounts["a7"].children), 2) diff --git a/coldfront/plugins/slurm/utils.py b/coldfront/plugins/slurm/utils.py index 7f8bf9db48..6edb077a8b 100644 --- a/coldfront/plugins/slurm/utils.py +++ b/coldfront/plugins/slurm/utils.py @@ -12,6 +12,7 @@ SLURM_CLUSTER_ATTRIBUTE_NAME = import_from_settings("SLURM_CLUSTER_ATTRIBUTE_NAME", "slurm_cluster") SLURM_ACCOUNT_ATTRIBUTE_NAME = import_from_settings("SLURM_ACCOUNT_ATTRIBUTE_NAME", "slurm_account_name") +SLURM_CHILDREN_ATTRIBUTE_NAME = import_from_settings("SLURM_CHILDREN_ATTRIBUTE_NAME", "slurm_children") SLURM_SPECS_ATTRIBUTE_NAME = import_from_settings("SLURM_SPECS_ATTRIBUTE_NAME", "slurm_specs") SLURM_USER_SPECS_ATTRIBUTE_NAME = import_from_settings("SLURM_USER_SPECS_ATTRIBUTE_NAME", "slurm_user_specs") SLURM_SACCTMGR_PATH = import_from_settings("SLURM_SACCTMGR_PATH", "/usr/bin/sacctmgr") From 8218d31e003dc5390e713a86430e108568106d50 Mon Sep 17 00:00:00 2001 From: Cecilia Lau Date: Tue, 26 Aug 2025 14:05:23 -0400 Subject: [PATCH 2/5] update logging to use fstrings Signed-off-by: Cecilia Lau --- .../slurm/management/commands/slurm_check.py | 54 +++++++---------- coldfront/plugins/slurm/tests/test_parents.py | 60 ++++++------------- 2 files changed, 42 insertions(+), 72 deletions(-) diff --git a/coldfront/plugins/slurm/management/commands/slurm_check.py b/coldfront/plugins/slurm/management/commands/slurm_check.py index ded64a6ce5..dea9bf3ff3 100644 --- a/coldfront/plugins/slurm/management/commands/slurm_check.py +++ b/coldfront/plugins/slurm/management/commands/slurm_check.py @@ -53,11 +53,11 @@ def write(self, data): def _skip_user(self, user, account): if user in SLURM_IGNORE_USERS: - logger.debug("Ignoring user %s", user) + logger.debug(f"Ignoring user {user}") return True if account in SLURM_IGNORE_ACCOUNTS: - logger.debug("Ignoring account %s", account) + logger.debug(f"Ignoring account {account}") return True if self.filter_account and account != self.filter_account: @@ -70,7 +70,7 @@ def _skip_user(self, user, account): def _skip_account(self, account): if account in SLURM_IGNORE_ACCOUNTS: - logger.debug("Ignoring account %s", account) + logger.debug(f"Ignoring account {account}") return True if self.filter_user: @@ -89,13 +89,9 @@ def remove_user(self, user, account, cluster): try: slurm_remove_assoc(user, cluster, account, noop=self.noop) except SlurmError as e: - logger.error( - "Failed removing Slurm association user %s account %s cluster %s: %s", user, account, cluster, e - ) + logger.error(f"Failed removing Slurm association user {user} account {account} cluster {cluster}: {e}") else: - logger.error( - "Removed Slurm association user %s account %s cluster %s successfully", user, account, cluster - ) + logger.error(f"Removed Slurm association user {user} account {account} cluster {cluster} successfully") row = [ user, @@ -114,9 +110,9 @@ def remove_account(self, account, cluster): try: slurm_remove_account(cluster, account, noop=self.noop) except SlurmError as e: - logger.error("Failed removing Slurm account %s cluster %s: %s", account, cluster, e) + logger.error(f"Failed removing Slurm account {account} cluster {cluster}: {e}") else: - logger.error("Removed Slurm account %s cluster %s successfully", account, cluster) + logger.error(f"Removed Slurm account {account} cluster {cluster} successfully") row = [ "", @@ -137,11 +133,11 @@ def remove_qos(self, user, account, cluster, qos): pass except SlurmError as e: logger.error( - "Failed removing Slurm qos %s for user %s account %s cluster %s: %s", qos, user, account, cluster, e + f"Failed removing Slurm qos {qos} for user {user} account {account} cluster {cluster}: {e}" ) else: logger.error( - "Removed Slurm qos %s for user %s account %s cluster %s successfully", qos, user, account, cluster + f"Removed Slurm qos {qos} for user {user} account {account} cluster {cluster} successfully" ) row = [user, account, cluster, "Remove", qos] @@ -166,12 +162,11 @@ def _parse_qos(self, qos): def _diff_qos(self, account_name, cluster_name, user_a, user_b): logger.debug( - "diff qos: cluster=%s account=%s uid=%s a=%s b=%s", - cluster_name, - account_name, - user_a.name, - user_a.spec_list(), - user_b.spec_list(), + f"diff qos: cluster={cluster_name}" + f" account={account_name}" + f" uid={user_a.name}" + f" a={user_a.spec_list()}" + f" b={user_b.spec_list()}" ) specs_a = [] @@ -189,13 +184,12 @@ def _diff_qos(self, account_name, cluster_name, user_a, user_b): diff = specs_set_a.difference(specs_set_b) logger.debug( - "diff qos: cluster=%s account=%s uid=%s a=%s b=%s diff=%s", - cluster_name, - account_name, - user_a.name, - specs_set_a, - specs_set_b, - diff, + f"diff qos: cluster={cluster_name}" + f" account={account_name}" + f" uid={user_a.name}" + f" a={user_a.spec_list()}" + f" b={user_b.spec_list()}" + f" diff={diff}" ) if len(diff) > 0: @@ -238,7 +232,7 @@ def _cluster_from_dump(self, cluster): with open(fname) as fh: slurm_cluster = SlurmCluster.new_from_stream(fh) except SlurmError as e: - logger.error("Failed to dump Slurm cluster %s: %s", cluster, e) + logger.error(f"Failed to dump Slurm cluster {cluster}: {e}") return slurm_cluster @@ -277,7 +271,7 @@ def handle(self, *args, **options): sys.exit(1) if slurm_cluster.name in SLURM_IGNORE_CLUSTERS: - logger.warning("Ignoring cluster %s. Nothing to do.", slurm_cluster.name) + logger.warning(f"Ignoring cluster {slurm_cluster.name}. Nothing to do.") sys.exit(0) try: @@ -286,9 +280,7 @@ def handle(self, *args, **options): ).resource except ResourceAttribute.DoesNotExist: logger.error( - "No Slurm '%s' cluster resource found in ColdFront using '%s' attribute", - slurm_cluster.name, - SLURM_CLUSTER_ATTRIBUTE_NAME, + f"No Slurm '{slurm_cluster.name}' cluster resource found in ColdFront using '{SLURM_CLUSTER_ATTRIBUTE_NAME}' attribute" ) sys.exit(1) diff --git a/coldfront/plugins/slurm/tests/test_parents.py b/coldfront/plugins/slurm/tests/test_parents.py index 17093facff..c2b6b908d4 100644 --- a/coldfront/plugins/slurm/tests/test_parents.py +++ b/coldfront/plugins/slurm/tests/test_parents.py @@ -3,23 +3,19 @@ # SPDX-License-Identifier: AGPL-3.0-or-later from io import StringIO -import sys from django.core.management import call_command from django.test import TestCase -from coldfront.core.allocation.models import AllocationAttributeType, Allocation -from coldfront.core.resource.models import Resource, ResourceAttribute, ResourceAttributeType, ResourceType +from coldfront.core.allocation.models import Allocation, AllocationAttributeType +from coldfront.core.resource.models import ResourceAttribute, ResourceAttributeType, ResourceType from coldfront.core.test_helpers.factories import ( AllocationAttributeFactory, - AllocationAttributeTypeFactory, - AllocationFactory, AllocationStatusChoiceFactory, AllocationUserFactory, ProjectFactory, ProjectUserFactory, ResourceFactory, - ResourceTypeFactory, UserFactory, ) from coldfront.plugins.slurm.associations import SlurmCluster @@ -109,7 +105,7 @@ def setUpTestData(cls): AllocationUserFactory(allocation=cls.a7, user=cls.u1) AllocationUserFactory(allocation=cls.a7, user=cls.u3) - def test_allocations_to_slurm(self): + def test_slurm_from_resource(self): """non-exhaustive, should make better""" cluster = SlurmCluster.new_from_resource(self.resource) self.assertEqual(cluster.name, "test_cluster") @@ -119,7 +115,21 @@ def test_allocations_to_slurm(self): self.assertEqual(len(cluster.accounts["a7"].children), 2) def test_slurm_dump_roundtrip(self): - """todo""" + """create from resource, dump, and load dump""" + cluster = SlurmCluster.new_from_resource(self.resource) + out = StringIO("") + cluster.write(out) + out.seek(0) + + cluster = SlurmCluster.new_from_stream(out) + + self.assertEqual(cluster.name, "test_cluster") + self.assertEqual(len(cluster.accounts), 3) # root account is added when dumped + self.assertIn("a1", cluster.accounts) + self.assertIn("a7", cluster.accounts) + self.assertEqual(len(cluster.accounts["a7"].children), 2) + + def test_slurm_from_stream(self): dump = StringIO(""" # To edit this file start with a cluster line for the new cluster # Cluster - 'cluster_name':MaxNodesPerJob=50 @@ -157,39 +167,7 @@ def test_slurm_dump_roundtrip(self): User - 'u1' User - 'u3' """) - - expected = """Cluster - 'test_cluster': -Parent - 'root' -User - 'root':DefaultAccount='root':AdminLevel='Administrator':Fairshare=1 -Account - 'a1': -Account - 'a7': -Parent - 'a1' -Account - 'a3': -Account - 'a2': -Parent - 'a3' -Account - 'a4': -Account - 'a5': -Parent - 'a4' -User - 'u1': -User - 'u2': -Parent - 'a5' -Account - 'a6': -Parent - 'a6' -User - 'u3': -Parent - 'a2' -User - 'u1': -Parent - 'a7' -User - 'u1': -User - 'u3':""".strip() - - cluster = SlurmCluster.new_from_resource(self.resource) - out = StringIO("") - cluster.write(out) - cluster.write(sys.stdout) - out.seek(0) - - cluster = SlurmCluster.new_from_stream(out) - + cluster = SlurmCluster.new_from_stream(dump) self.assertEqual(cluster.name, "test_cluster") self.assertEqual(len(cluster.accounts), 3) self.assertIn("a1", cluster.accounts) From dc840ea435a9f7cd4006ff1ee7c6e7cbc4c59c13 Mon Sep 17 00:00:00 2001 From: Cecilia Lau Date: Thu, 28 Aug 2025 16:05:28 -0400 Subject: [PATCH 3/5] slurm: update slurm_check to consider parents Signed-off-by: Cecilia Lau --- coldfront/plugins/slurm/associations.py | 109 +++++++++++++++++- .../slurm/management/commands/slurm_check.py | 55 ++------- coldfront/plugins/slurm/utils.py | 18 +++ 3 files changed, 136 insertions(+), 46 deletions(-) diff --git a/coldfront/plugins/slurm/associations.py b/coldfront/plugins/slurm/associations.py index e6ca3d8537..0d975ac187 100644 --- a/coldfront/plugins/slurm/associations.py +++ b/coldfront/plugins/slurm/associations.py @@ -7,6 +7,7 @@ import os import re import sys +from typing import Self from django.core.exceptions import ObjectDoesNotExist @@ -19,6 +20,7 @@ SLURM_SPECS_ATTRIBUTE_NAME, SLURM_USER_SPECS_ATTRIBUTE_NAME, SlurmError, + parse_qos, ) SLURM_ACCOUNT_ATTRIBUTE_TYPE = AllocationAttributeType.objects.get(name=SLURM_ACCOUNT_ATTRIBUTE_NAME) @@ -68,7 +70,7 @@ def _write(self, out, data): class SlurmCluster(SlurmBase): def __init__(self, name, specs=None): super().__init__(name, specs=specs) - self.accounts = {} + self.accounts: dict[str, SlurmAccount] = {} @staticmethod def new_from_stream(stream): @@ -200,12 +202,27 @@ def write(self, out): for name, account in self.accounts.items(): account.write_children(out) + def get_objects_to_remove(self, expected: Self) -> dict[str, list[dict]]: + """Get the objects to remove from this cluster based on the expected cluster""" + objects_to_remove = { + "users": [], + "accounts": [], + "qoses": [], + } + for account_name, account in self.accounts.items(): + if account_name == "root": + continue + child_objects_to_remove = account.get_objects_to_remove(expected.accounts.get(account_name)) + for key, value in child_objects_to_remove.items(): + objects_to_remove[key].extend(value) + return objects_to_remove + class SlurmAccount(SlurmBase): def __init__(self, name, specs=None): super().__init__(name, specs=specs) self.child_type = None - self.children = {} + self.children: dict[str, SlurmAccount | SlurmUser] = {} @staticmethod def new_from_sacctmgr(line): @@ -309,6 +326,57 @@ def write_children(self, out): for child in self.children.values(): child.write_children(out) + def get_objects_to_remove(self, expected: Self | None = None) -> dict[str, list[dict]]: + """Get the objects to remove from this account based on the expected account. + If expected is None, remove the entire account. + """ + objects_to_remove = { + "users": [], + "accounts": [], + "qoses": [], + } + + if expected is None: + if self.child_type == SlurmAccount: + for account in self.children.values(): + child_objects_to_remove = account.get_objects_to_remove() + for key, value in child_objects_to_remove.items(): + objects_to_remove[key].extend(value) + elif self.child_type == SlurmUser: + for uid in self.children.keys(): + objects_to_remove["users"].append({"user": uid, "account": self.name}) + objects_to_remove["accounts"].append({"account": self.name}) + return objects_to_remove + + if self.child_type != expected.child_type: + # remove this entire account + child_objects_to_remove = self.get_objects_to_remove() + for key, value in child_objects_to_remove.items(): + objects_to_remove[key].extend(value) + elif self.child_type == expected.child_type: + children_removed = 0 + if self.child_type == SlurmAccount: + for account_name, account in self.children.items(): + child_objects_to_remove = self.get_objects_to_remove(expected.children.get(account_name)) + for key, value in child_objects_to_remove.items(): + objects_to_remove[key].extend(value) + elif self.child_type == SlurmUser: + for uid, user in self.children.items(): + if uid == "root": + continue + if uid not in expected.children: + objects_to_remove["users"].append({"user": uid, "account": self.name}) + children_removed += 1 + else: + qoses_to_remove = user.get_qoses_to_remove(self.name, self.name, expected.children[uid]) + if len(qoses_to_remove) > 0: + objects_to_remove["qoses"].append( + {"user": uid, "account": self.name, "qos": "QOS-=" + ",".join(list(qoses_to_remove))} + ) + if children_removed == len(self.children): + objects_to_remove["accounts"].append({"account": self.name}) + return objects_to_remove + class SlurmUser(SlurmBase): @staticmethod @@ -327,3 +395,40 @@ def new_from_sacctmgr(line): def write(self, out): self._write(out, f"User - '{self.name}':{self.format_specs()}\n") + + def get_qoses_to_remove(self, account_name: str, cluster_name: str, expected: Self) -> set[str]: + """Get the set of QOSes to remove from this user based on the expected user + Returns: set of QOS names to remove + """ + logger.debug( + f"diff qos: cluster={cluster_name}" + f" account={account_name}" + f" uid={self.name}" + f" self={self.spec_list()}" + f" expected={expected.spec_list()}" + ) + + specs_a = [] + for s in self.spec_list(): + if s.startswith("QOS"): + specs_a += parse_qos(s) + + specs_b = [] + for s in expected.spec_list(): + if s.startswith("QOS"): + specs_b += parse_qos(s) + + specs_set_a = set(specs_a) + specs_set_b = set(specs_b) + + diff = specs_set_a.difference(specs_set_b) + logger.debug( + f"diff qos: cluster={cluster_name}" + f" account={account_name}" + f" uid={self.name}" + f" self={self.spec_list()}" + f" expected={expected.spec_list()}" + f" diff={diff}" + ) + + return diff diff --git a/coldfront/plugins/slurm/management/commands/slurm_check.py b/coldfront/plugins/slurm/management/commands/slurm_check.py index dea9bf3ff3..fe750dadfc 100644 --- a/coldfront/plugins/slurm/management/commands/slurm_check.py +++ b/coldfront/plugins/slurm/management/commands/slurm_check.py @@ -15,6 +15,7 @@ from coldfront.plugins.slurm.utils import ( SLURM_CLUSTER_ATTRIBUTE_NAME, SlurmError, + parse_qos, slurm_dump_cluster, slurm_remove_account, slurm_remove_assoc, @@ -144,22 +145,6 @@ def remove_qos(self, user, account, cluster, qos): self.write("\t".join(row)) - def _parse_qos(self, qos): - if qos.startswith("QOS+="): - qos = qos.replace("QOS+=", "") - qos = qos.replace("'", "") - return qos.split(",") - elif qos.startswith("QOS="): - qos = qos.replace("QOS=", "") - qos = qos.replace("'", "") - lst = [] - for q in qos.split(","): - if q.startswith("+"): - lst.append(q.replace("+", "")) - return lst - - return [] - def _diff_qos(self, account_name, cluster_name, user_a, user_b): logger.debug( f"diff qos: cluster={cluster_name}" @@ -172,12 +157,12 @@ def _diff_qos(self, account_name, cluster_name, user_a, user_b): specs_a = [] for s in user_a.spec_list(): if s.startswith("QOS"): - specs_a += self._parse_qos(s) + specs_a += parse_qos(s) specs_b = [] for s in user_b.spec_list(): if s.startswith("QOS"): - specs_b += self._parse_qos(s) + specs_b += parse_qos(s) specs_set_a = set(specs_a) specs_set_b = set(specs_b) @@ -195,33 +180,15 @@ def _diff_qos(self, account_name, cluster_name, user_a, user_b): if len(diff) > 0: self.remove_qos(user_a.name, account_name, cluster_name, "QOS-=" + ",".join([x for x in list(diff)])) - def _diff(self, cluster_a, cluster_b): - for name, account in cluster_a.accounts.items(): - if name == "root": - continue - - if name in cluster_b.accounts: - total = 0 - for uid, user in account.users.items(): - if uid == "root": - continue - if uid not in cluster_b.accounts[name].users: - self.remove_user(uid, name, cluster_a.name) - total += 1 - else: - self._diff_qos(name, cluster_a.name, user, cluster_b.accounts[name].users[uid]) - - if total == len(account.users): - self.remove_account(name, cluster_a.name) - else: - for uid, user in account.users.items(): - self.remove_user(uid, name, cluster_a.name) - - self.remove_account(name, cluster_a.name) - - def check_consistency(self, slurm_cluster, coldfront_cluster): + def check_consistency(self, slurm_cluster: SlurmCluster, coldfront_cluster: SlurmCluster): # Check for accounts in Slurm NOT in ColdFront - self._diff(slurm_cluster, coldfront_cluster) + objects_to_remove = slurm_cluster.get_objects_to_remove(coldfront_cluster) + for qos_kwargs in objects_to_remove["qoses"]: + self.remove_qos(cluster=slurm_cluster.name, **qos_kwargs) + for user_kwargs in objects_to_remove["users"]: + self.remove_user(cluster=slurm_cluster.name, **user_kwargs) + for account_kwargs in objects_to_remove["accounts"]: + self.remove_account(cluster=slurm_cluster.name, **account_kwargs) def _cluster_from_dump(self, cluster): slurm_cluster = None diff --git a/coldfront/plugins/slurm/utils.py b/coldfront/plugins/slurm/utils.py index 6edb077a8b..3478fee573 100644 --- a/coldfront/plugins/slurm/utils.py +++ b/coldfront/plugins/slurm/utils.py @@ -152,3 +152,21 @@ def slurm_check_assoc(user, cluster, account): def slurm_dump_cluster(cluster, fname, noop=False): cmd = SLURM_CMD_DUMP_CLUSTER.format(shlex.quote(cluster), shlex.quote(fname)) _run_slurm_cmd(cmd, noop=noop) + + +def parse_qos(qos: str) -> list[str]: + """Parses QOS spec string into a list of QOSes""" + if qos.startswith("QOS+="): + qos = qos.replace("QOS+=", "") + qos = qos.replace("'", "") + return qos.split(",") + elif qos.startswith("QOS="): + qos = qos.replace("QOS=", "") + qos = qos.replace("'", "") + lst = [] + for q in qos.split(","): + if q.startswith("+"): + lst.append(q.replace("+", "")) + return lst + + return [] From a63dac9e08123c2ff536ab823bcc4208b3f2b2d7 Mon Sep 17 00:00:00 2001 From: Cecilia Lau Date: Tue, 2 Sep 2025 11:14:17 -0400 Subject: [PATCH 4/5] slurm: add support for parents to both account and user children and update readme Signed-off-by: Cecilia Lau --- coldfront/plugins/slurm/README.md | 28 ++- coldfront/plugins/slurm/associations.py | 182 ++++++++---------- coldfront/plugins/slurm/tests/test_parents.py | 50 ++++- 3 files changed, 140 insertions(+), 120 deletions(-) diff --git a/coldfront/plugins/slurm/README.md b/coldfront/plugins/slurm/README.md index 61aa1b12db..66a19baaee 100644 --- a/coldfront/plugins/slurm/README.md +++ b/coldfront/plugins/slurm/README.md @@ -3,7 +3,7 @@ ColdFront django plugin providing Slurm integration for ColdFront. Allocations in ColdFront are marshalled out to Slurm associations in the Slurm flat file format and can be loaded with sacctmgr. For more information on -the Slurm flat file format see [here](https://slurm.schedmd.com/sacctmgr.html). +the Slurm flat file format, see [here](https://slurm.schedmd.com/sacctmgr.html). A command line tool is also provided with this app that allows an administrator to check the consistency between ColdFront and Slurm and optionally remove any @@ -13,23 +13,31 @@ associations that should not be in Slurm according to ColdFront. Resources in ColdFront map to Clusters (or partitions within a cluster) in Slurm. The name of the Slurm cluster is taken from a resource attribute in -ColdFront named "slurm\_cluster". You can optionally provide Slurm -specifications for a cluster using a resource attribute named "slurm\_specs". +ColdFront named `slurm_cluster`. You can optionally provide Slurm +specifications for a cluster using a resource attribute named `slurm_specs`. The value of this attribute must conform to the Slurm specification format and are colon separated. Allocations in ColdFront map to Accounts in Slurm. The name of the Slurm account is taken from a allocation attribute in ColdFront named -"slurm\_account\_name" . You can optionally provide Slurm specifications for -the account using a allocation attribute named "slurm\_specs". The value of -this attribute must conform to the Slurm specification format and are colon -separated. +`slurm_account_name`. You can optionally provide Slurm specifications for +the account using a allocation attribute named `slurm_specs`. The value of +this attribute must conform to the Slurm specification format. This attribute +can either be colon-separated or multiple instances of this attribute can be +specified. For example, specifying both +`slurm_specs`: `QOS='+qos_interactive,-free'` +and `slurm_specs`: `DefaultAccount='my-account'` is equivalent to specifying +only `slurm_specs`: `QOS='+qos_interactive,-free':DefaultAccount='my-account'`. +You can also optionally provide child accounts with the `slurm_children` +allocation attribute - specify the account name. Allocation users in ColdFront map to Users in Slurm. You can optionally provide Slurm specifications for each user in a allocation using a -allocation attribute named "slurm\_user\_specs". The value of this attribute -must conform to the Slurm specification format and are colon separated. Setting -specifications on an individual user basis is not currently supported. +allocation attribute named `slurm_user_specs`. The value of this attribute +must conform to the Slurm specification format This attribute can either be +colon-separated or multiple instances of this attribute can be specified. +Setting specifications on an individual user basis is not currently supported. + ## Usage diff --git a/coldfront/plugins/slurm/associations.py b/coldfront/plugins/slurm/associations.py index 0d975ac187..e9341a29c5 100644 --- a/coldfront/plugins/slurm/associations.py +++ b/coldfront/plugins/slurm/associations.py @@ -1,15 +1,17 @@ # SPDX-FileCopyrightText: (C) ColdFront Authors # # SPDX-License-Identifier: AGPL-3.0-or-later +from __future__ import annotations import datetime import logging import os import re import sys -from typing import Self +from typing import Optional, Self from django.core.exceptions import ObjectDoesNotExist +from django.db.models.query import QuerySet from coldfront.core.allocation.models import Allocation, AllocationAttribute, AllocationAttributeType from coldfront.core.resource.models import Resource @@ -97,7 +99,7 @@ def new_from_stream(stream): if parent == "root": cluster.accounts[account.name] = account elif parent_account: - parent_account.add_child(account) + parent_account.add_account(account) elif re.match("^Parent - '[^']+'", line): if not cluster or not cluster.name: raise no_cluster_error @@ -113,7 +115,7 @@ def new_from_stream(stream): user = SlurmUser.new_from_sacctmgr(line) if not parent or not parent_account: raise SlurmParserError(f"Found user record without Parent for line: {line}") - parent_account.add_child(user) + parent_account.add_user(user) if not cluster or not cluster.name: raise no_cluster_error @@ -135,11 +137,10 @@ def new_from_resource(resource): allocations = resource.allocation_set.filter(status__name__in=["Active", "Renewal Requested"]) for allocation in allocations: cluster.add_allocation(allocation, allocations, user_specs=user_specs) - # remove child accounts cluster accounts + # remove child accounts from cluster accounts child_accounts = set() for account in cluster.accounts.values(): - if account.child_type == SlurmAccount: - child_accounts.update(account.children.keys()) + child_accounts.update(account.accounts.keys()) for account_name in child_accounts: del cluster.accounts[account_name] @@ -154,8 +155,7 @@ def new_from_resource(resource): # remove child accounts cluster accounts child_accounts = set() for account in cluster.accounts.values(): - if account.child_type == SlurmAccount: - child_accounts.update(account.children.keys()) + child_accounts.update(account.accounts.keys()) for account_name in child_accounts: del cluster.accounts[account_name] @@ -221,8 +221,8 @@ def get_objects_to_remove(self, expected: Self) -> dict[str, list[dict]]: class SlurmAccount(SlurmBase): def __init__(self, name, specs=None): super().__init__(name, specs=specs) - self.child_type = None - self.children: dict[str, SlurmAccount | SlurmUser] = {} + self.users: dict[str, SlurmUser] = {} + self.accounts: dict[str, SlurmAccount] = {} @staticmethod def new_from_sacctmgr(line): @@ -238,7 +238,7 @@ def new_from_sacctmgr(line): return SlurmAccount(name, specs=parts[1:]) - def add_allocation(self, allocation: Allocation, res_allocations, user_specs=None): + def add_allocation(self, allocation: Allocation, res_allocations: QuerySet[Allocation], user_specs=None): """Add users from a ColdFront Allocation model to SlurmAccount""" if user_specs is None: user_specs = [] @@ -253,61 +253,48 @@ def add_allocation(self, allocation: Allocation, res_allocations, user_specs=Non ) child_accounts = set(allocation.get_attribute_list(SLURM_CHILDREN_ATTRIBUTE_NAME)) - if len(child_accounts) > 0 and allocation.allocationuser_set.count() > 0: - raise SlurmError( - f"Allocation {allocation} cannot be a parent and have users!" - f" Please remove users or all {SLURM_CHILDREN_ATTRIBUTE_NAME} attributes." - ) - self.specs += allocation.get_attribute_list(SLURM_SPECS_ATTRIBUTE_NAME) - if len(child_accounts) > 0: - self.child_type = SlurmAccount - for account_name in child_accounts: - account = self.children.get(account_name, SlurmAccount(account_name)) - try: - child_allocation = res_allocations.get( - pk=AllocationAttribute.objects.get( - allocation_attribute_type=SLURM_ACCOUNT_ATTRIBUTE_TYPE, value=account_name - ).allocation.pk - ) - account.add_allocation(child_allocation, res_allocations, user_specs=user_specs) - except ObjectDoesNotExist: - raise SlurmError( - f"No allocation with {SLURM_ACCOUNT_ATTRIBUTE_TYPE}={account_name} in correct resource" # Don't have an easy way to get the resource here - ) - - self.add_child(account) - else: - self.child_type = SlurmUser - allocation_user_specs = allocation.get_attribute_list(SLURM_USER_SPECS_ATTRIBUTE_NAME) - for u in allocation.allocationuser_set.filter(status__name="Active"): - user = SlurmUser(u.user.username) - user.specs += allocation_user_specs - user.specs += user_specs - self.add_child(user) - - def add_child(self, child): - if not self.child_type: - self.child_type = type(child) - else: - if type(child) is not self.child_type: + for account_name in child_accounts: + account = self.accounts.get(account_name, SlurmAccount(account_name)) + try: + child_allocation = res_allocations.get( + pk=AllocationAttribute.objects.get( + allocation_attribute_type=SLURM_ACCOUNT_ATTRIBUTE_TYPE, value=account_name + ).allocation.pk + ) + account.add_allocation(child_allocation, res_allocations, user_specs=user_specs) + except ObjectDoesNotExist: raise SlurmError( - f"Cannot assign child of type {type(child)} to parent with child_type {self.child_type}" + f"No allocation with {SLURM_ACCOUNT_ATTRIBUTE_TYPE}={account_name} in correct resource" # Don't have an easy way to get the resource here ) - if child.name not in self.children: - self.children[child.name] = child - ch = self.children[child.name] - ch.specs += child.specs - self.children[child.name] = ch + self.add_account(account) - def get_account(self, account_name): - if self.child_type != SlurmAccount: - return None - if account_name in self.children.keys(): - return self.children[account_name] - for account in self.children.values(): + allocation_user_specs = allocation.get_attribute_list(SLURM_USER_SPECS_ATTRIBUTE_NAME) + for u in allocation.allocationuser_set.filter(status__name="Active"): + user = SlurmUser(u.user.username) + user.specs += allocation_user_specs + user.specs += user_specs + self.add_user(user) + + def add_account(self, account: SlurmAccount) -> None: + if account.name not in self.accounts: + self.accounts[account.name] = account + return + self.accounts[account.name].specs += account.specs + + def add_user(self, user: SlurmUser) -> None: + if user.name not in self.users: + self.users[user.name] = user + return + self.users[user.name].specs += user.specs + + def get_account(self, account_name: str) -> Optional[SlurmAccount]: + """Gets an account, traversing through child accounts""" + if account_name in self.accounts.keys(): + return self.accounts[account_name] + for account in self.accounts.values(): result = account.get_account(account_name) if result: return result @@ -319,14 +306,14 @@ def write(self, out): def write_children(self, out): self._write(out, f"Parent - '{self.name}'\n") - for child in self.children.values(): - child.write(out) - if self.child_type == SlurmUser: - return - for child in self.children.values(): - child.write_children(out) + for user in self.users.values(): + user.write(out) + for account in self.accounts.values(): + account.write(out) + for account in self.accounts.values(): + account.write_children(out) - def get_objects_to_remove(self, expected: Self | None = None) -> dict[str, list[dict]]: + def get_objects_to_remove(self, expected: Optional[Self] = None) -> dict[str, list[dict]]: """Get the objects to remove from this account based on the expected account. If expected is None, remove the entire account. """ @@ -337,44 +324,39 @@ def get_objects_to_remove(self, expected: Self | None = None) -> dict[str, list[ } if expected is None: - if self.child_type == SlurmAccount: - for account in self.children.values(): - child_objects_to_remove = account.get_objects_to_remove() - for key, value in child_objects_to_remove.items(): - objects_to_remove[key].extend(value) - elif self.child_type == SlurmUser: - for uid in self.children.keys(): - objects_to_remove["users"].append({"user": uid, "account": self.name}) + for account in self.accounts.values(): + child_objects_to_remove = account.get_objects_to_remove() + for key, value in child_objects_to_remove.items(): + objects_to_remove[key].extend(value) + for uid in self.users.keys(): + objects_to_remove["users"].append({"user": uid, "account": self.name}) objects_to_remove["accounts"].append({"account": self.name}) return objects_to_remove - if self.child_type != expected.child_type: - # remove this entire account - child_objects_to_remove = self.get_objects_to_remove() + accounts_removed = 0 + for account_name, account in self.accounts.items(): + if account_name not in expected.accounts: + accounts_removed += 1 + child_objects_to_remove = self.get_objects_to_remove(expected.accounts.get(account_name)) for key, value in child_objects_to_remove.items(): objects_to_remove[key].extend(value) - elif self.child_type == expected.child_type: - children_removed = 0 - if self.child_type == SlurmAccount: - for account_name, account in self.children.items(): - child_objects_to_remove = self.get_objects_to_remove(expected.children.get(account_name)) - for key, value in child_objects_to_remove.items(): - objects_to_remove[key].extend(value) - elif self.child_type == SlurmUser: - for uid, user in self.children.items(): - if uid == "root": - continue - if uid not in expected.children: - objects_to_remove["users"].append({"user": uid, "account": self.name}) - children_removed += 1 - else: - qoses_to_remove = user.get_qoses_to_remove(self.name, self.name, expected.children[uid]) - if len(qoses_to_remove) > 0: - objects_to_remove["qoses"].append( - {"user": uid, "account": self.name, "qos": "QOS-=" + ",".join(list(qoses_to_remove))} - ) - if children_removed == len(self.children): - objects_to_remove["accounts"].append({"account": self.name}) + + users_removed = 0 + for uid, user in self.users.items(): + if uid == "root": + continue + if uid not in expected.users: + objects_to_remove["users"].append({"user": uid, "account": self.name}) + users_removed += 1 + else: + qoses_to_remove = user.get_qoses_to_remove(self.name, self.name, expected.users[uid]) + if len(qoses_to_remove) > 0: + objects_to_remove["qoses"].append( + {"user": uid, "account": self.name, "qos": "QOS-=" + ",".join(list(qoses_to_remove))} + ) + + if accounts_removed == len(self.accounts) and users_removed == len(self.users): + objects_to_remove["accounts"].append({"account": self.name}) return objects_to_remove diff --git a/coldfront/plugins/slurm/tests/test_parents.py b/coldfront/plugins/slurm/tests/test_parents.py index c2b6b908d4..39c819cac1 100644 --- a/coldfront/plugins/slurm/tests/test_parents.py +++ b/coldfront/plugins/slurm/tests/test_parents.py @@ -23,11 +23,11 @@ # Building this account structure in slurm/coldfront # 'a' represents an account and 'u' represents a user # -# a1 a7 -# / \ / \ -# a2 a3 u1 u3 -# / / \ -# u1 a4 a5 +# a1 a7 +# / \ / | \ +# a2 a3 a8 u3 a7 <-- a user, to test users and +# / / \ / \ accounts having the same name +# u1 a4 a5 u1 u2 # / \ | # u1 u2 a6 # | @@ -58,11 +58,13 @@ def setUpTestData(cls): cls.u1 = UserFactory(username="u1") cls.u2 = UserFactory(username="u2") cls.u3 = UserFactory(username="u3") + cls.u4 = UserFactory(username="a7") # create project cls.project = ProjectFactory(title="test_proj") ProjectUserFactory(project=cls.project, user=cls.u1) ProjectUserFactory(project=cls.project, user=cls.u2) ProjectUserFactory(project=cls.project, user=cls.u3) + ProjectUserFactory(project=cls.project, user=cls.u4) # create allocations alloc_kwargs = {"project": cls.project, "status": AllocationStatusChoiceFactory(name="Active")} san_aat = AllocationAttributeType.objects.get(name="slurm_account_name") @@ -74,6 +76,7 @@ def setUpTestData(cls): cls.a5 = Allocation.objects.create(**alloc_kwargs) cls.a6 = Allocation.objects.create(**alloc_kwargs) cls.a7 = Allocation.objects.create(**alloc_kwargs) + cls.a8 = Allocation.objects.create(**alloc_kwargs) cls.a1.resources.add(cls.resource) cls.a2.resources.add(cls.resource) cls.a3.resources.add(cls.resource) @@ -81,6 +84,7 @@ def setUpTestData(cls): cls.a5.resources.add(cls.resource) cls.a6.resources.add(cls.resource) cls.a7.resources.add(cls.resource) + cls.a8.resources.add(cls.resource) # slurm account names aat_kwargs = {"allocation_attribute_type": san_aat} AllocationAttributeFactory(allocation=cls.a1, value="a1", **aat_kwargs) @@ -90,6 +94,7 @@ def setUpTestData(cls): AllocationAttributeFactory(allocation=cls.a5, value="a5", **aat_kwargs) AllocationAttributeFactory(allocation=cls.a6, value="a6", **aat_kwargs) AllocationAttributeFactory(allocation=cls.a7, value="a7", **aat_kwargs) + AllocationAttributeFactory(allocation=cls.a8, value="a8", **aat_kwargs) # slurm children aat_kwargs = {"allocation_attribute_type": sc_aat} AllocationAttributeFactory(allocation=cls.a1, value="a2", **aat_kwargs) @@ -97,13 +102,16 @@ def setUpTestData(cls): AllocationAttributeFactory(allocation=cls.a3, value="a4", **aat_kwargs) AllocationAttributeFactory(allocation=cls.a3, value="a5", **aat_kwargs) AllocationAttributeFactory(allocation=cls.a5, value="a6", **aat_kwargs) + AllocationAttributeFactory(allocation=cls.a7, value="a8", **aat_kwargs) # add users to allocations AllocationUserFactory(allocation=cls.a2, user=cls.u1) AllocationUserFactory(allocation=cls.a4, user=cls.u1) AllocationUserFactory(allocation=cls.a4, user=cls.u2) AllocationUserFactory(allocation=cls.a6, user=cls.u3) - AllocationUserFactory(allocation=cls.a7, user=cls.u1) AllocationUserFactory(allocation=cls.a7, user=cls.u3) + AllocationUserFactory(allocation=cls.a7, user=cls.u4) + AllocationUserFactory(allocation=cls.a8, user=cls.u1) + AllocationUserFactory(allocation=cls.a8, user=cls.u2) def test_slurm_from_resource(self): """non-exhaustive, should make better""" @@ -112,7 +120,13 @@ def test_slurm_from_resource(self): self.assertEqual(len(cluster.accounts), 2) self.assertIn("a1", cluster.accounts) self.assertIn("a7", cluster.accounts) - self.assertEqual(len(cluster.accounts["a7"].children), 2) + a7_accounts = cluster.accounts["a7"].accounts + a7_users = cluster.accounts["a7"].users + self.assertEqual(len(a7_accounts), 1) + self.assertEqual(len(a7_users), 2) + self.assertIn("a8", a7_accounts) + self.assertIn("u3", a7_users) + self.assertIn("a7", a7_users) def test_slurm_dump_roundtrip(self): """create from resource, dump, and load dump""" @@ -127,7 +141,13 @@ def test_slurm_dump_roundtrip(self): self.assertEqual(len(cluster.accounts), 3) # root account is added when dumped self.assertIn("a1", cluster.accounts) self.assertIn("a7", cluster.accounts) - self.assertEqual(len(cluster.accounts["a7"].children), 2) + a7_accounts = cluster.accounts["a7"].accounts + a7_users = cluster.accounts["a7"].users + self.assertEqual(len(a7_accounts), 1) + self.assertEqual(len(a7_users), 2) + self.assertIn("a8", a7_accounts) + self.assertIn("u3", a7_users) + self.assertIn("a7", a7_users) def test_slurm_from_stream(self): dump = StringIO(""" @@ -164,12 +184,22 @@ def test_slurm_from_stream(self): Parent - 'a6' User - 'u3' Parent - 'a7' -User - 'u1' +Account - 'a8':Description='a8':Organization='a8' User - 'u3' +User - 'a7' +Parent - 'a8' +User - 'u1' +User - 'u2' """) cluster = SlurmCluster.new_from_stream(dump) self.assertEqual(cluster.name, "test_cluster") self.assertEqual(len(cluster.accounts), 3) self.assertIn("a1", cluster.accounts) self.assertIn("a7", cluster.accounts) - self.assertEqual(len(cluster.accounts["a7"].children), 2) + a7_accounts = cluster.accounts["a7"].accounts + a7_users = cluster.accounts["a7"].users + self.assertEqual(len(a7_accounts), 1) + self.assertEqual(len(a7_users), 2) + self.assertIn("a8", a7_accounts) + self.assertIn("u3", a7_users) + self.assertIn("a7", a7_users) From af1cee8a6319c230248814e5ed26c9ec663e46d2 Mon Sep 17 00:00:00 2001 From: Cecilia Lau Date: Tue, 2 Sep 2025 11:43:55 -0400 Subject: [PATCH 5/5] slurm: fix tests Signed-off-by: Cecilia Lau --- coldfront/plugins/slurm/associations.py | 9 +++++---- .../{test_associations.py => _test_associations.py} | 0 coldfront/plugins/slurm/tests/test_parents.py | 10 +++++----- 3 files changed, 10 insertions(+), 9 deletions(-) rename coldfront/plugins/slurm/tests/{test_associations.py => _test_associations.py} (100%) diff --git a/coldfront/plugins/slurm/associations.py b/coldfront/plugins/slurm/associations.py index e9341a29c5..6954548e19 100644 --- a/coldfront/plugins/slurm/associations.py +++ b/coldfront/plugins/slurm/associations.py @@ -25,8 +25,6 @@ parse_qos, ) -SLURM_ACCOUNT_ATTRIBUTE_TYPE = AllocationAttributeType.objects.get(name=SLURM_ACCOUNT_ATTRIBUTE_NAME) - logger = logging.getLogger(__name__) @@ -260,13 +258,16 @@ def add_allocation(self, allocation: Allocation, res_allocations: QuerySet[Alloc try: child_allocation = res_allocations.get( pk=AllocationAttribute.objects.get( - allocation_attribute_type=SLURM_ACCOUNT_ATTRIBUTE_TYPE, value=account_name + allocation_attribute_type=AllocationAttributeType.objects.get( + name=SLURM_ACCOUNT_ATTRIBUTE_NAME + ), + value=account_name, ).allocation.pk ) account.add_allocation(child_allocation, res_allocations, user_specs=user_specs) except ObjectDoesNotExist: raise SlurmError( - f"No allocation with {SLURM_ACCOUNT_ATTRIBUTE_TYPE}={account_name} in correct resource" # Don't have an easy way to get the resource here + f"No allocation with {SLURM_ACCOUNT_ATTRIBUTE_NAME}={account_name} in correct resource" # Don't have an easy way to get the resource here ) self.add_account(account) diff --git a/coldfront/plugins/slurm/tests/test_associations.py b/coldfront/plugins/slurm/tests/_test_associations.py similarity index 100% rename from coldfront/plugins/slurm/tests/test_associations.py rename to coldfront/plugins/slurm/tests/_test_associations.py diff --git a/coldfront/plugins/slurm/tests/test_parents.py b/coldfront/plugins/slurm/tests/test_parents.py index 39c819cac1..711e67acc7 100644 --- a/coldfront/plugins/slurm/tests/test_parents.py +++ b/coldfront/plugins/slurm/tests/test_parents.py @@ -7,10 +7,11 @@ from django.core.management import call_command from django.test import TestCase -from coldfront.core.allocation.models import Allocation, AllocationAttributeType +from coldfront.core.allocation.models import Allocation from coldfront.core.resource.models import ResourceAttribute, ResourceAttributeType, ResourceType from coldfront.core.test_helpers.factories import ( AllocationAttributeFactory, + AllocationAttributeTypeFactory, AllocationStatusChoiceFactory, AllocationUserFactory, ProjectFactory, @@ -22,6 +23,7 @@ # Building this account structure in slurm/coldfront # 'a' represents an account and 'u' represents a user +# unless otherwise stated # # a1 a7 # / \ / | \ @@ -37,8 +39,6 @@ class AssociationTest(TestCase): @classmethod def setUpClass(cls): - # call_command("import_field_of_science_data") - # call_command("add_default_grant_options") call_command("add_default_project_choices") call_command("add_allocation_defaults") call_command("add_resource_defaults") @@ -67,8 +67,8 @@ def setUpTestData(cls): ProjectUserFactory(project=cls.project, user=cls.u4) # create allocations alloc_kwargs = {"project": cls.project, "status": AllocationStatusChoiceFactory(name="Active")} - san_aat = AllocationAttributeType.objects.get(name="slurm_account_name") - sc_aat = AllocationAttributeType.objects.get(name="slurm_children") + san_aat = AllocationAttributeTypeFactory(name="slurm_account_name") + sc_aat = AllocationAttributeTypeFactory(name="slurm_children") cls.a1 = Allocation.objects.create(**alloc_kwargs) cls.a2 = Allocation.objects.create(**alloc_kwargs) cls.a3 = Allocation.objects.create(**alloc_kwargs)