Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion clusterscope/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,33 @@ def task():
default="json",
help="Format to output the job requirements in",
)
def slurm(num_gpus: int, num_tasks_per_node: int, output_format: str, partition: str):
@click.option(
"--account",
type=str,
default=None,
help="SLURM account to charge resources to (optional)",
)
@click.option(
"--qos",
type=str,
default=None,
help="Quality of Service (QoS) specification for the job (optional)",
)
@click.option(
"--time",
type=str,
default=None,
help="Time limit for the job (format: HH:MM:SS or days-HH:MM:SS, optional)",
)
def slurm(
num_gpus: int,
num_tasks_per_node: int,
output_format: str,
partition: str,
account: str,
qos: str,
time: str,
):
"""Generate job requirements for a task of a Slurm job."""
partitions = get_partition_info()
partition_names = [p.name for p in partitions]
Expand All @@ -210,6 +236,9 @@ def slurm(num_gpus: int, num_tasks_per_node: int, output_format: str, partition:
partition=partition,
num_gpus=num_gpus,
num_tasks_per_node=num_tasks_per_node,
account=account,
qos=qos,
time=time,
)

# Route to the correct format method based on CLI option
Expand Down
43 changes: 30 additions & 13 deletions clusterscope/cluster_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,23 +27,18 @@ class ResourceShape(NamedTuple):
tasks_per_node: int
gpus_per_node: int
slurm_partition: str
account: Optional[str] = None
qos: Optional[str] = None
time: Optional[str] = None

def to_json(self) -> str:
"""Convert ResourceShape to JSON format.

Returns:
str: JSON representation of the resource requirements
"""
mem_gb = parse_memory_to_gb(self.memory)

data = {
"cpu_cores": self.cpu_cores,
"memory": self.memory,
"tasks_per_node": self.tasks_per_node,
"mem_gb": mem_gb,
"gpus_per_node": self.gpus_per_node,
"slurm_partition": self.slurm_partition,
}
data = {k: v for k, v in self._asdict().items() if v is not None}
data["mem_gb"] = parse_memory_to_gb(data["memory"])
return json.dumps(data, indent=2)

def to_sbatch(self) -> str:
Expand All @@ -60,6 +55,10 @@ def to_sbatch(self) -> str:
f"#SBATCH --gres=gpu:{self.gpus_per_node}",
f"#SBATCH --partition={self.slurm_partition}",
]
for attr_name in ["account", "qos", "time"]:
value = getattr(self, attr_name)
if value is not None:
lines.append(f"#SBATCH --{attr_name}={value}")
return "\n".join(lines)

def to_srun(self) -> str:
Expand All @@ -76,6 +75,10 @@ def to_srun(self) -> str:
f"--gres=gpu:{self.gpus_per_node}",
f"--partition={self.slurm_partition}",
]
for attr_name in ["account", "qos", "time"]:
value = getattr(self, attr_name)
if value is not None:
cmd_parts.append(f"--{attr_name}={value}")
return " ".join(cmd_parts)

def to_submitit(self) -> str:
Expand All @@ -90,9 +93,18 @@ def to_submitit(self) -> str:
"slurm_partition": self.slurm_partition,
"cpus_per_task": self.cpu_cores,
"mem_gb": mem_gb,
"tasks_per_node": self.tasks_per_node,
"gpus_per_node": self.gpus_per_node,
}
for attr_name in [
"slurm_partition",
"tasks_per_node",
"gpus_per_node",
"account",
"qos",
"time",
]:
value = getattr(self, attr_name)
if value is not None:
params[attr_name] = value
return json.dumps(params, indent=2)


Expand Down Expand Up @@ -232,7 +244,11 @@ def get_total_gpus_per_node(self) -> int:
return max(total_gpus, 1) # Ensure at least 1 to avoid division by zero

def get_task_resource_requirements(
self, partition: str, num_gpus: int, num_tasks_per_node: int = 1
self,
partition: str,
num_gpus: int,
num_tasks_per_node: int = 1,
**kwargs,
) -> ResourceShape:
"""Calculate resource requirements for better GPU packing based on node's GPU configuration.

Expand Down Expand Up @@ -300,6 +316,7 @@ def get_task_resource_requirements(
memory=sbatch_memory,
tasks_per_node=num_tasks_per_node,
gpus_per_node=num_gpus,
**kwargs,
)

def get_array_job_requirements(
Expand Down