@@ -54,69 +54,69 @@ def generate_topology_config_file(output_file: str, input_file: str, block_sizes
54
54
BlockName=block2 Nodes=queue-1-st-compute-resource-0-[1-18] #### 18 nodes
55
55
BlockSizes=9,18
56
56
"""
57
-
58
- min_block_size_list = min (list (map (int , block_sizes .split ("," ))))
59
- max_block_size_list = max (list (map (int , block_sizes .split ("," ))))
60
-
61
- cluster_config = _load_cluster_config (input_file )
62
- queue_name , compute_resource_name = None , None
63
- try :
64
- topology_config = CONFIG_HEADER + "\n "
65
- block_count = 0
66
- for queue_config in cluster_config ["Scheduling" ]["SlurmQueues" ]:
67
- queue_name = queue_config ["Name" ]
68
-
69
- # Retrieve capacity info from the queue_name, if there
70
- # queue_capacity_type = CAPACITY_TYPE_MAP.get(queue_config.get("CapacityType", "ONDEMAND"))
71
- # if queue_capacity_type != CAPACITY_TYPE_MAP.get("CAPACITY_BLOCK"):
72
- # log.info("ParallelCluster does not create topology for %s", queue_capacity_type)
73
- # continue
74
-
75
- queue_capacity_reservation_target = queue_config .get ("CapacityReservationTarget" , {})
76
- queue_capacity_reservation = (
77
- queue_capacity_reservation_target .get ("CapacityReservationId" )
78
- if queue_capacity_reservation_target
79
- else None
80
- )
81
-
82
- for compute_resource_config in queue_config ["ComputeResources" ]:
83
- compute_resource_name = compute_resource_config ["Name" ]
84
- compute_min_count = compute_resource_config ["MinCount" ]
85
- compute_max_count = compute_resource_config ["MaxCount" ]
86
- if compute_min_count == compute_max_count :
87
- node_type = "st"
88
- else :
89
- continue
90
-
91
- capacity_reservation_target = compute_resource_config .get ("CapacityReservationTarget" , {})
92
- capacity_reservation = (
93
- capacity_reservation_target .get ("CapacityReservationId" , queue_capacity_reservation )
94
- if capacity_reservation_target
95
- else queue_capacity_reservation
57
+ if block_sizes :
58
+ min_block_size_list = min (list (map (int , block_sizes .split ("," ))))
59
+ max_block_size_list = max (list (map (int , block_sizes .split ("," ))))
60
+
61
+ cluster_config = _load_cluster_config (input_file )
62
+ queue_name , compute_resource_name = None , None
63
+ try :
64
+ topology_config = CONFIG_HEADER + "\n "
65
+ block_count = 0
66
+ for queue_config in cluster_config ["Scheduling" ]["SlurmQueues" ]:
67
+ queue_name = queue_config ["Name" ]
68
+
69
+ # Retrieve capacity info from the queue_name, if there
70
+ # queue_capacity_type = CAPACITY_TYPE_MAP.get(queue_config.get("CapacityType", "ONDEMAND"))
71
+ # if queue_capacity_type != CAPACITY_TYPE_MAP.get("CAPACITY_BLOCK"):
72
+ # log.info("ParallelCluster does not create topology for %s", queue_capacity_type)
73
+ # continue
74
+
75
+ queue_capacity_reservation_target = queue_config .get ("CapacityReservationTarget" , {})
76
+ queue_capacity_reservation = (
77
+ queue_capacity_reservation_target .get ("CapacityReservationId" )
78
+ if queue_capacity_reservation_target
79
+ else None
96
80
)
97
- ### Check for if reservation is for NVLink and size matches min_block_size_list
98
- # if compute_resource_config.get('InstanceType') == 'p6e-gb200.36xlarge':
99
- if min_block_size_list == compute_min_count or max_block_size_list == compute_max_count :
100
- block_count += 1
101
- ### Each Capacity Reservation ID is a Capacity Block and we associate each slurm block with a single capacity Block
102
- topology_config += "BlockName=Block" + str (block_count )+ " Nodes=" + str (queue_name ) + "-" + str (node_type ) + "-" + str (compute_resource_name ) + "-[1-" + str (compute_max_count ) + "]\n "
103
-
104
- topology_config += "BlockSizes=" + str (block_sizes )+ "\n "
105
- except (KeyError , AttributeError ) as e :
106
- if isinstance (e , KeyError ):
107
- message = f"Unable to find key { e } in the configuration file."
108
- else :
109
- message = f"Error parsing configuration file. { e } . { traceback .format_exc ()} ."
110
- message += f" Queue: { queue_name } " if queue_name else ""
111
- log .error (message )
112
- raise CriticalError (message )
113
-
114
- log .info ("Writing Info %s" , topology_config )
115
- log .info ("Generating %s" , output_file )
116
- with open (output_file , "w" , encoding = "utf-8" ) as output :
117
- output .write (topology_config )
118
81
119
- log .info ("Finished." )
82
+ for compute_resource_config in queue_config ["ComputeResources" ]:
83
+ compute_resource_name = compute_resource_config ["Name" ]
84
+ compute_min_count = compute_resource_config ["MinCount" ]
85
+ compute_max_count = compute_resource_config ["MaxCount" ]
86
+ if compute_min_count == compute_max_count :
87
+ node_type = "st"
88
+ else :
89
+ continue
90
+
91
+ capacity_reservation_target = compute_resource_config .get ("CapacityReservationTarget" , {})
92
+ capacity_reservation = (
93
+ capacity_reservation_target .get ("CapacityReservationId" , queue_capacity_reservation )
94
+ if capacity_reservation_target
95
+ else queue_capacity_reservation
96
+ )
97
+ ### Check for if reservation is for NVLink and size matches min_block_size_list
98
+ # if compute_resource_config.get('InstanceType') == 'p6e-gb200.36xlarge':
99
+ if min_block_size_list == compute_min_count or max_block_size_list == compute_max_count :
100
+ block_count += 1
101
+ ### Each Capacity Reservation ID is a Capacity Block and we associate each slurm block with a single capacity Block
102
+ topology_config += "BlockName=Block" + str (block_count )+ " Nodes=" + str (queue_name ) + "-" + str (node_type ) + "-" + str (compute_resource_name ) + "-[1-" + str (compute_max_count ) + "]\n "
103
+
104
+ topology_config += "BlockSizes=" + str (block_sizes )+ "\n "
105
+ except (KeyError , AttributeError ) as e :
106
+ if isinstance (e , KeyError ):
107
+ message = f"Unable to find key { e } in the configuration file."
108
+ else :
109
+ message = f"Error parsing configuration file. { e } . { traceback .format_exc ()} ."
110
+ message += f" Queue: { queue_name } " if queue_name else ""
111
+ log .error (message )
112
+ raise CriticalError (message )
113
+
114
+ log .info ("Writing Info %s" , topology_config )
115
+ log .info ("Generating %s" , output_file )
116
+ with open (output_file , "w" , encoding = "utf-8" ) as output :
117
+ output .write (topology_config )
118
+
119
+ log .info ("Finished." )
120
120
121
121
122
122
def cleanup_topology_config_file (file_path ):
0 commit comments