Skip to content

Commit ac598fc

Browse files
authoredMar 6, 2025
Updated wrap_rrdesi to fix multiple use cases (PR #2429)
Updated wrap_rrdesi to fix multiple use cases.
2 parents 2019a32 + 44459f8 commit ac598fc

File tree

1 file changed

+29
-2
lines changed

1 file changed

+29
-2
lines changed
 

‎bin/wrap_rrdesi

+29-2
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,15 @@ from desispec.scripts import qsoqn, qsomgii, emlinefit
1717
# MPI environment availability
1818
have_mpi = None
1919
if nersc_login_node():
20-
have_mpi = False
20+
print ("wrap_rrdesi should not be run on a login node.")
21+
sys.exit(0)
2122
else:
2223
have_mpi = True
2324
try:
2425
import mpi4py.MPI as MPI
2526
except ImportError:
2627
have_mpi = False
27-
print ("MPI not available")
28+
print ("MPI not available - required to run wrap_rrdesi")
2829
sys.exit(0)
2930

3031
parser = argparse.ArgumentParser(allow_abbrev=False)
@@ -61,6 +62,18 @@ afterburners = args.afterburners
6162
comm = MPI.COMM_WORLD
6263
comm_rank = comm.rank
6364

65+
#print ("COMM", comm.size, comm.rank)
66+
env = os.environ
67+
if not 'SLURM_STEP_RESV_PORTS' in os.environ and comm.rank == 0:
68+
print ("WARNING: Detected that wrap_rrdesi is not being run with srun command.")
69+
print ("WARNING: Calling directly can lead to under-utilizing resources.")
70+
print ("Recommended syntax: srun -N nodes -n tasks -c 2 --gpu-bind=map_gpu:3,2,1,0 ./wrap_rrdesi [options]")
71+
print ("\tEx: 8 tasks each with GPU support on 2 nodes:")
72+
print ("\t\tsrun -N 2 -n 8 -c 2 --gpu-bind=map_gpu:3,2,1,0 wrap_rrdesi ...")
73+
print ("\tEx: 64 tasks on 1 node and 4 GPUs - this will run on both GPU and non-GPU nodes at once:")
74+
print ("\t\tsrun -N 1 -n 64 -c 2 --gpu-bind=map_gpu:3,2,1,0 wrap_rrdesi ...")
75+
76+
6477
#Get number of nodes
6578
nhosts = os.getenv('SLURM_NNODES')
6679
if nhosts is None:
@@ -84,11 +97,21 @@ if args.gpu:
8497
gpu_per_node = int(gpu_per_node)
8598
ngpu = gpu_per_node*nhosts
8699

100+
if ngpu > comm.size:
101+
if comm.rank == 0:
102+
print (f"WARNING: wrap_rrdesi was called with {ngpu} GPUs but only {comm.size} MPI ranks.")
103+
print (f"WARNING: Will only use {comm.size} GPUs.")
104+
ngpu = comm.size
105+
87106
#Set GPU nodes
88107
#We want the first gpu_per_node ranks of each host
89108
ranks_per_host = comm.size // nhosts
90109
use_gpu = (comm_rank % ranks_per_host) < gpu_per_node
91110
ncpu_ranks = (comm.size - ngpu -1) // cpu_per_task + 1
111+
#if comm.rank == 0:
112+
# print (f'{ngpu=}, {gpu_per_node=}, {nhosts=}')
113+
# print (f'{ranks_per_host=}, {use_gpu=}, {ncpu_ranks=}')
114+
# print (f'{comm.size=}, {comm_rank=}, {cpu_per_task=}')
92115
if args.gpuonly:
93116
ncpu_ranks = 0
94117

@@ -119,6 +142,7 @@ if use_gpu:
119142
else:
120143
myhost = ngpu + (comm.rank - gpu_per_node*(comm.rank // ranks_per_host)) // cpu_per_task
121144
subcomm = comm.Split(myhost)
145+
#print (f'{comm.rank=}, {ncomm=}, {myhost=}, {subcomm.size=}')
122146

123147
if comm.rank == 0:
124148
print("Running "+str(len(inputfiles))+" input files on "+str(ngpu)+" GPUs and "+str(ncomm)+" total procs...")
@@ -127,6 +151,8 @@ if comm.rank == 0:
127151
# In --gpuonly mode, CPU procs will not enter this block
128152
if myhost < ncomm:
129153
myfiles = np.array_split(inputfiles, ncomm)[myhost]
154+
nfiles = len(myfiles)
155+
#print (f'DEBUG: {myhost=} {ncomm=} {nfiles=} {myfiles=}, {comm.rank=}')
130156
for infile in myfiles:
131157
redrockfile = os.path.join(outdir, os.path.basename(infile).replace('coadd-', 'redrock-'))
132158
if os.path.isfile(redrockfile) and not overwrite:
@@ -145,6 +171,7 @@ if myhost < ncomm:
145171
opts.extend(args_to_pass)
146172
if use_gpu:
147173
opts.append('--gpu')
174+
print (f'Running rrdesi on {myhost=} {subcomm.rank=} with options {opts=}')
148175
desi.rrdesi(opts, comm=subcomm)
149176

150177
# optionally run all the afterburners

0 commit comments

Comments
 (0)