Skip to content

Commit

Permalink
Merge pull request #162 from ChevronETC/robustup
Browse files Browse the repository at this point in the history
attempt to make worker startup a bit more robust
  • Loading branch information
samtkaplan authored Sep 18, 2024
2 parents ae1233f + 192a56f commit b4dc6f3
Showing 1 changed file with 32 additions and 3 deletions.
35 changes: 32 additions & 3 deletions src/AzManagers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1016,7 +1016,7 @@ end
nworkers_provisioned([service=false])
Count of the number of scale-set machines that are provisioned
regardless if their status within the Julia cluster. If `service=true`,
regardless of their status within the Julia cluster. If `service=true`,
then we use the Azure scale-set service to make the count, otherwise
we use client side book-keeeping. The later is useful to avoid making
too many requests to the Azure scale-set service, causing it to throttle
Expand Down Expand Up @@ -1859,7 +1859,21 @@ function buildstartupscript_cluster(manager::AzManager, spot::Bool, ppi::Int, mp
export JULIA_WORKER_TIMEOUT=$(get(ENV, "JULIA_WORKER_TIMEOUT", "720"))
export OMP_NUM_THREADS=$omp_num_threads
$envstring
$exename $_exeflags -e '$(juliaenvstring)try using AzManagers; catch; using Pkg; Pkg.instantiate(); using AzManagers; end; AzManagers.nvidia_gpucheck($nvidia_enable_ecc, $nvidia_enable_mig); AzManagers.mount_datadisks(); AzManagers.azure_worker("$cookie", "$master_address", $master_port, $ppi, "$_exeflags")'
attempt_number=1
maximum_attempts=5
exit_code=0
while [ \$attempt_number -le \$maximum_attempts ]; do
$exename $_exeflags -e '$(juliaenvstring)try using AzManagers; catch; using Pkg; Pkg.instantiate(); using AzManagers; end; AzManagers.nvidia_gpucheck($nvidia_enable_ecc, $nvidia_enable_mig); AzManagers.mount_datadisks(); AzManagers.azure_worker("$cookie", "$master_address", $master_port, $ppi, "$_exeflags")'
exit_code=\$?
echo "attempt \$attempt_number is done with exit code \$exit_code. trying again after sleeping for 5 seconds..."
sleep 5
attempt_number=\$(( attempt_number + 1 ))
echo "the worker startup was tried \$attempt_number times."
done
echo "the worker has finished running with exit code \$exit_code."
EOF
"""
else
Expand All @@ -1869,8 +1883,23 @@ function buildstartupscript_cluster(manager::AzManager, spot::Bool, ppi::Int, mp
export JULIA_WORKER_TIMEOUT=$(get(ENV, "JULIA_WORKER_TIMEOUT", "720"))
export OMP_NUM_THREADS=$omp_num_threads
$envstring
$exename -e '$(juliaenvstring)try using AzManagers; catch; using Pkg; Pkg.instantiate(); using AzManagers; end; AzManagers.nvidia_gpucheck($nvidia_enable_ecc, $nvidia_enable_mig); AzManagers.mount_datadisks()'
mpirun -n $mpi_ranks_per_worker $mpi_flags $exename $_exeflags -e '$(juliaenvstring)using AzManagers, MPI; AzManagers.azure_worker_mpi("$cookie", "$master_address", $master_port, $ppi, "$_exeflags")'
attempt_number=1
maximum_attempts=5
exit_code=0
while [ \$attempt_number -le \$maximum_attempts ]; do
mpirun -n $mpi_ranks_per_worker $mpi_flags $exename $_exeflags -e '$(juliaenvstring)using AzManagers, MPI; AzManagers.azure_worker_mpi("$cookie", "$master_address", $master_port, $ppi, "$_exeflags")'
exit_code=\$?
echo "attempt \$attempt_number is done with exit code \$exit_code. trying again after sleeping for 5 seconds..."
sleep 5
attempt_number=\$(( attempt_number + 1 ))
echo "the worker startup was tried \$attempt_number times."
done
echo "the worker has finished running with exit code \$exit_code."
EOF
"""
end
Expand Down

0 comments on commit b4dc6f3

Please sign in to comment.