diff --git a/HeterogeneousCore/SonicTriton/README.md b/HeterogeneousCore/SonicTriton/README.md index 4050f36beaf0a..488566c937caf 100644 --- a/HeterogeneousCore/SonicTriton/README.md +++ b/HeterogeneousCore/SonicTriton/README.md @@ -132,11 +132,11 @@ The script has three operations (`start`, `stop`, `check`) and the following opt * `-c`: don't cleanup temporary dir (for debugging) * `-C [dir]`: directory containing Nvidia compatibility drivers (checks CMSSW_BASE by default if available) * `-D`: dry run: print container commands rather than executing them -* `-d`: use Docker instead of Apptainer +* `-d [exe]`: container choice: apptainer, docker, podman, podman-hpc (default: apptainer) * `-E [path]`: include extra path(s) for executables (default: /cvmfs/oasis.opensciencegrid.org/mis/apptainer/current/bin) * `-f`: force reuse of (possibly) existing container instance -* `-g`: use GPU instead of CPU -* `-i` [name]`: server image name (default: fastml/triton-torchgeo:22.07-py3-geometric) +* `-g [device]`: device choice: auto (try to detect GPU), CPU, GPU (default: auto) +* `-i [name]`: server image name (default: fastml/triton-torchgeo:22.07-py3-geometric) * `-I [num]`: number of model instances (default: 0 -> means no local editing of config files) * `-M [dir]`: model repository (can be given more than once) * `-m [dir]`: specific model directory (can be given more than one) @@ -144,7 +144,7 @@ The script has three operations (`start`, `stop`, `check`) and the following opt * `-P [port]`: base port number for services (-1: automatically find an unused port range) (default: 8000) * `-p [pid]`: automatically shut down server when process w/ specified PID ends (-1: use parent process PID) * `-r [num]`: number of retries when starting container (default: 3) -* `-s [dir]`: Apptainer sandbox directory (default: /cvmfs/unpacked.cern.ch/registry.hub.docker.com/fastml/triton-torchgeo:22.07-py3-geometric) +* `-s [dir]`: apptainer sandbox directory (default: /cvmfs/unpacked.cern.ch/registry.hub.docker.com/fastml/triton-torchgeo:22.07-py3-geometric) * `-t [dir]`: non-default hidden temporary dir * `-v`: (verbose) start: activate server debugging info; stop: keep server logs * `-w [time]`: maximum time to wait for server to start (default: 300 seconds) @@ -200,8 +200,8 @@ The fallback server has a separate set of options, mostly related to the invocat * `enable`: enable the fallback server * `debug`: enable debugging (equivalent to `-c` in `cmsTriton`) * `verbose`: enable verbose output in logs (equivalent to `-v` in `cmsTriton`) -* `useDocker`: use Docker instead of Apptainer (equivalent to `-d` in `cmsTriton`) -* `useGPU`: run on local GPU (equivalent to `-g` in `cmsTriton`) +* `container`: container choice (equivalent to `-d` in `cmsTriton`) +* `device`: device choice (equivalent to `-g` in `cmsTriton`) * `retries`: number of retries when starting container (passed to `-r [num]` in `cmsTriton` if >= 0; default: -1) * `wait`: maximum time to wait for server to start (passed to `-w time` in `cmsTriton` if >= 0; default: -1) * `instanceBaseName`: base name for server instance if random names are enabled (default: triton_server_instance) diff --git a/HeterogeneousCore/SonicTriton/interface/TritonService.h b/HeterogeneousCore/SonicTriton/interface/TritonService.h index f4d6093695dad..470c6ad76b436 100644 --- a/HeterogeneousCore/SonicTriton/interface/TritonService.h +++ b/HeterogeneousCore/SonicTriton/interface/TritonService.h @@ -36,8 +36,8 @@ class TritonService { : enable(pset.getUntrackedParameter("enable")), debug(pset.getUntrackedParameter("debug")), verbose(pset.getUntrackedParameter("verbose")), - useDocker(pset.getUntrackedParameter("useDocker")), - useGPU(pset.getUntrackedParameter("useGPU")), + container(pset.getUntrackedParameter("container")), + device(pset.getUntrackedParameter("device")), retries(pset.getUntrackedParameter("retries")), wait(pset.getUntrackedParameter("wait")), instanceName(pset.getUntrackedParameter("instanceName")), @@ -54,8 +54,8 @@ class TritonService { bool enable; bool debug; bool verbose; - bool useDocker; - bool useGPU; + std::string container; + std::string device; int retries; int wait; std::string instanceName; @@ -89,6 +89,7 @@ class TritonService { std::unordered_set models; static const std::string fallbackName; static const std::string fallbackAddress; + static const std::string siteconfName; }; struct Model { Model(const std::string& path_ = "") : path(path_) {} diff --git a/HeterogeneousCore/SonicTriton/python/TritonService_cff.py b/HeterogeneousCore/SonicTriton/python/TritonService_cff.py index e991d54e6f72f..6916c51e4308a 100644 --- a/HeterogeneousCore/SonicTriton/python/TritonService_cff.py +++ b/HeterogeneousCore/SonicTriton/python/TritonService_cff.py @@ -2,18 +2,8 @@ from Configuration.ProcessModifiers.enableSonicTriton_cff import enableSonicTriton -_gpu_available_cached = None - -def _gpu_available(): - global _gpu_available_cached - if _gpu_available_cached is None: - import os - _gpu_available_cached = (os.system("nvidia-smi -L") == 0) - return _gpu_available_cached - enableSonicTriton.toModify(TritonService, fallback = dict( enable = True, - useGPU = _gpu_available(), ), ) diff --git a/HeterogeneousCore/SonicTriton/scripts/cmsTriton b/HeterogeneousCore/SonicTriton/scripts/cmsTriton index 3949d6f21826b..de83f20b51847 100755 --- a/HeterogeneousCore/SonicTriton/scripts/cmsTriton +++ b/HeterogeneousCore/SonicTriton/scripts/cmsTriton @@ -1,8 +1,7 @@ #!/bin/bash # defaults -USEDOCKER="" -GPU="" +CONTAINER=apptainer VERBOSE="" VERBOSE_ARGS="--log-verbose=1 --log-error=1 --log-warning=1 --log-info=1" WTIME=600 @@ -24,6 +23,13 @@ IMAGE=fastml/triton-torchgeo:22.07-py3-geometric SANDBOX="" COMPAT_USR="" EXTRAPATH=/cvmfs/oasis.opensciencegrid.org/mis/apptainer/current/bin +OSVERSION=$(sed -nr 's/[^0-9]*([0-9]+).*/\1/p' /etc/redhat-release) +if [ "$OSVERSION" -eq 7 ]; then + # this is the latest version with guaranteed sl7 support + EXTRAPATH=/cvmfs/oasis.opensciencegrid.org/mis/apptainer/1.2.5/bin +fi +DEVICE=auto +THREADCONTROL="" get_sandbox(){ if [ -z "$SANDBOX" ]; then @@ -41,10 +47,10 @@ usage() { $ECHO "-c \t don't cleanup temporary dir (for debugging)" $ECHO "-C [dir] \t directory containing Nvidia compatibility drivers (checks CMSSW_BASE by default if available)" $ECHO "-D \t dry run: print container commands rather than executing them" - $ECHO "-d \t use Docker instead of Apptainer" + $ECHO "-d [exe] \t container choice: apptainer, docker, podman, podman-hpc (default: ${CONTAINER})" $ECHO "-E [path] \t include extra path(s) for executables (default: ${EXTRAPATH})" $ECHO "-f \t force reuse of (possibly) existing container instance" - $ECHO "-g \t use GPU instead of CPU" + $ECHO "-g [device] \t device choice: auto (try to detect GPU), CPU, GPU (default: ${DEVICE})" $ECHO "-i [name] \t server image name (default: ${IMAGE})" $ECHO "-I [num] \t number of model instances (default: ${INSTANCES} -> means no local editing of config files)" $ECHO "-M [dir] \t model repository (can be given more than once)" @@ -53,7 +59,7 @@ usage() { $ECHO "-P [port] \t base port number for services (-1: automatically find an unused port range) (default: ${BASEPORT})" $ECHO "-p [pid] \t automatically shut down server when process w/ specified PID ends (-1: use parent process PID)" $ECHO "-r [num] \t number of retries when starting container (default: ${RETRIES})" - $ECHO "-s [dir] \t Apptainer sandbox directory (default: $(get_sandbox))" + $ECHO "-s [dir] \t apptainer sandbox directory (default: $(get_sandbox))" $ECHO "-t [dir] \t non-default hidden temporary dir" $ECHO "-v \t (verbose) start: activate server debugging info; stop: keep server logs" $ECHO "-w [time] \t maximum time to wait for server to start (default: ${WTIME} seconds)" @@ -72,7 +78,7 @@ if [ -e /run/shm ]; then SHM=/run/shm fi -while getopts "cC:Ddfgi:I:M:m:n:P:p:r:s:t:vw:h" opt; do +while getopts "cC:Dd:fg:i:I:M:m:n:P:p:r:s:t:vw:h" opt; do case "$opt" in c) CLEANUP="" ;; @@ -80,11 +86,11 @@ while getopts "cC:Ddfgi:I:M:m:n:P:p:r:s:t:vw:h" opt; do ;; D) DRYRUN=echo ;; - d) USEDOCKER=true + d) CONTAINER="$OPTARG" ;; f) FORCE=true ;; - g) GPU=true + g) DEVICE="$OPTARG" ;; i) IMAGE="$OPTARG" ;; @@ -122,6 +128,20 @@ if [ "$OP" != start ] && [ "$OP" != stop ] && [ "$OP" != check ]; then usage 1 fi +# check acceptable values for device choice +DEVICE="${DEVICE,,}" +if [[ ! " auto cpu gpu " =~ " $DEVICE " ]]; then + echo "Unsupported device value: $DEVICE" + exit 1 +fi + +# check acceptable values for container choice +CONTAINER="${CONTAINER,,}" +if [[ ! " apptainer docker podman podman-hpc " =~ " $CONTAINER " ]]; then + echo "Unsupported container value: $CONTAINER" + exit 1 +fi + if [ "$RETRIES" -le 0 ]; then RETRIES=1 fi @@ -139,9 +159,17 @@ if [ -n "$EXTRAPATH" ]; then fi # find executables -if [ -n "$USEDOCKER" ]; then +if [ "$CONTAINER" == "docker" ]; then if [ -z "$DOCKER" ]; then - DOCKER="sudo docker" + DOCKER="docker" + fi +elif [ "$CONTAINER" == "podman" ]; then + if [ -z "$PODMAN" ]; then + PODMAN="podman" + fi +elif [ "$CONTAINER" == "podman-hpc" ]; then + if [ -z "$PODMAN" ]; then + PODMAN="podman-hpc" fi else if [ -z "$APPTAINER" ]; then @@ -166,9 +194,8 @@ SEGFAULT_INDICATOR="Address already in use" EXTRA="" COMPAT_SCRIPT=/etc/shinit_v2 -THREADCONTROL="" -# do not apply thread control settings if GPU use is requested -if [ "$INSTANCES" -gt 0 ] && [ -z "$GPU" ]; then +# this will be reset later if chosen device is gpu +if [ "$INSTANCES" -gt 0 ]; then THREADCONTROL=true fi @@ -239,6 +266,29 @@ start_docker(){ ${IMAGE} tritonserver $PORTARGS $REPOARGS $VERBOSE } +start_podman(){ + # mount all model repositories + MOUNTARGS="" + REPOARGS="" + for REPO in ${REPOS[@]}; do + MOUNTARGS="$MOUNTARGS --volume $REPO:$REPO" + REPOARGS="$REPOARGS --model-repository=${REPO}" + done + + # compatibility driver environment + if [ -n "$COMPAT" ]; then + MOUNTARGS="$MOUNTARGS --volume $COMPAT" + if [ -n "$COMPAT_SCRIPT_MOUNT" ]; then + MOUNTARGS="$MOUNTARGS --volume $COMPAT_SCRIPT_MOUNT" + fi + fi + + $DRYRUN $PODMAN run -d --name ${SERVER} \ + --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \ + -p${HTTPPORT}:${HTTPPORT} -p${GRPCPORT}:${GRPCPORT} -p${METRPORT}:${METRPORT} $EXTRA $MOUNTARGS \ + ${IMAGE} tritonserver $PORTARGS $REPOARGS $VERBOSE +} + start_apptainer(){ # triton server image may need to modify contents of opt/tritonserver/lib/ # but cvmfs is read-only @@ -263,20 +313,11 @@ start_apptainer(){ fi fi - # workaround for nvidia libs w/ singularity-in-singularity - # from https://github.com/hpcng/singularity/issues/5759#issuecomment-919523970 - if [ -d /.singularity.d/libs ]; then - TMPD=`mktemp -d` - (echo '#!/bin/bash'; echo 'exec /usr/sbin/ldconfig -C '"$TMPD"'/ld.so.cache "$@"') > $TMPD/ldconfig - chmod +x $TMPD/ldconfig - PATH=$TMPD:$PATH - # this does not work with LD_LIBRARY_PATH from cmsenv - ldconfig /.singularity.d/libs - fi - # start instance # need to bind /cvmfs for above symlinks to work inside container + # --underlay: workaround for https://github.com/apptainer/apptainer/issues/2167 $DRYRUN $APPTAINER instance start \ + --underlay \ -B ${SHM}:/run/shm -B ${LIB}:/opt/tritonserver/lib -B ${SANDBOX} $MOUNTARGS $EXTRA \ ${SANDBOX} ${SERVER} @@ -307,6 +348,16 @@ stop_docker(){ $DRYRUN $DOCKER rm ${SERVER} } +stop_podman(){ + # keep log + if [ -z "$DRYRUN" ]; then + if [ -n "$VERBOSE" ]; then $PODMAN logs ${SERVER} >& "$LOG"; fi + fi + + $DRYRUN $PODMAN stop ${SERVER} + $DRYRUN $PODMAN rm ${SERVER} +} + stop_apptainer(){ $DRYRUN $APPTAINER instance stop ${SERVER} } @@ -316,6 +367,11 @@ test_docker(){ ${DOCKER} logs ${SERVER} |& grep "$1" } +test_podman(){ + # podman logs print to stdout + ${PODMAN} logs ${SERVER} |& grep "$1" +} + test_apptainer(){ grep "$1" $LOG } @@ -464,10 +520,14 @@ driver_docker(){ $DOCKER run --rm --entrypoint env ${IMAGE} | grep "CUDA_DRIVER_VERSION=" } +driver_podman(){ + $PODMAN run --rm --entrypoint env ${IMAGE} | grep "CUDA_DRIVER_VERSION=" +} + driver_apptainer(){ D2S=${SANDBOX}/.singularity.d/env/10-docker2singularity.sh if [ -f "$D2S" ]; then - source $D2S && echo $CUDA_DRIVER_VERSION + source $D2S && echo "CUDA_DRIVER_VERSION=$CUDA_DRIVER_VERSION" fi } @@ -481,9 +541,10 @@ compat_apptainer(){ check_drivers(){ # get sandbox env vars in subshell - CUDA_DRIVER_VERSION=$($DRIVER_FN) + eval "$($DRIVER_FN)" # copied from https://github.com/triton-inference-server/server/blob/v2.11.0/nvidia_entrypoint.sh - DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true) + # regex generalized to handle SUSE + DRIVER_VERSION=$(sed -nr 's/^NVRM.*Kernel Module[^.]* ([0-9.]*).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true) if [[ "${DRIVER_VERSION%%.*}" -ge "${CUDA_DRIVER_VERSION%%.*}" ]]; then return 0 fi @@ -533,26 +594,60 @@ check_drivers(){ fi } -if [ -n "$USEDOCKER" ]; then - if [ -n "$GPU" ]; then +extra_docker(){ + if [ "$DEVICE" == gpu ]; then EXTRA="--gpus all" fi +} +extra_podman(){ + if [ "$DEVICE" == gpu ]; then + EXTRA="--device nvidia.com/gpu=all" + fi +} +extra_podman_hpc(){ + if [ "$DEVICE" == gpu ]; then + EXTRA="--gpu" + fi + EXTRA="$EXTRA --cvmfs --log-driver=json-file" +} +extra_apptainer(){ + if [ "$DEVICE" == gpu ]; then + EXTRA="--nv" + fi +} + +if [ "$CONTAINER" == "docker" ]; then START_FN=start_docker + EXTRA_FN=extra_docker TEST_FN=test_docker STOP_FN=stop_docker DRIVER_FN=driver_docker COMPAT_FN=compat_docker - PROG_NAME=Docker + PROG_NAME=docker +elif [ "$CONTAINER" == "podman" ]; then + START_FN=start_podman + EXTRA_FN=extra_podman + TEST_FN=test_podman + STOP_FN=stop_podman + DRIVER_FN=driver_podman + COMPAT_FN=compat_podman + PROG_NAME=podman +elif [ "$CONTAINER" == "podman-hpc" ]; then + START_FN=start_podman + EXTRA_FN=extra_podman_hpc + TEST_FN=test_podman + STOP_FN=stop_podman + DRIVER_FN=driver_podman + COMPAT_FN=compat_podman + PROG_NAME=podman-hpc else - if [ -n "$GPU" ]; then - EXTRA="--nv" - fi START_FN=start_apptainer + EXTRA_FN=extra_apptainer TEST_FN=test_apptainer STOP_FN=stop_apptainer DRIVER_FN=driver_apptainer COMPAT_FN=compat_apptainer - PROG_NAME=Apptainer + PROG_NAME=apptainer fi if [ "$OP" == check ]; then @@ -567,6 +662,22 @@ elif [ "$OP" == start ]; then exit 1 fi + # auto GPU check + if [ "$DEVICE" == auto ]; then + if nvidia-smi -L >& /dev/null; then + DEVICE=gpu + else + DEVICE=cpu + fi + fi + echo "CMS_TRITON_CHOSEN_DEVICE: $DEVICE" + $EXTRA_FN + + # do not apply thread control settings if GPU use is requested + if [ "$DEVICE" == gpu ]; then + THREADCONTROL="" + fi + handle_ports PORT_EXIT=$? if [ "$PORT_EXIT" -ne 0 ]; then exit $PORT_EXIT; fi @@ -585,7 +696,7 @@ elif [ "$OP" == start ]; then if [ "$counter" -eq 0 ] || [ -n "$THREADCONTROL" ]; then list_models; fi # only need to check drivers if using GPU - if [ -n "$GPU" ]; then + if [ "$DEVICE" == gpu ]; then check_drivers DRIVER_EXIT=$? if [ "$DRIVER_EXIT" -ne 0 ]; then exit $DRIVER_EXIT; fi diff --git a/HeterogeneousCore/SonicTriton/scripts/cmsTritonConfigTool b/HeterogeneousCore/SonicTriton/scripts/cmsTritonConfigTool index 00c08742dd5f9..a313ebfdc9ab5 100755 --- a/HeterogeneousCore/SonicTriton/scripts/cmsTritonConfigTool +++ b/HeterogeneousCore/SonicTriton/scripts/cmsTritonConfigTool @@ -381,7 +381,7 @@ def cfg_threadcontrol(args): for key,val in thread_control_parameters.items(): if key in platform: # partial matching for param in val: - item = args.model.parameters.get_or_create(key) + item = args.model.parameters.get_or_create(param) item.string_value = "1" found_params = True break diff --git a/HeterogeneousCore/SonicTriton/src/TritonClient.cc b/HeterogeneousCore/SonicTriton/src/TritonClient.cc index 76fd670bb66bc..54a0205a1f0be 100644 --- a/HeterogeneousCore/SonicTriton/src/TritonClient.cc +++ b/HeterogeneousCore/SonicTriton/src/TritonClient.cc @@ -66,8 +66,7 @@ TritonClient::TritonClient(const edm::ParameterSet& params, const std::string& d const auto& server = ts->serverInfo(options_[0].model_name_, params.getUntrackedParameter("preferredServer")); serverType_ = server.type; - if (verbose_) - edm::LogInfo(fullDebugName_) << "Using server: " << server.url; + edm::LogInfo("TritonDiscovery") << debugName_ << " assigned server: " << server.url; //enforce sync mode for fallback CPU server to avoid contention //todo: could enforce async mode otherwise (unless mode was specified by user?) if (serverType_ == TritonServerType::LocalCPU) @@ -264,8 +263,8 @@ unsigned TritonClient::batchSize() const { return batchMode_ == TritonBatchMode: bool TritonClient::setBatchSize(unsigned bsize) { if (batchMode_ == TritonBatchMode::Rectangular) { if (bsize > maxOuterDim_) { - edm::LogWarning(fullDebugName_) << "Requested batch size " << bsize << " exceeds server-specified max batch size " - << maxOuterDim_ << ". Batch size will remain as " << outerDim_; + throw TritonException("LocalFailure") + << "Requested batch size " << bsize << " exceeds server-specified max batch size " << maxOuterDim_ << "."; return false; } else { outerDim_ = bsize; diff --git a/HeterogeneousCore/SonicTriton/src/TritonService.cc b/HeterogeneousCore/SonicTriton/src/TritonService.cc index 53b94f767062b..ca5aa9c7c65e7 100644 --- a/HeterogeneousCore/SonicTriton/src/TritonService.cc +++ b/HeterogeneousCore/SonicTriton/src/TritonService.cc @@ -3,16 +3,20 @@ #include "DataFormats/Provenance/interface/ModuleDescription.h" #include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ParameterSet/interface/allowedValues.h" #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" #include "FWCore/ParameterSet/interface/ParameterSetDescription.h" #include "FWCore/ServiceRegistry/interface/ActivityRegistry.h" #include "FWCore/ServiceRegistry/interface/SystemBounds.h" #include "FWCore/ServiceRegistry/interface/ProcessContext.h" #include "FWCore/Utilities/interface/Exception.h" +#include "FWCore/Utilities/interface/GetEnvironmentVariable.h" #include "grpc_client.h" #include "grpc_service.pb.h" +#include +#include #include #include #include @@ -25,6 +29,7 @@ namespace tc = triton::client; const std::string TritonService::Server::fallbackName{"fallback"}; const std::string TritonService::Server::fallbackAddress{"0.0.0.0"}; +const std::string TritonService::Server::siteconfName{"SONIC_LOCAL_BALANCER"}; namespace { std::pair execSys(const std::string& cmd) { @@ -53,6 +58,18 @@ namespace { int rv = pclose(pipe); return std::make_pair(result, rv); } + + //extract specific info from log + std::string extractFromLog(const std::string& output, const std::string& indicator) { + //find last instance in log (in case of multiple) + auto pos = output.rfind(indicator); + if (pos != std::string::npos) { + auto pos2 = pos + indicator.size(); + auto pos3 = output.find('\n', pos2); + return output.substr(pos2, pos3 - pos2); + } else + return ""; + } } // namespace TritonService::TritonService(const edm::ParameterSet& pset, edm::ActivityRegistry& areg) @@ -74,25 +91,25 @@ TritonService::TritonService(const edm::ParameterSet& pset, edm::ActivityRegistr areg.watchPreBeginJob(this, &TritonService::preBeginJob); areg.watchPostEndJob(this, &TritonService::postEndJob); - //include fallback server in set if enabled - if (fallbackOpts_.enable) { - auto serverType = TritonServerType::Remote; - if (!fallbackOpts_.useGPU) - serverType = TritonServerType::LocalCPU; -#ifdef TRITON_ENABLE_GPU - else - serverType = TritonServerType::LocalGPU; -#endif - - servers_.emplace(std::piecewise_construct, - std::forward_as_tuple(Server::fallbackName), - std::forward_as_tuple(Server::fallbackName, Server::fallbackAddress, serverType)); - } + //check for server specified in SITECONF + //(temporary solution, to be replaced with entry in site-local-config.xml or similar) + std::string siteconf_address(edm::getEnvironmentVariable(Server::siteconfName + "_HOST")); + std::string siteconf_port(edm::getEnvironmentVariable(Server::siteconfName + "_PORT")); + if (!siteconf_address.empty() and !siteconf_port.empty()) { + servers_.emplace( + std::piecewise_construct, + std::forward_as_tuple(Server::siteconfName), + std::forward_as_tuple(Server::siteconfName, siteconf_address + ":" + siteconf_port, TritonServerType::Remote)); + if (verbose_) + edm::LogInfo("TritonDiscovery") << "Obtained server from SITECONF: " + << servers_.find(Server::siteconfName)->second.url; + } else if (siteconf_address.empty() != siteconf_port.empty()) { //xor + edm::LogWarning("TritonDiscovery") << "Incomplete server information from SITECONF: HOST = " << siteconf_address + << ", PORT = " << siteconf_port; + } else + edm::LogWarning("TritonDiscovery") << "No server information from SITECONF"; - //loop over input servers: check which models they have - std::string msg; - if (verbose_) - msg = "List of models for each server:\n"; + //finally, populate list of servers from config input for (const auto& serverPset : pset.getUntrackedParameterSetVector("servers")) { const std::string& serverName(serverPset.getUntrackedParameter("name")); //ensure uniqueness @@ -100,8 +117,13 @@ TritonService::TritonService(const edm::ParameterSet& pset, edm::ActivityRegistr if (!unique) throw cms::Exception("DuplicateServer") << "TritonService: Not allowed to specify more than one server with same name (" << serverName << ")"; - auto& server(sit->second); + } + //loop over all servers: check which models they have + std::string msg; + if (verbose_) + msg = "List of models for each server:\n"; + for (auto& [serverName, server] : servers_) { std::unique_ptr client; TRITON_THROW_IF_ERROR( tc::InferenceServerGrpcClient::Create(&client, server.url, false, server.useSsl, server.sslOptions), @@ -110,37 +132,46 @@ TritonService::TritonService(const edm::ParameterSet& pset, edm::ActivityRegistr if (verbose_) { inference::ServerMetadataResponse serverMetaResponse; - TRITON_THROW_IF_ERROR(client->ServerMetadata(&serverMetaResponse), - "TritonService(): unable to get metadata for " + serverName + " (" + server.url + ")", - false); - edm::LogInfo("TritonService") << "Server " << serverName << ": url = " << server.url - << ", version = " << serverMetaResponse.version(); + auto err = client->ServerMetadata(&serverMetaResponse); + if (err.IsOk()) + edm::LogInfo("TritonService") << "Server " << serverName << ": url = " << server.url + << ", version = " << serverMetaResponse.version(); + else + edm::LogInfo("TritonService") << "unable to get metadata for " + serverName + " (" + server.url + ")"; } + //if this query fails, it indicates that the server is nonresponsive or saturated + //in which case it should just be skipped inference::RepositoryIndexResponse repoIndexResponse; - TRITON_THROW_IF_ERROR(client->ModelRepositoryIndex(&repoIndexResponse), - "TritonService(): unable to get repository index for " + serverName + " (" + server.url + ")", - false); + auto err = client->ModelRepositoryIndex(&repoIndexResponse); //servers keep track of models and vice versa if (verbose_) msg += serverName + ": "; - for (const auto& modelIndex : repoIndexResponse.models()) { - const auto& modelName = modelIndex.name(); - auto mit = models_.find(modelName); - if (mit == models_.end()) - mit = models_.emplace(modelName, "").first; - auto& modelInfo(mit->second); - modelInfo.servers.insert(serverName); - server.models.insert(modelName); + if (err.IsOk()) { + for (const auto& modelIndex : repoIndexResponse.models()) { + const auto& modelName = modelIndex.name(); + auto mit = models_.find(modelName); + if (mit == models_.end()) + mit = models_.emplace(modelName, "").first; + auto& modelInfo(mit->second); + modelInfo.servers.insert(serverName); + server.models.insert(modelName); + if (verbose_) + msg += modelName + ", "; + } + } else { if (verbose_) - msg += modelName + ", "; + msg += "unable to get repository index"; + else + edm::LogWarning("TritonFailure") << "TritonService(): unable to get repository index for " + serverName + " (" + + server.url + ")"; } if (verbose_) msg += "\n"; } if (verbose_) - edm::LogInfo("TritonService") << msg; + edm::LogInfo("TritonDiscovery") << msg; } void TritonService::preallocate(edm::service::SystemBounds const& bounds) { @@ -217,6 +248,14 @@ void TritonService::preBeginJob(edm::PathsAndConsumesOfModulesBase const&, edm:: if (!fallbackOpts_.enable or unservedModels_.empty()) return; + //include fallback server in set + auto serverType = TritonServerType::LocalCPU; + if (fallbackOpts_.device == "gpu") + serverType = TritonServerType::LocalGPU; + servers_.emplace(std::piecewise_construct, + std::forward_as_tuple(Server::fallbackName), + std::forward_as_tuple(Server::fallbackName, Server::fallbackAddress, serverType)); + std::string msg; if (verbose_) msg = "List of models for fallback server: "; @@ -230,18 +269,16 @@ void TritonService::preBeginJob(edm::PathsAndConsumesOfModulesBase const&, edm:: msg += modelName + ", "; } if (verbose_) - edm::LogInfo("TritonService") << msg; + edm::LogInfo("TritonDiscovery") << msg; //assemble server start command fallbackOpts_.command = "cmsTriton -P -1 -p " + pid_; + fallbackOpts_.command += " -g " + fallbackOpts_.device; + fallbackOpts_.command += " -d " + fallbackOpts_.container; if (fallbackOpts_.debug) fallbackOpts_.command += " -c"; if (fallbackOpts_.verbose) fallbackOpts_.command += " -v"; - if (fallbackOpts_.useDocker) - fallbackOpts_.command += " -d"; - if (fallbackOpts_.useGPU) - fallbackOpts_.command += " -g"; if (!fallbackOpts_.instanceName.empty()) fallbackOpts_.command += " -n " + fallbackOpts_.instanceName; if (fallbackOpts_.retries >= 0) @@ -282,22 +319,42 @@ void TritonService::preBeginJob(edm::PathsAndConsumesOfModulesBase const&, edm:: if (rv != 0) { edm::LogError("TritonService") << output; printFallbackServerLog(); - throw cms::Exception("FallbackFailed") + throw edm::Exception(edm::errors::ExternalFailure) << "TritonService: Starting the fallback server failed with exit code " << rv; } else if (verbose_) edm::LogInfo("TritonService") << output; + + //get the chosen device + std::string chosenDevice(fallbackOpts_.device); + if (chosenDevice == "auto") { + chosenDevice = extractFromLog(output, "CMS_TRITON_CHOSEN_DEVICE: "); + if (!chosenDevice.empty()) { + if (chosenDevice == "cpu") + server.type = TritonServerType::LocalCPU; + else if (chosenDevice == "gpu") + server.type = TritonServerType::LocalGPU; + else + throw edm::Exception(edm::errors::ExternalFailure) + << "TritonService: unsupported device choice " << chosenDevice << " for fallback server, log follows:\n" + << output; + } else + throw edm::Exception(edm::errors::ExternalFailure) + << "TritonService: unknown device choice for fallback server, log follows:\n" + << output; + } + //print server info + std::transform(chosenDevice.begin(), chosenDevice.end(), chosenDevice.begin(), toupper); + if (verbose_) + edm::LogInfo("TritonDiscovery") << "Fallback server started: " << chosenDevice; + //get the port - const std::string& portIndicator("CMS_TRITON_GRPC_PORT: "); - //find last instance in log in case multiple ports were tried - auto pos = output.rfind(portIndicator); - if (pos != std::string::npos) { - auto pos2 = pos + portIndicator.size(); - auto pos3 = output.find('\n', pos2); - const auto& portNum = output.substr(pos2, pos3 - pos2); + const auto& portNum = extractFromLog(output, "CMS_TRITON_GRPC_PORT: "); + if (!portNum.empty()) server.url += ":" + portNum; - } else - throw cms::Exception("FallbackFailed") << "TritonService: Unknown port for fallback server, log follows:\n" - << output; + else + throw edm::Exception(edm::errors::ExternalFailure) + << "TritonService: Unknown port for fallback server, log follows:\n" + << output; } void TritonService::notifyCallStatus(bool status) const { @@ -378,8 +435,10 @@ void TritonService::fillDescriptions(edm::ConfigurationDescriptions& description fallbackDesc.addUntracked("enable", false); fallbackDesc.addUntracked("debug", false); fallbackDesc.addUntracked("verbose", false); - fallbackDesc.addUntracked("useDocker", false); - fallbackDesc.addUntracked("useGPU", false); + fallbackDesc.ifValue(edm::ParameterDescription("container", "apptainer", false), + edm::allowedValues("apptainer", "docker", "podman")); + fallbackDesc.ifValue(edm::ParameterDescription("device", "auto", false), + edm::allowedValues("auto", "cpu", "gpu")); fallbackDesc.addUntracked("retries", -1); fallbackDesc.addUntracked("wait", -1); fallbackDesc.addUntracked("instanceBaseName", "triton_server_instance"); diff --git a/HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py b/HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py index fa891adb88721..9cede0e496706 100644 --- a/HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py +++ b/HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py @@ -15,6 +15,7 @@ allowed_modes = ["Async","PseudoAsync","Sync"] allowed_compression = ["none","deflate","gzip"] allowed_devices = ["auto","cpu","gpu"] +allowed_containers = ["apptainer","docker","podman","podman-hpc"] parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument("--maxEvents", default=-1, type=int, help="Number of events to process (-1 for all)") @@ -33,6 +34,7 @@ parser.add_argument("--verboseClient", default=False, action="store_true", help="enable verbose output for clients") parser.add_argument("--verboseServer", default=False, action="store_true", help="enable verbose output for server") parser.add_argument("--verboseService", default=False, action="store_true", help="enable verbose output for TritonService") +parser.add_argument("--verboseDiscovery", default=False, action="store_true", help="enable verbose output just for server discovery in TritonService") parser.add_argument("--brief", default=False, action="store_true", help="briefer output for graph modules") parser.add_argument("--fallbackName", default="", type=str, help="name for fallback server") parser.add_argument("--unittest", default=False, action="store_true", help="unit test mode: reduce input sizes") @@ -41,7 +43,7 @@ parser.add_argument("--compression", default="", type=str, choices=allowed_compression, help="enable I/O compression") parser.add_argument("--ssl", default=False, action="store_true", help="enable SSL authentication for server communication") parser.add_argument("--device", default="auto", type=str.lower, choices=allowed_devices, help="specify device for fallback server") -parser.add_argument("--docker", default=False, action="store_true", help="use Docker for fallback server") +parser.add_argument("--container", default="apptainer", type=str.lower, choices=allowed_containers, help="specify container for fallback server") parser.add_argument("--tries", default=0, type=int, help="number of retries for failed request") options = parser.parse_args() @@ -71,13 +73,12 @@ process.source = cms.Source("EmptySource") -process.TritonService.verbose = options.verbose or options.verboseService +process.TritonService.verbose = options.verbose or options.verboseService or options.verboseDiscovery process.TritonService.fallback.verbose = options.verbose or options.verboseServer -process.TritonService.fallback.useDocker = options.docker +process.TritonService.fallback.container = options.container +process.TritonService.fallback.device = options.device if len(options.fallbackName)>0: process.TritonService.fallback.instanceBaseName = options.fallbackName -if options.device != "auto": - process.TritonService.fallback.useGPU = options.device=="gpu" if len(options.address)>0: process.TritonService.servers.append( cms.PSet( @@ -100,7 +101,13 @@ "Analyzer": cms.EDAnalyzer, } -keepMsgs = ['TritonClient','TritonService'] +keepMsgs = [] +if options.verbose or options.verboseDiscovery: + keepMsgs.append('TritonDiscovery') +if options.verbose or options.verboseClient: + keepMsgs.append('TritonClient') +if options.verbose or options.verboseService: + keepMsgs.append('TritonService') for im,module in enumerate(options.modules): model = options.models[im] @@ -141,7 +148,8 @@ processModule.edgeMax = cms.uint32(15000) processModule.brief = cms.bool(options.brief) process.p += processModule - keepMsgs.extend([module,module+':TritonClient']) + if options.verbose or options.verboseClient: + keepMsgs.extend([module,module+':TritonClient']) if options.testother: # clone modules to test both gRPC and shared memory _module2 = module+"GRPC" if processModule.Client.useSharedMemory else "SHM" @@ -152,7 +160,8 @@ ) processModule2 = getattr(process, _module2) process.p += processModule2 - keepMsgs.extend([_module2,_module2+':TritonClient']) + if options.verbose or options.verboseClient: + keepMsgs.extend([_module2,_module2+':TritonClient']) process.load('FWCore/MessageService/MessageLogger_cfi') process.MessageLogger.cerr.FwkReport.reportEvery = 500 diff --git a/RecoEcal/EgammaClusterProducers/test/DRNTest_cfg.py b/RecoEcal/EgammaClusterProducers/test/DRNTest_cfg.py index 52734b58dbb09..f777e379362f7 100644 --- a/RecoEcal/EgammaClusterProducers/test/DRNTest_cfg.py +++ b/RecoEcal/EgammaClusterProducers/test/DRNTest_cfg.py @@ -15,8 +15,8 @@ process.TritonService.verbose = False process.TritonService.fallback.verbose = False -process.TritonService.fallback.useDocker = False -process.TritonService.fallback.useGPU = False +process.TritonService.fallback.container = "apptainer" +process.TritonService.fallback.device = "cpu" process.MessageLogger.cerr.FwkReport.reportEvery = cms.untracked.int32(100) #process.MessageLogger.suppressWarning = cms.untracked.vstring('DRNProducerEB', 'DRNProducerEE')