From f1b99f61cd685e06445b05685e1465f506979323 Mon Sep 17 00:00:00 2001 From: Tongxuan Liu Date: Wed, 3 Jul 2024 02:09:08 +0000 Subject: [PATCH] bugfix: fix multiple definition issue. This related to commit d711c5553f7746ef7efbd8dac429c1e711250ee8. --- scalellm/csrc/vlm_handler.cpp | 15 ++++ src/common/metrics.h | 38 +++++----- src/engine/CMakeLists.txt | 2 + src/engine/engine_metrics.cpp | 15 ++++ src/engine/engine_metrics.h | 9 +++ src/engine/llm_engine.cpp | 4 +- src/engine/vlm_engine.cpp | 6 +- src/engine/vlm_worker.cpp | 20 ++---- src/engine/worker.cpp | 14 +--- src/handlers/CMakeLists.txt | 2 + src/handlers/handler_metrics.cpp | 42 +++++++++++ src/handlers/handler_metrics.h | 19 +++++ src/handlers/llm_handler.cpp | 42 +---------- src/handlers/vlm_handler.cpp | 116 ++++++++++--------------------- 14 files changed, 172 insertions(+), 172 deletions(-) create mode 100644 src/engine/engine_metrics.cpp create mode 100644 src/engine/engine_metrics.h create mode 100644 src/handlers/handler_metrics.cpp create mode 100644 src/handlers/handler_metrics.h diff --git a/scalellm/csrc/vlm_handler.cpp b/scalellm/csrc/vlm_handler.cpp index 4602ced1..ab699420 100644 --- a/scalellm/csrc/vlm_handler.cpp +++ b/scalellm/csrc/vlm_handler.cpp @@ -10,6 +10,21 @@ namespace py = pybind11; using namespace pybind11::literals; void init_vlm_handler(py::module_& m) { + py::enum_(m, "Priority") + .value("DEFAULT", Priority::NORMAL) + .value("LOW", Priority::LOW) + .value("NORMAL", Priority::NORMAL) + .value("HIGH", Priority::HIGH) + .export_values(); + + py::class_>(m, "Future") + .def("wait", + &std::future::wait, + py::call_guard()) + .def("get", + &std::future::get, + py::call_guard()); + auto vlm_handler = py::class_(m, "VLMHandler") .def(py::init(), py::arg("options")) diff --git a/src/common/metrics.h b/src/common/metrics.h index cfc88490..5ec26ad2 100644 --- a/src/common/metrics.h +++ b/src/common/metrics.h @@ -93,14 +93,16 @@ class AutoCounter final { // define gauge // a gauge is a metric that represents a single numerical value that can // arbitrarily go up and down. -#define DEFINE_GAUGE(name, desc) \ - auto& GAUGE_##name = llm::Metrics::Instance().BuildGauge(#name, desc).Add({}); +#define DEFINE_GAUGE(name, desc) \ + prometheus::Gauge& GAUGE_##name = \ + llm::Metrics::Instance().BuildGauge(#name, desc).Add({}); -#define DEFINE_GAUGE_FAMILY(name, desc) \ - auto& name##_family = llm::Metrics::Instance().BuildGauge(#name, desc); +#define DEFINE_GAUGE_FAMILY(name, desc) \ + prometheus::Family& name##_family = \ + llm::Metrics::Instance().BuildGauge(#name, desc); #define DEFINE_GAUGE_INSTANCE(alias, name, ...) \ - auto& GAUGE_##alias = name##_family.Add(__VA_ARGS__); + prometheus::Gauge& GAUGE_##alias = name##_family.Add(__VA_ARGS__); #define GAUGE_SET(name, value) GAUGE_##name.Set(value); @@ -111,15 +113,16 @@ class AutoCounter final { // define counter // a counter is a monotonically increasing counter whose value can only increase // or be reset to zero on restart. -#define DEFINE_COUNTER(name, desc) \ - auto& COUNTER_##name = \ +#define DEFINE_COUNTER(name, desc) \ + prometheus::Counter& COUNTER_##name = \ llm::Metrics::Instance().BuildCounter(#name, desc).Add({}); -#define DEFINE_COUNTER_FAMILY(name, desc) \ - auto& name##_family = llm::Metrics::Instance().BuildCounter(#name, desc); +#define DEFINE_COUNTER_FAMILY(name, desc) \ + prometheus::Family& name##_family = \ + llm::Metrics::Instance().BuildCounter(#name, desc); #define DEFINE_COUNTER_INSTANCE(alias, name, ...) \ - auto& COUNTER_##alias = name##_family.Add(__VA_ARGS__); + prometheus::Counter& COUNTER_##alias = name##_family.Add(__VA_ARGS__); #define COUNTER_ADD(name, value) COUNTER_##name.Increment(value); @@ -133,16 +136,17 @@ class AutoCounter final { // a histogram samples observations (usually things like request durations or // response sizes) and counts them in configurable buckets. It also provides a // sum of all observed values. -#define DEFINE_HISTOGRAM(name, desc, ...) \ - auto& HISTOGRAM_##name = llm::Metrics::Instance() \ - .BuildHistogram(#name, desc) \ - .Add({}, __VA_ARGS__); +#define DEFINE_HISTOGRAM(name, desc, ...) \ + prometheus::Histogram& HISTOGRAM_##name = llm::Metrics::Instance() \ + .BuildHistogram(#name, desc) \ + .Add({}, __VA_ARGS__); -#define DEFINE_HISTOGRAM_FAMILY(name, desc) \ - auto& name##_family = llm::Metrics::Instance().BuildHistogram(#name, desc); +#define DEFINE_HISTOGRAM_FAMILY(name, desc) \ + prometheus::Family& name##_family = \ + llm::Metrics::Instance().BuildHistogram(#name, desc); #define DEFINE_HISTOGRAM_INSTANCE(alias, name, ...) \ - auto& HISTOGRAM_##alias = name##_family.Add(__VA_ARGS__); + prometheus::Histogram& HISTOGRAM_##alias = name##_family.Add(__VA_ARGS__); #define HISTOGRAM_OBSERVE(name, value) HISTOGRAM_##name.Observe(value); diff --git a/src/engine/CMakeLists.txt b/src/engine/CMakeLists.txt index 8f1483e4..2bf01acf 100644 --- a/src/engine/CMakeLists.txt +++ b/src/engine/CMakeLists.txt @@ -14,6 +14,7 @@ cc_library( engine.h llm_engine.h vlm_engine.h + engine_metrics.h SRCS utils.cpp batch.cpp @@ -22,6 +23,7 @@ cc_library( vlm_worker.cpp llm_engine.cpp vlm_engine.cpp + engine_metrics.cpp DEPS torch :common diff --git a/src/engine/engine_metrics.cpp b/src/engine/engine_metrics.cpp new file mode 100644 index 00000000..34d70204 --- /dev/null +++ b/src/engine/engine_metrics.cpp @@ -0,0 +1,15 @@ +#include "engine_metrics.h" + +DEFINE_COUNTER(prepare_input_latency_seconds, + "Latency of preparing input in seconds"); +DEFINE_COUNTER_FAMILY(execution_latency_seconds, + "Execution latency in seconds"); +DEFINE_COUNTER_INSTANCE(model_execution_latency_seconds, + execution_latency_seconds, + {{"stage", "model"}}); +DEFINE_COUNTER_INSTANCE(logits_processing_latency_seconds, + execution_latency_seconds, + {{"stage", "logits_processing"}}); +DEFINE_COUNTER_INSTANCE(sampling_latency_seconds, + execution_latency_seconds, + {{"stage", "sampling"}}); diff --git a/src/engine/engine_metrics.h b/src/engine/engine_metrics.h new file mode 100644 index 00000000..4b381dbb --- /dev/null +++ b/src/engine/engine_metrics.h @@ -0,0 +1,9 @@ +#pragma once + +#include "common/metrics.h" + +DECLARE_COUNTER(prepare_input_latency_seconds) +DECLARE_COUNTER_FAMILY(execution_latency_seconds) +DECLARE_COUNTER_INSTANCE(model_execution_latency_seconds) +DECLARE_COUNTER_INSTANCE(logits_processing_latency_seconds) +DECLARE_COUNTER_INSTANCE(sampling_latency_seconds) diff --git a/src/engine/llm_engine.cpp b/src/engine/llm_engine.cpp index c8f9d75c..df845218 100644 --- a/src/engine/llm_engine.cpp +++ b/src/engine/llm_engine.cpp @@ -10,14 +10,12 @@ #include "common/metrics.h" #include "common/pretty_print.h" +#include "engine_metrics.h" #include "model_loader/model_loader.h" #include "model_parallel/parallel_args.h" #include "models/model_args.h" #include "worker.h" -DEFINE_COUNTER(prepare_input_latency_seconds, - "Latency of preparing input in seconds"); - namespace llm { namespace { const std::vector kDefaultBatchSizesForCudaGraph = diff --git a/src/engine/vlm_engine.cpp b/src/engine/vlm_engine.cpp index 17f2a2cd..f32dbfdc 100644 --- a/src/engine/vlm_engine.cpp +++ b/src/engine/vlm_engine.cpp @@ -9,14 +9,12 @@ #include "common/metrics.h" #include "common/pretty_print.h" +#include "engine_metrics.h" #include "model_loader/model_loader.h" #include "model_parallel/parallel_args.h" #include "models/model_args.h" #include "vlm_worker.h" -// DEFINE_COUNTER(prepare_input_latency_seconds, -// "Latency of preparing input in seconds"); - namespace llm { namespace { // clang-format off @@ -270,7 +268,7 @@ ModelOutput VLMEngine::execute_model(Batch& batch) { Timer timer; auto model_inputs = batch.prepare_model_input(options_.num_decoding_tokens(), adjusted_batch_size); - // COUNTER_ADD(prepare_input_latency_seconds, timer.elapsed_seconds()); + COUNTER_ADD(prepare_input_latency_seconds, timer.elapsed_seconds()); if (!model_inputs.token_ids.defined()) { // empty input, just return diff --git a/src/engine/vlm_worker.cpp b/src/engine/vlm_worker.cpp index 7a6bf02a..cbc37a2c 100644 --- a/src/engine/vlm_worker.cpp +++ b/src/engine/vlm_worker.cpp @@ -15,6 +15,7 @@ #include "common/metrics.h" #include "common/threadpool.h" #include "common/timer.h" +#include "engine_metrics.h" #include "memory/kv_cache.h" #include "memory/memory.h" #include "model_loader/state_dict.h" @@ -23,19 +24,6 @@ #include "sampling/logits_processor.h" #include "sampling/sampler.h" -// latency metrics -// DEFINE_COUNTER_FAMILY(execution_latency_seconds, -// "Execution latency in seconds"); -// DEFINE_COUNTER_INSTANCE(model_execution_latency_seconds, -// execution_latency_seconds, -// {{"stage", "model"}}); -// DEFINE_COUNTER_INSTANCE(logits_processing_latency_seconds, -// execution_latency_seconds, -// {{"stage", "logits_processing"}}); -// DEFINE_COUNTER_INSTANCE(sampling_latency_seconds, -// execution_latency_seconds, -// {{"stage", "sampling"}}); - namespace llm { VLMWorker::VLMWorker(const ParallelArgs& parallel_args, @@ -149,7 +137,7 @@ std::optional VLMWorker::execute_model(const ModelInput& inputs) { } at::cuda::getCurrentCUDAStream().synchronize(); - // COUNTER_ADD(model_execution_latency_seconds, timer.elapsed_seconds()); + COUNTER_ADD(model_execution_latency_seconds, timer.elapsed_seconds()); if (!driver_) { return std::nullopt; @@ -166,7 +154,7 @@ std::optional VLMWorker::execute_model(const ModelInput& inputs) { sampling_params.unique_token_ids, sampling_params.unique_token_counts, sampling_params.unique_token_ids_lens); - // COUNTER_ADD(logits_processing_latency_seconds, timer.elapsed_seconds()); + COUNTER_ADD(logits_processing_latency_seconds, timer.elapsed_seconds()); // set logits to output output.logits = logits; @@ -179,7 +167,7 @@ std::optional VLMWorker::execute_model(const ModelInput& inputs) { auto sample_logits = logits.index_select(/*dim=*/0, sampling_params.sample_idxes); auto sample_output = sampler->forward(sample_logits); - // COUNTER_ADD(sampling_latency_seconds, timer.elapsed_seconds()); + COUNTER_ADD(sampling_latency_seconds, timer.elapsed_seconds()); // set sample output to output output.sample_output = sample_output; diff --git a/src/engine/worker.cpp b/src/engine/worker.cpp index af95386f..655c3162 100644 --- a/src/engine/worker.cpp +++ b/src/engine/worker.cpp @@ -15,6 +15,7 @@ #include "common/metrics.h" #include "common/threadpool.h" #include "common/timer.h" +#include "engine_metrics.h" #include "memory/kv_cache.h" #include "memory/memory.h" #include "model_loader/state_dict.h" @@ -23,19 +24,6 @@ #include "sampling/logits_processor.h" #include "sampling/sampler.h" -// latency metrics -DEFINE_COUNTER_FAMILY(execution_latency_seconds, - "Execution latency in seconds"); -DEFINE_COUNTER_INSTANCE(model_execution_latency_seconds, - execution_latency_seconds, - {{"stage", "model"}}); -DEFINE_COUNTER_INSTANCE(logits_processing_latency_seconds, - execution_latency_seconds, - {{"stage", "logits_processing"}}); -DEFINE_COUNTER_INSTANCE(sampling_latency_seconds, - execution_latency_seconds, - {{"stage", "sampling"}}); - namespace llm { Worker::Worker(const ParallelArgs& parallel_args, diff --git a/src/handlers/CMakeLists.txt b/src/handlers/CMakeLists.txt index e23e4fa8..4595cc2a 100644 --- a/src/handlers/CMakeLists.txt +++ b/src/handlers/CMakeLists.txt @@ -7,9 +7,11 @@ cc_library( sampling_params.h llm_handler.h vlm_handler.h + handler_metrics.h SRCS llm_handler.cpp vlm_handler.cpp + handler_metrics.cpp DEPS :common :scheduler diff --git a/src/handlers/handler_metrics.cpp b/src/handlers/handler_metrics.cpp new file mode 100644 index 00000000..38d7ae9f --- /dev/null +++ b/src/handlers/handler_metrics.cpp @@ -0,0 +1,42 @@ +#include "handler_metrics.h" + +DEFINE_COUNTER_FAMILY(request_status_total, "Total number of request status"); +DEFINE_COUNTER_INSTANCE(request_ok, request_status_total, {{"code", "OK"}}); +DEFINE_COUNTER_INSTANCE(request_cancelled, + request_status_total, + {{"code", "CANCELLED"}}); +DEFINE_COUNTER_INSTANCE(request_unknown, + request_status_total, + {{"code", "UNKNOWN"}}); +DEFINE_COUNTER_INSTANCE(request_invalid_argument, + request_status_total, + {{"code", "INVALID_ARGUMENT"}}); +DEFINE_COUNTER_INSTANCE(request_deadline_exceeded, + request_status_total, + {{"code", "DEADLINE_EXCEEDED"}}); +DEFINE_COUNTER_INSTANCE(request_resource_exhausted, + request_status_total, + {{"code", "RESOURCE_EXHAUSTED"}}); +DEFINE_COUNTER_INSTANCE(request_unauthenticated, + request_status_total, + {{"code", "UNAUTHENTICATED"}}); +DEFINE_COUNTER_INSTANCE(request_unavailable, + request_status_total, + {{"code", "UNAVAILABLE"}}); +DEFINE_COUNTER_INSTANCE(request_unimplemented, + request_status_total, + {{"code", "UNIMPLEMENTED"}}); + +DEFINE_COUNTER_FAMILY(request_handling_latency_seconds, + "Request handling latency in seconds"); +DEFINE_COUNTER_INSTANCE(chat_handling_latency_seconds, + request_handling_latency_seconds, + {{"type", "chat"}}); +DEFINE_COUNTER_INSTANCE(completion_handling_latency_seconds, + request_handling_latency_seconds, + {{"type", "completion"}}); + +DEFINE_COUNTER(tokenization_latency_seconds, + "Prompt tokenization latency in seconds"); +DEFINE_COUNTER(chat_template_latency_seconds, + "Chat template latency in seconds"); diff --git a/src/handlers/handler_metrics.h b/src/handlers/handler_metrics.h new file mode 100644 index 00000000..06bef77c --- /dev/null +++ b/src/handlers/handler_metrics.h @@ -0,0 +1,19 @@ +#pragma once + +#include "common/metrics.h" + +DECLARE_COUNTER_FAMILY(request_status_total) +DECLARE_COUNTER_INSTANCE(request_ok) +DECLARE_COUNTER_INSTANCE(request_cancelled) +DECLARE_COUNTER_INSTANCE(request_unknown) +DECLARE_COUNTER_INSTANCE(request_invalid_argument) +DECLARE_COUNTER_INSTANCE(request_deadline_exceeded) +DECLARE_COUNTER_INSTANCE(request_resource_exhausted) +DECLARE_COUNTER_INSTANCE(request_unauthenticated) +DECLARE_COUNTER_INSTANCE(request_unavailable) +DECLARE_COUNTER_INSTANCE(request_unimplemented) +DECLARE_COUNTER_FAMILY(request_handling_latency_seconds) +DECLARE_COUNTER_INSTANCE(chat_handling_latency_seconds) +DECLARE_COUNTER_INSTANCE(completion_handling_latency_seconds) +DECLARE_COUNTER(tokenization_latency_seconds) +DECLARE_COUNTER(chat_template_latency_seconds) diff --git a/src/handlers/llm_handler.cpp b/src/handlers/llm_handler.cpp index 47d9e0e0..f4768291 100644 --- a/src/handlers/llm_handler.cpp +++ b/src/handlers/llm_handler.cpp @@ -13,53 +13,13 @@ #include "common/scope_guard.h" #include "common/timer.h" #include "engine/utils.h" +#include "handler_metrics.h" #include "models/model_args.h" #include "models/model_registry.h" #include "request/output.h" #include "request/request.h" #include "speculative/speculative_engine.h" -DEFINE_COUNTER_FAMILY(request_status_total, "Total number of request status"); -DEFINE_COUNTER_INSTANCE(request_ok, request_status_total, {{"code", "OK"}}); -DEFINE_COUNTER_INSTANCE(request_cancelled, - request_status_total, - {{"code", "CANCELLED"}}); -DEFINE_COUNTER_INSTANCE(request_unknown, - request_status_total, - {{"code", "UNKNOWN"}}); -DEFINE_COUNTER_INSTANCE(request_invalid_argument, - request_status_total, - {{"code", "INVALID_ARGUMENT"}}); -DEFINE_COUNTER_INSTANCE(request_deadline_exceeded, - request_status_total, - {{"code", "DEADLINE_EXCEEDED"}}); -DEFINE_COUNTER_INSTANCE(request_resource_exhausted, - request_status_total, - {{"code", "RESOURCE_EXHAUSTED"}}); -DEFINE_COUNTER_INSTANCE(request_unauthenticated, - request_status_total, - {{"code", "UNAUTHENTICATED"}}); -DEFINE_COUNTER_INSTANCE(request_unavailable, - request_status_total, - {{"code", "UNAVAILABLE"}}); -DEFINE_COUNTER_INSTANCE(request_unimplemented, - request_status_total, - {{"code", "UNIMPLEMENTED"}}); - -DEFINE_COUNTER_FAMILY(request_handling_latency_seconds, - "Request handling latency in seconds"); -DEFINE_COUNTER_INSTANCE(chat_handling_latency_seconds, - request_handling_latency_seconds, - {{"type", "chat"}}); -DEFINE_COUNTER_INSTANCE(completion_handling_latency_seconds, - request_handling_latency_seconds, - {{"type", "completion"}}); - -DEFINE_COUNTER(tokenization_latency_seconds, - "Prompt tokenization latency in seconds"); -DEFINE_COUNTER(chat_template_latency_seconds, - "Chat template latency in seconds"); - namespace llm { namespace { diff --git a/src/handlers/vlm_handler.cpp b/src/handlers/vlm_handler.cpp index 80def6a7..b415c831 100644 --- a/src/handlers/vlm_handler.cpp +++ b/src/handlers/vlm_handler.cpp @@ -14,92 +14,52 @@ #include "common/timer.h" #include "engine/utils.h" #include "engine/vlm_engine.h" +#include "handler_metrics.h" #include "models/model_args.h" #include "models/model_registry.h" #include "request/output.h" #include "request/request.h" #include "speculative/speculative_engine.h" -// DEFINE_COUNTER_FAMILY(request_status_total, "Total number of request -// status"); DEFINE_COUNTER_INSTANCE(request_ok, request_status_total, {{"code", -// "OK"}}); DEFINE_COUNTER_INSTANCE(request_cancelled, -// request_status_total, -// {{"code", "CANCELLED"}}); -// DEFINE_COUNTER_INSTANCE(request_unknown, -// request_status_total, -// {{"code", "UNKNOWN"}}); -// DEFINE_COUNTER_INSTANCE(request_invalid_argument, -// request_status_total, -// {{"code", "INVALID_ARGUMENT"}}); -// DEFINE_COUNTER_INSTANCE(request_deadline_exceeded, -// request_status_total, -// {{"code", "DEADLINE_EXCEEDED"}}); -// DEFINE_COUNTER_INSTANCE(request_resource_exhausted, -// request_status_total, -// {{"code", "RESOURCE_EXHAUSTED"}}); -// DEFINE_COUNTER_INSTANCE(request_unauthenticated, -// request_status_total, -// {{"code", "UNAUTHENTICATED"}}); -// DEFINE_COUNTER_INSTANCE(request_unavailable, -// request_status_total, -// {{"code", "UNAVAILABLE"}}); -// DEFINE_COUNTER_INSTANCE(request_unimplemented, -// request_status_total, -// {{"code", "UNIMPLEMENTED"}}); - -// DEFINE_COUNTER_FAMILY(request_handling_latency_seconds, -// "Request handling latency in seconds"); -// DEFINE_COUNTER_INSTANCE(chat_handling_latency_seconds, -// request_handling_latency_seconds, -// {{"type", "chat"}}); -// DEFINE_COUNTER_INSTANCE(completion_handling_latency_seconds, -// request_handling_latency_seconds, -// {{"type", "completion"}}); - -// DEFINE_COUNTER(tokenization_latency_seconds, -// "Prompt tokenization latency in seconds"); -// DEFINE_COUNTER(chat_template_latency_seconds, -// "Chat template latency in seconds"); - namespace llm { namespace { #define CALLBACK_WITH_ERROR(CODE, MSG) callback(Status{CODE, MSG}); -// void log_request_status(StatusCode code) { -// switch (code) { -// case StatusCode::OK: -// COUNTER_INC(request_ok); -// break; -// case StatusCode::CANCELLED: -// COUNTER_INC(request_cancelled); -// break; -// case StatusCode::UNKNOWN: -// COUNTER_INC(request_unknown); -// break; -// case StatusCode::INVALID_ARGUMENT: -// COUNTER_INC(request_invalid_argument); -// break; -// case StatusCode::DEADLINE_EXCEEDED: -// COUNTER_INC(request_deadline_exceeded); -// break; -// case StatusCode::RESOURCE_EXHAUSTED: -// COUNTER_INC(request_resource_exhausted); -// break; -// case StatusCode::UNAUTHENTICATED: -// COUNTER_INC(request_unauthenticated); -// break; -// case StatusCode::UNAVAILABLE: -// COUNTER_INC(request_unavailable); -// break; -// case StatusCode::UNIMPLEMENTED: -// COUNTER_INC(request_unimplemented); -// break; -// default: -// COUNTER_INC(request_unknown); -// break; -// } -// } +void log_request_status(StatusCode code) { + switch (code) { + case StatusCode::OK: + COUNTER_INC(request_ok); + break; + case StatusCode::CANCELLED: + COUNTER_INC(request_cancelled); + break; + case StatusCode::UNKNOWN: + COUNTER_INC(request_unknown); + break; + case StatusCode::INVALID_ARGUMENT: + COUNTER_INC(request_invalid_argument); + break; + case StatusCode::DEADLINE_EXCEEDED: + COUNTER_INC(request_deadline_exceeded); + break; + case StatusCode::RESOURCE_EXHAUSTED: + COUNTER_INC(request_resource_exhausted); + break; + case StatusCode::UNAUTHENTICATED: + COUNTER_INC(request_unauthenticated); + break; + case StatusCode::UNAVAILABLE: + COUNTER_INC(request_unavailable); + break; + case StatusCode::UNIMPLEMENTED: + COUNTER_INC(request_unimplemented); + break; + default: + COUNTER_INC(request_unknown); + break; + } +} bool verify_params(const SamplingParams& sp, OutputCallback callback) { if (sp.n == 0) { @@ -220,7 +180,7 @@ std::future VLMHandler::schedule_async(torch::Tensor image, stream, [callback = std::move(callback)](const RequestOutput& output) { if (output.status.has_value()) { - // log_request_status(output.status.value().code()); + log_request_status(output.status.value().code()); } return callback(output); }); @@ -243,7 +203,7 @@ std::future VLMHandler::schedule(torch::Tensor image, priority, stream, callback = std::move(callback)](size_t tid) mutable { - // AUTO_COUNTER(completion_handling_latency_seconds); + AUTO_COUNTER(completion_handling_latency_seconds); // remove the pending request after scheduling SCOPE_GUARD([this] { scheduler_->dec_pending_requests(); }); @@ -343,7 +303,7 @@ std::unique_ptr VLMHandler::create_request(size_t tid, "Failed to encode prompt"); return nullptr; } - // COUNTER_ADD(tokenization_latency_seconds, timer.elapsed_seconds()); + COUNTER_ADD(tokenization_latency_seconds, timer.elapsed_seconds()); // encode the image, encode & projector auto vision_engine = dynamic_cast(engine_.get());