Skip to content

bugfix: fix multiple definition issue. #261

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions scalellm/csrc/vlm_handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,21 @@ namespace py = pybind11;
using namespace pybind11::literals;

void init_vlm_handler(py::module_& m) {
py::enum_<Priority>(m, "Priority")
.value("DEFAULT", Priority::NORMAL)
.value("LOW", Priority::LOW)
.value("NORMAL", Priority::NORMAL)
.value("HIGH", Priority::HIGH)
.export_values();

py::class_<std::future<bool>>(m, "Future")
.def("wait",
&std::future<bool>::wait,
py::call_guard<py::gil_scoped_release>())
.def("get",
&std::future<bool>::get,
py::call_guard<py::gil_scoped_release>());

auto vlm_handler =
py::class_<VLMHandler>(m, "VLMHandler")
.def(py::init<const VLMHandler::Options&>(), py::arg("options"))
Expand Down
38 changes: 21 additions & 17 deletions src/common/metrics.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,16 @@ class AutoCounter final {
// define gauge
// a gauge is a metric that represents a single numerical value that can
// arbitrarily go up and down.
#define DEFINE_GAUGE(name, desc) \
auto& GAUGE_##name = llm::Metrics::Instance().BuildGauge(#name, desc).Add({});
#define DEFINE_GAUGE(name, desc) \
prometheus::Gauge& GAUGE_##name = \
llm::Metrics::Instance().BuildGauge(#name, desc).Add({});

#define DEFINE_GAUGE_FAMILY(name, desc) \
auto& name##_family = llm::Metrics::Instance().BuildGauge(#name, desc);
#define DEFINE_GAUGE_FAMILY(name, desc) \
prometheus::Family<prometheus::Gauge>& name##_family = \
llm::Metrics::Instance().BuildGauge(#name, desc);

#define DEFINE_GAUGE_INSTANCE(alias, name, ...) \
auto& GAUGE_##alias = name##_family.Add(__VA_ARGS__);
prometheus::Gauge& GAUGE_##alias = name##_family.Add(__VA_ARGS__);

#define GAUGE_SET(name, value) GAUGE_##name.Set(value);

Expand All @@ -111,15 +113,16 @@ class AutoCounter final {
// define counter
// a counter is a monotonically increasing counter whose value can only increase
// or be reset to zero on restart.
#define DEFINE_COUNTER(name, desc) \
auto& COUNTER_##name = \
#define DEFINE_COUNTER(name, desc) \
prometheus::Counter& COUNTER_##name = \
llm::Metrics::Instance().BuildCounter(#name, desc).Add({});

#define DEFINE_COUNTER_FAMILY(name, desc) \
auto& name##_family = llm::Metrics::Instance().BuildCounter(#name, desc);
#define DEFINE_COUNTER_FAMILY(name, desc) \
prometheus::Family<prometheus::Counter>& name##_family = \
llm::Metrics::Instance().BuildCounter(#name, desc);

#define DEFINE_COUNTER_INSTANCE(alias, name, ...) \
auto& COUNTER_##alias = name##_family.Add(__VA_ARGS__);
prometheus::Counter& COUNTER_##alias = name##_family.Add(__VA_ARGS__);

#define COUNTER_ADD(name, value) COUNTER_##name.Increment(value);

Expand All @@ -133,16 +136,17 @@ class AutoCounter final {
// a histogram samples observations (usually things like request durations or
// response sizes) and counts them in configurable buckets. It also provides a
// sum of all observed values.
#define DEFINE_HISTOGRAM(name, desc, ...) \
auto& HISTOGRAM_##name = llm::Metrics::Instance() \
.BuildHistogram(#name, desc) \
.Add({}, __VA_ARGS__);
#define DEFINE_HISTOGRAM(name, desc, ...) \
prometheus::Histogram& HISTOGRAM_##name = llm::Metrics::Instance() \
.BuildHistogram(#name, desc) \
.Add({}, __VA_ARGS__);

#define DEFINE_HISTOGRAM_FAMILY(name, desc) \
auto& name##_family = llm::Metrics::Instance().BuildHistogram(#name, desc);
#define DEFINE_HISTOGRAM_FAMILY(name, desc) \
prometheus::Family<prometheus::Histogram>& name##_family = \
llm::Metrics::Instance().BuildHistogram(#name, desc);

#define DEFINE_HISTOGRAM_INSTANCE(alias, name, ...) \
auto& HISTOGRAM_##alias = name##_family.Add(__VA_ARGS__);
prometheus::Histogram& HISTOGRAM_##alias = name##_family.Add(__VA_ARGS__);

#define HISTOGRAM_OBSERVE(name, value) HISTOGRAM_##name.Observe(value);

Expand Down
2 changes: 2 additions & 0 deletions src/engine/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ cc_library(
engine.h
llm_engine.h
vlm_engine.h
engine_metrics.h
SRCS
utils.cpp
batch.cpp
Expand All @@ -22,6 +23,7 @@ cc_library(
vlm_worker.cpp
llm_engine.cpp
vlm_engine.cpp
engine_metrics.cpp
DEPS
torch
:common
Expand Down
15 changes: 15 additions & 0 deletions src/engine/engine_metrics.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#include "engine_metrics.h"

DEFINE_COUNTER(prepare_input_latency_seconds,
"Latency of preparing input in seconds");
DEFINE_COUNTER_FAMILY(execution_latency_seconds,
"Execution latency in seconds");
DEFINE_COUNTER_INSTANCE(model_execution_latency_seconds,
execution_latency_seconds,
{{"stage", "model"}});
DEFINE_COUNTER_INSTANCE(logits_processing_latency_seconds,
execution_latency_seconds,
{{"stage", "logits_processing"}});
DEFINE_COUNTER_INSTANCE(sampling_latency_seconds,
execution_latency_seconds,
{{"stage", "sampling"}});
9 changes: 9 additions & 0 deletions src/engine/engine_metrics.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#pragma once

#include "common/metrics.h"

DECLARE_COUNTER(prepare_input_latency_seconds)
DECLARE_COUNTER_FAMILY(execution_latency_seconds)
DECLARE_COUNTER_INSTANCE(model_execution_latency_seconds)
DECLARE_COUNTER_INSTANCE(logits_processing_latency_seconds)
DECLARE_COUNTER_INSTANCE(sampling_latency_seconds)
4 changes: 1 addition & 3 deletions src/engine/llm_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,12 @@

#include "common/metrics.h"
#include "common/pretty_print.h"
#include "engine_metrics.h"
#include "model_loader/model_loader.h"
#include "model_parallel/parallel_args.h"
#include "models/model_args.h"
#include "worker.h"

DEFINE_COUNTER(prepare_input_latency_seconds,
"Latency of preparing input in seconds");

namespace llm {
namespace {
const std::vector<uint32_t> kDefaultBatchSizesForCudaGraph =
Expand Down
6 changes: 2 additions & 4 deletions src/engine/vlm_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,12 @@

#include "common/metrics.h"
#include "common/pretty_print.h"
#include "engine_metrics.h"
#include "model_loader/model_loader.h"
#include "model_parallel/parallel_args.h"
#include "models/model_args.h"
#include "vlm_worker.h"

// DEFINE_COUNTER(prepare_input_latency_seconds,
// "Latency of preparing input in seconds");

namespace llm {
namespace {
// clang-format off
Expand Down Expand Up @@ -270,7 +268,7 @@ ModelOutput VLMEngine::execute_model(Batch& batch) {
Timer timer;
auto model_inputs = batch.prepare_model_input(options_.num_decoding_tokens(),
adjusted_batch_size);
// COUNTER_ADD(prepare_input_latency_seconds, timer.elapsed_seconds());
COUNTER_ADD(prepare_input_latency_seconds, timer.elapsed_seconds());

if (!model_inputs.token_ids.defined()) {
// empty input, just return
Expand Down
20 changes: 4 additions & 16 deletions src/engine/vlm_worker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "common/metrics.h"
#include "common/threadpool.h"
#include "common/timer.h"
#include "engine_metrics.h"
#include "memory/kv_cache.h"
#include "memory/memory.h"
#include "model_loader/state_dict.h"
Expand All @@ -23,19 +24,6 @@
#include "sampling/logits_processor.h"
#include "sampling/sampler.h"

// latency metrics
// DEFINE_COUNTER_FAMILY(execution_latency_seconds,
// "Execution latency in seconds");
// DEFINE_COUNTER_INSTANCE(model_execution_latency_seconds,
// execution_latency_seconds,
// {{"stage", "model"}});
// DEFINE_COUNTER_INSTANCE(logits_processing_latency_seconds,
// execution_latency_seconds,
// {{"stage", "logits_processing"}});
// DEFINE_COUNTER_INSTANCE(sampling_latency_seconds,
// execution_latency_seconds,
// {{"stage", "sampling"}});

namespace llm {

VLMWorker::VLMWorker(const ParallelArgs& parallel_args,
Expand Down Expand Up @@ -149,7 +137,7 @@ std::optional<ModelOutput> VLMWorker::execute_model(const ModelInput& inputs) {
}

at::cuda::getCurrentCUDAStream().synchronize();
// COUNTER_ADD(model_execution_latency_seconds, timer.elapsed_seconds());
COUNTER_ADD(model_execution_latency_seconds, timer.elapsed_seconds());

if (!driver_) {
return std::nullopt;
Expand All @@ -166,7 +154,7 @@ std::optional<ModelOutput> VLMWorker::execute_model(const ModelInput& inputs) {
sampling_params.unique_token_ids,
sampling_params.unique_token_counts,
sampling_params.unique_token_ids_lens);
// COUNTER_ADD(logits_processing_latency_seconds, timer.elapsed_seconds());
COUNTER_ADD(logits_processing_latency_seconds, timer.elapsed_seconds());

// set logits to output
output.logits = logits;
Expand All @@ -179,7 +167,7 @@ std::optional<ModelOutput> VLMWorker::execute_model(const ModelInput& inputs) {
auto sample_logits =
logits.index_select(/*dim=*/0, sampling_params.sample_idxes);
auto sample_output = sampler->forward(sample_logits);
// COUNTER_ADD(sampling_latency_seconds, timer.elapsed_seconds());
COUNTER_ADD(sampling_latency_seconds, timer.elapsed_seconds());

// set sample output to output
output.sample_output = sample_output;
Expand Down
14 changes: 1 addition & 13 deletions src/engine/worker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "common/metrics.h"
#include "common/threadpool.h"
#include "common/timer.h"
#include "engine_metrics.h"
#include "memory/kv_cache.h"
#include "memory/memory.h"
#include "model_loader/state_dict.h"
Expand All @@ -23,19 +24,6 @@
#include "sampling/logits_processor.h"
#include "sampling/sampler.h"

// latency metrics
DEFINE_COUNTER_FAMILY(execution_latency_seconds,
"Execution latency in seconds");
DEFINE_COUNTER_INSTANCE(model_execution_latency_seconds,
execution_latency_seconds,
{{"stage", "model"}});
DEFINE_COUNTER_INSTANCE(logits_processing_latency_seconds,
execution_latency_seconds,
{{"stage", "logits_processing"}});
DEFINE_COUNTER_INSTANCE(sampling_latency_seconds,
execution_latency_seconds,
{{"stage", "sampling"}});

namespace llm {

Worker::Worker(const ParallelArgs& parallel_args,
Expand Down
2 changes: 2 additions & 0 deletions src/handlers/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@ cc_library(
sampling_params.h
llm_handler.h
vlm_handler.h
handler_metrics.h
SRCS
llm_handler.cpp
vlm_handler.cpp
handler_metrics.cpp
DEPS
:common
:scheduler
Expand Down
42 changes: 42 additions & 0 deletions src/handlers/handler_metrics.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#include "handler_metrics.h"

DEFINE_COUNTER_FAMILY(request_status_total, "Total number of request status");
DEFINE_COUNTER_INSTANCE(request_ok, request_status_total, {{"code", "OK"}});
DEFINE_COUNTER_INSTANCE(request_cancelled,
request_status_total,
{{"code", "CANCELLED"}});
DEFINE_COUNTER_INSTANCE(request_unknown,
request_status_total,
{{"code", "UNKNOWN"}});
DEFINE_COUNTER_INSTANCE(request_invalid_argument,
request_status_total,
{{"code", "INVALID_ARGUMENT"}});
DEFINE_COUNTER_INSTANCE(request_deadline_exceeded,
request_status_total,
{{"code", "DEADLINE_EXCEEDED"}});
DEFINE_COUNTER_INSTANCE(request_resource_exhausted,
request_status_total,
{{"code", "RESOURCE_EXHAUSTED"}});
DEFINE_COUNTER_INSTANCE(request_unauthenticated,
request_status_total,
{{"code", "UNAUTHENTICATED"}});
DEFINE_COUNTER_INSTANCE(request_unavailable,
request_status_total,
{{"code", "UNAVAILABLE"}});
DEFINE_COUNTER_INSTANCE(request_unimplemented,
request_status_total,
{{"code", "UNIMPLEMENTED"}});

DEFINE_COUNTER_FAMILY(request_handling_latency_seconds,
"Request handling latency in seconds");
DEFINE_COUNTER_INSTANCE(chat_handling_latency_seconds,
request_handling_latency_seconds,
{{"type", "chat"}});
DEFINE_COUNTER_INSTANCE(completion_handling_latency_seconds,
request_handling_latency_seconds,
{{"type", "completion"}});

DEFINE_COUNTER(tokenization_latency_seconds,
"Prompt tokenization latency in seconds");
DEFINE_COUNTER(chat_template_latency_seconds,
"Chat template latency in seconds");
19 changes: 19 additions & 0 deletions src/handlers/handler_metrics.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#pragma once

#include "common/metrics.h"

DECLARE_COUNTER_FAMILY(request_status_total)
DECLARE_COUNTER_INSTANCE(request_ok)
DECLARE_COUNTER_INSTANCE(request_cancelled)
DECLARE_COUNTER_INSTANCE(request_unknown)
DECLARE_COUNTER_INSTANCE(request_invalid_argument)
DECLARE_COUNTER_INSTANCE(request_deadline_exceeded)
DECLARE_COUNTER_INSTANCE(request_resource_exhausted)
DECLARE_COUNTER_INSTANCE(request_unauthenticated)
DECLARE_COUNTER_INSTANCE(request_unavailable)
DECLARE_COUNTER_INSTANCE(request_unimplemented)
DECLARE_COUNTER_FAMILY(request_handling_latency_seconds)
DECLARE_COUNTER_INSTANCE(chat_handling_latency_seconds)
DECLARE_COUNTER_INSTANCE(completion_handling_latency_seconds)
DECLARE_COUNTER(tokenization_latency_seconds)
DECLARE_COUNTER(chat_template_latency_seconds)
42 changes: 1 addition & 41 deletions src/handlers/llm_handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,53 +13,13 @@
#include "common/scope_guard.h"
#include "common/timer.h"
#include "engine/utils.h"
#include "handler_metrics.h"
#include "models/model_args.h"
#include "models/model_registry.h"
#include "request/output.h"
#include "request/request.h"
#include "speculative/speculative_engine.h"

DEFINE_COUNTER_FAMILY(request_status_total, "Total number of request status");
DEFINE_COUNTER_INSTANCE(request_ok, request_status_total, {{"code", "OK"}});
DEFINE_COUNTER_INSTANCE(request_cancelled,
request_status_total,
{{"code", "CANCELLED"}});
DEFINE_COUNTER_INSTANCE(request_unknown,
request_status_total,
{{"code", "UNKNOWN"}});
DEFINE_COUNTER_INSTANCE(request_invalid_argument,
request_status_total,
{{"code", "INVALID_ARGUMENT"}});
DEFINE_COUNTER_INSTANCE(request_deadline_exceeded,
request_status_total,
{{"code", "DEADLINE_EXCEEDED"}});
DEFINE_COUNTER_INSTANCE(request_resource_exhausted,
request_status_total,
{{"code", "RESOURCE_EXHAUSTED"}});
DEFINE_COUNTER_INSTANCE(request_unauthenticated,
request_status_total,
{{"code", "UNAUTHENTICATED"}});
DEFINE_COUNTER_INSTANCE(request_unavailable,
request_status_total,
{{"code", "UNAVAILABLE"}});
DEFINE_COUNTER_INSTANCE(request_unimplemented,
request_status_total,
{{"code", "UNIMPLEMENTED"}});

DEFINE_COUNTER_FAMILY(request_handling_latency_seconds,
"Request handling latency in seconds");
DEFINE_COUNTER_INSTANCE(chat_handling_latency_seconds,
request_handling_latency_seconds,
{{"type", "chat"}});
DEFINE_COUNTER_INSTANCE(completion_handling_latency_seconds,
request_handling_latency_seconds,
{{"type", "completion"}});

DEFINE_COUNTER(tokenization_latency_seconds,
"Prompt tokenization latency in seconds");
DEFINE_COUNTER(chat_template_latency_seconds,
"Chat template latency in seconds");

namespace llm {
namespace {

Expand Down
Loading