Skip to content
5 changes: 4 additions & 1 deletion crates/grpc_client/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
// Rebuild triggers
println!("cargo:rerun-if-changed=proto/common.proto");
println!("cargo:rerun-if-changed=proto/sglang_scheduler.proto");
println!("cargo:rerun-if-changed=proto/tokenspeed_scheduler.proto");
println!("cargo:rerun-if-changed=proto/vllm_engine.proto");
println!("cargo:rerun-if-changed=proto/trtllm_service.proto");
println!("cargo:rerun-if-changed=proto/mlx_engine.proto");
Expand All @@ -20,7 +21,8 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
.extern_path(".smg.grpc.common", "crate::common_proto")
.type_attribute("GetModelInfoResponse", "#[derive(serde::Serialize)]")
// vllm + trtllm ServerInfo have only primitive fields.
// sglang's contains prost_types::{Struct,Timestamp} so it's handled separately.
// sglang's and tokenspeed's contain prost_types::{Struct,Timestamp};
// those are handled separately at the wrapper layer.
.type_attribute(
"vllm.grpc.engine.GetServerInfoResponse",
"#[derive(serde::Serialize)]",
Expand All @@ -40,6 +42,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
"proto/vllm_engine.proto",
"proto/trtllm_service.proto",
"proto/mlx_engine.proto",
"proto/tokenspeed_scheduler.proto",
],
&["proto"],
)?;
Expand Down
300 changes: 300 additions & 0 deletions crates/grpc_client/proto/tokenspeed_scheduler.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,300 @@
syntax = "proto3";

package tokenspeed.grpc.scheduler;

import "google/protobuf/timestamp.proto";
import "google/protobuf/struct.proto";

// Service definition for TokenSpeed scheduler communication.
//
// TokenSpeed has its own service identity AND its own message shapes — wire
// definition is fully self-contained, with zero dependencies on
// ``sglang_scheduler.proto``. The message catalog is intentionally minimal:
// it covers what TokenSpeed's top-tier LLMs (Kimi K2, MiniMax M2, Qwen 3,
// gpt-oss, DeepSeek V4) actually need today, and nothing more. Anything
// SGLang-specific (PD-disaggregated serving, LoRA hot-swap, multimodal,
// classifier outputs, hidden-state forwarding, embeddings) is deliberately
// out of scope and lands here only when an explicit TokenSpeed use case
// shows up.
service TokenSpeedScheduler {
// Submit a generation request (server-streaming for token-by-token).
rpc Generate(GenerateRequest) returns (stream GenerateResponse);
Comment thread
key4ng marked this conversation as resolved.

// Liveness + readiness probe.
rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse);

// Cancel a running request.
rpc Abort(AbortRequest) returns (AbortResponse);

// Static info about the loaded model.
rpc GetModelInfo(GetModelInfoRequest) returns (GetModelInfoResponse);

// Runtime info about the server.
rpc GetServerInfo(GetServerInfoRequest) returns (GetServerInfoResponse);

// Per-DP-rank load metrics (used by router for least-load).
rpc GetLoads(GetLoadsRequest) returns (GetLoadsResponse);
}

// =====================
// Sampling
// =====================

// IMPORTANT: proto3 numeric defaults (0) do NOT match semantic defaults
// (temperature=1.0, top_p=1.0, top_k=-1). All sampling scalars are
// declared ``optional`` so presence is preserved on the wire — the
// servicer uses ``HasField()`` to distinguish "client explicitly set 0"
// from "client didn't send anything." Without this, ``temperature=0``
// (a valid request for greedy decoding) is indistinguishable from the
// proto3 default and would be silently dropped by truthy-check guards.
//
// ``min_new_tokens`` is left non-optional because 0 is its semantic
// "no minimum" sentinel.
message SamplingParams {
optional float temperature = 1;
optional float top_p = 2;
optional int32 top_k = 3;
optional float min_p = 4;
optional float frequency_penalty = 5;
optional float presence_penalty = 6;
optional float repetition_penalty = 7;

optional uint32 max_new_tokens = 8;
uint32 min_new_tokens = 9;

repeated string stop = 10;
repeated uint32 stop_token_ids = 11;
bool ignore_eos = 12;

bool skip_special_tokens = 13;
bool spaces_between_special_tokens = 14;

// Number of samples (n in OpenAI API).
uint32 n = 15;

// Per-token logit bias.
map<string, float> logit_bias = 16;

// Structured generation. Currently xfailed in e2e (tokenspeed#361),
// but the wire shape stays so wiring it later doesn't bump the proto.
oneof constraint {
string regex = 17;
string json_schema = 18;
string ebnf_grammar = 19;
string structural_tag = 20;
}

// When true, generation does not strip the trailing matched stop token
// from ``output_ids`` (matches SGLang's ``no_stop_trim``). Combined with
// ``skip_special_tokens=False`` it lets the gateway-side detokenizer
// render the EOS marker in the visible response — required for the
// ``test_no_stop_trim_with_skip_special_false`` e2e check and for any
// downstream logic that needs the raw stop token in the output stream.
bool no_stop_trim = 22;

// Escape hatch for backend-specific knobs without bumping the proto.
google.protobuf.Struct custom_params = 21;
}

// =====================
// Generate
// =====================

message GenerateRequest {
string request_id = 1;

// Tokenized input (router does its own tokenization).
TokenizedInput tokenized = 2;

SamplingParams sampling_params = 3;

// Logprob options.
bool return_logprob = 4;
// Optional so the servicer can distinguish "client omitted" (use SGLang's
// ``-1`` default = no input logprobs) from an explicit value like 0.
optional int32 logprob_start_len = 5;
int32 top_logprobs_num = 6;
repeated uint32 token_ids_logprob = 7;

// Whether the client wants stream chunks (otherwise: complete-only).
bool stream = 8;
}

message TokenizedInput {
repeated uint32 input_ids = 1;
// Original text — purely cosmetic; the tokenizer pass is skipped because
// input_ids is set. Used in worker logs for traceability.
string original_text = 2;
}

message GenerateResponse {
string request_id = 1;

oneof response {
GenerateStreamChunk chunk = 2;
GenerateComplete complete = 3;
}
}

message GenerateStreamChunk {
// Generated tokens since the previous chunk.
repeated uint32 token_ids = 1;

uint32 prompt_tokens = 2;
uint32 completion_tokens = 3;
uint32 cached_tokens = 4;

OutputLogProbs output_logprobs = 5;

// For ordering when n>1.
uint32 index = 6;
}

message GenerateComplete {
repeated uint32 output_ids = 1;

// OpenAI-compatible: "stop", "length", "abort", "tool_calls".
string finish_reason = 2;

uint32 prompt_tokens = 3;
uint32 completion_tokens = 4;
uint32 cached_tokens = 5;

OutputLogProbs output_logprobs = 6;

// Which stop matched (for clients that care which `stop` triggered).
oneof matched_stop {
uint32 matched_token_id = 7;
string matched_stop_str = 8;
}

uint32 index = 9;
}

message OutputLogProbs {
repeated float token_logprobs = 1;
repeated uint32 token_ids = 2;
repeated TopLogProbs top_logprobs = 3;
}

message TopLogProbs {
repeated float values = 1;
repeated uint32 token_ids = 2;
}

// =====================
// Management
// =====================

message HealthCheckRequest {}
message HealthCheckResponse {
bool healthy = 1;
string message = 2;
}

message AbortRequest {
string request_id = 1;
string reason = 2;
}
message AbortResponse {
bool success = 1;
string message = 2;
}

// =====================
// Model & Server Info
// =====================

message GetModelInfoRequest {}
message GetModelInfoResponse {
string model_path = 1;
string tokenizer_path = 2;
string served_model_name = 3;
string model_type = 4;
repeated string architectures = 5;

int32 max_context_length = 6;
int32 max_req_input_len = 7;
int32 vocab_size = 8;

repeated int32 eos_token_ids = 9;
int32 pad_token_id = 10;
int32 bos_token_id = 11;

string weight_version = 12;
string preferred_sampling_params = 13; // JSON string or empty
}

message GetServerInfoRequest {}
message GetServerInfoResponse {
google.protobuf.Struct server_args = 1;
google.protobuf.Struct scheduler_info = 2;

int32 active_requests = 3;
bool is_paused = 4;
double uptime_seconds = 5;
int32 max_total_num_tokens = 6;

string tokenspeed_version = 7;
google.protobuf.Timestamp start_time = 8;
}

// =====================
// Loads
// =====================

message GetLoadsRequest {
optional int32 dp_rank = 1;
// Sections: "core" (default), "memory", "queues". Pass "all" for everything.
repeated string include = 2;
}

message GetLoadsResponse {
string timestamp = 1;
string version = 2;
int32 dp_rank_count = 3;
repeated SchedulerLoad loads = 4;
AggregateMetrics aggregate = 5;
}

message SchedulerLoad {
int32 dp_rank = 1;

int32 num_running_reqs = 2;
int32 num_waiting_reqs = 3;
int32 num_total_reqs = 4;
int32 num_used_tokens = 5;
int32 max_total_num_tokens = 6;
int32 max_running_requests = 7;

double token_usage = 8;
double gen_throughput = 9;
double cache_hit_rate = 10;
double utilization = 11;

optional MemoryMetrics memory = 12;
optional QueueMetrics queues = 13;
}

message MemoryMetrics {
double weight_gb = 1;
double kv_cache_gb = 2;
double graph_gb = 3;
int32 token_capacity = 4;
}

message QueueMetrics {
int32 waiting = 1;
int32 grammar = 2;
int32 paused = 3;
int32 retracted = 4;
}

message AggregateMetrics {
int32 total_running_reqs = 1;
int32 total_waiting_reqs = 2;
int32 total_reqs = 3;
double avg_token_usage = 4;
double avg_throughput = 5;
double avg_utilization = 6;
}
6 changes: 5 additions & 1 deletion crates/grpc_client/python/smg_grpc_proto/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""SMG gRPC Proto - Protocol definitions for SGLang, vLLM, TRT-LLM, and MLX."""
"""SMG gRPC Proto - Protocol definitions for SGLang, TokenSpeed, vLLM, TRT-LLM, and MLX."""

from importlib.metadata import version

Expand All @@ -14,6 +14,8 @@
sglang_encoder_pb2_grpc,
sglang_scheduler_pb2,
sglang_scheduler_pb2_grpc,
tokenspeed_scheduler_pb2,
tokenspeed_scheduler_pb2_grpc,
Comment thread
key4ng marked this conversation as resolved.
trtllm_service_pb2,
trtllm_service_pb2_grpc,
vllm_engine_pb2,
Expand All @@ -25,6 +27,8 @@
"sglang_scheduler_pb2_grpc",
"sglang_encoder_pb2",
"sglang_encoder_pb2_grpc",
"tokenspeed_scheduler_pb2",
"tokenspeed_scheduler_pb2_grpc",
"vllm_engine_pb2",
"vllm_engine_pb2_grpc",
"trtllm_service_pb2",
Expand Down
8 changes: 8 additions & 0 deletions crates/grpc_client/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ pub mod common_proto {
tonic::include_proto!("smg.grpc.common");
}
pub mod mlx_engine;
pub mod sampling_params;
pub mod sglang_scheduler;
pub mod tokenizer_bundle;
pub mod tokenspeed_scheduler;
Comment thread
key4ng marked this conversation as resolved.
pub mod trtllm_service;
pub mod vllm_engine;

Expand All @@ -18,6 +20,12 @@ use std::sync::Arc;

pub use mlx_engine::{proto as mlx_proto, MlxEngineClient};
pub use sglang_scheduler::{proto as sglang_proto, SglangSchedulerClient};
// TokenSpeed has a fully independent wire definition (see
// ``proto/tokenspeed_scheduler.proto``) — distinct service, distinct
// messages with intentionally trimmed field sets aimed at top-tier LLM
// workloads. The client wraps that wire and translates to/from SGLang-shaped
// types at the boundary so the router's dispatch enums don't proliferate.
pub use tokenspeed_scheduler::{tokenspeed_proto, TokenSpeedSchedulerClient};
Comment thread
key4ng marked this conversation as resolved.
Outdated
use tonic::metadata::MetadataMap;
pub use trtllm_service::{proto as trtllm_proto, TrtllmServiceClient};
pub use vllm_engine::{proto as vllm_proto, VllmEngineClient};
Expand Down
Loading
Loading