llm-d · github-actions · Nov 19, 2025 · Nov 18, 2025 · Nov 19, 2025
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -576,14 +576,16 @@ func (s *VllmSimulator) createCompletionResponse(logprobs *int, isChatCompletion
 		time.Now().Unix(), modelName, usageData)
 
 	if doRemoteDecode {
+		baseResp.KVParams = &openaiserverapi.KVTransferParams{}
 		// add special fields related to the prefill pod special behavior
-		baseResp.DoRemoteDecode = true
-		baseResp.DoRemotePrefill = false
+		baseResp.KVParams.DoRemoteDecode = false
+		baseResp.KVParams.DoRemotePrefill = true
 		// currently remote prefill information is hard-coded
-		baseResp.RemoteBlockIds = []string{"DUMMY_ID"}
-		baseResp.RemoteEngineId = "DUMMY_ID"
-		baseResp.RemoteHost = "DUMMY"
-		baseResp.RemotePort = 1234
+		baseResp.KVParams.RemoteBlockIds = []string{"DUMMY_ID"}
+		baseResp.KVParams.RemoteEngineId = "DUMMY_ID"
+		baseResp.KVParams.RemoteHost = "DUMMY"
+		baseResp.KVParams.RemotePort = 1234
+		baseResp.KVParams.TPSize = 1
 	}
 
 	baseChoice := openaiserverapi.CreateBaseResponseChoice(0, finishReason)

diff --git a/pkg/openai-server-api/request.go b/pkg/openai-server-api/request.go
@@ -91,22 +91,28 @@ type baseCompletionRequest struct {
 	StreamOptions StreamOptions `json:"stream_options"`
 	// Model defines Model name to use for "inference", could be base Model name or one of available LoRA adapters
 	Model string `json:"model"`
+	// KVParams kv transfer related fields
+	KVParams *KVTransferParams `json:"kv_transfer_params"`
+	// The number of tokens in the prompt that are in the local KV Cache
+	cachedPromptTokens int
+	// IgnoreEOS is a boolean value, true when the model should ignore end-of-sequence tokens
+	IgnoreEOS bool `json:"ignore_eos"`
+}
+
+type KVTransferParams struct {
 	// DoRemoteDecode boolean value, true when request's decode will be done on remote pod
 	DoRemoteDecode bool `json:"do_remote_decode"`
 	// DoRemotePrefill boolean value, true when request's prefill was done on remote pod
 	DoRemotePrefill bool `json:"do_remote_prefill"`
-	// RemoteBlockIds is a list of block identifiers to process remotely for distributed decoding
-	RemoteBlockIds []string `json:"remote_block_ids"`
 	// RemoteEngineId is an identifier of the remote inference engine or backend to use for processing requests
 	RemoteEngineId string `json:"remote_engine_id"`
+	// RemoteBlockIds is a list of block identifiers to process remotely for distributed decoding
+	RemoteBlockIds []string `json:"remote_block_ids"`
 	// RemoteHost is a hostname or IP address of the remote server handling prefill
 	RemoteHost string `json:"remote_host"`
 	// RemotePort is a port of the remote server handling prefill
 	RemotePort int `json:"remote_port"`
-	// The number of tokens in the prompt that are in the local KV Cache
-	cachedPromptTokens int
-	// IgnoreEOS is a boolean value, true when the model should ignore end-of-sequence tokens
-	IgnoreEOS bool `json:"ignore_eos"`
+	TPSize     int `json:"tp_size" default:"1"`
 }
 
 // StreamOptions defines streaming options for streaming requests
@@ -132,11 +138,11 @@ func (b *baseCompletionRequest) IncludeUsage() bool {
 }
 
 func (b *baseCompletionRequest) IsDoRemoteDecode() bool {
-	return b.DoRemoteDecode
+	return b.KVParams != nil && b.KVParams.DoRemoteDecode
 }
 
 func (b *baseCompletionRequest) IsDoRemotePrefill() bool {
-	return b.DoRemotePrefill
+	return b.KVParams != nil && b.KVParams.DoRemotePrefill
 }
 
 // GetNumberOfCachedPromptTokens returns the number of tokens in the prompt that are

diff --git a/pkg/openai-server-api/response.go b/pkg/openai-server-api/response.go
@@ -40,18 +40,8 @@ type baseCompletionResponse struct {
 	Usage *Usage `json:"usage"`
 	// Object is the Object type, "text_completion", "chat.completion", or "chat.completion.chunk"
 	Object string `json:"object"`
-	// DoRemoteDecode boolean value, true when request's decode will be done on remote pod
-	DoRemoteDecode bool `json:"do_remote_decode"`
-	// DoRemotePrefill boolean value, true when request's prefill was done on remote pod
-	DoRemotePrefill bool `json:"do_remote_prefill"`
-	// RemoteBlockIds is a list of block identifiers to process remotely for distributed decoding
-	RemoteBlockIds []string `json:"remote_block_ids"`
-	// RemoteEngineId is an identifier of the remote inference engine or backend to use for processing requests
-	RemoteEngineId string `json:"remote_engine_id"`
-	// RemoteHost is a hostname or IP address of the remote server handling prefill
-	RemoteHost string `json:"remote_host"`
-	// RemotePort is a port of the remote server handling prefill
-	RemotePort int `json:"remote_port"`
+	// KVParams kv transfer related fields
+	KVParams *KVTransferParams `json:"kv_transfer_params"`
 }
 
 // Usage contains token Usage statistics