Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions pkg/llm-d-inference-sim/simulator.go
Original file line number Diff line number Diff line change
Expand Up @@ -576,14 +576,16 @@ func (s *VllmSimulator) createCompletionResponse(logprobs *int, isChatCompletion
time.Now().Unix(), modelName, usageData)

if doRemoteDecode {
baseResp.KVParams = &openaiserverapi.KVTransferParams{}
// add special fields related to the prefill pod special behavior
baseResp.DoRemoteDecode = true
baseResp.DoRemotePrefill = false
baseResp.KVParams.DoRemoteDecode = false
baseResp.KVParams.DoRemotePrefill = true
// currently remote prefill information is hard-coded
baseResp.RemoteBlockIds = []string{"DUMMY_ID"}
baseResp.RemoteEngineId = "DUMMY_ID"
baseResp.RemoteHost = "DUMMY"
baseResp.RemotePort = 1234
baseResp.KVParams.RemoteBlockIds = []string{"DUMMY_ID"}
baseResp.KVParams.RemoteEngineId = "DUMMY_ID"
baseResp.KVParams.RemoteHost = "DUMMY"
baseResp.KVParams.RemotePort = 1234
baseResp.KVParams.TPSize = 1
}

baseChoice := openaiserverapi.CreateBaseResponseChoice(0, finishReason)
Expand Down
22 changes: 14 additions & 8 deletions pkg/openai-server-api/request.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,22 +91,28 @@ type baseCompletionRequest struct {
StreamOptions StreamOptions `json:"stream_options"`
// Model defines Model name to use for "inference", could be base Model name or one of available LoRA adapters
Model string `json:"model"`
// KVParams kv transfer related fields
KVParams *KVTransferParams `json:"kv_transfer_params"`
// The number of tokens in the prompt that are in the local KV Cache
cachedPromptTokens int
// IgnoreEOS is a boolean value, true when the model should ignore end-of-sequence tokens
IgnoreEOS bool `json:"ignore_eos"`
}

type KVTransferParams struct {
// DoRemoteDecode boolean value, true when request's decode will be done on remote pod
DoRemoteDecode bool `json:"do_remote_decode"`
// DoRemotePrefill boolean value, true when request's prefill was done on remote pod
DoRemotePrefill bool `json:"do_remote_prefill"`
// RemoteBlockIds is a list of block identifiers to process remotely for distributed decoding
RemoteBlockIds []string `json:"remote_block_ids"`
// RemoteEngineId is an identifier of the remote inference engine or backend to use for processing requests
RemoteEngineId string `json:"remote_engine_id"`
// RemoteBlockIds is a list of block identifiers to process remotely for distributed decoding
RemoteBlockIds []string `json:"remote_block_ids"`
// RemoteHost is a hostname or IP address of the remote server handling prefill
RemoteHost string `json:"remote_host"`
// RemotePort is a port of the remote server handling prefill
RemotePort int `json:"remote_port"`
// The number of tokens in the prompt that are in the local KV Cache
cachedPromptTokens int
// IgnoreEOS is a boolean value, true when the model should ignore end-of-sequence tokens
IgnoreEOS bool `json:"ignore_eos"`
TPSize int `json:"tp_size" default:"1"`
}

// StreamOptions defines streaming options for streaming requests
Expand All @@ -132,11 +138,11 @@ func (b *baseCompletionRequest) IncludeUsage() bool {
}

func (b *baseCompletionRequest) IsDoRemoteDecode() bool {
return b.DoRemoteDecode
return b.KVParams != nil && b.KVParams.DoRemoteDecode
}

func (b *baseCompletionRequest) IsDoRemotePrefill() bool {
return b.DoRemotePrefill
return b.KVParams != nil && b.KVParams.DoRemotePrefill
}

// GetNumberOfCachedPromptTokens returns the number of tokens in the prompt that are
Expand Down
14 changes: 2 additions & 12 deletions pkg/openai-server-api/response.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,18 +40,8 @@ type baseCompletionResponse struct {
Usage *Usage `json:"usage"`
// Object is the Object type, "text_completion", "chat.completion", or "chat.completion.chunk"
Object string `json:"object"`
// DoRemoteDecode boolean value, true when request's decode will be done on remote pod
DoRemoteDecode bool `json:"do_remote_decode"`
// DoRemotePrefill boolean value, true when request's prefill was done on remote pod
DoRemotePrefill bool `json:"do_remote_prefill"`
// RemoteBlockIds is a list of block identifiers to process remotely for distributed decoding
RemoteBlockIds []string `json:"remote_block_ids"`
// RemoteEngineId is an identifier of the remote inference engine or backend to use for processing requests
RemoteEngineId string `json:"remote_engine_id"`
// RemoteHost is a hostname or IP address of the remote server handling prefill
RemoteHost string `json:"remote_host"`
// RemotePort is a port of the remote server handling prefill
RemotePort int `json:"remote_port"`
// KVParams kv transfer related fields
KVParams *KVTransferParams `json:"kv_transfer_params"`
}

// Usage contains token Usage statistics
Expand Down
Loading