Skip to content

Commit e49ea01

Browse files
authored
feat(llama.cpp): add flash_attention and no_kv_offloading (#2310)
feat(llama.cpp): add flash_attn and no_kv_offload Signed-off-by: Ettore Di Giacinto <[email protected]>
1 parent 7123d07 commit e49ea01

File tree

4 files changed

+11
-0
lines changed

4 files changed

+11
-0
lines changed

backend/backend.proto

+3
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,9 @@ message ModelOptions {
212212
float YarnBetaSlow = 47;
213213

214214
string Type = 49;
215+
216+
bool FlashAttention = 56;
217+
bool NoKVOffload = 57;
215218
}
216219

217220
message Result {

backend/cpp/llama/grpc-server.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -2254,6 +2254,9 @@ static void params_parse(const backend::ModelOptions* request,
22542254
}
22552255
params.use_mlock = request->mlock();
22562256
params.use_mmap = request->mmap();
2257+
params.flash_attn = request->flashattention();
2258+
params.no_kv_offload = request->nokvoffload();
2259+
22572260
params.embedding = request->embeddings();
22582261

22592262
if (request->ropescaling() == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }

core/backend/options.go

+2
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
7777
MaxModelLen: int32(c.MaxModelLen),
7878
TensorParallelSize: int32(c.TensorParallelSize),
7979
MMProj: c.MMProj,
80+
FlashAttention: c.FlashAttention,
81+
NoKVOffload: c.NoKVOffloading,
8082
YarnExtFactor: c.YarnExtFactor,
8183
YarnAttnFactor: c.YarnAttnFactor,
8284
YarnBetaFast: c.YarnBetaFast,

core/config/backend_config.go

+3
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,9 @@ type LLMConfig struct {
132132
TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM
133133
MMProj string `yaml:"mmproj"`
134134

135+
FlashAttention bool `yaml:"flash_attention"`
136+
NoKVOffloading bool `yaml:"no_kv_offloading"`
137+
135138
RopeScaling string `yaml:"rope_scaling"`
136139
ModelType string `yaml:"type"`
137140

0 commit comments

Comments
 (0)