Skip to content

Commit f227e91

Browse files
authored
feat(llama.cpp): Bump llama.cpp, adapt grpc server (#1211)
* feat(llama.cpp): Bump llama.cpp, adapt grpc server Signed-off-by: Ettore Di Giacinto <[email protected]> * ci: fixups Signed-off-by: Ettore Di Giacinto <[email protected]> --------- Signed-off-by: Ettore Di Giacinto <[email protected]>
1 parent c132dba commit f227e91

File tree

3 files changed

+13
-10
lines changed

3 files changed

+13
-10
lines changed

.github/workflows/test-gpu.yml

+4-1
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ jobs:
4040
if [ ! -e /run/systemd/system ]; then
4141
sudo mkdir /run/systemd/system
4242
fi
43+
sudo mkdir -p /host/tests/${{ github.head_ref || github.ref }}
44+
sudo chmod -R 777 /host/tests/${{ github.head_ref || github.ref }}
4345
make \
4446
TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
4547
BUILD_TYPE=cublas \
@@ -57,4 +59,5 @@ jobs:
5759
make \
5860
TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
5961
teardown-e2e || true
60-
docker system prune -f -a --volumes || true
62+
sudo rm -rf /host/tests/${{ github.head_ref || github.ref }}
63+
docker system prune -f -a --volumes || true

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0
88

99
GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
1010

11-
CPPLLAMA_VERSION?=96981f37b1e3f450d9e63e571514217bf60f0a7f
11+
CPPLLAMA_VERSION?=9d02956443e5c1ded29b7b5ed8a21bc01ba6f563
1212

1313
# gpt4all version
1414
GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all

backend/cpp/llama/grpc-server.cpp

+8-8
Original file line numberDiff line numberDiff line change
@@ -275,11 +275,11 @@ struct llama_server_context
275275
if (suff_rm_leading_spc && suffix_tokens[0] == space_token) {
276276
suffix_tokens.erase(suffix_tokens.begin());
277277
}
278-
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
279-
prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(ctx)); // always add BOS
280-
prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
278+
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
279+
prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
280+
prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
281281
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
282-
prefix_tokens.push_back(llama_token_middle(ctx));
282+
prefix_tokens.push_back(llama_token_middle(model));
283283

284284
auto prompt_tokens = prefix_tokens;
285285

@@ -419,7 +419,7 @@ struct llama_server_context
419419
if (params.n_predict == 0)
420420
{
421421
has_next_token = false;
422-
result.tok = llama_token_eos(ctx);
422+
result.tok = llama_token_eos(model);
423423
return result;
424424
}
425425

@@ -453,7 +453,7 @@ struct llama_server_context
453453
// decrement remaining sampling budget
454454
--n_remain;
455455

456-
if (!embd.empty() && embd.back() == llama_token_eos(ctx))
456+
if (!embd.empty() && embd.back() == llama_token_eos(model))
457457
{
458458
// stopping_word = llama_token_to_piece(ctx, embd.back());
459459
has_next_token = false;
@@ -594,7 +594,7 @@ static void parse_options_completion(bool streaming,const backend::PredictOption
594594

595595
if (predict->ignoreeos())
596596
{
597-
llama.params.sparams.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
597+
llama.params.sparams.logit_bias[llama_token_eos(llama.model)] = -INFINITY;
598598
}
599599

600600
// const auto &logit_bias = body.find("logit_bias");
@@ -676,7 +676,7 @@ static void params_parse(const backend::ModelOptions* request,
676676
}
677677

678678
static bool is_at_eob(llama_server_context &server_context, const llama_token *tokens, const size_t n_tokens) {
679-
return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
679+
return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.model);
680680
}
681681

682682
// Function matching type llama_beam_search_callback_fn_t.

0 commit comments

Comments
 (0)