mirror of
https://github.com/ggerganov/llama.cpp
synced 2026-03-12 02:00:41 +01:00
server: moved self-call into speculative.cpp
This commit is contained in:
parent
1fb2658b0d
commit
1faeb628db
@ -187,6 +187,18 @@ llama_tokens common_speculative_gen_draft(
|
|||||||
struct common_speculative_params params,
|
struct common_speculative_params params,
|
||||||
const llama_tokens & prompt_tgt_main_model, // specified in target model vocab
|
const llama_tokens & prompt_tgt_main_model, // specified in target model vocab
|
||||||
llama_token id_last) {
|
llama_token id_last) {
|
||||||
|
if (params.self_mode == 1) {
|
||||||
|
// Look in the current context for a n-gram and return the following tokens as the draft.
|
||||||
|
llama_tokens draft_self = common_speculative_gen_self_draft(prompt_tgt_main_model, id_last,
|
||||||
|
params.self_ngram_size, params.n_draft);
|
||||||
|
if (!draft_self.empty()) {
|
||||||
|
return draft_self;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (spec == nullptr) {
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
auto & batch = spec->batch;
|
auto & batch = spec->batch;
|
||||||
auto & ctx_tgt = spec->ctx_tgt;
|
auto & ctx_tgt = spec->ctx_tgt;
|
||||||
auto & ctx_dft = spec->ctx_dft;
|
auto & ctx_dft = spec->ctx_dft;
|
||||||
|
|||||||
@ -10,6 +10,9 @@ struct common_speculative_params {
|
|||||||
int n_reuse = 256;
|
int n_reuse = 256;
|
||||||
|
|
||||||
float p_min = 0.75f; // min probability required to accept a token in the draft
|
float p_min = 0.75f; // min probability required to accept a token in the draft
|
||||||
|
|
||||||
|
int self_mode = 0; // 0: off, 1: self speculative lookup
|
||||||
|
int self_ngram_size = 12; // length of pattern to search for in self mode
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_speculative * common_speculative_init(
|
struct common_speculative * common_speculative_init(
|
||||||
|
|||||||
@ -2066,23 +2066,14 @@ private:
|
|||||||
GGML_ABORT("not supported by multimodal");
|
GGML_ABORT("not supported by multimodal");
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_tokens draft = {};
|
|
||||||
|
|
||||||
if (slot.task->params.speculative.use_self) {
|
|
||||||
// we search at least 5 tokens in history to try a self-speculative draft
|
|
||||||
const int n_draft_min = std::max(5, slot.task->params.speculative.n_min);
|
|
||||||
const llama_tokens & tokens = slot.prompt.tokens.get_text_tokens();
|
|
||||||
llama_token id = slot.sampled;
|
|
||||||
draft = common_speculative_gen_self_draft(tokens, id, n_draft_min, n_draft_max);
|
|
||||||
}
|
|
||||||
if (draft.empty() && slot.can_speculate()) {
|
|
||||||
struct common_speculative_params params_spec;
|
struct common_speculative_params params_spec;
|
||||||
params_spec.n_draft = n_draft_max;
|
params_spec.n_draft = n_draft_max;
|
||||||
params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.task->params.speculative.n_max;
|
params_spec.n_reuse = slot.ctx_dft ? (llama_n_ctx(slot.ctx_dft) - slot.task->params.speculative.n_max) : 0;
|
||||||
params_spec.p_min = slot.task->params.speculative.p_min;
|
params_spec.p_min = slot.task->params.speculative.p_min;
|
||||||
|
params_spec.self_mode = slot.task->params.speculative.use_self;
|
||||||
|
params_spec.self_ngram_size = std::max(5, slot.task->params.speculative.n_min);
|
||||||
const llama_tokens & cached_text_tokens = slot.prompt.tokens.get_text_tokens();
|
const llama_tokens & cached_text_tokens = slot.prompt.tokens.get_text_tokens();
|
||||||
draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, slot.sampled);
|
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, slot.sampled);
|
||||||
}
|
|
||||||
|
|
||||||
// add the sampled token to the batch
|
// add the sampled token to the batch
|
||||||
slot.i_batch_dft.push_back(batch.n_tokens);
|
slot.i_batch_dft.push_back(batch.n_tokens);
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user