mirror of
https://github.com/ggerganov/llama.cpp
synced 2026-03-13 02:30:36 +01:00
common : better names
This commit is contained in:
parent
f895bca71a
commit
a3300937e5
@ -3415,42 +3415,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--spec-ngram-size-n"}, "N",
|
||||
string_format("ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: %d)", params.speculative.spec_ngram_size_n),
|
||||
string_format("ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: %d)", params.speculative.ngram_size_n),
|
||||
[](common_params & params, int value) {
|
||||
if (value < 1 || value > 1024) {
|
||||
throw std::invalid_argument("ngram size N must be between 1 and 1024 inclusive");
|
||||
}
|
||||
params.speculative.spec_ngram_size_n = value;
|
||||
params.speculative.ngram_size_n = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--spec-ngram-size-m"}, "N",
|
||||
string_format("ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: %d)", params.speculative.spec_ngram_size_m),
|
||||
string_format("ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: %d)", params.speculative.ngram_size_m),
|
||||
[](common_params & params, int value) {
|
||||
if (value < 1 || value > 1024) {
|
||||
throw std::invalid_argument("ngram size M must be between 1 and 1024 inclusive");
|
||||
}
|
||||
params.speculative.spec_ngram_size_m = value;
|
||||
params.speculative.ngram_size_m = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--spec-ngram-check-rate"}, "N",
|
||||
string_format("ngram check rate for ngram-simple/ngram-map speculative decoding (default: %d)", params.speculative.spec_ngram_check_rate),
|
||||
string_format("ngram check rate for ngram-simple/ngram-map speculative decoding (default: %d)", params.speculative.ngram_check_rate),
|
||||
[](common_params & params, int value) {
|
||||
if (value < 1) {
|
||||
throw std::invalid_argument("ngram check rate must be at least 1");
|
||||
}
|
||||
params.speculative.spec_ngram_check_rate = value;
|
||||
params.speculative.ngram_check_rate = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--spec-ngram-min-hits"}, "N",
|
||||
string_format("minimum hits for ngram-map speculative decoding (default: %d)", params.speculative.spec_ngram_min_hits),
|
||||
string_format("minimum hits for ngram-map speculative decoding (default: %d)", params.speculative.ngram_min_hits),
|
||||
[](common_params & params, int value) {
|
||||
if (value < 1) {
|
||||
throw std::invalid_argument("ngram min hits must be at least 1");
|
||||
}
|
||||
params.speculative.spec_ngram_min_hits = value;
|
||||
params.speculative.ngram_min_hits = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
|
||||
@ -261,6 +261,7 @@ struct common_params_speculative {
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
||||
|
||||
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
|
||||
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
||||
|
||||
@ -276,10 +277,10 @@ struct common_params_speculative {
|
||||
|
||||
common_speculative_type draftless_type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding without a draft model
|
||||
|
||||
uint16_t spec_ngram_size_n = 12; // ngram size for lookup
|
||||
uint16_t spec_ngram_size_m = 48; // mgram size for speculative tokens
|
||||
uint16_t spec_ngram_check_rate = 1; // check rate for ngram lookup
|
||||
uint16_t spec_ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
|
||||
uint16_t ngram_size_n = 12; // ngram size for lookup
|
||||
uint16_t ngram_size_m = 48; // mgram size for speculative tokens
|
||||
uint16_t ngram_check_rate = 1; // check rate for ngram lookup
|
||||
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
|
||||
|
||||
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
|
||||
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
||||
|
||||
@ -281,11 +281,12 @@ struct common_speculative {
|
||||
};
|
||||
|
||||
static common_ngram_map get_common_ngram_map(const common_speculative_config & config) {
|
||||
uint16_t size_key = config.params.spec_ngram_size_n;
|
||||
uint16_t size_value = config.params.spec_ngram_size_m;
|
||||
uint16_t size_key = config.params.ngram_size_n;
|
||||
uint16_t size_value = config.params.ngram_size_m;
|
||||
bool key_only = (config.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K);
|
||||
uint16_t check_rate = config.params.spec_ngram_check_rate;
|
||||
uint16_t min_hits = config.params.spec_ngram_min_hits;
|
||||
uint16_t check_rate = config.params.ngram_check_rate;
|
||||
uint16_t min_hits = config.params.ngram_min_hits;
|
||||
|
||||
return common_ngram_map(size_key, size_value, key_only, check_rate, min_hits);
|
||||
}
|
||||
|
||||
@ -303,7 +304,6 @@ static struct common_speculative_state_ngram_cache create_state_ngram_cache(
|
||||
return state;
|
||||
}
|
||||
|
||||
|
||||
std::string common_speculative_type_name_str() {
|
||||
std::string result;
|
||||
for (size_t i = 0; i < common_speculative_types.size(); i++) {
|
||||
|
||||
@ -78,10 +78,10 @@ json task_params::to_json(bool only_metrics) const {
|
||||
{"speculative.n_min", speculative.n_min},
|
||||
{"speculative.p_min", speculative.p_min},
|
||||
{"speculative.draftless_t", common_speculative_type_to_str(speculative.draftless_type)},
|
||||
{"speculative.ngram_size_n", speculative.spec_ngram_size_n},
|
||||
{"speculative.ngram_size_m", speculative.spec_ngram_size_m},
|
||||
{"speculative.ngram_c_rate", speculative.spec_ngram_check_rate},
|
||||
{"speculative.ngram_m_hits", speculative.spec_ngram_min_hits},
|
||||
{"speculative.ngram_size_n", speculative.ngram_size_n},
|
||||
{"speculative.ngram_size_m", speculative.ngram_size_m},
|
||||
{"speculative.ngram_c_rate", speculative.ngram_check_rate},
|
||||
{"speculative.ngram_m_hits", speculative.ngram_min_hits},
|
||||
{"timings_per_token", timings_per_token},
|
||||
{"post_sampling_probs", post_sampling_probs},
|
||||
{"backend_sampling", sampling.backend_sampling},
|
||||
@ -142,10 +142,10 @@ json task_params::to_json(bool only_metrics) const {
|
||||
{"speculative.n_min", speculative.n_min},
|
||||
{"speculative.p_min", speculative.p_min},
|
||||
{"speculative.draftless_t", common_speculative_type_to_str(speculative.draftless_type)},
|
||||
{"speculative.ngram_size_n", speculative.spec_ngram_size_n},
|
||||
{"speculative.ngram_size_m", speculative.spec_ngram_size_m},
|
||||
{"speculative.ngram_c_rate", speculative.spec_ngram_check_rate},
|
||||
{"speculative.ngram_m_hits", speculative.spec_ngram_min_hits},
|
||||
{"speculative.ngram_size_n", speculative.ngram_size_n},
|
||||
{"speculative.ngram_size_m", speculative.ngram_size_m},
|
||||
{"speculative.ngram_c_rate", speculative.ngram_check_rate},
|
||||
{"speculative.ngram_m_hits", speculative.ngram_min_hits},
|
||||
{"timings_per_token", timings_per_token},
|
||||
{"post_sampling_probs", post_sampling_probs},
|
||||
{"backend_sampling", sampling.backend_sampling},
|
||||
@ -254,15 +254,15 @@ task_params server_task::params_from_json_cmpl(
|
||||
params.speculative.n_max = std::max(params.speculative.n_max, 0);
|
||||
|
||||
params.speculative.draftless_type = common_speculative_type_from_name(json_value(data, "speculative.draftless_t", common_speculative_type_to_str(defaults.speculative.draftless_type)));
|
||||
params.speculative.spec_ngram_size_n = json_value(data, "speculative.ngram_size_n", defaults.speculative.spec_ngram_size_n);
|
||||
params.speculative.spec_ngram_size_m = json_value(data, "speculative.ngram_size_m", defaults.speculative.spec_ngram_size_m);
|
||||
params.speculative.spec_ngram_check_rate = json_value(data, "speculative.ngram_c_rate", defaults.speculative.spec_ngram_check_rate);
|
||||
params.speculative.spec_ngram_min_hits = json_value(data, "speculative.ngram_m_hits", defaults.speculative.spec_ngram_min_hits);
|
||||
params.speculative.ngram_size_n = json_value(data, "speculative.ngram_size_n", defaults.speculative.ngram_size_n);
|
||||
params.speculative.ngram_size_m = json_value(data, "speculative.ngram_size_m", defaults.speculative.ngram_size_m);
|
||||
params.speculative.ngram_check_rate = json_value(data, "speculative.ngram_c_rate", defaults.speculative.ngram_check_rate);
|
||||
params.speculative.ngram_min_hits = json_value(data, "speculative.ngram_m_hits", defaults.speculative.ngram_min_hits);
|
||||
|
||||
params.speculative.spec_ngram_size_n = std::max(std::min(1, (int) params.speculative.spec_ngram_size_n), 1024);
|
||||
params.speculative.spec_ngram_size_m = std::max(std::min(1, (int) params.speculative.spec_ngram_size_m), 1024);
|
||||
params.speculative.spec_ngram_check_rate = std::max(std::min(1, (int) params.speculative.spec_ngram_check_rate), 1024);
|
||||
params.speculative.spec_ngram_min_hits = std::max(std::min(1, (int) params.speculative.spec_ngram_min_hits), 1024);
|
||||
params.speculative.ngram_size_n = std::max(std::min(1, (int) params.speculative.ngram_size_n), 1024);
|
||||
params.speculative.ngram_size_m = std::max(std::min(1, (int) params.speculative.ngram_size_m), 1024);
|
||||
params.speculative.ngram_check_rate = std::max(std::min(1, (int) params.speculative.ngram_check_rate), 1024);
|
||||
params.speculative.ngram_min_hits = std::max(std::min(1, (int) params.speculative.ngram_min_hits), 1024);
|
||||
|
||||
// Use OpenAI API logprobs only if n_probs wasn't provided
|
||||
if (data.contains("logprobs") && params.sampling.n_probs == defaults.sampling.n_probs){
|
||||
|
||||
Loading…
Reference in New Issue
Block a user