From a3300937e510e676f1c536eaca2105a954f4e143 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 26 Jan 2026 13:59:08 +0200 Subject: [PATCH] common : better names --- common/arg.cpp | 16 ++++++++-------- common/common.h | 9 +++++---- common/speculative.cpp | 10 +++++----- tools/server/server-task.cpp | 32 ++++++++++++++++---------------- 4 files changed, 34 insertions(+), 33 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index ae052f7a5a..3cac36118b 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3415,42 +3415,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--spec-ngram-size-n"}, "N", - string_format("ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: %d)", params.speculative.spec_ngram_size_n), + string_format("ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: %d)", params.speculative.ngram_size_n), [](common_params & params, int value) { if (value < 1 || value > 1024) { throw std::invalid_argument("ngram size N must be between 1 and 1024 inclusive"); } - params.speculative.spec_ngram_size_n = value; + params.speculative.ngram_size_n = value; } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--spec-ngram-size-m"}, "N", - string_format("ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: %d)", params.speculative.spec_ngram_size_m), + string_format("ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: %d)", params.speculative.ngram_size_m), [](common_params & params, int value) { if (value < 1 || value > 1024) { throw std::invalid_argument("ngram size M must be between 1 and 1024 inclusive"); } - params.speculative.spec_ngram_size_m = value; + params.speculative.ngram_size_m = value; } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--spec-ngram-check-rate"}, "N", - string_format("ngram check rate for ngram-simple/ngram-map speculative decoding (default: %d)", params.speculative.spec_ngram_check_rate), + string_format("ngram check rate for ngram-simple/ngram-map speculative decoding (default: %d)", params.speculative.ngram_check_rate), [](common_params & params, int value) { if (value < 1) { throw std::invalid_argument("ngram check rate must be at least 1"); } - params.speculative.spec_ngram_check_rate = value; + params.speculative.ngram_check_rate = value; } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--spec-ngram-min-hits"}, "N", - string_format("minimum hits for ngram-map speculative decoding (default: %d)", params.speculative.spec_ngram_min_hits), + string_format("minimum hits for ngram-map speculative decoding (default: %d)", params.speculative.ngram_min_hits), [](common_params & params, int value) { if (value < 1) { throw std::invalid_argument("ngram min hits must be at least 1"); } - params.speculative.spec_ngram_min_hits = value; + params.speculative.ngram_min_hits = value; } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( diff --git a/common/common.h b/common/common.h index dbd4a08062..9d3f880b84 100644 --- a/common/common.h +++ b/common/common.h @@ -261,6 +261,7 @@ struct common_params_speculative { int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default) float p_split = 0.1f; // speculative decoding split probability float p_min = 0.75f; // minimum speculative decoding probability (greedy) + std::vector> replacements; // main to speculative model replacements std::vector tensor_buft_overrides; @@ -276,10 +277,10 @@ struct common_params_speculative { common_speculative_type draftless_type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding without a draft model - uint16_t spec_ngram_size_n = 12; // ngram size for lookup - uint16_t spec_ngram_size_m = 48; // mgram size for speculative tokens - uint16_t spec_ngram_check_rate = 1; // check rate for ngram lookup - uint16_t spec_ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed + uint16_t ngram_size_n = 12; // ngram size for lookup + uint16_t ngram_size_m = 48; // mgram size for speculative tokens + uint16_t ngram_check_rate = 1; // check rate for ngram lookup + uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT diff --git a/common/speculative.cpp b/common/speculative.cpp index 7092ea27d9..8aeeb6ba2f 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -281,11 +281,12 @@ struct common_speculative { }; static common_ngram_map get_common_ngram_map(const common_speculative_config & config) { - uint16_t size_key = config.params.spec_ngram_size_n; - uint16_t size_value = config.params.spec_ngram_size_m; + uint16_t size_key = config.params.ngram_size_n; + uint16_t size_value = config.params.ngram_size_m; bool key_only = (config.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K); - uint16_t check_rate = config.params.spec_ngram_check_rate; - uint16_t min_hits = config.params.spec_ngram_min_hits; + uint16_t check_rate = config.params.ngram_check_rate; + uint16_t min_hits = config.params.ngram_min_hits; + return common_ngram_map(size_key, size_value, key_only, check_rate, min_hits); } @@ -303,7 +304,6 @@ static struct common_speculative_state_ngram_cache create_state_ngram_cache( return state; } - std::string common_speculative_type_name_str() { std::string result; for (size_t i = 0; i < common_speculative_types.size(); i++) { diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 86cb13f85f..b9a7602292 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -78,10 +78,10 @@ json task_params::to_json(bool only_metrics) const { {"speculative.n_min", speculative.n_min}, {"speculative.p_min", speculative.p_min}, {"speculative.draftless_t", common_speculative_type_to_str(speculative.draftless_type)}, - {"speculative.ngram_size_n", speculative.spec_ngram_size_n}, - {"speculative.ngram_size_m", speculative.spec_ngram_size_m}, - {"speculative.ngram_c_rate", speculative.spec_ngram_check_rate}, - {"speculative.ngram_m_hits", speculative.spec_ngram_min_hits}, + {"speculative.ngram_size_n", speculative.ngram_size_n}, + {"speculative.ngram_size_m", speculative.ngram_size_m}, + {"speculative.ngram_c_rate", speculative.ngram_check_rate}, + {"speculative.ngram_m_hits", speculative.ngram_min_hits}, {"timings_per_token", timings_per_token}, {"post_sampling_probs", post_sampling_probs}, {"backend_sampling", sampling.backend_sampling}, @@ -142,10 +142,10 @@ json task_params::to_json(bool only_metrics) const { {"speculative.n_min", speculative.n_min}, {"speculative.p_min", speculative.p_min}, {"speculative.draftless_t", common_speculative_type_to_str(speculative.draftless_type)}, - {"speculative.ngram_size_n", speculative.spec_ngram_size_n}, - {"speculative.ngram_size_m", speculative.spec_ngram_size_m}, - {"speculative.ngram_c_rate", speculative.spec_ngram_check_rate}, - {"speculative.ngram_m_hits", speculative.spec_ngram_min_hits}, + {"speculative.ngram_size_n", speculative.ngram_size_n}, + {"speculative.ngram_size_m", speculative.ngram_size_m}, + {"speculative.ngram_c_rate", speculative.ngram_check_rate}, + {"speculative.ngram_m_hits", speculative.ngram_min_hits}, {"timings_per_token", timings_per_token}, {"post_sampling_probs", post_sampling_probs}, {"backend_sampling", sampling.backend_sampling}, @@ -254,15 +254,15 @@ task_params server_task::params_from_json_cmpl( params.speculative.n_max = std::max(params.speculative.n_max, 0); params.speculative.draftless_type = common_speculative_type_from_name(json_value(data, "speculative.draftless_t", common_speculative_type_to_str(defaults.speculative.draftless_type))); - params.speculative.spec_ngram_size_n = json_value(data, "speculative.ngram_size_n", defaults.speculative.spec_ngram_size_n); - params.speculative.spec_ngram_size_m = json_value(data, "speculative.ngram_size_m", defaults.speculative.spec_ngram_size_m); - params.speculative.spec_ngram_check_rate = json_value(data, "speculative.ngram_c_rate", defaults.speculative.spec_ngram_check_rate); - params.speculative.spec_ngram_min_hits = json_value(data, "speculative.ngram_m_hits", defaults.speculative.spec_ngram_min_hits); + params.speculative.ngram_size_n = json_value(data, "speculative.ngram_size_n", defaults.speculative.ngram_size_n); + params.speculative.ngram_size_m = json_value(data, "speculative.ngram_size_m", defaults.speculative.ngram_size_m); + params.speculative.ngram_check_rate = json_value(data, "speculative.ngram_c_rate", defaults.speculative.ngram_check_rate); + params.speculative.ngram_min_hits = json_value(data, "speculative.ngram_m_hits", defaults.speculative.ngram_min_hits); - params.speculative.spec_ngram_size_n = std::max(std::min(1, (int) params.speculative.spec_ngram_size_n), 1024); - params.speculative.spec_ngram_size_m = std::max(std::min(1, (int) params.speculative.spec_ngram_size_m), 1024); - params.speculative.spec_ngram_check_rate = std::max(std::min(1, (int) params.speculative.spec_ngram_check_rate), 1024); - params.speculative.spec_ngram_min_hits = std::max(std::min(1, (int) params.speculative.spec_ngram_min_hits), 1024); + params.speculative.ngram_size_n = std::max(std::min(1, (int) params.speculative.ngram_size_n), 1024); + params.speculative.ngram_size_m = std::max(std::min(1, (int) params.speculative.ngram_size_m), 1024); + params.speculative.ngram_check_rate = std::max(std::min(1, (int) params.speculative.ngram_check_rate), 1024); + params.speculative.ngram_min_hits = std::max(std::min(1, (int) params.speculative.ngram_min_hits), 1024); // Use OpenAI API logprobs only if n_probs wasn't provided if (data.contains("logprobs") && params.sampling.n_probs == defaults.sampling.n_probs){