server: rename --clear-idle to --cache-idle-slots (#21741)

2026-04-23 12:02:17 +02:00 · 2026-04-20 08:30:24 +03:00 · 2026-04-20 08:30:24 +03:00 · 9d49acb2a7
commit 9d49acb2a7
parent e365e658f0
6 changed files with 16 additions and 16 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -1316,13 +1316,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
    add_opt(common_arg(
-        {"--clear-idle"},
-        {"--no-clear-idle"},
+        {"--cache-idle-slots"},
+        {"--no-cache-idle-slots"},
        "save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)",
        [](common_params & params, bool value) {
-            params.clear_idle = value;
+            params.cache_idle_slots = value;
        }
-    ).set_env("LLAMA_ARG_CLEAR_IDLE").set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_env("LLAMA_ARG_CACHE_IDLE_SLOTS").set_examples({LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--context-shift"},
        {"--no-context-shift"},
--- a/common/common.h
+++ b/common/common.h
@ -567,7 +567,7 @@ struct common_params {
    int32_t n_threads_http      = -1;    // number of threads to process HTTP requests (TODO: support threadpool)
    int32_t n_cache_reuse       = 0;     // min chunk size to reuse from the cache via KV shifting
    bool    cache_prompt        = true;  // whether to enable prompt caching
-    bool    clear_idle          = true;  // save and clear idle slots upon starting a new task
+    bool    cache_idle_slots    = true;  // save and clear idle slots upon starting a new task
    int32_t n_ctx_checkpoints   = 32;    // max number of context checkpoints per slot
    int32_t checkpoint_every_nt = 8192;  // make a checkpoint every n tokens during prefill
    int32_t cache_ram_mib       = 8192;  // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
--- a/tools/server/README.md
+++ b/tools/server/README.md
@ -167,7 +167,7 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)<br/>(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) |
 | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
 | `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
-| `--clear-idle, --no-clear-idle` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)<br/>(env: LLAMA_ARG_CLEAR_IDLE) |
+| `--cache-idle-slots, --no-cache-idle-slots` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)<br/>(env: LLAMA_ARG_CACHE_IDLE_SLOTS) |
 | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
 | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode |
 | `-sp, --special` | special tokens output enabled (default: false) |
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -987,13 +987,13 @@ private:

        metrics.init();

-        if (params_base.clear_idle) {
+        if (params_base.cache_idle_slots) {
            if (!params_base.kv_unified) {
-                SRV_WRN("%s: --clear-idle requires --kv-unified, disabling\n", __func__);
-                params_base.clear_idle = false;
+                SRV_WRN("%s: --cache-idle-slots requires --kv-unified, disabling\n", __func__);
+                params_base.cache_idle_slots = false;
            } else if (params_base.cache_ram_mib == 0) {
-                SRV_WRN("%s: --clear-idle requires --cache-ram, disabling\n", __func__);
-                params_base.clear_idle = false;
+                SRV_WRN("%s: --cache-idle-slots requires --cache-ram, disabling\n", __func__);
+                params_base.cache_idle_slots = false;
            } else {
                SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__);
                SRV_DBG("%s", "__TEST_TAG_CLEAR_IDLE_ENABLED__\n");
@ -1886,7 +1886,7 @@ private:
                        break; // drop the task
                    }

-                    if (params_base.clear_idle) {
+                    if (params_base.cache_idle_slots) {
                        for (auto & s : slots) {
                            if (!s.is_processing()) {
                                slot_save_and_clear(s);
--- a/tools/server/tests/unit/test_kv_keep_only_active.py
+++ b/tools/server/tests/unit/test_kv_keep_only_active.py
@ -91,7 +91,7 @@ def test_clear_and_restore():

 def test_disabled_with_flag():
    global server
-    server.no_clear_idle = True
+    server.no_cache_idle_slots = True
    server.start()
    log = LogReader(server.log_path)

--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@ -103,7 +103,7 @@ class ServerProcess:
    media_path: str | None = None
    sleep_idle_seconds: int | None = None
    cache_ram: int | None = None
-    no_clear_idle: bool = False
+    no_cache_idle_slots: bool = False
    log_path: str | None = None
    webui_mcp_proxy: bool = False

@ -242,8 +242,8 @@ class ServerProcess:
            server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds])
        if self.cache_ram is not None:
            server_args.extend(["--cache-ram", self.cache_ram])
-        if self.no_clear_idle:
-            server_args.append("--no-clear-idle")
+        if self.no_cache_idle_slots:
+            server_args.append("--no-cache-idle-slots")
        if self.webui_mcp_proxy:
            server_args.append("--webui-mcp-proxy")