llama: fit ctx size for CPU only (#21568)

2026-04-19 05:36:29 +02:00 · 2026-04-18 08:16:04 +02:00 · 2026-04-18 08:16:04 +02:00 · fd1c0ec3f0
commit fd1c0ec3f0
parent 45cac7ca70
1 changed files with 99 additions and 58 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -91,12 +91,16 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
        throw std::runtime_error("failed to create llama_context from model");
    }

-    std::vector<llama_device_memory_data> ret(model->devices.size());
+    const size_t nd = model->n_devices();
+    std::vector<llama_device_memory_data> ret(nd + 1);

    std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();

    for (const auto & [buft, mb] : memory_breakdown) {
        if (ggml_backend_buft_is_host(buft)) {
+            ret.back().mb.model   += mb.model;
+            ret.back().mb.context += mb.context;
+            ret.back().mb.compute += mb.compute;
            continue;
        }

@ -104,7 +108,7 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
        if (!dev) {
            continue;
        }
-        for (size_t i = 0; i < ret.size(); i++) {
+        for (size_t i = 0; i < nd; i++) {
            if (model->devices[i].dev == dev) {
                ret[i].mb.model   += mb.model;
                ret[i].mb.context += mb.context;
@ -113,7 +117,19 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
            }
        }
    }
-    for (size_t i = 0; i < ret.size(); i++) {
+
+    {
+        ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+        if (cpu_dev == nullptr) {
+            throw std::runtime_error(format("%s: no CPU backend found", __func__));
+        }
+        size_t free;
+        size_t total;
+        ggml_backend_dev_memory(cpu_dev, &free, &total);
+        ret.back().free  = free;
+        ret.back().total = total;
+    }
+    for (size_t i = 0; i < nd; i++) {
        size_t free;
        size_t total;
        ggml_backend_dev_memory(model->devices[i].dev, &free, &total);
@ -122,11 +138,8 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
        // have any to report. in this case, we will use the host memory as a fallback
        // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
        if (free == 0 && total == 0) {
-            ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-            if (cpu_dev == nullptr) {
-                throw std::runtime_error(format("%s: no CPU backend found", __func__));
-            }
-            ggml_backend_dev_memory(cpu_dev, &free, &total);
+            free  = ret.back().free;
+            total = ret.back().total;
        }
        ret[i].free  = free;
        ret[i].total = total;
@ -180,15 +193,15 @@ static void llama_params_fit_impl(
    LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__);
    const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
    const size_t nd = devs.size(); // number of devices
-    if (nd == 0) {
-        LLAMA_LOG_INFO("%s: no devices with dedicated memory found\n", __func__);
-        return;
-    }

    std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
    margins.reserve(nd);
-    for (size_t id = 0; id < nd; id++) {
-        margins.push_back(margins_s[id]);
+    if (nd == 0) {
+        margins.push_back(margins_s[0]);
+    } else {
+        for (size_t id = 0; id < nd; id++) {
+            margins.push_back(margins_s[id]);
+        }
    }

    std::vector<std::string> dev_names;
@ -215,46 +228,59 @@ static void llama_params_fit_impl(
    std::vector<int64_t> projected_free_per_device;
    projected_free_per_device.reserve(nd);

-    if (nd > 1) {
-        LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
-    }
-    for (size_t id = 0; id < nd; id++) {
-        const llama_device_memory_data & dmd = dmds_full[id];
-
-        const int64_t projected_used = dmd.mb.total();
-        const int64_t projected_free = dmd.free - projected_used;
-        projected_free_per_device.push_back(projected_free);
-
-        sum_free            += dmd.free;
-        sum_projected_used  += projected_used;
-        sum_projected_free  += projected_free;
-        sum_projected_model += dmd.mb.model;
-
-        if (nd > 1) {
-            LLAMA_LOG_INFO("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
-                __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
-        }
-    }
-    assert(sum_free >= 0 && sum_projected_used >= 0);
-    LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
-        __func__, sum_projected_used/MiB, sum_free/MiB);
-    if (nd == 1) {
-        if (projected_free_per_device[0] >= margins[0]) {
-            LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
-                __func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
+    if (nd == 0) {
+        sum_projected_used = dmds_full.back().mb.total();
+        sum_free           = dmds_full.back().total;
+        sum_projected_free = sum_free - sum_projected_used;
+        LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
+            __func__, sum_projected_used/MiB, sum_free/MiB);
+        if (sum_projected_free >= margins[0]) {
+            LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
+                __func__, sum_projected_free/MiB, margins[0]/MiB);
            return;
        }
    } else {
-        bool changes_needed = false;
+        if (nd > 1) {
+            LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
+        }
        for (size_t id = 0; id < nd; id++) {
-            if (projected_free_per_device[id] < margins[id]) {
-                changes_needed = true;
-                break;
+            const llama_device_memory_data & dmd = dmds_full[id];
+
+            const int64_t projected_used = dmd.mb.total();
+            const int64_t projected_free = dmd.free - projected_used;
+            projected_free_per_device.push_back(projected_free);
+
+            sum_free            += dmd.free;
+            sum_projected_used  += projected_used;
+            sum_projected_free  += projected_free;
+            sum_projected_model += dmd.mb.model;
+
+            if (nd > 1) {
+                LLAMA_LOG_INFO("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
+                    __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
            }
        }
-        if (!changes_needed) {
-            LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
-            return;
+        assert(sum_free >= 0 && sum_projected_used >= 0);
+        LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
+            __func__, sum_projected_used/MiB, sum_free/MiB);
+        if (nd == 1) {
+            if (projected_free_per_device[0] >= margins[0]) {
+                LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
+                    __func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
+                return;
+            }
+        } else {
+            bool changes_needed = false;
+            for (size_t id = 0; id < nd; id++) {
+                if (projected_free_per_device[id] < margins[id]) {
+                    changes_needed = true;
+                    break;
+                }
+            }
+            if (!changes_needed) {
+                LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
+                return;
+            }
        }
    }

@ -262,11 +288,15 @@ static void llama_params_fit_impl(

    {
        int64_t global_surplus = sum_projected_free;
-        for (size_t id = 0; id < nd; id++) {
-            global_surplus -= margins[id];
+        if (nd == 0) {
+            global_surplus -= margins[0];
+        } else {
+            for (size_t id = 0; id < nd; id++) {
+                global_surplus -= margins[id];
+            }
        }
        if (global_surplus < 0) {
-            if (nd == 1) {
+            if (nd <= 1) {
                LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
                    __func__, margins[0]/MiB, -global_surplus/MiB);
            } else {
@ -277,8 +307,12 @@ static void llama_params_fit_impl(
            if (cparams->n_ctx == 0) {
                if (hp_nct > n_ctx_min) {
                    int64_t sum_used_target = sum_free;
-                    for (size_t id = 0; id < nd; id++) {
-                        sum_used_target -= margins[id];
+                    if (nd == 0) {
+                        sum_used_target -= margins[0];
+                    } else {
+                        for (size_t id = 0; id < nd; id++) {
+                            sum_used_target -= margins[id];
+                        }
                    }
                    if (nd > 1) {
                        // for multiple devices we need to be more conservative in terms of how much context we think can fit:
@ -293,8 +327,12 @@ static void llama_params_fit_impl(
                    int64_t sum_projected_used_min_ctx = 0;
                    cparams->n_ctx = n_ctx_min;
                    const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
-                    for (const auto & dmd : dmds_min_ctx) {
-                        sum_projected_used_min_ctx += dmd.mb.total();
+                    if (nd == 0) {
+                        sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total();
+                    } else {
+                        for (size_t id = 0; id < nd; id++) {
+                            sum_projected_used_min_ctx += dmds_min_ctx[id].mb.total();
+                        }
                    }
                    if (sum_used_target > sum_projected_used_min_ctx) {
                        // linear interpolation between minimum and maximum context size:
@ -306,7 +344,7 @@ static void llama_params_fit_impl(
                        const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
                        LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
-                        if (nd == 1) {
+                        if (nd <= 1) {
                            LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
                            return;
                        }
@ -329,6 +367,9 @@ static void llama_params_fit_impl(
            }
        }
    }
+    if (nd == 0) {
+        throw llama_params_fit_exception("was unable to fit model into system memory by reducing context, abort");
+    }

    if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
        throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
@ -476,8 +517,8 @@ static void llama_params_fit_impl(

        std::vector<int64_t> ret;
        ret.reserve(nd);
-        for (const llama_device_memory_data & dmd : dmd_nl) {
-            ret.push_back(dmd.mb.total());
+        for (size_t id = 0; id < nd; id++) {
+            ret.push_back(dmd_nl[id].mb.total());
        }
        return ret;
    };