From fd1c0ec3f037f47f8ed76d99d0fd3cacdcc0baab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 18 Apr 2026 08:16:04 +0200 Subject: [PATCH] llama: fit ctx size for CPU only (#21568) --- src/llama.cpp | 157 +++++++++++++++++++++++++++++++------------------- 1 file changed, 99 insertions(+), 58 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 484372d8d1..7d3a34e98a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -91,12 +91,16 @@ static std::vector llama_get_device_memory_data( throw std::runtime_error("failed to create llama_context from model"); } - std::vector ret(model->devices.size()); + const size_t nd = model->n_devices(); + std::vector ret(nd + 1); std::map memory_breakdown = ctx->memory_breakdown(); for (const auto & [buft, mb] : memory_breakdown) { if (ggml_backend_buft_is_host(buft)) { + ret.back().mb.model += mb.model; + ret.back().mb.context += mb.context; + ret.back().mb.compute += mb.compute; continue; } @@ -104,7 +108,7 @@ static std::vector llama_get_device_memory_data( if (!dev) { continue; } - for (size_t i = 0; i < ret.size(); i++) { + for (size_t i = 0; i < nd; i++) { if (model->devices[i].dev == dev) { ret[i].mb.model += mb.model; ret[i].mb.context += mb.context; @@ -113,7 +117,19 @@ static std::vector llama_get_device_memory_data( } } } - for (size_t i = 0; i < ret.size(); i++) { + + { + ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (cpu_dev == nullptr) { + throw std::runtime_error(format("%s: no CPU backend found", __func__)); + } + size_t free; + size_t total; + ggml_backend_dev_memory(cpu_dev, &free, &total); + ret.back().free = free; + ret.back().total = total; + } + for (size_t i = 0; i < nd; i++) { size_t free; size_t total; ggml_backend_dev_memory(model->devices[i].dev, &free, &total); @@ -122,11 +138,8 @@ static std::vector llama_get_device_memory_data( // have any to report. in this case, we will use the host memory as a fallback // fixes: https://github.com/ggml-org/llama.cpp/issues/18577 if (free == 0 && total == 0) { - ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); - if (cpu_dev == nullptr) { - throw std::runtime_error(format("%s: no CPU backend found", __func__)); - } - ggml_backend_dev_memory(cpu_dev, &free, &total); + free = ret.back().free; + total = ret.back().total; } ret[i].free = free; ret[i].total = total; @@ -180,15 +193,15 @@ static void llama_params_fit_impl( LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__); const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); const size_t nd = devs.size(); // number of devices - if (nd == 0) { - LLAMA_LOG_INFO("%s: no devices with dedicated memory found\n", __func__); - return; - } std::vector margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits margins.reserve(nd); - for (size_t id = 0; id < nd; id++) { - margins.push_back(margins_s[id]); + if (nd == 0) { + margins.push_back(margins_s[0]); + } else { + for (size_t id = 0; id < nd; id++) { + margins.push_back(margins_s[id]); + } } std::vector dev_names; @@ -215,46 +228,59 @@ static void llama_params_fit_impl( std::vector projected_free_per_device; projected_free_per_device.reserve(nd); - if (nd > 1) { - LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__); - } - for (size_t id = 0; id < nd; id++) { - const llama_device_memory_data & dmd = dmds_full[id]; - - const int64_t projected_used = dmd.mb.total(); - const int64_t projected_free = dmd.free - projected_used; - projected_free_per_device.push_back(projected_free); - - sum_free += dmd.free; - sum_projected_used += projected_used; - sum_projected_free += projected_free; - sum_projected_model += dmd.mb.model; - - if (nd > 1) { - LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n", - __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB); - } - } - assert(sum_free >= 0 && sum_projected_used >= 0); - LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n", - __func__, sum_projected_used/MiB, sum_free/MiB); - if (nd == 1) { - if (projected_free_per_device[0] >= margins[0]) { - LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n", - __func__, projected_free_per_device[0]/MiB, margins[0]/MiB); + if (nd == 0) { + sum_projected_used = dmds_full.back().mb.total(); + sum_free = dmds_full.back().total; + sum_projected_free = sum_free - sum_projected_used; + LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n", + __func__, sum_projected_used/MiB, sum_free/MiB); + if (sum_projected_free >= margins[0]) { + LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n", + __func__, sum_projected_free/MiB, margins[0]/MiB); return; } } else { - bool changes_needed = false; + if (nd > 1) { + LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__); + } for (size_t id = 0; id < nd; id++) { - if (projected_free_per_device[id] < margins[id]) { - changes_needed = true; - break; + const llama_device_memory_data & dmd = dmds_full[id]; + + const int64_t projected_used = dmd.mb.total(); + const int64_t projected_free = dmd.free - projected_used; + projected_free_per_device.push_back(projected_free); + + sum_free += dmd.free; + sum_projected_used += projected_used; + sum_projected_free += projected_free; + sum_projected_model += dmd.mb.model; + + if (nd > 1) { + LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n", + __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB); } } - if (!changes_needed) { - LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__); - return; + assert(sum_free >= 0 && sum_projected_used >= 0); + LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n", + __func__, sum_projected_used/MiB, sum_free/MiB); + if (nd == 1) { + if (projected_free_per_device[0] >= margins[0]) { + LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n", + __func__, projected_free_per_device[0]/MiB, margins[0]/MiB); + return; + } + } else { + bool changes_needed = false; + for (size_t id = 0; id < nd; id++) { + if (projected_free_per_device[id] < margins[id]) { + changes_needed = true; + break; + } + } + if (!changes_needed) { + LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__); + return; + } } } @@ -262,11 +288,15 @@ static void llama_params_fit_impl( { int64_t global_surplus = sum_projected_free; - for (size_t id = 0; id < nd; id++) { - global_surplus -= margins[id]; + if (nd == 0) { + global_surplus -= margins[0]; + } else { + for (size_t id = 0; id < nd; id++) { + global_surplus -= margins[id]; + } } if (global_surplus < 0) { - if (nd == 1) { + if (nd <= 1) { LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n", __func__, margins[0]/MiB, -global_surplus/MiB); } else { @@ -277,8 +307,12 @@ static void llama_params_fit_impl( if (cparams->n_ctx == 0) { if (hp_nct > n_ctx_min) { int64_t sum_used_target = sum_free; - for (size_t id = 0; id < nd; id++) { - sum_used_target -= margins[id]; + if (nd == 0) { + sum_used_target -= margins[0]; + } else { + for (size_t id = 0; id < nd; id++) { + sum_used_target -= margins[id]; + } } if (nd > 1) { // for multiple devices we need to be more conservative in terms of how much context we think can fit: @@ -293,8 +327,12 @@ static void llama_params_fit_impl( int64_t sum_projected_used_min_ctx = 0; cparams->n_ctx = n_ctx_min; const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); - for (const auto & dmd : dmds_min_ctx) { - sum_projected_used_min_ctx += dmd.mb.total(); + if (nd == 0) { + sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total(); + } else { + for (size_t id = 0; id < nd; id++) { + sum_projected_used_min_ctx += dmds_min_ctx[id].mb.total(); + } } if (sum_used_target > sum_projected_used_min_ctx) { // linear interpolation between minimum and maximum context size: @@ -306,7 +344,7 @@ static void llama_params_fit_impl( const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx; LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n", __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB); - if (nd == 1) { + if (nd <= 1) { LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__); return; } @@ -329,6 +367,9 @@ static void llama_params_fit_impl( } } } + if (nd == 0) { + throw llama_params_fit_exception("was unable to fit model into system memory by reducing context, abort"); + } if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) { throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort"); @@ -476,8 +517,8 @@ static void llama_params_fit_impl( std::vector ret; ret.reserve(nd); - for (const llama_device_memory_data & dmd : dmd_nl) { - ret.push_back(dmd.mb.total()); + for (size_t id = 0; id < nd; id++) { + ret.push_back(dmd_nl[id].mb.total()); } return ret; };