CUDA: use LRU based eviction for cuda graphs (#21611)

* CUDA: use a ring-buffer for cuda graphs * bump limit to 128 * use LRU eviction * better naming * do periodic clean-up
2026-04-18 21:26:07 +02:00 · 2026-04-17 23:24:21 +08:00 · 2026-04-17 23:24:21 +08:00 · b94050e896
commit b94050e896
parent a279d0f0f4
1 changed files with 19 additions and 2 deletions
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@ -1187,6 +1187,7 @@ struct ggml_cuda_graph {
    bool disable_due_to_gpu_arch = false;
    bool warmup_complete = false;
    uint64_t uid = 0;
+    int64_t last_used_time = 0;
    struct node_properties {
        ggml_tensor node;
        void *   node_src_data_ptrs[GGML_MAX_SRC];
@ -1368,12 +1369,28 @@ struct ggml_backend_cuda_context {
    // when the computation is split across CPU/GPU (e.g., with --n-cpu-moe)
    std::unordered_map<const void *, std::unique_ptr<ggml_cuda_graph>> cuda_graphs;

+    int64_t last_graph_eviction_sweep = 0;
+
    ggml_cuda_graph * cuda_graph(const void * first_node_ptr) {
+        const int64_t time_now = ggml_time_us();
+
+        // sweep every 5s, evicting cuda graphs unused for >=10s
+        if (time_now - last_graph_eviction_sweep >= 5'000'000) {
+            last_graph_eviction_sweep = time_now;
+            for (auto it = cuda_graphs.begin(); it != cuda_graphs.end(); ) {
+                if (time_now - it->second->last_used_time >= 10'000'000) {
+                    it = cuda_graphs.erase(it);
+                } else {
+                    ++it;
+                }
+            }
+        }
+
        auto it = cuda_graphs.find(first_node_ptr);
        if (it == cuda_graphs.end()) {
-            cuda_graphs[first_node_ptr] = std::make_unique<ggml_cuda_graph>();
-            return cuda_graphs[first_node_ptr].get();
+            it = cuda_graphs.emplace(first_node_ptr, std::make_unique<ggml_cuda_graph>()).first;
        }
+        it->second->last_used_time = time_now;
        return it->second.get();
    }