CUDA: use LRU based eviction for cuda graphs (#21611)

* CUDA: use a ring-buffer for cuda graphs

* bump limit to 128

* use LRU eviction

* better naming

* do periodic clean-up
This commit is contained in:
Aman Gupta 2026-04-17 23:24:21 +08:00 committed by GitHub
parent a279d0f0f4
commit b94050e896
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1187,6 +1187,7 @@ struct ggml_cuda_graph {
bool disable_due_to_gpu_arch = false;
bool warmup_complete = false;
uint64_t uid = 0;
int64_t last_used_time = 0;
struct node_properties {
ggml_tensor node;
void * node_src_data_ptrs[GGML_MAX_SRC];
@ -1368,12 +1369,28 @@ struct ggml_backend_cuda_context {
// when the computation is split across CPU/GPU (e.g., with --n-cpu-moe)
std::unordered_map<const void *, std::unique_ptr<ggml_cuda_graph>> cuda_graphs;
int64_t last_graph_eviction_sweep = 0;
ggml_cuda_graph * cuda_graph(const void * first_node_ptr) {
const int64_t time_now = ggml_time_us();
// sweep every 5s, evicting cuda graphs unused for >=10s
if (time_now - last_graph_eviction_sweep >= 5'000'000) {
last_graph_eviction_sweep = time_now;
for (auto it = cuda_graphs.begin(); it != cuda_graphs.end(); ) {
if (time_now - it->second->last_used_time >= 10'000'000) {
it = cuda_graphs.erase(it);
} else {
++it;
}
}
}
auto it = cuda_graphs.find(first_node_ptr);
if (it == cuda_graphs.end()) {
cuda_graphs[first_node_ptr] = std::make_unique<ggml_cuda_graph>();
return cuda_graphs[first_node_ptr].get();
it = cuda_graphs.emplace(first_node_ptr, std::make_unique<ggml_cuda_graph>()).first;
}
it->second->last_used_time = time_now;
return it->second.get();
}