kv-cache : support V-less cache (llama/19067)

* kv-cache : support V-less cache

* cuda : better check for V_is_K_view

* cuda : improve V_is_K_view check

* graph : add comments

* hparams : refactor
This commit is contained in:
Georgi Gerganov 2026-01-25 15:48:56 +02:00
parent f53eafd745
commit d2b51404e4
2 changed files with 2 additions and 2 deletions

View File

@ -782,7 +782,7 @@ void launch_fattn(
const ggml_tensor * K = dst->src[1];
const ggml_tensor * V = dst->src[2];
const bool V_is_K_view = V->op == GGML_OP_VIEW && V->src[0] == K && V->data == K->data;
const bool V_is_K_view = V->view_src && V->view_offs == 0 && (V->view_src == K || V->view_src == K->view_src);
const ggml_tensor * mask = dst->src[3];
const ggml_tensor * sinks = dst->src[4];

View File

@ -247,7 +247,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
}
}
const bool V_is_K_view = V->op == GGML_OP_VIEW && V->src[0] == K && V->data == K->data;
const bool V_is_K_view = V->view_src && V->view_offs == 0 && (V->view_src == K || V->view_src == K->view_src);
const int cc = ggml_cuda_info().devices[device].cc;