mirror of
https://github.com/ggerganov/llama.cpp
synced 2026-04-21 22:54:31 +02:00
ggml-cuda: flush legacy pool on OOM and retry (#22155)
* ggml-cuda: flush legacy pool on OOM and retry Signed-off-by: 梁厚宏 <2695316095@qq.com> * Address review comments: add explicit sync, update destructor, clean up MUSA macros Signed-off-by: 梁厚宏 <2695316095@qq.com> --------- Signed-off-by: 梁厚宏 <2695316095@qq.com>
This commit is contained in:
parent
86f8daacfe
commit
97895129e5
@ -368,15 +368,21 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
}
|
||||
|
||||
~ggml_cuda_pool_leg() {
|
||||
clear_pool();
|
||||
GGML_ASSERT(pool_size == 0);
|
||||
}
|
||||
|
||||
void clear_pool() {
|
||||
ggml_cuda_set_device(device);
|
||||
for (int i = 0; i < MAX_BUFFERS; ++i) {
|
||||
ggml_cuda_buffer & b = buffer_pool[i];
|
||||
if (b.ptr != nullptr) {
|
||||
CUDA_CHECK(cudaFree(b.ptr));
|
||||
pool_size -= b.size;
|
||||
b.ptr = nullptr;
|
||||
b.size = 0;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(pool_size == 0);
|
||||
}
|
||||
|
||||
void * alloc(size_t size, size_t * actual_size) override {
|
||||
@ -421,7 +427,20 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
size_t look_ahead_size = (size_t) (1.05 * size);
|
||||
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
||||
ggml_cuda_set_device(device);
|
||||
CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
|
||||
cudaError_t err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
|
||||
if (err == cudaErrorMemoryAllocation) {
|
||||
(void)cudaGetLastError();
|
||||
const size_t cached_bytes = pool_size;
|
||||
GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: alloc of %.2f MiB failed, flushing %.2f MiB of cached buffers and retrying\n",
|
||||
device, look_ahead_size/1024.0/1024.0, cached_bytes/1024.0/1024.0);
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
clear_pool();
|
||||
err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
|
||||
if (err == cudaSuccess) {
|
||||
GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: retry succeeded\n", device);
|
||||
}
|
||||
}
|
||||
CUDA_CHECK(err);
|
||||
*actual_size = look_ahead_size;
|
||||
pool_size += look_ahead_size;
|
||||
#ifdef DEBUG_CUDA_MALLOC
|
||||
|
||||
1
ggml/src/ggml-cuda/vendors/hip.h
vendored
1
ggml/src/ggml-cuda/vendors/hip.h
vendored
@ -58,6 +58,7 @@
|
||||
#define cudaDeviceProp hipDeviceProp_t
|
||||
#define cudaDeviceSynchronize hipDeviceSynchronize
|
||||
#define cudaError_t hipError_t
|
||||
#define cudaErrorMemoryAllocation hipErrorOutOfMemory
|
||||
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
||||
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
||||
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
||||
|
||||
1
ggml/src/ggml-cuda/vendors/musa.h
vendored
1
ggml/src/ggml-cuda/vendors/musa.h
vendored
@ -42,6 +42,7 @@
|
||||
#define cudaDeviceProp musaDeviceProp
|
||||
#define cudaDeviceSynchronize musaDeviceSynchronize
|
||||
#define cudaError_t musaError_t
|
||||
#define cudaErrorMemoryAllocation musaErrorMemoryAllocation
|
||||
#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
|
||||
#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
|
||||
#define cudaEventCreateWithFlags musaEventCreateWithFlags
|
||||
|
||||
Loading…
Reference in New Issue
Block a user