Option to enable disable the IQK CPU FA kernels (#429)
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
parent
c35a383bcd
commit
b3036a872f
@ -131,6 +131,7 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM"
|
|||||||
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
|
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
|
||||||
option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" OFF)
|
option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" OFF)
|
||||||
|
|
||||||
|
option(GGML_IQK_FLASH_ATTENTION "ggml: enable the IQK FlashAttention CPU kernels" ON)
|
||||||
option(GGML_IQK_FA_ALL_QUANTS "ggml: compile all quants for IQK FlashAttention" OFF)
|
option(GGML_IQK_FA_ALL_QUANTS "ggml: compile all quants for IQK FlashAttention" OFF)
|
||||||
|
|
||||||
option(GGML_CURL "ggml: use libcurl to download model from an URL" OFF)
|
option(GGML_CURL "ggml: use libcurl to download model from an URL" OFF)
|
||||||
|
|||||||
@ -260,9 +260,15 @@ if (GGML_IQK_MUL_MAT)
|
|||||||
add_compile_definitions(GGML_USE_IQK_MULMAT)
|
add_compile_definitions(GGML_USE_IQK_MULMAT)
|
||||||
set(GGML_SOURCES_IQK_MM iqk/iqk_mul_mat.cpp iqk/iqk_flash_attn.cpp)
|
set(GGML_SOURCES_IQK_MM iqk/iqk_mul_mat.cpp iqk/iqk_flash_attn.cpp)
|
||||||
set(GGML_HEADERS_IQK_MM iqk/iqk_mul_mat.h iqk/iqk_flash_impl.h)
|
set(GGML_HEADERS_IQK_MM iqk/iqk_mul_mat.h iqk/iqk_flash_impl.h)
|
||||||
if (GGML_IQK_FA_ALL_QUANTS)
|
if (GGML_IQK_FLASH_ATTENTION)
|
||||||
message(STATUS "Including all IQK FA kernels")
|
message(STATUS "Enabling IQK Flash Attention kernels")
|
||||||
add_compile_definitions(GGML_IQK_FA_ALL_QUANTS)
|
add_compile_definitions(GGML_IQK_FLASH_ATTENTION)
|
||||||
|
if (GGML_IQK_FA_ALL_QUANTS)
|
||||||
|
message(STATUS "Including all IQK FA kernels")
|
||||||
|
add_compile_definitions(GGML_IQK_FA_ALL_QUANTS)
|
||||||
|
endif()
|
||||||
|
else()
|
||||||
|
message(STATUS "Disabling IQK Flash Attention kernels")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|||||||
@ -8,7 +8,7 @@
|
|||||||
#include "iqk_mul_mat.h"
|
#include "iqk_mul_mat.h"
|
||||||
#include "iqk_flash_impl.h"
|
#include "iqk_flash_impl.h"
|
||||||
|
|
||||||
#ifdef IQK_IMPLEMENT
|
#if defined IQK_IMPLEMENT && defined GGML_IQK_FLASH_ATTENTION
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
|||||||
@ -15875,6 +15875,7 @@ void MulMat::relu(int n, const float * x, float * y) {
|
|||||||
#endif
|
#endif
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
|
#ifdef GGML_IQK_FLASH_ATTENTION
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
template <int k_step>
|
template <int k_step>
|
||||||
@ -18663,6 +18664,7 @@ bool iqk_flash_attn_impl(int int_type_k, // type of k
|
|||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#else // IQK_IMPLEMENT
|
#else // IQK_IMPLEMENT
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user