mirror of
https://github.com/ggerganov/whisper.cpp
synced 2026-03-07 15:39:20 +01:00
* tests : update VAD tests to use Silero V6.2.0
This commit updates the VAD tests to use the Silero V6.2.0 instead of
V5.1.2. I'm was not sure if we needed to keep testing for both versions,
but opted to just update to the latest version for simplicity.
* wasm : use C++17 for emscripten builds
This commit updates the CMakeLists.txt file to explicitly set the C++
standard to C++17 when building with Emscripten.
The motivation for this change is that building with Emscripten
will currently fail locally and on CI with the following error:
```console
[ 75%] Building CXX object examples/CMakeFiles/common.dir/common-ggml.cpp.o
In file included from /home/danbev/work/ai/whisper.cpp/examples/stream.wasm/emscripten.cpp:5:
/home/danbev/work/utils/emsdk/upstream/emscripten/cache/sysroot/include/emscripten/bind.h:11:2: error:
"embind requires -std=c++17 or newer"
11 | #error "embind requires -std=c++17 or newer"
| ^
In file included from /home/danbev/work/ai/whisper.cpp/examples/whisper.wasm/emscripten.cpp:4:
/home/danbev/work/utils/emsdk/upstream/emscripten/cache/sysroot/include/emscripten/bind.h:11:2: error:
"embind requires -std=c++17 or newer"
11 | #error "embind requires -std=c++17 or newer"
| ^
```
84 lines
2.6 KiB
C++
84 lines
2.6 KiB
C++
#include "whisper.h"
|
|
#include "common-whisper.h"
|
|
|
|
#include <cstdio>
|
|
#include <string>
|
|
|
|
#ifdef NDEBUG
|
|
#undef NDEBUG
|
|
#endif
|
|
#include <cassert>
|
|
|
|
void assert_default_params(const struct whisper_vad_params & params) {
|
|
assert(params.threshold == 0.5);
|
|
assert(params.min_speech_duration_ms == 250);
|
|
assert(params.min_silence_duration_ms == 100);
|
|
assert(params.samples_overlap == 0.1f);
|
|
}
|
|
|
|
void assert_default_context_params(const struct whisper_vad_context_params & params) {
|
|
assert(params.n_threads == 4);
|
|
assert(params.use_gpu == false);
|
|
assert(params.gpu_device == 0);
|
|
}
|
|
|
|
void test_detect_speech(
|
|
struct whisper_vad_context * vctx,
|
|
struct whisper_vad_params params,
|
|
const float * pcmf32,
|
|
int n_samples) {
|
|
assert(whisper_vad_detect_speech(vctx, pcmf32, n_samples));
|
|
assert(whisper_vad_n_probs(vctx) == 344);
|
|
assert(whisper_vad_probs(vctx) != nullptr);
|
|
}
|
|
|
|
struct whisper_vad_segments * test_detect_timestamps(
|
|
struct whisper_vad_context * vctx,
|
|
struct whisper_vad_params params) {
|
|
struct whisper_vad_segments * timestamps = whisper_vad_segments_from_probs(vctx, params);
|
|
assert(whisper_vad_segments_n_segments(timestamps) == 4);
|
|
|
|
for (int i = 0; i < whisper_vad_segments_n_segments(timestamps); ++i) {
|
|
printf("VAD segment %d: start = %.2f, end = %.2f\n", i,
|
|
whisper_vad_segments_get_segment_t0(timestamps, i),
|
|
whisper_vad_segments_get_segment_t1(timestamps, i));
|
|
}
|
|
|
|
return timestamps;
|
|
}
|
|
|
|
int main() {
|
|
std::string vad_model_path = VAD_MODEL_PATH;
|
|
std::string sample_path = SAMPLE_PATH;
|
|
|
|
// Load the sample audio file
|
|
std::vector<float> pcmf32;
|
|
std::vector<std::vector<float>> pcmf32s;
|
|
assert(read_audio_data(sample_path.c_str(), pcmf32, pcmf32s, false));
|
|
assert(pcmf32.size() > 0);
|
|
assert(pcmf32s.size() == 0); // no stereo vector
|
|
|
|
// Load the VAD model
|
|
struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params();
|
|
assert_default_context_params(ctx_params);
|
|
|
|
struct whisper_vad_context * vctx = whisper_vad_init_from_file_with_params(
|
|
vad_model_path.c_str(),
|
|
ctx_params);
|
|
assert(vctx != nullptr);
|
|
|
|
struct whisper_vad_params params = whisper_vad_default_params();
|
|
assert_default_params(params);
|
|
|
|
// Test speech probabilites
|
|
test_detect_speech(vctx, params, pcmf32.data(), pcmf32.size());
|
|
|
|
// Test speech timestamps (uses speech probabilities from above)
|
|
struct whisper_vad_segments * timestamps = test_detect_timestamps(vctx, params);
|
|
|
|
whisper_vad_free_segments(timestamps);
|
|
whisper_vad_free(vctx);
|
|
|
|
return 0;
|
|
}
|