whisper.cpp/tests/test-vad.cpp
Daniel Bevenius d566358a1d
tests : update VAD tests to use Silero V6.2.0 (#3534)
* tests : update VAD tests to use Silero V6.2.0

This commit updates the VAD tests to use the Silero V6.2.0 instead of
V5.1.2. I'm was not sure if we needed to keep testing for both versions,
but opted to just update to the latest version for simplicity.

* wasm : use C++17 for emscripten builds

This commit updates the CMakeLists.txt file to explicitly set the C++
standard to C++17 when building with Emscripten.

The motivation for this change is that building with Emscripten
will currently fail locally and on CI with the following error:
```console
[ 75%] Building CXX object examples/CMakeFiles/common.dir/common-ggml.cpp.o
In file included from /home/danbev/work/ai/whisper.cpp/examples/stream.wasm/emscripten.cpp:5:
/home/danbev/work/utils/emsdk/upstream/emscripten/cache/sysroot/include/emscripten/bind.h:11:2: error:
      "embind requires -std=c++17 or newer"
   11 | #error "embind requires -std=c++17 or newer"
      |  ^
In file included from /home/danbev/work/ai/whisper.cpp/examples/whisper.wasm/emscripten.cpp:4:
/home/danbev/work/utils/emsdk/upstream/emscripten/cache/sysroot/include/emscripten/bind.h:11:2: error:
      "embind requires -std=c++17 or newer"
   11 | #error "embind requires -std=c++17 or newer"
      |  ^
```
2025-12-06 10:58:58 +01:00

84 lines
2.6 KiB
C++

#include "whisper.h"
#include "common-whisper.h"
#include <cstdio>
#include <string>
#ifdef NDEBUG
#undef NDEBUG
#endif
#include <cassert>
void assert_default_params(const struct whisper_vad_params & params) {
assert(params.threshold == 0.5);
assert(params.min_speech_duration_ms == 250);
assert(params.min_silence_duration_ms == 100);
assert(params.samples_overlap == 0.1f);
}
void assert_default_context_params(const struct whisper_vad_context_params & params) {
assert(params.n_threads == 4);
assert(params.use_gpu == false);
assert(params.gpu_device == 0);
}
void test_detect_speech(
struct whisper_vad_context * vctx,
struct whisper_vad_params params,
const float * pcmf32,
int n_samples) {
assert(whisper_vad_detect_speech(vctx, pcmf32, n_samples));
assert(whisper_vad_n_probs(vctx) == 344);
assert(whisper_vad_probs(vctx) != nullptr);
}
struct whisper_vad_segments * test_detect_timestamps(
struct whisper_vad_context * vctx,
struct whisper_vad_params params) {
struct whisper_vad_segments * timestamps = whisper_vad_segments_from_probs(vctx, params);
assert(whisper_vad_segments_n_segments(timestamps) == 4);
for (int i = 0; i < whisper_vad_segments_n_segments(timestamps); ++i) {
printf("VAD segment %d: start = %.2f, end = %.2f\n", i,
whisper_vad_segments_get_segment_t0(timestamps, i),
whisper_vad_segments_get_segment_t1(timestamps, i));
}
return timestamps;
}
int main() {
std::string vad_model_path = VAD_MODEL_PATH;
std::string sample_path = SAMPLE_PATH;
// Load the sample audio file
std::vector<float> pcmf32;
std::vector<std::vector<float>> pcmf32s;
assert(read_audio_data(sample_path.c_str(), pcmf32, pcmf32s, false));
assert(pcmf32.size() > 0);
assert(pcmf32s.size() == 0); // no stereo vector
// Load the VAD model
struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params();
assert_default_context_params(ctx_params);
struct whisper_vad_context * vctx = whisper_vad_init_from_file_with_params(
vad_model_path.c_str(),
ctx_params);
assert(vctx != nullptr);
struct whisper_vad_params params = whisper_vad_default_params();
assert_default_params(params);
// Test speech probabilites
test_detect_speech(vctx, params, pcmf32.data(), pcmf32.size());
// Test speech timestamps (uses speech probabilities from above)
struct whisper_vad_segments * timestamps = test_detect_timestamps(vctx, params);
whisper_vad_free_segments(timestamps);
whisper_vad_free(vctx);
return 0;
}