From dc9611662265870df22a7230b7586176a99c1955 Mon Sep 17 00:00:00 2001 From: lohopupa <87423657+lohopupa@users.noreply.github.com> Date: Tue, 17 Mar 2026 12:19:08 +0600 Subject: [PATCH] fix: VAD time mapping timestamp drift caused by overlap samples (#3711) * whisper : fix VAD segment overlap boundary handling - Use original segment length (pre-overlap) for vad_end in the time mapping table, so segment boundaries are preserved accurately Claude Sonnet 4.6 (Low) * whisper : remove intermediate VAD time mapping points Now that segment boundaries are mapped accurately, the intermediate point interpolation is no longer necessary. --------- Co-authored-by: Lohopupa --- src/whisper.cpp | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index 796bccfb4..86bfafeaa 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -6701,12 +6701,13 @@ static bool whisper_vad( int segment_start_samples = cs_to_samples(vad_segments->data[i].start); int segment_end_samples = cs_to_samples(vad_segments->data[i].end); - if (i < (int)vad_segments->data.size() - 1) { - segment_end_samples += overlap_samples; - } - segment_start_samples = std::min(segment_start_samples, n_samples - 1); segment_end_samples = std::min(segment_end_samples, n_samples - 1); + int original_segment_length = segment_end_samples - segment_start_samples; + + if (i < (int)vad_segments->data.size() - 1) { + segment_end_samples = std::min(segment_end_samples + overlap_samples, n_samples - 1); + } int segment_length = segment_end_samples - segment_start_samples; if (segment_length > 0) { whisper_state::vad_segment_info segment; @@ -6715,7 +6716,7 @@ static bool whisper_vad( segment.orig_end = vad_segments->data[i].end; segment.vad_start = samples_to_cs(offset); - segment.vad_end = samples_to_cs(offset + segment_length); + segment.vad_end = samples_to_cs(offset + original_segment_length); // Add segment boundaries to mapping table vad_time_mapping start_mapping = {segment.vad_start, segment.orig_start}; @@ -6724,29 +6725,6 @@ static bool whisper_vad( state->vad_mapping_table.push_back(start_mapping); state->vad_mapping_table.push_back(end_mapping); - // Add intermediate points for longer segments to improve interpolation accuracy - const int64_t min_segment_length = 100; // 1 second - const int64_t point_interval = 20; // Add a point every 200ms - - if (segment.vad_end - segment.vad_start > min_segment_length) { - int64_t segment_duration = segment.vad_end - segment.vad_start; - int num_points = (int)(segment_duration / point_interval) - 1; - - for (int j = 1; j <= num_points; j++) { - int64_t vad_time = segment.vad_start + j * point_interval; - - if (vad_time >= segment.vad_end) continue; - - int64_t vad_elapsed = vad_time - segment.vad_start; - int64_t vad_total = segment.vad_end - segment.vad_start; - int64_t orig_total = segment.orig_end - segment.orig_start; - int64_t orig_time = segment.orig_start + (vad_elapsed * orig_total) / vad_total; - - vad_time_mapping intermediate_mapping = {vad_time, orig_time}; - state->vad_mapping_table.push_back(intermediate_mapping); - } - } - WHISPER_LOG_INFO("%s: vad_segment_info: orig_start: %.2f, orig_end: %.2f, vad_start: %.2f, vad_end: %.2f\n", __func__, segment.orig_start/100.0, segment.orig_end/100.0, segment.vad_start/100.0, segment.vad_end/100.0); ctx->state->vad_segments.push_back(segment);