From 5aed513c47ab0002d11fd513c46ecc9674cfc419 Mon Sep 17 00:00:00 2001
From: Mathieu Virbel <mat@meltingrocks.com>
Date: Fri, 22 Aug 2025 11:03:26 -0600
Subject: [PATCH] fix: remove downscale from silero vad and use upstream
 processor

---
 .../processors/audio_chunker_silero.py        | 47 +++++++++----------
 1 file changed, 21 insertions(+), 26 deletions(-)

diff --git a/server/reflector/processors/audio_chunker_silero.py b/server/reflector/processors/audio_chunker_silero.py
index c9719ed36..f5919ebb7 100644
--- a/server/reflector/processors/audio_chunker_silero.py
+++ b/server/reflector/processors/audio_chunker_silero.py
@@ -11,7 +11,10 @@
 
 class AudioChunkerSileroProcessor(AudioChunkerProcessor):
     """
-    Assemble audio frames into chunks with VAD-based speech detection using Silero VAD
+    Assemble audio frames into chunks with VAD-based speech detection using Silero VAD.
+
+    Expects input audio to be already downscaled to 16kHz mono s16 format
+    (handled by AudioDownscaleProcessor in the pipeline).
     """
 
     def __init__(
@@ -31,12 +34,13 @@ def __init__(
         self._init_vad(use_onnx)
 
     def _init_vad(self, use_onnx=False):
-        """Initialize Silero VAD model"""
+        """Initialize Silero VAD model for 16kHz audio"""
         try:
             torch.set_num_threads(1)
             self.vad_model = load_silero_vad(onnx=use_onnx)
+            # VAD expects 16kHz audio (guaranteed by AudioDownscaleProcessor)
             self.vad_iterator = VADIterator(self.vad_model, sampling_rate=16000)
-            self.logger.info("Silero VAD initialized successfully")
+            self.logger.info("Silero VAD initialized for 16kHz audio")
 
         except Exception as e:
             self.logger.error(f"Failed to initialize Silero VAD: {e}")
@@ -75,7 +79,7 @@ async def _process_block(self) -> Optional[list[av.AudioFrame]]:
             return None
 
         # Processing block with current buffer size
-        print(f"Processing block: {len(self.frames)} frames in buffer")
+        # print(f"Processing block: {len(self.frames)} frames in buffer")
 
         try:
             # Convert frames to numpy array for VAD
@@ -189,38 +193,29 @@ def format_time(seconds):
         return None
 
     def _frames_to_numpy(self, frames: list[av.AudioFrame]) -> Optional[np.ndarray]:
-        """Convert av.AudioFrame list to numpy array for VAD processing"""
+        """Convert av.AudioFrame list to numpy array for VAD processing
+
+        Input frames are already 16kHz mono s16 format from AudioDownscaleProcessor.
+        Only need to convert s16 to float32 for Silero VAD.
+        """
         if not frames:
             return None
 
         try:
-            audio_data = []
-            for frame in frames:
-                frame_array = frame.to_ndarray()
-
-                if len(frame_array.shape) == 2:
-                    frame_array = frame_array.flatten()
-
-                audio_data.append(frame_array)
-
-            if not audio_data:
+            # Concatenate all frame arrays
+            audio_arrays = [frame.to_ndarray().flatten() for frame in frames]
+            if not audio_arrays:
                 return None
 
-            combined_audio = np.concatenate(audio_data)
+            combined_audio = np.concatenate(audio_arrays)
 
-            # Ensure float32 format
-            if combined_audio.dtype == np.int16:
-                # Normalize int16 audio to float32 in range [-1.0, 1.0]
-                combined_audio = combined_audio.astype(np.float32) / 32768.0
-            elif combined_audio.dtype != np.float32:
-                combined_audio = combined_audio.astype(np.float32)
-
-            return combined_audio
+            # Convert s16 to float32 (Silero VAD requires float32 in range [-1.0, 1.0])
+            # Input is guaranteed to be s16 from AudioDownscaleProcessor
+            return combined_audio.astype(np.float32) / 32768.0
 
         except Exception as e:
             self.logger.error(f"Error converting frames to numpy: {e}")
-
-        return None
+            return None
 
     def _find_speech_segment_end(self, audio_array: np.ndarray) -> Optional[int]:
         """Find complete speech segments and return frame index at segment end"""