mediar-ai · m13v · Jul 31, 2024 · Jul 31, 2024 · Jul 31, 2024 · Jul 31, 2024
diff --git a/screenpipe-audio/Cargo.toml b/screenpipe-audio/Cargo.toml
@@ -60,6 +60,9 @@ bytemuck = "1.16.1"
 # Async
 tokio = { workspace = true }
 
+# Detect speech/silence
+webrtc-vad = "0.4.0"
+
 screenpipe-core = { path = "../screenpipe-core" }
 
 [dev-dependencies]

diff --git a/screenpipe-audio/src/multilingual.rs b/screenpipe-audio/src/multilingual.rs
@@ -135,10 +135,10 @@ pub fn detect_language(
     let probs = probs.to_vec1::<f32>()?;
     let mut probs = LANGUAGES.iter().zip(probs.iter()).collect::<Vec<_>>();
     probs.sort_by(|(_, p1), (_, p2)| p2.total_cmp(p1));
-    for ((_, language), p) in probs.iter().take(5) {
-        info!("{language}: {p}")
+    for ((_lang_code, _language), _p) in probs.iter().take(5) {
+        // info!("{language}: {p}")
     }
     let language = super::stt::token_id(tokenizer, &format!("<|{}|>", probs[0].0 .0))?;
     info!("detected language: {:?}", probs[0].0);
     Ok(language)
-}
+}
diff --git a/screenpipe-audio/src/stt.rs b/screenpipe-audio/src/stt.rs
@@ -16,6 +16,8 @@ use rubato::{
 
 use crate::{multilingual, pcm_decode::pcm_decode};
 
+use webrtc_vad::{Vad, VadMode};
+
 #[derive(Clone)]
 pub struct WhisperModel {
     pub model: Model,
@@ -423,8 +425,48 @@ pub fn stt(file_path: &str, whisper_model: &WhisperModel) -> Result<String> {
         pcm_data = resample(pcm_data, sample_rate, m::SAMPLE_RATE as u32)?;
     }
 
+    // Initialize VAD
+    debug!("VAD: Initializing VAD");
+    let mut vad = Vad::new();
+    vad.set_mode(VadMode::VeryAggressive); // Set mode to very aggressive
+
+    // Filter out non-speech segments
+    // debug!("VAD: Filtering out non-speech segments");
+    let frame_size = 160; // 10ms frame size for 16kHz audio
+    let mut speech_frames = Vec::new();
+    for (frame_index, chunk) in pcm_data.chunks(frame_size).enumerate() {
+        // Convert f32 to i16
+        let i16_chunk: Vec<i16> = chunk.iter().map(|&x| (x * 32767.0) as i16).collect();
+        match vad.is_voice_segment(&i16_chunk) {
+            Ok(is_voice) => {
+                if is_voice {
+                    debug!("VAD: Speech detected in frame {}", frame_index);
+                    speech_frames.extend_from_slice(chunk);
+                } else {
+                    // debug!("VAD: Non-speech frame {} filtered out", frame_index);
+                }
+            },
+            Err(e) => {
+                error!("VAD failed for frame {}: {:?}", frame_index, e);
+                // Optionally, you can choose to include the frame if VAD fails
+                // speech_frames.extend_from_slice(chunk);
+            }
+        }
+    }
+
+    debug!("Total frames processed: {}, Speech frames: {}", pcm_data.len() / frame_size, speech_frames.len() / frame_size);
+
+    // If no speech frames detected, skip processing
+    if speech_frames.is_empty() {
+        debug!("No speech detected using VAD, skipping audio processing");
+        return Ok("".to_string()); // Return an empty string or consider a more specific "no speech" indicator
+    }
+
+    debug!("Using {} speech frames out of {} total frames", speech_frames.len() / frame_size, pcm_data.len() / frame_size);
+
     debug!("Converting PCM to mel spectrogram");
-    let mel = audio::pcm_to_mel(&model.config(), &pcm_data, &mel_filters);
+    // let mel = audio::pcm_to_mel(&model.config(), &pcm_data, &mel_filters);
+    let mel = audio::pcm_to_mel(&model.config(), &speech_frames, &mel_filters);
     let mel_len = mel.len();
     debug!("Creating tensor from mel spectrogram");
     let mel = Tensor::from_vec(

diff --git a/screenpipe-server/src/core.rs b/screenpipe-server/src/core.rs
@@ -175,7 +175,7 @@ async fn record_audio(
     loop {
         // Non-blocking check for new device controls
         while let Some((audio_device, device_control)) = audio_devices_control.pop() {
-            info!("Received audio device: {}", &audio_device);
+            debug!("Received audio device: {}", &audio_device);
             let device_id = audio_device.to_string();
 
             if !device_control.is_running {
@@ -196,15 +196,15 @@ async fn record_audio(
             let handle = tokio::spawn(async move {
                 let audio_device_clone = Arc::clone(&audio_device);
                 let device_control_clone = Arc::clone(&device_control);
-                info!(
+                debug!(
                     "Starting audio capture thread for device: {}",
                     &audio_device
                 );
 
                 let mut iteration = 0;
                 loop {
                     iteration += 1;
-                    info!(
+                    debug!(
                         "Starting iteration {} for device {}",
                         iteration, audio_device_clone
                     );
@@ -221,7 +221,7 @@ async fn record_audio(
                         .to_str()
                         .expect("Failed to create valid path")
                         .to_string();
-                    info!(
+                        debug!(
                         "Starting record_and_transcribe for device {} (iteration {})",
                         audio_device_clone, iteration
                     );