From 08284fb8c60480a77a2a0045b0a3a8755f45fa7f Mon Sep 17 00:00:00 2001 From: matthew-heartful Date: Wed, 31 Jul 2024 15:59:37 -0700 Subject: [PATCH 1/3] new logic to filter out no speech tokens using VAD library --- screenpipe-audio/src/stt.rs | 44 ++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/screenpipe-audio/src/stt.rs b/screenpipe-audio/src/stt.rs index 7351ccf3..5021a546 100644 --- a/screenpipe-audio/src/stt.rs +++ b/screenpipe-audio/src/stt.rs @@ -16,6 +16,8 @@ use rubato::{ use crate::{multilingual, pcm_decode::pcm_decode}; +use webrtc_vad::{Vad, VadMode}; + #[derive(Clone)] pub struct WhisperModel { pub model: Model, @@ -423,8 +425,48 @@ pub fn stt(file_path: &str, whisper_model: &WhisperModel) -> Result { pcm_data = resample(pcm_data, sample_rate, m::SAMPLE_RATE as u32)?; } + // Initialize VAD + debug!("VAD: Initializing VAD"); + let mut vad = Vad::new(); + vad.set_mode(VadMode::VeryAggressive); // Set mode to very aggressive + + // Filter out non-speech segments + // debug!("VAD: Filtering out non-speech segments"); + let frame_size = 160; // 10ms frame size for 16kHz audio + let mut speech_frames = Vec::new(); + for (frame_index, chunk) in pcm_data.chunks(frame_size).enumerate() { + // Convert f32 to i16 + let i16_chunk: Vec = chunk.iter().map(|&x| (x * 32767.0) as i16).collect(); + match vad.is_voice_segment(&i16_chunk) { + Ok(is_voice) => { + if is_voice { + debug!("VAD: Speech detected in frame {}", frame_index); + speech_frames.extend_from_slice(chunk); + } else { + // debug!("VAD: Non-speech frame {} filtered out", frame_index); + } + }, + Err(e) => { + error!("VAD failed for frame {}: {:?}", frame_index, e); + // Optionally, you can choose to include the frame if VAD fails + // speech_frames.extend_from_slice(chunk); + } + } + } + + debug!("Total frames processed: {}, Speech frames: {}", pcm_data.len() / frame_size, speech_frames.len() / frame_size); + + // If no speech frames detected, skip processing + if speech_frames.is_empty() { + debug!("No speech detected using VAD, skipping audio processing"); + return Ok("".to_string()); // Return an empty string or consider a more specific "no speech" indicator + } + + debug!("Using {} speech frames out of {} total frames", speech_frames.len() / frame_size, pcm_data.len() / frame_size); + debug!("Converting PCM to mel spectrogram"); - let mel = audio::pcm_to_mel(&model.config(), &pcm_data, &mel_filters); + // let mel = audio::pcm_to_mel(&model.config(), &pcm_data, &mel_filters); + let mel = audio::pcm_to_mel(&model.config(), &speech_frames, &mel_filters); let mel_len = mel.len(); debug!("Creating tensor from mel spectrogram"); let mel = Tensor::from_vec( From 47f617c8cccbd9026becf8cc2a8f67b159d293a2 Mon Sep 17 00:00:00 2001 From: matthew-heartful Date: Wed, 31 Jul 2024 16:00:04 -0700 Subject: [PATCH 2/3] dependency --- screenpipe-audio/Cargo.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/screenpipe-audio/Cargo.toml b/screenpipe-audio/Cargo.toml index 4dfdf434..5c19e2ca 100644 --- a/screenpipe-audio/Cargo.toml +++ b/screenpipe-audio/Cargo.toml @@ -60,6 +60,9 @@ bytemuck = "1.16.1" # Async tokio = { workspace = true } +# Detect speech/silence +webrtc-vad = "0.4.0" + screenpipe-core = { path = "../screenpipe-core" } [dev-dependencies] From cb6a1e3de7c73b77020c50a7e530615dacb47427 Mon Sep 17 00:00:00 2001 From: matthew-heartful Date: Wed, 31 Jul 2024 16:00:37 -0700 Subject: [PATCH 3/3] info turned into debu for some audio logs --- screenpipe-audio/src/multilingual.rs | 6 +++--- screenpipe-server/src/core.rs | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/screenpipe-audio/src/multilingual.rs b/screenpipe-audio/src/multilingual.rs index cdd73b21..8d54469f 100644 --- a/screenpipe-audio/src/multilingual.rs +++ b/screenpipe-audio/src/multilingual.rs @@ -135,10 +135,10 @@ pub fn detect_language( let probs = probs.to_vec1::()?; let mut probs = LANGUAGES.iter().zip(probs.iter()).collect::>(); probs.sort_by(|(_, p1), (_, p2)| p2.total_cmp(p1)); - for ((_, language), p) in probs.iter().take(5) { - info!("{language}: {p}") + for ((_lang_code, _language), _p) in probs.iter().take(5) { + // info!("{language}: {p}") } let language = super::stt::token_id(tokenizer, &format!("<|{}|>", probs[0].0 .0))?; info!("detected language: {:?}", probs[0].0); Ok(language) -} +} \ No newline at end of file diff --git a/screenpipe-server/src/core.rs b/screenpipe-server/src/core.rs index 501f7b92..426a51e1 100644 --- a/screenpipe-server/src/core.rs +++ b/screenpipe-server/src/core.rs @@ -175,7 +175,7 @@ async fn record_audio( loop { // Non-blocking check for new device controls while let Some((audio_device, device_control)) = audio_devices_control.pop() { - info!("Received audio device: {}", &audio_device); + debug!("Received audio device: {}", &audio_device); let device_id = audio_device.to_string(); if !device_control.is_running { @@ -196,7 +196,7 @@ async fn record_audio( let handle = tokio::spawn(async move { let audio_device_clone = Arc::clone(&audio_device); let device_control_clone = Arc::clone(&device_control); - info!( + debug!( "Starting audio capture thread for device: {}", &audio_device ); @@ -204,7 +204,7 @@ async fn record_audio( let mut iteration = 0; loop { iteration += 1; - info!( + debug!( "Starting iteration {} for device {}", iteration, audio_device_clone ); @@ -221,7 +221,7 @@ async fn record_audio( .to_str() .expect("Failed to create valid path") .to_string(); - info!( + debug!( "Starting record_and_transcribe for device {} (iteration {})", audio_device_clone, iteration );