Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VAD library to detecte no-speech tokens, not deepgram #81

Merged
merged 3 commits into from
Jul 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions screenpipe-audio/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ bytemuck = "1.16.1"
# Async
tokio = { workspace = true }

# Detect speech/silence
webrtc-vad = "0.4.0"

screenpipe-core = { path = "../screenpipe-core" }

[dev-dependencies]
Expand Down
6 changes: 3 additions & 3 deletions screenpipe-audio/src/multilingual.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,10 @@ pub fn detect_language(
let probs = probs.to_vec1::<f32>()?;
let mut probs = LANGUAGES.iter().zip(probs.iter()).collect::<Vec<_>>();
probs.sort_by(|(_, p1), (_, p2)| p2.total_cmp(p1));
for ((_, language), p) in probs.iter().take(5) {
info!("{language}: {p}")
for ((_lang_code, _language), _p) in probs.iter().take(5) {
// info!("{language}: {p}")
}
let language = super::stt::token_id(tokenizer, &format!("<|{}|>", probs[0].0 .0))?;
info!("detected language: {:?}", probs[0].0);
Ok(language)
}
}
44 changes: 43 additions & 1 deletion screenpipe-audio/src/stt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ use rubato::{

use crate::{multilingual, pcm_decode::pcm_decode};

use webrtc_vad::{Vad, VadMode};

#[derive(Clone)]
pub struct WhisperModel {
pub model: Model,
Expand Down Expand Up @@ -423,8 +425,48 @@ pub fn stt(file_path: &str, whisper_model: &WhisperModel) -> Result<String> {
pcm_data = resample(pcm_data, sample_rate, m::SAMPLE_RATE as u32)?;
}

// Initialize VAD
debug!("VAD: Initializing VAD");
let mut vad = Vad::new();
vad.set_mode(VadMode::VeryAggressive); // Set mode to very aggressive

// Filter out non-speech segments
// debug!("VAD: Filtering out non-speech segments");
let frame_size = 160; // 10ms frame size for 16kHz audio
let mut speech_frames = Vec::new();
for (frame_index, chunk) in pcm_data.chunks(frame_size).enumerate() {
// Convert f32 to i16
let i16_chunk: Vec<i16> = chunk.iter().map(|&x| (x * 32767.0) as i16).collect();
match vad.is_voice_segment(&i16_chunk) {
Ok(is_voice) => {
if is_voice {
debug!("VAD: Speech detected in frame {}", frame_index);
speech_frames.extend_from_slice(chunk);
} else {
// debug!("VAD: Non-speech frame {} filtered out", frame_index);
}
},
Err(e) => {
error!("VAD failed for frame {}: {:?}", frame_index, e);
// Optionally, you can choose to include the frame if VAD fails
// speech_frames.extend_from_slice(chunk);
}
}
}

debug!("Total frames processed: {}, Speech frames: {}", pcm_data.len() / frame_size, speech_frames.len() / frame_size);

// If no speech frames detected, skip processing
if speech_frames.is_empty() {
debug!("No speech detected using VAD, skipping audio processing");
return Ok("".to_string()); // Return an empty string or consider a more specific "no speech" indicator
}

debug!("Using {} speech frames out of {} total frames", speech_frames.len() / frame_size, pcm_data.len() / frame_size);

debug!("Converting PCM to mel spectrogram");
let mel = audio::pcm_to_mel(&model.config(), &pcm_data, &mel_filters);
// let mel = audio::pcm_to_mel(&model.config(), &pcm_data, &mel_filters);
let mel = audio::pcm_to_mel(&model.config(), &speech_frames, &mel_filters);
let mel_len = mel.len();
debug!("Creating tensor from mel spectrogram");
let mel = Tensor::from_vec(
Expand Down
8 changes: 4 additions & 4 deletions screenpipe-server/src/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ async fn record_audio(
loop {
// Non-blocking check for new device controls
while let Some((audio_device, device_control)) = audio_devices_control.pop() {
info!("Received audio device: {}", &audio_device);
debug!("Received audio device: {}", &audio_device);
let device_id = audio_device.to_string();

if !device_control.is_running {
Expand All @@ -196,15 +196,15 @@ async fn record_audio(
let handle = tokio::spawn(async move {
let audio_device_clone = Arc::clone(&audio_device);
let device_control_clone = Arc::clone(&device_control);
info!(
debug!(
"Starting audio capture thread for device: {}",
&audio_device
);

let mut iteration = 0;
loop {
iteration += 1;
info!(
debug!(
"Starting iteration {} for device {}",
iteration, audio_device_clone
);
Expand All @@ -221,7 +221,7 @@ async fn record_audio(
.to_str()
.expect("Failed to create valid path")
.to_string();
info!(
debug!(
"Starting record_and_transcribe for device {} (iteration {})",
audio_device_clone, iteration
);
Expand Down
Loading