Skip to content

Commit

Permalink
Merge pull request #81 from louis030195/deepgram-audio
Browse files Browse the repository at this point in the history
VAD library to detecte no-speech tokens, not deepgram
  • Loading branch information
m13v authored Jul 31, 2024
2 parents 6830073 + cb6a1e3 commit 8a09c27
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 8 deletions.
3 changes: 3 additions & 0 deletions screenpipe-audio/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ bytemuck = "1.16.1"
# Async
tokio = { workspace = true }

# Detect speech/silence
webrtc-vad = "0.4.0"

screenpipe-core = { path = "../screenpipe-core" }

[dev-dependencies]
Expand Down
6 changes: 3 additions & 3 deletions screenpipe-audio/src/multilingual.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,10 @@ pub fn detect_language(
let probs = probs.to_vec1::<f32>()?;
let mut probs = LANGUAGES.iter().zip(probs.iter()).collect::<Vec<_>>();
probs.sort_by(|(_, p1), (_, p2)| p2.total_cmp(p1));
for ((_, language), p) in probs.iter().take(5) {
info!("{language}: {p}")
for ((_lang_code, _language), _p) in probs.iter().take(5) {
// info!("{language}: {p}")
}
let language = super::stt::token_id(tokenizer, &format!("<|{}|>", probs[0].0 .0))?;
info!("detected language: {:?}", probs[0].0);
Ok(language)
}
}
44 changes: 43 additions & 1 deletion screenpipe-audio/src/stt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ use rubato::{

use crate::{multilingual, pcm_decode::pcm_decode};

use webrtc_vad::{Vad, VadMode};

#[derive(Clone)]
pub struct WhisperModel {
pub model: Model,
Expand Down Expand Up @@ -423,8 +425,48 @@ pub fn stt(file_path: &str, whisper_model: &WhisperModel) -> Result<String> {
pcm_data = resample(pcm_data, sample_rate, m::SAMPLE_RATE as u32)?;
}

// Initialize VAD
debug!("VAD: Initializing VAD");
let mut vad = Vad::new();
vad.set_mode(VadMode::VeryAggressive); // Set mode to very aggressive

// Filter out non-speech segments
// debug!("VAD: Filtering out non-speech segments");
let frame_size = 160; // 10ms frame size for 16kHz audio
let mut speech_frames = Vec::new();
for (frame_index, chunk) in pcm_data.chunks(frame_size).enumerate() {
// Convert f32 to i16
let i16_chunk: Vec<i16> = chunk.iter().map(|&x| (x * 32767.0) as i16).collect();
match vad.is_voice_segment(&i16_chunk) {
Ok(is_voice) => {
if is_voice {
debug!("VAD: Speech detected in frame {}", frame_index);
speech_frames.extend_from_slice(chunk);
} else {
// debug!("VAD: Non-speech frame {} filtered out", frame_index);
}
},
Err(e) => {
error!("VAD failed for frame {}: {:?}", frame_index, e);
// Optionally, you can choose to include the frame if VAD fails
// speech_frames.extend_from_slice(chunk);
}
}
}

debug!("Total frames processed: {}, Speech frames: {}", pcm_data.len() / frame_size, speech_frames.len() / frame_size);

// If no speech frames detected, skip processing
if speech_frames.is_empty() {
debug!("No speech detected using VAD, skipping audio processing");
return Ok("".to_string()); // Return an empty string or consider a more specific "no speech" indicator
}

debug!("Using {} speech frames out of {} total frames", speech_frames.len() / frame_size, pcm_data.len() / frame_size);

debug!("Converting PCM to mel spectrogram");
let mel = audio::pcm_to_mel(&model.config(), &pcm_data, &mel_filters);
// let mel = audio::pcm_to_mel(&model.config(), &pcm_data, &mel_filters);
let mel = audio::pcm_to_mel(&model.config(), &speech_frames, &mel_filters);
let mel_len = mel.len();
debug!("Creating tensor from mel spectrogram");
let mel = Tensor::from_vec(
Expand Down
8 changes: 4 additions & 4 deletions screenpipe-server/src/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ async fn record_audio(
loop {
// Non-blocking check for new device controls
while let Some((audio_device, device_control)) = audio_devices_control.pop() {
info!("Received audio device: {}", &audio_device);
debug!("Received audio device: {}", &audio_device);
let device_id = audio_device.to_string();

if !device_control.is_running {
Expand All @@ -196,15 +196,15 @@ async fn record_audio(
let handle = tokio::spawn(async move {
let audio_device_clone = Arc::clone(&audio_device);
let device_control_clone = Arc::clone(&device_control);
info!(
debug!(
"Starting audio capture thread for device: {}",
&audio_device
);

let mut iteration = 0;
loop {
iteration += 1;
info!(
debug!(
"Starting iteration {} for device {}",
iteration, audio_device_clone
);
Expand All @@ -221,7 +221,7 @@ async fn record_audio(
.to_str()
.expect("Failed to create valid path")
.to_string();
info!(
debug!(
"Starting record_and_transcribe for device {} (iteration {})",
audio_device_clone, iteration
);
Expand Down

0 comments on commit 8a09c27

Please sign in to comment.