From f7cabe8c8cc345baff4646252d2fa7048e9fcb3a Mon Sep 17 00:00:00 2001 From: matthew-heartful Date: Wed, 31 Jul 2024 19:59:34 -0700 Subject: [PATCH 1/9] new flag for cloud audio --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index b543b5fa..11a1f31d 100644 --- a/README.md +++ b/README.md @@ -87,6 +87,10 @@ if you want to run screenpipe in debug mode to show more logs in terminal: ```bash screenpipe --debug ``` +by default screenpipe is using deepgram nova-t text-to-audio model via cloud api. To use whisper-tiny that runs locally you should add this flag: +```bash +screenpipe --cloud-audio-off +``` you can combine multiple flags if needed From 282dca7969cb7637d794a9c84bac85a635e66d55 Mon Sep 17 00:00:00 2001 From: matthew-heartful Date: Wed, 31 Jul 2024 19:59:59 -0700 Subject: [PATCH 2/9] dependencies to use deepgram curl --- screenpipe-audio/Cargo.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/screenpipe-audio/Cargo.toml b/screenpipe-audio/Cargo.toml index 5c19e2ca..9077ede2 100644 --- a/screenpipe-audio/Cargo.toml +++ b/screenpipe-audio/Cargo.toml @@ -63,6 +63,9 @@ tokio = { workspace = true } # Detect speech/silence webrtc-vad = "0.4.0" +# Deepgram +reqwest = { version = "0.12.5", features = ["json", "blocking"] } + screenpipe-core = { path = "../screenpipe-core" } [dev-dependencies] From 12f75bbfe24e63e65ad4cb13f5ce379a8079813a Mon Sep 17 00:00:00 2001 From: matthew-heartful Date: Wed, 31 Jul 2024 20:06:33 -0700 Subject: [PATCH 3/9] passing cli flag --- screenpipe-audio/src/bin/screenpipe-audio.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/screenpipe-audio/src/bin/screenpipe-audio.rs b/screenpipe-audio/src/bin/screenpipe-audio.rs index 15729358..7c1fcb47 100644 --- a/screenpipe-audio/src/bin/screenpipe-audio.rs +++ b/screenpipe-audio/src/bin/screenpipe-audio.rs @@ -25,6 +25,9 @@ struct Args { #[clap(long, help = "List available audio devices")] list_audio_devices: bool, + + #[clap(long, help = "Disable cloud audio processing")] + cloud_audio_off: bool, } fn print_devices(devices: &[AudioDevice]) { @@ -74,7 +77,8 @@ async fn main() -> Result<()> { let chunk_duration = Duration::from_secs(5); let output_path = PathBuf::from("output.mp4"); - let (whisper_sender, mut whisper_receiver) = create_whisper_channel().await?; + let cloud_audio = !args.cloud_audio_off; + let (whisper_sender, mut whisper_receiver) = create_whisper_channel(cloud_audio).await?; // Spawn threads for each device let recording_threads: Vec<_> = devices .into_iter() @@ -128,4 +132,4 @@ async fn main() -> Result<()> { } Ok(()) -} +} \ No newline at end of file From 3090ae562e43b9bbff3bddbbea7f94913e824fcf Mon Sep 17 00:00:00 2001 From: matthew-heartful Date: Wed, 31 Jul 2024 20:06:47 -0700 Subject: [PATCH 4/9] passing cli flag --- screenpipe-server/src/bin/screenpipe-server.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/screenpipe-server/src/bin/screenpipe-server.rs b/screenpipe-server/src/bin/screenpipe-server.rs index ac479803..e64cdb09 100644 --- a/screenpipe-server/src/bin/screenpipe-server.rs +++ b/screenpipe-server/src/bin/screenpipe-server.rs @@ -83,6 +83,10 @@ struct Cli { /// Save text files #[arg(long, default_value_t = false)] save_text_files: bool, + + /// Disable cloud audio processing + #[arg(long, default_value_t = false)] + cloud_audio_off: bool, } fn get_base_dir(custom_path: Option) -> anyhow::Result { @@ -292,6 +296,7 @@ async fn main() -> anyhow::Result<()> { vision_control, audio_devices_control, cli.save_text_files, + !cli.cloud_audio_off, // Pass the cloud_audio flag ) .await; @@ -344,4 +349,4 @@ async fn main() -> anyhow::Result<()> { loop { tokio::time::sleep(Duration::from_secs(1)).await; } -} +} \ No newline at end of file From aa5be47ffbc5aae392199256e7fd2bb014d4179f Mon Sep 17 00:00:00 2001 From: matthew-heartful Date: Wed, 31 Jul 2024 20:07:22 -0700 Subject: [PATCH 5/9] passing cli flag --- screenpipe-server/src/core.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/screenpipe-server/src/core.rs b/screenpipe-server/src/core.rs index 426a51e1..7f2a4082 100644 --- a/screenpipe-server/src/core.rs +++ b/screenpipe-server/src/core.rs @@ -49,10 +49,11 @@ pub async fn start_continuous_recording( vision_control: Arc, audio_devices_control: Arc>, save_text_files: bool, + cloud_audio: bool, // Added cloud_audio parameter ) -> Result<()> { info!("Recording now"); - let (whisper_sender, whisper_receiver) = create_whisper_channel().await?; + let (whisper_sender, whisper_receiver) = create_whisper_channel(cloud_audio).await?; // Pass cloud_audio let db_manager_video = Arc::clone(&db); let db_manager_audio = Arc::clone(&db); @@ -320,4 +321,4 @@ async fn process_audio_result(db: &DatabaseManager, result: TranscriptionResult) result.input.device, e ), } -} +} \ No newline at end of file From 80845e048da9ab41f686989aa3f2cbb56917567f Mon Sep 17 00:00:00 2001 From: matthew-heartful Date: Wed, 31 Jul 2024 20:07:56 -0700 Subject: [PATCH 6/9] deepgram cloud audio transcription integration with fallback to whisper --- screenpipe-audio/src/stt.rs | 224 ++++++++++++++++++++++++++++-------- 1 file changed, 176 insertions(+), 48 deletions(-) diff --git a/screenpipe-audio/src/stt.rs b/screenpipe-audio/src/stt.rs index 5021a546..7cdd977e 100644 --- a/screenpipe-audio/src/stt.rs +++ b/screenpipe-audio/src/stt.rs @@ -18,6 +18,9 @@ use crate::{multilingual, pcm_decode::pcm_decode}; use webrtc_vad::{Vad, VadMode}; +use hound::{WavSpec, WavWriter}; +use std::io::Cursor; + #[derive(Clone)] pub struct WhisperModel { pub model: Model, @@ -399,7 +402,79 @@ enum Task { Translate, } -pub fn stt(file_path: &str, whisper_model: &WhisperModel) -> Result { +use reqwest::blocking::Client; +use serde_json::Value; + +// Replace the get_deepgram_api_key function with this: +fn get_deepgram_api_key() -> String { + "7ed2a159a094337b01fd8178b914b7ae0e77822d".to_string() +} + +fn transcribe_with_deepgram(api_key: &str, audio_data: &[f32]) -> Result { + debug!("Starting Deepgram transcription"); + let client = Client::new(); + + // Create a WAV file in memory + let mut cursor = Cursor::new(Vec::new()); + { + let spec = WavSpec { + channels: 1, + sample_rate: 16000, + bits_per_sample: 32, + sample_format: hound::SampleFormat::Float, + }; + let mut writer = WavWriter::new(&mut cursor, spec)?; + for &sample in audio_data { + writer.write_sample(sample)?; + } + writer.finalize()?; + } + + // Get the WAV data from the cursor + let wav_data = cursor.into_inner(); + + let response = client.post("https://api.deepgram.com/v1/listen?model=nova-2&smart_format=true") + .header("Content-Type", "audio/wav") + .header("Authorization", format!("Token {}", api_key)) + .body(wav_data) + .send(); + + match response { + Ok(resp) => { + debug!("Received response from Deepgram API"); + match resp.json::() { + Ok(result) => { + debug!("Successfully parsed JSON response"); + if let Some(err_code) = result.get("err_code") { + error!("Deepgram API error code: {:?}, result: {:?}", err_code, result); + return Err(anyhow::anyhow!("Deepgram API error: {:?}", result)); + } + let transcription = result["results"]["channels"][0]["alternatives"][0]["transcript"] + .as_str() + .unwrap_or(""); + + if transcription.is_empty() { + info!("Transcription is empty. Full response: {:?}", result); + } else { + info!("Transcription successful. Length: {} characters", transcription.len()); + } + + Ok(transcription.to_string()) + }, + Err(e) => { + error!("Failed to parse JSON response: {:?}", e); + Err(anyhow::anyhow!("Failed to parse JSON response: {:?}", e)) + } + } + }, + Err(e) => { + error!("Failed to send request to Deepgram API: {:?}", e); + Err(anyhow::anyhow!("Failed to send request to Deepgram API: {:?}", e)) + } + } +} + +pub fn stt(file_path: &str, whisper_model: &WhisperModel, cloud_audio: bool) -> Result { debug!("Starting speech to text for file: {}", file_path); let model = &whisper_model.model; let tokenizer = &whisper_model.tokenizer; @@ -431,7 +506,7 @@ pub fn stt(file_path: &str, whisper_model: &WhisperModel) -> Result { vad.set_mode(VadMode::VeryAggressive); // Set mode to very aggressive // Filter out non-speech segments - // debug!("VAD: Filtering out non-speech segments"); + debug!("VAD: Filtering out non-speech segments"); let frame_size = 160; // 10ms frame size for 16kHz audio let mut speech_frames = Vec::new(); for (frame_index, chunk) in pcm_data.chunks(frame_size).enumerate() { @@ -440,21 +515,21 @@ pub fn stt(file_path: &str, whisper_model: &WhisperModel) -> Result { match vad.is_voice_segment(&i16_chunk) { Ok(is_voice) => { if is_voice { - debug!("VAD: Speech detected in frame {}", frame_index); + // debug!("VAD: Speech detected in frame {}", frame_index); speech_frames.extend_from_slice(chunk); } else { // debug!("VAD: Non-speech frame {} filtered out", frame_index); } }, Err(e) => { - error!("VAD failed for frame {}: {:?}", frame_index, e); + debug!("VAD failed for frame {}: {:?}", frame_index, e); // Optionally, you can choose to include the frame if VAD fails // speech_frames.extend_from_slice(chunk); } } } - debug!("Total frames processed: {}, Speech frames: {}", pcm_data.len() / frame_size, speech_frames.len() / frame_size); + info!("Total audio_frames processed: {}, frames that include speech: {}", pcm_data.len() / frame_size, speech_frames.len() / frame_size); // If no speech frames detected, skip processing if speech_frames.is_empty() { @@ -464,47 +539,100 @@ pub fn stt(file_path: &str, whisper_model: &WhisperModel) -> Result { debug!("Using {} speech frames out of {} total frames", speech_frames.len() / frame_size, pcm_data.len() / frame_size); - debug!("Converting PCM to mel spectrogram"); - // let mel = audio::pcm_to_mel(&model.config(), &pcm_data, &mel_filters); - let mel = audio::pcm_to_mel(&model.config(), &speech_frames, &mel_filters); - let mel_len = mel.len(); - debug!("Creating tensor from mel spectrogram"); - let mel = Tensor::from_vec( - mel, - ( - 1, - model.config().num_mel_bins, - mel_len / model.config().num_mel_bins, - ), - &device, - )?; - - debug!("Detecting language"); - let language_token = Some(multilingual::detect_language( - &mut model.clone(), - &tokenizer, - &mel, - )?); - let mut model = model.clone(); - debug!("Initializing decoder"); - let mut dc = Decoder::new( - &mut model, - tokenizer, - 42, - &device, - language_token, - Some(Task::Transcribe), - true, - false, - )?; - debug!("Starting decoding process"); - let segments = dc.run(&mel)?; - debug!("Decoding complete"); - Ok(segments - .iter() - .map(|s| s.dr.text.clone()) - .collect::>() - .join("\n")) + if cloud_audio { + // Deepgram implementation + let api_key = get_deepgram_api_key(); + match transcribe_with_deepgram(&api_key, &speech_frames) { + Ok(transcription) => Ok(transcription), + Err(e) => { + error!("Deepgram transcription failed, falling back to Whisper: {:?}", e); + // Existing Whisper implementation + debug!("Converting PCM to mel spectrogram"); + let mel = audio::pcm_to_mel(&model.config(), &speech_frames, &mel_filters); + let mel_len = mel.len(); + debug!("Creating tensor from mel spectrogram"); + let mel = Tensor::from_vec( + mel, + ( + 1, + model.config().num_mel_bins, + mel_len / model.config().num_mel_bins, + ), + &device, + )?; + + debug!("Detecting language"); + let language_token = Some(multilingual::detect_language( + &mut model.clone(), + &tokenizer, + &mel, + )?); + let mut model = model.clone(); + debug!("Initializing decoder"); + let mut dc = Decoder::new( + &mut model, + tokenizer, + 42, + &device, + language_token, + Some(Task::Transcribe), + true, + false, + )?; + debug!("Starting decoding process"); + let segments = dc.run(&mel)?; + debug!("Decoding complete"); + Ok(segments + .iter() + .map(|s| s.dr.text.clone()) + .collect::>() + .join("\n")) + } + } + } else { + // Existing Whisper implementation + debug!("Starting Whisper transcription"); + debug!("Converting PCM to mel spectrogram"); + let mel = audio::pcm_to_mel(&model.config(), &speech_frames, &mel_filters); + let mel_len = mel.len(); + debug!("Creating tensor from mel spectrogram"); + let mel = Tensor::from_vec( + mel, + ( + 1, + model.config().num_mel_bins, + mel_len / model.config().num_mel_bins, + ), + &device, + )?; + + debug!("Detecting language"); + let language_token = Some(multilingual::detect_language( + &mut model.clone(), + &tokenizer, + &mel, + )?); + let mut model = model.clone(); + debug!("Initializing decoder"); + let mut dc = Decoder::new( + &mut model, + tokenizer, + 42, + &device, + language_token, + Some(Task::Transcribe), + true, + false, + )?; + debug!("Starting decoding process"); + let segments = dc.run(&mel)?; + debug!("Decoding complete"); + Ok(segments + .iter() + .map(|s| s.dr.text.clone()) + .collect::>() + .join("\n")) + } } fn resample(input: Vec, from_sample_rate: u32, to_sample_rate: u32) -> Result> { @@ -545,7 +673,7 @@ pub struct TranscriptionResult { pub timestamp: u64, pub error: Option, } -pub async fn create_whisper_channel() -> Result<( +pub async fn create_whisper_channel(cloud_audio: bool) -> Result<( UnboundedSender, UnboundedReceiver, )> { @@ -568,7 +696,7 @@ pub async fn create_whisper_channel() -> Result<( .expect("Time went backwards") .as_secs(); - let result = stt(&input.path, &whisper_model); + let result = stt(&input.path, &whisper_model, cloud_audio); let transcription_result = match result { Ok(transcription) => TranscriptionResult { From 54c7149464c1479fad9d281fc54298e175d4d3f8 Mon Sep 17 00:00:00 2001 From: matthew-heartful Date: Wed, 31 Jul 2024 20:19:09 -0700 Subject: [PATCH 7/9] tests --- screenpipe-audio/tests/core_tests.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/screenpipe-audio/tests/core_tests.rs b/screenpipe-audio/tests/core_tests.rs index 5e50c80e..070db7d2 100644 --- a/screenpipe-audio/tests/core_tests.rs +++ b/screenpipe-audio/tests/core_tests.rs @@ -43,12 +43,13 @@ mod tests { fn test_speech_to_text() { setup(); println!("Starting speech to text test"); - +a println!("Loading audio file"); let start = std::time::Instant::now(); let whisper_model = WhisperModel::new().unwrap(); + let cloud_audio = true; // Set this based on your test requirements - let text = stt("./test_data/selah.mp4", &whisper_model).unwrap(); + let text = stt("./test_data/selah.mp4", &whisper_model, cloud_audio).unwrap(); let duration = start.elapsed(); println!("Speech to text completed in {:?}", duration); From da4d608a9d878c9bb25a4860708285fe5e844f8e Mon Sep 17 00:00:00 2001 From: matthew-heartful Date: Wed, 31 Jul 2024 20:25:51 -0700 Subject: [PATCH 8/9] tests2 --- screenpipe-audio/tests/core_tests.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/screenpipe-audio/tests/core_tests.rs b/screenpipe-audio/tests/core_tests.rs index 070db7d2..de81b69d 100644 --- a/screenpipe-audio/tests/core_tests.rs +++ b/screenpipe-audio/tests/core_tests.rs @@ -43,7 +43,6 @@ mod tests { fn test_speech_to_text() { setup(); println!("Starting speech to text test"); -a println!("Loading audio file"); let start = std::time::Instant::now(); let whisper_model = WhisperModel::new().unwrap(); @@ -224,7 +223,7 @@ a let output_path = PathBuf::from(format!("test_output_{}.mp4", Utc::now().timestamp_millis())); let output_path_2 = output_path.clone(); - let (whisper_sender, mut whisper_receiver) = create_whisper_channel().await.unwrap(); + let (whisper_sender, mut whisper_receiver) = create_whisper_channel(cloud_audio).await.unwrap(); let is_running = Arc::new(AtomicBool::new(true)); // Start recording in a separate thread let recording_thread = tokio::spawn(async move { From f5cecb5e7a1f75c80b3c186f9c99ae01b80b2f35 Mon Sep 17 00:00:00 2001 From: matthew-heartful Date: Wed, 31 Jul 2024 20:47:30 -0700 Subject: [PATCH 9/9] tests3 --- screenpipe-audio/tests/core_tests.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/screenpipe-audio/tests/core_tests.rs b/screenpipe-audio/tests/core_tests.rs index de81b69d..a1e39488 100644 --- a/screenpipe-audio/tests/core_tests.rs +++ b/screenpipe-audio/tests/core_tests.rs @@ -3,7 +3,7 @@ mod tests { use chrono::Utc; use log::{debug, LevelFilter}; use screenpipe_audio::{default_output_device, list_audio_devices, stt, WhisperModel}; - use screenpipe_audio::{parse_audio_device, record_and_transcribe}; + use screenpipe_audio::{parse_audio_device, record_and_transcribe, create_whisper_channel}; use std::path::PathBuf; use std::process::Command; use std::str::FromStr; @@ -11,6 +11,7 @@ mod tests { use std::sync::Arc; use std::time::{Duration, Instant}; use tokio::sync::mpsc::unbounded_channel; + use tokio::time::timeout; fn setup() { // Initialize the logger with an info level filter @@ -223,6 +224,7 @@ mod tests { let output_path = PathBuf::from(format!("test_output_{}.mp4", Utc::now().timestamp_millis())); let output_path_2 = output_path.clone(); + let cloud_audio = true; // Set this based on your test requirements let (whisper_sender, mut whisper_receiver) = create_whisper_channel(cloud_audio).await.unwrap(); let is_running = Arc::new(AtomicBool::new(true)); // Start recording in a separate thread @@ -285,4 +287,4 @@ mod tests { let _ = recording_thread.abort(); std::fs::remove_file(output_path_2).unwrap_or_default(); } -} +} \ No newline at end of file