From f7cabe8c8cc345baff4646252d2fa7048e9fcb3a Mon Sep 17 00:00:00 2001
From: matthew-heartful <matthew.heartful@gmail.com>
Date: Wed, 31 Jul 2024 19:59:34 -0700
Subject: [PATCH 1/9] new flag for cloud audio

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)
diff --git a/README.md b/README.md
index b543b5fa..11a1f31d 100644
--- a/README.md
+++ b/README.md
@@ -87,6 +87,10 @@ if you want to run screenpipe in debug mode to show more logs in terminal:
 ```bash
 screenpipe --debug
 ```
+by default screenpipe is using deepgram nova-t text-to-audio model via cloud api. To use whisper-tiny that runs locally you should add this flag:
+```bash
+screenpipe --cloud-audio-off
+```
 
 you can combine multiple flags if needed
 

From 282dca7969cb7637d794a9c84bac85a635e66d55 Mon Sep 17 00:00:00 2001
From: matthew-heartful <matthew.heartful@gmail.com>
Date: Wed, 31 Jul 2024 19:59:59 -0700
Subject: [PATCH 2/9] dependencies to use deepgram curl

---
 screenpipe-audio/Cargo.toml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/screenpipe-audio/Cargo.toml b/screenpipe-audio/Cargo.toml
index 5c19e2ca..9077ede2 100644
--- a/screenpipe-audio/Cargo.toml
+++ b/screenpipe-audio/Cargo.toml
@@ -63,6 +63,9 @@ tokio = { workspace = true }
 # Detect speech/silence
 webrtc-vad = "0.4.0"
 
+# Deepgram
+reqwest = { version = "0.12.5", features = ["json", "blocking"] }
+
 screenpipe-core = { path = "../screenpipe-core" }
 
 [dev-dependencies]

From 12f75bbfe24e63e65ad4cb13f5ce379a8079813a Mon Sep 17 00:00:00 2001
From: matthew-heartful <matthew.heartful@gmail.com>
Date: Wed, 31 Jul 2024 20:06:33 -0700
Subject: [PATCH 3/9] passing cli flag

---
 screenpipe-audio/src/bin/screenpipe-audio.rs | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/screenpipe-audio/src/bin/screenpipe-audio.rs b/screenpipe-audio/src/bin/screenpipe-audio.rs
index 15729358..7c1fcb47 100644
--- a/screenpipe-audio/src/bin/screenpipe-audio.rs
+++ b/screenpipe-audio/src/bin/screenpipe-audio.rs
@@ -25,6 +25,9 @@ struct Args {
 
     #[clap(long, help = "List available audio devices")]
     list_audio_devices: bool,
+
+    #[clap(long, help = "Disable cloud audio processing")]
+    cloud_audio_off: bool,
 }
 
 fn print_devices(devices: &[AudioDevice]) {
@@ -74,7 +77,8 @@ async fn main() -> Result<()> {
 
     let chunk_duration = Duration::from_secs(5);
     let output_path = PathBuf::from("output.mp4");
-    let (whisper_sender, mut whisper_receiver) = create_whisper_channel().await?;
+    let cloud_audio = !args.cloud_audio_off;
+    let (whisper_sender, mut whisper_receiver) = create_whisper_channel(cloud_audio).await?;
     // Spawn threads for each device
     let recording_threads: Vec<_> = devices
         .into_iter()
@@ -128,4 +132,4 @@ async fn main() -> Result<()> {
     }
 
     Ok(())
-}
+}
\ No newline at end of file

From 3090ae562e43b9bbff3bddbbea7f94913e824fcf Mon Sep 17 00:00:00 2001
From: matthew-heartful <matthew.heartful@gmail.com>
Date: Wed, 31 Jul 2024 20:06:47 -0700
Subject: [PATCH 4/9] passing cli flag

---
 screenpipe-server/src/bin/screenpipe-server.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/screenpipe-server/src/bin/screenpipe-server.rs b/screenpipe-server/src/bin/screenpipe-server.rs
index ac479803..e64cdb09 100644
--- a/screenpipe-server/src/bin/screenpipe-server.rs
+++ b/screenpipe-server/src/bin/screenpipe-server.rs
@@ -83,6 +83,10 @@ struct Cli {
     /// Save text files
     #[arg(long, default_value_t = false)]
     save_text_files: bool,
+
+    /// Disable cloud audio processing
+    #[arg(long, default_value_t = false)]
+    cloud_audio_off: bool,
 }
 
 fn get_base_dir(custom_path: Option<String>) -> anyhow::Result<PathBuf> {
@@ -292,6 +296,7 @@ async fn main() -> anyhow::Result<()> {
                     vision_control,
                     audio_devices_control,
                     cli.save_text_files,
+                    !cli.cloud_audio_off, // Pass the cloud_audio flag
                 )
                 .await;
 
@@ -344,4 +349,4 @@ async fn main() -> anyhow::Result<()> {
     loop {
         tokio::time::sleep(Duration::from_secs(1)).await;
     }
-}
+}
\ No newline at end of file

From aa5be47ffbc5aae392199256e7fd2bb014d4179f Mon Sep 17 00:00:00 2001
From: matthew-heartful <matthew.heartful@gmail.com>
Date: Wed, 31 Jul 2024 20:07:22 -0700
Subject: [PATCH 5/9] passing cli flag

---
 screenpipe-server/src/core.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/screenpipe-server/src/core.rs b/screenpipe-server/src/core.rs
index 426a51e1..7f2a4082 100644
--- a/screenpipe-server/src/core.rs
+++ b/screenpipe-server/src/core.rs
@@ -49,10 +49,11 @@ pub async fn start_continuous_recording(
     vision_control: Arc<AtomicBool>,
     audio_devices_control: Arc<SegQueue<(AudioDevice, DeviceControl)>>,
     save_text_files: bool,
+    cloud_audio: bool, // Added cloud_audio parameter
 ) -> Result<()> {
     info!("Recording now");
 
-    let (whisper_sender, whisper_receiver) = create_whisper_channel().await?;
+    let (whisper_sender, whisper_receiver) = create_whisper_channel(cloud_audio).await?; // Pass cloud_audio
 
     let db_manager_video = Arc::clone(&db);
     let db_manager_audio = Arc::clone(&db);
@@ -320,4 +321,4 @@ async fn process_audio_result(db: &DatabaseManager, result: TranscriptionResult)
             result.input.device, e
         ),
     }
-}
+}
\ No newline at end of file

From 80845e048da9ab41f686989aa3f2cbb56917567f Mon Sep 17 00:00:00 2001
From: matthew-heartful <matthew.heartful@gmail.com>
Date: Wed, 31 Jul 2024 20:07:56 -0700
Subject: [PATCH 6/9] deepgram cloud audio transcription integration with
 fallback to whisper

---
 screenpipe-audio/src/stt.rs | 224 ++++++++++++++++++++++++++++--------
 1 file changed, 176 insertions(+), 48 deletions(-)

diff --git a/screenpipe-audio/src/stt.rs b/screenpipe-audio/src/stt.rs
index 5021a546..7cdd977e 100644
--- a/screenpipe-audio/src/stt.rs
+++ b/screenpipe-audio/src/stt.rs
@@ -18,6 +18,9 @@ use crate::{multilingual, pcm_decode::pcm_decode};
 
 use webrtc_vad::{Vad, VadMode};
 
+use hound::{WavSpec, WavWriter};
+use std::io::Cursor;
+
 #[derive(Clone)]
 pub struct WhisperModel {
     pub model: Model,
@@ -399,7 +402,79 @@ enum Task {
     Translate,
 }
 
-pub fn stt(file_path: &str, whisper_model: &WhisperModel) -> Result<String> {
+use reqwest::blocking::Client;
+use serde_json::Value;
+
+// Replace the get_deepgram_api_key function with this:
+fn get_deepgram_api_key() -> String {
+    "7ed2a159a094337b01fd8178b914b7ae0e77822d".to_string()
+}
+
+fn transcribe_with_deepgram(api_key: &str, audio_data: &[f32]) -> Result<String> {
+    debug!("Starting Deepgram transcription");
+    let client = Client::new();
+
+    // Create a WAV file in memory
+    let mut cursor = Cursor::new(Vec::new());
+    {
+        let spec = WavSpec {
+            channels: 1,
+            sample_rate: 16000,
+            bits_per_sample: 32,
+            sample_format: hound::SampleFormat::Float,
+        };
+        let mut writer = WavWriter::new(&mut cursor, spec)?;
+        for &sample in audio_data {
+            writer.write_sample(sample)?;
+        }
+        writer.finalize()?;
+    }
+
+    // Get the WAV data from the cursor
+    let wav_data = cursor.into_inner();
+
+    let response = client.post("https://api.deepgram.com/v1/listen?model=nova-2&smart_format=true")
+        .header("Content-Type", "audio/wav")
+        .header("Authorization", format!("Token {}", api_key))
+        .body(wav_data)
+        .send();
+
+    match response {
+        Ok(resp) => {
+            debug!("Received response from Deepgram API");
+            match resp.json::<Value>() {
+                Ok(result) => {
+                    debug!("Successfully parsed JSON response");
+                    if let Some(err_code) = result.get("err_code") {
+                        error!("Deepgram API error code: {:?}, result: {:?}", err_code, result);
+                        return Err(anyhow::anyhow!("Deepgram API error: {:?}", result));
+                    }
+                    let transcription = result["results"]["channels"][0]["alternatives"][0]["transcript"]
+                        .as_str()
+                        .unwrap_or("");
+                    
+                    if transcription.is_empty() {
+                        info!("Transcription is empty. Full response: {:?}", result);
+                    } else {
+                        info!("Transcription successful. Length: {} characters", transcription.len());
+                    }
+                    
+                    Ok(transcription.to_string())
+                },
+                Err(e) => {
+                    error!("Failed to parse JSON response: {:?}", e);
+                    Err(anyhow::anyhow!("Failed to parse JSON response: {:?}", e))
+                }
+            }
+        },
+        Err(e) => {
+            error!("Failed to send request to Deepgram API: {:?}", e);
+            Err(anyhow::anyhow!("Failed to send request to Deepgram API: {:?}", e))
+        }
+    }
+}
+
+pub fn stt(file_path: &str, whisper_model: &WhisperModel, cloud_audio: bool) -> Result<String> {
     debug!("Starting speech to text for file: {}", file_path);
     let model = &whisper_model.model;
     let tokenizer = &whisper_model.tokenizer;
@@ -431,7 +506,7 @@ pub fn stt(file_path: &str, whisper_model: &WhisperModel) -> Result<String> {
     vad.set_mode(VadMode::VeryAggressive); // Set mode to very aggressive
 
     // Filter out non-speech segments
-    // debug!("VAD: Filtering out non-speech segments");
+    debug!("VAD: Filtering out non-speech segments");
     let frame_size = 160; // 10ms frame size for 16kHz audio
     let mut speech_frames = Vec::new();
     for (frame_index, chunk) in pcm_data.chunks(frame_size).enumerate() {
@@ -440,21 +515,21 @@ pub fn stt(file_path: &str, whisper_model: &WhisperModel) -> Result<String> {
         match vad.is_voice_segment(&i16_chunk) {
             Ok(is_voice) => {
                 if is_voice {
-                    debug!("VAD: Speech detected in frame {}", frame_index);
+                    // debug!("VAD: Speech detected in frame {}", frame_index);
                     speech_frames.extend_from_slice(chunk);
                 } else {
                     // debug!("VAD: Non-speech frame {} filtered out", frame_index);
                 }
             },
             Err(e) => {
-                error!("VAD failed for frame {}: {:?}", frame_index, e);
+                debug!("VAD failed for frame {}: {:?}", frame_index, e);
                 // Optionally, you can choose to include the frame if VAD fails
                 // speech_frames.extend_from_slice(chunk);
             }
         }
     }
 
-    debug!("Total frames processed: {}, Speech frames: {}", pcm_data.len() / frame_size, speech_frames.len() / frame_size);
+    info!("Total audio_frames processed: {}, frames that include speech: {}", pcm_data.len() / frame_size, speech_frames.len() / frame_size);
 
     // If no speech frames detected, skip processing
     if speech_frames.is_empty() {
@@ -464,47 +539,100 @@ pub fn stt(file_path: &str, whisper_model: &WhisperModel) -> Result<String> {
 
     debug!("Using {} speech frames out of {} total frames", speech_frames.len() / frame_size, pcm_data.len() / frame_size);
 
-    debug!("Converting PCM to mel spectrogram");
-    // let mel = audio::pcm_to_mel(&model.config(), &pcm_data, &mel_filters);
-    let mel = audio::pcm_to_mel(&model.config(), &speech_frames, &mel_filters);
-    let mel_len = mel.len();
-    debug!("Creating tensor from mel spectrogram");
-    let mel = Tensor::from_vec(
-        mel,
-        (
-            1,
-            model.config().num_mel_bins,
-            mel_len / model.config().num_mel_bins,
-        ),
-        &device,
-    )?;
-
-    debug!("Detecting language");
-    let language_token = Some(multilingual::detect_language(
-        &mut model.clone(),
-        &tokenizer,
-        &mel,
-    )?);
-    let mut model = model.clone();
-    debug!("Initializing decoder");
-    let mut dc = Decoder::new(
-        &mut model,
-        tokenizer,
-        42,
-        &device,
-        language_token,
-        Some(Task::Transcribe),
-        true,
-        false,
-    )?;
-    debug!("Starting decoding process");
-    let segments = dc.run(&mel)?;
-    debug!("Decoding complete");
-    Ok(segments
-        .iter()
-        .map(|s| s.dr.text.clone())
-        .collect::<Vec<String>>()
-        .join("\n"))
+    if cloud_audio {
+        // Deepgram implementation
+        let api_key = get_deepgram_api_key();
+        match transcribe_with_deepgram(&api_key, &speech_frames) {
+            Ok(transcription) => Ok(transcription),
+            Err(e) => {
+                error!("Deepgram transcription failed, falling back to Whisper: {:?}", e);
+                // Existing Whisper implementation
+                debug!("Converting PCM to mel spectrogram");
+                let mel = audio::pcm_to_mel(&model.config(), &speech_frames, &mel_filters);
+                let mel_len = mel.len();
+                debug!("Creating tensor from mel spectrogram");
+                let mel = Tensor::from_vec(
+                    mel,
+                    (
+                        1,
+                        model.config().num_mel_bins,
+                        mel_len / model.config().num_mel_bins,
+                    ),
+                    &device,
+                )?;
+
+                debug!("Detecting language");
+                let language_token = Some(multilingual::detect_language(
+                    &mut model.clone(),
+                    &tokenizer,
+                    &mel,
+                )?);
+                let mut model = model.clone();
+                debug!("Initializing decoder");
+                let mut dc = Decoder::new(
+                    &mut model,
+                    tokenizer,
+                    42,
+                    &device,
+                    language_token,
+                    Some(Task::Transcribe),
+                    true,
+                    false,
+                )?;
+                debug!("Starting decoding process");
+                let segments = dc.run(&mel)?;
+                debug!("Decoding complete");
+                Ok(segments
+                    .iter()
+                    .map(|s| s.dr.text.clone())
+                    .collect::<Vec<String>>()
+                    .join("\n"))
+            }
+        }
+    } else {
+        // Existing Whisper implementation
+        debug!("Starting Whisper transcription");
+        debug!("Converting PCM to mel spectrogram");
+        let mel = audio::pcm_to_mel(&model.config(), &speech_frames, &mel_filters);
+        let mel_len = mel.len();
+        debug!("Creating tensor from mel spectrogram");
+        let mel = Tensor::from_vec(
+            mel,
+            (
+                1,
+                model.config().num_mel_bins,
+                mel_len / model.config().num_mel_bins,
+            ),
+            &device,
+        )?;
+
+        debug!("Detecting language");
+        let language_token = Some(multilingual::detect_language(
+            &mut model.clone(),
+            &tokenizer,
+            &mel,
+        )?);
+        let mut model = model.clone();
+        debug!("Initializing decoder");
+        let mut dc = Decoder::new(
+            &mut model,
+            tokenizer,
+            42,
+            &device,
+            language_token,
+            Some(Task::Transcribe),
+            true,
+            false,
+        )?;
+        debug!("Starting decoding process");
+        let segments = dc.run(&mel)?;
+        debug!("Decoding complete");
+        Ok(segments
+            .iter()
+            .map(|s| s.dr.text.clone())
+            .collect::<Vec<String>>()
+            .join("\n"))
+    }
 }
 
 fn resample(input: Vec<f32>, from_sample_rate: u32, to_sample_rate: u32) -> Result<Vec<f32>> {
@@ -545,7 +673,7 @@ pub struct TranscriptionResult {
     pub timestamp: u64,
     pub error: Option<String>,
 }
-pub async fn create_whisper_channel() -> Result<(
+pub async fn create_whisper_channel(cloud_audio: bool) -> Result<(
     UnboundedSender<AudioInput>,
     UnboundedReceiver<TranscriptionResult>,
 )> {
@@ -568,7 +696,7 @@ pub async fn create_whisper_channel() -> Result<(
                         .expect("Time went backwards")
                         .as_secs();
 
-                    let result = stt(&input.path, &whisper_model);
+                    let result = stt(&input.path, &whisper_model, cloud_audio);
 
                     let transcription_result = match result {
                         Ok(transcription) => TranscriptionResult {

From 54c7149464c1479fad9d281fc54298e175d4d3f8 Mon Sep 17 00:00:00 2001
From: matthew-heartful <matthew.heartful@gmail.com>
Date: Wed, 31 Jul 2024 20:19:09 -0700
Subject: [PATCH 7/9] tests

---
 screenpipe-audio/tests/core_tests.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/screenpipe-audio/tests/core_tests.rs b/screenpipe-audio/tests/core_tests.rs
index 5e50c80e..070db7d2 100644
--- a/screenpipe-audio/tests/core_tests.rs
+++ b/screenpipe-audio/tests/core_tests.rs
@@ -43,12 +43,13 @@ mod tests {
     fn test_speech_to_text() {
         setup();
         println!("Starting speech to text test");
-
+a
         println!("Loading audio file");
         let start = std::time::Instant::now();
         let whisper_model = WhisperModel::new().unwrap();
+        let cloud_audio = true; // Set this based on your test requirements
 
-        let text = stt("./test_data/selah.mp4", &whisper_model).unwrap();
+        let text = stt("./test_data/selah.mp4", &whisper_model, cloud_audio).unwrap();
         let duration = start.elapsed();
 
         println!("Speech to text completed in {:?}", duration);

From da4d608a9d878c9bb25a4860708285fe5e844f8e Mon Sep 17 00:00:00 2001
From: matthew-heartful <matthew.heartful@gmail.com>
Date: Wed, 31 Jul 2024 20:25:51 -0700
Subject: [PATCH 8/9] tests2

---
 screenpipe-audio/tests/core_tests.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/screenpipe-audio/tests/core_tests.rs b/screenpipe-audio/tests/core_tests.rs
index 070db7d2..de81b69d 100644
--- a/screenpipe-audio/tests/core_tests.rs
+++ b/screenpipe-audio/tests/core_tests.rs
@@ -43,7 +43,6 @@ mod tests {
     fn test_speech_to_text() {
         setup();
         println!("Starting speech to text test");
-a
         println!("Loading audio file");
         let start = std::time::Instant::now();
         let whisper_model = WhisperModel::new().unwrap();
@@ -224,7 +223,7 @@ a
         let output_path =
             PathBuf::from(format!("test_output_{}.mp4", Utc::now().timestamp_millis()));
         let output_path_2 = output_path.clone();
-        let (whisper_sender, mut whisper_receiver) = create_whisper_channel().await.unwrap();
+        let (whisper_sender, mut whisper_receiver) = create_whisper_channel(cloud_audio).await.unwrap();
         let is_running = Arc::new(AtomicBool::new(true));
         // Start recording in a separate thread
         let recording_thread = tokio::spawn(async move {

From f5cecb5e7a1f75c80b3c186f9c99ae01b80b2f35 Mon Sep 17 00:00:00 2001
From: matthew-heartful <matthew.heartful@gmail.com>
Date: Wed, 31 Jul 2024 20:47:30 -0700
Subject: [PATCH 9/9] tests3

---
 screenpipe-audio/tests/core_tests.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/screenpipe-audio/tests/core_tests.rs b/screenpipe-audio/tests/core_tests.rs
index de81b69d..a1e39488 100644
--- a/screenpipe-audio/tests/core_tests.rs
+++ b/screenpipe-audio/tests/core_tests.rs
@@ -3,7 +3,7 @@ mod tests {
     use chrono::Utc;
     use log::{debug, LevelFilter};
     use screenpipe_audio::{default_output_device, list_audio_devices, stt, WhisperModel};
-    use screenpipe_audio::{parse_audio_device, record_and_transcribe};
+    use screenpipe_audio::{parse_audio_device, record_and_transcribe, create_whisper_channel};
     use std::path::PathBuf;
     use std::process::Command;
     use std::str::FromStr;
@@ -11,6 +11,7 @@ mod tests {
     use std::sync::Arc;
     use std::time::{Duration, Instant};
     use tokio::sync::mpsc::unbounded_channel;
+    use tokio::time::timeout;
 
     fn setup() {
         // Initialize the logger with an info level filter
@@ -223,6 +224,7 @@ mod tests {
         let output_path =
             PathBuf::from(format!("test_output_{}.mp4", Utc::now().timestamp_millis()));
         let output_path_2 = output_path.clone();
+        let cloud_audio = true; // Set this based on your test requirements
         let (whisper_sender, mut whisper_receiver) = create_whisper_channel(cloud_audio).await.unwrap();
         let is_running = Arc::new(AtomicBool::new(true));
         // Start recording in a separate thread
@@ -285,4 +287,4 @@ mod tests {
         let _ = recording_thread.abort();
         std::fs::remove_file(output_path_2).unwrap_or_default();
     }
-}
+}
\ No newline at end of file