diff --git a/Cargo.toml b/Cargo.toml index 8940212b..74a7c067 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,4 +17,4 @@ candle = { package = "candle-core", version = "0.6.0" } candle-nn = { package = "candle-nn", version = "0.6.0" } candle-transformers = { package = "candle-transformers", version = "0.6.0" } tokenizers = "0.19.1" - +tracing = "0.1.37" diff --git a/screenpipe-audio/Cargo.toml b/screenpipe-audio/Cargo.toml index 105c2886..ea7b1c14 100644 --- a/screenpipe-audio/Cargo.toml +++ b/screenpipe-audio/Cargo.toml @@ -50,6 +50,12 @@ env_logger = "0.10" # File tempfile = "3" +# Tracing +tracing = { workspace = true } + +# Concurrency +crossbeam = "0.8" + [dev-dependencies] tempfile = "3.3.0" diff --git a/screenpipe-audio/src/bin/screenpipe-audio.rs b/screenpipe-audio/src/bin/screenpipe-audio.rs index c0186d33..7465e263 100644 --- a/screenpipe-audio/src/bin/screenpipe-audio.rs +++ b/screenpipe-audio/src/bin/screenpipe-audio.rs @@ -1,6 +1,9 @@ use anyhow::{anyhow, Result}; use clap::Parser; use log::info; +use screenpipe_audio::create_whisper_channel; +use screenpipe_audio::default_input_device; +use screenpipe_audio::default_output_device; use screenpipe_audio::list_audio_devices; use screenpipe_audio::parse_device_spec; use screenpipe_audio::record_and_transcribe; @@ -12,8 +15,12 @@ use std::time::Duration; #[derive(Parser, Debug)] #[clap(author, version, about, long_about = None)] struct Args { - #[clap(short, long, help = "Audio device name")] - audio_device: Option, + #[clap( + short, + long, + help = "Audio device name (can be specified multiple times)" + )] + audio_device: Vec, #[clap(long, help = "List available audio devices")] list_audio_devices: bool, @@ -45,51 +52,58 @@ fn main() -> Result<()> { return Ok(()); } - let device = match args.audio_device { - Some(d) => parse_device_spec(&d).unwrap(), - None => { - if devices.is_empty() { - return Err(anyhow!("No audio input devices found")); - } - eprintln!("No audio device specified. Available devices are:"); - print_devices(&devices); - eprintln!("\nPlease specify one or more devices with:"); - eprintln!( - " {} --audio-device \"Device Name (input)\" [--audio-device \"Another Device (output)\"]", - std::env::args().next().unwrap() - ); - return Err(anyhow!("No device specified")); - } + let devices = if args.audio_device.is_empty() { + vec![default_input_device()?, default_output_device()?] + } else { + args.audio_device + .iter() + .map(|d| parse_device_spec(d)) + .collect::>>()? }; - let (result_tx, result_rx) = mpsc::channel(); + if devices.is_empty() { + return Err(anyhow!("No audio input devices found")); + } + let chunk_duration = Duration::from_secs(30); let output_path = PathBuf::from("output.wav"); - // Spawn a thread to handle the recording and transcription - let recording_thread = thread::spawn(move || { - record_and_transcribe(&device, chunk_duration, result_tx, output_path) - }); + let (whisper_sender, whisper_receiver) = create_whisper_channel()?; + + // Spawn threads for each device + let recording_threads: Vec<_> = devices + .into_iter() + .enumerate() + .map(|(i, device)| { + let whisper_sender = whisper_sender.clone(); + let output_path = output_path.with_file_name(format!("output_{}.wav", i)); + thread::spawn(move || { + record_and_transcribe(&device, chunk_duration, output_path, whisper_sender) + }) + }) + .collect(); // Main loop to receive and print transcriptions loop { - match result_rx.recv_timeout(Duration::from_secs(5)) { + match whisper_receiver.recv_timeout(Duration::from_secs(5)) { Ok(result) => { - info!("Transcription: {}", result.text); + info!("Transcription: {:?}", result); } - Err(mpsc::RecvTimeoutError::Timeout) => { + Err(crossbeam::channel::RecvTimeoutError::Timeout) => { // No transcription received in 5 seconds, continue waiting continue; } - Err(mpsc::RecvTimeoutError::Disconnected) => { - // Sender has been dropped, recording is complete + Err(crossbeam::channel::RecvTimeoutError::Disconnected) => { + // All senders have been dropped, recording is complete break; } } } - // Wait for the recording thread to finish - let file_path = recording_thread.join().unwrap()?; - println!("Recording complete: {:?}", file_path); + // Wait for all recording threads to finish + for (i, thread) in recording_threads.into_iter().enumerate() { + let file_path = thread.join().unwrap()?; + println!("Recording {} complete: {:?}", i, file_path); + } Ok(()) } diff --git a/screenpipe-audio/src/core.rs b/screenpipe-audio/src/core.rs index 464c7355..8431abd3 100644 --- a/screenpipe-audio/src/core.rs +++ b/screenpipe-audio/src/core.rs @@ -1,18 +1,18 @@ use anyhow::{anyhow, Result}; use cpal::traits::{DeviceTrait, HostTrait, StreamTrait}; use cpal::{FromSample, Sample}; +use crossbeam::channel::Sender; use hound::WavSpec; use log::{error, info}; use serde::Serialize; use std::fmt; use std::fs::File; use std::path::PathBuf; -use std::sync::mpsc::Sender; use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; use std::{io::BufWriter, thread}; -use crate::stt::stt; +use crate::AudioInput; pub struct AudioCaptureResult { pub text: String, @@ -83,8 +83,8 @@ pub fn parse_device_spec(name: &str) -> Result { pub fn record_and_transcribe( device_spec: &DeviceSpec, duration: Duration, - result_tx: Sender, output_path: PathBuf, + whisper_sender: Sender, ) -> Result { let host = match device_spec { #[cfg(target_os = "macos")] @@ -94,7 +94,7 @@ pub fn record_and_transcribe( info!("device: {:?}", device_spec.to_string()); - let device = if device_spec.to_string() == "default" { + let audio_device = if device_spec.to_string() == "default" { host.default_input_device() } else { host.input_devices()?.find(|x| { @@ -108,10 +108,14 @@ pub fn record_and_transcribe( .unwrap_or(false) }) } - .ok_or_else(|| anyhow!("Device not found"))?; + .ok_or_else(|| anyhow!("Audio device not found"))?; - let config = device.default_input_config()?; - info!("Recording device: {}, Config: {:?}", device.name()?, config); + let config = audio_device.default_input_config()?; + info!( + "Recording audio device: {}, Config: {:?}", + audio_device.name()?, + config + ); let spec = wav_spec_from_config(&config); let writer = hound::WavWriter::create(&output_path, spec)?; @@ -120,25 +124,25 @@ pub fn record_and_transcribe( let err_fn = |err| error!("An error occurred on the audio stream: {}", err); let stream = match config.sample_format() { - cpal::SampleFormat::I8 => device.build_input_stream( + cpal::SampleFormat::I8 => audio_device.build_input_stream( &config.into(), move |data, _: &_| write_input_data::(data, &writer_2), err_fn, None, )?, - cpal::SampleFormat::I16 => device.build_input_stream( + cpal::SampleFormat::I16 => audio_device.build_input_stream( &config.into(), move |data, _: &_| write_input_data::(data, &writer_2), err_fn, None, )?, - cpal::SampleFormat::I32 => device.build_input_stream( + cpal::SampleFormat::I32 => audio_device.build_input_stream( &config.into(), move |data, _: &_| write_input_data::(data, &writer_2), err_fn, None, )?, - cpal::SampleFormat::F32 => device.build_input_stream( + cpal::SampleFormat::F32 => audio_device.build_input_stream( &config.into(), move |data, _: &_| write_input_data::(data, &writer_2), err_fn, @@ -169,15 +173,11 @@ pub fn record_and_transcribe( if let Some(writer) = writer_guard.as_mut() { writer.flush()?; - // Transcribe the current audio chunk - match stt(output_path.to_str().unwrap()) { - Ok(transcription) => { - result_tx.send(AudioCaptureResult { - text: transcription, - })?; - } - Err(e) => error!("Transcription failed: {}", e), - } + // Send the file path to the whisper channel + whisper_sender.send(AudioInput { + path: output_path.to_str().unwrap().to_string(), + device: device_spec.to_string(), + })?; } } @@ -195,14 +195,11 @@ pub fn record_and_transcribe( if let Some(writer) = writer_guard.as_mut() { writer.flush()?; - match stt(output_path.to_str().unwrap()) { - Ok(transcription) => { - result_tx.send(AudioCaptureResult { - text: transcription, - })?; - } - Err(e) => error!("Final transcription failed: {}", e), - } + // Send the file path to the whisper channel + whisper_sender.send(AudioInput { + path: output_path.to_str().unwrap().to_string(), + device: device_spec.to_string(), + })?; } } diff --git a/screenpipe-audio/src/lib.rs b/screenpipe-audio/src/lib.rs index 04729f80..d5f6907c 100644 --- a/screenpipe-audio/src/lib.rs +++ b/screenpipe-audio/src/lib.rs @@ -2,8 +2,8 @@ mod core; mod multilingual; mod pcm_decode; mod stt; - pub use core::{ default_input_device, default_output_device, list_audio_devices, parse_device_spec, record_and_transcribe, AudioCaptureResult, AudioDevice, DeviceSpec, }; +pub use stt::{create_whisper_channel, AudioInput, TranscriptionResult}; diff --git a/screenpipe-audio/src/multilingual.rs b/screenpipe-audio/src/multilingual.rs index 78f3c517..cdd73b21 100644 --- a/screenpipe-audio/src/multilingual.rs +++ b/screenpipe-audio/src/multilingual.rs @@ -3,6 +3,8 @@ use candle_transformers::models::whisper::SOT_TOKEN; use log::info; use tokenizers::Tokenizer; +use crate::stt::Model; + const LANGUAGES: [(&str, &str); 99] = [ ("en", "english"), ("zh", "chinese"), @@ -107,7 +109,7 @@ const LANGUAGES: [(&str, &str); 99] = [ /// Returns the token id for the selected language. pub fn detect_language( - model: &mut super::stt::Model, + model: &mut Model, tokenizer: &Tokenizer, mel: &Tensor, ) -> Result { diff --git a/screenpipe-audio/src/stt.rs b/screenpipe-audio/src/stt.rs index 4ee7988b..35b7b9e8 100644 --- a/screenpipe-audio/src/stt.rs +++ b/screenpipe-audio/src/stt.rs @@ -1,18 +1,67 @@ +use std::{ + thread, + time::{SystemTime, UNIX_EPOCH}, +}; + use anyhow::{Error as E, Result}; use candle::{Device, IndexOp, Tensor}; use candle_nn::ops::softmax; +use crossbeam::channel::{self, Receiver, Sender}; use hf_hub::{api::sync::Api, Repo, RepoType}; use log::{error, info}; use rand::{distributions::Distribution, SeedableRng}; use tokenizers::Tokenizer; -use candle_transformers::models::whisper::{self as m, audio, Config}; +use candle_transformers::{ + models::whisper::{self as m, audio, Config}, + quantized_var_builder::VarBuilder, +}; use rubato::{ Resampler, SincFixedIn, SincInterpolationParameters, SincInterpolationType, WindowFunction, }; use crate::{multilingual, pcm_decode::pcm_decode}; +#[derive(Clone)] +pub struct WhisperModel { + pub model: Model, + pub tokenizer: Tokenizer, + pub device: Device, +} + +impl WhisperModel { + pub fn new() -> Result { + let device = Device::new_metal(0).unwrap_or(Device::new_cuda(0).unwrap_or(Device::Cpu)); + info!("device = {:?}", device); + + let (config_filename, tokenizer_filename, weights_filename) = { + let api = Api::new()?; + let repo = api.repo(Repo::with_revision( + "lmz/candle-whisper".to_string(), + RepoType::Model, + "main".to_string(), + )); + let config = repo.get("config-tiny.json")?; + let tokenizer = repo.get("tokenizer-tiny.json")?; + let model = repo.get("model-tiny-q80.gguf")?; + (config, tokenizer, model) + }; + + let config: Config = serde_json::from_str(&std::fs::read_to_string(config_filename)?)?; + let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?; + + let vb = VarBuilder::from_gguf(weights_filename, &device)?; + let model = Model::Quantized(m::quantized_model::Whisper::load(&vb, config)?); + + Ok(Self { + model: model, + tokenizer, + device, + }) + } +} + +#[derive(Debug, Clone)] pub enum Model { Normal(m::model::Whisper), Quantized(m::quantized_model::Whisper), @@ -70,13 +119,13 @@ struct Segment { dr: DecodingResult, } -struct Decoder { - model: Model, +struct Decoder<'a> { + model: &'a mut Model, rng: rand::rngs::StdRng, task: Option, timestamps: bool, verbose: bool, - tokenizer: Tokenizer, + tokenizer: &'a Tokenizer, suppress_tokens: Tensor, sot_token: u32, transcribe_token: u32, @@ -87,11 +136,11 @@ struct Decoder { language_token: Option, } -impl Decoder { +impl<'a> Decoder<'a> { #[allow(clippy::too_many_arguments)] fn new( - model: Model, - tokenizer: Tokenizer, + model: &'a mut Model, + tokenizer: &'a Tokenizer, seed: u64, device: &Device, language_token: Option, @@ -142,12 +191,11 @@ impl Decoder { } fn decode(&mut self, mel: &Tensor, t: f64) -> Result { - let model = &mut self.model; - let audio_features = model.encoder_forward(mel, true)?; + let audio_features = self.model.encoder_forward(mel, true)?; if self.verbose { info!("audio features: {:?}", audio_features.dims()); } - let sample_len = model.config().max_target_positions / 2; + let sample_len = self.model.config().max_target_positions / 2; let mut no_speech_prob = f64::NAN; let mut tokens = vec![self.sot_token]; if let Some(language_token) = self.language_token { @@ -167,17 +215,20 @@ impl Decoder { for i in 0..sample_len { let tokens_t = Tensor::new(tokens.as_slice(), mel.device())?; let tokens_t = tokens_t.unsqueeze(0)?; - let ys = model.decoder_forward(&tokens_t, &audio_features, i == 0)?; + let ys = self + .model + .decoder_forward(&tokens_t, &audio_features, i == 0)?; if i == 0 { - let logits = model.decoder_final_linear(&ys.i(..1)?)?.i(0)?.i(0)?; + let logits = self.model.decoder_final_linear(&ys.i(..1)?)?.i(0)?.i(0)?; no_speech_prob = softmax(&logits, 0)? .i(self.no_speech_token as usize)? .to_scalar::()? as f64; } let (_, seq_len, _) = ys.dims3()?; - let logits = model + let logits = self + .model .decoder_final_linear(&ys.i((..1, seq_len - 1..))?)? .i(0)? .i(0)?; @@ -213,7 +264,9 @@ impl Decoder { sum_logprob += prob.ln(); - if next_token == self.eot_token || tokens.len() > model.config().max_target_positions { + if next_token == self.eot_token + || tokens.len() > self.model.config().max_target_positions + { break; } @@ -343,31 +396,13 @@ enum Task { Translate, } -pub fn stt(input: &str) -> Result { +pub fn stt(file_path: &str, whisper_model: &WhisperModel) -> Result { info!("Starting speech to text"); - let device = Device::new_metal(0).unwrap_or(Device::new_cuda(0).unwrap_or(Device::Cpu)); - info!("device = {:?}", device); - let (default_model, default_revision) = ("lmz/candle-whisper", "main"); - let default_model = default_model.to_string(); - let default_revision = default_revision.to_string(); - - let (config_filename, tokenizer_filename, weights_filename, input) = { - let api = Api::new()?; - let repo = api.repo(Repo::with_revision( - default_model, - RepoType::Model, - default_revision, - )); - let sample = std::path::PathBuf::from(input); - let config = repo.get("config-tiny.json")?; - let tokenizer = repo.get("tokenizer-tiny.json")?; - let model = repo.get("model-tiny-q80.gguf")?; - (config, tokenizer, model, sample) - }; - let config: Config = serde_json::from_str(&std::fs::read_to_string(config_filename)?)?; - let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?; + let mut model = &whisper_model.model; + let tokenizer = &whisper_model.tokenizer; + let device = &whisper_model.device; - let mel_bytes = match config.num_mel_bins { + let mel_bytes = match model.config().num_mel_bins { 80 => include_bytes!("../models/whisper/melfilters.bytes").as_slice(), 128 => include_bytes!("../models/whisper/melfilters128.bytes").as_slice(), nmel => anyhow::bail!("unexpected num_mel_bins {nmel}"), @@ -375,7 +410,7 @@ pub fn stt(input: &str) -> Result { let mut mel_filters = vec![0f32; mel_bytes.len() / 4]; ::read_f32_into(mel_bytes, &mut mel_filters); - let (mut pcm_data, sample_rate) = pcm_decode(input)?; + let (mut pcm_data, sample_rate) = pcm_decode(file_path)?; if sample_rate != m::SAMPLE_RATE as u32 { info!( "Resampling from {} Hz to {} Hz", @@ -386,24 +421,26 @@ pub fn stt(input: &str) -> Result { } // info!("pcm data loaded {}", pcm_data.len()); - let mel = audio::pcm_to_mel(&config, &pcm_data, &mel_filters); + let mel = audio::pcm_to_mel(&model.config(), &pcm_data, &mel_filters); let mel_len = mel.len(); let mel = Tensor::from_vec( mel, - (1, config.num_mel_bins, mel_len / config.num_mel_bins), + ( + 1, + model.config().num_mel_bins, + mel_len / model.config().num_mel_bins, + ), &device, )?; - // info!("loaded mel: {:?}", mel.dims()); - let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf( - weights_filename, - &device, - )?; - let mut model = Model::Quantized(m::quantized_model::Whisper::load(&vb, config)?); - let language_token = Some(multilingual::detect_language(&mut model, &tokenizer, &mel)?); - // info!("Creating decoder"); + let language_token = Some(multilingual::detect_language( + &mut model.clone(), + &tokenizer, + &mel, + )?); + let mut model = model.clone(); let mut dc = Decoder::new( - model, + &mut model, tokenizer, 42, &device, @@ -445,6 +482,61 @@ fn resample(input: Vec, from_sample_rate: u32, to_sample_rate: u32) -> Resu Ok(waves_out.into_iter().next().unwrap()) } +#[derive(Debug, Clone)] +pub struct AudioInput { + pub path: String, + pub device: String, +} + +#[derive(Debug, Clone)] +pub struct TranscriptionResult { + pub input: AudioInput, + pub transcription: Option, + pub timestamp: u64, + pub error: Option, +} +pub fn create_whisper_channel() -> Result<(Sender, Receiver)> { + let whisper_model = WhisperModel::new()?; + let (input_sender, input_receiver): (Sender, Receiver) = + channel::unbounded(); + let (output_sender, output_receiver): ( + Sender, + Receiver, + ) = channel::unbounded(); + + thread::spawn(move || { + while let Ok(input) = input_receiver.recv() { + let timestamp = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("Time went backwards") + .as_secs(); + + let result = stt(&input.path, &whisper_model); + + let transcription_result = match result { + Ok(transcription) => TranscriptionResult { + input: input.clone(), + transcription: Some(transcription), + timestamp, + error: None, + }, + Err(e) => TranscriptionResult { + input: input.clone(), + transcription: None, + timestamp, + error: Some(e.to_string()), + }, + }; + + if output_sender.send(transcription_result).is_err() { + break; + } + } + }); + + Ok((input_sender, output_receiver)) +} + #[test] #[ignore] fn test_speech_to_text() { @@ -452,7 +544,9 @@ fn test_speech_to_text() { println!("Loading audio file"); let start = std::time::Instant::now(); - let text = stt("./test_data/poetic_kapil_gupta.wav").unwrap(); + let whisper_model = WhisperModel::new().unwrap(); + + let text = stt("./test_data/poetic_kapil_gupta.wav", &whisper_model).unwrap(); let duration = start.elapsed(); println!("Speech to text completed in {:?}", duration); diff --git a/screenpipe-server/Cargo.toml b/screenpipe-server/Cargo.toml index edcc9829..e197ea88 100644 --- a/screenpipe-server/Cargo.toml +++ b/screenpipe-server/Cargo.toml @@ -68,6 +68,12 @@ sysinfo = "0.29.0" # Color colored = "2.0" +# Tracing +tracing = { workspace = true } + +# Concurrency +crossbeam = "0.8" + [dev-dependencies] tempfile = "3.3.0" diff --git a/screenpipe-server/src/core.rs b/screenpipe-server/src/core.rs index f7a50262..d3bfcd7a 100644 --- a/screenpipe-server/src/core.rs +++ b/screenpipe-server/src/core.rs @@ -1,14 +1,16 @@ use crate::{DatabaseManager, VideoCapture}; use anyhow::Result; use chrono::Utc; +use crossbeam::channel::{Receiver, Sender}; use log::{debug, error, info}; -use screenpipe_audio::{record_and_transcribe, AudioCaptureResult, DeviceSpec}; -use screenpipe_vision::CaptureResult; +use screenpipe_audio::{ + create_whisper_channel, record_and_transcribe, AudioCaptureResult, AudioInput, DeviceSpec, + TranscriptionResult, +}; use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::mpsc::{self, Receiver}; use std::sync::Arc; +use std::thread; use std::time::Duration; -use std::{fs, thread}; use tokio::runtime::Runtime; pub enum RecorderControl { Pause, @@ -21,7 +23,7 @@ pub async fn start_continuous_recording( output_path: Arc, fps: f64, audio_chunk_duration: Duration, - control_rx: Receiver, + control_rx: std::sync::mpsc::Receiver, enable_audio: bool, audio_devices: Vec>, ) -> Result<()> { @@ -30,6 +32,8 @@ pub async fn start_continuous_recording( info!("Audio recording disabled"); } + let (whisper_sender, whisper_receiver) = create_whisper_channel()?; + let db_manager_video = Arc::clone(&db); let db_manager_audio = Arc::clone(&db); @@ -52,6 +56,8 @@ pub async fn start_continuous_recording( audio_chunk_duration, is_running_audio, audio_devices, + whisper_sender, + whisper_receiver, ) .await }); @@ -95,7 +101,6 @@ async fn record_video( error!("Failed to insert new video chunk: {}", e); } }; - let video_capture = VideoCapture::new(&output_path, fps, new_chunk_callback); while is_running.load(Ordering::SeqCst) { @@ -125,6 +130,8 @@ async fn record_audio( chunk_duration: Duration, is_running: Arc, devices: Vec>, + whisper_sender: Sender, + whisper_receiver: Receiver, ) -> Result<()> { let mut handles = vec![]; @@ -133,10 +140,10 @@ async fn record_audio( let output_path_clone = Arc::clone(&output_path); let is_running_clone = Arc::clone(&is_running); let device_spec_clone = Arc::clone(&device_spec); + let whisper_sender_clone = whisper_sender.clone(); + let whisper_receiver_clone = whisper_receiver.clone(); let handle = tokio::spawn(async move { - let (result_tx, result_rx) = mpsc::channel(); - info!( "Starting audio capture thread for device: {}", &device_spec_clone @@ -145,8 +152,9 @@ async fn record_audio( while is_running_clone.load(Ordering::SeqCst) { let recording_thread = thread::spawn({ let device_spec_clone = Arc::clone(&device_spec_clone); - let result_tx = result_tx.clone(); let output_path_clone = Arc::clone(&output_path_clone); + let whisper_sender = whisper_sender_clone.clone(); + move || { let new_file_name = Utc::now().format("%Y-%m-%d_%H-%M-%S").to_string(); let file_path = format!( @@ -156,27 +164,24 @@ async fn record_audio( record_and_transcribe( device_spec_clone.as_ref(), chunk_duration, - result_tx, file_path.into(), + whisper_sender, ) } }); + // Handle the recording thread result match recording_thread.join() { Ok(Ok(file_path)) => { info!( "Recording complete for device {}: {:?}", device_spec_clone, file_path ); + let whisper_receiver = whisper_receiver_clone.clone(); + // Process the recorded chunk - while let Ok(result) = result_rx.try_recv() { - process_audio_result( - &db_clone, - &file_path.to_str().unwrap(), - &device_spec_clone, - result, - ) - .await; + if let Ok(transcription) = whisper_receiver.recv() { + process_audio_result(&db_clone, transcription).await; } } Ok(Err(e)) => error!("Error in record_and_transcribe: {}", e), @@ -196,34 +201,36 @@ async fn record_audio( Ok(()) } - -async fn process_audio_result( - db: &DatabaseManager, - output_path: &str, - device_spec: &DeviceSpec, - result: AudioCaptureResult, -) { - info!("Inserting audio chunk: {:?}", result.text); - match db.insert_audio_chunk(&output_path).await { +async fn process_audio_result(db: &DatabaseManager, result: TranscriptionResult) { + info!("Inserting audio chunk: {:?}", result.transcription); + if result.error.is_some() || result.transcription.is_none() { + error!( + "Error in audio recording: {}", + result.error.unwrap_or_default() + ); + return; + } + let transcription = result.transcription.unwrap(); + match db.insert_audio_chunk(&result.input.path).await { Ok(audio_chunk_id) => { if let Err(e) = db - .insert_audio_transcription(audio_chunk_id, &result.text, 0) // TODO index is in the text atm + .insert_audio_transcription(audio_chunk_id, &transcription, 0) // TODO index is in the text atm .await { error!( "Failed to insert audio transcription for device {}: {}", - device_spec, e + result.input.device, e ); } else { debug!( "Inserted audio transcription for chunk {} from device {}", - audio_chunk_id, device_spec + audio_chunk_id, result.input.device ); } } Err(e) => error!( "Failed to insert audio chunk for device {}: {}", - device_spec, e + result.input.device, e ), } }