diff --git a/README.md b/README.md index d4f57f046..82852d95f 100644 --- a/README.md +++ b/README.md @@ -142,6 +142,32 @@ The following are recommendations for running Handy on your own machine. If you We're actively working on several features and improvements. Contributions and feedback are welcome! +### Experimental + +**Post Processing:** + +Post-processing allows you to refine transcriptions using an LLM after transcribing the initial text. You can create custom prompt templates with the following variables: + +| Variable | Description | +|----------|-------------| +| `${output}` | The transcription text | +| `${current_app}` | Name of the frontmost application when recording started | +| `${short_prev_transcript}` | Recent transcript from same app (last 200 words, expires after 5 min) | +| `${time_local}` | Current local time (e.g., "Tuesday, February 3, 2026 10:33:39 AM") | +| `${language}` | Selected transcription language for Whisper models (e.g., "en", "zh-Hans"), or "auto" for other models (Parakeet, Moonshine) | + +Example prompt template: + +``` +Fix grammar and punctuation in this transcription in preferred 'Language'. Keep the original meaning. +Current app: ${current_app} +Time: ${time_local} +Language code: ${language} +Previous transcript (for context): ${short_prev_transcript} + +Transcription: ${output} +``` + ### In Progress **Debug Logging:** diff --git a/src-tauri/Cargo.lock b/src-tauri/Cargo.lock index 6a4388f96..72931ada1 100644 --- a/src-tauri/Cargo.lock +++ b/src-tauri/Cargo.lock @@ -2385,6 +2385,7 @@ dependencies = [ "hound", "log", "natural", + "objc", "once_cell", "rdev 0.5.0-2", "regex", diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml index e44a8e7f8..80a8ad874 100644 --- a/src-tauri/Cargo.toml +++ b/src-tauri/Cargo.toml @@ -95,6 +95,7 @@ windows = { version = "0.61.3", features = [ [target.'cfg(target_os = "macos")'.dependencies] tauri-nspanel = { git = "https://github.com/ahkohd/tauri-nspanel", branch = "v2.1" } +objc = "0.2" [profile.release] lto = true diff --git a/src-tauri/src/actions.rs b/src-tauri/src/actions.rs index 6dfa52d31..d8372a0bd 100644 --- a/src-tauri/src/actions.rs +++ b/src-tauri/src/actions.rs @@ -3,21 +3,26 @@ use crate::apple_intelligence; use crate::audio_feedback::{play_feedback_sound, play_feedback_sound_blocking, SoundType}; use crate::managers::audio::AudioRecordingManager; use crate::managers::history::HistoryManager; +use crate::managers::model::{EngineType, ModelManager}; use crate::managers::transcription::TranscriptionManager; use crate::settings::{get_settings, AppSettings, APPLE_INTELLIGENCE_PROVIDER_ID}; use crate::shortcut; use crate::tray::{change_tray_icon, TrayIconState}; use crate::utils::{self, show_recording_overlay, show_transcribing_overlay}; -use crate::ManagedToggleState; +use crate::{active_app, transcript_context, ManagedToggleState}; use ferrous_opencc::{config::BuiltinConfig, OpenCC}; use log::{debug, error}; use once_cell::sync::Lazy; use std::collections::HashMap; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; use std::time::Instant; use tauri::AppHandle; use tauri::Manager; +/// Tracks the frontmost application captured at recording start, keyed by binding_id +static RECORDING_APP_CONTEXT: Lazy>> = + Lazy::new(|| Mutex::new(HashMap::new())); + // Shortcut Action Trait pub trait ShortcutAction: Send + Sync { fn start(&self, app: &AppHandle, binding_id: &str, shortcut_str: &str); @@ -27,9 +32,24 @@ pub trait ShortcutAction: Send + Sync { // Transcribe Action struct TranscribeAction; +/// Context information for LLM post-processing prompts. +/// These fields are available as variables in the prompt template. +#[derive(Clone, Debug, Default)] +pub struct PostProcessContext { + /// The name of the frontmost application when transcription started + pub current_app: String, + /// Short excerpt from previous transcript in the same app (last 200 words, expires after 5 min) + pub short_prev_transcript: String, + /// Current local time formatted as "Tuesday, February 3, 2026 10:33:39 AM" + pub time_local: String, + /// The selected language for transcription (e.g., "en", "zh-Hans"), or "auto" if not specified + pub language: String, +} + async fn maybe_post_process_transcription( settings: &AppSettings, transcription: &str, + context: &PostProcessContext, ) -> Option { if !settings.post_process_enabled { return None; @@ -90,9 +110,20 @@ async fn maybe_post_process_transcription( provider.id, model ); - // Replace ${output} variable in the prompt with the actual text - let processed_prompt = prompt.replace("${output}", transcription); - debug!("Processed prompt length: {} chars", processed_prompt.len()); + // Replace variables in the prompt with actual values + // Available variables: + // ${output} - The transcription text + // ${current_app} - Name of the frontmost application + // ${short_prev_transcript} - Recent transcript from same app (last 200 words, 5 min expiry) + // ${time_local} - Current local time (e.g., "Tuesday, February 3, 2026 10:33:39 AM") + // ${language} - Selected transcription language (e.g., "en", "zh-Hans") or "auto" + let processed_prompt = prompt + .replace("${output}", transcription) + .replace("${current_app}", &context.current_app) + .replace("${short_prev_transcript}", &context.short_prev_transcript) + .replace("${time_local}", &context.time_local) + .replace("${language}", &context.language); + debug!("Processed prompt : {}", processed_prompt); if provider.id == APPLE_INTELLIGENCE_PROVIDER_ID { #[cfg(all(target_os = "macos", target_arch = "aarch64"))] @@ -218,6 +249,16 @@ impl ShortcutAction for TranscribeAction { let start_time = Instant::now(); debug!("TranscribeAction::start called for binding: {}", binding_id); + // Capture the frontmost application name for LLM context + // This is done early before any UI changes that might affect focus + let frontmost_app = active_app::get_frontmost_app_name().unwrap_or_default(); + debug!("Captured frontmost app: '{}'", frontmost_app); + + // Store the captured app name for use when transcription completes + if let Ok(mut context) = RECORDING_APP_CONTEXT.lock() { + context.insert(binding_id.to_string(), frontmost_app); + } + // Load model in the background let tm = app.state::>(); tm.initiate_model_load(); @@ -290,6 +331,17 @@ impl ShortcutAction for TranscribeAction { let stop_time = Instant::now(); debug!("TranscribeAction::stop called for binding: {}", binding_id); + // Retrieve the captured frontmost app name from recording start + let current_app = RECORDING_APP_CONTEXT + .lock() + .ok() + .and_then(|mut ctx| ctx.remove(binding_id)) + .unwrap_or_default(); + debug!( + "Retrieved frontmost app for binding '{}': '{}'", + binding_id, current_app + ); + let ah = app.clone(); let rm = Arc::clone(&app.state::>()); let tm = Arc::clone(&app.state::>()); @@ -343,10 +395,51 @@ impl ShortcutAction for TranscribeAction { final_text = converted_text; } + // Build context for LLM post-processing + // Get previous transcript from same app (last 200 words, 5 min expiry) + let short_prev_transcript = + transcript_context::get_short_prev_transcript(¤t_app); + + // Generate formatted local time: "Tuesday, February 3, 2026 10:33:39 AM" + let time_local = chrono::Local::now() + .format("%A, %B %-d, %Y %-I:%M:%S %p") + .to_string(); + + let pp_context = PostProcessContext { + current_app: current_app.clone(), + short_prev_transcript, + time_local, + language: { + // Only use selected_language for Whisper models, 'auto' for other models (Parakeet, Moonshine) + let mm = ah.state::>(); + let is_whisper = mm + .get_model_info(&settings.selected_model) + .map(|m| matches!(m.engine_type, EngineType::Whisper)) + .unwrap_or(false); + + if is_whisper && !settings.selected_language.is_empty() { + settings.selected_language.clone() + } else { + "auto".to_string() + } + }, + }; + debug!( + "Post-process context: app='{}', prev_transcript_len={}, time='{}', language='{}'", + pp_context.current_app, + pp_context.short_prev_transcript.len(), + pp_context.time_local, + pp_context.language + ); + // Then apply regular post-processing if enabled // Uses final_text which may already have Chinese conversion applied - if let Some(processed_text) = - maybe_post_process_transcription(&settings, &final_text).await + if let Some(processed_text) = maybe_post_process_transcription( + &settings, + &final_text, + &pp_context, + ) + .await { post_processed_text = Some(processed_text.clone()); final_text = processed_text; @@ -366,6 +459,13 @@ impl ShortcutAction for TranscribeAction { post_processed_text = Some(final_text.clone()); } + // Update the transcript context for this app + // Use the original transcription (before post-processing) for context + transcript_context::update_transcript_context( + ¤t_app, + &transcription, + ); + // Save to history with post-processed text and prompt let hm_clone = Arc::clone(&hm); let transcription_for_history = transcription.clone(); diff --git a/src-tauri/src/active_app.rs b/src-tauri/src/active_app.rs new file mode 100644 index 000000000..39b04bf3a --- /dev/null +++ b/src-tauri/src/active_app.rs @@ -0,0 +1,124 @@ +/// Module for getting the frontmost/active application name. +/// This is platform-specific and returns the name of the application +/// that has keyboard focus when the user starts transcribing. + +#[cfg(target_os = "macos")] +pub fn get_frontmost_app_name() -> Option { + use objc::{msg_send, sel, sel_impl}; + use std::ffi::CStr; + + unsafe { + // Get NSWorkspace shared instance + let workspace: *mut objc::runtime::Object = + msg_send![objc::class!(NSWorkspace), sharedWorkspace]; + if workspace.is_null() { + return None; + } + + // Get frontmost application (NSRunningApplication) + let frontmost_app: *mut objc::runtime::Object = + msg_send![workspace, frontmostApplication]; + if frontmost_app.is_null() { + return None; + } + + // Get localized name of the application + let name: *mut objc::runtime::Object = msg_send![frontmost_app, localizedName]; + if name.is_null() { + return None; + } + + // Convert NSString to Rust String + let utf8_ptr: *const i8 = msg_send![name, UTF8String]; + if utf8_ptr.is_null() { + return None; + } + + let c_str = CStr::from_ptr(utf8_ptr); + match c_str.to_str() { + Ok(s) if !s.is_empty() => Some(s.to_string()), + _ => None, + } + } +} + +#[cfg(target_os = "windows")] +pub fn get_frontmost_app_name() -> Option { + use std::ffi::OsString; + use std::os::windows::ffi::OsStringExt; + use windows::Win32::Foundation::HWND; + use windows::Win32::UI::WindowsAndMessaging::{ + GetForegroundWindow, GetWindowTextLengthW, GetWindowTextW, + }; + + unsafe { + let hwnd: HWND = GetForegroundWindow(); + if hwnd.0.is_null() { + return None; + } + + let length = GetWindowTextLengthW(hwnd); + if length == 0 { + return None; + } + + let mut buffer: Vec = vec![0; (length + 1) as usize]; + let chars_copied = GetWindowTextW(hwnd, &mut buffer); + + if chars_copied > 0 { + buffer.truncate(chars_copied as usize); + let title = OsString::from_wide(&buffer) + .to_string_lossy() + .into_owned(); + if !title.is_empty() { + return Some(title); + } + } + } + + None +} + +#[cfg(target_os = "linux")] +pub fn get_frontmost_app_name() -> Option { + use std::process::Command; + + // Try xdotool first (X11) + if let Ok(output) = Command::new("xdotool") + .args(["getactivewindow", "getwindowname"]) + .output() + { + if output.status.success() { + let name = String::from_utf8_lossy(&output.stdout).trim().to_string(); + if !name.is_empty() { + return Some(name); + } + } + } + + // Fallback for Wayland - try to get from environment or use a generic name + // Most Wayland compositors don't expose window info to external tools + if std::env::var("WAYLAND_DISPLAY").is_ok() { + // On Wayland, we can't easily get the active window name + // Return None and let the caller handle it + return None; + } + + None +} + +#[cfg(not(any(target_os = "macos", target_os = "windows", target_os = "linux")))] +pub fn get_frontmost_app_name() -> Option { + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_frontmost_app_returns_something_or_none() { + // This test just ensures the function doesn't panic + let _result = get_frontmost_app_name(); + } +} diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs index c80a8b98f..39a059cbf 100644 --- a/src-tauri/src/lib.rs +++ b/src-tauri/src/lib.rs @@ -1,4 +1,5 @@ mod actions; +mod active_app; #[cfg(all(target_os = "macos", target_arch = "aarch64"))] mod apple_intelligence; mod audio_feedback; @@ -13,6 +14,7 @@ mod overlay; mod settings; mod shortcut; mod signal_handle; +mod transcript_context; mod tray; mod tray_i18n; mod utils; diff --git a/src-tauri/src/transcript_context.rs b/src-tauri/src/transcript_context.rs new file mode 100644 index 000000000..103cbb0d2 --- /dev/null +++ b/src-tauri/src/transcript_context.rs @@ -0,0 +1,185 @@ +/// Manager for tracking recent transcription context per application. +/// This allows the LLM post-processing to have context about previous +/// transcriptions in the same application. +use log::debug; +use once_cell::sync::Lazy; +use std::collections::HashMap; +use std::sync::Mutex; +use std::time::{Duration, Instant}; + +/// Maximum number of words to keep in the short previous transcript +const MAX_PREV_WORDS: usize = 200; + +/// How long before previous transcript expires (5 minutes) +const EXPIRY_DURATION: Duration = Duration::from_secs(5 * 60); + +/// Entry for tracking transcript history per application +#[derive(Clone, Debug)] +struct TranscriptEntry { + /// The transcript text (trimmed to last MAX_PREV_WORDS words) + text: String, + /// When this entry was last updated + last_updated: Instant, +} + +/// Global state for tracking transcripts per application +static TRANSCRIPT_CONTEXT: Lazy>> = + Lazy::new(|| Mutex::new(HashMap::new())); + +/// Get the short previous transcript for an application. +/// Returns up to the last 200 words of the most recent transcript +/// for this application, if it was within the last 5 minutes. +/// Returns an empty string if no recent transcript exists or if it has expired. +pub fn get_short_prev_transcript(app_name: &str) -> String { + let context = match TRANSCRIPT_CONTEXT.lock() { + Ok(guard) => guard, + Err(e) => { + debug!("Failed to lock transcript context: {}", e); + return String::new(); + } + }; + + if let Some(entry) = context.get(app_name) { + // Check if the entry has expired + if entry.last_updated.elapsed() < EXPIRY_DURATION { + debug!( + "Found previous transcript for '{}': {} chars", + app_name, + entry.text.len() + ); + return entry.text.clone(); + } else { + debug!( + "Previous transcript for '{}' has expired ({:?} ago)", + app_name, + entry.last_updated.elapsed() + ); + } + } + + String::new() +} + +/// Update the transcript context for an application. +/// The text is trimmed to the last MAX_PREV_WORDS words. +pub fn update_transcript_context(app_name: &str, transcript: &str) { + if app_name.is_empty() { + debug!("Skipping transcript context update: empty app name"); + return; + } + + let trimmed_text = trim_to_last_words(transcript, MAX_PREV_WORDS); + + let mut context = match TRANSCRIPT_CONTEXT.lock() { + Ok(guard) => guard, + Err(e) => { + debug!("Failed to lock transcript context for update: {}", e); + return; + } + }; + + // Update or insert the entry + let entry = context.entry(app_name.to_string()).or_insert_with(|| { + TranscriptEntry { + text: String::new(), + last_updated: Instant::now(), + } + }); + + // Append the new transcript to existing text, then trim + if !entry.text.is_empty() && entry.last_updated.elapsed() < EXPIRY_DURATION { + // Combine with previous text if not expired + let combined = format!("{} {}", entry.text, trimmed_text); + entry.text = trim_to_last_words(&combined, MAX_PREV_WORDS); + } else { + // Start fresh if expired or empty + entry.text = trimmed_text; + } + entry.last_updated = Instant::now(); + + debug!( + "Updated transcript context for '{}': {} chars", + app_name, + entry.text.len() + ); + + // Clean up expired entries periodically + cleanup_expired_entries(&mut context); +} + +/// Trim text to the last N words +fn trim_to_last_words(text: &str, max_words: usize) -> String { + let words: Vec<&str> = text.split_whitespace().collect(); + if words.len() <= max_words { + words.join(" ") + } else { + words[words.len() - max_words..].join(" ") + } +} + +/// Remove expired entries from the context map +fn cleanup_expired_entries(context: &mut HashMap) { + let expired_keys: Vec = context + .iter() + .filter(|(_, entry)| entry.last_updated.elapsed() >= EXPIRY_DURATION) + .map(|(key, _)| key.clone()) + .collect(); + + for key in expired_keys { + context.remove(&key); + debug!("Removed expired transcript context for '{}'", key); + } +} + +/// Clear all transcript context (useful for testing or reset) +#[allow(dead_code)] +pub fn clear_all_context() { + if let Ok(mut context) = TRANSCRIPT_CONTEXT.lock() { + context.clear(); + debug!("Cleared all transcript context"); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_trim_to_last_words() { + assert_eq!(trim_to_last_words("hello world", 10), "hello world"); + assert_eq!(trim_to_last_words("a b c d e", 3), "c d e"); + assert_eq!(trim_to_last_words("one", 5), "one"); + assert_eq!(trim_to_last_words("", 5), ""); + } + + #[test] + fn test_get_and_update_context() { + clear_all_context(); + + // Initially empty + assert_eq!(get_short_prev_transcript("TestApp"), ""); + + // Update with some text + update_transcript_context("TestApp", "Hello world this is a test"); + + // Should get the text back + let result = get_short_prev_transcript("TestApp"); + assert_eq!(result, "Hello world this is a test"); + + // Update with more text - should combine + update_transcript_context("TestApp", "Another sentence here"); + + let result = get_short_prev_transcript("TestApp"); + assert!(result.contains("Another sentence here")); + + clear_all_context(); + } + + #[test] + fn test_empty_app_name_ignored() { + clear_all_context(); + update_transcript_context("", "Some text"); + assert_eq!(get_short_prev_transcript(""), ""); + clear_all_context(); + } +}