From 5f970855c421739b37dc0752b6d93579e4ff3bac Mon Sep 17 00:00:00 2001 From: Yorick <2524964538@qq.com> Date: Fri, 2 Jan 2026 22:05:17 +0800 Subject: [PATCH 1/2] feat: Implement local API server for audio transcription with detailed segment output --- LOCAL_API.md | 121 ++++++++++ src-tauri/Cargo.toml | 4 + src-tauri/src/audio_toolkit/audio/mod.rs | 2 +- .../src/audio_toolkit/audio/resampler.rs | 73 ++++++ src-tauri/src/audio_toolkit/mod.rs | 3 +- src-tauri/src/lib.rs | 10 + src-tauri/src/managers/transcription.rs | 74 +++++- src-tauri/src/server.rs | 216 ++++++++++++++++++ src-tauri/src/settings.rs | 14 ++ src-tauri/src/shortcut/mod.rs | 51 +++++ src/components/settings/LocalApiService.tsx | 85 +++++++ .../settings/advanced/AdvancedSettings.tsx | 2 + src/i18n/locales/de/translation.json | 8 + src/i18n/locales/en/translation.json | 8 + src/i18n/locales/es/translation.json | 8 + src/i18n/locales/fr/translation.json | 8 + src/i18n/locales/it/translation.json | 8 + src/i18n/locales/ja/translation.json | 8 + src/i18n/locales/pl/translation.json | 8 + src/i18n/locales/ru/translation.json | 8 + src/i18n/locales/vi/translation.json | 8 + src/i18n/locales/zh/translation.json | 8 + src/stores/settingsStore.ts | 36 +-- 23 files changed, 746 insertions(+), 25 deletions(-) create mode 100644 LOCAL_API.md create mode 100644 src-tauri/src/server.rs create mode 100644 src/components/settings/LocalApiService.tsx diff --git a/LOCAL_API.md b/LOCAL_API.md new file mode 100644 index 000000000..70d2c7ddd --- /dev/null +++ b/LOCAL_API.md @@ -0,0 +1,121 @@ +# Local STT API Documentation + +The Local Speech-to-Text (STT) API allows you to expose Handy's transcription capabilities as a local web service. It is designed to be compatible with the OpenAI and Groq transcription API formats. + +## Configuration + +You can manage the Local API in the application settings: + +1. Open **Settings**. +2. Navigate to the **Advanced** tab. +3. Find the **Local API** section. +4. **Enable Local API**: Toggle the switch to start the server. +5. **Local API Port**: Specify the port you want the server to listen on (default is `5500`). + +> [!NOTE] +> The server listens on `0.0.0.0`, making it accessible from other devices in your local network if your firewall allows it. + +## API Endpoint + +### Transcribe Audio + +`POST /v1/audio/transcriptions` + +Transcribes the uploaded audio file using the currently active model in Handy. + +#### Request Headers + +- `Content-Type: multipart/form-data` + +#### Request Body (Multipart) + +| Field | Type | Required | Status | Description | +| :-------------------------- | :----- | :------- | :-------------- | :--------------------------------------------------------------------------------------------- | +| `file` | file | Yes | **Functional** | The audio file to transcribe (currently only supports **.mp3**). | +| `model` | string | No | _Compatibility_ | Ignored. Handy always uses its currently active model selected in UI. | +| `response_format` | string | No | **Functional** | Can be `json` (default) or `verbose_json`. | +| `timestamp_granularities[]` | string | No | **Functional** | Set to `segment` to include segment-level timestamps when `response_format` is `verbose_json`. | + +#### Response Format + +**Standard JSON (`response_format: json`)** + +```json +{ + "text": "The transcribed text content." +} +``` + +**Verbose JSON (`response_format: verbose_json`)** + +| Field | Type | Status | Description | +| :--------- | :----- | :------------- | :----------------------------------------------------------------------------- | +| `text` | string | **Functional** | The full transcribed text content. | +| `segments` | array | **Functional** | List of transcription segments (requires `timestamp_granularities[]=segment`). | + +**Segment Object Fields:** + +| Field | Type | Status | Description | +| :------------------ | :------ | :------------- | :------------------------------------------------------------------ | +| `start` | float | **Functional** | Start time of the segment in seconds (rounded to 2 decimal places). | +| `end` | float | **Functional** | End time of the segment in seconds (rounded to 2 decimal places). | +| `text` | string | **Functional** | Text content of the segment. | +| `id` | integer | **Functional** | Auto-incrementing index starting from 0. | +| `seek` | integer | _Fixed (0)_ | Compatibility placeholder. | +| `tokens` | array | _Fixed ([])_ | Compatibility placeholder. | +| `temperature` | float | _Fixed (0.0)_ | Compatibility placeholder. | +| `avg_logprob` | float | _Fixed (0.0)_ | Compatibility placeholder. | +| `compression_ratio` | float | _Fixed (0.0)_ | Compatibility placeholder. | +| `no_speech_prob` | float | _Fixed (0.0)_ | Compatibility placeholder. | + +Example verbose response: + +```json +{ + "text": "The transcribed text content.", + "segments": [ + { + "start": 0.0, + "end": 2.5, + "text": "The transcribed text", + "id": 0, + "seek": 0, + "tokens": [], + "temperature": 0.0, + "avg_logprob": 0.0, + "compression_ratio": 0.0, + "no_speech_prob": 0.0 + } + ] +} +``` + +## Usage Example (cURL) + +```bash +curl http://localhost:5500/v1/audio/transcriptions \ + -H "Content-Type: multipart/form-data" \ + -F file="@/path/to/your/audio.mp3" \ + -F response_format="verbose_json" \ + -F "timestamp_granularities[]=segment" +``` + +## Model Recommendations & Known Issues + +### Known Issues + +- **Audio Format Limitation**: Currently, the API only supports **.mp3** files. Support for additional formats (e.g., .m4a, .wav, .flac) is planned for the future. **Pull Requests (PRs) from the community are highly welcome to help implement broader format support!** + +- **Hallucinations**: When using smaller models (like Whisper `small`), the model may occasionally append non-existent phrases at the end of the transcription, such as "谢谢大家" (Thank you everyone) or "字幕组" (Subtitle group). This typically happens during silent segments or at the very end of an audio file. + +### Recommended Models + +For the best balance of speed and accuracy, we recommend: + +- **Chinese (中文)**: **Whisper Turbo** is currently the optimal choice for high-quality Chinese transcription. +- **English**: **Parakeet V3** is recommended for English transcription due to its exceptional processing speed. + +## Troubleshooting + +- **Address already in use**: If you see an error in the logs saying the address is in use, try changing the port in settings. +- **Firewall**: Ensure your system's firewall allows incoming connections on the chosen port if you plan to access the API from other devices. diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml index e2f26501c..44e14acb0 100644 --- a/src-tauri/Cargo.toml +++ b/src-tauri/Cargo.toml @@ -74,6 +74,10 @@ ferrous-opencc = "0.2.3" specta = "=2.0.0-rc.22" specta-typescript = "0.0.9" tauri-specta = { version = "=2.0.0-rc.21", features = ["derive", "typescript"] } +axum = { version = "0.7", features = ["multipart"] } +tower-http = { version = "0.6", features = ["cors", "trace", "limit"] } +tokio-util = { version = "0.7", features = ["io"] } +hyper = "1.0" [target.'cfg(unix)'.dependencies] signal-hook = "0.3" diff --git a/src-tauri/src/audio_toolkit/audio/mod.rs b/src-tauri/src/audio_toolkit/audio/mod.rs index c0abd7e52..6eee777d2 100644 --- a/src-tauri/src/audio_toolkit/audio/mod.rs +++ b/src-tauri/src/audio_toolkit/audio/mod.rs @@ -7,6 +7,6 @@ mod visualizer; pub use device::{list_input_devices, list_output_devices, CpalDeviceInfo}; pub use recorder::AudioRecorder; -pub use resampler::FrameResampler; +pub use resampler::{resample_audio, FrameResampler}; pub use utils::save_wav_file; pub use visualizer::AudioVisualiser; diff --git a/src-tauri/src/audio_toolkit/audio/resampler.rs b/src-tauri/src/audio_toolkit/audio/resampler.rs index 149d99ba9..458705dff 100644 --- a/src-tauri/src/audio_toolkit/audio/resampler.rs +++ b/src-tauri/src/audio_toolkit/audio/resampler.rs @@ -1,3 +1,5 @@ +use anyhow::{anyhow, Result}; +use rodio::Source; use rubato::{FftFixedIn, Resampler}; use std::time::Duration; @@ -97,3 +99,74 @@ impl FrameResampler { } } } + +pub fn resample_audio(source: S) -> Result> +where + S: Source + Send + 'static, +{ + let target_sample_rate = 16000; + let source_rate = source.sample_rate(); + let channels = source.channels(); + + if channels == 0 { + return Err(anyhow!("Audio has no channels")); + } + + // 1. Convert to mono and collect all samples + let mut mono_samples = Vec::new(); + let mut channel_sum = 0.0; + let mut channel_count = 0; + + for sample in source { + channel_sum += sample; + channel_count += 1; + if channel_count == channels { + mono_samples.push(channel_sum / channels as f32); + channel_sum = 0.0; + channel_count = 0; + } + } + + if source_rate == target_sample_rate { + return Ok(mono_samples); + } + + // 2. High-quality resampling using rubato + // We use a fixed chunk size for the resampler + let chunk_size = 1024; + let mut resampler = FftFixedIn::::new( + source_rate as usize, + target_sample_rate as usize, + chunk_size, + 1, + 1, + ) + .map_err(|e| anyhow!("Failed to create resampler: {}", e))?; + + let mut output = Vec::new(); + let mut input_pos = 0; + + while input_pos + chunk_size <= mono_samples.len() { + let chunk = &mono_samples[input_pos..input_pos + chunk_size]; + if let Ok(resampled_chunk) = resampler.process(&[chunk], None) { + output.extend_from_slice(&resampled_chunk[0]); + } + input_pos += chunk_size; + } + + // Handle remaining samples by padding with zeros + if input_pos < mono_samples.len() { + let mut last_chunk = vec![0.0; chunk_size]; + let remaining = mono_samples.len() - input_pos; + last_chunk[..remaining].copy_from_slice(&mono_samples[input_pos..]); + if let Ok(resampled_chunk) = resampler.process(&[last_chunk], None) { + // Only take the relevant part of the output to avoid too much padding + // (Though for transcription a bit of silence at the end is fine) + let out_len = + (remaining as f32 * (target_sample_rate as f32 / source_rate as f32)) as usize; + output.extend_from_slice(&resampled_chunk[0][..out_len.min(resampled_chunk[0].len())]); + } + } + + Ok(output) +} diff --git a/src-tauri/src/audio_toolkit/mod.rs b/src-tauri/src/audio_toolkit/mod.rs index 5aaa3ffc6..54352bf58 100644 --- a/src-tauri/src/audio_toolkit/mod.rs +++ b/src-tauri/src/audio_toolkit/mod.rs @@ -5,7 +5,8 @@ pub mod utils; pub mod vad; pub use audio::{ - list_input_devices, list_output_devices, save_wav_file, AudioRecorder, CpalDeviceInfo, + list_input_devices, list_output_devices, resample_audio, save_wav_file, AudioRecorder, + CpalDeviceInfo, }; pub use text::{apply_custom_words, filter_transcription_output}; pub use utils::get_cpal_host; diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs index 556dd3a12..72077368e 100644 --- a/src-tauri/src/lib.rs +++ b/src-tauri/src/lib.rs @@ -10,6 +10,7 @@ mod input; mod llm_client; mod managers; mod overlay; +mod server; mod settings; mod shortcut; mod signal_handle; @@ -215,6 +216,13 @@ fn initialize_core_logic(app_handle: &AppHandle) { // Create the recording overlay window (hidden by default) utils::create_recording_overlay(app_handle); + + // Initialize Local API Server + let mut api_server = server::ApiServer::new(settings.local_api_port); + if settings.local_api_enabled { + api_server.start(transcription_manager.clone()); + } + app_handle.manage(Mutex::new(api_server)); } #[tauri::command] @@ -265,6 +273,8 @@ pub fn run() { shortcut::update_custom_words, shortcut::suspend_binding, shortcut::resume_binding, + shortcut::change_local_api_setting, + shortcut::change_local_api_port_setting, shortcut::change_mute_while_recording_setting, shortcut::change_append_trailing_space_setting, shortcut::change_app_language_setting, diff --git a/src-tauri/src/managers/transcription.rs b/src-tauri/src/managers/transcription.rs index 4287533dd..cf0f1d68b 100644 --- a/src-tauri/src/managers/transcription.rs +++ b/src-tauri/src/managers/transcription.rs @@ -28,6 +28,26 @@ pub struct ModelStateEvent { pub error: Option, } +#[derive(Debug, Clone, Serialize)] +pub struct TranscriptionSegment { + pub id: u32, + pub seek: u32, + pub start: f64, + pub end: f64, + pub text: String, + pub tokens: Vec, + pub temperature: f32, + pub avg_logprob: f32, + pub compression_ratio: f32, + pub no_speech_prob: f32, +} + +#[derive(Debug, Clone, Serialize)] +pub struct TranscriptionResult { + pub text: String, + pub segments: Vec, +} + enum LoadedEngine { Whisper(WhisperEngine), Parakeet(ParakeetEngine), @@ -342,6 +362,16 @@ impl TranscriptionManager { } pub fn transcribe(&self, audio: Vec) -> Result { + let result = self.transcribe_internal(audio, false, false)?; + Ok(result.text) + } + + pub fn transcribe_internal( + &self, + audio: Vec, + verbose: bool, + enable_segments: bool, + ) -> Result { // Update last activity timestamp self.last_activity.store( SystemTime::now() @@ -358,7 +388,10 @@ impl TranscriptionManager { if audio.is_empty() { debug!("Empty audio vector"); self.maybe_unload_immediately("empty audio"); - return Ok(String::new()); + return Ok(TranscriptionResult { + text: String::new(), + segments: Vec::new(), + }); } // Check if model is loaded, if not try to load it @@ -430,7 +463,7 @@ impl TranscriptionManager { }; // Apply word correction if custom words are configured - let corrected_result = if !settings.custom_words.is_empty() { + let corrected_text = if !settings.custom_words.is_empty() { apply_custom_words( &result.text, &settings.custom_words, @@ -441,7 +474,7 @@ impl TranscriptionManager { }; // Filter out filler words and hallucinations - let filtered_result = filter_transcription_output(&corrected_result); + let filtered_result = filter_transcription_output(&corrected_text); let et = std::time::Instant::now(); let translation_note = if settings.translate_to_english { @@ -455,17 +488,44 @@ impl TranscriptionManager { translation_note ); - let final_result = filtered_result; + let final_text = filtered_result.trim().to_string(); - if final_result.is_empty() { + if final_text.is_empty() { info!("Transcription result is empty"); } else { - info!("Transcription result: {}", final_result); + info!("Transcription result: {}", final_text); } self.maybe_unload_immediately("transcription"); - Ok(final_result) + // Convert segments only if requested + let segments = if verbose || enable_segments { + result + .segments + .unwrap_or_default() + .into_iter() + .enumerate() + .map(|(i, s)| TranscriptionSegment { + id: i as u32, + seek: 0, + start: (s.start as f64 * 100.0).round() / 100.0, + end: (s.end as f64 * 100.0).round() / 100.0, + text: s.text, + tokens: vec![], + temperature: 0.0, + avg_logprob: 0.0, + compression_ratio: 0.0, + no_speech_prob: 0.0, + }) + .collect() + } else { + Vec::new() + }; + + Ok(TranscriptionResult { + text: final_text, + segments, + }) } } diff --git a/src-tauri/src/server.rs b/src-tauri/src/server.rs new file mode 100644 index 000000000..1c580ac44 --- /dev/null +++ b/src-tauri/src/server.rs @@ -0,0 +1,216 @@ +use crate::managers::transcription::TranscriptionManager; +use axum::{ + extract::{Multipart, State}, + http::StatusCode, + response::{IntoResponse, Json}, + routing::post, + Router, +}; +use log::{error, info}; +use serde::Serialize; +use std::io::Cursor; +use std::net::SocketAddr; +use std::sync::Arc; +use tokio::net::TcpListener; +use tokio::sync::oneshot; + +pub struct ServerState { + pub transcription_manager: Arc, +} + +#[derive(Clone)] +pub struct AppState { + pub state: Arc, +} + +#[derive(Debug, Serialize)] +struct Segment { + id: u32, + seek: u32, + start: f64, + end: f64, + text: String, + tokens: Vec, + temperature: f32, + avg_logprob: f32, + compression_ratio: f32, + no_speech_prob: f32, +} + +#[derive(Debug, Serialize)] +pub struct GroqTranscriptionResponse { + text: String, + #[serde(skip_serializing_if = "Option::is_none")] + segments: Option>, + // We could add words here if we support word-level timestamps in the future +} + +pub struct ApiServer { + shutdown_tx: Option>, + pub port: u16, +} + +impl ApiServer { + pub fn new(port: u16) -> Self { + Self { + shutdown_tx: None, + port, + } + } + + pub fn start(&mut self, tm: Arc) { + let port = self.port; + let (tx, rx) = oneshot::channel(); + self.shutdown_tx = Some(tx); + + let state = AppState { + state: Arc::new(ServerState { + transcription_manager: tm, + }), + }; + + let app = Router::new() + .route("/v1/audio/transcriptions", post(transcribe_audio)) + .with_state(state); + + tauri::async_runtime::spawn(async move { + let addr = SocketAddr::from(([0, 0, 0, 0], port)); + info!("Starting local API server on {}", addr); + + match TcpListener::bind(addr).await { + Ok(listener) => { + if let Err(e) = axum::serve(listener, app) + .with_graceful_shutdown(async { + rx.await.ok(); + }) + .await + { + error!("Server error: {}", e); + } + } + Err(e) => { + error!("Failed to bind server to {}: {}", addr, e); + } + } + info!("Local API server on port {} stopped", port); + }); + } + + pub fn stop(&mut self) { + if let Some(tx) = self.shutdown_tx.take() { + let _ = tx.send(()); + } + } +} + +async fn transcribe_audio( + State(state): State, + mut multipart: Multipart, +) -> impl IntoResponse { + let mut file_bytes = None; + let mut _model = "whisper-large-v3-turbo".to_string(); + let mut response_format = "json".to_string(); + let mut timestamp_granularities = Vec::new(); + + while let Ok(Some(field)) = multipart.next_field().await { + let name = field.name().unwrap_or("").to_string(); + + if name == "file" { + if let Ok(bytes) = field.bytes().await { + file_bytes = Some(bytes); + } + } else if name == "model" { + if let Ok(text) = field.text().await { + _model = text; + } + } else if name == "response_format" { + if let Ok(text) = field.text().await { + response_format = text; + } + } else if name == "timestamp_granularities[]" || name == "timestamp_granularities" { + if let Ok(text) = field.text().await { + timestamp_granularities.push(text); + } + } + } + + if let Some(bytes) = file_bytes { + // Decode audio + let cursor = Cursor::new(bytes); + let decoder = match rodio::Decoder::new(cursor) { + Ok(d) => d, + Err(e) => { + return ( + StatusCode::BAD_REQUEST, + Json(serde_json::json!({"error": format!("Failed to decode audio: {}", e)})), + ); + } + }; + + // Resample and collect samples + let samples: Vec = match crate::audio_toolkit::resample_audio(decoder) { + Ok(s) => s, + Err(e) => { + return ( + StatusCode::INTERNAL_SERVER_ERROR, + Json(serde_json::json!({"error": format!("Failed to resample audio: {}", e)})), + ); + } + }; + + // Transcribe + let verbose = response_format == "verbose_json"; + state.state.transcription_manager.initiate_model_load(); + match state.state.transcription_manager.transcribe_internal( + samples, + verbose, + timestamp_granularities.contains(&"segment".to_string()), + ) { + Ok(result) => { + let segments = if verbose { + Some( + result + .segments + .into_iter() + .map(|s| Segment { + id: s.id, + seek: s.seek, + start: s.start, + end: s.end, + text: s.text, + tokens: s.tokens, + temperature: s.temperature, + avg_logprob: s.avg_logprob, + compression_ratio: s.compression_ratio, + no_speech_prob: s.no_speech_prob, + }) + .collect(), + ) + } else { + None + }; + + let response = GroqTranscriptionResponse { + text: result.text, + segments, + }; + + return ( + StatusCode::OK, + Json(serde_json::to_value(response).unwrap()), + ); + } + Err(e) => { + return ( + StatusCode::INTERNAL_SERVER_ERROR, + Json(serde_json::json!({"error": format!("Transcription failed: {}", e)})), + ); + } + } + } + + ( + StatusCode::BAD_REQUEST, + Json(serde_json::json!({"error": "No file provided"})), + ) +} diff --git a/src-tauri/src/settings.rs b/src-tauri/src/settings.rs index 8feb07f4d..a414bd559 100644 --- a/src-tauri/src/settings.rs +++ b/src-tauri/src/settings.rs @@ -315,6 +315,10 @@ pub struct AppSettings { pub experimental_enabled: bool, #[serde(default)] pub keyboard_implementation: KeyboardImplementation, + #[serde(default = "default_local_api_enabled")] + pub local_api_enabled: bool, + #[serde(default = "default_local_api_port")] + pub local_api_port: u16, } fn default_model() -> String { @@ -394,6 +398,14 @@ fn default_post_process_provider_id() -> String { "openai".to_string() } +fn default_local_api_enabled() -> bool { + false +} + +fn default_local_api_port() -> u16 { + 5500 +} + fn default_post_process_providers() -> Vec { let mut providers = vec![ PostProcessProvider { @@ -605,6 +617,8 @@ pub fn get_default_settings() -> AppSettings { app_language: default_app_language(), experimental_enabled: false, keyboard_implementation: KeyboardImplementation::default(), + local_api_enabled: default_local_api_enabled(), + local_api_port: default_local_api_port(), } } diff --git a/src-tauri/src/shortcut/mod.rs b/src-tauri/src/shortcut/mod.rs index da0e16091..c47cea865 100644 --- a/src-tauri/src/shortcut/mod.rs +++ b/src-tauri/src/shortcut/mod.rs @@ -920,6 +920,57 @@ pub fn change_mute_while_recording_setting(app: AppHandle, enabled: bool) -> Res Ok(()) } +#[tauri::command] +#[specta::specta] +pub fn change_local_api_setting(app: AppHandle, enabled: bool) -> Result<(), String> { + let mut settings = settings::get_settings(&app); + + // Only take action if the setting actually changes + if settings.local_api_enabled != enabled { + settings.local_api_enabled = enabled; + settings::write_settings(&app, settings); + + // Manage server state + let server_mutex = app.state::>(); + let mut server = server_mutex.lock().map_err(|e| e.to_string())?; + + if enabled { + let tm = app.state::>(); + server.start(tm.inner().clone()); + } else { + server.stop(); + } + } + + Ok(()) +} + +#[tauri::command] +#[specta::specta] +pub fn change_local_api_port_setting(app: AppHandle, port: u16) -> Result<(), String> { + let mut settings = settings::get_settings(&app); + + // Only take action if the setting actually changes + if settings.local_api_port != port { + settings.local_api_port = port; + settings::write_settings(&app, settings.clone()); + + // Restart server if enabled + if settings.local_api_enabled { + let server_mutex = app.state::>(); + let mut server = server_mutex.lock().map_err(|e| e.to_string())?; + server.stop(); + server.port = port; // Ensure field is pub or add setter method. + + // Wait a bit for port release? Usually instant. + let tm = app.state::>(); + server.start(tm.inner().clone()); + } + } + + Ok(()) +} + #[tauri::command] #[specta::specta] pub fn change_append_trailing_space_setting(app: AppHandle, enabled: bool) -> Result<(), String> { diff --git a/src/components/settings/LocalApiService.tsx b/src/components/settings/LocalApiService.tsx new file mode 100644 index 000000000..49b67cfdc --- /dev/null +++ b/src/components/settings/LocalApiService.tsx @@ -0,0 +1,85 @@ +import React from "react"; +import { useTranslation } from "react-i18next"; +import { useSettings } from "../../hooks/useSettings"; + +import { ToggleSwitch } from "../ui/ToggleSwitch"; +import { SettingContainer } from "../ui/SettingContainer"; +import { Input } from "../ui/Input"; + +interface LocalApiServiceProps { + descriptionMode?: "inline" | "tooltip"; + grouped?: boolean; +} + +export const LocalApiService: React.FC = React.memo( + ({ grouped = false, descriptionMode = "inline" }) => { + const { t } = useTranslation(); + const { getSetting, updateSetting, isUpdating } = useSettings(); + + const enabled = (getSetting("local_api_enabled") as boolean) ?? false; + const port = (getSetting("local_api_port") as number) ?? 5500; + const [localPort, setLocalPort] = React.useState(port.toString()); + + React.useEffect(() => { + setLocalPort(port.toString()); + }, [port]); + + const handleEnabledChange = (checked: boolean) => { + // This calls updateSetting in useSettingsStore, which we updated to handle 'local_api_enabled' + // via commands.changeLocalApiSetting(checked) + updateSetting("local_api_enabled", checked); + }; + + const handlePortChange = (e: React.ChangeEvent) => { + const stringVal = e.target.value; + setLocalPort(stringVal); + }; + + React.useEffect(() => { + const timer = setTimeout(() => { + const val = parseInt(localPort, 10); + if (!isNaN(val) && val >= 1 && val <= 65535 && val !== port) { + updateSetting("local_api_port", val); + } + }, 500); + + return () => clearTimeout(timer); + }, [localPort, port, updateSetting]); + + return ( + <> + + + +
+ +
+
+ + ); + }, +); diff --git a/src/components/settings/advanced/AdvancedSettings.tsx b/src/components/settings/advanced/AdvancedSettings.tsx index a6104aa74..8823e597f 100644 --- a/src/components/settings/advanced/AdvancedSettings.tsx +++ b/src/components/settings/advanced/AdvancedSettings.tsx @@ -17,6 +17,7 @@ import { RecordingRetentionPeriodSelector } from "../RecordingRetentionPeriod"; import { ExperimentalToggle } from "../ExperimentalToggle"; import { useSettings } from "../../../hooks/useSettings"; import { KeyboardImplementationSelector } from "../debug/KeyboardImplementationSelector"; +import { LocalApiService } from "../LocalApiService"; export const AdvancedSettings: React.FC = () => { const { t } = useTranslation(); @@ -40,6 +41,7 @@ export const AdvancedSettings: React.FC = () => { + diff --git a/src/i18n/locales/de/translation.json b/src/i18n/locales/de/translation.json index 1c2be1809..d8e582ef1 100644 --- a/src/i18n/locales/de/translation.json +++ b/src/i18n/locales/de/translation.json @@ -233,6 +233,14 @@ "placeholder": "Wort hinzufügen", "add": "Hinzufügen", "remove": "{{word}} entfernen" + }, + "local_api": { + "label": "Lokale API", + "description": "Stellen Sie eine lokale OpenAI-kompatible Speech-to-Text-API auf Port {{port}} bereit.", + "port": { + "label": "Lokaler API-Port", + "description": "Port, auf dem der Server lauscht. Das Ändern des Ports startet den Server neu." + } } }, "postProcessing": { diff --git a/src/i18n/locales/en/translation.json b/src/i18n/locales/en/translation.json index 059bec64a..035c8cf81 100644 --- a/src/i18n/locales/en/translation.json +++ b/src/i18n/locales/en/translation.json @@ -237,6 +237,14 @@ "placeholder": "Add a word", "add": "Add", "remove": "Remove {{word}}" + }, + "local_api": { + "label": "Local API", + "description": "Expose a local OpenAI-compatible Speech-to-Text API on port {{port}}.", + "port": { + "label": "Local API Port", + "description": "Port to listen on. Changing the port will restart the server." + } } }, "postProcessing": { diff --git a/src/i18n/locales/es/translation.json b/src/i18n/locales/es/translation.json index 4c03ec37d..14a6f92ce 100644 --- a/src/i18n/locales/es/translation.json +++ b/src/i18n/locales/es/translation.json @@ -233,6 +233,14 @@ "placeholder": "Agregar una palabra", "add": "Agregar", "remove": "Eliminar {{word}}" + }, + "local_api": { + "label": "API Local", + "description": "Expone una API de voz a texto local compatible con OpenAI en el puerto {{port}}.", + "port": { + "label": "Puerto de API Local", + "description": "Puerto para escuchar. Cambiar el puerto reiniciará el servidor." + } } }, "postProcessing": { diff --git a/src/i18n/locales/fr/translation.json b/src/i18n/locales/fr/translation.json index dce93f31b..de94853c9 100644 --- a/src/i18n/locales/fr/translation.json +++ b/src/i18n/locales/fr/translation.json @@ -234,6 +234,14 @@ "placeholder": "Ajouter un mot", "add": "Ajouter", "remove": "Supprimer {{word}}" + }, + "local_api": { + "label": "API locale", + "description": "Expose une API locale de reconnaissance vocale compatible OpenAI sur le port {{port}}.", + "port": { + "label": "Port de l'API locale", + "description": "Port d'écoute. Changer le port redémarrera le serveur." + } } }, "postProcessing": { diff --git a/src/i18n/locales/it/translation.json b/src/i18n/locales/it/translation.json index 3d43bd333..a3772972c 100644 --- a/src/i18n/locales/it/translation.json +++ b/src/i18n/locales/it/translation.json @@ -233,6 +233,14 @@ "placeholder": "Aggiungi una parola", "add": "Aggiungi", "remove": "Rimuovi {{word}}" + }, + "local_api": { + "label": "API Locale", + "description": "Esponi un'API di riconoscimento vocale locale compatibile con OpenAI sulla porta {{port}}.", + "port": { + "label": "Porta API Locale", + "description": "Porta su cui ascoltare. Modificare la porta riavvierà il server." + } } }, "postProcessing": { diff --git a/src/i18n/locales/ja/translation.json b/src/i18n/locales/ja/translation.json index 545e85edb..4293c2ae4 100644 --- a/src/i18n/locales/ja/translation.json +++ b/src/i18n/locales/ja/translation.json @@ -233,6 +233,14 @@ "placeholder": "単語を追加", "add": "追加", "remove": "{{word}}を削除" + }, + "local_api": { + "label": "ローカルAPI", + "description": "ポート{{port}}でローカルのOpenAI互換音声文字起こしAPIを公開します。", + "port": { + "label": "ローカルAPIポート", + "description": "リスンするポート。ポートを変更するとサーバーが再起動します。" + } } }, "postProcessing": { diff --git a/src/i18n/locales/pl/translation.json b/src/i18n/locales/pl/translation.json index 5a7ce1751..3a0783001 100644 --- a/src/i18n/locales/pl/translation.json +++ b/src/i18n/locales/pl/translation.json @@ -233,6 +233,14 @@ "placeholder": "Dodaj słowo", "add": "Dodaj", "remove": "Usuń {{word}}" + }, + "local_api": { + "label": "Lokalne API", + "description": "Udostępnij lokalne API mowy na tekst zgodne z OpenAI na porcie {{port}}.", + "port": { + "label": "Port lokalnego API", + "description": "Port, na którym nasłuchuje serwer. Zmiana portu spowoduje restart serwera." + } } }, "postProcessing": { diff --git a/src/i18n/locales/ru/translation.json b/src/i18n/locales/ru/translation.json index 664020fe1..e1a5a7d8b 100644 --- a/src/i18n/locales/ru/translation.json +++ b/src/i18n/locales/ru/translation.json @@ -233,6 +233,14 @@ "placeholder": "Добавить слово", "add": "Добавлять", "remove": "Удалить {{word}}" + }, + "local_api": { + "label": "Локальный API", + "description": "Запустить локальный OpenAI-совместимый API преобразования речи в текст на порту {{port}}.", + "port": { + "label": "Порт локального API", + "description": "Порт для входящих соединений. Изменение порта приведет к перезапуску сервера." + } } }, "postProcessing": { diff --git a/src/i18n/locales/vi/translation.json b/src/i18n/locales/vi/translation.json index bea9688eb..0068e4e39 100644 --- a/src/i18n/locales/vi/translation.json +++ b/src/i18n/locales/vi/translation.json @@ -234,6 +234,14 @@ "placeholder": "Thêm một từ", "add": "Thêm", "remove": "Xóa {{word}}" + }, + "local_api": { + "label": "API cục bộ", + "description": "Cung cấp API chuyển đổi giọng nói thành văn bản tương thích với OpenAI trên cổng {{port}}.", + "port": { + "label": "Cổng API cục bộ", + "description": "Cổng để lắng nghe. Thay đổi cổng sẽ khởi động lại máy chủ." + } } }, "postProcessing": { diff --git a/src/i18n/locales/zh/translation.json b/src/i18n/locales/zh/translation.json index e77621c9f..310dcc1dc 100644 --- a/src/i18n/locales/zh/translation.json +++ b/src/i18n/locales/zh/translation.json @@ -233,6 +233,14 @@ "placeholder": "添加词汇", "add": "添加", "remove": "删除 {{word}}" + }, + "local_api": { + "label": "本地API", + "description": "在端口{{port}}上暴露本地OpenAI兼容的语音转文字API。", + "port": { + "label": "本地API端口", + "description": "本地API服务器监听的端口。更改端口将重启服务器。" + } } }, "postProcessing": { diff --git a/src/stores/settingsStore.ts b/src/stores/settingsStore.ts index 620ab7053..501cca565 100644 --- a/src/stores/settingsStore.ts +++ b/src/stores/settingsStore.ts @@ -127,6 +127,10 @@ const settingUpdaters: { app_language: (value) => commands.changeAppLanguageSetting(value as string), experimental_enabled: (value) => commands.changeExperimentalEnabledSetting(value as boolean), + local_api_enabled: (value) => + commands.changeLocalApiSetting(value as boolean), + local_api_port: (value) => + commands.changeLocalApiPortSetting(value as number), }; export const useSettingsStore = create()( @@ -297,15 +301,15 @@ export const useSettingsStore = create()( set((state) => ({ settings: state.settings ? { - ...state.settings, - bindings: { - ...state.settings.bindings, - [id]: { - ...state.settings.bindings[id]!, - current_binding: binding, - }, + ...state.settings, + bindings: { + ...state.settings.bindings, + [id]: { + ...state.settings.bindings[id]!, + current_binding: binding, }, - } + }, + } : null, })); @@ -328,15 +332,15 @@ export const useSettingsStore = create()( set((state) => ({ settings: state.settings ? { - ...state.settings, - bindings: { - ...state.settings.bindings, - [id]: { - ...state.settings.bindings[id]!, - current_binding: originalBinding, - }, + ...state.settings, + bindings: { + ...state.settings.bindings, + [id]: { + ...state.settings.bindings[id]!, + current_binding: originalBinding, }, - } + }, + } : null, })); } From 87fb4df3efd439a8e10cabcb4226dbf5cb909188 Mon Sep 17 00:00:00 2001 From: Yorick <2524964538@qq.com> Date: Wed, 28 Jan 2026 17:25:39 +0800 Subject: [PATCH 2/2] feat: Add CORS to the API server and introduce local API enablement and port settings. --- src-tauri/Cargo.lock | 111 ++++++++++++++++++++++++++++++++++ src-tauri/src/server.rs | 7 +++ src-tauri/src/shortcut/mod.rs | 1 + src/bindings.ts | 18 +++++- 4 files changed, 136 insertions(+), 1 deletion(-) diff --git a/src-tauri/Cargo.lock b/src-tauri/Cargo.lock index e5c502166..1accb7b94 100644 --- a/src-tauri/Cargo.lock +++ b/src-tauri/Cargo.lock @@ -366,6 +366,62 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "axum" +version = "0.7.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f" +dependencies = [ + "async-trait", + "axum-core", + "bytes", + "futures-util", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-util", + "itoa", + "matchit", + "memchr", + "mime", + "multer", + "percent-encoding", + "pin-project-lite", + "rustversion", + "serde", + "serde_json", + "serde_path_to_error", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "axum-core" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http", + "http-body", + "http-body-util", + "mime", + "pin-project-lite", + "rustversion", + "sync_wrapper", + "tower-layer", + "tower-service", + "tracing", +] + [[package]] name = "base64" version = "0.21.7" @@ -2374,6 +2430,7 @@ name = "handy" version = "0.7.0" dependencies = [ "anyhow", + "axum", "chrono", "cpal", "enigo", @@ -2383,6 +2440,7 @@ dependencies = [ "futures-util", "handy-keys", "hound", + "hyper", "log", "natural", "once_cell", @@ -2418,6 +2476,8 @@ dependencies = [ "tauri-plugin-updater", "tauri-specta", "tokio", + "tokio-util", + "tower-http", "transcribe-rs", "vad-rs", "windows 0.61.3", @@ -2572,6 +2632,12 @@ version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + [[package]] name = "humantime" version = "2.3.0" @@ -2592,6 +2658,7 @@ dependencies = [ "http", "http-body", "httparse", + "httpdate", "itoa", "pin-project-lite", "pin-utils", @@ -3247,6 +3314,12 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2532096657941c2fea9c289d370a250971c689d4f143798ff67113ec042024a5" +[[package]] +name = "matchit" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" + [[package]] name = "matrixmultiply" version = "0.3.10" @@ -3373,6 +3446,23 @@ dependencies = [ "windows-sys 0.60.2", ] +[[package]] +name = "multer" +version = "3.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83e87776546dc87511aa5ee218730c92b666d7264ab6ed41f9d215af9cd5224b" +dependencies = [ + "bytes", + "encoding_rs", + "futures-util", + "http", + "httparse", + "memchr", + "mime", + "spin", + "version_check", +] + [[package]] name = "native-tls" version = "0.2.14" @@ -5388,6 +5478,17 @@ dependencies = [ "serde_core", ] +[[package]] +name = "serde_path_to_error" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10a9ff822e371bb5403e391ecd83e182e0e77ba7f6fe0160b795797109d1b457" +dependencies = [ + "itoa", + "serde", + "serde_core", +] + [[package]] name = "serde_repr" version = "0.1.20" @@ -5683,6 +5784,12 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" + [[package]] name = "stable_deref_trait" version = "1.2.1" @@ -6877,6 +6984,7 @@ dependencies = [ "tokio", "tower-layer", "tower-service", + "tracing", ] [[package]] @@ -6890,11 +6998,13 @@ dependencies = [ "futures-util", "http", "http-body", + "http-body-util", "iri-string", "pin-project-lite", "tower", "tower-layer", "tower-service", + "tracing", ] [[package]] @@ -6915,6 +7025,7 @@ version = "0.1.41" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" dependencies = [ + "log", "pin-project-lite", "tracing-attributes", "tracing-core", diff --git a/src-tauri/src/server.rs b/src-tauri/src/server.rs index 1c580ac44..eab67c000 100644 --- a/src-tauri/src/server.rs +++ b/src-tauri/src/server.rs @@ -13,6 +13,7 @@ use std::net::SocketAddr; use std::sync::Arc; use tokio::net::TcpListener; use tokio::sync::oneshot; +use tower_http::cors::{Any, CorsLayer}; pub struct ServerState { pub transcription_manager: Arc, @@ -69,8 +70,14 @@ impl ApiServer { }), }; + let cors = CorsLayer::new() + .allow_origin(Any) + .allow_methods(Any) + .allow_headers(Any); + let app = Router::new() .route("/v1/audio/transcriptions", post(transcribe_audio)) + .layer(cors) .with_state(state); tauri::async_runtime::spawn(async move { diff --git a/src-tauri/src/shortcut/mod.rs b/src-tauri/src/shortcut/mod.rs index c47cea865..c3048ce11 100644 --- a/src-tauri/src/shortcut/mod.rs +++ b/src-tauri/src/shortcut/mod.rs @@ -16,6 +16,7 @@ mod tauri_impl; use log::{error, info, warn}; use serde::Serialize; use specta::Type; +use std::sync::Arc; use tauri::{AppHandle, Emitter, Manager}; use tauri_plugin_autostart::ManagerExt; diff --git a/src/bindings.ts b/src/bindings.ts index 6b50126ab..7cf74aa86 100644 --- a/src/bindings.ts +++ b/src/bindings.ts @@ -244,6 +244,22 @@ async resumeBinding(id: string) : Promise> { else return { status: "error", error: e as any }; } }, +async changeLocalApiSetting(enabled: boolean) : Promise> { + try { + return { status: "ok", data: await TAURI_INVOKE("change_local_api_setting", { enabled }) }; +} catch (e) { + if(e instanceof Error) throw e; + else return { status: "error", error: e as any }; +} +}, +async changeLocalApiPortSetting(port: number) : Promise> { + try { + return { status: "ok", data: await TAURI_INVOKE("change_local_api_port_setting", { port }) }; +} catch (e) { + if(e instanceof Error) throw e; + else return { status: "error", error: e as any }; +} +}, async changeMuteWhileRecordingSetting(enabled: boolean) : Promise> { try { return { status: "ok", data: await TAURI_INVOKE("change_mute_while_recording_setting", { enabled }) }; @@ -690,7 +706,7 @@ async isLaptop() : Promise> { /** user-defined types **/ -export type AppSettings = { bindings: Partial<{ [key in string]: ShortcutBinding }>; push_to_talk: boolean; audio_feedback: boolean; audio_feedback_volume?: number; sound_theme?: SoundTheme; start_hidden?: boolean; autostart_enabled?: boolean; update_checks_enabled?: boolean; selected_model?: string; always_on_microphone?: boolean; selected_microphone?: string | null; clamshell_microphone?: string | null; selected_output_device?: string | null; translate_to_english?: boolean; selected_language?: string; overlay_position?: OverlayPosition; debug_mode?: boolean; log_level?: LogLevel; custom_words?: string[]; model_unload_timeout?: ModelUnloadTimeout; word_correction_threshold?: number; history_limit?: number; recording_retention_period?: RecordingRetentionPeriod; paste_method?: PasteMethod; clipboard_handling?: ClipboardHandling; post_process_enabled?: boolean; post_process_provider_id?: string; post_process_providers?: PostProcessProvider[]; post_process_api_keys?: Partial<{ [key in string]: string }>; post_process_models?: Partial<{ [key in string]: string }>; post_process_prompts?: LLMPrompt[]; post_process_selected_prompt_id?: string | null; mute_while_recording?: boolean; append_trailing_space?: boolean; app_language?: string; experimental_enabled?: boolean; keyboard_implementation?: KeyboardImplementation } +export type AppSettings = { bindings: Partial<{ [key in string]: ShortcutBinding }>; push_to_talk: boolean; audio_feedback: boolean; audio_feedback_volume?: number; sound_theme?: SoundTheme; start_hidden?: boolean; autostart_enabled?: boolean; update_checks_enabled?: boolean; selected_model?: string; always_on_microphone?: boolean; selected_microphone?: string | null; clamshell_microphone?: string | null; selected_output_device?: string | null; translate_to_english?: boolean; selected_language?: string; overlay_position?: OverlayPosition; debug_mode?: boolean; log_level?: LogLevel; custom_words?: string[]; model_unload_timeout?: ModelUnloadTimeout; word_correction_threshold?: number; history_limit?: number; recording_retention_period?: RecordingRetentionPeriod; paste_method?: PasteMethod; clipboard_handling?: ClipboardHandling; post_process_enabled?: boolean; post_process_provider_id?: string; post_process_providers?: PostProcessProvider[]; post_process_api_keys?: Partial<{ [key in string]: string }>; post_process_models?: Partial<{ [key in string]: string }>; post_process_prompts?: LLMPrompt[]; post_process_selected_prompt_id?: string | null; mute_while_recording?: boolean; append_trailing_space?: boolean; app_language?: string; experimental_enabled?: boolean; keyboard_implementation?: KeyboardImplementation; local_api_enabled?: boolean; local_api_port?: number } export type AudioDevice = { index: string; name: string; is_default: boolean } export type BindingResponse = { success: boolean; binding: ShortcutBinding | null; error: string | null } export type ClipboardHandling = "dont_modify" | "copy_to_clipboard"