From e0733623e241c094e8a5d57ef945b66441a3969e Mon Sep 17 00:00:00 2001 From: Yash Agarwal Date: Fri, 19 Sep 2025 19:46:01 +0800 Subject: [PATCH 1/2] fix: fallback to pcm in caselocal stt models aren't enough --- .../src/services/session/MicrophoneManager.ts | 99 +++++++++++-------- .../services/session/SubscriptionManager.ts | 57 ++++++----- .../cloud/src/services/session/UserSession.ts | 11 ++- .../src/services/session/session.service.ts | 11 ++- 4 files changed, 101 insertions(+), 77 deletions(-) diff --git a/cloud/packages/cloud/src/services/session/MicrophoneManager.ts b/cloud/packages/cloud/src/services/session/MicrophoneManager.ts index 6fea3a7ddf..b377bf3fc6 100644 --- a/cloud/packages/cloud/src/services/session/MicrophoneManager.ts +++ b/cloud/packages/cloud/src/services/session/MicrophoneManager.ts @@ -54,13 +54,15 @@ export class MicrophoneManager { // Cached subscription state to avoid expensive repeated lookups private cachedSubscriptionState: { - hasPCM: boolean; - hasTranscription: boolean; - hasMedia: boolean; + needsPcm: boolean; + needsTranslation: boolean; + transcriptionLanguages: string[]; + needsMedia: boolean; } = { - hasPCM: false, - hasTranscription: false, - hasMedia: false, + needsPcm: false, + needsTranslation: false, + transcriptionLanguages: [], + needsMedia: false, }; constructor(session: UserSession) { @@ -221,11 +223,12 @@ export class MicrophoneManager { */ private updateCachedSubscriptionState(): void { const state = - this.session.subscriptionManager.hasPCMTranscriptionSubscriptions(); + this.session.subscriptionManager.getMediaSubscriptionDetails(); this.cachedSubscriptionState = { - hasPCM: state.hasPCM, - hasTranscription: state.hasTranscription, - hasMedia: state.hasMedia, + needsPcm: state.needsPcm, + needsTranslation: state.needsTranslation, + transcriptionLanguages: state.transcriptionLanguages, + needsMedia: state.needsMedia, }; this.logger.debug( "Updated cached subscription state", @@ -239,28 +242,40 @@ export class MicrophoneManager { */ private shouldBypassVadForPCM(): boolean { // Use cached state instead of calling service - return this.cachedSubscriptionState.hasPCM; + return this.cachedSubscriptionState.needsPcm; } calculateRequiredData( - hasPCM: boolean, - hasTranscription: boolean, + needsPcm: boolean, + needsTranslation: boolean, + transcriptionLanguages: string[] ): Array<"pcm" | "transcription" | "pcm_or_transcription"> { const requiredData: Array< "pcm" | "transcription" | "pcm_or_transcription" > = []; + const localTranscriptionLanguage = "en-US"; const isCloudSttDown = this.session.transcriptionManager.isCloudSTTDown(); - if (hasPCM) { + const transcriptionLanguageSubscriptions = transcriptionLanguages.length; + let isLocalTranscriptionLangugeSame = false; + if (transcriptionLanguageSubscriptions == 1) { + isLocalTranscriptionLangugeSame = transcriptionLanguages[0] == localTranscriptionLanguage; + } + + if ( + needsPcm || + transcriptionLanguageSubscriptions > 1 || + needsTranslation || + (transcriptionLanguageSubscriptions == 1 && !isLocalTranscriptionLangugeSame) + ) { requiredData.push("pcm"); - if (hasTranscription && isCloudSttDown) { - requiredData.push("transcription"); - } - } else { - if (hasTranscription && isCloudSttDown) { - requiredData.push("transcription"); - } else { - requiredData.push("pcm_or_transcription"); - } + } + + if (transcriptionLanguageSubscriptions == 1 && isLocalTranscriptionLangugeSame && isCloudSttDown) { + requiredData.push("transcription"); + } + + if (!needsPcm && (transcriptionLanguageSubscriptions == 1 && isLocalTranscriptionLangugeSame && !isCloudSttDown)) { + requiredData.push("pcm_or_transcription"); } return requiredData; @@ -302,12 +317,13 @@ export class MicrophoneManager { // Update cache before using it this.updateCachedSubscriptionState(); - const hasMediaSubscriptions = this.cachedSubscriptionState.hasMedia; + const needsMediaSubscriptions = this.cachedSubscriptionState.needsMedia; const requiredData = this.calculateRequiredData( - this.cachedSubscriptionState.hasPCM, - this.cachedSubscriptionState.hasTranscription, + this.cachedSubscriptionState.needsPcm, + this.cachedSubscriptionState.needsTranslation, + this.cachedSubscriptionState.transcriptionLanguages, ); - this.updateState(hasMediaSubscriptions, requiredData); + this.updateState(needsMediaSubscriptions, requiredData); } } @@ -326,16 +342,17 @@ export class MicrophoneManager { // Update cache when subscriptions change this.updateCachedSubscriptionState(); - const hasMediaSubscriptions = this.cachedSubscriptionState.hasMedia; + const needsMediaSubscriptions = this.cachedSubscriptionState.needsMedia; const requiredData = this.calculateRequiredData( - this.cachedSubscriptionState.hasPCM, - this.cachedSubscriptionState.hasTranscription, + this.cachedSubscriptionState.needsPcm, + this.cachedSubscriptionState.needsTranslation, + this.cachedSubscriptionState.transcriptionLanguages, ); this.logger.info( - `Subscription changed, media subscriptions: ${hasMediaSubscriptions}`, + `Subscription changed, media subscriptions: ${needsMediaSubscriptions}`, ); // Apply holddown when turning mic off to avoid flapping - if (hasMediaSubscriptions) { + if (needsMediaSubscriptions) { // Cancel any pending mic-off holddown if (this.micOffHolddownTimer) { clearTimeout(this.micOffHolddownTimer); @@ -349,10 +366,11 @@ export class MicrophoneManager { this.micOffHolddownTimer = setTimeout(() => { // Re-evaluate before actually turning off this.updateCachedSubscriptionState(); - const stillNoMedia = !this.cachedSubscriptionState.hasMedia; + const stillNoMedia = !this.cachedSubscriptionState.needsMedia; const finalRequiredData = this.calculateRequiredData( - this.cachedSubscriptionState.hasPCM, - this.cachedSubscriptionState.hasTranscription, + this.cachedSubscriptionState.needsPcm, + this.cachedSubscriptionState.needsTranslation, + this.cachedSubscriptionState.transcriptionLanguages, ); if (stillNoMedia) { this.updateState(false, finalRequiredData); @@ -382,7 +400,7 @@ export class MicrophoneManager { private updateKeepAliveTimer(): void { // Check if we should have a keep-alive timer running using cached state const shouldHaveKeepAlive = - this.enabled && this.cachedSubscriptionState.hasMedia; + this.enabled && this.cachedSubscriptionState.needsMedia; if (shouldHaveKeepAlive && !this.keepAliveTimer) { // Start keep-alive timer @@ -394,7 +412,7 @@ export class MicrophoneManager { this.session.websocket.readyState === WebSocket.OPEN ) { // Use cached state for the check - if (this.cachedSubscriptionState.hasMedia && this.enabled) { + if (this.cachedSubscriptionState.needsMedia && this.enabled) { this.logger.debug("Sending microphone keep-alive"); this.sendStateChangeToGlasses( this.lastSentState, @@ -436,7 +454,7 @@ export class MicrophoneManager { // Check if we should NOT be receiving audio using cached state const shouldMicBeOff = - !this.enabled || !this.cachedSubscriptionState.hasMedia; + !this.enabled || !this.cachedSubscriptionState.needsMedia; if (shouldMicBeOff) { // We're receiving audio when we shouldn't be @@ -446,8 +464,9 @@ export class MicrophoneManager { // Send mic off immediately const requiredData = this.calculateRequiredData( - this.cachedSubscriptionState.hasPCM, - this.cachedSubscriptionState.hasTranscription, + this.cachedSubscriptionState.needsPcm, + this.cachedSubscriptionState.needsTranslation, + this.cachedSubscriptionState.transcriptionLanguages, ); this.sendStateChangeToGlasses(false, requiredData); diff --git a/cloud/packages/cloud/src/services/session/SubscriptionManager.ts b/cloud/packages/cloud/src/services/session/SubscriptionManager.ts index 9b86389009..ea279c7fb7 100644 --- a/cloud/packages/cloud/src/services/session/SubscriptionManager.ts +++ b/cloud/packages/cloud/src/services/session/SubscriptionManager.ts @@ -42,7 +42,8 @@ export class SubscriptionManager { // Cached aggregates for O(1) reads private pcmSubscriptionCount: number = 0; - private transcriptionLikeSubscriptionCount: number = 0; // transcription/translation incl. language streams + private translationSubscriptionCount: number = 0; + private transcriptionLanguagesSet: Set = new Set(); private languageStreamCounts: Map = new Map(); constructor(userSession: UserSession) { @@ -136,15 +137,17 @@ export class SubscriptionManager { return result; } - hasPCMTranscriptionSubscriptions(): { - hasMedia: boolean; - hasPCM: boolean; - hasTranscription: boolean; + getMediaSubscriptionDetails(): { + needsMedia: boolean, + needsPcm: boolean, + needsTranslation: boolean, + transcriptionLanguages: string[], } { - const hasPCM = this.pcmSubscriptionCount > 0; - const hasTranscription = this.transcriptionLikeSubscriptionCount > 0; - const hasMedia = hasPCM || hasTranscription; - return { hasMedia, hasPCM, hasTranscription }; + const needsPcm = this.pcmSubscriptionCount > 0; + const needsTranslation = this.translationSubscriptionCount > 0; + const transcriptionLanguages: string[] = Array.from(this.transcriptionLanguagesSet); + const needsMedia = needsPcm || needsTranslation || transcriptionLanguages.length > 0; + return { needsMedia, needsPcm, needsTranslation, transcriptionLanguages }; } cacheCalendarEvent(event: any): void { @@ -452,26 +455,26 @@ export class SubscriptionManager { return; } - // Direct transcription/translation - if (sub === StreamType.TRANSCRIPTION || sub === StreamType.TRANSLATION) { - this.transcriptionLikeSubscriptionCount += isAdd ? 1 : -1; - if (this.transcriptionLikeSubscriptionCount < 0) - this.transcriptionLikeSubscriptionCount = 0; - return; + const languageStreamInfo = parseLanguageStream(sub as string); + const isLangStream = languageStreamInfo !== null; + + if (sub === StreamType.TRANSLATION || (isLangStream && languageStreamInfo.baseType === StreamType.TRANSLATION)) { + this.translationSubscriptionCount += isAdd ? 1 : -1; + if (this.translationSubscriptionCount < 0) this.translationSubscriptionCount = 0; } - // Language-specific streams - if (isLanguageStream(sub)) { - const langInfo = parseLanguageStream(sub as string); - if ( - langInfo && - (langInfo.type === StreamType.TRANSCRIPTION || - langInfo.type === StreamType.TRANSLATION) - ) { - this.transcriptionLikeSubscriptionCount += isAdd ? 1 : -1; - if (this.transcriptionLikeSubscriptionCount < 0) - this.transcriptionLikeSubscriptionCount = 0; + if (sub === StreamType.TRANSCRIPTION || (isLangStream && languageStreamInfo.baseType === StreamType.TRANSCRIPTION)) { + // in transcriptionLanguagesArray push languageStreamInfo.language + const transcriptionLanguage = languageStreamInfo?.transcribeLanguage || "en-US"; + if (isAdd) { + this.transcriptionLanguagesSet.add(transcriptionLanguage); + } else { + this.transcriptionLanguagesSet.delete(transcriptionLanguage); } + } + + // Language-specific streams + if (isLangStream) { const prev = this.languageStreamCounts.get(sub) || 0; const next = prev + (isAdd ? 1 : -1); if (next <= 0) this.languageStreamCounts.delete(sub); @@ -481,4 +484,4 @@ export class SubscriptionManager { } } -export default SubscriptionManager; +export default SubscriptionManager; \ No newline at end of file diff --git a/cloud/packages/cloud/src/services/session/UserSession.ts b/cloud/packages/cloud/src/services/session/UserSession.ts index bc8e4103ba..b48f758547 100644 --- a/cloud/packages/cloud/src/services/session/UserSession.ts +++ b/cloud/packages/cloud/src/services/session/UserSession.ts @@ -522,12 +522,13 @@ export class UserSession { this.subscriptionManager.getAppSubscriptions(packageName); } - const hasPCMTranscriptionSubscriptions = - this.subscriptionManager.hasPCMTranscriptionSubscriptions(); - const requiresAudio = hasPCMTranscriptionSubscriptions.hasMedia; + const mediaSubscriptionDetails = + this.subscriptionManager.getMediaSubscriptionDetails(); + const requiresAudio = mediaSubscriptionDetails.needsMedia; const requiredData = this.microphoneManager.calculateRequiredData( - hasPCMTranscriptionSubscriptions.hasPCM, - hasPCMTranscriptionSubscriptions.hasTranscription, + mediaSubscriptionDetails.needsPcm, + mediaSubscriptionDetails.needsTranslation, + mediaSubscriptionDetails.transcriptionLanguages, ); // Side-effect: update mic state to reflect current needs this.microphoneManager.updateState(requiresAudio, requiredData); diff --git a/cloud/packages/cloud/src/services/session/session.service.ts b/cloud/packages/cloud/src/services/session/session.service.ts index 125fd66c62..7a19851c78 100644 --- a/cloud/packages/cloud/src/services/session/session.service.ts +++ b/cloud/packages/cloud/src/services/session/session.service.ts @@ -146,12 +146,13 @@ export class SessionService { // Calculate streams that need to be active // const requiresAudio = subscriptionService.hasMediaSubscriptions(userId); - const hasPCMTranscriptionSubscriptions = - userSession.subscriptionManager.hasPCMTranscriptionSubscriptions(); - const requiresAudio = hasPCMTranscriptionSubscriptions.hasMedia; + const mediaSubscriptionDetails = + userSession.subscriptionManager.getMediaSubscriptionDetails(); + const requiresAudio = mediaSubscriptionDetails.needsMedia; const requiredData = userSession.microphoneManager.calculateRequiredData( - hasPCMTranscriptionSubscriptions.hasPCM, - hasPCMTranscriptionSubscriptions.hasTranscription, + mediaSubscriptionDetails.needsPcm, + mediaSubscriptionDetails.needsTranslation, + mediaSubscriptionDetails.transcriptionLanguages, ); userSession.microphoneManager.updateState(requiresAudio, requiredData); // TODO(isaiah): Feels like an odd place to put it, but it works for now. From 4d0610900863b9e50ceb0295e927ab6e9fc217d7 Mon Sep 17 00:00:00 2001 From: Yash Agarwal Date: Fri, 19 Sep 2025 20:03:29 +0800 Subject: [PATCH 2/2] chore: send local stt language code tocloud --- cloud/packages/cloud/src/models/user.model.ts | 3 +++ .../cloud/src/services/websocket/websocket-glasses.service.ts | 1 + mobile/src/managers/Settings.tsx | 3 +++ mobile/src/services/STTModelManager.ts | 2 ++ 4 files changed, 9 insertions(+) diff --git a/cloud/packages/cloud/src/models/user.model.ts b/cloud/packages/cloud/src/models/user.model.ts index be553b1270..ac250a50fe 100644 --- a/cloud/packages/cloud/src/models/user.model.ts +++ b/cloud/packages/cloud/src/models/user.model.ts @@ -36,6 +36,7 @@ export interface UserI extends Document { bypassAudioEncoding: boolean; metricSystemEnabled: boolean; enforceLocalTranscription: boolean; + localSttLanguage: string; }; location?: Location; locationSubscriptions?: Map; @@ -181,6 +182,7 @@ const UserSchema = new Schema( bypassVad: { type: Boolean, default: false }, bypassAudioEncoding: { type: Boolean, default: false }, enforceLocalTranscription: { type: Boolean, default: false }, + localSttLanguage: { type: String, default: null }, }, default: function () { return { @@ -197,6 +199,7 @@ const UserSchema = new Schema( bypassVad: false, bypassAudioEncoding: false, enforceLocalTranscription: false, + localSttLanguage: null, }; }, }, diff --git a/cloud/packages/cloud/src/services/websocket/websocket-glasses.service.ts b/cloud/packages/cloud/src/services/websocket/websocket-glasses.service.ts index 6364ed2b1e..b2168eb00c 100644 --- a/cloud/packages/cloud/src/services/websocket/websocket-glasses.service.ts +++ b/cloud/packages/cloud/src/services/websocket/websocket-glasses.service.ts @@ -353,6 +353,7 @@ export class GlassesWebSocketService { bypassVad: coreInfo.bypass_vad_for_debugging, bypassAudioEncoding: coreInfo.bypass_audio_encoding_for_debugging, enforceLocalTranscription: coreInfo.enforce_local_transcription, + localSttLanguage: coreInfo.local_stt_language, }; logger.debug({ newSettings }, "🔥🔥🔥: newSettings:"); diff --git a/mobile/src/managers/Settings.tsx b/mobile/src/managers/Settings.tsx index f1911f326c..9d9af98647 100644 --- a/mobile/src/managers/Settings.tsx +++ b/mobile/src/managers/Settings.tsx @@ -24,6 +24,7 @@ export const SETTINGS_KEYS = { bypass_audio_encoding_for_debugging: "bypass_audio_encoding_for_debugging", metric_system_enabled: "metric_system_enabled", enforce_local_transcription: "enforce_local_transcription", + local_stt_language: "local_stt_language", button_press_mode: "button_press_mode", default_wearable: "default_wearable", device_name: "device_name", @@ -76,6 +77,8 @@ const DEFAULT_SETTINGS = { [SETTINGS_KEYS.bypass_audio_encoding_for_debugging]: false, [SETTINGS_KEYS.metric_system_enabled]: false, [SETTINGS_KEYS.enforce_local_transcription]: false, + [SETTINGS_KEYS.local_stt_language]: null, + [SETTINGS_KEYS.button_press_mode]: "photo", [SETTINGS_KEYS.default_wearable]: null, [SETTINGS_KEYS.device_name]: "", diff --git a/mobile/src/services/STTModelManager.ts b/mobile/src/services/STTModelManager.ts index fbb9a2b973..e2188e04ca 100644 --- a/mobile/src/services/STTModelManager.ts +++ b/mobile/src/services/STTModelManager.ts @@ -3,6 +3,7 @@ import {Platform} from "react-native" import {NativeModules} from "react-native" import {TarBz2Extractor} from "./TarBz2Extractor" import bridge from "@/bridge/MantleBridge" +import settings, { SETTINGS_KEYS } from "@/managers/Settings" const {BridgeModule, FileProviderModule} = NativeModules @@ -389,6 +390,7 @@ class STTModelManager { this.currentModelId = modelId const modelPath = this.getModelPath(modelId) + await settings.set(SETTINGS_KEYS.local_stt_language, model.languageCode) await this.setNativeModelPath(modelPath, model.languageCode) }