11import Foundation
2+ import OSLog
23
34/// A single custom vocabulary entry.
45public struct CustomVocabularyTerm : Codable , Sendable {
@@ -21,6 +22,11 @@ public struct CustomVocabularyConfig: Codable, Sendable {
2122 public let depthScaling : Float ?
2223 public let scorePerPhrase : Float ?
2324 public let terms : [ CustomVocabularyTerm ]
25+
26+ // CTC keyword boosting confidence thresholds
27+ public let minCtcScore : Float ?
28+ public let minSimilarity : Float ?
29+ public let minCombinedConfidence : Float ?
2430}
2531
2632/// Runtime context used by the decoder biasing system.
@@ -31,35 +37,105 @@ public struct CustomVocabularyContext: Sendable {
3137 public let depthScaling : Float
3238 public let scorePerPhrase : Float
3339
40+ // CTC keyword boosting confidence thresholds
41+ public let minCtcScore : Float
42+ public let minSimilarity : Float
43+ public let minCombinedConfidence : Float
44+
3445 public init (
3546 terms: [ CustomVocabularyTerm ] ,
3647 alpha: Float = 0.5 ,
3748 contextScore: Float = 1.2 ,
3849 depthScaling: Float = 2.0 ,
39- scorePerPhrase: Float = 0.0
50+ scorePerPhrase: Float = 0.0 ,
51+ minCtcScore: Float = - 10.0 ,
52+ minSimilarity: Float = 0.50 ,
53+ minCombinedConfidence: Float = 0.54
4054 ) {
4155 self . terms = terms
4256 self . alpha = alpha
4357 self . contextScore = contextScore
4458 self . depthScaling = depthScaling
4559 self . scorePerPhrase = scorePerPhrase
60+ self . minCtcScore = minCtcScore
61+ self . minSimilarity = minSimilarity
62+ self . minCombinedConfidence = minCombinedConfidence
4663 }
4764
4865 /// Load a custom vocabulary JSON file produced by the analysis tooling.
4966 public static func load( from url: URL ) throws -> CustomVocabularyContext {
67+ let logger = Logger ( subsystem: " com.fluidaudio " , category: " CustomVocabulary " )
5068 let data = try Data ( contentsOf: url)
5169 let config = try JSONDecoder ( ) . decode ( CustomVocabularyConfig . self, from: data)
5270
5371 let alpha = config. alpha ?? 0.5
5472 let contextScore = config. contextScore ?? 1.2
5573 let depthScaling = config. depthScaling ?? 2.0
5674 let scorePerPhrase = config. scorePerPhrase ?? 0.0
75+ let minCtcScore = config. minCtcScore ?? - 10.0
76+ let minSimilarity = config. minSimilarity ?? 0.50
77+ let minCombinedConfidence = config. minCombinedConfidence ?? 0.54
78+
79+ // Validate and normalize vocabulary terms
80+ var validatedTerms : [ CustomVocabularyTerm ] = [ ]
81+ for term in config. terms {
82+ let ( sanitized, warnings) = sanitizeVocabularyTerm ( term. text)
83+
84+ if !warnings. isEmpty {
85+ logger. warning ( " Term ' \( term. text) ': \( warnings. joined ( separator: " , " ) ) " )
86+ }
87+
88+ // Skip empty terms after sanitization
89+ guard !sanitized. isEmpty else {
90+ logger. warning ( " Term ' \( term. text) ' is empty after sanitization, skipping " )
91+ continue
92+ }
93+
94+ validatedTerms. append ( term)
95+ }
96+
5797 return CustomVocabularyContext (
58- terms: config . terms ,
98+ terms: validatedTerms ,
5999 alpha: alpha,
60100 contextScore: contextScore,
61101 depthScaling: depthScaling,
62- scorePerPhrase: scorePerPhrase
102+ scorePerPhrase: scorePerPhrase,
103+ minCtcScore: minCtcScore,
104+ minSimilarity: minSimilarity,
105+ minCombinedConfidence: minCombinedConfidence
63106 )
64107 }
108+
109+ /// Sanitize a vocabulary term and return warnings about potential issues.
110+ private static func sanitizeVocabularyTerm( _ text: String ) -> ( sanitized: String , warnings: [ String ] ) {
111+ var warnings : [ String ] = [ ]
112+ var result = text
113+
114+ // 1. Check for control characters
115+ if result. rangeOfCharacter ( from: . controlCharacters) != nil {
116+ warnings. append ( " contains control characters " )
117+ result = result. filter { !$0. isNewline && !$0. isWhitespace || $0 == " " }
118+ }
119+
120+ // 2. Check for diacritics (informational, not blocking)
121+ if result. folding ( options: . diacriticInsensitive, locale: nil ) != result {
122+ warnings. append ( " contains diacritics - consider adding ASCII alias " )
123+ }
124+
125+ // 3. Check for numbers (informational)
126+ if result. rangeOfCharacter ( from: . decimalDigits) != nil {
127+ warnings. append ( " contains numbers " )
128+ }
129+
130+ // 4. Check for unusual characters (not letters, spaces, hyphens, apostrophes)
131+ let allowedChars = CharacterSet . letters
132+ . union ( . whitespaces)
133+ . union ( CharacterSet ( charactersIn: " -' " ) )
134+
135+ if result. rangeOfCharacter ( from: allowedChars. inverted) != nil {
136+ warnings. append ( " contains unusual characters " )
137+ }
138+
139+ return ( result, warnings)
140+ }
65141}
0 commit comments