MacPaw · nezhyborets · Jan 17, 2025 · Nov 8, 2024 · Nov 8, 2024 · Nov 8, 2024
diff --git a/Sources/OpenAI/Public/Models/AudioTranscriptionQuery.swift b/Sources/OpenAI/Public/Models/AudioTranscriptionQuery.swift
@@ -88,7 +88,7 @@ extension AudioTranscriptionQuery: MultipartFormDataBodyEncodable {
             .string(paramName: "prompt", value: prompt),
             .string(paramName: "temperature", value: temperature),
             .string(paramName: "language", value: language),
-            .string(paramName: "response_format", value: responseFormat)
+            .string(paramName: "response_format", value: responseFormat?.rawValue)
         ])
         return bodyBuilder.build()
     }

diff --git a/Sources/OpenAI/Public/Models/AudioTranscriptionResult.swift b/Sources/OpenAI/Public/Models/AudioTranscriptionResult.swift
@@ -8,7 +8,41 @@
 import Foundation
 
 public struct AudioTranscriptionResult: Codable, Equatable {
-
-    /// The transcribed text.
+    /// The task type (always "transcribe" for transcriptions)
+    public let task: String?
+    /// The detected language
+    public let language: String?
+    /// The duration of the audio in seconds
+    public let duration: Double?
+    /// The transcribed text
     public let text: String
+    /// The segments containing detailed information (only present in verbose_json format)
+    public let segments: [Segment]?
+
+    public init(
+        task: String? = nil,
+        language: String? = nil,
+        duration: Double? = nil,
+        text: String,
+        segments: [Segment]? = nil
+    ) {
+        self.task = task
+        self.language = language
+        self.duration = duration
+        self.text = text
+        self.segments = segments
+    }
+
+    public struct Segment: Codable, Equatable {
+        public let id: Int
+        public let seek: Int
+        public let start: Double
+        public let end: Double
+        public let text: String
+        public let tokens: [Int]
+        public let temperature: Double
+        public let avg_logprob: Double
+        public let compression_ratio: Double
+        public let no_speech_prob: Double
+    }
 }
diff --git a/Tests/OpenAITests/OpenAITests.swift b/Tests/OpenAITests/OpenAITests.swift
@@ -320,7 +320,38 @@ class OpenAITests: XCTestCase {
         let result = try await openAI.audioTranscriptions(query: query)
         XCTAssertEqual(result, transcriptionResult)
     }
-
+
+    func testVerboseJsonAudioTranscriptions() async throws {
+        let data = Data()
+        let query = AudioTranscriptionQuery(file: data, fileType: .m4a, model: .whisper_1, responseFormat: .verboseJson)
+
+        let transcriptionResult = AudioTranscriptionResult(
+            task: "transcribe",
+            language: "english",
+            duration: 3.759999990463257,
+            text: "This is a test.",
+            segments: [
+                AudioTranscriptionResult.Segment(
+                    id: 0,
+                    seek: 0,
+                    start: 0,
+                    end: 3.759999990463257,
+                    text: " This is a test.",
+                    tokens: [50364, 639, 307, 257, 1500, 13, 50552],
+                    temperature: 0,
+                    avg_logprob: -0.5153926610946655,
+                    compression_ratio: 0.7142857313156128,
+                    no_speech_prob: 0.08552933484315872
+                )
+            ]
+        )
+
+        try self.stub(result: transcriptionResult)
+
+        let result = try await openAI.audioTranscriptions(query: query)
+        XCTAssertEqual(result, transcriptionResult)
+    }
+
     func testAudioTranscriptionsError() async throws {
         let data = Data()
         let query = AudioTranscriptionQuery(file: data, fileType: .m4a, model: .whisper_1)