AIX: OpenAI: wire support for Audio-input, Audio-output (non-streamin…

…g only), and Prediction (rewrite acceleration) modes.
enricoros · Dec 25, 2024 · 1bd327e · 1bd327e
1 parent 557886f
commit 1bd327e
Showing 1 changed file with 70 additions and 11 deletions.
diff --git a/src/modules/aix/server/dispatch/wiretypes/openai.wiretypes.ts b/src/modules/aix/server/dispatch/wiretypes/openai.wiretypes.ts
@@ -3,7 +3,8 @@ import { z } from 'zod';
 
 //
 // Implementation notes (see https://platform.openai.com/docs/changelog for upstream changes):
-// - 2024-11-05: "Predicted Outputs" - not fully added yet - TBA
+// - 2024-12-17: "Reasoning Effort" - added reasoning_effort and the 'developer' message role
+// - 2024-11-05: "Predicted Outputs"
 // - 2024-10-17: "gpt-4o-audio-preview" - not fully added: "Audio inputs and outputs are now available in the Chat Completions API" - TBA
 // - 2024-10-01: "DevDay" - added prompt_tokens_details, audio_tokens, and refusal messages
 // - 2024-09-12: "o1" - max_tokens is deprecated in favor of max_completion_tokens, added completion_tokens_details
@@ -34,9 +35,21 @@ export namespace OpenAIWire_ContentParts {
     }),
   });
 
+  const OpenAI_AudioContentPart_schema = z.object({
+    // [OpenAI, 2024-10-17] input content: audio
+    type: z.literal('input_audio'),
+    input_audio: z.object({
+      // Base64 encoded audio data.
+      data: z.string(),
+      // The format of the encoded audio data. Currently supports "wav" and "mp3".
+      format: z.enum(['wav', 'mp3']),
+    }),
+  });
+
   export const ContentPart_schema = z.discriminatedUnion('type', [
     TextContentPart_schema,
     ImageContentPart_schema,
+    OpenAI_AudioContentPart_schema,
   ]);
 
   export function TextContentPart(text: string): z.infer<typeof TextContentPart_schema> {
@@ -104,7 +117,13 @@ export namespace OpenAIWire_Messages {
 
   const AssistantMessage_schema = z.object({
     role: z.literal('assistant'),
-    /** The contents of the assistant message. Required unless tool_calls or function_call is specified. */
+    /**
+     * The contents of the assistant message. Required unless tool_calls or function_call is specified.
+     *
+     * NOTE: the assistant message is also extending to be an array, but as of 2024-12-24, it's not important
+     *       enough to require array support. The documentation of the array[] behavior of the field is:
+     *       "An array of content parts with a defined type. Can be one or more of type text, or exactly one of type refusal."
+     */
     content: z.string().nullable(),
     /**
      * The tool calls generated by the model, such as function calls.
@@ -115,12 +134,36 @@ export namespace OpenAIWire_Messages {
      * [OpenAI, 2024-10-01] The refusal message generated by the model.
      */
     refusal: z.string().nullable().optional(),
+    /**
+     * [OpenAI, 2024-10-17] Data about a previous audio response from the model. Usage depends on the context:
+     * - request (this schema): has an id, if present
+     * - non-streaming response: has the generated audio and some metadata
+     * - streaming response: NO audio fields
+     */
+    audio: z.object({
+      id: z.string(),
+    }).nullable().optional(),
     // name: _optionalParticipantName,
   });
 
   export const AssistantMessage_NS_schema = AssistantMessage_schema.extend({
+    //
+    // IMPORTANT - this message *extends* the AssistantMessage_schema, to inherit all fields while performing any other change
+    //
+
     // .optional: when parsing a non-streaming message with just a FC, the content can be missing
-    content: z.string().optional().nullable(),
+    content: z.string().nullable().optional(),
+
+    /**
+     * [OpenAI, 2024-10-17] Audio output (non-streaming only)
+     * If the audio output modality is requested, this object contains data about the audio response from the model
+     */
+    audio: z.object({
+      id: z.string(),
+      data: z.string(), // Base64 encoded audio data
+      expires_at: z.number(), // Unix timestamp
+      transcript: z.string().optional(),
+    }).nullable().optional(),
   });
 
   const ToolMessage_schema = z.object({
@@ -227,13 +270,27 @@ export namespace OpenAIWire_API_Chat_Completions {
     temperature: z.number().min(0).max(2).optional(),
     top_p: z.number().min(0).max(1).optional(),
 
+    // new output modalities
+    modalities: z.array(z.enum(['text', 'audio'])).optional(), // defaults to ['text']
+    audio: z.object({  // Parameters for audio output. Required when audio output is requested with `modalities: ["audio"]`
+      voice: z.enum([
+        'ash', 'ballad', 'coral', 'sage', 'verse', // recommended
+        'alloy', 'echo', 'shimmer', // discouraged
+      ]),
+      format: z.enum(['wav', 'mp3', 'flac', 'opus', 'pcm16']),
+    }).optional(),
+
     // API configuration
     n: z.number().int().positive().optional(), // Defaults to 1, as the derived-ecosystem does not support it
     stream: z.boolean().optional(), // If set, partial message deltas will be sent, with the stream terminated by a `data: [DONE]` message.
     stream_options: z.object({
       include_usage: z.boolean().optional(), // If set, an additional chunk will be streamed with a 'usage' field on the entire request.
     }).optional(),
     reasoning_effort: z.enum(['low', 'medium', 'high']).optional(), // [OpenAI, 2024-12-17] reasoning effort, o1 models only for now
+    prediction: z.object({ // [OpenAI, 2024-11-05] Predicted Outputs - for regenerating a file with only minor changes to most of the content.
+      type: z.literal('content'),
+      content: z.union([z.string(), z.array(OpenAIWire_ContentParts.ContentPart_schema)]),
+    }).optional(),
     response_format: z.discriminatedUnion('type', [
       z.object({
         type: z.literal('text'), // Default
@@ -272,16 +329,16 @@ export namespace OpenAIWire_API_Chat_Completions {
     stop: z.array(z.string()).optional(), // Up to 4 sequences where the API will stop generating further tokens.
     user: z.string().optional(),
 
-    // (deprecated upstream, we decide to omit this): function_call and functions
+    // (deprecated upstream, OMITTED BY CHOICE): function_call and functions
 
-    // (disabled) advanced model configuration
+    // (OMITTED BY CHOICE) advanced model configuration
     // frequency_penalty: z.number().min(-2).max(2).optional(), // Defaults to 0
     // presence_penalty: z.number().min(-2).max(2).optional(),  // Defaults to 0
     // logit_bias: z.record(z.number()).optional(),
-    // logprobs: z.boolean().optional(),                        // Defaults to false
+    // logprobs: z.boolean().optional(), // Defaults to false
     // top_logprobs: z.number().int().min(0).max(20).optional(),
 
-    // (disabled) advanced API configuration
+    // (OMITTED BY CHOICE) advanced API configuration
     // store: z.boolean().optional(), // Defaults to false. Whether or not to store the output of this chat completion request for use in our model distillation or evals products.
     // metadata: z.record(z.any()).optional(), // Developer-defined tags and values used for filtering completions in [the dashboard](https://platform.openai.com/completions)
     // service_tier: z.string().optional(),
@@ -296,7 +353,7 @@ export namespace OpenAIWire_API_Chat_Completions {
     'tool_calls', // the model called a tool
     'content_filter', // upstream content was omitted due to a flag from content filters
 
-    // Disabling Function Call, we decide to not support this obsoleted api
+    // Disabling Function Call, OMITTED BY CHOICE
     // 'function_call', // (deprecated) the model called a function
 
     // Extensions // disabled: we now use a string union to accept any value without breaking
@@ -339,7 +396,7 @@ export namespace OpenAIWire_API_Chat_Completions {
     finish_reason: z.union([FinishReason_Enum, z.string()])
       .nullable(),
 
-    // (we decide to omit this) We will not support logprobs for now, so it's disabled here and in the request
+    // (OMITTED BY CHOICE) We will not support logprobs for now, so it's disabled here and in the request
     // logprobs: z.any().nullable().optional() // Log probability information for the choice.
   });
 
@@ -358,7 +415,7 @@ export namespace OpenAIWire_API_Chat_Completions {
     created: z.number(), // The Unix timestamp (in seconds) of when the chat completion was created.
     system_fingerprint: z.string().optional() // The backend configuration that the model runs with.
       .nullable(), // [Groq, undocumented OpenAI] fingerprint is null on some OpenAI examples too
-    // service_tier: z.string().optional().nullable(),
+    // service_tier: z.string().optional().nullable(), // OMITTED BY CHOICE
 
     // undocumented messages that are not part of the official schema, but can be found when the server sends and error
     error: z.any().optional(),
@@ -416,7 +473,9 @@ export namespace OpenAIWire_API_Chat_Completions {
   const ChunkDelta_schema = z.object({
     role: z.literal('assistant').optional()
       .nullable(), // [Deepseek] added .nullable()
+    // delta-text content
     content: z.string().nullable().optional(),
+    // delta-tool-calls content
     tool_calls: z.array(ChunkDeltaToolCalls_schema).optional()
       .nullable(), // [TogetherAI] added .nullable(), see https://github.com/togethercomputer/together-python/issues/160
     refusal: z.string().nullable().optional(), // [OpenAI, 2024-10-01] refusal message
@@ -433,7 +492,7 @@ export namespace OpenAIWire_API_Chat_Completions {
       .nullable()   // very common, e.g. Azure
       .optional(),  // [OpenRouter] added .optional() which only has the delta field in the whole chunk choice
 
-    // (we decide to omit this) We will not support logprobs for now, so it's disabled here and in the request
+    // (OMITTED BY CHOICE) We will not support logprobs for now, so it's disabled here and in the request
     // logprobs: z.any().nullable().optional() // Log probability information for the choice.
   });