Skip to content

Commit

Permalink
AIX: OpenAI: wire support for Audio-input, Audio-output (non-streamin…
Browse files Browse the repository at this point in the history
…g only), and Prediction (rewrite acceleration) modes.
  • Loading branch information
enricoros committed Dec 25, 2024
1 parent 557886f commit 1bd327e
Showing 1 changed file with 70 additions and 11 deletions.
81 changes: 70 additions & 11 deletions src/modules/aix/server/dispatch/wiretypes/openai.wiretypes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ import { z } from 'zod';

//
// Implementation notes (see https://platform.openai.com/docs/changelog for upstream changes):
// - 2024-11-05: "Predicted Outputs" - not fully added yet - TBA
// - 2024-12-17: "Reasoning Effort" - added reasoning_effort and the 'developer' message role
// - 2024-11-05: "Predicted Outputs"
// - 2024-10-17: "gpt-4o-audio-preview" - not fully added: "Audio inputs and outputs are now available in the Chat Completions API" - TBA
// - 2024-10-01: "DevDay" - added prompt_tokens_details, audio_tokens, and refusal messages
// - 2024-09-12: "o1" - max_tokens is deprecated in favor of max_completion_tokens, added completion_tokens_details
Expand Down Expand Up @@ -34,9 +35,21 @@ export namespace OpenAIWire_ContentParts {
}),
});

const OpenAI_AudioContentPart_schema = z.object({
// [OpenAI, 2024-10-17] input content: audio
type: z.literal('input_audio'),
input_audio: z.object({
// Base64 encoded audio data.
data: z.string(),
// The format of the encoded audio data. Currently supports "wav" and "mp3".
format: z.enum(['wav', 'mp3']),
}),
});

export const ContentPart_schema = z.discriminatedUnion('type', [
TextContentPart_schema,
ImageContentPart_schema,
OpenAI_AudioContentPart_schema,
]);

export function TextContentPart(text: string): z.infer<typeof TextContentPart_schema> {
Expand Down Expand Up @@ -104,7 +117,13 @@ export namespace OpenAIWire_Messages {

const AssistantMessage_schema = z.object({
role: z.literal('assistant'),
/** The contents of the assistant message. Required unless tool_calls or function_call is specified. */
/**
* The contents of the assistant message. Required unless tool_calls or function_call is specified.
*
* NOTE: the assistant message is also extending to be an array, but as of 2024-12-24, it's not important
* enough to require array support. The documentation of the array[] behavior of the field is:
* "An array of content parts with a defined type. Can be one or more of type text, or exactly one of type refusal."
*/
content: z.string().nullable(),
/**
* The tool calls generated by the model, such as function calls.
Expand All @@ -115,12 +134,36 @@ export namespace OpenAIWire_Messages {
* [OpenAI, 2024-10-01] The refusal message generated by the model.
*/
refusal: z.string().nullable().optional(),
/**
* [OpenAI, 2024-10-17] Data about a previous audio response from the model. Usage depends on the context:
* - request (this schema): has an id, if present
* - non-streaming response: has the generated audio and some metadata
* - streaming response: NO audio fields
*/
audio: z.object({
id: z.string(),
}).nullable().optional(),
// name: _optionalParticipantName,
});

export const AssistantMessage_NS_schema = AssistantMessage_schema.extend({
//
// IMPORTANT - this message *extends* the AssistantMessage_schema, to inherit all fields while performing any other change
//

// .optional: when parsing a non-streaming message with just a FC, the content can be missing
content: z.string().optional().nullable(),
content: z.string().nullable().optional(),

/**
* [OpenAI, 2024-10-17] Audio output (non-streaming only)
* If the audio output modality is requested, this object contains data about the audio response from the model
*/
audio: z.object({
id: z.string(),
data: z.string(), // Base64 encoded audio data
expires_at: z.number(), // Unix timestamp
transcript: z.string().optional(),
}).nullable().optional(),
});

const ToolMessage_schema = z.object({
Expand Down Expand Up @@ -227,13 +270,27 @@ export namespace OpenAIWire_API_Chat_Completions {
temperature: z.number().min(0).max(2).optional(),
top_p: z.number().min(0).max(1).optional(),

// new output modalities
modalities: z.array(z.enum(['text', 'audio'])).optional(), // defaults to ['text']
audio: z.object({ // Parameters for audio output. Required when audio output is requested with `modalities: ["audio"]`
voice: z.enum([
'ash', 'ballad', 'coral', 'sage', 'verse', // recommended
'alloy', 'echo', 'shimmer', // discouraged
]),
format: z.enum(['wav', 'mp3', 'flac', 'opus', 'pcm16']),
}).optional(),

// API configuration
n: z.number().int().positive().optional(), // Defaults to 1, as the derived-ecosystem does not support it
stream: z.boolean().optional(), // If set, partial message deltas will be sent, with the stream terminated by a `data: [DONE]` message.
stream_options: z.object({
include_usage: z.boolean().optional(), // If set, an additional chunk will be streamed with a 'usage' field on the entire request.
}).optional(),
reasoning_effort: z.enum(['low', 'medium', 'high']).optional(), // [OpenAI, 2024-12-17] reasoning effort, o1 models only for now
prediction: z.object({ // [OpenAI, 2024-11-05] Predicted Outputs - for regenerating a file with only minor changes to most of the content.
type: z.literal('content'),
content: z.union([z.string(), z.array(OpenAIWire_ContentParts.ContentPart_schema)]),
}).optional(),
response_format: z.discriminatedUnion('type', [
z.object({
type: z.literal('text'), // Default
Expand Down Expand Up @@ -272,16 +329,16 @@ export namespace OpenAIWire_API_Chat_Completions {
stop: z.array(z.string()).optional(), // Up to 4 sequences where the API will stop generating further tokens.
user: z.string().optional(),

// (deprecated upstream, we decide to omit this): function_call and functions
// (deprecated upstream, OMITTED BY CHOICE): function_call and functions

// (disabled) advanced model configuration
// (OMITTED BY CHOICE) advanced model configuration
// frequency_penalty: z.number().min(-2).max(2).optional(), // Defaults to 0
// presence_penalty: z.number().min(-2).max(2).optional(), // Defaults to 0
// logit_bias: z.record(z.number()).optional(),
// logprobs: z.boolean().optional(), // Defaults to false
// logprobs: z.boolean().optional(), // Defaults to false
// top_logprobs: z.number().int().min(0).max(20).optional(),

// (disabled) advanced API configuration
// (OMITTED BY CHOICE) advanced API configuration
// store: z.boolean().optional(), // Defaults to false. Whether or not to store the output of this chat completion request for use in our model distillation or evals products.
// metadata: z.record(z.any()).optional(), // Developer-defined tags and values used for filtering completions in [the dashboard](https://platform.openai.com/completions)
// service_tier: z.string().optional(),
Expand All @@ -296,7 +353,7 @@ export namespace OpenAIWire_API_Chat_Completions {
'tool_calls', // the model called a tool
'content_filter', // upstream content was omitted due to a flag from content filters

// Disabling Function Call, we decide to not support this obsoleted api
// Disabling Function Call, OMITTED BY CHOICE
// 'function_call', // (deprecated) the model called a function

// Extensions // disabled: we now use a string union to accept any value without breaking
Expand Down Expand Up @@ -339,7 +396,7 @@ export namespace OpenAIWire_API_Chat_Completions {
finish_reason: z.union([FinishReason_Enum, z.string()])
.nullable(),

// (we decide to omit this) We will not support logprobs for now, so it's disabled here and in the request
// (OMITTED BY CHOICE) We will not support logprobs for now, so it's disabled here and in the request
// logprobs: z.any().nullable().optional() // Log probability information for the choice.
});

Expand All @@ -358,7 +415,7 @@ export namespace OpenAIWire_API_Chat_Completions {
created: z.number(), // The Unix timestamp (in seconds) of when the chat completion was created.
system_fingerprint: z.string().optional() // The backend configuration that the model runs with.
.nullable(), // [Groq, undocumented OpenAI] fingerprint is null on some OpenAI examples too
// service_tier: z.string().optional().nullable(),
// service_tier: z.string().optional().nullable(), // OMITTED BY CHOICE

// undocumented messages that are not part of the official schema, but can be found when the server sends and error
error: z.any().optional(),
Expand Down Expand Up @@ -416,7 +473,9 @@ export namespace OpenAIWire_API_Chat_Completions {
const ChunkDelta_schema = z.object({
role: z.literal('assistant').optional()
.nullable(), // [Deepseek] added .nullable()
// delta-text content
content: z.string().nullable().optional(),
// delta-tool-calls content
tool_calls: z.array(ChunkDeltaToolCalls_schema).optional()
.nullable(), // [TogetherAI] added .nullable(), see https://github.com/togethercomputer/together-python/issues/160
refusal: z.string().nullable().optional(), // [OpenAI, 2024-10-01] refusal message
Expand All @@ -433,7 +492,7 @@ export namespace OpenAIWire_API_Chat_Completions {
.nullable() // very common, e.g. Azure
.optional(), // [OpenRouter] added .optional() which only has the delta field in the whole chunk choice

// (we decide to omit this) We will not support logprobs for now, so it's disabled here and in the request
// (OMITTED BY CHOICE) We will not support logprobs for now, so it's disabled here and in the request
// logprobs: z.any().nullable().optional() // Log probability information for the choice.
});

Expand Down

0 comments on commit 1bd327e

Please sign in to comment.