Skip to content

Commit 21a3965

Browse files
authored
[cookbook image-captioning] Add caption streaming and perf metrics (#94)
* Improve image-captioning with batch processing * Use custom NDJSON hook * Pretty print timing metrics * Setup workspace level catalog for dependencies * Improve hydration warning for theme toggle * Update use of button styles * Update README and recipe doc comments
1 parent bba2227 commit 21a3965

File tree

12 files changed

+548
-310
lines changed

12 files changed

+548
-310
lines changed

genai-cookbook/README.md

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,16 +79,20 @@ The Agentic Cookbook is collection of recipes demonstrating how to build modern
7979
Build a streaming chat interface that maintains conversation context across multiple exchanges. This recipe demonstrates:
8080

8181
- Real-time token streaming using the Vercel AI SDK
82-
- Auto-scrolling message display using Mantine
82+
- Markdown rendering with syntax-highlighted code blocks using Streamdown
83+
- Auto-scrolling message display with smart scroll detection
84+
- Seamless compatibility with Modular MAX and OpenAI-compatible endpoints
8385

8486
### 2. **Image Captioning**
8587

86-
Create an intelligent image captioning system that generates natural language descriptions for uploaded images. Features include:
88+
Create an intelligent image captioning system that generates natural language descriptions for uploaded images with progressive streaming and performance tracking. Features include:
8789

90+
- **NDJSON streaming**: Custom useNDJSON hook for progressive results—captions appear as they're generated
91+
- **Parallel processing**: Multiple images processed simultaneously for maximum speed
92+
- **Performance metrics**: TTFT (time to first token) and duration tracking with human-readable formatting via pretty-ms
8893
- Drag-and-drop image upload with Mantine Dropzone
89-
- Base64 image encoding for API transport
9094
- Customizable prompt for caption generation
91-
- Gallery view with loading states and progress indicators
95+
- Gallery view with loading states and real-time updates
9296
9397
## Architecture
9498

genai-cookbook/apps/cookbook/components/CodeToggle.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ export function CodeToggle() {
2424
icon?: React.ReactNode
2525
}) => {
2626
return (
27-
<Button size="compact-sm" radius="xl">
27+
<Button variant="light" size="compact-sm" radius="xl">
2828
<>
2929
{icon}
3030
<Text size="sm" fw="500" p="6px">

genai-cookbook/apps/cookbook/components/ThemeToggle.tsx

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,25 @@
11
'use client'
22

3+
import { useEffect, useState } from 'react'
34
import { ActionIcon, Tooltip, useMantineColorScheme } from '@mantine/core'
45
import { IconMoon, IconSun } from '@tabler/icons-react'
56

67
export function ThemeToggle({ stroke }: { stroke: number }): JSX.Element {
78
const { setColorScheme, colorScheme } = useMantineColorScheme()
9+
const [mounted, setMounted] = useState(false)
10+
11+
useEffect(() => {
12+
setMounted(true)
13+
}, [])
814

915
function toggleColorScheme() {
1016
const result = colorScheme === 'dark' ? 'light' : 'dark'
1117
return setColorScheme(result)
1218
}
1319

14-
const label = colorScheme === 'dark' ? 'Switch to light' : 'Switch to dark'
20+
const resolvedScheme = mounted ? colorScheme : 'light'
21+
const label =
22+
resolvedScheme === 'dark' ? 'Switch to light' : 'Switch to dark'
1523

1624
return (
1725
<Tooltip label={label}>
@@ -20,7 +28,7 @@ export function ThemeToggle({ stroke }: { stroke: number }): JSX.Element {
2028
aria-label={label}
2129
variant="transparent"
2230
>
23-
{colorScheme === 'dark' ? (
31+
{resolvedScheme === 'dark' ? (
2432
<IconMoon stroke={stroke} />
2533
) : (
2634
<IconSun stroke={stroke} />

genai-cookbook/apps/cookbook/package.json

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,33 +11,34 @@
1111
"format:check": "prettier --check ."
1212
},
1313
"dependencies": {
14-
"@ai-sdk/openai": "^2.0.23",
15-
"@ai-sdk/react": "^2.0.30",
16-
"@mantine/core": "^7.17.8",
17-
"@mantine/dropzone": "^7.17.8",
18-
"@mantine/hooks": "^7.17.8",
14+
"@ai-sdk/openai": "catalog:",
15+
"@ai-sdk/react": "catalog:",
16+
"@mantine/core": "catalog:",
17+
"@mantine/dropzone": "catalog:",
18+
"@mantine/hooks": "catalog:",
1919
"@modular/recipes": "workspace:*",
20-
"@tabler/icons-react": "^3.34.1",
21-
"ai": "^5.0.28",
22-
"nanoid": "^5.1.5",
20+
"@tabler/icons-react": "catalog:",
21+
"ai": "catalog:",
22+
"nanoid": "catalog:",
2323
"next": "^14",
24-
"openai": "^5.20.2",
24+
"openai": "catalog:",
25+
"pretty-ms": "catalog:",
2526
"react": "^18",
2627
"react-dom": "^18",
2728
"react-syntax-highlighter": "^15.6.6",
28-
"streamdown": "^1.3.0",
29-
"sass": "^1.93.2"
29+
"sass": "^1.93.2",
30+
"streamdown": "catalog:"
3031
},
3132
"devDependencies": {
32-
"@types/node": "^20",
33-
"@types/react": "^18",
34-
"@types/react-dom": "^18",
33+
"@types/node": "catalog:",
34+
"@types/react": "catalog:",
35+
"@types/react-dom": "catalog:",
3536
"@types/react-syntax-highlighter": "^15.5.13",
3637
"eslint": "^8",
3738
"eslint-config-next": "14.2.31",
3839
"postcss": "^8",
3940
"tailwindcss": "^3.4.1",
40-
"typescript": "^5"
41+
"typescript": "catalog:"
4142
},
4243
"packageManager": "[email protected]+sha512.77a884a165cbba2d8d1c19e3b4880eee6d2fcabd0d879121e282196b80042351d5eb3ca0935fa599da1dc51265cc68816ad2bddd2a2de5ea9fdf92adbec7cd34"
4344
}

genai-cookbook/packages/recipes/package.json

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,25 +11,26 @@
1111
"./*": "./src/*"
1212
},
1313
"dependencies": {
14-
"@ai-sdk/openai": "^2.0.23",
15-
"@ai-sdk/react": "^2.0.30",
16-
"@mantine/core": "^7.17.8",
17-
"@mantine/dropzone": "^7.17.8",
18-
"@mantine/hooks": "^7.17.8",
19-
"@tabler/icons-react": "^3.34.1",
20-
"ai": "^5.0.28",
21-
"nanoid": "^5.1.5",
22-
"openai": "^5.20.2",
23-
"streamdown": "^1.3.0"
14+
"@ai-sdk/openai": "catalog:",
15+
"@ai-sdk/react": "catalog:",
16+
"@mantine/core": "catalog:",
17+
"@mantine/dropzone": "catalog:",
18+
"@mantine/hooks": "catalog:",
19+
"@tabler/icons-react": "catalog:",
20+
"ai": "catalog:",
21+
"nanoid": "catalog:",
22+
"openai": "catalog:",
23+
"pretty-ms": "catalog:",
24+
"streamdown": "catalog:"
2425
},
2526
"peerDependencies": {
2627
"react": "^18",
2728
"react-dom": "^18"
2829
},
2930
"devDependencies": {
30-
"@types/node": "^20",
31-
"@types/react": "^18",
32-
"@types/react-dom": "^18",
33-
"typescript": "^5"
31+
"@types/node": "catalog:",
32+
"@types/react": "catalog:",
33+
"@types/react-dom": "catalog:",
34+
"typescript": "catalog:"
3435
}
3536
}

genai-cookbook/packages/recipes/src/image-captioning/api.ts

Lines changed: 105 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,123 @@
1-
import { generateText } from 'ai'
1+
import { streamText } from 'ai'
22
import { RecipeContext } from '../types'
33
import { createOpenAI } from '@ai-sdk/openai'
44

55
/*
6-
* The captioning API mirrors our multi-turn chat route but returns a single
7-
* string instead of a streaming response. Because Modular MAX speaks the
8-
* OpenAI-compatible protocol, the Vercel AI SDK can works with Modular MAX
9-
* out of the box.
6+
* Image Captioning API with NDJSON Streaming and Performance Metrics
7+
*
8+
* This API demonstrates progressive response streaming using NDJSON (newline-delimited JSON).
9+
* Instead of waiting for all captions to complete, we stream each result as it's generated,
10+
* providing immediate feedback to users along with detailed performance metrics.
11+
*
12+
* Key concepts:
13+
* - NDJSON format: One JSON object per line, easy to parse progressively
14+
* - Parallel processing: All images caption simultaneously for speed
15+
* - Stream-as-you-go: Results appear in the UI the moment they're ready
16+
* - Performance tracking: TTFT (time to first token) and duration (generation time) per image
17+
* - OpenAI-compatible: Works with Modular MAX or any OpenAI-compatible server
18+
*
19+
* Timing metrics explained:
20+
* - TTFT: Time from request start to first token (measures latency)
21+
* - Duration: Time from first token to completion (measures generation speed)
1022
*/
1123

12-
// ============================================================================
13-
// POST /api — generates an image caption
14-
// ============================================================================
1524
export default async function POST(req: Request, context: RecipeContext) {
1625
const { apiKey, baseUrl, modelName } = context
17-
const { messages } = await req.json()
18-
if (!messages) {
19-
return new Response('Client did not provide messages', { status: 400 })
26+
const body = await req.json()
27+
28+
const isBatch = Array.isArray(body.batch)
29+
30+
if (!isBatch && !body.messages) {
31+
return new Response('Client did not provide messages or batch', { status: 400 })
2032
}
2133

22-
// Use the the Vercel AI SDK to connect to the MAX endpoint
2334
try {
24-
// createOpenAI returns an OpenAI-compatible client
35+
// The Vercel AI SDK's createOpenAI works with any OpenAI-compatible endpoint
2536
const client = createOpenAI({ baseURL: baseUrl, apiKey })
26-
27-
// chat(modelName) works with LLM servers like MAX that
28-
// implement the chat-completions format
2937
const model = client.chat(modelName)
3038

31-
// Finally, we call generateText to get a caption for our images
32-
const { text } = await generateText({
33-
// The recipe UI creates messages in the ModelMessage format,
34-
// so converting from UIMessage to ModelMessage is unnecessary
35-
model: model,
36-
messages: messages,
37-
})
39+
if (isBatch) {
40+
// NDJSON streaming: send results progressively as they complete
41+
const encoder = new TextEncoder()
42+
const stream = new ReadableStream({
43+
async start(controller) {
44+
try {
45+
// Process all images in parallel using Promise.all
46+
// As each caption completes, we immediately stream it to the client
47+
await Promise.all(
48+
body.batch.map(async (item: { imageId: string; messages: any }) => {
49+
try {
50+
const startTime = Date.now()
51+
let firstTokenTime: number | null = null
52+
let ttft: number | null = null
53+
let textChunks: string[] = []
54+
55+
// Use streamText (not generateText) to capture timing metrics
56+
const result = streamText({
57+
model: model,
58+
messages: item.messages,
59+
})
60+
61+
// Consume the stream chunk-by-chunk to collect text and timing
62+
for await (const chunk of result.textStream) {
63+
// Capture TTFT: time from request start to first token
64+
if (ttft === null) {
65+
firstTokenTime = Date.now()
66+
ttft = firstTokenTime - startTime
67+
}
68+
textChunks.push(chunk)
69+
}
70+
71+
// Duration: time from first token to completion (not total time)
72+
const duration = firstTokenTime ? Date.now() - firstTokenTime : null
73+
const text = textChunks.join('')
74+
75+
// Stream result as NDJSON: one JSON object per line with metrics
76+
const line = JSON.stringify({
77+
imageId: item.imageId,
78+
text,
79+
ttft,
80+
duration
81+
}) + '\n'
82+
controller.enqueue(encoder.encode(line))
83+
} catch (error) {
84+
// Send errors per-image so UI can show partial results
85+
const errorMessage = error instanceof Error ? error.message : 'Unknown error'
86+
const line = JSON.stringify({
87+
imageId: item.imageId,
88+
error: errorMessage
89+
}) + '\n'
90+
controller.enqueue(encoder.encode(line))
91+
}
92+
})
93+
)
94+
95+
controller.close()
96+
} catch (error) {
97+
controller.error(error)
98+
}
99+
},
100+
})
101+
102+
return new Response(stream, {
103+
headers: {
104+
'Content-Type': 'application/x-ndjson',
105+
},
106+
})
107+
} else {
108+
// Single caption request: stream and collect text
109+
const result = streamText({
110+
model: model,
111+
messages: body.messages,
112+
})
113+
114+
let textChunks: string[] = []
115+
for await (const chunk of result.textStream) {
116+
textChunks.push(chunk)
117+
}
38118

39-
return Response.json({ text })
119+
return Response.json({ text: textChunks.join('') })
120+
}
40121
} catch (error) {
41122
const errorMessage = error instanceof Error ? `(${error.message})` : ''
42123
return new Response(`Failed to generate caption ${errorMessage}`, {

0 commit comments

Comments
 (0)