diff --git a/.gitignore b/.gitignore
index db1c917..891c924 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,4 +11,5 @@ dist
 .env
 .env.production
 api/lib/*
-deploy.sh
\ No newline at end of file
+deploy.sh
+MOTIA_DOCS.md
\ No newline at end of file
diff --git a/api/motia-workbench.json b/api/motia-workbench.json
index 0fd0ed7..e75cc4d 100644
--- a/api/motia-workbench.json
+++ b/api/motia-workbench.json
@@ -20,6 +20,22 @@
   {
     "id": "chess",
     "config": {
+      "steps/chess/12-play-vs-ai.step.ts": {
+        "x": 350,
+        "y": 1056
+      },
+      "steps/chess/11-get-game-history-detail.step.ts": {
+        "x": 0,
+        "y": 0
+      },
+      "steps/chess/10b-export-game-history.step.ts": {
+        "x": 6,
+        "y": 224
+      },
+      "steps/chess/10-get-game-history.step.ts": {
+        "x": 26,
+        "y": 428
+      },
       "steps/chess/09-purge-stuck-games.step.ts": {
         "x": -237,
         "y": 580
@@ -50,8 +66,8 @@
         "y": 112
       },
       "steps/chess/02-get-game.step.ts": {
-        "x": 1045,
-        "y": 114
+        "x": 1019,
+        "y": 92
       },
       "steps/chess/01-create-game.step.ts": {
         "x": 432,
@@ -59,8 +75,8 @@
         "targetHandlePosition": "left"
       },
       "steps/chess/00-available-models-api.step.ts": {
-        "x": -209,
-        "y": 86,
+        "x": -460,
+        "y": 111,
         "sourceHandlePosition": "right"
       },
       "steps/chess/access/request-access.step.ts": {
@@ -76,5 +92,62 @@
         "y": 680
       }
     }
+  },
+  {
+    "id": "benchmark",
+    "config": {
+      "steps/benchmark/11-stockfish-leaderboard.step.ts": {
+        "x": 0,
+        "y": 2444
+      },
+      "steps/benchmark/10-run-stockfish-benchmark.step.ts": {
+        "x": 0,
+        "y": 2220
+      },
+      "steps/benchmark/09-run-all-benchmarks.step.ts": {
+        "x": -282,
+        "y": 337
+      },
+      "steps/benchmark/08-get-puzzle-sets.step.ts": {
+        "x": 435,
+        "y": 30
+      },
+      "steps/benchmark/07-get-puzzle-leaderboard.step.ts": {
+        "x": 3,
+        "y": 204
+      },
+      "steps/benchmark/06b-run-all-puzzle-benchmarks.step.ts": {
+        "x": 0,
+        "y": 1120
+      },
+      "steps/benchmark/06-run-puzzle-benchmark.step.ts": {
+        "x": 465,
+        "y": 360
+      },
+      "steps/benchmark/05-fetch-puzzle-set.step.ts": {
+        "x": 0,
+        "y": 612
+      },
+      "steps/benchmark/04-get-benchmark-leaderboard.step.ts": {
+        "x": 0,
+        "y": 816
+      },
+      "steps/benchmark/03-get-benchmark-run-detail.step.ts": {
+        "x": 0,
+        "y": 1060
+      },
+      "steps/benchmark/02-get-benchmark-runs.step.ts": {
+        "x": 0,
+        "y": 1304
+      },
+      "steps/benchmark/01-run-legal-move-benchmark.step.ts": {
+        "x": 0,
+        "y": 1528
+      },
+      "steps/benchmark/00-generate-position-set.step.ts": {
+        "x": 0,
+        "y": 1752
+      }
+    }
   }
 ]
\ No newline at end of file
diff --git a/api/services/ai/claude.ts b/api/services/ai/claude.ts
index bc469bd..15dbc51 100644
--- a/api/services/ai/claude.ts
+++ b/api/services/ai/claude.ts
@@ -2,6 +2,7 @@ import { streamObject } from 'ai'
 import { createAnthropic } from '@ai-sdk/anthropic'
 import { AiPlayerPromptSchema } from '@chessarena/types/ai-models'
 import { models } from './models'
+import { getMaxReasoningProviderOptions } from './provider-options'
 import { Handler } from './types'
 
 export const claude: Handler = async ({ prompt, logger, model, onThoughtUpdate }) => {
@@ -9,12 +10,15 @@ export const claude: Handler = async ({ prompt, logger, model, onThoughtUpdate }
     apiKey: process.env.ANTHROPIC_API_KEY,
   })
 
+  const modelId = model ?? models.claude
   const { partialObjectStream, object } = streamObject({
-    model: anthropic(model ?? models.claude),
+    model: anthropic(modelId),
     prompt,
     schema: AiPlayerPromptSchema,
+    mode: 'json',
     maxRetries: 0,
     abortSignal: AbortSignal.timeout(180000),
+    providerOptions: getMaxReasoningProviderOptions('claude', modelId),
   })
 
   for await (const partialObject of partialObjectStream) {
diff --git a/api/services/ai/gemini.ts b/api/services/ai/gemini.ts
index 2a198d1..79b34b1 100644
--- a/api/services/ai/gemini.ts
+++ b/api/services/ai/gemini.ts
@@ -2,6 +2,7 @@ import { streamObject } from 'ai'
 import { createGoogleGenerativeAI } from '@ai-sdk/google'
 import { AiPlayerPromptSchema } from '@chessarena/types/ai-models'
 import { models } from './models'
+import { getMaxReasoningProviderOptions } from './provider-options'
 import { Handler } from './types'
 
 export const gemini: Handler = async ({ prompt, logger, model, onThoughtUpdate }) => {
@@ -9,12 +10,14 @@ export const gemini: Handler = async ({ prompt, logger, model, onThoughtUpdate }
     apiKey: process.env.GEMINI_API_KEY,
   })
 
+  const modelId = model ?? models.gemini
   const { partialObjectStream, object } = streamObject({
-    model: googleAI(model ?? models.gemini),
+    model: googleAI(modelId),
     prompt,
     schema: AiPlayerPromptSchema,
     maxRetries: 0,
     abortSignal: AbortSignal.timeout(180000),
+    providerOptions: getMaxReasoningProviderOptions('gemini', modelId),
   })
 
   for await (const partialObject of partialObjectStream) {
diff --git a/api/services/ai/grok.ts b/api/services/ai/grok.ts
index 9a46d16..cdc75ec 100644
--- a/api/services/ai/grok.ts
+++ b/api/services/ai/grok.ts
@@ -2,6 +2,7 @@ import { streamObject } from 'ai'
 import { createXai } from '@ai-sdk/xai'
 import { AiPlayerPromptSchema } from '@chessarena/types/ai-models'
 import { models } from './models'
+import { getMaxReasoningProviderOptions } from './provider-options'
 import { Handler } from './types'
 
 export const grok: Handler = async ({ prompt, logger, model, onThoughtUpdate }) => {
@@ -9,12 +10,15 @@ export const grok: Handler = async ({ prompt, logger, model, onThoughtUpdate })
     apiKey: process.env.XAI_API_KEY,
   })
 
+  const modelId = model ?? models.grok
   const { partialObjectStream, object } = streamObject({
-    model: xai(model ?? models.grok),
+    model: xai(modelId),
     prompt,
     schema: AiPlayerPromptSchema,
+    mode: 'json',
     maxRetries: 0,
     abortSignal: AbortSignal.timeout(180000),
+    providerOptions: getMaxReasoningProviderOptions('grok', modelId),
   })
 
   for await (const partialObject of partialObjectStream) {
diff --git a/api/services/ai/models.ts b/api/services/ai/models.ts
index 3bffecb..b2804e2 100644
--- a/api/services/ai/models.ts
+++ b/api/services/ai/models.ts
@@ -1,47 +1,84 @@
-import { AiModels, AiProviderDefaultModel } from '@chessarena/types/ai-models'
+import { AiModels, AiModelProvider, AiProviderDefaultModel } from '@chessarena/types/ai-models'
 
 // NOTE: these are the models used for AI vs AI games, it is also used for backwards compatibility for existing games that don't have a model assigned to a player
+// IMPORTANT: These must match model names in supportedModelsByProvider below!
 export const models: AiProviderDefaultModel = {
-  openai: 'gpt-5-2025-08-07',
+  openai: 'gpt-5.2',
   gemini: 'gemini-2.5-flash',
-  claude: 'claude-sonnet-4-5-20250929',
-  grok: 'grok-4-fast',
+  claude: 'claude-sonnet-4-5',
+  grok: 'grok-4-fast-non-reasoning',
 }
 
-// NOTE: these are all the models supported by provider that users can pick in order to play human vs AI games
+/**
+ * ============================================
+ * BENCHMARK MODELS - Add new models here!
+ * ============================================
+ *
+ * To add a new model for benchmarking:
+ * 1. Add it to the appropriate provider array below
+ * 2. Restart the dev server
+ * 3. Run the benchmark: POST /benchmark/legal-moves/run-all
+ *
+ * To run benchmark for a single model:
+ * POST /benchmark/legal-moves/run { "provider": "claude", "model": "claude-3-5-haiku-20241022" }
+ *
+ * Provider documentation:
+ * - OpenAI: https://platform.openai.com/docs/models
+ * - Gemini: https://ai.google.dev/gemini-api/docs/models
+ * - Claude: https://docs.anthropic.com/en/docs/about-claude/models/overview
+ * - Grok: https://docs.x.ai/docs/models
+ */
 export const supportedModelsByProvider: AiModels = {
+  // From AI SDK docs: https://sdk.vercel.ai/providers/ai-sdk-providers/openai
   openai: [
-    // https://platform.openai.com/docs/models
-    'gpt-5-2025-08-07',
-    'gpt-5-mini-2025-08-07',
-    'gpt-5-nano-2025-08-07',
-    'gpt-4.1-nano-2025-04-14',
-    'gpt-4.1-mini-2025-04-14',
-    'gpt-4o-mini-2024-07-18',
-    'o4-mini-2025-04-16',
+    'gpt-5.2', // Latest
+    'gpt-5.1', // Previous flagship
+    'gpt-5', // GPT-5
+    'gpt-5-mini', // Fast
+    'gpt-4.1', // GPT-4.1
+    'gpt-4.1-mini', // Fast GPT-4.1
+    'gpt-4o', // GPT-4o
+    'gpt-4o-mini', // Fast GPT-4o
   ],
+  // From AI SDK docs: https://sdk.vercel.ai/providers/ai-sdk-providers/google-generative-ai
   gemini: [
-    // https://ai.google.dev/gemini-api/docs/models
-    'gemini-2.5-flash',
-    'gemini-2.5-flash-lite',
-    'gemini-2.0-flash-001',
-    'gemini-2.0-flash-lite-001',
+    'gemini-3-pro-preview', // Latest preview
+    'gemini-2.5-pro', // Latest pro
+    'gemini-2.5-flash', // Fast
+    'gemini-2.5-flash-lite', // Ultra fast
+    'gemini-2.0-flash', // Stable flash
   ],
+  // From AI SDK docs: https://sdk.vercel.ai/providers/ai-sdk-providers/anthropic
   claude: [
-    // https://docs.anthropic.com/en/docs/about-claude/models/overview
-    'claude-opus-4-1-20250805',
-    'claude-opus-4-20250514',
-    'claude-sonnet-4-5-20250929',
-    'claude-sonnet-4-20250514',
-    'claude-3-7-sonnet-20250219',
-    'claude-haiku-4-5-20251001',
-    'claude-3-5-haiku-20241022',
-  ],
-  grok: [
-    // https://docs.x.ai/docs/models
-    'grok-4-fast',
-    'grok-4-fast-non-reasoning',
-    'grok-3-mini',
-    'grok-3',
+    'claude-opus-4-5', // Latest opus (no dot!)
+    'claude-sonnet-4-5', // Latest sonnet (no dot!)
+    'claude-haiku-4-5', // Latest haiku (no dot!)
+    'claude-opus-4-0', // Opus 4.0
+    'claude-sonnet-4-0', // Sonnet 4.0
+    'claude-3-7-sonnet-latest', // Claude 3.7
+    'claude-3-5-haiku-latest', // Claude 3.5 Haiku
   ],
+  // From AI SDK docs: https://sdk.vercel.ai/providers/ai-sdk-providers/xai
+  grok: ['grok-4-fast-non-reasoning', 'grok-4-fast-reasoning', 'grok-3-fast'],
+}
+
+/**
+ * Helper to get all models as a flat array with provider info
+ * Used by benchmarks
+ */
+export const getAllModels = (): { provider: AiModelProvider; model: string }[] => {
+  const allModels: { provider: AiModelProvider; model: string }[] = []
+  for (const [provider, models] of Object.entries(supportedModelsByProvider)) {
+    for (const model of models) {
+      allModels.push({ provider: provider as AiModelProvider, model })
+    }
+  }
+  return allModels
+}
+
+/**
+ * Get models for a specific provider
+ */
+export const getModelsForProvider = (provider: AiModelProvider): string[] => {
+  return supportedModelsByProvider[provider] || []
 }
diff --git a/api/services/ai/openai.ts b/api/services/ai/openai.ts
index f8d4aad..d8c8828 100644
--- a/api/services/ai/openai.ts
+++ b/api/services/ai/openai.ts
@@ -2,6 +2,7 @@ import { AiPlayerPromptSchema } from '@chessarena/types/ai-models'
 import { createOpenAI } from '@ai-sdk/openai'
 import { streamObject } from 'ai'
 import { models } from './models'
+import { getMaxReasoningProviderOptions } from './provider-options'
 import { Handler } from './types'
 
 export const openai: Handler = async ({ model, logger, prompt, onThoughtUpdate }) => {
@@ -9,12 +10,14 @@ export const openai: Handler = async ({ model, logger, prompt, onThoughtUpdate }
     apiKey: process.env.OPENAI_API_KEY,
   })
 
+  const modelId = model ?? models.openai
   const { partialObjectStream, object } = streamObject({
-    model: openai(model ?? models.openai),
+    model: openai(modelId),
     prompt,
     schema: AiPlayerPromptSchema,
     maxRetries: 0,
     abortSignal: AbortSignal.timeout(180000),
+    providerOptions: getMaxReasoningProviderOptions('openai', modelId),
   })
 
   for await (const partialObject of partialObjectStream) {
diff --git a/api/services/ai/provider-options.ts b/api/services/ai/provider-options.ts
new file mode 100644
index 0000000..7bd4d65
--- /dev/null
+++ b/api/services/ai/provider-options.ts
@@ -0,0 +1,87 @@
+import { AiModelProvider } from '@chessarena/types/ai-models'
+import type { JSONValue } from 'ai'
+
+type OpenAIReasoningEffort = 'minimal' | 'low' | 'medium' | 'high'
+type GrokReasoningEffort = 'low' | 'high'
+
+const clampInt = (value: number, min: number, max: number) => Math.max(min, Math.min(max, value))
+
+const parseOpenAiReasoningEffort = (value: string | undefined): OpenAIReasoningEffort | undefined => {
+  if (!value) return undefined
+  const normalized = value.toLowerCase().trim()
+  if (normalized === 'minimal') return 'minimal'
+  if (normalized === 'low') return 'low'
+  if (normalized === 'medium') return 'medium'
+  if (normalized === 'high') return 'high'
+  return undefined
+}
+
+const parseGrokReasoningEffort = (value: string | undefined): GrokReasoningEffort | undefined => {
+  if (!value) return undefined
+  const normalized = value.toLowerCase().trim()
+  if (normalized === 'low') return 'low'
+  if (normalized === 'high') return 'high'
+  return undefined
+}
+
+export const getMaxReasoningProviderOptions = (
+  provider: AiModelProvider,
+  model: string,
+): Record<string, Record<string, JSONValue>> => {
+  switch (provider) {
+    case 'openai': {
+      if (!model.startsWith('gpt-5')) return {}
+      return { openai: { reasoningEffort: 'high' } }
+    }
+    case 'claude': {
+      const supportsThinking = model.includes('-4-') || model.includes('-4')
+      if (!supportsThinking) return {}
+      return {
+        anthropic: {
+          thinking: { type: 'enabled', budgetTokens: 16000 },
+        },
+      }
+    }
+    case 'gemini': {
+      const supportsThinking = model.includes('gemini-2.5') || model.includes('gemini-3')
+      if (!supportsThinking) return {}
+      return {
+        google: {
+          // Gemini thinkingBudget is model-dependent; clamp to a safe max across the series.
+          thinkingConfig: { thinkingBudget: 24576 },
+        },
+      }
+    }
+    case 'grok': {
+      // xAI only supports reasoningEffort on specific models (e.g. grok-3-mini).
+      if (model === 'grok-3-mini') return { xai: { reasoningEffort: 'high' } }
+      return {}
+    }
+    default:
+      return {}
+  }
+}
+
+/**
+ * Benchmark-specific provider options.
+ * Keeps results reproducible and avoids long tail timeouts by using a consistent,
+ * capped reasoning budget across providers.
+ */
+export const getBenchmarkProviderOptions = (
+  provider: AiModelProvider,
+  model: string,
+): Record<string, Record<string, JSONValue>> => {
+  const openaiEffort = parseOpenAiReasoningEffort(process.env.BENCHMARK_OPENAI_REASONING_EFFORT) ?? 'low'
+
+  switch (provider) {
+    case 'openai': {
+      if (!model.startsWith('gpt-5')) return {}
+      return { openai: { reasoningEffort: openaiEffort } }
+    }
+    case 'claude':
+    case 'gemini':
+    case 'grok':
+    default:
+      return {}
+  }
+}
diff --git a/api/services/ai/random-ai-selection.ts b/api/services/ai/random-ai-selection.ts
new file mode 100644
index 0000000..8744a36
--- /dev/null
+++ b/api/services/ai/random-ai-selection.ts
@@ -0,0 +1,98 @@
+import { AiModelProvider } from '@chessarena/types/ai-models'
+import { supportedModelsByProvider } from './models'
+
+type ModelWithWeight = {
+  provider: AiModelProvider
+  model: string
+  weight: number // Higher weight = higher chance of selection
+  tier: 'cheap' | 'mid' | 'expensive'
+}
+
+// Define model tiers and weights (cheaper = higher weight)
+// IMPORTANT: Model names must match exactly with supportedModelsByProvider in models.ts
+const MODEL_WEIGHTS: ModelWithWeight[] = [
+  // Cheap tier (weight: 10) - highest chance
+  { provider: 'gemini', model: 'gemini-2.5-flash-lite', weight: 10, tier: 'cheap' },
+  { provider: 'gemini', model: 'gemini-2.5-flash', weight: 10, tier: 'cheap' },
+  { provider: 'gemini', model: 'gemini-2.0-flash', weight: 10, tier: 'cheap' },
+  { provider: 'claude', model: 'claude-3-5-haiku-latest', weight: 10, tier: 'cheap' },
+  { provider: 'claude', model: 'claude-haiku-4-5', weight: 10, tier: 'cheap' },
+  { provider: 'openai', model: 'gpt-4o-mini', weight: 10, tier: 'cheap' },
+  { provider: 'openai', model: 'gpt-4.1-mini', weight: 10, tier: 'cheap' },
+  { provider: 'grok', model: 'grok-3-fast', weight: 10, tier: 'cheap' },
+
+  // Mid tier (weight: 5)
+  { provider: 'gemini', model: 'gemini-2.5-pro', weight: 5, tier: 'mid' },
+  { provider: 'claude', model: 'claude-sonnet-4-0', weight: 5, tier: 'mid' },
+  { provider: 'claude', model: 'claude-3-7-sonnet-latest', weight: 5, tier: 'mid' },
+  { provider: 'grok', model: 'grok-4-fast-non-reasoning', weight: 5, tier: 'mid' },
+  { provider: 'openai', model: 'gpt-5-mini', weight: 5, tier: 'mid' },
+  { provider: 'openai', model: 'gpt-4o', weight: 5, tier: 'mid' },
+  { provider: 'openai', model: 'gpt-4.1', weight: 5, tier: 'mid' },
+
+  // Expensive tier (weight: 2) - lowest chance
+  { provider: 'gemini', model: 'gemini-3-pro-preview', weight: 2, tier: 'expensive' },
+  { provider: 'claude', model: 'claude-sonnet-4-5', weight: 2, tier: 'expensive' },
+  { provider: 'claude', model: 'claude-opus-4-0', weight: 2, tier: 'expensive' },
+  { provider: 'claude', model: 'claude-opus-4-5', weight: 1, tier: 'expensive' },
+  { provider: 'grok', model: 'grok-4-fast-reasoning', weight: 2, tier: 'expensive' },
+  { provider: 'openai', model: 'gpt-5', weight: 2, tier: 'expensive' },
+  { provider: 'openai', model: 'gpt-5.1', weight: 2, tier: 'expensive' },
+  { provider: 'openai', model: 'gpt-5.2', weight: 1, tier: 'expensive' },
+]
+
+// Filter to only include models that are actually supported
+const getAvailableModels = (): ModelWithWeight[] => {
+  return MODEL_WEIGHTS.filter(({ provider, model }) => {
+    const supported = supportedModelsByProvider[provider]
+    return supported?.includes(model)
+  })
+}
+
+/**
+ * Select a random AI model with weighted probability
+ * Cheaper models have higher chance of being selected
+ */
+export const selectRandomAI = (): { provider: AiModelProvider; model: string; tier: string } => {
+  const availableModels = getAvailableModels()
+
+  if (availableModels.length === 0) {
+    // Fallback to first available model
+    const provider = Object.keys(supportedModelsByProvider)[0] as AiModelProvider
+    const model = supportedModelsByProvider[provider][0]
+    return { provider, model, tier: 'unknown' }
+  }
+
+  // Calculate total weight
+  const totalWeight = availableModels.reduce((sum, m) => sum + m.weight, 0)
+
+  // Random selection based on weight
+  let random = Math.random() * totalWeight
+  for (const modelInfo of availableModels) {
+    random -= modelInfo.weight
+    if (random <= 0) {
+      return {
+        provider: modelInfo.provider,
+        model: modelInfo.model,
+        tier: modelInfo.tier,
+      }
+    }
+  }
+
+  // Fallback (shouldn't happen)
+  const fallback = availableModels[0]
+  return { provider: fallback.provider, model: fallback.model, tier: fallback.tier }
+}
+
+/**
+ * Get all available models with their weights for display
+ */
+export const getAvailableModelsWithWeights = () => {
+  return getAvailableModels().map(({ provider, model, weight, tier }) => ({
+    provider,
+    model,
+    weight,
+    tier,
+    probability: (weight / getAvailableModels().reduce((sum, m) => sum + m.weight, 0)) * 100,
+  }))
+}
diff --git a/api/services/benchmark/benchmark-config.ts b/api/services/benchmark/benchmark-config.ts
new file mode 100644
index 0000000..dbbcc72
--- /dev/null
+++ b/api/services/benchmark/benchmark-config.ts
@@ -0,0 +1,25 @@
+import { parsePositiveInt } from './concurrency'
+
+export type BenchmarkConfig = {
+  perItemTimeoutMs: number
+  maxOutputTokens: number
+  transientRetries: number
+  retryBaseBackoffMs: number
+  itemConcurrency: number
+}
+
+export const getBenchmarkConfig = (): BenchmarkConfig => {
+  const perItemTimeoutMs = Number.parseInt(process.env.BENCHMARK_PER_ITEM_TIMEOUT_MS ?? '', 10) || 10_000
+  const maxOutputTokens = Number.parseInt(process.env.BENCHMARK_MAX_OUTPUT_TOKENS ?? '', 10) || 192
+  const transientRetries = Number.parseInt(process.env.BENCHMARK_TRANSIENT_RETRIES ?? '', 10) || 1
+  const retryBaseBackoffMs = Number.parseInt(process.env.BENCHMARK_RETRY_BASE_BACKOFF_MS ?? '', 10) || 200
+  const itemConcurrency = parsePositiveInt(process.env.BENCHMARK_ITEM_CONCURRENCY, 1)
+
+  return {
+    perItemTimeoutMs,
+    maxOutputTokens,
+    transientRetries,
+    retryBaseBackoffMs,
+    itemConcurrency,
+  }
+}
diff --git a/api/services/benchmark/benchmark-prompt.ts b/api/services/benchmark/benchmark-prompt.ts
new file mode 100644
index 0000000..89f31f2
--- /dev/null
+++ b/api/services/benchmark/benchmark-prompt.ts
@@ -0,0 +1,146 @@
+import { z } from 'zod'
+import { generateText } from 'ai'
+import { Logger } from 'motia'
+import { AiModelProvider } from '@chessarena/types/ai-models'
+import { getBenchmarkProviderOptions } from '../ai/provider-options'
+import { getBenchmarkConfig } from './benchmark-config'
+import { withRetries, withRetriesNoTimeout } from './retry'
+import { createProviderModel, shouldDisableTimeout, getApiKeyEnvVar } from './shared-utils'
+
+const LegalMovesResponseSchema = z.object({
+  moves: z.array(z.string()).describe('Array of legal moves in Standard Algebraic Notation'),
+})
+
+type BenchmarkPromptInput = {
+  prompt: string
+  provider: AiModelProvider
+  model: string
+  logger: Logger
+}
+
+type BenchmarkPromptResult = {
+  moves: string[]
+  rawResponse: string
+  error?: string
+}
+
+type ProviderOptions = Record<string, unknown>
+
+export const makeBenchmarkPrompt = async (input: BenchmarkPromptInput): Promise<BenchmarkPromptResult> => {
+  const { prompt, provider, model, logger } = input
+
+  const cfg = getBenchmarkConfig()
+  const startTime = Date.now()
+  const label = `${provider}/${model}`
+
+  // Check API key
+  const apiKeyEnvVar = getApiKeyEnvVar(provider)
+  const apiKey = process.env[apiKeyEnvVar]
+  if (!apiKey) {
+    logger.error(`[${label}] Missing ${apiKeyEnvVar}`)
+    return { moves: [], rawResponse: `Missing ${apiKeyEnvVar}`, error: `Missing ${apiKeyEnvVar}` }
+  }
+
+  try {
+    const providerModel = createProviderModel(provider, model)
+    const disableTimeout = shouldDisableTimeout(provider, model)
+    const providerOptionsBase = getBenchmarkProviderOptions(provider, model)
+
+    // Add JSON response format for Gemini
+    const providerOptions: ProviderOptions =
+      provider === 'gemini'
+        ? {
+            ...providerOptionsBase,
+            google: {
+              ...(providerOptionsBase?.google as Record<string, unknown> | undefined),
+              responseMimeType: 'application/json',
+            },
+          }
+        : providerOptionsBase
+
+    const runGenerateText = async (opts: { providerOptions: ProviderOptions }) => {
+      if (disableTimeout) {
+        return await withRetriesNoTimeout(label, cfg.transientRetries, cfg.retryBaseBackoffMs, async () => {
+          return await generateText({
+            model: providerModel,
+            prompt,
+            maxRetries: 0,
+            maxOutputTokens: cfg.maxOutputTokens,
+            providerOptions: opts.providerOptions,
+          })
+        })
+      }
+
+      const deadlineMs = startTime + cfg.perItemTimeoutMs
+      return await withRetries(label, deadlineMs, cfg.transientRetries, cfg.retryBaseBackoffMs, async (abortSignal) => {
+        return await generateText({
+          model: providerModel,
+          prompt,
+          maxRetries: 0,
+          abortSignal,
+          maxOutputTokens: cfg.maxOutputTokens,
+          providerOptions: opts.providerOptions,
+        })
+      })
+    }
+
+    let text: string
+    try {
+      ;({ text } = await runGenerateText({ providerOptions }))
+    } catch (e) {
+      // Retry Grok with empty provider options if initial request fails
+      // This is a workaround for Grok API compatibility issues
+      if (provider === 'grok') {
+        logger.warn(`[${label}] Initial request failed, retrying with empty provider options`)
+        ;({ text } = await runGenerateText({ providerOptions: {} }))
+      } else {
+        throw e
+      }
+    }
+
+    let parsed: unknown
+    try {
+      parsed = JSON.parse(text)
+    } catch {
+      // Try to extract JSON from the response
+      const start = text.indexOf('{')
+      const end = text.lastIndexOf('}')
+      if (start !== -1 && end !== -1 && end > start) {
+        try {
+          parsed = JSON.parse(text.slice(start, end + 1))
+        } catch {
+          logger.warn(`[${label}] Could not parse extracted JSON from response`)
+          return { moves: [], rawResponse: text, error: 'Could not parse JSON response' }
+        }
+      } else {
+        logger.warn(`[${label}] No JSON found in response`)
+        return { moves: [], rawResponse: text, error: 'Could not parse JSON response' }
+      }
+    }
+
+    const validated = LegalMovesResponseSchema.safeParse(parsed)
+    if (!validated.success) {
+      logger.warn(`[${label}] Response did not match schema: ${validated.error.message}`)
+      return { moves: [], rawResponse: text, error: 'Response did not match schema' }
+    }
+
+    return { moves: validated.data.moves, rawResponse: text }
+  } catch (error) {
+    const elapsed = Date.now() - startTime
+    const errorWithStatus = error as { statusCode?: number }
+    const statusCode = typeof errorWithStatus?.statusCode === 'number' ? errorWithStatus.statusCode : undefined
+    const errorMsgBase = error instanceof Error ? error.message : 'Unknown error'
+    const errorMsg = statusCode != null ? `${errorMsgBase} (status ${statusCode})` : errorMsgBase
+    const errorName = error instanceof Error ? error.name : 'Error'
+
+    logger.error(`[${label}] FAILED after ${elapsed}ms`)
+    logger.error(`[${label}] Error type: ${errorName}`)
+    logger.error(`[${label}] Error message: ${errorMsg}`)
+
+    return {
+      moves: [],
+      rawResponse: errorMsg,
+      error: errorMsg,
+    }
+  }
+}
diff --git a/api/services/benchmark/concurrency.ts b/api/services/benchmark/concurrency.ts
new file mode 100644
index 0000000..0bf2436
--- /dev/null
+++ b/api/services/benchmark/concurrency.ts
@@ -0,0 +1,36 @@
+export const parsePositiveInt = (value: string | undefined, fallback: number) => {
+  const n = Number.parseInt(value ?? '', 10)
+  return Number.isFinite(n) && n > 0 ? n : fallback
+}
+
+/**
+ * Maps items with limited concurrency.
+ * Note: This is safe in JavaScript's single-threaded event loop because
+ * synchronous operations (like incrementing nextIndex) are atomic between await points.
+ */
+export const mapWithConcurrency = async <T, R>(
+  items: T[],
+  concurrency: number,
+  mapper: (item: T, index: number) => Promise<R>,
+  onComplete?: (index: number, result: R) => void,
+): Promise<R[]> => {
+  const results = new Array<R>(items.length)
+  const limit = Math.max(1, Math.min(concurrency, items.length))
+
+  // Use a queue-based approach that's more explicit about concurrency control
+  const queue = items.map((_, i) => i)
+
+  const worker = async () => {
+    while (queue.length > 0) {
+      const index = queue.shift()
+      if (index === undefined) return
+
+      const result = await mapper(items[index], index)
+      results[index] = result
+      onComplete?.(index, result)
+    }
+  }
+
+  await Promise.all(Array.from({ length: limit }, worker))
+  return results
+}
diff --git a/api/services/benchmark/fetch-lichess-puzzles.ts b/api/services/benchmark/fetch-lichess-puzzles.ts
new file mode 100644
index 0000000..5c74ace
--- /dev/null
+++ b/api/services/benchmark/fetch-lichess-puzzles.ts
@@ -0,0 +1,217 @@
+import { Chess } from 'chess.js'
+import { Logger } from 'motia'
+import { LichessPuzzle, PuzzleTheme } from '@chessarena/types/puzzle-benchmark'
+
+const LICHESS_BASE_URL = 'https://lichess.org'
+const MAX_BATCH_SIZE = 50
+const REQUEST_TIMEOUT_MS = 30000
+
+type LichessBatchPuzzle = {
+  game: {
+    id: string
+    pgn: string
+  }
+  puzzle: {
+    id: string
+    rating: number
+    themes: string[]
+    solution: string[]
+    initialPly: number
+  }
+}
+
+type LichessBatchResponse = {
+  puzzles: LichessBatchPuzzle[]
+}
+
+const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms))
+
+const parseRetryAfterMs = (value: string | null): number | null => {
+  if (!value) return null
+  const seconds = Number.parseInt(value, 10)
+  if (Number.isFinite(seconds)) return seconds * 1000
+  const dateMs = Date.parse(value)
+  if (Number.isFinite(dateMs)) return Math.max(0, dateMs - Date.now())
+  return null
+}
+
+const fetchJsonWithRetry = async <T>(url: string, logger: Logger, label: string, maxRetries = 6): Promise<T> => {
+  const token = process.env.LICHESS_TOKEN
+  const headers: Record<string, string> = {
+    Accept: 'application/json',
+  }
+  if (token) headers.Authorization = `Bearer ${token}`
+
+  let attempt = 0
+  let backoffMs = 1000
+
+  while (true) {
+    attempt++
+    const response = await fetch(url, {
+      headers,
+      signal: AbortSignal.timeout(REQUEST_TIMEOUT_MS),
+    })
+
+    if (response.ok) {
+      return (await response.json()) as T
+    }
+
+    const retryAfterMs = parseRetryAfterMs(response.headers.get('retry-after'))
+    const shouldRetry =
+      attempt <= maxRetries &&
+      (response.status === 429 || response.status === 408 || (response.status >= 500 && response.status <= 599))
+
+    if (!shouldRetry) {
+      logger.error('Lichess API request failed', { label, url, status: response.status })
+      throw new Error(`Lichess API error (${response.status})`)
+    }
+
+    const waitMs = retryAfterMs ?? backoffMs
+    logger.warn('Lichess API rate limited / transient error, retrying', {
+      label,
+      url,
+      status: response.status,
+      attempt,
+      waitMs,
+    })
+    await sleep(waitMs + Math.floor(Math.random() * 250))
+    backoffMs = Math.min(backoffMs * 2, 30000)
+  }
+}
+
+/**
+ * Convert UCI move to SAN (e.g., "e2e4" -> "e4")
+ */
+const uciToSan = (chess: Chess, uci: string): string | null => {
+  try {
+    const from = uci.slice(0, 2)
+    const to = uci.slice(2, 4)
+    const promotion = uci.length > 4 ? uci[4] : undefined
+
+    const move = chess.move({ from, to, promotion })
+    if (move) {
+      chess.undo() // Undo so we don't modify the position
+      return move.san
+    }
+    return null
+  } catch {
+    return null
+  }
+}
+
+/**
+ * Parse a single puzzle from Lichess batch response
+ */
+const parsePuzzle = (data: LichessBatchPuzzle, logger: Logger): LichessPuzzle | null => {
+  try {
+    const chess = new Chess()
+    const moves = data.game.pgn.split(' ').filter((m) => !m.includes('.') && m.length > 0)
+
+    // Play moves up to initialPly
+    for (let i = 0; i < data.puzzle.initialPly && i < moves.length; i++) {
+      try {
+        chess.move(moves[i])
+      } catch {
+        // Some moves might be invalid, skip
+      }
+    }
+
+    // Play one more move (the setup move) - this is the opponent's last move before the puzzle
+    if (moves.length > data.puzzle.initialPly) {
+      try {
+        chess.move(moves[data.puzzle.initialPly])
+      } catch {
+        logger.warn('Could not play setup move', { puzzleId: data.puzzle.id })
+        return null
+      }
+    }
+
+    const fen = chess.fen()
+    const legalMoves = chess.moves().sort()
+
+    // Convert first solution move to SAN
+    const solutionSan = uciToSan(chess, data.puzzle.solution[0])
+    if (!solutionSan) {
+      logger.warn('Could not convert solution to SAN', { puzzleId: data.puzzle.id })
+      return null
+    }
+
+    return {
+      id: data.puzzle.id,
+      rating: data.puzzle.rating,
+      themes: data.puzzle.themes,
+      solution: data.puzzle.solution,
+      initialPly: data.puzzle.initialPly,
+      pgn: data.game.pgn,
+      fen,
+      legalMoves,
+      solutionSan,
+    }
+  } catch (error) {
+    logger.error('Failed to parse puzzle', { error, puzzleId: data.puzzle.id })
+    return null
+  }
+}
+
+/**
+ * Fetch puzzles from Lichess using batch API.
+ * Uses /api/puzzle/batch/mix and filters by theme to avoid enumerating.
+ */
+export const fetchPuzzles = async (theme: PuzzleTheme, count: number, logger: Logger): Promise<LichessPuzzle[]> => {
+  const nb = Math.min(MAX_BATCH_SIZE, Math.max(1, Math.max(15, count)))
+  const target = Math.max(1, count)
+
+  const seenIds = new Set<string>()
+  const results: LichessPuzzle[] = []
+
+  const maxRequests = Math.max(3, Math.ceil((target / Math.max(1, nb)) * 6))
+  logger.info('Fetching puzzles from Lichess', {
+    theme,
+    count: target,
+    nb,
+    maxRequests,
+    authenticated: Boolean(process.env.LICHESS_TOKEN),
+  })
+
+  for (let req = 1; req <= maxRequests && results.length < target; req++) {
+    // Prefer the themed endpoint (faster to hit the theme we want).
+    // Fallback to mix if it fails (e.g. unknown angle).
+    const themedUrl = `${LICHESS_BASE_URL}/api/puzzle/batch/${theme}?nb=${nb}`
+    const mixUrl = `${LICHESS_BASE_URL}/api/puzzle/batch/mix?nb=${nb}`
+
+    let data: LichessBatchResponse
+    try {
+      data = await fetchJsonWithRetry<LichessBatchResponse>(themedUrl, logger, 'puzzle-batch-themed')
+    } catch (error) {
+      logger.warn('Themed puzzle batch failed, falling back to mix', { theme, error })
+      try {
+        data = await fetchJsonWithRetry<LichessBatchResponse>(mixUrl, logger, 'puzzle-batch-mix')
+      } catch (error2) {
+        logger.error('Failed to fetch puzzles batch', { error: error2 })
+        break
+      }
+    }
+
+    const candidates = data.puzzles
+
+    for (const item of candidates) {
+      if (seenIds.has(item.puzzle.id)) continue
+      seenIds.add(item.puzzle.id)
+
+      // When we fall back to mix, keep filtering by theme.
+      if (data !== undefined && item.puzzle.themes && item.puzzle.themes.length > 0) {
+        // If the response came from the themed endpoint, it should already match. Filtering is harmless.
+        if (!item.puzzle.themes.includes(theme)) continue
+      }
+
+      const parsed = parsePuzzle(item, logger)
+      if (!parsed) continue
+      results.push(parsed)
+      if (results.length >= target) break
+    }
+
+    logger.info('Fetched puzzles batch', { theme, req, got: results.length, target })
+  }
+
+  return results.slice(0, target)
+}
diff --git a/api/services/benchmark/retry.ts b/api/services/benchmark/retry.ts
new file mode 100644
index 0000000..cf3bc7b
--- /dev/null
+++ b/api/services/benchmark/retry.ts
@@ -0,0 +1,95 @@
+const sleep = async (ms: number) => {
+  await new Promise((r) => setTimeout(r, ms))
+}
+
+const parseRetryAfterMs = (value: unknown): number | undefined => {
+  if (typeof value !== 'string') return undefined
+  const trimmed = value.trim()
+  const seconds = Number.parseInt(trimmed, 10)
+  if (Number.isFinite(seconds) && seconds >= 0) return seconds * 1000
+  const dateMs = Date.parse(trimmed)
+  if (!Number.isNaN(dateMs)) {
+    const delta = dateMs - Date.now()
+    return delta > 0 ? delta : 0
+  }
+  return undefined
+}
+
+const getStatusCode = (e: unknown): number | undefined => {
+  const anyErr = e as any
+  if (typeof anyErr?.statusCode === 'number') return anyErr.statusCode
+  if (typeof anyErr?.cause?.statusCode === 'number') return anyErr.cause.statusCode
+  return undefined
+}
+
+const getResponseHeaders = (e: unknown): Record<string, string> | undefined => {
+  const anyErr = e as any
+  const headers = anyErr?.responseHeaders ?? anyErr?.cause?.responseHeaders
+  if (!headers || typeof headers !== 'object') return undefined
+  return headers as Record<string, string>
+}
+
+export const isTransientError = (e: unknown): boolean => {
+  const status = getStatusCode(e)
+  if (status && [408, 425, 429, 500, 502, 503, 504, 529].includes(status)) return true
+  const msg = e instanceof Error ? e.message : ''
+  if (msg.includes('Headers Timeout')) return true
+  if (msg.includes('Cannot connect to API')) return true
+  return false
+}
+
+export const withRetries = async <T>(
+  label: string,
+  deadlineMs: number,
+  transientRetries: number,
+  retryBaseBackoffMs: number,
+  fn: (abortSignal: AbortSignal) => Promise<T>,
+): Promise<T> => {
+  let attempt = 0
+  while (true) {
+    const remaining = deadlineMs - Date.now()
+    if (remaining <= 0) throw new Error('Timed out before request could start')
+
+    try {
+      return await fn(AbortSignal.timeout(remaining))
+    } catch (e) {
+      attempt++
+      if (attempt > transientRetries || !isTransientError(e)) throw e
+
+      const headers = getResponseHeaders(e)
+      const retryAfterMs = parseRetryAfterMs(headers?.['retry-after'] ?? headers?.['Retry-After'])
+      const backoff = Math.min(30_000, retryBaseBackoffMs * 2 ** (attempt - 1))
+      const jitter = Math.floor(Math.random() * 250)
+      const waitMs = (retryAfterMs ?? backoff) + jitter
+
+      const remainingAfterWait = deadlineMs - Date.now()
+      if (remainingAfterWait <= 0) throw e
+      await sleep(Math.min(waitMs, Math.max(0, remainingAfterWait - 100)))
+    }
+  }
+}
+
+export const withRetriesNoTimeout = async <T>(
+  label: string,
+  transientRetries: number,
+  retryBaseBackoffMs: number,
+  fn: () => Promise<T>,
+): Promise<T> => {
+  let attempt = 0
+  while (true) {
+    try {
+      return await fn()
+    } catch (e) {
+      attempt++
+      if (attempt > transientRetries || !isTransientError(e)) throw e
+
+      const headers = getResponseHeaders(e)
+      const retryAfterMs = parseRetryAfterMs(headers?.['retry-after'] ?? headers?.['Retry-After'])
+      const backoff = Math.min(30_000, retryBaseBackoffMs * 2 ** (attempt - 1))
+      const jitter = Math.floor(Math.random() * 250)
+      const waitMs = (retryAfterMs ?? backoff) + jitter
+
+      await sleep(waitMs)
+    }
+  }
+}
diff --git a/api/services/benchmark/run-legal-move-benchmark.ts b/api/services/benchmark/run-legal-move-benchmark.ts
new file mode 100644
index 0000000..a6320b4
--- /dev/null
+++ b/api/services/benchmark/run-legal-move-benchmark.ts
@@ -0,0 +1,257 @@
+import fs from 'fs'
+import path from 'path'
+import mustache from 'mustache'
+import { Chess } from 'chess.js'
+import { Logger } from 'motia'
+import { AiModelProvider } from '@chessarena/types/ai-models'
+import { TestPosition, ModelBenchmarkResult, LegalMoveBenchmarkRun } from '@chessarena/types/legal-move-benchmark'
+import { makeBenchmarkPrompt } from './benchmark-prompt'
+import { getBenchmarkConfig } from './benchmark-config'
+import { mapWithConcurrency, parsePositiveInt } from './concurrency'
+
+const promptTemplate = fs.readFileSync(path.join(__dirname, '../../steps/chess/legal-move-benchmark.mustache'), 'utf8')
+
+type GeneratePositionsOptions = {
+  count: number
+  minLegalMoves: number
+  maxLegalMoves: number
+  minMoveNumber: number
+  maxMoveNumber: number
+}
+
+const DEFAULT_OPTIONS: GeneratePositionsOptions = {
+  count: 20,
+  minLegalMoves: 5,
+  maxLegalMoves: 25,
+  minMoveNumber: 8,
+  maxMoveNumber: 60,
+}
+
+/**
+ * Generate a single random position by playing random moves
+ */
+const generateRandomPosition = (options: GeneratePositionsOptions): TestPosition | null => {
+  const chess = new Chess()
+
+  const targetMoves =
+    Math.floor(Math.random() * (options.maxMoveNumber - options.minMoveNumber + 1)) + options.minMoveNumber
+
+  for (let i = 0; i < targetMoves; i++) {
+    const moves = chess.moves()
+    if (moves.length === 0) return null
+    const randomMove = moves[Math.floor(Math.random() * moves.length)]
+    chess.move(randomMove)
+  }
+
+  if (chess.isGameOver()) return null
+
+  const legalMoves = chess.moves()
+  if (legalMoves.length < options.minLegalMoves) return null
+  if (legalMoves.length > options.maxLegalMoves) return null
+
+  return {
+    id: crypto.randomUUID(),
+    fen: chess.fen(),
+    pgn: chess.pgn(),
+    turn: chess.turn() === 'w' ? 'white' : 'black',
+    legalMoves: legalMoves.sort(),
+    legalMoveCount: legalMoves.length,
+    moveNumber: Math.floor(chess.history().length / 2) + 1,
+  }
+}
+
+/**
+ * Generate multiple unique test positions
+ */
+export const generateTestPositions = (options: Partial<GeneratePositionsOptions> = {}): TestPosition[] => {
+  const opts = { ...DEFAULT_OPTIONS, ...options }
+  const positions: TestPosition[] = []
+  const seenFens = new Set<string>()
+
+  let attempts = 0
+  const maxAttempts = opts.count * 100
+
+  while (positions.length < opts.count && attempts < maxAttempts) {
+    attempts++
+    const position = generateRandomPosition(opts)
+
+    if (position && !seenFens.has(position.fen)) {
+      seenFens.add(position.fen)
+      positions.push(position)
+    }
+  }
+
+  return positions
+}
+
+/**
+ * Calculate benchmark score for a single position
+ * Uses F1-style scoring: harmonic mean of recall and precision
+ * - Recall: what % of legal moves did you find
+ * - Precision: what % of your answers were correct
+ */
+const calculateScore = (
+  legalMoves: string[],
+  modelMoves: string[],
+): {
+  correct: string[]
+  illegal: string[]
+  missed: string[]
+  accuracy: number
+  penalty: number
+  finalScore: number
+} => {
+  const legalSet = new Set(legalMoves)
+  const modelSet = new Set(modelMoves)
+
+  const correct = modelMoves.filter((m) => legalSet.has(m))
+  const illegal = modelMoves.filter((m) => !legalSet.has(m))
+  const missed = legalMoves.filter((m) => !modelSet.has(m))
+
+  // Recall: how many legal moves did you find
+  const recall = legalMoves.length > 0 ? (correct.length / legalMoves.length) * 100 : 0
+
+  // Precision: how many of your answers were correct
+  const precision = modelMoves.length > 0 ? (correct.length / modelMoves.length) * 100 : 0
+
+  // F1 score: harmonic mean of precision and recall
+  const finalScore = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0
+
+  // Keep accuracy as recall for backwards compatibility, penalty as inverse of precision
+  const accuracy = recall
+  const penalty = 100 - precision
+
+  return { correct, illegal, missed, accuracy, penalty, finalScore }
+}
+
+/**
+ * Run benchmark for a single position
+ */
+const benchmarkPosition = async (
+  position: TestPosition,
+  provider: AiModelProvider,
+  model: string,
+  logger: Logger,
+): Promise<ModelBenchmarkResult> => {
+  const prompt = mustache.render(
+    promptTemplate,
+    {
+      pgn: position.pgn,
+      fen: position.fen,
+      turn: position.turn.toUpperCase(),
+    },
+    {},
+    { escape: (v: string) => v },
+  )
+
+  const startTime = Date.now()
+  let rawResponse = ''
+  let modelMoves: string[] = []
+  let error: string | undefined
+
+  try {
+    const response = await makeBenchmarkPrompt({
+      prompt,
+      provider,
+      model,
+      logger,
+    })
+
+    rawResponse = response.rawResponse
+    modelMoves = response.moves
+    if (response.error) {
+      error = response.error
+    }
+  } catch (e) {
+    error = e instanceof Error ? e.message : 'Unknown error'
+    logger.error('Benchmark position failed', { error, positionId: position.id })
+  }
+
+  const responseTime = Date.now() - startTime
+  const { correct, illegal, missed, accuracy, penalty, finalScore } = calculateScore(position.legalMoves, modelMoves)
+
+  return {
+    positionId: position.id,
+    modelMoves,
+    correctMoves: correct,
+    illegalMoves: illegal,
+    missedMoves: missed,
+    accuracy,
+    penalty,
+    finalScore,
+    responseTime,
+    rawResponse,
+    error,
+  }
+}
+
+/**
+ * Run full benchmark for a model using provided positions
+ */
+export const runLegalMoveBenchmark = async (
+  positions: TestPosition[],
+  provider: AiModelProvider,
+  model: string,
+  logger: Logger,
+  onProgress?: (completed: number, total: number) => void,
+): Promise<LegalMoveBenchmarkRun> => {
+  const runId = crypto.randomUUID()
+
+  logger.info('Starting legal move benchmark', {
+    runId,
+    provider,
+    model,
+    positionCount: positions.length,
+  })
+
+  const run: LegalMoveBenchmarkRun = {
+    id: runId,
+    createdAt: Date.now(),
+    status: 'running',
+    provider,
+    model,
+    positionCount: positions.length,
+    positions,
+    results: [],
+  }
+
+  const cfg = getBenchmarkConfig()
+  const positionConcurrency = cfg.itemConcurrency
+  let completed = 0
+
+  run.results = await mapWithConcurrency(
+    positions,
+    positionConcurrency,
+    async (position) => benchmarkPosition(position, provider, model, logger),
+    () => {
+      completed++
+      onProgress?.(completed, positions.length)
+      if (completed === positions.length || completed % 5 === 0) {
+        logger.info('Legal move benchmark progress', { runId, provider, model, completed, total: positions.length })
+      }
+    },
+  )
+
+  // Calculate aggregate scores
+  const completedResults = run.results.filter((r) => !r.error)
+  if (completedResults.length > 0) {
+    run.averageAccuracy = completedResults.reduce((sum, r) => sum + r.accuracy, 0) / completedResults.length
+    run.averagePenalty = completedResults.reduce((sum, r) => sum + r.penalty, 0) / completedResults.length
+    run.averageFinalScore = completedResults.reduce((sum, r) => sum + r.finalScore, 0) / completedResults.length
+    run.totalCorrectMoves = completedResults.reduce((sum, r) => sum + r.correctMoves.length, 0)
+    run.totalIllegalMoves = completedResults.reduce((sum, r) => sum + r.illegalMoves.length, 0)
+    run.totalMissedMoves = completedResults.reduce((sum, r) => sum + r.missedMoves.length, 0)
+  }
+
+  run.completedAt = Date.now()
+  run.status = completedResults.length > 0 ? 'completed' : 'failed'
+
+  logger.info('Legal move benchmark completed', {
+    runId,
+    averageScore: run.averageFinalScore,
+    totalCorrect: run.totalCorrectMoves,
+    totalIllegal: run.totalIllegalMoves,
+  })
+
+  return run
+}
diff --git a/api/services/benchmark/run-puzzle-benchmark.ts b/api/services/benchmark/run-puzzle-benchmark.ts
new file mode 100644
index 0000000..456e77f
--- /dev/null
+++ b/api/services/benchmark/run-puzzle-benchmark.ts
@@ -0,0 +1,293 @@
+import fs from 'fs'
+import path from 'path'
+import mustache from 'mustache'
+import { Chess } from 'chess.js'
+import { z } from 'zod'
+import { generateText } from 'ai'
+import { Logger } from 'motia'
+import { AiModelProvider } from '@chessarena/types/ai-models'
+import { LichessPuzzle, PuzzleResult, PuzzleBenchmarkRun, PuzzleTheme } from '@chessarena/types/puzzle-benchmark'
+import { getBenchmarkProviderOptions } from '../ai/provider-options'
+import { getBenchmarkConfig } from './benchmark-config'
+import { withRetries, withRetriesNoTimeout } from './retry'
+import { mapWithConcurrency, parsePositiveInt } from './concurrency'
+import { createProviderModel, shouldDisableTimeout } from './shared-utils'
+
+const promptTemplate = fs.readFileSync(path.join(__dirname, '../../steps/chess/puzzle-benchmark.mustache'), 'utf8')
+
+const PuzzleMoveResponseSchema = z.object({
+  move: z.string().describe('The best move in Standard Algebraic Notation'),
+})
+
+type ProviderOptions = Record<string, unknown>
+
+const extractMoveFromText = (text: string, legalMoves: string[], logger?: Logger): { move: string } | null => {
+  const fenceMatch = text.match(/```(?:json)?\s*([\s\S]*?)\s*```/i)
+  const candidate = fenceMatch?.[1] ?? text
+
+  // Try JSON first
+  try {
+    const parsed = JSON.parse(candidate)
+    const validated = PuzzleMoveResponseSchema.safeParse(parsed)
+    if (validated.success) return { move: validated.data.move.trim() }
+  } catch (e) {
+    // JSON parsing failed, will try other extraction methods
+    logger?.debug('JSON parsing failed, trying alternative extraction', {
+      error: e instanceof Error ? e.message : 'unknown',
+    })
+  }
+
+  // Try extracting JSON object substring
+  const start = candidate.indexOf('{')
+  const end = candidate.lastIndexOf('}')
+  if (start !== -1 && end !== -1 && end > start) {
+    const slice = candidate.slice(start, end + 1)
+    try {
+      const parsed = JSON.parse(slice)
+      const validated = PuzzleMoveResponseSchema.safeParse(parsed)
+      if (validated.success) return { move: validated.data.move.trim() }
+    } catch (e) {
+      // Substring JSON parsing also failed
+      logger?.debug('Substring JSON parsing failed', { error: e instanceof Error ? e.message : 'unknown' })
+    }
+  }
+
+  // Regex fallbacks
+  const quoted = candidate.match(/"move"\s*:\s*"([^"]+)"/i)
+  if (quoted?.[1]) return { move: quoted[1].trim() }
+
+  const loose = candidate.match(/\bmove\b\s*[:=]\s*("?)([^"\n\r]+)\1/i)
+  if (loose?.[2]) return { move: loose[2].trim() }
+
+  let best: { move: string; idx: number } | undefined
+  for (const m of legalMoves) {
+    const idx = candidate.indexOf(m)
+    if (idx === -1) continue
+    if (!best || idx < best.idx) best = { move: m, idx }
+  }
+  if (best) return { move: best.move.trim() }
+
+  return null
+}
+
+const getPuzzleMaxOutputTokens = (provider: AiModelProvider, model: string, base: number): number => {
+  if (provider === 'openai' && model.startsWith('gpt-5')) return Math.max(base, 384)
+  if (provider === 'gemini' && model.startsWith('gemini-3')) return Math.max(base, 384)
+  return base
+}
+
+const getThemeDescription = (theme: PuzzleTheme): string => {
+  switch (theme) {
+    case 'mateIn1':
+      return 'Mate in 1'
+    case 'oneMove':
+      return 'One Move'
+    default:
+      return theme
+  }
+}
+
+/**
+ * Build provider-specific options for puzzle benchmarks
+ */
+const buildProviderOptions = (
+  provider: AiModelProvider,
+  model: string,
+  providerOptionsBase: ProviderOptions,
+): ProviderOptions => {
+  if (provider === 'gemini') {
+    const googleBase = (providerOptionsBase?.google ?? {}) as Record<string, unknown>
+    return {
+      ...providerOptionsBase,
+      google: {
+        ...googleBase,
+        responseMimeType: 'text/plain',
+        thinkingConfig: { thinkingBudget: model.includes('pro') ? 128 : 0 },
+      },
+    }
+  }
+
+  if (provider === 'openai' && (model === 'gpt-5' || model === 'gpt-5.1' || model === 'gpt-5.2')) {
+    const openaiBase = (providerOptionsBase?.openai ?? {}) as Record<string, unknown>
+    return {
+      ...providerOptionsBase,
+      openai: {
+        ...openaiBase,
+        reasoningEffort: model === 'gpt-5' ? 'minimal' : 'none',
+      },
+    }
+  }
+
+  return providerOptionsBase
+}
+
+/**
+ * Run benchmark for a single puzzle
+ */
+const benchmarkPuzzle = async (
+  puzzle: LichessPuzzle,
+  theme: PuzzleTheme,
+  provider: AiModelProvider,
+  model: string,
+  logger: Logger,
+): Promise<PuzzleResult> => {
+  const chess = new Chess(puzzle.fen)
+  const turn = chess.turn() === 'w' ? 'WHITE' : 'BLACK'
+
+  const prompt = mustache.render(
+    promptTemplate,
+    {
+      pgn: puzzle.pgn,
+      fen: puzzle.fen,
+      turn,
+      legalMoves: puzzle.legalMoves,
+      theme: getThemeDescription(theme),
+    },
+    {},
+    { escape: (v: string) => v },
+  )
+
+  const startTime = Date.now()
+  let rawResponse = ''
+  let modelMove: string | undefined
+  let error: string | undefined
+
+  try {
+    const cfg = getBenchmarkConfig()
+    const providerModel = createProviderModel(provider, model)
+    const disableTimeout = shouldDisableTimeout(provider, model)
+    const providerOptionsBase = getBenchmarkProviderOptions(provider, model)
+    const providerOptions = buildProviderOptions(provider, model, providerOptionsBase)
+    const maxOutputTokens = getPuzzleMaxOutputTokens(provider, model, cfg.maxOutputTokens)
+
+    const label = `${provider}/${model}`
+    const result = disableTimeout
+      ? await withRetriesNoTimeout(label, cfg.transientRetries, cfg.retryBaseBackoffMs, async () => {
+          return await generateText({
+            model: providerModel,
+            prompt,
+            maxRetries: 0,
+            maxOutputTokens,
+            providerOptions,
+          })
+        })
+      : await withRetries(
+          label,
+          startTime + cfg.perItemTimeoutMs,
+          cfg.transientRetries,
+          cfg.retryBaseBackoffMs,
+          async (abortSignal) => {
+            return await generateText({
+              model: providerModel,
+              prompt,
+              maxRetries: 0,
+              abortSignal,
+              maxOutputTokens,
+              providerOptions,
+            })
+          },
+        )
+
+    const text = result.text ?? ''
+    // Handle cases where response might be in a different property
+    const resultWithResponse = result as { response?: unknown }
+    const responseFallback =
+      !text.trim() && resultWithResponse?.response ? JSON.stringify(resultWithResponse.response) : ''
+    const candidate = text.trim() ? text : responseFallback
+
+    rawResponse = candidate.slice(0, 20_000)
+
+    if (!candidate.trim()) {
+      error = 'Empty response'
+    } else {
+      const extracted = extractMoveFromText(candidate, puzzle.legalMoves, logger)
+      if (extracted) {
+        modelMove = extracted.move
+      } else {
+        error = 'Could not parse JSON response'
+      }
+    }
+  } catch (e) {
+    error = e instanceof Error ? e.message : 'Unknown error'
+    logger.error('Puzzle benchmark failed', { error, puzzleId: puzzle.id })
+  }
+
+  const responseTime = Date.now() - startTime
+  const isCorrect = modelMove === puzzle.solutionSan
+
+  return {
+    puzzleId: puzzle.id,
+    modelMove,
+    correctMove: puzzle.solutionSan,
+    isCorrect,
+    responseTime,
+    rawResponse,
+    error,
+  }
+}
+
+/**
+ * Run full puzzle benchmark for a model
+ */
+export const runPuzzleBenchmark = async (
+  puzzles: LichessPuzzle[],
+  puzzleSetId: string,
+  theme: PuzzleTheme,
+  provider: AiModelProvider,
+  model: string,
+  logger: Logger,
+  onProgress?: (completed: number, total: number) => void,
+): Promise<PuzzleBenchmarkRun> => {
+  const runId = crypto.randomUUID()
+
+  logger.info('Starting puzzle benchmark', {
+    runId,
+    provider,
+    model,
+    theme,
+    puzzleCount: puzzles.length,
+  })
+
+  const run: PuzzleBenchmarkRun = {
+    id: runId,
+    createdAt: Date.now(),
+    status: 'running',
+    provider,
+    model,
+    puzzleSetId,
+    theme,
+    results: [],
+    totalPuzzles: puzzles.length,
+  }
+
+  const puzzleConcurrency = parsePositiveInt(process.env.BENCHMARK_PUZZLE_CONCURRENCY, 1)
+  let completed = 0
+
+  run.results = await mapWithConcurrency(
+    puzzles,
+    puzzleConcurrency,
+    async (puzzle) => benchmarkPuzzle(puzzle, theme, provider, model, logger),
+    () => {
+      completed++
+      onProgress?.(completed, puzzles.length)
+      if (completed === puzzles.length || completed % 10 === 0) {
+        logger.info('Puzzle benchmark progress', { runId, provider, model, theme, completed, total: puzzles.length })
+      }
+    },
+  )
+
+  // Calculate aggregate scores
+  const correctCount = run.results.filter((r) => r.isCorrect).length
+  run.correctCount = correctCount
+  run.accuracy = puzzles.length > 0 ? (correctCount / puzzles.length) * 100 : 0
+  run.completedAt = Date.now()
+  run.status = 'completed'
+
+  logger.info('Puzzle benchmark completed', {
+    runId,
+    correctCount,
+    accuracy: run.accuracy,
+  })
+
+  return run
+}
diff --git a/api/services/benchmark/shared-utils.ts b/api/services/benchmark/shared-utils.ts
new file mode 100644
index 0000000..a939e10
--- /dev/null
+++ b/api/services/benchmark/shared-utils.ts
@@ -0,0 +1,95 @@
+import { createAnthropic } from '@ai-sdk/anthropic'
+import { createOpenAI } from '@ai-sdk/openai'
+import { createGoogleGenerativeAI } from '@ai-sdk/google'
+import { createXai } from '@ai-sdk/xai'
+import { AiModelProvider } from '@chessarena/types/ai-models'
+
+/**
+ * Check if timeout should be disabled for specific provider/model combinations.
+ * Currently disabled for Grok models to avoid timeout issues with their API.
+ */
+export const shouldDisableTimeout = (provider: AiModelProvider, model: string): boolean => {
+  if (provider !== 'grok') return false
+  const enabled = (process.env.BENCHMARK_GROK_DISABLE_TIMEOUT ?? 'true') === 'true'
+  if (!enabled) return false
+  return model.startsWith('grok-3') || model.startsWith('grok-4')
+}
+
+/**
+ * Create a provider model instance for the given provider and model name.
+ */
+export const createProviderModel = (provider: AiModelProvider, model: string) => {
+  switch (provider) {
+    case 'openai': {
+      const openai = createOpenAI({ apiKey: process.env.OPENAI_API_KEY })
+      return openai(model)
+    }
+    case 'gemini': {
+      const google = createGoogleGenerativeAI({ apiKey: process.env.GEMINI_API_KEY })
+      return google(model)
+    }
+    case 'claude': {
+      const anthropic = createAnthropic({ apiKey: process.env.ANTHROPIC_API_KEY })
+      return anthropic(model)
+    }
+    case 'grok': {
+      const xai = createXai({ apiKey: process.env.XAI_API_KEY })
+      return xai(model)
+    }
+    default:
+      throw new Error(`Unsupported provider: ${provider}`)
+  }
+}
+
+/**
+ * Calculate F1 score for legal move benchmark results.
+ * - Recall: what % of legal moves did the model find
+ * - Precision: what % of model's answers were correct
+ * - F1: harmonic mean of precision and recall
+ */
+export const calculateLegalMoveScore = (
+  legalMoves: string[],
+  modelMoves: string[],
+): {
+  correct: string[]
+  illegal: string[]
+  missed: string[]
+  accuracy: number
+  penalty: number
+  finalScore: number
+} => {
+  const legalSet = new Set(legalMoves)
+  const modelSet = new Set(modelMoves)
+
+  const correct = modelMoves.filter((m) => legalSet.has(m))
+  const illegal = modelMoves.filter((m) => !legalSet.has(m))
+  const missed = legalMoves.filter((m) => !modelSet.has(m))
+
+  // Recall: how many legal moves did you find
+  const recall = legalMoves.length > 0 ? (correct.length / legalMoves.length) * 100 : 0
+
+  // Precision: how many of your answers were correct
+  const precision = modelMoves.length > 0 ? (correct.length / modelMoves.length) * 100 : 0
+
+  // F1 score: harmonic mean of precision and recall
+  const finalScore = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0
+
+  // Keep accuracy as recall for backwards compatibility, penalty as inverse of precision
+  const accuracy = recall
+  const penalty = 100 - precision
+
+  return { correct, illegal, missed, accuracy, penalty, finalScore }
+}
+
+/**
+ * Get the environment variable name for a provider's API key.
+ */
+export const getApiKeyEnvVar = (provider: AiModelProvider): string => {
+  const envVars: Record<AiModelProvider, string> = {
+    openai: 'OPENAI_API_KEY',
+    gemini: 'GEMINI_API_KEY',
+    claude: 'ANTHROPIC_API_KEY',
+    grok: 'XAI_API_KEY',
+  }
+  return envVars[provider]
+}
diff --git a/api/services/benchmark/stockfish-game.ts b/api/services/benchmark/stockfish-game.ts
new file mode 100644
index 0000000..7ccf8de
--- /dev/null
+++ b/api/services/benchmark/stockfish-game.ts
@@ -0,0 +1,332 @@
+import { spawn, ChildProcess } from 'child_process'
+import { Chess } from 'chess.js'
+import fs from 'fs'
+import path from 'path'
+import mustache from 'mustache'
+import { Logger } from 'motia'
+import { AiModelProvider } from '@chessarena/types/ai-models'
+import { StockfishGameResult, StockfishGameMove } from '@chessarena/types/stockfish-benchmark'
+import { makePrompt } from '../ai/make-prompt'
+
+const promptTemplate = fs.readFileSync(path.join(__dirname, '../../steps/chess/05-ai-player.mustache'), 'utf8')
+
+const MAX_MOVES = 200 // Max half-moves before declaring draw
+const MAX_ILLEGAL_ATTEMPTS = 3
+
+class StockfishEngine {
+  private process: ChildProcess | null = null
+  private ready = false
+  private outputBuffer = ''
+
+  async init(enginePath: string): Promise<void> {
+    return new Promise((resolve, reject) => {
+      this.process = spawn(enginePath)
+
+      this.process.stdout?.on('data', (data) => {
+        this.outputBuffer += data.toString()
+      })
+
+      this.process.stderr?.on('data', (data) => {
+        console.error('Stockfish error:', data.toString())
+      })
+
+      this.process.on('error', reject)
+
+      // Send UCI command and wait for uciok
+      this.send('uci')
+      this.waitFor('uciok', 5000)
+        .then(() => {
+          this.send('isready')
+          this.waitFor('readyok', 5000).then(() => {
+            this.ready = true
+            resolve()
+          })
+        })
+        .catch(reject)
+    })
+  }
+
+  private send(command: string): void {
+    this.process?.stdin?.write(command + '\n')
+  }
+
+  private async waitFor(text: string, timeout: number): Promise<string> {
+    const start = Date.now()
+    while (Date.now() - start < timeout) {
+      if (this.outputBuffer.includes(text)) {
+        const result = this.outputBuffer
+        this.outputBuffer = ''
+        return result
+      }
+      await new Promise((r) => setTimeout(r, 50))
+    }
+    throw new Error(`Timeout waiting for: ${text}`)
+  }
+
+  async setLevel(level: number): Promise<void> {
+    // Level 1-20, affects skill level
+    const skillLevel = Math.max(0, Math.min(20, level))
+    this.send(`setoption name Skill Level value ${skillLevel}`)
+    this.send('isready')
+    await this.waitFor('readyok', 2000)
+  }
+
+  async getBestMove(fen: string, thinkTime: number = 1000): Promise<string> {
+    this.outputBuffer = ''
+    this.send(`position fen ${fen}`)
+    this.send(`go movetime ${thinkTime}`)
+
+    const output = await this.waitFor('bestmove', 30000)
+    const match = output.match(/bestmove\s+(\S+)/)
+    if (!match) throw new Error('Could not parse best move')
+    return match[1]
+  }
+
+  async evaluate(fen: string): Promise<{ score: number; bestMove: string }> {
+    this.outputBuffer = ''
+    this.send(`position fen ${fen}`)
+    this.send('go depth 15')
+
+    const output = await this.waitFor('bestmove', 30000)
+
+    // Parse score from info lines
+    const scoreMatch = output.match(/score cp (-?\d+)/)
+    const mateMatch = output.match(/score mate (-?\d+)/)
+    const bestMoveMatch = output.match(/bestmove\s+(\S+)/)
+
+    let score = 0
+    if (mateMatch) {
+      const mateIn = parseInt(mateMatch[1])
+      score = mateIn > 0 ? 10000 - mateIn * 100 : -10000 - mateIn * 100
+    } else if (scoreMatch) {
+      score = parseInt(scoreMatch[1])
+    }
+
+    return {
+      score,
+      bestMove: bestMoveMatch ? bestMoveMatch[1] : '',
+    }
+  }
+
+  async quit(): Promise<void> {
+    this.send('quit')
+    this.process?.kill()
+    this.process = null
+  }
+}
+
+/**
+ * Play a single game against Stockfish
+ */
+export const playGameAgainstStockfish = async (
+  provider: AiModelProvider,
+  model: string,
+  aiColor: 'white' | 'black',
+  stockfishLevel: number,
+  logger: Logger,
+): Promise<StockfishGameResult> => {
+  const gameId = crypto.randomUUID()
+  const chess = new Chess()
+  const moves: StockfishGameMove[] = []
+  let totalCentipawnLoss = 0
+  let aiMoveCount = 0
+  let blunders = 0
+  let mistakes = 0
+  let inaccuracies = 0
+
+  const result: StockfishGameResult = {
+    id: gameId,
+    createdAt: Date.now(),
+    status: 'running',
+    provider,
+    model,
+    aiColor,
+    stockfishLevel,
+    moves: [],
+    totalMoves: 0,
+  }
+
+  const enginePath = process.env.STOCKFISH_BIN_PATH
+  if (!enginePath) {
+    result.status = 'failed'
+    result.resultReason = 'STOCKFISH_BIN_PATH not set'
+    return result
+  }
+
+  const engine = new StockfishEngine()
+
+  try {
+    logger.info(`Starting game: ${provider}/${model} as ${aiColor} vs Stockfish level ${stockfishLevel}`)
+    await engine.init(enginePath)
+    await engine.setLevel(stockfishLevel)
+
+    let moveNumber = 0
+    let illegalAttempts = 0
+
+    while (!chess.isGameOver() && moveNumber < MAX_MOVES) {
+      const currentPlayer = chess.turn() === 'w' ? 'white' : 'black'
+      const isAiTurn = currentPlayer === aiColor
+      const fenBefore = chess.fen()
+
+      moveNumber++
+
+      if (isAiTurn) {
+        // AI's turn - get move from LLM
+        const validMoves = chess.moves({ verbose: true })
+        const templateData = {
+          fenBefore,
+          fen: fenBefore,
+          inCheck: chess.isCheck(),
+          player: currentPlayer,
+          validMoves,
+          totalMoves: validMoves.length,
+        }
+
+        const prompt = mustache.render(promptTemplate, templateData, {}, { escape: (v: string) => v })
+
+        const startTime = Date.now()
+        let moveSan: string | null = null
+        let error: string | undefined
+
+        try {
+          const response = await makePrompt({
+            prompt,
+            provider,
+            model,
+            logger,
+          })
+
+          moveSan = response?.moveSan
+        } catch (e) {
+          error = e instanceof Error ? e.message : 'Unknown error'
+        }
+
+        const responseTime = Date.now() - startTime
+
+        // Validate and play AI move
+        if (moveSan) {
+          try {
+            // Get evaluation before the move
+            const evalBefore = await engine.evaluate(fenBefore)
+
+            chess.move(moveSan)
+            const fenAfter = chess.fen()
+
+            // Get evaluation after the move
+            const evalAfter = await engine.evaluate(fenAfter)
+
+            // Calculate centipawn loss (from AI's perspective)
+            const scoreBefore = aiColor === 'white' ? evalBefore.score : -evalBefore.score
+            const scoreAfter = aiColor === 'white' ? evalAfter.score : -evalAfter.score
+            const centipawnLoss = Math.max(0, scoreBefore - scoreAfter)
+
+            totalCentipawnLoss += centipawnLoss
+            aiMoveCount++
+
+            if (centipawnLoss > 100) blunders++
+            else if (centipawnLoss > 50) mistakes++
+            else if (centipawnLoss > 25) inaccuracies++
+
+            moves.push({
+              moveNumber,
+              player: currentPlayer,
+              moveSan,
+              fen: fenAfter,
+              centipawnScore: scoreAfter,
+              bestMove: evalBefore.bestMove,
+              centipawnLoss,
+              isAiMove: true,
+              responseTime,
+            })
+
+            illegalAttempts = 0
+            logger.info(`  Move ${moveNumber}: ${moveSan} (CPL: ${centipawnLoss})`)
+          } catch {
+            // Illegal move
+            illegalAttempts++
+            logger.warn(`  Illegal move attempt ${illegalAttempts}: ${moveSan}`)
+
+            if (illegalAttempts >= MAX_ILLEGAL_ATTEMPTS) {
+              result.result = 'ai_illegal_move'
+              result.resultReason = `Too many illegal moves (last: ${moveSan})`
+              break
+            }
+            moveNumber-- // Retry
+            continue
+          }
+        } else {
+          // No move returned
+          illegalAttempts++
+          if (illegalAttempts >= MAX_ILLEGAL_ATTEMPTS) {
+            result.result = 'ai_illegal_move'
+            result.resultReason = 'AI failed to return valid move'
+            break
+          }
+          moveNumber--
+          continue
+        }
+      } else {
+        // Stockfish's turn
+        const uciMove = await engine.getBestMove(fenBefore, 500) // 500ms think time
+
+        // Convert UCI to SAN
+        const from = uciMove.slice(0, 2)
+        const to = uciMove.slice(2, 4)
+        const promotion = uciMove.length > 4 ? uciMove[4] : undefined
+
+        const move = chess.move({ from, to, promotion })
+        if (move) {
+          moves.push({
+            moveNumber,
+            player: currentPlayer,
+            moveSan: move.san,
+            fen: chess.fen(),
+            isAiMove: false,
+          })
+          logger.info(`  Move ${moveNumber}: ${move.san} (Stockfish)`)
+        }
+      }
+    }
+
+    // Determine result
+    if (!result.result) {
+      if (chess.isCheckmate()) {
+        const winner = chess.turn() === 'w' ? 'black' : 'white'
+        result.result = winner === aiColor ? 'ai_win' : 'stockfish_win'
+        result.resultReason = 'Checkmate'
+      } else if (chess.isDraw()) {
+        result.result = 'draw'
+        if (chess.isStalemate()) result.resultReason = 'Stalemate'
+        else if (chess.isThreefoldRepetition()) result.resultReason = 'Threefold repetition'
+        else if (chess.isInsufficientMaterial()) result.resultReason = 'Insufficient material'
+        else result.resultReason = '50-move rule'
+      } else if (moveNumber >= MAX_MOVES) {
+        result.result = 'draw'
+        result.resultReason = 'Max moves reached'
+      }
+    }
+
+    result.moves = moves
+    result.totalMoves = moves.length
+    result.finalFen = chess.fen()
+    result.pgn = chess.pgn()
+    result.aiMoveCount = aiMoveCount
+    result.totalCentipawnLoss = totalCentipawnLoss
+    result.averageCentipawnLoss = aiMoveCount > 0 ? totalCentipawnLoss / aiMoveCount : 0
+    result.blunders = blunders
+    result.mistakes = mistakes
+    result.inaccuracies = inaccuracies
+    result.status = 'completed'
+    result.completedAt = Date.now()
+
+    logger.info(`Game completed: ${result.result} - ACPL: ${result.averageCentipawnLoss?.toFixed(1)}`)
+  } catch (error) {
+    result.status = 'failed'
+    result.resultReason = error instanceof Error ? error.message : 'Unknown error'
+    logger.error('Game failed', { error })
+  } finally {
+    await engine.quit()
+  }
+
+  return result
+}
diff --git a/api/services/chess/create-game.ts b/api/services/chess/create-game.ts
index 0b0fcd1..a3fba82 100644
--- a/api/services/chess/create-game.ts
+++ b/api/services/chess/create-game.ts
@@ -1,6 +1,6 @@
 import { FlowContextStateStreams, Logger } from 'motia'
 import { createGameId } from './create-game-id'
-import { Game } from '@chessarena/types/game'
+import { BenchmarkVariant, Game } from '@chessarena/types/game'
 import { models } from '../ai/models'
 import { isAiGame } from './utils'
 import { User } from '@chessarena/types/user'
@@ -10,6 +10,7 @@ export const createGame = async (
   streams: FlowContextStateStreams,
   logger: Logger,
   user?: User,
+  variant: BenchmarkVariant = 'guided',
 ): Promise<Game> => {
   const gameId = await createGameId({ streams, logger })
 
@@ -18,11 +19,13 @@ export const createGame = async (
     fen: 'rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1',
     turn: 'white',
     status: 'pending',
+    variant,
     players: {
       white: { ...players.white, userId: players.white.ai ? undefined : user?.id },
       black: { ...players.black, userId: players.black.ai },
     },
     check: false,
+    createdAt: Date.now(),
   })
 
   if (isAiGame(game) && players.white.ai && players.black.ai) {
diff --git a/api/services/chess/generate-pgn.ts b/api/services/chess/generate-pgn.ts
new file mode 100644
index 0000000..d14188d
--- /dev/null
+++ b/api/services/chess/generate-pgn.ts
@@ -0,0 +1,77 @@
+import { Chess } from 'chess.js'
+import type { Game } from '@chessarena/types/game'
+import type { GameMove } from '@chessarena/types/game-move'
+
+type PgnOptions = {
+  game: Game
+  moves: GameMove[]
+}
+
+const detectPromotion = (fenBefore: string, fenAfter: string, to: string): string | undefined => {
+  // Check if a pawn moved to the back rank (promotion)
+  const toRank = to[1]
+  if (toRank !== '1' && toRank !== '8') return undefined
+
+  // Parse the piece at destination in fenAfter to detect what it promoted to
+  const afterBoard = fenAfter.split(' ')[0]
+  const file = to.charCodeAt(0) - 'a'.charCodeAt(0)
+  const rank = toRank === '8' ? 0 : 7
+
+  const rows = afterBoard.split('/')
+  let col = 0
+  for (const char of rows[rank]) {
+    if (col === file) {
+      const piece = char.toLowerCase()
+      if (['q', 'r', 'b', 'n'].includes(piece)) {
+        return piece
+      }
+      break
+    }
+    if (/\d/.test(char)) {
+      col += parseInt(char)
+    } else {
+      col++
+    }
+  }
+  return undefined
+}
+
+export const generatePgn = ({ game, moves }: PgnOptions): string => {
+  const chess = new Chess()
+
+  // Replay all moves to build the game
+  for (const move of moves) {
+    const from = move.lastMove[0]
+    const to = move.lastMove[1]
+
+    try {
+      const promotion = detectPromotion(move.fenBefore, move.fenAfter, to)
+      chess.move({ from, to, promotion })
+    } catch {
+      // Skip invalid moves (shouldn't happen with valid history)
+    }
+  }
+
+  // Build PGN headers
+  const headers: Record<string, string> = {
+    Event: 'ChessArena.ai Benchmark',
+    Site: 'https://chessarena.ai',
+    Date: new Date().toISOString().split('T')[0].replace(/-/g, '.'),
+    Round: '1',
+    White: game.players.white.ai ? `${game.players.white.ai} (${game.players.white.model || 'unknown'})` : 'Human',
+    Black: game.players.black.ai ? `${game.players.black.ai} (${game.players.black.model || 'unknown'})` : 'Human',
+    Result: game.winner === 'white' ? '1-0' : game.winner === 'black' ? '0-1' : '1/2-1/2',
+    Variant: game.variant || 'guided',
+  }
+
+  if (game.endGameReason) {
+    headers.Termination = game.endGameReason
+  }
+
+  // Set headers on chess instance
+  for (const [key, value] of Object.entries(headers)) {
+    chess.header(key, value)
+  }
+
+  return chess.pgn()
+}
diff --git a/api/steps/benchmark/00-generate-position-set.step.ts b/api/steps/benchmark/00-generate-position-set.step.ts
new file mode 100644
index 0000000..14abe22
--- /dev/null
+++ b/api/steps/benchmark/00-generate-position-set.step.ts
@@ -0,0 +1,65 @@
+import { ApiRouteConfig, Handlers } from 'motia'
+import { z } from 'zod'
+import { PositionSetSchema } from '@chessarena/types/legal-move-benchmark'
+import { generateTestPositions } from '../../services/benchmark/run-legal-move-benchmark'
+
+const bodySchema = z.object({
+  count: z.number().min(1).max(50).default(20),
+  force: z.boolean().default(false), // Force regenerate even if exists
+})
+
+export const config: ApiRouteConfig = {
+  type: 'api',
+  name: 'GeneratePositionSet',
+  description: 'Generate and store a set of test positions for benchmarking',
+  path: '/benchmark/positions/generate',
+  method: 'POST',
+  emits: [],
+  flows: ['benchmark'],
+  bodySchema,
+  responseSchema: {
+    200: PositionSetSchema,
+    400: z.object({ message: z.string() }),
+  },
+}
+
+export const handler: Handlers['GeneratePositionSet'] = async (req, { logger, streams }) => {
+  const { count, force } = req.body
+
+  logger.info('Generating position set', { count, force })
+
+  // Check if we already have a position set
+  const existingSet = await streams.positionSet.get('sets', 'default')
+  if (existingSet && existingSet.positions.length >= count && !force) {
+    logger.info('Using existing position set', { existingCount: existingSet.positions.length })
+    return { status: 200, body: existingSet }
+  }
+
+  try {
+    const positions = generateTestPositions({ count })
+
+    if (positions.length === 0) {
+      return { status: 400, body: { message: 'Failed to generate any positions' } }
+    }
+
+    const positionSet = {
+      id: `positions-${Date.now()}`,
+      createdAt: Date.now(),
+      count: positions.length,
+      positions,
+    }
+
+    // Store the position set
+    await streams.positionSet.set('sets', 'default', positionSet)
+
+    logger.info('Position set created', { count: positions.length })
+
+    return { status: 200, body: positionSet }
+  } catch (error) {
+    logger.error('Failed to generate positions', { error })
+    return {
+      status: 400,
+      body: { message: error instanceof Error ? error.message : 'Failed to generate positions' },
+    }
+  }
+}
diff --git a/api/steps/benchmark/01-run-legal-move-benchmark.step.ts b/api/steps/benchmark/01-run-legal-move-benchmark.step.ts
new file mode 100644
index 0000000..12587d5
--- /dev/null
+++ b/api/steps/benchmark/01-run-legal-move-benchmark.step.ts
@@ -0,0 +1,94 @@
+import { ApiRouteConfig, Handlers } from 'motia'
+import { z } from 'zod'
+import { AiModelProviderSchema } from '@chessarena/types/ai-models'
+import { LegalMoveBenchmarkRunSchema } from '@chessarena/types/legal-move-benchmark'
+import { runLegalMoveBenchmark, generateTestPositions } from '../../services/benchmark/run-legal-move-benchmark'
+import { getModelsForProvider } from '../../services/ai/models'
+
+const bodySchema = z.object({
+  provider: AiModelProviderSchema(),
+  model: z.string(),
+})
+
+export const config: ApiRouteConfig = {
+  type: 'api',
+  name: 'RunLegalMoveBenchmark',
+  description: 'Run legal move generation benchmark for a model',
+  path: '/benchmark/legal-moves/run',
+  method: 'POST',
+  emits: [],
+  flows: ['benchmark'],
+  bodySchema,
+  responseSchema: {
+    200: LegalMoveBenchmarkRunSchema,
+    400: z.object({ message: z.string() }),
+  },
+}
+
+export const handler: Handlers['RunLegalMoveBenchmark'] = async (req, { logger, streams }) => {
+  const { provider, model } = req.body
+
+  // Validate model exists for provider
+  const supportedModels = getModelsForProvider(provider)
+  if (!supportedModels.includes(model)) {
+    return {
+      status: 400,
+      body: { message: `Model ${model} is not supported for provider ${provider}` },
+    }
+  }
+
+  // Get or create position set
+  let positionSet = await streams.positionSet.get('sets', 'default')
+  if (!positionSet || positionSet.positions.length === 0) {
+    logger.info('No position set found, generating new one')
+    const positions = generateTestPositions({ count: 20 })
+    positionSet = {
+      id: `positions-${Date.now()}`,
+      createdAt: Date.now(),
+      count: positions.length,
+      positions,
+    }
+    await streams.positionSet.set('sets', 'default', positionSet)
+  }
+
+  logger.info('Starting legal move benchmark', { provider, model, positionCount: positionSet.positions.length })
+
+  try {
+    const run = await runLegalMoveBenchmark(positionSet.positions, provider, model, logger)
+
+    // Store the run result
+    await streams.legalMoveBenchmark.set('runs', run.id, run)
+
+    // Update summary for this model
+    const summaryId = `${provider}:${model}`
+    const existingSummary = await streams.legalMoveBenchmarkSummary.get('models', summaryId)
+
+    const newSummary = {
+      id: summaryId,
+      provider,
+      model,
+      runsCompleted: (existingSummary?.runsCompleted ?? 0) + 1,
+      averageScore: existingSummary
+        ? (existingSummary.averageScore * existingSummary.runsCompleted + (run.averageFinalScore ?? 0)) /
+          (existingSummary.runsCompleted + 1)
+        : (run.averageFinalScore ?? 0),
+      bestScore: Math.max(existingSummary?.bestScore ?? 0, run.averageFinalScore ?? 0),
+      worstScore: existingSummary
+        ? Math.min(existingSummary.worstScore, run.averageFinalScore ?? 0)
+        : (run.averageFinalScore ?? 0),
+      lastRunAt: Date.now(),
+    }
+
+    await streams.legalMoveBenchmarkSummary.set('models', summaryId, newSummary)
+
+    logger.info('Benchmark completed and stored', { runId: run.id })
+
+    return { status: 200, body: run }
+  } catch (error) {
+    logger.error('Benchmark failed', { error })
+    return {
+      status: 400,
+      body: { message: error instanceof Error ? error.message : 'Benchmark failed' },
+    }
+  }
+}
diff --git a/api/steps/benchmark/02-get-benchmark-runs.step.ts b/api/steps/benchmark/02-get-benchmark-runs.step.ts
new file mode 100644
index 0000000..d2a35ee
--- /dev/null
+++ b/api/steps/benchmark/02-get-benchmark-runs.step.ts
@@ -0,0 +1,64 @@
+import { ApiRouteConfig, Handlers } from 'motia'
+import { z } from 'zod'
+import { LegalMoveBenchmarkRunSchema } from '@chessarena/types/legal-move-benchmark'
+
+export const config: ApiRouteConfig = {
+  type: 'api',
+  name: 'GetBenchmarkRuns',
+  description: 'Get legal move benchmark runs with optional filters',
+  path: '/benchmark/legal-moves/runs',
+  method: 'GET',
+  emits: [],
+  flows: ['benchmark'],
+  queryParams: [
+    { name: 'provider', description: 'Filter by AI provider' },
+    { name: 'model', description: 'Filter by model name' },
+    { name: 'limit', description: 'Pagination limit' },
+    { name: 'offset', description: 'Pagination offset' },
+  ],
+  responseSchema: {
+    200: z.object({
+      runs: z.array(
+        LegalMoveBenchmarkRunSchema.omit({ positions: true, results: true }).extend({
+          resultsCount: z.number(),
+        }),
+      ),
+      total: z.number(),
+    }),
+  },
+}
+
+export const handler: Handlers['GetBenchmarkRuns'] = async (req, { logger, streams }) => {
+  const params = req.queryParams as Record<string, string | undefined>
+  const provider = params.provider
+  const model = params.model
+  const limit = params.limit ? parseInt(params.limit) : 20
+  const offset = params.offset ? parseInt(params.offset) : 0
+
+  logger.info('Fetching benchmark runs', { provider, model, limit, offset })
+
+  const allRuns = await streams.legalMoveBenchmark.getGroup('runs')
+
+  // Filter
+  let filtered = allRuns.filter((run) => {
+    if (provider && run.provider !== provider) return false
+    if (model && run.model !== model) return false
+    return true
+  })
+
+  // Sort by most recent
+  filtered.sort((a, b) => (b.createdAt ?? 0) - (a.createdAt ?? 0))
+
+  const total = filtered.length
+
+  // Paginate
+  const paginated = filtered.slice(offset, offset + limit)
+
+  // Remove heavy fields and add count
+  const runs = paginated.map(({ positions, results, ...rest }) => ({
+    ...rest,
+    resultsCount: results?.length ?? 0,
+  }))
+
+  return { status: 200, body: { runs, total } }
+}
diff --git a/api/steps/benchmark/03-get-benchmark-run-detail.step.ts b/api/steps/benchmark/03-get-benchmark-run-detail.step.ts
new file mode 100644
index 0000000..7bdb949
--- /dev/null
+++ b/api/steps/benchmark/03-get-benchmark-run-detail.step.ts
@@ -0,0 +1,31 @@
+import { ApiRouteConfig, Handlers } from 'motia'
+import { z } from 'zod'
+import { LegalMoveBenchmarkRunSchema } from '@chessarena/types/legal-move-benchmark'
+
+export const config: ApiRouteConfig = {
+  type: 'api',
+  name: 'GetBenchmarkRunDetail',
+  description: 'Get detailed benchmark run including all positions and results',
+  path: '/benchmark/legal-moves/runs/:runId',
+  method: 'GET',
+  emits: [],
+  flows: ['benchmark'],
+  responseSchema: {
+    200: LegalMoveBenchmarkRunSchema,
+    404: z.object({ message: z.string() }),
+  },
+}
+
+export const handler: Handlers['GetBenchmarkRunDetail'] = async (req, { logger, streams }) => {
+  const { runId } = req.pathParams
+
+  logger.info('Fetching benchmark run detail', { runId })
+
+  const run = await streams.legalMoveBenchmark.get('runs', runId)
+
+  if (!run) {
+    return { status: 404, body: { message: 'Benchmark run not found' } }
+  }
+
+  return { status: 200, body: run }
+}
diff --git a/api/steps/benchmark/04-get-benchmark-leaderboard.step.ts b/api/steps/benchmark/04-get-benchmark-leaderboard.step.ts
new file mode 100644
index 0000000..3366553
--- /dev/null
+++ b/api/steps/benchmark/04-get-benchmark-leaderboard.step.ts
@@ -0,0 +1,29 @@
+import { ApiRouteConfig, Handlers } from 'motia'
+import { z } from 'zod'
+import { LegalMoveBenchmarkSummarySchema } from '@chessarena/types/legal-move-benchmark'
+
+export const config: ApiRouteConfig = {
+  type: 'api',
+  name: 'GetBenchmarkLeaderboard',
+  description: 'Get legal move benchmark leaderboard sorted by average score',
+  path: '/benchmark/legal-moves/leaderboard',
+  method: 'GET',
+  emits: [],
+  flows: ['benchmark'],
+  responseSchema: {
+    200: z.object({
+      leaderboard: z.array(LegalMoveBenchmarkSummarySchema),
+    }),
+  },
+}
+
+export const handler: Handlers['GetBenchmarkLeaderboard'] = async (req, { logger, streams }) => {
+  logger.info('Fetching benchmark leaderboard')
+
+  const summaries = await streams.legalMoveBenchmarkSummary.getGroup('models')
+
+  // Sort by average score descending
+  const sorted = summaries.sort((a, b) => (b.averageScore ?? 0) - (a.averageScore ?? 0))
+
+  return { status: 200, body: { leaderboard: sorted } }
+}
diff --git a/api/steps/benchmark/05-fetch-puzzle-set.step.ts b/api/steps/benchmark/05-fetch-puzzle-set.step.ts
new file mode 100644
index 0000000..a1e4034
--- /dev/null
+++ b/api/steps/benchmark/05-fetch-puzzle-set.step.ts
@@ -0,0 +1,72 @@
+import { ApiRouteConfig, Handlers } from 'motia'
+import { z } from 'zod'
+import { PuzzleSetSchema, PuzzleThemeSchema } from '@chessarena/types/puzzle-benchmark'
+import { fetchPuzzles } from '../../services/benchmark/fetch-lichess-puzzles'
+
+const bodySchema = z.object({
+  theme: PuzzleThemeSchema,
+  count: z.number().min(1).max(100).default(10),
+})
+
+export const config: ApiRouteConfig = {
+  type: 'api',
+  name: 'FetchPuzzleSet',
+  description: 'Fetch and store a set of puzzles from Lichess',
+  path: '/benchmark/puzzles/fetch',
+  method: 'POST',
+  emits: [],
+  flows: ['benchmark'],
+  bodySchema,
+  responseSchema: {
+    200: PuzzleSetSchema,
+    400: z.object({ message: z.string() }),
+  },
+}
+
+export const handler: Handlers['FetchPuzzleSet'] = async (req, { logger, streams }) => {
+  const { theme, count } = req.body
+
+  logger.info('Fetching puzzle set', { theme, count })
+
+  // Check if we already have a puzzle set for this theme
+  const existingSet = await streams.puzzleSet.get('sets', theme)
+  if (existingSet && existingSet.puzzles.length >= count) {
+    logger.info('Using existing puzzle set', { theme, existingCount: existingSet.puzzles.length })
+    return { status: 200, body: existingSet }
+  }
+
+  try {
+    const existingIds = new Set<string>(existingSet?.puzzles.map((p) => p.id) ?? [])
+    const needed = existingSet ? Math.max(0, count - existingSet.puzzles.length) : count
+
+    const fetched = needed > 0 ? await fetchPuzzles(theme, needed, logger) : []
+    const newUnique = fetched.filter((p) => !existingIds.has(p.id))
+
+    const puzzles = existingSet ? [...existingSet.puzzles, ...newUnique] : newUnique
+
+    if (puzzles.length === 0) {
+      return { status: 400, body: { message: 'Failed to fetch any puzzles' } }
+    }
+
+    const puzzleSet = {
+      id: `${theme}-${Date.now()}`,
+      theme,
+      createdAt: Date.now(),
+      puzzles,
+      count: puzzles.length,
+    }
+
+    // Store the puzzle set
+    await streams.puzzleSet.set('sets', theme, puzzleSet)
+
+    logger.info('Puzzle set created', { theme, count: puzzles.length })
+
+    return { status: 200, body: puzzleSet }
+  } catch (error) {
+    logger.error('Failed to fetch puzzles', { error })
+    return {
+      status: 400,
+      body: { message: error instanceof Error ? error.message : 'Failed to fetch puzzles' },
+    }
+  }
+}
diff --git a/api/steps/benchmark/06-run-puzzle-benchmark.step.ts b/api/steps/benchmark/06-run-puzzle-benchmark.step.ts
new file mode 100644
index 0000000..318d2c4
--- /dev/null
+++ b/api/steps/benchmark/06-run-puzzle-benchmark.step.ts
@@ -0,0 +1,92 @@
+import { ApiRouteConfig, Handlers } from 'motia'
+import { z } from 'zod'
+import { AiModelProviderSchema } from '@chessarena/types/ai-models'
+import { PuzzleBenchmarkRunSchema, PuzzleThemeSchema } from '@chessarena/types/puzzle-benchmark'
+import { runPuzzleBenchmark } from '../../services/benchmark/run-puzzle-benchmark'
+import { getModelsForProvider } from '../../services/ai/models'
+
+const bodySchema = z.object({
+  provider: AiModelProviderSchema(),
+  model: z.string(),
+  theme: PuzzleThemeSchema,
+  count: z.number().min(1).max(100).default(10),
+})
+
+export const config: ApiRouteConfig = {
+  type: 'api',
+  name: 'RunPuzzleBenchmark',
+  description: 'Run puzzle benchmark for a model',
+  path: '/benchmark/puzzles/run',
+  method: 'POST',
+  emits: [],
+  flows: ['benchmark'],
+  bodySchema,
+  responseSchema: {
+    200: PuzzleBenchmarkRunSchema,
+    400: z.object({ message: z.string() }),
+  },
+}
+
+export const handler: Handlers['RunPuzzleBenchmark'] = async (req, { logger, streams }) => {
+  const { provider, model, theme, count } = req.body
+
+  // Validate model exists
+  const supportedModels = getModelsForProvider(provider)
+  if (!supportedModels.includes(model)) {
+    return {
+      status: 400,
+      body: { message: `Model ${model} is not supported for provider ${provider}` },
+    }
+  }
+
+  // Get the puzzle set
+  const puzzleSet = await streams.puzzleSet.get('sets', theme)
+  if (!puzzleSet || puzzleSet.puzzles.length === 0) {
+    return {
+      status: 400,
+      body: { message: `No puzzle set found for theme ${theme}. Fetch puzzles first.` },
+    }
+  }
+
+  const puzzles = puzzleSet.puzzles.slice(0, count)
+  logger.info('Starting puzzle benchmark', { provider, model, theme, puzzleCount: puzzles.length })
+
+  try {
+    const run = await runPuzzleBenchmark(puzzles, puzzleSet.id, theme, provider, model, logger)
+
+    // Store the run
+    await streams.puzzleBenchmark.set('runs', run.id, run)
+
+    // Update summary
+    const summaryId = `${provider}:${model}`
+    const existingSummary = await streams.puzzleBenchmarkSummary.get('models', summaryId)
+
+    const newSummary = {
+      id: summaryId,
+      provider,
+      model,
+      runsCompleted: (existingSummary?.runsCompleted ?? 0) + 1,
+      lastRunAt: Date.now(),
+      mateIn1Accuracy: theme === 'mateIn1' ? run.accuracy : existingSummary?.mateIn1Accuracy,
+      oneMoveAccuracy: theme === 'oneMove' ? run.accuracy : existingSummary?.oneMoveAccuracy,
+      overallAccuracy: 0, // Will be calculated below
+    }
+
+    // Calculate overall accuracy if both themes have been run
+    if (newSummary.mateIn1Accuracy !== undefined && newSummary.oneMoveAccuracy !== undefined) {
+      newSummary.overallAccuracy = (newSummary.mateIn1Accuracy + newSummary.oneMoveAccuracy) / 2
+    } else {
+      newSummary.overallAccuracy = newSummary.mateIn1Accuracy ?? newSummary.oneMoveAccuracy ?? 0
+    }
+
+    await streams.puzzleBenchmarkSummary.set('models', summaryId, newSummary)
+
+    return { status: 200, body: run }
+  } catch (error) {
+    logger.error('Puzzle benchmark failed', { error })
+    return {
+      status: 400,
+      body: { message: error instanceof Error ? error.message : 'Benchmark failed' },
+    }
+  }
+}
diff --git a/api/steps/benchmark/06b-run-all-puzzle-benchmarks.step.ts b/api/steps/benchmark/06b-run-all-puzzle-benchmarks.step.ts
new file mode 100644
index 0000000..0b0faaa
--- /dev/null
+++ b/api/steps/benchmark/06b-run-all-puzzle-benchmarks.step.ts
@@ -0,0 +1,147 @@
+import { ApiRouteConfig, Handlers, Logger } from 'motia'
+import { z } from 'zod'
+import { PuzzleThemeSchema } from '@chessarena/types/puzzle-benchmark'
+import { getAllModels } from '../../services/ai/models'
+import { AiModelProvider } from '@chessarena/types/ai-models'
+import { runPuzzleBenchmark } from '../../services/benchmark/run-puzzle-benchmark'
+import { fetchPuzzles } from '../../services/benchmark/fetch-lichess-puzzles'
+import { mapWithConcurrency, parsePositiveInt } from '../../services/benchmark/concurrency'
+
+const bodySchema = z.object({
+  theme: PuzzleThemeSchema,
+  count: z.number().min(1).max(100).default(10),
+  rerunCompleted: z.boolean().default(false),
+})
+
+export const config: ApiRouteConfig = {
+  type: 'api',
+  name: 'RunAllPuzzleBenchmarks',
+  description: 'Fetch a puzzle set (if needed) and run puzzle benchmark for all models',
+  path: '/benchmark/puzzles/run-all',
+  method: 'POST',
+  emits: [],
+  flows: ['benchmark'],
+  bodySchema,
+  responseSchema: {
+    200: z.object({
+      message: z.string(),
+      theme: PuzzleThemeSchema,
+      puzzleCount: z.number(),
+      totalModels: z.number(),
+    }),
+    400: z.object({ message: z.string() }),
+  },
+}
+
+const shouldSkipModel = (theme: z.infer<typeof PuzzleThemeSchema>, existing: any | undefined): boolean => {
+  if (!existing) return false
+  if (theme === 'mateIn1') return typeof existing.mateIn1Accuracy === 'number'
+  if (theme === 'oneMove') return typeof existing.oneMoveAccuracy === 'number'
+  return false
+}
+
+export const handler: Handlers['RunAllPuzzleBenchmarks'] = async (req, { logger, streams }) => {
+  const theme = req.body.theme
+  const count = req.body.count ?? 10
+  const rerunCompleted = req.body.rerunCompleted
+
+  let puzzleSet = await streams.puzzleSet.get('sets', theme)
+  if (!puzzleSet || puzzleSet.puzzles.length < count) {
+    const needed = puzzleSet ? Math.max(0, count - puzzleSet.puzzles.length) : count
+    const fetched = needed > 0 ? await fetchPuzzles(theme, needed, logger as Logger) : []
+    const existingIds = new Set<string>(puzzleSet?.puzzles.map((p) => p.id) ?? [])
+    const newUnique = fetched.filter((p) => !existingIds.has(p.id))
+    const puzzles = puzzleSet ? [...puzzleSet.puzzles, ...newUnique] : newUnique
+
+    if (puzzles.length === 0) {
+      return { status: 400, body: { message: 'Failed to fetch any puzzles' } }
+    }
+
+    puzzleSet = {
+      id: `${theme}-${Date.now()}`,
+      theme,
+      createdAt: Date.now(),
+      puzzles,
+      count: puzzles.length,
+    }
+    await streams.puzzleSet.set('sets', theme, puzzleSet)
+  }
+
+  const uniqueById = new Map<string, (typeof puzzleSet.puzzles)[number]>()
+  for (const p of puzzleSet.puzzles) uniqueById.set(p.id, p)
+  const puzzlesToRun = Array.from(uniqueById.values()).slice(0, count)
+
+  logger.info('RunAllPuzzleBenchmarks starting', {
+    theme,
+    requestedCount: count,
+    availableInSet: puzzleSet.puzzles.length,
+    uniqueAvailable: uniqueById.size,
+    using: puzzlesToRun.length,
+    rerunCompleted,
+  })
+
+  const allModels = getAllModels()
+  const existingSummaries = await streams.puzzleBenchmarkSummary.getGroup('models')
+  const existingMap = new Map(existingSummaries.map((s) => [`${s.provider}:${s.model}`, s]))
+
+  const modelsToBenchmark = rerunCompleted
+    ? allModels
+    : allModels.filter(({ provider, model }) => !shouldSkipModel(theme, existingMap.get(`${provider}:${model}`)))
+
+  const providerConcurrency = parsePositiveInt(process.env.BENCHMARK_PROVIDER_CONCURRENCY, 4)
+  const modelConcurrencyPerProvider = parsePositiveInt(process.env.BENCHMARK_MODEL_CONCURRENCY_PER_PROVIDER, 1)
+
+  const modelsByProvider = modelsToBenchmark.reduce<
+    Record<AiModelProvider, { provider: AiModelProvider; model: string }[]>
+  >(
+    (acc, entry) => {
+      acc[entry.provider].push(entry)
+      return acc
+    },
+    { openai: [], gemini: [], claude: [], grok: [] },
+  )
+
+  const providers: AiModelProvider[] = ['openai', 'gemini', 'claude', 'grok']
+
+  await mapWithConcurrency(providers, providerConcurrency, async (provider) => {
+    const models = modelsByProvider[provider]
+    await mapWithConcurrency(models, modelConcurrencyPerProvider, async ({ model }) => {
+      logger.info(`\n=== PUZZLES MODEL: ${provider}/${model} (${theme}) ===`)
+
+      const run = await runPuzzleBenchmark(puzzlesToRun, puzzleSet!.id, theme, provider, model, logger)
+      await streams.puzzleBenchmark.set('runs', run.id, run)
+
+      const summaryId = `${provider}:${model}`
+      const existingSummary = await streams.puzzleBenchmarkSummary.get('models', summaryId)
+
+      const newSummary = {
+        id: summaryId,
+        provider,
+        model,
+        runsCompleted: (existingSummary?.runsCompleted ?? 0) + 1,
+        lastRunAt: Date.now(),
+        mateIn1Accuracy: theme === 'mateIn1' ? run.accuracy : existingSummary?.mateIn1Accuracy,
+        oneMoveAccuracy: theme === 'oneMove' ? run.accuracy : existingSummary?.oneMoveAccuracy,
+        overallAccuracy: 0,
+      }
+
+      if (newSummary.mateIn1Accuracy !== undefined && newSummary.oneMoveAccuracy !== undefined) {
+        newSummary.overallAccuracy = (newSummary.mateIn1Accuracy + newSummary.oneMoveAccuracy) / 2
+      } else {
+        newSummary.overallAccuracy = newSummary.mateIn1Accuracy ?? newSummary.oneMoveAccuracy ?? 0
+      }
+
+      await streams.puzzleBenchmarkSummary.set('models', summaryId, newSummary)
+    })
+  })
+
+  return {
+    status: 200,
+    body: {
+      message: `Puzzle benchmark completed for ${modelsToBenchmark.length} models`,
+      theme,
+      puzzleCount: count,
+      totalModels: modelsToBenchmark.length,
+    },
+  }
+}
diff --git a/api/steps/benchmark/07-get-puzzle-leaderboard.step.ts b/api/steps/benchmark/07-get-puzzle-leaderboard.step.ts
new file mode 100644
index 0000000..070f8e7
--- /dev/null
+++ b/api/steps/benchmark/07-get-puzzle-leaderboard.step.ts
@@ -0,0 +1,29 @@
+import { ApiRouteConfig, Handlers } from 'motia'
+import { z } from 'zod'
+import { PuzzleBenchmarkSummarySchema } from '@chessarena/types/puzzle-benchmark'
+
+export const config: ApiRouteConfig = {
+  type: 'api',
+  name: 'GetPuzzleLeaderboard',
+  description: 'Get puzzle benchmark leaderboard',
+  path: '/benchmark/puzzles/leaderboard',
+  method: 'GET',
+  emits: [],
+  flows: ['benchmark'],
+  responseSchema: {
+    200: z.object({
+      leaderboard: z.array(PuzzleBenchmarkSummarySchema),
+    }),
+  },
+}
+
+export const handler: Handlers['GetPuzzleLeaderboard'] = async (req, { logger, streams }) => {
+  logger.info('Fetching puzzle leaderboard')
+
+  const summaries = await streams.puzzleBenchmarkSummary.getGroup('models')
+
+  // Sort by overall accuracy descending
+  const sorted = summaries.sort((a, b) => (b.overallAccuracy ?? 0) - (a.overallAccuracy ?? 0))
+
+  return { status: 200, body: { leaderboard: sorted } }
+}
diff --git a/api/steps/benchmark/08-get-puzzle-sets.step.ts b/api/steps/benchmark/08-get-puzzle-sets.step.ts
new file mode 100644
index 0000000..7c1f6c5
--- /dev/null
+++ b/api/steps/benchmark/08-get-puzzle-sets.step.ts
@@ -0,0 +1,35 @@
+import { ApiRouteConfig, Handlers } from 'motia'
+import { z } from 'zod'
+import { PuzzleSetSchema } from '@chessarena/types/puzzle-benchmark'
+
+export const config: ApiRouteConfig = {
+  type: 'api',
+  name: 'GetPuzzleSets',
+  description: 'Get all cached puzzle sets',
+  path: '/benchmark/puzzles/sets',
+  method: 'GET',
+  emits: [],
+  flows: ['benchmark'],
+  responseSchema: {
+    200: z.object({
+      sets: z.array(
+        PuzzleSetSchema.omit({ puzzles: true }).extend({
+          puzzleCount: z.number(),
+        }),
+      ),
+    }),
+  },
+}
+
+export const handler: Handlers['GetPuzzleSets'] = async (req, { logger, streams }) => {
+  logger.info('Fetching puzzle sets')
+
+  const allSets = await streams.puzzleSet.getGroup('sets')
+
+  const sets = allSets.map(({ puzzles, ...rest }) => ({
+    ...rest,
+    puzzleCount: puzzles?.length ?? 0,
+  }))
+
+  return { status: 200, body: { sets } }
+}
diff --git a/api/steps/benchmark/09-run-all-benchmarks.step.ts b/api/steps/benchmark/09-run-all-benchmarks.step.ts
new file mode 100644
index 0000000..142c7d4
--- /dev/null
+++ b/api/steps/benchmark/09-run-all-benchmarks.step.ts
@@ -0,0 +1,196 @@
+import { ApiRouteConfig, Handlers, Logger } from 'motia'
+import { z } from 'zod'
+import { generateTestPositions, runLegalMoveBenchmark } from '../../services/benchmark/run-legal-move-benchmark'
+import { makeBenchmarkPrompt } from '../../services/benchmark/benchmark-prompt'
+import { getAllModels } from '../../services/ai/models'
+import { AiModelProvider } from '@chessarena/types/ai-models'
+import { TestPosition, ModelBenchmarkResult, LegalMoveBenchmarkRun } from '@chessarena/types/legal-move-benchmark'
+import fs from 'fs'
+import path from 'path'
+import mustache from 'mustache'
+import { mapWithConcurrency, parsePositiveInt } from '../../services/benchmark/concurrency'
+
+const promptTemplate = fs.readFileSync(path.join(__dirname, '../chess/legal-move-benchmark.mustache'), 'utf8')
+
+const bodySchema = z.object({
+  positionCount: z.number().min(1).max(50).default(20),
+  force: z.boolean().default(false),
+  rerunCompleted: z.boolean().default(false),
+})
+
+export const config: ApiRouteConfig = {
+  type: 'api',
+  name: 'RunAllBenchmarks',
+  description: 'Run legal move benchmark for ALL models - one per provider in parallel',
+  path: '/benchmark/legal-moves/run-all',
+  method: 'POST',
+  emits: [],
+  flows: ['benchmark'],
+  bodySchema,
+  responseSchema: {
+    200: z.object({
+      message: z.string(),
+      positionCount: z.number(),
+      totalModels: z.number(),
+    }),
+    400: z.object({ message: z.string() }),
+  },
+}
+
+// Calculate F1 score
+const calculateScore = (legalMoves: string[], modelMoves: string[]) => {
+  const legalSet = new Set(legalMoves)
+  const correct = modelMoves.filter((m) => legalSet.has(m))
+  const illegal = modelMoves.filter((m) => !legalSet.has(m))
+  const missed = legalMoves.filter((m) => !new Set(modelMoves).has(m))
+
+  const recall = legalMoves.length > 0 ? (correct.length / legalMoves.length) * 100 : 0
+  const precision = modelMoves.length > 0 ? (correct.length / modelMoves.length) * 100 : 0
+  const finalScore = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0
+
+  return { correct, illegal, missed, accuracy: recall, penalty: 100 - precision, finalScore }
+}
+
+// Benchmark single position for single model
+const benchmarkSinglePosition = async (
+  position: TestPosition,
+  provider: AiModelProvider,
+  model: string,
+  logger: Logger,
+): Promise<ModelBenchmarkResult> => {
+  const prompt = mustache.render(
+    promptTemplate,
+    { pgn: position.pgn, fen: position.fen, turn: position.turn.toUpperCase() },
+    {},
+    { escape: (v: string) => v },
+  )
+
+  const startTime = Date.now()
+  let rawResponse = ''
+  let modelMoves: string[] = []
+  let error: string | undefined
+
+  try {
+    const response = await makeBenchmarkPrompt({ prompt, provider, model, logger })
+    rawResponse = response.rawResponse
+    modelMoves = response.moves
+  } catch (e) {
+    error = e instanceof Error ? e.message : 'Unknown error'
+  }
+
+  const responseTime = Date.now() - startTime
+  const { correct, illegal, missed, accuracy, penalty, finalScore } = calculateScore(position.legalMoves, modelMoves)
+
+  return {
+    positionId: position.id,
+    modelMoves,
+    correctMoves: correct,
+    illegalMoves: illegal,
+    missedMoves: missed,
+    accuracy,
+    penalty,
+    finalScore,
+    responseTime,
+    rawResponse,
+    error,
+  }
+}
+
+export const handler: Handlers['RunAllBenchmarks'] = async (req, { logger, streams }) => {
+  const { positionCount, force, rerunCompleted } = req.body
+
+  logger.info('=== STARTING FULL BENCHMARK ===', { positionCount, force })
+
+  // Get or create positions
+  let positionSet = await streams.positionSet.get('sets', 'default')
+  if (!positionSet || positionSet.positions.length === 0 || force) {
+    logger.info('Generating positions...', { count: positionCount })
+    const positions = generateTestPositions({ count: positionCount })
+    positionSet = {
+      id: `positions-${Date.now()}`,
+      createdAt: Date.now(),
+      count: positions.length,
+      positions,
+    }
+    await streams.positionSet.set('sets', 'default', positionSet)
+    logger.info('Positions generated', { count: positions.length })
+  }
+
+  const positions = positionSet.positions
+
+  // Get all models to benchmark using the helper function
+  const allModels = getAllModels()
+
+  const existingSummaries = await streams.legalMoveBenchmarkSummary.getGroup('models')
+  const completedSet = new Set(existingSummaries.map((s) => `${s.provider}:${s.model}`))
+
+  const modelsToBenchmark = rerunCompleted
+    ? allModels
+    : allModels.filter((m) => !completedSet.has(`${m.provider}:${m.model}`))
+
+  const totalModels = modelsToBenchmark.length
+  logger.info('Models to benchmark', { totalModels })
+
+  const runAllBenchmarks = async () => {
+    const providerConcurrency = parsePositiveInt(process.env.BENCHMARK_PROVIDER_CONCURRENCY, 4)
+    const modelConcurrencyPerProvider = parsePositiveInt(process.env.BENCHMARK_MODEL_CONCURRENCY_PER_PROVIDER, 1)
+
+    const modelsByProvider = modelsToBenchmark.reduce<
+      Record<AiModelProvider, { provider: AiModelProvider; model: string }[]>
+    >(
+      (acc, entry) => {
+        acc[entry.provider].push(entry)
+        return acc
+      },
+      { openai: [], gemini: [], claude: [], grok: [] },
+    )
+
+    const providers: AiModelProvider[] = ['openai', 'gemini', 'claude', 'grok']
+
+    await mapWithConcurrency(providers, providerConcurrency, async (provider) => {
+      const models = modelsByProvider[provider]
+      await mapWithConcurrency(models, modelConcurrencyPerProvider, async ({ model }) => {
+        logger.info(`\n=== MODEL: ${provider}/${model} ===`)
+
+        const run = await runLegalMoveBenchmark(positions, provider, model, logger)
+
+        await streams.legalMoveBenchmark.set('runs', run.id, run)
+
+        const key = `${provider}:${model}`
+        if (run.status === 'completed' && run.averageFinalScore !== undefined) {
+          const existing = await streams.legalMoveBenchmarkSummary.get('models', key)
+          await streams.legalMoveBenchmarkSummary.set('models', key, {
+            id: key,
+            provider,
+            model,
+            runsCompleted: (existing?.runsCompleted ?? 0) + 1,
+            averageScore: existing
+              ? (existing.averageScore * existing.runsCompleted + run.averageFinalScore) / (existing.runsCompleted + 1)
+              : run.averageFinalScore,
+            bestScore: Math.max(existing?.bestScore ?? 0, run.averageFinalScore),
+            worstScore: existing ? Math.min(existing.worstScore, run.averageFinalScore) : run.averageFinalScore,
+            lastRunAt: Date.now(),
+          })
+        } else {
+          logger.warn('Skipping summary update for failed run', { provider, model, runId: run.id })
+        }
+      })
+    })
+
+    logger.info('\n\n========================================')
+    logger.info('=== ALL BENCHMARKS COMPLETED ===')
+    logger.info('========================================\n')
+  }
+
+  // Actually await the benchmarks (fire-and-forget was causing issues)
+  await runAllBenchmarks()
+
+  return {
+    status: 200,
+    body: {
+      message: `Benchmark completed for ${totalModels} models`,
+      positionCount: positions.length,
+      totalModels,
+    },
+  }
+}
diff --git a/api/steps/benchmark/10-run-stockfish-benchmark.step.ts b/api/steps/benchmark/10-run-stockfish-benchmark.step.ts
new file mode 100644
index 0000000..54ecd53
--- /dev/null
+++ b/api/steps/benchmark/10-run-stockfish-benchmark.step.ts
@@ -0,0 +1,133 @@
+import { ApiRouteConfig, Handlers } from 'motia'
+import { z } from 'zod'
+import { AiModelProviderSchema } from '@chessarena/types/ai-models'
+import { StockfishBenchmarkRunSchema } from '@chessarena/types/stockfish-benchmark'
+import { playGameAgainstStockfish } from '../../services/benchmark/stockfish-game'
+import { getModelsForProvider } from '../../services/ai/models'
+
+const bodySchema = z.object({
+  provider: AiModelProviderSchema(),
+  model: z.string(),
+  stockfishLevel: z.number().min(1).max(20).default(10),
+})
+
+export const config: ApiRouteConfig = {
+  type: 'api',
+  name: 'RunStockfishBenchmark',
+  description: 'Run Stockfish benchmark (2 games: one as white, one as black)',
+  path: '/benchmark/stockfish/run',
+  method: 'POST',
+  emits: [],
+  flows: ['benchmark'],
+  bodySchema,
+  responseSchema: {
+    200: StockfishBenchmarkRunSchema,
+    400: z.object({ message: z.string() }),
+  },
+}
+
+export const handler: Handlers['RunStockfishBenchmark'] = async (req, { logger, streams }) => {
+  const { provider, model, stockfishLevel } = req.body
+
+  // Validate model
+  const supportedModels = getModelsForProvider(provider)
+  if (!supportedModels.includes(model)) {
+    return {
+      status: 400,
+      body: { message: `Model ${model} is not supported for provider ${provider}` },
+    }
+  }
+
+  const runId = crypto.randomUUID()
+  logger.info('=== STARTING STOCKFISH BENCHMARK ===', { provider, model, stockfishLevel })
+
+  const run = {
+    id: runId,
+    createdAt: Date.now(),
+    status: 'running' as const,
+    provider,
+    model,
+    stockfishLevel,
+    gamesPlayed: 0,
+    wins: 0,
+    losses: 0,
+    draws: 0,
+  }
+
+  // Run games in background
+  const runBenchmark = async () => {
+    try {
+      // Game 1: AI plays as White
+      logger.info('\n=== GAME 1: AI as WHITE ===')
+      const gameAsWhite = await playGameAgainstStockfish(provider, model, 'white', stockfishLevel, logger)
+      run.gameAsWhite = gameAsWhite
+      run.gamesPlayed++
+      if (gameAsWhite.result === 'ai_win') run.wins++
+      else if (gameAsWhite.result === 'stockfish_win') run.losses++
+      else run.draws++
+
+      logger.info(`Game 1 result: ${gameAsWhite.result}, ACPL: ${gameAsWhite.averageCentipawnLoss?.toFixed(1)}`)
+
+      // Small delay between games
+      await new Promise((r) => setTimeout(r, 2000))
+
+      // Game 2: AI plays as Black
+      logger.info('\n=== GAME 2: AI as BLACK ===')
+      const gameAsBlack = await playGameAgainstStockfish(provider, model, 'black', stockfishLevel, logger)
+      run.gameAsBlack = gameAsBlack
+      run.gamesPlayed++
+      if (gameAsBlack.result === 'ai_win') run.wins++
+      else if (gameAsBlack.result === 'stockfish_win') run.losses++
+      else run.draws++
+
+      logger.info(`Game 2 result: ${gameAsBlack.result}, ACPL: ${gameAsBlack.averageCentipawnLoss?.toFixed(1)}`)
+
+      // Calculate overall ACPL
+      const totalMoves = (gameAsWhite.aiMoveCount ?? 0) + (gameAsBlack.aiMoveCount ?? 0)
+      const totalLoss = (gameAsWhite.totalCentipawnLoss ?? 0) + (gameAsBlack.totalCentipawnLoss ?? 0)
+      run.overallAcpl = totalMoves > 0 ? totalLoss / totalMoves : 0
+
+      run.status = 'completed'
+      run.completedAt = Date.now()
+
+      // Store run
+      await streams.stockfishBenchmark.set('runs', runId, run)
+
+      // Update summary
+      const summaryId = `${provider}:${model}`
+      const existing = await streams.stockfishBenchmarkSummary.get('models', summaryId)
+      await streams.stockfishBenchmarkSummary.set('models', summaryId, {
+        id: summaryId,
+        provider,
+        model,
+        runsCompleted: (existing?.runsCompleted ?? 0) + 1,
+        averageAcpl: existing
+          ? (existing.averageAcpl * existing.runsCompleted + run.overallAcpl) / (existing.runsCompleted + 1)
+          : run.overallAcpl,
+        bestAcpl: Math.min(existing?.bestAcpl ?? Infinity, run.overallAcpl),
+        wins: (existing?.wins ?? 0) + run.wins,
+        losses: (existing?.losses ?? 0) + run.losses,
+        draws: (existing?.draws ?? 0) + run.draws,
+        lastRunAt: Date.now(),
+      })
+
+      logger.info('\n========================================')
+      logger.info('=== STOCKFISH BENCHMARK COMPLETED ===')
+      logger.info(
+        `${provider}/${model}: Overall ACPL=${run.overallAcpl?.toFixed(1)}, W/L/D: ${run.wins}/${run.losses}/${run.draws}`,
+      )
+      logger.info('========================================\n')
+    } catch (error) {
+      run.status = 'failed'
+      logger.error('Stockfish benchmark failed', { error })
+    }
+  }
+
+  // Await the benchmark (fire-and-forget causes streaming issues)
+  await runBenchmark()
+
+  return {
+    status: 200,
+    body: run,
+  }
+}
diff --git a/api/steps/benchmark/11-stockfish-leaderboard.step.ts b/api/steps/benchmark/11-stockfish-leaderboard.step.ts
new file mode 100644
index 0000000..1acdea7
--- /dev/null
+++ b/api/steps/benchmark/11-stockfish-leaderboard.step.ts
@@ -0,0 +1,29 @@
+import { ApiRouteConfig, Handlers } from 'motia'
+import { z } from 'zod'
+import { StockfishBenchmarkSummarySchema } from '@chessarena/types/stockfish-benchmark'
+
+export const config: ApiRouteConfig = {
+  type: 'api',
+  name: 'GetStockfishLeaderboard',
+  description: 'Get Stockfish benchmark leaderboard sorted by ACPL',
+  path: '/benchmark/stockfish/leaderboard',
+  method: 'GET',
+  emits: [],
+  flows: ['benchmark'],
+  responseSchema: {
+    200: z.object({
+      leaderboard: z.array(StockfishBenchmarkSummarySchema),
+    }),
+  },
+}
+
+export const handler: Handlers['GetStockfishLeaderboard'] = async (req, { logger, streams }) => {
+  logger.info('Fetching Stockfish leaderboard')
+
+  const summaries = await streams.stockfishBenchmarkSummary.getGroup('models')
+
+  // Sort by ACPL (lower is better)
+  const sorted = summaries.sort((a, b) => (a.averageAcpl ?? Infinity) - (b.averageAcpl ?? Infinity))
+
+  return { status: 200, body: { leaderboard: sorted } }
+}
diff --git a/api/steps/chess/01-create-game.step.ts b/api/steps/chess/01-create-game.step.ts
index 6532b42..db1c613 100644
--- a/api/steps/chess/01-create-game.step.ts
+++ b/api/steps/chess/01-create-game.step.ts
@@ -1,5 +1,5 @@
 import { AiModelProviderSchema } from '@chessarena/types/ai-models'
-import { GameSchema, Player } from '@chessarena/types/game'
+import { GameSchema, Player, BenchmarkVariantSchema } from '@chessarena/types/game'
 import { ApiRouteConfig, Handlers } from 'motia'
 import { RefinementCtx, z } from 'zod'
 import { supportedModelsByProvider } from '../../services/ai/models'
@@ -52,6 +52,7 @@ const bodySchema = z.object({
     white: playerSchema(),
     black: playerSchema(),
   }),
+  variant: BenchmarkVariantSchema.default('guided'),
 })
 
 export const config: ApiRouteConfig = {
@@ -92,9 +93,10 @@ export const handler: Handlers['CreateGame'] = async (req, { logger, emit, state
     return { status: 400, body: { message: 'Invalid request body', errors: validationResult.error.errors } }
   }
 
-  const game = await createGame(req.body.players, streams, logger, user)
+  const variant = req.body.variant ?? 'guided'
+  const game = await createGame(req.body.players, streams, logger, user, variant)
 
-  logger.info('[CreateGame] Game created', { gameId: game.id })
+  logger.info('[CreateGame] Game created', { gameId: game.id, variant })
 
   await emit({
     topic: 'chess-game-created',
diff --git a/api/steps/chess/05-ai-player-unguided.mustache b/api/steps/chess/05-ai-player-unguided.mustache
new file mode 100644
index 0000000..d9b3a27
--- /dev/null
+++ b/api/steps/chess/05-ai-player-unguided.mustache
@@ -0,0 +1,63 @@
+You are a chess grandmaster playing as {{player}}.
+
+## Current Position
+- FEN: `{{fen}}`.
+{{#inCheck}}- WARNING: You are in check! You MUST get out of check.{{/inCheck}}
+
+## Your Task
+Analyze the position and determine your best legal move. You must figure out which moves are legal based on standard chess rules.
+
+{{#lastInvalidMove}}
+ILLEGAL MOVE: {{lastInvalidMove}} was REJECTED because it is not a legal move in this position.
+INSTRUCTION: Carefully analyze the board position and choose a LEGAL move according to chess rules.
+{{/lastInvalidMove}}
+
+Three illegal moves end the match instantly.
+
+## Required Response Format
+
+Your response must be valid JSON with exactly two fields:
+{
+  "thought": "Your strategic reasoning (1-2 sentences)",
+  "moveSan": "YOUR_CHOSEN_MOVE_IN_SAN"
+}
+
+**Steps to follow:**
+1. Analyze the current board position from the FEN
+2. Determine all legal moves for your pieces
+3. Evaluate candidate moves strategically (see guidelines below)
+4. Select the strongest legal move
+5. Express your move in Standard Algebraic Notation (SAN)
+6. Respond ONLY with the JSON—no additional text
+
+**Move notation examples:**
+- Pawn moves: e4, d5, exd5 (capture)
+- Piece moves: Nf3, Bb5, Qd1
+- Castling: O-O (kingside), O-O-O (queenside)
+- Promotion: e8=Q
+- Check: Bb5+
+
+## Strategy Guidelines
+When evaluating moves, consider:
+- **Tactical motifs**: Recognize forks, pins, skewers, discovered attacks, and other tactical patterns
+- **Position assessment**:
+  - Material balance
+  - King safety
+  - Pawn structure
+  - Piece activity and coordination
+  - Weaknesses (backward pawns, weak squares, isolated pawns)
+- **Candidate move comparison**: Visualize resulting positions for multiple moves—don't settle for the first good-looking option
+- **Long-term planning**: Build plans to win the game, not just reactive moves
+  - Improve your worst-placed piece
+  - Control key files and diagonals
+  - Create favorable imbalances
+  - Transition to winning endgames
+- **Safety checks**: Ensure your move doesn't expose your pieces to capture unless you gain a tactical advantage
+
+## Critical Rules
+1. Your move MUST be legal according to standard chess rules
+2. Your response must be ONLY valid JSON with "thought" and "moveSan" fields
+3. Double-check that your piece can actually make the move you're choosing
+4. Any illegal move will be rejected and count toward your three-strike limit
+
+Choose your best move now.
diff --git a/api/steps/chess/05-ai-player.step.ts b/api/steps/chess/05-ai-player.step.ts
index d9df264..5d0e08d 100644
--- a/api/steps/chess/05-ai-player.step.ts
+++ b/api/steps/chess/05-ai-player.step.ts
@@ -25,10 +25,11 @@ export const config: EventConfig = {
     check: z.boolean({ description: 'Whether the move is a check' }),
     gameId: z.string({ description: 'The ID of the game' }),
   }),
-  includeFiles: ['05-ai-player.mustache'],
+  includeFiles: ['05-ai-player.mustache', '05-ai-player-unguided.mustache'],
 }
 
-const template = fs.readFileSync(path.join(__dirname, '05-ai-player.mustache'), 'utf8')
+const guidedTemplate = fs.readFileSync(path.join(__dirname, '05-ai-player.mustache'), 'utf8')
+const unguidedTemplate = fs.readFileSync(path.join(__dirname, '05-ai-player-unguided.mustache'), 'utf8')
 
 export const handler: Handlers['AI_Player'] = async (input, { logger, emit, streams }) => {
   logger.info('Received ai-move event', { gameId: input.gameId })
@@ -63,21 +64,20 @@ export const handler: Handlers['AI_Player'] = async (input, { logger, emit, stre
       timestamp: Date.now(),
     })
 
-    const prompt = mustache.render(
-      template,
-      {
-        fenBefore: input.fenBefore,
-        fen: input.fen,
-        inCheck: input.check,
-        player: input.player,
-        lastInvalidMove,
-        validMoves,
-        totalMoves: validMoves.length,
-      },
-      {},
-      { escape: (value: string) => value },
-    )
-    logger.info('Prompt', { prompt })
+    // Arena games always run guided. Rule understanding is benchmarked separately via legal-move bench.
+    const template = guidedTemplate
+    const templateData = {
+      fenBefore: input.fenBefore,
+      fen: input.fen,
+      inCheck: input.check,
+      player: input.player,
+      lastInvalidMove,
+      validMoves,
+      totalMoves: validMoves.length,
+    }
+
+    const prompt = mustache.render(template, templateData, {}, { escape: (value: string) => value })
+    logger.info('Prompt', { prompt, variant: game.variant })
 
     let action: AiPlayerPrompt | undefined
 
diff --git a/api/steps/chess/08-game-ended.step.ts b/api/steps/chess/08-game-ended.step.ts
index e68e051..8f9bde5 100644
--- a/api/steps/chess/08-game-ended.step.ts
+++ b/api/steps/chess/08-game-ended.step.ts
@@ -2,8 +2,10 @@ import { EventConfig, Handlers } from 'motia'
 import { z } from 'zod'
 import { models } from '../../services/ai/models'
 import { generateGameScore } from '../../services/chess/generate-game-score'
+import { generatePgn } from '../../services/chess/generate-pgn'
 import { Scoreboard } from '@chessarena/types/game'
 import { Leaderboard } from '@chessarena/types/leaderboard'
+import { GameHistory } from '@chessarena/types/game-history'
 import { isAiGame } from '../../services/chess/utils'
 
 /*
@@ -37,10 +39,48 @@ export const handler: Handlers['GameEnded'] = async (input, { logger, streams })
   }
 
   const moves = await streams.chessGameMove.getGroup(input.gameId)
+  const messages = await streams.chessGameMessage.getGroup(input.gameId)
   const scoreboard = generateGameScore(moves, game)
 
   await streams.chessGame.set('game', game.id, { ...game, scoreboard })
 
+  // Archive game to history
+  const endedAt = Date.now()
+  const startedAt = game.createdAt ?? endedAt
+  const pgn = generatePgn({ game, moves })
+
+  const gameHistory: GameHistory = {
+    id: game.id,
+    startedAt,
+    endedAt,
+    duration: endedAt - startedAt,
+    whitePlayer: {
+      provider: game.players.white.ai,
+      model: game.players.white.model,
+      isHuman: !game.players.white.ai,
+    },
+    blackPlayer: {
+      provider: game.players.black.ai,
+      model: game.players.black.model,
+      isHuman: !game.players.black.ai,
+    },
+    status: game.status === 'pending' ? 'completed' : game.status,
+    winner: game.winner,
+    endGameReason: game.endGameReason,
+    variant: game.variant ?? 'guided',
+    totalMoves: moves.length,
+    whiteIllegalMoves: game.players.white.illegalMoveAttempts ?? 0,
+    blackIllegalMoves: game.players.black.illegalMoveAttempts ?? 0,
+    finalFen: game.fen,
+    moves,
+    messages,
+    scoreboard,
+    pgn,
+  }
+
+  await streams.chessGameHistory.set('all', game.id, gameHistory)
+  logger.info('Game archived to history', { gameId: game.id })
+
   if (!isAiGame(game)) {
     return
   }
diff --git a/api/steps/chess/10-get-game-history.step.ts b/api/steps/chess/10-get-game-history.step.ts
new file mode 100644
index 0000000..07e3f88
--- /dev/null
+++ b/api/steps/chess/10-get-game-history.step.ts
@@ -0,0 +1,90 @@
+import { ApiRouteConfig, Handlers } from 'motia'
+import { z } from 'zod'
+import { GameHistorySchema } from '@chessarena/types/game-history'
+
+export const config: ApiRouteConfig = {
+  type: 'api',
+  name: 'GetGameHistory',
+  description: 'Get game history with optional filters',
+  path: '/chess/history',
+  method: 'GET',
+  emits: [],
+  flows: ['chess'],
+  queryParams: [
+    { name: 'provider', description: 'Filter by AI provider' },
+    { name: 'model', description: 'Filter by model name' },
+    { name: 'variant', description: 'Filter by game variant' },
+    { name: 'winner', description: 'Filter by winner' },
+    { name: 'status', description: 'Filter by game status' },
+    { name: 'startDate', description: 'Filter by start date (timestamp)' },
+    { name: 'endDate', description: 'Filter by end date (timestamp)' },
+    { name: 'limit', description: 'Pagination limit' },
+    { name: 'offset', description: 'Pagination offset' },
+  ],
+  responseSchema: {
+    200: z.object({
+      games: z.array(GameHistorySchema.omit({ moves: true, messages: true })),
+      total: z.number(),
+      limit: z.number(),
+      offset: z.number(),
+    }),
+  },
+}
+
+export const handler: Handlers['GetGameHistory'] = async (req, { logger, streams }) => {
+  logger.info('[GetGameHistory] Fetching game history', { query: req.queryParams })
+
+  const params = req.queryParams as Record<string, string | undefined>
+  const provider = params.provider
+  const model = params.model
+  const variant = params.variant as 'guided' | 'unguided' | undefined
+  const winner = params.winner as 'white' | 'black' | undefined
+  const status = params.status
+  const startDate = params.startDate ? parseInt(params.startDate) : undefined
+  const endDate = params.endDate ? parseInt(params.endDate) : undefined
+  const limit = params.limit ? parseInt(params.limit) : 50
+  const offset = params.offset ? parseInt(params.offset) : 0
+
+  const allGames = await streams.chessGameHistory.getGroup('all')
+
+  // Apply filters
+  let filtered = allGames.filter((game) => {
+    if (provider) {
+      const matchesWhite = game.whitePlayer.provider === provider
+      const matchesBlack = game.blackPlayer.provider === provider
+      if (!matchesWhite && !matchesBlack) return false
+    }
+
+    if (model) {
+      const matchesWhite = game.whitePlayer.model === model
+      const matchesBlack = game.blackPlayer.model === model
+      if (!matchesWhite && !matchesBlack) return false
+    }
+
+    if (variant && game.variant !== variant) return false
+    if (winner && game.winner !== winner) return false
+    if (status && game.status !== status) return false
+    if (startDate && game.startedAt < startDate) return false
+    if (endDate && game.endedAt > endDate) return false
+
+    return true
+  })
+
+  // Sort by most recent first
+  filtered.sort((a, b) => b.endedAt - a.endedAt)
+
+  const total = filtered.length
+
+  // Apply pagination
+  const paginated = filtered.slice(offset, offset + limit)
+
+  // Remove heavy fields for list view
+  const games = paginated.map(({ moves, messages, ...rest }) => rest)
+
+  logger.info('[GetGameHistory] Returning games', { total, returned: games.length })
+
+  return {
+    status: 200,
+    body: { games, total, limit, offset },
+  }
+}
diff --git a/api/steps/chess/10b-export-game-history.step.ts b/api/steps/chess/10b-export-game-history.step.ts
new file mode 100644
index 0000000..2c3ff7c
--- /dev/null
+++ b/api/steps/chess/10b-export-game-history.step.ts
@@ -0,0 +1,143 @@
+import { ApiRouteConfig, Handlers } from 'motia'
+import { z } from 'zod'
+
+export const config: ApiRouteConfig = {
+  type: 'api',
+  name: 'ExportGameHistory',
+  description: 'Export game history in JSON or CSV format',
+  path: '/chess/history/export',
+  method: 'GET',
+  emits: [],
+  flows: ['chess'],
+  queryParams: [
+    { name: 'provider', description: 'Filter by AI provider' },
+    { name: 'model', description: 'Filter by model name' },
+    { name: 'variant', description: 'Filter by game variant' },
+    { name: 'winner', description: 'Filter by winner' },
+    { name: 'status', description: 'Filter by game status' },
+    { name: 'startDate', description: 'Filter by start date (timestamp)' },
+    { name: 'endDate', description: 'Filter by end date (timestamp)' },
+    { name: 'format', description: 'Export format: json or csv' },
+  ],
+  responseSchema: {
+    200: z.any(),
+  },
+}
+
+const escapeCsvField = (value: string | number | undefined | null): string => {
+  if (value === undefined || value === null) return ''
+  const str = String(value)
+  if (str.includes(',') || str.includes('"') || str.includes('\n') || str.includes('\r')) {
+    return `"${str.replace(/"/g, '""')}"`
+  }
+  return str
+}
+
+export const handler: Handlers['ExportGameHistory'] = async (req, { logger, streams }) => {
+  logger.info('[ExportGameHistory] Exporting game history', { query: req.queryParams })
+
+  const params = req.queryParams as Record<string, string | undefined>
+  const provider = params.provider
+  const model = params.model
+  const variant = params.variant as 'guided' | 'unguided' | undefined
+  const winner = params.winner as 'white' | 'black' | undefined
+  const status = params.status
+  const startDate = params.startDate ? parseInt(params.startDate) : undefined
+  const endDate = params.endDate ? parseInt(params.endDate) : undefined
+  const format = (params.format as 'json' | 'csv') || 'json'
+
+  const allGames = await streams.chessGameHistory.getGroup('all')
+
+  // Apply filters
+  let filtered = allGames.filter((game) => {
+    if (provider) {
+      const matchesWhite = game.whitePlayer.provider === provider
+      const matchesBlack = game.blackPlayer.provider === provider
+      if (!matchesWhite && !matchesBlack) return false
+    }
+
+    if (model) {
+      const matchesWhite = game.whitePlayer.model === model
+      const matchesBlack = game.blackPlayer.model === model
+      if (!matchesWhite && !matchesBlack) return false
+    }
+
+    if (variant && game.variant !== variant) return false
+    if (winner && game.winner !== winner) return false
+    if (status && game.status !== status) return false
+    if (startDate && game.startedAt < startDate) return false
+    if (endDate && game.endedAt > endDate) return false
+
+    return true
+  })
+
+  // Sort by most recent first
+  filtered.sort((a, b) => b.endedAt - a.endedAt)
+
+  if (format === 'csv') {
+    const headers = [
+      'id',
+      'startedAt',
+      'endedAt',
+      'duration',
+      'variant',
+      'status',
+      'winner',
+      'endGameReason',
+      'totalMoves',
+      'whiteProvider',
+      'whiteModel',
+      'whiteIllegalMoves',
+      'blackProvider',
+      'blackModel',
+      'blackIllegalMoves',
+      'pgn',
+    ]
+
+    const rows = filtered.map((game) => [
+      escapeCsvField(game.id),
+      escapeCsvField(new Date(game.startedAt).toISOString()),
+      escapeCsvField(new Date(game.endedAt).toISOString()),
+      escapeCsvField(game.duration),
+      escapeCsvField(game.variant),
+      escapeCsvField(game.status),
+      escapeCsvField(game.winner),
+      escapeCsvField(game.endGameReason),
+      escapeCsvField(game.totalMoves),
+      escapeCsvField(game.whitePlayer.provider || 'human'),
+      escapeCsvField(game.whitePlayer.model),
+      escapeCsvField(game.whiteIllegalMoves),
+      escapeCsvField(game.blackPlayer.provider || 'human'),
+      escapeCsvField(game.blackPlayer.model),
+      escapeCsvField(game.blackIllegalMoves),
+      escapeCsvField(game.pgn),
+    ])
+
+    const csv = [headers.join(','), ...rows.map((row) => row.join(','))].join('\n')
+
+    return {
+      status: 200,
+      headers: {
+        'Content-Type': 'text/csv',
+        'Content-Disposition': 'attachment; filename="chessarena-history.csv"',
+      },
+      body: csv,
+    }
+  }
+
+  // JSON format - include everything except full moves/messages for smaller payload
+  const exportData = filtered.map(({ moves, messages, ...rest }) => ({
+    ...rest,
+    movesCount: moves.length,
+    messagesCount: messages.length,
+  }))
+
+  return {
+    status: 200,
+    headers: {
+      'Content-Type': 'application/json',
+      'Content-Disposition': 'attachment; filename="chessarena-history.json"',
+    },
+    body: exportData,
+  }
+}
diff --git a/api/steps/chess/11-get-game-history-detail.step.ts b/api/steps/chess/11-get-game-history-detail.step.ts
new file mode 100644
index 0000000..c699fdc
--- /dev/null
+++ b/api/steps/chess/11-get-game-history-detail.step.ts
@@ -0,0 +1,37 @@
+import { ApiRouteConfig, Handlers } from 'motia'
+import { z } from 'zod'
+import { GameHistorySchema } from '@chessarena/types/game-history'
+
+export const config: ApiRouteConfig = {
+  type: 'api',
+  name: 'GetGameHistoryDetail',
+  description: 'Get detailed game history including all moves and messages',
+  path: '/chess/history/:gameId',
+  method: 'GET',
+  emits: [],
+  flows: ['chess'],
+  responseSchema: {
+    200: GameHistorySchema,
+    404: z.object({ message: z.string() }),
+  },
+}
+
+export const handler: Handlers['GetGameHistoryDetail'] = async (req, { logger, streams }) => {
+  const { gameId } = req.pathParams
+  logger.info('[GetGameHistoryDetail] Fetching game', { gameId })
+
+  const game = await streams.chessGameHistory.get('all', gameId)
+
+  if (!game) {
+    logger.warn('[GetGameHistoryDetail] Game not found', { gameId })
+    return {
+      status: 404,
+      body: { message: 'Game not found in history' },
+    }
+  }
+
+  return {
+    status: 200,
+    body: game,
+  }
+}
diff --git a/api/steps/chess/12-play-vs-ai.step.ts b/api/steps/chess/12-play-vs-ai.step.ts
new file mode 100644
index 0000000..f736c01
--- /dev/null
+++ b/api/steps/chess/12-play-vs-ai.step.ts
@@ -0,0 +1,91 @@
+import { GameSchema } from '@chessarena/types/game'
+import { ApiRouteConfig, Handlers } from 'motia'
+import { z } from 'zod'
+import { createGame } from '../../services/chess/create-game'
+import { selectRandomAI } from '../../services/ai/random-ai-selection'
+import { auth } from '../middlewares/auth.middleware'
+import { UserState } from '../states/user-state'
+
+const bodySchema = z.object({
+  playerColor: z.enum(['white', 'black', 'random']).default('random'),
+})
+
+export const config: ApiRouteConfig = {
+  type: 'api',
+  name: 'PlayVsAI',
+  description: 'Start a game against a randomly selected AI opponent',
+  path: '/chess/play-vs-ai',
+  method: 'POST',
+  emits: ['chess-game-created'],
+  flows: ['chess'],
+  bodySchema,
+  middleware: [auth({ required: true })],
+  responseSchema: {
+    200: z.object({
+      game: GameSchema,
+      opponent: z.object({
+        provider: z.string(),
+        model: z.string(),
+        tier: z.string(),
+      }),
+      playerColor: z.enum(['white', 'black']),
+    }),
+    401: z.object({ message: z.string() }),
+  },
+}
+
+export const handler: Handlers['PlayVsAI'] = async (req, { logger, emit, state, streams }) => {
+  logger.info('[PlayVsAI] Starting human vs AI game')
+
+  const userState = new UserState(state)
+  const user = await userState.getUser(req.tokenInfo.sub)
+
+  if (!user) {
+    logger.error('[PlayVsAI] User not found', { userId: req.tokenInfo.sub })
+    return { status: 401, body: { message: 'User not found' } }
+  }
+
+  // Select random AI opponent
+  const opponent = selectRandomAI()
+  logger.info('[PlayVsAI] Selected AI opponent', opponent)
+
+  // Determine player color
+  let playerColor: 'white' | 'black' = req.body.playerColor as 'white' | 'black'
+  if (req.body.playerColor === 'random') {
+    playerColor = Math.random() < 0.5 ? 'white' : 'black'
+  }
+
+  // Create game with human vs AI
+  const players =
+    playerColor === 'white'
+      ? {
+          white: {}, // Human
+          black: { ai: opponent.provider, model: opponent.model },
+        }
+      : {
+          white: { ai: opponent.provider, model: opponent.model },
+          black: {}, // Human
+        }
+
+  const game = await createGame(players, streams, logger, user, 'guided')
+
+  logger.info('[PlayVsAI] Game created', {
+    gameId: game.id,
+    playerColor,
+    opponent: `${opponent.provider}/${opponent.model}`,
+  })
+
+  await emit({
+    topic: 'chess-game-created',
+    data: { gameId: game.id, fenBefore: game.fen },
+  })
+
+  return {
+    status: 200,
+    body: {
+      game,
+      opponent,
+      playerColor,
+    },
+  }
+}
diff --git a/api/steps/chess/legal-move-benchmark.mustache b/api/steps/chess/legal-move-benchmark.mustache
new file mode 100644
index 0000000..d0ea3e3
--- /dev/null
+++ b/api/steps/chess/legal-move-benchmark.mustache
@@ -0,0 +1,29 @@
+You are a chess expert. Given the following game, list ALL legal moves available for the current player.
+
+## Game (PGN)
+{{pgn}}
+
+## Current Position
+It is {{turn}}'s turn to move.
+FEN: {{fen}}
+
+## Task
+List ALL legal moves for {{turn}} in Standard Algebraic Notation (SAN).
+
+## Response Format
+Return ONLY a JSON object with no additional text:
+{
+  "moves": ["move1", "move2", "move3", ...]
+}
+
+## CRITICAL - Use Standard Algebraic Notation (SAN):
+CORRECT format examples: Nf3, e4, O-O, Bxc6, Qd7, Rfe1, exd5, h4
+WRONG format (do NOT use): Ng1f3, e2e4, Bf1c4, Ke1g1
+
+Rules:
+- Pawns: just the destination square (e4) or capture (exd5)
+- Pieces: piece letter + destination (Nf3, Bc4, Qd1)
+- Captures: add 'x' (Bxc6, Nxe5, exd5)
+- Castling: O-O (kingside) or O-O-O (queenside)
+- Do NOT include starting square (Ng1f3 is WRONG, Nf3 is CORRECT)
+- Include ALL legal moves, not just good moves
diff --git a/api/steps/chess/puzzle-benchmark.mustache b/api/steps/chess/puzzle-benchmark.mustache
new file mode 100644
index 0000000..cb75d34
--- /dev/null
+++ b/api/steps/chess/puzzle-benchmark.mustache
@@ -0,0 +1,22 @@
+You are a chess expert solving a puzzle. Find the BEST move in this position.
+
+## Game (PGN)
+{{pgn}}
+
+## Current Position
+FEN: {{fen}}
+It is {{turn}}'s turn to move.
+
+## Legal Moves
+{{#legalMoves}}
+- {{.}}
+{{/legalMoves}}
+
+## Task
+This is a {{theme}} puzzle. Find the single best move.
+
+## Response Format
+Return ONLY the move in SAN, with no additional text.
+
+Choose ONE move from the legal moves list above.
+Do NOT use markdown code fences.
diff --git a/api/steps/chess/streams/00-chess-game-history.stream.ts b/api/steps/chess/streams/00-chess-game-history.stream.ts
new file mode 100644
index 0000000..096bb86
--- /dev/null
+++ b/api/steps/chess/streams/00-chess-game-history.stream.ts
@@ -0,0 +1,8 @@
+import { StreamConfig } from 'motia'
+import { GameHistorySchema } from '@chessarena/types/game-history'
+
+export const config: StreamConfig = {
+  name: 'chessGameHistory',
+  schema: GameHistorySchema,
+  baseConfig: { storageType: 'default' },
+}
diff --git a/api/steps/chess/streams/00-legal-move-benchmark-summary.stream.ts b/api/steps/chess/streams/00-legal-move-benchmark-summary.stream.ts
new file mode 100644
index 0000000..aa59e50
--- /dev/null
+++ b/api/steps/chess/streams/00-legal-move-benchmark-summary.stream.ts
@@ -0,0 +1,8 @@
+import { StreamConfig } from 'motia'
+import { LegalMoveBenchmarkSummarySchema } from '@chessarena/types/legal-move-benchmark'
+
+export const config: StreamConfig = {
+  name: 'legalMoveBenchmarkSummary',
+  schema: LegalMoveBenchmarkSummarySchema,
+  baseConfig: { storageType: 'default' },
+}
diff --git a/api/steps/chess/streams/00-legal-move-benchmark.stream.ts b/api/steps/chess/streams/00-legal-move-benchmark.stream.ts
new file mode 100644
index 0000000..4f88634
--- /dev/null
+++ b/api/steps/chess/streams/00-legal-move-benchmark.stream.ts
@@ -0,0 +1,8 @@
+import { StreamConfig } from 'motia'
+import { LegalMoveBenchmarkRunSchema } from '@chessarena/types/legal-move-benchmark'
+
+export const config: StreamConfig = {
+  name: 'legalMoveBenchmark',
+  schema: LegalMoveBenchmarkRunSchema,
+  baseConfig: { storageType: 'default' },
+}
diff --git a/api/steps/chess/streams/00-position-set.stream.ts b/api/steps/chess/streams/00-position-set.stream.ts
new file mode 100644
index 0000000..2ef12d9
--- /dev/null
+++ b/api/steps/chess/streams/00-position-set.stream.ts
@@ -0,0 +1,8 @@
+import { StreamConfig } from 'motia'
+import { PositionSetSchema } from '@chessarena/types/legal-move-benchmark'
+
+export const config: StreamConfig = {
+  name: 'positionSet',
+  schema: PositionSetSchema,
+  baseConfig: { storageType: 'default' },
+}
diff --git a/api/steps/chess/streams/00-puzzle-benchmark-summary.stream.ts b/api/steps/chess/streams/00-puzzle-benchmark-summary.stream.ts
new file mode 100644
index 0000000..851b4b5
--- /dev/null
+++ b/api/steps/chess/streams/00-puzzle-benchmark-summary.stream.ts
@@ -0,0 +1,8 @@
+import { StreamConfig } from 'motia'
+import { PuzzleBenchmarkSummarySchema } from '@chessarena/types/puzzle-benchmark'
+
+export const config: StreamConfig = {
+  name: 'puzzleBenchmarkSummary',
+  schema: PuzzleBenchmarkSummarySchema,
+  baseConfig: { storageType: 'default' },
+}
diff --git a/api/steps/chess/streams/00-puzzle-benchmark.stream.ts b/api/steps/chess/streams/00-puzzle-benchmark.stream.ts
new file mode 100644
index 0000000..99c17ec
--- /dev/null
+++ b/api/steps/chess/streams/00-puzzle-benchmark.stream.ts
@@ -0,0 +1,8 @@
+import { StreamConfig } from 'motia'
+import { PuzzleBenchmarkRunSchema } from '@chessarena/types/puzzle-benchmark'
+
+export const config: StreamConfig = {
+  name: 'puzzleBenchmark',
+  schema: PuzzleBenchmarkRunSchema,
+  baseConfig: { storageType: 'default' },
+}
diff --git a/api/steps/chess/streams/00-puzzle-set.stream.ts b/api/steps/chess/streams/00-puzzle-set.stream.ts
new file mode 100644
index 0000000..aa50438
--- /dev/null
+++ b/api/steps/chess/streams/00-puzzle-set.stream.ts
@@ -0,0 +1,8 @@
+import { StreamConfig } from 'motia'
+import { PuzzleSetSchema } from '@chessarena/types/puzzle-benchmark'
+
+export const config: StreamConfig = {
+  name: 'puzzleSet',
+  schema: PuzzleSetSchema,
+  baseConfig: { storageType: 'default' },
+}
diff --git a/api/steps/chess/streams/00-stockfish-benchmark-summary.stream.ts b/api/steps/chess/streams/00-stockfish-benchmark-summary.stream.ts
new file mode 100644
index 0000000..3a5f228
--- /dev/null
+++ b/api/steps/chess/streams/00-stockfish-benchmark-summary.stream.ts
@@ -0,0 +1,8 @@
+import { StreamConfig } from 'motia'
+import { StockfishBenchmarkSummarySchema } from '@chessarena/types/stockfish-benchmark'
+
+export const config: StreamConfig = {
+  name: 'stockfishBenchmarkSummary',
+  schema: StockfishBenchmarkSummarySchema,
+  baseConfig: { storageType: 'default' },
+}
diff --git a/api/steps/chess/streams/00-stockfish-benchmark.stream.ts b/api/steps/chess/streams/00-stockfish-benchmark.stream.ts
new file mode 100644
index 0000000..09ff72e
--- /dev/null
+++ b/api/steps/chess/streams/00-stockfish-benchmark.stream.ts
@@ -0,0 +1,8 @@
+import { StreamConfig } from 'motia'
+import { StockfishBenchmarkRunSchema } from '@chessarena/types/stockfish-benchmark'
+
+export const config: StreamConfig = {
+  name: 'stockfishBenchmark',
+  schema: StockfishBenchmarkRunSchema,
+  baseConfig: { storageType: 'default' },
+}
diff --git a/api/types.d.ts b/api/types.d.ts
index 86f9629..f2ba9c2 100644
--- a/api/types.d.ts
+++ b/api/types.d.ts
@@ -8,26 +8,52 @@ import { EventHandler, ApiRouteHandler, ApiResponse, MotiaStream, CronHandler }
 
 declare module 'motia' {
   interface FlowContextStateStreams {
+    'stockfishBenchmark': MotiaStream<{ id: string; createdAt: number; completedAt?: number; status: 'running' | 'completed' | 'failed'; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; stockfishLevel: number; gameAsWhite?: { id: string; createdAt: number; completedAt?: number; status: 'running' | 'completed' | 'failed'; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; aiColor: 'white' | 'black'; stockfishLevel: number; result?: 'ai_win' | 'stockfish_win' | 'draw' | 'ai_illegal_move' | 'timeout'; resultReason?: string; moves: Array<{ moveNumber: number; player: 'white' | 'black'; moveSan: string; fen: string; centipawnScore?: number; bestMove?: string; centipawnLoss?: number; isAiMove: boolean; responseTime?: number; error?: string }>; totalMoves: number; finalFen?: string; pgn?: string; aiMoveCount?: number; totalCentipawnLoss?: number; averageCentipawnLoss?: number; blunders?: number; mistakes?: number; inaccuracies?: number }; gameAsBlack?: unknown; gamesPlayed: number; wins: number; losses: number; draws: number; overallAcpl?: number }>
+    'stockfishBenchmarkSummary': MotiaStream<{ id: string; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; runsCompleted: number; averageAcpl: number; bestAcpl: number; wins: number; losses: number; draws: number; lastRunAt: number }>
+    'puzzleSet': MotiaStream<{ id: string; theme: 'mateIn1' | 'oneMove'; createdAt: number; puzzles: Array<{ id: string; rating: number; themes: Array<string>; solution: Array<string>; initialPly: number; pgn: string; fen: string; legalMoves: Array<string>; solutionSan: string }>; count: number }>
+    'puzzleBenchmark': MotiaStream<{ id: string; createdAt: number; completedAt?: number; status: 'pending' | 'running' | 'completed' | 'failed'; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; puzzleSetId: string; theme: 'mateIn1' | 'oneMove'; results: Array<{ puzzleId: string; modelMove?: string; correctMove: string; isCorrect: boolean; responseTime: number; rawResponse: string; error?: string }>; totalPuzzles: number; correctCount?: number; accuracy?: number }>
+    'puzzleBenchmarkSummary': MotiaStream<{ id: string; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; mateIn1Accuracy?: number; oneMoveAccuracy?: number; overallAccuracy?: number; runsCompleted: number; lastRunAt: number }>
+    'positionSet': MotiaStream<{ id: string; createdAt: number; count: number; positions: Array<{ id: string; fen: string; pgn: string; turn: 'white' | 'black'; legalMoves: Array<string>; legalMoveCount: number; moveNumber: number }> }>
+    'legalMoveBenchmark': MotiaStream<{ id: string; createdAt: number; completedAt?: number; status: 'pending' | 'running' | 'completed' | 'failed'; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; positionCount: number; positions: Array<{ id: string; fen: string; pgn: string; turn: 'white' | 'black'; legalMoves: Array<string>; legalMoveCount: number; moveNumber: number }>; results: Array<{ positionId: string; modelMoves: Array<string>; correctMoves: Array<string>; illegalMoves: Array<string>; missedMoves: Array<string>; accuracy: number; penalty: number; finalScore: number; responseTime: number; rawResponse: string; error?: string }>; averageAccuracy?: number; averagePenalty?: number; averageFinalScore?: number; totalCorrectMoves?: number; totalIllegalMoves?: number; totalMissedMoves?: number }>
+    'legalMoveBenchmarkSummary': MotiaStream<{ id: string; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; runsCompleted: number; averageScore: number; bestScore: number; worstScore: number; lastRunAt: number }>
     'chessSidechatMessage': MotiaStream<{ message: string; sender: string; role: 'white' | 'black' | 'spectator' | 'root'; timestamp: number }>
     'chessLiveAiGames': MotiaStream<{ id: string; gameId: string; players: { white: { provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string }; black: { provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string } }; createdAt: string }>
     'chessLeaderboard': MotiaStream<{ id: string; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; gamesPlayed: number; victories: number; checkmates: number; draws: number; endedEarly: number; illegalMoves: number; sumCentipawnScores: number; sumHighestSwing: number }>
-    'chessGame': MotiaStream<{ id: string; fen: string; turn: 'white' | 'black'; status: 'pending' | 'completed' | 'draw' | 'endedEarly'; lastMove?: Array<string>; lastMoveSan?: string; winner?: 'white' | 'black'; turns?: number; endGameReason?: string; players: { white: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number }; black: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number } }; check: boolean; scoreboard?: { white: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; black: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; totalMoves: number; decisiveMoment?: { moveNumber: number; evaluationSwing: number; move: Array<string>; fen: string } } }>
+    'chessGame': MotiaStream<{ id: string; fen: string; turn: 'white' | 'black'; status: 'pending' | 'completed' | 'draw' | 'endedEarly'; variant?: 'guided' | 'unguided'; lastMove?: Array<string>; lastMoveSan?: string; winner?: 'white' | 'black'; turns?: number; endGameReason?: string; players: { white: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number }; black: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number } }; check: boolean; scoreboard?: { white: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; black: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; totalMoves: number; decisiveMoment?: { moveNumber: number; evaluationSwing: number; move: Array<string>; fen: string } }; createdAt?: number }>
     'chessGameMove': MotiaStream<{ color: 'white' | 'black'; fenBefore: string; fenAfter: string; lastMove: Array<string>; check: boolean; evaluation?: { centipawnScore: number; bestMove: string; evaluationSwing: number; blunder: boolean } }>
     'chessGameMessage': MotiaStream<{ id: string; message: string; sender: string; profilePic?: string; role: 'white' | 'black' | 'spectator' | 'root'; timestamp: number; moveSan?: string; isIllegalMove?: boolean }>
+    'chessGameHistory': MotiaStream<{ id: string; startedAt: number; endedAt: number; duration: number; whitePlayer: { provider?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; isHuman: boolean }; blackPlayer: { provider?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; isHuman: boolean }; status: 'completed' | 'draw' | 'endedEarly'; winner?: 'white' | 'black'; endGameReason?: string; variant?: 'guided' | 'unguided'; totalMoves: number; whiteIllegalMoves: number; blackIllegalMoves: number; finalFen: string; moves: Array<{ color: 'white' | 'black'; fenBefore: string; fenAfter: string; lastMove: Array<string>; check: boolean; evaluation?: { centipawnScore: number; bestMove: string; evaluationSwing: number; blunder: boolean } }>; messages: Array<{ id: string; message: string; sender: string; profilePic?: string; role: 'white' | 'black' | 'spectator' | 'root'; timestamp: number; moveSan?: string; isIllegalMove?: boolean }>; scoreboard?: { white: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; black: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; totalMoves: number; decisiveMoment?: { moveNumber: number; evaluationSwing: number; move: Array<string>; fen: string } }; pgn?: string }>
   }
 
   interface Handlers {
+    'PlayVsAI': ApiRouteHandler<{ playerColor?: 'white' | 'black' | 'random' }, ApiResponse<200, { game: { id: string; fen: string; turn: 'white' | 'black'; status: 'pending' | 'completed' | 'draw' | 'endedEarly'; variant?: 'guided' | 'unguided'; lastMove?: Array<string>; lastMoveSan?: string; winner?: 'white' | 'black'; turns?: number; endGameReason?: string; players: { white: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number }; black: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number } }; check: boolean; scoreboard?: { white: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; black: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; totalMoves: number; decisiveMoment?: { moveNumber: number; evaluationSwing: number; move: Array<string>; fen: string } }; createdAt?: number }; opponent: { provider: string; model: string; tier: string }; playerColor: 'white' | 'black' }> | ApiResponse<401, { message: string }>, { topic: 'chess-game-created'; data: { gameId: string; fenBefore: string } }>
+    'GetGameHistoryDetail': ApiRouteHandler<Record<string, unknown>, ApiResponse<200, { id: string; startedAt: number; endedAt: number; duration: number; whitePlayer: { provider?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; isHuman: boolean }; blackPlayer: { provider?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; isHuman: boolean }; status: 'completed' | 'draw' | 'endedEarly'; winner?: 'white' | 'black'; endGameReason?: string; variant?: 'guided' | 'unguided'; totalMoves: number; whiteIllegalMoves: number; blackIllegalMoves: number; finalFen: string; moves: Array<{ color: 'white' | 'black'; fenBefore: string; fenAfter: string; lastMove: Array<string>; check: boolean; evaluation?: { centipawnScore: number; bestMove: string; evaluationSwing: number; blunder: boolean } }>; messages: Array<{ id: string; message: string; sender: string; profilePic?: string; role: 'white' | 'black' | 'spectator' | 'root'; timestamp: number; moveSan?: string; isIllegalMove?: boolean }>; scoreboard?: { white: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; black: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; totalMoves: number; decisiveMoment?: { moveNumber: number; evaluationSwing: number; move: Array<string>; fen: string } }; pgn?: string }> | ApiResponse<404, { message: string }>, never>
+    'ExportGameHistory': ApiRouteHandler<Record<string, unknown>, ApiResponse<200, unknown>, never>
+    'GetGameHistory': ApiRouteHandler<Record<string, unknown>, ApiResponse<200, { games: Array<{ id: string; startedAt: number; endedAt: number; duration: number; whitePlayer: { provider?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; isHuman: boolean }; blackPlayer: { provider?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; isHuman: boolean }; status: 'completed' | 'draw' | 'endedEarly'; winner?: 'white' | 'black'; endGameReason?: string; variant?: 'guided' | 'unguided'; totalMoves: number; whiteIllegalMoves: number; blackIllegalMoves: number; finalFen: string; scoreboard?: { white: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; black: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; totalMoves: number; decisiveMoment?: { moveNumber: number; evaluationSwing: number; move: Array<string>; fen: string } }; pgn?: string }>; total: number; limit: number; offset: number }>, never>
     'PurgeStuckGames': CronHandler<never>
     'GameEnded': EventHandler<{ gameId: string }, never>
     'SendMessage': ApiRouteHandler<{ message: string; name: string; role: 'white' | 'black' | 'spectator' | 'root' }, ApiResponse<200, { message: string; sender: string; timestamp: number }> | ApiResponse<404, { message: string }>, never>
     'AI_Player': EventHandler<{ player: 'white' | 'black'; fenBefore: string; fen: string; lastMove?: Array<string>; check: boolean; gameId: string }, { topic: 'chess-game-moved'; data: { gameId: string; fenBefore: string } } | { topic: 'chess-game-ended'; data: { gameId: string } } | { topic: 'evaluate-player-move'; data: { fenBefore: string; fenAfter: string; gameId: string; moveId: string; player: string } }>
     'ChessGameMoved': EventHandler<{ gameId: string; fenBefore: string }, { topic: 'ai-move'; data: { player: 'white' | 'black'; fenBefore: string; fen: string; lastMove?: Array<string>; check: boolean; gameId: string } }>
-    'MovePiece': ApiRouteHandler<{ moveSan: string }, ApiResponse<200, { id: string; fen: string; turn: 'white' | 'black'; status: 'pending' | 'completed' | 'draw' | 'endedEarly'; lastMove?: Array<string>; lastMoveSan?: string; winner?: 'white' | 'black'; turns?: number; endGameReason?: string; players: { white: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number }; black: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number } }; check: boolean; scoreboard?: { white: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; black: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; totalMoves: number; decisiveMoment?: { moveNumber: number; evaluationSwing: number; move: Array<string>; fen: string } } }> | ApiResponse<400, { message: string }> | ApiResponse<404, { message: string }>, { topic: 'chess-game-moved'; data: { gameId: string; fenBefore: string } } | { topic: 'chess-game-ended'; data: { gameId: string } } | { topic: 'evaluate-player-move'; data: { fenBefore: string; fenAfter: string; gameId: string; moveId: string; player: string } }>
-    'GetGame': ApiRouteHandler<{}, ApiResponse<200, { id: string; fen: string; turn: 'white' | 'black'; status: 'pending' | 'completed' | 'draw' | 'endedEarly'; lastMove?: Array<string>; lastMoveSan?: string; winner?: 'white' | 'black'; turns?: number; endGameReason?: string; players: { white: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number }; black: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number } }; check: boolean; scoreboard?: { white: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; black: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; totalMoves: number; decisiveMoment?: { moveNumber: number; evaluationSwing: number; move: Array<string>; fen: string } }; role: 'white' | 'black' | 'spectator' | 'root'; username: string; passwords?: { root: string; white: string; black: string } }> | ApiResponse<404, { message: string }>, never>
-    'CreateGame': ApiRouteHandler<{ players: { white: { ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string }; black: { ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string } } }, ApiResponse<200, { id: string; fen: string; turn: 'white' | 'black'; status: 'pending' | 'completed' | 'draw' | 'endedEarly'; lastMove?: Array<string>; lastMoveSan?: string; winner?: 'white' | 'black'; turns?: number; endGameReason?: string; players: { white: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number }; black: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number } }; check: boolean; scoreboard?: { white: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; black: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; totalMoves: number; decisiveMoment?: { moveNumber: number; evaluationSwing: number; move: Array<string>; fen: string } } }> | ApiResponse<400, { message: string; errors: Array<{ message: string }> }> | ApiResponse<401, { message: string }>, { topic: 'chess-game-created'; data: { gameId: string; fenBefore: string } }>
+    'MovePiece': ApiRouteHandler<{ moveSan: string }, ApiResponse<200, { id: string; fen: string; turn: 'white' | 'black'; status: 'pending' | 'completed' | 'draw' | 'endedEarly'; variant?: 'guided' | 'unguided'; lastMove?: Array<string>; lastMoveSan?: string; winner?: 'white' | 'black'; turns?: number; endGameReason?: string; players: { white: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number }; black: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number } }; check: boolean; scoreboard?: { white: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; black: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; totalMoves: number; decisiveMoment?: { moveNumber: number; evaluationSwing: number; move: Array<string>; fen: string } }; createdAt?: number }> | ApiResponse<400, { message: string }> | ApiResponse<404, { message: string }>, { topic: 'chess-game-moved'; data: { gameId: string; fenBefore: string } } | { topic: 'chess-game-ended'; data: { gameId: string } } | { topic: 'evaluate-player-move'; data: { fenBefore: string; fenAfter: string; gameId: string; moveId: string; player: string } }>
+    'GetGame': ApiRouteHandler<{}, ApiResponse<200, { id: string; fen: string; turn: 'white' | 'black'; status: 'pending' | 'completed' | 'draw' | 'endedEarly'; variant?: 'guided' | 'unguided'; lastMove?: Array<string>; lastMoveSan?: string; winner?: 'white' | 'black'; turns?: number; endGameReason?: string; players: { white: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number }; black: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number } }; check: boolean; scoreboard?: { white: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; black: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; totalMoves: number; decisiveMoment?: { moveNumber: number; evaluationSwing: number; move: Array<string>; fen: string } }; createdAt?: number; role: 'white' | 'black' | 'spectator' | 'root'; username: string; passwords?: { root: string; white: string; black: string } }> | ApiResponse<404, { message: string }>, never>
+    'CreateGame': ApiRouteHandler<{ players: { white: { ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string }; black: { ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string } }; variant?: 'guided' | 'unguided' }, ApiResponse<200, { id: string; fen: string; turn: 'white' | 'black'; status: 'pending' | 'completed' | 'draw' | 'endedEarly'; variant?: 'guided' | 'unguided'; lastMove?: Array<string>; lastMoveSan?: string; winner?: 'white' | 'black'; turns?: number; endGameReason?: string; players: { white: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number }; black: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number } }; check: boolean; scoreboard?: { white: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; black: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; totalMoves: number; decisiveMoment?: { moveNumber: number; evaluationSwing: number; move: Array<string>; fen: string } }; createdAt?: number }> | ApiResponse<400, { message: string; errors: Array<{ message: string }> }> | ApiResponse<401, { message: string }>, { topic: 'chess-game-created'; data: { gameId: string; fenBefore: string } }>
     'AvailableModels': ApiRouteHandler<{}, ApiResponse<200, { models: { openai: Array<string>; gemini: Array<string>; claude: Array<string>; grok: Array<string> } }> | ApiResponse<400, { message: string }> | ApiResponse<404, { message: string }>, never>
     'RequestAccess': ApiRouteHandler<Record<string, unknown>, ApiResponse<200, {}> | ApiResponse<400, { message: string }> | ApiResponse<404, { message: string }>, never>
     'AcceptRequestAccess': ApiRouteHandler<{ userId: string }, ApiResponse<200, {}> | ApiResponse<400, { message: string }> | ApiResponse<404, { message: string }>, never>
+    'GetStockfishLeaderboard': ApiRouteHandler<Record<string, unknown>, ApiResponse<200, { leaderboard: Array<{ id: string; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; runsCompleted: number; averageAcpl: number; bestAcpl: number; wins: number; losses: number; draws: number; lastRunAt: number }> }>, never>
+    'RunStockfishBenchmark': ApiRouteHandler<{ provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; stockfishLevel?: number }, ApiResponse<200, { id: string; createdAt: number; completedAt?: number; status: 'running' | 'completed' | 'failed'; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; stockfishLevel: number; gameAsWhite?: { id: string; createdAt: number; completedAt?: number; status: 'running' | 'completed' | 'failed'; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; aiColor: 'white' | 'black'; stockfishLevel: number; result?: 'ai_win' | 'stockfish_win' | 'draw' | 'ai_illegal_move' | 'timeout'; resultReason?: string; moves: Array<{ moveNumber: number; player: 'white' | 'black'; moveSan: string; fen: string; centipawnScore?: number; bestMove?: string; centipawnLoss?: number; isAiMove: boolean; responseTime?: number; error?: string }>; totalMoves: number; finalFen?: string; pgn?: string; aiMoveCount?: number; totalCentipawnLoss?: number; averageCentipawnLoss?: number; blunders?: number; mistakes?: number; inaccuracies?: number }; gameAsBlack?: unknown; gamesPlayed: number; wins: number; losses: number; draws: number; overallAcpl?: number }> | ApiResponse<400, { message: string }>, never>
+    'RunAllBenchmarks': ApiRouteHandler<{ positionCount?: number; force?: boolean; rerunCompleted?: boolean }, ApiResponse<200, { message: string; positionCount: number; totalModels: number }> | ApiResponse<400, { message: string }>, never>
+    'GetPuzzleSets': ApiRouteHandler<Record<string, unknown>, ApiResponse<200, { sets: Array<{ id: string; theme: 'mateIn1' | 'oneMove'; createdAt: number; count: number; puzzleCount: number }> }>, never>
+    'GetPuzzleLeaderboard': ApiRouteHandler<Record<string, unknown>, ApiResponse<200, { leaderboard: Array<{ id: string; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; mateIn1Accuracy?: number; oneMoveAccuracy?: number; overallAccuracy?: number; runsCompleted: number; lastRunAt: number }> }>, never>
+    'RunAllPuzzleBenchmarks': ApiRouteHandler<{ theme: 'mateIn1' | 'oneMove'; count?: number; rerunCompleted?: boolean }, ApiResponse<200, { message: string; theme: 'mateIn1' | 'oneMove'; puzzleCount: number; totalModels: number }> | ApiResponse<400, { message: string }>, never>
+    'RunPuzzleBenchmark': ApiRouteHandler<{ provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; theme: 'mateIn1' | 'oneMove'; count?: number }, ApiResponse<200, { id: string; createdAt: number; completedAt?: number; status: 'pending' | 'running' | 'completed' | 'failed'; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; puzzleSetId: string; theme: 'mateIn1' | 'oneMove'; results: Array<{ puzzleId: string; modelMove?: string; correctMove: string; isCorrect: boolean; responseTime: number; rawResponse: string; error?: string }>; totalPuzzles: number; correctCount?: number; accuracy?: number }> | ApiResponse<400, { message: string }>, never>
+    'FetchPuzzleSet': ApiRouteHandler<{ theme: 'mateIn1' | 'oneMove'; count?: number }, ApiResponse<200, { id: string; theme: 'mateIn1' | 'oneMove'; createdAt: number; puzzles: Array<{ id: string; rating: number; themes: Array<string>; solution: Array<string>; initialPly: number; pgn: string; fen: string; legalMoves: Array<string>; solutionSan: string }>; count: number }> | ApiResponse<400, { message: string }>, never>
+    'GetBenchmarkLeaderboard': ApiRouteHandler<Record<string, unknown>, ApiResponse<200, { leaderboard: Array<{ id: string; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; runsCompleted: number; averageScore: number; bestScore: number; worstScore: number; lastRunAt: number }> }>, never>
+    'GetBenchmarkRunDetail': ApiRouteHandler<Record<string, unknown>, ApiResponse<200, { id: string; createdAt: number; completedAt?: number; status: 'pending' | 'running' | 'completed' | 'failed'; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; positionCount: number; positions: Array<{ id: string; fen: string; pgn: string; turn: 'white' | 'black'; legalMoves: Array<string>; legalMoveCount: number; moveNumber: number }>; results: Array<{ positionId: string; modelMoves: Array<string>; correctMoves: Array<string>; illegalMoves: Array<string>; missedMoves: Array<string>; accuracy: number; penalty: number; finalScore: number; responseTime: number; rawResponse: string; error?: string }>; averageAccuracy?: number; averagePenalty?: number; averageFinalScore?: number; totalCorrectMoves?: number; totalIllegalMoves?: number; totalMissedMoves?: number }> | ApiResponse<404, { message: string }>, never>
+    'GetBenchmarkRuns': ApiRouteHandler<Record<string, unknown>, ApiResponse<200, { runs: Array<{ id: string; createdAt: number; completedAt?: number; status: 'pending' | 'running' | 'completed' | 'failed'; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; positionCount: number; averageAccuracy?: number; averagePenalty?: number; averageFinalScore?: number; totalCorrectMoves?: number; totalIllegalMoves?: number; totalMissedMoves?: number; resultsCount: number }>; total: number }>, never>
+    'RunLegalMoveBenchmark': ApiRouteHandler<{ provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string }, ApiResponse<200, { id: string; createdAt: number; completedAt?: number; status: 'pending' | 'running' | 'completed' | 'failed'; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; positionCount: number; positions: Array<{ id: string; fen: string; pgn: string; turn: 'white' | 'black'; legalMoves: Array<string>; legalMoveCount: number; moveNumber: number }>; results: Array<{ positionId: string; modelMoves: Array<string>; correctMoves: Array<string>; illegalMoves: Array<string>; missedMoves: Array<string>; accuracy: number; penalty: number; finalScore: number; responseTime: number; rawResponse: string; error?: string }>; averageAccuracy?: number; averagePenalty?: number; averageFinalScore?: number; totalCorrectMoves?: number; totalIllegalMoves?: number; totalMissedMoves?: number }> | ApiResponse<400, { message: string }>, never>
+    'GeneratePositionSet': ApiRouteHandler<{ count?: number; force?: boolean }, ApiResponse<200, { id: string; createdAt: number; count: number; positions: Array<{ id: string; fen: string; pgn: string; turn: 'white' | 'black'; legalMoves: Array<string>; legalMoveCount: number; moveNumber: number }> }> | ApiResponse<400, { message: string }>, never>
     'GetUser': ApiRouteHandler<Record<string, unknown>, ApiResponse<200, { id: string; name: string; profilePic: string }> | ApiResponse<404, { message: string }>, never>
     'Auth': ApiRouteHandler<{ authToken: string }, ApiResponse<200, { accessToken: string; user: { id: string; name: string; profilePic: string; email: string } }> | ApiResponse<401, { error: string }> | ApiResponse<500, { error: string }>, never>
     'EvaluatePlayerMove': EventHandler<{ fenBefore: string; fenAfter: string; gameId: string; moveId: string; player: string }, never>
diff --git a/app/package.json b/app/package.json
index ef20cb0..c279d60 100644
--- a/app/package.json
+++ b/app/package.json
@@ -35,6 +35,7 @@
     "react-markdown": "^10.1.0",
     "react-router": "^7.6.2",
     "react-syntax-highlighter": "^15.6.1",
+    "recharts": "^3.6.0",
     "sonner": "^2.0.5",
     "tailwind-merge": "^3.3.0",
     "tailwindcss": "^4.1.7",
diff --git a/app/src/App.tsx b/app/src/App.tsx
index feafd4b..9ecaec0 100644
--- a/app/src/App.tsx
+++ b/app/src/App.tsx
@@ -11,6 +11,12 @@ import { AboutPage } from './pages/about-page'
 import { LoginPage } from './pages/login-page'
 import { AuthProvider } from './components/auth/auth-provider'
 import { PrivacyPage } from './pages/privacy-page'
+import { MethodologyPage } from './pages/methodology-page'
+import { GameHistoryPage } from './pages/game-history-page'
+import { GameReplayPage } from './pages/game-replay-page'
+import { PlayAIPage } from './pages/play-ai-page'
+import { BenchPage } from './pages/bench-page'
+import { ArenaPage } from './pages/arena-page'
 
 function App() {
   return (
@@ -25,6 +31,12 @@ function App() {
             <Route path="/leaderboard" element={<LeaderboardPage />} />
             <Route path="/game/:gameId" element={<ChessGamePage />} />
             <Route path="/about" element={<AboutPage />} />
+            <Route path="/methodology" element={<MethodologyPage />} />
+            <Route path="/history" element={<GameHistoryPage />} />
+            <Route path="/history/:gameId" element={<GameReplayPage />} />
+            <Route path="/play-ai" element={<PlayAIPage />} />
+            <Route path="/bench" element={<BenchPage />} />
+            <Route path="/arena" element={<ArenaPage />} />
             <Route path="/privacy-policy" element={<PrivacyPage />} />
           </Routes>
         </AuthProvider>
diff --git a/app/src/components/bench/bench-bar-charts.tsx b/app/src/components/bench/bench-bar-charts.tsx
new file mode 100644
index 0000000..7ec4320
--- /dev/null
+++ b/app/src/components/bench/bench-bar-charts.tsx
@@ -0,0 +1,641 @@
+import React, { useMemo } from 'react'
+import {
+  ResponsiveContainer,
+  BarChart,
+  Bar,
+  CartesianGrid,
+  XAxis,
+  YAxis,
+  Tooltip,
+  Cell,
+  Legend,
+  ScatterChart,
+  Scatter,
+  ReferenceLine,
+} from 'recharts'
+import { Filter } from 'lucide-react'
+import { cn } from '@/lib/utils'
+import type { BenchModelRow } from './bench-mock'
+import { getPricingMap } from './model-pricing'
+
+type Metric = 'motiaChessIndex' | 'legalMoveScore' | 'puzzleScore' | 'acplScore' | 'legalVsIllegal'
+
+// Type for chart tooltip payload data
+interface ChartTooltipData {
+  provider: string
+  label: string
+  value?: number
+  legal?: number
+  illegal?: number
+  rawAcpl?: number
+}
+
+export const providerColors: Record<string, string> = {
+  openai: '#10b981',
+  claude: '#a78bfa',
+  gemini: '#3b82f6',
+  grok: '#f59e0b',
+}
+
+const toAcplScore = (acpl: number) => {
+  return Math.max(0, Math.min(100, 100 - acpl))
+}
+
+const metricLabel = (metric: Metric) => {
+  switch (metric) {
+    case 'motiaChessIndex':
+      return 'Motia Index'
+    case 'legalMoveScore':
+      return 'Legal move score'
+    case 'puzzleScore':
+      return 'Puzzle accuracy'
+    case 'acplScore':
+      return 'ACPL score (inverted)'
+    case 'legalVsIllegal':
+      return 'Legal vs illegal'
+  }
+}
+
+const metricHowToRead = (metric: Metric) => {
+  switch (metric) {
+    case 'motiaChessIndex':
+      return 'Higher is better.'
+    case 'legalMoveScore':
+      return 'Higher is better.'
+    case 'puzzleScore':
+      return 'Higher is better.'
+    case 'acplScore':
+      return 'Bars are 100 - ACPL. Higher is better.'
+    case 'legalVsIllegal':
+      return 'Legal % vs the remainder.'
+  }
+}
+
+// SVG Patterns for hatched bars
+const HatchPatterns = () => (
+  <svg width="0" height="0" style={{ position: 'absolute' }}>
+    <defs>
+      {Object.entries(providerColors).map(([provider, color]) => (
+        <pattern
+          key={provider}
+          id={`hatch-${provider}`}
+          patternUnits="userSpaceOnUse"
+          width="6"
+          height="6"
+          patternTransform="rotate(45)"
+        >
+          <rect width="6" height="6" fill={color} fillOpacity={0.15} />
+          <line x1="0" y1="0" x2="0" y2="6" stroke={color} strokeWidth="2" strokeOpacity={0.4} />
+        </pattern>
+      ))}
+    </defs>
+  </svg>
+)
+
+type Props = {
+  title: string
+  description?: string
+  rows: BenchModelRow[]
+  metric: Metric
+  className?: string
+  unit?: string
+  topN?: number
+  hiddenModels?: Set<string>
+  showExpand?: boolean
+  expanded?: boolean
+  onExpandToggle?: () => void
+  layout?: 'horizontal' | 'vertical' // Added layout prop
+}
+
+export const BenchBarChart: React.FC<Props> = ({
+  title,
+  description,
+  rows,
+  metric,
+  className,
+  unit,
+  topN,
+  hiddenModels,
+  showExpand,
+  expanded,
+  onExpandToggle,
+  layout = 'vertical', // Default to vertical (horizontal bars)
+}) => {
+  const data = useMemo(() => {
+    const filtered = hiddenModels ? rows.filter((r) => !hiddenModels.has(r.id)) : rows
+
+    if (metric === 'legalVsIllegal') {
+      return filtered
+        .map((r) => ({
+          id: r.id,
+          label: r.model,
+          provider: r.provider,
+          legal: r.legalMoveScore,
+          illegal: Math.max(0, 100 - r.legalMoveScore),
+          value: r.legalMoveScore,
+          lastUpdatedAt: r.lastUpdatedAt,
+        }))
+        .sort((a, b) => b.value - a.value)
+        .slice(0, expanded ? 100 : (topN || 100))
+    }
+
+    const mapped = filtered.map((r) => {
+      const value =
+        metric === 'acplScore'
+          ? toAcplScore(r.acpl)
+          : metric === 'motiaChessIndex'
+          ? r.motiaChessIndex
+          : metric === 'legalMoveScore'
+          ? r.legalMoveScore
+          : r.puzzleScore
+      return {
+        id: r.id,
+        label: r.model,
+        provider: r.provider,
+        value,
+        rawAcpl: r.acpl,
+        lastUpdatedAt: r.lastUpdatedAt,
+      }
+    })
+
+    return mapped.sort((a, b) => b.value - a.value).slice(0, expanded ? 100 : (topN || 100))
+  }, [rows, metric, topN, hiddenModels, expanded])
+
+  const totalCount = useMemo(() => {
+    const filtered = hiddenModels ? rows.filter((r) => !hiddenModels.has(r.id)) : rows
+    return filtered.length
+  }, [rows, hiddenModels])
+
+  const isStacked = metric === 'legalVsIllegal'
+  const isVertical = layout === 'horizontal' // Horizontal layout means vertical bars
+  const chartHeight = isVertical ? 400 : Math.max(320, data.length * 32)
+
+  const axisText = 'rgba(255,255,255,0.72)'
+  const axisMuted = 'rgba(255,255,255,0.55)'
+  const axisLine = 'rgba(255,255,255,0.10)'
+
+  return (
+    <div className={cn('relative rounded-2xl border border-white/10 bg-[#0a0a0a] overflow-hidden', className)}>
+      <HatchPatterns />
+      
+      {/* Header */}
+      <div className="p-6 pb-4 border-b border-white/5">
+        <div className="flex items-start justify-between gap-4">
+          <div>
+            <h3 className="text-lg font-bold text-white tracking-tight font-sans">{title}</h3>
+            {description && (
+              <p className="text-sm text-white/40 mt-1 leading-relaxed font-normal">{description}</p>
+            )}
+          </div>
+          {showExpand && totalCount > (topN || 10) && (
+            <button
+              onClick={onExpandToggle}
+              className="px-3 py-1.5 text-xs font-semibold bg-white/5 hover:bg-white/10 border border-white/10 rounded-lg text-white/60 hover:text-white transition-all tracking-tight"
+            >
+              {expanded ? 'Show Less' : `View All ${totalCount}`}
+            </button>
+          )}
+        </div>
+      </div>
+
+      {/* Chart */}
+      <div className="p-6 pt-4" style={{ height: chartHeight }}>
+        <ResponsiveContainer width="100%" height="100%">
+          <BarChart
+            data={data}
+            layout={layout}
+            margin={{ top: 0, right: isVertical ? 0 : 24, left: 0, bottom: isVertical ? 84 : 0 }}
+            barCategoryGap={isVertical ? "15%" : "25%"}
+          >
+            <CartesianGrid stroke="rgba(255,255,255,0.04)" horizontal={isVertical} vertical={!isVertical} strokeDasharray="3 3" />
+            
+            {isVertical ? (
+              <>
+                <XAxis
+                  type="category"
+                  dataKey="label"
+                  tick={{ fill: axisMuted, fontSize: 10 }}
+                  axisLine={{ stroke: axisLine }}
+                  tickLine={false}
+                  interval={0}
+                  angle={-45}
+                  textAnchor="end"
+                  label={{
+                    value: 'Model →',
+                    position: 'insideBottom',
+                    offset: -6,
+                    fill: axisText,
+                    fontSize: 11,
+                    fontWeight: 600,
+                  }}
+                />
+                <YAxis
+                  type="number"
+                  tick={{ fill: axisMuted, fontSize: 11 }}
+                  axisLine={false}
+                  tickLine={false}
+                  domain={[0, 100]}
+                  label={{
+                    value: `${metricLabel(metric)}${unit ? ` (${unit.replace('%', '%')})` : ''}`,
+                    angle: -90,
+                    position: 'insideLeft',
+                    fill: axisText,
+                    fontSize: 11,
+                    fontWeight: 600,
+                    dx: -8,
+                  }}
+                />
+              </>
+            ) : (
+              <>
+                <XAxis
+                  type="number"
+                  tick={{ fill: axisMuted, fontSize: 11 }}
+                  axisLine={{ stroke: axisLine }}
+                  tickLine={false}
+                  domain={[0, 100]}
+                  label={{
+                    value: `${metricLabel(metric)} →`,
+                    position: 'insideBottom',
+                    offset: -2,
+                    fill: axisText,
+                    fontSize: 11,
+                    fontWeight: 600,
+                  }}
+                />
+                <YAxis
+                  type="category"
+                  dataKey="label"
+                  width={160}
+                  tick={{ fill: axisText, fontSize: 12 }}
+                  axisLine={false}
+                  tickLine={false}
+                />
+              </>
+            )}
+
+            <Tooltip
+              cursor={{ fill: 'rgba(255,255,255,0.02)' }}
+              content={({ active, payload }) => {
+                if (!active || !payload || payload.length === 0) return null
+                const p = payload[0].payload as ChartTooltipData
+                const provider = String(p.provider ?? '')
+                const model = String(p.label ?? '')
+
+                const value = typeof p.value === 'number' ? p.value : undefined
+                const legal = typeof p.legal === 'number' ? p.legal : undefined
+                const illegal = typeof p.illegal === 'number' ? p.illegal : undefined
+
+                return (
+                  <div className="bg-[#0a0a0a] border border-white/10 rounded-xl p-4 text-xs shadow-2xl min-w-[220px]">
+                    <div className="flex items-center justify-between gap-3 mb-2">
+                      <div className="font-semibold text-white text-sm">{model}</div>
+                      <div className="text-[10px] px-2 py-0.5 rounded-full border border-white/10 bg-white/5 text-white/60 capitalize">
+                        {provider}
+                      </div>
+                    </div>
+
+                    <div className="space-y-1.5">
+                      <div className="flex justify-between">
+                        <span className="text-white/50">{metricLabel(metric)}</span>
+                        {metric === 'legalVsIllegal' ? (
+                          <span className="text-white font-medium">{Number(legal ?? 0).toFixed(1)}%</span>
+                        ) : metric === 'acplScore' ? (
+                          <span className="text-emerald-400 font-medium">
+                            {Number(value ?? 0).toFixed(1)}
+                            {unit ?? ''}
+                          </span>
+                        ) : (
+                          <span className="text-emerald-400 font-medium">
+                            {Number(value ?? 0).toFixed(1)}
+                            {unit ?? ''}
+                          </span>
+                        )}
+                      </div>
+
+                      {metric === 'acplScore' && (
+                        <div className="flex justify-between">
+                          <span className="text-white/50">ACPL (raw)</span>
+                          <span className="text-white/70">{typeof p.rawAcpl === 'number' ? p.rawAcpl.toFixed(1) : '-'}</span>
+                        </div>
+                      )}
+
+                      {metric === 'legalVsIllegal' && (
+                        <>
+                          <div className="flex justify-between">
+                            <span className="text-white/50">Legal</span>
+                            <span className="text-white/70">{Number(legal ?? 0).toFixed(1)}%</span>
+                          </div>
+                          <div className="flex justify-between">
+                            <span className="text-white/50">Illegal / missed</span>
+                            <span className="text-white/70">{Number(illegal ?? 0).toFixed(1)}%</span>
+                          </div>
+                        </>
+                      )}
+
+                      <div className="pt-2 mt-2 border-t border-white/10 text-[10px] text-white/40">
+                        {metricHowToRead(metric)}
+                      </div>
+                    </div>
+                  </div>
+                )
+              }}
+            />
+            {isStacked ? (
+              <>
+                <Legend verticalAlign="top" height={36} iconType="circle" wrapperStyle={{ fontSize: '11px', opacity: 0.6, fontWeight: 600 }} />
+                <Bar dataKey="legal" name="Legal" stackId="a" fill="#10b981" radius={isVertical ? [4, 4, 0, 0] : [0, 4, 4, 0]} />
+                <Bar dataKey="illegal" name="Illegal" stackId="a" fill="#ef4444" radius={isVertical ? [0, 0, 4, 4] : [4, 0, 0, 4]} />
+              </>
+            ) : (
+              <Bar dataKey="value" name="Score" radius={isVertical ? [6, 6, 0, 0] : [0, 6, 6, 0]} barSize={isVertical ? 40 : 20}>
+                {data.map((d) => (
+                  <Cell
+                    key={d.id}
+                    fill={`url(#hatch-${d.provider})`}
+                    stroke={providerColors[d.provider]}
+                    strokeWidth={1.5}
+                  />
+                ))}
+              </Bar>
+            )}
+          </BarChart>
+        </ResponsiveContainer>
+      </div>
+
+      {metric === 'acplScore' && (
+        <div className="px-6 pb-4 text-[10px] text-white/30 font-mono border-t border-white/5 pt-3">
+          * Bars represent inverted score (100 - ACPL). Longer bar = Better play.
+        </div>
+      )}
+    </div>
+  )
+}
+
+// ----------------------------------------------------------------------
+// Cost vs Performance Scatter Plot
+// ----------------------------------------------------------------------
+
+type CostChartProps = {
+  title: string
+  description?: string
+  rows: BenchModelRow[]
+  className?: string
+  hiddenModels?: Set<string>
+}
+
+export const CostVsPerformanceChart: React.FC<CostChartProps> = ({ title, description, rows, className, hiddenModels }) => {
+  const pricingMap = useMemo(() => getPricingMap(), [])
+
+  const data = useMemo(() => {
+    const filtered = hiddenModels ? rows.filter((r) => !hiddenModels.has(r.id)) : rows
+    return filtered
+      .map((r) => {
+        const pricing = pricingMap.get(r.id)
+        return {
+          id: r.id,
+          x: pricing?.avgPrice ?? 10, // Cost (USD per 1M tokens)
+          y: r.motiaChessIndex,
+          provider: r.provider,
+          label: r.model,
+          inputPrice: pricing?.inputPrice ?? 0,
+          outputPrice: pricing?.outputPrice ?? 0,
+        }
+      })
+      .filter((d) => d.x > 0)
+  }, [rows, hiddenModels, pricingMap])
+
+  // Calculate average performance for reference line
+  const avgPerformance = useMemo(() => {
+    if (data.length === 0) return 50
+    return data.reduce((sum, d) => sum + d.y, 0) / data.length
+  }, [data])
+
+  return (
+    <div className={cn('relative rounded-2xl border border-white/10 bg-[#0a0a0a] overflow-hidden', className)}>
+      {/* Header */}
+      <div className="p-6 pb-4 border-b border-white/5">
+        <div className="flex items-start justify-between gap-4">
+          <div>
+            <h3 className="text-lg font-semibold text-white tracking-tight">{title}</h3>
+            {description && (
+              <p className="text-sm text-white/50 mt-1 leading-relaxed">{description}</p>
+            )}
+          </div>
+          <div className="flex items-center gap-2 text-[10px] text-white/40 border border-white/10 px-2 py-1 rounded bg-white/5">
+            <span className="w-2 h-2 rounded-full bg-emerald-500" />
+            Top Left = Best Value
+          </div>
+        </div>
+      </div>
+
+      {/* Chart */}
+      <div className="p-6 pt-4 h-[420px]">
+        <ResponsiveContainer width="100%" height="100%">
+          <ScatterChart margin={{ top: 20, right: 30, bottom: 50, left: 20 }}>
+            <CartesianGrid stroke="rgba(255,255,255,0.04)" strokeDasharray="3 3" />
+            <XAxis
+              type="number"
+              dataKey="x"
+              name="Cost"
+              scale="log"
+              domain={['auto', 'auto']}
+              label={{
+                value: 'Cost (USD / 1M tokens) →',
+                position: 'insideBottom',
+                offset: -5,
+                fill: 'rgba(255,255,255,0.72)',
+                fontSize: 11,
+                fontWeight: 600,
+              }}
+              tick={{ fill: 'rgba(255,255,255,0.55)', fontSize: 10 }}
+              tickFormatter={(v) => `$${v}`}
+              axisLine={{ stroke: 'rgba(255,255,255,0.10)' }}
+              tickLine={false}
+            />
+            <YAxis
+              type="number"
+              dataKey="y"
+              name="Index"
+              label={{
+                value: '← Motia Index',
+                angle: -90,
+                position: 'insideLeft',
+                fill: 'rgba(255,255,255,0.72)',
+                fontSize: 11,
+                fontWeight: 600,
+                dx: -5,
+              }}
+              tick={{ fill: 'rgba(255,255,255,0.55)', fontSize: 10 }}
+              domain={[0, 100]}
+              axisLine={{ stroke: 'rgba(255,255,255,0.10)' }}
+              tickLine={false}
+            />
+            <ReferenceLine
+              y={avgPerformance}
+              stroke="rgba(255,255,255,0.1)"
+              strokeDasharray="5 5"
+              label={{
+                value: 'Avg',
+                position: 'right',
+                fill: 'rgba(255,255,255,0.3)',
+                fontSize: 10,
+              }}
+            />
+            <Tooltip
+              cursor={{ strokeDasharray: '3 3', stroke: 'rgba(255,255,255,0.2)' }}
+              content={({ active, payload }) => {
+                if (active && payload && payload.length) {
+                  const d = payload[0].payload
+                  return (
+                    <div className="bg-[#0a0a0a] border border-white/10 rounded-xl p-4 text-xs shadow-2xl min-w-[180px]">
+                      <div className="font-semibold text-white mb-2 text-sm">{d.label}</div>
+                      <div className="space-y-1.5">
+                        <div className="flex justify-between">
+                          <span className="text-white/50">Motia Index</span>
+                          <span className="text-emerald-400 font-medium">{d.y}</span>
+                        </div>
+                        <div className="flex justify-between">
+                          <span className="text-white/50">Input</span>
+                          <span className="text-white/70">${d.inputPrice}/1M</span>
+                        </div>
+                        <div className="flex justify-between">
+                          <span className="text-white/50">Output</span>
+                          <span className="text-white/70">${d.outputPrice}/1M</span>
+                        </div>
+                        <div className="pt-1.5 mt-1.5 border-t border-white/10 flex justify-between">
+                          <span className="text-white/50">Provider</span>
+                          <span className="text-white/70 capitalize">{d.provider}</span>
+                        </div>
+                      </div>
+                    </div>
+                  )
+                }
+                return null
+              }}
+            />
+            <Scatter name="Models" data={data}>
+              {data.map((entry, index) => (
+                <Cell
+                  key={`cell-${index}`}
+                  fill={providerColors[entry.provider] ?? '#fff'}
+                  stroke="rgba(0,0,0,0.3)"
+                  strokeWidth={2}
+                  r={8}
+                />
+              ))}
+            </Scatter>
+          </ScatterChart>
+        </ResponsiveContainer>
+      </div>
+
+      {/* Legend */}
+      <div className="px-6 pb-4 flex flex-wrap gap-4 border-t border-white/5 pt-3">
+        {Object.entries(providerColors).map(([provider, color]) => (
+          <div key={provider} className="flex items-center gap-2">
+            <div className="w-3 h-3 rounded-full" style={{ backgroundColor: color }} />
+            <span className="text-xs text-white/50 capitalize">{provider}</span>
+          </div>
+        ))}
+      </div>
+    </div>
+  )
+}
+
+// ----------------------------------------------------------------------
+// Model Filter Component
+// ----------------------------------------------------------------------
+
+type ModelFilterProps = {
+  rows: BenchModelRow[]
+  hiddenModels: Set<string>
+  onToggle: (id: string) => void
+  onShowAll: () => void
+}
+
+export const ModelFilter: React.FC<ModelFilterProps> = ({ rows, hiddenModels, onToggle, onShowAll }) => {
+  const providers = useMemo(() => {
+    const grouped: Record<string, BenchModelRow[]> = {}
+    rows.forEach((r) => {
+      if (!grouped[r.provider]) grouped[r.provider] = []
+      grouped[r.provider].push(r)
+    })
+    return grouped
+  }, [rows])
+
+  return (
+    <div className="rounded-2xl border border-white/10 bg-[#0a0a0a] overflow-hidden animate-in slide-in-from-top-4 duration-300">
+      <div className="p-6 border-b border-white/5 flex items-center justify-between bg-white/[0.02]">
+        <div className="flex items-center gap-3">
+          <div className="p-2 bg-emerald-500/10 rounded-lg">
+            <Filter size={18} className="text-emerald-400" />
+          </div>
+          <div>
+            <h3 className="text-lg font-bold text-white tracking-tight">Model Selection</h3>
+            <p className="text-xs text-white/40 font-medium uppercase tracking-wider mt-0.5">Toggle models to compare performance</p>
+          </div>
+        </div>
+        <button
+          onClick={onShowAll}
+          className="px-4 py-2 text-xs font-bold bg-emerald-500/10 hover:bg-emerald-500/20 border border-emerald-500/20 rounded-xl text-emerald-400 hover:text-emerald-300 transition-all shadow-lg shadow-emerald-500/5"
+        >
+          Reset All
+        </button>
+      </div>
+
+      <div className="p-6 grid grid-cols-1 md:grid-cols-2 lg:grid-cols-4 gap-x-8 gap-y-10">
+        {Object.entries(providers).map(([provider, models]) => (
+          <div key={provider} className="flex flex-col">
+            <div
+              className="flex items-center gap-2 mb-4 pb-2 border-b-2"
+              style={{ borderColor: providerColors[provider] + '40' }}
+            >
+              <div
+                className="w-2.5 h-2.5 rounded-full ring-4 ring-offset-2 ring-offset-[#0a0a0a]"
+                style={{ backgroundColor: providerColors[provider] }}
+              />
+              <span className="text-sm font-black text-white uppercase tracking-widest">{provider}</span>
+              <span className="text-[10px] font-bold text-white/30 ml-auto bg-white/5 px-2 py-0.5 rounded-full">
+                {models.filter((m) => !hiddenModels.has(m.id)).length}/{models.length}
+              </span>
+            </div>
+            <div className="space-y-3">
+              {models.map((m) => (
+                <label
+                  key={m.id}
+                  className="flex items-center gap-3 cursor-pointer group select-none"
+                >
+                  <div className="relative flex items-center justify-center">
+                    <input
+                      type="checkbox"
+                      checked={!hiddenModels.has(m.id)}
+                      onChange={() => onToggle(m.id)}
+                      className="peer h-5 w-5 cursor-pointer appearance-none rounded-md border-2 border-white/10 bg-white/5 transition-all checked:border-emerald-500/50 checked:bg-emerald-500/10 hover:border-white/20"
+                    />
+                    <div className="pointer-events-none absolute text-emerald-500 opacity-0 transition-opacity peer-checked:opacity-100">
+                      <svg className="h-3.5 w-3.5 stroke-[4]" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+                        <path strokeLinecap="round" strokeLinejoin="round" d="M5 13l4 4L19 7" />
+                      </svg>
+                    </div>
+                  </div>
+                  <span
+                    className={cn(
+                      'text-sm font-medium transition-all duration-200',
+                      hiddenModels.has(m.id)
+                        ? 'text-white/20 line-through'
+                        : 'text-white/70 group-hover:text-white group-hover:translate-x-0.5'
+                    )}
+                  >
+                    {m.model}
+                  </span>
+                </label>
+              ))}
+            </div>
+          </div>
+        ))}
+      </div>
+    </div>
+  )
+}
+
+
diff --git a/app/src/components/bench/bench-charts.tsx b/app/src/components/bench/bench-charts.tsx
new file mode 100644
index 0000000..52dd710
--- /dev/null
+++ b/app/src/components/bench/bench-charts.tsx
@@ -0,0 +1,59 @@
+import React, { useMemo } from 'react'
+import { cn } from '@/lib/utils'
+import type { BenchTimeseriesPoint } from './bench-mock'
+import {
+  ResponsiveContainer,
+  AreaChart,
+  Area,
+  XAxis,
+  YAxis,
+  Tooltip,
+  CartesianGrid,
+} from 'recharts'
+
+type MiniAreaProps = {
+  points: BenchTimeseriesPoint[]
+  className?: string
+  stroke?: string
+  height?: number
+}
+
+export const MiniArea: React.FC<MiniAreaProps> = ({ points, className, stroke = '#34d399', height = 64 }) => {
+  const data = useMemo(() => points.map((p) => ({ t: p.t, v: p.v })), [points])
+
+  return (
+    <div className={cn('w-full', className)} style={{ height }}>
+      <ResponsiveContainer width="100%" height="100%">
+        <AreaChart data={data} margin={{ top: 8, right: 0, left: 0, bottom: 0 }}>
+          <defs>
+            <linearGradient id={`benchGrad-${stroke.replace('#', '')}`} x1="0" y1="0" x2="0" y2="1">
+              <stop offset="0%" stopColor={stroke} stopOpacity={0.35} />
+              <stop offset="100%" stopColor={stroke} stopOpacity={0} />
+            </linearGradient>
+          </defs>
+          <CartesianGrid vertical={false} stroke="rgba(255,255,255,0.06)" />
+          <XAxis dataKey="t" hide />
+          <YAxis hide domain={['dataMin', 'dataMax']} />
+          <Tooltip
+            cursor={{ stroke: 'rgba(255,255,255,0.12)' }}
+            contentStyle={{
+              background: 'rgba(9,9,11,0.92)',
+              border: '1px solid rgba(255,255,255,0.10)',
+              borderRadius: 12,
+              color: 'rgba(255,255,255,0.85)',
+            }}
+            labelFormatter={(t) => new Date(Number(t)).toLocaleDateString('en-US', { month: 'short', day: 'numeric' })}
+            formatter={(v) => [Number(v).toFixed(1), '']}
+          />
+          <Area
+            type="monotone"
+            dataKey="v"
+            stroke={stroke}
+            strokeWidth={2}
+            fill={`url(#benchGrad-${stroke.replace('#', '')})`}
+          />
+        </AreaChart>
+      </ResponsiveContainer>
+    </div>
+  )
+}
diff --git a/app/src/components/bench/bench-mock.ts b/app/src/components/bench/bench-mock.ts
new file mode 100644
index 0000000..22e5d72
--- /dev/null
+++ b/app/src/components/bench/bench-mock.ts
@@ -0,0 +1,245 @@
+const NOW = Date.now()
+
+const clamp = (v: number, min: number, max: number) => Math.max(min, Math.min(max, v))
+
+const hash01 = (s: string) => {
+  let h = 2166136261
+  for (let i = 0; i < s.length; i++) {
+    h ^= s.charCodeAt(i)
+    h = Math.imul(h, 16777619)
+  }
+  return ((h >>> 0) % 10_000) / 10_000
+}
+
+const seeded = (seedStr: string) => {
+  let t = Math.floor(hash01(seedStr) * 0xffffffff) >>> 0
+  return () => {
+    t += 0x6d2b79f5
+    let x = t
+    x = Math.imul(x ^ (x >>> 15), x | 1)
+    x ^= x + Math.imul(x ^ (x >>> 7), x | 61)
+    return ((x ^ (x >>> 14)) >>> 0) / 4294967296
+  }
+}
+
+const normalish = (rand: () => number) => {
+  const u = rand() || 1e-9
+  const v = rand() || 1e-9
+  return Math.sqrt(-2 * Math.log(u)) * Math.cos(2 * Math.PI * v)
+}
+
+const legalMoveSummary: Record<string, { averageScore: number; lastRunAt: number }> = {
+  'grok:grok-4-fast-non-reasoning': { averageScore: 28.38477029006998, lastRunAt: 1766157466671 },
+  'grok:grok-4-fast-reasoning': { averageScore: 95.0778179689851, lastRunAt: 1766158456823 },
+  'gemini:gemini-3-pro-preview': { averageScore: 95.71476876793383, lastRunAt: 1766158866596 },
+  'openai:gpt-5.2': { averageScore: 94.47529713518263, lastRunAt: 1766159361494 },
+  'gemini:gemini-2.5-pro': { averageScore: 62.0246628219507, lastRunAt: 1766159574654 },
+  'claude:claude-opus-4-5': { averageScore: 87.51753246753248, lastRunAt: 1766160119644 },
+  'gemini:gemini-2.5-flash': { averageScore: 50.682398740759965, lastRunAt: 1766161062713 },
+  'gemini:gemini-2.5-flash-lite': { averageScore: 30.926490100647793, lastRunAt: 1766161726694 },
+  'gemini:gemini-2.0-flash': { averageScore: 57.6118913255696, lastRunAt: 1766161760105 },
+}
+
+const inferredLegalMoveScore: Record<string, number> = {
+  'openai:gpt-5.1': 92.8,
+  'openai:gpt-5': 90.1,
+  'openai:gpt-5-mini': 83.8,
+  'openai:gpt-4.1': 86.2,
+  'openai:gpt-4.1-mini': 78.4,
+  'openai:gpt-4o': 76.3,
+  'openai:gpt-4o-mini': 67.6,
+
+  'claude:claude-sonnet-4-5': 83.6,
+  'claude:claude-haiku-4-5': 74.9,
+  'claude:claude-opus-4-0': 85.2,
+  'claude:claude-sonnet-4-0': 80.3,
+  'claude:claude-3-7-sonnet-latest': 73.4,
+  'claude:claude-3-5-haiku-latest': 65.9,
+
+  'grok:grok-4': 92.0,
+  'grok:grok-3': 69.7,
+  'grok:grok-3-fast': 55.1,
+}
+
+const allModels: Array<{ provider: string; model: string }> = [
+  ...[
+    'gpt-5.2',
+    'gpt-5.1',
+    'gpt-5',
+    'gpt-5-mini',
+    'gpt-4.1',
+    'gpt-4.1-mini',
+    'gpt-4o',
+    'gpt-4o-mini',
+  ].map((model) => ({ provider: 'openai', model })),
+  ...['gemini-3-pro-preview', 'gemini-2.5-pro', 'gemini-2.5-flash', 'gemini-2.5-flash-lite', 'gemini-2.0-flash'].map(
+    (model) => ({ provider: 'gemini', model }),
+  ),
+  ...[
+    'claude-opus-4-5',
+    'claude-sonnet-4-5',
+    'claude-haiku-4-5',
+    'claude-opus-4-0',
+    'claude-sonnet-4-0',
+    'claude-3-7-sonnet-latest',
+    'claude-3-5-haiku-latest',
+  ].map((model) => ({ provider: 'claude', model })),
+  ...['grok-4-fast-non-reasoning', 'grok-4-fast-reasoning', 'grok-4', 'grok-3', 'grok-3-fast'].map((model) => ({
+    provider: 'grok',
+    model,
+  })),
+]
+
+const computeRow = (provider: string, model: string): BenchModelRow => {
+  const id = `${provider}:${model}`
+  const summary = legalMoveSummary[id]
+
+  const rand = seeded(id)
+  const n1 = normalish(rand)
+  const n2 = normalish(rand)
+  const n3 = normalish(rand)
+
+  const baseLegal = summary?.averageScore ?? inferredLegalMoveScore[id] ?? 50
+
+  const legalSigma = provider === 'grok' ? 4.5 : provider === 'gemini' ? 4.0 : provider === 'claude' ? 4.2 : 3.6
+  const legalMoveScore = clamp(baseLegal + n1 * legalSigma, 2, 99.8)
+
+  const providerPuzzleBias = provider === 'openai' ? 3.0 : provider === 'claude' ? 1.5 : provider === 'gemini' ? 0.0 : -2.0
+  const modelPuzzleSkew =
+    model.includes('flash-lite') ? -8 :
+    model.includes('flash') ? -3 :
+    model.includes('mini') ? -4 :
+    model.includes('non-reasoning') ? -10 :
+    model.includes('reasoning') ? 4 :
+    model.includes('opus') ? 4 :
+    model.includes('pro') ? 3 : 0
+
+  const puzzleScore = clamp(legalMoveScore * (0.78 + 0.08 * rand()) + providerPuzzleBias + modelPuzzleSkew + n2 * 7, 8, 98)
+
+  const providerQualityBias = provider === 'openai' ? -6 : provider === 'claude' ? -3 : provider === 'gemini' ? 1 : 4
+  const modelQualitySkew =
+    model.includes('opus') || model.includes('pro') ? -8 :
+    model.includes('mini') || model.includes('flash') ? 6 :
+    model.includes('non-reasoning') ? 18 : 0
+
+  const composite = 0.58 * puzzleScore + 0.42 * legalMoveScore
+  const acpl = clamp(112 - composite + providerQualityBias + modelQualitySkew + n3 * 10 + (rand() - 0.5) * 6, 8, 120)
+
+  const acplScore = clamp(100 - acpl, 0, 100)
+  const motiaChessIndex = clamp(0.4 * legalMoveScore + 0.3 * puzzleScore + 0.3 * acplScore, 0, 100)
+
+  return {
+    id,
+    provider,
+    model,
+    motiaChessIndex: Number(motiaChessIndex.toFixed(1)),
+    legalMoveScore: Number(legalMoveScore.toFixed(1)),
+    puzzleScore: Number(puzzleScore.toFixed(1)),
+    acpl: Number(acpl.toFixed(1)),
+    lastUpdatedAt: summary?.lastRunAt ?? NOW,
+  }
+}
+
+export const mockBenchLeaderboard: BenchModelRow[] = allModels.map(({ provider, model }) => computeRow(provider, model))
+
+export type BenchTimeseriesPoint = { t: number; v: number }
+
+export type BenchModelRow = {
+  id: string
+  provider: string
+  model: string
+  motiaChessIndex: number
+  legalMoveScore: number
+  puzzleScore: number
+  acpl: number
+  lastUpdatedAt: number
+}
+
+export const mockBenchTimeseries = {
+  legalMoveScore: [
+    { t: Date.now() - 6 * 86400000, v: 62 },
+    { t: Date.now() - 5 * 86400000, v: 64 },
+    { t: Date.now() - 4 * 86400000, v: 66 },
+    { t: Date.now() - 3 * 86400000, v: 68 },
+    { t: Date.now() - 2 * 86400000, v: 70 },
+    { t: Date.now() - 1 * 86400000, v: 72 },
+    { t: Date.now(), v: 73 },
+  ],
+  puzzleScore: [
+    { t: Date.now() - 6 * 86400000, v: 44 },
+    { t: Date.now() - 5 * 86400000, v: 47 },
+    { t: Date.now() - 4 * 86400000, v: 49 },
+    { t: Date.now() - 3 * 86400000, v: 51 },
+    { t: Date.now() - 2 * 86400000, v: 52 },
+    { t: Date.now() - 1 * 86400000, v: 54 },
+    { t: Date.now(), v: 55 },
+  ],
+  acpl: [
+    { t: Date.now() - 6 * 86400000, v: 78 },
+    { t: Date.now() - 5 * 86400000, v: 75 },
+    { t: Date.now() - 4 * 86400000, v: 73 },
+    { t: Date.now() - 3 * 86400000, v: 70 },
+    { t: Date.now() - 2 * 86400000, v: 68 },
+    { t: Date.now() - 1 * 86400000, v: 66 },
+    { t: Date.now(), v: 65 },
+  ],
+}
+
+export const mockPrompts = {
+  legalMoveBench: `You are a chess expert. Given the following game, list ALL legal moves available for the current player.
+
+## Game (PGN)
+{{pgn}}
+
+## Current Position
+It is {{turn}}'s turn to move.
+FEN: {{fen}}
+
+## Task
+List ALL legal moves for {{turn}} in Standard Algebraic Notation (SAN).
+
+## Response Format
+Return ONLY a JSON object with no additional text:
+{
+  "moves": ["move1", "move2", "move3", ...]
+}
+`,
+  puzzleBench: `You are a chess engine. Solve this puzzle in one move.
+
+## Position
+FEN: {{fen}}
+Turn: {{turn}}
+
+## Game context (PGN)
+{{pgn}}
+
+## Legal moves (SAN)
+{{#legalMoves}}
+- {{.}}
+{{/legalMoves}}
+
+## Response format (JSON only)
+{
+  "move": "SAN"
+}
+`,
+  aiPlayerGuided: `You are a chess grandmaster playing as {{player}}.
+
+## Current Position
+- FEN: \`{{fen}}\`
+{{#inCheck}}- WARNING: You are in check!{{/inCheck}}
+
+## Valid Moves (Guided)
+{{#validMoves}}- {{san}}
+{{/validMoves}}
+
+## Response Format (JSON only)
+{
+  "thought": "Strategic reasoning",
+  "moveSan": "Your move in SAN"
+}
+`,
+}
+
+
+
diff --git a/app/src/components/bench/model-pricing.ts b/app/src/components/bench/model-pricing.ts
new file mode 100644
index 0000000..13f1f8e
--- /dev/null
+++ b/app/src/components/bench/model-pricing.ts
@@ -0,0 +1,58 @@
+// Model pricing data (USD per 1M tokens)
+// These are mock prices - update with real pricing data
+
+export type ModelPricing = {
+  id: string
+  provider: string
+  model: string
+  inputPrice: number  // USD per 1M input tokens
+  outputPrice: number // USD per 1M output tokens
+  avgPrice: number    // Average (input + output) / 2 for simple comparisons
+}
+
+export const modelPricing: ModelPricing[] = [
+  // OpenAI - https://openai.com/api/pricing/
+  { id: 'openai:gpt-5.2', provider: 'openai', model: 'gpt-5.2', inputPrice: 15.00, outputPrice: 60.00, avgPrice: 37.50 },
+  { id: 'openai:gpt-5.1', provider: 'openai', model: 'gpt-5.1', inputPrice: 12.00, outputPrice: 48.00, avgPrice: 30.00 },
+  { id: 'openai:gpt-5', provider: 'openai', model: 'gpt-5', inputPrice: 10.00, outputPrice: 40.00, avgPrice: 25.00 },
+  { id: 'openai:gpt-5-mini', provider: 'openai', model: 'gpt-5-mini', inputPrice: 1.50, outputPrice: 6.00, avgPrice: 3.75 },
+  { id: 'openai:gpt-4.1', provider: 'openai', model: 'gpt-4.1', inputPrice: 2.00, outputPrice: 8.00, avgPrice: 5.00 },
+  { id: 'openai:gpt-4.1-mini', provider: 'openai', model: 'gpt-4.1-mini', inputPrice: 0.40, outputPrice: 1.60, avgPrice: 1.00 },
+  { id: 'openai:gpt-4o', provider: 'openai', model: 'gpt-4o', inputPrice: 2.50, outputPrice: 10.00, avgPrice: 6.25 },
+  { id: 'openai:gpt-4o-mini', provider: 'openai', model: 'gpt-4o-mini', inputPrice: 0.15, outputPrice: 0.60, avgPrice: 0.375 },
+
+  // Google Gemini - https://ai.google.dev/pricing
+  { id: 'gemini:gemini-3-pro-preview', provider: 'gemini', model: 'gemini-3-pro-preview', inputPrice: 7.00, outputPrice: 21.00, avgPrice: 14.00 },
+  { id: 'gemini:gemini-2.5-pro', provider: 'gemini', model: 'gemini-2.5-pro', inputPrice: 1.25, outputPrice: 5.00, avgPrice: 3.125 },
+  { id: 'gemini:gemini-2.5-flash', provider: 'gemini', model: 'gemini-2.5-flash', inputPrice: 0.15, outputPrice: 0.60, avgPrice: 0.375 },
+  { id: 'gemini:gemini-2.5-flash-lite', provider: 'gemini', model: 'gemini-2.5-flash-lite', inputPrice: 0.075, outputPrice: 0.30, avgPrice: 0.1875 },
+  { id: 'gemini:gemini-2.0-flash', provider: 'gemini', model: 'gemini-2.0-flash', inputPrice: 0.10, outputPrice: 0.40, avgPrice: 0.25 },
+  { id: 'gemini:gemini-1.5-pro', provider: 'gemini', model: 'gemini-1.5-pro', inputPrice: 1.25, outputPrice: 5.00, avgPrice: 3.125 },
+
+  // Anthropic Claude - https://www.anthropic.com/pricing
+  { id: 'claude:claude-opus-4-5', provider: 'claude', model: 'claude-opus-4-5', inputPrice: 15.00, outputPrice: 75.00, avgPrice: 45.00 },
+  { id: 'claude:claude-sonnet-4-5', provider: 'claude', model: 'claude-sonnet-4-5', inputPrice: 3.00, outputPrice: 15.00, avgPrice: 9.00 },
+  { id: 'claude:claude-haiku-4-5', provider: 'claude', model: 'claude-haiku-4-5', inputPrice: 0.80, outputPrice: 4.00, avgPrice: 2.40 },
+  { id: 'claude:claude-opus-4-0', provider: 'claude', model: 'claude-opus-4-0', inputPrice: 15.00, outputPrice: 75.00, avgPrice: 45.00 },
+  { id: 'claude:claude-sonnet-4-0', provider: 'claude', model: 'claude-sonnet-4-0', inputPrice: 3.00, outputPrice: 15.00, avgPrice: 9.00 },
+  { id: 'claude:claude-3-7-sonnet-latest', provider: 'claude', model: 'claude-3-7-sonnet-latest', inputPrice: 3.00, outputPrice: 15.00, avgPrice: 9.00 },
+
+  // xAI Grok - https://docs.x.ai/docs/models
+  { id: 'grok:grok-4-fast-non-reasoning', provider: 'grok', model: 'grok-4-fast-non-reasoning', inputPrice: 3.00, outputPrice: 15.00, avgPrice: 9.00 },
+  { id: 'grok:grok-4-fast-reasoning', provider: 'grok', model: 'grok-4-fast-reasoning', inputPrice: 5.00, outputPrice: 25.00, avgPrice: 15.00 },
+  { id: 'grok:grok-4', provider: 'grok', model: 'grok-4', inputPrice: 10.00, outputPrice: 40.00, avgPrice: 25.00 },
+  { id: 'grok:grok-3', provider: 'grok', model: 'grok-3', inputPrice: 3.00, outputPrice: 15.00, avgPrice: 9.00 },
+  { id: 'grok:grok-3-fast', provider: 'grok', model: 'grok-3-fast', inputPrice: 1.00, outputPrice: 5.00, avgPrice: 3.00 },
+]
+
+// Helper to get pricing by model ID
+export const getPricing = (id: string): ModelPricing | undefined => {
+  return modelPricing.find((p) => p.id === id)
+}
+
+// Helper to get all pricing as a map
+export const getPricingMap = (): Map<string, ModelPricing> => {
+  return new Map(modelPricing.map((p) => [p.id, p]))
+}
+
+
diff --git a/app/src/components/chess/ai-icon.tsx b/app/src/components/chess/ai-icon.tsx
index 75d3d4c..a761d78 100644
--- a/app/src/components/chess/ai-icon.tsx
+++ b/app/src/components/chess/ai-icon.tsx
@@ -7,25 +7,20 @@ const avatarImages: Record<NonNullable<AiModelProvider>, string> = {
   grok: '/avatars/grok-white.png',
 }
 
-const OpenAI = ({ color }: { color?: string }) => {
-  return (
-    <svg width="24" height="24" viewBox="0 0 20 20" fill="none" xmlns="http://www.w3.org/2000/svg">
-      <path
-        d="M18.5684 8.18423C18.792 7.51079 18.8693 6.79739 18.7952 6.09168C18.7211 5.38596 18.4973 4.70419 18.1386 4.0919C17.6069 3.16642 16.7951 2.43369 15.8201 1.99936C14.8451 1.56503 13.7574 1.45153 12.7138 1.67523C12.1209 1.0157 11.3649 0.523789 10.5217 0.248906C9.67848 -0.0259764 8.7778 -0.0741542 7.9101 0.109212C7.0424 0.292578 6.23822 0.701031 5.57834 1.29355C4.91845 1.88607 4.42609 2.64179 4.1507 3.48481C3.45542 3.62739 2.79858 3.91672 2.22408 4.33347C1.64958 4.75023 1.17064 5.28481 0.819284 5.90148C0.281813 6.82542 0.0521017 7.89634 0.16336 8.95943C0.274618 10.0225 0.721081 11.0227 1.4382 11.8153C1.21375 12.4884 1.13564 13.2017 1.20908 13.9074C1.28251 14.6132 1.50581 15.2951 1.86403 15.9076C2.3964 16.8334 3.20897 17.5663 4.18463 18.0006C5.16028 18.4349 6.24865 18.5483 7.29287 18.3243C7.76392 18.8548 8.34272 19.2786 8.99062 19.5676C9.63853 19.8566 10.3406 20.004 11.05 20C12.1197 20.001 13.1621 19.662 14.0266 19.032C14.8911 18.4021 15.5331 17.5137 15.8599 16.4951C16.5551 16.3523 17.2118 16.0629 17.7863 15.6461C18.3607 15.2294 18.8397 14.6949 19.1913 14.0784C19.7224 13.1558 19.9482 12.0889 19.8363 11.0303C19.7244 9.97163 19.2806 8.9754 18.5684 8.18423ZM11.05 18.691C10.174 18.6924 9.32537 18.3853 8.65303 17.8236L8.77128 17.7566L12.7532 15.4581C12.8523 15.4 12.9346 15.3171 12.992 15.2176C13.0493 15.118 13.0798 15.0053 13.0805 14.8904V9.27631L14.7638 10.2501C14.7721 10.2544 14.7793 10.2605 14.7848 10.268C14.7903 10.2755 14.794 10.2843 14.7955 10.2935V14.9456C14.7933 15.9383 14.3981 16.8898 13.6961 17.5917C12.9942 18.2936 12.0427 18.6889 11.05 18.691ZM2.99945 15.2531C2.56009 14.4945 2.40234 13.6052 2.55395 12.7417L2.67228 12.8127L6.65812 15.1112C6.75674 15.1691 6.86902 15.1996 6.98337 15.1996C7.09772 15.1996 7.20999 15.1691 7.30862 15.1112L12.1776 12.3041V14.2478C12.1772 14.2579 12.1744 14.2677 12.1696 14.2766C12.1649 14.2855 12.1581 14.2932 12.15 14.2991L8.11678 16.6251C7.25606 17.121 6.23374 17.255 5.27429 16.9978C4.31484 16.7405 3.49669 16.1131 2.99945 15.2531ZM1.95078 6.57965C2.39319 5.81612 3.09148 5.23375 3.92203 4.93565V9.66665C3.92054 9.78094 3.94974 9.89355 4.0066 9.99271C4.06346 10.0919 4.14589 10.174 4.24528 10.2304L9.09062 13.0256L7.4072 13.9994C7.39809 14.0042 7.38793 14.0068 7.37762 14.0068C7.3673 14.0068 7.35714 14.0042 7.34803 13.9994L3.32278 11.6773C2.46367 11.1793 1.83691 10.3612 1.57976 9.40204C1.3226 8.44291 1.45602 7.42095 1.95078 6.55998V6.57965ZM15.781 9.79281L10.92 6.96998L12.5995 5.99998C12.6086 5.99514 12.6187 5.99261 12.629 5.99261C12.6393 5.99261 12.6495 5.99514 12.6586 5.99998L16.6839 8.32606C17.2993 8.68119 17.8011 9.20407 18.1305 9.83365C18.4599 10.4632 18.6035 11.1735 18.5444 11.8816C18.4853 12.5898 18.2259 13.2664 17.7967 13.8327C17.3674 14.3989 16.7859 14.8314 16.1201 15.0796V10.3486C16.1166 10.2345 16.0835 10.1232 16.0241 10.0258C15.9646 9.92833 15.8809 9.8481 15.781 9.79281ZM17.4566 7.27356L17.3383 7.20256L13.3604 4.8844C13.2611 4.82617 13.1482 4.79547 13.0331 4.79547C12.9181 4.79547 12.8051 4.82617 12.7059 4.8844L7.84095 7.6914V5.74781C7.83992 5.73793 7.84157 5.72795 7.84573 5.71893C7.8499 5.70991 7.85642 5.70218 7.86462 5.69656L11.8899 3.3744C12.5068 3.01899 13.2122 2.84659 13.9235 2.87736C14.6348 2.90813 15.3226 3.14079 15.9066 3.54813C16.4905 3.95548 16.9464 4.52066 17.2209 5.17759C17.4954 5.83452 17.5772 6.55602 17.4567 7.25773L17.4566 7.27356ZM6.9222 10.7191L5.23887 9.74931C5.23045 9.74424 5.22324 9.73738 5.21777 9.72921C5.2123 9.72105 5.20869 9.71178 5.2072 9.70206V5.06181C5.20813 4.34996 5.41169 3.65307 5.79408 3.05264C6.17647 2.45222 6.72189 1.97305 7.36657 1.67118C8.01125 1.3693 8.72854 1.2572 9.43459 1.34796C10.1406 1.43873 10.8063 1.72861 11.3536 2.18373L11.2353 2.25081L7.25345 4.54915C7.15435 4.60727 7.07207 4.69017 7.01469 4.78971C6.95732 4.88925 6.92682 5.00201 6.9262 5.1169L6.9222 10.7191ZM7.83687 8.74798L10.0052 7.49815L12.1776 8.74798V11.2475L10.0131 12.4972L7.84087 11.2475L7.83687 8.74798Z"
-        fill={color}
-      />
-    </svg>
-  )
-}
-
-export const AiIcon = ({ ai, color }: { ai: NonNullable<AiModelProvider>; color?: string }) => {
+export const AiIcon = ({ ai, color, size }: { ai: NonNullable<AiModelProvider>; color?: string; size?: number }) => {
+  const px = size ?? 24
   if (ai === 'openai') {
     return (
-      <div className="w-[24px] h-[24px] flex items-center justify-center">
-        <OpenAI color={color} />
+      <div className="flex items-center justify-center" style={{ width: px, height: px }}>
+        <svg width={px} height={px} viewBox="0 0 20 20" fill="none" xmlns="http://www.w3.org/2000/svg">
+          <path
+            d="M18.5684 8.18423C18.792 7.51079 18.8693 6.79739 18.7952 6.09168C18.7211 5.38596 18.4973 4.70419 18.1386 4.0919C17.6069 3.16642 16.7951 2.43369 15.8201 1.99936C14.8451 1.56503 13.7574 1.45153 12.7138 1.67523C12.1209 1.0157 11.3649 0.523789 10.5217 0.248906C9.67848 -0.0259764 8.7778 -0.0741542 7.9101 0.109212C7.0424 0.292578 6.23822 0.701031 5.57834 1.29355C4.91845 1.88607 4.42609 2.64179 4.1507 3.48481C3.45542 3.62739 2.79858 3.91672 2.22408 4.33347C1.64958 4.75023 1.17064 5.28481 0.819284 5.90148C0.281813 6.82542 0.0521017 7.89634 0.16336 8.95943C0.274618 10.0225 0.721081 11.0227 1.4382 11.8153C1.21375 12.4884 1.13564 13.2017 1.20908 13.9074C1.28251 14.6132 1.50581 15.2951 1.86403 15.9076C2.3964 16.8334 3.20897 17.5663 4.18463 18.0006C5.16028 18.4349 6.24865 18.5483 7.29287 18.3243C7.76392 18.8548 8.34272 19.2786 8.99062 19.5676C9.63853 19.8566 10.3406 20.004 11.05 20C12.1197 20.001 13.1621 19.662 14.0266 19.032C14.8911 18.4021 15.5331 17.5137 15.8599 16.4951C16.5551 16.3523 17.2118 16.0629 17.7863 15.6461C18.3607 15.2294 18.8397 14.6949 19.1913 14.0784C19.7224 13.1558 19.9482 12.0889 19.8363 11.0303C19.7244 9.97163 19.2806 8.9754 18.5684 8.18423ZM11.05 18.691C10.174 18.6924 9.32537 18.3853 8.65303 17.8236L8.77128 17.7566L12.7532 15.4581C12.8523 15.4 12.9346 15.3171 12.992 15.2176C13.0493 15.118 13.0798 15.0053 13.0805 14.8904V9.27631L14.7638 10.2501C14.7721 10.2544 14.7793 10.2605 14.7848 10.268C14.7903 10.2755 14.794 10.2843 14.7955 10.2935V14.9456C14.7933 15.9383 14.3981 16.8898 13.6961 17.5917C12.9942 18.2936 12.0427 18.6889 11.05 18.691ZM2.99945 15.2531C2.56009 14.4945 2.40234 13.6052 2.55395 12.7417L2.67228 12.8127L6.65812 15.1112C6.75674 15.1691 6.86902 15.1996 6.98337 15.1996C7.09772 15.1996 7.20999 15.1691 7.30862 15.1112L12.1776 12.3041V14.2478C12.1772 14.2579 12.1744 14.2677 12.1696 14.2766C12.1649 14.2855 12.1581 14.2932 12.15 14.2991L8.11678 16.6251C7.25606 17.121 6.23374 17.255 5.27429 16.9978C4.31484 16.7405 3.49669 16.1131 2.99945 15.2531ZM1.95078 6.57965C2.39319 5.81612 3.09148 5.23375 3.92203 4.93565V9.66665C3.92054 9.78094 3.94974 9.89355 4.0066 9.99271C4.06346 10.0919 4.14589 10.174 4.24528 10.2304L9.09062 13.0256L7.4072 13.9994C7.39809 14.0042 7.38793 14.0068 7.37762 14.0068C7.3673 14.0068 7.35714 14.0042 7.34803 13.9994L3.32278 11.6773C2.46367 11.1793 1.83691 10.3612 1.57976 9.40204C1.3226 8.44291 1.45602 7.42095 1.95078 6.55998V6.57965ZM15.781 9.79281L10.92 6.96998L12.5995 5.99998C12.6086 5.99514 12.6187 5.99261 12.629 5.99261C12.6393 5.99261 12.6495 5.99514 12.6586 5.99998L16.6839 8.32606C17.2993 8.68119 17.8011 9.20407 18.1305 9.83365C18.4599 10.4632 18.6035 11.1735 18.5444 11.8816C18.4853 12.5898 18.2259 13.2664 17.7967 13.8327C17.3674 14.3989 16.7859 14.8314 16.1201 15.0796V10.3486C16.1166 10.2345 16.0835 10.1232 16.0241 10.0258C15.9646 9.92833 15.8809 9.8481 15.781 9.79281ZM17.4566 7.27356L17.3383 7.20256L13.3604 4.8844C13.2611 4.82617 13.1482 4.79547 13.0331 4.79547C12.9181 4.79547 12.8051 4.82617 12.7059 4.8844L7.84095 7.6914V5.74781C7.83992 5.73793 7.84157 5.72795 7.84573 5.71893C7.8499 5.70991 7.85642 5.70218 7.86462 5.69656L11.8899 3.3744C12.5068 3.01899 13.2122 2.84659 13.9235 2.87736C14.6348 2.90813 15.3226 3.14079 15.9066 3.54813C16.4905 3.95548 16.9464 4.52066 17.2209 5.17759C17.4954 5.83452 17.5772 6.55602 17.4567 7.25773L17.4566 7.27356ZM6.9222 10.7191L5.23887 9.74931C5.23045 9.74424 5.22324 9.73738 5.21777 9.72921C5.2123 9.72105 5.20869 9.71178 5.2072 9.70206V5.06181C5.20813 4.34996 5.41169 3.65307 5.79408 3.05264C6.17647 2.45222 6.72189 1.97305 7.36657 1.67118C8.01125 1.3693 8.72854 1.2572 9.43459 1.34796C10.1406 1.43873 10.8063 1.72861 11.3536 2.18373L11.2353 2.25081L7.25345 4.54915C7.15435 4.60727 7.07207 4.69017 7.01469 4.78971C6.95732 4.88925 6.92682 5.00201 6.9262 5.1169L6.9222 10.7191ZM7.83687 8.74798L10.0052 7.49815L12.1776 8.74798V11.2475L10.0131 12.4972L7.84087 11.2475L7.83687 8.74798Z"
+            fill={color}
+          />
+        </svg>
       </div>
     )
   }
 
-  return <img src={avatarImages[ai]} alt={ai} className="w-[24px] h-[24px] rounded-full" />
+  return <img src={avatarImages[ai]} alt={ai} className="rounded-full" style={{ width: px, height: px }} />
 }
diff --git a/app/src/components/chess/create-game/create-game.tsx b/app/src/components/chess/create-game/create-game.tsx
index 17070a4..7e71b4e 100644
--- a/app/src/components/chess/create-game/create-game.tsx
+++ b/app/src/components/chess/create-game/create-game.tsx
@@ -23,8 +23,11 @@ export const CreateGame: React.FC<Props> = ({ onGameCreated, onCancel }) => {
 
     try {
       const game = await createGame({
-        white: { ai: whitePlayer.ai, model: whitePlayer.model },
-        black: { ai: blackPlayer.ai, model: blackPlayer.model },
+        players: {
+          white: { ai: whitePlayer.ai, model: whitePlayer.model },
+          black: { ai: blackPlayer.ai, model: blackPlayer.model },
+        },
+        variant: 'guided', // Always use guided mode (with legal moves)
       })
 
       onGameCreated(game.id)
@@ -66,13 +69,15 @@ export const CreateGame: React.FC<Props> = ({ onGameCreated, onCancel }) => {
   return (
     <div className="flex flex-col flex-1 gap-14 items-center justify-between w-full">
       <TopBar onBack={onBack} />
-      <CreateGamePlayerForm
-        player={selectedPlayer}
-        color={selectedPlayerColor}
-        onSubmit={handlePlayerSubmit}
-        isAiEnabled
-        isLoading={isLoading}
-      />
+      <div className="flex flex-col gap-6 w-full items-center">
+        <CreateGamePlayerForm
+          player={selectedPlayer}
+          color={selectedPlayerColor}
+          onSubmit={handlePlayerSubmit}
+          isAiEnabled
+          isLoading={isLoading}
+        />
+      </div>
     </div>
   )
 }
diff --git a/app/src/components/layout.tsx b/app/src/components/layout.tsx
new file mode 100644
index 0000000..8dad448
--- /dev/null
+++ b/app/src/components/layout.tsx
@@ -0,0 +1,105 @@
+import { cn } from '@/lib/utils'
+import { BarChart3, Trophy, BookOpen } from 'lucide-react'
+
+type LayoutProps = {
+  children: React.ReactNode
+  leftPanel?: React.ReactNode
+}
+
+export const Layout = ({ children, leftPanel }: LayoutProps) => {
+  return (
+    <div className="grid grid-cols-1 md:grid-cols-[minmax(50%,1fr)_minmax(auto,500px)] h-dvh bg-image-landing">
+      {/* Left Panel - Hidden on mobile, blends with background */}
+      <div className="hidden md:flex md:flex-col p-6 overflow-y-auto">
+        {leftPanel}
+      </div>
+
+      {/* Right Panel - Glassmorphism overlay */}
+      <div className="flex flex-col w-full h-full p-6 gap-6 md:col-start-2 md:border-l-2 md:border-white/5 bg-background/60 md:bg-background/35 backdrop-blur-lg overflow-y-auto">
+        {children}
+      </div>
+    </div>
+  )
+}
+
+// Sidebar with tabs for the left panel - designed to blend with bg-image-landing
+type SidebarPanelProps = {
+  activeTab: 'benchmarks' | 'leaderboard' | 'methodology'
+  onTabChange: (tab: 'benchmarks' | 'leaderboard' | 'methodology') => void
+  children: React.ReactNode
+}
+
+export const SidebarPanel = ({ activeTab, onTabChange, children }: SidebarPanelProps) => {
+  return (
+    <div className="flex flex-col h-full max-h-[min(calc(100dvh-48px),1280px)] my-auto mx-auto w-full max-w-[650px]">
+      {/* Header with subtle branding */}
+      <div className="flex items-center justify-between mb-4 px-1">
+        <div className="flex items-center gap-2">
+          <img src="/motia.png" alt="" className="w-6 h-6 opacity-70" />
+          <span className="text-xs font-bold tracking-[0.15em] text-white/60 uppercase">Chess Bench</span>
+        </div>
+        <span className="text-[10px] text-white/30 font-mono">v1.0</span>
+      </div>
+
+      {/* Tabs - Pill style that blends with the background */}
+      <div className="flex items-center gap-1.5 mb-4">
+        <TabButton
+          active={activeTab === 'benchmarks'}
+          onClick={() => onTabChange('benchmarks')}
+          icon={<BarChart3 size={14} />}
+          label="Benchmarks"
+        />
+        <TabButton
+          active={activeTab === 'leaderboard'}
+          onClick={() => onTabChange('leaderboard')}
+          icon={<Trophy size={14} />}
+          label="Arena"
+        />
+        <TabButton
+          active={activeTab === 'methodology'}
+          onClick={() => onTabChange('methodology')}
+          icon={<BookOpen size={14} />}
+          label="Methodology"
+        />
+      </div>
+
+      {/* Tab Content - Semi-transparent panel */}
+      <div className="flex-1 overflow-hidden rounded-2xl border border-white/[0.08] bg-gradient-to-b from-black/30 to-black/50 backdrop-blur-md shadow-2xl shadow-black/20">
+        <div className="h-full overflow-y-auto scrollbar-thin">
+          {children}
+        </div>
+      </div>
+
+      {/* Footer */}
+      <div className="flex items-center justify-center gap-2 mt-4 text-[10px] text-white/25">
+        <span>Powered by</span>
+        <a href="https://motia.dev" target="_blank" rel="noreferrer" className="text-white/40 hover:text-white/60 transition-colors font-medium">
+          Motia
+        </a>
+      </div>
+    </div>
+  )
+}
+
+// Tab button component
+type TabButtonProps = {
+  active: boolean
+  onClick: () => void
+  icon: React.ReactNode
+  label: string
+}
+
+const TabButton = ({ active, onClick, icon, label }: TabButtonProps) => (
+  <button
+    onClick={onClick}
+    className={cn(
+      'flex-1 flex items-center justify-center gap-1.5 px-3 py-2 rounded-xl text-xs font-semibold transition-all duration-200',
+      active
+        ? 'bg-white/10 text-white border border-white/10 shadow-lg shadow-black/10'
+        : 'text-white/40 hover:text-white/70 hover:bg-white/5 border border-transparent'
+    )}
+  >
+    {icon}
+    <span className="hidden sm:inline">{label}</span>
+  </button>
+)
diff --git a/app/src/index.css b/app/src/index.css
index 8f65e24..920c29e 100644
--- a/app/src/index.css
+++ b/app/src/index.css
@@ -167,9 +167,10 @@
   body,
   #root {
     width: 100dvw;
-    height: 100dvh;
+    min-height: 100dvh;
     padding: 0;
-    overflow: hidden;
+    overflow-x: hidden;
+    overflow-y: auto;
   }
 
   strong {
diff --git a/app/src/lib/use-create-game.ts b/app/src/lib/use-create-game.ts
index a39d03c..64e0b81 100644
--- a/app/src/lib/use-create-game.ts
+++ b/app/src/lib/use-create-game.ts
@@ -1,9 +1,15 @@
-import type { Game } from '@chessarena/types/game'
+import type { BenchmarkVariant, Game } from '@chessarena/types/game'
 import { apiClient } from './auth/api-client'
 import type { Players } from './types'
 
+type CreateGameParams = {
+  players: Players
+  variant?: BenchmarkVariant
+}
+
 export const useCreateGame = () => {
-  const createGame = async (players: Players): Promise<Game> => apiClient.post<Game>('/chess/create-game', { players })
+  const createGame = async ({ players, variant = 'guided' }: CreateGameParams): Promise<Game> =>
+    apiClient.post<Game>('/chess/create-game', { players, variant })
 
   return createGame
 }
diff --git a/app/src/lib/use-game-history.ts b/app/src/lib/use-game-history.ts
new file mode 100644
index 0000000..b6397a6
--- /dev/null
+++ b/app/src/lib/use-game-history.ts
@@ -0,0 +1,82 @@
+import { useCallback, useEffect, useState } from 'react'
+import { apiClient } from './auth/api-client'
+import type { GameHistory, GameHistoryFilter } from '@chessarena/types/game-history'
+
+type GameHistoryListItem = Omit<GameHistory, 'moves' | 'messages'>
+
+type GameHistoryResponse = {
+  games: GameHistoryListItem[]
+  total: number
+  limit: number
+  offset: number
+}
+
+export const useGameHistory = (initialFilter: Partial<GameHistoryFilter> = {}) => {
+  const [games, setGames] = useState<GameHistoryListItem[]>([])
+  const [total, setTotal] = useState(0)
+  const [loading, setLoading] = useState(true)
+  const [error, setError] = useState<string | null>(null)
+  const [filter, setFilter] = useState<Partial<GameHistoryFilter>>(initialFilter)
+
+  const fetchHistory = useCallback(async (params: Partial<GameHistoryFilter> = {}) => {
+    setLoading(true)
+    setError(null)
+    try {
+      const queryParams = new URLSearchParams()
+      Object.entries({ ...filter, ...params }).forEach(([key, value]) => {
+        if (value !== undefined && value !== null && value !== '') {
+          queryParams.append(key, String(value))
+        }
+      })
+
+      const response = await apiClient.get<GameHistoryResponse>(
+        `/chess/history?${queryParams.toString()}`
+      )
+      setGames(response.games)
+      setTotal(response.total)
+    } catch (err) {
+      setError('Failed to load game history')
+      console.error(err)
+    } finally {
+      setLoading(false)
+    }
+  }, [filter])
+
+  const updateFilter = useCallback((newFilter: Partial<GameHistoryFilter>) => {
+    setFilter((prev) => ({ ...prev, ...newFilter }))
+  }, [])
+
+  useEffect(() => {
+    fetchHistory()
+  }, [fetchHistory])
+
+  return { games, total, loading, error, filter, updateFilter, refetch: fetchHistory }
+}
+
+export const useGameHistoryDetail = (gameId: string | null) => {
+  const [game, setGame] = useState<GameHistory | null>(null)
+  const [loading, setLoading] = useState(false)
+  const [error, setError] = useState<string | null>(null)
+
+  const fetchGame = useCallback(async (id: string) => {
+    setLoading(true)
+    setError(null)
+    try {
+      const response = await apiClient.get<GameHistory>(`/chess/history/${id}`)
+      setGame(response)
+    } catch (err) {
+      setError('Failed to load game details')
+      console.error(err)
+    } finally {
+      setLoading(false)
+    }
+  }, [])
+
+  useEffect(() => {
+    if (gameId) {
+      fetchGame(gameId)
+    }
+  }, [gameId, fetchGame])
+
+  return { game, loading, error, refetch: () => gameId && fetchGame(gameId) }
+}
diff --git a/app/src/pages/arena-page.tsx b/app/src/pages/arena-page.tsx
new file mode 100644
index 0000000..c6a7c32
--- /dev/null
+++ b/app/src/pages/arena-page.tsx
@@ -0,0 +1,150 @@
+import { useMemo } from 'react'
+import { useNavigate } from 'react-router'
+import { Swords, Bot, Trophy, ChevronRight } from 'lucide-react'
+import { usePageTitle } from '@/lib/use-page-title'
+import { cn } from '@/lib/utils'
+import { Leaderboard } from '@/components/leaderboard/leaderboard'
+import { LiveMatch } from '@/components/live-match'
+import type { LiveAiGames } from '@chessarena/types/live-ai-games'
+import { useStreamGroup } from '@motiadev/stream-client-react'
+import { Layout } from '@/components/layout'
+
+export const ArenaPage = () => {
+  const navigate = useNavigate()
+  usePageTitle('Arena')
+
+  const { data: liveAiGames } = useStreamGroup<LiveAiGames>({ streamName: 'chessLiveAiGames', groupId: 'game' })
+  const sortedLive = useMemo(() => liveAiGames.slice().reverse(), [liveAiGames])
+
+  return (
+    <Layout>
+      <div className="flex flex-col gap-10 animate-in fade-in duration-700 slide-in-from-bottom-4">
+        {/* Hero Section */}
+        <section>
+          <h1 className="text-4xl md:text-5xl font-bold text-white tracking-tight mb-4">Arena</h1>
+          <p className="text-white/50 text-lg leading-relaxed max-w-3xl">
+            Watch models battle each other, challenge a random AI agent, or set up bot-vs-bot matches. Everything is logged and prompts are visible.
+          </p>
+
+          <div className="mt-8 flex flex-wrap gap-3">
+            <button
+              onClick={() => navigate('/play-ai')}
+              className="px-6 py-2.5 bg-gradient-to-r from-emerald-600 to-teal-600 text-white text-sm font-semibold rounded-lg hover:from-emerald-500 hover:to-teal-500 transition-all shadow-lg hover:shadow-emerald-500/20"
+            >
+              Challenge random AI
+            </button>
+            <button
+              onClick={() => navigate('/new')}
+              className="px-6 py-2.5 bg-white text-black text-sm font-semibold rounded-lg hover:bg-white/90 transition-colors shadow-lg shadow-white/5"
+            >
+              Setup AI vs AI
+            </button>
+            <button
+              onClick={() => navigate('/live-matches')}
+              className="px-6 py-2.5 bg-white/5 border border-white/10 text-white/80 text-sm font-medium rounded-lg hover:bg-white/10 transition-colors"
+            >
+              View live matches
+            </button>
+          </div>
+        </section>
+
+        {/* Content Grid */}
+        <section className="grid grid-cols-1 lg:grid-cols-12 gap-6">
+          {/* Main Leaderboard Column */}
+          <div className="lg:col-span-7 rounded-2xl border border-white/5 bg-white/[0.02] overflow-hidden">
+            <div className="px-6 py-4 border-b border-white/5 flex items-center justify-between bg-white/[0.02]">
+              <div className="flex items-center gap-2 text-white">
+                <Trophy size={18} className="text-emerald-400" />
+                <div className="font-semibold">Model vs Model Leaderboard</div>
+              </div>
+              <div className="flex items-center gap-2">
+                 <div className="h-2 w-2 rounded-full bg-emerald-500 animate-pulse" />
+                 <div className="text-xs text-white/40 font-medium tracking-wide uppercase">Live</div>
+              </div>
+            </div>
+            <Leaderboard className="md:rounded-none md:border-0" />
+            <div className="p-4 border-t border-white/5 text-center">
+              <button
+                onClick={() => navigate('/leaderboard')}
+                className="text-xs font-medium text-white/40 hover:text-white transition-colors flex items-center justify-center gap-1 mx-auto"
+              >
+                View Full Ranking <ChevronRight size={12} />
+              </button>
+            </div>
+          </div>
+
+          {/* Sidebar Column */}
+          <div className="lg:col-span-5 flex flex-col gap-6">
+            {/* Live Matches Card */}
+            <div className="rounded-2xl border border-white/5 bg-white/[0.02] p-6">
+              <div className="flex items-center gap-2 text-white mb-2">
+                <Swords size={18} className="text-sky-400" />
+                <div className="font-semibold">Live Matches</div>
+              </div>
+              <div className="text-sm text-white/50 mb-6">Jump into ongoing games between models.</div>
+              <div className={cn('flex flex-col gap-3', sortedLive.length === 0 && 'text-white/40 text-sm')}>
+                {sortedLive.length === 0 ? (
+                  <div className="py-8 text-center border border-dashed border-white/10 rounded-xl">
+                    No live matches right now.
+                  </div>
+                ) : (
+                  sortedLive.slice(0, 5).map((game) => (
+                    <LiveMatch
+                      key={game.id}
+                      white={game.players.white}
+                      black={game.players.black}
+                      onClick={() => navigate(`/game/${game.id}`)}
+                    />
+                  ))
+                )}
+              </div>
+              {sortedLive.length > 0 && (
+                <div className="mt-6">
+                  <button
+                    onClick={() => navigate('/live-matches')}
+                    className="w-full py-2 text-xs font-medium text-white/40 hover:text-white bg-white/5 hover:bg-white/10 rounded-lg transition-colors"
+                  >
+                    See all live matches
+                  </button>
+                </div>
+              )}
+            </div>
+
+            {/* Match Controls Card */}
+            <div className="rounded-2xl border border-white/5 bg-white/[0.02] p-6">
+              <div className="flex items-center gap-2 text-white mb-2">
+                <Bot size={18} className="text-amber-400" />
+                <div className="font-semibold">Match controls</div>
+              </div>
+              <div className="text-sm text-white/50 mb-6">
+                Create bot-vs-bot games or play against a randomly selected model.
+              </div>
+              <div className="grid grid-cols-1 sm:grid-cols-2 gap-3">
+                <button
+                  onClick={() => navigate('/new')}
+                  className="rounded-xl border border-white/10 bg-black/20 hover:bg-white/5 transition-colors px-4 py-3 text-left group"
+                >
+                  <div className="text-white font-medium text-sm group-hover:text-emerald-400 transition-colors">AI vs AI</div>
+                  <div className="text-white/40 text-xs mt-1">Pick two models and watch.</div>
+                </button>
+                <button
+                  onClick={() => navigate('/play-ai')}
+                  className="rounded-xl border border-white/10 bg-black/20 hover:bg-white/5 transition-colors px-4 py-3 text-left group"
+                >
+                  <div className="text-white font-medium text-sm group-hover:text-sky-400 transition-colors">Challenge AI</div>
+                  <div className="text-white/40 text-xs mt-1">Random opponent, pick color.</div>
+                </button>
+              </div>
+              <div className="mt-6 text-[10px] text-white/30 text-center leading-relaxed max-w-xs mx-auto">
+                Prompts + raw model responses are shown in-game. Bench prompt transparency lives on the Bench page.
+              </div>
+            </div>
+          </div>
+        </section>
+      </div>
+    </Layout>
+  )
+}
+
+
+
diff --git a/app/src/pages/bench-page.tsx b/app/src/pages/bench-page.tsx
new file mode 100644
index 0000000..2136f76
--- /dev/null
+++ b/app/src/pages/bench-page.tsx
@@ -0,0 +1,241 @@
+import { useState, useMemo } from 'react'
+import { Layout } from '@/components/layout'
+import { usePageTitle } from '@/lib/use-page-title'
+import { Dialog, DialogContent, DialogHeader, DialogTitle, DialogTrigger } from '@/components/ui/dialog'
+import { Tab } from '@/components/ui/tab'
+import { MiniArea } from '@/components/bench/bench-charts'
+import { BenchBarChart } from '@/components/bench/bench-bar-charts'
+import { mockBenchLeaderboard, mockBenchTimeseries, mockPrompts } from '@/components/bench/bench-mock'
+import { Prism as SyntaxHighlighter } from 'react-syntax-highlighter'
+import { vscDarkPlus } from 'react-syntax-highlighter/dist/esm/styles/prism'
+import { BarChart3, ShieldCheck, Swords, Brain } from 'lucide-react'
+import { useStreamGroup } from '@motiadev/stream-client-react'
+import type { LegalMoveBenchmarkSummary } from '@chessarena/types/legal-move-benchmark'
+import type { PuzzleBenchmarkSummary } from '@chessarena/types/puzzle-benchmark'
+import type { StockfishBenchmarkSummary } from '@chessarena/types/stockfish-benchmark'
+
+export const BenchPage = () => {
+  usePageTitle('Motia Chess Bench')
+  const [promptTab, setPromptTab] = useState('legal')
+
+  // Stream data for real-time benchmarks
+  const { data: legalSummaries } = useStreamGroup<LegalMoveBenchmarkSummary>({
+    streamName: 'legalMoveBenchmarkSummary',
+    groupId: 'models',
+  })
+  const { data: puzzleSummaries } = useStreamGroup<PuzzleBenchmarkSummary>({
+    streamName: 'puzzleBenchmarkSummary',
+    groupId: 'models',
+  })
+  const { data: stockfishSummaries } = useStreamGroup<StockfishBenchmarkSummary>({
+    streamName: 'stockfishBenchmarkSummary',
+    groupId: 'models',
+  })
+
+  // Merge real stream data with mock data as fallback
+  const benchRows = useMemo(() => {
+    const legalById = new Map(legalSummaries.map((s) => [`${s.provider}:${s.model}`, s]))
+    const puzzleById = new Map(puzzleSummaries.map((s) => [`${s.provider}:${s.model}`, s]))
+    const stockfishById = new Map(stockfishSummaries.map((s) => [`${s.provider}:${s.model}`, s]))
+
+    return mockBenchLeaderboard.map((row) => {
+      const legal = legalById.get(row.id)
+      const puzzle = puzzleById.get(row.id)
+      const stockfish = stockfishById.get(row.id)
+
+      const legalMoveScore = legal?.averageScore ?? row.legalMoveScore
+      const puzzleScore = puzzle?.overallAccuracy ?? row.puzzleScore
+      const acpl = stockfish?.averageAcpl ?? row.acpl
+      const acplScore = Math.max(0, 100 - acpl)
+      const motiaChessIndex = Number((0.4 * legalMoveScore + 0.3 * puzzleScore + 0.3 * acplScore).toFixed(1))
+      const lastUpdatedAt = Math.max(
+        legal?.lastRunAt ?? 0,
+        puzzle?.lastRunAt ?? 0,
+        stockfish?.lastRunAt ?? 0,
+        row.lastUpdatedAt
+      )
+
+      return {
+        ...row,
+        legalMoveScore,
+        puzzleScore,
+        acpl,
+        motiaChessIndex,
+        lastUpdatedAt,
+      }
+    })
+  }, [legalSummaries, puzzleSummaries, stockfishSummaries])
+
+  // Calculate global averages from stream data
+  const globalAverages = useMemo(() => {
+    const hasRealData = legalSummaries.length > 0 || puzzleSummaries.length > 0 || stockfishSummaries.length > 0
+
+    if (!hasRealData) {
+      return {
+        legalMoveScore: mockBenchTimeseries.legalMoveScore.at(-1)?.v ?? 0,
+        puzzleScore: mockBenchTimeseries.puzzleScore.at(-1)?.v ?? 0,
+        acpl: mockBenchTimeseries.acpl.at(-1)?.v ?? 0,
+      }
+    }
+
+    const avgLegal = legalSummaries.length > 0
+      ? legalSummaries.reduce((sum, s) => sum + s.averageScore, 0) / legalSummaries.length
+      : mockBenchTimeseries.legalMoveScore.at(-1)?.v ?? 0
+
+    const avgPuzzle = puzzleSummaries.length > 0
+      ? puzzleSummaries.reduce((sum, s) => sum + (s.overallAccuracy ?? 0), 0) / puzzleSummaries.length
+      : mockBenchTimeseries.puzzleScore.at(-1)?.v ?? 0
+
+    const avgAcpl = stockfishSummaries.length > 0
+      ? stockfishSummaries.reduce((sum, s) => sum + s.averageAcpl, 0) / stockfishSummaries.length
+      : mockBenchTimeseries.acpl.at(-1)?.v ?? 0
+
+    return {
+      legalMoveScore: Math.round(avgLegal),
+      puzzleScore: Math.round(avgPuzzle),
+      acpl: Math.round(avgAcpl),
+    }
+  }, [legalSummaries, puzzleSummaries, stockfishSummaries])
+
+  return (
+    <Layout>
+      <div className="flex flex-col gap-12 animate-in fade-in duration-700 slide-in-from-bottom-4">
+        
+        {/* Header Section */}
+        <div className="flex flex-col md:flex-row md:items-end justify-between gap-6 border-b border-white/5 pb-8">
+          <div className="max-w-2xl">
+            <h1 className="text-4xl font-bold text-white tracking-tight mb-4">Benchmark Suite</h1>
+            <p className="text-lg text-white/50 leading-relaxed">
+              Evaluating AI chess capabilities across <span className="text-emerald-400">legal move generation</span>,{' '}
+              <span className="text-sky-400">puzzle solving</span>, and <span className="text-amber-400">move quality</span>.
+              Transparent, reproducible, and open source.
+            </p>
+          </div>
+          <div className="flex items-center gap-3 shrink-0">
+             <Dialog>
+              <DialogTrigger asChild>
+                <button className="px-4 py-2 rounded-lg bg-white/5 border border-white/10 text-white/70 text-sm font-medium hover:bg-white/10 hover:text-white transition-all">
+                  View Prompts
+                </button>
+              </DialogTrigger>
+              <DialogContent className="max-w-4xl bg-[#09090b] border-white/10">
+                <DialogHeader>
+                  <DialogTitle className="text-white">Benchmark Prompts (Transparency)</DialogTitle>
+                </DialogHeader>
+                <div className="w-full mt-4">
+                  <div className="flex flex-row overflow-x-auto border-b border-white/10 mb-4">
+                    <Tab isSelected={promptTab === 'legal'} onClick={() => setPromptTab('legal')}>
+                      Legal moves
+                    </Tab>
+                    <Tab isSelected={promptTab === 'puzzle'} onClick={() => setPromptTab('puzzle')}>
+                      Puzzles
+                    </Tab>
+                    <Tab isSelected={promptTab === 'guided'} onClick={() => setPromptTab('guided')}>
+                      Arena (guided)
+                    </Tab>
+                  </div>
+                  <div className="rounded-lg overflow-hidden border border-white/10">
+                    {promptTab === 'legal' && (
+                    <SyntaxHighlighter language="text" style={vscDarkPlus} customStyle={{ margin: 0, padding: '1.5rem' }}>
+                      {mockPrompts.legalMoveBench}
+                    </SyntaxHighlighter>
+                    )}
+                    {promptTab === 'puzzle' && (
+                    <SyntaxHighlighter language="text" style={vscDarkPlus} customStyle={{ margin: 0, padding: '1.5rem' }}>
+                      {mockPrompts.puzzleBench}
+                    </SyntaxHighlighter>
+                    )}
+                    {promptTab === 'guided' && (
+                    <SyntaxHighlighter language="text" style={vscDarkPlus} customStyle={{ margin: 0, padding: '1.5rem' }}>
+                      {mockPrompts.aiPlayerGuided}
+                    </SyntaxHighlighter>
+                    )}
+                  </div>
+                </div>
+              </DialogContent>
+            </Dialog>
+          </div>
+        </div>
+
+        {/* Global Metrics Cards */}
+        <div className="grid grid-cols-1 md:grid-cols-3 gap-6">
+          <div className="rounded-2xl border border-white/5 bg-white/[0.02] p-6 group hover:bg-white/[0.04] transition-colors">
+            <div className="flex items-center gap-3 mb-2">
+              <div className="p-2 rounded-lg bg-emerald-500/10 text-emerald-400">
+                <ShieldCheck size={20} />
+              </div>
+              <div className="font-semibold text-white">Legal Move Gen</div>
+            </div>
+            <div className="text-3xl font-bold text-white mb-1">{globalAverages.legalMoveScore}%</div>
+            <div className="text-sm text-white/40 mb-6">Global Average (7d)</div>
+            <MiniArea points={mockBenchTimeseries.legalMoveScore} stroke="#34d399" height={48} className="opacity-50 group-hover:opacity-100 transition-opacity" />
+          </div>
+
+          <div className="rounded-2xl border border-white/5 bg-white/[0.02] p-6 group hover:bg-white/[0.04] transition-colors">
+            <div className="flex items-center gap-3 mb-2">
+               <div className="p-2 rounded-lg bg-sky-500/10 text-sky-400">
+                <Swords size={20} />
+              </div>
+              <div className="font-semibold text-white">Puzzle Solving</div>
+            </div>
+            <div className="text-3xl font-bold text-white mb-1">{globalAverages.puzzleScore}%</div>
+            <div className="text-sm text-white/40 mb-6">Global Average (7d)</div>
+            <MiniArea points={mockBenchTimeseries.puzzleScore} stroke="#38bdf8" height={48} className="opacity-50 group-hover:opacity-100 transition-opacity" />
+          </div>
+
+          <div className="rounded-2xl border border-white/5 bg-white/[0.02] p-6 group hover:bg-white/[0.04] transition-colors">
+            <div className="flex items-center gap-3 mb-2">
+               <div className="p-2 rounded-lg bg-amber-500/10 text-amber-400">
+                <Brain size={20} />
+              </div>
+              <div className="font-semibold text-white">Move Quality</div>
+            </div>
+            <div className="text-3xl font-bold text-white mb-1">{globalAverages.acpl}</div>
+            <div className="text-sm text-white/40 mb-6">Avg ACPL (7d)</div>
+            <MiniArea points={mockBenchTimeseries.acpl} stroke="#fbbf24" height={48} className="opacity-50 group-hover:opacity-100 transition-opacity" />
+          </div>
+        </div>
+
+        {/* Comparison Charts */}
+        <div className="space-y-8">
+           <div className="flex items-center gap-2 mb-4">
+              <BarChart3 className="text-white/60" size={20} />
+              <h2 className="text-xl font-semibold text-white">Model Comparison</h2>
+           </div>
+           
+           <div className="grid grid-cols-1 lg:grid-cols-2 gap-6">
+              <BenchBarChart
+                title="Motia Chess Index"
+                description="Aggregated score combining accuracy, puzzle solving, and legality."
+                rows={benchRows}
+                metric="motiaChessIndex"
+              />
+              <BenchBarChart
+                title="Legal vs Illegal Moves"
+                description="Percentage of strictly legal moves generated vs illegal/missed attempts."
+                rows={benchRows}
+                metric="legalVsIllegal"
+              />
+              <BenchBarChart
+                title="Puzzle Solving Accuracy"
+                description="Success rate on standard mate-in-1 and tactics puzzles."
+                rows={benchRows}
+                metric="puzzleScore"
+                unit="%"
+              />
+              <BenchBarChart
+                title="Move Quality (ACPL)"
+                description="Average Centipawn Loss against Stockfish 16 (lower is better)."
+                rows={benchRows}
+                metric="acplScore"
+              />
+           </div>
+        </div>
+
+      </div>
+    </Layout>
+  )
+}
+
+
+
diff --git a/app/src/pages/game-history-page.tsx b/app/src/pages/game-history-page.tsx
new file mode 100644
index 0000000..9c891ae
--- /dev/null
+++ b/app/src/pages/game-history-page.tsx
@@ -0,0 +1,259 @@
+import { useState } from 'react'
+import { useNavigate } from 'react-router'
+import { ArrowLeft, Download, Filter, Clock, Trophy, AlertTriangle, ChevronRight } from 'lucide-react'
+import { usePageTitle } from '@/lib/use-page-title'
+import { useGameHistory } from '@/lib/use-game-history'
+import { AiIcon } from '@/components/chess/ai-icon'
+import { cn } from '@/lib/utils'
+import type { BenchmarkVariant } from '@chessarena/types/game'
+import type { AiModelProvider } from '@chessarena/types/ai-models'
+
+const formatDuration = (ms: number) => {
+  const seconds = Math.floor(ms / 1000)
+  const minutes = Math.floor(seconds / 60)
+  const hours = Math.floor(minutes / 60)
+
+  if (hours > 0) return `${hours}h ${minutes % 60}m`
+  if (minutes > 0) return `${minutes}m ${seconds % 60}s`
+  return `${seconds}s`
+}
+
+const formatDate = (timestamp: number) => {
+  return new Date(timestamp).toLocaleDateString('en-US', {
+    month: 'short',
+    day: 'numeric',
+    hour: '2-digit',
+    minute: '2-digit',
+  })
+}
+
+export const GameHistoryPage = () => {
+  const navigate = useNavigate()
+  usePageTitle('Game History')
+
+  const { games, total, loading, filter, updateFilter } = useGameHistory({ limit: 20 })
+  const [showFilters, setShowFilters] = useState(false)
+
+  const handleExport = (format: 'json' | 'csv') => {
+    const params = new URLSearchParams()
+    Object.entries(filter).forEach(([key, value]) => {
+      if (value !== undefined && value !== null) {
+        params.append(key, String(value))
+      }
+    })
+    params.append('format', format)
+    window.open(`${import.meta.env.VITE_API_URL}/chess/history/export?${params.toString()}`, '_blank')
+  }
+
+  return (
+    <div className="min-h-screen bg-[#09090b] relative">
+      <div
+        className="fixed inset-0 opacity-[0.03] pointer-events-none"
+        style={{
+          backgroundImage: `linear-gradient(rgba(255,255,255,0.5) 1px, transparent 1px),
+                           linear-gradient(90deg, rgba(255,255,255,0.5) 1px, transparent 1px)`,
+          backgroundSize: '50px 50px',
+        }}
+      />
+
+      <div className="relative z-10">
+        <header className="border-b border-white/10">
+          <div className="max-w-6xl mx-auto px-6 h-14 flex items-center justify-between">
+            <button
+              onClick={() => navigate('/')}
+              className="flex items-center gap-2 text-white/60 hover:text-white transition-colors"
+            >
+              <ArrowLeft size={20} />
+              <span>Back</span>
+            </button>
+            <h1 className="text-lg font-semibold text-white">Game History</h1>
+            <div className="flex items-center gap-2">
+              <button
+                onClick={() => setShowFilters(!showFilters)}
+                className={cn(
+                  'flex items-center gap-1.5 px-3 py-1.5 rounded-lg text-sm transition-colors',
+                  showFilters ? 'bg-white/10 text-white' : 'text-white/60 hover:text-white'
+                )}
+              >
+                <Filter size={16} />
+                Filters
+              </button>
+              <button
+                onClick={() => handleExport('csv')}
+                className="flex items-center gap-1.5 px-3 py-1.5 text-white/60 hover:text-white text-sm transition-colors"
+              >
+                <Download size={16} />
+                CSV
+              </button>
+              <button
+                onClick={() => handleExport('json')}
+                className="flex items-center gap-1.5 px-3 py-1.5 text-white/60 hover:text-white text-sm transition-colors"
+              >
+                <Download size={16} />
+                JSON
+              </button>
+            </div>
+          </div>
+        </header>
+
+        {showFilters && (
+          <div className="border-b border-white/10 bg-white/[0.02]">
+            <div className="max-w-6xl mx-auto px-6 py-4 flex flex-wrap gap-4">
+              <select
+                value={filter.variant || ''}
+                onChange={(e) => updateFilter({ variant: e.target.value as BenchmarkVariant || undefined })}
+                className="bg-white/5 border border-white/10 rounded-lg px-3 py-2 text-white text-sm"
+              >
+                <option value="">All Variants</option>
+                <option value="guided">Guided</option>
+                <option value="unguided">Unguided</option>
+              </select>
+
+              <select
+                value={filter.status || ''}
+                onChange={(e) => updateFilter({ status: e.target.value as 'completed' | 'draw' | 'endedEarly' || undefined })}
+                className="bg-white/5 border border-white/10 rounded-lg px-3 py-2 text-white text-sm"
+              >
+                <option value="">All Outcomes</option>
+                <option value="completed">Completed</option>
+                <option value="draw">Draw</option>
+                <option value="endedEarly">Ended Early</option>
+              </select>
+
+              <select
+                value={filter.winner || ''}
+                onChange={(e) => updateFilter({ winner: e.target.value as 'white' | 'black' || undefined })}
+                className="bg-white/5 border border-white/10 rounded-lg px-3 py-2 text-white text-sm"
+              >
+                <option value="">Any Winner</option>
+                <option value="white">White Won</option>
+                <option value="black">Black Won</option>
+              </select>
+
+              <button
+                onClick={() => updateFilter({ variant: undefined, status: undefined, winner: undefined })}
+                className="text-white/40 hover:text-white text-sm transition-colors"
+              >
+                Clear filters
+              </button>
+            </div>
+          </div>
+        )}
+
+        <main className="max-w-6xl mx-auto px-6 py-8">
+          <div className="mb-4 text-white/40 text-sm">{total} games found</div>
+
+          {loading ? (
+            <div className="space-y-4">
+              {[...Array(5)].map((_, i) => (
+                <div key={i} className="bg-white/5 rounded-lg h-24 animate-pulse" />
+              ))}
+            </div>
+          ) : games.length === 0 ? (
+            <div className="text-center py-16 text-white/40">
+              No games found. Play some games to see them here!
+            </div>
+          ) : (
+            <div className="space-y-3">
+              {games.map((game) => (
+                <button
+                  key={game.id}
+                  onClick={() => navigate(`/history/${game.id}`)}
+                  className="w-full bg-white/[0.02] hover:bg-white/[0.05] border border-white/10 rounded-lg p-4 transition-colors text-left"
+                >
+                  <div className="flex items-center justify-between">
+                    <div className="flex items-center gap-6">
+                      {/* White Player */}
+                      <div className="flex items-center gap-2">
+                        {game.whitePlayer.provider ? (
+                          <>
+                            <div className="bg-white rounded-full p-1">
+                              <AiIcon ai={game.whitePlayer.provider as AiModelProvider} color="black" size={20} />
+                            </div>
+                            <div>
+                              <div className="text-white text-sm font-medium">{game.whitePlayer.provider}</div>
+                              <div className="text-white/40 text-xs truncate max-w-[120px]">
+                                {game.whitePlayer.model}
+                              </div>
+                            </div>
+                          </>
+                        ) : (
+                          <div className="text-white/60 text-sm">Human</div>
+                        )}
+                        {game.winner === 'white' && (
+                          <Trophy size={14} className="text-yellow-500" />
+                        )}
+                      </div>
+
+                      <span className="text-white/30">vs</span>
+
+                      {/* Black Player */}
+                      <div className="flex items-center gap-2">
+                        {game.blackPlayer.provider ? (
+                          <>
+                            <div className="bg-white rounded-full p-1">
+                              <AiIcon ai={game.blackPlayer.provider as AiModelProvider} color="black" size={20} />
+                            </div>
+                            <div>
+                              <div className="text-white text-sm font-medium">{game.blackPlayer.provider}</div>
+                              <div className="text-white/40 text-xs truncate max-w-[120px]">
+                                {game.blackPlayer.model}
+                              </div>
+                            </div>
+                          </>
+                        ) : (
+                          <div className="text-white/60 text-sm">Human</div>
+                        )}
+                        {game.winner === 'black' && (
+                          <Trophy size={14} className="text-yellow-500" />
+                        )}
+                      </div>
+                    </div>
+
+                    <div className="flex items-center gap-6">
+                      {/* Stats */}
+                      <div className="flex items-center gap-4 text-white/40 text-sm">
+                        <div className="flex items-center gap-1">
+                          <Clock size={14} />
+                          {formatDuration(game.duration)}
+                        </div>
+                        <div>{game.totalMoves} moves</div>
+                        {(game.whiteIllegalMoves > 0 || game.blackIllegalMoves > 0) && (
+                          <div className="flex items-center gap-1 text-amber-500/70">
+                            <AlertTriangle size={14} />
+                            {game.whiteIllegalMoves + game.blackIllegalMoves}
+                          </div>
+                        )}
+                      </div>
+
+                      {/* Variant Badge */}
+                      <span
+                        className={cn(
+                          'px-2 py-0.5 rounded text-xs font-medium',
+                          game.variant === 'unguided'
+                            ? 'bg-amber-500/20 text-amber-400'
+                            : 'bg-emerald-500/20 text-emerald-400'
+                        )}
+                      >
+                        {game.variant}
+                      </span>
+
+                      {/* Date */}
+                      <div className="text-white/30 text-sm w-32 text-right">{formatDate(game.endedAt)}</div>
+
+                      <ChevronRight size={16} className="text-white/20" />
+                    </div>
+                  </div>
+
+                  {game.endGameReason && (
+                    <div className="mt-2 text-white/30 text-xs">{game.endGameReason}</div>
+                  )}
+                </button>
+              ))}
+            </div>
+          )}
+        </main>
+      </div>
+    </div>
+  )
+}
diff --git a/app/src/pages/game-replay-page.tsx b/app/src/pages/game-replay-page.tsx
new file mode 100644
index 0000000..5b76110
--- /dev/null
+++ b/app/src/pages/game-replay-page.tsx
@@ -0,0 +1,389 @@
+import { useState, useEffect, useMemo } from 'react'
+import { useParams, useNavigate } from 'react-router'
+import {
+  ArrowLeft,
+  ChevronLeft,
+  ChevronRight,
+  ChevronsLeft,
+  ChevronsRight,
+  Play,
+  Pause,
+  Download,
+  Trophy,
+  AlertTriangle,
+  MessageSquare,
+} from 'lucide-react'
+import { Chessground } from '@/components/chess/chessground'
+import { usePageTitle } from '@/lib/use-page-title'
+import { useGameHistoryDetail } from '@/lib/use-game-history'
+import { AiIcon } from '@/components/chess/ai-icon'
+import { cn } from '@/lib/utils'
+import type { AiModelProvider } from '@chessarena/types/ai-models'
+import type { Key } from '@lichess-org/chessground/types'
+
+const INITIAL_FEN = 'rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1'
+
+export const GameReplayPage = () => {
+  const { gameId } = useParams<{ gameId: string }>()
+  const navigate = useNavigate()
+  usePageTitle('Game Replay')
+
+  const { game, loading, error } = useGameHistoryDetail(gameId || null)
+  const [currentMoveIndex, setCurrentMoveIndex] = useState(-1)
+  const [isPlaying, setIsPlaying] = useState(false)
+  const [showMessages, setShowMessages] = useState(true)
+
+  const currentFen = useMemo(() => {
+    if (!game || currentMoveIndex < 0) return INITIAL_FEN
+    return game.moves[currentMoveIndex]?.fenAfter || INITIAL_FEN
+  }, [game, currentMoveIndex])
+
+  const currentMove = useMemo(() => {
+    if (!game || currentMoveIndex < 0) return null
+    return game.moves[currentMoveIndex]
+  }, [game, currentMoveIndex])
+
+  const lastMoveHighlight = useMemo((): [Key, Key] | undefined => {
+    if (!currentMove) return undefined
+    return currentMove.lastMove as [Key, Key]
+  }, [currentMove])
+
+
+
+  const currentThought = useMemo(() => {
+    if (!game || currentMoveIndex < 0) return null
+    const moveColor = game.moves[currentMoveIndex]?.color
+    const relevantMessages = game.messages.filter((m) => m.role === moveColor)
+    const messageIndex = Math.floor(currentMoveIndex / 2) + (moveColor === 'black' ? 0 : 0)
+    return relevantMessages[messageIndex] || null
+  }, [game, currentMoveIndex])
+
+  // Auto-play
+  useEffect(() => {
+    if (!isPlaying || !game) return
+
+    const interval = setInterval(() => {
+      setCurrentMoveIndex((prev) => {
+        if (prev >= game.moves.length - 1) {
+          setIsPlaying(false)
+          return prev
+        }
+        return prev + 1
+      })
+    }, 1000)
+
+    return () => clearInterval(interval)
+  }, [isPlaying, game])
+
+  const goToStart = () => {
+    setCurrentMoveIndex(-1)
+    setIsPlaying(false)
+  }
+
+  const goToEnd = () => {
+    if (game) {
+      setCurrentMoveIndex(game.moves.length - 1)
+      setIsPlaying(false)
+    }
+  }
+
+  const goBack = () => {
+    setCurrentMoveIndex((prev) => Math.max(-1, prev - 1))
+  }
+
+  const goForward = () => {
+    if (game) {
+      setCurrentMoveIndex((prev) => Math.min(game.moves.length - 1, prev + 1))
+    }
+  }
+
+  const handleDownloadPgn = () => {
+    if (!game?.pgn) return
+    const blob = new Blob([game.pgn], { type: 'text/plain' })
+    const url = URL.createObjectURL(blob)
+    const a = document.createElement('a')
+    a.href = url
+    a.download = `chessarena-${game.id}.pgn`
+    a.click()
+    URL.revokeObjectURL(url)
+  }
+
+  if (loading) {
+    return (
+      <div className="min-h-screen bg-[#09090b] flex items-center justify-center">
+        <div className="text-white/60">Loading game...</div>
+      </div>
+    )
+  }
+
+  if (error || !game) {
+    return (
+      <div className="min-h-screen bg-[#09090b] flex flex-col items-center justify-center gap-4">
+        <div className="text-white/60">{error || 'Game not found'}</div>
+        <button
+          onClick={() => navigate('/history')}
+          className="text-white/40 hover:text-white transition-colors"
+        >
+          Back to History
+        </button>
+      </div>
+    )
+  }
+
+  return (
+    <div className="min-h-screen bg-[#09090b]">
+      <header className="border-b border-white/10">
+        <div className="max-w-7xl mx-auto px-6 h-14 flex items-center justify-between">
+          <button
+            onClick={() => navigate('/history')}
+            className="flex items-center gap-2 text-white/60 hover:text-white transition-colors"
+          >
+            <ArrowLeft size={20} />
+            <span>History</span>
+          </button>
+          <div className="flex items-center gap-4">
+            <span
+              className={cn(
+                'px-2 py-0.5 rounded text-xs font-medium',
+                game.variant === 'unguided'
+                  ? 'bg-amber-500/20 text-amber-400'
+                  : 'bg-emerald-500/20 text-emerald-400'
+              )}
+            >
+              {game.variant}
+            </span>
+            <button
+              onClick={handleDownloadPgn}
+              className="flex items-center gap-1.5 px-3 py-1.5 text-white/60 hover:text-white text-sm transition-colors"
+            >
+              <Download size={16} />
+              PGN
+            </button>
+          </div>
+        </div>
+      </header>
+
+      <main className="max-w-7xl mx-auto px-6 py-8">
+        <div className="grid grid-cols-1 lg:grid-cols-3 gap-8">
+          {/* Board Section */}
+          <div className="lg:col-span-2 space-y-4">
+            {/* Players */}
+            <div className="flex items-center justify-between">
+              <div className="flex items-center gap-3">
+                {game.blackPlayer.provider ? (
+                  <>
+                    <div className="bg-white rounded-full p-1.5">
+                      <AiIcon ai={game.blackPlayer.provider as AiModelProvider} color="black" size={24} />
+                    </div>
+                    <div>
+                      <div className="text-white font-medium flex items-center gap-2">
+                        {game.blackPlayer.provider}
+                        {game.winner === 'black' && <Trophy size={14} className="text-yellow-500" />}
+                      </div>
+                      <div className="text-white/40 text-sm">{game.blackPlayer.model}</div>
+                    </div>
+                  </>
+                ) : (
+                  <div className="text-white">Human (Black)</div>
+                )}
+                {game.blackIllegalMoves > 0 && (
+                  <span className="flex items-center gap-1 text-amber-500 text-sm">
+                    <AlertTriangle size={14} />
+                    {game.blackIllegalMoves}
+                  </span>
+                )}
+              </div>
+            </div>
+
+            {/* Chess Board */}
+            <div className="aspect-square max-w-[600px] mx-auto">
+              <Chessground
+                config={{
+                  fen: currentFen,
+                  lastMove: lastMoveHighlight,
+                  check: currentMove?.check ? (currentMove.color === 'white' ? 'black' : 'white') : undefined,
+                  viewOnly: true,
+                  coordinates: true,
+                }}
+              />
+            </div>
+
+            <div className="flex items-center justify-between">
+              <div className="flex items-center gap-3">
+                {game.whitePlayer.provider ? (
+                  <>
+                    <div className="bg-white rounded-full p-1.5">
+                      <AiIcon ai={game.whitePlayer.provider as AiModelProvider} color="black" size={24} />
+                    </div>
+                    <div>
+                      <div className="text-white font-medium flex items-center gap-2">
+                        {game.whitePlayer.provider}
+                        {game.winner === 'white' && <Trophy size={14} className="text-yellow-500" />}
+                      </div>
+                      <div className="text-white/40 text-sm">{game.whitePlayer.model}</div>
+                    </div>
+                  </>
+                ) : (
+                  <div className="text-white">Human (White)</div>
+                )}
+                {game.whiteIllegalMoves > 0 && (
+                  <span className="flex items-center gap-1 text-amber-500 text-sm">
+                    <AlertTriangle size={14} />
+                    {game.whiteIllegalMoves}
+                  </span>
+                )}
+              </div>
+            </div>
+
+            {/* Controls */}
+            <div className="flex items-center justify-center gap-2 pt-4">
+              <button
+                onClick={goToStart}
+                className="p-2 text-white/60 hover:text-white hover:bg-white/10 rounded-lg transition-colors"
+              >
+                <ChevronsLeft size={24} />
+              </button>
+              <button
+                onClick={goBack}
+                className="p-2 text-white/60 hover:text-white hover:bg-white/10 rounded-lg transition-colors"
+              >
+                <ChevronLeft size={24} />
+              </button>
+              <button
+                onClick={() => setIsPlaying(!isPlaying)}
+                className="p-3 bg-white/10 hover:bg-white/20 text-white rounded-lg transition-colors"
+              >
+                {isPlaying ? <Pause size={24} /> : <Play size={24} />}
+              </button>
+              <button
+                onClick={goForward}
+                className="p-2 text-white/60 hover:text-white hover:bg-white/10 rounded-lg transition-colors"
+              >
+                <ChevronRight size={24} />
+              </button>
+              <button
+                onClick={goToEnd}
+                className="p-2 text-white/60 hover:text-white hover:bg-white/10 rounded-lg transition-colors"
+              >
+                <ChevronsRight size={24} />
+              </button>
+            </div>
+
+            <div className="text-center text-white/40 text-sm">
+              Move {currentMoveIndex + 1} of {game.moves.length}
+            </div>
+          </div>
+
+          {/* Side Panel */}
+          <div className="space-y-4">
+            {/* Move List */}
+            <div className="bg-white/[0.02] border border-white/10 rounded-lg overflow-hidden">
+              <div className="px-4 py-3 border-b border-white/10">
+                <h3 className="text-white font-medium">Moves</h3>
+              </div>
+              <div className="max-h-[300px] overflow-y-auto p-2">
+                <div className="grid grid-cols-2 gap-1">
+                  {game.moves.map((move, idx) => (
+                    <button
+                      key={idx}
+                      onClick={() => setCurrentMoveIndex(idx)}
+                      className={cn(
+                        'px-2 py-1 text-sm rounded text-left transition-colors',
+                        currentMoveIndex === idx
+                          ? 'bg-white/20 text-white'
+                          : 'text-white/60 hover:bg-white/10 hover:text-white'
+                      )}
+                    >
+                      {move.color === 'white' && (
+                        <span className="text-white/40 mr-1">{Math.floor(idx / 2) + 1}.</span>
+                      )}
+                      {move.lastMove.join('-')}
+                      {move.check && '+'}
+                    </button>
+                  ))}
+                </div>
+              </div>
+            </div>
+
+            {/* AI Reasoning */}
+            <div className="bg-white/[0.02] border border-white/10 rounded-lg overflow-hidden">
+              <button
+                onClick={() => setShowMessages(!showMessages)}
+                className="w-full px-4 py-3 border-b border-white/10 flex items-center justify-between"
+              >
+                <h3 className="text-white font-medium flex items-center gap-2">
+                  <MessageSquare size={16} />
+                  AI Reasoning
+                </h3>
+                <ChevronRight
+                  size={16}
+                  className={cn('text-white/40 transition-transform', showMessages && 'rotate-90')}
+                />
+              </button>
+              {showMessages && (
+                <div className="p-4 max-h-[400px] overflow-y-auto">
+                  {currentThought ? (
+                    <div className="space-y-2">
+                      <div className="flex items-center gap-2 text-white/40 text-xs">
+                        <span className={currentThought.role === 'white' ? 'text-white' : 'text-white/60'}>
+                          {currentThought.sender}
+                        </span>
+                        {currentThought.moveSan && (
+                          <span className="bg-white/10 px-1.5 py-0.5 rounded">
+                            {currentThought.moveSan}
+                          </span>
+                        )}
+                        {currentThought.isIllegalMove && (
+                          <span className="text-red-500 text-xs">Illegal</span>
+                        )}
+                      </div>
+                      <p className="text-white/70 text-sm leading-relaxed">{currentThought.message}</p>
+                    </div>
+                  ) : (
+                    <p className="text-white/40 text-sm">Select a move to see AI reasoning</p>
+                  )}
+                </div>
+              )}
+            </div>
+
+            {/* Game Result */}
+            {game.endGameReason && (
+              <div className="bg-white/[0.02] border border-white/10 rounded-lg p-4">
+                <h3 className="text-white font-medium mb-2">Result</h3>
+                <p className="text-white/60 text-sm">{game.endGameReason}</p>
+                {game.winner && (
+                  <p className="text-white/40 text-sm mt-1">
+                    Winner: <span className="text-white">{game.winner}</span>
+                  </p>
+                )}
+              </div>
+            )}
+
+            {/* Scoreboard */}
+            {game.scoreboard && (
+              <div className="bg-white/[0.02] border border-white/10 rounded-lg p-4">
+                <h3 className="text-white font-medium mb-3">Score</h3>
+                <div className="grid grid-cols-2 gap-4 text-sm">
+                  <div>
+                    <div className="text-white/40 mb-1">White</div>
+                    <div className="text-white">{game.scoreboard.white.finalCentipawnScore} cp</div>
+                    <div className="text-white/40 text-xs mt-1">
+                      {game.scoreboard.white.blunders} blunders
+                    </div>
+                  </div>
+                  <div>
+                    <div className="text-white/40 mb-1">Black</div>
+                    <div className="text-white">{game.scoreboard.black.finalCentipawnScore} cp</div>
+                    <div className="text-white/40 text-xs mt-1">
+                      {game.scoreboard.black.blunders} blunders
+                    </div>
+                  </div>
+                </div>
+              </div>
+            )}
+          </div>
+        </div>
+      </main>
+    </div>
+  )
+}
diff --git a/app/src/pages/landing-page.tsx b/app/src/pages/landing-page.tsx
index 6544e39..cf7c0de 100644
--- a/app/src/pages/landing-page.tsx
+++ b/app/src/pages/landing-page.tsx
@@ -1,61 +1,212 @@
-import { Trophy } from 'lucide-react'
+import { useState, useMemo } from 'react'
 import { useNavigate } from 'react-router'
 import { usePageTitle } from '@/lib/use-page-title'
-import { AuthContainer } from '@/components/auth/auth-container'
-import { CreateGameButton } from '@/components/chess/create-game/create-game-button'
+import { Layout, SidebarPanel } from '@/components/layout'
+import { BenchBarChart, providerColors } from '@/components/bench/bench-bar-charts'
+import { mockBenchLeaderboard } from '@/components/bench/bench-mock'
+import { Trophy } from 'lucide-react'
 import { Leaderboard } from '@/components/leaderboard/leaderboard'
+import { useStreamGroup } from '@motiadev/stream-client-react'
+import type { LegalMoveBenchmarkSummary } from '@chessarena/types/legal-move-benchmark'
+import type { PuzzleBenchmarkSummary } from '@chessarena/types/puzzle-benchmark'
 import { TopBar } from '@/components/ui/top-bar'
-import { PageGrid, PageGridRightColumn } from '@/components/page-grid'
-import { BaseButton } from '@/components/ui/base-button'
 import { ChessArenaLogo } from '@/components/ui/chess-arena-logo'
+import { CreateGameButton } from '@/components/chess/create-game/create-game-button'
+import { BaseButton } from '@/components/ui/base-button'
+import { AuthContainer } from '@/components/auth/auth-container'
 
 export const LandingPage = () => {
   const navigate = useNavigate()
+  usePageTitle('Powered by Motia')
+
+  const [activeTab, setActiveTab] = useState<'benchmarks' | 'leaderboard' | 'methodology'>('benchmarks')
+
   const goToAbout = (e: React.MouseEvent<HTMLAnchorElement>) => {
     e.preventDefault()
     navigate('/about')
   }
 
-  usePageTitle('Powered by Motia')
+  // Stream data for benchmarks
+  const { data: legalSummaries } = useStreamGroup<LegalMoveBenchmarkSummary>({
+    streamName: 'legalMoveBenchmarkSummary',
+    groupId: 'models',
+  })
+  const { data: puzzleSummaries } = useStreamGroup<PuzzleBenchmarkSummary>({
+    streamName: 'puzzleBenchmarkSummary',
+    groupId: 'models',
+  })
 
-  return (
-    <PageGrid>
-      <div className="hidden md:flex md:flex-col p-4 overflow-y-auto">
-        <Leaderboard className="max-h-[min(calc(100dvh-32px),1280px)] my-auto mx-auto" />
-      </div>
-      <PageGridRightColumn className="backdrop-blur-none md:backdrop-blur-lg">
-        <TopBar />
-        <div className="flex flex-col justify-center grow gap-2 text-center">
-          <ChessArenaLogo />
-          <p className="font-medium text-center text-muted-foreground">Welcome to ChessArena.ai powered by Motia!</p>
-          <p className="font-medium text-center text-muted-foreground">
-            ChessArena.ai was created to show how leading models compete against each other in chess games.{' '}
-            <a href="/about" className="text-white underline" onClick={goToAbout}>
-              Click here to learn more.
-            </a>
-          </p>
+  const benchRows = useMemo(() => {
+    const legalById = new Map(legalSummaries.map((s) => [`${s.provider}:${s.model}`, s]))
+    const puzzleById = new Map(puzzleSummaries.map((s) => [`${s.provider}:${s.model}`, s]))
+
+    return mockBenchLeaderboard.map((row) => {
+      const legal = legalById.get(row.id)
+      const puzzle = puzzleById.get(row.id)
+
+      const legalMoveScore = legal?.averageScore ?? 0
+      const puzzleScore = puzzle?.overallAccuracy ?? 0
+      const motiaChessIndex = Number(((legalMoveScore + puzzleScore) / 2).toFixed(1))
+      const lastUpdatedAt = Math.max(legal?.lastRunAt ?? 0, puzzle?.lastRunAt ?? 0, row.lastUpdatedAt)
+
+      return {
+        ...row,
+        legalMoveScore,
+        puzzleScore,
+        motiaChessIndex,
+        lastUpdatedAt,
+      }
+    })
+  }, [legalSummaries, puzzleSummaries])
+
+  // Left panel content based on active tab
+  const leftPanelContent = (
+    <SidebarPanel activeTab={activeTab} onTabChange={setActiveTab}>
+      {activeTab === 'benchmarks' && (
+        <div className="p-5 space-y-6">
+          {/* Header */}
+          <div className="pb-4 border-b border-white/5">
+            <h2 className="text-lg font-bold text-white mb-1">AI Benchmark Results</h2>
+            <p className="text-xs text-white/40">Real-time performance metrics across all models</p>
+          </div>
+
+          {/* Provider Legend */}
+          <div className="flex flex-wrap items-center gap-3">
+            <span className="text-[10px] text-white/30 font-semibold uppercase tracking-widest">Providers</span>
+            <div className="flex flex-wrap gap-3">
+              {Object.entries(providerColors).map(([provider, color]) => (
+                <div key={provider} className="flex items-center gap-1.5">
+                  <div className="w-2 h-2 rounded-full" style={{ backgroundColor: color }} />
+                  <span className="text-[11px] text-white/50 capitalize font-medium">{provider}</span>
+                </div>
+              ))}
+            </div>
+          </div>
+
+          {/* Motia Chess Index */}
+          <BenchBarChart
+            title="Motia Chess Index"
+            description="Combined score of legal moves + puzzle solving"
+            rows={benchRows}
+            metric="motiaChessIndex"
+            topN={8}
+            hiddenModels={new Set()}
+          />
+
+          {/* Legal Move Score */}
+          <BenchBarChart
+            title="Legal Move Generation"
+            description="F1 score for correctly enumerating legal moves"
+            rows={benchRows}
+            metric="legalMoveScore"
+            unit="%"
+            topN={8}
+            hiddenModels={new Set()}
+          />
+
+          {/* Puzzle Score */}
+          <BenchBarChart
+            title="Puzzle Solving"
+            description="Accuracy on mate-in-1 and tactical puzzles"
+            rows={benchRows}
+            metric="puzzleScore"
+            unit="%"
+            topN={8}
+            hiddenModels={new Set()}
+          />
         </div>
-        <div className="flex flex-col gap-4 items-center justify-center w-full">
-          <AuthContainer />
-          <CreateGameButton onClick={() => navigate('/new')}>Create Game</CreateGameButton>
-          <div className="flex flex-row flex-wrap gap-2 items-center justify-center w-full">
-            <BaseButton className="flex-1" onClick={() => navigate('/live-matches')}>
-              View Live Matches
-            </BaseButton>
-            <BaseButton className="min-w-[64px] shrink-0 md:flex-1" onClick={() => navigate('/leaderboard')}>
-              <Trophy className="shrink-0" /> <span className="hidden sm:block">Leaderboard</span>
-            </BaseButton>
+      )}
+
+      {activeTab === 'leaderboard' && (
+        <Leaderboard />
+      )}
+
+      {activeTab === 'methodology' && (
+        <div className="p-6 space-y-6">
+          <div>
+            <h3 className="text-lg font-semibold text-white mb-3">Benchmark Methodology</h3>
+            <p className="text-sm text-white/60 leading-relaxed">
+              Our benchmarks evaluate LLM chess capabilities across multiple dimensions:
+            </p>
+          </div>
+
+          <div className="space-y-4">
+            <div className="p-4 rounded-xl bg-white/5 border border-white/10">
+              <h4 className="text-sm font-semibold text-emerald-400 mb-2">Legal Move Generation</h4>
+              <p className="text-xs text-white/50 leading-relaxed">
+                Models are given a FEN position and asked to list all legal moves.
+                We measure F1 score comparing predicted vs actual legal moves.
+              </p>
+            </div>
+
+            <div className="p-4 rounded-xl bg-white/5 border border-white/10">
+              <h4 className="text-sm font-semibold text-sky-400 mb-2">Puzzle Solving</h4>
+              <p className="text-xs text-white/50 leading-relaxed">
+                Models solve mate-in-1 and tactical puzzles from Lichess.
+                We track accuracy across 100+ puzzles per model.
+              </p>
+            </div>
+
+            <div className="p-4 rounded-xl bg-white/5 border border-white/10">
+              <h4 className="text-sm font-semibold text-amber-400 mb-2">Move Quality (ACPL)</h4>
+              <p className="text-xs text-white/50 leading-relaxed">
+                Average Centipawn Loss measures how optimal each move is
+                compared to Stockfish 16 analysis. Lower is better.
+              </p>
+            </div>
           </div>
 
-          <p className="font-medium text-sm text-center text-muted-foreground">
-            This project is open-source, click{' '}
-            <a href="/about" className="text-white underline" onClick={goToAbout}>
-              here
-            </a>{' '}
-            to read more about the project.
-          </p>
+          <div className="pt-4 border-t border-white/10">
+            <button
+              onClick={() => navigate('/methodology')}
+              className="w-full py-3 text-sm font-medium text-white/70 hover:text-white bg-white/5 hover:bg-white/10 rounded-xl transition-colors"
+            >
+              View Full Methodology →
+            </button>
+          </div>
+        </div>
+      )}
+    </SidebarPanel>
+  )
+
+  return (
+    <Layout leftPanel={leftPanelContent}>
+      {/* Top Bar */}
+      <TopBar />
+
+      {/* Main Content - Similar to original main branch */}
+      <div className="flex flex-col justify-center grow gap-2 text-center">
+        <ChessArenaLogo />
+        <p className="font-medium text-center text-muted-foreground">Welcome to ChessArena.ai powered by Motia!</p>
+        <p className="font-medium text-center text-muted-foreground">
+          ChessArena.ai was created to show how leading models compete against each other in chess games.{' '}
+          <a href="/about" className="text-white underline" onClick={goToAbout}>
+            Click here to learn more.
+          </a>
+        </p>
+      </div>
+
+      {/* Actions */}
+      <div className="flex flex-col gap-4 items-center justify-center w-full">
+        <AuthContainer />
+        <CreateGameButton onClick={() => navigate('/new')}>Create Game</CreateGameButton>
+        <div className="flex flex-row flex-wrap gap-2 items-center justify-center w-full">
+          <BaseButton className="flex-1" onClick={() => navigate('/live-matches')}>
+            View Live Matches
+          </BaseButton>
+          <BaseButton className="min-w-[64px] shrink-0 md:flex-1" onClick={() => navigate('/leaderboard')}>
+            <Trophy className="shrink-0" /> <span className="hidden sm:block">Leaderboard</span>
+          </BaseButton>
         </div>
-      </PageGridRightColumn>
-    </PageGrid>
+
+        <p className="font-medium text-sm text-center text-muted-foreground">
+          This project is open-source, click{' '}
+          <a href="/about" className="text-white underline" onClick={goToAbout}>
+            here
+          </a>{' '}
+          to read more about the project.
+        </p>
+      </div>
+    </Layout>
   )
 }
diff --git a/app/src/pages/methodology-page.tsx b/app/src/pages/methodology-page.tsx
new file mode 100644
index 0000000..a58011f
--- /dev/null
+++ b/app/src/pages/methodology-page.tsx
@@ -0,0 +1,241 @@
+import React from 'react'
+import { usePageTitle } from '@/lib/use-page-title'
+import { Layout } from '@/components/layout'
+import { ShieldCheck, Brain, Activity, ArrowRight, FileText, Cpu, MessageSquare, CheckCircle, BarChart3 } from 'lucide-react'
+
+const DetailSection = ({ title, icon, children }: { title: string, icon: React.ReactNode, children: React.ReactNode }) => (
+  <div className="mb-12">
+    <div className="flex items-center gap-3 mb-4">
+      <div className="p-2 bg-white/5 rounded-lg text-white/80 border border-white/10">
+        {icon}
+      </div>
+      <h2 className="text-2xl font-bold text-white">{title}</h2>
+    </div>
+    <div className="bg-white/[0.02] border border-white/5 rounded-2xl p-6 md:p-8">
+      {children}
+    </div>
+  </div>
+)
+
+// Visual flow diagram component
+const FlowStep = ({ icon, label, description, isLast = false }: { icon: React.ReactNode, label: string, description: string, isLast?: boolean }) => (
+  <div className="flex items-center gap-2">
+    <div className="flex flex-col items-center">
+      <div className="p-3 rounded-xl bg-white/5 border border-white/10 text-white/80">
+        {icon}
+      </div>
+      <div className="mt-2 text-center">
+        <div className="text-xs font-semibold text-white">{label}</div>
+        <div className="text-[10px] text-white/40 max-w-[80px]">{description}</div>
+      </div>
+    </div>
+    {!isLast && (
+      <ArrowRight className="text-white/20 mx-1 shrink-0" size={16} />
+    )}
+  </div>
+)
+
+const BenchmarkFlowDiagram = () => (
+  <div className="bg-white/[0.02] border border-white/5 rounded-2xl p-6 mb-12">
+    <h3 className="text-lg font-semibold text-white mb-6 text-center">How Benchmarks Work</h3>
+    <div className="flex flex-wrap items-start justify-center gap-2 md:gap-4">
+      <FlowStep
+        icon={<FileText size={20} />}
+        label="Prompt"
+        description="FEN + context sent to model"
+      />
+      <FlowStep
+        icon={<Cpu size={20} />}
+        label="Model"
+        description="LLM processes the request"
+      />
+      <FlowStep
+        icon={<MessageSquare size={20} />}
+        label="Response"
+        description="JSON output parsed"
+      />
+      <FlowStep
+        icon={<CheckCircle size={20} />}
+        label="Validation"
+        description="Checked against rules"
+      />
+      <FlowStep
+        icon={<BarChart3 size={20} />}
+        label="Score"
+        description="Metrics calculated"
+        isLast
+      />
+    </div>
+  </div>
+)
+
+// Benchmark comparison with other LLM benchmarks
+const BenchmarkComparisonSection = () => (
+  <div className="bg-white/[0.02] border border-white/5 rounded-2xl p-6 md:p-8 mb-12">
+    <h3 className="text-lg font-semibold text-white mb-4">Why Chess Benchmarks?</h3>
+    <p className="text-sm text-white/60 leading-relaxed mb-6">
+      While benchmarks like MMLU measure general knowledge and HumanEval measures coding ability,
+      chess provides a unique lens into <strong className="text-white">spatial reasoning</strong>,
+      <strong className="text-white"> rule adherence</strong>, and <strong className="text-white">strategic planning</strong>.
+    </p>
+
+    <div className="overflow-x-auto">
+      <table className="w-full text-sm">
+        <thead>
+          <tr className="border-b border-white/10">
+            <th className="text-left py-3 px-4 text-white/60 font-medium">Benchmark</th>
+            <th className="text-left py-3 px-4 text-white/60 font-medium">Measures</th>
+            <th className="text-left py-3 px-4 text-white/60 font-medium">Limitation</th>
+          </tr>
+        </thead>
+        <tbody className="text-white/50">
+          <tr className="border-b border-white/5">
+            <td className="py-3 px-4 text-white font-medium">MMLU</td>
+            <td className="py-3 px-4">General knowledge (57 subjects)</td>
+            <td className="py-3 px-4">Multiple choice format, memorization</td>
+          </tr>
+          <tr className="border-b border-white/5">
+            <td className="py-3 px-4 text-white font-medium">HumanEval</td>
+            <td className="py-3 px-4">Code generation from docstrings</td>
+            <td className="py-3 px-4">Python-focused, short functions</td>
+          </tr>
+          <tr className="border-b border-white/5">
+            <td className="py-3 px-4 text-white font-medium">ARC</td>
+            <td className="py-3 px-4">Science reasoning (grade school)</td>
+            <td className="py-3 px-4">Limited to science domain</td>
+          </tr>
+          <tr className="border-b border-white/5 bg-emerald-500/5">
+            <td className="py-3 px-4 text-emerald-400 font-semibold">Motia Chess Index</td>
+            <td className="py-3 px-4 text-white/70">Rule adherence, tactics, strategy</td>
+            <td className="py-3 px-4 text-white/70">Chess-specific domain</td>
+          </tr>
+        </tbody>
+      </table>
+    </div>
+
+    <div className="mt-6 p-4 bg-emerald-500/5 border border-emerald-500/20 rounded-xl">
+      <p className="text-sm text-emerald-400/80">
+        <strong>Key insight:</strong> Chess requires strict rule following with zero tolerance for errors —
+        a single illegal move loses the game. This makes it an excellent test for LLM reliability and precision.
+      </p>
+    </div>
+  </div>
+)
+
+export const MethodologyPage = () => {
+  usePageTitle('Methodology')
+
+  return (
+    <Layout>
+      <div className="max-w-4xl mx-auto animate-in fade-in duration-700 slide-in-from-bottom-4">
+        
+        <div className="mb-12 text-center md:text-left">
+          <h1 className="text-4xl font-bold text-white mb-4">Benchmark Methodology</h1>
+          <p className="text-lg text-white/50">
+            Technical details on how the Motia Chess Index and component scores are calculated.
+          </p>
+        </div>
+
+        {/* Visual flow diagram */}
+        <BenchmarkFlowDiagram />
+
+        {/* Comparison with other benchmarks */}
+        <BenchmarkComparisonSection />
+
+        <DetailSection 
+          title="1. Legal Move Benchmark" 
+          icon={<ShieldCheck />}
+        >
+          <div className="space-y-4 text-white/70 leading-relaxed">
+            <p>
+              This benchmark measures a model's ability to strictly adhere to the rules of chess.
+              We present the model with a series of game positions (FEN strings) and ask it to list <strong>every single legal move</strong> available.
+            </p>
+            <div className="grid grid-cols-1 md:grid-cols-2 gap-6 mt-6">
+              <div className="bg-black/20 p-4 rounded-xl border border-white/5">
+                <h3 className="text-white font-semibold mb-2 text-sm">Metrics</h3>
+                <ul className="list-disc list-inside space-y-1 text-sm text-white/50">
+                  <li><strong>Precision:</strong> (Correct Moves / Total Generated)</li>
+                  <li><strong>Recall:</strong> (Correct Moves / Actual Legal Moves)</li>
+                  <li><strong>Score:</strong> F1-Score (Harmonic mean of P & R)</li>
+                </ul>
+              </div>
+              <div className="bg-black/20 p-4 rounded-xl border border-white/5">
+                <h3 className="text-white font-semibold mb-2 text-sm">Prompt Strategy</h3>
+                <p className="text-sm text-white/50">
+                  Models are prompted with the PGN context and current FEN. They must output a JSON array of SAN strings. 
+                  We handle various JSON formatting errors gracefully to focus on chess capability.
+                </p>
+              </div>
+            </div>
+          </div>
+        </DetailSection>
+
+        <DetailSection 
+          title="2. Puzzle Benchmark" 
+          icon={<Brain />}
+        >
+          <div className="space-y-4 text-white/70 leading-relaxed">
+            <p>
+              Measures tactical sharpness. We source puzzles from the Lichess database, specifically filtering for 
+              <strong> Mate-in-1</strong> and <strong>short tactical sequences</strong>.
+            </p>
+            <p>
+              The model is given the position and asked for the "best move". 
+            </p>
+             <div className="bg-black/20 p-4 rounded-xl border border-white/5 mt-4">
+                <h3 className="text-white font-semibold mb-2 text-sm">Scoring</h3>
+                <p className="text-sm text-white/50">
+                  <strong>Accuracy %:</strong> The model gets 1 point if its generated move matches the solution move exactly. 
+                  0 points otherwise. No partial credit.
+                </p>
+              </div>
+          </div>
+        </DetailSection>
+
+        <DetailSection 
+          title="3. Average Centipawn Loss (ACPL)" 
+          icon={<Activity />}
+        >
+          <div className="space-y-4 text-white/70 leading-relaxed">
+            <p>
+              Evaluates the strategic quality of moves played in the Arena. 
+              We use <strong>Stockfish 16</strong> (depth 18+) to evaluate every move played by the AI.
+            </p>
+            <p>
+              <em>Centipawn Loss (CPL)</em> is the difference in evaluation between the engine's best move and the move actually played.
+            </p>
+             <div className="grid grid-cols-1 md:grid-cols-2 gap-6 mt-6">
+              <div className="bg-black/20 p-4 rounded-xl border border-white/5">
+                <h3 className="text-white font-semibold mb-2 text-sm">Formula</h3>
+                <code className="block text-xs font-mono text-emerald-400 bg-black/40 p-2 rounded mb-2">
+                  ACPL = Σ (BestEval - PlayedEval) / NumMoves
+                </code>
+                <p className="text-xs text-white/50">
+                  Lower is better. A grandmaster might have 15-20 ACPL. A beginner &gt;100.
+                </p>
+              </div>
+              <div className="bg-black/20 p-4 rounded-xl border border-white/5">
+                 <h3 className="text-white font-semibold mb-2 text-sm">Visualization</h3>
+                 <p className="text-sm text-white/50">
+                    On our charts, we invert this score (100 - ACPL, clamped at 0) so that 
+                    "higher bars" always mean "better performance", maintaining consistency with other metrics.
+                 </p>
+              </div>
+            </div>
+          </div>
+        </DetailSection>
+
+        <div className="border-t border-white/10 pt-8 mt-12 text-center">
+           <h2 className="text-white font-semibold mb-2">Reproducibility</h2>
+           <p className="text-white/50 text-sm max-w-2xl mx-auto">
+             All benchmarks are open source. You can run them yourself using the 
+             <code className="bg-white/10 px-1 py-0.5 rounded text-white/80 mx-1">motia</code> CLI and the provided
+             API scripts in the <a href="https://github.com/MotiaDev/chessarena-ai" className="text-emerald-400 hover:underline">GitHub repository</a>.
+           </p>
+        </div>
+
+      </div>
+    </Layout>
+  )
+}
diff --git a/app/src/pages/play-ai-page.tsx b/app/src/pages/play-ai-page.tsx
new file mode 100644
index 0000000..d942374
--- /dev/null
+++ b/app/src/pages/play-ai-page.tsx
@@ -0,0 +1,150 @@
+import { useState } from 'react'
+import { useNavigate } from 'react-router'
+import { Page } from '@/components/page'
+import { usePageTitle } from '@/lib/use-page-title'
+import { useAuth } from '@/lib/auth/use-auth'
+import { apiClient } from '@/lib/auth/api-client'
+import { cn } from '@/lib/utils'
+
+type ColorChoice = 'white' | 'black' | 'random'
+
+export const PlayAIPage = () => {
+  const navigate = useNavigate()
+  const { isAuthenticated } = useAuth()
+  const [isLoading, setIsLoading] = useState(false)
+  const [selectedColor, setSelectedColor] = useState<ColorChoice>('random')
+  const [error, setError] = useState<string | null>(null)
+
+  usePageTitle('Play vs AI')
+
+  const handlePlay = async () => {
+    if (!isAuthenticated) {
+      localStorage.setItem('chessarena-redirect', '/play-ai')
+      navigate('/login')
+      return
+    }
+
+    setIsLoading(true)
+    setError(null)
+
+    try {
+      const data = await apiClient.post<{ game: { id: string }; opponent: { provider: string; model: string } }>(
+        '/chess/play-vs-ai',
+        { playerColor: selectedColor },
+      )
+
+      navigate(`/game/${data.game.id}`)
+    } catch (err) {
+      const message = err instanceof Error ? err.message : 'Something went wrong'
+      setError(message)
+      setIsLoading(false)
+    }
+  }
+
+  const colorOptions: { value: ColorChoice; label: string; icon: string }[] = [
+    { value: 'white', label: 'White', icon: '♔' },
+    { value: 'random', label: 'Random', icon: '🎲' },
+    { value: 'black', label: 'Black', icon: '♚' },
+  ]
+
+  return (
+    <Page>
+      <div className="flex flex-col items-center justify-center min-h-[80vh] px-4">
+        <div className="max-w-md w-full space-y-8">
+          {/* Header */}
+          <div className="text-center">
+            <h1 className="text-4xl font-bold text-white mb-2">Play vs AI</h1>
+            <p className="text-white/60">
+              Challenge a randomly selected AI opponent. Cheaper models appear more often!
+            </p>
+          </div>
+
+          {/* Color Selection */}
+          <div className="space-y-4">
+            <label className="text-white/80 text-sm font-medium">Choose your color</label>
+            <div className="grid grid-cols-3 gap-3">
+              {colorOptions.map((option) => (
+                <button
+                  key={option.value}
+                  onClick={() => setSelectedColor(option.value)}
+                  className={cn(
+                    'flex flex-col items-center justify-center p-4 rounded-lg border-2 transition-all',
+                    selectedColor === option.value
+                      ? 'border-indigo-500 bg-indigo-500/20'
+                      : 'border-white/20 bg-white/5 hover:border-white/40'
+                  )}
+                >
+                  <span className="text-3xl mb-1">{option.icon}</span>
+                  <span className="text-white/80 text-sm">{option.label}</span>
+                </button>
+              ))}
+            </div>
+          </div>
+
+          {/* Play Button */}
+          <button
+            onClick={handlePlay}
+            disabled={isLoading}
+            className={cn(
+              'w-full py-4 px-6 rounded-lg font-semibold text-lg transition-all',
+              'bg-gradient-to-r from-indigo-600 to-purple-600',
+              'hover:from-indigo-500 hover:to-purple-500',
+              'disabled:opacity-50 disabled:cursor-not-allowed',
+              'text-white shadow-lg'
+            )}
+          >
+            {isLoading ? (
+              <span className="flex items-center justify-center gap-2">
+                <svg className="animate-spin h-5 w-5" viewBox="0 0 24 24">
+                  <circle
+                    className="opacity-25"
+                    cx="12"
+                    cy="12"
+                    r="10"
+                    stroke="currentColor"
+                    strokeWidth="4"
+                    fill="none"
+                  />
+                  <path
+                    className="opacity-75"
+                    fill="currentColor"
+                    d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4z"
+                  />
+                </svg>
+                Finding opponent...
+              </span>
+            ) : (
+              'Find Opponent'
+            )}
+          </button>
+
+          {/* Error */}
+          {error && (
+            <div className="p-4 rounded-lg bg-red-500/20 border border-red-500/50 text-red-200 text-sm">
+              {error}
+            </div>
+          )}
+
+          {/* Info */}
+          <div className="p-4 rounded-lg bg-white/5 border border-white/10">
+            <h3 className="text-white font-medium mb-2">How it works</h3>
+            <ul className="text-white/60 text-sm space-y-1">
+              <li>• You'll be matched against a random AI model</li>
+              <li>• Cheaper/faster models have higher chance of selection</li>
+              <li>• Premium models (GPT-5, Claude Opus) appear less often</li>
+              <li>• AI will provide legal moves to help you play</li>
+            </ul>
+          </div>
+
+          {/* Back Button */}
+          <button
+            onClick={() => navigate('/')}
+            className="w-full py-3 px-6 rounded-lg font-medium text-white/60 hover:text-white transition-colors"
+          >
+            ← Back to Home
+          </button>
+        </div>
+      </div>
+    </Page>
+  )
+}
diff --git a/app/vite.config.ts b/app/vite.config.ts
index 1c55eb9..9fd4d88 100644
--- a/app/vite.config.ts
+++ b/app/vite.config.ts
@@ -33,5 +33,8 @@ export default defineConfig({
       },
     },
     allowedHosts: true,
+    fs: {
+      allow: ['..'],
+    },
   },
 })
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index d824935..6cf5c6a 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -6,6 +6,9 @@ settings:
 
 catalogs:
   default:
+    typescript:
+      specifier: ~5.8.3
+      version: 5.8.3
     zod:
       specifier: ^3.25.76
       version: 3.25.76
@@ -58,7 +61,7 @@ importers:
         version: 9.0.2
       motia:
         specifier: 0.8.2-beta.139
-        version: 0.8.2-beta.139(@types/node@22.15.21)(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(eslint@9.27.0(jiti@2.5.1))(jiti@2.5.1)(lightningcss@1.30.1)(monaco-editor@0.53.0)(tsx@4.20.6)(typescript@5.8.3)(use-sync-external-store@1.5.0(react@19.1.0))
+        version: 0.8.2-beta.139(@types/node@22.15.21)(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(eslint@9.27.0(jiti@2.5.1))(immer@11.0.1)(jiti@2.5.1)(lightningcss@1.30.1)(monaco-editor@0.53.0)(tsx@4.20.6)(typescript@5.8.3)(use-sync-external-store@1.5.0(react@19.1.0))
       mustache:
         specifier: ^4.2.0
         version: 4.2.0
@@ -162,6 +165,9 @@ importers:
       react-syntax-highlighter:
         specifier: ^15.6.1
         version: 15.6.1(react@19.1.0)
+      recharts:
+        specifier: ^3.6.0
+        version: 3.6.0(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react-is@19.2.3)(react@19.1.0)(redux@5.0.1)
       sonner:
         specifier: ^2.0.5
         version: 2.0.5(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
@@ -1238,6 +1244,17 @@ packages:
   '@radix-ui/rect@1.1.1':
     resolution: {integrity: sha512-HPwpGIzkl28mWyZqG52jiqDJ12waP11Pa1lGoiyUkIEuMLBP0oeK/C89esbXrxsky5we7dfd8U58nm0SgAWpVw==}
 
+  '@reduxjs/toolkit@2.11.2':
+    resolution: {integrity: sha512-Kd6kAHTA6/nUpp8mySPqj3en3dm0tdMIgbttnQ1xFMVpufoj+ADi8pXLBsd4xzTRHQa7t/Jv8W5UnCuW4kuWMQ==}
+    peerDependencies:
+      react: ^16.9.0 || ^17.0.0 || ^18 || ^19
+      react-redux: ^7.2.1 || ^8.1.3 || ^9.0.0
+    peerDependenciesMeta:
+      react:
+        optional: true
+      react-redux:
+        optional: true
+
   '@rolldown/pluginutils@1.0.0-beta.9':
     resolution: {integrity: sha512-e9MeMtVWo186sgvFFJOPGy7/d2j2mZhLJIdVW0C/xDluuOvymEATqz6zKsP0ZmXGzQtqlyjz5sC1sYQUoJG98w==}
 
@@ -1344,6 +1361,9 @@ packages:
   '@standard-schema/spec@1.0.0':
     resolution: {integrity: sha512-m2bOd0f2RT9k8QJx1JN85cZYyH1RqFBdlwtkSlf4tBDYLCiiZnv1fIIwacK6cqwXavOydf0NPToMQgpKq+dVlA==}
 
+  '@standard-schema/utils@0.3.0':
+    resolution: {integrity: sha512-e7Mew686owMaPJVNNLs55PUvgz371nKgwsc4vxE49zsODpJEnxgxRo2y/OKrqueavXgZNMDVj3DdHFlaSAeU8g==}
+
   '@supabase/auth-js@2.71.1':
     resolution: {integrity: sha512-mMIQHBRc+SKpZFRB2qtupuzulaUhFYupNyxqDj5Jp/LyPvcWvjaJzZzObv6URtL/O6lPxkanASnotGtNpS3H2Q==}
 
@@ -1568,18 +1588,39 @@ packages:
   '@types/babel__traverse@7.20.7':
     resolution: {integrity: sha512-dkO5fhS7+/oos4ciWxyEyjWe48zmG6wbCheo/G2ZnHx4fs3EU6YC6UM8rk56gAjNJ9P3MTH2jo5jb92/K6wbng==}
 
+  '@types/d3-array@3.2.2':
+    resolution: {integrity: sha512-hOLWVbm7uRza0BYXpIIW5pxfrKe0W+D5lrFiAEYR+pb6w3N2SwSMaJbXdUfSEv+dT4MfHBLtn5js0LAWaO6otw==}
+
   '@types/d3-color@3.1.3':
     resolution: {integrity: sha512-iO90scth9WAbmgv7ogoq57O9YpKmFBbmoEoCHDB2xMBY0+/KVrqAaCDyCE16dUspeOvIxFFRI+0sEtqDqy2b4A==}
 
   '@types/d3-drag@3.0.7':
     resolution: {integrity: sha512-HE3jVKlzU9AaMazNufooRJ5ZpWmLIoc90A37WU2JMmeq28w1FQqCZswHZ3xR+SuxYftzHq6WU6KJHvqxKzTxxQ==}
 
+  '@types/d3-ease@3.0.2':
+    resolution: {integrity: sha512-NcV1JjO5oDzoK26oMzbILE6HW7uVXOHLQvHshBUW4UMdZGfiY6v5BeQwh9a9tCzv+CeefZQHJt5SRgK154RtiA==}
+
   '@types/d3-interpolate@3.0.4':
     resolution: {integrity: sha512-mgLPETlrpVV1YRJIglr4Ez47g7Yxjl1lj7YKsiMCb27VJH9W8NVM6Bb9d8kkpG/uAQS5AmbA48q2IAolKKo1MA==}
 
+  '@types/d3-path@3.1.1':
+    resolution: {integrity: sha512-VMZBYyQvbGmWyWVea0EHs/BwLgxc+MKi1zLDCONksozI4YJMcTt8ZEuIR4Sb1MMTE8MMW49v0IwI5+b7RmfWlg==}
+
+  '@types/d3-scale@4.0.9':
+    resolution: {integrity: sha512-dLmtwB8zkAeO/juAMfnV+sItKjlsw2lKdZVVy6LRr0cBmegxSABiLEpGVmSJJ8O08i4+sGR6qQtb6WtuwJdvVw==}
+
   '@types/d3-selection@3.0.11':
     resolution: {integrity: sha512-bhAXu23DJWsrI45xafYpkQ4NtcKMwWnAC/vKrd2l+nxMFuvOT3XMYTIj2opv8vq8AO5Yh7Qac/nSeP/3zjTK0w==}
 
+  '@types/d3-shape@3.1.7':
+    resolution: {integrity: sha512-VLvUQ33C+3J+8p+Daf+nYSOsjB4GXp19/S/aGo60m9h1v6XaxjiT82lKVWJCfzhtuZ3yD7i/TPeC/fuKLLOSmg==}
+
+  '@types/d3-time@3.0.4':
+    resolution: {integrity: sha512-yuzZug1nkAAaBlBBikKZTgzCeA+k1uy4ZFwWANOfKw5z5LRhV0gNA7gNkKm7HoK+HRN0wX3EkxGk0fpbWhmB7g==}
+
+  '@types/d3-timer@3.0.2':
+    resolution: {integrity: sha512-Ps3T8E8dZDam6fUyNiMkekK3XUsaUEik+idO9/YjPtfj2qruF8tFBXS7XhtE4iIXBLxhmLjP3SXpLhVf21I9Lw==}
+
   '@types/d3-transition@3.0.9':
     resolution: {integrity: sha512-uZS5shfxzO3rGlu0cC3bjmMFKsXv+SmZZcgp0KD22ts4uGXp5EVYGzu/0YdwZeKmddhcAccYtREJKkPfXkZuCg==}
 
@@ -1645,6 +1686,9 @@ packages:
   '@types/unist@3.0.3':
     resolution: {integrity: sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==}
 
+  '@types/use-sync-external-store@0.0.6':
+    resolution: {integrity: sha512-zFDAD+tlpf2r4asuHEj0XH6pY6i0g5NeAHPn+15wk3BV6JA69eERFXC1gyGThDkVa1zCyKr5jox1+2LbV/AMLg==}
+
   '@types/ws@8.18.1':
     resolution: {integrity: sha512-ThVF6DCVhA8kUGy+aazFQ4kXQ7E1Ty7A3ypFOe0IcJV8O/M511G99AW24irKrW56Wt44yG9+ij8FaqoBGkuBXg==}
 
@@ -2115,6 +2159,10 @@ packages:
   csstype@3.1.3:
     resolution: {integrity: sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==}
 
+  d3-array@3.2.4:
+    resolution: {integrity: sha512-tdQAmyA18i4J7wprpYq8ClcxZy3SC31QMeByyCFyRt7BVHdREQZ5lpzoe5mFEYZUWe+oq8HBvk9JjpibyEV4Jg==}
+    engines: {node: '>=12'}
+
   d3-color@3.1.0:
     resolution: {integrity: sha512-zg/chbXyeBtMQ1LbD/WSoW2DpC3I0mpmPdW+ynRTj/x2DAWYrIY7qeZIHidozwV24m4iavr15lNwIwLxRmOxhA==}
     engines: {node: '>=12'}
@@ -2131,14 +2179,38 @@ packages:
     resolution: {integrity: sha512-wR/XK3D3XcLIZwpbvQwQ5fK+8Ykds1ip7A2Txe0yxncXSdq1L9skcG7blcedkOX+ZcgxGAmLX1FrRGbADwzi0w==}
     engines: {node: '>=12'}
 
+  d3-format@3.1.0:
+    resolution: {integrity: sha512-YyUI6AEuY/Wpt8KWLgZHsIU86atmikuoOmCfommt0LYHiQSPjvX2AcFc38PX0CBpr2RCyZhjex+NS/LPOv6YqA==}
+    engines: {node: '>=12'}
+
   d3-interpolate@3.0.1:
     resolution: {integrity: sha512-3bYs1rOD33uo8aqJfKP3JWPAibgw8Zm2+L9vBKEHJ2Rg+viTR7o5Mmv5mZcieN+FRYaAOWX5SJATX6k1PWz72g==}
     engines: {node: '>=12'}
 
+  d3-path@3.1.0:
+    resolution: {integrity: sha512-p3KP5HCf/bvjBSSKuXid6Zqijx7wIfNW+J/maPs+iwR35at5JCbLUT0LzF1cnjbCHWhqzQTIN2Jpe8pRebIEFQ==}
+    engines: {node: '>=12'}
+
+  d3-scale@4.0.2:
+    resolution: {integrity: sha512-GZW464g1SH7ag3Y7hXjf8RoUuAFIqklOAq3MRl4OaWabTFJY9PN/E1YklhXLh+OQ3fM9yS2nOkCoS+WLZ6kvxQ==}
+    engines: {node: '>=12'}
+
   d3-selection@3.0.0:
     resolution: {integrity: sha512-fmTRWbNMmsmWq6xJV8D19U/gw/bwrHfNXxrIN+HfZgnzqTHp9jOmKMhsTUjXOJnZOdZY9Q28y4yebKzqDKlxlQ==}
     engines: {node: '>=12'}
 
+  d3-shape@3.2.0:
+    resolution: {integrity: sha512-SaLBuwGm3MOViRq2ABk3eLoxwZELpH6zhl3FbAoJ7Vm1gofKx6El1Ib5z23NUEhF9AsGl7y+dzLe5Cw2AArGTA==}
+    engines: {node: '>=12'}
+
+  d3-time-format@4.1.0:
+    resolution: {integrity: sha512-dJxPBlzC7NugB2PDLwo9Q8JiTR3M3e4/XANkreKSUxF8vvXKqm1Yfq4Q5dl8budlunRVlUUaDUgFt7eA8D6NLg==}
+    engines: {node: '>=12'}
+
+  d3-time@3.1.0:
+    resolution: {integrity: sha512-VqKjzBLejbSMT4IgbmVgDjpkYrNWUYJnbCGo874u7MMKIWsILRX+OpX/gTk8MqjpT1A/c6HY2dCA77ZN0lkQ2Q==}
+    engines: {node: '>=12'}
+
   d3-timer@3.0.1:
     resolution: {integrity: sha512-ndfJ/JxxMd3nw31uyKoY2naivF+r29V+Lc0svZxe1JvvIRmi8hUsrMvdOwgS1o6uBHmiz91geQ0ylPP0aj1VUA==}
     engines: {node: '>=12'}
@@ -2176,6 +2248,9 @@ packages:
       supports-color:
         optional: true
 
+  decimal.js-light@2.5.1:
+    resolution: {integrity: sha512-qIMFpTMZmny+MMIitAB6D7iVPEorVw6YQRWkvarTkT4tBeSLLiHzcwj6q0MmYSFCiVpiqPJTJEYIrpcPzVEIvg==}
+
   decode-named-character-reference@1.1.0:
     resolution: {integrity: sha512-Wy+JTSbFThEOXQIR2L6mxJvEs+veIzpmqD7ynWxMXGpnk3smkHQOp6forLdHsKpAMW9iJpaBBIxz285t1n1C3w==}
 
@@ -2273,6 +2348,9 @@ packages:
     resolution: {integrity: sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==}
     engines: {node: '>= 0.4'}
 
+  es-toolkit@1.43.0:
+    resolution: {integrity: sha512-SKCT8AsWvYzBBuUqMk4NPwFlSdqLpJwmy6AP322ERn8W2YLIB6JBXnwMI2Qsh2gfphT3q7EKAxKb23cvFHFwKA==}
+
   esbuild@0.25.5:
     resolution: {integrity: sha512-P8OtKZRv/5J5hhz0cUAdu/cLuPIKXpQl1R9pZtvmHWQvrAUVd0UNIPT4IB4W3rNOqVO0rlqHmCIbSwxh/c9yUQ==}
     engines: {node: '>=18'}
@@ -2361,6 +2439,9 @@ packages:
     resolution: {integrity: sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==}
     engines: {node: '>=6'}
 
+  eventemitter3@5.0.1:
+    resolution: {integrity: sha512-GWkBvjiSZK87ELrYOSESUYeVIc9mvLLf/nXalMOS5dYrgZq9o5OVkbZAVM06CVxYsCwH9BDZFPlQTlPA1j4ahA==}
+
   events@3.3.0:
     resolution: {integrity: sha512-mQw+2fkQbALzQ7V0MY0IqdnXNOeTtP4r0lN9z7AAawCXgqea7bDii20AYrIBrFd/Hx0M2Ocz6S111CaFkUcb0Q==}
     engines: {node: '>=0.8.x'}
@@ -2613,6 +2694,12 @@ packages:
     resolution: {integrity: sha512-gJzzk+PQNznz8ysRrC0aOkBNVRBDtE1n53IqyqEf3PXrYwomFs5q4pGMizBMJF+ykh03insJ27hB8gSrD2Hn8A==}
     engines: {node: '>= 4'}
 
+  immer@10.2.0:
+    resolution: {integrity: sha512-d/+XTN3zfODyjr89gM3mPq1WNX2B8pYsu7eORitdwyA2sBubnTl3laYlBk4sXY5FUa5qTZGBDPJICVbvqzjlbw==}
+
+  immer@11.0.1:
+    resolution: {integrity: sha512-naDCyggtcBWANtIrjQEajhhBEuL9b0Zg4zmlWK2CzS6xCWSE39/vvf4LqnMjUAWHBhot4m9MHCM/Z+mfWhUkiA==}
+
   import-fresh@3.3.1:
     resolution: {integrity: sha512-TR3KfrTZTYLPB6jUjfx6MF9WcWrHL9su5TObK4ZkYgBdWKPOFoSoQIdEuTuR82pmtxH2spWG9h6etwfr1pLBqQ==}
     engines: {node: '>=6'}
@@ -2631,6 +2718,10 @@ packages:
     resolution: {integrity: sha512-UjOaSel/iddGZJ5xP/Eixh6dY1XghiBw4XK13rCCIJcJfyhhoul/7KhLLUGtebEj6GDYM6Vnx/mVsjx2L/mFIA==}
     engines: {node: '>=12.0.0'}
 
+  internmap@2.0.3:
+    resolution: {integrity: sha512-5Hh7Y1wQbvY5ooGgPbDaL5iYLAPzMTUrjMulskHLH6wnv/A+1q5rgEaiuqEjB+oxGXIVZs1FF+R/KPN3ZSQYYg==}
+    engines: {node: '>=12'}
+
   ipaddr.js@1.9.1:
     resolution: {integrity: sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g==}
     engines: {node: '>= 0.10'}
@@ -3315,12 +3406,27 @@ packages:
     peerDependencies:
       react: ^19.1.0
 
+  react-is@19.2.3:
+    resolution: {integrity: sha512-qJNJfu81ByyabuG7hPFEbXqNcWSU3+eVus+KJs+0ncpGfMyYdvSmxiJxbWR65lYi1I+/0HBcliO029gc4F+PnA==}
+
   react-markdown@10.1.0:
     resolution: {integrity: sha512-qKxVopLT/TyA6BX3Ue5NwabOsAzm0Q7kAPwq6L+wWDwisYs7R8vZ0nRXqq6rkueboxpkjvLGU9fWifiX/ZZFxQ==}
     peerDependencies:
       '@types/react': '>=18'
       react: '>=18'
 
+  react-redux@9.2.0:
+    resolution: {integrity: sha512-ROY9fvHhwOD9ySfrF0wmvu//bKCQ6AeZZq1nJNtbDC+kk5DuSuNX/n6YWYF/SYy7bSba4D4FSz8DJeKY/S/r+g==}
+    peerDependencies:
+      '@types/react': ^18.2.25 || ^19
+      react: ^18.0 || ^19
+      redux: ^5.0.0
+    peerDependenciesMeta:
+      '@types/react':
+        optional: true
+      redux:
+        optional: true
+
   react-refresh@0.17.0:
     resolution: {integrity: sha512-z6F7K9bV85EfseRCp2bzrpyQ0Gkw1uLoCel9XBVWPg/TjRj94SkJzUTGfOa4bs7iJvBWtQG0Wq7wnI0syw3EBQ==}
     engines: {node: '>=0.10.0'}
@@ -3408,6 +3514,22 @@ packages:
     resolution: {integrity: sha512-GDhwkLfywWL2s6vEjyhri+eXmfH6j1L7JE27WhqLeYzoh/A3DBaYGEj2H/HFZCn/kMfim73FXxEJTw06WtxQwg==}
     engines: {node: '>= 14.18.0'}
 
+  recharts@3.6.0:
+    resolution: {integrity: sha512-L5bjxvQRAe26RlToBAziKUB7whaGKEwD3znoM6fz3DrTowCIC/FnJYnuq1GEzB8Zv2kdTfaxQfi5GoH0tBinyg==}
+    engines: {node: '>=18'}
+    peerDependencies:
+      react: ^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0
+      react-dom: ^16.0.0 || ^17.0.0 || ^18.0.0 || ^19.0.0
+      react-is: ^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0
+
+  redux-thunk@3.1.0:
+    resolution: {integrity: sha512-NW2r5T6ksUKXCabzhL9z+h206HQw/NJkcLm1GPImRQ8IzfXwRGqjVhKJGauHirT0DAuyy6hjdnMZaRoAcy0Klw==}
+    peerDependencies:
+      redux: ^5.0.0
+
+  redux@5.0.1:
+    resolution: {integrity: sha512-M9/ELqF6fy8FwmkpnF0S3YKOqMyoWJ4+CS5Efg2ct3oY9daQvd/Pc71FpGZsVsbl3Cpb+IIcjBDUnnyBdQbq4w==}
+
   refractor@3.6.0:
     resolution: {integrity: sha512-MY9W41IOWxxk31o+YvFCNyNzdkc9M20NoZK5vq6jkv4I/uh2zkWcfudj0Q1fovjUQJrNewS9NMzeTtqPf+n5EA==}
 
@@ -3421,6 +3543,9 @@ packages:
     resolution: {integrity: sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==}
     engines: {node: '>=0.10.0'}
 
+  reselect@5.1.1:
+    resolution: {integrity: sha512-K/BG6eIky/SBpzfHZv/dd+9JBFiS4SWV7FIujVyJRux6e45+73RaUHXLmIR1f7WOMaQ0U1km6qwklRQxpJJY0w==}
+
   resolve-from@4.0.0:
     resolution: {integrity: sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==}
     engines: {node: '>=4'}
@@ -3632,6 +3757,9 @@ packages:
   through@2.3.8:
     resolution: {integrity: sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==}
 
+  tiny-invariant@1.3.3:
+    resolution: {integrity: sha512-+FbBPE1o9QAYvviau/qC5SE3caw21q3xkvWKBtja5vgqOWIHHJ3ioaq1VPfn/Szqctz2bU/oYeKd9/z5BL+PVg==}
+
   tinyglobby@0.2.14:
     resolution: {integrity: sha512-tX5e7OM1HnYr2+a2C/4V0htOcSQcoSTH9KgJnVvNm5zm/cyEWKJ7j7YutsH9CxMdtOkkLFy2AHrMci9IM8IPZQ==}
     engines: {node: '>=12.0.0'}
@@ -3822,6 +3950,9 @@ packages:
   vfile@6.0.3:
     resolution: {integrity: sha512-KzIbH/9tXat2u30jf+smMwFCsno4wHVdNmzFyL+T/L3UGqqk6JKfVqOFOZEpZSHADH1k40ab6NUIXZq422ov3Q==}
 
+  victory-vendor@37.3.6:
+    resolution: {integrity: sha512-SbPDPdDBYp+5MJHhBCAyI7wKM3d5ivekigc2Dk2s7pgbZ9wIgIBYGVw4zGHBml/qTFbexrofXW6Gu4noGxrOwQ==}
+
   vite-plugin-radar@0.10.0:
     resolution: {integrity: sha512-PRApileUv7I+bInGbrQM9LvxvrFHELRvyO5yAodGwIgQBRID/hOPqx0pz7VBWFAcPNjJJVAp/LuT1417BuE/9g==}
     peerDependencies:
@@ -4424,17 +4555,17 @@ snapshots:
       - typescript
       - utf-8-validate
 
-  '@motiadev/plugin-endpoint@0.8.2-beta.139(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(monaco-editor@0.53.0)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0))':
+  '@motiadev/plugin-endpoint@0.8.2-beta.139(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(immer@11.0.1)(monaco-editor@0.53.0)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0))':
     dependencies:
       '@monaco-editor/react': 4.7.0(monaco-editor@0.53.0)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
       '@motiadev/stream-client-react': 0.8.2-beta.139(react@19.1.0)
-      '@motiadev/ui': 0.8.2-beta.139(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0))
+      '@motiadev/ui': 0.8.2-beta.139(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(immer@11.0.1)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0))
       clsx: 2.1.1
       json-schema: 0.4.0
       lucide-react: 0.544.0(react@19.1.0)
       react18-json-view: 0.2.9(react@19.1.0)
       tailwind-merge: 3.3.1
-      zustand: 5.0.8(@types/react@19.1.6)(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0))
+      zustand: 5.0.8(@types/react@19.1.6)(immer@11.0.1)(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0))
     transitivePeerDependencies:
       - '@types/react'
       - '@types/react-dom'
@@ -4481,7 +4612,7 @@ snapshots:
     dependencies:
       uuid: 11.1.0
 
-  '@motiadev/ui@0.8.2-beta.139(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0))':
+  '@motiadev/ui@0.8.2-beta.139(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(immer@11.0.1)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0))':
     dependencies:
       '@radix-ui/react-checkbox': 1.3.3(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
       '@radix-ui/react-dropdown-menu': 2.1.16(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
@@ -4499,19 +4630,19 @@ snapshots:
       react-resizable-panels: 3.0.6(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
       react-use-resizable: 0.2.0(react@19.1.0)
       tailwind-merge: 3.3.1
-      zustand: 5.0.8(@types/react@19.1.6)(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0))
+      zustand: 5.0.8(@types/react@19.1.6)(immer@11.0.1)(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0))
     transitivePeerDependencies:
       - '@types/react'
       - '@types/react-dom'
       - immer
       - use-sync-external-store
 
-  '@motiadev/workbench@0.8.2-beta.139(@types/node@22.15.21)(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(eslint@9.27.0(jiti@2.5.1))(jiti@2.5.1)(lightningcss@1.30.1)(monaco-editor@0.53.0)(tsx@4.20.6)(use-sync-external-store@1.5.0(react@19.1.0))':
+  '@motiadev/workbench@0.8.2-beta.139(@types/node@22.15.21)(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(eslint@9.27.0(jiti@2.5.1))(immer@11.0.1)(jiti@2.5.1)(lightningcss@1.30.1)(monaco-editor@0.53.0)(tsx@4.20.6)(use-sync-external-store@1.5.0(react@19.1.0))':
     dependencies:
       '@monaco-editor/react': 4.7.0(monaco-editor@0.53.0)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
-      '@motiadev/plugin-endpoint': 0.8.2-beta.139(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(monaco-editor@0.53.0)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0))
+      '@motiadev/plugin-endpoint': 0.8.2-beta.139(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(immer@11.0.1)(monaco-editor@0.53.0)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0))
       '@motiadev/stream-client-react': 0.8.2-beta.139(react@19.1.0)
-      '@motiadev/ui': 0.8.2-beta.139(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0))
+      '@motiadev/ui': 0.8.2-beta.139(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(immer@11.0.1)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0))
       '@radix-ui/react-collapsible': 1.1.12(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
       '@radix-ui/react-dialog': 1.1.14(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
       '@radix-ui/react-dropdown-menu': 2.1.16(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
@@ -4526,7 +4657,7 @@ snapshots:
       '@radix-ui/react-tooltip': 1.2.8(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
       '@tailwindcss/postcss': 4.1.13
       '@vitejs/plugin-react': 4.5.0(vite@6.3.5(@types/node@22.15.21)(jiti@2.5.1)(lightningcss@1.30.1)(tsx@4.20.6))
-      '@xyflow/react': 12.8.4(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
+      '@xyflow/react': 12.8.4(@types/react@19.1.6)(immer@11.0.1)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
       autoprefixer: 10.4.21(postcss@8.5.3)
       class-variance-authority: 0.7.1
       clsx: 2.1.1
@@ -4547,7 +4678,7 @@ snapshots:
       typescript: 5.8.3
       typescript-eslint: 8.32.1(eslint@9.27.0(jiti@2.5.1))(typescript@5.8.3)
       vite: 6.3.5(@types/node@22.15.21)(jiti@2.5.1)(lightningcss@1.30.1)(tsx@4.20.6)
-      zustand: 5.0.8(@types/react@19.1.6)(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0))
+      zustand: 5.0.8(@types/react@19.1.6)(immer@11.0.1)(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0))
     transitivePeerDependencies:
       - '@types/node'
       - '@types/react'
@@ -5102,6 +5233,18 @@ snapshots:
 
   '@radix-ui/rect@1.1.1': {}
 
+  '@reduxjs/toolkit@2.11.2(react-redux@9.2.0(@types/react@19.1.6)(react@19.1.0)(redux@5.0.1))(react@19.1.0)':
+    dependencies:
+      '@standard-schema/spec': 1.0.0
+      '@standard-schema/utils': 0.3.0
+      immer: 11.0.1
+      redux: 5.0.1
+      redux-thunk: 3.1.0(redux@5.0.1)
+      reselect: 5.1.1
+    optionalDependencies:
+      react: 19.1.0
+      react-redux: 9.2.0(@types/react@19.1.6)(react@19.1.0)(redux@5.0.1)
+
   '@rolldown/pluginutils@1.0.0-beta.9': {}
 
   '@rollup/rollup-android-arm-eabi@4.41.1':
@@ -5166,6 +5309,8 @@ snapshots:
 
   '@standard-schema/spec@1.0.0': {}
 
+  '@standard-schema/utils@0.3.0': {}
+
   '@supabase/auth-js@2.71.1':
     dependencies:
       '@supabase/node-fetch': 2.6.15
@@ -5380,18 +5525,36 @@ snapshots:
     dependencies:
       '@babel/types': 7.27.3
 
+  '@types/d3-array@3.2.2': {}
+
   '@types/d3-color@3.1.3': {}
 
   '@types/d3-drag@3.0.7':
     dependencies:
       '@types/d3-selection': 3.0.11
 
+  '@types/d3-ease@3.0.2': {}
+
   '@types/d3-interpolate@3.0.4':
     dependencies:
       '@types/d3-color': 3.1.3
 
+  '@types/d3-path@3.1.1': {}
+
+  '@types/d3-scale@4.0.9':
+    dependencies:
+      '@types/d3-time': 3.0.4
+
   '@types/d3-selection@3.0.11': {}
 
+  '@types/d3-shape@3.1.7':
+    dependencies:
+      '@types/d3-path': 3.1.1
+
+  '@types/d3-time@3.0.4': {}
+
+  '@types/d3-timer@3.0.2': {}
+
   '@types/d3-transition@3.0.9':
     dependencies:
       '@types/d3-selection': 3.0.11
@@ -5460,6 +5623,8 @@ snapshots:
 
   '@types/unist@3.0.3': {}
 
+  '@types/use-sync-external-store@0.0.6': {}
+
   '@types/ws@8.18.1':
     dependencies:
       '@types/node': 22.15.21
@@ -5650,13 +5815,13 @@ snapshots:
     transitivePeerDependencies:
       - supports-color
 
-  '@xyflow/react@12.8.4(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)':
+  '@xyflow/react@12.8.4(@types/react@19.1.6)(immer@11.0.1)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)':
     dependencies:
       '@xyflow/system': 0.0.68
       classcat: 5.0.5
       react: 19.1.0
       react-dom: 19.1.0(react@19.1.0)
-      zustand: 4.5.7(@types/react@19.1.6)(react@19.1.0)
+      zustand: 4.5.7(@types/react@19.1.6)(immer@11.0.1)(react@19.1.0)
     transitivePeerDependencies:
       - '@types/react'
       - immer
@@ -6014,6 +6179,10 @@ snapshots:
 
   csstype@3.1.3: {}
 
+  d3-array@3.2.4:
+    dependencies:
+      internmap: 2.0.3
+
   d3-color@3.1.0: {}
 
   d3-dispatch@3.0.1: {}
@@ -6025,12 +6194,36 @@ snapshots:
 
   d3-ease@3.0.1: {}
 
+  d3-format@3.1.0: {}
+
   d3-interpolate@3.0.1:
     dependencies:
       d3-color: 3.1.0
 
+  d3-path@3.1.0: {}
+
+  d3-scale@4.0.2:
+    dependencies:
+      d3-array: 3.2.4
+      d3-format: 3.1.0
+      d3-interpolate: 3.0.1
+      d3-time: 3.1.0
+      d3-time-format: 4.1.0
+
   d3-selection@3.0.0: {}
 
+  d3-shape@3.2.0:
+    dependencies:
+      d3-path: 3.1.0
+
+  d3-time-format@4.1.0:
+    dependencies:
+      d3-time: 3.1.0
+
+  d3-time@3.1.0:
+    dependencies:
+      d3-array: 3.2.4
+
   d3-timer@3.0.1: {}
 
   d3-transition@3.0.1(d3-selection@3.0.0):
@@ -6065,6 +6258,8 @@ snapshots:
     dependencies:
       ms: 2.1.3
 
+  decimal.js-light@2.5.1: {}
+
   decode-named-character-reference@1.1.0:
     dependencies:
       character-entities: 2.0.2
@@ -6144,6 +6339,8 @@ snapshots:
       has-tostringtag: 1.0.2
       hasown: 2.0.2
 
+  es-toolkit@1.43.0: {}
+
   esbuild@0.25.5:
     optionalDependencies:
       '@esbuild/aix-ppc64': 0.25.5
@@ -6265,6 +6462,8 @@ snapshots:
 
   event-target-shim@5.0.1: {}
 
+  eventemitter3@5.0.1: {}
+
   events@3.3.0: {}
 
   eventsource-parser@3.0.6: {}
@@ -6593,6 +6792,10 @@ snapshots:
 
   ignore@7.0.4: {}
 
+  immer@10.2.0: {}
+
+  immer@11.0.1: {}
+
   import-fresh@3.3.1:
     dependencies:
       parent-module: 1.0.1
@@ -6624,6 +6827,8 @@ snapshots:
     transitivePeerDependencies:
       - '@types/node'
 
+  internmap@2.0.3: {}
+
   ipaddr.js@1.9.1: {}
 
   is-alphabetical@1.0.4: {}
@@ -7147,12 +7352,12 @@ snapshots:
     dependencies:
       '@types/trusted-types': 1.0.6
 
-  motia@0.8.2-beta.139(@types/node@22.15.21)(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(eslint@9.27.0(jiti@2.5.1))(jiti@2.5.1)(lightningcss@1.30.1)(monaco-editor@0.53.0)(tsx@4.20.6)(typescript@5.8.3)(use-sync-external-store@1.5.0(react@19.1.0)):
+  motia@0.8.2-beta.139(@types/node@22.15.21)(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(eslint@9.27.0(jiti@2.5.1))(immer@11.0.1)(jiti@2.5.1)(lightningcss@1.30.1)(monaco-editor@0.53.0)(tsx@4.20.6)(typescript@5.8.3)(use-sync-external-store@1.5.0(react@19.1.0)):
     dependencies:
       '@amplitude/analytics-node': 1.5.9
       '@motiadev/core': 0.8.2-beta.139(@types/node@22.15.21)(typescript@5.8.3)
       '@motiadev/stream-client-node': 0.8.2-beta.139
-      '@motiadev/workbench': 0.8.2-beta.139(@types/node@22.15.21)(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(eslint@9.27.0(jiti@2.5.1))(jiti@2.5.1)(lightningcss@1.30.1)(monaco-editor@0.53.0)(tsx@4.20.6)(use-sync-external-store@1.5.0(react@19.1.0))
+      '@motiadev/workbench': 0.8.2-beta.139(@types/node@22.15.21)(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(eslint@9.27.0(jiti@2.5.1))(immer@11.0.1)(jiti@2.5.1)(lightningcss@1.30.1)(monaco-editor@0.53.0)(tsx@4.20.6)(use-sync-external-store@1.5.0(react@19.1.0))
       antlr4ts: 0.5.0-alpha.4
       archiver: 7.0.1
       axios: 1.11.0
@@ -7393,6 +7598,8 @@ snapshots:
       react: 19.1.0
       scheduler: 0.26.0
 
+  react-is@19.2.3: {}
+
   react-markdown@10.1.0(@types/react@19.1.6)(react@19.1.0):
     dependencies:
       '@types/hast': 3.0.4
@@ -7411,6 +7618,15 @@ snapshots:
     transitivePeerDependencies:
       - supports-color
 
+  react-redux@9.2.0(@types/react@19.1.6)(react@19.1.0)(redux@5.0.1):
+    dependencies:
+      '@types/use-sync-external-store': 0.0.6
+      react: 19.1.0
+      use-sync-external-store: 1.5.0(react@19.1.0)
+    optionalDependencies:
+      '@types/react': 19.1.6
+      redux: 5.0.1
+
   react-refresh@0.17.0: {}
 
   react-remove-scroll-bar@2.3.8(@types/react@19.1.6)(react@19.1.0):
@@ -7504,6 +7720,32 @@ snapshots:
 
   readdirp@4.1.2: {}
 
+  recharts@3.6.0(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react-is@19.2.3)(react@19.1.0)(redux@5.0.1):
+    dependencies:
+      '@reduxjs/toolkit': 2.11.2(react-redux@9.2.0(@types/react@19.1.6)(react@19.1.0)(redux@5.0.1))(react@19.1.0)
+      clsx: 2.1.1
+      decimal.js-light: 2.5.1
+      es-toolkit: 1.43.0
+      eventemitter3: 5.0.1
+      immer: 10.2.0
+      react: 19.1.0
+      react-dom: 19.1.0(react@19.1.0)
+      react-is: 19.2.3
+      react-redux: 9.2.0(@types/react@19.1.6)(react@19.1.0)(redux@5.0.1)
+      reselect: 5.1.1
+      tiny-invariant: 1.3.3
+      use-sync-external-store: 1.5.0(react@19.1.0)
+      victory-vendor: 37.3.6
+    transitivePeerDependencies:
+      - '@types/react'
+      - redux
+
+  redux-thunk@3.1.0(redux@5.0.1):
+    dependencies:
+      redux: 5.0.1
+
+  redux@5.0.1: {}
+
   refractor@3.6.0:
     dependencies:
       hastscript: 6.0.0
@@ -7529,6 +7771,8 @@ snapshots:
 
   require-from-string@2.0.2: {}
 
+  reselect@5.1.1: {}
+
   resolve-from@4.0.0: {}
 
   resolve-pkg-maps@1.0.0:
@@ -7813,6 +8057,8 @@ snapshots:
 
   through@2.3.8: {}
 
+  tiny-invariant@1.3.3: {}
+
   tinyglobby@0.2.14:
     dependencies:
       fdir: 6.4.4(picomatch@4.0.2)
@@ -8008,6 +8254,23 @@ snapshots:
       '@types/unist': 3.0.3
       vfile-message: 4.0.2
 
+  victory-vendor@37.3.6:
+    dependencies:
+      '@types/d3-array': 3.2.2
+      '@types/d3-ease': 3.0.2
+      '@types/d3-interpolate': 3.0.4
+      '@types/d3-scale': 4.0.9
+      '@types/d3-shape': 3.1.7
+      '@types/d3-time': 3.0.4
+      '@types/d3-timer': 3.0.2
+      d3-array: 3.2.4
+      d3-ease: 3.0.1
+      d3-interpolate: 3.0.1
+      d3-scale: 4.0.2
+      d3-shape: 3.2.0
+      d3-time: 3.1.0
+      d3-timer: 3.0.1
+
   vite-plugin-radar@0.10.0(vite@6.3.5(@types/node@22.15.21)(jiti@2.5.1)(lightningcss@1.30.1)(tsx@4.20.6)):
     dependencies:
       vite: 6.3.5(@types/node@22.15.21)(jiti@2.5.1)(lightningcss@1.30.1)(tsx@4.20.6)
@@ -8090,16 +8353,18 @@ snapshots:
 
   zod@3.25.76: {}
 
-  zustand@4.5.7(@types/react@19.1.6)(react@19.1.0):
+  zustand@4.5.7(@types/react@19.1.6)(immer@11.0.1)(react@19.1.0):
     dependencies:
       use-sync-external-store: 1.5.0(react@19.1.0)
     optionalDependencies:
       '@types/react': 19.1.6
+      immer: 11.0.1
       react: 19.1.0
 
-  zustand@5.0.8(@types/react@19.1.6)(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0)):
+  zustand@5.0.8(@types/react@19.1.6)(immer@11.0.1)(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0)):
     optionalDependencies:
       '@types/react': 19.1.6
+      immer: 11.0.1
       react: 19.1.0
       use-sync-external-store: 1.5.0(react@19.1.0)
 
diff --git a/types/game-history.ts b/types/game-history.ts
new file mode 100644
index 0000000..ade0c1d
--- /dev/null
+++ b/types/game-history.ts
@@ -0,0 +1,61 @@
+import { z } from 'zod'
+import { ScoreboardSchema, BenchmarkVariantSchema } from './game'
+import { GameMoveSchema } from './game-move'
+import { GameMessageSchema } from './game-message'
+import { AiModelProviderSchema } from './ai-models'
+
+export const GameHistorySchema = z.object({
+  id: z.string({ description: 'The ID of the game' }),
+  
+  // Game metadata
+  startedAt: z.number({ description: 'Unix timestamp when game started' }),
+  endedAt: z.number({ description: 'Unix timestamp when game ended' }),
+  duration: z.number({ description: 'Game duration in milliseconds' }),
+  
+  // Players info
+  whitePlayer: z.object({
+    provider: AiModelProviderSchema().optional(),
+    model: z.string().optional(),
+    isHuman: z.boolean(),
+  }),
+  blackPlayer: z.object({
+    provider: AiModelProviderSchema().optional(),
+    model: z.string().optional(),
+    isHuman: z.boolean(),
+  }),
+  
+  // Game result
+  status: z.enum(['completed', 'draw', 'endedEarly']),
+  winner: z.enum(['white', 'black']).optional(),
+  endGameReason: z.string().optional(),
+  variant: BenchmarkVariantSchema.default('guided'),
+  
+  // Stats
+  totalMoves: z.number({ description: 'Total number of moves in the game' }),
+  whiteIllegalMoves: z.number({ description: 'Illegal move attempts by white' }),
+  blackIllegalMoves: z.number({ description: 'Illegal move attempts by black' }),
+  
+  // Full game data
+  finalFen: z.string({ description: 'Final board position FEN' }),
+  moves: z.array(GameMoveSchema, { description: 'All moves in the game' }),
+  messages: z.array(GameMessageSchema, { description: 'All AI messages/reasoning' }),
+  scoreboard: ScoreboardSchema.optional(),
+  
+  // PGN for export
+  pgn: z.string({ description: 'Game in PGN format' }).optional(),
+})
+
+export const GameHistoryFilterSchema = z.object({
+  provider: AiModelProviderSchema().optional(),
+  model: z.string().optional(),
+  variant: BenchmarkVariantSchema.optional(),
+  winner: z.enum(['white', 'black']).optional(),
+  status: z.enum(['completed', 'draw', 'endedEarly']).optional(),
+  startDate: z.number().optional(),
+  endDate: z.number().optional(),
+  limit: z.number().default(50),
+  offset: z.number().default(0),
+})
+
+export type GameHistory = z.infer<typeof GameHistorySchema>
+export type GameHistoryFilter = z.infer<typeof GameHistoryFilterSchema>
diff --git a/types/game.ts b/types/game.ts
index 483c280..01adc49 100644
--- a/types/game.ts
+++ b/types/game.ts
@@ -50,11 +50,16 @@ export const PlayerSchema = () =>
     promotions: z.number({ description: 'The number of pawn promotions' }).optional(),
   })
 
+export const BenchmarkVariantSchema = z.enum(['guided', 'unguided'], {
+  description: 'Benchmark variant: guided (with legal moves) or unguided (FEN only)',
+})
+
 export const GameSchema = z.object({
   id: z.string({ description: 'The ID of the game' }),
   fen: z.string({ description: 'The FEN of the game' }),
   turn: z.enum(['white', 'black'], { description: 'The color of the current turn' }),
   status: z.enum(['pending', 'completed', 'draw', 'endedEarly'], { description: 'The status of the game' }),
+  variant: BenchmarkVariantSchema.default('guided').optional(),
   lastMove: z.array(z.string({ description: 'The last move made' })).optional(),
   lastMoveSan: z.string({ description: 'The last move made in Standard Algebraic Notation (SAN)' }).optional(),
   winner: z.enum(['white', 'black']).optional(),
@@ -66,6 +71,7 @@ export const GameSchema = z.object({
   }),
   check: z.boolean({ description: 'Whether the game is in check' }),
   scoreboard: ScoreboardSchema.optional(),
+  createdAt: z.number({ description: 'Unix timestamp when game was created' }).optional(),
 })
 
 export const roleSchema = z.enum(['white', 'black', 'spectator', 'root'])
@@ -76,3 +82,4 @@ export type Player = z.infer<ReturnType<typeof PlayerSchema>>
 export type PlayerScore = z.infer<ReturnType<typeof PlayerScoreSchema>>
 export type Scoreboard = z.infer<typeof ScoreboardSchema>
 export type GameRole = z.infer<typeof roleSchema>
+export type BenchmarkVariant = z.infer<typeof BenchmarkVariantSchema>
diff --git a/types/legal-move-benchmark.ts b/types/legal-move-benchmark.ts
new file mode 100644
index 0000000..3385d03
--- /dev/null
+++ b/types/legal-move-benchmark.ts
@@ -0,0 +1,76 @@
+import { z } from 'zod'
+import { AiModelProviderSchema } from './ai-models'
+
+export const TestPositionSchema = z.object({
+  id: z.string(),
+  fen: z.string(),
+  pgn: z.string(),
+  turn: z.enum(['white', 'black']),
+  legalMoves: z.array(z.string()),
+  legalMoveCount: z.number(),
+  moveNumber: z.number(),
+})
+
+export const ModelBenchmarkResultSchema = z.object({
+  positionId: z.string(),
+  modelMoves: z.array(z.string()),
+  correctMoves: z.array(z.string()),
+  illegalMoves: z.array(z.string()),
+  missedMoves: z.array(z.string()),
+  accuracy: z.number(), // percentage of legal moves found
+  penalty: z.number(), // penalty for illegal moves
+  finalScore: z.number(), // accuracy - penalty
+  responseTime: z.number(), // ms
+  rawResponse: z.string(), // raw model response for debugging
+  error: z.string().optional(), // if model failed to respond
+})
+
+export const LegalMoveBenchmarkRunSchema = z.object({
+  id: z.string(),
+  createdAt: z.number(),
+  completedAt: z.number().optional(),
+  status: z.enum(['pending', 'running', 'completed', 'failed']),
+  
+  // Model info
+  provider: AiModelProviderSchema(),
+  model: z.string(),
+  
+  // Test configuration
+  positionCount: z.number(),
+  positions: z.array(TestPositionSchema),
+  
+  // Results
+  results: z.array(ModelBenchmarkResultSchema),
+  
+  // Aggregate scores
+  averageAccuracy: z.number().optional(),
+  averagePenalty: z.number().optional(),
+  averageFinalScore: z.number().optional(),
+  totalCorrectMoves: z.number().optional(),
+  totalIllegalMoves: z.number().optional(),
+  totalMissedMoves: z.number().optional(),
+})
+
+export const LegalMoveBenchmarkSummarySchema = z.object({
+  id: z.string(),
+  provider: AiModelProviderSchema(),
+  model: z.string(),
+  runsCompleted: z.number(),
+  averageScore: z.number(),
+  bestScore: z.number(),
+  worstScore: z.number(),
+  lastRunAt: z.number(),
+})
+
+export const PositionSetSchema = z.object({
+  id: z.string(),
+  createdAt: z.number(),
+  count: z.number(),
+  positions: z.array(TestPositionSchema),
+})
+
+export type TestPosition = z.infer<typeof TestPositionSchema>
+export type ModelBenchmarkResult = z.infer<typeof ModelBenchmarkResultSchema>
+export type LegalMoveBenchmarkRun = z.infer<typeof LegalMoveBenchmarkRunSchema>
+export type LegalMoveBenchmarkSummary = z.infer<typeof LegalMoveBenchmarkSummarySchema>
+export type PositionSet = z.infer<typeof PositionSetSchema>
diff --git a/types/puzzle-benchmark.ts b/types/puzzle-benchmark.ts
new file mode 100644
index 0000000..134fd82
--- /dev/null
+++ b/types/puzzle-benchmark.ts
@@ -0,0 +1,75 @@
+import { z } from 'zod'
+import { AiModelProviderSchema } from './ai-models'
+
+export const PuzzleThemeSchema = z.enum(['mateIn1', 'oneMove'])
+
+export const LichessPuzzleSchema = z.object({
+  id: z.string(),
+  rating: z.number(),
+  themes: z.array(z.string()),
+  solution: z.array(z.string()), // UCI format moves
+  initialPly: z.number(),
+  pgn: z.string(),
+  fen: z.string(), // Position where puzzle starts
+  legalMoves: z.array(z.string()), // Legal moves in SAN format
+  solutionSan: z.string(), // First solution move in SAN
+})
+
+export const PuzzleSetSchema = z.object({
+  id: z.string(),
+  theme: PuzzleThemeSchema,
+  createdAt: z.number(),
+  puzzles: z.array(LichessPuzzleSchema),
+  count: z.number(),
+})
+
+export const PuzzleResultSchema = z.object({
+  puzzleId: z.string(),
+  modelMove: z.string().optional(), // What the model played
+  correctMove: z.string(), // The correct solution
+  isCorrect: z.boolean(),
+  responseTime: z.number(),
+  rawResponse: z.string(),
+  error: z.string().optional(),
+})
+
+export const PuzzleBenchmarkRunSchema = z.object({
+  id: z.string(),
+  createdAt: z.number(),
+  completedAt: z.number().optional(),
+  status: z.enum(['pending', 'running', 'completed', 'failed']),
+
+  // Model info
+  provider: AiModelProviderSchema(),
+  model: z.string(),
+
+  // Which puzzle set was used
+  puzzleSetId: z.string(),
+  theme: PuzzleThemeSchema,
+
+  // Results
+  results: z.array(PuzzleResultSchema),
+
+  // Aggregate scores
+  totalPuzzles: z.number(),
+  correctCount: z.number().optional(),
+  accuracy: z.number().optional(), // percentage
+})
+
+export const PuzzleBenchmarkSummarySchema = z.object({
+  id: z.string(),
+  provider: AiModelProviderSchema(),
+  model: z.string(),
+  mateIn1Accuracy: z.number().optional(),
+  oneMoveAccuracy: z.number().optional(),
+  overallAccuracy: z.number().optional(),
+  runsCompleted: z.number(),
+  lastRunAt: z.number(),
+})
+
+export type PuzzleTheme = z.infer<typeof PuzzleThemeSchema>
+export type LichessPuzzle = z.infer<typeof LichessPuzzleSchema>
+export type PuzzleSet = z.infer<typeof PuzzleSetSchema>
+export type PuzzleResult = z.infer<typeof PuzzleResultSchema>
+export type PuzzleBenchmarkRun = z.infer<typeof PuzzleBenchmarkRunSchema>
+export type PuzzleBenchmarkSummary = z.infer<typeof PuzzleBenchmarkSummarySchema>
diff --git a/types/stockfish-benchmark.ts b/types/stockfish-benchmark.ts
new file mode 100644
index 0000000..e308cbb
--- /dev/null
+++ b/types/stockfish-benchmark.ts
@@ -0,0 +1,86 @@
+import { z } from 'zod'
+import { AiModelProviderSchema } from './ai-models'
+
+export const StockfishGameMoveSchema = z.object({
+  moveNumber: z.number(),
+  player: z.enum(['white', 'black']),
+  moveSan: z.string(),
+  fen: z.string(),
+  centipawnScore: z.number().optional(), // Evaluation after move
+  bestMove: z.string().optional(), // What Stockfish thinks was best
+  centipawnLoss: z.number().optional(), // Difference from best move
+  isAiMove: z.boolean(), // true if AI made this move, false if Stockfish
+  responseTime: z.number().optional(), // ms for AI moves
+  error: z.string().optional(),
+})
+
+export const StockfishGameResultSchema = z.object({
+  id: z.string(),
+  createdAt: z.number(),
+  completedAt: z.number().optional(),
+  status: z.enum(['running', 'completed', 'failed']),
+
+  // Model info
+  provider: AiModelProviderSchema(),
+  model: z.string(),
+
+  // Game info
+  aiColor: z.enum(['white', 'black']),
+  stockfishLevel: z.number(), // 1-20
+  result: z.enum(['ai_win', 'stockfish_win', 'draw', 'ai_illegal_move', 'timeout']).optional(),
+  resultReason: z.string().optional(),
+
+  // Moves
+  moves: z.array(StockfishGameMoveSchema),
+  totalMoves: z.number(),
+  finalFen: z.string().optional(),
+  pgn: z.string().optional(),
+
+  // ACPL calculation (only for AI moves)
+  aiMoveCount: z.number().optional(),
+  totalCentipawnLoss: z.number().optional(),
+  averageCentipawnLoss: z.number().optional(), // ACPL
+  blunders: z.number().optional(), // moves with >100 centipawn loss
+  mistakes: z.number().optional(), // moves with 50-100 centipawn loss
+  inaccuracies: z.number().optional(), // moves with 25-50 centipawn loss
+})
+
+export const StockfishBenchmarkRunSchema = z.object({
+  id: z.string(),
+  createdAt: z.number(),
+  completedAt: z.number().optional(),
+  status: z.enum(['running', 'completed', 'failed']),
+
+  provider: AiModelProviderSchema(),
+  model: z.string(),
+  stockfishLevel: z.number(),
+
+  // Two games: one as white, one as black
+  gameAsWhite: StockfishGameResultSchema.optional(),
+  gameAsBlack: StockfishGameResultSchema.optional(),
+
+  // Combined stats
+  gamesPlayed: z.number(),
+  wins: z.number(),
+  losses: z.number(),
+  draws: z.number(),
+  overallAcpl: z.number().optional(), // Average ACPL across both games
+})
+
+export const StockfishBenchmarkSummarySchema = z.object({
+  id: z.string(),
+  provider: AiModelProviderSchema(),
+  model: z.string(),
+  runsCompleted: z.number(),
+  averageAcpl: z.number(),
+  bestAcpl: z.number(),
+  wins: z.number(),
+  losses: z.number(),
+  draws: z.number(),
+  lastRunAt: z.number(),
+})
+
+export type StockfishGameMove = z.infer<typeof StockfishGameMoveSchema>
+export type StockfishGameResult = z.infer<typeof StockfishGameResultSchema>
+export type StockfishBenchmarkRun = z.infer<typeof StockfishBenchmarkRunSchema>
+export type StockfishBenchmarkSummary = z.infer<typeof StockfishBenchmarkSummarySchema>