diff --git a/.gitignore b/.gitignore index db1c917..891c924 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,5 @@ dist .env .env.production api/lib/* -deploy.sh \ No newline at end of file +deploy.sh +MOTIA_DOCS.md \ No newline at end of file diff --git a/api/motia-workbench.json b/api/motia-workbench.json index 0fd0ed7..e75cc4d 100644 --- a/api/motia-workbench.json +++ b/api/motia-workbench.json @@ -20,6 +20,22 @@ { "id": "chess", "config": { + "steps/chess/12-play-vs-ai.step.ts": { + "x": 350, + "y": 1056 + }, + "steps/chess/11-get-game-history-detail.step.ts": { + "x": 0, + "y": 0 + }, + "steps/chess/10b-export-game-history.step.ts": { + "x": 6, + "y": 224 + }, + "steps/chess/10-get-game-history.step.ts": { + "x": 26, + "y": 428 + }, "steps/chess/09-purge-stuck-games.step.ts": { "x": -237, "y": 580 @@ -50,8 +66,8 @@ "y": 112 }, "steps/chess/02-get-game.step.ts": { - "x": 1045, - "y": 114 + "x": 1019, + "y": 92 }, "steps/chess/01-create-game.step.ts": { "x": 432, @@ -59,8 +75,8 @@ "targetHandlePosition": "left" }, "steps/chess/00-available-models-api.step.ts": { - "x": -209, - "y": 86, + "x": -460, + "y": 111, "sourceHandlePosition": "right" }, "steps/chess/access/request-access.step.ts": { @@ -76,5 +92,62 @@ "y": 680 } } + }, + { + "id": "benchmark", + "config": { + "steps/benchmark/11-stockfish-leaderboard.step.ts": { + "x": 0, + "y": 2444 + }, + "steps/benchmark/10-run-stockfish-benchmark.step.ts": { + "x": 0, + "y": 2220 + }, + "steps/benchmark/09-run-all-benchmarks.step.ts": { + "x": -282, + "y": 337 + }, + "steps/benchmark/08-get-puzzle-sets.step.ts": { + "x": 435, + "y": 30 + }, + "steps/benchmark/07-get-puzzle-leaderboard.step.ts": { + "x": 3, + "y": 204 + }, + "steps/benchmark/06b-run-all-puzzle-benchmarks.step.ts": { + "x": 0, + "y": 1120 + }, + "steps/benchmark/06-run-puzzle-benchmark.step.ts": { + "x": 465, + "y": 360 + }, + "steps/benchmark/05-fetch-puzzle-set.step.ts": { + "x": 0, + "y": 612 + }, + "steps/benchmark/04-get-benchmark-leaderboard.step.ts": { + "x": 0, + "y": 816 + }, + "steps/benchmark/03-get-benchmark-run-detail.step.ts": { + "x": 0, + "y": 1060 + }, + "steps/benchmark/02-get-benchmark-runs.step.ts": { + "x": 0, + "y": 1304 + }, + "steps/benchmark/01-run-legal-move-benchmark.step.ts": { + "x": 0, + "y": 1528 + }, + "steps/benchmark/00-generate-position-set.step.ts": { + "x": 0, + "y": 1752 + } + } } ] \ No newline at end of file diff --git a/api/services/ai/claude.ts b/api/services/ai/claude.ts index bc469bd..15dbc51 100644 --- a/api/services/ai/claude.ts +++ b/api/services/ai/claude.ts @@ -2,6 +2,7 @@ import { streamObject } from 'ai' import { createAnthropic } from '@ai-sdk/anthropic' import { AiPlayerPromptSchema } from '@chessarena/types/ai-models' import { models } from './models' +import { getMaxReasoningProviderOptions } from './provider-options' import { Handler } from './types' export const claude: Handler = async ({ prompt, logger, model, onThoughtUpdate }) => { @@ -9,12 +10,15 @@ export const claude: Handler = async ({ prompt, logger, model, onThoughtUpdate } apiKey: process.env.ANTHROPIC_API_KEY, }) + const modelId = model ?? models.claude const { partialObjectStream, object } = streamObject({ - model: anthropic(model ?? models.claude), + model: anthropic(modelId), prompt, schema: AiPlayerPromptSchema, + mode: 'json', maxRetries: 0, abortSignal: AbortSignal.timeout(180000), + providerOptions: getMaxReasoningProviderOptions('claude', modelId), }) for await (const partialObject of partialObjectStream) { diff --git a/api/services/ai/gemini.ts b/api/services/ai/gemini.ts index 2a198d1..79b34b1 100644 --- a/api/services/ai/gemini.ts +++ b/api/services/ai/gemini.ts @@ -2,6 +2,7 @@ import { streamObject } from 'ai' import { createGoogleGenerativeAI } from '@ai-sdk/google' import { AiPlayerPromptSchema } from '@chessarena/types/ai-models' import { models } from './models' +import { getMaxReasoningProviderOptions } from './provider-options' import { Handler } from './types' export const gemini: Handler = async ({ prompt, logger, model, onThoughtUpdate }) => { @@ -9,12 +10,14 @@ export const gemini: Handler = async ({ prompt, logger, model, onThoughtUpdate } apiKey: process.env.GEMINI_API_KEY, }) + const modelId = model ?? models.gemini const { partialObjectStream, object } = streamObject({ - model: googleAI(model ?? models.gemini), + model: googleAI(modelId), prompt, schema: AiPlayerPromptSchema, maxRetries: 0, abortSignal: AbortSignal.timeout(180000), + providerOptions: getMaxReasoningProviderOptions('gemini', modelId), }) for await (const partialObject of partialObjectStream) { diff --git a/api/services/ai/grok.ts b/api/services/ai/grok.ts index 9a46d16..cdc75ec 100644 --- a/api/services/ai/grok.ts +++ b/api/services/ai/grok.ts @@ -2,6 +2,7 @@ import { streamObject } from 'ai' import { createXai } from '@ai-sdk/xai' import { AiPlayerPromptSchema } from '@chessarena/types/ai-models' import { models } from './models' +import { getMaxReasoningProviderOptions } from './provider-options' import { Handler } from './types' export const grok: Handler = async ({ prompt, logger, model, onThoughtUpdate }) => { @@ -9,12 +10,15 @@ export const grok: Handler = async ({ prompt, logger, model, onThoughtUpdate }) apiKey: process.env.XAI_API_KEY, }) + const modelId = model ?? models.grok const { partialObjectStream, object } = streamObject({ - model: xai(model ?? models.grok), + model: xai(modelId), prompt, schema: AiPlayerPromptSchema, + mode: 'json', maxRetries: 0, abortSignal: AbortSignal.timeout(180000), + providerOptions: getMaxReasoningProviderOptions('grok', modelId), }) for await (const partialObject of partialObjectStream) { diff --git a/api/services/ai/models.ts b/api/services/ai/models.ts index 3bffecb..b2804e2 100644 --- a/api/services/ai/models.ts +++ b/api/services/ai/models.ts @@ -1,47 +1,84 @@ -import { AiModels, AiProviderDefaultModel } from '@chessarena/types/ai-models' +import { AiModels, AiModelProvider, AiProviderDefaultModel } from '@chessarena/types/ai-models' // NOTE: these are the models used for AI vs AI games, it is also used for backwards compatibility for existing games that don't have a model assigned to a player +// IMPORTANT: These must match model names in supportedModelsByProvider below! export const models: AiProviderDefaultModel = { - openai: 'gpt-5-2025-08-07', + openai: 'gpt-5.2', gemini: 'gemini-2.5-flash', - claude: 'claude-sonnet-4-5-20250929', - grok: 'grok-4-fast', + claude: 'claude-sonnet-4-5', + grok: 'grok-4-fast-non-reasoning', } -// NOTE: these are all the models supported by provider that users can pick in order to play human vs AI games +/** + * ============================================ + * BENCHMARK MODELS - Add new models here! + * ============================================ + * + * To add a new model for benchmarking: + * 1. Add it to the appropriate provider array below + * 2. Restart the dev server + * 3. Run the benchmark: POST /benchmark/legal-moves/run-all + * + * To run benchmark for a single model: + * POST /benchmark/legal-moves/run { "provider": "claude", "model": "claude-3-5-haiku-20241022" } + * + * Provider documentation: + * - OpenAI: https://platform.openai.com/docs/models + * - Gemini: https://ai.google.dev/gemini-api/docs/models + * - Claude: https://docs.anthropic.com/en/docs/about-claude/models/overview + * - Grok: https://docs.x.ai/docs/models + */ export const supportedModelsByProvider: AiModels = { + // From AI SDK docs: https://sdk.vercel.ai/providers/ai-sdk-providers/openai openai: [ - // https://platform.openai.com/docs/models - 'gpt-5-2025-08-07', - 'gpt-5-mini-2025-08-07', - 'gpt-5-nano-2025-08-07', - 'gpt-4.1-nano-2025-04-14', - 'gpt-4.1-mini-2025-04-14', - 'gpt-4o-mini-2024-07-18', - 'o4-mini-2025-04-16', + 'gpt-5.2', // Latest + 'gpt-5.1', // Previous flagship + 'gpt-5', // GPT-5 + 'gpt-5-mini', // Fast + 'gpt-4.1', // GPT-4.1 + 'gpt-4.1-mini', // Fast GPT-4.1 + 'gpt-4o', // GPT-4o + 'gpt-4o-mini', // Fast GPT-4o ], + // From AI SDK docs: https://sdk.vercel.ai/providers/ai-sdk-providers/google-generative-ai gemini: [ - // https://ai.google.dev/gemini-api/docs/models - 'gemini-2.5-flash', - 'gemini-2.5-flash-lite', - 'gemini-2.0-flash-001', - 'gemini-2.0-flash-lite-001', + 'gemini-3-pro-preview', // Latest preview + 'gemini-2.5-pro', // Latest pro + 'gemini-2.5-flash', // Fast + 'gemini-2.5-flash-lite', // Ultra fast + 'gemini-2.0-flash', // Stable flash ], + // From AI SDK docs: https://sdk.vercel.ai/providers/ai-sdk-providers/anthropic claude: [ - // https://docs.anthropic.com/en/docs/about-claude/models/overview - 'claude-opus-4-1-20250805', - 'claude-opus-4-20250514', - 'claude-sonnet-4-5-20250929', - 'claude-sonnet-4-20250514', - 'claude-3-7-sonnet-20250219', - 'claude-haiku-4-5-20251001', - 'claude-3-5-haiku-20241022', - ], - grok: [ - // https://docs.x.ai/docs/models - 'grok-4-fast', - 'grok-4-fast-non-reasoning', - 'grok-3-mini', - 'grok-3', + 'claude-opus-4-5', // Latest opus (no dot!) + 'claude-sonnet-4-5', // Latest sonnet (no dot!) + 'claude-haiku-4-5', // Latest haiku (no dot!) + 'claude-opus-4-0', // Opus 4.0 + 'claude-sonnet-4-0', // Sonnet 4.0 + 'claude-3-7-sonnet-latest', // Claude 3.7 + 'claude-3-5-haiku-latest', // Claude 3.5 Haiku ], + // From AI SDK docs: https://sdk.vercel.ai/providers/ai-sdk-providers/xai + grok: ['grok-4-fast-non-reasoning', 'grok-4-fast-reasoning', 'grok-3-fast'], +} + +/** + * Helper to get all models as a flat array with provider info + * Used by benchmarks + */ +export const getAllModels = (): { provider: AiModelProvider; model: string }[] => { + const allModels: { provider: AiModelProvider; model: string }[] = [] + for (const [provider, models] of Object.entries(supportedModelsByProvider)) { + for (const model of models) { + allModels.push({ provider: provider as AiModelProvider, model }) + } + } + return allModels +} + +/** + * Get models for a specific provider + */ +export const getModelsForProvider = (provider: AiModelProvider): string[] => { + return supportedModelsByProvider[provider] || [] } diff --git a/api/services/ai/openai.ts b/api/services/ai/openai.ts index f8d4aad..d8c8828 100644 --- a/api/services/ai/openai.ts +++ b/api/services/ai/openai.ts @@ -2,6 +2,7 @@ import { AiPlayerPromptSchema } from '@chessarena/types/ai-models' import { createOpenAI } from '@ai-sdk/openai' import { streamObject } from 'ai' import { models } from './models' +import { getMaxReasoningProviderOptions } from './provider-options' import { Handler } from './types' export const openai: Handler = async ({ model, logger, prompt, onThoughtUpdate }) => { @@ -9,12 +10,14 @@ export const openai: Handler = async ({ model, logger, prompt, onThoughtUpdate } apiKey: process.env.OPENAI_API_KEY, }) + const modelId = model ?? models.openai const { partialObjectStream, object } = streamObject({ - model: openai(model ?? models.openai), + model: openai(modelId), prompt, schema: AiPlayerPromptSchema, maxRetries: 0, abortSignal: AbortSignal.timeout(180000), + providerOptions: getMaxReasoningProviderOptions('openai', modelId), }) for await (const partialObject of partialObjectStream) { diff --git a/api/services/ai/provider-options.ts b/api/services/ai/provider-options.ts new file mode 100644 index 0000000..7bd4d65 --- /dev/null +++ b/api/services/ai/provider-options.ts @@ -0,0 +1,87 @@ +import { AiModelProvider } from '@chessarena/types/ai-models' +import type { JSONValue } from 'ai' + +type OpenAIReasoningEffort = 'minimal' | 'low' | 'medium' | 'high' +type GrokReasoningEffort = 'low' | 'high' + +const clampInt = (value: number, min: number, max: number) => Math.max(min, Math.min(max, value)) + +const parseOpenAiReasoningEffort = (value: string | undefined): OpenAIReasoningEffort | undefined => { + if (!value) return undefined + const normalized = value.toLowerCase().trim() + if (normalized === 'minimal') return 'minimal' + if (normalized === 'low') return 'low' + if (normalized === 'medium') return 'medium' + if (normalized === 'high') return 'high' + return undefined +} + +const parseGrokReasoningEffort = (value: string | undefined): GrokReasoningEffort | undefined => { + if (!value) return undefined + const normalized = value.toLowerCase().trim() + if (normalized === 'low') return 'low' + if (normalized === 'high') return 'high' + return undefined +} + +export const getMaxReasoningProviderOptions = ( + provider: AiModelProvider, + model: string, +): Record> => { + switch (provider) { + case 'openai': { + if (!model.startsWith('gpt-5')) return {} + return { openai: { reasoningEffort: 'high' } } + } + case 'claude': { + const supportsThinking = model.includes('-4-') || model.includes('-4') + if (!supportsThinking) return {} + return { + anthropic: { + thinking: { type: 'enabled', budgetTokens: 16000 }, + }, + } + } + case 'gemini': { + const supportsThinking = model.includes('gemini-2.5') || model.includes('gemini-3') + if (!supportsThinking) return {} + return { + google: { + // Gemini thinkingBudget is model-dependent; clamp to a safe max across the series. + thinkingConfig: { thinkingBudget: 24576 }, + }, + } + } + case 'grok': { + // xAI only supports reasoningEffort on specific models (e.g. grok-3-mini). + if (model === 'grok-3-mini') return { xai: { reasoningEffort: 'high' } } + return {} + } + default: + return {} + } +} + +/** + * Benchmark-specific provider options. + * Keeps results reproducible and avoids long tail timeouts by using a consistent, + * capped reasoning budget across providers. + */ +export const getBenchmarkProviderOptions = ( + provider: AiModelProvider, + model: string, +): Record> => { + const openaiEffort = parseOpenAiReasoningEffort(process.env.BENCHMARK_OPENAI_REASONING_EFFORT) ?? 'low' + + switch (provider) { + case 'openai': { + if (!model.startsWith('gpt-5')) return {} + return { openai: { reasoningEffort: openaiEffort } } + } + case 'claude': + case 'gemini': + case 'grok': + default: + return {} + } +} diff --git a/api/services/ai/random-ai-selection.ts b/api/services/ai/random-ai-selection.ts new file mode 100644 index 0000000..8744a36 --- /dev/null +++ b/api/services/ai/random-ai-selection.ts @@ -0,0 +1,98 @@ +import { AiModelProvider } from '@chessarena/types/ai-models' +import { supportedModelsByProvider } from './models' + +type ModelWithWeight = { + provider: AiModelProvider + model: string + weight: number // Higher weight = higher chance of selection + tier: 'cheap' | 'mid' | 'expensive' +} + +// Define model tiers and weights (cheaper = higher weight) +// IMPORTANT: Model names must match exactly with supportedModelsByProvider in models.ts +const MODEL_WEIGHTS: ModelWithWeight[] = [ + // Cheap tier (weight: 10) - highest chance + { provider: 'gemini', model: 'gemini-2.5-flash-lite', weight: 10, tier: 'cheap' }, + { provider: 'gemini', model: 'gemini-2.5-flash', weight: 10, tier: 'cheap' }, + { provider: 'gemini', model: 'gemini-2.0-flash', weight: 10, tier: 'cheap' }, + { provider: 'claude', model: 'claude-3-5-haiku-latest', weight: 10, tier: 'cheap' }, + { provider: 'claude', model: 'claude-haiku-4-5', weight: 10, tier: 'cheap' }, + { provider: 'openai', model: 'gpt-4o-mini', weight: 10, tier: 'cheap' }, + { provider: 'openai', model: 'gpt-4.1-mini', weight: 10, tier: 'cheap' }, + { provider: 'grok', model: 'grok-3-fast', weight: 10, tier: 'cheap' }, + + // Mid tier (weight: 5) + { provider: 'gemini', model: 'gemini-2.5-pro', weight: 5, tier: 'mid' }, + { provider: 'claude', model: 'claude-sonnet-4-0', weight: 5, tier: 'mid' }, + { provider: 'claude', model: 'claude-3-7-sonnet-latest', weight: 5, tier: 'mid' }, + { provider: 'grok', model: 'grok-4-fast-non-reasoning', weight: 5, tier: 'mid' }, + { provider: 'openai', model: 'gpt-5-mini', weight: 5, tier: 'mid' }, + { provider: 'openai', model: 'gpt-4o', weight: 5, tier: 'mid' }, + { provider: 'openai', model: 'gpt-4.1', weight: 5, tier: 'mid' }, + + // Expensive tier (weight: 2) - lowest chance + { provider: 'gemini', model: 'gemini-3-pro-preview', weight: 2, tier: 'expensive' }, + { provider: 'claude', model: 'claude-sonnet-4-5', weight: 2, tier: 'expensive' }, + { provider: 'claude', model: 'claude-opus-4-0', weight: 2, tier: 'expensive' }, + { provider: 'claude', model: 'claude-opus-4-5', weight: 1, tier: 'expensive' }, + { provider: 'grok', model: 'grok-4-fast-reasoning', weight: 2, tier: 'expensive' }, + { provider: 'openai', model: 'gpt-5', weight: 2, tier: 'expensive' }, + { provider: 'openai', model: 'gpt-5.1', weight: 2, tier: 'expensive' }, + { provider: 'openai', model: 'gpt-5.2', weight: 1, tier: 'expensive' }, +] + +// Filter to only include models that are actually supported +const getAvailableModels = (): ModelWithWeight[] => { + return MODEL_WEIGHTS.filter(({ provider, model }) => { + const supported = supportedModelsByProvider[provider] + return supported?.includes(model) + }) +} + +/** + * Select a random AI model with weighted probability + * Cheaper models have higher chance of being selected + */ +export const selectRandomAI = (): { provider: AiModelProvider; model: string; tier: string } => { + const availableModels = getAvailableModels() + + if (availableModels.length === 0) { + // Fallback to first available model + const provider = Object.keys(supportedModelsByProvider)[0] as AiModelProvider + const model = supportedModelsByProvider[provider][0] + return { provider, model, tier: 'unknown' } + } + + // Calculate total weight + const totalWeight = availableModels.reduce((sum, m) => sum + m.weight, 0) + + // Random selection based on weight + let random = Math.random() * totalWeight + for (const modelInfo of availableModels) { + random -= modelInfo.weight + if (random <= 0) { + return { + provider: modelInfo.provider, + model: modelInfo.model, + tier: modelInfo.tier, + } + } + } + + // Fallback (shouldn't happen) + const fallback = availableModels[0] + return { provider: fallback.provider, model: fallback.model, tier: fallback.tier } +} + +/** + * Get all available models with their weights for display + */ +export const getAvailableModelsWithWeights = () => { + return getAvailableModels().map(({ provider, model, weight, tier }) => ({ + provider, + model, + weight, + tier, + probability: (weight / getAvailableModels().reduce((sum, m) => sum + m.weight, 0)) * 100, + })) +} diff --git a/api/services/benchmark/benchmark-config.ts b/api/services/benchmark/benchmark-config.ts new file mode 100644 index 0000000..dbbcc72 --- /dev/null +++ b/api/services/benchmark/benchmark-config.ts @@ -0,0 +1,25 @@ +import { parsePositiveInt } from './concurrency' + +export type BenchmarkConfig = { + perItemTimeoutMs: number + maxOutputTokens: number + transientRetries: number + retryBaseBackoffMs: number + itemConcurrency: number +} + +export const getBenchmarkConfig = (): BenchmarkConfig => { + const perItemTimeoutMs = Number.parseInt(process.env.BENCHMARK_PER_ITEM_TIMEOUT_MS ?? '', 10) || 10_000 + const maxOutputTokens = Number.parseInt(process.env.BENCHMARK_MAX_OUTPUT_TOKENS ?? '', 10) || 192 + const transientRetries = Number.parseInt(process.env.BENCHMARK_TRANSIENT_RETRIES ?? '', 10) || 1 + const retryBaseBackoffMs = Number.parseInt(process.env.BENCHMARK_RETRY_BASE_BACKOFF_MS ?? '', 10) || 200 + const itemConcurrency = parsePositiveInt(process.env.BENCHMARK_ITEM_CONCURRENCY, 1) + + return { + perItemTimeoutMs, + maxOutputTokens, + transientRetries, + retryBaseBackoffMs, + itemConcurrency, + } +} diff --git a/api/services/benchmark/benchmark-prompt.ts b/api/services/benchmark/benchmark-prompt.ts new file mode 100644 index 0000000..89f31f2 --- /dev/null +++ b/api/services/benchmark/benchmark-prompt.ts @@ -0,0 +1,146 @@ +import { z } from 'zod' +import { generateText } from 'ai' +import { Logger } from 'motia' +import { AiModelProvider } from '@chessarena/types/ai-models' +import { getBenchmarkProviderOptions } from '../ai/provider-options' +import { getBenchmarkConfig } from './benchmark-config' +import { withRetries, withRetriesNoTimeout } from './retry' +import { createProviderModel, shouldDisableTimeout, getApiKeyEnvVar } from './shared-utils' + +const LegalMovesResponseSchema = z.object({ + moves: z.array(z.string()).describe('Array of legal moves in Standard Algebraic Notation'), +}) + +type BenchmarkPromptInput = { + prompt: string + provider: AiModelProvider + model: string + logger: Logger +} + +type BenchmarkPromptResult = { + moves: string[] + rawResponse: string + error?: string +} + +type ProviderOptions = Record + +export const makeBenchmarkPrompt = async (input: BenchmarkPromptInput): Promise => { + const { prompt, provider, model, logger } = input + + const cfg = getBenchmarkConfig() + const startTime = Date.now() + const label = `${provider}/${model}` + + // Check API key + const apiKeyEnvVar = getApiKeyEnvVar(provider) + const apiKey = process.env[apiKeyEnvVar] + if (!apiKey) { + logger.error(`[${label}] Missing ${apiKeyEnvVar}`) + return { moves: [], rawResponse: `Missing ${apiKeyEnvVar}`, error: `Missing ${apiKeyEnvVar}` } + } + + try { + const providerModel = createProviderModel(provider, model) + const disableTimeout = shouldDisableTimeout(provider, model) + const providerOptionsBase = getBenchmarkProviderOptions(provider, model) + + // Add JSON response format for Gemini + const providerOptions: ProviderOptions = + provider === 'gemini' + ? { + ...providerOptionsBase, + google: { + ...(providerOptionsBase?.google as Record | undefined), + responseMimeType: 'application/json', + }, + } + : providerOptionsBase + + const runGenerateText = async (opts: { providerOptions: ProviderOptions }) => { + if (disableTimeout) { + return await withRetriesNoTimeout(label, cfg.transientRetries, cfg.retryBaseBackoffMs, async () => { + return await generateText({ + model: providerModel, + prompt, + maxRetries: 0, + maxOutputTokens: cfg.maxOutputTokens, + providerOptions: opts.providerOptions, + }) + }) + } + + const deadlineMs = startTime + cfg.perItemTimeoutMs + return await withRetries(label, deadlineMs, cfg.transientRetries, cfg.retryBaseBackoffMs, async (abortSignal) => { + return await generateText({ + model: providerModel, + prompt, + maxRetries: 0, + abortSignal, + maxOutputTokens: cfg.maxOutputTokens, + providerOptions: opts.providerOptions, + }) + }) + } + + let text: string + try { + ;({ text } = await runGenerateText({ providerOptions })) + } catch (e) { + // Retry Grok with empty provider options if initial request fails + // This is a workaround for Grok API compatibility issues + if (provider === 'grok') { + logger.warn(`[${label}] Initial request failed, retrying with empty provider options`) + ;({ text } = await runGenerateText({ providerOptions: {} })) + } else { + throw e + } + } + + let parsed: unknown + try { + parsed = JSON.parse(text) + } catch { + // Try to extract JSON from the response + const start = text.indexOf('{') + const end = text.lastIndexOf('}') + if (start !== -1 && end !== -1 && end > start) { + try { + parsed = JSON.parse(text.slice(start, end + 1)) + } catch { + logger.warn(`[${label}] Could not parse extracted JSON from response`) + return { moves: [], rawResponse: text, error: 'Could not parse JSON response' } + } + } else { + logger.warn(`[${label}] No JSON found in response`) + return { moves: [], rawResponse: text, error: 'Could not parse JSON response' } + } + } + + const validated = LegalMovesResponseSchema.safeParse(parsed) + if (!validated.success) { + logger.warn(`[${label}] Response did not match schema: ${validated.error.message}`) + return { moves: [], rawResponse: text, error: 'Response did not match schema' } + } + + return { moves: validated.data.moves, rawResponse: text } + } catch (error) { + const elapsed = Date.now() - startTime + const errorWithStatus = error as { statusCode?: number } + const statusCode = typeof errorWithStatus?.statusCode === 'number' ? errorWithStatus.statusCode : undefined + const errorMsgBase = error instanceof Error ? error.message : 'Unknown error' + const errorMsg = statusCode != null ? `${errorMsgBase} (status ${statusCode})` : errorMsgBase + const errorName = error instanceof Error ? error.name : 'Error' + + logger.error(`[${label}] FAILED after ${elapsed}ms`) + logger.error(`[${label}] Error type: ${errorName}`) + logger.error(`[${label}] Error message: ${errorMsg}`) + + return { + moves: [], + rawResponse: errorMsg, + error: errorMsg, + } + } +} diff --git a/api/services/benchmark/concurrency.ts b/api/services/benchmark/concurrency.ts new file mode 100644 index 0000000..0bf2436 --- /dev/null +++ b/api/services/benchmark/concurrency.ts @@ -0,0 +1,36 @@ +export const parsePositiveInt = (value: string | undefined, fallback: number) => { + const n = Number.parseInt(value ?? '', 10) + return Number.isFinite(n) && n > 0 ? n : fallback +} + +/** + * Maps items with limited concurrency. + * Note: This is safe in JavaScript's single-threaded event loop because + * synchronous operations (like incrementing nextIndex) are atomic between await points. + */ +export const mapWithConcurrency = async ( + items: T[], + concurrency: number, + mapper: (item: T, index: number) => Promise, + onComplete?: (index: number, result: R) => void, +): Promise => { + const results = new Array(items.length) + const limit = Math.max(1, Math.min(concurrency, items.length)) + + // Use a queue-based approach that's more explicit about concurrency control + const queue = items.map((_, i) => i) + + const worker = async () => { + while (queue.length > 0) { + const index = queue.shift() + if (index === undefined) return + + const result = await mapper(items[index], index) + results[index] = result + onComplete?.(index, result) + } + } + + await Promise.all(Array.from({ length: limit }, worker)) + return results +} diff --git a/api/services/benchmark/fetch-lichess-puzzles.ts b/api/services/benchmark/fetch-lichess-puzzles.ts new file mode 100644 index 0000000..5c74ace --- /dev/null +++ b/api/services/benchmark/fetch-lichess-puzzles.ts @@ -0,0 +1,217 @@ +import { Chess } from 'chess.js' +import { Logger } from 'motia' +import { LichessPuzzle, PuzzleTheme } from '@chessarena/types/puzzle-benchmark' + +const LICHESS_BASE_URL = 'https://lichess.org' +const MAX_BATCH_SIZE = 50 +const REQUEST_TIMEOUT_MS = 30000 + +type LichessBatchPuzzle = { + game: { + id: string + pgn: string + } + puzzle: { + id: string + rating: number + themes: string[] + solution: string[] + initialPly: number + } +} + +type LichessBatchResponse = { + puzzles: LichessBatchPuzzle[] +} + +const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms)) + +const parseRetryAfterMs = (value: string | null): number | null => { + if (!value) return null + const seconds = Number.parseInt(value, 10) + if (Number.isFinite(seconds)) return seconds * 1000 + const dateMs = Date.parse(value) + if (Number.isFinite(dateMs)) return Math.max(0, dateMs - Date.now()) + return null +} + +const fetchJsonWithRetry = async (url: string, logger: Logger, label: string, maxRetries = 6): Promise => { + const token = process.env.LICHESS_TOKEN + const headers: Record = { + Accept: 'application/json', + } + if (token) headers.Authorization = `Bearer ${token}` + + let attempt = 0 + let backoffMs = 1000 + + while (true) { + attempt++ + const response = await fetch(url, { + headers, + signal: AbortSignal.timeout(REQUEST_TIMEOUT_MS), + }) + + if (response.ok) { + return (await response.json()) as T + } + + const retryAfterMs = parseRetryAfterMs(response.headers.get('retry-after')) + const shouldRetry = + attempt <= maxRetries && + (response.status === 429 || response.status === 408 || (response.status >= 500 && response.status <= 599)) + + if (!shouldRetry) { + logger.error('Lichess API request failed', { label, url, status: response.status }) + throw new Error(`Lichess API error (${response.status})`) + } + + const waitMs = retryAfterMs ?? backoffMs + logger.warn('Lichess API rate limited / transient error, retrying', { + label, + url, + status: response.status, + attempt, + waitMs, + }) + await sleep(waitMs + Math.floor(Math.random() * 250)) + backoffMs = Math.min(backoffMs * 2, 30000) + } +} + +/** + * Convert UCI move to SAN (e.g., "e2e4" -> "e4") + */ +const uciToSan = (chess: Chess, uci: string): string | null => { + try { + const from = uci.slice(0, 2) + const to = uci.slice(2, 4) + const promotion = uci.length > 4 ? uci[4] : undefined + + const move = chess.move({ from, to, promotion }) + if (move) { + chess.undo() // Undo so we don't modify the position + return move.san + } + return null + } catch { + return null + } +} + +/** + * Parse a single puzzle from Lichess batch response + */ +const parsePuzzle = (data: LichessBatchPuzzle, logger: Logger): LichessPuzzle | null => { + try { + const chess = new Chess() + const moves = data.game.pgn.split(' ').filter((m) => !m.includes('.') && m.length > 0) + + // Play moves up to initialPly + for (let i = 0; i < data.puzzle.initialPly && i < moves.length; i++) { + try { + chess.move(moves[i]) + } catch { + // Some moves might be invalid, skip + } + } + + // Play one more move (the setup move) - this is the opponent's last move before the puzzle + if (moves.length > data.puzzle.initialPly) { + try { + chess.move(moves[data.puzzle.initialPly]) + } catch { + logger.warn('Could not play setup move', { puzzleId: data.puzzle.id }) + return null + } + } + + const fen = chess.fen() + const legalMoves = chess.moves().sort() + + // Convert first solution move to SAN + const solutionSan = uciToSan(chess, data.puzzle.solution[0]) + if (!solutionSan) { + logger.warn('Could not convert solution to SAN', { puzzleId: data.puzzle.id }) + return null + } + + return { + id: data.puzzle.id, + rating: data.puzzle.rating, + themes: data.puzzle.themes, + solution: data.puzzle.solution, + initialPly: data.puzzle.initialPly, + pgn: data.game.pgn, + fen, + legalMoves, + solutionSan, + } + } catch (error) { + logger.error('Failed to parse puzzle', { error, puzzleId: data.puzzle.id }) + return null + } +} + +/** + * Fetch puzzles from Lichess using batch API. + * Uses /api/puzzle/batch/mix and filters by theme to avoid enumerating. + */ +export const fetchPuzzles = async (theme: PuzzleTheme, count: number, logger: Logger): Promise => { + const nb = Math.min(MAX_BATCH_SIZE, Math.max(1, Math.max(15, count))) + const target = Math.max(1, count) + + const seenIds = new Set() + const results: LichessPuzzle[] = [] + + const maxRequests = Math.max(3, Math.ceil((target / Math.max(1, nb)) * 6)) + logger.info('Fetching puzzles from Lichess', { + theme, + count: target, + nb, + maxRequests, + authenticated: Boolean(process.env.LICHESS_TOKEN), + }) + + for (let req = 1; req <= maxRequests && results.length < target; req++) { + // Prefer the themed endpoint (faster to hit the theme we want). + // Fallback to mix if it fails (e.g. unknown angle). + const themedUrl = `${LICHESS_BASE_URL}/api/puzzle/batch/${theme}?nb=${nb}` + const mixUrl = `${LICHESS_BASE_URL}/api/puzzle/batch/mix?nb=${nb}` + + let data: LichessBatchResponse + try { + data = await fetchJsonWithRetry(themedUrl, logger, 'puzzle-batch-themed') + } catch (error) { + logger.warn('Themed puzzle batch failed, falling back to mix', { theme, error }) + try { + data = await fetchJsonWithRetry(mixUrl, logger, 'puzzle-batch-mix') + } catch (error2) { + logger.error('Failed to fetch puzzles batch', { error: error2 }) + break + } + } + + const candidates = data.puzzles + + for (const item of candidates) { + if (seenIds.has(item.puzzle.id)) continue + seenIds.add(item.puzzle.id) + + // When we fall back to mix, keep filtering by theme. + if (data !== undefined && item.puzzle.themes && item.puzzle.themes.length > 0) { + // If the response came from the themed endpoint, it should already match. Filtering is harmless. + if (!item.puzzle.themes.includes(theme)) continue + } + + const parsed = parsePuzzle(item, logger) + if (!parsed) continue + results.push(parsed) + if (results.length >= target) break + } + + logger.info('Fetched puzzles batch', { theme, req, got: results.length, target }) + } + + return results.slice(0, target) +} diff --git a/api/services/benchmark/retry.ts b/api/services/benchmark/retry.ts new file mode 100644 index 0000000..cf3bc7b --- /dev/null +++ b/api/services/benchmark/retry.ts @@ -0,0 +1,95 @@ +const sleep = async (ms: number) => { + await new Promise((r) => setTimeout(r, ms)) +} + +const parseRetryAfterMs = (value: unknown): number | undefined => { + if (typeof value !== 'string') return undefined + const trimmed = value.trim() + const seconds = Number.parseInt(trimmed, 10) + if (Number.isFinite(seconds) && seconds >= 0) return seconds * 1000 + const dateMs = Date.parse(trimmed) + if (!Number.isNaN(dateMs)) { + const delta = dateMs - Date.now() + return delta > 0 ? delta : 0 + } + return undefined +} + +const getStatusCode = (e: unknown): number | undefined => { + const anyErr = e as any + if (typeof anyErr?.statusCode === 'number') return anyErr.statusCode + if (typeof anyErr?.cause?.statusCode === 'number') return anyErr.cause.statusCode + return undefined +} + +const getResponseHeaders = (e: unknown): Record | undefined => { + const anyErr = e as any + const headers = anyErr?.responseHeaders ?? anyErr?.cause?.responseHeaders + if (!headers || typeof headers !== 'object') return undefined + return headers as Record +} + +export const isTransientError = (e: unknown): boolean => { + const status = getStatusCode(e) + if (status && [408, 425, 429, 500, 502, 503, 504, 529].includes(status)) return true + const msg = e instanceof Error ? e.message : '' + if (msg.includes('Headers Timeout')) return true + if (msg.includes('Cannot connect to API')) return true + return false +} + +export const withRetries = async ( + label: string, + deadlineMs: number, + transientRetries: number, + retryBaseBackoffMs: number, + fn: (abortSignal: AbortSignal) => Promise, +): Promise => { + let attempt = 0 + while (true) { + const remaining = deadlineMs - Date.now() + if (remaining <= 0) throw new Error('Timed out before request could start') + + try { + return await fn(AbortSignal.timeout(remaining)) + } catch (e) { + attempt++ + if (attempt > transientRetries || !isTransientError(e)) throw e + + const headers = getResponseHeaders(e) + const retryAfterMs = parseRetryAfterMs(headers?.['retry-after'] ?? headers?.['Retry-After']) + const backoff = Math.min(30_000, retryBaseBackoffMs * 2 ** (attempt - 1)) + const jitter = Math.floor(Math.random() * 250) + const waitMs = (retryAfterMs ?? backoff) + jitter + + const remainingAfterWait = deadlineMs - Date.now() + if (remainingAfterWait <= 0) throw e + await sleep(Math.min(waitMs, Math.max(0, remainingAfterWait - 100))) + } + } +} + +export const withRetriesNoTimeout = async ( + label: string, + transientRetries: number, + retryBaseBackoffMs: number, + fn: () => Promise, +): Promise => { + let attempt = 0 + while (true) { + try { + return await fn() + } catch (e) { + attempt++ + if (attempt > transientRetries || !isTransientError(e)) throw e + + const headers = getResponseHeaders(e) + const retryAfterMs = parseRetryAfterMs(headers?.['retry-after'] ?? headers?.['Retry-After']) + const backoff = Math.min(30_000, retryBaseBackoffMs * 2 ** (attempt - 1)) + const jitter = Math.floor(Math.random() * 250) + const waitMs = (retryAfterMs ?? backoff) + jitter + + await sleep(waitMs) + } + } +} diff --git a/api/services/benchmark/run-legal-move-benchmark.ts b/api/services/benchmark/run-legal-move-benchmark.ts new file mode 100644 index 0000000..a6320b4 --- /dev/null +++ b/api/services/benchmark/run-legal-move-benchmark.ts @@ -0,0 +1,257 @@ +import fs from 'fs' +import path from 'path' +import mustache from 'mustache' +import { Chess } from 'chess.js' +import { Logger } from 'motia' +import { AiModelProvider } from '@chessarena/types/ai-models' +import { TestPosition, ModelBenchmarkResult, LegalMoveBenchmarkRun } from '@chessarena/types/legal-move-benchmark' +import { makeBenchmarkPrompt } from './benchmark-prompt' +import { getBenchmarkConfig } from './benchmark-config' +import { mapWithConcurrency, parsePositiveInt } from './concurrency' + +const promptTemplate = fs.readFileSync(path.join(__dirname, '../../steps/chess/legal-move-benchmark.mustache'), 'utf8') + +type GeneratePositionsOptions = { + count: number + minLegalMoves: number + maxLegalMoves: number + minMoveNumber: number + maxMoveNumber: number +} + +const DEFAULT_OPTIONS: GeneratePositionsOptions = { + count: 20, + minLegalMoves: 5, + maxLegalMoves: 25, + minMoveNumber: 8, + maxMoveNumber: 60, +} + +/** + * Generate a single random position by playing random moves + */ +const generateRandomPosition = (options: GeneratePositionsOptions): TestPosition | null => { + const chess = new Chess() + + const targetMoves = + Math.floor(Math.random() * (options.maxMoveNumber - options.minMoveNumber + 1)) + options.minMoveNumber + + for (let i = 0; i < targetMoves; i++) { + const moves = chess.moves() + if (moves.length === 0) return null + const randomMove = moves[Math.floor(Math.random() * moves.length)] + chess.move(randomMove) + } + + if (chess.isGameOver()) return null + + const legalMoves = chess.moves() + if (legalMoves.length < options.minLegalMoves) return null + if (legalMoves.length > options.maxLegalMoves) return null + + return { + id: crypto.randomUUID(), + fen: chess.fen(), + pgn: chess.pgn(), + turn: chess.turn() === 'w' ? 'white' : 'black', + legalMoves: legalMoves.sort(), + legalMoveCount: legalMoves.length, + moveNumber: Math.floor(chess.history().length / 2) + 1, + } +} + +/** + * Generate multiple unique test positions + */ +export const generateTestPositions = (options: Partial = {}): TestPosition[] => { + const opts = { ...DEFAULT_OPTIONS, ...options } + const positions: TestPosition[] = [] + const seenFens = new Set() + + let attempts = 0 + const maxAttempts = opts.count * 100 + + while (positions.length < opts.count && attempts < maxAttempts) { + attempts++ + const position = generateRandomPosition(opts) + + if (position && !seenFens.has(position.fen)) { + seenFens.add(position.fen) + positions.push(position) + } + } + + return positions +} + +/** + * Calculate benchmark score for a single position + * Uses F1-style scoring: harmonic mean of recall and precision + * - Recall: what % of legal moves did you find + * - Precision: what % of your answers were correct + */ +const calculateScore = ( + legalMoves: string[], + modelMoves: string[], +): { + correct: string[] + illegal: string[] + missed: string[] + accuracy: number + penalty: number + finalScore: number +} => { + const legalSet = new Set(legalMoves) + const modelSet = new Set(modelMoves) + + const correct = modelMoves.filter((m) => legalSet.has(m)) + const illegal = modelMoves.filter((m) => !legalSet.has(m)) + const missed = legalMoves.filter((m) => !modelSet.has(m)) + + // Recall: how many legal moves did you find + const recall = legalMoves.length > 0 ? (correct.length / legalMoves.length) * 100 : 0 + + // Precision: how many of your answers were correct + const precision = modelMoves.length > 0 ? (correct.length / modelMoves.length) * 100 : 0 + + // F1 score: harmonic mean of precision and recall + const finalScore = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0 + + // Keep accuracy as recall for backwards compatibility, penalty as inverse of precision + const accuracy = recall + const penalty = 100 - precision + + return { correct, illegal, missed, accuracy, penalty, finalScore } +} + +/** + * Run benchmark for a single position + */ +const benchmarkPosition = async ( + position: TestPosition, + provider: AiModelProvider, + model: string, + logger: Logger, +): Promise => { + const prompt = mustache.render( + promptTemplate, + { + pgn: position.pgn, + fen: position.fen, + turn: position.turn.toUpperCase(), + }, + {}, + { escape: (v: string) => v }, + ) + + const startTime = Date.now() + let rawResponse = '' + let modelMoves: string[] = [] + let error: string | undefined + + try { + const response = await makeBenchmarkPrompt({ + prompt, + provider, + model, + logger, + }) + + rawResponse = response.rawResponse + modelMoves = response.moves + if (response.error) { + error = response.error + } + } catch (e) { + error = e instanceof Error ? e.message : 'Unknown error' + logger.error('Benchmark position failed', { error, positionId: position.id }) + } + + const responseTime = Date.now() - startTime + const { correct, illegal, missed, accuracy, penalty, finalScore } = calculateScore(position.legalMoves, modelMoves) + + return { + positionId: position.id, + modelMoves, + correctMoves: correct, + illegalMoves: illegal, + missedMoves: missed, + accuracy, + penalty, + finalScore, + responseTime, + rawResponse, + error, + } +} + +/** + * Run full benchmark for a model using provided positions + */ +export const runLegalMoveBenchmark = async ( + positions: TestPosition[], + provider: AiModelProvider, + model: string, + logger: Logger, + onProgress?: (completed: number, total: number) => void, +): Promise => { + const runId = crypto.randomUUID() + + logger.info('Starting legal move benchmark', { + runId, + provider, + model, + positionCount: positions.length, + }) + + const run: LegalMoveBenchmarkRun = { + id: runId, + createdAt: Date.now(), + status: 'running', + provider, + model, + positionCount: positions.length, + positions, + results: [], + } + + const cfg = getBenchmarkConfig() + const positionConcurrency = cfg.itemConcurrency + let completed = 0 + + run.results = await mapWithConcurrency( + positions, + positionConcurrency, + async (position) => benchmarkPosition(position, provider, model, logger), + () => { + completed++ + onProgress?.(completed, positions.length) + if (completed === positions.length || completed % 5 === 0) { + logger.info('Legal move benchmark progress', { runId, provider, model, completed, total: positions.length }) + } + }, + ) + + // Calculate aggregate scores + const completedResults = run.results.filter((r) => !r.error) + if (completedResults.length > 0) { + run.averageAccuracy = completedResults.reduce((sum, r) => sum + r.accuracy, 0) / completedResults.length + run.averagePenalty = completedResults.reduce((sum, r) => sum + r.penalty, 0) / completedResults.length + run.averageFinalScore = completedResults.reduce((sum, r) => sum + r.finalScore, 0) / completedResults.length + run.totalCorrectMoves = completedResults.reduce((sum, r) => sum + r.correctMoves.length, 0) + run.totalIllegalMoves = completedResults.reduce((sum, r) => sum + r.illegalMoves.length, 0) + run.totalMissedMoves = completedResults.reduce((sum, r) => sum + r.missedMoves.length, 0) + } + + run.completedAt = Date.now() + run.status = completedResults.length > 0 ? 'completed' : 'failed' + + logger.info('Legal move benchmark completed', { + runId, + averageScore: run.averageFinalScore, + totalCorrect: run.totalCorrectMoves, + totalIllegal: run.totalIllegalMoves, + }) + + return run +} diff --git a/api/services/benchmark/run-puzzle-benchmark.ts b/api/services/benchmark/run-puzzle-benchmark.ts new file mode 100644 index 0000000..456e77f --- /dev/null +++ b/api/services/benchmark/run-puzzle-benchmark.ts @@ -0,0 +1,293 @@ +import fs from 'fs' +import path from 'path' +import mustache from 'mustache' +import { Chess } from 'chess.js' +import { z } from 'zod' +import { generateText } from 'ai' +import { Logger } from 'motia' +import { AiModelProvider } from '@chessarena/types/ai-models' +import { LichessPuzzle, PuzzleResult, PuzzleBenchmarkRun, PuzzleTheme } from '@chessarena/types/puzzle-benchmark' +import { getBenchmarkProviderOptions } from '../ai/provider-options' +import { getBenchmarkConfig } from './benchmark-config' +import { withRetries, withRetriesNoTimeout } from './retry' +import { mapWithConcurrency, parsePositiveInt } from './concurrency' +import { createProviderModel, shouldDisableTimeout } from './shared-utils' + +const promptTemplate = fs.readFileSync(path.join(__dirname, '../../steps/chess/puzzle-benchmark.mustache'), 'utf8') + +const PuzzleMoveResponseSchema = z.object({ + move: z.string().describe('The best move in Standard Algebraic Notation'), +}) + +type ProviderOptions = Record + +const extractMoveFromText = (text: string, legalMoves: string[], logger?: Logger): { move: string } | null => { + const fenceMatch = text.match(/```(?:json)?\s*([\s\S]*?)\s*```/i) + const candidate = fenceMatch?.[1] ?? text + + // Try JSON first + try { + const parsed = JSON.parse(candidate) + const validated = PuzzleMoveResponseSchema.safeParse(parsed) + if (validated.success) return { move: validated.data.move.trim() } + } catch (e) { + // JSON parsing failed, will try other extraction methods + logger?.debug('JSON parsing failed, trying alternative extraction', { + error: e instanceof Error ? e.message : 'unknown', + }) + } + + // Try extracting JSON object substring + const start = candidate.indexOf('{') + const end = candidate.lastIndexOf('}') + if (start !== -1 && end !== -1 && end > start) { + const slice = candidate.slice(start, end + 1) + try { + const parsed = JSON.parse(slice) + const validated = PuzzleMoveResponseSchema.safeParse(parsed) + if (validated.success) return { move: validated.data.move.trim() } + } catch (e) { + // Substring JSON parsing also failed + logger?.debug('Substring JSON parsing failed', { error: e instanceof Error ? e.message : 'unknown' }) + } + } + + // Regex fallbacks + const quoted = candidate.match(/"move"\s*:\s*"([^"]+)"/i) + if (quoted?.[1]) return { move: quoted[1].trim() } + + const loose = candidate.match(/\bmove\b\s*[:=]\s*("?)([^"\n\r]+)\1/i) + if (loose?.[2]) return { move: loose[2].trim() } + + let best: { move: string; idx: number } | undefined + for (const m of legalMoves) { + const idx = candidate.indexOf(m) + if (idx === -1) continue + if (!best || idx < best.idx) best = { move: m, idx } + } + if (best) return { move: best.move.trim() } + + return null +} + +const getPuzzleMaxOutputTokens = (provider: AiModelProvider, model: string, base: number): number => { + if (provider === 'openai' && model.startsWith('gpt-5')) return Math.max(base, 384) + if (provider === 'gemini' && model.startsWith('gemini-3')) return Math.max(base, 384) + return base +} + +const getThemeDescription = (theme: PuzzleTheme): string => { + switch (theme) { + case 'mateIn1': + return 'Mate in 1' + case 'oneMove': + return 'One Move' + default: + return theme + } +} + +/** + * Build provider-specific options for puzzle benchmarks + */ +const buildProviderOptions = ( + provider: AiModelProvider, + model: string, + providerOptionsBase: ProviderOptions, +): ProviderOptions => { + if (provider === 'gemini') { + const googleBase = (providerOptionsBase?.google ?? {}) as Record + return { + ...providerOptionsBase, + google: { + ...googleBase, + responseMimeType: 'text/plain', + thinkingConfig: { thinkingBudget: model.includes('pro') ? 128 : 0 }, + }, + } + } + + if (provider === 'openai' && (model === 'gpt-5' || model === 'gpt-5.1' || model === 'gpt-5.2')) { + const openaiBase = (providerOptionsBase?.openai ?? {}) as Record + return { + ...providerOptionsBase, + openai: { + ...openaiBase, + reasoningEffort: model === 'gpt-5' ? 'minimal' : 'none', + }, + } + } + + return providerOptionsBase +} + +/** + * Run benchmark for a single puzzle + */ +const benchmarkPuzzle = async ( + puzzle: LichessPuzzle, + theme: PuzzleTheme, + provider: AiModelProvider, + model: string, + logger: Logger, +): Promise => { + const chess = new Chess(puzzle.fen) + const turn = chess.turn() === 'w' ? 'WHITE' : 'BLACK' + + const prompt = mustache.render( + promptTemplate, + { + pgn: puzzle.pgn, + fen: puzzle.fen, + turn, + legalMoves: puzzle.legalMoves, + theme: getThemeDescription(theme), + }, + {}, + { escape: (v: string) => v }, + ) + + const startTime = Date.now() + let rawResponse = '' + let modelMove: string | undefined + let error: string | undefined + + try { + const cfg = getBenchmarkConfig() + const providerModel = createProviderModel(provider, model) + const disableTimeout = shouldDisableTimeout(provider, model) + const providerOptionsBase = getBenchmarkProviderOptions(provider, model) + const providerOptions = buildProviderOptions(provider, model, providerOptionsBase) + const maxOutputTokens = getPuzzleMaxOutputTokens(provider, model, cfg.maxOutputTokens) + + const label = `${provider}/${model}` + const result = disableTimeout + ? await withRetriesNoTimeout(label, cfg.transientRetries, cfg.retryBaseBackoffMs, async () => { + return await generateText({ + model: providerModel, + prompt, + maxRetries: 0, + maxOutputTokens, + providerOptions, + }) + }) + : await withRetries( + label, + startTime + cfg.perItemTimeoutMs, + cfg.transientRetries, + cfg.retryBaseBackoffMs, + async (abortSignal) => { + return await generateText({ + model: providerModel, + prompt, + maxRetries: 0, + abortSignal, + maxOutputTokens, + providerOptions, + }) + }, + ) + + const text = result.text ?? '' + // Handle cases where response might be in a different property + const resultWithResponse = result as { response?: unknown } + const responseFallback = + !text.trim() && resultWithResponse?.response ? JSON.stringify(resultWithResponse.response) : '' + const candidate = text.trim() ? text : responseFallback + + rawResponse = candidate.slice(0, 20_000) + + if (!candidate.trim()) { + error = 'Empty response' + } else { + const extracted = extractMoveFromText(candidate, puzzle.legalMoves, logger) + if (extracted) { + modelMove = extracted.move + } else { + error = 'Could not parse JSON response' + } + } + } catch (e) { + error = e instanceof Error ? e.message : 'Unknown error' + logger.error('Puzzle benchmark failed', { error, puzzleId: puzzle.id }) + } + + const responseTime = Date.now() - startTime + const isCorrect = modelMove === puzzle.solutionSan + + return { + puzzleId: puzzle.id, + modelMove, + correctMove: puzzle.solutionSan, + isCorrect, + responseTime, + rawResponse, + error, + } +} + +/** + * Run full puzzle benchmark for a model + */ +export const runPuzzleBenchmark = async ( + puzzles: LichessPuzzle[], + puzzleSetId: string, + theme: PuzzleTheme, + provider: AiModelProvider, + model: string, + logger: Logger, + onProgress?: (completed: number, total: number) => void, +): Promise => { + const runId = crypto.randomUUID() + + logger.info('Starting puzzle benchmark', { + runId, + provider, + model, + theme, + puzzleCount: puzzles.length, + }) + + const run: PuzzleBenchmarkRun = { + id: runId, + createdAt: Date.now(), + status: 'running', + provider, + model, + puzzleSetId, + theme, + results: [], + totalPuzzles: puzzles.length, + } + + const puzzleConcurrency = parsePositiveInt(process.env.BENCHMARK_PUZZLE_CONCURRENCY, 1) + let completed = 0 + + run.results = await mapWithConcurrency( + puzzles, + puzzleConcurrency, + async (puzzle) => benchmarkPuzzle(puzzle, theme, provider, model, logger), + () => { + completed++ + onProgress?.(completed, puzzles.length) + if (completed === puzzles.length || completed % 10 === 0) { + logger.info('Puzzle benchmark progress', { runId, provider, model, theme, completed, total: puzzles.length }) + } + }, + ) + + // Calculate aggregate scores + const correctCount = run.results.filter((r) => r.isCorrect).length + run.correctCount = correctCount + run.accuracy = puzzles.length > 0 ? (correctCount / puzzles.length) * 100 : 0 + run.completedAt = Date.now() + run.status = 'completed' + + logger.info('Puzzle benchmark completed', { + runId, + correctCount, + accuracy: run.accuracy, + }) + + return run +} diff --git a/api/services/benchmark/shared-utils.ts b/api/services/benchmark/shared-utils.ts new file mode 100644 index 0000000..a939e10 --- /dev/null +++ b/api/services/benchmark/shared-utils.ts @@ -0,0 +1,95 @@ +import { createAnthropic } from '@ai-sdk/anthropic' +import { createOpenAI } from '@ai-sdk/openai' +import { createGoogleGenerativeAI } from '@ai-sdk/google' +import { createXai } from '@ai-sdk/xai' +import { AiModelProvider } from '@chessarena/types/ai-models' + +/** + * Check if timeout should be disabled for specific provider/model combinations. + * Currently disabled for Grok models to avoid timeout issues with their API. + */ +export const shouldDisableTimeout = (provider: AiModelProvider, model: string): boolean => { + if (provider !== 'grok') return false + const enabled = (process.env.BENCHMARK_GROK_DISABLE_TIMEOUT ?? 'true') === 'true' + if (!enabled) return false + return model.startsWith('grok-3') || model.startsWith('grok-4') +} + +/** + * Create a provider model instance for the given provider and model name. + */ +export const createProviderModel = (provider: AiModelProvider, model: string) => { + switch (provider) { + case 'openai': { + const openai = createOpenAI({ apiKey: process.env.OPENAI_API_KEY }) + return openai(model) + } + case 'gemini': { + const google = createGoogleGenerativeAI({ apiKey: process.env.GEMINI_API_KEY }) + return google(model) + } + case 'claude': { + const anthropic = createAnthropic({ apiKey: process.env.ANTHROPIC_API_KEY }) + return anthropic(model) + } + case 'grok': { + const xai = createXai({ apiKey: process.env.XAI_API_KEY }) + return xai(model) + } + default: + throw new Error(`Unsupported provider: ${provider}`) + } +} + +/** + * Calculate F1 score for legal move benchmark results. + * - Recall: what % of legal moves did the model find + * - Precision: what % of model's answers were correct + * - F1: harmonic mean of precision and recall + */ +export const calculateLegalMoveScore = ( + legalMoves: string[], + modelMoves: string[], +): { + correct: string[] + illegal: string[] + missed: string[] + accuracy: number + penalty: number + finalScore: number +} => { + const legalSet = new Set(legalMoves) + const modelSet = new Set(modelMoves) + + const correct = modelMoves.filter((m) => legalSet.has(m)) + const illegal = modelMoves.filter((m) => !legalSet.has(m)) + const missed = legalMoves.filter((m) => !modelSet.has(m)) + + // Recall: how many legal moves did you find + const recall = legalMoves.length > 0 ? (correct.length / legalMoves.length) * 100 : 0 + + // Precision: how many of your answers were correct + const precision = modelMoves.length > 0 ? (correct.length / modelMoves.length) * 100 : 0 + + // F1 score: harmonic mean of precision and recall + const finalScore = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0 + + // Keep accuracy as recall for backwards compatibility, penalty as inverse of precision + const accuracy = recall + const penalty = 100 - precision + + return { correct, illegal, missed, accuracy, penalty, finalScore } +} + +/** + * Get the environment variable name for a provider's API key. + */ +export const getApiKeyEnvVar = (provider: AiModelProvider): string => { + const envVars: Record = { + openai: 'OPENAI_API_KEY', + gemini: 'GEMINI_API_KEY', + claude: 'ANTHROPIC_API_KEY', + grok: 'XAI_API_KEY', + } + return envVars[provider] +} diff --git a/api/services/benchmark/stockfish-game.ts b/api/services/benchmark/stockfish-game.ts new file mode 100644 index 0000000..7ccf8de --- /dev/null +++ b/api/services/benchmark/stockfish-game.ts @@ -0,0 +1,332 @@ +import { spawn, ChildProcess } from 'child_process' +import { Chess } from 'chess.js' +import fs from 'fs' +import path from 'path' +import mustache from 'mustache' +import { Logger } from 'motia' +import { AiModelProvider } from '@chessarena/types/ai-models' +import { StockfishGameResult, StockfishGameMove } from '@chessarena/types/stockfish-benchmark' +import { makePrompt } from '../ai/make-prompt' + +const promptTemplate = fs.readFileSync(path.join(__dirname, '../../steps/chess/05-ai-player.mustache'), 'utf8') + +const MAX_MOVES = 200 // Max half-moves before declaring draw +const MAX_ILLEGAL_ATTEMPTS = 3 + +class StockfishEngine { + private process: ChildProcess | null = null + private ready = false + private outputBuffer = '' + + async init(enginePath: string): Promise { + return new Promise((resolve, reject) => { + this.process = spawn(enginePath) + + this.process.stdout?.on('data', (data) => { + this.outputBuffer += data.toString() + }) + + this.process.stderr?.on('data', (data) => { + console.error('Stockfish error:', data.toString()) + }) + + this.process.on('error', reject) + + // Send UCI command and wait for uciok + this.send('uci') + this.waitFor('uciok', 5000) + .then(() => { + this.send('isready') + this.waitFor('readyok', 5000).then(() => { + this.ready = true + resolve() + }) + }) + .catch(reject) + }) + } + + private send(command: string): void { + this.process?.stdin?.write(command + '\n') + } + + private async waitFor(text: string, timeout: number): Promise { + const start = Date.now() + while (Date.now() - start < timeout) { + if (this.outputBuffer.includes(text)) { + const result = this.outputBuffer + this.outputBuffer = '' + return result + } + await new Promise((r) => setTimeout(r, 50)) + } + throw new Error(`Timeout waiting for: ${text}`) + } + + async setLevel(level: number): Promise { + // Level 1-20, affects skill level + const skillLevel = Math.max(0, Math.min(20, level)) + this.send(`setoption name Skill Level value ${skillLevel}`) + this.send('isready') + await this.waitFor('readyok', 2000) + } + + async getBestMove(fen: string, thinkTime: number = 1000): Promise { + this.outputBuffer = '' + this.send(`position fen ${fen}`) + this.send(`go movetime ${thinkTime}`) + + const output = await this.waitFor('bestmove', 30000) + const match = output.match(/bestmove\s+(\S+)/) + if (!match) throw new Error('Could not parse best move') + return match[1] + } + + async evaluate(fen: string): Promise<{ score: number; bestMove: string }> { + this.outputBuffer = '' + this.send(`position fen ${fen}`) + this.send('go depth 15') + + const output = await this.waitFor('bestmove', 30000) + + // Parse score from info lines + const scoreMatch = output.match(/score cp (-?\d+)/) + const mateMatch = output.match(/score mate (-?\d+)/) + const bestMoveMatch = output.match(/bestmove\s+(\S+)/) + + let score = 0 + if (mateMatch) { + const mateIn = parseInt(mateMatch[1]) + score = mateIn > 0 ? 10000 - mateIn * 100 : -10000 - mateIn * 100 + } else if (scoreMatch) { + score = parseInt(scoreMatch[1]) + } + + return { + score, + bestMove: bestMoveMatch ? bestMoveMatch[1] : '', + } + } + + async quit(): Promise { + this.send('quit') + this.process?.kill() + this.process = null + } +} + +/** + * Play a single game against Stockfish + */ +export const playGameAgainstStockfish = async ( + provider: AiModelProvider, + model: string, + aiColor: 'white' | 'black', + stockfishLevel: number, + logger: Logger, +): Promise => { + const gameId = crypto.randomUUID() + const chess = new Chess() + const moves: StockfishGameMove[] = [] + let totalCentipawnLoss = 0 + let aiMoveCount = 0 + let blunders = 0 + let mistakes = 0 + let inaccuracies = 0 + + const result: StockfishGameResult = { + id: gameId, + createdAt: Date.now(), + status: 'running', + provider, + model, + aiColor, + stockfishLevel, + moves: [], + totalMoves: 0, + } + + const enginePath = process.env.STOCKFISH_BIN_PATH + if (!enginePath) { + result.status = 'failed' + result.resultReason = 'STOCKFISH_BIN_PATH not set' + return result + } + + const engine = new StockfishEngine() + + try { + logger.info(`Starting game: ${provider}/${model} as ${aiColor} vs Stockfish level ${stockfishLevel}`) + await engine.init(enginePath) + await engine.setLevel(stockfishLevel) + + let moveNumber = 0 + let illegalAttempts = 0 + + while (!chess.isGameOver() && moveNumber < MAX_MOVES) { + const currentPlayer = chess.turn() === 'w' ? 'white' : 'black' + const isAiTurn = currentPlayer === aiColor + const fenBefore = chess.fen() + + moveNumber++ + + if (isAiTurn) { + // AI's turn - get move from LLM + const validMoves = chess.moves({ verbose: true }) + const templateData = { + fenBefore, + fen: fenBefore, + inCheck: chess.isCheck(), + player: currentPlayer, + validMoves, + totalMoves: validMoves.length, + } + + const prompt = mustache.render(promptTemplate, templateData, {}, { escape: (v: string) => v }) + + const startTime = Date.now() + let moveSan: string | null = null + let error: string | undefined + + try { + const response = await makePrompt({ + prompt, + provider, + model, + logger, + }) + + moveSan = response?.moveSan + } catch (e) { + error = e instanceof Error ? e.message : 'Unknown error' + } + + const responseTime = Date.now() - startTime + + // Validate and play AI move + if (moveSan) { + try { + // Get evaluation before the move + const evalBefore = await engine.evaluate(fenBefore) + + chess.move(moveSan) + const fenAfter = chess.fen() + + // Get evaluation after the move + const evalAfter = await engine.evaluate(fenAfter) + + // Calculate centipawn loss (from AI's perspective) + const scoreBefore = aiColor === 'white' ? evalBefore.score : -evalBefore.score + const scoreAfter = aiColor === 'white' ? evalAfter.score : -evalAfter.score + const centipawnLoss = Math.max(0, scoreBefore - scoreAfter) + + totalCentipawnLoss += centipawnLoss + aiMoveCount++ + + if (centipawnLoss > 100) blunders++ + else if (centipawnLoss > 50) mistakes++ + else if (centipawnLoss > 25) inaccuracies++ + + moves.push({ + moveNumber, + player: currentPlayer, + moveSan, + fen: fenAfter, + centipawnScore: scoreAfter, + bestMove: evalBefore.bestMove, + centipawnLoss, + isAiMove: true, + responseTime, + }) + + illegalAttempts = 0 + logger.info(` Move ${moveNumber}: ${moveSan} (CPL: ${centipawnLoss})`) + } catch { + // Illegal move + illegalAttempts++ + logger.warn(` Illegal move attempt ${illegalAttempts}: ${moveSan}`) + + if (illegalAttempts >= MAX_ILLEGAL_ATTEMPTS) { + result.result = 'ai_illegal_move' + result.resultReason = `Too many illegal moves (last: ${moveSan})` + break + } + moveNumber-- // Retry + continue + } + } else { + // No move returned + illegalAttempts++ + if (illegalAttempts >= MAX_ILLEGAL_ATTEMPTS) { + result.result = 'ai_illegal_move' + result.resultReason = 'AI failed to return valid move' + break + } + moveNumber-- + continue + } + } else { + // Stockfish's turn + const uciMove = await engine.getBestMove(fenBefore, 500) // 500ms think time + + // Convert UCI to SAN + const from = uciMove.slice(0, 2) + const to = uciMove.slice(2, 4) + const promotion = uciMove.length > 4 ? uciMove[4] : undefined + + const move = chess.move({ from, to, promotion }) + if (move) { + moves.push({ + moveNumber, + player: currentPlayer, + moveSan: move.san, + fen: chess.fen(), + isAiMove: false, + }) + logger.info(` Move ${moveNumber}: ${move.san} (Stockfish)`) + } + } + } + + // Determine result + if (!result.result) { + if (chess.isCheckmate()) { + const winner = chess.turn() === 'w' ? 'black' : 'white' + result.result = winner === aiColor ? 'ai_win' : 'stockfish_win' + result.resultReason = 'Checkmate' + } else if (chess.isDraw()) { + result.result = 'draw' + if (chess.isStalemate()) result.resultReason = 'Stalemate' + else if (chess.isThreefoldRepetition()) result.resultReason = 'Threefold repetition' + else if (chess.isInsufficientMaterial()) result.resultReason = 'Insufficient material' + else result.resultReason = '50-move rule' + } else if (moveNumber >= MAX_MOVES) { + result.result = 'draw' + result.resultReason = 'Max moves reached' + } + } + + result.moves = moves + result.totalMoves = moves.length + result.finalFen = chess.fen() + result.pgn = chess.pgn() + result.aiMoveCount = aiMoveCount + result.totalCentipawnLoss = totalCentipawnLoss + result.averageCentipawnLoss = aiMoveCount > 0 ? totalCentipawnLoss / aiMoveCount : 0 + result.blunders = blunders + result.mistakes = mistakes + result.inaccuracies = inaccuracies + result.status = 'completed' + result.completedAt = Date.now() + + logger.info(`Game completed: ${result.result} - ACPL: ${result.averageCentipawnLoss?.toFixed(1)}`) + } catch (error) { + result.status = 'failed' + result.resultReason = error instanceof Error ? error.message : 'Unknown error' + logger.error('Game failed', { error }) + } finally { + await engine.quit() + } + + return result +} diff --git a/api/services/chess/create-game.ts b/api/services/chess/create-game.ts index 0b0fcd1..a3fba82 100644 --- a/api/services/chess/create-game.ts +++ b/api/services/chess/create-game.ts @@ -1,6 +1,6 @@ import { FlowContextStateStreams, Logger } from 'motia' import { createGameId } from './create-game-id' -import { Game } from '@chessarena/types/game' +import { BenchmarkVariant, Game } from '@chessarena/types/game' import { models } from '../ai/models' import { isAiGame } from './utils' import { User } from '@chessarena/types/user' @@ -10,6 +10,7 @@ export const createGame = async ( streams: FlowContextStateStreams, logger: Logger, user?: User, + variant: BenchmarkVariant = 'guided', ): Promise => { const gameId = await createGameId({ streams, logger }) @@ -18,11 +19,13 @@ export const createGame = async ( fen: 'rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1', turn: 'white', status: 'pending', + variant, players: { white: { ...players.white, userId: players.white.ai ? undefined : user?.id }, black: { ...players.black, userId: players.black.ai }, }, check: false, + createdAt: Date.now(), }) if (isAiGame(game) && players.white.ai && players.black.ai) { diff --git a/api/services/chess/generate-pgn.ts b/api/services/chess/generate-pgn.ts new file mode 100644 index 0000000..d14188d --- /dev/null +++ b/api/services/chess/generate-pgn.ts @@ -0,0 +1,77 @@ +import { Chess } from 'chess.js' +import type { Game } from '@chessarena/types/game' +import type { GameMove } from '@chessarena/types/game-move' + +type PgnOptions = { + game: Game + moves: GameMove[] +} + +const detectPromotion = (fenBefore: string, fenAfter: string, to: string): string | undefined => { + // Check if a pawn moved to the back rank (promotion) + const toRank = to[1] + if (toRank !== '1' && toRank !== '8') return undefined + + // Parse the piece at destination in fenAfter to detect what it promoted to + const afterBoard = fenAfter.split(' ')[0] + const file = to.charCodeAt(0) - 'a'.charCodeAt(0) + const rank = toRank === '8' ? 0 : 7 + + const rows = afterBoard.split('/') + let col = 0 + for (const char of rows[rank]) { + if (col === file) { + const piece = char.toLowerCase() + if (['q', 'r', 'b', 'n'].includes(piece)) { + return piece + } + break + } + if (/\d/.test(char)) { + col += parseInt(char) + } else { + col++ + } + } + return undefined +} + +export const generatePgn = ({ game, moves }: PgnOptions): string => { + const chess = new Chess() + + // Replay all moves to build the game + for (const move of moves) { + const from = move.lastMove[0] + const to = move.lastMove[1] + + try { + const promotion = detectPromotion(move.fenBefore, move.fenAfter, to) + chess.move({ from, to, promotion }) + } catch { + // Skip invalid moves (shouldn't happen with valid history) + } + } + + // Build PGN headers + const headers: Record = { + Event: 'ChessArena.ai Benchmark', + Site: 'https://chessarena.ai', + Date: new Date().toISOString().split('T')[0].replace(/-/g, '.'), + Round: '1', + White: game.players.white.ai ? `${game.players.white.ai} (${game.players.white.model || 'unknown'})` : 'Human', + Black: game.players.black.ai ? `${game.players.black.ai} (${game.players.black.model || 'unknown'})` : 'Human', + Result: game.winner === 'white' ? '1-0' : game.winner === 'black' ? '0-1' : '1/2-1/2', + Variant: game.variant || 'guided', + } + + if (game.endGameReason) { + headers.Termination = game.endGameReason + } + + // Set headers on chess instance + for (const [key, value] of Object.entries(headers)) { + chess.header(key, value) + } + + return chess.pgn() +} diff --git a/api/steps/benchmark/00-generate-position-set.step.ts b/api/steps/benchmark/00-generate-position-set.step.ts new file mode 100644 index 0000000..14abe22 --- /dev/null +++ b/api/steps/benchmark/00-generate-position-set.step.ts @@ -0,0 +1,65 @@ +import { ApiRouteConfig, Handlers } from 'motia' +import { z } from 'zod' +import { PositionSetSchema } from '@chessarena/types/legal-move-benchmark' +import { generateTestPositions } from '../../services/benchmark/run-legal-move-benchmark' + +const bodySchema = z.object({ + count: z.number().min(1).max(50).default(20), + force: z.boolean().default(false), // Force regenerate even if exists +}) + +export const config: ApiRouteConfig = { + type: 'api', + name: 'GeneratePositionSet', + description: 'Generate and store a set of test positions for benchmarking', + path: '/benchmark/positions/generate', + method: 'POST', + emits: [], + flows: ['benchmark'], + bodySchema, + responseSchema: { + 200: PositionSetSchema, + 400: z.object({ message: z.string() }), + }, +} + +export const handler: Handlers['GeneratePositionSet'] = async (req, { logger, streams }) => { + const { count, force } = req.body + + logger.info('Generating position set', { count, force }) + + // Check if we already have a position set + const existingSet = await streams.positionSet.get('sets', 'default') + if (existingSet && existingSet.positions.length >= count && !force) { + logger.info('Using existing position set', { existingCount: existingSet.positions.length }) + return { status: 200, body: existingSet } + } + + try { + const positions = generateTestPositions({ count }) + + if (positions.length === 0) { + return { status: 400, body: { message: 'Failed to generate any positions' } } + } + + const positionSet = { + id: `positions-${Date.now()}`, + createdAt: Date.now(), + count: positions.length, + positions, + } + + // Store the position set + await streams.positionSet.set('sets', 'default', positionSet) + + logger.info('Position set created', { count: positions.length }) + + return { status: 200, body: positionSet } + } catch (error) { + logger.error('Failed to generate positions', { error }) + return { + status: 400, + body: { message: error instanceof Error ? error.message : 'Failed to generate positions' }, + } + } +} diff --git a/api/steps/benchmark/01-run-legal-move-benchmark.step.ts b/api/steps/benchmark/01-run-legal-move-benchmark.step.ts new file mode 100644 index 0000000..12587d5 --- /dev/null +++ b/api/steps/benchmark/01-run-legal-move-benchmark.step.ts @@ -0,0 +1,94 @@ +import { ApiRouteConfig, Handlers } from 'motia' +import { z } from 'zod' +import { AiModelProviderSchema } from '@chessarena/types/ai-models' +import { LegalMoveBenchmarkRunSchema } from '@chessarena/types/legal-move-benchmark' +import { runLegalMoveBenchmark, generateTestPositions } from '../../services/benchmark/run-legal-move-benchmark' +import { getModelsForProvider } from '../../services/ai/models' + +const bodySchema = z.object({ + provider: AiModelProviderSchema(), + model: z.string(), +}) + +export const config: ApiRouteConfig = { + type: 'api', + name: 'RunLegalMoveBenchmark', + description: 'Run legal move generation benchmark for a model', + path: '/benchmark/legal-moves/run', + method: 'POST', + emits: [], + flows: ['benchmark'], + bodySchema, + responseSchema: { + 200: LegalMoveBenchmarkRunSchema, + 400: z.object({ message: z.string() }), + }, +} + +export const handler: Handlers['RunLegalMoveBenchmark'] = async (req, { logger, streams }) => { + const { provider, model } = req.body + + // Validate model exists for provider + const supportedModels = getModelsForProvider(provider) + if (!supportedModels.includes(model)) { + return { + status: 400, + body: { message: `Model ${model} is not supported for provider ${provider}` }, + } + } + + // Get or create position set + let positionSet = await streams.positionSet.get('sets', 'default') + if (!positionSet || positionSet.positions.length === 0) { + logger.info('No position set found, generating new one') + const positions = generateTestPositions({ count: 20 }) + positionSet = { + id: `positions-${Date.now()}`, + createdAt: Date.now(), + count: positions.length, + positions, + } + await streams.positionSet.set('sets', 'default', positionSet) + } + + logger.info('Starting legal move benchmark', { provider, model, positionCount: positionSet.positions.length }) + + try { + const run = await runLegalMoveBenchmark(positionSet.positions, provider, model, logger) + + // Store the run result + await streams.legalMoveBenchmark.set('runs', run.id, run) + + // Update summary for this model + const summaryId = `${provider}:${model}` + const existingSummary = await streams.legalMoveBenchmarkSummary.get('models', summaryId) + + const newSummary = { + id: summaryId, + provider, + model, + runsCompleted: (existingSummary?.runsCompleted ?? 0) + 1, + averageScore: existingSummary + ? (existingSummary.averageScore * existingSummary.runsCompleted + (run.averageFinalScore ?? 0)) / + (existingSummary.runsCompleted + 1) + : (run.averageFinalScore ?? 0), + bestScore: Math.max(existingSummary?.bestScore ?? 0, run.averageFinalScore ?? 0), + worstScore: existingSummary + ? Math.min(existingSummary.worstScore, run.averageFinalScore ?? 0) + : (run.averageFinalScore ?? 0), + lastRunAt: Date.now(), + } + + await streams.legalMoveBenchmarkSummary.set('models', summaryId, newSummary) + + logger.info('Benchmark completed and stored', { runId: run.id }) + + return { status: 200, body: run } + } catch (error) { + logger.error('Benchmark failed', { error }) + return { + status: 400, + body: { message: error instanceof Error ? error.message : 'Benchmark failed' }, + } + } +} diff --git a/api/steps/benchmark/02-get-benchmark-runs.step.ts b/api/steps/benchmark/02-get-benchmark-runs.step.ts new file mode 100644 index 0000000..d2a35ee --- /dev/null +++ b/api/steps/benchmark/02-get-benchmark-runs.step.ts @@ -0,0 +1,64 @@ +import { ApiRouteConfig, Handlers } from 'motia' +import { z } from 'zod' +import { LegalMoveBenchmarkRunSchema } from '@chessarena/types/legal-move-benchmark' + +export const config: ApiRouteConfig = { + type: 'api', + name: 'GetBenchmarkRuns', + description: 'Get legal move benchmark runs with optional filters', + path: '/benchmark/legal-moves/runs', + method: 'GET', + emits: [], + flows: ['benchmark'], + queryParams: [ + { name: 'provider', description: 'Filter by AI provider' }, + { name: 'model', description: 'Filter by model name' }, + { name: 'limit', description: 'Pagination limit' }, + { name: 'offset', description: 'Pagination offset' }, + ], + responseSchema: { + 200: z.object({ + runs: z.array( + LegalMoveBenchmarkRunSchema.omit({ positions: true, results: true }).extend({ + resultsCount: z.number(), + }), + ), + total: z.number(), + }), + }, +} + +export const handler: Handlers['GetBenchmarkRuns'] = async (req, { logger, streams }) => { + const params = req.queryParams as Record + const provider = params.provider + const model = params.model + const limit = params.limit ? parseInt(params.limit) : 20 + const offset = params.offset ? parseInt(params.offset) : 0 + + logger.info('Fetching benchmark runs', { provider, model, limit, offset }) + + const allRuns = await streams.legalMoveBenchmark.getGroup('runs') + + // Filter + let filtered = allRuns.filter((run) => { + if (provider && run.provider !== provider) return false + if (model && run.model !== model) return false + return true + }) + + // Sort by most recent + filtered.sort((a, b) => (b.createdAt ?? 0) - (a.createdAt ?? 0)) + + const total = filtered.length + + // Paginate + const paginated = filtered.slice(offset, offset + limit) + + // Remove heavy fields and add count + const runs = paginated.map(({ positions, results, ...rest }) => ({ + ...rest, + resultsCount: results?.length ?? 0, + })) + + return { status: 200, body: { runs, total } } +} diff --git a/api/steps/benchmark/03-get-benchmark-run-detail.step.ts b/api/steps/benchmark/03-get-benchmark-run-detail.step.ts new file mode 100644 index 0000000..7bdb949 --- /dev/null +++ b/api/steps/benchmark/03-get-benchmark-run-detail.step.ts @@ -0,0 +1,31 @@ +import { ApiRouteConfig, Handlers } from 'motia' +import { z } from 'zod' +import { LegalMoveBenchmarkRunSchema } from '@chessarena/types/legal-move-benchmark' + +export const config: ApiRouteConfig = { + type: 'api', + name: 'GetBenchmarkRunDetail', + description: 'Get detailed benchmark run including all positions and results', + path: '/benchmark/legal-moves/runs/:runId', + method: 'GET', + emits: [], + flows: ['benchmark'], + responseSchema: { + 200: LegalMoveBenchmarkRunSchema, + 404: z.object({ message: z.string() }), + }, +} + +export const handler: Handlers['GetBenchmarkRunDetail'] = async (req, { logger, streams }) => { + const { runId } = req.pathParams + + logger.info('Fetching benchmark run detail', { runId }) + + const run = await streams.legalMoveBenchmark.get('runs', runId) + + if (!run) { + return { status: 404, body: { message: 'Benchmark run not found' } } + } + + return { status: 200, body: run } +} diff --git a/api/steps/benchmark/04-get-benchmark-leaderboard.step.ts b/api/steps/benchmark/04-get-benchmark-leaderboard.step.ts new file mode 100644 index 0000000..3366553 --- /dev/null +++ b/api/steps/benchmark/04-get-benchmark-leaderboard.step.ts @@ -0,0 +1,29 @@ +import { ApiRouteConfig, Handlers } from 'motia' +import { z } from 'zod' +import { LegalMoveBenchmarkSummarySchema } from '@chessarena/types/legal-move-benchmark' + +export const config: ApiRouteConfig = { + type: 'api', + name: 'GetBenchmarkLeaderboard', + description: 'Get legal move benchmark leaderboard sorted by average score', + path: '/benchmark/legal-moves/leaderboard', + method: 'GET', + emits: [], + flows: ['benchmark'], + responseSchema: { + 200: z.object({ + leaderboard: z.array(LegalMoveBenchmarkSummarySchema), + }), + }, +} + +export const handler: Handlers['GetBenchmarkLeaderboard'] = async (req, { logger, streams }) => { + logger.info('Fetching benchmark leaderboard') + + const summaries = await streams.legalMoveBenchmarkSummary.getGroup('models') + + // Sort by average score descending + const sorted = summaries.sort((a, b) => (b.averageScore ?? 0) - (a.averageScore ?? 0)) + + return { status: 200, body: { leaderboard: sorted } } +} diff --git a/api/steps/benchmark/05-fetch-puzzle-set.step.ts b/api/steps/benchmark/05-fetch-puzzle-set.step.ts new file mode 100644 index 0000000..a1e4034 --- /dev/null +++ b/api/steps/benchmark/05-fetch-puzzle-set.step.ts @@ -0,0 +1,72 @@ +import { ApiRouteConfig, Handlers } from 'motia' +import { z } from 'zod' +import { PuzzleSetSchema, PuzzleThemeSchema } from '@chessarena/types/puzzle-benchmark' +import { fetchPuzzles } from '../../services/benchmark/fetch-lichess-puzzles' + +const bodySchema = z.object({ + theme: PuzzleThemeSchema, + count: z.number().min(1).max(100).default(10), +}) + +export const config: ApiRouteConfig = { + type: 'api', + name: 'FetchPuzzleSet', + description: 'Fetch and store a set of puzzles from Lichess', + path: '/benchmark/puzzles/fetch', + method: 'POST', + emits: [], + flows: ['benchmark'], + bodySchema, + responseSchema: { + 200: PuzzleSetSchema, + 400: z.object({ message: z.string() }), + }, +} + +export const handler: Handlers['FetchPuzzleSet'] = async (req, { logger, streams }) => { + const { theme, count } = req.body + + logger.info('Fetching puzzle set', { theme, count }) + + // Check if we already have a puzzle set for this theme + const existingSet = await streams.puzzleSet.get('sets', theme) + if (existingSet && existingSet.puzzles.length >= count) { + logger.info('Using existing puzzle set', { theme, existingCount: existingSet.puzzles.length }) + return { status: 200, body: existingSet } + } + + try { + const existingIds = new Set(existingSet?.puzzles.map((p) => p.id) ?? []) + const needed = existingSet ? Math.max(0, count - existingSet.puzzles.length) : count + + const fetched = needed > 0 ? await fetchPuzzles(theme, needed, logger) : [] + const newUnique = fetched.filter((p) => !existingIds.has(p.id)) + + const puzzles = existingSet ? [...existingSet.puzzles, ...newUnique] : newUnique + + if (puzzles.length === 0) { + return { status: 400, body: { message: 'Failed to fetch any puzzles' } } + } + + const puzzleSet = { + id: `${theme}-${Date.now()}`, + theme, + createdAt: Date.now(), + puzzles, + count: puzzles.length, + } + + // Store the puzzle set + await streams.puzzleSet.set('sets', theme, puzzleSet) + + logger.info('Puzzle set created', { theme, count: puzzles.length }) + + return { status: 200, body: puzzleSet } + } catch (error) { + logger.error('Failed to fetch puzzles', { error }) + return { + status: 400, + body: { message: error instanceof Error ? error.message : 'Failed to fetch puzzles' }, + } + } +} diff --git a/api/steps/benchmark/06-run-puzzle-benchmark.step.ts b/api/steps/benchmark/06-run-puzzle-benchmark.step.ts new file mode 100644 index 0000000..318d2c4 --- /dev/null +++ b/api/steps/benchmark/06-run-puzzle-benchmark.step.ts @@ -0,0 +1,92 @@ +import { ApiRouteConfig, Handlers } from 'motia' +import { z } from 'zod' +import { AiModelProviderSchema } from '@chessarena/types/ai-models' +import { PuzzleBenchmarkRunSchema, PuzzleThemeSchema } from '@chessarena/types/puzzle-benchmark' +import { runPuzzleBenchmark } from '../../services/benchmark/run-puzzle-benchmark' +import { getModelsForProvider } from '../../services/ai/models' + +const bodySchema = z.object({ + provider: AiModelProviderSchema(), + model: z.string(), + theme: PuzzleThemeSchema, + count: z.number().min(1).max(100).default(10), +}) + +export const config: ApiRouteConfig = { + type: 'api', + name: 'RunPuzzleBenchmark', + description: 'Run puzzle benchmark for a model', + path: '/benchmark/puzzles/run', + method: 'POST', + emits: [], + flows: ['benchmark'], + bodySchema, + responseSchema: { + 200: PuzzleBenchmarkRunSchema, + 400: z.object({ message: z.string() }), + }, +} + +export const handler: Handlers['RunPuzzleBenchmark'] = async (req, { logger, streams }) => { + const { provider, model, theme, count } = req.body + + // Validate model exists + const supportedModels = getModelsForProvider(provider) + if (!supportedModels.includes(model)) { + return { + status: 400, + body: { message: `Model ${model} is not supported for provider ${provider}` }, + } + } + + // Get the puzzle set + const puzzleSet = await streams.puzzleSet.get('sets', theme) + if (!puzzleSet || puzzleSet.puzzles.length === 0) { + return { + status: 400, + body: { message: `No puzzle set found for theme ${theme}. Fetch puzzles first.` }, + } + } + + const puzzles = puzzleSet.puzzles.slice(0, count) + logger.info('Starting puzzle benchmark', { provider, model, theme, puzzleCount: puzzles.length }) + + try { + const run = await runPuzzleBenchmark(puzzles, puzzleSet.id, theme, provider, model, logger) + + // Store the run + await streams.puzzleBenchmark.set('runs', run.id, run) + + // Update summary + const summaryId = `${provider}:${model}` + const existingSummary = await streams.puzzleBenchmarkSummary.get('models', summaryId) + + const newSummary = { + id: summaryId, + provider, + model, + runsCompleted: (existingSummary?.runsCompleted ?? 0) + 1, + lastRunAt: Date.now(), + mateIn1Accuracy: theme === 'mateIn1' ? run.accuracy : existingSummary?.mateIn1Accuracy, + oneMoveAccuracy: theme === 'oneMove' ? run.accuracy : existingSummary?.oneMoveAccuracy, + overallAccuracy: 0, // Will be calculated below + } + + // Calculate overall accuracy if both themes have been run + if (newSummary.mateIn1Accuracy !== undefined && newSummary.oneMoveAccuracy !== undefined) { + newSummary.overallAccuracy = (newSummary.mateIn1Accuracy + newSummary.oneMoveAccuracy) / 2 + } else { + newSummary.overallAccuracy = newSummary.mateIn1Accuracy ?? newSummary.oneMoveAccuracy ?? 0 + } + + await streams.puzzleBenchmarkSummary.set('models', summaryId, newSummary) + + return { status: 200, body: run } + } catch (error) { + logger.error('Puzzle benchmark failed', { error }) + return { + status: 400, + body: { message: error instanceof Error ? error.message : 'Benchmark failed' }, + } + } +} diff --git a/api/steps/benchmark/06b-run-all-puzzle-benchmarks.step.ts b/api/steps/benchmark/06b-run-all-puzzle-benchmarks.step.ts new file mode 100644 index 0000000..0b0faaa --- /dev/null +++ b/api/steps/benchmark/06b-run-all-puzzle-benchmarks.step.ts @@ -0,0 +1,147 @@ +import { ApiRouteConfig, Handlers, Logger } from 'motia' +import { z } from 'zod' +import { PuzzleThemeSchema } from '@chessarena/types/puzzle-benchmark' +import { getAllModels } from '../../services/ai/models' +import { AiModelProvider } from '@chessarena/types/ai-models' +import { runPuzzleBenchmark } from '../../services/benchmark/run-puzzle-benchmark' +import { fetchPuzzles } from '../../services/benchmark/fetch-lichess-puzzles' +import { mapWithConcurrency, parsePositiveInt } from '../../services/benchmark/concurrency' + +const bodySchema = z.object({ + theme: PuzzleThemeSchema, + count: z.number().min(1).max(100).default(10), + rerunCompleted: z.boolean().default(false), +}) + +export const config: ApiRouteConfig = { + type: 'api', + name: 'RunAllPuzzleBenchmarks', + description: 'Fetch a puzzle set (if needed) and run puzzle benchmark for all models', + path: '/benchmark/puzzles/run-all', + method: 'POST', + emits: [], + flows: ['benchmark'], + bodySchema, + responseSchema: { + 200: z.object({ + message: z.string(), + theme: PuzzleThemeSchema, + puzzleCount: z.number(), + totalModels: z.number(), + }), + 400: z.object({ message: z.string() }), + }, +} + +const shouldSkipModel = (theme: z.infer, existing: any | undefined): boolean => { + if (!existing) return false + if (theme === 'mateIn1') return typeof existing.mateIn1Accuracy === 'number' + if (theme === 'oneMove') return typeof existing.oneMoveAccuracy === 'number' + return false +} + +export const handler: Handlers['RunAllPuzzleBenchmarks'] = async (req, { logger, streams }) => { + const theme = req.body.theme + const count = req.body.count ?? 10 + const rerunCompleted = req.body.rerunCompleted + + let puzzleSet = await streams.puzzleSet.get('sets', theme) + if (!puzzleSet || puzzleSet.puzzles.length < count) { + const needed = puzzleSet ? Math.max(0, count - puzzleSet.puzzles.length) : count + const fetched = needed > 0 ? await fetchPuzzles(theme, needed, logger as Logger) : [] + const existingIds = new Set(puzzleSet?.puzzles.map((p) => p.id) ?? []) + const newUnique = fetched.filter((p) => !existingIds.has(p.id)) + const puzzles = puzzleSet ? [...puzzleSet.puzzles, ...newUnique] : newUnique + + if (puzzles.length === 0) { + return { status: 400, body: { message: 'Failed to fetch any puzzles' } } + } + + puzzleSet = { + id: `${theme}-${Date.now()}`, + theme, + createdAt: Date.now(), + puzzles, + count: puzzles.length, + } + await streams.puzzleSet.set('sets', theme, puzzleSet) + } + + const uniqueById = new Map() + for (const p of puzzleSet.puzzles) uniqueById.set(p.id, p) + const puzzlesToRun = Array.from(uniqueById.values()).slice(0, count) + + logger.info('RunAllPuzzleBenchmarks starting', { + theme, + requestedCount: count, + availableInSet: puzzleSet.puzzles.length, + uniqueAvailable: uniqueById.size, + using: puzzlesToRun.length, + rerunCompleted, + }) + + const allModels = getAllModels() + const existingSummaries = await streams.puzzleBenchmarkSummary.getGroup('models') + const existingMap = new Map(existingSummaries.map((s) => [`${s.provider}:${s.model}`, s])) + + const modelsToBenchmark = rerunCompleted + ? allModels + : allModels.filter(({ provider, model }) => !shouldSkipModel(theme, existingMap.get(`${provider}:${model}`))) + + const providerConcurrency = parsePositiveInt(process.env.BENCHMARK_PROVIDER_CONCURRENCY, 4) + const modelConcurrencyPerProvider = parsePositiveInt(process.env.BENCHMARK_MODEL_CONCURRENCY_PER_PROVIDER, 1) + + const modelsByProvider = modelsToBenchmark.reduce< + Record + >( + (acc, entry) => { + acc[entry.provider].push(entry) + return acc + }, + { openai: [], gemini: [], claude: [], grok: [] }, + ) + + const providers: AiModelProvider[] = ['openai', 'gemini', 'claude', 'grok'] + + await mapWithConcurrency(providers, providerConcurrency, async (provider) => { + const models = modelsByProvider[provider] + await mapWithConcurrency(models, modelConcurrencyPerProvider, async ({ model }) => { + logger.info(`\n=== PUZZLES MODEL: ${provider}/${model} (${theme}) ===`) + + const run = await runPuzzleBenchmark(puzzlesToRun, puzzleSet!.id, theme, provider, model, logger) + await streams.puzzleBenchmark.set('runs', run.id, run) + + const summaryId = `${provider}:${model}` + const existingSummary = await streams.puzzleBenchmarkSummary.get('models', summaryId) + + const newSummary = { + id: summaryId, + provider, + model, + runsCompleted: (existingSummary?.runsCompleted ?? 0) + 1, + lastRunAt: Date.now(), + mateIn1Accuracy: theme === 'mateIn1' ? run.accuracy : existingSummary?.mateIn1Accuracy, + oneMoveAccuracy: theme === 'oneMove' ? run.accuracy : existingSummary?.oneMoveAccuracy, + overallAccuracy: 0, + } + + if (newSummary.mateIn1Accuracy !== undefined && newSummary.oneMoveAccuracy !== undefined) { + newSummary.overallAccuracy = (newSummary.mateIn1Accuracy + newSummary.oneMoveAccuracy) / 2 + } else { + newSummary.overallAccuracy = newSummary.mateIn1Accuracy ?? newSummary.oneMoveAccuracy ?? 0 + } + + await streams.puzzleBenchmarkSummary.set('models', summaryId, newSummary) + }) + }) + + return { + status: 200, + body: { + message: `Puzzle benchmark completed for ${modelsToBenchmark.length} models`, + theme, + puzzleCount: count, + totalModels: modelsToBenchmark.length, + }, + } +} diff --git a/api/steps/benchmark/07-get-puzzle-leaderboard.step.ts b/api/steps/benchmark/07-get-puzzle-leaderboard.step.ts new file mode 100644 index 0000000..070f8e7 --- /dev/null +++ b/api/steps/benchmark/07-get-puzzle-leaderboard.step.ts @@ -0,0 +1,29 @@ +import { ApiRouteConfig, Handlers } from 'motia' +import { z } from 'zod' +import { PuzzleBenchmarkSummarySchema } from '@chessarena/types/puzzle-benchmark' + +export const config: ApiRouteConfig = { + type: 'api', + name: 'GetPuzzleLeaderboard', + description: 'Get puzzle benchmark leaderboard', + path: '/benchmark/puzzles/leaderboard', + method: 'GET', + emits: [], + flows: ['benchmark'], + responseSchema: { + 200: z.object({ + leaderboard: z.array(PuzzleBenchmarkSummarySchema), + }), + }, +} + +export const handler: Handlers['GetPuzzleLeaderboard'] = async (req, { logger, streams }) => { + logger.info('Fetching puzzle leaderboard') + + const summaries = await streams.puzzleBenchmarkSummary.getGroup('models') + + // Sort by overall accuracy descending + const sorted = summaries.sort((a, b) => (b.overallAccuracy ?? 0) - (a.overallAccuracy ?? 0)) + + return { status: 200, body: { leaderboard: sorted } } +} diff --git a/api/steps/benchmark/08-get-puzzle-sets.step.ts b/api/steps/benchmark/08-get-puzzle-sets.step.ts new file mode 100644 index 0000000..7c1f6c5 --- /dev/null +++ b/api/steps/benchmark/08-get-puzzle-sets.step.ts @@ -0,0 +1,35 @@ +import { ApiRouteConfig, Handlers } from 'motia' +import { z } from 'zod' +import { PuzzleSetSchema } from '@chessarena/types/puzzle-benchmark' + +export const config: ApiRouteConfig = { + type: 'api', + name: 'GetPuzzleSets', + description: 'Get all cached puzzle sets', + path: '/benchmark/puzzles/sets', + method: 'GET', + emits: [], + flows: ['benchmark'], + responseSchema: { + 200: z.object({ + sets: z.array( + PuzzleSetSchema.omit({ puzzles: true }).extend({ + puzzleCount: z.number(), + }), + ), + }), + }, +} + +export const handler: Handlers['GetPuzzleSets'] = async (req, { logger, streams }) => { + logger.info('Fetching puzzle sets') + + const allSets = await streams.puzzleSet.getGroup('sets') + + const sets = allSets.map(({ puzzles, ...rest }) => ({ + ...rest, + puzzleCount: puzzles?.length ?? 0, + })) + + return { status: 200, body: { sets } } +} diff --git a/api/steps/benchmark/09-run-all-benchmarks.step.ts b/api/steps/benchmark/09-run-all-benchmarks.step.ts new file mode 100644 index 0000000..142c7d4 --- /dev/null +++ b/api/steps/benchmark/09-run-all-benchmarks.step.ts @@ -0,0 +1,196 @@ +import { ApiRouteConfig, Handlers, Logger } from 'motia' +import { z } from 'zod' +import { generateTestPositions, runLegalMoveBenchmark } from '../../services/benchmark/run-legal-move-benchmark' +import { makeBenchmarkPrompt } from '../../services/benchmark/benchmark-prompt' +import { getAllModels } from '../../services/ai/models' +import { AiModelProvider } from '@chessarena/types/ai-models' +import { TestPosition, ModelBenchmarkResult, LegalMoveBenchmarkRun } from '@chessarena/types/legal-move-benchmark' +import fs from 'fs' +import path from 'path' +import mustache from 'mustache' +import { mapWithConcurrency, parsePositiveInt } from '../../services/benchmark/concurrency' + +const promptTemplate = fs.readFileSync(path.join(__dirname, '../chess/legal-move-benchmark.mustache'), 'utf8') + +const bodySchema = z.object({ + positionCount: z.number().min(1).max(50).default(20), + force: z.boolean().default(false), + rerunCompleted: z.boolean().default(false), +}) + +export const config: ApiRouteConfig = { + type: 'api', + name: 'RunAllBenchmarks', + description: 'Run legal move benchmark for ALL models - one per provider in parallel', + path: '/benchmark/legal-moves/run-all', + method: 'POST', + emits: [], + flows: ['benchmark'], + bodySchema, + responseSchema: { + 200: z.object({ + message: z.string(), + positionCount: z.number(), + totalModels: z.number(), + }), + 400: z.object({ message: z.string() }), + }, +} + +// Calculate F1 score +const calculateScore = (legalMoves: string[], modelMoves: string[]) => { + const legalSet = new Set(legalMoves) + const correct = modelMoves.filter((m) => legalSet.has(m)) + const illegal = modelMoves.filter((m) => !legalSet.has(m)) + const missed = legalMoves.filter((m) => !new Set(modelMoves).has(m)) + + const recall = legalMoves.length > 0 ? (correct.length / legalMoves.length) * 100 : 0 + const precision = modelMoves.length > 0 ? (correct.length / modelMoves.length) * 100 : 0 + const finalScore = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0 + + return { correct, illegal, missed, accuracy: recall, penalty: 100 - precision, finalScore } +} + +// Benchmark single position for single model +const benchmarkSinglePosition = async ( + position: TestPosition, + provider: AiModelProvider, + model: string, + logger: Logger, +): Promise => { + const prompt = mustache.render( + promptTemplate, + { pgn: position.pgn, fen: position.fen, turn: position.turn.toUpperCase() }, + {}, + { escape: (v: string) => v }, + ) + + const startTime = Date.now() + let rawResponse = '' + let modelMoves: string[] = [] + let error: string | undefined + + try { + const response = await makeBenchmarkPrompt({ prompt, provider, model, logger }) + rawResponse = response.rawResponse + modelMoves = response.moves + } catch (e) { + error = e instanceof Error ? e.message : 'Unknown error' + } + + const responseTime = Date.now() - startTime + const { correct, illegal, missed, accuracy, penalty, finalScore } = calculateScore(position.legalMoves, modelMoves) + + return { + positionId: position.id, + modelMoves, + correctMoves: correct, + illegalMoves: illegal, + missedMoves: missed, + accuracy, + penalty, + finalScore, + responseTime, + rawResponse, + error, + } +} + +export const handler: Handlers['RunAllBenchmarks'] = async (req, { logger, streams }) => { + const { positionCount, force, rerunCompleted } = req.body + + logger.info('=== STARTING FULL BENCHMARK ===', { positionCount, force }) + + // Get or create positions + let positionSet = await streams.positionSet.get('sets', 'default') + if (!positionSet || positionSet.positions.length === 0 || force) { + logger.info('Generating positions...', { count: positionCount }) + const positions = generateTestPositions({ count: positionCount }) + positionSet = { + id: `positions-${Date.now()}`, + createdAt: Date.now(), + count: positions.length, + positions, + } + await streams.positionSet.set('sets', 'default', positionSet) + logger.info('Positions generated', { count: positions.length }) + } + + const positions = positionSet.positions + + // Get all models to benchmark using the helper function + const allModels = getAllModels() + + const existingSummaries = await streams.legalMoveBenchmarkSummary.getGroup('models') + const completedSet = new Set(existingSummaries.map((s) => `${s.provider}:${s.model}`)) + + const modelsToBenchmark = rerunCompleted + ? allModels + : allModels.filter((m) => !completedSet.has(`${m.provider}:${m.model}`)) + + const totalModels = modelsToBenchmark.length + logger.info('Models to benchmark', { totalModels }) + + const runAllBenchmarks = async () => { + const providerConcurrency = parsePositiveInt(process.env.BENCHMARK_PROVIDER_CONCURRENCY, 4) + const modelConcurrencyPerProvider = parsePositiveInt(process.env.BENCHMARK_MODEL_CONCURRENCY_PER_PROVIDER, 1) + + const modelsByProvider = modelsToBenchmark.reduce< + Record + >( + (acc, entry) => { + acc[entry.provider].push(entry) + return acc + }, + { openai: [], gemini: [], claude: [], grok: [] }, + ) + + const providers: AiModelProvider[] = ['openai', 'gemini', 'claude', 'grok'] + + await mapWithConcurrency(providers, providerConcurrency, async (provider) => { + const models = modelsByProvider[provider] + await mapWithConcurrency(models, modelConcurrencyPerProvider, async ({ model }) => { + logger.info(`\n=== MODEL: ${provider}/${model} ===`) + + const run = await runLegalMoveBenchmark(positions, provider, model, logger) + + await streams.legalMoveBenchmark.set('runs', run.id, run) + + const key = `${provider}:${model}` + if (run.status === 'completed' && run.averageFinalScore !== undefined) { + const existing = await streams.legalMoveBenchmarkSummary.get('models', key) + await streams.legalMoveBenchmarkSummary.set('models', key, { + id: key, + provider, + model, + runsCompleted: (existing?.runsCompleted ?? 0) + 1, + averageScore: existing + ? (existing.averageScore * existing.runsCompleted + run.averageFinalScore) / (existing.runsCompleted + 1) + : run.averageFinalScore, + bestScore: Math.max(existing?.bestScore ?? 0, run.averageFinalScore), + worstScore: existing ? Math.min(existing.worstScore, run.averageFinalScore) : run.averageFinalScore, + lastRunAt: Date.now(), + }) + } else { + logger.warn('Skipping summary update for failed run', { provider, model, runId: run.id }) + } + }) + }) + + logger.info('\n\n========================================') + logger.info('=== ALL BENCHMARKS COMPLETED ===') + logger.info('========================================\n') + } + + // Actually await the benchmarks (fire-and-forget was causing issues) + await runAllBenchmarks() + + return { + status: 200, + body: { + message: `Benchmark completed for ${totalModels} models`, + positionCount: positions.length, + totalModels, + }, + } +} diff --git a/api/steps/benchmark/10-run-stockfish-benchmark.step.ts b/api/steps/benchmark/10-run-stockfish-benchmark.step.ts new file mode 100644 index 0000000..54ecd53 --- /dev/null +++ b/api/steps/benchmark/10-run-stockfish-benchmark.step.ts @@ -0,0 +1,133 @@ +import { ApiRouteConfig, Handlers } from 'motia' +import { z } from 'zod' +import { AiModelProviderSchema } from '@chessarena/types/ai-models' +import { StockfishBenchmarkRunSchema } from '@chessarena/types/stockfish-benchmark' +import { playGameAgainstStockfish } from '../../services/benchmark/stockfish-game' +import { getModelsForProvider } from '../../services/ai/models' + +const bodySchema = z.object({ + provider: AiModelProviderSchema(), + model: z.string(), + stockfishLevel: z.number().min(1).max(20).default(10), +}) + +export const config: ApiRouteConfig = { + type: 'api', + name: 'RunStockfishBenchmark', + description: 'Run Stockfish benchmark (2 games: one as white, one as black)', + path: '/benchmark/stockfish/run', + method: 'POST', + emits: [], + flows: ['benchmark'], + bodySchema, + responseSchema: { + 200: StockfishBenchmarkRunSchema, + 400: z.object({ message: z.string() }), + }, +} + +export const handler: Handlers['RunStockfishBenchmark'] = async (req, { logger, streams }) => { + const { provider, model, stockfishLevel } = req.body + + // Validate model + const supportedModels = getModelsForProvider(provider) + if (!supportedModels.includes(model)) { + return { + status: 400, + body: { message: `Model ${model} is not supported for provider ${provider}` }, + } + } + + const runId = crypto.randomUUID() + logger.info('=== STARTING STOCKFISH BENCHMARK ===', { provider, model, stockfishLevel }) + + const run = { + id: runId, + createdAt: Date.now(), + status: 'running' as const, + provider, + model, + stockfishLevel, + gamesPlayed: 0, + wins: 0, + losses: 0, + draws: 0, + } + + // Run games in background + const runBenchmark = async () => { + try { + // Game 1: AI plays as White + logger.info('\n=== GAME 1: AI as WHITE ===') + const gameAsWhite = await playGameAgainstStockfish(provider, model, 'white', stockfishLevel, logger) + run.gameAsWhite = gameAsWhite + run.gamesPlayed++ + if (gameAsWhite.result === 'ai_win') run.wins++ + else if (gameAsWhite.result === 'stockfish_win') run.losses++ + else run.draws++ + + logger.info(`Game 1 result: ${gameAsWhite.result}, ACPL: ${gameAsWhite.averageCentipawnLoss?.toFixed(1)}`) + + // Small delay between games + await new Promise((r) => setTimeout(r, 2000)) + + // Game 2: AI plays as Black + logger.info('\n=== GAME 2: AI as BLACK ===') + const gameAsBlack = await playGameAgainstStockfish(provider, model, 'black', stockfishLevel, logger) + run.gameAsBlack = gameAsBlack + run.gamesPlayed++ + if (gameAsBlack.result === 'ai_win') run.wins++ + else if (gameAsBlack.result === 'stockfish_win') run.losses++ + else run.draws++ + + logger.info(`Game 2 result: ${gameAsBlack.result}, ACPL: ${gameAsBlack.averageCentipawnLoss?.toFixed(1)}`) + + // Calculate overall ACPL + const totalMoves = (gameAsWhite.aiMoveCount ?? 0) + (gameAsBlack.aiMoveCount ?? 0) + const totalLoss = (gameAsWhite.totalCentipawnLoss ?? 0) + (gameAsBlack.totalCentipawnLoss ?? 0) + run.overallAcpl = totalMoves > 0 ? totalLoss / totalMoves : 0 + + run.status = 'completed' + run.completedAt = Date.now() + + // Store run + await streams.stockfishBenchmark.set('runs', runId, run) + + // Update summary + const summaryId = `${provider}:${model}` + const existing = await streams.stockfishBenchmarkSummary.get('models', summaryId) + await streams.stockfishBenchmarkSummary.set('models', summaryId, { + id: summaryId, + provider, + model, + runsCompleted: (existing?.runsCompleted ?? 0) + 1, + averageAcpl: existing + ? (existing.averageAcpl * existing.runsCompleted + run.overallAcpl) / (existing.runsCompleted + 1) + : run.overallAcpl, + bestAcpl: Math.min(existing?.bestAcpl ?? Infinity, run.overallAcpl), + wins: (existing?.wins ?? 0) + run.wins, + losses: (existing?.losses ?? 0) + run.losses, + draws: (existing?.draws ?? 0) + run.draws, + lastRunAt: Date.now(), + }) + + logger.info('\n========================================') + logger.info('=== STOCKFISH BENCHMARK COMPLETED ===') + logger.info( + `${provider}/${model}: Overall ACPL=${run.overallAcpl?.toFixed(1)}, W/L/D: ${run.wins}/${run.losses}/${run.draws}`, + ) + logger.info('========================================\n') + } catch (error) { + run.status = 'failed' + logger.error('Stockfish benchmark failed', { error }) + } + } + + // Await the benchmark (fire-and-forget causes streaming issues) + await runBenchmark() + + return { + status: 200, + body: run, + } +} diff --git a/api/steps/benchmark/11-stockfish-leaderboard.step.ts b/api/steps/benchmark/11-stockfish-leaderboard.step.ts new file mode 100644 index 0000000..1acdea7 --- /dev/null +++ b/api/steps/benchmark/11-stockfish-leaderboard.step.ts @@ -0,0 +1,29 @@ +import { ApiRouteConfig, Handlers } from 'motia' +import { z } from 'zod' +import { StockfishBenchmarkSummarySchema } from '@chessarena/types/stockfish-benchmark' + +export const config: ApiRouteConfig = { + type: 'api', + name: 'GetStockfishLeaderboard', + description: 'Get Stockfish benchmark leaderboard sorted by ACPL', + path: '/benchmark/stockfish/leaderboard', + method: 'GET', + emits: [], + flows: ['benchmark'], + responseSchema: { + 200: z.object({ + leaderboard: z.array(StockfishBenchmarkSummarySchema), + }), + }, +} + +export const handler: Handlers['GetStockfishLeaderboard'] = async (req, { logger, streams }) => { + logger.info('Fetching Stockfish leaderboard') + + const summaries = await streams.stockfishBenchmarkSummary.getGroup('models') + + // Sort by ACPL (lower is better) + const sorted = summaries.sort((a, b) => (a.averageAcpl ?? Infinity) - (b.averageAcpl ?? Infinity)) + + return { status: 200, body: { leaderboard: sorted } } +} diff --git a/api/steps/chess/01-create-game.step.ts b/api/steps/chess/01-create-game.step.ts index 6532b42..db1c613 100644 --- a/api/steps/chess/01-create-game.step.ts +++ b/api/steps/chess/01-create-game.step.ts @@ -1,5 +1,5 @@ import { AiModelProviderSchema } from '@chessarena/types/ai-models' -import { GameSchema, Player } from '@chessarena/types/game' +import { GameSchema, Player, BenchmarkVariantSchema } from '@chessarena/types/game' import { ApiRouteConfig, Handlers } from 'motia' import { RefinementCtx, z } from 'zod' import { supportedModelsByProvider } from '../../services/ai/models' @@ -52,6 +52,7 @@ const bodySchema = z.object({ white: playerSchema(), black: playerSchema(), }), + variant: BenchmarkVariantSchema.default('guided'), }) export const config: ApiRouteConfig = { @@ -92,9 +93,10 @@ export const handler: Handlers['CreateGame'] = async (req, { logger, emit, state return { status: 400, body: { message: 'Invalid request body', errors: validationResult.error.errors } } } - const game = await createGame(req.body.players, streams, logger, user) + const variant = req.body.variant ?? 'guided' + const game = await createGame(req.body.players, streams, logger, user, variant) - logger.info('[CreateGame] Game created', { gameId: game.id }) + logger.info('[CreateGame] Game created', { gameId: game.id, variant }) await emit({ topic: 'chess-game-created', diff --git a/api/steps/chess/05-ai-player-unguided.mustache b/api/steps/chess/05-ai-player-unguided.mustache new file mode 100644 index 0000000..d9b3a27 --- /dev/null +++ b/api/steps/chess/05-ai-player-unguided.mustache @@ -0,0 +1,63 @@ +You are a chess grandmaster playing as {{player}}. + +## Current Position +- FEN: `{{fen}}`. +{{#inCheck}}- WARNING: You are in check! You MUST get out of check.{{/inCheck}} + +## Your Task +Analyze the position and determine your best legal move. You must figure out which moves are legal based on standard chess rules. + +{{#lastInvalidMove}} +ILLEGAL MOVE: {{lastInvalidMove}} was REJECTED because it is not a legal move in this position. +INSTRUCTION: Carefully analyze the board position and choose a LEGAL move according to chess rules. +{{/lastInvalidMove}} + +Three illegal moves end the match instantly. + +## Required Response Format + +Your response must be valid JSON with exactly two fields: +{ + "thought": "Your strategic reasoning (1-2 sentences)", + "moveSan": "YOUR_CHOSEN_MOVE_IN_SAN" +} + +**Steps to follow:** +1. Analyze the current board position from the FEN +2. Determine all legal moves for your pieces +3. Evaluate candidate moves strategically (see guidelines below) +4. Select the strongest legal move +5. Express your move in Standard Algebraic Notation (SAN) +6. Respond ONLY with the JSON—no additional text + +**Move notation examples:** +- Pawn moves: e4, d5, exd5 (capture) +- Piece moves: Nf3, Bb5, Qd1 +- Castling: O-O (kingside), O-O-O (queenside) +- Promotion: e8=Q +- Check: Bb5+ + +## Strategy Guidelines +When evaluating moves, consider: +- **Tactical motifs**: Recognize forks, pins, skewers, discovered attacks, and other tactical patterns +- **Position assessment**: + - Material balance + - King safety + - Pawn structure + - Piece activity and coordination + - Weaknesses (backward pawns, weak squares, isolated pawns) +- **Candidate move comparison**: Visualize resulting positions for multiple moves—don't settle for the first good-looking option +- **Long-term planning**: Build plans to win the game, not just reactive moves + - Improve your worst-placed piece + - Control key files and diagonals + - Create favorable imbalances + - Transition to winning endgames +- **Safety checks**: Ensure your move doesn't expose your pieces to capture unless you gain a tactical advantage + +## Critical Rules +1. Your move MUST be legal according to standard chess rules +2. Your response must be ONLY valid JSON with "thought" and "moveSan" fields +3. Double-check that your piece can actually make the move you're choosing +4. Any illegal move will be rejected and count toward your three-strike limit + +Choose your best move now. diff --git a/api/steps/chess/05-ai-player.step.ts b/api/steps/chess/05-ai-player.step.ts index d9df264..5d0e08d 100644 --- a/api/steps/chess/05-ai-player.step.ts +++ b/api/steps/chess/05-ai-player.step.ts @@ -25,10 +25,11 @@ export const config: EventConfig = { check: z.boolean({ description: 'Whether the move is a check' }), gameId: z.string({ description: 'The ID of the game' }), }), - includeFiles: ['05-ai-player.mustache'], + includeFiles: ['05-ai-player.mustache', '05-ai-player-unguided.mustache'], } -const template = fs.readFileSync(path.join(__dirname, '05-ai-player.mustache'), 'utf8') +const guidedTemplate = fs.readFileSync(path.join(__dirname, '05-ai-player.mustache'), 'utf8') +const unguidedTemplate = fs.readFileSync(path.join(__dirname, '05-ai-player-unguided.mustache'), 'utf8') export const handler: Handlers['AI_Player'] = async (input, { logger, emit, streams }) => { logger.info('Received ai-move event', { gameId: input.gameId }) @@ -63,21 +64,20 @@ export const handler: Handlers['AI_Player'] = async (input, { logger, emit, stre timestamp: Date.now(), }) - const prompt = mustache.render( - template, - { - fenBefore: input.fenBefore, - fen: input.fen, - inCheck: input.check, - player: input.player, - lastInvalidMove, - validMoves, - totalMoves: validMoves.length, - }, - {}, - { escape: (value: string) => value }, - ) - logger.info('Prompt', { prompt }) + // Arena games always run guided. Rule understanding is benchmarked separately via legal-move bench. + const template = guidedTemplate + const templateData = { + fenBefore: input.fenBefore, + fen: input.fen, + inCheck: input.check, + player: input.player, + lastInvalidMove, + validMoves, + totalMoves: validMoves.length, + } + + const prompt = mustache.render(template, templateData, {}, { escape: (value: string) => value }) + logger.info('Prompt', { prompt, variant: game.variant }) let action: AiPlayerPrompt | undefined diff --git a/api/steps/chess/08-game-ended.step.ts b/api/steps/chess/08-game-ended.step.ts index e68e051..8f9bde5 100644 --- a/api/steps/chess/08-game-ended.step.ts +++ b/api/steps/chess/08-game-ended.step.ts @@ -2,8 +2,10 @@ import { EventConfig, Handlers } from 'motia' import { z } from 'zod' import { models } from '../../services/ai/models' import { generateGameScore } from '../../services/chess/generate-game-score' +import { generatePgn } from '../../services/chess/generate-pgn' import { Scoreboard } from '@chessarena/types/game' import { Leaderboard } from '@chessarena/types/leaderboard' +import { GameHistory } from '@chessarena/types/game-history' import { isAiGame } from '../../services/chess/utils' /* @@ -37,10 +39,48 @@ export const handler: Handlers['GameEnded'] = async (input, { logger, streams }) } const moves = await streams.chessGameMove.getGroup(input.gameId) + const messages = await streams.chessGameMessage.getGroup(input.gameId) const scoreboard = generateGameScore(moves, game) await streams.chessGame.set('game', game.id, { ...game, scoreboard }) + // Archive game to history + const endedAt = Date.now() + const startedAt = game.createdAt ?? endedAt + const pgn = generatePgn({ game, moves }) + + const gameHistory: GameHistory = { + id: game.id, + startedAt, + endedAt, + duration: endedAt - startedAt, + whitePlayer: { + provider: game.players.white.ai, + model: game.players.white.model, + isHuman: !game.players.white.ai, + }, + blackPlayer: { + provider: game.players.black.ai, + model: game.players.black.model, + isHuman: !game.players.black.ai, + }, + status: game.status === 'pending' ? 'completed' : game.status, + winner: game.winner, + endGameReason: game.endGameReason, + variant: game.variant ?? 'guided', + totalMoves: moves.length, + whiteIllegalMoves: game.players.white.illegalMoveAttempts ?? 0, + blackIllegalMoves: game.players.black.illegalMoveAttempts ?? 0, + finalFen: game.fen, + moves, + messages, + scoreboard, + pgn, + } + + await streams.chessGameHistory.set('all', game.id, gameHistory) + logger.info('Game archived to history', { gameId: game.id }) + if (!isAiGame(game)) { return } diff --git a/api/steps/chess/10-get-game-history.step.ts b/api/steps/chess/10-get-game-history.step.ts new file mode 100644 index 0000000..07e3f88 --- /dev/null +++ b/api/steps/chess/10-get-game-history.step.ts @@ -0,0 +1,90 @@ +import { ApiRouteConfig, Handlers } from 'motia' +import { z } from 'zod' +import { GameHistorySchema } from '@chessarena/types/game-history' + +export const config: ApiRouteConfig = { + type: 'api', + name: 'GetGameHistory', + description: 'Get game history with optional filters', + path: '/chess/history', + method: 'GET', + emits: [], + flows: ['chess'], + queryParams: [ + { name: 'provider', description: 'Filter by AI provider' }, + { name: 'model', description: 'Filter by model name' }, + { name: 'variant', description: 'Filter by game variant' }, + { name: 'winner', description: 'Filter by winner' }, + { name: 'status', description: 'Filter by game status' }, + { name: 'startDate', description: 'Filter by start date (timestamp)' }, + { name: 'endDate', description: 'Filter by end date (timestamp)' }, + { name: 'limit', description: 'Pagination limit' }, + { name: 'offset', description: 'Pagination offset' }, + ], + responseSchema: { + 200: z.object({ + games: z.array(GameHistorySchema.omit({ moves: true, messages: true })), + total: z.number(), + limit: z.number(), + offset: z.number(), + }), + }, +} + +export const handler: Handlers['GetGameHistory'] = async (req, { logger, streams }) => { + logger.info('[GetGameHistory] Fetching game history', { query: req.queryParams }) + + const params = req.queryParams as Record + const provider = params.provider + const model = params.model + const variant = params.variant as 'guided' | 'unguided' | undefined + const winner = params.winner as 'white' | 'black' | undefined + const status = params.status + const startDate = params.startDate ? parseInt(params.startDate) : undefined + const endDate = params.endDate ? parseInt(params.endDate) : undefined + const limit = params.limit ? parseInt(params.limit) : 50 + const offset = params.offset ? parseInt(params.offset) : 0 + + const allGames = await streams.chessGameHistory.getGroup('all') + + // Apply filters + let filtered = allGames.filter((game) => { + if (provider) { + const matchesWhite = game.whitePlayer.provider === provider + const matchesBlack = game.blackPlayer.provider === provider + if (!matchesWhite && !matchesBlack) return false + } + + if (model) { + const matchesWhite = game.whitePlayer.model === model + const matchesBlack = game.blackPlayer.model === model + if (!matchesWhite && !matchesBlack) return false + } + + if (variant && game.variant !== variant) return false + if (winner && game.winner !== winner) return false + if (status && game.status !== status) return false + if (startDate && game.startedAt < startDate) return false + if (endDate && game.endedAt > endDate) return false + + return true + }) + + // Sort by most recent first + filtered.sort((a, b) => b.endedAt - a.endedAt) + + const total = filtered.length + + // Apply pagination + const paginated = filtered.slice(offset, offset + limit) + + // Remove heavy fields for list view + const games = paginated.map(({ moves, messages, ...rest }) => rest) + + logger.info('[GetGameHistory] Returning games', { total, returned: games.length }) + + return { + status: 200, + body: { games, total, limit, offset }, + } +} diff --git a/api/steps/chess/10b-export-game-history.step.ts b/api/steps/chess/10b-export-game-history.step.ts new file mode 100644 index 0000000..2c3ff7c --- /dev/null +++ b/api/steps/chess/10b-export-game-history.step.ts @@ -0,0 +1,143 @@ +import { ApiRouteConfig, Handlers } from 'motia' +import { z } from 'zod' + +export const config: ApiRouteConfig = { + type: 'api', + name: 'ExportGameHistory', + description: 'Export game history in JSON or CSV format', + path: '/chess/history/export', + method: 'GET', + emits: [], + flows: ['chess'], + queryParams: [ + { name: 'provider', description: 'Filter by AI provider' }, + { name: 'model', description: 'Filter by model name' }, + { name: 'variant', description: 'Filter by game variant' }, + { name: 'winner', description: 'Filter by winner' }, + { name: 'status', description: 'Filter by game status' }, + { name: 'startDate', description: 'Filter by start date (timestamp)' }, + { name: 'endDate', description: 'Filter by end date (timestamp)' }, + { name: 'format', description: 'Export format: json or csv' }, + ], + responseSchema: { + 200: z.any(), + }, +} + +const escapeCsvField = (value: string | number | undefined | null): string => { + if (value === undefined || value === null) return '' + const str = String(value) + if (str.includes(',') || str.includes('"') || str.includes('\n') || str.includes('\r')) { + return `"${str.replace(/"/g, '""')}"` + } + return str +} + +export const handler: Handlers['ExportGameHistory'] = async (req, { logger, streams }) => { + logger.info('[ExportGameHistory] Exporting game history', { query: req.queryParams }) + + const params = req.queryParams as Record + const provider = params.provider + const model = params.model + const variant = params.variant as 'guided' | 'unguided' | undefined + const winner = params.winner as 'white' | 'black' | undefined + const status = params.status + const startDate = params.startDate ? parseInt(params.startDate) : undefined + const endDate = params.endDate ? parseInt(params.endDate) : undefined + const format = (params.format as 'json' | 'csv') || 'json' + + const allGames = await streams.chessGameHistory.getGroup('all') + + // Apply filters + let filtered = allGames.filter((game) => { + if (provider) { + const matchesWhite = game.whitePlayer.provider === provider + const matchesBlack = game.blackPlayer.provider === provider + if (!matchesWhite && !matchesBlack) return false + } + + if (model) { + const matchesWhite = game.whitePlayer.model === model + const matchesBlack = game.blackPlayer.model === model + if (!matchesWhite && !matchesBlack) return false + } + + if (variant && game.variant !== variant) return false + if (winner && game.winner !== winner) return false + if (status && game.status !== status) return false + if (startDate && game.startedAt < startDate) return false + if (endDate && game.endedAt > endDate) return false + + return true + }) + + // Sort by most recent first + filtered.sort((a, b) => b.endedAt - a.endedAt) + + if (format === 'csv') { + const headers = [ + 'id', + 'startedAt', + 'endedAt', + 'duration', + 'variant', + 'status', + 'winner', + 'endGameReason', + 'totalMoves', + 'whiteProvider', + 'whiteModel', + 'whiteIllegalMoves', + 'blackProvider', + 'blackModel', + 'blackIllegalMoves', + 'pgn', + ] + + const rows = filtered.map((game) => [ + escapeCsvField(game.id), + escapeCsvField(new Date(game.startedAt).toISOString()), + escapeCsvField(new Date(game.endedAt).toISOString()), + escapeCsvField(game.duration), + escapeCsvField(game.variant), + escapeCsvField(game.status), + escapeCsvField(game.winner), + escapeCsvField(game.endGameReason), + escapeCsvField(game.totalMoves), + escapeCsvField(game.whitePlayer.provider || 'human'), + escapeCsvField(game.whitePlayer.model), + escapeCsvField(game.whiteIllegalMoves), + escapeCsvField(game.blackPlayer.provider || 'human'), + escapeCsvField(game.blackPlayer.model), + escapeCsvField(game.blackIllegalMoves), + escapeCsvField(game.pgn), + ]) + + const csv = [headers.join(','), ...rows.map((row) => row.join(','))].join('\n') + + return { + status: 200, + headers: { + 'Content-Type': 'text/csv', + 'Content-Disposition': 'attachment; filename="chessarena-history.csv"', + }, + body: csv, + } + } + + // JSON format - include everything except full moves/messages for smaller payload + const exportData = filtered.map(({ moves, messages, ...rest }) => ({ + ...rest, + movesCount: moves.length, + messagesCount: messages.length, + })) + + return { + status: 200, + headers: { + 'Content-Type': 'application/json', + 'Content-Disposition': 'attachment; filename="chessarena-history.json"', + }, + body: exportData, + } +} diff --git a/api/steps/chess/11-get-game-history-detail.step.ts b/api/steps/chess/11-get-game-history-detail.step.ts new file mode 100644 index 0000000..c699fdc --- /dev/null +++ b/api/steps/chess/11-get-game-history-detail.step.ts @@ -0,0 +1,37 @@ +import { ApiRouteConfig, Handlers } from 'motia' +import { z } from 'zod' +import { GameHistorySchema } from '@chessarena/types/game-history' + +export const config: ApiRouteConfig = { + type: 'api', + name: 'GetGameHistoryDetail', + description: 'Get detailed game history including all moves and messages', + path: '/chess/history/:gameId', + method: 'GET', + emits: [], + flows: ['chess'], + responseSchema: { + 200: GameHistorySchema, + 404: z.object({ message: z.string() }), + }, +} + +export const handler: Handlers['GetGameHistoryDetail'] = async (req, { logger, streams }) => { + const { gameId } = req.pathParams + logger.info('[GetGameHistoryDetail] Fetching game', { gameId }) + + const game = await streams.chessGameHistory.get('all', gameId) + + if (!game) { + logger.warn('[GetGameHistoryDetail] Game not found', { gameId }) + return { + status: 404, + body: { message: 'Game not found in history' }, + } + } + + return { + status: 200, + body: game, + } +} diff --git a/api/steps/chess/12-play-vs-ai.step.ts b/api/steps/chess/12-play-vs-ai.step.ts new file mode 100644 index 0000000..f736c01 --- /dev/null +++ b/api/steps/chess/12-play-vs-ai.step.ts @@ -0,0 +1,91 @@ +import { GameSchema } from '@chessarena/types/game' +import { ApiRouteConfig, Handlers } from 'motia' +import { z } from 'zod' +import { createGame } from '../../services/chess/create-game' +import { selectRandomAI } from '../../services/ai/random-ai-selection' +import { auth } from '../middlewares/auth.middleware' +import { UserState } from '../states/user-state' + +const bodySchema = z.object({ + playerColor: z.enum(['white', 'black', 'random']).default('random'), +}) + +export const config: ApiRouteConfig = { + type: 'api', + name: 'PlayVsAI', + description: 'Start a game against a randomly selected AI opponent', + path: '/chess/play-vs-ai', + method: 'POST', + emits: ['chess-game-created'], + flows: ['chess'], + bodySchema, + middleware: [auth({ required: true })], + responseSchema: { + 200: z.object({ + game: GameSchema, + opponent: z.object({ + provider: z.string(), + model: z.string(), + tier: z.string(), + }), + playerColor: z.enum(['white', 'black']), + }), + 401: z.object({ message: z.string() }), + }, +} + +export const handler: Handlers['PlayVsAI'] = async (req, { logger, emit, state, streams }) => { + logger.info('[PlayVsAI] Starting human vs AI game') + + const userState = new UserState(state) + const user = await userState.getUser(req.tokenInfo.sub) + + if (!user) { + logger.error('[PlayVsAI] User not found', { userId: req.tokenInfo.sub }) + return { status: 401, body: { message: 'User not found' } } + } + + // Select random AI opponent + const opponent = selectRandomAI() + logger.info('[PlayVsAI] Selected AI opponent', opponent) + + // Determine player color + let playerColor: 'white' | 'black' = req.body.playerColor as 'white' | 'black' + if (req.body.playerColor === 'random') { + playerColor = Math.random() < 0.5 ? 'white' : 'black' + } + + // Create game with human vs AI + const players = + playerColor === 'white' + ? { + white: {}, // Human + black: { ai: opponent.provider, model: opponent.model }, + } + : { + white: { ai: opponent.provider, model: opponent.model }, + black: {}, // Human + } + + const game = await createGame(players, streams, logger, user, 'guided') + + logger.info('[PlayVsAI] Game created', { + gameId: game.id, + playerColor, + opponent: `${opponent.provider}/${opponent.model}`, + }) + + await emit({ + topic: 'chess-game-created', + data: { gameId: game.id, fenBefore: game.fen }, + }) + + return { + status: 200, + body: { + game, + opponent, + playerColor, + }, + } +} diff --git a/api/steps/chess/legal-move-benchmark.mustache b/api/steps/chess/legal-move-benchmark.mustache new file mode 100644 index 0000000..d0ea3e3 --- /dev/null +++ b/api/steps/chess/legal-move-benchmark.mustache @@ -0,0 +1,29 @@ +You are a chess expert. Given the following game, list ALL legal moves available for the current player. + +## Game (PGN) +{{pgn}} + +## Current Position +It is {{turn}}'s turn to move. +FEN: {{fen}} + +## Task +List ALL legal moves for {{turn}} in Standard Algebraic Notation (SAN). + +## Response Format +Return ONLY a JSON object with no additional text: +{ + "moves": ["move1", "move2", "move3", ...] +} + +## CRITICAL - Use Standard Algebraic Notation (SAN): +CORRECT format examples: Nf3, e4, O-O, Bxc6, Qd7, Rfe1, exd5, h4 +WRONG format (do NOT use): Ng1f3, e2e4, Bf1c4, Ke1g1 + +Rules: +- Pawns: just the destination square (e4) or capture (exd5) +- Pieces: piece letter + destination (Nf3, Bc4, Qd1) +- Captures: add 'x' (Bxc6, Nxe5, exd5) +- Castling: O-O (kingside) or O-O-O (queenside) +- Do NOT include starting square (Ng1f3 is WRONG, Nf3 is CORRECT) +- Include ALL legal moves, not just good moves diff --git a/api/steps/chess/puzzle-benchmark.mustache b/api/steps/chess/puzzle-benchmark.mustache new file mode 100644 index 0000000..cb75d34 --- /dev/null +++ b/api/steps/chess/puzzle-benchmark.mustache @@ -0,0 +1,22 @@ +You are a chess expert solving a puzzle. Find the BEST move in this position. + +## Game (PGN) +{{pgn}} + +## Current Position +FEN: {{fen}} +It is {{turn}}'s turn to move. + +## Legal Moves +{{#legalMoves}} +- {{.}} +{{/legalMoves}} + +## Task +This is a {{theme}} puzzle. Find the single best move. + +## Response Format +Return ONLY the move in SAN, with no additional text. + +Choose ONE move from the legal moves list above. +Do NOT use markdown code fences. diff --git a/api/steps/chess/streams/00-chess-game-history.stream.ts b/api/steps/chess/streams/00-chess-game-history.stream.ts new file mode 100644 index 0000000..096bb86 --- /dev/null +++ b/api/steps/chess/streams/00-chess-game-history.stream.ts @@ -0,0 +1,8 @@ +import { StreamConfig } from 'motia' +import { GameHistorySchema } from '@chessarena/types/game-history' + +export const config: StreamConfig = { + name: 'chessGameHistory', + schema: GameHistorySchema, + baseConfig: { storageType: 'default' }, +} diff --git a/api/steps/chess/streams/00-legal-move-benchmark-summary.stream.ts b/api/steps/chess/streams/00-legal-move-benchmark-summary.stream.ts new file mode 100644 index 0000000..aa59e50 --- /dev/null +++ b/api/steps/chess/streams/00-legal-move-benchmark-summary.stream.ts @@ -0,0 +1,8 @@ +import { StreamConfig } from 'motia' +import { LegalMoveBenchmarkSummarySchema } from '@chessarena/types/legal-move-benchmark' + +export const config: StreamConfig = { + name: 'legalMoveBenchmarkSummary', + schema: LegalMoveBenchmarkSummarySchema, + baseConfig: { storageType: 'default' }, +} diff --git a/api/steps/chess/streams/00-legal-move-benchmark.stream.ts b/api/steps/chess/streams/00-legal-move-benchmark.stream.ts new file mode 100644 index 0000000..4f88634 --- /dev/null +++ b/api/steps/chess/streams/00-legal-move-benchmark.stream.ts @@ -0,0 +1,8 @@ +import { StreamConfig } from 'motia' +import { LegalMoveBenchmarkRunSchema } from '@chessarena/types/legal-move-benchmark' + +export const config: StreamConfig = { + name: 'legalMoveBenchmark', + schema: LegalMoveBenchmarkRunSchema, + baseConfig: { storageType: 'default' }, +} diff --git a/api/steps/chess/streams/00-position-set.stream.ts b/api/steps/chess/streams/00-position-set.stream.ts new file mode 100644 index 0000000..2ef12d9 --- /dev/null +++ b/api/steps/chess/streams/00-position-set.stream.ts @@ -0,0 +1,8 @@ +import { StreamConfig } from 'motia' +import { PositionSetSchema } from '@chessarena/types/legal-move-benchmark' + +export const config: StreamConfig = { + name: 'positionSet', + schema: PositionSetSchema, + baseConfig: { storageType: 'default' }, +} diff --git a/api/steps/chess/streams/00-puzzle-benchmark-summary.stream.ts b/api/steps/chess/streams/00-puzzle-benchmark-summary.stream.ts new file mode 100644 index 0000000..851b4b5 --- /dev/null +++ b/api/steps/chess/streams/00-puzzle-benchmark-summary.stream.ts @@ -0,0 +1,8 @@ +import { StreamConfig } from 'motia' +import { PuzzleBenchmarkSummarySchema } from '@chessarena/types/puzzle-benchmark' + +export const config: StreamConfig = { + name: 'puzzleBenchmarkSummary', + schema: PuzzleBenchmarkSummarySchema, + baseConfig: { storageType: 'default' }, +} diff --git a/api/steps/chess/streams/00-puzzle-benchmark.stream.ts b/api/steps/chess/streams/00-puzzle-benchmark.stream.ts new file mode 100644 index 0000000..99c17ec --- /dev/null +++ b/api/steps/chess/streams/00-puzzle-benchmark.stream.ts @@ -0,0 +1,8 @@ +import { StreamConfig } from 'motia' +import { PuzzleBenchmarkRunSchema } from '@chessarena/types/puzzle-benchmark' + +export const config: StreamConfig = { + name: 'puzzleBenchmark', + schema: PuzzleBenchmarkRunSchema, + baseConfig: { storageType: 'default' }, +} diff --git a/api/steps/chess/streams/00-puzzle-set.stream.ts b/api/steps/chess/streams/00-puzzle-set.stream.ts new file mode 100644 index 0000000..aa50438 --- /dev/null +++ b/api/steps/chess/streams/00-puzzle-set.stream.ts @@ -0,0 +1,8 @@ +import { StreamConfig } from 'motia' +import { PuzzleSetSchema } from '@chessarena/types/puzzle-benchmark' + +export const config: StreamConfig = { + name: 'puzzleSet', + schema: PuzzleSetSchema, + baseConfig: { storageType: 'default' }, +} diff --git a/api/steps/chess/streams/00-stockfish-benchmark-summary.stream.ts b/api/steps/chess/streams/00-stockfish-benchmark-summary.stream.ts new file mode 100644 index 0000000..3a5f228 --- /dev/null +++ b/api/steps/chess/streams/00-stockfish-benchmark-summary.stream.ts @@ -0,0 +1,8 @@ +import { StreamConfig } from 'motia' +import { StockfishBenchmarkSummarySchema } from '@chessarena/types/stockfish-benchmark' + +export const config: StreamConfig = { + name: 'stockfishBenchmarkSummary', + schema: StockfishBenchmarkSummarySchema, + baseConfig: { storageType: 'default' }, +} diff --git a/api/steps/chess/streams/00-stockfish-benchmark.stream.ts b/api/steps/chess/streams/00-stockfish-benchmark.stream.ts new file mode 100644 index 0000000..09ff72e --- /dev/null +++ b/api/steps/chess/streams/00-stockfish-benchmark.stream.ts @@ -0,0 +1,8 @@ +import { StreamConfig } from 'motia' +import { StockfishBenchmarkRunSchema } from '@chessarena/types/stockfish-benchmark' + +export const config: StreamConfig = { + name: 'stockfishBenchmark', + schema: StockfishBenchmarkRunSchema, + baseConfig: { storageType: 'default' }, +} diff --git a/api/types.d.ts b/api/types.d.ts index 86f9629..f2ba9c2 100644 --- a/api/types.d.ts +++ b/api/types.d.ts @@ -8,26 +8,52 @@ import { EventHandler, ApiRouteHandler, ApiResponse, MotiaStream, CronHandler } declare module 'motia' { interface FlowContextStateStreams { + 'stockfishBenchmark': MotiaStream<{ id: string; createdAt: number; completedAt?: number; status: 'running' | 'completed' | 'failed'; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; stockfishLevel: number; gameAsWhite?: { id: string; createdAt: number; completedAt?: number; status: 'running' | 'completed' | 'failed'; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; aiColor: 'white' | 'black'; stockfishLevel: number; result?: 'ai_win' | 'stockfish_win' | 'draw' | 'ai_illegal_move' | 'timeout'; resultReason?: string; moves: Array<{ moveNumber: number; player: 'white' | 'black'; moveSan: string; fen: string; centipawnScore?: number; bestMove?: string; centipawnLoss?: number; isAiMove: boolean; responseTime?: number; error?: string }>; totalMoves: number; finalFen?: string; pgn?: string; aiMoveCount?: number; totalCentipawnLoss?: number; averageCentipawnLoss?: number; blunders?: number; mistakes?: number; inaccuracies?: number }; gameAsBlack?: unknown; gamesPlayed: number; wins: number; losses: number; draws: number; overallAcpl?: number }> + 'stockfishBenchmarkSummary': MotiaStream<{ id: string; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; runsCompleted: number; averageAcpl: number; bestAcpl: number; wins: number; losses: number; draws: number; lastRunAt: number }> + 'puzzleSet': MotiaStream<{ id: string; theme: 'mateIn1' | 'oneMove'; createdAt: number; puzzles: Array<{ id: string; rating: number; themes: Array; solution: Array; initialPly: number; pgn: string; fen: string; legalMoves: Array; solutionSan: string }>; count: number }> + 'puzzleBenchmark': MotiaStream<{ id: string; createdAt: number; completedAt?: number; status: 'pending' | 'running' | 'completed' | 'failed'; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; puzzleSetId: string; theme: 'mateIn1' | 'oneMove'; results: Array<{ puzzleId: string; modelMove?: string; correctMove: string; isCorrect: boolean; responseTime: number; rawResponse: string; error?: string }>; totalPuzzles: number; correctCount?: number; accuracy?: number }> + 'puzzleBenchmarkSummary': MotiaStream<{ id: string; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; mateIn1Accuracy?: number; oneMoveAccuracy?: number; overallAccuracy?: number; runsCompleted: number; lastRunAt: number }> + 'positionSet': MotiaStream<{ id: string; createdAt: number; count: number; positions: Array<{ id: string; fen: string; pgn: string; turn: 'white' | 'black'; legalMoves: Array; legalMoveCount: number; moveNumber: number }> }> + 'legalMoveBenchmark': MotiaStream<{ id: string; createdAt: number; completedAt?: number; status: 'pending' | 'running' | 'completed' | 'failed'; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; positionCount: number; positions: Array<{ id: string; fen: string; pgn: string; turn: 'white' | 'black'; legalMoves: Array; legalMoveCount: number; moveNumber: number }>; results: Array<{ positionId: string; modelMoves: Array; correctMoves: Array; illegalMoves: Array; missedMoves: Array; accuracy: number; penalty: number; finalScore: number; responseTime: number; rawResponse: string; error?: string }>; averageAccuracy?: number; averagePenalty?: number; averageFinalScore?: number; totalCorrectMoves?: number; totalIllegalMoves?: number; totalMissedMoves?: number }> + 'legalMoveBenchmarkSummary': MotiaStream<{ id: string; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; runsCompleted: number; averageScore: number; bestScore: number; worstScore: number; lastRunAt: number }> 'chessSidechatMessage': MotiaStream<{ message: string; sender: string; role: 'white' | 'black' | 'spectator' | 'root'; timestamp: number }> 'chessLiveAiGames': MotiaStream<{ id: string; gameId: string; players: { white: { provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string }; black: { provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string } }; createdAt: string }> 'chessLeaderboard': MotiaStream<{ id: string; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; gamesPlayed: number; victories: number; checkmates: number; draws: number; endedEarly: number; illegalMoves: number; sumCentipawnScores: number; sumHighestSwing: number }> - 'chessGame': MotiaStream<{ id: string; fen: string; turn: 'white' | 'black'; status: 'pending' | 'completed' | 'draw' | 'endedEarly'; lastMove?: Array; lastMoveSan?: string; winner?: 'white' | 'black'; turns?: number; endGameReason?: string; players: { white: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number }; black: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number } }; check: boolean; scoreboard?: { white: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; black: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; totalMoves: number; decisiveMoment?: { moveNumber: number; evaluationSwing: number; move: Array; fen: string } } }> + 'chessGame': MotiaStream<{ id: string; fen: string; turn: 'white' | 'black'; status: 'pending' | 'completed' | 'draw' | 'endedEarly'; variant?: 'guided' | 'unguided'; lastMove?: Array; lastMoveSan?: string; winner?: 'white' | 'black'; turns?: number; endGameReason?: string; players: { white: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number }; black: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number } }; check: boolean; scoreboard?: { white: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; black: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; totalMoves: number; decisiveMoment?: { moveNumber: number; evaluationSwing: number; move: Array; fen: string } }; createdAt?: number }> 'chessGameMove': MotiaStream<{ color: 'white' | 'black'; fenBefore: string; fenAfter: string; lastMove: Array; check: boolean; evaluation?: { centipawnScore: number; bestMove: string; evaluationSwing: number; blunder: boolean } }> 'chessGameMessage': MotiaStream<{ id: string; message: string; sender: string; profilePic?: string; role: 'white' | 'black' | 'spectator' | 'root'; timestamp: number; moveSan?: string; isIllegalMove?: boolean }> + 'chessGameHistory': MotiaStream<{ id: string; startedAt: number; endedAt: number; duration: number; whitePlayer: { provider?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; isHuman: boolean }; blackPlayer: { provider?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; isHuman: boolean }; status: 'completed' | 'draw' | 'endedEarly'; winner?: 'white' | 'black'; endGameReason?: string; variant?: 'guided' | 'unguided'; totalMoves: number; whiteIllegalMoves: number; blackIllegalMoves: number; finalFen: string; moves: Array<{ color: 'white' | 'black'; fenBefore: string; fenAfter: string; lastMove: Array; check: boolean; evaluation?: { centipawnScore: number; bestMove: string; evaluationSwing: number; blunder: boolean } }>; messages: Array<{ id: string; message: string; sender: string; profilePic?: string; role: 'white' | 'black' | 'spectator' | 'root'; timestamp: number; moveSan?: string; isIllegalMove?: boolean }>; scoreboard?: { white: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; black: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; totalMoves: number; decisiveMoment?: { moveNumber: number; evaluationSwing: number; move: Array; fen: string } }; pgn?: string }> } interface Handlers { + 'PlayVsAI': ApiRouteHandler<{ playerColor?: 'white' | 'black' | 'random' }, ApiResponse<200, { game: { id: string; fen: string; turn: 'white' | 'black'; status: 'pending' | 'completed' | 'draw' | 'endedEarly'; variant?: 'guided' | 'unguided'; lastMove?: Array; lastMoveSan?: string; winner?: 'white' | 'black'; turns?: number; endGameReason?: string; players: { white: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number }; black: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number } }; check: boolean; scoreboard?: { white: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; black: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; totalMoves: number; decisiveMoment?: { moveNumber: number; evaluationSwing: number; move: Array; fen: string } }; createdAt?: number }; opponent: { provider: string; model: string; tier: string }; playerColor: 'white' | 'black' }> | ApiResponse<401, { message: string }>, { topic: 'chess-game-created'; data: { gameId: string; fenBefore: string } }> + 'GetGameHistoryDetail': ApiRouteHandler, ApiResponse<200, { id: string; startedAt: number; endedAt: number; duration: number; whitePlayer: { provider?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; isHuman: boolean }; blackPlayer: { provider?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; isHuman: boolean }; status: 'completed' | 'draw' | 'endedEarly'; winner?: 'white' | 'black'; endGameReason?: string; variant?: 'guided' | 'unguided'; totalMoves: number; whiteIllegalMoves: number; blackIllegalMoves: number; finalFen: string; moves: Array<{ color: 'white' | 'black'; fenBefore: string; fenAfter: string; lastMove: Array; check: boolean; evaluation?: { centipawnScore: number; bestMove: string; evaluationSwing: number; blunder: boolean } }>; messages: Array<{ id: string; message: string; sender: string; profilePic?: string; role: 'white' | 'black' | 'spectator' | 'root'; timestamp: number; moveSan?: string; isIllegalMove?: boolean }>; scoreboard?: { white: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; black: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; totalMoves: number; decisiveMoment?: { moveNumber: number; evaluationSwing: number; move: Array; fen: string } }; pgn?: string }> | ApiResponse<404, { message: string }>, never> + 'ExportGameHistory': ApiRouteHandler, ApiResponse<200, unknown>, never> + 'GetGameHistory': ApiRouteHandler, ApiResponse<200, { games: Array<{ id: string; startedAt: number; endedAt: number; duration: number; whitePlayer: { provider?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; isHuman: boolean }; blackPlayer: { provider?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; isHuman: boolean }; status: 'completed' | 'draw' | 'endedEarly'; winner?: 'white' | 'black'; endGameReason?: string; variant?: 'guided' | 'unguided'; totalMoves: number; whiteIllegalMoves: number; blackIllegalMoves: number; finalFen: string; scoreboard?: { white: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; black: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; totalMoves: number; decisiveMoment?: { moveNumber: number; evaluationSwing: number; move: Array; fen: string } }; pgn?: string }>; total: number; limit: number; offset: number }>, never> 'PurgeStuckGames': CronHandler 'GameEnded': EventHandler<{ gameId: string }, never> 'SendMessage': ApiRouteHandler<{ message: string; name: string; role: 'white' | 'black' | 'spectator' | 'root' }, ApiResponse<200, { message: string; sender: string; timestamp: number }> | ApiResponse<404, { message: string }>, never> 'AI_Player': EventHandler<{ player: 'white' | 'black'; fenBefore: string; fen: string; lastMove?: Array; check: boolean; gameId: string }, { topic: 'chess-game-moved'; data: { gameId: string; fenBefore: string } } | { topic: 'chess-game-ended'; data: { gameId: string } } | { topic: 'evaluate-player-move'; data: { fenBefore: string; fenAfter: string; gameId: string; moveId: string; player: string } }> 'ChessGameMoved': EventHandler<{ gameId: string; fenBefore: string }, { topic: 'ai-move'; data: { player: 'white' | 'black'; fenBefore: string; fen: string; lastMove?: Array; check: boolean; gameId: string } }> - 'MovePiece': ApiRouteHandler<{ moveSan: string }, ApiResponse<200, { id: string; fen: string; turn: 'white' | 'black'; status: 'pending' | 'completed' | 'draw' | 'endedEarly'; lastMove?: Array; lastMoveSan?: string; winner?: 'white' | 'black'; turns?: number; endGameReason?: string; players: { white: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number }; black: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number } }; check: boolean; scoreboard?: { white: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; black: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; totalMoves: number; decisiveMoment?: { moveNumber: number; evaluationSwing: number; move: Array; fen: string } } }> | ApiResponse<400, { message: string }> | ApiResponse<404, { message: string }>, { topic: 'chess-game-moved'; data: { gameId: string; fenBefore: string } } | { topic: 'chess-game-ended'; data: { gameId: string } } | { topic: 'evaluate-player-move'; data: { fenBefore: string; fenAfter: string; gameId: string; moveId: string; player: string } }> - 'GetGame': ApiRouteHandler<{}, ApiResponse<200, { id: string; fen: string; turn: 'white' | 'black'; status: 'pending' | 'completed' | 'draw' | 'endedEarly'; lastMove?: Array; lastMoveSan?: string; winner?: 'white' | 'black'; turns?: number; endGameReason?: string; players: { white: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number }; black: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number } }; check: boolean; scoreboard?: { white: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; black: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; totalMoves: number; decisiveMoment?: { moveNumber: number; evaluationSwing: number; move: Array; fen: string } }; role: 'white' | 'black' | 'spectator' | 'root'; username: string; passwords?: { root: string; white: string; black: string } }> | ApiResponse<404, { message: string }>, never> - 'CreateGame': ApiRouteHandler<{ players: { white: { ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string }; black: { ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string } } }, ApiResponse<200, { id: string; fen: string; turn: 'white' | 'black'; status: 'pending' | 'completed' | 'draw' | 'endedEarly'; lastMove?: Array; lastMoveSan?: string; winner?: 'white' | 'black'; turns?: number; endGameReason?: string; players: { white: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number }; black: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number } }; check: boolean; scoreboard?: { white: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; black: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; totalMoves: number; decisiveMoment?: { moveNumber: number; evaluationSwing: number; move: Array; fen: string } } }> | ApiResponse<400, { message: string; errors: Array<{ message: string }> }> | ApiResponse<401, { message: string }>, { topic: 'chess-game-created'; data: { gameId: string; fenBefore: string } }> + 'MovePiece': ApiRouteHandler<{ moveSan: string }, ApiResponse<200, { id: string; fen: string; turn: 'white' | 'black'; status: 'pending' | 'completed' | 'draw' | 'endedEarly'; variant?: 'guided' | 'unguided'; lastMove?: Array; lastMoveSan?: string; winner?: 'white' | 'black'; turns?: number; endGameReason?: string; players: { white: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number }; black: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number } }; check: boolean; scoreboard?: { white: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; black: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; totalMoves: number; decisiveMoment?: { moveNumber: number; evaluationSwing: number; move: Array; fen: string } }; createdAt?: number }> | ApiResponse<400, { message: string }> | ApiResponse<404, { message: string }>, { topic: 'chess-game-moved'; data: { gameId: string; fenBefore: string } } | { topic: 'chess-game-ended'; data: { gameId: string } } | { topic: 'evaluate-player-move'; data: { fenBefore: string; fenAfter: string; gameId: string; moveId: string; player: string } }> + 'GetGame': ApiRouteHandler<{}, ApiResponse<200, { id: string; fen: string; turn: 'white' | 'black'; status: 'pending' | 'completed' | 'draw' | 'endedEarly'; variant?: 'guided' | 'unguided'; lastMove?: Array; lastMoveSan?: string; winner?: 'white' | 'black'; turns?: number; endGameReason?: string; players: { white: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number }; black: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number } }; check: boolean; scoreboard?: { white: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; black: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; totalMoves: number; decisiveMoment?: { moveNumber: number; evaluationSwing: number; move: Array; fen: string } }; createdAt?: number; role: 'white' | 'black' | 'spectator' | 'root'; username: string; passwords?: { root: string; white: string; black: string } }> | ApiResponse<404, { message: string }>, never> + 'CreateGame': ApiRouteHandler<{ players: { white: { ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string }; black: { ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string } }; variant?: 'guided' | 'unguided' }, ApiResponse<200, { id: string; fen: string; turn: 'white' | 'black'; status: 'pending' | 'completed' | 'draw' | 'endedEarly'; variant?: 'guided' | 'unguided'; lastMove?: Array; lastMoveSan?: string; winner?: 'white' | 'black'; turns?: number; endGameReason?: string; players: { white: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number }; black: { userId?: string; ai?: 'openai' | 'gemini' | 'claude' | 'grok'; model?: string; illegalMoveAttempts?: number; totalMoves?: number; captures?: Array<{ piece: string; score: number }>; promotions?: number } }; check: boolean; scoreboard?: { white: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; black: { averageSwing: number; highestSwing: number; finalCentipawnScore: number; blunders: number; illegalMoveAttempts: number; captures: Array<{ piece: string; score: number }>; promotions: number; checks: number }; totalMoves: number; decisiveMoment?: { moveNumber: number; evaluationSwing: number; move: Array; fen: string } }; createdAt?: number }> | ApiResponse<400, { message: string; errors: Array<{ message: string }> }> | ApiResponse<401, { message: string }>, { topic: 'chess-game-created'; data: { gameId: string; fenBefore: string } }> 'AvailableModels': ApiRouteHandler<{}, ApiResponse<200, { models: { openai: Array; gemini: Array; claude: Array; grok: Array } }> | ApiResponse<400, { message: string }> | ApiResponse<404, { message: string }>, never> 'RequestAccess': ApiRouteHandler, ApiResponse<200, {}> | ApiResponse<400, { message: string }> | ApiResponse<404, { message: string }>, never> 'AcceptRequestAccess': ApiRouteHandler<{ userId: string }, ApiResponse<200, {}> | ApiResponse<400, { message: string }> | ApiResponse<404, { message: string }>, never> + 'GetStockfishLeaderboard': ApiRouteHandler, ApiResponse<200, { leaderboard: Array<{ id: string; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; runsCompleted: number; averageAcpl: number; bestAcpl: number; wins: number; losses: number; draws: number; lastRunAt: number }> }>, never> + 'RunStockfishBenchmark': ApiRouteHandler<{ provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; stockfishLevel?: number }, ApiResponse<200, { id: string; createdAt: number; completedAt?: number; status: 'running' | 'completed' | 'failed'; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; stockfishLevel: number; gameAsWhite?: { id: string; createdAt: number; completedAt?: number; status: 'running' | 'completed' | 'failed'; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; aiColor: 'white' | 'black'; stockfishLevel: number; result?: 'ai_win' | 'stockfish_win' | 'draw' | 'ai_illegal_move' | 'timeout'; resultReason?: string; moves: Array<{ moveNumber: number; player: 'white' | 'black'; moveSan: string; fen: string; centipawnScore?: number; bestMove?: string; centipawnLoss?: number; isAiMove: boolean; responseTime?: number; error?: string }>; totalMoves: number; finalFen?: string; pgn?: string; aiMoveCount?: number; totalCentipawnLoss?: number; averageCentipawnLoss?: number; blunders?: number; mistakes?: number; inaccuracies?: number }; gameAsBlack?: unknown; gamesPlayed: number; wins: number; losses: number; draws: number; overallAcpl?: number }> | ApiResponse<400, { message: string }>, never> + 'RunAllBenchmarks': ApiRouteHandler<{ positionCount?: number; force?: boolean; rerunCompleted?: boolean }, ApiResponse<200, { message: string; positionCount: number; totalModels: number }> | ApiResponse<400, { message: string }>, never> + 'GetPuzzleSets': ApiRouteHandler, ApiResponse<200, { sets: Array<{ id: string; theme: 'mateIn1' | 'oneMove'; createdAt: number; count: number; puzzleCount: number }> }>, never> + 'GetPuzzleLeaderboard': ApiRouteHandler, ApiResponse<200, { leaderboard: Array<{ id: string; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; mateIn1Accuracy?: number; oneMoveAccuracy?: number; overallAccuracy?: number; runsCompleted: number; lastRunAt: number }> }>, never> + 'RunAllPuzzleBenchmarks': ApiRouteHandler<{ theme: 'mateIn1' | 'oneMove'; count?: number; rerunCompleted?: boolean }, ApiResponse<200, { message: string; theme: 'mateIn1' | 'oneMove'; puzzleCount: number; totalModels: number }> | ApiResponse<400, { message: string }>, never> + 'RunPuzzleBenchmark': ApiRouteHandler<{ provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; theme: 'mateIn1' | 'oneMove'; count?: number }, ApiResponse<200, { id: string; createdAt: number; completedAt?: number; status: 'pending' | 'running' | 'completed' | 'failed'; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; puzzleSetId: string; theme: 'mateIn1' | 'oneMove'; results: Array<{ puzzleId: string; modelMove?: string; correctMove: string; isCorrect: boolean; responseTime: number; rawResponse: string; error?: string }>; totalPuzzles: number; correctCount?: number; accuracy?: number }> | ApiResponse<400, { message: string }>, never> + 'FetchPuzzleSet': ApiRouteHandler<{ theme: 'mateIn1' | 'oneMove'; count?: number }, ApiResponse<200, { id: string; theme: 'mateIn1' | 'oneMove'; createdAt: number; puzzles: Array<{ id: string; rating: number; themes: Array; solution: Array; initialPly: number; pgn: string; fen: string; legalMoves: Array; solutionSan: string }>; count: number }> | ApiResponse<400, { message: string }>, never> + 'GetBenchmarkLeaderboard': ApiRouteHandler, ApiResponse<200, { leaderboard: Array<{ id: string; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; runsCompleted: number; averageScore: number; bestScore: number; worstScore: number; lastRunAt: number }> }>, never> + 'GetBenchmarkRunDetail': ApiRouteHandler, ApiResponse<200, { id: string; createdAt: number; completedAt?: number; status: 'pending' | 'running' | 'completed' | 'failed'; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; positionCount: number; positions: Array<{ id: string; fen: string; pgn: string; turn: 'white' | 'black'; legalMoves: Array; legalMoveCount: number; moveNumber: number }>; results: Array<{ positionId: string; modelMoves: Array; correctMoves: Array; illegalMoves: Array; missedMoves: Array; accuracy: number; penalty: number; finalScore: number; responseTime: number; rawResponse: string; error?: string }>; averageAccuracy?: number; averagePenalty?: number; averageFinalScore?: number; totalCorrectMoves?: number; totalIllegalMoves?: number; totalMissedMoves?: number }> | ApiResponse<404, { message: string }>, never> + 'GetBenchmarkRuns': ApiRouteHandler, ApiResponse<200, { runs: Array<{ id: string; createdAt: number; completedAt?: number; status: 'pending' | 'running' | 'completed' | 'failed'; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; positionCount: number; averageAccuracy?: number; averagePenalty?: number; averageFinalScore?: number; totalCorrectMoves?: number; totalIllegalMoves?: number; totalMissedMoves?: number; resultsCount: number }>; total: number }>, never> + 'RunLegalMoveBenchmark': ApiRouteHandler<{ provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string }, ApiResponse<200, { id: string; createdAt: number; completedAt?: number; status: 'pending' | 'running' | 'completed' | 'failed'; provider: 'openai' | 'gemini' | 'claude' | 'grok'; model: string; positionCount: number; positions: Array<{ id: string; fen: string; pgn: string; turn: 'white' | 'black'; legalMoves: Array; legalMoveCount: number; moveNumber: number }>; results: Array<{ positionId: string; modelMoves: Array; correctMoves: Array; illegalMoves: Array; missedMoves: Array; accuracy: number; penalty: number; finalScore: number; responseTime: number; rawResponse: string; error?: string }>; averageAccuracy?: number; averagePenalty?: number; averageFinalScore?: number; totalCorrectMoves?: number; totalIllegalMoves?: number; totalMissedMoves?: number }> | ApiResponse<400, { message: string }>, never> + 'GeneratePositionSet': ApiRouteHandler<{ count?: number; force?: boolean }, ApiResponse<200, { id: string; createdAt: number; count: number; positions: Array<{ id: string; fen: string; pgn: string; turn: 'white' | 'black'; legalMoves: Array; legalMoveCount: number; moveNumber: number }> }> | ApiResponse<400, { message: string }>, never> 'GetUser': ApiRouteHandler, ApiResponse<200, { id: string; name: string; profilePic: string }> | ApiResponse<404, { message: string }>, never> 'Auth': ApiRouteHandler<{ authToken: string }, ApiResponse<200, { accessToken: string; user: { id: string; name: string; profilePic: string; email: string } }> | ApiResponse<401, { error: string }> | ApiResponse<500, { error: string }>, never> 'EvaluatePlayerMove': EventHandler<{ fenBefore: string; fenAfter: string; gameId: string; moveId: string; player: string }, never> diff --git a/app/package.json b/app/package.json index ef20cb0..c279d60 100644 --- a/app/package.json +++ b/app/package.json @@ -35,6 +35,7 @@ "react-markdown": "^10.1.0", "react-router": "^7.6.2", "react-syntax-highlighter": "^15.6.1", + "recharts": "^3.6.0", "sonner": "^2.0.5", "tailwind-merge": "^3.3.0", "tailwindcss": "^4.1.7", diff --git a/app/src/App.tsx b/app/src/App.tsx index feafd4b..9ecaec0 100644 --- a/app/src/App.tsx +++ b/app/src/App.tsx @@ -11,6 +11,12 @@ import { AboutPage } from './pages/about-page' import { LoginPage } from './pages/login-page' import { AuthProvider } from './components/auth/auth-provider' import { PrivacyPage } from './pages/privacy-page' +import { MethodologyPage } from './pages/methodology-page' +import { GameHistoryPage } from './pages/game-history-page' +import { GameReplayPage } from './pages/game-replay-page' +import { PlayAIPage } from './pages/play-ai-page' +import { BenchPage } from './pages/bench-page' +import { ArenaPage } from './pages/arena-page' function App() { return ( @@ -25,6 +31,12 @@ function App() { } /> } /> } /> + } /> + } /> + } /> + } /> + } /> + } /> } /> diff --git a/app/src/components/bench/bench-bar-charts.tsx b/app/src/components/bench/bench-bar-charts.tsx new file mode 100644 index 0000000..7ec4320 --- /dev/null +++ b/app/src/components/bench/bench-bar-charts.tsx @@ -0,0 +1,641 @@ +import React, { useMemo } from 'react' +import { + ResponsiveContainer, + BarChart, + Bar, + CartesianGrid, + XAxis, + YAxis, + Tooltip, + Cell, + Legend, + ScatterChart, + Scatter, + ReferenceLine, +} from 'recharts' +import { Filter } from 'lucide-react' +import { cn } from '@/lib/utils' +import type { BenchModelRow } from './bench-mock' +import { getPricingMap } from './model-pricing' + +type Metric = 'motiaChessIndex' | 'legalMoveScore' | 'puzzleScore' | 'acplScore' | 'legalVsIllegal' + +// Type for chart tooltip payload data +interface ChartTooltipData { + provider: string + label: string + value?: number + legal?: number + illegal?: number + rawAcpl?: number +} + +export const providerColors: Record = { + openai: '#10b981', + claude: '#a78bfa', + gemini: '#3b82f6', + grok: '#f59e0b', +} + +const toAcplScore = (acpl: number) => { + return Math.max(0, Math.min(100, 100 - acpl)) +} + +const metricLabel = (metric: Metric) => { + switch (metric) { + case 'motiaChessIndex': + return 'Motia Index' + case 'legalMoveScore': + return 'Legal move score' + case 'puzzleScore': + return 'Puzzle accuracy' + case 'acplScore': + return 'ACPL score (inverted)' + case 'legalVsIllegal': + return 'Legal vs illegal' + } +} + +const metricHowToRead = (metric: Metric) => { + switch (metric) { + case 'motiaChessIndex': + return 'Higher is better.' + case 'legalMoveScore': + return 'Higher is better.' + case 'puzzleScore': + return 'Higher is better.' + case 'acplScore': + return 'Bars are 100 - ACPL. Higher is better.' + case 'legalVsIllegal': + return 'Legal % vs the remainder.' + } +} + +// SVG Patterns for hatched bars +const HatchPatterns = () => ( + + + {Object.entries(providerColors).map(([provider, color]) => ( + + + + + ))} + + +) + +type Props = { + title: string + description?: string + rows: BenchModelRow[] + metric: Metric + className?: string + unit?: string + topN?: number + hiddenModels?: Set + showExpand?: boolean + expanded?: boolean + onExpandToggle?: () => void + layout?: 'horizontal' | 'vertical' // Added layout prop +} + +export const BenchBarChart: React.FC = ({ + title, + description, + rows, + metric, + className, + unit, + topN, + hiddenModels, + showExpand, + expanded, + onExpandToggle, + layout = 'vertical', // Default to vertical (horizontal bars) +}) => { + const data = useMemo(() => { + const filtered = hiddenModels ? rows.filter((r) => !hiddenModels.has(r.id)) : rows + + if (metric === 'legalVsIllegal') { + return filtered + .map((r) => ({ + id: r.id, + label: r.model, + provider: r.provider, + legal: r.legalMoveScore, + illegal: Math.max(0, 100 - r.legalMoveScore), + value: r.legalMoveScore, + lastUpdatedAt: r.lastUpdatedAt, + })) + .sort((a, b) => b.value - a.value) + .slice(0, expanded ? 100 : (topN || 100)) + } + + const mapped = filtered.map((r) => { + const value = + metric === 'acplScore' + ? toAcplScore(r.acpl) + : metric === 'motiaChessIndex' + ? r.motiaChessIndex + : metric === 'legalMoveScore' + ? r.legalMoveScore + : r.puzzleScore + return { + id: r.id, + label: r.model, + provider: r.provider, + value, + rawAcpl: r.acpl, + lastUpdatedAt: r.lastUpdatedAt, + } + }) + + return mapped.sort((a, b) => b.value - a.value).slice(0, expanded ? 100 : (topN || 100)) + }, [rows, metric, topN, hiddenModels, expanded]) + + const totalCount = useMemo(() => { + const filtered = hiddenModels ? rows.filter((r) => !hiddenModels.has(r.id)) : rows + return filtered.length + }, [rows, hiddenModels]) + + const isStacked = metric === 'legalVsIllegal' + const isVertical = layout === 'horizontal' // Horizontal layout means vertical bars + const chartHeight = isVertical ? 400 : Math.max(320, data.length * 32) + + const axisText = 'rgba(255,255,255,0.72)' + const axisMuted = 'rgba(255,255,255,0.55)' + const axisLine = 'rgba(255,255,255,0.10)' + + return ( +
+ + + {/* Header */} +
+
+
+

{title}

+ {description && ( +

{description}

+ )} +
+ {showExpand && totalCount > (topN || 10) && ( + + )} +
+
+ + {/* Chart */} +
+ + + + + {isVertical ? ( + <> + + + + ) : ( + <> + + + + )} + + { + if (!active || !payload || payload.length === 0) return null + const p = payload[0].payload as ChartTooltipData + const provider = String(p.provider ?? '') + const model = String(p.label ?? '') + + const value = typeof p.value === 'number' ? p.value : undefined + const legal = typeof p.legal === 'number' ? p.legal : undefined + const illegal = typeof p.illegal === 'number' ? p.illegal : undefined + + return ( +
+
+
{model}
+
+ {provider} +
+
+ +
+
+ {metricLabel(metric)} + {metric === 'legalVsIllegal' ? ( + {Number(legal ?? 0).toFixed(1)}% + ) : metric === 'acplScore' ? ( + + {Number(value ?? 0).toFixed(1)} + {unit ?? ''} + + ) : ( + + {Number(value ?? 0).toFixed(1)} + {unit ?? ''} + + )} +
+ + {metric === 'acplScore' && ( +
+ ACPL (raw) + {typeof p.rawAcpl === 'number' ? p.rawAcpl.toFixed(1) : '-'} +
+ )} + + {metric === 'legalVsIllegal' && ( + <> +
+ Legal + {Number(legal ?? 0).toFixed(1)}% +
+
+ Illegal / missed + {Number(illegal ?? 0).toFixed(1)}% +
+ + )} + +
+ {metricHowToRead(metric)} +
+
+
+ ) + }} + /> + {isStacked ? ( + <> + + + + + ) : ( + + {data.map((d) => ( + + ))} + + )} +
+
+
+ + {metric === 'acplScore' && ( +
+ * Bars represent inverted score (100 - ACPL). Longer bar = Better play. +
+ )} +
+ ) +} + +// ---------------------------------------------------------------------- +// Cost vs Performance Scatter Plot +// ---------------------------------------------------------------------- + +type CostChartProps = { + title: string + description?: string + rows: BenchModelRow[] + className?: string + hiddenModels?: Set +} + +export const CostVsPerformanceChart: React.FC = ({ title, description, rows, className, hiddenModels }) => { + const pricingMap = useMemo(() => getPricingMap(), []) + + const data = useMemo(() => { + const filtered = hiddenModels ? rows.filter((r) => !hiddenModels.has(r.id)) : rows + return filtered + .map((r) => { + const pricing = pricingMap.get(r.id) + return { + id: r.id, + x: pricing?.avgPrice ?? 10, // Cost (USD per 1M tokens) + y: r.motiaChessIndex, + provider: r.provider, + label: r.model, + inputPrice: pricing?.inputPrice ?? 0, + outputPrice: pricing?.outputPrice ?? 0, + } + }) + .filter((d) => d.x > 0) + }, [rows, hiddenModels, pricingMap]) + + // Calculate average performance for reference line + const avgPerformance = useMemo(() => { + if (data.length === 0) return 50 + return data.reduce((sum, d) => sum + d.y, 0) / data.length + }, [data]) + + return ( +
+ {/* Header */} +
+
+
+

{title}

+ {description && ( +

{description}

+ )} +
+
+ + Top Left = Best Value +
+
+
+ + {/* Chart */} +
+ + + + `$${v}`} + axisLine={{ stroke: 'rgba(255,255,255,0.10)' }} + tickLine={false} + /> + + + { + if (active && payload && payload.length) { + const d = payload[0].payload + return ( +
+
{d.label}
+
+
+ Motia Index + {d.y} +
+
+ Input + ${d.inputPrice}/1M +
+
+ Output + ${d.outputPrice}/1M +
+
+ Provider + {d.provider} +
+
+
+ ) + } + return null + }} + /> + + {data.map((entry, index) => ( + + ))} + +
+
+
+ + {/* Legend */} +
+ {Object.entries(providerColors).map(([provider, color]) => ( +
+
+ {provider} +
+ ))} +
+
+ ) +} + +// ---------------------------------------------------------------------- +// Model Filter Component +// ---------------------------------------------------------------------- + +type ModelFilterProps = { + rows: BenchModelRow[] + hiddenModels: Set + onToggle: (id: string) => void + onShowAll: () => void +} + +export const ModelFilter: React.FC = ({ rows, hiddenModels, onToggle, onShowAll }) => { + const providers = useMemo(() => { + const grouped: Record = {} + rows.forEach((r) => { + if (!grouped[r.provider]) grouped[r.provider] = [] + grouped[r.provider].push(r) + }) + return grouped + }, [rows]) + + return ( +
+
+
+
+ +
+
+

Model Selection

+

Toggle models to compare performance

+
+
+ +
+ +
+ {Object.entries(providers).map(([provider, models]) => ( +
+
+
+ {provider} + + {models.filter((m) => !hiddenModels.has(m.id)).length}/{models.length} + +
+
+ {models.map((m) => ( + + ))} +
+
+ ))} +
+
+ ) +} + + diff --git a/app/src/components/bench/bench-charts.tsx b/app/src/components/bench/bench-charts.tsx new file mode 100644 index 0000000..52dd710 --- /dev/null +++ b/app/src/components/bench/bench-charts.tsx @@ -0,0 +1,59 @@ +import React, { useMemo } from 'react' +import { cn } from '@/lib/utils' +import type { BenchTimeseriesPoint } from './bench-mock' +import { + ResponsiveContainer, + AreaChart, + Area, + XAxis, + YAxis, + Tooltip, + CartesianGrid, +} from 'recharts' + +type MiniAreaProps = { + points: BenchTimeseriesPoint[] + className?: string + stroke?: string + height?: number +} + +export const MiniArea: React.FC = ({ points, className, stroke = '#34d399', height = 64 }) => { + const data = useMemo(() => points.map((p) => ({ t: p.t, v: p.v })), [points]) + + return ( +
+ + + + + + + + + + + + new Date(Number(t)).toLocaleDateString('en-US', { month: 'short', day: 'numeric' })} + formatter={(v) => [Number(v).toFixed(1), '']} + /> + + + +
+ ) +} diff --git a/app/src/components/bench/bench-mock.ts b/app/src/components/bench/bench-mock.ts new file mode 100644 index 0000000..22e5d72 --- /dev/null +++ b/app/src/components/bench/bench-mock.ts @@ -0,0 +1,245 @@ +const NOW = Date.now() + +const clamp = (v: number, min: number, max: number) => Math.max(min, Math.min(max, v)) + +const hash01 = (s: string) => { + let h = 2166136261 + for (let i = 0; i < s.length; i++) { + h ^= s.charCodeAt(i) + h = Math.imul(h, 16777619) + } + return ((h >>> 0) % 10_000) / 10_000 +} + +const seeded = (seedStr: string) => { + let t = Math.floor(hash01(seedStr) * 0xffffffff) >>> 0 + return () => { + t += 0x6d2b79f5 + let x = t + x = Math.imul(x ^ (x >>> 15), x | 1) + x ^= x + Math.imul(x ^ (x >>> 7), x | 61) + return ((x ^ (x >>> 14)) >>> 0) / 4294967296 + } +} + +const normalish = (rand: () => number) => { + const u = rand() || 1e-9 + const v = rand() || 1e-9 + return Math.sqrt(-2 * Math.log(u)) * Math.cos(2 * Math.PI * v) +} + +const legalMoveSummary: Record = { + 'grok:grok-4-fast-non-reasoning': { averageScore: 28.38477029006998, lastRunAt: 1766157466671 }, + 'grok:grok-4-fast-reasoning': { averageScore: 95.0778179689851, lastRunAt: 1766158456823 }, + 'gemini:gemini-3-pro-preview': { averageScore: 95.71476876793383, lastRunAt: 1766158866596 }, + 'openai:gpt-5.2': { averageScore: 94.47529713518263, lastRunAt: 1766159361494 }, + 'gemini:gemini-2.5-pro': { averageScore: 62.0246628219507, lastRunAt: 1766159574654 }, + 'claude:claude-opus-4-5': { averageScore: 87.51753246753248, lastRunAt: 1766160119644 }, + 'gemini:gemini-2.5-flash': { averageScore: 50.682398740759965, lastRunAt: 1766161062713 }, + 'gemini:gemini-2.5-flash-lite': { averageScore: 30.926490100647793, lastRunAt: 1766161726694 }, + 'gemini:gemini-2.0-flash': { averageScore: 57.6118913255696, lastRunAt: 1766161760105 }, +} + +const inferredLegalMoveScore: Record = { + 'openai:gpt-5.1': 92.8, + 'openai:gpt-5': 90.1, + 'openai:gpt-5-mini': 83.8, + 'openai:gpt-4.1': 86.2, + 'openai:gpt-4.1-mini': 78.4, + 'openai:gpt-4o': 76.3, + 'openai:gpt-4o-mini': 67.6, + + 'claude:claude-sonnet-4-5': 83.6, + 'claude:claude-haiku-4-5': 74.9, + 'claude:claude-opus-4-0': 85.2, + 'claude:claude-sonnet-4-0': 80.3, + 'claude:claude-3-7-sonnet-latest': 73.4, + 'claude:claude-3-5-haiku-latest': 65.9, + + 'grok:grok-4': 92.0, + 'grok:grok-3': 69.7, + 'grok:grok-3-fast': 55.1, +} + +const allModels: Array<{ provider: string; model: string }> = [ + ...[ + 'gpt-5.2', + 'gpt-5.1', + 'gpt-5', + 'gpt-5-mini', + 'gpt-4.1', + 'gpt-4.1-mini', + 'gpt-4o', + 'gpt-4o-mini', + ].map((model) => ({ provider: 'openai', model })), + ...['gemini-3-pro-preview', 'gemini-2.5-pro', 'gemini-2.5-flash', 'gemini-2.5-flash-lite', 'gemini-2.0-flash'].map( + (model) => ({ provider: 'gemini', model }), + ), + ...[ + 'claude-opus-4-5', + 'claude-sonnet-4-5', + 'claude-haiku-4-5', + 'claude-opus-4-0', + 'claude-sonnet-4-0', + 'claude-3-7-sonnet-latest', + 'claude-3-5-haiku-latest', + ].map((model) => ({ provider: 'claude', model })), + ...['grok-4-fast-non-reasoning', 'grok-4-fast-reasoning', 'grok-4', 'grok-3', 'grok-3-fast'].map((model) => ({ + provider: 'grok', + model, + })), +] + +const computeRow = (provider: string, model: string): BenchModelRow => { + const id = `${provider}:${model}` + const summary = legalMoveSummary[id] + + const rand = seeded(id) + const n1 = normalish(rand) + const n2 = normalish(rand) + const n3 = normalish(rand) + + const baseLegal = summary?.averageScore ?? inferredLegalMoveScore[id] ?? 50 + + const legalSigma = provider === 'grok' ? 4.5 : provider === 'gemini' ? 4.0 : provider === 'claude' ? 4.2 : 3.6 + const legalMoveScore = clamp(baseLegal + n1 * legalSigma, 2, 99.8) + + const providerPuzzleBias = provider === 'openai' ? 3.0 : provider === 'claude' ? 1.5 : provider === 'gemini' ? 0.0 : -2.0 + const modelPuzzleSkew = + model.includes('flash-lite') ? -8 : + model.includes('flash') ? -3 : + model.includes('mini') ? -4 : + model.includes('non-reasoning') ? -10 : + model.includes('reasoning') ? 4 : + model.includes('opus') ? 4 : + model.includes('pro') ? 3 : 0 + + const puzzleScore = clamp(legalMoveScore * (0.78 + 0.08 * rand()) + providerPuzzleBias + modelPuzzleSkew + n2 * 7, 8, 98) + + const providerQualityBias = provider === 'openai' ? -6 : provider === 'claude' ? -3 : provider === 'gemini' ? 1 : 4 + const modelQualitySkew = + model.includes('opus') || model.includes('pro') ? -8 : + model.includes('mini') || model.includes('flash') ? 6 : + model.includes('non-reasoning') ? 18 : 0 + + const composite = 0.58 * puzzleScore + 0.42 * legalMoveScore + const acpl = clamp(112 - composite + providerQualityBias + modelQualitySkew + n3 * 10 + (rand() - 0.5) * 6, 8, 120) + + const acplScore = clamp(100 - acpl, 0, 100) + const motiaChessIndex = clamp(0.4 * legalMoveScore + 0.3 * puzzleScore + 0.3 * acplScore, 0, 100) + + return { + id, + provider, + model, + motiaChessIndex: Number(motiaChessIndex.toFixed(1)), + legalMoveScore: Number(legalMoveScore.toFixed(1)), + puzzleScore: Number(puzzleScore.toFixed(1)), + acpl: Number(acpl.toFixed(1)), + lastUpdatedAt: summary?.lastRunAt ?? NOW, + } +} + +export const mockBenchLeaderboard: BenchModelRow[] = allModels.map(({ provider, model }) => computeRow(provider, model)) + +export type BenchTimeseriesPoint = { t: number; v: number } + +export type BenchModelRow = { + id: string + provider: string + model: string + motiaChessIndex: number + legalMoveScore: number + puzzleScore: number + acpl: number + lastUpdatedAt: number +} + +export const mockBenchTimeseries = { + legalMoveScore: [ + { t: Date.now() - 6 * 86400000, v: 62 }, + { t: Date.now() - 5 * 86400000, v: 64 }, + { t: Date.now() - 4 * 86400000, v: 66 }, + { t: Date.now() - 3 * 86400000, v: 68 }, + { t: Date.now() - 2 * 86400000, v: 70 }, + { t: Date.now() - 1 * 86400000, v: 72 }, + { t: Date.now(), v: 73 }, + ], + puzzleScore: [ + { t: Date.now() - 6 * 86400000, v: 44 }, + { t: Date.now() - 5 * 86400000, v: 47 }, + { t: Date.now() - 4 * 86400000, v: 49 }, + { t: Date.now() - 3 * 86400000, v: 51 }, + { t: Date.now() - 2 * 86400000, v: 52 }, + { t: Date.now() - 1 * 86400000, v: 54 }, + { t: Date.now(), v: 55 }, + ], + acpl: [ + { t: Date.now() - 6 * 86400000, v: 78 }, + { t: Date.now() - 5 * 86400000, v: 75 }, + { t: Date.now() - 4 * 86400000, v: 73 }, + { t: Date.now() - 3 * 86400000, v: 70 }, + { t: Date.now() - 2 * 86400000, v: 68 }, + { t: Date.now() - 1 * 86400000, v: 66 }, + { t: Date.now(), v: 65 }, + ], +} + +export const mockPrompts = { + legalMoveBench: `You are a chess expert. Given the following game, list ALL legal moves available for the current player. + +## Game (PGN) +{{pgn}} + +## Current Position +It is {{turn}}'s turn to move. +FEN: {{fen}} + +## Task +List ALL legal moves for {{turn}} in Standard Algebraic Notation (SAN). + +## Response Format +Return ONLY a JSON object with no additional text: +{ + "moves": ["move1", "move2", "move3", ...] +} +`, + puzzleBench: `You are a chess engine. Solve this puzzle in one move. + +## Position +FEN: {{fen}} +Turn: {{turn}} + +## Game context (PGN) +{{pgn}} + +## Legal moves (SAN) +{{#legalMoves}} +- {{.}} +{{/legalMoves}} + +## Response format (JSON only) +{ + "move": "SAN" +} +`, + aiPlayerGuided: `You are a chess grandmaster playing as {{player}}. + +## Current Position +- FEN: \`{{fen}}\` +{{#inCheck}}- WARNING: You are in check!{{/inCheck}} + +## Valid Moves (Guided) +{{#validMoves}}- {{san}} +{{/validMoves}} + +## Response Format (JSON only) +{ + "thought": "Strategic reasoning", + "moveSan": "Your move in SAN" +} +`, +} + + + diff --git a/app/src/components/bench/model-pricing.ts b/app/src/components/bench/model-pricing.ts new file mode 100644 index 0000000..13f1f8e --- /dev/null +++ b/app/src/components/bench/model-pricing.ts @@ -0,0 +1,58 @@ +// Model pricing data (USD per 1M tokens) +// These are mock prices - update with real pricing data + +export type ModelPricing = { + id: string + provider: string + model: string + inputPrice: number // USD per 1M input tokens + outputPrice: number // USD per 1M output tokens + avgPrice: number // Average (input + output) / 2 for simple comparisons +} + +export const modelPricing: ModelPricing[] = [ + // OpenAI - https://openai.com/api/pricing/ + { id: 'openai:gpt-5.2', provider: 'openai', model: 'gpt-5.2', inputPrice: 15.00, outputPrice: 60.00, avgPrice: 37.50 }, + { id: 'openai:gpt-5.1', provider: 'openai', model: 'gpt-5.1', inputPrice: 12.00, outputPrice: 48.00, avgPrice: 30.00 }, + { id: 'openai:gpt-5', provider: 'openai', model: 'gpt-5', inputPrice: 10.00, outputPrice: 40.00, avgPrice: 25.00 }, + { id: 'openai:gpt-5-mini', provider: 'openai', model: 'gpt-5-mini', inputPrice: 1.50, outputPrice: 6.00, avgPrice: 3.75 }, + { id: 'openai:gpt-4.1', provider: 'openai', model: 'gpt-4.1', inputPrice: 2.00, outputPrice: 8.00, avgPrice: 5.00 }, + { id: 'openai:gpt-4.1-mini', provider: 'openai', model: 'gpt-4.1-mini', inputPrice: 0.40, outputPrice: 1.60, avgPrice: 1.00 }, + { id: 'openai:gpt-4o', provider: 'openai', model: 'gpt-4o', inputPrice: 2.50, outputPrice: 10.00, avgPrice: 6.25 }, + { id: 'openai:gpt-4o-mini', provider: 'openai', model: 'gpt-4o-mini', inputPrice: 0.15, outputPrice: 0.60, avgPrice: 0.375 }, + + // Google Gemini - https://ai.google.dev/pricing + { id: 'gemini:gemini-3-pro-preview', provider: 'gemini', model: 'gemini-3-pro-preview', inputPrice: 7.00, outputPrice: 21.00, avgPrice: 14.00 }, + { id: 'gemini:gemini-2.5-pro', provider: 'gemini', model: 'gemini-2.5-pro', inputPrice: 1.25, outputPrice: 5.00, avgPrice: 3.125 }, + { id: 'gemini:gemini-2.5-flash', provider: 'gemini', model: 'gemini-2.5-flash', inputPrice: 0.15, outputPrice: 0.60, avgPrice: 0.375 }, + { id: 'gemini:gemini-2.5-flash-lite', provider: 'gemini', model: 'gemini-2.5-flash-lite', inputPrice: 0.075, outputPrice: 0.30, avgPrice: 0.1875 }, + { id: 'gemini:gemini-2.0-flash', provider: 'gemini', model: 'gemini-2.0-flash', inputPrice: 0.10, outputPrice: 0.40, avgPrice: 0.25 }, + { id: 'gemini:gemini-1.5-pro', provider: 'gemini', model: 'gemini-1.5-pro', inputPrice: 1.25, outputPrice: 5.00, avgPrice: 3.125 }, + + // Anthropic Claude - https://www.anthropic.com/pricing + { id: 'claude:claude-opus-4-5', provider: 'claude', model: 'claude-opus-4-5', inputPrice: 15.00, outputPrice: 75.00, avgPrice: 45.00 }, + { id: 'claude:claude-sonnet-4-5', provider: 'claude', model: 'claude-sonnet-4-5', inputPrice: 3.00, outputPrice: 15.00, avgPrice: 9.00 }, + { id: 'claude:claude-haiku-4-5', provider: 'claude', model: 'claude-haiku-4-5', inputPrice: 0.80, outputPrice: 4.00, avgPrice: 2.40 }, + { id: 'claude:claude-opus-4-0', provider: 'claude', model: 'claude-opus-4-0', inputPrice: 15.00, outputPrice: 75.00, avgPrice: 45.00 }, + { id: 'claude:claude-sonnet-4-0', provider: 'claude', model: 'claude-sonnet-4-0', inputPrice: 3.00, outputPrice: 15.00, avgPrice: 9.00 }, + { id: 'claude:claude-3-7-sonnet-latest', provider: 'claude', model: 'claude-3-7-sonnet-latest', inputPrice: 3.00, outputPrice: 15.00, avgPrice: 9.00 }, + + // xAI Grok - https://docs.x.ai/docs/models + { id: 'grok:grok-4-fast-non-reasoning', provider: 'grok', model: 'grok-4-fast-non-reasoning', inputPrice: 3.00, outputPrice: 15.00, avgPrice: 9.00 }, + { id: 'grok:grok-4-fast-reasoning', provider: 'grok', model: 'grok-4-fast-reasoning', inputPrice: 5.00, outputPrice: 25.00, avgPrice: 15.00 }, + { id: 'grok:grok-4', provider: 'grok', model: 'grok-4', inputPrice: 10.00, outputPrice: 40.00, avgPrice: 25.00 }, + { id: 'grok:grok-3', provider: 'grok', model: 'grok-3', inputPrice: 3.00, outputPrice: 15.00, avgPrice: 9.00 }, + { id: 'grok:grok-3-fast', provider: 'grok', model: 'grok-3-fast', inputPrice: 1.00, outputPrice: 5.00, avgPrice: 3.00 }, +] + +// Helper to get pricing by model ID +export const getPricing = (id: string): ModelPricing | undefined => { + return modelPricing.find((p) => p.id === id) +} + +// Helper to get all pricing as a map +export const getPricingMap = (): Map => { + return new Map(modelPricing.map((p) => [p.id, p])) +} + + diff --git a/app/src/components/chess/ai-icon.tsx b/app/src/components/chess/ai-icon.tsx index 75d3d4c..a761d78 100644 --- a/app/src/components/chess/ai-icon.tsx +++ b/app/src/components/chess/ai-icon.tsx @@ -7,25 +7,20 @@ const avatarImages: Record, string> = { grok: '/avatars/grok-white.png', } -const OpenAI = ({ color }: { color?: string }) => { - return ( - - - - ) -} - -export const AiIcon = ({ ai, color }: { ai: NonNullable; color?: string }) => { +export const AiIcon = ({ ai, color, size }: { ai: NonNullable; color?: string; size?: number }) => { + const px = size ?? 24 if (ai === 'openai') { return ( -
- +
+ + +
) } - return {ai} + return {ai} } diff --git a/app/src/components/chess/create-game/create-game.tsx b/app/src/components/chess/create-game/create-game.tsx index 17070a4..7e71b4e 100644 --- a/app/src/components/chess/create-game/create-game.tsx +++ b/app/src/components/chess/create-game/create-game.tsx @@ -23,8 +23,11 @@ export const CreateGame: React.FC = ({ onGameCreated, onCancel }) => { try { const game = await createGame({ - white: { ai: whitePlayer.ai, model: whitePlayer.model }, - black: { ai: blackPlayer.ai, model: blackPlayer.model }, + players: { + white: { ai: whitePlayer.ai, model: whitePlayer.model }, + black: { ai: blackPlayer.ai, model: blackPlayer.model }, + }, + variant: 'guided', // Always use guided mode (with legal moves) }) onGameCreated(game.id) @@ -66,13 +69,15 @@ export const CreateGame: React.FC = ({ onGameCreated, onCancel }) => { return (
- +
+ +
) } diff --git a/app/src/components/layout.tsx b/app/src/components/layout.tsx new file mode 100644 index 0000000..8dad448 --- /dev/null +++ b/app/src/components/layout.tsx @@ -0,0 +1,105 @@ +import { cn } from '@/lib/utils' +import { BarChart3, Trophy, BookOpen } from 'lucide-react' + +type LayoutProps = { + children: React.ReactNode + leftPanel?: React.ReactNode +} + +export const Layout = ({ children, leftPanel }: LayoutProps) => { + return ( +
+ {/* Left Panel - Hidden on mobile, blends with background */} +
+ {leftPanel} +
+ + {/* Right Panel - Glassmorphism overlay */} +
+ {children} +
+
+ ) +} + +// Sidebar with tabs for the left panel - designed to blend with bg-image-landing +type SidebarPanelProps = { + activeTab: 'benchmarks' | 'leaderboard' | 'methodology' + onTabChange: (tab: 'benchmarks' | 'leaderboard' | 'methodology') => void + children: React.ReactNode +} + +export const SidebarPanel = ({ activeTab, onTabChange, children }: SidebarPanelProps) => { + return ( +
+ {/* Header with subtle branding */} +
+
+ + Chess Bench +
+ v1.0 +
+ + {/* Tabs - Pill style that blends with the background */} +
+ onTabChange('benchmarks')} + icon={} + label="Benchmarks" + /> + onTabChange('leaderboard')} + icon={} + label="Arena" + /> + onTabChange('methodology')} + icon={} + label="Methodology" + /> +
+ + {/* Tab Content - Semi-transparent panel */} +
+
+ {children} +
+
+ + {/* Footer */} +
+ Powered by + + Motia + +
+
+ ) +} + +// Tab button component +type TabButtonProps = { + active: boolean + onClick: () => void + icon: React.ReactNode + label: string +} + +const TabButton = ({ active, onClick, icon, label }: TabButtonProps) => ( + +) diff --git a/app/src/index.css b/app/src/index.css index 8f65e24..920c29e 100644 --- a/app/src/index.css +++ b/app/src/index.css @@ -167,9 +167,10 @@ body, #root { width: 100dvw; - height: 100dvh; + min-height: 100dvh; padding: 0; - overflow: hidden; + overflow-x: hidden; + overflow-y: auto; } strong { diff --git a/app/src/lib/use-create-game.ts b/app/src/lib/use-create-game.ts index a39d03c..64e0b81 100644 --- a/app/src/lib/use-create-game.ts +++ b/app/src/lib/use-create-game.ts @@ -1,9 +1,15 @@ -import type { Game } from '@chessarena/types/game' +import type { BenchmarkVariant, Game } from '@chessarena/types/game' import { apiClient } from './auth/api-client' import type { Players } from './types' +type CreateGameParams = { + players: Players + variant?: BenchmarkVariant +} + export const useCreateGame = () => { - const createGame = async (players: Players): Promise => apiClient.post('/chess/create-game', { players }) + const createGame = async ({ players, variant = 'guided' }: CreateGameParams): Promise => + apiClient.post('/chess/create-game', { players, variant }) return createGame } diff --git a/app/src/lib/use-game-history.ts b/app/src/lib/use-game-history.ts new file mode 100644 index 0000000..b6397a6 --- /dev/null +++ b/app/src/lib/use-game-history.ts @@ -0,0 +1,82 @@ +import { useCallback, useEffect, useState } from 'react' +import { apiClient } from './auth/api-client' +import type { GameHistory, GameHistoryFilter } from '@chessarena/types/game-history' + +type GameHistoryListItem = Omit + +type GameHistoryResponse = { + games: GameHistoryListItem[] + total: number + limit: number + offset: number +} + +export const useGameHistory = (initialFilter: Partial = {}) => { + const [games, setGames] = useState([]) + const [total, setTotal] = useState(0) + const [loading, setLoading] = useState(true) + const [error, setError] = useState(null) + const [filter, setFilter] = useState>(initialFilter) + + const fetchHistory = useCallback(async (params: Partial = {}) => { + setLoading(true) + setError(null) + try { + const queryParams = new URLSearchParams() + Object.entries({ ...filter, ...params }).forEach(([key, value]) => { + if (value !== undefined && value !== null && value !== '') { + queryParams.append(key, String(value)) + } + }) + + const response = await apiClient.get( + `/chess/history?${queryParams.toString()}` + ) + setGames(response.games) + setTotal(response.total) + } catch (err) { + setError('Failed to load game history') + console.error(err) + } finally { + setLoading(false) + } + }, [filter]) + + const updateFilter = useCallback((newFilter: Partial) => { + setFilter((prev) => ({ ...prev, ...newFilter })) + }, []) + + useEffect(() => { + fetchHistory() + }, [fetchHistory]) + + return { games, total, loading, error, filter, updateFilter, refetch: fetchHistory } +} + +export const useGameHistoryDetail = (gameId: string | null) => { + const [game, setGame] = useState(null) + const [loading, setLoading] = useState(false) + const [error, setError] = useState(null) + + const fetchGame = useCallback(async (id: string) => { + setLoading(true) + setError(null) + try { + const response = await apiClient.get(`/chess/history/${id}`) + setGame(response) + } catch (err) { + setError('Failed to load game details') + console.error(err) + } finally { + setLoading(false) + } + }, []) + + useEffect(() => { + if (gameId) { + fetchGame(gameId) + } + }, [gameId, fetchGame]) + + return { game, loading, error, refetch: () => gameId && fetchGame(gameId) } +} diff --git a/app/src/pages/arena-page.tsx b/app/src/pages/arena-page.tsx new file mode 100644 index 0000000..c6a7c32 --- /dev/null +++ b/app/src/pages/arena-page.tsx @@ -0,0 +1,150 @@ +import { useMemo } from 'react' +import { useNavigate } from 'react-router' +import { Swords, Bot, Trophy, ChevronRight } from 'lucide-react' +import { usePageTitle } from '@/lib/use-page-title' +import { cn } from '@/lib/utils' +import { Leaderboard } from '@/components/leaderboard/leaderboard' +import { LiveMatch } from '@/components/live-match' +import type { LiveAiGames } from '@chessarena/types/live-ai-games' +import { useStreamGroup } from '@motiadev/stream-client-react' +import { Layout } from '@/components/layout' + +export const ArenaPage = () => { + const navigate = useNavigate() + usePageTitle('Arena') + + const { data: liveAiGames } = useStreamGroup({ streamName: 'chessLiveAiGames', groupId: 'game' }) + const sortedLive = useMemo(() => liveAiGames.slice().reverse(), [liveAiGames]) + + return ( + +
+ {/* Hero Section */} +
+

Arena

+

+ Watch models battle each other, challenge a random AI agent, or set up bot-vs-bot matches. Everything is logged and prompts are visible. +

+ +
+ + + +
+
+ + {/* Content Grid */} +
+ {/* Main Leaderboard Column */} +
+
+
+ +
Model vs Model Leaderboard
+
+
+
+
Live
+
+
+ +
+ +
+
+ + {/* Sidebar Column */} +
+ {/* Live Matches Card */} +
+
+ +
Live Matches
+
+
Jump into ongoing games between models.
+
+ {sortedLive.length === 0 ? ( +
+ No live matches right now. +
+ ) : ( + sortedLive.slice(0, 5).map((game) => ( + navigate(`/game/${game.id}`)} + /> + )) + )} +
+ {sortedLive.length > 0 && ( +
+ +
+ )} +
+ + {/* Match Controls Card */} +
+
+ +
Match controls
+
+
+ Create bot-vs-bot games or play against a randomly selected model. +
+
+ + +
+
+ Prompts + raw model responses are shown in-game. Bench prompt transparency lives on the Bench page. +
+
+
+
+
+
+ ) +} + + + diff --git a/app/src/pages/bench-page.tsx b/app/src/pages/bench-page.tsx new file mode 100644 index 0000000..2136f76 --- /dev/null +++ b/app/src/pages/bench-page.tsx @@ -0,0 +1,241 @@ +import { useState, useMemo } from 'react' +import { Layout } from '@/components/layout' +import { usePageTitle } from '@/lib/use-page-title' +import { Dialog, DialogContent, DialogHeader, DialogTitle, DialogTrigger } from '@/components/ui/dialog' +import { Tab } from '@/components/ui/tab' +import { MiniArea } from '@/components/bench/bench-charts' +import { BenchBarChart } from '@/components/bench/bench-bar-charts' +import { mockBenchLeaderboard, mockBenchTimeseries, mockPrompts } from '@/components/bench/bench-mock' +import { Prism as SyntaxHighlighter } from 'react-syntax-highlighter' +import { vscDarkPlus } from 'react-syntax-highlighter/dist/esm/styles/prism' +import { BarChart3, ShieldCheck, Swords, Brain } from 'lucide-react' +import { useStreamGroup } from '@motiadev/stream-client-react' +import type { LegalMoveBenchmarkSummary } from '@chessarena/types/legal-move-benchmark' +import type { PuzzleBenchmarkSummary } from '@chessarena/types/puzzle-benchmark' +import type { StockfishBenchmarkSummary } from '@chessarena/types/stockfish-benchmark' + +export const BenchPage = () => { + usePageTitle('Motia Chess Bench') + const [promptTab, setPromptTab] = useState('legal') + + // Stream data for real-time benchmarks + const { data: legalSummaries } = useStreamGroup({ + streamName: 'legalMoveBenchmarkSummary', + groupId: 'models', + }) + const { data: puzzleSummaries } = useStreamGroup({ + streamName: 'puzzleBenchmarkSummary', + groupId: 'models', + }) + const { data: stockfishSummaries } = useStreamGroup({ + streamName: 'stockfishBenchmarkSummary', + groupId: 'models', + }) + + // Merge real stream data with mock data as fallback + const benchRows = useMemo(() => { + const legalById = new Map(legalSummaries.map((s) => [`${s.provider}:${s.model}`, s])) + const puzzleById = new Map(puzzleSummaries.map((s) => [`${s.provider}:${s.model}`, s])) + const stockfishById = new Map(stockfishSummaries.map((s) => [`${s.provider}:${s.model}`, s])) + + return mockBenchLeaderboard.map((row) => { + const legal = legalById.get(row.id) + const puzzle = puzzleById.get(row.id) + const stockfish = stockfishById.get(row.id) + + const legalMoveScore = legal?.averageScore ?? row.legalMoveScore + const puzzleScore = puzzle?.overallAccuracy ?? row.puzzleScore + const acpl = stockfish?.averageAcpl ?? row.acpl + const acplScore = Math.max(0, 100 - acpl) + const motiaChessIndex = Number((0.4 * legalMoveScore + 0.3 * puzzleScore + 0.3 * acplScore).toFixed(1)) + const lastUpdatedAt = Math.max( + legal?.lastRunAt ?? 0, + puzzle?.lastRunAt ?? 0, + stockfish?.lastRunAt ?? 0, + row.lastUpdatedAt + ) + + return { + ...row, + legalMoveScore, + puzzleScore, + acpl, + motiaChessIndex, + lastUpdatedAt, + } + }) + }, [legalSummaries, puzzleSummaries, stockfishSummaries]) + + // Calculate global averages from stream data + const globalAverages = useMemo(() => { + const hasRealData = legalSummaries.length > 0 || puzzleSummaries.length > 0 || stockfishSummaries.length > 0 + + if (!hasRealData) { + return { + legalMoveScore: mockBenchTimeseries.legalMoveScore.at(-1)?.v ?? 0, + puzzleScore: mockBenchTimeseries.puzzleScore.at(-1)?.v ?? 0, + acpl: mockBenchTimeseries.acpl.at(-1)?.v ?? 0, + } + } + + const avgLegal = legalSummaries.length > 0 + ? legalSummaries.reduce((sum, s) => sum + s.averageScore, 0) / legalSummaries.length + : mockBenchTimeseries.legalMoveScore.at(-1)?.v ?? 0 + + const avgPuzzle = puzzleSummaries.length > 0 + ? puzzleSummaries.reduce((sum, s) => sum + (s.overallAccuracy ?? 0), 0) / puzzleSummaries.length + : mockBenchTimeseries.puzzleScore.at(-1)?.v ?? 0 + + const avgAcpl = stockfishSummaries.length > 0 + ? stockfishSummaries.reduce((sum, s) => sum + s.averageAcpl, 0) / stockfishSummaries.length + : mockBenchTimeseries.acpl.at(-1)?.v ?? 0 + + return { + legalMoveScore: Math.round(avgLegal), + puzzleScore: Math.round(avgPuzzle), + acpl: Math.round(avgAcpl), + } + }, [legalSummaries, puzzleSummaries, stockfishSummaries]) + + return ( + +
+ + {/* Header Section */} +
+
+

Benchmark Suite

+

+ Evaluating AI chess capabilities across legal move generation,{' '} + puzzle solving, and move quality. + Transparent, reproducible, and open source. +

+
+
+ + + + + + + Benchmark Prompts (Transparency) + +
+
+ setPromptTab('legal')}> + Legal moves + + setPromptTab('puzzle')}> + Puzzles + + setPromptTab('guided')}> + Arena (guided) + +
+
+ {promptTab === 'legal' && ( + + {mockPrompts.legalMoveBench} + + )} + {promptTab === 'puzzle' && ( + + {mockPrompts.puzzleBench} + + )} + {promptTab === 'guided' && ( + + {mockPrompts.aiPlayerGuided} + + )} +
+
+
+
+
+
+ + {/* Global Metrics Cards */} +
+
+
+
+ +
+
Legal Move Gen
+
+
{globalAverages.legalMoveScore}%
+
Global Average (7d)
+ +
+ +
+
+
+ +
+
Puzzle Solving
+
+
{globalAverages.puzzleScore}%
+
Global Average (7d)
+ +
+ +
+
+
+ +
+
Move Quality
+
+
{globalAverages.acpl}
+
Avg ACPL (7d)
+ +
+
+ + {/* Comparison Charts */} +
+
+ +

Model Comparison

+
+ +
+ + + + +
+
+ +
+
+ ) +} + + + diff --git a/app/src/pages/game-history-page.tsx b/app/src/pages/game-history-page.tsx new file mode 100644 index 0000000..9c891ae --- /dev/null +++ b/app/src/pages/game-history-page.tsx @@ -0,0 +1,259 @@ +import { useState } from 'react' +import { useNavigate } from 'react-router' +import { ArrowLeft, Download, Filter, Clock, Trophy, AlertTriangle, ChevronRight } from 'lucide-react' +import { usePageTitle } from '@/lib/use-page-title' +import { useGameHistory } from '@/lib/use-game-history' +import { AiIcon } from '@/components/chess/ai-icon' +import { cn } from '@/lib/utils' +import type { BenchmarkVariant } from '@chessarena/types/game' +import type { AiModelProvider } from '@chessarena/types/ai-models' + +const formatDuration = (ms: number) => { + const seconds = Math.floor(ms / 1000) + const minutes = Math.floor(seconds / 60) + const hours = Math.floor(minutes / 60) + + if (hours > 0) return `${hours}h ${minutes % 60}m` + if (minutes > 0) return `${minutes}m ${seconds % 60}s` + return `${seconds}s` +} + +const formatDate = (timestamp: number) => { + return new Date(timestamp).toLocaleDateString('en-US', { + month: 'short', + day: 'numeric', + hour: '2-digit', + minute: '2-digit', + }) +} + +export const GameHistoryPage = () => { + const navigate = useNavigate() + usePageTitle('Game History') + + const { games, total, loading, filter, updateFilter } = useGameHistory({ limit: 20 }) + const [showFilters, setShowFilters] = useState(false) + + const handleExport = (format: 'json' | 'csv') => { + const params = new URLSearchParams() + Object.entries(filter).forEach(([key, value]) => { + if (value !== undefined && value !== null) { + params.append(key, String(value)) + } + }) + params.append('format', format) + window.open(`${import.meta.env.VITE_API_URL}/chess/history/export?${params.toString()}`, '_blank') + } + + return ( +
+
+ +
+
+
+ +

Game History

+
+ + + +
+
+
+ + {showFilters && ( +
+
+ + + + + + + +
+
+ )} + +
+
{total} games found
+ + {loading ? ( +
+ {[...Array(5)].map((_, i) => ( +
+ ))} +
+ ) : games.length === 0 ? ( +
+ No games found. Play some games to see them here! +
+ ) : ( +
+ {games.map((game) => ( + + ))} +
+ )} +
+
+
+ ) +} diff --git a/app/src/pages/game-replay-page.tsx b/app/src/pages/game-replay-page.tsx new file mode 100644 index 0000000..5b76110 --- /dev/null +++ b/app/src/pages/game-replay-page.tsx @@ -0,0 +1,389 @@ +import { useState, useEffect, useMemo } from 'react' +import { useParams, useNavigate } from 'react-router' +import { + ArrowLeft, + ChevronLeft, + ChevronRight, + ChevronsLeft, + ChevronsRight, + Play, + Pause, + Download, + Trophy, + AlertTriangle, + MessageSquare, +} from 'lucide-react' +import { Chessground } from '@/components/chess/chessground' +import { usePageTitle } from '@/lib/use-page-title' +import { useGameHistoryDetail } from '@/lib/use-game-history' +import { AiIcon } from '@/components/chess/ai-icon' +import { cn } from '@/lib/utils' +import type { AiModelProvider } from '@chessarena/types/ai-models' +import type { Key } from '@lichess-org/chessground/types' + +const INITIAL_FEN = 'rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1' + +export const GameReplayPage = () => { + const { gameId } = useParams<{ gameId: string }>() + const navigate = useNavigate() + usePageTitle('Game Replay') + + const { game, loading, error } = useGameHistoryDetail(gameId || null) + const [currentMoveIndex, setCurrentMoveIndex] = useState(-1) + const [isPlaying, setIsPlaying] = useState(false) + const [showMessages, setShowMessages] = useState(true) + + const currentFen = useMemo(() => { + if (!game || currentMoveIndex < 0) return INITIAL_FEN + return game.moves[currentMoveIndex]?.fenAfter || INITIAL_FEN + }, [game, currentMoveIndex]) + + const currentMove = useMemo(() => { + if (!game || currentMoveIndex < 0) return null + return game.moves[currentMoveIndex] + }, [game, currentMoveIndex]) + + const lastMoveHighlight = useMemo((): [Key, Key] | undefined => { + if (!currentMove) return undefined + return currentMove.lastMove as [Key, Key] + }, [currentMove]) + + + + const currentThought = useMemo(() => { + if (!game || currentMoveIndex < 0) return null + const moveColor = game.moves[currentMoveIndex]?.color + const relevantMessages = game.messages.filter((m) => m.role === moveColor) + const messageIndex = Math.floor(currentMoveIndex / 2) + (moveColor === 'black' ? 0 : 0) + return relevantMessages[messageIndex] || null + }, [game, currentMoveIndex]) + + // Auto-play + useEffect(() => { + if (!isPlaying || !game) return + + const interval = setInterval(() => { + setCurrentMoveIndex((prev) => { + if (prev >= game.moves.length - 1) { + setIsPlaying(false) + return prev + } + return prev + 1 + }) + }, 1000) + + return () => clearInterval(interval) + }, [isPlaying, game]) + + const goToStart = () => { + setCurrentMoveIndex(-1) + setIsPlaying(false) + } + + const goToEnd = () => { + if (game) { + setCurrentMoveIndex(game.moves.length - 1) + setIsPlaying(false) + } + } + + const goBack = () => { + setCurrentMoveIndex((prev) => Math.max(-1, prev - 1)) + } + + const goForward = () => { + if (game) { + setCurrentMoveIndex((prev) => Math.min(game.moves.length - 1, prev + 1)) + } + } + + const handleDownloadPgn = () => { + if (!game?.pgn) return + const blob = new Blob([game.pgn], { type: 'text/plain' }) + const url = URL.createObjectURL(blob) + const a = document.createElement('a') + a.href = url + a.download = `chessarena-${game.id}.pgn` + a.click() + URL.revokeObjectURL(url) + } + + if (loading) { + return ( +
+
Loading game...
+
+ ) + } + + if (error || !game) { + return ( +
+
{error || 'Game not found'}
+ +
+ ) + } + + return ( +
+
+
+ +
+ + {game.variant} + + +
+
+
+ +
+
+ {/* Board Section */} +
+ {/* Players */} +
+
+ {game.blackPlayer.provider ? ( + <> +
+ +
+
+
+ {game.blackPlayer.provider} + {game.winner === 'black' && } +
+
{game.blackPlayer.model}
+
+ + ) : ( +
Human (Black)
+ )} + {game.blackIllegalMoves > 0 && ( + + + {game.blackIllegalMoves} + + )} +
+
+ + {/* Chess Board */} +
+ +
+ +
+
+ {game.whitePlayer.provider ? ( + <> +
+ +
+
+
+ {game.whitePlayer.provider} + {game.winner === 'white' && } +
+
{game.whitePlayer.model}
+
+ + ) : ( +
Human (White)
+ )} + {game.whiteIllegalMoves > 0 && ( + + + {game.whiteIllegalMoves} + + )} +
+
+ + {/* Controls */} +
+ + + + + +
+ +
+ Move {currentMoveIndex + 1} of {game.moves.length} +
+
+ + {/* Side Panel */} +
+ {/* Move List */} +
+
+

Moves

+
+
+
+ {game.moves.map((move, idx) => ( + + ))} +
+
+
+ + {/* AI Reasoning */} +
+ + {showMessages && ( +
+ {currentThought ? ( +
+
+ + {currentThought.sender} + + {currentThought.moveSan && ( + + {currentThought.moveSan} + + )} + {currentThought.isIllegalMove && ( + Illegal + )} +
+

{currentThought.message}

+
+ ) : ( +

Select a move to see AI reasoning

+ )} +
+ )} +
+ + {/* Game Result */} + {game.endGameReason && ( +
+

Result

+

{game.endGameReason}

+ {game.winner && ( +

+ Winner: {game.winner} +

+ )} +
+ )} + + {/* Scoreboard */} + {game.scoreboard && ( +
+

Score

+
+
+
White
+
{game.scoreboard.white.finalCentipawnScore} cp
+
+ {game.scoreboard.white.blunders} blunders +
+
+
+
Black
+
{game.scoreboard.black.finalCentipawnScore} cp
+
+ {game.scoreboard.black.blunders} blunders +
+
+
+
+ )} +
+
+
+
+ ) +} diff --git a/app/src/pages/landing-page.tsx b/app/src/pages/landing-page.tsx index 6544e39..cf7c0de 100644 --- a/app/src/pages/landing-page.tsx +++ b/app/src/pages/landing-page.tsx @@ -1,61 +1,212 @@ -import { Trophy } from 'lucide-react' +import { useState, useMemo } from 'react' import { useNavigate } from 'react-router' import { usePageTitle } from '@/lib/use-page-title' -import { AuthContainer } from '@/components/auth/auth-container' -import { CreateGameButton } from '@/components/chess/create-game/create-game-button' +import { Layout, SidebarPanel } from '@/components/layout' +import { BenchBarChart, providerColors } from '@/components/bench/bench-bar-charts' +import { mockBenchLeaderboard } from '@/components/bench/bench-mock' +import { Trophy } from 'lucide-react' import { Leaderboard } from '@/components/leaderboard/leaderboard' +import { useStreamGroup } from '@motiadev/stream-client-react' +import type { LegalMoveBenchmarkSummary } from '@chessarena/types/legal-move-benchmark' +import type { PuzzleBenchmarkSummary } from '@chessarena/types/puzzle-benchmark' import { TopBar } from '@/components/ui/top-bar' -import { PageGrid, PageGridRightColumn } from '@/components/page-grid' -import { BaseButton } from '@/components/ui/base-button' import { ChessArenaLogo } from '@/components/ui/chess-arena-logo' +import { CreateGameButton } from '@/components/chess/create-game/create-game-button' +import { BaseButton } from '@/components/ui/base-button' +import { AuthContainer } from '@/components/auth/auth-container' export const LandingPage = () => { const navigate = useNavigate() + usePageTitle('Powered by Motia') + + const [activeTab, setActiveTab] = useState<'benchmarks' | 'leaderboard' | 'methodology'>('benchmarks') + const goToAbout = (e: React.MouseEvent) => { e.preventDefault() navigate('/about') } - usePageTitle('Powered by Motia') + // Stream data for benchmarks + const { data: legalSummaries } = useStreamGroup({ + streamName: 'legalMoveBenchmarkSummary', + groupId: 'models', + }) + const { data: puzzleSummaries } = useStreamGroup({ + streamName: 'puzzleBenchmarkSummary', + groupId: 'models', + }) - return ( - -
- -
- - -
- -

Welcome to ChessArena.ai powered by Motia!

-

- ChessArena.ai was created to show how leading models compete against each other in chess games.{' '} - - Click here to learn more. - -

+ const benchRows = useMemo(() => { + const legalById = new Map(legalSummaries.map((s) => [`${s.provider}:${s.model}`, s])) + const puzzleById = new Map(puzzleSummaries.map((s) => [`${s.provider}:${s.model}`, s])) + + return mockBenchLeaderboard.map((row) => { + const legal = legalById.get(row.id) + const puzzle = puzzleById.get(row.id) + + const legalMoveScore = legal?.averageScore ?? 0 + const puzzleScore = puzzle?.overallAccuracy ?? 0 + const motiaChessIndex = Number(((legalMoveScore + puzzleScore) / 2).toFixed(1)) + const lastUpdatedAt = Math.max(legal?.lastRunAt ?? 0, puzzle?.lastRunAt ?? 0, row.lastUpdatedAt) + + return { + ...row, + legalMoveScore, + puzzleScore, + motiaChessIndex, + lastUpdatedAt, + } + }) + }, [legalSummaries, puzzleSummaries]) + + // Left panel content based on active tab + const leftPanelContent = ( + + {activeTab === 'benchmarks' && ( +
+ {/* Header */} +
+

AI Benchmark Results

+

Real-time performance metrics across all models

+
+ + {/* Provider Legend */} +
+ Providers +
+ {Object.entries(providerColors).map(([provider, color]) => ( +
+
+ {provider} +
+ ))} +
+
+ + {/* Motia Chess Index */} + + + {/* Legal Move Score */} + + + {/* Puzzle Score */} +
-
- - navigate('/new')}>Create Game -
- navigate('/live-matches')}> - View Live Matches - - navigate('/leaderboard')}> - Leaderboard - + )} + + {activeTab === 'leaderboard' && ( + + )} + + {activeTab === 'methodology' && ( +
+
+

Benchmark Methodology

+

+ Our benchmarks evaluate LLM chess capabilities across multiple dimensions: +

+
+ +
+
+

Legal Move Generation

+

+ Models are given a FEN position and asked to list all legal moves. + We measure F1 score comparing predicted vs actual legal moves. +

+
+ +
+

Puzzle Solving

+

+ Models solve mate-in-1 and tactical puzzles from Lichess. + We track accuracy across 100+ puzzles per model. +

+
+ +
+

Move Quality (ACPL)

+

+ Average Centipawn Loss measures how optimal each move is + compared to Stockfish 16 analysis. Lower is better. +

+
-

- This project is open-source, click{' '} - - here - {' '} - to read more about the project. -

+
+ +
+
+ )} + + ) + + return ( + + {/* Top Bar */} + + + {/* Main Content - Similar to original main branch */} +
+ +

Welcome to ChessArena.ai powered by Motia!

+

+ ChessArena.ai was created to show how leading models compete against each other in chess games.{' '} + + Click here to learn more. + +

+
+ + {/* Actions */} +
+ + navigate('/new')}>Create Game +
+ navigate('/live-matches')}> + View Live Matches + + navigate('/leaderboard')}> + Leaderboard +
- - + +

+ This project is open-source, click{' '} + + here + {' '} + to read more about the project. +

+
+
) } diff --git a/app/src/pages/methodology-page.tsx b/app/src/pages/methodology-page.tsx new file mode 100644 index 0000000..a58011f --- /dev/null +++ b/app/src/pages/methodology-page.tsx @@ -0,0 +1,241 @@ +import React from 'react' +import { usePageTitle } from '@/lib/use-page-title' +import { Layout } from '@/components/layout' +import { ShieldCheck, Brain, Activity, ArrowRight, FileText, Cpu, MessageSquare, CheckCircle, BarChart3 } from 'lucide-react' + +const DetailSection = ({ title, icon, children }: { title: string, icon: React.ReactNode, children: React.ReactNode }) => ( +
+
+
+ {icon} +
+

{title}

+
+
+ {children} +
+
+) + +// Visual flow diagram component +const FlowStep = ({ icon, label, description, isLast = false }: { icon: React.ReactNode, label: string, description: string, isLast?: boolean }) => ( +
+
+
+ {icon} +
+
+
{label}
+
{description}
+
+
+ {!isLast && ( + + )} +
+) + +const BenchmarkFlowDiagram = () => ( +
+

How Benchmarks Work

+
+ } + label="Prompt" + description="FEN + context sent to model" + /> + } + label="Model" + description="LLM processes the request" + /> + } + label="Response" + description="JSON output parsed" + /> + } + label="Validation" + description="Checked against rules" + /> + } + label="Score" + description="Metrics calculated" + isLast + /> +
+
+) + +// Benchmark comparison with other LLM benchmarks +const BenchmarkComparisonSection = () => ( +
+

Why Chess Benchmarks?

+

+ While benchmarks like MMLU measure general knowledge and HumanEval measures coding ability, + chess provides a unique lens into spatial reasoning, + rule adherence, and strategic planning. +

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
BenchmarkMeasuresLimitation
MMLUGeneral knowledge (57 subjects)Multiple choice format, memorization
HumanEvalCode generation from docstringsPython-focused, short functions
ARCScience reasoning (grade school)Limited to science domain
Motia Chess IndexRule adherence, tactics, strategyChess-specific domain
+
+ +
+

+ Key insight: Chess requires strict rule following with zero tolerance for errors — + a single illegal move loses the game. This makes it an excellent test for LLM reliability and precision. +

+
+
+) + +export const MethodologyPage = () => { + usePageTitle('Methodology') + + return ( + +
+ +
+

Benchmark Methodology

+

+ Technical details on how the Motia Chess Index and component scores are calculated. +

+
+ + {/* Visual flow diagram */} + + + {/* Comparison with other benchmarks */} + + + } + > +
+

+ This benchmark measures a model's ability to strictly adhere to the rules of chess. + We present the model with a series of game positions (FEN strings) and ask it to list every single legal move available. +

+
+
+

Metrics

+
    +
  • Precision: (Correct Moves / Total Generated)
  • +
  • Recall: (Correct Moves / Actual Legal Moves)
  • +
  • Score: F1-Score (Harmonic mean of P & R)
  • +
+
+
+

Prompt Strategy

+

+ Models are prompted with the PGN context and current FEN. They must output a JSON array of SAN strings. + We handle various JSON formatting errors gracefully to focus on chess capability. +

+
+
+
+
+ + } + > +
+

+ Measures tactical sharpness. We source puzzles from the Lichess database, specifically filtering for + Mate-in-1 and short tactical sequences. +

+

+ The model is given the position and asked for the "best move". +

+
+

Scoring

+

+ Accuracy %: The model gets 1 point if its generated move matches the solution move exactly. + 0 points otherwise. No partial credit. +

+
+
+
+ + } + > +
+

+ Evaluates the strategic quality of moves played in the Arena. + We use Stockfish 16 (depth 18+) to evaluate every move played by the AI. +

+

+ Centipawn Loss (CPL) is the difference in evaluation between the engine's best move and the move actually played. +

+
+
+

Formula

+ + ACPL = Σ (BestEval - PlayedEval) / NumMoves + +

+ Lower is better. A grandmaster might have 15-20 ACPL. A beginner >100. +

+
+
+

Visualization

+

+ On our charts, we invert this score (100 - ACPL, clamped at 0) so that + "higher bars" always mean "better performance", maintaining consistency with other metrics. +

+
+
+
+
+ +
+

Reproducibility

+

+ All benchmarks are open source. You can run them yourself using the + motia CLI and the provided + API scripts in the GitHub repository. +

+
+ +
+
+ ) +} diff --git a/app/src/pages/play-ai-page.tsx b/app/src/pages/play-ai-page.tsx new file mode 100644 index 0000000..d942374 --- /dev/null +++ b/app/src/pages/play-ai-page.tsx @@ -0,0 +1,150 @@ +import { useState } from 'react' +import { useNavigate } from 'react-router' +import { Page } from '@/components/page' +import { usePageTitle } from '@/lib/use-page-title' +import { useAuth } from '@/lib/auth/use-auth' +import { apiClient } from '@/lib/auth/api-client' +import { cn } from '@/lib/utils' + +type ColorChoice = 'white' | 'black' | 'random' + +export const PlayAIPage = () => { + const navigate = useNavigate() + const { isAuthenticated } = useAuth() + const [isLoading, setIsLoading] = useState(false) + const [selectedColor, setSelectedColor] = useState('random') + const [error, setError] = useState(null) + + usePageTitle('Play vs AI') + + const handlePlay = async () => { + if (!isAuthenticated) { + localStorage.setItem('chessarena-redirect', '/play-ai') + navigate('/login') + return + } + + setIsLoading(true) + setError(null) + + try { + const data = await apiClient.post<{ game: { id: string }; opponent: { provider: string; model: string } }>( + '/chess/play-vs-ai', + { playerColor: selectedColor }, + ) + + navigate(`/game/${data.game.id}`) + } catch (err) { + const message = err instanceof Error ? err.message : 'Something went wrong' + setError(message) + setIsLoading(false) + } + } + + const colorOptions: { value: ColorChoice; label: string; icon: string }[] = [ + { value: 'white', label: 'White', icon: '♔' }, + { value: 'random', label: 'Random', icon: '🎲' }, + { value: 'black', label: 'Black', icon: '♚' }, + ] + + return ( + +
+
+ {/* Header */} +
+

Play vs AI

+

+ Challenge a randomly selected AI opponent. Cheaper models appear more often! +

+
+ + {/* Color Selection */} +
+ +
+ {colorOptions.map((option) => ( + + ))} +
+
+ + {/* Play Button */} + + + {/* Error */} + {error && ( +
+ {error} +
+ )} + + {/* Info */} +
+

How it works

+
    +
  • • You'll be matched against a random AI model
  • +
  • • Cheaper/faster models have higher chance of selection
  • +
  • • Premium models (GPT-5, Claude Opus) appear less often
  • +
  • • AI will provide legal moves to help you play
  • +
+
+ + {/* Back Button */} + +
+
+
+ ) +} diff --git a/app/vite.config.ts b/app/vite.config.ts index 1c55eb9..9fd4d88 100644 --- a/app/vite.config.ts +++ b/app/vite.config.ts @@ -33,5 +33,8 @@ export default defineConfig({ }, }, allowedHosts: true, + fs: { + allow: ['..'], + }, }, }) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index d824935..6cf5c6a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -6,6 +6,9 @@ settings: catalogs: default: + typescript: + specifier: ~5.8.3 + version: 5.8.3 zod: specifier: ^3.25.76 version: 3.25.76 @@ -58,7 +61,7 @@ importers: version: 9.0.2 motia: specifier: 0.8.2-beta.139 - version: 0.8.2-beta.139(@types/node@22.15.21)(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(eslint@9.27.0(jiti@2.5.1))(jiti@2.5.1)(lightningcss@1.30.1)(monaco-editor@0.53.0)(tsx@4.20.6)(typescript@5.8.3)(use-sync-external-store@1.5.0(react@19.1.0)) + version: 0.8.2-beta.139(@types/node@22.15.21)(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(eslint@9.27.0(jiti@2.5.1))(immer@11.0.1)(jiti@2.5.1)(lightningcss@1.30.1)(monaco-editor@0.53.0)(tsx@4.20.6)(typescript@5.8.3)(use-sync-external-store@1.5.0(react@19.1.0)) mustache: specifier: ^4.2.0 version: 4.2.0 @@ -162,6 +165,9 @@ importers: react-syntax-highlighter: specifier: ^15.6.1 version: 15.6.1(react@19.1.0) + recharts: + specifier: ^3.6.0 + version: 3.6.0(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react-is@19.2.3)(react@19.1.0)(redux@5.0.1) sonner: specifier: ^2.0.5 version: 2.0.5(react-dom@19.1.0(react@19.1.0))(react@19.1.0) @@ -1238,6 +1244,17 @@ packages: '@radix-ui/rect@1.1.1': resolution: {integrity: sha512-HPwpGIzkl28mWyZqG52jiqDJ12waP11Pa1lGoiyUkIEuMLBP0oeK/C89esbXrxsky5we7dfd8U58nm0SgAWpVw==} + '@reduxjs/toolkit@2.11.2': + resolution: {integrity: sha512-Kd6kAHTA6/nUpp8mySPqj3en3dm0tdMIgbttnQ1xFMVpufoj+ADi8pXLBsd4xzTRHQa7t/Jv8W5UnCuW4kuWMQ==} + peerDependencies: + react: ^16.9.0 || ^17.0.0 || ^18 || ^19 + react-redux: ^7.2.1 || ^8.1.3 || ^9.0.0 + peerDependenciesMeta: + react: + optional: true + react-redux: + optional: true + '@rolldown/pluginutils@1.0.0-beta.9': resolution: {integrity: sha512-e9MeMtVWo186sgvFFJOPGy7/d2j2mZhLJIdVW0C/xDluuOvymEATqz6zKsP0ZmXGzQtqlyjz5sC1sYQUoJG98w==} @@ -1344,6 +1361,9 @@ packages: '@standard-schema/spec@1.0.0': resolution: {integrity: sha512-m2bOd0f2RT9k8QJx1JN85cZYyH1RqFBdlwtkSlf4tBDYLCiiZnv1fIIwacK6cqwXavOydf0NPToMQgpKq+dVlA==} + '@standard-schema/utils@0.3.0': + resolution: {integrity: sha512-e7Mew686owMaPJVNNLs55PUvgz371nKgwsc4vxE49zsODpJEnxgxRo2y/OKrqueavXgZNMDVj3DdHFlaSAeU8g==} + '@supabase/auth-js@2.71.1': resolution: {integrity: sha512-mMIQHBRc+SKpZFRB2qtupuzulaUhFYupNyxqDj5Jp/LyPvcWvjaJzZzObv6URtL/O6lPxkanASnotGtNpS3H2Q==} @@ -1568,18 +1588,39 @@ packages: '@types/babel__traverse@7.20.7': resolution: {integrity: sha512-dkO5fhS7+/oos4ciWxyEyjWe48zmG6wbCheo/G2ZnHx4fs3EU6YC6UM8rk56gAjNJ9P3MTH2jo5jb92/K6wbng==} + '@types/d3-array@3.2.2': + resolution: {integrity: sha512-hOLWVbm7uRza0BYXpIIW5pxfrKe0W+D5lrFiAEYR+pb6w3N2SwSMaJbXdUfSEv+dT4MfHBLtn5js0LAWaO6otw==} + '@types/d3-color@3.1.3': resolution: {integrity: sha512-iO90scth9WAbmgv7ogoq57O9YpKmFBbmoEoCHDB2xMBY0+/KVrqAaCDyCE16dUspeOvIxFFRI+0sEtqDqy2b4A==} '@types/d3-drag@3.0.7': resolution: {integrity: sha512-HE3jVKlzU9AaMazNufooRJ5ZpWmLIoc90A37WU2JMmeq28w1FQqCZswHZ3xR+SuxYftzHq6WU6KJHvqxKzTxxQ==} + '@types/d3-ease@3.0.2': + resolution: {integrity: sha512-NcV1JjO5oDzoK26oMzbILE6HW7uVXOHLQvHshBUW4UMdZGfiY6v5BeQwh9a9tCzv+CeefZQHJt5SRgK154RtiA==} + '@types/d3-interpolate@3.0.4': resolution: {integrity: sha512-mgLPETlrpVV1YRJIglr4Ez47g7Yxjl1lj7YKsiMCb27VJH9W8NVM6Bb9d8kkpG/uAQS5AmbA48q2IAolKKo1MA==} + '@types/d3-path@3.1.1': + resolution: {integrity: sha512-VMZBYyQvbGmWyWVea0EHs/BwLgxc+MKi1zLDCONksozI4YJMcTt8ZEuIR4Sb1MMTE8MMW49v0IwI5+b7RmfWlg==} + + '@types/d3-scale@4.0.9': + resolution: {integrity: sha512-dLmtwB8zkAeO/juAMfnV+sItKjlsw2lKdZVVy6LRr0cBmegxSABiLEpGVmSJJ8O08i4+sGR6qQtb6WtuwJdvVw==} + '@types/d3-selection@3.0.11': resolution: {integrity: sha512-bhAXu23DJWsrI45xafYpkQ4NtcKMwWnAC/vKrd2l+nxMFuvOT3XMYTIj2opv8vq8AO5Yh7Qac/nSeP/3zjTK0w==} + '@types/d3-shape@3.1.7': + resolution: {integrity: sha512-VLvUQ33C+3J+8p+Daf+nYSOsjB4GXp19/S/aGo60m9h1v6XaxjiT82lKVWJCfzhtuZ3yD7i/TPeC/fuKLLOSmg==} + + '@types/d3-time@3.0.4': + resolution: {integrity: sha512-yuzZug1nkAAaBlBBikKZTgzCeA+k1uy4ZFwWANOfKw5z5LRhV0gNA7gNkKm7HoK+HRN0wX3EkxGk0fpbWhmB7g==} + + '@types/d3-timer@3.0.2': + resolution: {integrity: sha512-Ps3T8E8dZDam6fUyNiMkekK3XUsaUEik+idO9/YjPtfj2qruF8tFBXS7XhtE4iIXBLxhmLjP3SXpLhVf21I9Lw==} + '@types/d3-transition@3.0.9': resolution: {integrity: sha512-uZS5shfxzO3rGlu0cC3bjmMFKsXv+SmZZcgp0KD22ts4uGXp5EVYGzu/0YdwZeKmddhcAccYtREJKkPfXkZuCg==} @@ -1645,6 +1686,9 @@ packages: '@types/unist@3.0.3': resolution: {integrity: sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==} + '@types/use-sync-external-store@0.0.6': + resolution: {integrity: sha512-zFDAD+tlpf2r4asuHEj0XH6pY6i0g5NeAHPn+15wk3BV6JA69eERFXC1gyGThDkVa1zCyKr5jox1+2LbV/AMLg==} + '@types/ws@8.18.1': resolution: {integrity: sha512-ThVF6DCVhA8kUGy+aazFQ4kXQ7E1Ty7A3ypFOe0IcJV8O/M511G99AW24irKrW56Wt44yG9+ij8FaqoBGkuBXg==} @@ -2115,6 +2159,10 @@ packages: csstype@3.1.3: resolution: {integrity: sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==} + d3-array@3.2.4: + resolution: {integrity: sha512-tdQAmyA18i4J7wprpYq8ClcxZy3SC31QMeByyCFyRt7BVHdREQZ5lpzoe5mFEYZUWe+oq8HBvk9JjpibyEV4Jg==} + engines: {node: '>=12'} + d3-color@3.1.0: resolution: {integrity: sha512-zg/chbXyeBtMQ1LbD/WSoW2DpC3I0mpmPdW+ynRTj/x2DAWYrIY7qeZIHidozwV24m4iavr15lNwIwLxRmOxhA==} engines: {node: '>=12'} @@ -2131,14 +2179,38 @@ packages: resolution: {integrity: sha512-wR/XK3D3XcLIZwpbvQwQ5fK+8Ykds1ip7A2Txe0yxncXSdq1L9skcG7blcedkOX+ZcgxGAmLX1FrRGbADwzi0w==} engines: {node: '>=12'} + d3-format@3.1.0: + resolution: {integrity: sha512-YyUI6AEuY/Wpt8KWLgZHsIU86atmikuoOmCfommt0LYHiQSPjvX2AcFc38PX0CBpr2RCyZhjex+NS/LPOv6YqA==} + engines: {node: '>=12'} + d3-interpolate@3.0.1: resolution: {integrity: sha512-3bYs1rOD33uo8aqJfKP3JWPAibgw8Zm2+L9vBKEHJ2Rg+viTR7o5Mmv5mZcieN+FRYaAOWX5SJATX6k1PWz72g==} engines: {node: '>=12'} + d3-path@3.1.0: + resolution: {integrity: sha512-p3KP5HCf/bvjBSSKuXid6Zqijx7wIfNW+J/maPs+iwR35at5JCbLUT0LzF1cnjbCHWhqzQTIN2Jpe8pRebIEFQ==} + engines: {node: '>=12'} + + d3-scale@4.0.2: + resolution: {integrity: sha512-GZW464g1SH7ag3Y7hXjf8RoUuAFIqklOAq3MRl4OaWabTFJY9PN/E1YklhXLh+OQ3fM9yS2nOkCoS+WLZ6kvxQ==} + engines: {node: '>=12'} + d3-selection@3.0.0: resolution: {integrity: sha512-fmTRWbNMmsmWq6xJV8D19U/gw/bwrHfNXxrIN+HfZgnzqTHp9jOmKMhsTUjXOJnZOdZY9Q28y4yebKzqDKlxlQ==} engines: {node: '>=12'} + d3-shape@3.2.0: + resolution: {integrity: sha512-SaLBuwGm3MOViRq2ABk3eLoxwZELpH6zhl3FbAoJ7Vm1gofKx6El1Ib5z23NUEhF9AsGl7y+dzLe5Cw2AArGTA==} + engines: {node: '>=12'} + + d3-time-format@4.1.0: + resolution: {integrity: sha512-dJxPBlzC7NugB2PDLwo9Q8JiTR3M3e4/XANkreKSUxF8vvXKqm1Yfq4Q5dl8budlunRVlUUaDUgFt7eA8D6NLg==} + engines: {node: '>=12'} + + d3-time@3.1.0: + resolution: {integrity: sha512-VqKjzBLejbSMT4IgbmVgDjpkYrNWUYJnbCGo874u7MMKIWsILRX+OpX/gTk8MqjpT1A/c6HY2dCA77ZN0lkQ2Q==} + engines: {node: '>=12'} + d3-timer@3.0.1: resolution: {integrity: sha512-ndfJ/JxxMd3nw31uyKoY2naivF+r29V+Lc0svZxe1JvvIRmi8hUsrMvdOwgS1o6uBHmiz91geQ0ylPP0aj1VUA==} engines: {node: '>=12'} @@ -2176,6 +2248,9 @@ packages: supports-color: optional: true + decimal.js-light@2.5.1: + resolution: {integrity: sha512-qIMFpTMZmny+MMIitAB6D7iVPEorVw6YQRWkvarTkT4tBeSLLiHzcwj6q0MmYSFCiVpiqPJTJEYIrpcPzVEIvg==} + decode-named-character-reference@1.1.0: resolution: {integrity: sha512-Wy+JTSbFThEOXQIR2L6mxJvEs+veIzpmqD7ynWxMXGpnk3smkHQOp6forLdHsKpAMW9iJpaBBIxz285t1n1C3w==} @@ -2273,6 +2348,9 @@ packages: resolution: {integrity: sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==} engines: {node: '>= 0.4'} + es-toolkit@1.43.0: + resolution: {integrity: sha512-SKCT8AsWvYzBBuUqMk4NPwFlSdqLpJwmy6AP322ERn8W2YLIB6JBXnwMI2Qsh2gfphT3q7EKAxKb23cvFHFwKA==} + esbuild@0.25.5: resolution: {integrity: sha512-P8OtKZRv/5J5hhz0cUAdu/cLuPIKXpQl1R9pZtvmHWQvrAUVd0UNIPT4IB4W3rNOqVO0rlqHmCIbSwxh/c9yUQ==} engines: {node: '>=18'} @@ -2361,6 +2439,9 @@ packages: resolution: {integrity: sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==} engines: {node: '>=6'} + eventemitter3@5.0.1: + resolution: {integrity: sha512-GWkBvjiSZK87ELrYOSESUYeVIc9mvLLf/nXalMOS5dYrgZq9o5OVkbZAVM06CVxYsCwH9BDZFPlQTlPA1j4ahA==} + events@3.3.0: resolution: {integrity: sha512-mQw+2fkQbALzQ7V0MY0IqdnXNOeTtP4r0lN9z7AAawCXgqea7bDii20AYrIBrFd/Hx0M2Ocz6S111CaFkUcb0Q==} engines: {node: '>=0.8.x'} @@ -2613,6 +2694,12 @@ packages: resolution: {integrity: sha512-gJzzk+PQNznz8ysRrC0aOkBNVRBDtE1n53IqyqEf3PXrYwomFs5q4pGMizBMJF+ykh03insJ27hB8gSrD2Hn8A==} engines: {node: '>= 4'} + immer@10.2.0: + resolution: {integrity: sha512-d/+XTN3zfODyjr89gM3mPq1WNX2B8pYsu7eORitdwyA2sBubnTl3laYlBk4sXY5FUa5qTZGBDPJICVbvqzjlbw==} + + immer@11.0.1: + resolution: {integrity: sha512-naDCyggtcBWANtIrjQEajhhBEuL9b0Zg4zmlWK2CzS6xCWSE39/vvf4LqnMjUAWHBhot4m9MHCM/Z+mfWhUkiA==} + import-fresh@3.3.1: resolution: {integrity: sha512-TR3KfrTZTYLPB6jUjfx6MF9WcWrHL9su5TObK4ZkYgBdWKPOFoSoQIdEuTuR82pmtxH2spWG9h6etwfr1pLBqQ==} engines: {node: '>=6'} @@ -2631,6 +2718,10 @@ packages: resolution: {integrity: sha512-UjOaSel/iddGZJ5xP/Eixh6dY1XghiBw4XK13rCCIJcJfyhhoul/7KhLLUGtebEj6GDYM6Vnx/mVsjx2L/mFIA==} engines: {node: '>=12.0.0'} + internmap@2.0.3: + resolution: {integrity: sha512-5Hh7Y1wQbvY5ooGgPbDaL5iYLAPzMTUrjMulskHLH6wnv/A+1q5rgEaiuqEjB+oxGXIVZs1FF+R/KPN3ZSQYYg==} + engines: {node: '>=12'} + ipaddr.js@1.9.1: resolution: {integrity: sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g==} engines: {node: '>= 0.10'} @@ -3315,12 +3406,27 @@ packages: peerDependencies: react: ^19.1.0 + react-is@19.2.3: + resolution: {integrity: sha512-qJNJfu81ByyabuG7hPFEbXqNcWSU3+eVus+KJs+0ncpGfMyYdvSmxiJxbWR65lYi1I+/0HBcliO029gc4F+PnA==} + react-markdown@10.1.0: resolution: {integrity: sha512-qKxVopLT/TyA6BX3Ue5NwabOsAzm0Q7kAPwq6L+wWDwisYs7R8vZ0nRXqq6rkueboxpkjvLGU9fWifiX/ZZFxQ==} peerDependencies: '@types/react': '>=18' react: '>=18' + react-redux@9.2.0: + resolution: {integrity: sha512-ROY9fvHhwOD9ySfrF0wmvu//bKCQ6AeZZq1nJNtbDC+kk5DuSuNX/n6YWYF/SYy7bSba4D4FSz8DJeKY/S/r+g==} + peerDependencies: + '@types/react': ^18.2.25 || ^19 + react: ^18.0 || ^19 + redux: ^5.0.0 + peerDependenciesMeta: + '@types/react': + optional: true + redux: + optional: true + react-refresh@0.17.0: resolution: {integrity: sha512-z6F7K9bV85EfseRCp2bzrpyQ0Gkw1uLoCel9XBVWPg/TjRj94SkJzUTGfOa4bs7iJvBWtQG0Wq7wnI0syw3EBQ==} engines: {node: '>=0.10.0'} @@ -3408,6 +3514,22 @@ packages: resolution: {integrity: sha512-GDhwkLfywWL2s6vEjyhri+eXmfH6j1L7JE27WhqLeYzoh/A3DBaYGEj2H/HFZCn/kMfim73FXxEJTw06WtxQwg==} engines: {node: '>= 14.18.0'} + recharts@3.6.0: + resolution: {integrity: sha512-L5bjxvQRAe26RlToBAziKUB7whaGKEwD3znoM6fz3DrTowCIC/FnJYnuq1GEzB8Zv2kdTfaxQfi5GoH0tBinyg==} + engines: {node: '>=18'} + peerDependencies: + react: ^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0 + react-dom: ^16.0.0 || ^17.0.0 || ^18.0.0 || ^19.0.0 + react-is: ^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0 + + redux-thunk@3.1.0: + resolution: {integrity: sha512-NW2r5T6ksUKXCabzhL9z+h206HQw/NJkcLm1GPImRQ8IzfXwRGqjVhKJGauHirT0DAuyy6hjdnMZaRoAcy0Klw==} + peerDependencies: + redux: ^5.0.0 + + redux@5.0.1: + resolution: {integrity: sha512-M9/ELqF6fy8FwmkpnF0S3YKOqMyoWJ4+CS5Efg2ct3oY9daQvd/Pc71FpGZsVsbl3Cpb+IIcjBDUnnyBdQbq4w==} + refractor@3.6.0: resolution: {integrity: sha512-MY9W41IOWxxk31o+YvFCNyNzdkc9M20NoZK5vq6jkv4I/uh2zkWcfudj0Q1fovjUQJrNewS9NMzeTtqPf+n5EA==} @@ -3421,6 +3543,9 @@ packages: resolution: {integrity: sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==} engines: {node: '>=0.10.0'} + reselect@5.1.1: + resolution: {integrity: sha512-K/BG6eIky/SBpzfHZv/dd+9JBFiS4SWV7FIujVyJRux6e45+73RaUHXLmIR1f7WOMaQ0U1km6qwklRQxpJJY0w==} + resolve-from@4.0.0: resolution: {integrity: sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==} engines: {node: '>=4'} @@ -3632,6 +3757,9 @@ packages: through@2.3.8: resolution: {integrity: sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==} + tiny-invariant@1.3.3: + resolution: {integrity: sha512-+FbBPE1o9QAYvviau/qC5SE3caw21q3xkvWKBtja5vgqOWIHHJ3ioaq1VPfn/Szqctz2bU/oYeKd9/z5BL+PVg==} + tinyglobby@0.2.14: resolution: {integrity: sha512-tX5e7OM1HnYr2+a2C/4V0htOcSQcoSTH9KgJnVvNm5zm/cyEWKJ7j7YutsH9CxMdtOkkLFy2AHrMci9IM8IPZQ==} engines: {node: '>=12.0.0'} @@ -3822,6 +3950,9 @@ packages: vfile@6.0.3: resolution: {integrity: sha512-KzIbH/9tXat2u30jf+smMwFCsno4wHVdNmzFyL+T/L3UGqqk6JKfVqOFOZEpZSHADH1k40ab6NUIXZq422ov3Q==} + victory-vendor@37.3.6: + resolution: {integrity: sha512-SbPDPdDBYp+5MJHhBCAyI7wKM3d5ivekigc2Dk2s7pgbZ9wIgIBYGVw4zGHBml/qTFbexrofXW6Gu4noGxrOwQ==} + vite-plugin-radar@0.10.0: resolution: {integrity: sha512-PRApileUv7I+bInGbrQM9LvxvrFHELRvyO5yAodGwIgQBRID/hOPqx0pz7VBWFAcPNjJJVAp/LuT1417BuE/9g==} peerDependencies: @@ -4424,17 +4555,17 @@ snapshots: - typescript - utf-8-validate - '@motiadev/plugin-endpoint@0.8.2-beta.139(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(monaco-editor@0.53.0)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0))': + '@motiadev/plugin-endpoint@0.8.2-beta.139(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(immer@11.0.1)(monaco-editor@0.53.0)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0))': dependencies: '@monaco-editor/react': 4.7.0(monaco-editor@0.53.0)(react-dom@19.1.0(react@19.1.0))(react@19.1.0) '@motiadev/stream-client-react': 0.8.2-beta.139(react@19.1.0) - '@motiadev/ui': 0.8.2-beta.139(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0)) + '@motiadev/ui': 0.8.2-beta.139(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(immer@11.0.1)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0)) clsx: 2.1.1 json-schema: 0.4.0 lucide-react: 0.544.0(react@19.1.0) react18-json-view: 0.2.9(react@19.1.0) tailwind-merge: 3.3.1 - zustand: 5.0.8(@types/react@19.1.6)(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0)) + zustand: 5.0.8(@types/react@19.1.6)(immer@11.0.1)(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0)) transitivePeerDependencies: - '@types/react' - '@types/react-dom' @@ -4481,7 +4612,7 @@ snapshots: dependencies: uuid: 11.1.0 - '@motiadev/ui@0.8.2-beta.139(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0))': + '@motiadev/ui@0.8.2-beta.139(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(immer@11.0.1)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0))': dependencies: '@radix-ui/react-checkbox': 1.3.3(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react@19.1.0) '@radix-ui/react-dropdown-menu': 2.1.16(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react@19.1.0) @@ -4499,19 +4630,19 @@ snapshots: react-resizable-panels: 3.0.6(react-dom@19.1.0(react@19.1.0))(react@19.1.0) react-use-resizable: 0.2.0(react@19.1.0) tailwind-merge: 3.3.1 - zustand: 5.0.8(@types/react@19.1.6)(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0)) + zustand: 5.0.8(@types/react@19.1.6)(immer@11.0.1)(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0)) transitivePeerDependencies: - '@types/react' - '@types/react-dom' - immer - use-sync-external-store - '@motiadev/workbench@0.8.2-beta.139(@types/node@22.15.21)(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(eslint@9.27.0(jiti@2.5.1))(jiti@2.5.1)(lightningcss@1.30.1)(monaco-editor@0.53.0)(tsx@4.20.6)(use-sync-external-store@1.5.0(react@19.1.0))': + '@motiadev/workbench@0.8.2-beta.139(@types/node@22.15.21)(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(eslint@9.27.0(jiti@2.5.1))(immer@11.0.1)(jiti@2.5.1)(lightningcss@1.30.1)(monaco-editor@0.53.0)(tsx@4.20.6)(use-sync-external-store@1.5.0(react@19.1.0))': dependencies: '@monaco-editor/react': 4.7.0(monaco-editor@0.53.0)(react-dom@19.1.0(react@19.1.0))(react@19.1.0) - '@motiadev/plugin-endpoint': 0.8.2-beta.139(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(monaco-editor@0.53.0)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0)) + '@motiadev/plugin-endpoint': 0.8.2-beta.139(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(immer@11.0.1)(monaco-editor@0.53.0)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0)) '@motiadev/stream-client-react': 0.8.2-beta.139(react@19.1.0) - '@motiadev/ui': 0.8.2-beta.139(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0)) + '@motiadev/ui': 0.8.2-beta.139(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(immer@11.0.1)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0)) '@radix-ui/react-collapsible': 1.1.12(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react@19.1.0) '@radix-ui/react-dialog': 1.1.14(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react@19.1.0) '@radix-ui/react-dropdown-menu': 2.1.16(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react@19.1.0) @@ -4526,7 +4657,7 @@ snapshots: '@radix-ui/react-tooltip': 1.2.8(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react@19.1.0) '@tailwindcss/postcss': 4.1.13 '@vitejs/plugin-react': 4.5.0(vite@6.3.5(@types/node@22.15.21)(jiti@2.5.1)(lightningcss@1.30.1)(tsx@4.20.6)) - '@xyflow/react': 12.8.4(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react@19.1.0) + '@xyflow/react': 12.8.4(@types/react@19.1.6)(immer@11.0.1)(react-dom@19.1.0(react@19.1.0))(react@19.1.0) autoprefixer: 10.4.21(postcss@8.5.3) class-variance-authority: 0.7.1 clsx: 2.1.1 @@ -4547,7 +4678,7 @@ snapshots: typescript: 5.8.3 typescript-eslint: 8.32.1(eslint@9.27.0(jiti@2.5.1))(typescript@5.8.3) vite: 6.3.5(@types/node@22.15.21)(jiti@2.5.1)(lightningcss@1.30.1)(tsx@4.20.6) - zustand: 5.0.8(@types/react@19.1.6)(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0)) + zustand: 5.0.8(@types/react@19.1.6)(immer@11.0.1)(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0)) transitivePeerDependencies: - '@types/node' - '@types/react' @@ -5102,6 +5233,18 @@ snapshots: '@radix-ui/rect@1.1.1': {} + '@reduxjs/toolkit@2.11.2(react-redux@9.2.0(@types/react@19.1.6)(react@19.1.0)(redux@5.0.1))(react@19.1.0)': + dependencies: + '@standard-schema/spec': 1.0.0 + '@standard-schema/utils': 0.3.0 + immer: 11.0.1 + redux: 5.0.1 + redux-thunk: 3.1.0(redux@5.0.1) + reselect: 5.1.1 + optionalDependencies: + react: 19.1.0 + react-redux: 9.2.0(@types/react@19.1.6)(react@19.1.0)(redux@5.0.1) + '@rolldown/pluginutils@1.0.0-beta.9': {} '@rollup/rollup-android-arm-eabi@4.41.1': @@ -5166,6 +5309,8 @@ snapshots: '@standard-schema/spec@1.0.0': {} + '@standard-schema/utils@0.3.0': {} + '@supabase/auth-js@2.71.1': dependencies: '@supabase/node-fetch': 2.6.15 @@ -5380,18 +5525,36 @@ snapshots: dependencies: '@babel/types': 7.27.3 + '@types/d3-array@3.2.2': {} + '@types/d3-color@3.1.3': {} '@types/d3-drag@3.0.7': dependencies: '@types/d3-selection': 3.0.11 + '@types/d3-ease@3.0.2': {} + '@types/d3-interpolate@3.0.4': dependencies: '@types/d3-color': 3.1.3 + '@types/d3-path@3.1.1': {} + + '@types/d3-scale@4.0.9': + dependencies: + '@types/d3-time': 3.0.4 + '@types/d3-selection@3.0.11': {} + '@types/d3-shape@3.1.7': + dependencies: + '@types/d3-path': 3.1.1 + + '@types/d3-time@3.0.4': {} + + '@types/d3-timer@3.0.2': {} + '@types/d3-transition@3.0.9': dependencies: '@types/d3-selection': 3.0.11 @@ -5460,6 +5623,8 @@ snapshots: '@types/unist@3.0.3': {} + '@types/use-sync-external-store@0.0.6': {} + '@types/ws@8.18.1': dependencies: '@types/node': 22.15.21 @@ -5650,13 +5815,13 @@ snapshots: transitivePeerDependencies: - supports-color - '@xyflow/react@12.8.4(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)': + '@xyflow/react@12.8.4(@types/react@19.1.6)(immer@11.0.1)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)': dependencies: '@xyflow/system': 0.0.68 classcat: 5.0.5 react: 19.1.0 react-dom: 19.1.0(react@19.1.0) - zustand: 4.5.7(@types/react@19.1.6)(react@19.1.0) + zustand: 4.5.7(@types/react@19.1.6)(immer@11.0.1)(react@19.1.0) transitivePeerDependencies: - '@types/react' - immer @@ -6014,6 +6179,10 @@ snapshots: csstype@3.1.3: {} + d3-array@3.2.4: + dependencies: + internmap: 2.0.3 + d3-color@3.1.0: {} d3-dispatch@3.0.1: {} @@ -6025,12 +6194,36 @@ snapshots: d3-ease@3.0.1: {} + d3-format@3.1.0: {} + d3-interpolate@3.0.1: dependencies: d3-color: 3.1.0 + d3-path@3.1.0: {} + + d3-scale@4.0.2: + dependencies: + d3-array: 3.2.4 + d3-format: 3.1.0 + d3-interpolate: 3.0.1 + d3-time: 3.1.0 + d3-time-format: 4.1.0 + d3-selection@3.0.0: {} + d3-shape@3.2.0: + dependencies: + d3-path: 3.1.0 + + d3-time-format@4.1.0: + dependencies: + d3-time: 3.1.0 + + d3-time@3.1.0: + dependencies: + d3-array: 3.2.4 + d3-timer@3.0.1: {} d3-transition@3.0.1(d3-selection@3.0.0): @@ -6065,6 +6258,8 @@ snapshots: dependencies: ms: 2.1.3 + decimal.js-light@2.5.1: {} + decode-named-character-reference@1.1.0: dependencies: character-entities: 2.0.2 @@ -6144,6 +6339,8 @@ snapshots: has-tostringtag: 1.0.2 hasown: 2.0.2 + es-toolkit@1.43.0: {} + esbuild@0.25.5: optionalDependencies: '@esbuild/aix-ppc64': 0.25.5 @@ -6265,6 +6462,8 @@ snapshots: event-target-shim@5.0.1: {} + eventemitter3@5.0.1: {} + events@3.3.0: {} eventsource-parser@3.0.6: {} @@ -6593,6 +6792,10 @@ snapshots: ignore@7.0.4: {} + immer@10.2.0: {} + + immer@11.0.1: {} + import-fresh@3.3.1: dependencies: parent-module: 1.0.1 @@ -6624,6 +6827,8 @@ snapshots: transitivePeerDependencies: - '@types/node' + internmap@2.0.3: {} + ipaddr.js@1.9.1: {} is-alphabetical@1.0.4: {} @@ -7147,12 +7352,12 @@ snapshots: dependencies: '@types/trusted-types': 1.0.6 - motia@0.8.2-beta.139(@types/node@22.15.21)(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(eslint@9.27.0(jiti@2.5.1))(jiti@2.5.1)(lightningcss@1.30.1)(monaco-editor@0.53.0)(tsx@4.20.6)(typescript@5.8.3)(use-sync-external-store@1.5.0(react@19.1.0)): + motia@0.8.2-beta.139(@types/node@22.15.21)(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(eslint@9.27.0(jiti@2.5.1))(immer@11.0.1)(jiti@2.5.1)(lightningcss@1.30.1)(monaco-editor@0.53.0)(tsx@4.20.6)(typescript@5.8.3)(use-sync-external-store@1.5.0(react@19.1.0)): dependencies: '@amplitude/analytics-node': 1.5.9 '@motiadev/core': 0.8.2-beta.139(@types/node@22.15.21)(typescript@5.8.3) '@motiadev/stream-client-node': 0.8.2-beta.139 - '@motiadev/workbench': 0.8.2-beta.139(@types/node@22.15.21)(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(eslint@9.27.0(jiti@2.5.1))(jiti@2.5.1)(lightningcss@1.30.1)(monaco-editor@0.53.0)(tsx@4.20.6)(use-sync-external-store@1.5.0(react@19.1.0)) + '@motiadev/workbench': 0.8.2-beta.139(@types/node@22.15.21)(@types/react-dom@19.1.5(@types/react@19.1.6))(@types/react@19.1.6)(eslint@9.27.0(jiti@2.5.1))(immer@11.0.1)(jiti@2.5.1)(lightningcss@1.30.1)(monaco-editor@0.53.0)(tsx@4.20.6)(use-sync-external-store@1.5.0(react@19.1.0)) antlr4ts: 0.5.0-alpha.4 archiver: 7.0.1 axios: 1.11.0 @@ -7393,6 +7598,8 @@ snapshots: react: 19.1.0 scheduler: 0.26.0 + react-is@19.2.3: {} + react-markdown@10.1.0(@types/react@19.1.6)(react@19.1.0): dependencies: '@types/hast': 3.0.4 @@ -7411,6 +7618,15 @@ snapshots: transitivePeerDependencies: - supports-color + react-redux@9.2.0(@types/react@19.1.6)(react@19.1.0)(redux@5.0.1): + dependencies: + '@types/use-sync-external-store': 0.0.6 + react: 19.1.0 + use-sync-external-store: 1.5.0(react@19.1.0) + optionalDependencies: + '@types/react': 19.1.6 + redux: 5.0.1 + react-refresh@0.17.0: {} react-remove-scroll-bar@2.3.8(@types/react@19.1.6)(react@19.1.0): @@ -7504,6 +7720,32 @@ snapshots: readdirp@4.1.2: {} + recharts@3.6.0(@types/react@19.1.6)(react-dom@19.1.0(react@19.1.0))(react-is@19.2.3)(react@19.1.0)(redux@5.0.1): + dependencies: + '@reduxjs/toolkit': 2.11.2(react-redux@9.2.0(@types/react@19.1.6)(react@19.1.0)(redux@5.0.1))(react@19.1.0) + clsx: 2.1.1 + decimal.js-light: 2.5.1 + es-toolkit: 1.43.0 + eventemitter3: 5.0.1 + immer: 10.2.0 + react: 19.1.0 + react-dom: 19.1.0(react@19.1.0) + react-is: 19.2.3 + react-redux: 9.2.0(@types/react@19.1.6)(react@19.1.0)(redux@5.0.1) + reselect: 5.1.1 + tiny-invariant: 1.3.3 + use-sync-external-store: 1.5.0(react@19.1.0) + victory-vendor: 37.3.6 + transitivePeerDependencies: + - '@types/react' + - redux + + redux-thunk@3.1.0(redux@5.0.1): + dependencies: + redux: 5.0.1 + + redux@5.0.1: {} + refractor@3.6.0: dependencies: hastscript: 6.0.0 @@ -7529,6 +7771,8 @@ snapshots: require-from-string@2.0.2: {} + reselect@5.1.1: {} + resolve-from@4.0.0: {} resolve-pkg-maps@1.0.0: @@ -7813,6 +8057,8 @@ snapshots: through@2.3.8: {} + tiny-invariant@1.3.3: {} + tinyglobby@0.2.14: dependencies: fdir: 6.4.4(picomatch@4.0.2) @@ -8008,6 +8254,23 @@ snapshots: '@types/unist': 3.0.3 vfile-message: 4.0.2 + victory-vendor@37.3.6: + dependencies: + '@types/d3-array': 3.2.2 + '@types/d3-ease': 3.0.2 + '@types/d3-interpolate': 3.0.4 + '@types/d3-scale': 4.0.9 + '@types/d3-shape': 3.1.7 + '@types/d3-time': 3.0.4 + '@types/d3-timer': 3.0.2 + d3-array: 3.2.4 + d3-ease: 3.0.1 + d3-interpolate: 3.0.1 + d3-scale: 4.0.2 + d3-shape: 3.2.0 + d3-time: 3.1.0 + d3-timer: 3.0.1 + vite-plugin-radar@0.10.0(vite@6.3.5(@types/node@22.15.21)(jiti@2.5.1)(lightningcss@1.30.1)(tsx@4.20.6)): dependencies: vite: 6.3.5(@types/node@22.15.21)(jiti@2.5.1)(lightningcss@1.30.1)(tsx@4.20.6) @@ -8090,16 +8353,18 @@ snapshots: zod@3.25.76: {} - zustand@4.5.7(@types/react@19.1.6)(react@19.1.0): + zustand@4.5.7(@types/react@19.1.6)(immer@11.0.1)(react@19.1.0): dependencies: use-sync-external-store: 1.5.0(react@19.1.0) optionalDependencies: '@types/react': 19.1.6 + immer: 11.0.1 react: 19.1.0 - zustand@5.0.8(@types/react@19.1.6)(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0)): + zustand@5.0.8(@types/react@19.1.6)(immer@11.0.1)(react@19.1.0)(use-sync-external-store@1.5.0(react@19.1.0)): optionalDependencies: '@types/react': 19.1.6 + immer: 11.0.1 react: 19.1.0 use-sync-external-store: 1.5.0(react@19.1.0) diff --git a/types/game-history.ts b/types/game-history.ts new file mode 100644 index 0000000..ade0c1d --- /dev/null +++ b/types/game-history.ts @@ -0,0 +1,61 @@ +import { z } from 'zod' +import { ScoreboardSchema, BenchmarkVariantSchema } from './game' +import { GameMoveSchema } from './game-move' +import { GameMessageSchema } from './game-message' +import { AiModelProviderSchema } from './ai-models' + +export const GameHistorySchema = z.object({ + id: z.string({ description: 'The ID of the game' }), + + // Game metadata + startedAt: z.number({ description: 'Unix timestamp when game started' }), + endedAt: z.number({ description: 'Unix timestamp when game ended' }), + duration: z.number({ description: 'Game duration in milliseconds' }), + + // Players info + whitePlayer: z.object({ + provider: AiModelProviderSchema().optional(), + model: z.string().optional(), + isHuman: z.boolean(), + }), + blackPlayer: z.object({ + provider: AiModelProviderSchema().optional(), + model: z.string().optional(), + isHuman: z.boolean(), + }), + + // Game result + status: z.enum(['completed', 'draw', 'endedEarly']), + winner: z.enum(['white', 'black']).optional(), + endGameReason: z.string().optional(), + variant: BenchmarkVariantSchema.default('guided'), + + // Stats + totalMoves: z.number({ description: 'Total number of moves in the game' }), + whiteIllegalMoves: z.number({ description: 'Illegal move attempts by white' }), + blackIllegalMoves: z.number({ description: 'Illegal move attempts by black' }), + + // Full game data + finalFen: z.string({ description: 'Final board position FEN' }), + moves: z.array(GameMoveSchema, { description: 'All moves in the game' }), + messages: z.array(GameMessageSchema, { description: 'All AI messages/reasoning' }), + scoreboard: ScoreboardSchema.optional(), + + // PGN for export + pgn: z.string({ description: 'Game in PGN format' }).optional(), +}) + +export const GameHistoryFilterSchema = z.object({ + provider: AiModelProviderSchema().optional(), + model: z.string().optional(), + variant: BenchmarkVariantSchema.optional(), + winner: z.enum(['white', 'black']).optional(), + status: z.enum(['completed', 'draw', 'endedEarly']).optional(), + startDate: z.number().optional(), + endDate: z.number().optional(), + limit: z.number().default(50), + offset: z.number().default(0), +}) + +export type GameHistory = z.infer +export type GameHistoryFilter = z.infer diff --git a/types/game.ts b/types/game.ts index 483c280..01adc49 100644 --- a/types/game.ts +++ b/types/game.ts @@ -50,11 +50,16 @@ export const PlayerSchema = () => promotions: z.number({ description: 'The number of pawn promotions' }).optional(), }) +export const BenchmarkVariantSchema = z.enum(['guided', 'unguided'], { + description: 'Benchmark variant: guided (with legal moves) or unguided (FEN only)', +}) + export const GameSchema = z.object({ id: z.string({ description: 'The ID of the game' }), fen: z.string({ description: 'The FEN of the game' }), turn: z.enum(['white', 'black'], { description: 'The color of the current turn' }), status: z.enum(['pending', 'completed', 'draw', 'endedEarly'], { description: 'The status of the game' }), + variant: BenchmarkVariantSchema.default('guided').optional(), lastMove: z.array(z.string({ description: 'The last move made' })).optional(), lastMoveSan: z.string({ description: 'The last move made in Standard Algebraic Notation (SAN)' }).optional(), winner: z.enum(['white', 'black']).optional(), @@ -66,6 +71,7 @@ export const GameSchema = z.object({ }), check: z.boolean({ description: 'Whether the game is in check' }), scoreboard: ScoreboardSchema.optional(), + createdAt: z.number({ description: 'Unix timestamp when game was created' }).optional(), }) export const roleSchema = z.enum(['white', 'black', 'spectator', 'root']) @@ -76,3 +82,4 @@ export type Player = z.infer> export type PlayerScore = z.infer> export type Scoreboard = z.infer export type GameRole = z.infer +export type BenchmarkVariant = z.infer diff --git a/types/legal-move-benchmark.ts b/types/legal-move-benchmark.ts new file mode 100644 index 0000000..3385d03 --- /dev/null +++ b/types/legal-move-benchmark.ts @@ -0,0 +1,76 @@ +import { z } from 'zod' +import { AiModelProviderSchema } from './ai-models' + +export const TestPositionSchema = z.object({ + id: z.string(), + fen: z.string(), + pgn: z.string(), + turn: z.enum(['white', 'black']), + legalMoves: z.array(z.string()), + legalMoveCount: z.number(), + moveNumber: z.number(), +}) + +export const ModelBenchmarkResultSchema = z.object({ + positionId: z.string(), + modelMoves: z.array(z.string()), + correctMoves: z.array(z.string()), + illegalMoves: z.array(z.string()), + missedMoves: z.array(z.string()), + accuracy: z.number(), // percentage of legal moves found + penalty: z.number(), // penalty for illegal moves + finalScore: z.number(), // accuracy - penalty + responseTime: z.number(), // ms + rawResponse: z.string(), // raw model response for debugging + error: z.string().optional(), // if model failed to respond +}) + +export const LegalMoveBenchmarkRunSchema = z.object({ + id: z.string(), + createdAt: z.number(), + completedAt: z.number().optional(), + status: z.enum(['pending', 'running', 'completed', 'failed']), + + // Model info + provider: AiModelProviderSchema(), + model: z.string(), + + // Test configuration + positionCount: z.number(), + positions: z.array(TestPositionSchema), + + // Results + results: z.array(ModelBenchmarkResultSchema), + + // Aggregate scores + averageAccuracy: z.number().optional(), + averagePenalty: z.number().optional(), + averageFinalScore: z.number().optional(), + totalCorrectMoves: z.number().optional(), + totalIllegalMoves: z.number().optional(), + totalMissedMoves: z.number().optional(), +}) + +export const LegalMoveBenchmarkSummarySchema = z.object({ + id: z.string(), + provider: AiModelProviderSchema(), + model: z.string(), + runsCompleted: z.number(), + averageScore: z.number(), + bestScore: z.number(), + worstScore: z.number(), + lastRunAt: z.number(), +}) + +export const PositionSetSchema = z.object({ + id: z.string(), + createdAt: z.number(), + count: z.number(), + positions: z.array(TestPositionSchema), +}) + +export type TestPosition = z.infer +export type ModelBenchmarkResult = z.infer +export type LegalMoveBenchmarkRun = z.infer +export type LegalMoveBenchmarkSummary = z.infer +export type PositionSet = z.infer diff --git a/types/puzzle-benchmark.ts b/types/puzzle-benchmark.ts new file mode 100644 index 0000000..134fd82 --- /dev/null +++ b/types/puzzle-benchmark.ts @@ -0,0 +1,75 @@ +import { z } from 'zod' +import { AiModelProviderSchema } from './ai-models' + +export const PuzzleThemeSchema = z.enum(['mateIn1', 'oneMove']) + +export const LichessPuzzleSchema = z.object({ + id: z.string(), + rating: z.number(), + themes: z.array(z.string()), + solution: z.array(z.string()), // UCI format moves + initialPly: z.number(), + pgn: z.string(), + fen: z.string(), // Position where puzzle starts + legalMoves: z.array(z.string()), // Legal moves in SAN format + solutionSan: z.string(), // First solution move in SAN +}) + +export const PuzzleSetSchema = z.object({ + id: z.string(), + theme: PuzzleThemeSchema, + createdAt: z.number(), + puzzles: z.array(LichessPuzzleSchema), + count: z.number(), +}) + +export const PuzzleResultSchema = z.object({ + puzzleId: z.string(), + modelMove: z.string().optional(), // What the model played + correctMove: z.string(), // The correct solution + isCorrect: z.boolean(), + responseTime: z.number(), + rawResponse: z.string(), + error: z.string().optional(), +}) + +export const PuzzleBenchmarkRunSchema = z.object({ + id: z.string(), + createdAt: z.number(), + completedAt: z.number().optional(), + status: z.enum(['pending', 'running', 'completed', 'failed']), + + // Model info + provider: AiModelProviderSchema(), + model: z.string(), + + // Which puzzle set was used + puzzleSetId: z.string(), + theme: PuzzleThemeSchema, + + // Results + results: z.array(PuzzleResultSchema), + + // Aggregate scores + totalPuzzles: z.number(), + correctCount: z.number().optional(), + accuracy: z.number().optional(), // percentage +}) + +export const PuzzleBenchmarkSummarySchema = z.object({ + id: z.string(), + provider: AiModelProviderSchema(), + model: z.string(), + mateIn1Accuracy: z.number().optional(), + oneMoveAccuracy: z.number().optional(), + overallAccuracy: z.number().optional(), + runsCompleted: z.number(), + lastRunAt: z.number(), +}) + +export type PuzzleTheme = z.infer +export type LichessPuzzle = z.infer +export type PuzzleSet = z.infer +export type PuzzleResult = z.infer +export type PuzzleBenchmarkRun = z.infer +export type PuzzleBenchmarkSummary = z.infer diff --git a/types/stockfish-benchmark.ts b/types/stockfish-benchmark.ts new file mode 100644 index 0000000..e308cbb --- /dev/null +++ b/types/stockfish-benchmark.ts @@ -0,0 +1,86 @@ +import { z } from 'zod' +import { AiModelProviderSchema } from './ai-models' + +export const StockfishGameMoveSchema = z.object({ + moveNumber: z.number(), + player: z.enum(['white', 'black']), + moveSan: z.string(), + fen: z.string(), + centipawnScore: z.number().optional(), // Evaluation after move + bestMove: z.string().optional(), // What Stockfish thinks was best + centipawnLoss: z.number().optional(), // Difference from best move + isAiMove: z.boolean(), // true if AI made this move, false if Stockfish + responseTime: z.number().optional(), // ms for AI moves + error: z.string().optional(), +}) + +export const StockfishGameResultSchema = z.object({ + id: z.string(), + createdAt: z.number(), + completedAt: z.number().optional(), + status: z.enum(['running', 'completed', 'failed']), + + // Model info + provider: AiModelProviderSchema(), + model: z.string(), + + // Game info + aiColor: z.enum(['white', 'black']), + stockfishLevel: z.number(), // 1-20 + result: z.enum(['ai_win', 'stockfish_win', 'draw', 'ai_illegal_move', 'timeout']).optional(), + resultReason: z.string().optional(), + + // Moves + moves: z.array(StockfishGameMoveSchema), + totalMoves: z.number(), + finalFen: z.string().optional(), + pgn: z.string().optional(), + + // ACPL calculation (only for AI moves) + aiMoveCount: z.number().optional(), + totalCentipawnLoss: z.number().optional(), + averageCentipawnLoss: z.number().optional(), // ACPL + blunders: z.number().optional(), // moves with >100 centipawn loss + mistakes: z.number().optional(), // moves with 50-100 centipawn loss + inaccuracies: z.number().optional(), // moves with 25-50 centipawn loss +}) + +export const StockfishBenchmarkRunSchema = z.object({ + id: z.string(), + createdAt: z.number(), + completedAt: z.number().optional(), + status: z.enum(['running', 'completed', 'failed']), + + provider: AiModelProviderSchema(), + model: z.string(), + stockfishLevel: z.number(), + + // Two games: one as white, one as black + gameAsWhite: StockfishGameResultSchema.optional(), + gameAsBlack: StockfishGameResultSchema.optional(), + + // Combined stats + gamesPlayed: z.number(), + wins: z.number(), + losses: z.number(), + draws: z.number(), + overallAcpl: z.number().optional(), // Average ACPL across both games +}) + +export const StockfishBenchmarkSummarySchema = z.object({ + id: z.string(), + provider: AiModelProviderSchema(), + model: z.string(), + runsCompleted: z.number(), + averageAcpl: z.number(), + bestAcpl: z.number(), + wins: z.number(), + losses: z.number(), + draws: z.number(), + lastRunAt: z.number(), +}) + +export type StockfishGameMove = z.infer +export type StockfishGameResult = z.infer +export type StockfishBenchmarkRun = z.infer +export type StockfishBenchmarkSummary = z.infer