Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
26b98b2
feat: Checkpoint 1 - Benchmark methodology & transparency
Bot-Rakshit Dec 5, 2025
31e3757
fix: improve methodology page UI and scrolling
Bot-Rakshit Dec 5, 2025
272ca65
fix: improve landing page button layout with grid
Bot-Rakshit Dec 5, 2025
dbf2763
feat: redesign landing page as benchmark-focused layout
Bot-Rakshit Dec 5, 2025
0fa502d
refactor: preserve original landing page layout with benchmark updates
Bot-Rakshit Dec 5, 2025
8b7e93d
feat: add game history types and stream
Bot-Rakshit Dec 10, 2025
421ff0d
feat: implement game history backend
Bot-Rakshit Dec 10, 2025
29a2a14
feat: add game history UI with replay viewer
Bot-Rakshit Dec 10, 2025
dca34e8
chore: add MOTIA_DOCS.md to gitignore
Bot-Rakshit Dec 10, 2025
e4eb92c
fix: address code review issues
Bot-Rakshit Dec 10, 2025
0d19ded
feat: update AI models
Bot-Rakshit Dec 14, 2025
52a6eeb
feat: add legal move benchmark types and streams
Bot-Rakshit Dec 14, 2025
9b18ab0
feat: implement legal move generation benchmark
Bot-Rakshit Dec 14, 2025
8e134c1
feat: add puzzle benchmark types and streams
Bot-Rakshit Dec 14, 2025
b111818
feat: implement puzzle benchmark system
Bot-Rakshit Dec 14, 2025
be43e6d
refactor: remove variant toggle, use guided mode only
Bot-Rakshit Dec 14, 2025
7b0acc1
fix: use GEMINI_API_KEY instead of GOOGLE_API_KEY in benchmarks
Bot-Rakshit Dec 14, 2025
b733338
fix: correct puzzle parsing by playing setup move after initialPly
Bot-Rakshit Dec 14, 2025
9f7a461
fix: use queryParams array instead of querySchema for Motia API routes
Bot-Rakshit Dec 14, 2025
5b3efb9
fix: add rate limit retry logic and increase delay for Lichess API
Bot-Rakshit Dec 14, 2025
0a39348
refactor: use Lichess batch API for faster puzzle fetching
Bot-Rakshit Dec 14, 2025
55a7198
feat: use shared position set for fair benchmark comparison
Bot-Rakshit Dec 14, 2025
b0f1abf
fix: correct position set stream config format
Bot-Rakshit Dec 14, 2025
135c466
fix: use F1 scoring instead of harsh penalty for legal move benchmark
Bot-Rakshit Dec 14, 2025
0d45840
feat: add Human vs AI game mode with weighted random matching
Bot-Rakshit Dec 14, 2025
ba3b22c
fix: improve legal move benchmark prompt with clearer SAN examples
Bot-Rakshit Dec 14, 2025
f50fdf3
feat: add run-all-benchmarks endpoint for parallel execution
Bot-Rakshit Dec 14, 2025
3b66e71
fix: correct useAuth import and use apiClient in play-ai-page
Bot-Rakshit Dec 14, 2025
fd3728e
fix: run benchmarks in batches of 3 to avoid rate limits
Bot-Rakshit Dec 14, 2025
99d7411
refactor: run benchmarks position-by-position with one model per prov…
Bot-Rakshit Dec 14, 2025
b8967a2
feat: add Stockfish benchmark to measure ACPL
Bot-Rakshit Dec 14, 2025
f065539
fix: reduce benchmark prompt timeout from 2min to 1min
Bot-Rakshit Dec 14, 2025
ad4b551
feat: add detailed logging to benchmark prompts
Bot-Rakshit Dec 14, 2025
4f82b5c
fix: use Promise.race timeout instead of AbortSignal
Bot-Rakshit Dec 14, 2025
4c3e642
fix: use streamObject instead of generateObject for benchmark
Bot-Rakshit Dec 14, 2025
7b9bd92
fix: remove fire and forget from the api
Bot-Rakshit Dec 19, 2025
5e49c8d
add thinking efforts in models
Bot-Rakshit Dec 19, 2025
108b86f
add batch api
Bot-Rakshit Dec 19, 2025
35bf6ba
rechart graphs
Bot-Rakshit Dec 19, 2025
37abcb7
revert landing assets
Bot-Rakshit Dec 27, 2025
6a868ff
bench page and methodology improvements
Bot-Rakshit Jan 8, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ dist
.env
.env.production
api/lib/*
deploy.sh
deploy.sh
MOTIA_DOCS.md
81 changes: 77 additions & 4 deletions api/motia-workbench.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,22 @@
{
"id": "chess",
"config": {
"steps/chess/12-play-vs-ai.step.ts": {
"x": 350,
"y": 1056
},
"steps/chess/11-get-game-history-detail.step.ts": {
"x": 0,
"y": 0
},
"steps/chess/10b-export-game-history.step.ts": {
"x": 6,
"y": 224
},
"steps/chess/10-get-game-history.step.ts": {
"x": 26,
"y": 428
},
"steps/chess/09-purge-stuck-games.step.ts": {
"x": -237,
"y": 580
Expand Down Expand Up @@ -50,17 +66,17 @@
"y": 112
},
"steps/chess/02-get-game.step.ts": {
"x": 1045,
"y": 114
"x": 1019,
"y": 92
},
"steps/chess/01-create-game.step.ts": {
"x": 432,
"y": 111,
"targetHandlePosition": "left"
},
"steps/chess/00-available-models-api.step.ts": {
"x": -209,
"y": 86,
"x": -460,
"y": 111,
"sourceHandlePosition": "right"
},
"steps/chess/access/request-access.step.ts": {
Expand All @@ -76,5 +92,62 @@
"y": 680
}
}
},
{
"id": "benchmark",
"config": {
"steps/benchmark/11-stockfish-leaderboard.step.ts": {
"x": 0,
"y": 2444
},
"steps/benchmark/10-run-stockfish-benchmark.step.ts": {
"x": 0,
"y": 2220
},
"steps/benchmark/09-run-all-benchmarks.step.ts": {
"x": -282,
"y": 337
},
"steps/benchmark/08-get-puzzle-sets.step.ts": {
"x": 435,
"y": 30
},
"steps/benchmark/07-get-puzzle-leaderboard.step.ts": {
"x": 3,
"y": 204
},
"steps/benchmark/06b-run-all-puzzle-benchmarks.step.ts": {
"x": 0,
"y": 1120
},
"steps/benchmark/06-run-puzzle-benchmark.step.ts": {
"x": 465,
"y": 360
},
"steps/benchmark/05-fetch-puzzle-set.step.ts": {
"x": 0,
"y": 612
},
"steps/benchmark/04-get-benchmark-leaderboard.step.ts": {
"x": 0,
"y": 816
},
"steps/benchmark/03-get-benchmark-run-detail.step.ts": {
"x": 0,
"y": 1060
},
"steps/benchmark/02-get-benchmark-runs.step.ts": {
"x": 0,
"y": 1304
},
"steps/benchmark/01-run-legal-move-benchmark.step.ts": {
"x": 0,
"y": 1528
},
"steps/benchmark/00-generate-position-set.step.ts": {
"x": 0,
"y": 1752
}
}
}
]
6 changes: 5 additions & 1 deletion api/services/ai/claude.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,23 @@ import { streamObject } from 'ai'
import { createAnthropic } from '@ai-sdk/anthropic'
import { AiPlayerPromptSchema } from '@chessarena/types/ai-models'
import { models } from './models'
import { getMaxReasoningProviderOptions } from './provider-options'
import { Handler } from './types'

export const claude: Handler = async ({ prompt, logger, model, onThoughtUpdate }) => {
const anthropic = createAnthropic({
apiKey: process.env.ANTHROPIC_API_KEY,
})

const modelId = model ?? models.claude
const { partialObjectStream, object } = streamObject({
model: anthropic(model ?? models.claude),
model: anthropic(modelId),
prompt,
schema: AiPlayerPromptSchema,
mode: 'json',
maxRetries: 0,
abortSignal: AbortSignal.timeout(180000),
providerOptions: getMaxReasoningProviderOptions('claude', modelId),
})

for await (const partialObject of partialObjectStream) {
Expand Down
5 changes: 4 additions & 1 deletion api/services/ai/gemini.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,22 @@ import { streamObject } from 'ai'
import { createGoogleGenerativeAI } from '@ai-sdk/google'
import { AiPlayerPromptSchema } from '@chessarena/types/ai-models'
import { models } from './models'
import { getMaxReasoningProviderOptions } from './provider-options'
import { Handler } from './types'

export const gemini: Handler = async ({ prompt, logger, model, onThoughtUpdate }) => {
const googleAI = createGoogleGenerativeAI({
apiKey: process.env.GEMINI_API_KEY,
})

const modelId = model ?? models.gemini
const { partialObjectStream, object } = streamObject({
model: googleAI(model ?? models.gemini),
model: googleAI(modelId),
prompt,
schema: AiPlayerPromptSchema,
maxRetries: 0,
abortSignal: AbortSignal.timeout(180000),
providerOptions: getMaxReasoningProviderOptions('gemini', modelId),
})

for await (const partialObject of partialObjectStream) {
Expand Down
6 changes: 5 additions & 1 deletion api/services/ai/grok.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,23 @@ import { streamObject } from 'ai'
import { createXai } from '@ai-sdk/xai'
import { AiPlayerPromptSchema } from '@chessarena/types/ai-models'
import { models } from './models'
import { getMaxReasoningProviderOptions } from './provider-options'
import { Handler } from './types'

export const grok: Handler = async ({ prompt, logger, model, onThoughtUpdate }) => {
const xai = createXai({
apiKey: process.env.XAI_API_KEY,
})

const modelId = model ?? models.grok
const { partialObjectStream, object } = streamObject({
model: xai(model ?? models.grok),
model: xai(modelId),
prompt,
schema: AiPlayerPromptSchema,
mode: 'json',
maxRetries: 0,
abortSignal: AbortSignal.timeout(180000),
providerOptions: getMaxReasoningProviderOptions('grok', modelId),
})

for await (const partialObject of partialObjectStream) {
Expand Down
103 changes: 70 additions & 33 deletions api/services/ai/models.ts
Original file line number Diff line number Diff line change
@@ -1,47 +1,84 @@
import { AiModels, AiProviderDefaultModel } from '@chessarena/types/ai-models'
import { AiModels, AiModelProvider, AiProviderDefaultModel } from '@chessarena/types/ai-models'

// NOTE: these are the models used for AI vs AI games, it is also used for backwards compatibility for existing games that don't have a model assigned to a player
// IMPORTANT: These must match model names in supportedModelsByProvider below!
export const models: AiProviderDefaultModel = {
openai: 'gpt-5-2025-08-07',
openai: 'gpt-5.2',
gemini: 'gemini-2.5-flash',
claude: 'claude-sonnet-4-5-20250929',
grok: 'grok-4-fast',
claude: 'claude-sonnet-4-5',
grok: 'grok-4-fast-non-reasoning',
}

// NOTE: these are all the models supported by provider that users can pick in order to play human vs AI games
/**
* ============================================
* BENCHMARK MODELS - Add new models here!
* ============================================
*
* To add a new model for benchmarking:
* 1. Add it to the appropriate provider array below
* 2. Restart the dev server
* 3. Run the benchmark: POST /benchmark/legal-moves/run-all
*
* To run benchmark for a single model:
* POST /benchmark/legal-moves/run { "provider": "claude", "model": "claude-3-5-haiku-20241022" }
*
* Provider documentation:
* - OpenAI: https://platform.openai.com/docs/models
* - Gemini: https://ai.google.dev/gemini-api/docs/models
* - Claude: https://docs.anthropic.com/en/docs/about-claude/models/overview
* - Grok: https://docs.x.ai/docs/models
*/
export const supportedModelsByProvider: AiModels = {
// From AI SDK docs: https://sdk.vercel.ai/providers/ai-sdk-providers/openai
openai: [
// https://platform.openai.com/docs/models
'gpt-5-2025-08-07',
'gpt-5-mini-2025-08-07',
'gpt-5-nano-2025-08-07',
'gpt-4.1-nano-2025-04-14',
'gpt-4.1-mini-2025-04-14',
'gpt-4o-mini-2024-07-18',
'o4-mini-2025-04-16',
'gpt-5.2', // Latest
'gpt-5.1', // Previous flagship
'gpt-5', // GPT-5
'gpt-5-mini', // Fast
'gpt-4.1', // GPT-4.1
'gpt-4.1-mini', // Fast GPT-4.1
'gpt-4o', // GPT-4o
'gpt-4o-mini', // Fast GPT-4o
],
// From AI SDK docs: https://sdk.vercel.ai/providers/ai-sdk-providers/google-generative-ai
gemini: [
// https://ai.google.dev/gemini-api/docs/models
'gemini-2.5-flash',
'gemini-2.5-flash-lite',
'gemini-2.0-flash-001',
'gemini-2.0-flash-lite-001',
'gemini-3-pro-preview', // Latest preview
'gemini-2.5-pro', // Latest pro
'gemini-2.5-flash', // Fast
'gemini-2.5-flash-lite', // Ultra fast
'gemini-2.0-flash', // Stable flash
],
// From AI SDK docs: https://sdk.vercel.ai/providers/ai-sdk-providers/anthropic
claude: [
// https://docs.anthropic.com/en/docs/about-claude/models/overview
'claude-opus-4-1-20250805',
'claude-opus-4-20250514',
'claude-sonnet-4-5-20250929',
'claude-sonnet-4-20250514',
'claude-3-7-sonnet-20250219',
'claude-haiku-4-5-20251001',
'claude-3-5-haiku-20241022',
],
grok: [
// https://docs.x.ai/docs/models
'grok-4-fast',
'grok-4-fast-non-reasoning',
'grok-3-mini',
'grok-3',
'claude-opus-4-5', // Latest opus (no dot!)
'claude-sonnet-4-5', // Latest sonnet (no dot!)
'claude-haiku-4-5', // Latest haiku (no dot!)
'claude-opus-4-0', // Opus 4.0
'claude-sonnet-4-0', // Sonnet 4.0
'claude-3-7-sonnet-latest', // Claude 3.7
'claude-3-5-haiku-latest', // Claude 3.5 Haiku
],
// From AI SDK docs: https://sdk.vercel.ai/providers/ai-sdk-providers/xai
grok: ['grok-4-fast-non-reasoning', 'grok-4-fast-reasoning', 'grok-3-fast'],
}

/**
* Helper to get all models as a flat array with provider info
* Used by benchmarks
*/
export const getAllModels = (): { provider: AiModelProvider; model: string }[] => {
const allModels: { provider: AiModelProvider; model: string }[] = []
for (const [provider, models] of Object.entries(supportedModelsByProvider)) {
for (const model of models) {
allModels.push({ provider: provider as AiModelProvider, model })
}
}
return allModels
}

/**
* Get models for a specific provider
*/
export const getModelsForProvider = (provider: AiModelProvider): string[] => {
return supportedModelsByProvider[provider] || []
}
5 changes: 4 additions & 1 deletion api/services/ai/openai.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,22 @@ import { AiPlayerPromptSchema } from '@chessarena/types/ai-models'
import { createOpenAI } from '@ai-sdk/openai'
import { streamObject } from 'ai'
import { models } from './models'
import { getMaxReasoningProviderOptions } from './provider-options'
import { Handler } from './types'

export const openai: Handler = async ({ model, logger, prompt, onThoughtUpdate }) => {
const openai = createOpenAI({
apiKey: process.env.OPENAI_API_KEY,
})

const modelId = model ?? models.openai
const { partialObjectStream, object } = streamObject({
model: openai(model ?? models.openai),
model: openai(modelId),
prompt,
schema: AiPlayerPromptSchema,
maxRetries: 0,
abortSignal: AbortSignal.timeout(180000),
providerOptions: getMaxReasoningProviderOptions('openai', modelId),
})

for await (const partialObject of partialObjectStream) {
Expand Down
Loading