From 840f4db1b17098431469969eaecfb4af4afaf538 Mon Sep 17 00:00:00 2001 From: Tony Deng Date: Wed, 7 Jan 2026 13:25:54 -0800 Subject: [PATCH 1/5] update wandb with cloud agent integration --- AGENTS.md | 17 +++- README.md | 41 ++++++---- .../src/lib/tax-processing-service.ts | 8 +- packages/scripts/src/harness-lib-async.ts | 1 + packages/scripts/src/harness-lib.ts | 1 + packages/scripts/src/step1-runloop-setup.ts | 77 +++++++++++++++++++ packages/scripts/src/step1-start-devbox.ts | 6 +- .../src/step1-start-prebuild-devbox.ts | 6 +- .../src/step3-generate-expected-outputs.ts | 6 +- 9 files changed, 138 insertions(+), 25 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 8184c9a..fca6a54 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -219,16 +219,18 @@ This ensures the agent can write to `/tmp` for tax document processing without r ### Environment Variables -**Agent Server (`packages/tax-processing/.env`):** +**Agent Server (`packages/tax-processing/.env`) - Local Development Only:** ```env PORT=3001 FRONTEND_URL=http://localhost:3000 OPENAI_API_KEY=your-api-key-here -# Optional: Weights & Biases Weave for LLM tracing +# Optional: Weights & Biases Weave for LLM tracing (local dev only) WANDB_API_KEY=your-wandb-api-key-here ``` +**Note**: When running on Runloop devboxes, API keys must be configured as Runloop secrets, not local environment variables. The `OPENAI_API_KEY` and `WANDB_API_KEY` secrets are automatically injected into devboxes from the Runloop secret store. + **Frontend (`packages/frontend/.env.local`):** ```env NEXT_PUBLIC_AGENT_URL=http://localhost:3001 @@ -244,7 +246,14 @@ GITHUB_TOKEN=your-token-here # Required for private repos The tax agent integrates with [Weights & Biases Weave](https://docs.wandb.ai/weave/) for comprehensive LLM call tracing and monitoring. -**Setup:** +**Setup for Runloop Devboxes:** +1. Get your W&B API key from https://wandb.ai/authorize +2. Add `WANDB_API_KEY` to Runloop secrets: + - Option A: Use the Runloop Settings page at https://platform.runloop.ai/settings + - Option B: Run `pnpm step1_runloop_setup` - it will prompt you to add the secret +3. When the agent runs on a devbox, Weave initializes automatically if the secret is present + +**Setup for Local Development:** 1. Get your W&B API key from https://wandb.ai/authorize 2. Add `WANDB_API_KEY=your-key` to `packages/tax-processing/.env` 3. Start the agent server - Weave initializes automatically @@ -262,7 +271,7 @@ The tax agent integrates with [Weights & Biases Weave](https://docs.wandb.ai/wea - Explore traces, latency distributions, and token usage **Configuration:** -The CodexService automatically initializes Weave if `WANDB_API_KEY` is present. Look for: +The CodexService automatically initializes Weave if `WANDB_API_KEY` is present in the environment (from Runloop secrets on devboxes, or from local `.env` file). Look for: - `[CodexService] Weave tracing initialized successfully` - Weave is active - `[CodexService] Weave tracing disabled: WANDB_API_KEY not set` - Running without Weave diff --git a/README.md b/README.md index 804eb45..16c9967 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,9 @@ best. We can perform quick experiments to measure performance after making chang c. Create a Secret for the OpenAI key you just generated in the [https://platform.runloop.ai/settings](settings) page. Name the secret name `OPENAI_API_KEY` and paste the key value from the OpenAI site. - d. Now configure your environment: + d. (Optional) Create a Secret for W&B API key for Weave LLM tracing. Get your W&B API key from [https://wandb.ai/authorize](https://wandb.ai/authorize), then create a secret named `WANDB_API_KEY` in the [https://platform.runloop.ai/settings](settings) page. This enables Weave tracing on Runloop devboxes. Note: The `pnpm step1_runloop_setup` script will also prompt you for this. + + e. Now configure your environment: ```bash export RUNLOOP_API_KEY= @@ -73,7 +75,7 @@ best. We can perform quick experiments to measure performance after making chang Open the `.env` file and update values where prompted. - e. Launch the environment to test the setup: + f. Launch the environment to test the setup: ```bash pnpm dev @@ -85,27 +87,34 @@ best. We can perform quick experiments to measure performance after making chang 3. **(Optional) Enable Weave Tracing** - To track and monitor LLM calls with Weights & Biases Weave: - - a. Get your W&B API key from [https://wandb.ai/authorize](https://wandb.ai/authorize) - - b. Add the API key to `packages/tax-processing/.env`: + The tax agent integrates with [Weights & Biases Weave](https://docs.wandb.ai/weave/) for comprehensive LLM call tracing and monitoring. - ```bash - WANDB_API_KEY=your-wandb-api-key-here - ``` + **For Local Development:** + - Get your W&B API key from [https://wandb.ai/authorize](https://wandb.ai/authorize) + - Add the API key to `packages/tax-processing/.env`: + ```bash + WANDB_API_KEY=your-wandb-api-key-here + ``` + - When the agent server starts locally, you'll see: + - Success: `[CodexService] Weave tracing initialized successfully` + - Disabled: `[CodexService] Weave tracing disabled: WANDB_API_KEY not set` - c. When the agent server starts, you'll see: - - Success: `[CodexService] Weave tracing initialized successfully` - - Disabled: `[CodexService] Weave tracing disabled: WANDB_API_KEY not set` + **For Runloop Devboxes:** + - The `WANDB_API_KEY` must be configured as a Runloop secret (see step 2d above) + - The `pnpm step1_runloop_setup` script will prompt you to add this secret + - Weave automatically initializes when the agent runs on a devbox if the secret is present - d. View traces in your Weave dashboard at [https://wandb.ai/](https://wandb.ai/) + **Viewing Traces:** + - Navigate to [https://wandb.ai/](https://wandb.ai/) + - Select project: `tax-preparation-agent` + - Explore traces, latency distributions, and token usage Weave automatically captures: - All OpenAI API calls made through the Codex SDK - - Input prompts and output responses - - Token usage and latency metrics + - Complete input prompts and output responses + - Token usage, latency, and cost metrics - Error traces and debugging information + - Agent tool usage (taxctl commands) **Note**: Weave is completely optional. The system works without it. diff --git a/packages/frontend/src/lib/tax-processing-service.ts b/packages/frontend/src/lib/tax-processing-service.ts index d98cc64..82de52d 100644 --- a/packages/frontend/src/lib/tax-processing-service.ts +++ b/packages/frontend/src/lib/tax-processing-service.ts @@ -84,8 +84,12 @@ export class TaxProcessingService { CODEX_SKIP_GIT_REPO_CHECK: 'true', RUNLOOP_DEVBOX: '1', }, - // wire in the OpenAI key from the Runloop secret store - secrets: { OPENAI_API_KEY: 'OPENAI_API_KEY' }, + secrets: { + // wire in the OpenAI key from the Runloop secret store + OPENAI_API_KEY: 'OPENAI_API_KEY', + // optionally wire in the WANDB key from the Runloop secret store + // WANDB_API_KEY: 'WANDB_API_KEY', + }, }); logger.log(`Devbox created: ${this.devbox.id}`); diff --git a/packages/scripts/src/harness-lib-async.ts b/packages/scripts/src/harness-lib-async.ts index d7969e9..e61c110 100644 --- a/packages/scripts/src/harness-lib-async.ts +++ b/packages/scripts/src/harness-lib-async.ts @@ -297,6 +297,7 @@ export async function startBenchmarkRun( ], secrets: { OPENAI_API_KEY: 'OPENAI_API_KEY', + WANDB_API_KEY: 'WANDB_API_KEY', }, } as any, }); diff --git a/packages/scripts/src/harness-lib.ts b/packages/scripts/src/harness-lib.ts index 3ab80fb..f6dbcae 100644 --- a/packages/scripts/src/harness-lib.ts +++ b/packages/scripts/src/harness-lib.ts @@ -119,6 +119,7 @@ async function startScenarioRun( ], secrets: { OPENAI_API_KEY: 'OPENAI_API_KEY', + WANDB_API_KEY: 'WANDB_API_KEY', }, } as any, }); diff --git a/packages/scripts/src/step1-runloop-setup.ts b/packages/scripts/src/step1-runloop-setup.ts index 93ccdaa..cbb4720 100644 --- a/packages/scripts/src/step1-runloop-setup.ts +++ b/packages/scripts/src/step1-runloop-setup.ts @@ -99,6 +99,78 @@ async function installOpenaiKey(runloop: RunloopSDK): Promise { } } +/** + * Install the W&B API key in the Runloop secret store (optional). + * This enables Weave LLM tracing on Runloop devboxes. + */ +async function installWandbKey(runloop: RunloopSDK): Promise { + try { + // Check if secret already exists + const secretsList = await runloop.api.secrets.list(); + const existingSecret = secretsList.secrets.find( + (s) => s.name === 'WANDB_API_KEY' + ); + + if (existingSecret) { + console.log( + ' ✓ WANDB_API_KEY already exists in Runloop secret store (skipping)' + ); + return; + } + + // Get default value from environment + const defaultValue = process.env.WANDB_API_KEY; + + // Construct prompt message + let promptMessage = + '🔑 WANDB_API_KEY not found in Runloop secret store.\n Optional: Provide your W&B API key for Weave LLM tracing'; + promptMessage += '\n (Get your key from https://wandb.ai/authorize)'; + + if (defaultValue) { + const last4 = defaultValue.slice(-4); + promptMessage += `\n (press Enter to use env var: ...${last4})`; + } + promptMessage += '\n (press Enter to skip): '; + + // Prompt user interactively + const rl = readline.createInterface({ + input: process.stdin, + output: process.stdout, + }); + + const userInput = await new Promise((resolve) => { + rl.question(promptMessage, (answer) => { + rl.close(); + resolve(answer); + }); + }); + + // Determine final value + const finalValue = userInput.trim() || defaultValue; + + if (!finalValue) { + console.log( + ' ⏭️ Skipping WANDB_API_KEY setup (Weave tracing will be disabled)' + ); + return; + } + + // Create secret in Runloop + await runloop.api.secrets.create({ + name: 'WANDB_API_KEY', + value: finalValue, + }); + + console.log(' ✓ WANDB_API_KEY installed in Runloop secret store'); + } catch (error) { + console.error( + `⚠️ Warning: Could not manage WANDB_API_KEY secret: ${error instanceof Error ? error.message : error}` + ); + console.log(' Continuing without Weave tracing...'); + // Don't exit - this is optional + } +} + async function main() { console.log('🚀 Starting Step 1 Runloop Setup...\n'); @@ -145,6 +217,11 @@ async function main() { await installOpenaiKey(runloop); console.log(''); + // Install W&B API key in secret store (optional) + console.log('🔑 Checking for WANDB_API_KEY in Runloop secret store...\n'); + await installWandbKey(runloop); + console.log(''); + // Check if resources already exist on Runloop console.log('🔍 Checking for existing resources on Runloop...\n'); diff --git a/packages/scripts/src/step1-start-devbox.ts b/packages/scripts/src/step1-start-devbox.ts index 68705d9..c36ba7b 100644 --- a/packages/scripts/src/step1-start-devbox.ts +++ b/packages/scripts/src/step1-start-devbox.ts @@ -71,7 +71,11 @@ async function main() { CODEX_SKIP_GIT_REPO_CHECK: 'true', RUNLOOP_DEVBOX: '1', // Tell the agent it's running on a devbox }, - secrets: { OPENAI_API_KEY: 'OPENAI_API_KEY' }, + secrets: { + OPENAI_API_KEY: 'OPENAI_API_KEY', + // optionally wire in the WANDB key from the Runloop secret store + // WANDB_API_KEY: 'WANDB_API_KEY', + }, launch_parameters: { keep_alive_time_seconds: 86400, // after_idle: { idle_time_seconds: 3600, on_idle: "suspend" } // exclusive with keep_alive_time_seconds diff --git a/packages/scripts/src/step1-start-prebuild-devbox.ts b/packages/scripts/src/step1-start-prebuild-devbox.ts index 476403b..fae7789 100644 --- a/packages/scripts/src/step1-start-prebuild-devbox.ts +++ b/packages/scripts/src/step1-start-prebuild-devbox.ts @@ -70,7 +70,11 @@ async function main() { CODEX_SKIP_GIT_REPO_CHECK: 'true', RUNLOOP_DEVBOX: '1', // Tell the agent it's running on a devbox }, - secrets: { OPENAI_API_KEY: 'OPENAI_API_KEY' }, + secrets: { + OPENAI_API_KEY: 'OPENAI_API_KEY', + // optionally wire in the WANDB key from the Runloop secret store + // WANDB_API_KEY: 'WANDB_API_KEY', + }, launch_parameters: { keep_alive_time_seconds: 86400, }, diff --git a/packages/scripts/src/step3-generate-expected-outputs.ts b/packages/scripts/src/step3-generate-expected-outputs.ts index a695435..bd39bc4 100644 --- a/packages/scripts/src/step3-generate-expected-outputs.ts +++ b/packages/scripts/src/step3-generate-expected-outputs.ts @@ -49,7 +49,11 @@ async function generateExpectedOutput( CODEX_SKIP_GIT_REPO_CHECK: 'true', RUNLOOP_DEVBOX: '1', }, - secrets: { OPENAI_API_KEY: 'OPENAI_API_KEY' }, + secrets: { + OPENAI_API_KEY: 'OPENAI_API_KEY', + // By step 3, tracing is beneficial so we'll require the WANDB key + WANDB_API_KEY: 'WANDB_API_KEY', + }, }); try { From a4a41ca10c23149653571a633dd1c704836b68e8 Mon Sep 17 00:00:00 2001 From: Tony Deng Date: Wed, 7 Jan 2026 16:31:06 -0800 Subject: [PATCH 2/5] update with wandb --- .../src/lib/tax-processing-service.ts | 2 +- .../src/services/codex.service.ts | 44 +++++++++++++++++-- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/packages/frontend/src/lib/tax-processing-service.ts b/packages/frontend/src/lib/tax-processing-service.ts index 82de52d..7724def 100644 --- a/packages/frontend/src/lib/tax-processing-service.ts +++ b/packages/frontend/src/lib/tax-processing-service.ts @@ -88,7 +88,7 @@ export class TaxProcessingService { // wire in the OpenAI key from the Runloop secret store OPENAI_API_KEY: 'OPENAI_API_KEY', // optionally wire in the WANDB key from the Runloop secret store - // WANDB_API_KEY: 'WANDB_API_KEY', + WANDB_API_KEY: 'WANDB_API_KEY', }, }); diff --git a/packages/tax-processing/src/services/codex.service.ts b/packages/tax-processing/src/services/codex.service.ts index 32eaa35..f0474bc 100644 --- a/packages/tax-processing/src/services/codex.service.ts +++ b/packages/tax-processing/src/services/codex.service.ts @@ -1,5 +1,5 @@ import { Codex } from '@openai/codex-sdk'; -import { init as weaveInit } from 'weave'; +import * as weave from 'weave'; export interface AgentTurnResult { success: boolean; @@ -10,6 +10,8 @@ export interface AgentTurnResult { export class CodexService { private codex: Codex; private sandboxMode: 'danger-full-access' | 'workspace-write'; + private weaveInitialized: boolean = false; + private weaveInitPromise: Promise; constructor(_systemPrompt?: string) { if (!process.env.OPENAI_API_KEY) { @@ -19,7 +21,8 @@ export class CodexService { } // Initialize Weave for LLM tracing if API key is available - this.initializeWeave(); + // Store the promise so we can await it before first use + this.weaveInitPromise = this.initializeWeave(); // Set sandbox mode based on environment // Use 'danger-full-access' on Runloop Devbox (already sandboxed), 'workspace-write' locally @@ -54,25 +57,58 @@ export class CodexService { console.log( '[CodexService] Weave tracing disabled: WANDB_API_KEY not set' ); - return; + this.weaveInitialized = false; + return; } try { - await weaveInit('tax-preparation-agent'); + await weave.init('tax-preparation-agent'); + this.weaveInitialized = true; console.log('[CodexService] Weave tracing initialized successfully'); } catch (error) { console.error('[CodexService] Failed to initialize Weave:', error); // Don't throw - allow the service to continue without Weave + this.weaveInitialized = false; } } /** * Run a single agent turn with real-time event streaming * Calls the event handler for each event as it arrives, allowing real-time monitoring + * Wrapped with weave.op() to create traces that are automatically sent to Wandb */ async runAgentTurnStreaming( prompt: string, onEvent: (event: unknown) => void + ): Promise { + // Ensure Weave is initialized before creating traces + await this.weaveInitPromise; + + // Only wrap with weave.op() if Weave is initialized + if (this.weaveInitialized) { + // Wrap the actual implementation with weave.op() to create traces + // Weave automatically captures function name, inputs, outputs, and execution time + const tracedRunAgentTurn = weave.op( + this.runAgentTurnStreamingInternal.bind(this), + { + name: 'runAgentTurnStreaming', + } + ); + + return tracedRunAgentTurn(prompt, onEvent); + } else { + // If Weave is not initialized, run without tracing + return this.runAgentTurnStreamingInternal(prompt, onEvent); + } + } + + /** + * Internal implementation of agent turn execution + * Separated so we can wrap it with weave.op() for tracing + */ + private async runAgentTurnStreamingInternal( + prompt: string, + onEvent: (event: unknown) => void ): Promise { try { console.log('[runAgentTurnStreaming] Starting agent turn...'); From 08f15659065b8d95ce7120c9d0cdb7220cc58f2a Mon Sep 17 00:00:00 2001 From: Tony Deng Date: Wed, 7 Jan 2026 17:00:49 -0800 Subject: [PATCH 3/5] update weave agent init --- .../src/services/codex.service.ts | 66 +++++++++++++++++-- 1 file changed, 61 insertions(+), 5 deletions(-) diff --git a/packages/tax-processing/src/services/codex.service.ts b/packages/tax-processing/src/services/codex.service.ts index f0474bc..5d995da 100644 --- a/packages/tax-processing/src/services/codex.service.ts +++ b/packages/tax-processing/src/services/codex.service.ts @@ -64,7 +64,9 @@ export class CodexService { try { await weave.init('tax-preparation-agent'); this.weaveInitialized = true; - console.log('[CodexService] Weave tracing initialized successfully'); + console.log( + '[CodexService] Weave tracing initialized - traces will be sent to Wandb project "tax-preparation-agent"' + ); } catch (error) { console.error('[CodexService] Failed to initialize Weave:', error); // Don't throw - allow the service to continue without Weave @@ -111,8 +113,6 @@ export class CodexService { onEvent: (event: unknown) => void ): Promise { try { - console.log('[runAgentTurnStreaming] Starting agent turn...'); - console.log('[runAgentTurnStreaming] Working directory:', process.cwd()); // Create a fresh thread for this turn const thread = this.codex.startThread({ @@ -158,12 +158,68 @@ export class CodexService { type?: string; content?: string; text?: string; + command?: string; + exit_code?: number; + aggregated_output?: string; + summary?: string; }; error?: string; }; - // Collect agent messages for final result - if (e.type === 'item.completed' && e.item) { + // Create child traces for significant events if Weave is initialized + if (this.weaveInitialized && e.type === 'item.completed' && e.item) { + if (e.item.type === 'command_execution') { + // Wrap command execution as a child operation + const traceCommandExecution = weave.op( + async function traceCommandExecution( + command: string, + output: string, + exitCode?: number + ): Promise<{ command: string; output: string; exitCode?: number }> { + return { command, output, exitCode }; + } + ); + + await traceCommandExecution( + e.item.command || '', + e.item.aggregated_output || '', + e.item.exit_code + ); + } else if ( + e.item.type === 'agent_message' || + e.item.type === 'agentMessage' + ) { + // Wrap agent message as a child operation + const traceAgentMessage = weave.op( + async function traceAgentMessage( + message: string + ): Promise<{ message: string }> { + return { message }; + } + ); + + const content = e.item.content || e.item.text || ''; + if (content) { + await traceAgentMessage(content); + agentMessages.push(content); + } + } else if (e.item.type === 'reasoning') { + // Wrap reasoning as a child operation + const traceReasoning = weave.op( + async function traceReasoning( + reasoning: string + ): Promise<{ reasoning: string }> { + return { reasoning }; + } + ); + + const reasoningText = e.item.summary || e.item.text || ''; + if (reasoningText) { + await traceReasoning(reasoningText); + } + } + } else if (e.type === 'item.completed' && e.item) { + // Collect agent messages even if Weave is not initialized if ( e.item.type === 'agent_message' || e.item.type === 'agentMessage' From 5875bc69530643342e352611604cbb5b820c707e Mon Sep 17 00:00:00 2001 From: Tony Deng Date: Wed, 7 Jan 2026 17:06:35 -0800 Subject: [PATCH 4/5] format --- .../src/services/codex.service.ts | 33 +++++++++---------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/packages/tax-processing/src/services/codex.service.ts b/packages/tax-processing/src/services/codex.service.ts index 5d995da..a8fcf2e 100644 --- a/packages/tax-processing/src/services/codex.service.ts +++ b/packages/tax-processing/src/services/codex.service.ts @@ -58,7 +58,7 @@ export class CodexService { '[CodexService] Weave tracing disabled: WANDB_API_KEY not set' ); this.weaveInitialized = false; - return; + return; } try { @@ -113,7 +113,6 @@ export class CodexService { onEvent: (event: unknown) => void ): Promise { try { - // Create a fresh thread for this turn const thread = this.codex.startThread({ workingDirectory: process.cwd(), @@ -175,7 +174,11 @@ export class CodexService { command: string, output: string, exitCode?: number - ): Promise<{ command: string; output: string; exitCode?: number }> { + ): Promise<{ + command: string; + output: string; + exitCode?: number; + }> { return { command, output, exitCode }; } ); @@ -190,13 +193,11 @@ export class CodexService { e.item.type === 'agentMessage' ) { // Wrap agent message as a child operation - const traceAgentMessage = weave.op( - async function traceAgentMessage( - message: string - ): Promise<{ message: string }> { - return { message }; - } - ); + const traceAgentMessage = weave.op(async function traceAgentMessage( + message: string + ): Promise<{ message: string }> { + return { message }; + }); const content = e.item.content || e.item.text || ''; if (content) { @@ -205,13 +206,11 @@ export class CodexService { } } else if (e.item.type === 'reasoning') { // Wrap reasoning as a child operation - const traceReasoning = weave.op( - async function traceReasoning( - reasoning: string - ): Promise<{ reasoning: string }> { - return { reasoning }; - } - ); + const traceReasoning = weave.op(async function traceReasoning( + reasoning: string + ): Promise<{ reasoning: string }> { + return { reasoning }; + }); const reasoningText = e.item.summary || e.item.text || ''; if (reasoningText) { From a446531c0cc07e29ebfc2bff70b9bc82f7737297 Mon Sep 17 00:00:00 2001 From: Tony Deng Date: Wed, 7 Jan 2026 17:58:00 -0800 Subject: [PATCH 5/5] comment out WANDB_API_KEY for initial push --- packages/frontend/src/lib/tax-processing-service.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/frontend/src/lib/tax-processing-service.ts b/packages/frontend/src/lib/tax-processing-service.ts index 7724def..82de52d 100644 --- a/packages/frontend/src/lib/tax-processing-service.ts +++ b/packages/frontend/src/lib/tax-processing-service.ts @@ -88,7 +88,7 @@ export class TaxProcessingService { // wire in the OpenAI key from the Runloop secret store OPENAI_API_KEY: 'OPENAI_API_KEY', // optionally wire in the WANDB key from the Runloop secret store - WANDB_API_KEY: 'WANDB_API_KEY', + // WANDB_API_KEY: 'WANDB_API_KEY', }, });