diff --git a/bun.lock b/bun.lock
index a768e33..1f47721 100644
--- a/bun.lock
+++ b/bun.lock
@@ -11,7 +11,7 @@
     },
     "packages/code-chunk": {
       "name": "code-chunk",
-      "version": "0.1.0",
+      "version": "0.1.12",
       "dependencies": {
         "effect": "^3.19.12",
         "tree-sitter-go": "^0.25.0",
@@ -38,21 +38,31 @@
       "name": "@supermemory/eval",
       "version": "0.1.0",
       "dependencies": {
+        "@anthropic-ai/claude-agent-sdk": "^0.1.75",
+        "@anthropic-ai/sdk": "^0.71.2",
         "code-chunk": "workspace:*",
-        "openai": "^4.0.0",
+        "dotenv": "^16.4.0",
+        "zod": "^3.24.0",
       },
       "devDependencies": {
         "@types/bun": "^1.3.4",
+        "typescript": "^5.0.0",
       },
     },
   },
   "packages": {
+    "@anthropic-ai/claude-agent-sdk": ["@anthropic-ai/claude-agent-sdk@0.1.76", "", { "optionalDependencies": { "@img/sharp-darwin-arm64": "^0.33.5", "@img/sharp-darwin-x64": "^0.33.5", "@img/sharp-linux-arm": "^0.33.5", "@img/sharp-linux-arm64": "^0.33.5", "@img/sharp-linux-x64": "^0.33.5", "@img/sharp-linuxmusl-arm64": "^0.33.5", "@img/sharp-linuxmusl-x64": "^0.33.5", "@img/sharp-win32-x64": "^0.33.5" }, "peerDependencies": { "zod": "^3.24.1 || ^4.0.0" } }, "sha512-s7RvpXoFaLXLG7A1cJBAPD8ilwOhhc/12fb5mJXRuD561o4FmPtQ+WRfuy9akMmrFRfLsKv8Ornw3ClGAPL2fw=="],
+
+    "@anthropic-ai/sdk": ["@anthropic-ai/sdk@0.71.2", "", { "dependencies": { "json-schema-to-ts": "^3.1.1" }, "peerDependencies": { "zod": "^3.25.0 || ^4.0.0" }, "optionalPeers": ["zod"], "bin": { "anthropic-ai-sdk": "bin/cli" } }, "sha512-TGNDEUuEstk/DKu0/TflXAEt+p+p/WhTlFzEnoosvbaDU2LTjm42igSdlL0VijrKpWejtOKxX0b8A7uc+XiSAQ=="],
+
     "@babel/helper-string-parser": ["@babel/helper-string-parser@7.27.1", "", {}, "sha512-qMlSxKbpRlAridDExk92nSobyDdpPijUq2DW6oDnUqd0iOGxmQjyqhMIihI9+zv4LPyZdRje2cavWPbCbWm3eA=="],
 
     "@babel/helper-validator-identifier": ["@babel/helper-validator-identifier@7.28.5", "", {}, "sha512-qSs4ifwzKJSV39ucNjsvc6WVHs6b7S03sOh2OcHF9UHfVPqWWALUsNUVzhSBiItjRZoLHx7nIarVjqKVusUZ1Q=="],
 
     "@babel/parser": ["@babel/parser@7.28.5", "", { "dependencies": { "@babel/types": "^7.28.5" }, "bin": "./bin/babel-parser.js" }, "sha512-KKBU1VGYR7ORr3At5HAtUQ+TV3SzRCXmA/8OdDZiLDBIZxVyzXuztPjfLd3BV1PRAQGCMWWSHYhL0F8d5uHBDQ=="],
 
+    "@babel/runtime": ["@babel/runtime@7.28.4", "", {}, "sha512-Q/N6JNWvIvPnLDvjlE1OUBLPQHH6l3CltCEsHIujp45zQUSSh8K+gHnaEX45yAT1nyngnINhvWtzN+Nb9D8RAQ=="],
+
     "@babel/types": ["@babel/types@7.28.5", "", { "dependencies": { "@babel/helper-string-parser": "^7.27.1", "@babel/helper-validator-identifier": "^7.28.5" } }, "sha512-qQ5m48eI/MFLQ5PxQj4PFaprjyCTLI37ElWMmNs0K8Lk3dVeOdNpB3ks8jc7yM5CDmVC73eMVk/trk3fgmrUpA=="],
 
     "@biomejs/biome": ["@biomejs/biome@2.3.10", "", { "optionalDependencies": { "@biomejs/cli-darwin-arm64": "2.3.10", "@biomejs/cli-darwin-x64": "2.3.10", "@biomejs/cli-linux-arm64": "2.3.10", "@biomejs/cli-linux-arm64-musl": "2.3.10", "@biomejs/cli-linux-x64": "2.3.10", "@biomejs/cli-linux-x64-musl": "2.3.10", "@biomejs/cli-win32-arm64": "2.3.10", "@biomejs/cli-win32-x64": "2.3.10" }, "bin": { "biome": "bin/biome" } }, "sha512-/uWSUd1MHX2fjqNLHNL6zLYWBbrJeG412/8H7ESuK8ewoRoMPUgHDebqKrPTx/5n6f17Xzqc9hdg3MEqA5hXnQ=="],
@@ -83,6 +93,36 @@
 
     "@emnapi/wasi-threads": ["@emnapi/wasi-threads@1.1.0", "", { "dependencies": { "tslib": "^2.4.0" } }, "sha512-WI0DdZ8xFSbgMjR1sFsKABJ/C5OnRrjT06JXbZKexJGrDuPTzZdDYfFlsgcCXCyf+suG5QU2e/y1Wo2V/OapLQ=="],
 
+    "@img/sharp-darwin-arm64": ["@img/sharp-darwin-arm64@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-darwin-arm64": "1.0.4" }, "os": "darwin", "cpu": "arm64" }, "sha512-UT4p+iz/2H4twwAoLCqfA9UH5pI6DggwKEGuaPy7nCVQ8ZsiY5PIcrRvD1DzuY3qYL07NtIQcWnBSY/heikIFQ=="],
+
+    "@img/sharp-darwin-x64": ["@img/sharp-darwin-x64@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-darwin-x64": "1.0.4" }, "os": "darwin", "cpu": "x64" }, "sha512-fyHac4jIc1ANYGRDxtiqelIbdWkIuQaI84Mv45KvGRRxSAa7o7d1ZKAOBaYbnepLC1WqxfpimdeWfvqqSGwR2Q=="],
+
+    "@img/sharp-libvips-darwin-arm64": ["@img/sharp-libvips-darwin-arm64@1.0.4", "", { "os": "darwin", "cpu": "arm64" }, "sha512-XblONe153h0O2zuFfTAbQYAX2JhYmDHeWikp1LM9Hul9gVPjFY427k6dFEcOL72O01QxQsWi761svJ/ev9xEDg=="],
+
+    "@img/sharp-libvips-darwin-x64": ["@img/sharp-libvips-darwin-x64@1.0.4", "", { "os": "darwin", "cpu": "x64" }, "sha512-xnGR8YuZYfJGmWPvmlunFaWJsb9T/AO2ykoP3Fz/0X5XV2aoYBPkX6xqCQvUTKKiLddarLaxpzNe+b1hjeWHAQ=="],
+
+    "@img/sharp-libvips-linux-arm": ["@img/sharp-libvips-linux-arm@1.0.5", "", { "os": "linux", "cpu": "arm" }, "sha512-gvcC4ACAOPRNATg/ov8/MnbxFDJqf/pDePbBnuBDcjsI8PssmjoKMAz4LtLaVi+OnSb5FK/yIOamqDwGmXW32g=="],
+
+    "@img/sharp-libvips-linux-arm64": ["@img/sharp-libvips-linux-arm64@1.0.4", "", { "os": "linux", "cpu": "arm64" }, "sha512-9B+taZ8DlyyqzZQnoeIvDVR/2F4EbMepXMc/NdVbkzsJbzkUjhXv/70GQJ7tdLA4YJgNP25zukcxpX2/SueNrA=="],
+
+    "@img/sharp-libvips-linux-x64": ["@img/sharp-libvips-linux-x64@1.0.4", "", { "os": "linux", "cpu": "x64" }, "sha512-MmWmQ3iPFZr0Iev+BAgVMb3ZyC4KeFc3jFxnNbEPas60e1cIfevbtuyf9nDGIzOaW9PdnDciJm+wFFaTlj5xYw=="],
+
+    "@img/sharp-libvips-linuxmusl-arm64": ["@img/sharp-libvips-linuxmusl-arm64@1.0.4", "", { "os": "linux", "cpu": "arm64" }, "sha512-9Ti+BbTYDcsbp4wfYib8Ctm1ilkugkA/uscUn6UXK1ldpC1JjiXbLfFZtRlBhjPZ5o1NCLiDbg8fhUPKStHoTA=="],
+
+    "@img/sharp-libvips-linuxmusl-x64": ["@img/sharp-libvips-linuxmusl-x64@1.0.4", "", { "os": "linux", "cpu": "x64" }, "sha512-viYN1KX9m+/hGkJtvYYp+CCLgnJXwiQB39damAO7WMdKWlIhmYTfHjwSbQeUK/20vY154mwezd9HflVFM1wVSw=="],
+
+    "@img/sharp-linux-arm": ["@img/sharp-linux-arm@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-arm": "1.0.5" }, "os": "linux", "cpu": "arm" }, "sha512-JTS1eldqZbJxjvKaAkxhZmBqPRGmxgu+qFKSInv8moZ2AmT5Yib3EQ1c6gp493HvrvV8QgdOXdyaIBrhvFhBMQ=="],
+
+    "@img/sharp-linux-arm64": ["@img/sharp-linux-arm64@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-arm64": "1.0.4" }, "os": "linux", "cpu": "arm64" }, "sha512-JMVv+AMRyGOHtO1RFBiJy/MBsgz0x4AWrT6QoEVVTyh1E39TrCUpTRI7mx9VksGX4awWASxqCYLCV4wBZHAYxA=="],
+
+    "@img/sharp-linux-x64": ["@img/sharp-linux-x64@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-x64": "1.0.4" }, "os": "linux", "cpu": "x64" }, "sha512-opC+Ok5pRNAzuvq1AG0ar+1owsu842/Ab+4qvU879ippJBHvyY5n2mxF1izXqkPYlGuP/M556uh53jRLJmzTWA=="],
+
+    "@img/sharp-linuxmusl-arm64": ["@img/sharp-linuxmusl-arm64@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-linuxmusl-arm64": "1.0.4" }, "os": "linux", "cpu": "arm64" }, "sha512-XrHMZwGQGvJg2V/oRSUfSAfjfPxO+4DkiRh6p2AFjLQztWUuY/o8Mq0eMQVIY7HJ1CDQUJlxGGZRw1a5bqmd1g=="],
+
+    "@img/sharp-linuxmusl-x64": ["@img/sharp-linuxmusl-x64@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-linuxmusl-x64": "1.0.4" }, "os": "linux", "cpu": "x64" }, "sha512-WT+d/cgqKkkKySYmqoZ8y3pxx7lx9vVejxW/W4DOFMYVSkErR+w7mf2u8m/y4+xHe7yY9DAXQMWQhpnMuFfScw=="],
+
+    "@img/sharp-win32-x64": ["@img/sharp-win32-x64@0.33.5", "", { "os": "win32", "cpu": "x64" }, "sha512-MpY/o8/8kj+EcnxwvrP4aTJSWw/aZ7JIGR4aBeZkZw5B7/Jn+tY9/VNwtcoGmdT7GfggGIU4kygOMSbYnOrAbg=="],
+
     "@napi-rs/wasm-runtime": ["@napi-rs/wasm-runtime@1.1.0", "", { "dependencies": { "@emnapi/core": "^1.7.1", "@emnapi/runtime": "^1.7.1", "@tybys/wasm-util": "^0.10.1" } }, "sha512-Fq6DJW+Bb5jaWE69/qOE0D1TUN9+6uWhCeZpdnSBk14pjLcCWR7Q8n49PTSPHazM37JqrsdpEthXy2xn6jWWiA=="],
 
     "@oxc-minify/binding-android-arm64": ["@oxc-minify/binding-android-arm64@0.93.0", "", { "os": "android", "cpu": "arm64" }, "sha512-N3j/JoK4hXwQbnyOJoEltM8MEkddWV3XtfYimO6jsMjr5R6QdauKaSVeQHO/lSezB7SFkrMPqr6X7tBfghHiXA=="],
@@ -195,18 +235,10 @@
 
     "@types/node": ["@types/node@18.19.130", "", { "dependencies": { "undici-types": "~5.26.4" } }, "sha512-GRaXQx6jGfL8sKfaIDD6OupbIHBr9jv7Jnaml9tB7l4v068PAOXqfcujMMo5PhbIs6ggR1XODELqahT2R8v0fg=="],
 
-    "@types/node-fetch": ["@types/node-fetch@2.6.13", "", { "dependencies": { "@types/node": "*", "form-data": "^4.0.4" } }, "sha512-QGpRVpzSaUs30JBSGPjOg4Uveu384erbHBoT1zeONvyCfwQxIkUshLAOqN/k9EjGviPRmWTTe6aH2qySWKTVSw=="],
-
-    "abort-controller": ["abort-controller@3.0.0", "", { "dependencies": { "event-target-shim": "^5.0.0" } }, "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg=="],
-
-    "agentkeepalive": ["agentkeepalive@4.6.0", "", { "dependencies": { "humanize-ms": "^1.2.1" } }, "sha512-kja8j7PjmncONqaTsB8fQ+wE2mSU2DJ9D4XKoJ5PFWIdRMa6SLSN1ff4mOr4jCbfRSsxR4keIiySJU0N9T5hIQ=="],
-
     "ansis": ["ansis@4.2.0", "", {}, "sha512-HqZ5rWlFjGiV0tDm3UxxgNRqsOTniqoKZu0pIAfh7TZQMGuZK+hH0drySty0si0QXj1ieop4+SkSfPZBPPkHig=="],
 
     "args-tokenizer": ["args-tokenizer@0.3.0", "", {}, "sha512-xXAd7G2Mll5W8uo37GETpQ2VrE84M181Z7ugHFGQnJZ50M2mbOv0osSZ9VsSgPfJQ+LVG0prSi0th+ELMsno7Q=="],
 
-    "asynckit": ["asynckit@0.4.0", "", {}, "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q=="],
-
     "bumpp": ["bumpp@10.3.2", "", { "dependencies": { "ansis": "^4.2.0", "args-tokenizer": "^0.3.0", "c12": "^3.3.2", "cac": "^6.7.14", "escalade": "^3.2.0", "jsonc-parser": "^3.3.1", "package-manager-detector": "^1.5.0", "semver": "^7.7.3", "tinyexec": "^1.0.2", "tinyglobby": "^0.2.15", "yaml": "^2.8.1" }, "bin": { "bumpp": "bin/bumpp.mjs" } }, "sha512-yUUkVx5zpTywLNX97MlrqtpanI7eMMwFwLntWR2EBVDw3/Pm3aRIzCoDEGHATLIiHK9PuJC7xWI4XNWqXItSPg=="],
 
     "bun-types": ["bun-types@1.3.5", "", { "dependencies": { "@types/node": "*" } }, "sha512-inmAYe2PFLs0SUbFOWSVD24sg1jFlMPxOjOSSCYqUgn4Hsc3rDc7dFvfVYjFPNHtov6kgUeulV4SxbuIV/stPw=="],
@@ -217,8 +249,6 @@
 
     "cac": ["cac@6.7.14", "", {}, "sha512-b6Ilus+c3RrdDk+JhLKUAQfzzgLEPy6wcXqS7f/xe1EETvsDP6GORG7SFuOs6cID5YkqchW/LXZbX5bc8j7ZcQ=="],
 
-    "call-bind-apply-helpers": ["call-bind-apply-helpers@1.0.2", "", { "dependencies": { "es-errors": "^1.3.0", "function-bind": "^1.1.2" } }, "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ=="],
-
     "chokidar": ["chokidar@5.0.0", "", { "dependencies": { "readdirp": "^5.0.0" } }, "sha512-TQMmc3w+5AxjpL8iIiwebF73dRDF4fBIieAqGn9RGCWaEVwQ6Fb2cGe31Yns0RRIzii5goJ1Y7xbMwo1TxMplw=="],
 
     "citty": ["citty@0.1.6", "", { "dependencies": { "consola": "^3.2.3" } }, "sha512-tskPPKEs8D2KPafUypv2gxwJP8h/OaJmC82QQGGDQcHvXX43xF2VDACcJVmZ0EuSxkpO9Kc4MlrA3q0+FG58AQ=="],
@@ -227,70 +257,34 @@
 
     "coffi": ["coffi@0.1.37", "", { "dependencies": { "strip-json-comments": "^5.0.3" } }, "sha512-ewO5Xis7sw7g54yI/3lJ/nNV90Er4ZnENeDORZjrs58T70MmwKFLZgevraNCz+RmB4KDKsYT1ui1wDB36iPWqQ=="],
 
-    "combined-stream": ["combined-stream@1.0.8", "", { "dependencies": { "delayed-stream": "~1.0.0" } }, "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg=="],
-
     "confbox": ["confbox@0.2.2", "", {}, "sha512-1NB+BKqhtNipMsov4xI/NnhCKp9XG9NamYp5PVm9klAT0fsrNPjaFICsCFhNhwZJKNh7zB/3q8qXz0E9oaMNtQ=="],
 
     "consola": ["consola@3.4.2", "", {}, "sha512-5IKcdX0nnYavi6G7TtOhwkYzyjfJlatbjMjuLSfE2kYT5pMDOilZ4OvMhi637CcDICTmz3wARPoyhqyX1Y+XvA=="],
 
     "defu": ["defu@6.1.4", "", {}, "sha512-mEQCMmwJu317oSz8CwdIOdwf3xMif1ttiM8LTufzc3g6kR+9Pe236twL8j3IYT1F7GfRgGcW6MWxzZjLIkuHIg=="],
 
-    "delayed-stream": ["delayed-stream@1.0.0", "", {}, "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ=="],
-
     "destr": ["destr@2.0.5", "", {}, "sha512-ugFTXCtDZunbzasqBxrK93Ik/DRYsO6S/fedkWEMKqt04xZ4csmnmwGDBAb07QWNaGMAmnTIemsYZCksjATwsA=="],
 
     "detect-libc": ["detect-libc@2.1.2", "", {}, "sha512-Btj2BOOO83o3WyH59e8MgXsxEQVcarkUOpEYrubB0urwnN10yQ364rsiByU11nZlqWYZm05i/of7io4mzihBtQ=="],
 
-    "dotenv": ["dotenv@17.2.3", "", {}, "sha512-JVUnt+DUIzu87TABbhPmNfVdBDt18BLOWjMUFJMSi/Qqg7NTYtabbvSNJGOJ7afbRuv9D/lngizHtP7QyLQ+9w=="],
-
-    "dunder-proto": ["dunder-proto@1.0.1", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.1", "es-errors": "^1.3.0", "gopd": "^1.2.0" } }, "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A=="],
+    "dotenv": ["dotenv@16.6.1", "", {}, "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow=="],
 
     "effect": ["effect@3.19.13", "", { "dependencies": { "@standard-schema/spec": "^1.0.0", "fast-check": "^3.23.1" } }, "sha512-8MZ783YuHRwHZX2Mmm+bpGxq+7XPd88sWwYAz2Ysry80sEKpftDZXs2Hg9ZyjESi1IBTNHF0oDKe0zJRkUlyew=="],
 
-    "es-define-property": ["es-define-property@1.0.1", "", {}, "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g=="],
-
-    "es-errors": ["es-errors@1.3.0", "", {}, "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw=="],
-
-    "es-object-atoms": ["es-object-atoms@1.1.1", "", { "dependencies": { "es-errors": "^1.3.0" } }, "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA=="],
-
-    "es-set-tostringtag": ["es-set-tostringtag@2.1.0", "", { "dependencies": { "es-errors": "^1.3.0", "get-intrinsic": "^1.2.6", "has-tostringtag": "^1.0.2", "hasown": "^2.0.2" } }, "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA=="],
-
     "escalade": ["escalade@3.2.0", "", {}, "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA=="],
 
-    "event-target-shim": ["event-target-shim@5.0.1", "", {}, "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ=="],
-
     "exsolve": ["exsolve@1.0.8", "", {}, "sha512-LmDxfWXwcTArk8fUEnOfSZpHOJ6zOMUJKOtFLFqJLoKJetuQG874Uc7/Kki7zFLzYybmZhp1M7+98pfMqeX8yA=="],
 
     "fast-check": ["fast-check@3.23.2", "", { "dependencies": { "pure-rand": "^6.1.0" } }, "sha512-h5+1OzzfCC3Ef7VbtKdcv7zsstUQwUDlYpUTvjeUsJAssPgLn7QzbboPtL5ro04Mq0rPOsMzl7q5hIbRs2wD1A=="],
 
     "fdir": ["fdir@6.5.0", "", { "peerDependencies": { "picomatch": "^3 || ^4" }, "optionalPeers": ["picomatch"] }, "sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg=="],
 
-    "form-data": ["form-data@4.0.5", "", { "dependencies": { "asynckit": "^0.4.0", "combined-stream": "^1.0.8", "es-set-tostringtag": "^2.1.0", "hasown": "^2.0.2", "mime-types": "^2.1.12" } }, "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w=="],
-
-    "form-data-encoder": ["form-data-encoder@1.7.2", "", {}, "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A=="],
-
-    "formdata-node": ["formdata-node@4.4.1", "", { "dependencies": { "node-domexception": "1.0.0", "web-streams-polyfill": "4.0.0-beta.3" } }, "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ=="],
-
-    "function-bind": ["function-bind@1.1.2", "", {}, "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA=="],
-
-    "get-intrinsic": ["get-intrinsic@1.3.0", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.2", "es-define-property": "^1.0.1", "es-errors": "^1.3.0", "es-object-atoms": "^1.1.1", "function-bind": "^1.1.2", "get-proto": "^1.0.1", "gopd": "^1.2.0", "has-symbols": "^1.1.0", "hasown": "^2.0.2", "math-intrinsics": "^1.1.0" } }, "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ=="],
-
-    "get-proto": ["get-proto@1.0.1", "", { "dependencies": { "dunder-proto": "^1.0.1", "es-object-atoms": "^1.0.0" } }, "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g=="],
-
     "giget": ["giget@2.0.0", "", { "dependencies": { "citty": "^0.1.6", "consola": "^3.4.0", "defu": "^6.1.4", "node-fetch-native": "^1.6.6", "nypm": "^0.6.0", "pathe": "^2.0.3" }, "bin": { "giget": "dist/cli.mjs" } }, "sha512-L5bGsVkxJbJgdnwyuheIunkGatUF/zssUoxxjACCseZYAVbaqdh9Tsmmlkl8vYan09H7sbvKt4pS8GqKLBrEzA=="],
 
-    "gopd": ["gopd@1.2.0", "", {}, "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg=="],
-
-    "has-symbols": ["has-symbols@1.1.0", "", {}, "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ=="],
-
-    "has-tostringtag": ["has-tostringtag@1.0.2", "", { "dependencies": { "has-symbols": "^1.0.3" } }, "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw=="],
-
-    "hasown": ["hasown@2.0.2", "", { "dependencies": { "function-bind": "^1.1.2" } }, "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ=="],
-
-    "humanize-ms": ["humanize-ms@1.2.1", "", { "dependencies": { "ms": "^2.0.0" } }, "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ=="],
-
     "jiti": ["jiti@2.6.1", "", { "bin": { "jiti": "lib/jiti-cli.mjs" } }, "sha512-ekilCSN1jwRvIbgeg/57YFh8qQDNbwDb9xT/qu2DAHbFFZUicIl4ygVaAvzveMhMVr3LnpSKTNnwt8PoOfmKhQ=="],
 
+    "json-schema-to-ts": ["json-schema-to-ts@3.1.1", "", { "dependencies": { "@babel/runtime": "^7.18.3", "ts-algebra": "^2.0.0" } }, "sha512-+DWg8jCJG2TEnpy7kOm/7/AxaYoaRbjVB4LFZLySZlWn8exGs3A4OLJR966cVvU26N7X9TWxl+Jsw7dzAqKT6g=="],
+
     "jsonc-parser": ["jsonc-parser@3.3.1", "", {}, "sha512-HUgH65KyejrUFPvHFPbqOY0rsFip3Bo5wb4ngvdi1EpCYWUQDC5V+Y7mZws+DLkr4M//zQJoanu1SP+87Dv1oQ=="],
 
     "lightningcss": ["lightningcss@1.30.2", "", { "dependencies": { "detect-libc": "^2.0.3" }, "optionalDependencies": { "lightningcss-android-arm64": "1.30.2", "lightningcss-darwin-arm64": "1.30.2", "lightningcss-darwin-x64": "1.30.2", "lightningcss-freebsd-x64": "1.30.2", "lightningcss-linux-arm-gnueabihf": "1.30.2", "lightningcss-linux-arm64-gnu": "1.30.2", "lightningcss-linux-arm64-musl": "1.30.2", "lightningcss-linux-x64-gnu": "1.30.2", "lightningcss-linux-x64-musl": "1.30.2", "lightningcss-win32-arm64-msvc": "1.30.2", "lightningcss-win32-x64-msvc": "1.30.2" } }, "sha512-utfs7Pr5uJyyvDETitgsaqSyjCb2qNRAtuqUeWIAKztsOYdcACf2KtARYXg2pSvhkt+9NfoaNY7fxjl6nuMjIQ=="],
@@ -317,20 +311,8 @@
 
     "lightningcss-win32-x64-msvc": ["lightningcss-win32-x64-msvc@1.30.2", "", { "os": "win32", "cpu": "x64" }, "sha512-5g1yc73p+iAkid5phb4oVFMB45417DkRevRbt/El/gKXJk4jid+vPFF/AXbxn05Aky8PapwzZrdJShv5C0avjw=="],
 
-    "math-intrinsics": ["math-intrinsics@1.1.0", "", {}, "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g=="],
-
-    "mime-db": ["mime-db@1.52.0", "", {}, "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg=="],
-
-    "mime-types": ["mime-types@2.1.35", "", { "dependencies": { "mime-db": "1.52.0" } }, "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw=="],
-
-    "ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="],
-
     "node-addon-api": ["node-addon-api@8.5.0", "", {}, "sha512-/bRZty2mXUIFY/xU5HLvveNHlswNJej+RnxBjOMkidWfwZzgTbPG1E3K5TOxRLOR+5hX7bSofy8yf1hZevMS8A=="],
 
-    "node-domexception": ["node-domexception@1.0.0", "", {}, "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ=="],
-
-    "node-fetch": ["node-fetch@2.7.0", "", { "dependencies": { "whatwg-url": "^5.0.0" }, "peerDependencies": { "encoding": "^0.1.0" }, "optionalPeers": ["encoding"] }, "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A=="],
-
     "node-fetch-native": ["node-fetch-native@1.6.7", "", {}, "sha512-g9yhqoedzIUm0nTnTqAQvueMPVOuIY16bqgAJJC8XOOubYFNwz6IER9qs0Gq2Xd0+CecCKFjtdDTMA4u4xG06Q=="],
 
     "node-gyp-build": ["node-gyp-build@4.8.4", "", { "bin": { "node-gyp-build": "bin.js", "node-gyp-build-optional": "optional.js", "node-gyp-build-test": "build-test.js" } }, "sha512-LA4ZjwlnUblHVgq0oBF3Jl/6h/Nvs5fzBLwdEF4nuxnFdsfajde4WfxtJr3CaiH+F6ewcIB/q4jQ4UzPyid+CQ=="],
@@ -339,8 +321,6 @@
 
     "ohash": ["ohash@2.0.11", "", {}, "sha512-RdR9FQrFwNBNXAr4GixM8YaRZRJ5PUWbKYbE5eOsrwAjJW0q2REGcf79oYPsLyskQCZG1PLN+S/K1V00joZAoQ=="],
 
-    "openai": ["openai@4.104.0", "", { "dependencies": { "@types/node": "^18.11.18", "@types/node-fetch": "^2.6.4", "abort-controller": "^3.0.0", "agentkeepalive": "^4.2.1", "form-data-encoder": "1.7.2", "formdata-node": "^4.3.2", "node-fetch": "^2.6.7" }, "peerDependencies": { "ws": "^8.18.0", "zod": "^3.23.8" }, "optionalPeers": ["ws", "zod"], "bin": { "openai": "bin/cli" } }, "sha512-p99EFNsA/yX6UhVO93f5kJsDRLAg+CTA2RBqdHK4RtK8u5IJw32Hyb2dTGKbnnFmnuoBv5r7Z2CURI9sGZpSuA=="],
-
     "oxc-minify": ["oxc-minify@0.93.0", "", { "optionalDependencies": { "@oxc-minify/binding-android-arm64": "0.93.0", "@oxc-minify/binding-darwin-arm64": "0.93.0", "@oxc-minify/binding-darwin-x64": "0.93.0", "@oxc-minify/binding-freebsd-x64": "0.93.0", "@oxc-minify/binding-linux-arm-gnueabihf": "0.93.0", "@oxc-minify/binding-linux-arm-musleabihf": "0.93.0", "@oxc-minify/binding-linux-arm64-gnu": "0.93.0", "@oxc-minify/binding-linux-arm64-musl": "0.93.0", "@oxc-minify/binding-linux-riscv64-gnu": "0.93.0", "@oxc-minify/binding-linux-s390x-gnu": "0.93.0", "@oxc-minify/binding-linux-x64-gnu": "0.93.0", "@oxc-minify/binding-linux-x64-musl": "0.93.0", "@oxc-minify/binding-wasm32-wasi": "0.93.0", "@oxc-minify/binding-win32-arm64-msvc": "0.93.0", "@oxc-minify/binding-win32-x64-msvc": "0.93.0" } }, "sha512-pwMjOGN/I+cfLVkSmECcVHROKwECNVAXCT5h/29S4f0aArIUh3CQnix1yYy7MTQ3yThNuGANjjE9jWJyT43Vbw=="],
 
     "oxc-resolver": ["oxc-resolver@11.16.0", "", { "optionalDependencies": { "@oxc-resolver/binding-android-arm-eabi": "11.16.0", "@oxc-resolver/binding-android-arm64": "11.16.0", "@oxc-resolver/binding-darwin-arm64": "11.16.0", "@oxc-resolver/binding-darwin-x64": "11.16.0", "@oxc-resolver/binding-freebsd-x64": "11.16.0", "@oxc-resolver/binding-linux-arm-gnueabihf": "11.16.0", "@oxc-resolver/binding-linux-arm-musleabihf": "11.16.0", "@oxc-resolver/binding-linux-arm64-gnu": "11.16.0", "@oxc-resolver/binding-linux-arm64-musl": "11.16.0", "@oxc-resolver/binding-linux-ppc64-gnu": "11.16.0", "@oxc-resolver/binding-linux-riscv64-gnu": "11.16.0", "@oxc-resolver/binding-linux-riscv64-musl": "11.16.0", "@oxc-resolver/binding-linux-s390x-gnu": "11.16.0", "@oxc-resolver/binding-linux-x64-gnu": "11.16.0", "@oxc-resolver/binding-linux-x64-musl": "11.16.0", "@oxc-resolver/binding-openharmony-arm64": "11.16.0", "@oxc-resolver/binding-wasm32-wasi": "11.16.0", "@oxc-resolver/binding-win32-arm64-msvc": "11.16.0", "@oxc-resolver/binding-win32-ia32-msvc": "11.16.0", "@oxc-resolver/binding-win32-x64-msvc": "11.16.0" } }, "sha512-I4sHGa1fZUpTQ9ftS0E0cBYbBjNnIKXRSX/trFMIJDIJ4n21dCrLAZhnJS0TSfRIRqZNFyceNZr2kablfgNyTA=="],
@@ -375,8 +355,6 @@
 
     "tinyglobby": ["tinyglobby@0.2.15", "", { "dependencies": { "fdir": "^6.5.0", "picomatch": "^4.0.3" } }, "sha512-j2Zq4NyQYG5XMST4cbs02Ak8iJUdxRM0XI5QyxXuZOzKOINmWurp3smXu3y5wDcJrptwpSjgXHzIQxR0omXljQ=="],
 
-    "tr46": ["tr46@0.0.3", "", {}, "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw=="],
-
     "tree-kill": ["tree-kill@1.2.2", "", { "bin": { "tree-kill": "cli.js" } }, "sha512-L0Orpi8qGpRG//Nd+H90vFB+3iHnue1zSSGmNOOCh1GLJ7rUKVwV2HvijphGQS2UmhUZewS9VgvxYIdgr+fG1A=="],
 
     "tree-sitter-go": ["tree-sitter-go@0.25.0", "", { "dependencies": { "node-addon-api": "^8.3.1", "node-gyp-build": "^4.8.4" }, "peerDependencies": { "tree-sitter": "^0.25.0" }, "optionalPeers": ["tree-sitter"] }, "sha512-APBc/Dq3xz/e35Xpkhb1blu5UgW+2E3RyGWawZSCNcbGwa7jhSQPS8KsUupuzBla8PCo8+lz9W/JDJjmfRa2tw=="],
@@ -391,6 +369,8 @@
 
     "tree-sitter-typescript": ["tree-sitter-typescript@0.23.2", "", { "dependencies": { "node-addon-api": "^8.2.2", "node-gyp-build": "^4.8.2", "tree-sitter-javascript": "^0.23.1" }, "peerDependencies": { "tree-sitter": "^0.21.0" }, "optionalPeers": ["tree-sitter"] }, "sha512-e04JUUKxTT53/x3Uq1zIL45DoYKVfHH4CZqwgZhPg5qYROl5nQjV+85ruFzFGZxu+QeFVbRTPDRnqL9UbU4VeA=="],
 
+    "ts-algebra": ["ts-algebra@2.0.0", "", {}, "sha512-FPAhNPFMrkwz76P7cdjdmiShwMynZYN6SgOujD1urY4oNm80Ou9oMdmbR45LotcKOXoy7wSmHkRFE6Mxbrhefw=="],
+
     "ts-import-resolver": ["ts-import-resolver@0.1.23", "", { "peerDependencies": { "typescript": ">=4.5.0" }, "optionalPeers": ["typescript"] }, "sha512-282pgr6j6aOvP3P2I6XugDxdBobkpdMmdbWjRjGl5gjPI1p0+oTNGDh1t924t75kRlyIkF65DiwhSIUysmyHQA=="],
 
     "tslib": ["tslib@2.8.1", "", {}, "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w=="],
@@ -399,18 +379,16 @@
 
     "undici-types": ["undici-types@5.26.5", "", {}, "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA=="],
 
-    "web-streams-polyfill": ["web-streams-polyfill@4.0.0-beta.3", "", {}, "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug=="],
-
     "web-tree-sitter": ["web-tree-sitter@0.26.3", "", {}, "sha512-JIVgIKFS1w6lejxSntCtsS/QsE/ecTS00en809cMxMPxaor6MvUnQ+ovG8uTTTvQCFosSh4MeDdI5bSGw5SoBw=="],
 
-    "webidl-conversions": ["webidl-conversions@3.0.1", "", {}, "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ=="],
-
-    "whatwg-url": ["whatwg-url@5.0.0", "", { "dependencies": { "tr46": "~0.0.3", "webidl-conversions": "^3.0.0" } }, "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw=="],
-
     "yaml": ["yaml@2.8.2", "", { "bin": { "yaml": "bin.mjs" } }, "sha512-mplynKqc1C2hTVYxd0PU2xQAc22TI1vShAYGksCCfxbn/dFwnHTNi1bvYsBTkhdUNtGIf5xNOg938rrSSYvS9A=="],
 
     "zlye": ["zlye@0.4.4", "", { "dependencies": { "picocolors": "^1.1.1" }, "peerDependencies": { "typescript": ">=4.5.0" }, "optionalPeers": ["typescript"] }, "sha512-fwpeC841X3ElOLYRMKXbwX29pitNrsm6nRNvEhDMrRXDl3BhR2i03Bkr0GNrpyYgZJuEzUsBylXAYzgGPXXOCQ=="],
 
+    "zod": ["zod@3.25.76", "", {}, "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ=="],
+
+    "c12/dotenv": ["dotenv@17.2.3", "", {}, "sha512-JVUnt+DUIzu87TABbhPmNfVdBDt18BLOWjMUFJMSi/Qqg7NTYtabbvSNJGOJ7afbRuv9D/lngizHtP7QyLQ+9w=="],
+
     "tree-sitter-typescript/tree-sitter-javascript": ["tree-sitter-javascript@0.23.1", "", { "dependencies": { "node-addon-api": "^8.2.2", "node-gyp-build": "^4.8.2" }, "peerDependencies": { "tree-sitter": "^0.21.1" }, "optionalPeers": ["tree-sitter"] }, "sha512-/bnhbrTD9frUYHQTiYnPcxyHORIw157ERBa6dqzaKxvR/x3PC4Yzd+D1pZIMS6zNg2v3a8BZ0oK7jHqsQo9fWA=="],
   }
 }
diff --git a/packages/eval/.gitignore b/packages/eval/.gitignore
index 0f8888b..fab698e 100644
--- a/packages/eval/.gitignore
+++ b/packages/eval/.gitignore
@@ -1,3 +1,2 @@
-cache
-results
+runs
 data
diff --git a/packages/eval/.python-version b/packages/eval/.python-version
deleted file mode 100644
index 6324d40..0000000
--- a/packages/eval/.python-version
+++ /dev/null
@@ -1 +0,0 @@
-3.14
diff --git a/packages/eval/README.md b/packages/eval/README.md
index e69de29..3c28216 100644
--- a/packages/eval/README.md
+++ b/packages/eval/README.md
@@ -0,0 +1,116 @@
+# @supermemory/eval
+
+SWE-bench Lite retrieval-only evaluation harness comparing two Claude Agent SDK variants:
+
+- **Agent1 (ops-only)**: Read/Grep/Glob tools only
+- **Agent2 (ops+search)**: Read/Grep/Glob + semantic search via `code-chunk` embeddings
+
+## Setup
+
+```bash
+# From monorepo root
+bun install
+```
+
+Required environment variables:
+
+```bash
+ANTHROPIC_API_KEY=...  # Claude API access
+GOOGLE_API_KEY=...     # Gemini embeddings (default)
+# or
+OPENAI_API_KEY=...     # If using --embedding-provider openai
+```
+
+## Usage
+
+```bash
+cd packages/eval
+
+# Full evaluation on test split
+bun run src/run.ts
+
+# Dev split, limited instances
+bun run src/run.ts --split dev --max-instances 10
+
+# Only Agent1 (ops-only)
+bun run src/run.ts --skip-agent2
+
+# Specific instance
+bun run src/run.ts --instance django__django-12345
+
+# Custom embedding dimensions (768/1536/3072)
+bun run src/run.ts --embedding-dimensions 768
+```
+
+## Options
+
+| Flag | Description | Default |
+|------|-------------|---------|
+| `--split <dev\|test>` | Dataset split | `test` |
+| `--max-instances <n>` | Limit instances | all |
+| `--max-turns <n>` | Max agent turns | 20 |
+| `--max-tool-calls <n>` | Max tool calls | 50 |
+| `--model <name>` | Claude model | `claude-sonnet-4-5` |
+| `--skip-agent1` | Skip ops-only agent | false |
+| `--skip-agent2` | Skip ops+search agent | false |
+| `--instance <id>` | Run specific instance(s) | - |
+| `--run-dir <path>` | Output directory | `./runs` |
+| `--embedding-provider` | `gemini` or `openai` | `gemini` |
+| `--embedding-dimensions` | Gemini output dims | 1536 |
+
+## Output
+
+Runs output to `runs/<timestamp>/`:
+
+```
+runs/
+└── 2025-01-01T12-00-00-000Z/
+    ├── events/
+    │   ├── django__django-12345_ops-only.jsonl
+    │   └── django__django-12345_ops+search.jsonl
+    ├── metrics.jsonl
+    └── summary.json
+```
+
+## Metrics
+
+- **Hit@k**: Whether oracle file appears in top-k predictions
+- **MRR**: Mean Reciprocal Rank of first oracle file
+- **Coverage@k**: Fraction of oracle files in top-k
+- **Time-to-first-hit**: Turns/tool calls until first oracle file accessed
+- **Embedding latency**: Index build + query times (Agent2 only)
+
+## Architecture
+
+```
+src/
+├── run.ts                    # CLI entrypoint
+└── swebench/
+    ├── types.ts              # SWEbenchInstance, metrics types
+    ├── dataset.ts            # HuggingFace dataset loader with caching
+    ├── git.ts                # Bare clone + worktree management
+    ├── score.ts              # Per-instance metric computation
+    ├── aggregate.ts          # Cross-instance aggregation
+    ├── run.ts                # Main evaluation loop
+    ├── agent/
+    │   ├── prompts.ts        # Retrieval-only system/user prompts
+    │   ├── variants.ts       # Agent1/Agent2 tool configurations
+    │   └── semantic_search_adapter.ts  # Gemini embeddings + MCP server
+    └── observe/
+        └── instrumentation.ts  # SDK hooks, event writer
+```
+
+## How it works
+
+1. Loads SWE-bench Lite dataset (300 instances)
+2. For each instance:
+   - Creates git worktree at target commit
+   - Runs Agent1 (ops-only) with Read/Grep/Glob
+   - Builds semantic index using `code-chunk` 
+   - Runs Agent2 (ops+search) with additional semantic_search tool
+   - Computes retrieval metrics against oracle files from patch
+3. Aggregates metrics, prints summary, writes results
+
+## Embedding cache
+
+Semantic search indexes are cached at `~/.cache/swebench-eval/embeddings/` to avoid re-embedding repos. Cache key includes instance ID + embedding provider + dimensions.
diff --git a/packages/eval/package.json b/packages/eval/package.json
index c54865c..f1c8394 100644
--- a/packages/eval/package.json
+++ b/packages/eval/package.json
@@ -2,17 +2,24 @@
 	"name": "@supermemory/eval",
 	"version": "0.1.0",
 	"private": true,
-	"description": "Evaluation harness for code-chunk",
+	"description": "SWE-bench Lite retrieval-only evaluation harness for code-chunk",
 	"type": "module",
 	"scripts": {
 		"start": "bun run src/run.ts",
+		"eval": "bun run src/run.ts",
+		"eval:dev": "bun run src/run.ts --split dev",
+		"eval:quick": "bun run src/run.ts --max-instances 5",
 		"type-check": "tsc --noEmit"
 	},
 	"dependencies": {
+		"@anthropic-ai/claude-agent-sdk": "^0.1.75",
+		"@anthropic-ai/sdk": "^0.71.2",
 		"code-chunk": "workspace:*",
-		"openai": "^4.0.0"
+		"dotenv": "^16.4.0",
+		"zod": "^3.24.0"
 	},
 	"devDependencies": {
-		"@types/bun": "^1.3.4"
+		"@types/bun": "^1.3.4",
+		"typescript": "^5.0.0"
 	}
 }
diff --git a/packages/eval/pyproject.toml b/packages/eval/pyproject.toml
deleted file mode 100644
index e3232ad..0000000
--- a/packages/eval/pyproject.toml
+++ /dev/null
@@ -1,9 +0,0 @@
-[project]
-name = "eval"
-version = "0.1.0"
-description = "Add your description here"
-readme = "README.md"
-requires-python = ">=3.14"
-dependencies = [
-    "chonkie[code]>=1.0.5",
-]
diff --git a/packages/eval/src/chunkers/ast.ts b/packages/eval/src/chunkers/ast.ts
deleted file mode 100644
index 67727ce..0000000
--- a/packages/eval/src/chunkers/ast.ts
+++ /dev/null
@@ -1,40 +0,0 @@
-/**
- * AST-aware chunker wrapper for evaluation
- *
- * Wraps the code-chunk library for use in the evaluation harness.
- * Uses the built-in contextualizedText for better embedding quality.
- */
-
-import { chunk } from 'code-chunk'
-
-/**
- * Chunk a file using AST-aware chunking and return results
- * in a format compatible with the evaluation
- *
- * @param filepath - Path to the file
- * @param code - Source code content
- * @param maxNws - Maximum NWS characters per chunk (default: 1500)
- */
-export async function chunkFile(
-	filepath: string,
-	code: string,
-	maxNws: number = 1500,
-): Promise<
-	Array<{
-		id: string
-		text: string
-		startLine: number
-		endLine: number
-	}>
-> {
-	const chunks = await chunk(filepath, code, {
-		maxChunkSize: maxNws,
-	})
-
-	return chunks.map((c) => ({
-		id: `${filepath}:${c.lineRange.start}-${c.lineRange.end}`,
-		text: c.contextualizedText,
-		startLine: c.lineRange.start,
-		endLine: c.lineRange.end,
-	}))
-}
diff --git a/packages/eval/src/chunkers/chonkie.ts b/packages/eval/src/chunkers/chonkie.ts
deleted file mode 100644
index 3c0327f..0000000
--- a/packages/eval/src/chunkers/chonkie.ts
+++ /dev/null
@@ -1,82 +0,0 @@
-/**
- * Chonkie CodeChunker wrapper for evaluation
- *
- * Wraps the Chonkie Python library's CodeChunker for use in the evaluation harness.
- * Calls the Python script via subprocess.
- */
-
-import { spawn } from 'node:child_process'
-import { dirname, join } from 'node:path'
-
-// Go up from src/chunkers to package root
-const PACKAGE_ROOT = join(dirname(import.meta.dir), '..')
-const PYTHON_PATH = join(PACKAGE_ROOT, '.venv', 'bin', 'python')
-const SCRIPT_PATH = join(import.meta.dir, 'chonkie_chunk.py')
-
-interface ChunkResult {
-	id: string
-	text: string
-	startLine: number
-	endLine: number
-}
-
-/**
- * Chunk a file using Chonkie's CodeChunker and return results
- * in a format compatible with the evaluation
- *
- * @param filepath - Path to the file
- * @param code - Source code content
- * @param maxChunkSize - Maximum characters per chunk (default: 1500)
- */
-export async function chunkFile(
-	filepath: string,
-	code: string,
-	maxChunkSize: number = 1500,
-): Promise<ChunkResult[]> {
-	return new Promise((resolve, reject) => {
-		const proc = spawn(
-			PYTHON_PATH,
-			[SCRIPT_PATH, filepath, String(maxChunkSize)],
-			{
-				stdio: ['pipe', 'pipe', 'pipe'],
-			},
-		)
-
-		let stdout = ''
-		let stderr = ''
-
-		proc.stdout.on('data', (data) => {
-			stdout += data.toString()
-		})
-
-		proc.stderr.on('data', (data) => {
-			stderr += data.toString()
-		})
-
-		proc.on('close', (code) => {
-			if (code !== 0) {
-				reject(new Error(`Chonkie chunker failed: ${stderr}`))
-				return
-			}
-
-			try {
-				const result = JSON.parse(stdout)
-				if (result.error) {
-					reject(new Error(`Chonkie error: ${result.error}`))
-					return
-				}
-				resolve(result)
-			} catch {
-				reject(new Error(`Failed to parse Chonkie output: ${stdout}`))
-			}
-		})
-
-		proc.on('error', (err) => {
-			reject(err)
-		})
-
-		// Write code to stdin
-		proc.stdin.write(code)
-		proc.stdin.end()
-	})
-}
diff --git a/packages/eval/src/chunkers/chonkie_chunk.py b/packages/eval/src/chunkers/chonkie_chunk.py
deleted file mode 100644
index 24da1e3..0000000
--- a/packages/eval/src/chunkers/chonkie_chunk.py
+++ /dev/null
@@ -1,92 +0,0 @@
-#!/usr/bin/env python3
-"""
-Chonkie CodeChunker wrapper for evaluation.
-
-Takes filepath, code, and max_chunk_size as arguments.
-Outputs JSON array of chunks with id, text, startLine, endLine.
-"""
-
-import json
-import sys
-from chonkie import CodeChunker
-
-
-def count_nws(text: str) -> int:
-    """Count non-whitespace characters to match the evaluation's sizing."""
-    return sum(1 for c in text if not c.isspace())
-
-
-def main():
-    if len(sys.argv) < 3:
-        print("Usage: chonkie_chunk.py <filepath> <max_chunk_size>", file=sys.stderr)
-        print("Code is read from stdin", file=sys.stderr)
-        sys.exit(1)
-
-    filepath = sys.argv[1]
-    max_chunk_size = int(sys.argv[2])
-    
-    # Read code from stdin to handle large files and special characters
-    code = sys.stdin.read()
-    
-    # Determine language from file extension
-    ext = filepath.rsplit(".", 1)[-1].lower() if "." in filepath else ""
-    lang_map = {
-        "py": "python",
-        "js": "javascript",
-        "ts": "typescript",
-        "tsx": "tsx",
-        "jsx": "javascript",
-        "rs": "rust",
-        "go": "go",
-        "java": "java",
-        "c": "c",
-        "cpp": "cpp",
-        "h": "c",
-        "hpp": "cpp",
-        "rb": "ruby",
-        "php": "php",
-        "cs": "c_sharp",
-        "swift": "swift",
-        "kt": "kotlin",
-        "scala": "scala",
-    }
-    
-    language = lang_map.get(ext, "python")  # Default to python for .py files
-    
-    try:
-        # Initialize CodeChunker with character tokenizer to match NWS-based sizing
-        # Use a simple character-based token counter
-        chunker = CodeChunker(
-            tokenizer_or_token_counter=lambda x: len(x),  # Character count
-            chunk_size=max_chunk_size,
-            language=language,
-            include_nodes=False,
-        )
-        
-        chunks = chunker.chunk(code)
-        
-        # Convert to evaluation format
-        results = []
-        lines = code.split("\n")
-        
-        for chunk in chunks:
-            # Find line numbers from start/end indices
-            start_line = code[:chunk.start_index].count("\n")
-            end_line = code[:chunk.end_index].count("\n")
-            
-            results.append({
-                "id": f"{filepath}:{start_line}-{end_line}",
-                "text": chunk.text,
-                "startLine": start_line,
-                "endLine": end_line,
-            })
-        
-        print(json.dumps(results))
-        
-    except Exception as e:
-        print(json.dumps({"error": str(e)}), file=sys.stderr)
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/packages/eval/src/chunkers/fixed.ts b/packages/eval/src/chunkers/fixed.ts
deleted file mode 100644
index c685b7b..0000000
--- a/packages/eval/src/chunkers/fixed.ts
+++ /dev/null
@@ -1,89 +0,0 @@
-/**
- * Fixed-size chunker for evaluation baseline
- *
- * Simple line-based chunker that splits code into fixed-size chunks
- * based on non-whitespace character count. Used as a baseline comparison
- * for the AST-aware chunker.
- */
-
-/**
- * Count non-whitespace characters in a string
- */
-function countNws(text: string): number {
-	let count = 0
-	for (let i = 0; i < text.length; i++) {
-		if (text.charCodeAt(i) > 32) count++
-	}
-	return count
-}
-
-/**
- * Chunk a file using fixed-size chunking based on NWS character count
- *
- * @param filepath - Path to the file (used for chunk IDs)
- * @param code - Source code content
- * @param maxNws - Maximum NWS characters per chunk (default: 1500)
- */
-export async function chunkFile(
-	filepath: string,
-	code: string,
-	maxNws: number = 1500,
-): Promise<
-	Array<{
-		id: string
-		text: string
-		startLine: number
-		endLine: number
-	}>
-> {
-	const lines = code.split('\n')
-	const chunks: Array<{
-		id: string
-		text: string
-		startLine: number
-		endLine: number
-	}> = []
-
-	let currentLines: string[] = []
-	let currentNws = 0
-	let startLine = 0
-
-	for (let i = 0; i < lines.length; i++) {
-		const line = lines[i] ?? ''
-		const lineNws = countNws(line)
-
-		if (currentNws + lineNws > maxNws && currentLines.length > 0) {
-			// Flush current chunk
-			const text = currentLines.join('\n')
-			const endLine = startLine + currentLines.length - 1
-			chunks.push({
-				id: `${filepath}:${startLine}-${endLine}`,
-				text,
-				startLine,
-				endLine,
-			})
-
-			// Start new chunk
-			currentLines = [line]
-			currentNws = lineNws
-			startLine = i
-		} else {
-			currentLines.push(line)
-			currentNws += lineNws
-		}
-	}
-
-	// Flush remaining lines
-	if (currentLines.length > 0) {
-		const text = currentLines.join('\n')
-		const endLine = startLine + currentLines.length - 1
-		chunks.push({
-			id: `${filepath}:${startLine}-${endLine}`,
-			text,
-			startLine,
-			endLine,
-		})
-	}
-
-	return chunks
-}
diff --git a/packages/eval/src/debug_chunks.ts b/packages/eval/src/debug_chunks.ts
deleted file mode 100644
index 4afb888..0000000
--- a/packages/eval/src/debug_chunks.ts
+++ /dev/null
@@ -1,77 +0,0 @@
-import { readFileSync } from 'node:fs'
-import { join } from 'node:path'
-import { chunk } from 'code-chunk'
-import { chunkFile as chunkFixed } from './chunkers/fixed'
-
-// Check deepmind_tracr/tracr/craft/transformers.py
-// Assume we're looking for lines 100-150
-const testFile = join(
-	import.meta.dir,
-	'../data/repoeval/repositories/function_level/deepmind_tracr/tracr/craft/transformers.py',
-)
-const code = readFileSync(testFile, 'utf-8')
-const targetStart = 100
-const targetEnd = 150
-
-console.log('File:', testFile)
-console.log('Target lines:', targetStart, '-', targetEnd)
-console.log('')
-
-function countNws(text: string): number {
-	let count = 0
-	for (let i = 0; i < text.length; i++) {
-		if (text.charCodeAt(i) > 32) count++
-	}
-	return count
-}
-
-function overlaps(
-	chunkStart: number,
-	chunkEnd: number,
-	tStart: number,
-	tEnd: number,
-): boolean {
-	return !(chunkEnd < tStart || chunkStart > tEnd)
-}
-
-for (const maxSize of [1500, 1800]) {
-	console.log(`\n=== Max chunk size: ${maxSize} ===`)
-
-	const astChunks = await chunk(testFile, code, { maxChunkSize: maxSize })
-	const fixedChunks = await chunkFixed(testFile, code, maxSize)
-
-	console.log('\nAST chunks:')
-	for (const c of astChunks) {
-		const overlap = overlaps(
-			c.lineRange.start,
-			c.lineRange.end,
-			targetStart,
-			targetEnd,
-		)
-		console.log(
-			`  Lines ${c.lineRange.start}-${c.lineRange.end} (${countNws(c.text)} NWS) ${overlap ? '*** RELEVANT ***' : ''}`,
-		)
-	}
-
-	console.log('\nFixed chunks:')
-	for (const c of fixedChunks) {
-		const overlap = overlaps(c.startLine, c.endLine, targetStart, targetEnd)
-		console.log(
-			`  Lines ${c.startLine}-${c.endLine} (${countNws(c.text)} NWS) ${overlap ? '*** RELEVANT ***' : ''}`,
-		)
-	}
-
-	const astRelevant = astChunks.filter((c) =>
-		overlaps(c.lineRange.start, c.lineRange.end, targetStart, targetEnd),
-	)
-	const fixedRelevant = fixedChunks.filter((c) =>
-		overlaps(c.startLine, c.endLine, targetStart, targetEnd),
-	)
-
-	console.log(
-		`\nRelevant chunks: AST=${astRelevant.length}, Fixed=${fixedRelevant.length}`,
-	)
-	console.log(
-		`Total chunks: AST=${astChunks.length}, Fixed=${fixedChunks.length}`,
-	)
-}
diff --git a/packages/eval/src/download.ts b/packages/eval/src/download.ts
deleted file mode 100644
index 4e1bd61..0000000
--- a/packages/eval/src/download.ts
+++ /dev/null
@@ -1,149 +0,0 @@
-/**
- * Download RepoEval benchmark data
- *
- * Downloads:
- * 1. Task datasets (queries, ground truth) from Microsoft CodeT repo
- * 2. Function-level Python repositories for chunking
- */
-
-import { existsSync } from 'node:fs'
-import { mkdir, writeFile } from 'node:fs/promises'
-import { join } from 'node:path'
-
-const DATA_DIR = join(import.meta.dir, '..', 'data', 'repoeval')
-const DATASETS_DIR = join(DATA_DIR, 'datasets')
-const REPOS_DIR = join(DATA_DIR, 'repositories', 'function_level')
-
-// Function-level repositories from RepoEval
-const REPOS_FUNCTION = [
-	'amazon-science_patchcore-inspection',
-	'deepmind_tracr',
-	'facebookresearch_omnivore',
-	'google_lightweight_mmm',
-	'lucidrains_imagen-pytorch',
-	'maxhumber_redframes',
-]
-
-async function downloadAndExtractZip(
-	url: string,
-	destDir: string,
-): Promise<void> {
-	console.log(`Downloading from ${url}...`)
-
-	const response = await fetch(url)
-	if (!response.ok) {
-		throw new Error(`Failed to download: ${response.statusText}`)
-	}
-
-	const arrayBuffer = await response.arrayBuffer()
-	const tempZipPath = join(destDir, '_temp.zip')
-
-	await mkdir(destDir, { recursive: true })
-	await writeFile(tempZipPath, new Uint8Array(arrayBuffer))
-
-	// Use unzip command
-	const proc = Bun.spawn(['unzip', '-o', '-q', tempZipPath, '-d', destDir], {
-		cwd: destDir,
-	})
-	await proc.exited
-
-	// Clean up temp file
-	await Bun.spawn(['rm', tempZipPath]).exited
-
-	console.log(`Extracted to ${destDir}`)
-}
-
-async function downloadDatasets(): Promise<void> {
-	if (existsSync(DATASETS_DIR)) {
-		console.log('Datasets already downloaded, skipping...')
-		return
-	}
-
-	const datasetsUrl =
-		'https://github.com/microsoft/CodeT/raw/main/RepoCoder/datasets/datasets.zip'
-	await downloadAndExtractZip(datasetsUrl, DATASETS_DIR)
-}
-
-async function downloadRepositories(): Promise<void> {
-	if (existsSync(REPOS_DIR)) {
-		console.log('Repositories already downloaded, skipping...')
-		return
-	}
-
-	// Using the cleaned version from Veronicium's fork
-	const reposUrl =
-		'https://github.com/Veronicium/repoeval_debug/raw/main/function_level.zip'
-	await downloadAndExtractZip(reposUrl, REPOS_DIR)
-}
-
-export interface RepoEvalTask {
-	prompt: string
-	metadata: {
-		task_id: string
-		ground_truth: string
-		fpath_tuple: string[]
-		line_no: number
-		lineno: number
-		context_start_lineno: number
-	}
-}
-
-export async function loadTasks(
-	contextLength: '1k' | '2k' | '4k' = '2k',
-): Promise<RepoEvalTask[]> {
-	const fileName = `function_level_completion_${contextLength}_context_codex.test.jsonl`
-	const filePath = join(DATASETS_DIR, fileName)
-
-	const content = await Bun.file(filePath).text()
-	const lines = content.trim().split('\n')
-
-	const tasks: RepoEvalTask[] = []
-	const repo2idx: Record<string, number> = {}
-
-	for (const line of lines) {
-		const task = JSON.parse(line) as RepoEvalTask
-
-		// Clean up task_id format
-		const repo = task.metadata.task_id.replace('--', '_').split('/')[0]
-		if (!repo || !REPOS_FUNCTION.includes(repo)) continue
-
-		if (!(repo in repo2idx)) {
-			repo2idx[repo] = 0
-		}
-
-		task.metadata.task_id = task.metadata.task_id
-			.replace('--', '_')
-			.replace('idx', String(repo2idx[repo] ?? 0))
-		task.metadata.line_no = task.metadata.lineno
-		repo2idx[repo] = (repo2idx[repo] ?? 0) + 1
-
-		tasks.push(task)
-	}
-
-	return tasks
-}
-
-export function getReposDir(): string {
-	return REPOS_DIR
-}
-
-export function getRepos(): string[] {
-	return REPOS_FUNCTION
-}
-
-export async function download(): Promise<void> {
-	console.log('Downloading RepoEval benchmark data...\n')
-
-	await mkdir(DATA_DIR, { recursive: true })
-
-	await downloadDatasets()
-	await downloadRepositories()
-
-	console.log('\nDownload complete!')
-	console.log(`Data stored in: ${DATA_DIR}`)
-}
-
-// Run if executed directly
-if (import.meta.main) {
-	await download()
-}
diff --git a/packages/eval/src/embeddings.ts b/packages/eval/src/embeddings.ts
deleted file mode 100644
index 0836f2a..0000000
--- a/packages/eval/src/embeddings.ts
+++ /dev/null
@@ -1,219 +0,0 @@
-/**
- * OpenAI embeddings wrapper with disk caching
- */
-
-import { createHash } from 'node:crypto'
-import { existsSync } from 'node:fs'
-import { mkdir, readFile, writeFile } from 'node:fs/promises'
-import { join } from 'node:path'
-import OpenAI from 'openai'
-
-const CACHE_DIR = join(import.meta.dir, '..', 'cache', 'embeddings')
-const MODEL = 'text-embedding-3-small'
-const BATCH_SIZE = 100
-
-let client: OpenAI | null = null
-
-function getClient(): OpenAI {
-	if (!client) {
-		client = new OpenAI()
-	}
-	return client
-}
-
-/**
- * Create a cache key from text content
- */
-function cacheKey(text: string): string {
-	return createHash('sha256').update(text).digest('hex').slice(0, 16)
-}
-
-/**
- * Get cache file path for a text
- */
-function cachePath(text: string): string {
-	const key = cacheKey(text)
-	// Use first 2 chars as subdirectory to avoid too many files in one dir
-	return join(CACHE_DIR, key.slice(0, 2), `${key}.json`)
-}
-
-/**
- * Try to load embedding from cache
- */
-async function loadFromCache(text: string): Promise<number[] | null> {
-	const path = cachePath(text)
-	if (!existsSync(path)) {
-		return null
-	}
-	try {
-		const data = await readFile(path, 'utf-8')
-		return JSON.parse(data) as number[]
-	} catch {
-		return null
-	}
-}
-
-/**
- * Save embedding to cache
- */
-async function saveToCache(text: string, embedding: number[]): Promise<void> {
-	const path = cachePath(text)
-	const dir = join(path, '..')
-	await mkdir(dir, { recursive: true })
-	await writeFile(path, JSON.stringify(embedding))
-}
-
-/**
- * Embed a batch of texts using OpenAI API
- */
-async function embedBatch(texts: string[]): Promise<number[][]> {
-	const openai = getClient()
-
-	// Filter out empty texts and track their indices
-	const nonEmptyTexts: string[] = []
-	const indexMap: number[] = []
-
-	for (let i = 0; i < texts.length; i++) {
-		const text = texts[i]?.trim() ?? ''
-		if (text.length > 0) {
-			nonEmptyTexts.push(text)
-			indexMap.push(i)
-		}
-	}
-
-	if (nonEmptyTexts.length === 0) {
-		// Return zero vectors for all empty inputs
-		return texts.map(() => new Array(1536).fill(0))
-	}
-
-	const response = await openai.embeddings.create({
-		model: MODEL,
-		input: nonEmptyTexts,
-	})
-
-	// Sort by index to maintain order
-	const sorted = response.data.sort(
-		(a: { index: number }, b: { index: number }) => a.index - b.index,
-	)
-	const embeddings = sorted.map((d: { embedding: number[] }) => d.embedding)
-
-	// Map back to original indices, filling zeros for empty texts
-	const result: number[][] = texts.map(() => new Array(1536).fill(0))
-	for (let i = 0; i < indexMap.length; i++) {
-		const idx = indexMap[i]
-		const emb = embeddings[i]
-		if (idx !== undefined && emb !== undefined) {
-			result[idx] = emb
-		}
-	}
-
-	return result
-}
-
-/**
- * Embed texts with caching
- *
- * @param texts - Array of texts to embed
- * @param onProgress - Optional callback for progress updates (done, total, cachedCount)
- * @returns Array of embeddings (same order as input texts)
- */
-export async function embedTexts(
-	texts: string[],
-	onProgress?: (done: number, total: number, cached: number) => void,
-): Promise<number[][]> {
-	await mkdir(CACHE_DIR, { recursive: true })
-
-	const results: (number[] | null)[] = new Array(texts.length).fill(null)
-	const uncachedIndices: number[] = []
-	const uncachedTexts: string[] = []
-
-	// Check cache for each text
-	for (let i = 0; i < texts.length; i++) {
-		const text = texts[i]
-		if (!text) continue
-		const cached = await loadFromCache(text)
-		if (cached) {
-			results[i] = cached
-		} else {
-			uncachedIndices.push(i)
-			uncachedTexts.push(text)
-		}
-	}
-
-	const cachedCount = texts.length - uncachedTexts.length
-
-	// Report initial state if all cached
-	if (onProgress && uncachedTexts.length === 0) {
-		onProgress(texts.length, texts.length, cachedCount)
-	}
-
-	// Embed uncached texts in batches
-	for (let i = 0; i < uncachedTexts.length; i += BATCH_SIZE) {
-		const batch = uncachedTexts.slice(i, i + BATCH_SIZE)
-		const batchIndices = uncachedIndices.slice(i, i + BATCH_SIZE)
-
-		const embeddings = await embedBatch(batch)
-
-		// Save to cache and store results
-		for (let j = 0; j < embeddings.length; j++) {
-			const originalIdx = batchIndices[j]
-			const embedding = embeddings[j]
-			const text = batch[j]
-			if (originalIdx === undefined || !embedding || !text) continue
-			results[originalIdx] = embedding
-			await saveToCache(text, embedding)
-		}
-
-		if (onProgress) {
-			onProgress(
-				cachedCount + Math.min(i + BATCH_SIZE, uncachedTexts.length),
-				texts.length,
-				cachedCount,
-			)
-		}
-	}
-
-	return results as number[][]
-}
-
-/**
- * Compute cosine similarity between two vectors
- */
-export function cosineSimilarity(a: number[], b: number[]): number {
-	let dotProduct = 0
-	let normA = 0
-	let normB = 0
-
-	for (let i = 0; i < a.length; i++) {
-		const ai = a[i] ?? 0
-		const bi = b[i] ?? 0
-		dotProduct += ai * bi
-		normA += ai * ai
-		normB += bi * bi
-	}
-
-	return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB))
-}
-
-/**
- * Find top-k most similar items
- *
- * @param queryEmbedding - The query embedding
- * @param corpusEmbeddings - Array of corpus embeddings
- * @param k - Number of top results to return
- * @returns Array of { index, score } sorted by score descending
- */
-export function topK(
-	queryEmbedding: number[],
-	corpusEmbeddings: number[][],
-	k: number,
-): Array<{ index: number; score: number }> {
-	const scores = corpusEmbeddings.map((emb, idx) => ({
-		index: idx,
-		score: cosineSimilarity(queryEmbedding, emb),
-	}))
-
-	scores.sort((a, b) => b.score - a.score)
-
-	return scores.slice(0, k)
-}
diff --git a/packages/eval/src/metrics.ts b/packages/eval/src/metrics.ts
deleted file mode 100644
index a0aaac5..0000000
--- a/packages/eval/src/metrics.ts
+++ /dev/null
@@ -1,72 +0,0 @@
-/**
- * Retrieval metrics for evaluation
- *
- * Computes precision, recall, and nDCG for retrieval evaluation.
- */
-
-/**
- * Compute precision, recall, and nDCG for a single query
- *
- * @param retrievedIds - Ordered list of retrieved chunk IDs
- * @param relevantSet - Set of relevant (ground truth) chunk IDs
- * @param k - Number of results to consider
- */
-export function computeMetrics(
-	retrievedIds: string[],
-	relevantSet: Set<string>,
-	k: number,
-): { precision: number; recall: number; ndcg: number } {
-	const topK = retrievedIds.slice(0, k)
-
-	// Precision@k: fraction of retrieved that are relevant
-	const relevantInTopK = topK.filter((id) => relevantSet.has(id)).length
-	const precision = relevantInTopK / k
-
-	// Recall@k: fraction of relevant that are retrieved
-	const recall = relevantSet.size > 0 ? relevantInTopK / relevantSet.size : 0
-
-	// nDCG@k: normalized discounted cumulative gain
-	const dcg = topK.reduce((sum, id, i) => {
-		const rel = relevantSet.has(id) ? 1 : 0
-		return sum + rel / Math.log2(i + 2) // i+2 because log2(1) = 0
-	}, 0)
-
-	// Ideal DCG: all relevant docs at top
-	const idealK = Math.min(k, relevantSet.size)
-	const idcg = Array.from({ length: idealK }).reduce<number>(
-		(sum, _, i) => sum + 1 / Math.log2(i + 2),
-		0,
-	)
-
-	const ndcg = idcg > 0 ? dcg / idcg : 0
-
-	return { precision, recall, ndcg }
-}
-
-/**
- * Aggregate metrics across multiple queries
- *
- * @param metrics - Array of metric objects
- */
-export function aggregateMetrics(
-	metrics: Array<{ precision: number; recall: number; ndcg: number }>,
-): { precision: number; recall: number; ndcg: number } {
-	if (metrics.length === 0) {
-		return { precision: 0, recall: 0, ndcg: 0 }
-	}
-
-	const sum = metrics.reduce(
-		(acc, m) => ({
-			precision: acc.precision + m.precision,
-			recall: acc.recall + m.recall,
-			ndcg: acc.ndcg + m.ndcg,
-		}),
-		{ precision: 0, recall: 0, ndcg: 0 },
-	)
-
-	return {
-		precision: sum.precision / metrics.length,
-		recall: sum.recall / metrics.length,
-		ndcg: sum.ndcg / metrics.length,
-	}
-}
diff --git a/packages/eval/src/run.ts b/packages/eval/src/run.ts
index db892d1..d9b6083 100644
--- a/packages/eval/src/run.ts
+++ b/packages/eval/src/run.ts
@@ -1,452 +1,177 @@
+#!/usr/bin/env bun
 /**
- * RepoEval Retrieval Evaluation Runner
+ * SWE-bench Lite Retrieval-Only Evaluation Harness
  *
- * Compares AST-aware chunking vs fixed-size chunking on code retrieval.
+ * CLI entrypoint for running retrieval-only evaluation comparing:
+ * - Agent1 (ops-only): Read/Grep/Glob
+ * - Agent2 (ops+search): Read/Grep/Glob + semantic search
  *
  * Usage:
- *   bun eval/run.ts
- */
-
-import { readdirSync, statSync } from 'node:fs'
-import { mkdir, readFile, writeFile } from 'node:fs/promises'
-import { join } from 'node:path'
-import { chunkFile as chunkWithAST } from './chunkers/ast'
-import { chunkFile as chunkWithChonkie } from './chunkers/chonkie'
-import { chunkFile as chunkWithFixed } from './chunkers/fixed'
-import {
-	download,
-	getRepos,
-	getReposDir,
-	loadTasks,
-	type RepoEvalTask,
-} from './download'
-import { embedTexts, topK } from './embeddings'
-import { aggregateMetrics, computeMetrics } from './metrics'
-
-const RESULTS_DIR = join(import.meta.dir, '..', 'results')
-const K_VALUES = [5, 10] // Top-k values for retrieval
-const MAX_CHUNK_SIZE = 1500 // NWS characters per chunk
-
-// Colors for terminal output
-const dim = (s: string) => `\x1b[2m${s}\x1b[0m`
-const bold = (s: string) => `\x1b[1m${s}\x1b[0m`
-const green = (s: string) => `\x1b[32m${s}\x1b[0m`
-const yellow = (s: string) => `\x1b[33m${s}\x1b[0m`
-const cyan = (s: string) => `\x1b[36m${s}\x1b[0m`
-
-// Status line helper - overwrites current line
-function status(msg: string) {
-	process.stdout.write(`\r\x1b[K${dim(msg)}`)
-}
-
-function clearStatus() {
-	process.stdout.write('\r\x1b[K')
-}
-
-interface ChunkInfo {
-	id: string
-	text: string
-	startLine: number
-	endLine: number
-	filepath: string
-}
-
-interface MetricsAtK {
-	precision: number
-	recall: number
-	ndcg: number
-}
-
-interface QueryResult {
-	taskId: string
-	prompt: string
-	groundTruthLines: { start: number; end: number }
-	groundTruthFile: string
-	retrievedChunks: Array<{ id: string; score: number; rank: number }>
-	relevantChunkIds: string[]
-	metrics: Record<number, MetricsAtK> // metrics per k value
-}
-
-type ChunkerType = 'ast' | 'chonkie' | 'fixed'
-
-interface EvalResult {
-	chunker: ChunkerType
-	repo: string
-	summary: Record<number, MetricsAtK> // summary per k value
-	queryResults: QueryResult[]
-	config: { kValues: number[]; maxChunkSize: number }
-	timestamp: string
-}
-
-/**
- * Recursively find all Python files in a directory
+ *   bun run src/run.ts [options]
+ *
+ * Options:
+ *   --split <dev|test>       Dataset split (default: test)
+ *   --max-instances <n>      Limit number of instances to process
+ *   --max-turns <n>          Max agent turns per instance (default: 20)
+ *   --max-tool-calls <n>     Max tool calls per agent (default: 50)
+ *   --model <name>           Claude model to use (default: claude-sonnet-4-5)
+ *   --skip-agent1            Skip Agent1 (ops-only)
+ *   --skip-agent2            Skip Agent2 (ops+search)
+ *   --instance <id>          Run specific instance(s), can be repeated
+ *   --run-dir <path>         Output directory for runs (default: ./runs)
+ *   --embedding-provider <gemini|openai>  Embedding provider (default: gemini)
+ *   --embedding-dimensions <n>            Output dimensions for Gemini (768/1536/3072)
+ *
+ * Environment Variables:
+ *   ANTHROPIC_API_KEY        Required for Claude API access
+ *   GOOGLE_API_KEY           Required for Gemini embeddings (default provider)
+ *   OPENAI_API_KEY           Required for OpenAI embeddings (if --embedding-provider openai)
+ *
+ * Examples:
+ *   # Run full evaluation on test split
+ *   bun run src/run.ts
+ *
+ *   # Run on dev split with max 10 instances
+ *   bun run src/run.ts --split dev --max-instances 10
+ *
+ *   # Run only Agent1 for debugging
+ *   bun run src/run.ts --skip-agent2 --max-instances 5
+ *
+ *   # Run specific instance
+ *   bun run src/run.ts --instance django__django-12345
  */
-function findPythonFiles(dir: string): string[] {
-	const files: string[] = []
 
-	function walk(currentDir: string) {
-		const entries = readdirSync(currentDir)
-		for (const entry of entries) {
-			const fullPath = join(currentDir, entry)
-			const stat = statSync(fullPath)
-			if (stat.isDirectory()) {
-				walk(fullPath)
-			} else if (entry.endsWith('.py')) {
-				files.push(fullPath)
-			}
+import dotenv from 'dotenv'
+import { type RunConfig, runEvaluation } from './swebench/run'
+
+// Load environment variables
+dotenv.config()
+
+// Parse command line arguments
+function parseArgs(): RunConfig {
+	const args = process.argv.slice(2)
+	const config: RunConfig = {}
+	const instanceIds: string[] = []
+
+	for (let i = 0; i < args.length; i++) {
+		const arg = args[i]
+		const next = args[i + 1]
+
+		switch (arg) {
+			case '--split':
+				if (next === 'dev' || next === 'test') {
+					config.split = next
+					i++
+				}
+				break
+			case '--max-instances':
+				if (next) config.maxInstances = parseInt(next, 10)
+				i++
+				break
+			case '--max-turns':
+				if (next) config.maxTurns = parseInt(next, 10)
+				i++
+				break
+			case '--max-tool-calls':
+				if (next) config.maxToolCalls = parseInt(next, 10)
+				i++
+				break
+			case '--model':
+				if (next) config.model = next
+				i++
+				break
+			case '--skip-agent1':
+				config.skipAgent1 = true
+				break
+			case '--skip-agent2':
+				config.skipAgent2 = true
+				break
+			case '--instance':
+				if (next) instanceIds.push(next)
+				i++
+				break
+			case '--run-dir':
+				if (next) config.runDir = next
+				i++
+				break
+			case '--embedding-provider':
+				if (next === 'gemini' || next === 'openai') {
+					config.embeddingProvider = next
+					i++
+				}
+				break
+			case '--embedding-dimensions':
+				config.embeddingDimensions = parseInt(next!, 10)
+				i++
+				break
+			case '--help':
+			case '-h':
+				console.log(`
+SWE-bench Lite Retrieval-Only Evaluation Harness
+
+Usage:
+  bun run src/run.ts [options]
+
+Options:
+  --split <dev|test>       Dataset split (default: test)
+  --max-instances <n>      Limit number of instances to process
+  --max-turns <n>          Max agent turns per instance (default: 20)
+  --max-tool-calls <n>     Max tool calls per agent (default: 50)
+  --model <name>           Claude model to use (default: claude-sonnet-4-5)
+  --skip-agent1            Skip Agent1 (ops-only)
+  --skip-agent2            Skip Agent2 (ops+search)
+  --instance <id>          Run specific instance(s), can be repeated
+  --run-dir <path>         Output directory for runs (default: ./runs)
+  --embedding-provider <gemini|openai>  Embedding provider (default: gemini)
+  --embedding-dimensions <n>            Output dimensions for Gemini (768/1536/3072)
+  --help, -h               Show this help message
+
+Environment Variables:
+  ANTHROPIC_API_KEY        Required for Claude API access
+  GOOGLE_API_KEY           Required for Gemini embeddings (default provider)
+  OPENAI_API_KEY           Required for OpenAI embeddings (if --embedding-provider openai)
+`)
+				process.exit(0)
 		}
 	}
 
-	walk(dir)
-	return files
-}
-
-/**
- * Check if a chunk overlaps with a line range
- */
-function chunksOverlap(
-	chunk: { startLine: number; endLine: number },
-	target: { start: number; end: number },
-): boolean {
-	return !(chunk.endLine < target.start || chunk.startLine > target.end)
-}
-
-interface EmbedStats {
-	cached: number
-	total: number
-}
-
-/**
- * Run evaluation for a single repository and chunker
- */
-async function evaluateRepo(
-	repo: string,
-	tasks: RepoEvalTask[],
-	chunkerType: ChunkerType,
-): Promise<{ result: EvalResult; chunkCount: number; embedStats: EmbedStats }> {
-	const repoDir = join(getReposDir(), repo)
-	const pyFiles = findPythonFiles(repoDir)
-
-	// Step 1: Chunk all files
-	status(`[${chunkerType}] chunking ${pyFiles.length} files...`)
-	const allChunks: ChunkInfo[] = []
-
-	for (const filepath of pyFiles) {
-		const code = await readFile(filepath, 'utf-8')
-		const relPath = filepath.replace(`${repoDir}/`, '')
-
-		try {
-			let chunks: Awaited<ReturnType<typeof chunkWithAST>>
-			switch (chunkerType) {
-				case 'ast':
-					chunks = await chunkWithAST(filepath, code, MAX_CHUNK_SIZE)
-					break
-				case 'chonkie':
-					chunks = await chunkWithChonkie(filepath, code, MAX_CHUNK_SIZE)
-					break
-				case 'fixed':
-					chunks = await chunkWithFixed(filepath, code, MAX_CHUNK_SIZE)
-					break
-			}
-
-			for (const chunk of chunks) {
-				allChunks.push({
-					...chunk,
-					filepath: relPath,
-				})
-			}
-		} catch {
-			// Skip files that fail to parse
-		}
+	if (instanceIds.length > 0) {
+		config.instanceIds = instanceIds
 	}
 
-	// Step 2: Embed all chunks
-	status(`[${chunkerType}] embedding ${allChunks.length} chunks...`)
-	const chunkTexts = allChunks.map((c) => c.text)
-	let embedStats: EmbedStats = { cached: 0, total: chunkTexts.length }
-	const chunkEmbeddings = await embedTexts(
-		chunkTexts,
-		(done, total, cached) => {
-			embedStats = { cached, total }
-			status(
-				`[${chunkerType}] embedding chunks ${done}/${total} (${cached} cached)`,
-			)
-		},
-	)
-
-	// Step 3: Embed queries
-	status(`[${chunkerType}] embedding ${tasks.length} queries...`)
-	const queryTexts = tasks.map((t) => t.prompt)
-	const queryEmbeddings = await embedTexts(queryTexts)
-
-	// Step 4: For each query, retrieve top-k and compute metrics
-	status(`[${chunkerType}] computing metrics...`)
-	const queryResults: QueryResult[] = []
-	const maxK = Math.max(...K_VALUES)
-
-	for (let i = 0; i < tasks.length; i++) {
-		const task = tasks[i]
-		const queryEmb = queryEmbeddings[i]
-		if (!task || !queryEmb) continue
-
-		const topKResults = topK(queryEmb, chunkEmbeddings, maxK)
-
-		const targetFile = task.metadata.fpath_tuple.slice(1).join('/')
-		const targetLines = {
-			start: task.metadata.context_start_lineno,
-			end: task.metadata.line_no,
-		}
-
-		const relevantChunkIds = allChunks
-			.filter((c) => c.filepath === targetFile && chunksOverlap(c, targetLines))
-			.map((c) => c.id)
-
-		const relevantSet = new Set(relevantChunkIds)
-		const retrievedIds = topKResults.map((r) => allChunks[r.index]?.id ?? '')
-
-		const metrics: Record<number, MetricsAtK> = {}
-		for (const k of K_VALUES) {
-			metrics[k] = computeMetrics(retrievedIds, relevantSet, k)
-		}
-
-		queryResults.push({
-			taskId: task.metadata.task_id,
-			prompt: `${task.prompt.slice(0, 200)}...`,
-			groundTruthLines: targetLines,
-			groundTruthFile: targetFile,
-			retrievedChunks: topKResults.map((r, rank) => ({
-				id: allChunks[r.index]?.id ?? '',
-				score: r.score,
-				rank: rank + 1,
-			})),
-			relevantChunkIds,
-			metrics,
-		})
-	}
-
-	const summary: Record<number, MetricsAtK> = {}
-	for (const k of K_VALUES) {
-		const metricsAtK = queryResults
-			.map((q) => q.metrics[k])
-			.filter((m): m is MetricsAtK => m !== undefined)
-		summary[k] = aggregateMetrics(metricsAtK)
-	}
-
-	clearStatus()
-
-	return {
-		result: {
-			chunker: chunkerType,
-			repo,
-			summary,
-			queryResults,
-			config: { kValues: K_VALUES, maxChunkSize: MAX_CHUNK_SIZE },
-			timestamp: new Date().toISOString(),
-		},
-		chunkCount: allChunks.length,
-		embedStats,
-	}
-}
-
-/**
- * Print metrics table for all k values
- */
-function printMetricsTable(
-	summaries: Record<string, Record<number, MetricsAtK>>,
-): void {
-	const chunkerNames = Object.keys(summaries)
-
-	for (const k of K_VALUES) {
-		console.log(dim(`@${k}`))
-		console.log(
-			`${dim('Chunker'.padEnd(12))} ${dim('nDCG'.padStart(8))}  ${dim('Prec'.padStart(8))}  ${dim('Recall'.padStart(8))}`,
-		)
-
-		for (const name of chunkerNames) {
-			const m = summaries[name]?.[k]
-			if (!m) continue
-			const ndcg = yellow(`${(m.ndcg * 100).toFixed(1)}%`.padStart(8))
-			const prec = `${(m.precision * 100).toFixed(1)}%`.padStart(8)
-			const recall = `${(m.recall * 100).toFixed(1)}%`.padStart(8)
-			console.log(`${cyan(name.padEnd(12))} ${ndcg}  ${prec}  ${recall}`)
-		}
-		console.log('')
-	}
+	return config
 }
 
+// Main
 async function main() {
-	console.log(bold('\nRepoEval Retrieval Evaluation\n'))
+	// Parse args first (handles --help early exit)
+	const config = parseArgs()
 
-	// Step 1: Download data if needed
-	await download()
-
-	// Step 2: Load tasks
-	status('loading tasks...')
-	const allTasks = await loadTasks('2k')
-	clearStatus()
-
-	// Group tasks by repo
-	const tasksByRepo = new Map<string, RepoEvalTask[]>()
-	for (const task of allTasks) {
-		const repo = task.metadata.task_id.split('/')[0]
-		if (!repo) continue
-		if (!tasksByRepo.has(repo)) {
-			tasksByRepo.set(repo, [])
-		}
-		const repoTasks = tasksByRepo.get(repo)
-		if (repoTasks) {
-			repoTasks.push(task)
-		}
+	// Check required env vars
+	if (!process.env.ANTHROPIC_API_KEY) {
+		console.error('Error: ANTHROPIC_API_KEY environment variable is required')
+		process.exit(1)
 	}
 
-	// Step 3: Run evaluation for each repo and chunker
-	await mkdir(RESULTS_DIR, { recursive: true })
-
-	const allResults: EvalResult[] = []
-	const repos = getRepos()
-	const chunkerTypes: ChunkerType[] = ['ast', 'chonkie', 'fixed']
-
-	// Display names for chunkers
-	const chunkerNames: Record<ChunkerType, string> = {
-		ast: 'AST',
-		chonkie: 'Chonkie',
-		fixed: 'Fixed',
-	}
-
-	for (let repoIdx = 0; repoIdx < repos.length; repoIdx++) {
-		const repo = repos[repoIdx]
-		if (!repo) continue
-		const tasks = tasksByRepo.get(repo)
-		if (!tasks || tasks.length === 0) {
-			continue
-		}
-
-		console.log(
-			`${dim(`[${repoIdx + 1}/${repos.length}]`)} ${bold(repo)} ${dim(`(${tasks.length} tasks)`)}`,
+	// Check embedding provider env var
+	const provider = config.embeddingProvider || 'gemini'
+	if (provider === 'gemini' && !process.env.GOOGLE_API_KEY) {
+		console.warn(
+			'Warning: GOOGLE_API_KEY not set. Agent2 (semantic search) will be skipped.',
+		)
+	} else if (provider === 'openai' && !process.env.OPENAI_API_KEY) {
+		console.warn(
+			'Warning: OPENAI_API_KEY not set. Agent2 (semantic search) will be skipped.',
 		)
-
-		const repoResults: Record<
-			string,
-			{ result: EvalResult; chunkCount: number; embedStats: EmbedStats }
-		> = {}
-
-		for (const chunkerType of chunkerTypes) {
-			const evalResult = await evaluateRepo(repo, tasks, chunkerType)
-			repoResults[chunkerType] = evalResult
-			allResults.push(evalResult.result)
-		}
-
-		// Print summary line for this repo
-		const summaryParts = chunkerTypes.map((ct) => {
-			const r = repoResults[ct]
-			if (!r) return ''
-			const { chunkCount, embedStats } = r
-			const cachedPct =
-				embedStats.total > 0
-					? Math.round((embedStats.cached / embedStats.total) * 100)
-					: 0
-			return `${cyan(chunkerNames[ct])}: ${chunkCount} ${dim(`(${cachedPct}%)`)}`
-		})
-		console.log(`  ${summaryParts.join('  ')}`)
-
-		// Print quick metrics comparison
-		const k = K_VALUES[0]
-		if (k !== undefined) {
-			const metricsLine = chunkerTypes.map((ct) => {
-				const r = repoResults[ct]
-				if (!r) return ''
-				const ndcg = (r.result.summary[k]?.ndcg ?? 0) * 100
-				return `${chunkerNames[ct]}: ${yellow(ndcg.toFixed(1))}%`
-			})
-			console.log(`  ${dim(`nDCG@${k}:`)} ${metricsLine.join('  ')}\n`)
-		}
-	}
-
-	// Step 4: Compute overall summary
-	console.log(bold('Results'))
-	console.log(dim('─'.repeat(60)))
-
-	// Aggregate results by chunker type
-	const overallByChunker: Record<string, Record<number, MetricsAtK>> = {}
-	for (const ct of chunkerTypes) {
-		const results = allResults.filter((r) => r.chunker === ct)
-		const name = chunkerNames[ct]
-		overallByChunker[name] = {}
-		for (const k of K_VALUES) {
-			const metricsAtK = results
-				.map((r) => r.summary[k])
-				.filter((m): m is MetricsAtK => m !== undefined)
-			const chunkerMetrics = overallByChunker[name]
-			if (chunkerMetrics) {
-				chunkerMetrics[k] = aggregateMetrics(metricsAtK)
-			}
-		}
-	}
-
-	printMetricsTable(overallByChunker)
-
-	// Compute improvements vs Fixed baseline
-	const fixedOverall = overallByChunker[chunkerNames.fixed]
-	const computeImprovement = (a: number, b: number): string => {
-		if (b === 0) return 'N/A'
-		const improvement = ((a - b) / b) * 100
-		const sign = improvement >= 0 ? '+' : ''
-		return improvement >= 0
-			? green(`${sign}${improvement.toFixed(1)}%`)
-			: `${sign}${improvement.toFixed(1)}%`
 	}
+	console.log('Starting SWE-bench Lite retrieval-only evaluation...\n')
 
-	console.log(dim('vs Fixed baseline:'))
-	for (const k of K_VALUES) {
-		const parts = chunkerTypes
-			.filter((ct) => ct !== 'fixed')
-			.map((ct) => {
-				const overall = overallByChunker[chunkerNames[ct]]
-				const fixedNdcg = fixedOverall?.[k]?.ndcg ?? 0
-				const overallNdcg = overall?.[k]?.ndcg ?? 0
-				return `${cyan(chunkerNames[ct])} ${computeImprovement(overallNdcg, fixedNdcg)}`
-			})
-		console.log(`  k=${k}: ${parts.join('  ')}`)
+	try {
+		await runEvaluation(config)
+	} catch (err) {
+		console.error('Evaluation failed:', err)
+		process.exit(1)
 	}
-
-	// Step 5: Save results
-	const timestamp = new Date().toISOString().replace(/[:.]/g, '-')
-
-	// Save summary
-	const summaryPath = join(RESULTS_DIR, `summary_${timestamp}.json`)
-	await writeFile(
-		summaryPath,
-		JSON.stringify(
-			{
-				overall: overallByChunker,
-				perRepo: Object.fromEntries(
-					repos.map((repo) => [
-						repo,
-						Object.fromEntries(
-							chunkerTypes.map((ct) => [
-								ct,
-								allResults.find((r) => r.repo === repo && r.chunker === ct)
-									?.summary,
-							]),
-						),
-					]),
-				),
-				config: { kValues: K_VALUES, maxChunkSize: MAX_CHUNK_SIZE },
-				timestamp: new Date().toISOString(),
-			},
-			null,
-			2,
-		),
-	)
-
-	// Save detailed results
-	const detailedPath = join(RESULTS_DIR, `detailed_${timestamp}.json`)
-	await writeFile(detailedPath, JSON.stringify(allResults, null, 2))
-
-	console.log(`\n${dim(`Saved to ${summaryPath}`)}`)
 }
 
-// Run if executed directly
-if (import.meta.main) {
-	main().catch(console.error)
-}
+main()
diff --git a/packages/eval/src/swebench/agent/prompts.ts b/packages/eval/src/swebench/agent/prompts.ts
new file mode 100644
index 0000000..92a2eb1
--- /dev/null
+++ b/packages/eval/src/swebench/agent/prompts.ts
@@ -0,0 +1,163 @@
+/**
+ * Shared prompt templates for retrieval-only agents
+ */
+
+/**
+ * Base system prompt for retrieval-only evaluation
+ * Instructs the agent to locate files without making changes
+ */
+const BASE_SYSTEM_PROMPT = `You are a skilled software engineer helping to locate the source files that need to be modified to address a bug or feature request.
+
+## Your Task
+
+Given a problem statement describing a bug or feature request, your goal is to identify the **most relevant source files** in the repository that would need to be modified to address the issue.
+
+## Important Constraints
+
+1. **DO NOT** propose any code changes, patches, or fixes
+2. **DO NOT** write or edit any files  
+3. **DO NOT** run any commands that modify the repository
+4. **ONLY** read files and search the codebase to locate relevant files
+
+## Working Directory
+
+You are working in a repository checkout. All file paths should be relative to the repository root.
+- Use patterns like \`src/**/*.py\` or \`**/rules/*.py\` for Glob
+- Use paths like \`src/\` or \`.\` for Grep path parameter
+- All file paths in your output should be relative (e.g., \`src/module/file.py\`)
+
+## Available Tools
+
+You have access to these read-only tools:
+- **Read**: Read file contents
+- **Grep**: Search for patterns in files
+- **Glob**: Find files matching a pattern
+- **LS**: List directory contents
+
+**NOTE**: Bash, shell commands, and terminal access are NOT available.
+
+## Approach
+
+Understand the problem statement first - look for key terms, error messages, function names, or class names mentioned.
+
+Use the tools to navigate and search the codebase:
+- **Glob** is useful for finding files by name patterns
+- **Grep** is useful for searching file contents by keyword
+- **Read** lets you examine file contents in detail
+- **LS** shows directory structure
+
+Aim to identify 3-10 files that would need modification.
+
+## Output Format
+
+When you have identified the relevant files, output your final answer as a JSON object:
+
+\`\`\`json
+{
+  "top_files": [
+    "path/to/most/relevant/file.py",
+    "path/to/second/relevant/file.py"
+  ],
+  "reason": "Brief explanation of why these files are relevant"
+}
+\`\`\`
+
+List files in order of relevance (most relevant first). Aim for 3-10 files.`
+
+/**
+ * System prompt for Agent1 (ops-only)
+ */
+export const RETRIEVAL_ONLY_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT
+
+/**
+ * System prompt for Agent2 (ops + semantic search)
+ * Includes information about the semantic search tool
+ */
+export const RETRIEVAL_WITH_SEARCH_SYSTEM_PROMPT = `${BASE_SYSTEM_PROMPT}
+
+## Semantic Code Search (Your Primary Tool)
+
+You have **mcp__semantic_search__search** - use this to quickly find relevant code:
+
+\`\`\`
+mcp__semantic_search__search({"query": "description of what you're looking for", "top_k": 10})
+\`\`\`
+
+The codebase is pre-indexed. One semantic search call typically finds relevant files faster than multiple Glob/Grep calls.
+
+**Important**: Semantic search returns absolute file paths that you can use directly with Read. After identifying candidate files, use Read with the exact paths from the results.`
+
+/**
+ * Create the user prompt with just the problem statement
+ * @param problemStatement - The SWE-bench problem statement
+ * @param repo - Optional repo name for context
+ * @param hasSemanticSearch - Whether Agent2's semantic search is available
+ */
+export function createUserPrompt(
+	problemStatement: string,
+	repo?: string,
+	hasSemanticSearch = false,
+): string {
+	const repoInfo = repo
+		? `\n\nYou are working in the **${repo}** repository.\n`
+		: ''
+
+	const searchGuidance = hasSemanticSearch
+		? `**Recommended approach**: Start with semantic search to quickly find relevant code areas. Then use Read to examine specific files.`
+		: `Start by exploring the repository structure (use LS or Glob) to understand the codebase layout. Then search for relevant code using the available tools.`
+
+	return `## Problem Statement
+
+${problemStatement}
+${repoInfo}
+---
+
+Please analyze the problem and identify the source files that would need to be modified to address this issue.
+
+${searchGuidance}
+
+Remember to output your final answer as JSON with "top_files" and "reason" fields. Use relative paths (e.g., \`src/module/file.py\`).`
+}
+
+/**
+ * Parse the agent's final output to extract top_files
+ * Handles various output formats the agent might use
+ */
+export function parseTopFiles(output: string): string[] {
+	// Try to find JSON in the output
+	const jsonMatch = output.match(/```json\s*([\s\S]*?)\s*```/)
+	if (jsonMatch?.[1]) {
+		try {
+			const parsed = JSON.parse(jsonMatch[1])
+			if (Array.isArray(parsed.top_files)) {
+				return parsed.top_files
+			}
+		} catch {
+			// Continue to fallback
+		}
+	}
+
+	// Try to parse raw JSON
+	const rawJsonMatch = output.match(/\{[\s\S]*"top_files"[\s\S]*\}/)
+	if (rawJsonMatch) {
+		try {
+			const parsed = JSON.parse(rawJsonMatch[0])
+			if (Array.isArray(parsed.top_files)) {
+				return parsed.top_files
+			}
+		} catch {
+			// Continue to fallback
+		}
+	}
+
+	// Fallback: extract file paths from the text
+	// Match common file path patterns (e.g., path/to/file.py)
+	const pathMatches = output.match(/[\w\-./]+\.[a-z]+/gi) || []
+	const uniquePaths = [...new Set(pathMatches)].filter(
+		(p) =>
+			// Filter out common non-file patterns
+			!p.startsWith('http') && !p.includes('...') && p.includes('/'),
+	)
+
+	return uniquePaths.slice(0, 10)
+}
diff --git a/packages/eval/src/swebench/agent/semantic_search_adapter.ts b/packages/eval/src/swebench/agent/semantic_search_adapter.ts
new file mode 100644
index 0000000..1e2163e
--- /dev/null
+++ b/packages/eval/src/swebench/agent/semantic_search_adapter.ts
@@ -0,0 +1,600 @@
+/**
+ * Semantic Search Adapter: Interface for the underlying index implementation
+ * This provides a pluggable boundary to integrate with code-chunk or other indexers
+ */
+
+import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'
+import { join } from 'node:path'
+import { createSdkMcpServer, tool } from '@anthropic-ai/claude-agent-sdk'
+import { chunk as codeChunk } from 'code-chunk'
+import { z } from 'zod'
+import type { SemanticSearchResult } from '../types'
+
+/**
+ * Task type for embedding generation (provider-specific optimization)
+ * - RETRIEVAL_DOCUMENT: For indexing documents/code chunks
+ * - CODE_RETRIEVAL_QUERY: For search queries optimized for code retrieval
+ * - RETRIEVAL_QUERY: For general search queries
+ */
+export type EmbeddingTaskType =
+	| 'RETRIEVAL_DOCUMENT'
+	| 'CODE_RETRIEVAL_QUERY'
+	| 'RETRIEVAL_QUERY'
+
+/**
+ * Embedding service interface
+ */
+export interface EmbeddingService {
+	embed(
+		texts: string[],
+		taskType?: EmbeddingTaskType,
+	): Promise<{ embeddings: number[][]; tokens: number }>
+}
+
+/** Sleep helper for retry backoff */
+const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms))
+
+/** Check if error is retryable (network issues) */
+function isRetryableError(err: unknown): boolean {
+	const code = (err as { code?: string })?.code
+	const msg = err instanceof Error ? err.message : ''
+	return (
+		[
+			'ECONNRESET',
+			'ETIMEDOUT',
+			'ECONNREFUSED',
+			'ENOTFOUND',
+			'EAI_AGAIN',
+		].includes(code ?? '') ||
+		/socket|network|connection/i.test(msg) ||
+		err instanceof TypeError
+	)
+}
+
+/** Normalize vector to unit length */
+function normalizeVector(vector: number[]): number[] {
+	const norm = Math.sqrt(vector.reduce((sum, v) => sum + v * v, 0))
+	return norm === 0 ? vector : vector.map((v) => v / norm)
+}
+
+/**
+ * Gemini embedding service with retry logic
+ */
+export class GeminiEmbeddingService implements EmbeddingService {
+	private apiKey: string
+	private model: string
+	private maxRetries: number
+	private dims: number
+	private baseUrl = 'https://generativelanguage.googleapis.com/v1beta/models'
+
+	constructor(
+		apiKey: string,
+		model = 'gemini-embedding-001',
+		maxRetries = 5,
+		dims = 768,
+	) {
+		this.apiKey = apiKey
+		this.model = model
+		this.maxRetries = maxRetries
+		this.dims = dims
+	}
+
+	async embed(
+		texts: string[],
+		taskType: EmbeddingTaskType = 'RETRIEVAL_DOCUMENT',
+	): Promise<{ embeddings: number[][]; tokens: number }> {
+		const isSingle = texts.length === 1
+		const endpoint = isSingle ? 'embedContent' : 'batchEmbedContents'
+		const url = `${this.baseUrl}/${this.model}:${endpoint}?key=${this.apiKey}`
+
+		const body = isSingle
+			? {
+					model: `models/${this.model}`,
+					content: { parts: [{ text: texts[0] }] },
+					taskType,
+					outputDimensionality: this.dims,
+				}
+			: {
+					requests: texts.map((text) => ({
+						model: `models/${this.model}`,
+						content: { parts: [{ text }] },
+						taskType,
+						outputDimensionality: this.dims,
+					})),
+				}
+
+		let lastError: Error | null = null
+
+		for (let attempt = 0; attempt < this.maxRetries; attempt++) {
+			try {
+				const response = await fetch(url, {
+					method: 'POST',
+					headers: { 'Content-Type': 'application/json' },
+					body: JSON.stringify(body),
+				})
+
+				if (!response.ok) {
+					const { status, statusText } = response
+					const respBody = await response.text()
+
+					if (status === 429 || status >= 500) {
+						const waitMs = 2 ** attempt * 1000 + Math.random() * 1000
+						console.warn(
+							`[embedding] Gemini ${status} ${statusText}, retry in ${Math.round(waitMs)}ms (${attempt + 1}/${this.maxRetries})...`,
+						)
+						await sleep(waitMs)
+						lastError = new Error(`Gemini ${status} ${statusText}: ${respBody}`)
+						continue
+					}
+					throw new Error(`Gemini ${status} ${statusText}: ${respBody}`)
+				}
+
+				const data = await response.json()
+				let embeddings: number[][] = isSingle
+					? [data.embedding?.values || []]
+					: data.embeddings?.map((e: { values: number[] }) => e.values) || []
+
+				// Normalize if not using native 3072 dimensions
+				if (this.dims !== 3072) {
+					embeddings = embeddings.map(normalizeVector)
+				}
+
+				const tokens = texts.reduce(
+					(sum, t) => sum + Math.ceil(t.length / 4),
+					0,
+				)
+				return { embeddings, tokens }
+			} catch (err) {
+				if (isRetryableError(err) && attempt < this.maxRetries - 1) {
+					const waitMs = 2 ** attempt * 1000 + Math.random() * 1000
+					console.warn(
+						`[embedding] Network error, retry in ${Math.round(waitMs)}ms (${attempt + 1}/${this.maxRetries})...`,
+					)
+					await sleep(waitMs)
+					lastError = err instanceof Error ? err : new Error(String(err))
+					continue
+				}
+				throw err
+			}
+		}
+
+		throw lastError || new Error('Gemini embedding failed after retries')
+	}
+}
+
+/**
+ * Indexed chunk with embedding
+ */
+interface IndexedChunk {
+	filepath: string
+	startLine: number
+	endLine: number
+	text: string
+	contextualizedText: string
+	embedding: number[]
+}
+
+/**
+ * Cosine similarity between two vectors
+ */
+function cosineSimilarity(a: number[], b: number[]): number {
+	let dotProduct = 0
+	let normA = 0
+	let normB = 0
+
+	for (let i = 0; i < a.length; i++) {
+		dotProduct += a[i]! * b[i]!
+		normA += a[i]! * a[i]!
+		normB += b[i]! * b[i]!
+	}
+
+	return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB))
+}
+
+/**
+ * Cache metadata for serialized index
+ */
+interface IndexCacheMetadata {
+	version: number
+	embeddingProvider: string
+	embeddingDimensions: number
+	chunkSettings: {
+		maxChunkSize: number
+		overlapLines: number
+	}
+	createdAt: string
+	chunkCount: number
+	totalEmbedTokens: number
+	totalEmbedLatencyMs: number
+}
+
+/**
+ * Semantic Search Index: indexes a repository and supports similarity search
+ * Supports caching to avoid re-embedding the same repo
+ */
+export class SemanticSearchIndex {
+	private chunks: IndexedChunk[] = []
+	private embedService: EmbeddingService
+	private worktreePath: string
+
+	// Metrics for fairness accounting
+	public indexLoadMs = 0
+	public totalEmbedTokens = 0
+	public totalEmbedLatencyMs = 0
+	public lastQueryEmbedTokens = 0
+	public lastQueryEmbedLatencyMs = 0
+
+	// Cache versioning (increment when chunk/embed format changes)
+	private static CACHE_VERSION = 1
+
+	constructor(worktreePath: string, embedService: EmbeddingService) {
+		this.worktreePath = worktreePath
+		this.embedService = embedService
+	}
+
+	/**
+	 * Generate cache key for an index
+	 */
+	static getCacheKey(
+		instanceId: string,
+		embeddingProvider: string,
+		embeddingDimensions: number,
+	): string {
+		return `${instanceId}_${embeddingProvider}_${embeddingDimensions}`
+	}
+
+	/**
+	 * Get cache file path
+	 */
+	static getCachePath(cacheDir: string, cacheKey: string): string {
+		return join(cacheDir, `${cacheKey}.json`)
+	}
+
+	/**
+	 * Check if a valid cache exists
+	 */
+	static cacheExists(
+		cacheDir: string,
+		instanceId: string,
+		embeddingProvider: string,
+		embeddingDimensions: number,
+	): boolean {
+		const cacheKey = SemanticSearchIndex.getCacheKey(
+			instanceId,
+			embeddingProvider,
+			embeddingDimensions,
+		)
+		const cachePath = SemanticSearchIndex.getCachePath(cacheDir, cacheKey)
+		return existsSync(cachePath)
+	}
+
+	/**
+	 * Load index from cache
+	 */
+	static async loadFromCache(
+		cacheDir: string,
+		instanceId: string,
+		embeddingProvider: string,
+		embeddingDimensions: number,
+		worktreePath: string,
+		embedService: EmbeddingService,
+	): Promise<SemanticSearchIndex | null> {
+		const cacheKey = SemanticSearchIndex.getCacheKey(
+			instanceId,
+			embeddingProvider,
+			embeddingDimensions,
+		)
+		const cachePath = SemanticSearchIndex.getCachePath(cacheDir, cacheKey)
+
+		if (!existsSync(cachePath)) {
+			return null
+		}
+
+		try {
+			const startTime = Date.now()
+			const data = JSON.parse(readFileSync(cachePath, 'utf-8'))
+
+			// Validate cache version
+			if (data.metadata?.version !== SemanticSearchIndex.CACHE_VERSION) {
+				console.log(`[semantic-search] Cache version mismatch, will re-index`)
+				return null
+			}
+
+			const index = new SemanticSearchIndex(worktreePath, embedService)
+			index.chunks = data.chunks
+			index.totalEmbedTokens = data.metadata.totalEmbedTokens || 0
+			index.totalEmbedLatencyMs = data.metadata.totalEmbedLatencyMs || 0
+			index.indexLoadMs = Date.now() - startTime
+
+			console.log(
+				`[semantic-search] Loaded ${index.chunks.length} chunks from cache in ${index.indexLoadMs}ms`,
+			)
+
+			return index
+		} catch (err) {
+			console.warn(`[semantic-search] Failed to load cache: ${err}`)
+			return null
+		}
+	}
+
+	/**
+	 * Save index to cache
+	 */
+	async saveToCache(
+		cacheDir: string,
+		instanceId: string,
+		embeddingProvider: string,
+		embeddingDimensions: number,
+	): Promise<void> {
+		try {
+			// Ensure cache directory exists
+			if (!existsSync(cacheDir)) {
+				mkdirSync(cacheDir, { recursive: true })
+			}
+
+			const cacheKey = SemanticSearchIndex.getCacheKey(
+				instanceId,
+				embeddingProvider,
+				embeddingDimensions,
+			)
+			const cachePath = SemanticSearchIndex.getCachePath(cacheDir, cacheKey)
+
+			const metadata: IndexCacheMetadata = {
+				version: SemanticSearchIndex.CACHE_VERSION,
+				embeddingProvider,
+				embeddingDimensions,
+				chunkSettings: {
+					maxChunkSize: 1500,
+					overlapLines: 5,
+				},
+				createdAt: new Date().toISOString(),
+				chunkCount: this.chunks.length,
+				totalEmbedTokens: this.totalEmbedTokens,
+				totalEmbedLatencyMs: this.totalEmbedLatencyMs,
+			}
+
+			const data = {
+				metadata,
+				chunks: this.chunks,
+			}
+
+			writeFileSync(cachePath, JSON.stringify(data))
+			console.log(`[semantic-search] Saved index to cache: ${cachePath}`)
+		} catch (err) {
+			console.warn(`[semantic-search] Failed to save cache: ${err}`)
+		}
+	}
+
+	/**
+	 * Index files in the repository
+	 */
+	async index(filePaths: string[]): Promise<void> {
+		const startTime = Date.now()
+		console.log(`[semantic-search] Indexing ${filePaths.length} files...`)
+
+		// Chunk all files
+		const allChunks: {
+			filepath: string
+			text: string
+			contextualizedText: string
+			startLine: number
+			endLine: number
+		}[] = []
+
+		for (const filepath of filePaths) {
+			try {
+				const fullPath = join(this.worktreePath, filepath)
+				const content = readFileSync(fullPath, 'utf-8')
+
+				const chunks = await codeChunk(filepath, content, {
+					maxChunkSize: 1500,
+					overlapLines: 5,
+				})
+
+				for (const c of chunks) {
+					allChunks.push({
+						filepath,
+						text: c.text,
+						contextualizedText: c.contextualizedText,
+						startLine: c.lineRange.start,
+						endLine: c.lineRange.end,
+					})
+				}
+			} catch (err) {
+				// Skip files that can't be chunked (binary, too large, etc.)
+				console.warn(`[semantic-search] Failed to chunk ${filepath}: ${err}`)
+			}
+		}
+
+		console.log(`[semantic-search] Created ${allChunks.length} chunks`)
+
+		// Batch embed chunks (using contextualizedText for better semantic matching)
+		const batchSize = 100
+		for (let i = 0; i < allChunks.length; i += batchSize) {
+			const batch = allChunks.slice(i, i + batchSize)
+			const texts = batch.map((c) => c.contextualizedText)
+
+			const embedStart = Date.now()
+			const { embeddings, tokens } = await this.embedService.embed(
+				texts,
+				'RETRIEVAL_DOCUMENT', // Task type for indexing documents/code chunks
+			)
+			const embedLatency = Date.now() - embedStart
+
+			this.totalEmbedTokens += tokens
+			this.totalEmbedLatencyMs += embedLatency
+
+			for (let j = 0; j < batch.length; j++) {
+				this.chunks.push({
+					...batch[j]!,
+					embedding: embeddings[j]!,
+				})
+			}
+
+			console.log(
+				`[semantic-search] Embedded batch ${Math.floor(i / batchSize) + 1}/${Math.ceil(allChunks.length / batchSize)} (${tokens} tokens, ${embedLatency}ms)`,
+			)
+		}
+
+		this.indexLoadMs = Date.now() - startTime
+		console.log(
+			`[semantic-search] Indexing complete: ${this.chunks.length} chunks, ${this.totalEmbedTokens} tokens, ${this.indexLoadMs}ms`,
+		)
+	}
+
+	/**
+	 * Search for relevant chunks given a query
+	 */
+	async search(
+		query: string,
+		topK = 10,
+		filters?: { filepathPattern?: string },
+	): Promise<SemanticSearchResult[]> {
+		if (this.chunks.length === 0) {
+			return []
+		}
+
+		// Embed the query with CODE_RETRIEVAL_QUERY for optimal code search
+		const embedStart = Date.now()
+		const { embeddings, tokens } = await this.embedService.embed(
+			[query],
+			'CODE_RETRIEVAL_QUERY', // Task type optimized for code retrieval queries
+		)
+		const queryEmbedding = embeddings[0]!
+		this.lastQueryEmbedLatencyMs = Date.now() - embedStart
+		this.lastQueryEmbedTokens = tokens
+		this.totalEmbedTokens += tokens
+		this.totalEmbedLatencyMs += this.lastQueryEmbedLatencyMs
+
+		// Filter chunks if needed
+		let candidates = this.chunks
+		if (filters?.filepathPattern) {
+			const pattern = new RegExp(filters.filepathPattern)
+			candidates = candidates.filter((c) => pattern.test(c.filepath))
+		}
+
+		// Compute similarities
+		const scored = candidates.map((c) => ({
+			chunk: c,
+			score: cosineSimilarity(queryEmbedding, c.embedding!),
+		}))
+
+		// Sort by score descending
+		scored.sort((a, b) => b.score - a.score)
+
+		// Return top-k results with absolute paths (SDK Read tool requires absolute paths)
+		const results = scored.slice(0, topK).map((s) => ({
+			filepath: join(this.worktreePath, s.chunk.filepath),
+			start_line: s.chunk.startLine,
+			end_line: s.chunk.endLine,
+			score: s.score,
+			snippet: s.chunk.text.slice(0, 200), // Truncate for compactness
+		}))
+
+		return results
+	}
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// MCP Server for semantic search tool
+// ─────────────────────────────────────────────────────────────────────────────
+
+export interface SemanticSearchMetrics {
+	callCount: number
+	totalLatencyMs: number
+	totalQueryEmbedTokens: number
+	totalQueryEmbedLatencyMs: number
+}
+
+export function createSemanticSearchMetrics(): SemanticSearchMetrics {
+	return {
+		callCount: 0,
+		totalLatencyMs: 0,
+		totalQueryEmbedTokens: 0,
+		totalQueryEmbedLatencyMs: 0,
+	}
+}
+
+export function createSemanticSearchServer(
+	index: SemanticSearchIndex,
+	metrics: SemanticSearchMetrics,
+) {
+	return createSdkMcpServer({
+		name: 'semantic_search',
+		version: '1.0.0',
+		tools: [
+			tool(
+				'search',
+				'Search the codebase for code semantically similar to the query. Returns ranked file locations with snippets.',
+				{
+					query: z
+						.string()
+						.describe(
+							"Natural language description of what code you're looking for",
+						),
+					top_k: z
+						.number()
+						.min(1)
+						.max(50)
+						.default(10)
+						.describe('Number of results to return'),
+					filepath_pattern: z
+						.string()
+						.optional()
+						.describe('Optional regex pattern to filter file paths'),
+				},
+				async (args) => {
+					const startTime = Date.now()
+					try {
+						const results = await index.search(args.query, args.top_k, {
+							filepathPattern: args.filepath_pattern,
+						})
+						metrics.callCount++
+						metrics.totalLatencyMs += Date.now() - startTime
+						metrics.totalQueryEmbedTokens += index.lastQueryEmbedTokens
+						metrics.totalQueryEmbedLatencyMs += index.lastQueryEmbedLatencyMs
+
+						if (results.length === 0) {
+							return {
+								content: [
+									{
+										type: 'text' as const,
+										text: 'No matching code found for the query.',
+									},
+								],
+							}
+						}
+
+						const formatted = results.map((r, i) => {
+							const lines =
+								r.start_line && r.end_line
+									? `L${r.start_line}-${r.end_line}`
+									: ''
+							return `${i + 1}. ${r.filepath}${lines ? `:${lines}` : ''} (score: ${r.score.toFixed(3)})\n   ${r.snippet?.replace(/\n/g, '\n   ')}`
+						})
+						const fileList = results.map((r) => r.filepath).join('\n')
+
+						return {
+							content: [
+								{
+									type: 'text' as const,
+									text: `Found ${results.length} relevant code locations:\n\n${formatted.join('\n\n')}\n\n__FILES__\n${fileList}\n__END_FILES__`,
+								},
+							],
+						}
+					} catch (error) {
+						return {
+							content: [
+								{
+									type: 'text' as const,
+									text: `Error during semantic search: ${error instanceof Error ? error.message : String(error)}`,
+								},
+							],
+							isError: true,
+						}
+					}
+				},
+			),
+		],
+	})
+}
diff --git a/packages/eval/src/swebench/agent/variants.ts b/packages/eval/src/swebench/agent/variants.ts
new file mode 100644
index 0000000..29426fb
--- /dev/null
+++ b/packages/eval/src/swebench/agent/variants.ts
@@ -0,0 +1,92 @@
+/**
+ * Agent variant definitions for retrieval-only evaluation
+ */
+
+import type { AgentVariant } from '../types'
+
+/**
+ * Configuration for an agent variant
+ */
+export interface AgentConfig {
+	variant: AgentVariant
+	tools: string[]
+	allowedTools: string[]
+	maxTurns: number
+	maxToolCalls: number
+}
+
+/**
+ * Agent1: Ops-only (Read/Grep/Glob)
+ */
+export const AGENT1_CONFIG: AgentConfig = {
+	variant: 'ops-only',
+	tools: ['Read', 'Grep', 'Glob'],
+	allowedTools: ['Read', 'Grep', 'Glob'],
+	maxTurns: 20,
+	maxToolCalls: 50,
+}
+
+/**
+ * Agent2: Ops + Semantic Search
+ * Includes the custom semantic search tool exposed as MCP tool
+ */
+export const AGENT2_CONFIG: AgentConfig = {
+	variant: 'ops-plus-search',
+	tools: ['Read', 'Grep', 'Glob'],
+	allowedTools: [
+		'Read',
+		'Grep',
+		'Glob',
+		'mcp__semantic_search__search', // Custom MCP tool
+	],
+	maxTurns: 20,
+	maxToolCalls: 50,
+}
+
+/**
+ * Tools that are explicitly denied (for logging/enforcement)
+ */
+export const DENIED_TOOLS = [
+	'Write',
+	'Edit',
+	'Bash',
+	'WebFetch',
+	'TodoRead',
+	'TodoWrite',
+	'NotebookRead',
+	'NotebookEdit',
+]
+
+/**
+ * Get agent config by variant
+ */
+export function getAgentConfig(variant: AgentVariant): AgentConfig {
+	return variant === 'ops-only' ? AGENT1_CONFIG : AGENT2_CONFIG
+}
+
+/**
+ * Check if a tool is allowed for a variant
+ */
+export function isToolAllowed(
+	toolName: string,
+	config: AgentConfig,
+	toolCallCount: number,
+): { allowed: boolean; reason?: string } {
+	// Check tool budget
+	if (toolCallCount >= config.maxToolCalls) {
+		return { allowed: false, reason: 'Tool budget exceeded' }
+	}
+
+	// Check if tool is explicitly allowed
+	if (config.allowedTools.includes(toolName)) {
+		return { allowed: true }
+	}
+
+	// Check if tool is explicitly denied
+	if (DENIED_TOOLS.includes(toolName)) {
+		return { allowed: false, reason: 'Tool is denied for retrieval-only mode' }
+	}
+
+	// Default: deny unknown tools
+	return { allowed: false, reason: 'Tool not in allowlist' }
+}
diff --git a/packages/eval/src/swebench/aggregate.ts b/packages/eval/src/swebench/aggregate.ts
new file mode 100644
index 0000000..b6abe86
--- /dev/null
+++ b/packages/eval/src/swebench/aggregate.ts
@@ -0,0 +1,312 @@
+/**
+ * Aggregate metrics across all instances for summary reporting
+ */
+
+import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'
+import { dirname } from 'node:path'
+import type { AgentVariant, AggregateSummary, InstanceMetrics } from './types'
+
+/**
+ * Load metrics from JSONL file
+ */
+export function loadMetrics(filePath: string): InstanceMetrics[] {
+	const content = readFileSync(filePath, 'utf-8')
+	return content
+		.trim()
+		.split('\n')
+		.filter(Boolean)
+		.map((line) => JSON.parse(line) as InstanceMetrics)
+}
+
+/**
+ * Compute percentile
+ */
+function percentile(values: number[], p: number): number {
+	if (values.length === 0) return 0
+	const sorted = [...values].sort((a, b) => a - b)
+	const idx = Math.ceil((p / 100) * sorted.length) - 1
+	return sorted[Math.max(0, idx)]!
+}
+
+/**
+ * Compute mean
+ */
+function mean(values: number[]): number {
+	if (values.length === 0) return 0
+	return values.reduce((a, b) => a + b, 0) / values.length
+}
+
+/**
+ * Compute median
+ */
+function median(values: number[]): number {
+	return percentile(values, 50)
+}
+
+/**
+ * Filter values, removing nulls
+ */
+function filterNulls(values: (number | null)[]): number[] {
+	return values.filter((v): v is number => v !== null)
+}
+
+/**
+ * Sum helper
+ */
+function sum(values: number[]): number {
+	return values.reduce((a, b) => a + b, 0)
+}
+
+/**
+ * Aggregate metrics for a single agent variant
+ */
+function aggregateForVariant(
+	metrics: InstanceMetrics[],
+	variant: AgentVariant,
+): AggregateSummary['agent_summaries'][0] {
+	const variantMetrics = metrics.filter((m) => m.agent_variant === variant)
+	const n = variantMetrics.length
+
+	if (n === 0) {
+		return {
+			agent_variant: variant,
+			hit_at_1_rate: 0,
+			hit_at_3_rate: 0,
+			hit_at_5_rate: 0,
+			hit_at_10_rate: 0,
+			mean_mrr: 0,
+			mean_coverage_at_5: 0,
+			mean_coverage_at_10: 0,
+			total_duration_ms: 0,
+			total_tokens: 0,
+			total_input_tokens: 0,
+			total_output_tokens: 0,
+			total_cost_usd: 0,
+			total_tool_calls: 0,
+			median_duration_ms: 0,
+			p90_duration_ms: 0,
+			median_tokens: 0,
+			p90_tokens: 0,
+			median_time_to_first_hit_ms: null,
+			median_tokens_to_first_hit: null,
+		}
+	}
+
+	// Hit rates (using behavioral ranking)
+	const hit1 = variantMetrics.filter((m) => m.hit_at_1_behavioral).length / n
+	const hit3 = variantMetrics.filter((m) => m.hit_at_3_behavioral).length / n
+	const hit5 = variantMetrics.filter((m) => m.hit_at_5_behavioral).length / n
+	const hit10 = variantMetrics.filter((m) => m.hit_at_10_behavioral).length / n
+
+	// MRR and coverage
+	const mrrValues = variantMetrics.map((m) => m.mrr_behavioral)
+	const cov5Values = variantMetrics.map((m) => m.coverage_at_5_behavioral)
+	const cov10Values = variantMetrics.map((m) => m.coverage_at_10_behavioral)
+
+	// Duration and tokens
+	const durations = variantMetrics.map((m) => m.total_duration_ms)
+	const inputTokens = variantMetrics.map((m) => m.total_input_tokens)
+	const outputTokens = variantMetrics.map((m) => m.total_output_tokens)
+	const tokens = variantMetrics.map(
+		(m) => m.total_input_tokens + m.total_output_tokens,
+	)
+	const costs = variantMetrics.map((m) => m.total_cost_usd)
+	const toolCalls = variantMetrics.map((m) => m.tool_calls_count)
+
+	// Time/tokens to first hit
+	const timesToHit = filterNulls(
+		variantMetrics.map((m) => m.time_to_first_hit_ms),
+	)
+	const tokensToHit = filterNulls(
+		variantMetrics.map((m) => m.tokens_to_first_hit),
+	)
+
+	return {
+		agent_variant: variant,
+		hit_at_1_rate: hit1,
+		hit_at_3_rate: hit3,
+		hit_at_5_rate: hit5,
+		hit_at_10_rate: hit10,
+		mean_mrr: mean(mrrValues),
+		mean_coverage_at_5: mean(cov5Values),
+		mean_coverage_at_10: mean(cov10Values),
+		// Totals
+		total_duration_ms: sum(durations),
+		total_tokens: sum(tokens),
+		total_input_tokens: sum(inputTokens),
+		total_output_tokens: sum(outputTokens),
+		total_cost_usd: sum(costs),
+		total_tool_calls: sum(toolCalls),
+		// Medians
+		median_duration_ms: median(durations),
+		p90_duration_ms: percentile(durations, 90),
+		median_tokens: median(tokens),
+		p90_tokens: percentile(tokens, 90),
+		median_time_to_first_hit_ms:
+			timesToHit.length > 0 ? median(timesToHit) : null,
+		median_tokens_to_first_hit:
+			tokensToHit.length > 0 ? median(tokensToHit) : null,
+	}
+}
+
+/**
+ * Aggregate all metrics into a summary
+ */
+export function aggregateMetrics(
+	metrics: InstanceMetrics[],
+	split: string,
+): AggregateSummary {
+	const opsOnly = aggregateForVariant(metrics, 'ops-only')
+	const opsPlusSearch = aggregateForVariant(metrics, 'ops-plus-search')
+
+	// Compute deltas (Agent2 - Agent1) - use totals for cost comparison
+	const delta = {
+		hit_at_1_delta: opsPlusSearch.hit_at_1_rate - opsOnly.hit_at_1_rate,
+		hit_at_3_delta: opsPlusSearch.hit_at_3_rate - opsOnly.hit_at_3_rate,
+		hit_at_5_delta: opsPlusSearch.hit_at_5_rate - opsOnly.hit_at_5_rate,
+		mrr_delta: opsPlusSearch.mean_mrr - opsOnly.mean_mrr,
+		duration_ms_delta:
+			opsPlusSearch.total_duration_ms - opsOnly.total_duration_ms,
+		tokens_delta: opsPlusSearch.total_tokens - opsOnly.total_tokens,
+		cost_usd_delta: opsPlusSearch.total_cost_usd - opsOnly.total_cost_usd,
+	}
+
+	return {
+		split,
+		total_instances: new Set(metrics.map((m) => m.instance_id)).size,
+		agent_summaries: [opsOnly, opsPlusSearch],
+		delta,
+	}
+}
+
+/**
+ * Write summary to JSON file
+ */
+export function writeSummary(
+	filePath: string,
+	summary: AggregateSummary,
+): void {
+	const dir = dirname(filePath)
+	if (!existsSync(dir)) {
+		mkdirSync(dir, { recursive: true })
+	}
+
+	writeFileSync(filePath, JSON.stringify(summary, null, 2))
+}
+
+/**
+ * Format duration nicely
+ */
+function formatDuration(ms: number): string {
+	if (ms < 1000) return `${ms.toFixed(0)}ms`
+	if (ms < 60000) return `${(ms / 1000).toFixed(1)}s`
+	return `${(ms / 60000).toFixed(1)}m`
+}
+
+/**
+ * Format token count nicely
+ */
+function formatTokens(tokens: number): string {
+	if (tokens < 1000) return `${tokens}`
+	if (tokens < 1000000) return `${(tokens / 1000).toFixed(1)}k`
+	return `${(tokens / 1000000).toFixed(2)}M`
+}
+
+/**
+ * Print summary to console
+ */
+export function printSummary(summary: AggregateSummary): void {
+	const W = 70 // Total box width
+	const line = '='.repeat(W - 2)
+
+	console.log(`\n+${line}+`)
+	console.log(
+		'|' +
+			`  SWE-bench Lite Retrieval Evaluation Summary (${summary.split})`.padEnd(
+				W - 2,
+			) +
+			'|',
+	)
+	console.log(`+${line}+`)
+	console.log(
+		`|${`  Total instances: ${summary.total_instances}`.padEnd(W - 2)}|`,
+	)
+	console.log(`+${line}+\n`)
+
+	for (const agent of summary.agent_summaries) {
+		const title = ` ${agent.agent_variant.toUpperCase()} `
+		const titlePad = Math.floor((W - 2 - title.length) / 2)
+		const header =
+			'='.repeat(titlePad) + title + '='.repeat(W - 2 - titlePad - title.length)
+
+		console.log(`+${header}+`)
+
+		// Quality metrics
+		console.log(`${'| QUALITY'.padEnd(W - 1)}|`)
+		const h1 = `${(agent.hit_at_1_rate * 100).toFixed(1)}%`.padStart(6)
+		const h3 = `${(agent.hit_at_3_rate * 100).toFixed(1)}%`.padStart(6)
+		const h5 = `${(agent.hit_at_5_rate * 100).toFixed(1)}%`.padStart(6)
+		const h10 = `${(agent.hit_at_10_rate * 100).toFixed(1)}%`.padStart(6)
+		console.log(
+			`|   Hit@1:${h1}  Hit@3:${h3}  Hit@5:${h5}  Hit@10:${h10}`.padEnd(W - 1) +
+				'|',
+		)
+		console.log(
+			`${`|   MRR: ${agent.mean_mrr.toFixed(3)}   Coverage@5: ${(agent.mean_coverage_at_5 * 100).toFixed(1)}%   Coverage@10: ${(agent.mean_coverage_at_10 * 100).toFixed(1)}%`.padEnd(
+				W - 1,
+			)}|`,
+		)
+
+		// Totals
+		console.log(`${'| TOTALS'.padEnd(W - 1)}|`)
+		const dur = formatDuration(agent.total_duration_ms).padStart(8)
+		const tok = formatTokens(agent.total_tokens).padStart(7)
+		const tokIn = formatTokens(agent.total_input_tokens)
+		const tokOut = formatTokens(agent.total_output_tokens)
+		console.log(
+			`${`|   Duration:${dur}   Tokens:${tok} (in: ${tokIn}, out: ${tokOut})`.padEnd(
+				W - 1,
+			)}|`,
+		)
+		const cost = agent.total_cost_usd.toFixed(4).padStart(8)
+		console.log(
+			`${`|   Cost: $${cost}   Tool calls: ${agent.total_tool_calls}`.padEnd(
+				W - 1,
+			)}|`,
+		)
+
+		// First hit metrics
+		if (agent.median_time_to_first_hit_ms !== null) {
+			console.log(`${'| FIRST HIT'.padEnd(W - 1)}|`)
+			console.log(
+				`${`|   Time: ${formatDuration(agent.median_time_to_first_hit_ms)}   Tokens: ${agent.median_tokens_to_first_hit ?? 'N/A'}`.padEnd(
+					W - 1,
+				)}|`,
+			)
+		}
+
+		console.log(`+${line}+\n`)
+	}
+
+	// Delta comparison
+	console.log(`+${'-'.repeat(W - 2)}+`)
+	console.log(`|${' DELTA (ops-plus-search minus ops-only)'.padEnd(W - 2)}|`)
+	console.log(`+${'-'.repeat(W - 2)}+`)
+	const d1 = `${(summary.delta.hit_at_1_delta * 100) >= 0 ? '+' : ''}${(summary.delta.hit_at_1_delta * 100).toFixed(1)}%`
+	const dMrr = `${summary.delta.mrr_delta >= 0 ? '+' : ''}${summary.delta.mrr_delta.toFixed(3)}`
+	console.log(
+		`${`|   Quality:  Hit@1 ${d1.padStart(7)}   MRR ${dMrr.padStart(7)}`.padEnd(
+			W - 1,
+		)}|`,
+	)
+	const dTok = `${summary.delta.tokens_delta >= 0 ? '+' : ''}${formatTokens(summary.delta.tokens_delta)}`
+	const dCost = `${summary.delta.cost_usd_delta >= 0 ? '+' : ''}$${summary.delta.cost_usd_delta.toFixed(4)}`
+	console.log(
+		`${`|   Cost:     ${dTok.padStart(8)} tokens   ${dCost.padStart(10)}`.padEnd(
+			W - 1,
+		)}|`,
+	)
+	const dDur = `${summary.delta.duration_ms_delta >= 0 ? '+' : ''}${formatDuration(summary.delta.duration_ms_delta)}`
+	console.log(`${`|   Duration: ${dDur.padStart(8)}`.padEnd(W - 1)}|`)
+	console.log(`+${'-'.repeat(W - 2)}+\n`)
+}
diff --git a/packages/eval/src/swebench/dataset.ts b/packages/eval/src/swebench/dataset.ts
new file mode 100644
index 0000000..85e6673
--- /dev/null
+++ b/packages/eval/src/swebench/dataset.ts
@@ -0,0 +1,154 @@
+/**
+ * SWE-bench Lite dataset loader with caching and retry logic
+ * Fetches from Hugging Face Dataset Viewer /rows endpoint
+ */
+
+import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'
+import { dirname, join } from 'node:path'
+import type { SWEbenchInstance } from './types'
+
+const HF_DATASET = 'princeton-nlp/SWE-bench_Lite'
+const HF_API_BASE = 'https://datasets-server.huggingface.co'
+const PAGE_SIZE = 100
+
+// Cache directory (relative to package root)
+const CACHE_DIR = join(
+	dirname(dirname(decodeURIComponent(new URL(import.meta.url).pathname))),
+	'data',
+	'swebench_lite',
+)
+
+/**
+ * Ensure cache directory exists
+ */
+function ensureCacheDir(): void {
+	if (!existsSync(CACHE_DIR)) {
+		mkdirSync(CACHE_DIR, { recursive: true })
+	}
+}
+
+/**
+ * Get cache file path for a page
+ */
+function getCachePath(split: string, offset: number): string {
+	return join(CACHE_DIR, `${split}_offset${offset}_limit${PAGE_SIZE}.json`)
+}
+
+/**
+ * Sleep helper for retry backoff
+ */
+function sleep(ms: number): Promise<void> {
+	return new Promise((resolve) => setTimeout(resolve, ms))
+}
+
+/**
+ * Fetch a page from HF Dataset Viewer with retry + exponential backoff
+ */
+async function fetchPage(
+	split: string,
+	offset: number,
+	maxRetries = 3,
+): Promise<{ rows: SWEbenchInstance[]; total: number }> {
+	const url = new URL(`${HF_API_BASE}/rows`)
+	url.searchParams.set('dataset', HF_DATASET)
+	url.searchParams.set('config', 'default')
+	url.searchParams.set('split', split)
+	url.searchParams.set('offset', String(offset))
+	url.searchParams.set('length', String(PAGE_SIZE))
+
+	for (let attempt = 0; attempt < maxRetries; attempt++) {
+		try {
+			const response = await fetch(url.toString())
+			if (!response.ok) {
+				throw new Error(`HTTP ${response.status}: ${response.statusText}`)
+			}
+			const data = await response.json()
+
+			// HF returns { features, rows: [{row_idx, row: {...}}], num_rows_total }
+			const rows: SWEbenchInstance[] = data.rows.map((r: any) => ({
+				instance_id: r.row.instance_id,
+				repo: r.row.repo,
+				base_commit: r.row.base_commit,
+				problem_statement: r.row.problem_statement,
+				patch: r.row.patch,
+				test_patch: r.row.test_patch,
+			}))
+
+			return { rows, total: data.num_rows_total }
+		} catch (err) {
+			const waitMs = 2 ** attempt * 1000
+			console.warn(
+				`[dataset] Fetch failed (attempt ${attempt + 1}/${maxRetries}): ${err}. Retrying in ${waitMs}ms...`,
+			)
+			await sleep(waitMs)
+		}
+	}
+
+	throw new Error(
+		`[dataset] Failed to fetch page after ${maxRetries} attempts: split=${split}, offset=${offset}`,
+	)
+}
+
+/**
+ * Load a page from cache or fetch from HF
+ */
+async function loadPage(
+	split: string,
+	offset: number,
+): Promise<{ rows: SWEbenchInstance[]; total: number }> {
+	ensureCacheDir()
+	const cachePath = getCachePath(split, offset)
+
+	// Check cache first
+	if (existsSync(cachePath)) {
+		try {
+			const cached = JSON.parse(readFileSync(cachePath, 'utf-8'))
+			return cached
+		} catch {
+			// Cache corrupted, refetch
+		}
+	}
+
+	// Fetch from HF
+	const result = await fetchPage(split, offset)
+
+	// Write to cache
+	writeFileSync(cachePath, JSON.stringify(result, null, 2))
+
+	return result
+}
+
+/**
+ * Load all instances from a split
+ */
+export async function loadSWEbenchLite(
+	split: 'dev' | 'test' = 'test',
+	maxInstances?: number,
+): Promise<SWEbenchInstance[]> {
+	const instances: SWEbenchInstance[] = []
+	let offset = 0
+	let total = Infinity
+
+	console.log(`[dataset] Loading SWE-bench Lite split="${split}"...`)
+
+	while (offset < total) {
+		const page = await loadPage(split, offset)
+		total = page.total
+		instances.push(...page.rows)
+		console.log(
+			`[dataset] Loaded ${instances.length}/${total} instances (offset=${offset})`,
+		)
+
+		if (maxInstances && instances.length >= maxInstances) {
+			break
+		}
+
+		offset += PAGE_SIZE
+	}
+
+	const result = maxInstances ? instances.slice(0, maxInstances) : instances
+	console.log(
+		`[dataset] Loaded ${result.length} instances from split="${split}"`,
+	)
+	return result
+}
diff --git a/packages/eval/src/swebench/git.ts b/packages/eval/src/swebench/git.ts
new file mode 100644
index 0000000..f20e723
--- /dev/null
+++ b/packages/eval/src/swebench/git.ts
@@ -0,0 +1,204 @@
+/**
+ * Git repository manager: bare clones + worktrees for reproducible checkout
+ */
+
+import { spawnSync } from 'node:child_process'
+import { existsSync, mkdirSync, rmSync } from 'node:fs'
+import { dirname, join } from 'node:path'
+import type { WorktreeInfo } from './types'
+
+// Cache directory for bare clones (relative to package root)
+const REPOS_CACHE_DIR = join(
+	dirname(dirname(decodeURIComponent(new URL(import.meta.url).pathname))),
+	'data',
+	'repos',
+)
+
+// Worktrees directory
+const WORKTREES_DIR = join(
+	dirname(dirname(decodeURIComponent(new URL(import.meta.url).pathname))),
+	'data',
+	'worktrees',
+)
+
+/**
+ * Ensure directory exists
+ */
+function ensureDir(dir: string): void {
+	if (!existsSync(dir)) {
+		mkdirSync(dir, { recursive: true })
+	}
+}
+
+/**
+ * Run a git command and return stdout
+ */
+function git(args: string[], cwd?: string): string {
+	const result = spawnSync('git', args, {
+		cwd,
+		encoding: 'utf-8',
+		maxBuffer: 50 * 1024 * 1024, // 50MB
+	})
+
+	if (result.status !== 0) {
+		throw new Error(
+			`git ${args.join(' ')} failed: ${result.stderr || result.stdout}`,
+		)
+	}
+
+	return result.stdout.trim()
+}
+
+/**
+ * Get the bare clone path for a repo
+ * e.g. "django/django" -> "/path/to/repos/django__django.git"
+ */
+function getBareClonePath(repo: string): string {
+	const safeName = `${repo.replace(/\//g, '__')}.git`
+	return join(REPOS_CACHE_DIR, safeName)
+}
+
+/**
+ * Get the worktree path for an instance
+ */
+function getWorktreePath(instanceId: string): string {
+	const safeName = instanceId.replace(/[^a-zA-Z0-9_-]/g, '_')
+	return join(WORKTREES_DIR, safeName)
+}
+
+/**
+ * Ensure a bare clone exists for a repo, or create/update it
+ */
+async function ensureBareClone(repo: string): Promise<string> {
+	ensureDir(REPOS_CACHE_DIR)
+	const barePath = getBareClonePath(repo)
+
+	if (existsSync(barePath)) {
+		// Fetch latest
+		console.log(`[git] Fetching updates for ${repo}...`)
+		try {
+			git(['fetch', '--all', '--prune'], barePath)
+		} catch (err) {
+			console.warn(`[git] Fetch failed, will continue with existing: ${err}`)
+		}
+	} else {
+		// Clone bare
+		const url = `https://github.com/${repo}.git`
+		console.log(`[git] Cloning bare ${repo} from ${url}...`)
+		git(['clone', '--bare', url, barePath])
+	}
+
+	return barePath
+}
+
+/**
+ * Create a worktree at a specific commit
+ */
+export async function createWorktree(
+	repo: string,
+	commit: string,
+	instanceId: string,
+): Promise<WorktreeInfo> {
+	const startTime = Date.now()
+
+	// Ensure bare clone exists
+	const barePath = await ensureBareClone(repo)
+
+	// Get worktree path
+	ensureDir(WORKTREES_DIR)
+	const worktreePath = getWorktreePath(instanceId)
+
+	// Remove existing worktree if it exists
+	if (existsSync(worktreePath)) {
+		console.log(`[git] Removing existing worktree at ${worktreePath}...`)
+		try {
+			git(['worktree', 'remove', '--force', worktreePath], barePath)
+		} catch {
+			// Force remove directory if git worktree remove fails
+			rmSync(worktreePath, { recursive: true, force: true })
+		}
+	}
+
+	// Create worktree
+	console.log(
+		`[git] Creating worktree for ${instanceId} at commit ${commit}...`,
+	)
+	git(['worktree', 'add', '--detach', worktreePath, commit], barePath)
+
+	// Verify the commit
+	const resolvedCommit = git(['rev-parse', 'HEAD'], worktreePath)
+
+	const checkoutMs = Date.now() - startTime
+	console.log(
+		`[git] Worktree created at ${worktreePath} (commit: ${resolvedCommit}, took ${checkoutMs}ms)`,
+	)
+
+	return {
+		path: worktreePath,
+		commit: resolvedCommit,
+		checkout_ms: checkoutMs,
+	}
+}
+
+/**
+ * Remove a worktree
+ */
+export async function removeWorktree(
+	repo: string,
+	instanceId: string,
+): Promise<void> {
+	const barePath = getBareClonePath(repo)
+	const worktreePath = getWorktreePath(instanceId)
+
+	if (!existsSync(worktreePath)) {
+		return
+	}
+
+	console.log(`[git] Removing worktree at ${worktreePath}...`)
+	try {
+		git(['worktree', 'remove', '--force', worktreePath], barePath)
+	} catch {
+		// Force remove directory if git worktree remove fails
+		rmSync(worktreePath, { recursive: true, force: true })
+	}
+}
+
+/**
+ * List all files in a worktree (for indexing)
+ */
+export function listFiles(
+	worktreePath: string,
+	extensions?: string[],
+): string[] {
+	let files: string[]
+
+	try {
+		// Use git ls-files for tracked files
+		const output = git(['ls-files'], worktreePath)
+		files = output.split('\n').filter(Boolean)
+	} catch {
+		// Fallback: use find
+		const result = spawnSync('find', ['.', '-type', 'f', '-name', '*.*'], {
+			cwd: worktreePath,
+			encoding: 'utf-8',
+			maxBuffer: 50 * 1024 * 1024,
+		})
+		files = result.stdout
+			.split('\n')
+			.filter(Boolean)
+			.map((f) => f.replace(/^\.\//, ''))
+	}
+
+	// Filter by extensions if provided
+	if (extensions && extensions.length > 0) {
+		const extSet = new Set(
+			extensions.map((e) => (e.startsWith('.') ? e : `.${e}`)),
+		)
+		files = files.filter((f) => {
+			const ext = f.slice(f.lastIndexOf('.'))
+			return extSet.has(ext)
+		})
+	}
+
+	return files
+}
diff --git a/packages/eval/src/swebench/observe/instrumentation.ts b/packages/eval/src/swebench/observe/instrumentation.ts
new file mode 100644
index 0000000..0437874
--- /dev/null
+++ b/packages/eval/src/swebench/observe/instrumentation.ts
@@ -0,0 +1,494 @@
+/**
+ * Instrumentation layer: hooks + message parsing + ranking + usage dedupe
+ */
+
+import { appendFileSync, existsSync, mkdirSync } from 'node:fs'
+import { dirname } from 'node:path'
+import { matchesOracle, normalizePath } from '../score'
+import type { AgentVariant, Event, OracleFiles } from '../types'
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Event Writer (merged from events.ts)
+// ─────────────────────────────────────────────────────────────────────────────
+
+export class EventWriter {
+	private filePath: string
+
+	constructor(filePath: string) {
+		this.filePath = filePath
+		const dir = dirname(filePath)
+		if (!existsSync(dir)) mkdirSync(dir, { recursive: true })
+	}
+
+	private write(event: Event): void {
+		appendFileSync(this.filePath, `${JSON.stringify(event)}\n`)
+	}
+
+	writeSessionStart(
+		instanceId: string,
+		agentVariant: AgentVariant,
+		model: string,
+		cwd: string,
+		allowedTools: string[],
+	): void {
+		this.write({
+			type: 'session_start',
+			timestamp: Date.now(),
+			instance_id: instanceId,
+			agent_variant: agentVariant,
+			model,
+			cwd,
+			allowed_tools: allowedTools,
+		})
+	}
+
+	writeSessionEnd(
+		instanceId: string,
+		agentVariant: AgentVariant,
+		durationMs: number,
+		totalCostUsd: number,
+		usage: {
+			input_tokens: number
+			output_tokens: number
+			cache_read_input_tokens?: number
+		},
+		topFilesFinal: string[],
+		rankedFilesFromTools: string[],
+	): void {
+		this.write({
+			type: 'session_end',
+			timestamp: Date.now(),
+			instance_id: instanceId,
+			agent_variant: agentVariant,
+			duration_ms: durationMs,
+			total_cost_usd: totalCostUsd,
+			usage,
+			top_files_final: topFilesFinal,
+			ranked_files_from_tools: rankedFilesFromTools,
+		})
+	}
+
+	writeToolCallStart(
+		toolName: string,
+		toolUseId: string,
+		input: unknown,
+	): void {
+		this.write({
+			type: 'tool_call',
+			timestamp: Date.now(),
+			tool_name: toolName,
+			tool_use_id: toolUseId,
+			input,
+		})
+	}
+
+	writeToolCallEnd(
+		toolName: string,
+		toolUseId: string,
+		input: unknown,
+		output: unknown,
+		latencyMs: number,
+		outputChars: number,
+	): void {
+		this.write({
+			type: 'tool_call',
+			timestamp: Date.now(),
+			tool_name: toolName,
+			tool_use_id: toolUseId,
+			input,
+			output,
+			latency_ms: latencyMs,
+			output_chars: outputChars,
+		})
+	}
+
+	writeToolCallError(
+		toolName: string,
+		toolUseId: string,
+		input: unknown,
+		error: string,
+		latencyMs: number,
+	): void {
+		this.write({
+			type: 'tool_call',
+			timestamp: Date.now(),
+			tool_name: toolName,
+			tool_use_id: toolUseId,
+			input,
+			error,
+			latency_ms: latencyMs,
+		})
+	}
+
+	writeUsage(
+		messageId: string,
+		inputTokens: number,
+		outputTokens: number,
+		cacheReadInputTokens?: number,
+		cacheCreationInputTokens?: number,
+	): void {
+		this.write({
+			type: 'usage',
+			timestamp: Date.now(),
+			message_id: messageId,
+			input_tokens: inputTokens,
+			output_tokens: outputTokens,
+			cache_read_input_tokens: cacheReadInputTokens,
+			cache_creation_input_tokens: cacheCreationInputTokens,
+		})
+	}
+}
+
+/**
+ * Accumulated usage (deduplicated by message ID)
+ */
+interface AccumulatedUsage {
+	input_tokens: number
+	output_tokens: number
+	cache_read_input_tokens: number
+	cache_creation_input_tokens: number
+}
+
+/**
+ * Tool call timing for latency tracking
+ */
+interface ToolCallTiming {
+	startTime: number
+	toolName: string
+	input: unknown
+}
+
+/**
+ * Run context for a single agent run
+ */
+export interface RunContext {
+	instanceId: string
+	agentVariant: AgentVariant
+	worktreePath: string
+	oracle: OracleFiles
+	eventWriter: EventWriter
+
+	// State
+	startTime: number
+	toolCallCount: number
+	toolCallTimings: Map<string, ToolCallTiming> // toolUseId -> timing
+	seenMessageIds: Set<string>
+	accumulatedUsage: AccumulatedUsage
+
+	// Ranked file extraction
+	rankedFilesFromTools: string[] // Behavioral: first-seen order from tools
+	seenFilePaths: Set<string>
+
+	// Tool output size accounting
+	toolOutputCharsByType: Record<string, number>
+
+	// First hit tracking
+	firstHitTime: number | null
+	firstHitTokens: number | null
+	toolUseIdToMessageId: Map<string, string> // For attribution
+}
+
+/**
+ * Create a new run context
+ */
+export function createRunContext(
+	instanceId: string,
+	agentVariant: AgentVariant,
+	worktreePath: string,
+	oracle: OracleFiles,
+	eventWriter: EventWriter,
+): RunContext {
+	return {
+		instanceId,
+		agentVariant,
+		worktreePath,
+		oracle,
+		eventWriter,
+		startTime: Date.now(),
+		toolCallCount: 0,
+		toolCallTimings: new Map(),
+		seenMessageIds: new Set(),
+		accumulatedUsage: {
+			input_tokens: 0,
+			output_tokens: 0,
+			cache_read_input_tokens: 0,
+			cache_creation_input_tokens: 0,
+		},
+		rankedFilesFromTools: [],
+		seenFilePaths: new Set(),
+		toolOutputCharsByType: {},
+		firstHitTime: null,
+		firstHitTokens: null,
+		toolUseIdToMessageId: new Map(),
+	}
+}
+
+/**
+ * Extract file paths from tool output
+ */
+function extractFilePathsFromToolOutput(
+	toolName: string,
+	input: unknown,
+	output: unknown,
+): string[] {
+	const paths: string[] = []
+
+	// Read tool: file_path in input
+	if (toolName === 'Read' && typeof input === 'object' && input !== null) {
+		const readInput = input as { file_path?: string }
+		if (readInput.file_path) {
+			paths.push(readInput.file_path)
+		}
+	}
+
+	// Grep tool: parse output for file paths
+	if (toolName === 'Grep' && typeof output === 'string') {
+		// Grep output format: "filepath:line:content" or just filepaths
+		const lines = output.split('\n')
+		for (const line of lines) {
+			const match = line.match(/^([^:]+):/)
+			if (match?.[1]) {
+				const candidate = match[1].trim()
+				// Skip if it's just a number (line number) or doesn't look like a path
+				if (/^\d+$/.test(candidate)) continue
+				// Must contain a path separator or file extension
+				if (candidate.includes('/') || candidate.includes('.')) {
+					paths.push(candidate)
+				}
+			}
+		}
+	}
+
+	// Glob tool: output is typically a list of paths
+	if (toolName === 'Glob') {
+		if (typeof output === 'string') {
+			const lines = output.split('\n').filter(Boolean)
+			paths.push(...lines)
+		} else if (Array.isArray(output)) {
+			paths.push(...output.filter((p) => typeof p === 'string'))
+		}
+	}
+
+	// Semantic search: results contain filepath
+	// Output can be a string, or MCP format: [{type:"text", text:"..."}]
+	if (toolName === 'mcp__semantic_search__search') {
+		let textContent = ''
+
+		if (typeof output === 'string') {
+			textContent = output
+		} else if (Array.isArray(output)) {
+			// MCP response format: [{type: "text", text: "..."}]
+			for (const item of output) {
+				if (item && typeof item === 'object' && 'text' in item) {
+					textContent += `${(item as { text: string }).text}\n`
+				}
+			}
+		} else if (output && typeof output === 'object' && 'text' in output) {
+			textContent = (output as { text: string }).text
+		}
+
+		if (textContent) {
+			// Extract file paths from structured __FILES__ block (added by semantic_search_tool.ts)
+			const filesMatch = textContent.match(
+				/__FILES__\n([\s\S]*?)\n__END_FILES__/,
+			)
+			if (filesMatch?.[1]) {
+				const files = filesMatch[1].split('\n').filter(Boolean)
+				paths.push(...files)
+			}
+		}
+	}
+
+	return paths
+}
+
+/**
+ * Strip the worktree prefix from an absolute path to get repo-relative path
+ */
+function stripWorktreePrefix(path: string, worktreePath: string): string {
+	// Normalize both paths for comparison
+	const normalizedWorktree = `${worktreePath.replace(/\/+$/, '')}/`
+
+	if (path.startsWith(normalizedWorktree)) {
+		return path.slice(normalizedWorktree.length)
+	}
+
+	// Also handle URL-decoded paths (spaces vs %20)
+	const decodedPath = decodeURIComponent(path)
+	if (decodedPath.startsWith(normalizedWorktree)) {
+		return decodedPath.slice(normalizedWorktree.length)
+	}
+
+	return path
+}
+
+/**
+ * Record file paths from tool output and check for oracle hits
+ */
+export function recordFilePathsFromTool(
+	ctx: RunContext,
+	toolName: string,
+	_toolUseId: string,
+	input: unknown,
+	output: unknown,
+): void {
+	const paths = extractFilePathsFromToolOutput(toolName, input, output)
+
+	for (const rawPath of paths) {
+		// First strip the worktree prefix to get repo-relative path
+		const relativePath = stripWorktreePrefix(rawPath, ctx.worktreePath)
+		const normalized = normalizePath(relativePath)
+		if (!ctx.seenFilePaths.has(normalized)) {
+			ctx.seenFilePaths.add(normalized)
+			ctx.rankedFilesFromTools.push(normalized)
+
+			// Check for first hit
+			if (ctx.firstHitTime === null && matchesOracle(normalized, ctx.oracle)) {
+				ctx.firstHitTime = Date.now() - ctx.startTime
+				// Tokens to first hit: sum usage up to the message that spawned this tool
+				// We can't perfectly compute this without tracking per-message, so we use accumulated
+				ctx.firstHitTokens =
+					ctx.accumulatedUsage.input_tokens + ctx.accumulatedUsage.output_tokens
+			}
+		}
+	}
+}
+
+/**
+ * Pre-tool-use hook handler
+ */
+export function onPreToolUse(
+	ctx: RunContext,
+	toolName: string,
+	toolUseId: string,
+	input: unknown,
+): void {
+	ctx.toolCallCount++
+	ctx.toolCallTimings.set(toolUseId, {
+		startTime: Date.now(),
+		toolName,
+		input,
+	})
+
+	ctx.eventWriter.writeToolCallStart(toolName, toolUseId, input)
+}
+
+/**
+ * Post-tool-use hook handler
+ */
+export function onPostToolUse(
+	ctx: RunContext,
+	toolName: string,
+	toolUseId: string,
+	input: unknown,
+	output: unknown,
+): void {
+	const timing = ctx.toolCallTimings.get(toolUseId)
+	const latencyMs = timing ? Date.now() - timing.startTime : 0
+
+	// Serialize output for size accounting
+	const outputStr = typeof output === 'string' ? output : JSON.stringify(output)
+	const outputChars = outputStr.length
+
+	// Accumulate tool output chars by type
+	ctx.toolOutputCharsByType[toolName] =
+		(ctx.toolOutputCharsByType[toolName] || 0) + outputChars
+
+	ctx.eventWriter.writeToolCallEnd(
+		toolName,
+		toolUseId,
+		input,
+		output,
+		latencyMs,
+		outputChars,
+	)
+
+	// Extract and record file paths
+	recordFilePathsFromTool(ctx, toolName, toolUseId, input, output)
+
+	ctx.toolCallTimings.delete(toolUseId)
+}
+
+/**
+ * Post-tool-use-failure hook handler
+ */
+export function onPostToolUseFailure(
+	ctx: RunContext,
+	toolName: string,
+	toolUseId: string,
+	input: unknown,
+	error: string,
+): void {
+	const timing = ctx.toolCallTimings.get(toolUseId)
+	const latencyMs = timing ? Date.now() - timing.startTime : 0
+
+	ctx.eventWriter.writeToolCallError(
+		toolName,
+		toolUseId,
+		input,
+		error,
+		latencyMs,
+	)
+	ctx.toolCallTimings.delete(toolUseId)
+}
+
+/**
+ * Process an assistant message to extract tool_use blocks and usage
+ */
+export function processAssistantMessage(
+	ctx: RunContext,
+	message: {
+		id?: string
+		content?: unknown
+		usage?: {
+			input_tokens?: number
+			output_tokens?: number
+			cache_read_input_tokens?: number
+			cache_creation_input_tokens?: number
+		}
+	},
+): void {
+	const messageId = message.id
+	if (!messageId) return
+
+	// Dedupe usage by message ID
+	if (!ctx.seenMessageIds.has(messageId)) {
+		ctx.seenMessageIds.add(messageId)
+
+		if (message.usage) {
+			const usage = message.usage
+			ctx.accumulatedUsage.input_tokens += usage.input_tokens || 0
+			ctx.accumulatedUsage.output_tokens += usage.output_tokens || 0
+			ctx.accumulatedUsage.cache_read_input_tokens +=
+				usage.cache_read_input_tokens || 0
+			ctx.accumulatedUsage.cache_creation_input_tokens +=
+				usage.cache_creation_input_tokens || 0
+
+			ctx.eventWriter.writeUsage(
+				messageId,
+				usage.input_tokens || 0,
+				usage.output_tokens || 0,
+				usage.cache_read_input_tokens,
+				usage.cache_creation_input_tokens,
+			)
+		}
+	}
+
+	// Build toolUseId -> messageId mapping from content blocks
+	if (Array.isArray(message.content)) {
+		for (const block of message.content) {
+			if (block && typeof block === 'object' && 'type' in block) {
+				if (block.type === 'tool_use' && 'id' in block) {
+					ctx.toolUseIdToMessageId.set(block.id as string, messageId)
+				}
+			}
+		}
+	}
+}
+
+/**
+ * Get total tool output chars
+ */
+export function getTotalToolOutputChars(ctx: RunContext): number {
+	return Object.values(ctx.toolOutputCharsByType).reduce((a, b) => a + b, 0)
+}
diff --git a/packages/eval/src/swebench/run.ts b/packages/eval/src/swebench/run.ts
new file mode 100644
index 0000000..b5db6c2
--- /dev/null
+++ b/packages/eval/src/swebench/run.ts
@@ -0,0 +1,669 @@
+/**
+ * SWE-bench Lite retrieval-only evaluation runner
+ * Main runner loop that orchestrates the evaluation
+ */
+
+import { mkdirSync } from 'node:fs'
+import { dirname, join } from 'node:path'
+import { query } from '@anthropic-ai/claude-agent-sdk'
+import {
+	createUserPrompt,
+	parseTopFiles,
+	RETRIEVAL_ONLY_SYSTEM_PROMPT,
+	RETRIEVAL_WITH_SEARCH_SYSTEM_PROMPT,
+} from './agent/prompts'
+import {
+	createSemanticSearchMetrics,
+	createSemanticSearchServer,
+	GeminiEmbeddingService,
+	SemanticSearchIndex,
+} from './agent/semantic_search_adapter'
+import { getAgentConfig } from './agent/variants'
+import { aggregateMetrics, printSummary, writeSummary } from './aggregate'
+import { loadSWEbenchLite } from './dataset'
+import { createWorktree, listFiles, removeWorktree } from './git'
+import {
+	createRunContext,
+	EventWriter,
+	onPostToolUse,
+	onPreToolUse,
+	processAssistantMessage,
+} from './observe/instrumentation'
+import { computeInstanceMetrics, extractOracle, writeMetrics } from './score'
+import type { AgentVariant, InstanceMetrics, SWEbenchInstance } from './types'
+
+/**
+ * Configuration for the evaluation run
+ */
+export interface RunConfig {
+	split?: 'dev' | 'test'
+	maxInstances?: number
+	maxTurns?: number
+	maxToolCalls?: number
+	model?: string
+	runDir?: string
+	skipAgent1?: boolean
+	skipAgent2?: boolean
+	instanceIds?: string[] // Run specific instances only
+	indexExtensions?: string[] // File extensions to index for semantic search
+	embeddingDimensions?: number // Output dimensions for Gemini (768/1536/3072)
+	embeddingProvider?: 'gemini' | 'openai'
+}
+
+const DEFAULT_CONFIG: Required<
+	Omit<RunConfig, 'instanceIds' | 'embeddingProvider'>
+> = {
+	split: 'test',
+	maxInstances: undefined as unknown as number,
+	maxTurns: 20,
+	maxToolCalls: 50,
+	model: 'claude-sonnet-4-5',
+	// Put runs in project root (not src/) to avoid polluting agent's Grep searches
+	runDir: join(
+		dirname(dirname(decodeURIComponent(new URL(import.meta.url).pathname))),
+		'runs',
+	),
+	skipAgent1: false,
+	skipAgent2: false,
+	indexExtensions: ['.py', '.js', '.ts', '.java', '.go', '.rs', '.rb', '.php'],
+	embeddingDimensions: 1536, // Gemini embedding dimensions (768/1536/3072)
+}
+
+/**
+ * Run evaluation for a single agent on a single instance
+ */
+async function runAgent(
+	instance: SWEbenchInstance,
+	variant: AgentVariant,
+	worktreePath: string,
+	runTimestamp: string,
+	config: Required<Omit<RunConfig, 'instanceIds' | 'embeddingProvider'>> & {
+		instanceIds?: string[]
+		embeddingProvider?: 'gemini' | 'openai'
+	},
+	semanticIndex?: SemanticSearchIndex,
+): Promise<InstanceMetrics> {
+	const agentConfig = getAgentConfig(variant)
+	const oracle = extractOracle(instance)
+
+	// Setup output paths
+	const eventsPath = join(
+		config.runDir,
+		runTimestamp,
+		'events',
+		`${instance.instance_id}_${variant}.jsonl`,
+	)
+	const eventWriter = new EventWriter(eventsPath)
+
+	// Create run context
+	const ctx = createRunContext(
+		instance.instance_id,
+		variant,
+		worktreePath,
+		oracle,
+		eventWriter,
+	)
+
+	// Log session start
+	eventWriter.writeSessionStart(
+		instance.instance_id,
+		variant,
+		config.model,
+		worktreePath,
+		agentConfig.allowedTools,
+	)
+
+	// Setup semantic search for Agent2
+	const semanticSearchMetrics = createSemanticSearchMetrics()
+	const mcpServers: Record<string, unknown> = {}
+
+	if (variant === 'ops-plus-search' && semanticIndex) {
+		const semanticServer = createSemanticSearchServer(
+			semanticIndex,
+			semanticSearchMetrics,
+		)
+		mcpServers.semantic_search = semanticServer
+	}
+
+	// Create prompt with repo context
+	// Pass hasSemanticSearch=true for Agent2 to encourage semantic search usage
+	const hasSemanticSearch = variant === 'ops-plus-search' && !!semanticIndex
+	const userPrompt = createUserPrompt(
+		instance.problem_statement,
+		instance.repo,
+		hasSemanticSearch,
+	)
+
+	// Track tool call count for budget enforcement
+	let toolCallCount = 0
+
+	// Run the agent
+	let finalOutput = ''
+	let totalCostUsd = 0
+	let totalDurationMs = 0
+
+	try {
+		// Build query options with explicit tool restriction
+		// SDK docs: 'tools' array = ONLY these tools available (excludes MCP!)
+		// SDK docs: 'allowedTools' = whitelist that includes MCP tools
+		const allowedBuiltinTools = ['Read', 'Grep', 'Glob', 'LS']
+
+		// Deny all tools that could modify or are not needed
+		const denyTools = [
+			'Write',
+			'Edit',
+			'Bash',
+			'Task',
+			'WebSearch',
+			'WebFetch',
+			'TodoRead',
+			'TodoWrite',
+			'NotebookRead',
+			'NotebookEdit',
+			'Agent',
+			'MultiEdit',
+		]
+
+		// Select appropriate system prompt based on variant
+		const systemPrompt =
+			variant === 'ops-plus-search'
+				? RETRIEVAL_WITH_SEARCH_SYSTEM_PROMPT
+				: RETRIEVAL_ONLY_SYSTEM_PROMPT
+
+		const queryOptions: Record<string, unknown> = {
+			cwd: worktreePath, // SDK uses 'cwd' not 'workingDirectory' for tool path resolution
+			model: config.model,
+			systemPrompt,
+			maxTurns: config.maxTurns,
+			disallowedTools: denyTools,
+			permissionMode: 'bypassPermissions', // Auto-allow for retrieval-only (no writes)
+		}
+
+		// Configure tools based on variant
+		if (variant === 'ops-plus-search' && Object.keys(mcpServers).length > 0) {
+			// For Agent2 with MCP: use allowedTools (whitelist) instead of tools (restriction)
+			// This allows both built-in AND MCP tools
+			queryOptions.mcpServers = mcpServers
+			queryOptions.allowedTools = [
+				...allowedBuiltinTools,
+				'mcp__semantic_search__search',
+			]
+		} else {
+			// For Agent1: use tools array to strictly limit to built-in tools only
+			queryOptions.tools = allowedBuiltinTools
+		}
+
+		console.log(`[runner] CWD (worktree): ${worktreePath}`)
+		if (queryOptions.tools) {
+			console.log(
+				`[runner] Tools (strict): [${(queryOptions.tools as string[]).join(', ')}]`,
+			)
+		}
+		if (queryOptions.allowedTools) {
+			console.log(
+				`[runner] AllowedTools: [${(queryOptions.allowedTools as string[]).join(', ')}]`,
+			)
+		}
+		console.log(`[runner] Denied: [${denyTools.slice(0, 5).join(', ')}...]`)
+		if (queryOptions.mcpServers) {
+			console.log(`[runner] MCP: semantic_search enabled`)
+		}
+
+		const response = query({
+			prompt: userPrompt, // Use simple string prompt instead of generator
+			options: queryOptions,
+		})
+
+		// Process streaming messages
+		for await (const message of response) {
+			// Cast message to any to handle SDK type inconsistencies with docs
+			const msg = message as Record<string, unknown>
+
+			switch (msg.type) {
+				case 'assistant': {
+					const msgContent = msg.message as { content?: unknown } | undefined
+					// Extract text content for final output parsing
+					if (typeof msgContent?.content === 'string') {
+						finalOutput = msgContent.content
+						// Log assistant thinking (truncate if long)
+						const preview = msgContent.content.slice(0, 150)
+						console.log(
+							`  [${variant}] thinking: ${preview}${msgContent.content.length > 150 ? '...' : ''}`,
+						)
+					} else if (Array.isArray(msgContent?.content)) {
+						for (const block of msgContent.content) {
+							const b = block as {
+								type?: string
+								text?: string
+								name?: string
+								id?: string
+								input?: unknown
+							}
+							if (b?.type === 'text' && b.text) {
+								finalOutput = b.text
+								const preview = b.text.slice(0, 150)
+								console.log(
+									`  [${variant}] thinking: ${preview}${b.text.length > 150 ? '...' : ''}`,
+								)
+							} else if (b?.type === 'tool_use') {
+								toolCallCount++
+								const inputStr = JSON.stringify(b.input || {}).slice(0, 100)
+								console.log(
+									`  [${variant}] Tool[${toolCallCount}]: ${b.name}(${inputStr}${inputStr.length >= 100 ? '...' : ''})`,
+								)
+								if (b.name && b.id) {
+									onPreToolUse(ctx, b.name, b.id, b.input)
+								}
+							}
+						}
+					}
+					// Process for usage and tool_use tracking
+					if (msgContent) {
+						processAssistantMessage(
+							ctx,
+							msgContent as {
+								id?: string
+								content?: unknown
+								usage?: {
+									input_tokens?: number
+									output_tokens?: number
+									cache_read_input_tokens?: number
+									cache_creation_input_tokens?: number
+								}
+							},
+						)
+					}
+					break
+				}
+
+				case 'tool_result': {
+					// Track tool results - show brief result preview
+					const resultVal = msg.result
+					const resultStr =
+						typeof resultVal === 'string'
+							? resultVal
+							: JSON.stringify(resultVal || '')
+					const resultPreview = resultStr.slice(0, 80)
+					console.log(
+						`  [${variant}] result: ${msg.tool_name}: ${resultPreview}${resultStr.length > 80 ? '...' : ''}`,
+					)
+					if (msg.tool_name && msg.tool_use_id) {
+						onPostToolUse(
+							ctx,
+							msg.tool_name as string,
+							msg.tool_use_id as string,
+							msg.input,
+							msg.result,
+						)
+					}
+					break
+				}
+
+				case 'user': {
+					// Tool results come as "user" messages with tool_result content
+					const userMsg = msg.message as { content?: unknown[] } | undefined
+					if (Array.isArray(userMsg?.content)) {
+						for (const block of userMsg.content) {
+							const b = block as {
+								type?: string
+								tool_use_id?: string
+								content?: unknown
+							}
+							if (b?.type === 'tool_result' && b.tool_use_id) {
+								const resultContent =
+									typeof b.content === 'string'
+										? b.content
+										: JSON.stringify(b.content || '')
+								const preview = resultContent.slice(0, 80)
+
+								// Look up the tool info from when the call was made
+								const toolInfo = ctx.toolCallTimings.get(b.tool_use_id)
+								const toolName = toolInfo?.toolName || 'unknown'
+								const toolInput = toolInfo?.input || {}
+
+								console.log(
+									`  [${variant}] result: ${toolName}: ${preview}${resultContent.length > 80 ? '...' : ''}`,
+								)
+								onPostToolUse(
+									ctx,
+									toolName,
+									b.tool_use_id,
+									toolInput,
+									b.content,
+								)
+							}
+						}
+					}
+					break
+				}
+
+				case 'error':
+					console.error(`  [${variant}] Agent error:`, msg.error)
+					break
+
+				case 'result': {
+					totalCostUsd = (msg.total_cost_usd as number) || 0
+					totalDurationMs = (msg.duration_ms as number) || 0
+					// Extract token usage from result message if available
+					if (msg.total_input_tokens || msg.total_output_tokens) {
+						ctx.accumulatedUsage.input_tokens =
+							(msg.total_input_tokens as number) || 0
+						ctx.accumulatedUsage.output_tokens =
+							(msg.total_output_tokens as number) || 0
+					}
+					// Also check for usage object
+					const usage = msg.usage as
+						| { input_tokens?: number; output_tokens?: number }
+						| undefined
+					if (usage) {
+						ctx.accumulatedUsage.input_tokens =
+							usage.input_tokens || ctx.accumulatedUsage.input_tokens
+						ctx.accumulatedUsage.output_tokens =
+							usage.output_tokens || ctx.accumulatedUsage.output_tokens
+					}
+					{
+						const totalTokens =
+							ctx.accumulatedUsage.input_tokens +
+							ctx.accumulatedUsage.output_tokens
+						console.log(
+							`  [${variant}] Done in ${(totalDurationMs / 1000).toFixed(1)}s, cost: $${totalCostUsd.toFixed(4)}, tokens: ${totalTokens}`,
+						)
+					}
+					break
+				}
+
+				case 'system': {
+					if (msg.subtype === 'init') {
+						console.log(`  [${variant}] Session: ${msg.session_id}`)
+					}
+					break
+				}
+
+				default:
+					// Debug: log unknown message types
+					console.log(
+						`  [${variant}] [${msg.type}${msg.subtype ? `:${msg.subtype}` : ''}]`,
+					)
+			}
+		}
+	} catch (err) {
+		console.error(
+			`[runner] Error running ${variant} on ${instance.instance_id}:`,
+			err,
+		)
+	}
+
+	// Parse final output for declared top_files
+	const topFilesFinal = parseTopFiles(finalOutput)
+
+	// Log comparison: found vs expected
+	const oracleFilesArr = Array.from(ctx.oracle.files)
+	const foundFiles = ctx.rankedFilesFromTools.slice(0, 10)
+	const intersection = foundFiles.filter((f) => ctx.oracle.files.has(f))
+
+	console.log(
+		`\n  +===================================================================+`,
+	)
+	console.log(
+		`  | [${variant}] RESULTS                                            |`,
+	)
+	console.log(
+		`  +===================================================================+`,
+	)
+	console.log(
+		`  | GOLDEN PATCH files:                                              |`,
+	)
+	for (const f of oracleFilesArr) {
+		console.log(`  |    - ${f.slice(0, 55).padEnd(55)} |`)
+	}
+	console.log(
+		`  +===================================================================+`,
+	)
+	console.log(
+		`  | AGENT found files (top ${foundFiles.length}):                                  |`,
+	)
+	if (foundFiles.length === 0) {
+		console.log(
+			`  |    (no files found)                                              |`,
+		)
+	}
+	for (const f of foundFiles) {
+		const match = ctx.oracle.files.has(f) ? '[x]' : '[ ]'
+		console.log(`  |  ${match} ${f.slice(0, 55).padEnd(55)} |`)
+	}
+	console.log(
+		`  +===================================================================+`,
+	)
+	console.log(
+		`  | Hit: ${intersection.length}/${oracleFilesArr.length} | Tool calls: ${toolCallCount.toString().padEnd(3)} | Tokens: ${(ctx.accumulatedUsage.input_tokens + ctx.accumulatedUsage.output_tokens).toString().padEnd(8)} |`,
+	)
+	console.log(
+		`  +===================================================================+\n`,
+	)
+
+	// Log session end
+	eventWriter.writeSessionEnd(
+		instance.instance_id,
+		variant,
+		totalDurationMs,
+		totalCostUsd,
+		{
+			input_tokens: ctx.accumulatedUsage.input_tokens,
+			output_tokens: ctx.accumulatedUsage.output_tokens,
+			cache_read_input_tokens: ctx.accumulatedUsage.cache_read_input_tokens,
+		},
+		topFilesFinal,
+		ctx.rankedFilesFromTools,
+	)
+
+	// Compute metrics
+	const metrics = computeInstanceMetrics(
+		ctx,
+		topFilesFinal,
+		totalDurationMs,
+		totalCostUsd,
+		variant === 'ops-plus-search' && semanticIndex
+			? {
+					callCount: semanticSearchMetrics.callCount,
+					totalQueryEmbedTokens: semanticSearchMetrics.totalQueryEmbedTokens,
+					totalQueryEmbedLatencyMs:
+						semanticSearchMetrics.totalQueryEmbedLatencyMs,
+					indexEmbedTokens: semanticIndex.totalEmbedTokens,
+					indexLoadMs: semanticIndex.indexLoadMs,
+				}
+			: undefined,
+	)
+
+	return metrics
+}
+
+/**
+ * Main evaluation runner
+ */
+export async function runEvaluation(
+	config: Partial<RunConfig> = {},
+): Promise<void> {
+	const cfg: Required<Omit<RunConfig, 'instanceIds' | 'embeddingProvider'>> & {
+		instanceIds?: string[]
+		embeddingProvider?: 'gemini' | 'openai'
+	} = {
+		...DEFAULT_CONFIG,
+		...config,
+	}
+
+	// Create run directory
+	const runTimestamp = new Date().toISOString().replace(/[:.]/g, '-')
+	const runDir = join(cfg.runDir, runTimestamp)
+	mkdirSync(runDir, { recursive: true })
+	mkdirSync(join(runDir, 'events'), { recursive: true })
+
+	console.log(`[runner] Starting evaluation run at ${runDir}`)
+	console.log(`[runner] Config:`, cfg)
+
+	// Load dataset
+	let instances = await loadSWEbenchLite(cfg.split, cfg.maxInstances)
+
+	// Filter to specific instances if provided
+	if (cfg.instanceIds && cfg.instanceIds.length > 0) {
+		const ids = new Set(cfg.instanceIds)
+		instances = instances.filter((i) => ids.has(i.instance_id))
+		console.log(`[runner] Filtered to ${instances.length} specific instances`)
+	}
+
+	console.log(`[runner] Loaded ${instances.length} instances`)
+
+	// Metrics output path
+	const metricsPath = join(runDir, 'metrics.jsonl')
+	const allMetrics: InstanceMetrics[] = []
+
+	// Process each instance
+	for (let i = 0; i < instances.length; i++) {
+		const instance = instances[i]!
+		console.log(
+			`\n[runner] Processing instance ${i + 1}/${instances.length}: ${instance.instance_id}`,
+		)
+
+		// Checkout repo at base_commit
+		let worktree:
+			| { path: string; commit: string; checkout_ms: number }
+			| undefined
+		try {
+			worktree = await createWorktree(
+				instance.repo,
+				instance.base_commit,
+				instance.instance_id,
+			)
+		} catch (err) {
+			console.error(`[runner] Failed to checkout ${instance.instance_id}:`, err)
+			continue
+		}
+
+		try {
+			// Build semantic index for Agent2 (reused across both agents for fairness)
+			let semanticIndex: SemanticSearchIndex | undefined
+			if (!cfg.skipAgent2) {
+				const geminiKey = process.env.GOOGLE_API_KEY
+				if (!geminiKey) {
+					console.warn(
+						'[runner] GOOGLE_API_KEY not set, skipping semantic search indexing',
+					)
+				} else {
+					const embedService = new GeminiEmbeddingService(
+						geminiKey,
+						'gemini-embedding-001',
+						5,
+						cfg.embeddingDimensions,
+					)
+					console.log(
+						`[runner] Using Gemini embeddings (${cfg.embeddingDimensions} dimensions)`,
+					)
+					// Check for cached index
+					const indexCacheDir = join(cfg.runDir, '.index_cache')
+					const cacheExists = SemanticSearchIndex.cacheExists(
+						indexCacheDir,
+						instance.instance_id,
+						'gemini',
+						cfg.embeddingDimensions,
+					)
+
+					if (cacheExists) {
+						console.log(`[runner] Loading semantic index from cache...`)
+						const cached = await SemanticSearchIndex.loadFromCache(
+							indexCacheDir,
+							instance.instance_id,
+							'gemini',
+							cfg.embeddingDimensions,
+							worktree.path,
+							embedService,
+						)
+						if (cached) {
+							semanticIndex = cached
+						}
+					}
+
+					// Index if not loaded from cache
+					if (!semanticIndex) {
+						semanticIndex = new SemanticSearchIndex(worktree.path, embedService)
+
+						// List files to index
+						const files = listFiles(worktree.path, cfg.indexExtensions)
+						console.log(
+							`[runner] Indexing ${files.length} files for semantic search...`,
+						)
+
+						await semanticIndex.index(files)
+
+						// Save to cache for future runs
+						await semanticIndex.saveToCache(
+							indexCacheDir,
+							instance.instance_id,
+							'gemini',
+							cfg.embeddingDimensions,
+						)
+					}
+				}
+			}
+
+			// Run Agent1 (ops-only)
+			if (!cfg.skipAgent1) {
+				console.log(
+					`[runner] Running Agent1 (ops-only) on ${instance.instance_id}...`,
+				)
+				const metrics1 = await runAgent(
+					instance,
+					'ops-only',
+					worktree.path,
+					runTimestamp,
+					cfg,
+				)
+				writeMetrics(metricsPath, metrics1)
+				allMetrics.push(metrics1)
+				console.log(
+					`[runner] Agent1 done: Hit@5=${metrics1.hit_at_5_behavioral}, MRR=${metrics1.mrr_behavioral.toFixed(3)}`,
+				)
+			}
+
+			// Run Agent2 (ops + semantic search)
+			if (!cfg.skipAgent2) {
+				if (!semanticIndex) {
+					console.warn(
+						`[runner] Skipping Agent2: semantic index not available (check API keys or indexing errors)`,
+					)
+				} else {
+					console.log(
+						`[runner] Running Agent2 (ops+search) on ${instance.instance_id}...`,
+					)
+					const metrics2 = await runAgent(
+						instance,
+						'ops-plus-search',
+						worktree.path,
+						runTimestamp,
+						cfg,
+						semanticIndex,
+					)
+					writeMetrics(metricsPath, metrics2)
+					allMetrics.push(metrics2)
+					console.log(
+						`[runner] Agent2 done: Hit@5=${metrics2.hit_at_5_behavioral}, MRR=${metrics2.mrr_behavioral.toFixed(3)}`,
+					)
+				}
+			}
+		} finally {
+			// Cleanup worktree
+			try {
+				await removeWorktree(instance.repo, instance.instance_id)
+			} catch (err) {
+				console.warn(`[runner] Failed to cleanup worktree:`, err)
+			}
+		}
+	}
+
+	// Aggregate and write summary
+	console.log('\n[runner] Computing aggregate summary...')
+	const summary = aggregateMetrics(allMetrics, cfg.split)
+	const summaryPath = join(runDir, 'summary.json')
+	writeSummary(summaryPath, summary)
+	printSummary(summary)
+
+	console.log(`[runner] Evaluation complete. Results at ${runDir}`)
+}
diff --git a/packages/eval/src/swebench/score.ts b/packages/eval/src/swebench/score.ts
new file mode 100644
index 0000000..b96c3d6
--- /dev/null
+++ b/packages/eval/src/swebench/score.ts
@@ -0,0 +1,183 @@
+/**
+ * Scoring functions for retrieval quality metrics
+ * Includes oracle extraction and path normalization (merged from oracle.ts)
+ */
+
+import { appendFileSync, existsSync, mkdirSync } from 'node:fs'
+import { dirname } from 'node:path'
+import type { RunContext } from './observe/instrumentation'
+import { getTotalToolOutputChars } from './observe/instrumentation'
+import type { InstanceMetrics, OracleFiles, SWEbenchInstance } from './types'
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Oracle extraction (merged from oracle.ts)
+// ─────────────────────────────────────────────────────────────────────────────
+
+const DIFF_HEADER_RE = /^diff --git a\/(.*?) b\/(.*)$/gm
+
+/** Normalize a file path: strip leading ./ or /, collapse slashes */
+export function normalizePath(path: string): string {
+	let n = path.trim()
+	while (n.startsWith('./')) n = n.slice(2)
+	while (n.startsWith('/')) n = n.slice(1)
+	n = n.replace(/\/+/g, '/')
+	while (n.endsWith('/')) n = n.slice(0, -1)
+	return n
+}
+
+/** Extract file paths from unified diff patch */
+function extractFilesFromPatch(patch: string): Set<string> {
+	const files = new Set<string>()
+	let match = DIFF_HEADER_RE.exec(patch)
+	while (match !== null) {
+		const bPath = match[2]!
+		if (bPath !== '/dev/null' && bPath !== 'dev/null') {
+			const normalized = normalizePath(bPath)
+			if (normalized) files.add(normalized)
+		}
+		match = DIFF_HEADER_RE.exec(patch)
+	}
+	DIFF_HEADER_RE.lastIndex = 0
+	return files
+}
+
+/** Extract oracle files from a SWE-bench instance */
+export function extractOracle(
+	instance: SWEbenchInstance,
+	includeTestPatch = false,
+): OracleFiles {
+	const files = extractFilesFromPatch(instance.patch)
+	if (includeTestPatch && instance.test_patch) {
+		for (const f of extractFilesFromPatch(instance.test_patch)) files.add(f)
+	}
+	return { instance_id: instance.instance_id, files }
+}
+
+/** Check if a candidate path matches any oracle file */
+export function matchesOracle(
+	candidatePath: string,
+	oracle: OracleFiles,
+): boolean {
+	return oracle.files.has(normalizePath(candidatePath))
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Ranking metrics
+// ─────────────────────────────────────────────────────────────────────────────
+
+function hitAtK(
+	rankedFiles: string[],
+	oracle: OracleFiles,
+	k: number,
+): boolean {
+	return rankedFiles.slice(0, k).some((f) => matchesOracle(f, oracle))
+}
+
+function reciprocalRank(rankedFiles: string[], oracle: OracleFiles): number {
+	for (let i = 0; i < rankedFiles.length; i++) {
+		if (matchesOracle(rankedFiles[i]!, oracle)) return 1 / (i + 1)
+	}
+	return 0
+}
+
+function coverageAtK(
+	rankedFiles: string[],
+	oracle: OracleFiles,
+	k: number,
+): number {
+	if (oracle.files.size === 0) return 1
+	const topK = new Set(rankedFiles.slice(0, k).map(normalizePath))
+	let hits = 0
+	for (const f of oracle.files) if (topK.has(f)) hits++
+	return hits / oracle.files.size
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Instance metrics computation
+// ─────────────────────────────────────────────────────────────────────────────
+
+/** Compute all metrics for a single run */
+export function computeInstanceMetrics(
+	ctx: RunContext,
+	topFilesFinal: string[],
+	totalDurationMs: number,
+	totalCostUsd: number,
+	semanticSearchMetrics?: {
+		callCount: number
+		totalQueryEmbedTokens: number
+		totalQueryEmbedLatencyMs: number
+		indexEmbedTokens: number
+		indexLoadMs: number
+	},
+): InstanceMetrics {
+	const oracle = ctx.oracle
+	const rankedBehavioral = ctx.rankedFilesFromTools
+	const rankedDeclared = topFilesFinal
+
+	return {
+		instance_id: ctx.instanceId,
+		agent_variant: ctx.agentVariant,
+		oracle_files: Array.from(oracle.files),
+
+		// Quality metrics (behavioral ranking)
+		hit_at_1_behavioral: hitAtK(rankedBehavioral, oracle, 1),
+		hit_at_3_behavioral: hitAtK(rankedBehavioral, oracle, 3),
+		hit_at_5_behavioral: hitAtK(rankedBehavioral, oracle, 5),
+		hit_at_10_behavioral: hitAtK(rankedBehavioral, oracle, 10),
+		mrr_behavioral: reciprocalRank(rankedBehavioral, oracle),
+		coverage_at_5_behavioral: coverageAtK(rankedBehavioral, oracle, 5),
+		coverage_at_10_behavioral: coverageAtK(rankedBehavioral, oracle, 10),
+
+		// Quality metrics (declared ranking)
+		hit_at_1_declared: hitAtK(rankedDeclared, oracle, 1),
+		hit_at_3_declared: hitAtK(rankedDeclared, oracle, 3),
+		hit_at_5_declared: hitAtK(rankedDeclared, oracle, 5),
+		hit_at_10_declared: hitAtK(rankedDeclared, oracle, 10),
+		mrr_declared: reciprocalRank(rankedDeclared, oracle),
+		coverage_at_5_declared: coverageAtK(rankedDeclared, oracle, 5),
+		coverage_at_10_declared: coverageAtK(rankedDeclared, oracle, 10),
+
+		// Time/tokens to first hit
+		time_to_first_hit_ms: ctx.firstHitTime,
+		tokens_to_first_hit: ctx.firstHitTokens,
+
+		// Efficiency metrics
+		total_duration_ms: totalDurationMs,
+		total_cost_usd: totalCostUsd,
+		total_input_tokens: ctx.accumulatedUsage.input_tokens,
+		total_output_tokens: ctx.accumulatedUsage.output_tokens,
+		tool_calls_count: ctx.toolCallCount,
+
+		// Semantic search specific (Agent2 only)
+		semantic_search_calls: semanticSearchMetrics?.callCount,
+		embed_latency_ms: semanticSearchMetrics
+			? semanticSearchMetrics.totalQueryEmbedLatencyMs +
+				semanticSearchMetrics.indexLoadMs
+			: undefined,
+		embed_tokens: semanticSearchMetrics
+			? semanticSearchMetrics.totalQueryEmbedTokens +
+				semanticSearchMetrics.indexEmbedTokens
+			: undefined,
+		embed_cost_usd: semanticSearchMetrics
+			? ((semanticSearchMetrics.totalQueryEmbedTokens +
+					semanticSearchMetrics.indexEmbedTokens) /
+					1000) *
+				0.00002
+			: undefined,
+
+		// Tool output size accounting
+		tool_output_chars_total: getTotalToolOutputChars(ctx),
+		tool_output_chars_by_type: { ...ctx.toolOutputCharsByType },
+
+		// Rankings
+		top_files_final: topFilesFinal,
+		ranked_files_from_tools: rankedBehavioral,
+	}
+}
+
+/** Write metrics to JSONL file */
+export function writeMetrics(filePath: string, metrics: InstanceMetrics): void {
+	const dir = dirname(filePath)
+	if (!existsSync(dir)) mkdirSync(dir, { recursive: true })
+	appendFileSync(filePath, `${JSON.stringify(metrics)}\n`)
+}
diff --git a/packages/eval/src/swebench/types.ts b/packages/eval/src/swebench/types.ts
new file mode 100644
index 0000000..f183d13
--- /dev/null
+++ b/packages/eval/src/swebench/types.ts
@@ -0,0 +1,211 @@
+/**
+ * Core types for SWE-bench Lite retrieval-only evaluation
+ */
+
+/**
+ * SWE-bench Lite instance (only fields needed for retrieval-only)
+ */
+export interface SWEbenchInstance {
+	instance_id: string
+	repo: string // e.g. "django/django"
+	base_commit: string
+	problem_statement: string
+	patch: string // unified diff
+	test_patch?: string // optional, for secondary reporting
+}
+
+/**
+ * Parsed oracle: ground-truth file paths from the gold patch
+ */
+export interface OracleFiles {
+	instance_id: string
+	files: Set<string> // normalized repo-relative paths
+}
+
+/**
+ * Worktree checkout result
+ */
+export interface WorktreeInfo {
+	path: string
+	commit: string
+	checkout_ms: number
+}
+
+/**
+ * Agent variant identifier
+ */
+export type AgentVariant = 'ops-only' | 'ops-plus-search'
+
+/**
+ * Semantic search result from the custom tool
+ */
+export interface SemanticSearchResult {
+	filepath: string
+	start_line?: number
+	end_line?: number
+	score: number
+	snippet?: string
+}
+
+/**
+ * Per-tool-call event (logged in events.jsonl)
+ */
+export interface ToolCallEvent {
+	type: 'tool_call'
+	timestamp: number
+	tool_name: string
+	tool_use_id: string
+	input: unknown
+	output?: unknown
+	error?: string
+	latency_ms?: number
+	output_chars?: number
+}
+
+/**
+ * LLM usage event (logged in events.jsonl)
+ */
+export interface UsageEvent {
+	type: 'usage'
+	timestamp: number
+	message_id: string
+	input_tokens: number
+	output_tokens: number
+	cache_read_input_tokens?: number
+	cache_creation_input_tokens?: number
+}
+
+/**
+ * Session start event
+ */
+export interface SessionStartEvent {
+	type: 'session_start'
+	timestamp: number
+	instance_id: string
+	agent_variant: AgentVariant
+	model: string
+	cwd: string
+	allowed_tools: string[]
+}
+
+/**
+ * Session end event
+ */
+export interface SessionEndEvent {
+	type: 'session_end'
+	timestamp: number
+	instance_id: string
+	agent_variant: AgentVariant
+	duration_ms: number
+	total_cost_usd: number
+	usage: {
+		input_tokens: number
+		output_tokens: number
+		cache_read_input_tokens?: number
+	}
+	top_files_final: string[] // declared from agent JSON output
+	ranked_files_from_tools: string[] // behavioral from tool traces
+}
+
+/**
+ * Union of all event types
+ */
+export type Event =
+	| ToolCallEvent
+	| UsageEvent
+	| SessionStartEvent
+	| SessionEndEvent
+
+/**
+ * Per-instance per-agent metrics (one row in metrics.jsonl)
+ */
+export interface InstanceMetrics {
+	instance_id: string
+	agent_variant: AgentVariant
+	oracle_files: string[]
+
+	// Quality metrics (behavioral ranking)
+	hit_at_1_behavioral: boolean
+	hit_at_3_behavioral: boolean
+	hit_at_5_behavioral: boolean
+	hit_at_10_behavioral: boolean
+	mrr_behavioral: number
+	coverage_at_5_behavioral: number
+	coverage_at_10_behavioral: number
+
+	// Quality metrics (declared ranking)
+	hit_at_1_declared: boolean
+	hit_at_3_declared: boolean
+	hit_at_5_declared: boolean
+	hit_at_10_declared: boolean
+	mrr_declared: number
+	coverage_at_5_declared: number
+	coverage_at_10_declared: number
+
+	// Time/tokens to first hit
+	time_to_first_hit_ms: number | null
+	tokens_to_first_hit: number | null
+
+	// Efficiency metrics
+	total_duration_ms: number
+	total_cost_usd: number
+	total_input_tokens: number
+	total_output_tokens: number
+	tool_calls_count: number
+	semantic_search_calls?: number
+
+	// Embedding metrics (Agent2 only)
+	embed_latency_ms?: number
+	embed_tokens?: number
+	embed_cost_usd?: number
+
+	// Tool output size accounting
+	tool_output_chars_total: number
+	tool_output_chars_by_type: Record<string, number>
+
+	// Rankings
+	top_files_final: string[]
+	ranked_files_from_tools: string[]
+}
+
+/**
+ * Aggregate summary across all instances
+ */
+export interface AggregateSummary {
+	split: string
+	total_instances: number
+	agent_summaries: {
+		agent_variant: AgentVariant
+		// Quality rates
+		hit_at_1_rate: number
+		hit_at_3_rate: number
+		hit_at_5_rate: number
+		hit_at_10_rate: number
+		mean_mrr: number
+		mean_coverage_at_5: number
+		mean_coverage_at_10: number
+		// Totals (sum across all instances) - for cost tracking
+		total_duration_ms: number
+		total_tokens: number
+		total_input_tokens: number
+		total_output_tokens: number
+		total_cost_usd: number
+		total_tool_calls: number
+		// Medians (for multi-instance benchmarks)
+		median_duration_ms: number
+		p90_duration_ms: number
+		median_tokens: number
+		p90_tokens: number
+		median_time_to_first_hit_ms: number | null
+		median_tokens_to_first_hit: number | null
+	}[]
+	delta: {
+		hit_at_1_delta: number
+		hit_at_3_delta: number
+		hit_at_5_delta: number
+		mrr_delta: number
+		duration_ms_delta: number
+		tokens_delta: number
+		cost_usd_delta: number
+	}
+}
diff --git a/packages/eval/uv.lock b/packages/eval/uv.lock
deleted file mode 100644
index ad6e80f..0000000
--- a/packages/eval/uv.lock
+++ /dev/null
@@ -1,370 +0,0 @@
-version = 1
-revision = 2
-requires-python = ">=3.14"
-resolution-markers = [
-    "sys_platform == 'win32'",
-    "sys_platform != 'win32'",
-]
-
-[[package]]
-name = "anyio"
-version = "4.12.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "idna" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/16/ce/8a777047513153587e5434fd752e89334ac33e379aa3497db860eeb60377/anyio-4.12.0.tar.gz", hash = "sha256:73c693b567b0c55130c104d0b43a9baf3aa6a31fc6110116509f27bf75e21ec0", size = 228266, upload-time = "2025-11-28T23:37:38.911Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/7f/9c/36c5c37947ebfb8c7f22e0eb6e4d188ee2d53aa3880f3f2744fb894f0cb1/anyio-4.12.0-py3-none-any.whl", hash = "sha256:dad2376a628f98eeca4881fc56cd06affd18f659b17a747d3ff0307ced94b1bb", size = 113362, upload-time = "2025-11-28T23:36:57.897Z" },
-]
-
-[[package]]
-name = "certifi"
-version = "2025.11.12"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/a2/8c/58f469717fa48465e4a50c014a0400602d3c437d7c0c468e17ada824da3a/certifi-2025.11.12.tar.gz", hash = "sha256:d8ab5478f2ecd78af242878415affce761ca6bc54a22a27e026d7c25357c3316", size = 160538, upload-time = "2025-11-12T02:54:51.517Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/70/7d/9bc192684cea499815ff478dfcdc13835ddf401365057044fb721ec6bddb/certifi-2025.11.12-py3-none-any.whl", hash = "sha256:97de8790030bbd5c2d96b7ec782fc2f7820ef8dba6db909ccf95449f2d062d4b", size = 159438, upload-time = "2025-11-12T02:54:49.735Z" },
-]
-
-[[package]]
-name = "chonkie"
-version = "1.0.5"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "tokenizers" },
-    { name = "tqdm" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/0d/a8/c881853cf1759c27fb6e06ded72aa52eaa3e99407185cd2d32d70b55f3a4/chonkie-1.0.5.tar.gz", hash = "sha256:0396efcc8e79d25a2dd4fe6d01ac3b9c077e00472af1f04dd9b5183f89b92cfc", size = 63596, upload-time = "2025-04-22T01:18:39.339Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/82/ce/d8d8359b17761259f86df1e46a5dbebc637906f3b8cee6c02877134075d3/chonkie-1.0.5-py3-none-any.whl", hash = "sha256:81f66fc5897bf14d5c1b55d4f0735f85fec269f237929b5c6d8a6b548cfd1e10", size = 80815, upload-time = "2025-04-22T01:18:38.095Z" },
-]
-
-[package.optional-dependencies]
-code = [
-    { name = "tree-sitter" },
-    { name = "tree-sitter-language-pack" },
-]
-
-[[package]]
-name = "click"
-version = "8.3.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "colorama", marker = "sys_platform == 'win32'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/3d/fa/656b739db8587d7b5dfa22e22ed02566950fbfbcdc20311993483657a5c0/click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a", size = 295065, upload-time = "2025-11-15T20:45:42.706Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" },
-]
-
-[[package]]
-name = "colorama"
-version = "0.4.6"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
-]
-
-[[package]]
-name = "eval"
-version = "0.1.0"
-source = { virtual = "." }
-dependencies = [
-    { name = "chonkie", extra = ["code"] },
-]
-
-[package.metadata]
-requires-dist = [{ name = "chonkie", extras = ["code"], specifier = ">=1.0.5" }]
-
-[[package]]
-name = "filelock"
-version = "3.20.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/a7/23/ce7a1126827cedeb958fc043d61745754464eb56c5937c35bbf2b8e26f34/filelock-3.20.1.tar.gz", hash = "sha256:b8360948b351b80f420878d8516519a2204b07aefcdcfd24912a5d33127f188c", size = 19476, upload-time = "2025-12-15T23:54:28.027Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/e3/7f/a1a97644e39e7316d850784c642093c99df1290a460df4ede27659056834/filelock-3.20.1-py3-none-any.whl", hash = "sha256:15d9e9a67306188a44baa72f569d2bfd803076269365fdea0934385da4dc361a", size = 16666, upload-time = "2025-12-15T23:54:26.874Z" },
-]
-
-[[package]]
-name = "fsspec"
-version = "2025.12.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/b6/27/954057b0d1f53f086f681755207dda6de6c660ce133c829158e8e8fe7895/fsspec-2025.12.0.tar.gz", hash = "sha256:c505de011584597b1060ff778bb664c1bc022e87921b0e4f10cc9c44f9635973", size = 309748, upload-time = "2025-12-03T15:23:42.687Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/51/c7/b64cae5dba3a1b138d7123ec36bb5ccd39d39939f18454407e5468f4763f/fsspec-2025.12.0-py3-none-any.whl", hash = "sha256:8bf1fe301b7d8acfa6e8571e3b1c3d158f909666642431cc78a1b7b4dbc5ec5b", size = 201422, upload-time = "2025-12-03T15:23:41.434Z" },
-]
-
-[[package]]
-name = "h11"
-version = "0.16.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },
-]
-
-[[package]]
-name = "hf-xet"
-version = "1.2.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/5e/6e/0f11bacf08a67f7fb5ee09740f2ca54163863b07b70d579356e9222ce5d8/hf_xet-1.2.0.tar.gz", hash = "sha256:a8c27070ca547293b6890c4bf389f713f80e8c478631432962bb7f4bc0bd7d7f", size = 506020, upload-time = "2025-10-24T19:04:32.129Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/e2/51/f7e2caae42f80af886db414d4e9885fac959330509089f97cccb339c6b87/hf_xet-1.2.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:10bfab528b968c70e062607f663e21e34e2bba349e8038db546646875495179e", size = 2861861, upload-time = "2025-10-24T19:04:19.01Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/1d/a641a88b69994f9371bd347f1dd35e5d1e2e2460a2e350c8d5165fc62005/hf_xet-1.2.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2a212e842647b02eb6a911187dc878e79c4aa0aa397e88dd3b26761676e8c1f8", size = 2717699, upload-time = "2025-10-24T19:04:17.306Z" },
-    { url = "https://files.pythonhosted.org/packages/df/e0/e5e9bba7d15f0318955f7ec3f4af13f92e773fbb368c0b8008a5acbcb12f/hf_xet-1.2.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30e06daccb3a7d4c065f34fc26c14c74f4653069bb2b194e7f18f17cbe9939c0", size = 3314885, upload-time = "2025-10-24T19:04:07.642Z" },
-    { url = "https://files.pythonhosted.org/packages/21/90/b7fe5ff6f2b7b8cbdf1bd56145f863c90a5807d9758a549bf3d916aa4dec/hf_xet-1.2.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:29c8fc913a529ec0a91867ce3d119ac1aac966e098cf49501800c870328cc090", size = 3221550, upload-time = "2025-10-24T19:04:05.55Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/cb/73f276f0a7ce46cc6a6ec7d6c7d61cbfe5f2e107123d9bbd0193c355f106/hf_xet-1.2.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e159cbfcfbb29f920db2c09ed8b660eb894640d284f102ada929b6e3dc410a", size = 3408010, upload-time = "2025-10-24T19:04:28.598Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/1e/d642a12caa78171f4be64f7cd9c40e3ca5279d055d0873188a58c0f5fbb9/hf_xet-1.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9c91d5ae931510107f148874e9e2de8a16052b6f1b3ca3c1b12f15ccb491390f", size = 3503264, upload-time = "2025-10-24T19:04:30.397Z" },
-    { url = "https://files.pythonhosted.org/packages/17/b5/33764714923fa1ff922770f7ed18c2daae034d21ae6e10dbf4347c854154/hf_xet-1.2.0-cp314-cp314t-win_amd64.whl", hash = "sha256:210d577732b519ac6ede149d2f2f34049d44e8622bf14eb3d63bbcd2d4b332dc", size = 2901071, upload-time = "2025-10-24T19:04:37.463Z" },
-    { url = "https://files.pythonhosted.org/packages/96/2d/22338486473df5923a9ab7107d375dbef9173c338ebef5098ef593d2b560/hf_xet-1.2.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:46740d4ac024a7ca9b22bebf77460ff43332868b661186a8e46c227fdae01848", size = 2866099, upload-time = "2025-10-24T19:04:15.366Z" },
-    { url = "https://files.pythonhosted.org/packages/7f/8c/c5becfa53234299bc2210ba314eaaae36c2875e0045809b82e40a9544f0c/hf_xet-1.2.0-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:27df617a076420d8845bea087f59303da8be17ed7ec0cd7ee3b9b9f579dff0e4", size = 2722178, upload-time = "2025-10-24T19:04:13.695Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/92/cf3ab0b652b082e66876d08da57fcc6fa2f0e6c70dfbbafbd470bb73eb47/hf_xet-1.2.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3651fd5bfe0281951b988c0facbe726aa5e347b103a675f49a3fa8144c7968fd", size = 3320214, upload-time = "2025-10-24T19:04:03.596Z" },
-    { url = "https://files.pythonhosted.org/packages/46/92/3f7ec4a1b6a65bf45b059b6d4a5d38988f63e193056de2f420137e3c3244/hf_xet-1.2.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d06fa97c8562fb3ee7a378dd9b51e343bc5bc8190254202c9771029152f5e08c", size = 3229054, upload-time = "2025-10-24T19:04:01.949Z" },
-    { url = "https://files.pythonhosted.org/packages/0b/dd/7ac658d54b9fb7999a0ccb07ad863b413cbaf5cf172f48ebcd9497ec7263/hf_xet-1.2.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:4c1428c9ae73ec0939410ec73023c4f842927f39db09b063b9482dac5a3bb737", size = 3413812, upload-time = "2025-10-24T19:04:24.585Z" },
-    { url = "https://files.pythonhosted.org/packages/92/68/89ac4e5b12a9ff6286a12174c8538a5930e2ed662091dd2572bbe0a18c8a/hf_xet-1.2.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a55558084c16b09b5ed32ab9ed38421e2d87cf3f1f89815764d1177081b99865", size = 3508920, upload-time = "2025-10-24T19:04:26.927Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/44/870d44b30e1dcfb6a65932e3e1506c103a8a5aea9103c337e7a53180322c/hf_xet-1.2.0-cp37-abi3-win_amd64.whl", hash = "sha256:e6584a52253f72c9f52f9e549d5895ca7a471608495c4ecaa6cc73dba2b24d69", size = 2905735, upload-time = "2025-10-24T19:04:35.928Z" },
-]
-
-[[package]]
-name = "httpcore"
-version = "1.0.9"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "certifi" },
-    { name = "h11" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" },
-]
-
-[[package]]
-name = "httpx"
-version = "0.28.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "anyio" },
-    { name = "certifi" },
-    { name = "httpcore" },
-    { name = "idna" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
-]
-
-[[package]]
-name = "huggingface-hub"
-version = "1.2.3"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "filelock" },
-    { name = "fsspec" },
-    { name = "hf-xet", marker = "platform_machine == 'AMD64' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" },
-    { name = "httpx" },
-    { name = "packaging" },
-    { name = "pyyaml" },
-    { name = "shellingham" },
-    { name = "tqdm" },
-    { name = "typer-slim" },
-    { name = "typing-extensions" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/a7/c8/9cd2fcb670ba0e708bfdf95a1177b34ca62de2d3821df0773bc30559af80/huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7", size = 614605, upload-time = "2025-12-12T15:31:42.161Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/df/8d/7ca723a884d55751b70479b8710f06a317296b1fa1c1dec01d0420d13e43/huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642", size = 520953, upload-time = "2025-12-12T15:31:40.339Z" },
-]
-
-[[package]]
-name = "idna"
-version = "3.11"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/0703ccc57f3a7233505399edb88de3cbd678da106337b9fcde432b65ed60/idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902", size = 194582, upload-time = "2025-10-12T14:55:20.501Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" },
-]
-
-[[package]]
-name = "packaging"
-version = "25.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/a1/d4/1fc4078c65507b51b96ca8f8c3ba19e6a61c8253c72794544580a7b6c24d/packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f", size = 165727, upload-time = "2025-04-19T11:48:59.673Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" },
-]
-
-[[package]]
-name = "pyyaml"
-version = "6.0.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" },
-    { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" },
-    { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" },
-    { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" },
-    { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" },
-    { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" },
-    { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" },
-    { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" },
-    { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" },
-    { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" },
-    { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" },
-    { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" },
-]
-
-[[package]]
-name = "shellingham"
-version = "1.5.4"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size = 10310, upload-time = "2023-10-24T04:13:40.426Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" },
-]
-
-[[package]]
-name = "tokenizers"
-version = "0.22.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "huggingface-hub" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/1c/46/fb6854cec3278fbfa4a75b50232c77622bc517ac886156e6afbfa4d8fc6e/tokenizers-0.22.1.tar.gz", hash = "sha256:61de6522785310a309b3407bac22d99c4db5dba349935e99e4d15ea2226af2d9", size = 363123, upload-time = "2025-09-19T09:49:23.424Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/bf/33/f4b2d94ada7ab297328fc671fed209368ddb82f965ec2224eb1892674c3a/tokenizers-0.22.1-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:59fdb013df17455e5f950b4b834a7b3ee2e0271e6378ccb33aa74d178b513c73", size = 3069318, upload-time = "2025-09-19T09:49:11.848Z" },
-    { url = "https://files.pythonhosted.org/packages/1c/58/2aa8c874d02b974990e89ff95826a4852a8b2a273c7d1b4411cdd45a4565/tokenizers-0.22.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:8d4e484f7b0827021ac5f9f71d4794aaef62b979ab7608593da22b1d2e3c4edc", size = 2926478, upload-time = "2025-09-19T09:49:09.759Z" },
-    { url = "https://files.pythonhosted.org/packages/1e/3b/55e64befa1e7bfea963cf4b787b2cea1011362c4193f5477047532ce127e/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19d2962dd28bc67c1f205ab180578a78eef89ac60ca7ef7cbe9635a46a56422a", size = 3256994, upload-time = "2025-09-19T09:48:56.701Z" },
-    { url = "https://files.pythonhosted.org/packages/71/0b/fbfecf42f67d9b7b80fde4aabb2b3110a97fac6585c9470b5bff103a80cb/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:38201f15cdb1f8a6843e6563e6e79f4abd053394992b9bbdf5213ea3469b4ae7", size = 3153141, upload-time = "2025-09-19T09:48:59.749Z" },
-    { url = "https://files.pythonhosted.org/packages/17/a9/b38f4e74e0817af8f8ef925507c63c6ae8171e3c4cb2d5d4624bf58fca69/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d1cbe5454c9a15df1b3443c726063d930c16f047a3cc724b9e6e1a91140e5a21", size = 3508049, upload-time = "2025-09-19T09:49:05.868Z" },
-    { url = "https://files.pythonhosted.org/packages/d2/48/dd2b3dac46bb9134a88e35d72e1aa4869579eacc1a27238f1577270773ff/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e7d094ae6312d69cc2a872b54b91b309f4f6fbce871ef28eb27b52a98e4d0214", size = 3710730, upload-time = "2025-09-19T09:49:01.832Z" },
-    { url = "https://files.pythonhosted.org/packages/93/0e/ccabc8d16ae4ba84a55d41345207c1e2ea88784651a5a487547d80851398/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:afd7594a56656ace95cdd6df4cca2e4059d294c5cfb1679c57824b605556cb2f", size = 3412560, upload-time = "2025-09-19T09:49:03.867Z" },
-    { url = "https://files.pythonhosted.org/packages/d0/c6/dc3a0db5a6766416c32c034286d7c2d406da1f498e4de04ab1b8959edd00/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e2ef6063d7a84994129732b47e7915e8710f27f99f3a3260b8a38fc7ccd083f4", size = 3250221, upload-time = "2025-09-19T09:49:07.664Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/a6/2c8486eef79671601ff57b093889a345dd3d576713ef047776015dc66de7/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ba0a64f450b9ef412c98f6bcd2a50c6df6e2443b560024a09fa6a03189726879", size = 9345569, upload-time = "2025-09-19T09:49:14.214Z" },
-    { url = "https://files.pythonhosted.org/packages/6b/16/32ce667f14c35537f5f605fe9bea3e415ea1b0a646389d2295ec348d5657/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:331d6d149fa9c7d632cde4490fb8bbb12337fa3a0232e77892be656464f4b446", size = 9271599, upload-time = "2025-09-19T09:49:16.639Z" },
-    { url = "https://files.pythonhosted.org/packages/51/7c/a5f7898a3f6baa3fc2685c705e04c98c1094c523051c805cdd9306b8f87e/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:607989f2ea68a46cb1dfbaf3e3aabdf3f21d8748312dbeb6263d1b3b66c5010a", size = 9533862, upload-time = "2025-09-19T09:49:19.146Z" },
-    { url = "https://files.pythonhosted.org/packages/36/65/7e75caea90bc73c1dd8d40438adf1a7bc26af3b8d0a6705ea190462506e1/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a0f307d490295717726598ef6fa4f24af9d484809223bbc253b201c740a06390", size = 9681250, upload-time = "2025-09-19T09:49:21.501Z" },
-    { url = "https://files.pythonhosted.org/packages/30/2c/959dddef581b46e6209da82df3b78471e96260e2bc463f89d23b1bf0e52a/tokenizers-0.22.1-cp39-abi3-win32.whl", hash = "sha256:b5120eed1442765cd90b903bb6cfef781fd8fe64e34ccaecbae4c619b7b12a82", size = 2472003, upload-time = "2025-09-19T09:49:27.089Z" },
-    { url = "https://files.pythonhosted.org/packages/b3/46/e33a8c93907b631a99377ef4c5f817ab453d0b34f93529421f42ff559671/tokenizers-0.22.1-cp39-abi3-win_amd64.whl", hash = "sha256:65fd6e3fb11ca1e78a6a93602490f134d1fdeb13bcef99389d5102ea318ed138", size = 2674684, upload-time = "2025-09-19T09:49:24.953Z" },
-]
-
-[[package]]
-name = "tqdm"
-version = "4.67.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "colorama", marker = "sys_platform == 'win32'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737, upload-time = "2024-11-24T20:12:22.481Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
-]
-
-[[package]]
-name = "tree-sitter"
-version = "0.25.2"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/66/7c/0350cfc47faadc0d3cf7d8237a4e34032b3014ddf4a12ded9933e1648b55/tree-sitter-0.25.2.tar.gz", hash = "sha256:fe43c158555da46723b28b52e058ad444195afd1db3ca7720c59a254544e9c20", size = 177961, upload-time = "2025-09-25T17:37:59.751Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/07/e3/d9526ba71dfbbe4eba5e51d89432b4b333a49a1e70712aa5590cd22fc74f/tree_sitter-0.25.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:65d3c931013ea798b502782acab986bbf47ba2c452610ab0776cf4a8ef150fc0", size = 146776, upload-time = "2025-09-25T17:37:50.898Z" },
-    { url = "https://files.pythonhosted.org/packages/42/97/4bd4ad97f85a23011dd8a535534bb1035c4e0bac1234d58f438e15cff51f/tree_sitter-0.25.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:bda059af9d621918efb813b22fb06b3fe00c3e94079c6143fcb2c565eb44cb87", size = 137732, upload-time = "2025-09-25T17:37:51.877Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/19/1e968aa0b1b567988ed522f836498a6a9529a74aab15f09dd9ac1e41f505/tree_sitter-0.25.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eac4e8e4c7060c75f395feec46421eb61212cb73998dbe004b7384724f3682ab", size = 609456, upload-time = "2025-09-25T17:37:52.925Z" },
-    { url = "https://files.pythonhosted.org/packages/48/b6/cf08f4f20f4c9094006ef8828555484e842fc468827ad6e56011ab668dbd/tree_sitter-0.25.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:260586381b23be33b6191a07cea3d44ecbd6c01aa4c6b027a0439145fcbc3358", size = 636772, upload-time = "2025-09-25T17:37:54.647Z" },
-    { url = "https://files.pythonhosted.org/packages/57/e2/d42d55bf56360987c32bc7b16adb06744e425670b823fb8a5786a1cea991/tree_sitter-0.25.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7d2ee1acbacebe50ba0f85fff1bc05e65d877958f00880f49f9b2af38dce1af0", size = 631522, upload-time = "2025-09-25T17:37:55.833Z" },
-    { url = "https://files.pythonhosted.org/packages/03/87/af9604ebe275a9345d88c3ace0cf2a1341aa3f8ef49dd9fc11662132df8a/tree_sitter-0.25.2-cp314-cp314-win_amd64.whl", hash = "sha256:4973b718fcadfb04e59e746abfbb0288694159c6aeecd2add59320c03368c721", size = 130864, upload-time = "2025-09-25T17:37:57.453Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/6e/e64621037357acb83d912276ffd30a859ef117f9c680f2e3cb955f47c680/tree_sitter-0.25.2-cp314-cp314-win_arm64.whl", hash = "sha256:b8d4429954a3beb3e844e2872610d2a4800ba4eb42bb1990c6a4b1949b18459f", size = 117470, upload-time = "2025-09-25T17:37:58.431Z" },
-]
-
-[[package]]
-name = "tree-sitter-c-sharp"
-version = "0.23.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/22/85/a61c782afbb706a47d990eaee6977e7c2bd013771c5bf5c81c617684f286/tree_sitter_c_sharp-0.23.1.tar.gz", hash = "sha256:322e2cfd3a547a840375276b2aea3335fa6458aeac082f6c60fec3f745c967eb", size = 1317728, upload-time = "2024-11-11T05:25:32.535Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/58/04/f6c2df4c53a588ccd88d50851155945cff8cd887bd70c175e00aaade7edf/tree_sitter_c_sharp-0.23.1-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2b612a6e5bd17bb7fa2aab4bb6fc1fba45c94f09cb034ab332e45603b86e32fd", size = 372235, upload-time = "2024-11-11T05:25:19.424Z" },
-    { url = "https://files.pythonhosted.org/packages/99/10/1aa9486f1e28fc22810fa92cbdc54e1051e7f5536a5e5b5e9695f609b31e/tree_sitter_c_sharp-0.23.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:1a8b98f62bc53efcd4d971151950c9b9cd5cbe3bacdb0cd69fdccac63350d83e", size = 419046, upload-time = "2024-11-11T05:25:20.679Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/21/13df29f8fcb9ba9f209b7b413a4764b673dfd58989a0dd67e9c7e19e9c2e/tree_sitter_c_sharp-0.23.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:986e93d845a438ec3c4416401aa98e6a6f6631d644bbbc2e43fcb915c51d255d", size = 415999, upload-time = "2024-11-11T05:25:22.359Z" },
-    { url = "https://files.pythonhosted.org/packages/ca/72/fc6846795bcdae2f8aa94cc8b1d1af33d634e08be63e294ff0d6794b1efc/tree_sitter_c_sharp-0.23.1-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8024e466b2f5611c6dc90321f232d8584893c7fb88b75e4a831992f877616d2", size = 402830, upload-time = "2024-11-11T05:25:24.198Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/3a/b6028c5890ce6653807d5fa88c72232c027c6ceb480dbeb3b186d60e5971/tree_sitter_c_sharp-0.23.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:7f9bf876866835492281d336b9e1f9626ab668737f74e914c31d285261507da7", size = 397880, upload-time = "2024-11-11T05:25:25.937Z" },
-    { url = "https://files.pythonhosted.org/packages/47/d2/4facaa34b40f8104d8751746d0e1cd2ddf0beb9f1404b736b97f372bd1f3/tree_sitter_c_sharp-0.23.1-cp39-abi3-win_amd64.whl", hash = "sha256:ae9a9e859e8f44e2b07578d44f9a220d3fa25b688966708af6aa55d42abeebb3", size = 377562, upload-time = "2024-11-11T05:25:27.539Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/88/3cf6bd9959d94d1fec1e6a9c530c5f08ff4115a474f62aedb5fedb0f7241/tree_sitter_c_sharp-0.23.1-cp39-abi3-win_arm64.whl", hash = "sha256:c81548347a93347be4f48cb63ec7d60ef4b0efa91313330e69641e49aa5a08c5", size = 375157, upload-time = "2024-11-11T05:25:30.839Z" },
-]
-
-[[package]]
-name = "tree-sitter-embedded-template"
-version = "0.25.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/fd/a7/77729fefab8b1b5690cfc54328f2f629d1c076d16daf32c96ba39d3a3a3a/tree_sitter_embedded_template-0.25.0.tar.gz", hash = "sha256:7d72d5e8a1d1d501a7c90e841b51f1449a90cc240be050e4fb85c22dab991d50", size = 14114, upload-time = "2025-08-29T00:42:51.078Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/1f/9d/3e3c8ee0c019d3bace728300a1ca807c03df39e66cc51e9a5e7c9d1e1909/tree_sitter_embedded_template-0.25.0-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:fa0d06467199aeb33fb3d6fa0665bf9b7d5a32621ffdaf37fd8249f8a8050649", size = 10266, upload-time = "2025-08-29T00:42:44.148Z" },
-    { url = "https://files.pythonhosted.org/packages/e8/ab/6d4e43b736b2a895d13baea3791dc8ce7245bedf4677df9e7deb22e23a2a/tree_sitter_embedded_template-0.25.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:fc7aacbc2985a5d7e7fe7334f44dffe24c38fb0a8295c4188a04cf21a3d64a73", size = 10650, upload-time = "2025-08-29T00:42:45.147Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/97/ea3d1ea4b320fe66e0468b9f6602966e544c9fe641882484f9105e50ee0c/tree_sitter_embedded_template-0.25.0-cp310-abi3-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a7c88c3dd8b94b3c9efe8ae071ff6b1b936a27ac5f6e651845c3b9631fa4c1c2", size = 18268, upload-time = "2025-08-29T00:42:46.03Z" },
-    { url = "https://files.pythonhosted.org/packages/64/40/0f42ca894a8f7c298cf336080046ccc14c10e8f4ea46d455f640193181b2/tree_sitter_embedded_template-0.25.0-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:025f7ca84218dcd8455efc901bdbcc2689fb694f3a636c0448e322a23d4bc96b", size = 19068, upload-time = "2025-08-29T00:42:46.699Z" },
-    { url = "https://files.pythonhosted.org/packages/d0/2a/0b720bcae7c2dd0a44889c09e800a2f8eb08c496dede9f2b97683506c4c3/tree_sitter_embedded_template-0.25.0-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:b5dc1aef6ffa3fae621fe037d85dd98948b597afba20df29d779c426be813ee5", size = 18518, upload-time = "2025-08-29T00:42:47.694Z" },
-    { url = "https://files.pythonhosted.org/packages/14/8a/d745071afa5e8bdf5b381cf84c4dc6be6c79dee6af8e0ff07476c3d8e4aa/tree_sitter_embedded_template-0.25.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:d0a35cfe634c44981a516243bc039874580e02a2990669313730187ce83a5bc6", size = 18267, upload-time = "2025-08-29T00:42:48.635Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/74/728355e594fca140f793f234fdfec195366b6956b35754d00ea97ca18b21/tree_sitter_embedded_template-0.25.0-cp310-abi3-win_amd64.whl", hash = "sha256:3e05a4ac013d54505e75ae48e1a0e9db9aab19949fe15d9f4c7345b11a84a069", size = 13049, upload-time = "2025-08-29T00:42:49.589Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/de/afac475e694d0e626b0808f3c86339c349cd15c5163a6a16a53cc11cf892/tree_sitter_embedded_template-0.25.0-cp310-abi3-win_arm64.whl", hash = "sha256:2751d402179ac0e83f2065b249d8fe6df0718153f1636bcb6a02bde3e5730db9", size = 11978, upload-time = "2025-08-29T00:42:50.226Z" },
-]
-
-[[package]]
-name = "tree-sitter-language-pack"
-version = "0.13.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "tree-sitter" },
-    { name = "tree-sitter-c-sharp" },
-    { name = "tree-sitter-embedded-template" },
-    { name = "tree-sitter-yaml" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/c1/83/d1bc738d6f253f415ee54a8afb99640f47028871436f53f2af637c392c4f/tree_sitter_language_pack-0.13.0.tar.gz", hash = "sha256:032034c5e27b1f6e00730b9e7c2dbc8203b4700d0c681fd019d6defcf61183ec", size = 51353370, upload-time = "2025-11-26T14:01:04.586Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/e9/38/aec1f450ae5c4796de8345442f297fcf8912c7d2e00a66d3236ff0f825ed/tree_sitter_language_pack-0.13.0-cp310-abi3-macosx_10_15_universal2.whl", hash = "sha256:0e7eae812b40a2dc8a12eb2f5c55e130eb892706a0bee06215dd76affeb00d07", size = 32991857, upload-time = "2025-11-26T14:00:51.459Z" },
-    { url = "https://files.pythonhosted.org/packages/90/09/11f51c59ede786dccddd2d348d5d24a1d99c54117d00f88b477f5fae4bd5/tree_sitter_language_pack-0.13.0-cp310-abi3-manylinux2014_aarch64.whl", hash = "sha256:7fdacf383418a845b20772118fcb53ad245f9c5d409bd07dae16acec65151756", size = 20092989, upload-time = "2025-11-26T14:00:54.202Z" },
-    { url = "https://files.pythonhosted.org/packages/72/9d/644db031047ab1a70fc5cb6a79a4d4067080fac628375b2320752d2d7b58/tree_sitter_language_pack-0.13.0-cp310-abi3-manylinux2014_x86_64.whl", hash = "sha256:0d4f261fce387ae040dae7e4d1c1aca63d84c88320afcc0961c123bec0be8377", size = 19952029, upload-time = "2025-11-26T14:00:56.699Z" },
-    { url = "https://files.pythonhosted.org/packages/48/92/5fd749bbb3f5e4538492c77de7bc51a5e479fec6209464ddc25be9153b13/tree_sitter_language_pack-0.13.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:78f369dc4d456c5b08d659939e662c2f9b9fba8c0ec5538a1f973e01edfcf04d", size = 19944614, upload-time = "2025-11-26T14:00:59.381Z" },
-    { url = "https://files.pythonhosted.org/packages/97/59/2287f07723c063475d6657babed0d5569f4b499e393ab51354d529c3e7b5/tree_sitter_language_pack-0.13.0-cp310-abi3-win_amd64.whl", hash = "sha256:1cdbc88a03dacd47bec69e56cc20c48eace1fbb6f01371e89c3ee6a2e8f34db1", size = 16896852, upload-time = "2025-11-26T14:01:01.788Z" },
-]
-
-[[package]]
-name = "tree-sitter-yaml"
-version = "0.7.2"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/57/b6/941d356ac70c90b9d2927375259e3a4204f38f7499ec6e7e8a95b9664689/tree_sitter_yaml-0.7.2.tar.gz", hash = "sha256:756db4c09c9d9e97c81699e8f941cb8ce4e51104927f6090eefe638ee567d32c", size = 84882, upload-time = "2025-10-07T14:40:36.071Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/38/29/c0b8dbff302c49ff4284666ffb6f2f21145006843bb4c3a9a85d0ec0b7ae/tree_sitter_yaml-0.7.2-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:7e269ddcfcab8edb14fbb1f1d34eed1e1e26888f78f94eedfe7cc98c60f8bc9f", size = 43898, upload-time = "2025-10-07T14:40:29.486Z" },
-    { url = "https://files.pythonhosted.org/packages/18/0d/15a5add06b3932b5e4ce5f5e8e179197097decfe82a0ef000952c8b98216/tree_sitter_yaml-0.7.2-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:0807b7966e23ddf7dddc4545216e28b5a58cdadedcecca86b8d8c74271a07870", size = 44691, upload-time = "2025-10-07T14:40:30.369Z" },
-    { url = "https://files.pythonhosted.org/packages/72/92/c4b896c90d08deb8308fadbad2210fdcc4c66c44ab4292eac4e80acb4b61/tree_sitter_yaml-0.7.2-cp310-abi3-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:f1a5c60c98b6c4c037aae023569f020d0c489fad8dc26fdfd5510363c9c29a41", size = 91430, upload-time = "2025-10-07T14:40:31.16Z" },
-    { url = "https://files.pythonhosted.org/packages/89/59/61f1fed31eb6d46ff080b8c0d53658cf29e10263f41ef5fe34768908037a/tree_sitter_yaml-0.7.2-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:88636d19d0654fd24f4f242eaaafa90f6f5ebdba8a62e4b32d251ed156c51a2a", size = 92428, upload-time = "2025-10-07T14:40:31.954Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/62/a33a04d19b7f9a0ded780b9c9fcc6279e37c5d00b89b00425bb807a22cc2/tree_sitter_yaml-0.7.2-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:1d2e8f0bb14aa4537320952d0f9607eef3021d5aada8383c34ebeece17db1e06", size = 90580, upload-time = "2025-10-07T14:40:33.037Z" },
-    { url = "https://files.pythonhosted.org/packages/6c/e7/9525defa7b30792623f56b1fba9bbba361752348875b165b8975b87398fd/tree_sitter_yaml-0.7.2-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:74ca712c50fc9d7dbc68cb36b4a7811d6e67a5466b5a789f19bf8dd6084ef752", size = 90455, upload-time = "2025-10-07T14:40:33.778Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/d6/8d1e1ace03db3b02e64e91daf21d1347941d1bbecc606a5473a1a605250d/tree_sitter_yaml-0.7.2-cp310-abi3-win_amd64.whl", hash = "sha256:7587b5ca00fc4f9a548eff649697a3b395370b2304b399ceefa2087d8a6c9186", size = 45514, upload-time = "2025-10-07T14:40:34.562Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/c7/dcf3ea1c4f5da9b10353b9af4455d756c92d728a8f58f03c480d3ef0ead5/tree_sitter_yaml-0.7.2-cp310-abi3-win_arm64.whl", hash = "sha256:f63c227b18e7ce7587bce124578f0bbf1f890ac63d3e3cd027417574273642c4", size = 44065, upload-time = "2025-10-07T14:40:35.337Z" },
-]
-
-[[package]]
-name = "typer-slim"
-version = "0.20.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "click" },
-    { name = "typing-extensions" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/8e/45/81b94a52caed434b94da65729c03ad0fb7665fab0f7db9ee54c94e541403/typer_slim-0.20.0.tar.gz", hash = "sha256:9fc6607b3c6c20f5c33ea9590cbeb17848667c51feee27d9e314a579ab07d1a3", size = 106561, upload-time = "2025-10-20T17:03:46.642Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/5e/dd/5cbf31f402f1cc0ab087c94d4669cfa55bd1e818688b910631e131d74e75/typer_slim-0.20.0-py3-none-any.whl", hash = "sha256:f42a9b7571a12b97dddf364745d29f12221865acef7a2680065f9bb29c7dc89d", size = 47087, upload-time = "2025-10-20T17:03:44.546Z" },
-]
-
-[[package]]
-name = "typing-extensions"
-version = "4.15.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" },
-]