diff --git a/bun.lock b/bun.lock index a768e33..1f47721 100644 --- a/bun.lock +++ b/bun.lock @@ -11,7 +11,7 @@ }, "packages/code-chunk": { "name": "code-chunk", - "version": "0.1.0", + "version": "0.1.12", "dependencies": { "effect": "^3.19.12", "tree-sitter-go": "^0.25.0", @@ -38,21 +38,31 @@ "name": "@supermemory/eval", "version": "0.1.0", "dependencies": { + "@anthropic-ai/claude-agent-sdk": "^0.1.75", + "@anthropic-ai/sdk": "^0.71.2", "code-chunk": "workspace:*", - "openai": "^4.0.0", + "dotenv": "^16.4.0", + "zod": "^3.24.0", }, "devDependencies": { "@types/bun": "^1.3.4", + "typescript": "^5.0.0", }, }, }, "packages": { + "@anthropic-ai/claude-agent-sdk": ["@anthropic-ai/claude-agent-sdk@0.1.76", "", { "optionalDependencies": { "@img/sharp-darwin-arm64": "^0.33.5", "@img/sharp-darwin-x64": "^0.33.5", "@img/sharp-linux-arm": "^0.33.5", "@img/sharp-linux-arm64": "^0.33.5", "@img/sharp-linux-x64": "^0.33.5", "@img/sharp-linuxmusl-arm64": "^0.33.5", "@img/sharp-linuxmusl-x64": "^0.33.5", "@img/sharp-win32-x64": "^0.33.5" }, "peerDependencies": { "zod": "^3.24.1 || ^4.0.0" } }, "sha512-s7RvpXoFaLXLG7A1cJBAPD8ilwOhhc/12fb5mJXRuD561o4FmPtQ+WRfuy9akMmrFRfLsKv8Ornw3ClGAPL2fw=="], + + "@anthropic-ai/sdk": ["@anthropic-ai/sdk@0.71.2", "", { "dependencies": { "json-schema-to-ts": "^3.1.1" }, "peerDependencies": { "zod": "^3.25.0 || ^4.0.0" }, "optionalPeers": ["zod"], "bin": { "anthropic-ai-sdk": "bin/cli" } }, "sha512-TGNDEUuEstk/DKu0/TflXAEt+p+p/WhTlFzEnoosvbaDU2LTjm42igSdlL0VijrKpWejtOKxX0b8A7uc+XiSAQ=="], + "@babel/helper-string-parser": ["@babel/helper-string-parser@7.27.1", "", {}, "sha512-qMlSxKbpRlAridDExk92nSobyDdpPijUq2DW6oDnUqd0iOGxmQjyqhMIihI9+zv4LPyZdRje2cavWPbCbWm3eA=="], "@babel/helper-validator-identifier": ["@babel/helper-validator-identifier@7.28.5", "", {}, "sha512-qSs4ifwzKJSV39ucNjsvc6WVHs6b7S03sOh2OcHF9UHfVPqWWALUsNUVzhSBiItjRZoLHx7nIarVjqKVusUZ1Q=="], "@babel/parser": ["@babel/parser@7.28.5", "", { "dependencies": { "@babel/types": "^7.28.5" }, "bin": "./bin/babel-parser.js" }, "sha512-KKBU1VGYR7ORr3At5HAtUQ+TV3SzRCXmA/8OdDZiLDBIZxVyzXuztPjfLd3BV1PRAQGCMWWSHYhL0F8d5uHBDQ=="], + "@babel/runtime": ["@babel/runtime@7.28.4", "", {}, "sha512-Q/N6JNWvIvPnLDvjlE1OUBLPQHH6l3CltCEsHIujp45zQUSSh8K+gHnaEX45yAT1nyngnINhvWtzN+Nb9D8RAQ=="], + "@babel/types": ["@babel/types@7.28.5", "", { "dependencies": { "@babel/helper-string-parser": "^7.27.1", "@babel/helper-validator-identifier": "^7.28.5" } }, "sha512-qQ5m48eI/MFLQ5PxQj4PFaprjyCTLI37ElWMmNs0K8Lk3dVeOdNpB3ks8jc7yM5CDmVC73eMVk/trk3fgmrUpA=="], "@biomejs/biome": ["@biomejs/biome@2.3.10", "", { "optionalDependencies": { "@biomejs/cli-darwin-arm64": "2.3.10", "@biomejs/cli-darwin-x64": "2.3.10", "@biomejs/cli-linux-arm64": "2.3.10", "@biomejs/cli-linux-arm64-musl": "2.3.10", "@biomejs/cli-linux-x64": "2.3.10", "@biomejs/cli-linux-x64-musl": "2.3.10", "@biomejs/cli-win32-arm64": "2.3.10", "@biomejs/cli-win32-x64": "2.3.10" }, "bin": { "biome": "bin/biome" } }, "sha512-/uWSUd1MHX2fjqNLHNL6zLYWBbrJeG412/8H7ESuK8ewoRoMPUgHDebqKrPTx/5n6f17Xzqc9hdg3MEqA5hXnQ=="], @@ -83,6 +93,36 @@ "@emnapi/wasi-threads": ["@emnapi/wasi-threads@1.1.0", "", { "dependencies": { "tslib": "^2.4.0" } }, "sha512-WI0DdZ8xFSbgMjR1sFsKABJ/C5OnRrjT06JXbZKexJGrDuPTzZdDYfFlsgcCXCyf+suG5QU2e/y1Wo2V/OapLQ=="], + "@img/sharp-darwin-arm64": ["@img/sharp-darwin-arm64@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-darwin-arm64": "1.0.4" }, "os": "darwin", "cpu": "arm64" }, "sha512-UT4p+iz/2H4twwAoLCqfA9UH5pI6DggwKEGuaPy7nCVQ8ZsiY5PIcrRvD1DzuY3qYL07NtIQcWnBSY/heikIFQ=="], + + "@img/sharp-darwin-x64": ["@img/sharp-darwin-x64@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-darwin-x64": "1.0.4" }, "os": "darwin", "cpu": "x64" }, "sha512-fyHac4jIc1ANYGRDxtiqelIbdWkIuQaI84Mv45KvGRRxSAa7o7d1ZKAOBaYbnepLC1WqxfpimdeWfvqqSGwR2Q=="], + + "@img/sharp-libvips-darwin-arm64": ["@img/sharp-libvips-darwin-arm64@1.0.4", "", { "os": "darwin", "cpu": "arm64" }, "sha512-XblONe153h0O2zuFfTAbQYAX2JhYmDHeWikp1LM9Hul9gVPjFY427k6dFEcOL72O01QxQsWi761svJ/ev9xEDg=="], + + "@img/sharp-libvips-darwin-x64": ["@img/sharp-libvips-darwin-x64@1.0.4", "", { "os": "darwin", "cpu": "x64" }, "sha512-xnGR8YuZYfJGmWPvmlunFaWJsb9T/AO2ykoP3Fz/0X5XV2aoYBPkX6xqCQvUTKKiLddarLaxpzNe+b1hjeWHAQ=="], + + "@img/sharp-libvips-linux-arm": ["@img/sharp-libvips-linux-arm@1.0.5", "", { "os": "linux", "cpu": "arm" }, "sha512-gvcC4ACAOPRNATg/ov8/MnbxFDJqf/pDePbBnuBDcjsI8PssmjoKMAz4LtLaVi+OnSb5FK/yIOamqDwGmXW32g=="], + + "@img/sharp-libvips-linux-arm64": ["@img/sharp-libvips-linux-arm64@1.0.4", "", { "os": "linux", "cpu": "arm64" }, "sha512-9B+taZ8DlyyqzZQnoeIvDVR/2F4EbMepXMc/NdVbkzsJbzkUjhXv/70GQJ7tdLA4YJgNP25zukcxpX2/SueNrA=="], + + "@img/sharp-libvips-linux-x64": ["@img/sharp-libvips-linux-x64@1.0.4", "", { "os": "linux", "cpu": "x64" }, "sha512-MmWmQ3iPFZr0Iev+BAgVMb3ZyC4KeFc3jFxnNbEPas60e1cIfevbtuyf9nDGIzOaW9PdnDciJm+wFFaTlj5xYw=="], + + "@img/sharp-libvips-linuxmusl-arm64": ["@img/sharp-libvips-linuxmusl-arm64@1.0.4", "", { "os": "linux", "cpu": "arm64" }, "sha512-9Ti+BbTYDcsbp4wfYib8Ctm1ilkugkA/uscUn6UXK1ldpC1JjiXbLfFZtRlBhjPZ5o1NCLiDbg8fhUPKStHoTA=="], + + "@img/sharp-libvips-linuxmusl-x64": ["@img/sharp-libvips-linuxmusl-x64@1.0.4", "", { "os": "linux", "cpu": "x64" }, "sha512-viYN1KX9m+/hGkJtvYYp+CCLgnJXwiQB39damAO7WMdKWlIhmYTfHjwSbQeUK/20vY154mwezd9HflVFM1wVSw=="], + + "@img/sharp-linux-arm": ["@img/sharp-linux-arm@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-arm": "1.0.5" }, "os": "linux", "cpu": "arm" }, "sha512-JTS1eldqZbJxjvKaAkxhZmBqPRGmxgu+qFKSInv8moZ2AmT5Yib3EQ1c6gp493HvrvV8QgdOXdyaIBrhvFhBMQ=="], + + "@img/sharp-linux-arm64": ["@img/sharp-linux-arm64@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-arm64": "1.0.4" }, "os": "linux", "cpu": "arm64" }, "sha512-JMVv+AMRyGOHtO1RFBiJy/MBsgz0x4AWrT6QoEVVTyh1E39TrCUpTRI7mx9VksGX4awWASxqCYLCV4wBZHAYxA=="], + + "@img/sharp-linux-x64": ["@img/sharp-linux-x64@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-linux-x64": "1.0.4" }, "os": "linux", "cpu": "x64" }, "sha512-opC+Ok5pRNAzuvq1AG0ar+1owsu842/Ab+4qvU879ippJBHvyY5n2mxF1izXqkPYlGuP/M556uh53jRLJmzTWA=="], + + "@img/sharp-linuxmusl-arm64": ["@img/sharp-linuxmusl-arm64@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-linuxmusl-arm64": "1.0.4" }, "os": "linux", "cpu": "arm64" }, "sha512-XrHMZwGQGvJg2V/oRSUfSAfjfPxO+4DkiRh6p2AFjLQztWUuY/o8Mq0eMQVIY7HJ1CDQUJlxGGZRw1a5bqmd1g=="], + + "@img/sharp-linuxmusl-x64": ["@img/sharp-linuxmusl-x64@0.33.5", "", { "optionalDependencies": { "@img/sharp-libvips-linuxmusl-x64": "1.0.4" }, "os": "linux", "cpu": "x64" }, "sha512-WT+d/cgqKkkKySYmqoZ8y3pxx7lx9vVejxW/W4DOFMYVSkErR+w7mf2u8m/y4+xHe7yY9DAXQMWQhpnMuFfScw=="], + + "@img/sharp-win32-x64": ["@img/sharp-win32-x64@0.33.5", "", { "os": "win32", "cpu": "x64" }, "sha512-MpY/o8/8kj+EcnxwvrP4aTJSWw/aZ7JIGR4aBeZkZw5B7/Jn+tY9/VNwtcoGmdT7GfggGIU4kygOMSbYnOrAbg=="], + "@napi-rs/wasm-runtime": ["@napi-rs/wasm-runtime@1.1.0", "", { "dependencies": { "@emnapi/core": "^1.7.1", "@emnapi/runtime": "^1.7.1", "@tybys/wasm-util": "^0.10.1" } }, "sha512-Fq6DJW+Bb5jaWE69/qOE0D1TUN9+6uWhCeZpdnSBk14pjLcCWR7Q8n49PTSPHazM37JqrsdpEthXy2xn6jWWiA=="], "@oxc-minify/binding-android-arm64": ["@oxc-minify/binding-android-arm64@0.93.0", "", { "os": "android", "cpu": "arm64" }, "sha512-N3j/JoK4hXwQbnyOJoEltM8MEkddWV3XtfYimO6jsMjr5R6QdauKaSVeQHO/lSezB7SFkrMPqr6X7tBfghHiXA=="], @@ -195,18 +235,10 @@ "@types/node": ["@types/node@18.19.130", "", { "dependencies": { "undici-types": "~5.26.4" } }, "sha512-GRaXQx6jGfL8sKfaIDD6OupbIHBr9jv7Jnaml9tB7l4v068PAOXqfcujMMo5PhbIs6ggR1XODELqahT2R8v0fg=="], - "@types/node-fetch": ["@types/node-fetch@2.6.13", "", { "dependencies": { "@types/node": "*", "form-data": "^4.0.4" } }, "sha512-QGpRVpzSaUs30JBSGPjOg4Uveu384erbHBoT1zeONvyCfwQxIkUshLAOqN/k9EjGviPRmWTTe6aH2qySWKTVSw=="], - - "abort-controller": ["abort-controller@3.0.0", "", { "dependencies": { "event-target-shim": "^5.0.0" } }, "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg=="], - - "agentkeepalive": ["agentkeepalive@4.6.0", "", { "dependencies": { "humanize-ms": "^1.2.1" } }, "sha512-kja8j7PjmncONqaTsB8fQ+wE2mSU2DJ9D4XKoJ5PFWIdRMa6SLSN1ff4mOr4jCbfRSsxR4keIiySJU0N9T5hIQ=="], - "ansis": ["ansis@4.2.0", "", {}, "sha512-HqZ5rWlFjGiV0tDm3UxxgNRqsOTniqoKZu0pIAfh7TZQMGuZK+hH0drySty0si0QXj1ieop4+SkSfPZBPPkHig=="], "args-tokenizer": ["args-tokenizer@0.3.0", "", {}, "sha512-xXAd7G2Mll5W8uo37GETpQ2VrE84M181Z7ugHFGQnJZ50M2mbOv0osSZ9VsSgPfJQ+LVG0prSi0th+ELMsno7Q=="], - "asynckit": ["asynckit@0.4.0", "", {}, "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q=="], - "bumpp": ["bumpp@10.3.2", "", { "dependencies": { "ansis": "^4.2.0", "args-tokenizer": "^0.3.0", "c12": "^3.3.2", "cac": "^6.7.14", "escalade": "^3.2.0", "jsonc-parser": "^3.3.1", "package-manager-detector": "^1.5.0", "semver": "^7.7.3", "tinyexec": "^1.0.2", "tinyglobby": "^0.2.15", "yaml": "^2.8.1" }, "bin": { "bumpp": "bin/bumpp.mjs" } }, "sha512-yUUkVx5zpTywLNX97MlrqtpanI7eMMwFwLntWR2EBVDw3/Pm3aRIzCoDEGHATLIiHK9PuJC7xWI4XNWqXItSPg=="], "bun-types": ["bun-types@1.3.5", "", { "dependencies": { "@types/node": "*" } }, "sha512-inmAYe2PFLs0SUbFOWSVD24sg1jFlMPxOjOSSCYqUgn4Hsc3rDc7dFvfVYjFPNHtov6kgUeulV4SxbuIV/stPw=="], @@ -217,8 +249,6 @@ "cac": ["cac@6.7.14", "", {}, "sha512-b6Ilus+c3RrdDk+JhLKUAQfzzgLEPy6wcXqS7f/xe1EETvsDP6GORG7SFuOs6cID5YkqchW/LXZbX5bc8j7ZcQ=="], - "call-bind-apply-helpers": ["call-bind-apply-helpers@1.0.2", "", { "dependencies": { "es-errors": "^1.3.0", "function-bind": "^1.1.2" } }, "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ=="], - "chokidar": ["chokidar@5.0.0", "", { "dependencies": { "readdirp": "^5.0.0" } }, "sha512-TQMmc3w+5AxjpL8iIiwebF73dRDF4fBIieAqGn9RGCWaEVwQ6Fb2cGe31Yns0RRIzii5goJ1Y7xbMwo1TxMplw=="], "citty": ["citty@0.1.6", "", { "dependencies": { "consola": "^3.2.3" } }, "sha512-tskPPKEs8D2KPafUypv2gxwJP8h/OaJmC82QQGGDQcHvXX43xF2VDACcJVmZ0EuSxkpO9Kc4MlrA3q0+FG58AQ=="], @@ -227,70 +257,34 @@ "coffi": ["coffi@0.1.37", "", { "dependencies": { "strip-json-comments": "^5.0.3" } }, "sha512-ewO5Xis7sw7g54yI/3lJ/nNV90Er4ZnENeDORZjrs58T70MmwKFLZgevraNCz+RmB4KDKsYT1ui1wDB36iPWqQ=="], - "combined-stream": ["combined-stream@1.0.8", "", { "dependencies": { "delayed-stream": "~1.0.0" } }, "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg=="], - "confbox": ["confbox@0.2.2", "", {}, "sha512-1NB+BKqhtNipMsov4xI/NnhCKp9XG9NamYp5PVm9klAT0fsrNPjaFICsCFhNhwZJKNh7zB/3q8qXz0E9oaMNtQ=="], "consola": ["consola@3.4.2", "", {}, "sha512-5IKcdX0nnYavi6G7TtOhwkYzyjfJlatbjMjuLSfE2kYT5pMDOilZ4OvMhi637CcDICTmz3wARPoyhqyX1Y+XvA=="], "defu": ["defu@6.1.4", "", {}, "sha512-mEQCMmwJu317oSz8CwdIOdwf3xMif1ttiM8LTufzc3g6kR+9Pe236twL8j3IYT1F7GfRgGcW6MWxzZjLIkuHIg=="], - "delayed-stream": ["delayed-stream@1.0.0", "", {}, "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ=="], - "destr": ["destr@2.0.5", "", {}, "sha512-ugFTXCtDZunbzasqBxrK93Ik/DRYsO6S/fedkWEMKqt04xZ4csmnmwGDBAb07QWNaGMAmnTIemsYZCksjATwsA=="], "detect-libc": ["detect-libc@2.1.2", "", {}, "sha512-Btj2BOOO83o3WyH59e8MgXsxEQVcarkUOpEYrubB0urwnN10yQ364rsiByU11nZlqWYZm05i/of7io4mzihBtQ=="], - "dotenv": ["dotenv@17.2.3", "", {}, "sha512-JVUnt+DUIzu87TABbhPmNfVdBDt18BLOWjMUFJMSi/Qqg7NTYtabbvSNJGOJ7afbRuv9D/lngizHtP7QyLQ+9w=="], - - "dunder-proto": ["dunder-proto@1.0.1", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.1", "es-errors": "^1.3.0", "gopd": "^1.2.0" } }, "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A=="], + "dotenv": ["dotenv@16.6.1", "", {}, "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow=="], "effect": ["effect@3.19.13", "", { "dependencies": { "@standard-schema/spec": "^1.0.0", "fast-check": "^3.23.1" } }, "sha512-8MZ783YuHRwHZX2Mmm+bpGxq+7XPd88sWwYAz2Ysry80sEKpftDZXs2Hg9ZyjESi1IBTNHF0oDKe0zJRkUlyew=="], - "es-define-property": ["es-define-property@1.0.1", "", {}, "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g=="], - - "es-errors": ["es-errors@1.3.0", "", {}, "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw=="], - - "es-object-atoms": ["es-object-atoms@1.1.1", "", { "dependencies": { "es-errors": "^1.3.0" } }, "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA=="], - - "es-set-tostringtag": ["es-set-tostringtag@2.1.0", "", { "dependencies": { "es-errors": "^1.3.0", "get-intrinsic": "^1.2.6", "has-tostringtag": "^1.0.2", "hasown": "^2.0.2" } }, "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA=="], - "escalade": ["escalade@3.2.0", "", {}, "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA=="], - "event-target-shim": ["event-target-shim@5.0.1", "", {}, "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ=="], - "exsolve": ["exsolve@1.0.8", "", {}, "sha512-LmDxfWXwcTArk8fUEnOfSZpHOJ6zOMUJKOtFLFqJLoKJetuQG874Uc7/Kki7zFLzYybmZhp1M7+98pfMqeX8yA=="], "fast-check": ["fast-check@3.23.2", "", { "dependencies": { "pure-rand": "^6.1.0" } }, "sha512-h5+1OzzfCC3Ef7VbtKdcv7zsstUQwUDlYpUTvjeUsJAssPgLn7QzbboPtL5ro04Mq0rPOsMzl7q5hIbRs2wD1A=="], "fdir": ["fdir@6.5.0", "", { "peerDependencies": { "picomatch": "^3 || ^4" }, "optionalPeers": ["picomatch"] }, "sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg=="], - "form-data": ["form-data@4.0.5", "", { "dependencies": { "asynckit": "^0.4.0", "combined-stream": "^1.0.8", "es-set-tostringtag": "^2.1.0", "hasown": "^2.0.2", "mime-types": "^2.1.12" } }, "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w=="], - - "form-data-encoder": ["form-data-encoder@1.7.2", "", {}, "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A=="], - - "formdata-node": ["formdata-node@4.4.1", "", { "dependencies": { "node-domexception": "1.0.0", "web-streams-polyfill": "4.0.0-beta.3" } }, "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ=="], - - "function-bind": ["function-bind@1.1.2", "", {}, "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA=="], - - "get-intrinsic": ["get-intrinsic@1.3.0", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.2", "es-define-property": "^1.0.1", "es-errors": "^1.3.0", "es-object-atoms": "^1.1.1", "function-bind": "^1.1.2", "get-proto": "^1.0.1", "gopd": "^1.2.0", "has-symbols": "^1.1.0", "hasown": "^2.0.2", "math-intrinsics": "^1.1.0" } }, "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ=="], - - "get-proto": ["get-proto@1.0.1", "", { "dependencies": { "dunder-proto": "^1.0.1", "es-object-atoms": "^1.0.0" } }, "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g=="], - "giget": ["giget@2.0.0", "", { "dependencies": { "citty": "^0.1.6", "consola": "^3.4.0", "defu": "^6.1.4", "node-fetch-native": "^1.6.6", "nypm": "^0.6.0", "pathe": "^2.0.3" }, "bin": { "giget": "dist/cli.mjs" } }, "sha512-L5bGsVkxJbJgdnwyuheIunkGatUF/zssUoxxjACCseZYAVbaqdh9Tsmmlkl8vYan09H7sbvKt4pS8GqKLBrEzA=="], - "gopd": ["gopd@1.2.0", "", {}, "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg=="], - - "has-symbols": ["has-symbols@1.1.0", "", {}, "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ=="], - - "has-tostringtag": ["has-tostringtag@1.0.2", "", { "dependencies": { "has-symbols": "^1.0.3" } }, "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw=="], - - "hasown": ["hasown@2.0.2", "", { "dependencies": { "function-bind": "^1.1.2" } }, "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ=="], - - "humanize-ms": ["humanize-ms@1.2.1", "", { "dependencies": { "ms": "^2.0.0" } }, "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ=="], - "jiti": ["jiti@2.6.1", "", { "bin": { "jiti": "lib/jiti-cli.mjs" } }, "sha512-ekilCSN1jwRvIbgeg/57YFh8qQDNbwDb9xT/qu2DAHbFFZUicIl4ygVaAvzveMhMVr3LnpSKTNnwt8PoOfmKhQ=="], + "json-schema-to-ts": ["json-schema-to-ts@3.1.1", "", { "dependencies": { "@babel/runtime": "^7.18.3", "ts-algebra": "^2.0.0" } }, "sha512-+DWg8jCJG2TEnpy7kOm/7/AxaYoaRbjVB4LFZLySZlWn8exGs3A4OLJR966cVvU26N7X9TWxl+Jsw7dzAqKT6g=="], + "jsonc-parser": ["jsonc-parser@3.3.1", "", {}, "sha512-HUgH65KyejrUFPvHFPbqOY0rsFip3Bo5wb4ngvdi1EpCYWUQDC5V+Y7mZws+DLkr4M//zQJoanu1SP+87Dv1oQ=="], "lightningcss": ["lightningcss@1.30.2", "", { "dependencies": { "detect-libc": "^2.0.3" }, "optionalDependencies": { "lightningcss-android-arm64": "1.30.2", "lightningcss-darwin-arm64": "1.30.2", "lightningcss-darwin-x64": "1.30.2", "lightningcss-freebsd-x64": "1.30.2", "lightningcss-linux-arm-gnueabihf": "1.30.2", "lightningcss-linux-arm64-gnu": "1.30.2", "lightningcss-linux-arm64-musl": "1.30.2", "lightningcss-linux-x64-gnu": "1.30.2", "lightningcss-linux-x64-musl": "1.30.2", "lightningcss-win32-arm64-msvc": "1.30.2", "lightningcss-win32-x64-msvc": "1.30.2" } }, "sha512-utfs7Pr5uJyyvDETitgsaqSyjCb2qNRAtuqUeWIAKztsOYdcACf2KtARYXg2pSvhkt+9NfoaNY7fxjl6nuMjIQ=="], @@ -317,20 +311,8 @@ "lightningcss-win32-x64-msvc": ["lightningcss-win32-x64-msvc@1.30.2", "", { "os": "win32", "cpu": "x64" }, "sha512-5g1yc73p+iAkid5phb4oVFMB45417DkRevRbt/El/gKXJk4jid+vPFF/AXbxn05Aky8PapwzZrdJShv5C0avjw=="], - "math-intrinsics": ["math-intrinsics@1.1.0", "", {}, "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g=="], - - "mime-db": ["mime-db@1.52.0", "", {}, "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg=="], - - "mime-types": ["mime-types@2.1.35", "", { "dependencies": { "mime-db": "1.52.0" } }, "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw=="], - - "ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="], - "node-addon-api": ["node-addon-api@8.5.0", "", {}, "sha512-/bRZty2mXUIFY/xU5HLvveNHlswNJej+RnxBjOMkidWfwZzgTbPG1E3K5TOxRLOR+5hX7bSofy8yf1hZevMS8A=="], - "node-domexception": ["node-domexception@1.0.0", "", {}, "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ=="], - - "node-fetch": ["node-fetch@2.7.0", "", { "dependencies": { "whatwg-url": "^5.0.0" }, "peerDependencies": { "encoding": "^0.1.0" }, "optionalPeers": ["encoding"] }, "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A=="], - "node-fetch-native": ["node-fetch-native@1.6.7", "", {}, "sha512-g9yhqoedzIUm0nTnTqAQvueMPVOuIY16bqgAJJC8XOOubYFNwz6IER9qs0Gq2Xd0+CecCKFjtdDTMA4u4xG06Q=="], "node-gyp-build": ["node-gyp-build@4.8.4", "", { "bin": { "node-gyp-build": "bin.js", "node-gyp-build-optional": "optional.js", "node-gyp-build-test": "build-test.js" } }, "sha512-LA4ZjwlnUblHVgq0oBF3Jl/6h/Nvs5fzBLwdEF4nuxnFdsfajde4WfxtJr3CaiH+F6ewcIB/q4jQ4UzPyid+CQ=="], @@ -339,8 +321,6 @@ "ohash": ["ohash@2.0.11", "", {}, "sha512-RdR9FQrFwNBNXAr4GixM8YaRZRJ5PUWbKYbE5eOsrwAjJW0q2REGcf79oYPsLyskQCZG1PLN+S/K1V00joZAoQ=="], - "openai": ["openai@4.104.0", "", { "dependencies": { "@types/node": "^18.11.18", "@types/node-fetch": "^2.6.4", "abort-controller": "^3.0.0", "agentkeepalive": "^4.2.1", "form-data-encoder": "1.7.2", "formdata-node": "^4.3.2", "node-fetch": "^2.6.7" }, "peerDependencies": { "ws": "^8.18.0", "zod": "^3.23.8" }, "optionalPeers": ["ws", "zod"], "bin": { "openai": "bin/cli" } }, "sha512-p99EFNsA/yX6UhVO93f5kJsDRLAg+CTA2RBqdHK4RtK8u5IJw32Hyb2dTGKbnnFmnuoBv5r7Z2CURI9sGZpSuA=="], - "oxc-minify": ["oxc-minify@0.93.0", "", { "optionalDependencies": { "@oxc-minify/binding-android-arm64": "0.93.0", "@oxc-minify/binding-darwin-arm64": "0.93.0", "@oxc-minify/binding-darwin-x64": "0.93.0", "@oxc-minify/binding-freebsd-x64": "0.93.0", "@oxc-minify/binding-linux-arm-gnueabihf": "0.93.0", "@oxc-minify/binding-linux-arm-musleabihf": "0.93.0", "@oxc-minify/binding-linux-arm64-gnu": "0.93.0", "@oxc-minify/binding-linux-arm64-musl": "0.93.0", "@oxc-minify/binding-linux-riscv64-gnu": "0.93.0", "@oxc-minify/binding-linux-s390x-gnu": "0.93.0", "@oxc-minify/binding-linux-x64-gnu": "0.93.0", "@oxc-minify/binding-linux-x64-musl": "0.93.0", "@oxc-minify/binding-wasm32-wasi": "0.93.0", "@oxc-minify/binding-win32-arm64-msvc": "0.93.0", "@oxc-minify/binding-win32-x64-msvc": "0.93.0" } }, "sha512-pwMjOGN/I+cfLVkSmECcVHROKwECNVAXCT5h/29S4f0aArIUh3CQnix1yYy7MTQ3yThNuGANjjE9jWJyT43Vbw=="], "oxc-resolver": ["oxc-resolver@11.16.0", "", { "optionalDependencies": { "@oxc-resolver/binding-android-arm-eabi": "11.16.0", "@oxc-resolver/binding-android-arm64": "11.16.0", "@oxc-resolver/binding-darwin-arm64": "11.16.0", "@oxc-resolver/binding-darwin-x64": "11.16.0", "@oxc-resolver/binding-freebsd-x64": "11.16.0", "@oxc-resolver/binding-linux-arm-gnueabihf": "11.16.0", "@oxc-resolver/binding-linux-arm-musleabihf": "11.16.0", "@oxc-resolver/binding-linux-arm64-gnu": "11.16.0", "@oxc-resolver/binding-linux-arm64-musl": "11.16.0", "@oxc-resolver/binding-linux-ppc64-gnu": "11.16.0", "@oxc-resolver/binding-linux-riscv64-gnu": "11.16.0", "@oxc-resolver/binding-linux-riscv64-musl": "11.16.0", "@oxc-resolver/binding-linux-s390x-gnu": "11.16.0", "@oxc-resolver/binding-linux-x64-gnu": "11.16.0", "@oxc-resolver/binding-linux-x64-musl": "11.16.0", "@oxc-resolver/binding-openharmony-arm64": "11.16.0", "@oxc-resolver/binding-wasm32-wasi": "11.16.0", "@oxc-resolver/binding-win32-arm64-msvc": "11.16.0", "@oxc-resolver/binding-win32-ia32-msvc": "11.16.0", "@oxc-resolver/binding-win32-x64-msvc": "11.16.0" } }, "sha512-I4sHGa1fZUpTQ9ftS0E0cBYbBjNnIKXRSX/trFMIJDIJ4n21dCrLAZhnJS0TSfRIRqZNFyceNZr2kablfgNyTA=="], @@ -375,8 +355,6 @@ "tinyglobby": ["tinyglobby@0.2.15", "", { "dependencies": { "fdir": "^6.5.0", "picomatch": "^4.0.3" } }, "sha512-j2Zq4NyQYG5XMST4cbs02Ak8iJUdxRM0XI5QyxXuZOzKOINmWurp3smXu3y5wDcJrptwpSjgXHzIQxR0omXljQ=="], - "tr46": ["tr46@0.0.3", "", {}, "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw=="], - "tree-kill": ["tree-kill@1.2.2", "", { "bin": { "tree-kill": "cli.js" } }, "sha512-L0Orpi8qGpRG//Nd+H90vFB+3iHnue1zSSGmNOOCh1GLJ7rUKVwV2HvijphGQS2UmhUZewS9VgvxYIdgr+fG1A=="], "tree-sitter-go": ["tree-sitter-go@0.25.0", "", { "dependencies": { "node-addon-api": "^8.3.1", "node-gyp-build": "^4.8.4" }, "peerDependencies": { "tree-sitter": "^0.25.0" }, "optionalPeers": ["tree-sitter"] }, "sha512-APBc/Dq3xz/e35Xpkhb1blu5UgW+2E3RyGWawZSCNcbGwa7jhSQPS8KsUupuzBla8PCo8+lz9W/JDJjmfRa2tw=="], @@ -391,6 +369,8 @@ "tree-sitter-typescript": ["tree-sitter-typescript@0.23.2", "", { "dependencies": { "node-addon-api": "^8.2.2", "node-gyp-build": "^4.8.2", "tree-sitter-javascript": "^0.23.1" }, "peerDependencies": { "tree-sitter": "^0.21.0" }, "optionalPeers": ["tree-sitter"] }, "sha512-e04JUUKxTT53/x3Uq1zIL45DoYKVfHH4CZqwgZhPg5qYROl5nQjV+85ruFzFGZxu+QeFVbRTPDRnqL9UbU4VeA=="], + "ts-algebra": ["ts-algebra@2.0.0", "", {}, "sha512-FPAhNPFMrkwz76P7cdjdmiShwMynZYN6SgOujD1urY4oNm80Ou9oMdmbR45LotcKOXoy7wSmHkRFE6Mxbrhefw=="], + "ts-import-resolver": ["ts-import-resolver@0.1.23", "", { "peerDependencies": { "typescript": ">=4.5.0" }, "optionalPeers": ["typescript"] }, "sha512-282pgr6j6aOvP3P2I6XugDxdBobkpdMmdbWjRjGl5gjPI1p0+oTNGDh1t924t75kRlyIkF65DiwhSIUysmyHQA=="], "tslib": ["tslib@2.8.1", "", {}, "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w=="], @@ -399,18 +379,16 @@ "undici-types": ["undici-types@5.26.5", "", {}, "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA=="], - "web-streams-polyfill": ["web-streams-polyfill@4.0.0-beta.3", "", {}, "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug=="], - "web-tree-sitter": ["web-tree-sitter@0.26.3", "", {}, "sha512-JIVgIKFS1w6lejxSntCtsS/QsE/ecTS00en809cMxMPxaor6MvUnQ+ovG8uTTTvQCFosSh4MeDdI5bSGw5SoBw=="], - "webidl-conversions": ["webidl-conversions@3.0.1", "", {}, "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ=="], - - "whatwg-url": ["whatwg-url@5.0.0", "", { "dependencies": { "tr46": "~0.0.3", "webidl-conversions": "^3.0.0" } }, "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw=="], - "yaml": ["yaml@2.8.2", "", { "bin": { "yaml": "bin.mjs" } }, "sha512-mplynKqc1C2hTVYxd0PU2xQAc22TI1vShAYGksCCfxbn/dFwnHTNi1bvYsBTkhdUNtGIf5xNOg938rrSSYvS9A=="], "zlye": ["zlye@0.4.4", "", { "dependencies": { "picocolors": "^1.1.1" }, "peerDependencies": { "typescript": ">=4.5.0" }, "optionalPeers": ["typescript"] }, "sha512-fwpeC841X3ElOLYRMKXbwX29pitNrsm6nRNvEhDMrRXDl3BhR2i03Bkr0GNrpyYgZJuEzUsBylXAYzgGPXXOCQ=="], + "zod": ["zod@3.25.76", "", {}, "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ=="], + + "c12/dotenv": ["dotenv@17.2.3", "", {}, "sha512-JVUnt+DUIzu87TABbhPmNfVdBDt18BLOWjMUFJMSi/Qqg7NTYtabbvSNJGOJ7afbRuv9D/lngizHtP7QyLQ+9w=="], + "tree-sitter-typescript/tree-sitter-javascript": ["tree-sitter-javascript@0.23.1", "", { "dependencies": { "node-addon-api": "^8.2.2", "node-gyp-build": "^4.8.2" }, "peerDependencies": { "tree-sitter": "^0.21.1" }, "optionalPeers": ["tree-sitter"] }, "sha512-/bnhbrTD9frUYHQTiYnPcxyHORIw157ERBa6dqzaKxvR/x3PC4Yzd+D1pZIMS6zNg2v3a8BZ0oK7jHqsQo9fWA=="], } } diff --git a/packages/eval/.gitignore b/packages/eval/.gitignore index 0f8888b..fab698e 100644 --- a/packages/eval/.gitignore +++ b/packages/eval/.gitignore @@ -1,3 +1,2 @@ -cache -results +runs data diff --git a/packages/eval/.python-version b/packages/eval/.python-version deleted file mode 100644 index 6324d40..0000000 --- a/packages/eval/.python-version +++ /dev/null @@ -1 +0,0 @@ -3.14 diff --git a/packages/eval/README.md b/packages/eval/README.md index e69de29..3c28216 100644 --- a/packages/eval/README.md +++ b/packages/eval/README.md @@ -0,0 +1,116 @@ +# @supermemory/eval + +SWE-bench Lite retrieval-only evaluation harness comparing two Claude Agent SDK variants: + +- **Agent1 (ops-only)**: Read/Grep/Glob tools only +- **Agent2 (ops+search)**: Read/Grep/Glob + semantic search via `code-chunk` embeddings + +## Setup + +```bash +# From monorepo root +bun install +``` + +Required environment variables: + +```bash +ANTHROPIC_API_KEY=... # Claude API access +GOOGLE_API_KEY=... # Gemini embeddings (default) +# or +OPENAI_API_KEY=... # If using --embedding-provider openai +``` + +## Usage + +```bash +cd packages/eval + +# Full evaluation on test split +bun run src/run.ts + +# Dev split, limited instances +bun run src/run.ts --split dev --max-instances 10 + +# Only Agent1 (ops-only) +bun run src/run.ts --skip-agent2 + +# Specific instance +bun run src/run.ts --instance django__django-12345 + +# Custom embedding dimensions (768/1536/3072) +bun run src/run.ts --embedding-dimensions 768 +``` + +## Options + +| Flag | Description | Default | +|------|-------------|---------| +| `--split ` | Dataset split | `test` | +| `--max-instances ` | Limit instances | all | +| `--max-turns ` | Max agent turns | 20 | +| `--max-tool-calls ` | Max tool calls | 50 | +| `--model ` | Claude model | `claude-sonnet-4-5` | +| `--skip-agent1` | Skip ops-only agent | false | +| `--skip-agent2` | Skip ops+search agent | false | +| `--instance ` | Run specific instance(s) | - | +| `--run-dir ` | Output directory | `./runs` | +| `--embedding-provider` | `gemini` or `openai` | `gemini` | +| `--embedding-dimensions` | Gemini output dims | 1536 | + +## Output + +Runs output to `runs//`: + +``` +runs/ +└── 2025-01-01T12-00-00-000Z/ + ├── events/ + │ ├── django__django-12345_ops-only.jsonl + │ └── django__django-12345_ops+search.jsonl + ├── metrics.jsonl + └── summary.json +``` + +## Metrics + +- **Hit@k**: Whether oracle file appears in top-k predictions +- **MRR**: Mean Reciprocal Rank of first oracle file +- **Coverage@k**: Fraction of oracle files in top-k +- **Time-to-first-hit**: Turns/tool calls until first oracle file accessed +- **Embedding latency**: Index build + query times (Agent2 only) + +## Architecture + +``` +src/ +├── run.ts # CLI entrypoint +└── swebench/ + ├── types.ts # SWEbenchInstance, metrics types + ├── dataset.ts # HuggingFace dataset loader with caching + ├── git.ts # Bare clone + worktree management + ├── score.ts # Per-instance metric computation + ├── aggregate.ts # Cross-instance aggregation + ├── run.ts # Main evaluation loop + ├── agent/ + │ ├── prompts.ts # Retrieval-only system/user prompts + │ ├── variants.ts # Agent1/Agent2 tool configurations + │ └── semantic_search_adapter.ts # Gemini embeddings + MCP server + └── observe/ + └── instrumentation.ts # SDK hooks, event writer +``` + +## How it works + +1. Loads SWE-bench Lite dataset (300 instances) +2. For each instance: + - Creates git worktree at target commit + - Runs Agent1 (ops-only) with Read/Grep/Glob + - Builds semantic index using `code-chunk` + - Runs Agent2 (ops+search) with additional semantic_search tool + - Computes retrieval metrics against oracle files from patch +3. Aggregates metrics, prints summary, writes results + +## Embedding cache + +Semantic search indexes are cached at `~/.cache/swebench-eval/embeddings/` to avoid re-embedding repos. Cache key includes instance ID + embedding provider + dimensions. diff --git a/packages/eval/package.json b/packages/eval/package.json index c54865c..f1c8394 100644 --- a/packages/eval/package.json +++ b/packages/eval/package.json @@ -2,17 +2,24 @@ "name": "@supermemory/eval", "version": "0.1.0", "private": true, - "description": "Evaluation harness for code-chunk", + "description": "SWE-bench Lite retrieval-only evaluation harness for code-chunk", "type": "module", "scripts": { "start": "bun run src/run.ts", + "eval": "bun run src/run.ts", + "eval:dev": "bun run src/run.ts --split dev", + "eval:quick": "bun run src/run.ts --max-instances 5", "type-check": "tsc --noEmit" }, "dependencies": { + "@anthropic-ai/claude-agent-sdk": "^0.1.75", + "@anthropic-ai/sdk": "^0.71.2", "code-chunk": "workspace:*", - "openai": "^4.0.0" + "dotenv": "^16.4.0", + "zod": "^3.24.0" }, "devDependencies": { - "@types/bun": "^1.3.4" + "@types/bun": "^1.3.4", + "typescript": "^5.0.0" } } diff --git a/packages/eval/pyproject.toml b/packages/eval/pyproject.toml deleted file mode 100644 index e3232ad..0000000 --- a/packages/eval/pyproject.toml +++ /dev/null @@ -1,9 +0,0 @@ -[project] -name = "eval" -version = "0.1.0" -description = "Add your description here" -readme = "README.md" -requires-python = ">=3.14" -dependencies = [ - "chonkie[code]>=1.0.5", -] diff --git a/packages/eval/src/chunkers/ast.ts b/packages/eval/src/chunkers/ast.ts deleted file mode 100644 index 67727ce..0000000 --- a/packages/eval/src/chunkers/ast.ts +++ /dev/null @@ -1,40 +0,0 @@ -/** - * AST-aware chunker wrapper for evaluation - * - * Wraps the code-chunk library for use in the evaluation harness. - * Uses the built-in contextualizedText for better embedding quality. - */ - -import { chunk } from 'code-chunk' - -/** - * Chunk a file using AST-aware chunking and return results - * in a format compatible with the evaluation - * - * @param filepath - Path to the file - * @param code - Source code content - * @param maxNws - Maximum NWS characters per chunk (default: 1500) - */ -export async function chunkFile( - filepath: string, - code: string, - maxNws: number = 1500, -): Promise< - Array<{ - id: string - text: string - startLine: number - endLine: number - }> -> { - const chunks = await chunk(filepath, code, { - maxChunkSize: maxNws, - }) - - return chunks.map((c) => ({ - id: `${filepath}:${c.lineRange.start}-${c.lineRange.end}`, - text: c.contextualizedText, - startLine: c.lineRange.start, - endLine: c.lineRange.end, - })) -} diff --git a/packages/eval/src/chunkers/chonkie.ts b/packages/eval/src/chunkers/chonkie.ts deleted file mode 100644 index 3c0327f..0000000 --- a/packages/eval/src/chunkers/chonkie.ts +++ /dev/null @@ -1,82 +0,0 @@ -/** - * Chonkie CodeChunker wrapper for evaluation - * - * Wraps the Chonkie Python library's CodeChunker for use in the evaluation harness. - * Calls the Python script via subprocess. - */ - -import { spawn } from 'node:child_process' -import { dirname, join } from 'node:path' - -// Go up from src/chunkers to package root -const PACKAGE_ROOT = join(dirname(import.meta.dir), '..') -const PYTHON_PATH = join(PACKAGE_ROOT, '.venv', 'bin', 'python') -const SCRIPT_PATH = join(import.meta.dir, 'chonkie_chunk.py') - -interface ChunkResult { - id: string - text: string - startLine: number - endLine: number -} - -/** - * Chunk a file using Chonkie's CodeChunker and return results - * in a format compatible with the evaluation - * - * @param filepath - Path to the file - * @param code - Source code content - * @param maxChunkSize - Maximum characters per chunk (default: 1500) - */ -export async function chunkFile( - filepath: string, - code: string, - maxChunkSize: number = 1500, -): Promise { - return new Promise((resolve, reject) => { - const proc = spawn( - PYTHON_PATH, - [SCRIPT_PATH, filepath, String(maxChunkSize)], - { - stdio: ['pipe', 'pipe', 'pipe'], - }, - ) - - let stdout = '' - let stderr = '' - - proc.stdout.on('data', (data) => { - stdout += data.toString() - }) - - proc.stderr.on('data', (data) => { - stderr += data.toString() - }) - - proc.on('close', (code) => { - if (code !== 0) { - reject(new Error(`Chonkie chunker failed: ${stderr}`)) - return - } - - try { - const result = JSON.parse(stdout) - if (result.error) { - reject(new Error(`Chonkie error: ${result.error}`)) - return - } - resolve(result) - } catch { - reject(new Error(`Failed to parse Chonkie output: ${stdout}`)) - } - }) - - proc.on('error', (err) => { - reject(err) - }) - - // Write code to stdin - proc.stdin.write(code) - proc.stdin.end() - }) -} diff --git a/packages/eval/src/chunkers/chonkie_chunk.py b/packages/eval/src/chunkers/chonkie_chunk.py deleted file mode 100644 index 24da1e3..0000000 --- a/packages/eval/src/chunkers/chonkie_chunk.py +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env python3 -""" -Chonkie CodeChunker wrapper for evaluation. - -Takes filepath, code, and max_chunk_size as arguments. -Outputs JSON array of chunks with id, text, startLine, endLine. -""" - -import json -import sys -from chonkie import CodeChunker - - -def count_nws(text: str) -> int: - """Count non-whitespace characters to match the evaluation's sizing.""" - return sum(1 for c in text if not c.isspace()) - - -def main(): - if len(sys.argv) < 3: - print("Usage: chonkie_chunk.py ", file=sys.stderr) - print("Code is read from stdin", file=sys.stderr) - sys.exit(1) - - filepath = sys.argv[1] - max_chunk_size = int(sys.argv[2]) - - # Read code from stdin to handle large files and special characters - code = sys.stdin.read() - - # Determine language from file extension - ext = filepath.rsplit(".", 1)[-1].lower() if "." in filepath else "" - lang_map = { - "py": "python", - "js": "javascript", - "ts": "typescript", - "tsx": "tsx", - "jsx": "javascript", - "rs": "rust", - "go": "go", - "java": "java", - "c": "c", - "cpp": "cpp", - "h": "c", - "hpp": "cpp", - "rb": "ruby", - "php": "php", - "cs": "c_sharp", - "swift": "swift", - "kt": "kotlin", - "scala": "scala", - } - - language = lang_map.get(ext, "python") # Default to python for .py files - - try: - # Initialize CodeChunker with character tokenizer to match NWS-based sizing - # Use a simple character-based token counter - chunker = CodeChunker( - tokenizer_or_token_counter=lambda x: len(x), # Character count - chunk_size=max_chunk_size, - language=language, - include_nodes=False, - ) - - chunks = chunker.chunk(code) - - # Convert to evaluation format - results = [] - lines = code.split("\n") - - for chunk in chunks: - # Find line numbers from start/end indices - start_line = code[:chunk.start_index].count("\n") - end_line = code[:chunk.end_index].count("\n") - - results.append({ - "id": f"{filepath}:{start_line}-{end_line}", - "text": chunk.text, - "startLine": start_line, - "endLine": end_line, - }) - - print(json.dumps(results)) - - except Exception as e: - print(json.dumps({"error": str(e)}), file=sys.stderr) - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/packages/eval/src/chunkers/fixed.ts b/packages/eval/src/chunkers/fixed.ts deleted file mode 100644 index c685b7b..0000000 --- a/packages/eval/src/chunkers/fixed.ts +++ /dev/null @@ -1,89 +0,0 @@ -/** - * Fixed-size chunker for evaluation baseline - * - * Simple line-based chunker that splits code into fixed-size chunks - * based on non-whitespace character count. Used as a baseline comparison - * for the AST-aware chunker. - */ - -/** - * Count non-whitespace characters in a string - */ -function countNws(text: string): number { - let count = 0 - for (let i = 0; i < text.length; i++) { - if (text.charCodeAt(i) > 32) count++ - } - return count -} - -/** - * Chunk a file using fixed-size chunking based on NWS character count - * - * @param filepath - Path to the file (used for chunk IDs) - * @param code - Source code content - * @param maxNws - Maximum NWS characters per chunk (default: 1500) - */ -export async function chunkFile( - filepath: string, - code: string, - maxNws: number = 1500, -): Promise< - Array<{ - id: string - text: string - startLine: number - endLine: number - }> -> { - const lines = code.split('\n') - const chunks: Array<{ - id: string - text: string - startLine: number - endLine: number - }> = [] - - let currentLines: string[] = [] - let currentNws = 0 - let startLine = 0 - - for (let i = 0; i < lines.length; i++) { - const line = lines[i] ?? '' - const lineNws = countNws(line) - - if (currentNws + lineNws > maxNws && currentLines.length > 0) { - // Flush current chunk - const text = currentLines.join('\n') - const endLine = startLine + currentLines.length - 1 - chunks.push({ - id: `${filepath}:${startLine}-${endLine}`, - text, - startLine, - endLine, - }) - - // Start new chunk - currentLines = [line] - currentNws = lineNws - startLine = i - } else { - currentLines.push(line) - currentNws += lineNws - } - } - - // Flush remaining lines - if (currentLines.length > 0) { - const text = currentLines.join('\n') - const endLine = startLine + currentLines.length - 1 - chunks.push({ - id: `${filepath}:${startLine}-${endLine}`, - text, - startLine, - endLine, - }) - } - - return chunks -} diff --git a/packages/eval/src/debug_chunks.ts b/packages/eval/src/debug_chunks.ts deleted file mode 100644 index 4afb888..0000000 --- a/packages/eval/src/debug_chunks.ts +++ /dev/null @@ -1,77 +0,0 @@ -import { readFileSync } from 'node:fs' -import { join } from 'node:path' -import { chunk } from 'code-chunk' -import { chunkFile as chunkFixed } from './chunkers/fixed' - -// Check deepmind_tracr/tracr/craft/transformers.py -// Assume we're looking for lines 100-150 -const testFile = join( - import.meta.dir, - '../data/repoeval/repositories/function_level/deepmind_tracr/tracr/craft/transformers.py', -) -const code = readFileSync(testFile, 'utf-8') -const targetStart = 100 -const targetEnd = 150 - -console.log('File:', testFile) -console.log('Target lines:', targetStart, '-', targetEnd) -console.log('') - -function countNws(text: string): number { - let count = 0 - for (let i = 0; i < text.length; i++) { - if (text.charCodeAt(i) > 32) count++ - } - return count -} - -function overlaps( - chunkStart: number, - chunkEnd: number, - tStart: number, - tEnd: number, -): boolean { - return !(chunkEnd < tStart || chunkStart > tEnd) -} - -for (const maxSize of [1500, 1800]) { - console.log(`\n=== Max chunk size: ${maxSize} ===`) - - const astChunks = await chunk(testFile, code, { maxChunkSize: maxSize }) - const fixedChunks = await chunkFixed(testFile, code, maxSize) - - console.log('\nAST chunks:') - for (const c of astChunks) { - const overlap = overlaps( - c.lineRange.start, - c.lineRange.end, - targetStart, - targetEnd, - ) - console.log( - ` Lines ${c.lineRange.start}-${c.lineRange.end} (${countNws(c.text)} NWS) ${overlap ? '*** RELEVANT ***' : ''}`, - ) - } - - console.log('\nFixed chunks:') - for (const c of fixedChunks) { - const overlap = overlaps(c.startLine, c.endLine, targetStart, targetEnd) - console.log( - ` Lines ${c.startLine}-${c.endLine} (${countNws(c.text)} NWS) ${overlap ? '*** RELEVANT ***' : ''}`, - ) - } - - const astRelevant = astChunks.filter((c) => - overlaps(c.lineRange.start, c.lineRange.end, targetStart, targetEnd), - ) - const fixedRelevant = fixedChunks.filter((c) => - overlaps(c.startLine, c.endLine, targetStart, targetEnd), - ) - - console.log( - `\nRelevant chunks: AST=${astRelevant.length}, Fixed=${fixedRelevant.length}`, - ) - console.log( - `Total chunks: AST=${astChunks.length}, Fixed=${fixedChunks.length}`, - ) -} diff --git a/packages/eval/src/download.ts b/packages/eval/src/download.ts deleted file mode 100644 index 4e1bd61..0000000 --- a/packages/eval/src/download.ts +++ /dev/null @@ -1,149 +0,0 @@ -/** - * Download RepoEval benchmark data - * - * Downloads: - * 1. Task datasets (queries, ground truth) from Microsoft CodeT repo - * 2. Function-level Python repositories for chunking - */ - -import { existsSync } from 'node:fs' -import { mkdir, writeFile } from 'node:fs/promises' -import { join } from 'node:path' - -const DATA_DIR = join(import.meta.dir, '..', 'data', 'repoeval') -const DATASETS_DIR = join(DATA_DIR, 'datasets') -const REPOS_DIR = join(DATA_DIR, 'repositories', 'function_level') - -// Function-level repositories from RepoEval -const REPOS_FUNCTION = [ - 'amazon-science_patchcore-inspection', - 'deepmind_tracr', - 'facebookresearch_omnivore', - 'google_lightweight_mmm', - 'lucidrains_imagen-pytorch', - 'maxhumber_redframes', -] - -async function downloadAndExtractZip( - url: string, - destDir: string, -): Promise { - console.log(`Downloading from ${url}...`) - - const response = await fetch(url) - if (!response.ok) { - throw new Error(`Failed to download: ${response.statusText}`) - } - - const arrayBuffer = await response.arrayBuffer() - const tempZipPath = join(destDir, '_temp.zip') - - await mkdir(destDir, { recursive: true }) - await writeFile(tempZipPath, new Uint8Array(arrayBuffer)) - - // Use unzip command - const proc = Bun.spawn(['unzip', '-o', '-q', tempZipPath, '-d', destDir], { - cwd: destDir, - }) - await proc.exited - - // Clean up temp file - await Bun.spawn(['rm', tempZipPath]).exited - - console.log(`Extracted to ${destDir}`) -} - -async function downloadDatasets(): Promise { - if (existsSync(DATASETS_DIR)) { - console.log('Datasets already downloaded, skipping...') - return - } - - const datasetsUrl = - 'https://github.com/microsoft/CodeT/raw/main/RepoCoder/datasets/datasets.zip' - await downloadAndExtractZip(datasetsUrl, DATASETS_DIR) -} - -async function downloadRepositories(): Promise { - if (existsSync(REPOS_DIR)) { - console.log('Repositories already downloaded, skipping...') - return - } - - // Using the cleaned version from Veronicium's fork - const reposUrl = - 'https://github.com/Veronicium/repoeval_debug/raw/main/function_level.zip' - await downloadAndExtractZip(reposUrl, REPOS_DIR) -} - -export interface RepoEvalTask { - prompt: string - metadata: { - task_id: string - ground_truth: string - fpath_tuple: string[] - line_no: number - lineno: number - context_start_lineno: number - } -} - -export async function loadTasks( - contextLength: '1k' | '2k' | '4k' = '2k', -): Promise { - const fileName = `function_level_completion_${contextLength}_context_codex.test.jsonl` - const filePath = join(DATASETS_DIR, fileName) - - const content = await Bun.file(filePath).text() - const lines = content.trim().split('\n') - - const tasks: RepoEvalTask[] = [] - const repo2idx: Record = {} - - for (const line of lines) { - const task = JSON.parse(line) as RepoEvalTask - - // Clean up task_id format - const repo = task.metadata.task_id.replace('--', '_').split('/')[0] - if (!repo || !REPOS_FUNCTION.includes(repo)) continue - - if (!(repo in repo2idx)) { - repo2idx[repo] = 0 - } - - task.metadata.task_id = task.metadata.task_id - .replace('--', '_') - .replace('idx', String(repo2idx[repo] ?? 0)) - task.metadata.line_no = task.metadata.lineno - repo2idx[repo] = (repo2idx[repo] ?? 0) + 1 - - tasks.push(task) - } - - return tasks -} - -export function getReposDir(): string { - return REPOS_DIR -} - -export function getRepos(): string[] { - return REPOS_FUNCTION -} - -export async function download(): Promise { - console.log('Downloading RepoEval benchmark data...\n') - - await mkdir(DATA_DIR, { recursive: true }) - - await downloadDatasets() - await downloadRepositories() - - console.log('\nDownload complete!') - console.log(`Data stored in: ${DATA_DIR}`) -} - -// Run if executed directly -if (import.meta.main) { - await download() -} diff --git a/packages/eval/src/embeddings.ts b/packages/eval/src/embeddings.ts deleted file mode 100644 index 0836f2a..0000000 --- a/packages/eval/src/embeddings.ts +++ /dev/null @@ -1,219 +0,0 @@ -/** - * OpenAI embeddings wrapper with disk caching - */ - -import { createHash } from 'node:crypto' -import { existsSync } from 'node:fs' -import { mkdir, readFile, writeFile } from 'node:fs/promises' -import { join } from 'node:path' -import OpenAI from 'openai' - -const CACHE_DIR = join(import.meta.dir, '..', 'cache', 'embeddings') -const MODEL = 'text-embedding-3-small' -const BATCH_SIZE = 100 - -let client: OpenAI | null = null - -function getClient(): OpenAI { - if (!client) { - client = new OpenAI() - } - return client -} - -/** - * Create a cache key from text content - */ -function cacheKey(text: string): string { - return createHash('sha256').update(text).digest('hex').slice(0, 16) -} - -/** - * Get cache file path for a text - */ -function cachePath(text: string): string { - const key = cacheKey(text) - // Use first 2 chars as subdirectory to avoid too many files in one dir - return join(CACHE_DIR, key.slice(0, 2), `${key}.json`) -} - -/** - * Try to load embedding from cache - */ -async function loadFromCache(text: string): Promise { - const path = cachePath(text) - if (!existsSync(path)) { - return null - } - try { - const data = await readFile(path, 'utf-8') - return JSON.parse(data) as number[] - } catch { - return null - } -} - -/** - * Save embedding to cache - */ -async function saveToCache(text: string, embedding: number[]): Promise { - const path = cachePath(text) - const dir = join(path, '..') - await mkdir(dir, { recursive: true }) - await writeFile(path, JSON.stringify(embedding)) -} - -/** - * Embed a batch of texts using OpenAI API - */ -async function embedBatch(texts: string[]): Promise { - const openai = getClient() - - // Filter out empty texts and track their indices - const nonEmptyTexts: string[] = [] - const indexMap: number[] = [] - - for (let i = 0; i < texts.length; i++) { - const text = texts[i]?.trim() ?? '' - if (text.length > 0) { - nonEmptyTexts.push(text) - indexMap.push(i) - } - } - - if (nonEmptyTexts.length === 0) { - // Return zero vectors for all empty inputs - return texts.map(() => new Array(1536).fill(0)) - } - - const response = await openai.embeddings.create({ - model: MODEL, - input: nonEmptyTexts, - }) - - // Sort by index to maintain order - const sorted = response.data.sort( - (a: { index: number }, b: { index: number }) => a.index - b.index, - ) - const embeddings = sorted.map((d: { embedding: number[] }) => d.embedding) - - // Map back to original indices, filling zeros for empty texts - const result: number[][] = texts.map(() => new Array(1536).fill(0)) - for (let i = 0; i < indexMap.length; i++) { - const idx = indexMap[i] - const emb = embeddings[i] - if (idx !== undefined && emb !== undefined) { - result[idx] = emb - } - } - - return result -} - -/** - * Embed texts with caching - * - * @param texts - Array of texts to embed - * @param onProgress - Optional callback for progress updates (done, total, cachedCount) - * @returns Array of embeddings (same order as input texts) - */ -export async function embedTexts( - texts: string[], - onProgress?: (done: number, total: number, cached: number) => void, -): Promise { - await mkdir(CACHE_DIR, { recursive: true }) - - const results: (number[] | null)[] = new Array(texts.length).fill(null) - const uncachedIndices: number[] = [] - const uncachedTexts: string[] = [] - - // Check cache for each text - for (let i = 0; i < texts.length; i++) { - const text = texts[i] - if (!text) continue - const cached = await loadFromCache(text) - if (cached) { - results[i] = cached - } else { - uncachedIndices.push(i) - uncachedTexts.push(text) - } - } - - const cachedCount = texts.length - uncachedTexts.length - - // Report initial state if all cached - if (onProgress && uncachedTexts.length === 0) { - onProgress(texts.length, texts.length, cachedCount) - } - - // Embed uncached texts in batches - for (let i = 0; i < uncachedTexts.length; i += BATCH_SIZE) { - const batch = uncachedTexts.slice(i, i + BATCH_SIZE) - const batchIndices = uncachedIndices.slice(i, i + BATCH_SIZE) - - const embeddings = await embedBatch(batch) - - // Save to cache and store results - for (let j = 0; j < embeddings.length; j++) { - const originalIdx = batchIndices[j] - const embedding = embeddings[j] - const text = batch[j] - if (originalIdx === undefined || !embedding || !text) continue - results[originalIdx] = embedding - await saveToCache(text, embedding) - } - - if (onProgress) { - onProgress( - cachedCount + Math.min(i + BATCH_SIZE, uncachedTexts.length), - texts.length, - cachedCount, - ) - } - } - - return results as number[][] -} - -/** - * Compute cosine similarity between two vectors - */ -export function cosineSimilarity(a: number[], b: number[]): number { - let dotProduct = 0 - let normA = 0 - let normB = 0 - - for (let i = 0; i < a.length; i++) { - const ai = a[i] ?? 0 - const bi = b[i] ?? 0 - dotProduct += ai * bi - normA += ai * ai - normB += bi * bi - } - - return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)) -} - -/** - * Find top-k most similar items - * - * @param queryEmbedding - The query embedding - * @param corpusEmbeddings - Array of corpus embeddings - * @param k - Number of top results to return - * @returns Array of { index, score } sorted by score descending - */ -export function topK( - queryEmbedding: number[], - corpusEmbeddings: number[][], - k: number, -): Array<{ index: number; score: number }> { - const scores = corpusEmbeddings.map((emb, idx) => ({ - index: idx, - score: cosineSimilarity(queryEmbedding, emb), - })) - - scores.sort((a, b) => b.score - a.score) - - return scores.slice(0, k) -} diff --git a/packages/eval/src/metrics.ts b/packages/eval/src/metrics.ts deleted file mode 100644 index a0aaac5..0000000 --- a/packages/eval/src/metrics.ts +++ /dev/null @@ -1,72 +0,0 @@ -/** - * Retrieval metrics for evaluation - * - * Computes precision, recall, and nDCG for retrieval evaluation. - */ - -/** - * Compute precision, recall, and nDCG for a single query - * - * @param retrievedIds - Ordered list of retrieved chunk IDs - * @param relevantSet - Set of relevant (ground truth) chunk IDs - * @param k - Number of results to consider - */ -export function computeMetrics( - retrievedIds: string[], - relevantSet: Set, - k: number, -): { precision: number; recall: number; ndcg: number } { - const topK = retrievedIds.slice(0, k) - - // Precision@k: fraction of retrieved that are relevant - const relevantInTopK = topK.filter((id) => relevantSet.has(id)).length - const precision = relevantInTopK / k - - // Recall@k: fraction of relevant that are retrieved - const recall = relevantSet.size > 0 ? relevantInTopK / relevantSet.size : 0 - - // nDCG@k: normalized discounted cumulative gain - const dcg = topK.reduce((sum, id, i) => { - const rel = relevantSet.has(id) ? 1 : 0 - return sum + rel / Math.log2(i + 2) // i+2 because log2(1) = 0 - }, 0) - - // Ideal DCG: all relevant docs at top - const idealK = Math.min(k, relevantSet.size) - const idcg = Array.from({ length: idealK }).reduce( - (sum, _, i) => sum + 1 / Math.log2(i + 2), - 0, - ) - - const ndcg = idcg > 0 ? dcg / idcg : 0 - - return { precision, recall, ndcg } -} - -/** - * Aggregate metrics across multiple queries - * - * @param metrics - Array of metric objects - */ -export function aggregateMetrics( - metrics: Array<{ precision: number; recall: number; ndcg: number }>, -): { precision: number; recall: number; ndcg: number } { - if (metrics.length === 0) { - return { precision: 0, recall: 0, ndcg: 0 } - } - - const sum = metrics.reduce( - (acc, m) => ({ - precision: acc.precision + m.precision, - recall: acc.recall + m.recall, - ndcg: acc.ndcg + m.ndcg, - }), - { precision: 0, recall: 0, ndcg: 0 }, - ) - - return { - precision: sum.precision / metrics.length, - recall: sum.recall / metrics.length, - ndcg: sum.ndcg / metrics.length, - } -} diff --git a/packages/eval/src/run.ts b/packages/eval/src/run.ts index db892d1..d9b6083 100644 --- a/packages/eval/src/run.ts +++ b/packages/eval/src/run.ts @@ -1,452 +1,177 @@ +#!/usr/bin/env bun /** - * RepoEval Retrieval Evaluation Runner + * SWE-bench Lite Retrieval-Only Evaluation Harness * - * Compares AST-aware chunking vs fixed-size chunking on code retrieval. + * CLI entrypoint for running retrieval-only evaluation comparing: + * - Agent1 (ops-only): Read/Grep/Glob + * - Agent2 (ops+search): Read/Grep/Glob + semantic search * * Usage: - * bun eval/run.ts - */ - -import { readdirSync, statSync } from 'node:fs' -import { mkdir, readFile, writeFile } from 'node:fs/promises' -import { join } from 'node:path' -import { chunkFile as chunkWithAST } from './chunkers/ast' -import { chunkFile as chunkWithChonkie } from './chunkers/chonkie' -import { chunkFile as chunkWithFixed } from './chunkers/fixed' -import { - download, - getRepos, - getReposDir, - loadTasks, - type RepoEvalTask, -} from './download' -import { embedTexts, topK } from './embeddings' -import { aggregateMetrics, computeMetrics } from './metrics' - -const RESULTS_DIR = join(import.meta.dir, '..', 'results') -const K_VALUES = [5, 10] // Top-k values for retrieval -const MAX_CHUNK_SIZE = 1500 // NWS characters per chunk - -// Colors for terminal output -const dim = (s: string) => `\x1b[2m${s}\x1b[0m` -const bold = (s: string) => `\x1b[1m${s}\x1b[0m` -const green = (s: string) => `\x1b[32m${s}\x1b[0m` -const yellow = (s: string) => `\x1b[33m${s}\x1b[0m` -const cyan = (s: string) => `\x1b[36m${s}\x1b[0m` - -// Status line helper - overwrites current line -function status(msg: string) { - process.stdout.write(`\r\x1b[K${dim(msg)}`) -} - -function clearStatus() { - process.stdout.write('\r\x1b[K') -} - -interface ChunkInfo { - id: string - text: string - startLine: number - endLine: number - filepath: string -} - -interface MetricsAtK { - precision: number - recall: number - ndcg: number -} - -interface QueryResult { - taskId: string - prompt: string - groundTruthLines: { start: number; end: number } - groundTruthFile: string - retrievedChunks: Array<{ id: string; score: number; rank: number }> - relevantChunkIds: string[] - metrics: Record // metrics per k value -} - -type ChunkerType = 'ast' | 'chonkie' | 'fixed' - -interface EvalResult { - chunker: ChunkerType - repo: string - summary: Record // summary per k value - queryResults: QueryResult[] - config: { kValues: number[]; maxChunkSize: number } - timestamp: string -} - -/** - * Recursively find all Python files in a directory + * bun run src/run.ts [options] + * + * Options: + * --split Dataset split (default: test) + * --max-instances Limit number of instances to process + * --max-turns Max agent turns per instance (default: 20) + * --max-tool-calls Max tool calls per agent (default: 50) + * --model Claude model to use (default: claude-sonnet-4-5) + * --skip-agent1 Skip Agent1 (ops-only) + * --skip-agent2 Skip Agent2 (ops+search) + * --instance Run specific instance(s), can be repeated + * --run-dir Output directory for runs (default: ./runs) + * --embedding-provider Embedding provider (default: gemini) + * --embedding-dimensions Output dimensions for Gemini (768/1536/3072) + * + * Environment Variables: + * ANTHROPIC_API_KEY Required for Claude API access + * GOOGLE_API_KEY Required for Gemini embeddings (default provider) + * OPENAI_API_KEY Required for OpenAI embeddings (if --embedding-provider openai) + * + * Examples: + * # Run full evaluation on test split + * bun run src/run.ts + * + * # Run on dev split with max 10 instances + * bun run src/run.ts --split dev --max-instances 10 + * + * # Run only Agent1 for debugging + * bun run src/run.ts --skip-agent2 --max-instances 5 + * + * # Run specific instance + * bun run src/run.ts --instance django__django-12345 */ -function findPythonFiles(dir: string): string[] { - const files: string[] = [] - function walk(currentDir: string) { - const entries = readdirSync(currentDir) - for (const entry of entries) { - const fullPath = join(currentDir, entry) - const stat = statSync(fullPath) - if (stat.isDirectory()) { - walk(fullPath) - } else if (entry.endsWith('.py')) { - files.push(fullPath) - } +import dotenv from 'dotenv' +import { type RunConfig, runEvaluation } from './swebench/run' + +// Load environment variables +dotenv.config() + +// Parse command line arguments +function parseArgs(): RunConfig { + const args = process.argv.slice(2) + const config: RunConfig = {} + const instanceIds: string[] = [] + + for (let i = 0; i < args.length; i++) { + const arg = args[i] + const next = args[i + 1] + + switch (arg) { + case '--split': + if (next === 'dev' || next === 'test') { + config.split = next + i++ + } + break + case '--max-instances': + if (next) config.maxInstances = parseInt(next, 10) + i++ + break + case '--max-turns': + if (next) config.maxTurns = parseInt(next, 10) + i++ + break + case '--max-tool-calls': + if (next) config.maxToolCalls = parseInt(next, 10) + i++ + break + case '--model': + if (next) config.model = next + i++ + break + case '--skip-agent1': + config.skipAgent1 = true + break + case '--skip-agent2': + config.skipAgent2 = true + break + case '--instance': + if (next) instanceIds.push(next) + i++ + break + case '--run-dir': + if (next) config.runDir = next + i++ + break + case '--embedding-provider': + if (next === 'gemini' || next === 'openai') { + config.embeddingProvider = next + i++ + } + break + case '--embedding-dimensions': + config.embeddingDimensions = parseInt(next!, 10) + i++ + break + case '--help': + case '-h': + console.log(` +SWE-bench Lite Retrieval-Only Evaluation Harness + +Usage: + bun run src/run.ts [options] + +Options: + --split Dataset split (default: test) + --max-instances Limit number of instances to process + --max-turns Max agent turns per instance (default: 20) + --max-tool-calls Max tool calls per agent (default: 50) + --model Claude model to use (default: claude-sonnet-4-5) + --skip-agent1 Skip Agent1 (ops-only) + --skip-agent2 Skip Agent2 (ops+search) + --instance Run specific instance(s), can be repeated + --run-dir Output directory for runs (default: ./runs) + --embedding-provider Embedding provider (default: gemini) + --embedding-dimensions Output dimensions for Gemini (768/1536/3072) + --help, -h Show this help message + +Environment Variables: + ANTHROPIC_API_KEY Required for Claude API access + GOOGLE_API_KEY Required for Gemini embeddings (default provider) + OPENAI_API_KEY Required for OpenAI embeddings (if --embedding-provider openai) +`) + process.exit(0) } } - walk(dir) - return files -} - -/** - * Check if a chunk overlaps with a line range - */ -function chunksOverlap( - chunk: { startLine: number; endLine: number }, - target: { start: number; end: number }, -): boolean { - return !(chunk.endLine < target.start || chunk.startLine > target.end) -} - -interface EmbedStats { - cached: number - total: number -} - -/** - * Run evaluation for a single repository and chunker - */ -async function evaluateRepo( - repo: string, - tasks: RepoEvalTask[], - chunkerType: ChunkerType, -): Promise<{ result: EvalResult; chunkCount: number; embedStats: EmbedStats }> { - const repoDir = join(getReposDir(), repo) - const pyFiles = findPythonFiles(repoDir) - - // Step 1: Chunk all files - status(`[${chunkerType}] chunking ${pyFiles.length} files...`) - const allChunks: ChunkInfo[] = [] - - for (const filepath of pyFiles) { - const code = await readFile(filepath, 'utf-8') - const relPath = filepath.replace(`${repoDir}/`, '') - - try { - let chunks: Awaited> - switch (chunkerType) { - case 'ast': - chunks = await chunkWithAST(filepath, code, MAX_CHUNK_SIZE) - break - case 'chonkie': - chunks = await chunkWithChonkie(filepath, code, MAX_CHUNK_SIZE) - break - case 'fixed': - chunks = await chunkWithFixed(filepath, code, MAX_CHUNK_SIZE) - break - } - - for (const chunk of chunks) { - allChunks.push({ - ...chunk, - filepath: relPath, - }) - } - } catch { - // Skip files that fail to parse - } + if (instanceIds.length > 0) { + config.instanceIds = instanceIds } - // Step 2: Embed all chunks - status(`[${chunkerType}] embedding ${allChunks.length} chunks...`) - const chunkTexts = allChunks.map((c) => c.text) - let embedStats: EmbedStats = { cached: 0, total: chunkTexts.length } - const chunkEmbeddings = await embedTexts( - chunkTexts, - (done, total, cached) => { - embedStats = { cached, total } - status( - `[${chunkerType}] embedding chunks ${done}/${total} (${cached} cached)`, - ) - }, - ) - - // Step 3: Embed queries - status(`[${chunkerType}] embedding ${tasks.length} queries...`) - const queryTexts = tasks.map((t) => t.prompt) - const queryEmbeddings = await embedTexts(queryTexts) - - // Step 4: For each query, retrieve top-k and compute metrics - status(`[${chunkerType}] computing metrics...`) - const queryResults: QueryResult[] = [] - const maxK = Math.max(...K_VALUES) - - for (let i = 0; i < tasks.length; i++) { - const task = tasks[i] - const queryEmb = queryEmbeddings[i] - if (!task || !queryEmb) continue - - const topKResults = topK(queryEmb, chunkEmbeddings, maxK) - - const targetFile = task.metadata.fpath_tuple.slice(1).join('/') - const targetLines = { - start: task.metadata.context_start_lineno, - end: task.metadata.line_no, - } - - const relevantChunkIds = allChunks - .filter((c) => c.filepath === targetFile && chunksOverlap(c, targetLines)) - .map((c) => c.id) - - const relevantSet = new Set(relevantChunkIds) - const retrievedIds = topKResults.map((r) => allChunks[r.index]?.id ?? '') - - const metrics: Record = {} - for (const k of K_VALUES) { - metrics[k] = computeMetrics(retrievedIds, relevantSet, k) - } - - queryResults.push({ - taskId: task.metadata.task_id, - prompt: `${task.prompt.slice(0, 200)}...`, - groundTruthLines: targetLines, - groundTruthFile: targetFile, - retrievedChunks: topKResults.map((r, rank) => ({ - id: allChunks[r.index]?.id ?? '', - score: r.score, - rank: rank + 1, - })), - relevantChunkIds, - metrics, - }) - } - - const summary: Record = {} - for (const k of K_VALUES) { - const metricsAtK = queryResults - .map((q) => q.metrics[k]) - .filter((m): m is MetricsAtK => m !== undefined) - summary[k] = aggregateMetrics(metricsAtK) - } - - clearStatus() - - return { - result: { - chunker: chunkerType, - repo, - summary, - queryResults, - config: { kValues: K_VALUES, maxChunkSize: MAX_CHUNK_SIZE }, - timestamp: new Date().toISOString(), - }, - chunkCount: allChunks.length, - embedStats, - } -} - -/** - * Print metrics table for all k values - */ -function printMetricsTable( - summaries: Record>, -): void { - const chunkerNames = Object.keys(summaries) - - for (const k of K_VALUES) { - console.log(dim(`@${k}`)) - console.log( - `${dim('Chunker'.padEnd(12))} ${dim('nDCG'.padStart(8))} ${dim('Prec'.padStart(8))} ${dim('Recall'.padStart(8))}`, - ) - - for (const name of chunkerNames) { - const m = summaries[name]?.[k] - if (!m) continue - const ndcg = yellow(`${(m.ndcg * 100).toFixed(1)}%`.padStart(8)) - const prec = `${(m.precision * 100).toFixed(1)}%`.padStart(8) - const recall = `${(m.recall * 100).toFixed(1)}%`.padStart(8) - console.log(`${cyan(name.padEnd(12))} ${ndcg} ${prec} ${recall}`) - } - console.log('') - } + return config } +// Main async function main() { - console.log(bold('\nRepoEval Retrieval Evaluation\n')) + // Parse args first (handles --help early exit) + const config = parseArgs() - // Step 1: Download data if needed - await download() - - // Step 2: Load tasks - status('loading tasks...') - const allTasks = await loadTasks('2k') - clearStatus() - - // Group tasks by repo - const tasksByRepo = new Map() - for (const task of allTasks) { - const repo = task.metadata.task_id.split('/')[0] - if (!repo) continue - if (!tasksByRepo.has(repo)) { - tasksByRepo.set(repo, []) - } - const repoTasks = tasksByRepo.get(repo) - if (repoTasks) { - repoTasks.push(task) - } + // Check required env vars + if (!process.env.ANTHROPIC_API_KEY) { + console.error('Error: ANTHROPIC_API_KEY environment variable is required') + process.exit(1) } - // Step 3: Run evaluation for each repo and chunker - await mkdir(RESULTS_DIR, { recursive: true }) - - const allResults: EvalResult[] = [] - const repos = getRepos() - const chunkerTypes: ChunkerType[] = ['ast', 'chonkie', 'fixed'] - - // Display names for chunkers - const chunkerNames: Record = { - ast: 'AST', - chonkie: 'Chonkie', - fixed: 'Fixed', - } - - for (let repoIdx = 0; repoIdx < repos.length; repoIdx++) { - const repo = repos[repoIdx] - if (!repo) continue - const tasks = tasksByRepo.get(repo) - if (!tasks || tasks.length === 0) { - continue - } - - console.log( - `${dim(`[${repoIdx + 1}/${repos.length}]`)} ${bold(repo)} ${dim(`(${tasks.length} tasks)`)}`, + // Check embedding provider env var + const provider = config.embeddingProvider || 'gemini' + if (provider === 'gemini' && !process.env.GOOGLE_API_KEY) { + console.warn( + 'Warning: GOOGLE_API_KEY not set. Agent2 (semantic search) will be skipped.', + ) + } else if (provider === 'openai' && !process.env.OPENAI_API_KEY) { + console.warn( + 'Warning: OPENAI_API_KEY not set. Agent2 (semantic search) will be skipped.', ) - - const repoResults: Record< - string, - { result: EvalResult; chunkCount: number; embedStats: EmbedStats } - > = {} - - for (const chunkerType of chunkerTypes) { - const evalResult = await evaluateRepo(repo, tasks, chunkerType) - repoResults[chunkerType] = evalResult - allResults.push(evalResult.result) - } - - // Print summary line for this repo - const summaryParts = chunkerTypes.map((ct) => { - const r = repoResults[ct] - if (!r) return '' - const { chunkCount, embedStats } = r - const cachedPct = - embedStats.total > 0 - ? Math.round((embedStats.cached / embedStats.total) * 100) - : 0 - return `${cyan(chunkerNames[ct])}: ${chunkCount} ${dim(`(${cachedPct}%)`)}` - }) - console.log(` ${summaryParts.join(' ')}`) - - // Print quick metrics comparison - const k = K_VALUES[0] - if (k !== undefined) { - const metricsLine = chunkerTypes.map((ct) => { - const r = repoResults[ct] - if (!r) return '' - const ndcg = (r.result.summary[k]?.ndcg ?? 0) * 100 - return `${chunkerNames[ct]}: ${yellow(ndcg.toFixed(1))}%` - }) - console.log(` ${dim(`nDCG@${k}:`)} ${metricsLine.join(' ')}\n`) - } - } - - // Step 4: Compute overall summary - console.log(bold('Results')) - console.log(dim('─'.repeat(60))) - - // Aggregate results by chunker type - const overallByChunker: Record> = {} - for (const ct of chunkerTypes) { - const results = allResults.filter((r) => r.chunker === ct) - const name = chunkerNames[ct] - overallByChunker[name] = {} - for (const k of K_VALUES) { - const metricsAtK = results - .map((r) => r.summary[k]) - .filter((m): m is MetricsAtK => m !== undefined) - const chunkerMetrics = overallByChunker[name] - if (chunkerMetrics) { - chunkerMetrics[k] = aggregateMetrics(metricsAtK) - } - } - } - - printMetricsTable(overallByChunker) - - // Compute improvements vs Fixed baseline - const fixedOverall = overallByChunker[chunkerNames.fixed] - const computeImprovement = (a: number, b: number): string => { - if (b === 0) return 'N/A' - const improvement = ((a - b) / b) * 100 - const sign = improvement >= 0 ? '+' : '' - return improvement >= 0 - ? green(`${sign}${improvement.toFixed(1)}%`) - : `${sign}${improvement.toFixed(1)}%` } + console.log('Starting SWE-bench Lite retrieval-only evaluation...\n') - console.log(dim('vs Fixed baseline:')) - for (const k of K_VALUES) { - const parts = chunkerTypes - .filter((ct) => ct !== 'fixed') - .map((ct) => { - const overall = overallByChunker[chunkerNames[ct]] - const fixedNdcg = fixedOverall?.[k]?.ndcg ?? 0 - const overallNdcg = overall?.[k]?.ndcg ?? 0 - return `${cyan(chunkerNames[ct])} ${computeImprovement(overallNdcg, fixedNdcg)}` - }) - console.log(` k=${k}: ${parts.join(' ')}`) + try { + await runEvaluation(config) + } catch (err) { + console.error('Evaluation failed:', err) + process.exit(1) } - - // Step 5: Save results - const timestamp = new Date().toISOString().replace(/[:.]/g, '-') - - // Save summary - const summaryPath = join(RESULTS_DIR, `summary_${timestamp}.json`) - await writeFile( - summaryPath, - JSON.stringify( - { - overall: overallByChunker, - perRepo: Object.fromEntries( - repos.map((repo) => [ - repo, - Object.fromEntries( - chunkerTypes.map((ct) => [ - ct, - allResults.find((r) => r.repo === repo && r.chunker === ct) - ?.summary, - ]), - ), - ]), - ), - config: { kValues: K_VALUES, maxChunkSize: MAX_CHUNK_SIZE }, - timestamp: new Date().toISOString(), - }, - null, - 2, - ), - ) - - // Save detailed results - const detailedPath = join(RESULTS_DIR, `detailed_${timestamp}.json`) - await writeFile(detailedPath, JSON.stringify(allResults, null, 2)) - - console.log(`\n${dim(`Saved to ${summaryPath}`)}`) } -// Run if executed directly -if (import.meta.main) { - main().catch(console.error) -} +main() diff --git a/packages/eval/src/swebench/agent/prompts.ts b/packages/eval/src/swebench/agent/prompts.ts new file mode 100644 index 0000000..92a2eb1 --- /dev/null +++ b/packages/eval/src/swebench/agent/prompts.ts @@ -0,0 +1,163 @@ +/** + * Shared prompt templates for retrieval-only agents + */ + +/** + * Base system prompt for retrieval-only evaluation + * Instructs the agent to locate files without making changes + */ +const BASE_SYSTEM_PROMPT = `You are a skilled software engineer helping to locate the source files that need to be modified to address a bug or feature request. + +## Your Task + +Given a problem statement describing a bug or feature request, your goal is to identify the **most relevant source files** in the repository that would need to be modified to address the issue. + +## Important Constraints + +1. **DO NOT** propose any code changes, patches, or fixes +2. **DO NOT** write or edit any files +3. **DO NOT** run any commands that modify the repository +4. **ONLY** read files and search the codebase to locate relevant files + +## Working Directory + +You are working in a repository checkout. All file paths should be relative to the repository root. +- Use patterns like \`src/**/*.py\` or \`**/rules/*.py\` for Glob +- Use paths like \`src/\` or \`.\` for Grep path parameter +- All file paths in your output should be relative (e.g., \`src/module/file.py\`) + +## Available Tools + +You have access to these read-only tools: +- **Read**: Read file contents +- **Grep**: Search for patterns in files +- **Glob**: Find files matching a pattern +- **LS**: List directory contents + +**NOTE**: Bash, shell commands, and terminal access are NOT available. + +## Approach + +Understand the problem statement first - look for key terms, error messages, function names, or class names mentioned. + +Use the tools to navigate and search the codebase: +- **Glob** is useful for finding files by name patterns +- **Grep** is useful for searching file contents by keyword +- **Read** lets you examine file contents in detail +- **LS** shows directory structure + +Aim to identify 3-10 files that would need modification. + +## Output Format + +When you have identified the relevant files, output your final answer as a JSON object: + +\`\`\`json +{ + "top_files": [ + "path/to/most/relevant/file.py", + "path/to/second/relevant/file.py" + ], + "reason": "Brief explanation of why these files are relevant" +} +\`\`\` + +List files in order of relevance (most relevant first). Aim for 3-10 files.` + +/** + * System prompt for Agent1 (ops-only) + */ +export const RETRIEVAL_ONLY_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + +/** + * System prompt for Agent2 (ops + semantic search) + * Includes information about the semantic search tool + */ +export const RETRIEVAL_WITH_SEARCH_SYSTEM_PROMPT = `${BASE_SYSTEM_PROMPT} + +## Semantic Code Search (Your Primary Tool) + +You have **mcp__semantic_search__search** - use this to quickly find relevant code: + +\`\`\` +mcp__semantic_search__search({"query": "description of what you're looking for", "top_k": 10}) +\`\`\` + +The codebase is pre-indexed. One semantic search call typically finds relevant files faster than multiple Glob/Grep calls. + +**Important**: Semantic search returns absolute file paths that you can use directly with Read. After identifying candidate files, use Read with the exact paths from the results.` + +/** + * Create the user prompt with just the problem statement + * @param problemStatement - The SWE-bench problem statement + * @param repo - Optional repo name for context + * @param hasSemanticSearch - Whether Agent2's semantic search is available + */ +export function createUserPrompt( + problemStatement: string, + repo?: string, + hasSemanticSearch = false, +): string { + const repoInfo = repo + ? `\n\nYou are working in the **${repo}** repository.\n` + : '' + + const searchGuidance = hasSemanticSearch + ? `**Recommended approach**: Start with semantic search to quickly find relevant code areas. Then use Read to examine specific files.` + : `Start by exploring the repository structure (use LS or Glob) to understand the codebase layout. Then search for relevant code using the available tools.` + + return `## Problem Statement + +${problemStatement} +${repoInfo} +--- + +Please analyze the problem and identify the source files that would need to be modified to address this issue. + +${searchGuidance} + +Remember to output your final answer as JSON with "top_files" and "reason" fields. Use relative paths (e.g., \`src/module/file.py\`).` +} + +/** + * Parse the agent's final output to extract top_files + * Handles various output formats the agent might use + */ +export function parseTopFiles(output: string): string[] { + // Try to find JSON in the output + const jsonMatch = output.match(/```json\s*([\s\S]*?)\s*```/) + if (jsonMatch?.[1]) { + try { + const parsed = JSON.parse(jsonMatch[1]) + if (Array.isArray(parsed.top_files)) { + return parsed.top_files + } + } catch { + // Continue to fallback + } + } + + // Try to parse raw JSON + const rawJsonMatch = output.match(/\{[\s\S]*"top_files"[\s\S]*\}/) + if (rawJsonMatch) { + try { + const parsed = JSON.parse(rawJsonMatch[0]) + if (Array.isArray(parsed.top_files)) { + return parsed.top_files + } + } catch { + // Continue to fallback + } + } + + // Fallback: extract file paths from the text + // Match common file path patterns (e.g., path/to/file.py) + const pathMatches = output.match(/[\w\-./]+\.[a-z]+/gi) || [] + const uniquePaths = [...new Set(pathMatches)].filter( + (p) => + // Filter out common non-file patterns + !p.startsWith('http') && !p.includes('...') && p.includes('/'), + ) + + return uniquePaths.slice(0, 10) +} diff --git a/packages/eval/src/swebench/agent/semantic_search_adapter.ts b/packages/eval/src/swebench/agent/semantic_search_adapter.ts new file mode 100644 index 0000000..1e2163e --- /dev/null +++ b/packages/eval/src/swebench/agent/semantic_search_adapter.ts @@ -0,0 +1,600 @@ +/** + * Semantic Search Adapter: Interface for the underlying index implementation + * This provides a pluggable boundary to integrate with code-chunk or other indexers + */ + +import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs' +import { join } from 'node:path' +import { createSdkMcpServer, tool } from '@anthropic-ai/claude-agent-sdk' +import { chunk as codeChunk } from 'code-chunk' +import { z } from 'zod' +import type { SemanticSearchResult } from '../types' + +/** + * Task type for embedding generation (provider-specific optimization) + * - RETRIEVAL_DOCUMENT: For indexing documents/code chunks + * - CODE_RETRIEVAL_QUERY: For search queries optimized for code retrieval + * - RETRIEVAL_QUERY: For general search queries + */ +export type EmbeddingTaskType = + | 'RETRIEVAL_DOCUMENT' + | 'CODE_RETRIEVAL_QUERY' + | 'RETRIEVAL_QUERY' + +/** + * Embedding service interface + */ +export interface EmbeddingService { + embed( + texts: string[], + taskType?: EmbeddingTaskType, + ): Promise<{ embeddings: number[][]; tokens: number }> +} + +/** Sleep helper for retry backoff */ +const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)) + +/** Check if error is retryable (network issues) */ +function isRetryableError(err: unknown): boolean { + const code = (err as { code?: string })?.code + const msg = err instanceof Error ? err.message : '' + return ( + [ + 'ECONNRESET', + 'ETIMEDOUT', + 'ECONNREFUSED', + 'ENOTFOUND', + 'EAI_AGAIN', + ].includes(code ?? '') || + /socket|network|connection/i.test(msg) || + err instanceof TypeError + ) +} + +/** Normalize vector to unit length */ +function normalizeVector(vector: number[]): number[] { + const norm = Math.sqrt(vector.reduce((sum, v) => sum + v * v, 0)) + return norm === 0 ? vector : vector.map((v) => v / norm) +} + +/** + * Gemini embedding service with retry logic + */ +export class GeminiEmbeddingService implements EmbeddingService { + private apiKey: string + private model: string + private maxRetries: number + private dims: number + private baseUrl = 'https://generativelanguage.googleapis.com/v1beta/models' + + constructor( + apiKey: string, + model = 'gemini-embedding-001', + maxRetries = 5, + dims = 768, + ) { + this.apiKey = apiKey + this.model = model + this.maxRetries = maxRetries + this.dims = dims + } + + async embed( + texts: string[], + taskType: EmbeddingTaskType = 'RETRIEVAL_DOCUMENT', + ): Promise<{ embeddings: number[][]; tokens: number }> { + const isSingle = texts.length === 1 + const endpoint = isSingle ? 'embedContent' : 'batchEmbedContents' + const url = `${this.baseUrl}/${this.model}:${endpoint}?key=${this.apiKey}` + + const body = isSingle + ? { + model: `models/${this.model}`, + content: { parts: [{ text: texts[0] }] }, + taskType, + outputDimensionality: this.dims, + } + : { + requests: texts.map((text) => ({ + model: `models/${this.model}`, + content: { parts: [{ text }] }, + taskType, + outputDimensionality: this.dims, + })), + } + + let lastError: Error | null = null + + for (let attempt = 0; attempt < this.maxRetries; attempt++) { + try { + const response = await fetch(url, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(body), + }) + + if (!response.ok) { + const { status, statusText } = response + const respBody = await response.text() + + if (status === 429 || status >= 500) { + const waitMs = 2 ** attempt * 1000 + Math.random() * 1000 + console.warn( + `[embedding] Gemini ${status} ${statusText}, retry in ${Math.round(waitMs)}ms (${attempt + 1}/${this.maxRetries})...`, + ) + await sleep(waitMs) + lastError = new Error(`Gemini ${status} ${statusText}: ${respBody}`) + continue + } + throw new Error(`Gemini ${status} ${statusText}: ${respBody}`) + } + + const data = await response.json() + let embeddings: number[][] = isSingle + ? [data.embedding?.values || []] + : data.embeddings?.map((e: { values: number[] }) => e.values) || [] + + // Normalize if not using native 3072 dimensions + if (this.dims !== 3072) { + embeddings = embeddings.map(normalizeVector) + } + + const tokens = texts.reduce( + (sum, t) => sum + Math.ceil(t.length / 4), + 0, + ) + return { embeddings, tokens } + } catch (err) { + if (isRetryableError(err) && attempt < this.maxRetries - 1) { + const waitMs = 2 ** attempt * 1000 + Math.random() * 1000 + console.warn( + `[embedding] Network error, retry in ${Math.round(waitMs)}ms (${attempt + 1}/${this.maxRetries})...`, + ) + await sleep(waitMs) + lastError = err instanceof Error ? err : new Error(String(err)) + continue + } + throw err + } + } + + throw lastError || new Error('Gemini embedding failed after retries') + } +} + +/** + * Indexed chunk with embedding + */ +interface IndexedChunk { + filepath: string + startLine: number + endLine: number + text: string + contextualizedText: string + embedding: number[] +} + +/** + * Cosine similarity between two vectors + */ +function cosineSimilarity(a: number[], b: number[]): number { + let dotProduct = 0 + let normA = 0 + let normB = 0 + + for (let i = 0; i < a.length; i++) { + dotProduct += a[i]! * b[i]! + normA += a[i]! * a[i]! + normB += b[i]! * b[i]! + } + + return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)) +} + +/** + * Cache metadata for serialized index + */ +interface IndexCacheMetadata { + version: number + embeddingProvider: string + embeddingDimensions: number + chunkSettings: { + maxChunkSize: number + overlapLines: number + } + createdAt: string + chunkCount: number + totalEmbedTokens: number + totalEmbedLatencyMs: number +} + +/** + * Semantic Search Index: indexes a repository and supports similarity search + * Supports caching to avoid re-embedding the same repo + */ +export class SemanticSearchIndex { + private chunks: IndexedChunk[] = [] + private embedService: EmbeddingService + private worktreePath: string + + // Metrics for fairness accounting + public indexLoadMs = 0 + public totalEmbedTokens = 0 + public totalEmbedLatencyMs = 0 + public lastQueryEmbedTokens = 0 + public lastQueryEmbedLatencyMs = 0 + + // Cache versioning (increment when chunk/embed format changes) + private static CACHE_VERSION = 1 + + constructor(worktreePath: string, embedService: EmbeddingService) { + this.worktreePath = worktreePath + this.embedService = embedService + } + + /** + * Generate cache key for an index + */ + static getCacheKey( + instanceId: string, + embeddingProvider: string, + embeddingDimensions: number, + ): string { + return `${instanceId}_${embeddingProvider}_${embeddingDimensions}` + } + + /** + * Get cache file path + */ + static getCachePath(cacheDir: string, cacheKey: string): string { + return join(cacheDir, `${cacheKey}.json`) + } + + /** + * Check if a valid cache exists + */ + static cacheExists( + cacheDir: string, + instanceId: string, + embeddingProvider: string, + embeddingDimensions: number, + ): boolean { + const cacheKey = SemanticSearchIndex.getCacheKey( + instanceId, + embeddingProvider, + embeddingDimensions, + ) + const cachePath = SemanticSearchIndex.getCachePath(cacheDir, cacheKey) + return existsSync(cachePath) + } + + /** + * Load index from cache + */ + static async loadFromCache( + cacheDir: string, + instanceId: string, + embeddingProvider: string, + embeddingDimensions: number, + worktreePath: string, + embedService: EmbeddingService, + ): Promise { + const cacheKey = SemanticSearchIndex.getCacheKey( + instanceId, + embeddingProvider, + embeddingDimensions, + ) + const cachePath = SemanticSearchIndex.getCachePath(cacheDir, cacheKey) + + if (!existsSync(cachePath)) { + return null + } + + try { + const startTime = Date.now() + const data = JSON.parse(readFileSync(cachePath, 'utf-8')) + + // Validate cache version + if (data.metadata?.version !== SemanticSearchIndex.CACHE_VERSION) { + console.log(`[semantic-search] Cache version mismatch, will re-index`) + return null + } + + const index = new SemanticSearchIndex(worktreePath, embedService) + index.chunks = data.chunks + index.totalEmbedTokens = data.metadata.totalEmbedTokens || 0 + index.totalEmbedLatencyMs = data.metadata.totalEmbedLatencyMs || 0 + index.indexLoadMs = Date.now() - startTime + + console.log( + `[semantic-search] Loaded ${index.chunks.length} chunks from cache in ${index.indexLoadMs}ms`, + ) + + return index + } catch (err) { + console.warn(`[semantic-search] Failed to load cache: ${err}`) + return null + } + } + + /** + * Save index to cache + */ + async saveToCache( + cacheDir: string, + instanceId: string, + embeddingProvider: string, + embeddingDimensions: number, + ): Promise { + try { + // Ensure cache directory exists + if (!existsSync(cacheDir)) { + mkdirSync(cacheDir, { recursive: true }) + } + + const cacheKey = SemanticSearchIndex.getCacheKey( + instanceId, + embeddingProvider, + embeddingDimensions, + ) + const cachePath = SemanticSearchIndex.getCachePath(cacheDir, cacheKey) + + const metadata: IndexCacheMetadata = { + version: SemanticSearchIndex.CACHE_VERSION, + embeddingProvider, + embeddingDimensions, + chunkSettings: { + maxChunkSize: 1500, + overlapLines: 5, + }, + createdAt: new Date().toISOString(), + chunkCount: this.chunks.length, + totalEmbedTokens: this.totalEmbedTokens, + totalEmbedLatencyMs: this.totalEmbedLatencyMs, + } + + const data = { + metadata, + chunks: this.chunks, + } + + writeFileSync(cachePath, JSON.stringify(data)) + console.log(`[semantic-search] Saved index to cache: ${cachePath}`) + } catch (err) { + console.warn(`[semantic-search] Failed to save cache: ${err}`) + } + } + + /** + * Index files in the repository + */ + async index(filePaths: string[]): Promise { + const startTime = Date.now() + console.log(`[semantic-search] Indexing ${filePaths.length} files...`) + + // Chunk all files + const allChunks: { + filepath: string + text: string + contextualizedText: string + startLine: number + endLine: number + }[] = [] + + for (const filepath of filePaths) { + try { + const fullPath = join(this.worktreePath, filepath) + const content = readFileSync(fullPath, 'utf-8') + + const chunks = await codeChunk(filepath, content, { + maxChunkSize: 1500, + overlapLines: 5, + }) + + for (const c of chunks) { + allChunks.push({ + filepath, + text: c.text, + contextualizedText: c.contextualizedText, + startLine: c.lineRange.start, + endLine: c.lineRange.end, + }) + } + } catch (err) { + // Skip files that can't be chunked (binary, too large, etc.) + console.warn(`[semantic-search] Failed to chunk ${filepath}: ${err}`) + } + } + + console.log(`[semantic-search] Created ${allChunks.length} chunks`) + + // Batch embed chunks (using contextualizedText for better semantic matching) + const batchSize = 100 + for (let i = 0; i < allChunks.length; i += batchSize) { + const batch = allChunks.slice(i, i + batchSize) + const texts = batch.map((c) => c.contextualizedText) + + const embedStart = Date.now() + const { embeddings, tokens } = await this.embedService.embed( + texts, + 'RETRIEVAL_DOCUMENT', // Task type for indexing documents/code chunks + ) + const embedLatency = Date.now() - embedStart + + this.totalEmbedTokens += tokens + this.totalEmbedLatencyMs += embedLatency + + for (let j = 0; j < batch.length; j++) { + this.chunks.push({ + ...batch[j]!, + embedding: embeddings[j]!, + }) + } + + console.log( + `[semantic-search] Embedded batch ${Math.floor(i / batchSize) + 1}/${Math.ceil(allChunks.length / batchSize)} (${tokens} tokens, ${embedLatency}ms)`, + ) + } + + this.indexLoadMs = Date.now() - startTime + console.log( + `[semantic-search] Indexing complete: ${this.chunks.length} chunks, ${this.totalEmbedTokens} tokens, ${this.indexLoadMs}ms`, + ) + } + + /** + * Search for relevant chunks given a query + */ + async search( + query: string, + topK = 10, + filters?: { filepathPattern?: string }, + ): Promise { + if (this.chunks.length === 0) { + return [] + } + + // Embed the query with CODE_RETRIEVAL_QUERY for optimal code search + const embedStart = Date.now() + const { embeddings, tokens } = await this.embedService.embed( + [query], + 'CODE_RETRIEVAL_QUERY', // Task type optimized for code retrieval queries + ) + const queryEmbedding = embeddings[0]! + this.lastQueryEmbedLatencyMs = Date.now() - embedStart + this.lastQueryEmbedTokens = tokens + this.totalEmbedTokens += tokens + this.totalEmbedLatencyMs += this.lastQueryEmbedLatencyMs + + // Filter chunks if needed + let candidates = this.chunks + if (filters?.filepathPattern) { + const pattern = new RegExp(filters.filepathPattern) + candidates = candidates.filter((c) => pattern.test(c.filepath)) + } + + // Compute similarities + const scored = candidates.map((c) => ({ + chunk: c, + score: cosineSimilarity(queryEmbedding, c.embedding!), + })) + + // Sort by score descending + scored.sort((a, b) => b.score - a.score) + + // Return top-k results with absolute paths (SDK Read tool requires absolute paths) + const results = scored.slice(0, topK).map((s) => ({ + filepath: join(this.worktreePath, s.chunk.filepath), + start_line: s.chunk.startLine, + end_line: s.chunk.endLine, + score: s.score, + snippet: s.chunk.text.slice(0, 200), // Truncate for compactness + })) + + return results + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// MCP Server for semantic search tool +// ───────────────────────────────────────────────────────────────────────────── + +export interface SemanticSearchMetrics { + callCount: number + totalLatencyMs: number + totalQueryEmbedTokens: number + totalQueryEmbedLatencyMs: number +} + +export function createSemanticSearchMetrics(): SemanticSearchMetrics { + return { + callCount: 0, + totalLatencyMs: 0, + totalQueryEmbedTokens: 0, + totalQueryEmbedLatencyMs: 0, + } +} + +export function createSemanticSearchServer( + index: SemanticSearchIndex, + metrics: SemanticSearchMetrics, +) { + return createSdkMcpServer({ + name: 'semantic_search', + version: '1.0.0', + tools: [ + tool( + 'search', + 'Search the codebase for code semantically similar to the query. Returns ranked file locations with snippets.', + { + query: z + .string() + .describe( + "Natural language description of what code you're looking for", + ), + top_k: z + .number() + .min(1) + .max(50) + .default(10) + .describe('Number of results to return'), + filepath_pattern: z + .string() + .optional() + .describe('Optional regex pattern to filter file paths'), + }, + async (args) => { + const startTime = Date.now() + try { + const results = await index.search(args.query, args.top_k, { + filepathPattern: args.filepath_pattern, + }) + metrics.callCount++ + metrics.totalLatencyMs += Date.now() - startTime + metrics.totalQueryEmbedTokens += index.lastQueryEmbedTokens + metrics.totalQueryEmbedLatencyMs += index.lastQueryEmbedLatencyMs + + if (results.length === 0) { + return { + content: [ + { + type: 'text' as const, + text: 'No matching code found for the query.', + }, + ], + } + } + + const formatted = results.map((r, i) => { + const lines = + r.start_line && r.end_line + ? `L${r.start_line}-${r.end_line}` + : '' + return `${i + 1}. ${r.filepath}${lines ? `:${lines}` : ''} (score: ${r.score.toFixed(3)})\n ${r.snippet?.replace(/\n/g, '\n ')}` + }) + const fileList = results.map((r) => r.filepath).join('\n') + + return { + content: [ + { + type: 'text' as const, + text: `Found ${results.length} relevant code locations:\n\n${formatted.join('\n\n')}\n\n__FILES__\n${fileList}\n__END_FILES__`, + }, + ], + } + } catch (error) { + return { + content: [ + { + type: 'text' as const, + text: `Error during semantic search: ${error instanceof Error ? error.message : String(error)}`, + }, + ], + isError: true, + } + } + }, + ), + ], + }) +} diff --git a/packages/eval/src/swebench/agent/variants.ts b/packages/eval/src/swebench/agent/variants.ts new file mode 100644 index 0000000..29426fb --- /dev/null +++ b/packages/eval/src/swebench/agent/variants.ts @@ -0,0 +1,92 @@ +/** + * Agent variant definitions for retrieval-only evaluation + */ + +import type { AgentVariant } from '../types' + +/** + * Configuration for an agent variant + */ +export interface AgentConfig { + variant: AgentVariant + tools: string[] + allowedTools: string[] + maxTurns: number + maxToolCalls: number +} + +/** + * Agent1: Ops-only (Read/Grep/Glob) + */ +export const AGENT1_CONFIG: AgentConfig = { + variant: 'ops-only', + tools: ['Read', 'Grep', 'Glob'], + allowedTools: ['Read', 'Grep', 'Glob'], + maxTurns: 20, + maxToolCalls: 50, +} + +/** + * Agent2: Ops + Semantic Search + * Includes the custom semantic search tool exposed as MCP tool + */ +export const AGENT2_CONFIG: AgentConfig = { + variant: 'ops-plus-search', + tools: ['Read', 'Grep', 'Glob'], + allowedTools: [ + 'Read', + 'Grep', + 'Glob', + 'mcp__semantic_search__search', // Custom MCP tool + ], + maxTurns: 20, + maxToolCalls: 50, +} + +/** + * Tools that are explicitly denied (for logging/enforcement) + */ +export const DENIED_TOOLS = [ + 'Write', + 'Edit', + 'Bash', + 'WebFetch', + 'TodoRead', + 'TodoWrite', + 'NotebookRead', + 'NotebookEdit', +] + +/** + * Get agent config by variant + */ +export function getAgentConfig(variant: AgentVariant): AgentConfig { + return variant === 'ops-only' ? AGENT1_CONFIG : AGENT2_CONFIG +} + +/** + * Check if a tool is allowed for a variant + */ +export function isToolAllowed( + toolName: string, + config: AgentConfig, + toolCallCount: number, +): { allowed: boolean; reason?: string } { + // Check tool budget + if (toolCallCount >= config.maxToolCalls) { + return { allowed: false, reason: 'Tool budget exceeded' } + } + + // Check if tool is explicitly allowed + if (config.allowedTools.includes(toolName)) { + return { allowed: true } + } + + // Check if tool is explicitly denied + if (DENIED_TOOLS.includes(toolName)) { + return { allowed: false, reason: 'Tool is denied for retrieval-only mode' } + } + + // Default: deny unknown tools + return { allowed: false, reason: 'Tool not in allowlist' } +} diff --git a/packages/eval/src/swebench/aggregate.ts b/packages/eval/src/swebench/aggregate.ts new file mode 100644 index 0000000..b6abe86 --- /dev/null +++ b/packages/eval/src/swebench/aggregate.ts @@ -0,0 +1,312 @@ +/** + * Aggregate metrics across all instances for summary reporting + */ + +import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs' +import { dirname } from 'node:path' +import type { AgentVariant, AggregateSummary, InstanceMetrics } from './types' + +/** + * Load metrics from JSONL file + */ +export function loadMetrics(filePath: string): InstanceMetrics[] { + const content = readFileSync(filePath, 'utf-8') + return content + .trim() + .split('\n') + .filter(Boolean) + .map((line) => JSON.parse(line) as InstanceMetrics) +} + +/** + * Compute percentile + */ +function percentile(values: number[], p: number): number { + if (values.length === 0) return 0 + const sorted = [...values].sort((a, b) => a - b) + const idx = Math.ceil((p / 100) * sorted.length) - 1 + return sorted[Math.max(0, idx)]! +} + +/** + * Compute mean + */ +function mean(values: number[]): number { + if (values.length === 0) return 0 + return values.reduce((a, b) => a + b, 0) / values.length +} + +/** + * Compute median + */ +function median(values: number[]): number { + return percentile(values, 50) +} + +/** + * Filter values, removing nulls + */ +function filterNulls(values: (number | null)[]): number[] { + return values.filter((v): v is number => v !== null) +} + +/** + * Sum helper + */ +function sum(values: number[]): number { + return values.reduce((a, b) => a + b, 0) +} + +/** + * Aggregate metrics for a single agent variant + */ +function aggregateForVariant( + metrics: InstanceMetrics[], + variant: AgentVariant, +): AggregateSummary['agent_summaries'][0] { + const variantMetrics = metrics.filter((m) => m.agent_variant === variant) + const n = variantMetrics.length + + if (n === 0) { + return { + agent_variant: variant, + hit_at_1_rate: 0, + hit_at_3_rate: 0, + hit_at_5_rate: 0, + hit_at_10_rate: 0, + mean_mrr: 0, + mean_coverage_at_5: 0, + mean_coverage_at_10: 0, + total_duration_ms: 0, + total_tokens: 0, + total_input_tokens: 0, + total_output_tokens: 0, + total_cost_usd: 0, + total_tool_calls: 0, + median_duration_ms: 0, + p90_duration_ms: 0, + median_tokens: 0, + p90_tokens: 0, + median_time_to_first_hit_ms: null, + median_tokens_to_first_hit: null, + } + } + + // Hit rates (using behavioral ranking) + const hit1 = variantMetrics.filter((m) => m.hit_at_1_behavioral).length / n + const hit3 = variantMetrics.filter((m) => m.hit_at_3_behavioral).length / n + const hit5 = variantMetrics.filter((m) => m.hit_at_5_behavioral).length / n + const hit10 = variantMetrics.filter((m) => m.hit_at_10_behavioral).length / n + + // MRR and coverage + const mrrValues = variantMetrics.map((m) => m.mrr_behavioral) + const cov5Values = variantMetrics.map((m) => m.coverage_at_5_behavioral) + const cov10Values = variantMetrics.map((m) => m.coverage_at_10_behavioral) + + // Duration and tokens + const durations = variantMetrics.map((m) => m.total_duration_ms) + const inputTokens = variantMetrics.map((m) => m.total_input_tokens) + const outputTokens = variantMetrics.map((m) => m.total_output_tokens) + const tokens = variantMetrics.map( + (m) => m.total_input_tokens + m.total_output_tokens, + ) + const costs = variantMetrics.map((m) => m.total_cost_usd) + const toolCalls = variantMetrics.map((m) => m.tool_calls_count) + + // Time/tokens to first hit + const timesToHit = filterNulls( + variantMetrics.map((m) => m.time_to_first_hit_ms), + ) + const tokensToHit = filterNulls( + variantMetrics.map((m) => m.tokens_to_first_hit), + ) + + return { + agent_variant: variant, + hit_at_1_rate: hit1, + hit_at_3_rate: hit3, + hit_at_5_rate: hit5, + hit_at_10_rate: hit10, + mean_mrr: mean(mrrValues), + mean_coverage_at_5: mean(cov5Values), + mean_coverage_at_10: mean(cov10Values), + // Totals + total_duration_ms: sum(durations), + total_tokens: sum(tokens), + total_input_tokens: sum(inputTokens), + total_output_tokens: sum(outputTokens), + total_cost_usd: sum(costs), + total_tool_calls: sum(toolCalls), + // Medians + median_duration_ms: median(durations), + p90_duration_ms: percentile(durations, 90), + median_tokens: median(tokens), + p90_tokens: percentile(tokens, 90), + median_time_to_first_hit_ms: + timesToHit.length > 0 ? median(timesToHit) : null, + median_tokens_to_first_hit: + tokensToHit.length > 0 ? median(tokensToHit) : null, + } +} + +/** + * Aggregate all metrics into a summary + */ +export function aggregateMetrics( + metrics: InstanceMetrics[], + split: string, +): AggregateSummary { + const opsOnly = aggregateForVariant(metrics, 'ops-only') + const opsPlusSearch = aggregateForVariant(metrics, 'ops-plus-search') + + // Compute deltas (Agent2 - Agent1) - use totals for cost comparison + const delta = { + hit_at_1_delta: opsPlusSearch.hit_at_1_rate - opsOnly.hit_at_1_rate, + hit_at_3_delta: opsPlusSearch.hit_at_3_rate - opsOnly.hit_at_3_rate, + hit_at_5_delta: opsPlusSearch.hit_at_5_rate - opsOnly.hit_at_5_rate, + mrr_delta: opsPlusSearch.mean_mrr - opsOnly.mean_mrr, + duration_ms_delta: + opsPlusSearch.total_duration_ms - opsOnly.total_duration_ms, + tokens_delta: opsPlusSearch.total_tokens - opsOnly.total_tokens, + cost_usd_delta: opsPlusSearch.total_cost_usd - opsOnly.total_cost_usd, + } + + return { + split, + total_instances: new Set(metrics.map((m) => m.instance_id)).size, + agent_summaries: [opsOnly, opsPlusSearch], + delta, + } +} + +/** + * Write summary to JSON file + */ +export function writeSummary( + filePath: string, + summary: AggregateSummary, +): void { + const dir = dirname(filePath) + if (!existsSync(dir)) { + mkdirSync(dir, { recursive: true }) + } + + writeFileSync(filePath, JSON.stringify(summary, null, 2)) +} + +/** + * Format duration nicely + */ +function formatDuration(ms: number): string { + if (ms < 1000) return `${ms.toFixed(0)}ms` + if (ms < 60000) return `${(ms / 1000).toFixed(1)}s` + return `${(ms / 60000).toFixed(1)}m` +} + +/** + * Format token count nicely + */ +function formatTokens(tokens: number): string { + if (tokens < 1000) return `${tokens}` + if (tokens < 1000000) return `${(tokens / 1000).toFixed(1)}k` + return `${(tokens / 1000000).toFixed(2)}M` +} + +/** + * Print summary to console + */ +export function printSummary(summary: AggregateSummary): void { + const W = 70 // Total box width + const line = '='.repeat(W - 2) + + console.log(`\n+${line}+`) + console.log( + '|' + + ` SWE-bench Lite Retrieval Evaluation Summary (${summary.split})`.padEnd( + W - 2, + ) + + '|', + ) + console.log(`+${line}+`) + console.log( + `|${` Total instances: ${summary.total_instances}`.padEnd(W - 2)}|`, + ) + console.log(`+${line}+\n`) + + for (const agent of summary.agent_summaries) { + const title = ` ${agent.agent_variant.toUpperCase()} ` + const titlePad = Math.floor((W - 2 - title.length) / 2) + const header = + '='.repeat(titlePad) + title + '='.repeat(W - 2 - titlePad - title.length) + + console.log(`+${header}+`) + + // Quality metrics + console.log(`${'| QUALITY'.padEnd(W - 1)}|`) + const h1 = `${(agent.hit_at_1_rate * 100).toFixed(1)}%`.padStart(6) + const h3 = `${(agent.hit_at_3_rate * 100).toFixed(1)}%`.padStart(6) + const h5 = `${(agent.hit_at_5_rate * 100).toFixed(1)}%`.padStart(6) + const h10 = `${(agent.hit_at_10_rate * 100).toFixed(1)}%`.padStart(6) + console.log( + `| Hit@1:${h1} Hit@3:${h3} Hit@5:${h5} Hit@10:${h10}`.padEnd(W - 1) + + '|', + ) + console.log( + `${`| MRR: ${agent.mean_mrr.toFixed(3)} Coverage@5: ${(agent.mean_coverage_at_5 * 100).toFixed(1)}% Coverage@10: ${(agent.mean_coverage_at_10 * 100).toFixed(1)}%`.padEnd( + W - 1, + )}|`, + ) + + // Totals + console.log(`${'| TOTALS'.padEnd(W - 1)}|`) + const dur = formatDuration(agent.total_duration_ms).padStart(8) + const tok = formatTokens(agent.total_tokens).padStart(7) + const tokIn = formatTokens(agent.total_input_tokens) + const tokOut = formatTokens(agent.total_output_tokens) + console.log( + `${`| Duration:${dur} Tokens:${tok} (in: ${tokIn}, out: ${tokOut})`.padEnd( + W - 1, + )}|`, + ) + const cost = agent.total_cost_usd.toFixed(4).padStart(8) + console.log( + `${`| Cost: $${cost} Tool calls: ${agent.total_tool_calls}`.padEnd( + W - 1, + )}|`, + ) + + // First hit metrics + if (agent.median_time_to_first_hit_ms !== null) { + console.log(`${'| FIRST HIT'.padEnd(W - 1)}|`) + console.log( + `${`| Time: ${formatDuration(agent.median_time_to_first_hit_ms)} Tokens: ${agent.median_tokens_to_first_hit ?? 'N/A'}`.padEnd( + W - 1, + )}|`, + ) + } + + console.log(`+${line}+\n`) + } + + // Delta comparison + console.log(`+${'-'.repeat(W - 2)}+`) + console.log(`|${' DELTA (ops-plus-search minus ops-only)'.padEnd(W - 2)}|`) + console.log(`+${'-'.repeat(W - 2)}+`) + const d1 = `${(summary.delta.hit_at_1_delta * 100) >= 0 ? '+' : ''}${(summary.delta.hit_at_1_delta * 100).toFixed(1)}%` + const dMrr = `${summary.delta.mrr_delta >= 0 ? '+' : ''}${summary.delta.mrr_delta.toFixed(3)}` + console.log( + `${`| Quality: Hit@1 ${d1.padStart(7)} MRR ${dMrr.padStart(7)}`.padEnd( + W - 1, + )}|`, + ) + const dTok = `${summary.delta.tokens_delta >= 0 ? '+' : ''}${formatTokens(summary.delta.tokens_delta)}` + const dCost = `${summary.delta.cost_usd_delta >= 0 ? '+' : ''}$${summary.delta.cost_usd_delta.toFixed(4)}` + console.log( + `${`| Cost: ${dTok.padStart(8)} tokens ${dCost.padStart(10)}`.padEnd( + W - 1, + )}|`, + ) + const dDur = `${summary.delta.duration_ms_delta >= 0 ? '+' : ''}${formatDuration(summary.delta.duration_ms_delta)}` + console.log(`${`| Duration: ${dDur.padStart(8)}`.padEnd(W - 1)}|`) + console.log(`+${'-'.repeat(W - 2)}+\n`) +} diff --git a/packages/eval/src/swebench/dataset.ts b/packages/eval/src/swebench/dataset.ts new file mode 100644 index 0000000..85e6673 --- /dev/null +++ b/packages/eval/src/swebench/dataset.ts @@ -0,0 +1,154 @@ +/** + * SWE-bench Lite dataset loader with caching and retry logic + * Fetches from Hugging Face Dataset Viewer /rows endpoint + */ + +import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs' +import { dirname, join } from 'node:path' +import type { SWEbenchInstance } from './types' + +const HF_DATASET = 'princeton-nlp/SWE-bench_Lite' +const HF_API_BASE = 'https://datasets-server.huggingface.co' +const PAGE_SIZE = 100 + +// Cache directory (relative to package root) +const CACHE_DIR = join( + dirname(dirname(decodeURIComponent(new URL(import.meta.url).pathname))), + 'data', + 'swebench_lite', +) + +/** + * Ensure cache directory exists + */ +function ensureCacheDir(): void { + if (!existsSync(CACHE_DIR)) { + mkdirSync(CACHE_DIR, { recursive: true }) + } +} + +/** + * Get cache file path for a page + */ +function getCachePath(split: string, offset: number): string { + return join(CACHE_DIR, `${split}_offset${offset}_limit${PAGE_SIZE}.json`) +} + +/** + * Sleep helper for retry backoff + */ +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)) +} + +/** + * Fetch a page from HF Dataset Viewer with retry + exponential backoff + */ +async function fetchPage( + split: string, + offset: number, + maxRetries = 3, +): Promise<{ rows: SWEbenchInstance[]; total: number }> { + const url = new URL(`${HF_API_BASE}/rows`) + url.searchParams.set('dataset', HF_DATASET) + url.searchParams.set('config', 'default') + url.searchParams.set('split', split) + url.searchParams.set('offset', String(offset)) + url.searchParams.set('length', String(PAGE_SIZE)) + + for (let attempt = 0; attempt < maxRetries; attempt++) { + try { + const response = await fetch(url.toString()) + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`) + } + const data = await response.json() + + // HF returns { features, rows: [{row_idx, row: {...}}], num_rows_total } + const rows: SWEbenchInstance[] = data.rows.map((r: any) => ({ + instance_id: r.row.instance_id, + repo: r.row.repo, + base_commit: r.row.base_commit, + problem_statement: r.row.problem_statement, + patch: r.row.patch, + test_patch: r.row.test_patch, + })) + + return { rows, total: data.num_rows_total } + } catch (err) { + const waitMs = 2 ** attempt * 1000 + console.warn( + `[dataset] Fetch failed (attempt ${attempt + 1}/${maxRetries}): ${err}. Retrying in ${waitMs}ms...`, + ) + await sleep(waitMs) + } + } + + throw new Error( + `[dataset] Failed to fetch page after ${maxRetries} attempts: split=${split}, offset=${offset}`, + ) +} + +/** + * Load a page from cache or fetch from HF + */ +async function loadPage( + split: string, + offset: number, +): Promise<{ rows: SWEbenchInstance[]; total: number }> { + ensureCacheDir() + const cachePath = getCachePath(split, offset) + + // Check cache first + if (existsSync(cachePath)) { + try { + const cached = JSON.parse(readFileSync(cachePath, 'utf-8')) + return cached + } catch { + // Cache corrupted, refetch + } + } + + // Fetch from HF + const result = await fetchPage(split, offset) + + // Write to cache + writeFileSync(cachePath, JSON.stringify(result, null, 2)) + + return result +} + +/** + * Load all instances from a split + */ +export async function loadSWEbenchLite( + split: 'dev' | 'test' = 'test', + maxInstances?: number, +): Promise { + const instances: SWEbenchInstance[] = [] + let offset = 0 + let total = Infinity + + console.log(`[dataset] Loading SWE-bench Lite split="${split}"...`) + + while (offset < total) { + const page = await loadPage(split, offset) + total = page.total + instances.push(...page.rows) + console.log( + `[dataset] Loaded ${instances.length}/${total} instances (offset=${offset})`, + ) + + if (maxInstances && instances.length >= maxInstances) { + break + } + + offset += PAGE_SIZE + } + + const result = maxInstances ? instances.slice(0, maxInstances) : instances + console.log( + `[dataset] Loaded ${result.length} instances from split="${split}"`, + ) + return result +} diff --git a/packages/eval/src/swebench/git.ts b/packages/eval/src/swebench/git.ts new file mode 100644 index 0000000..f20e723 --- /dev/null +++ b/packages/eval/src/swebench/git.ts @@ -0,0 +1,204 @@ +/** + * Git repository manager: bare clones + worktrees for reproducible checkout + */ + +import { spawnSync } from 'node:child_process' +import { existsSync, mkdirSync, rmSync } from 'node:fs' +import { dirname, join } from 'node:path' +import type { WorktreeInfo } from './types' + +// Cache directory for bare clones (relative to package root) +const REPOS_CACHE_DIR = join( + dirname(dirname(decodeURIComponent(new URL(import.meta.url).pathname))), + 'data', + 'repos', +) + +// Worktrees directory +const WORKTREES_DIR = join( + dirname(dirname(decodeURIComponent(new URL(import.meta.url).pathname))), + 'data', + 'worktrees', +) + +/** + * Ensure directory exists + */ +function ensureDir(dir: string): void { + if (!existsSync(dir)) { + mkdirSync(dir, { recursive: true }) + } +} + +/** + * Run a git command and return stdout + */ +function git(args: string[], cwd?: string): string { + const result = spawnSync('git', args, { + cwd, + encoding: 'utf-8', + maxBuffer: 50 * 1024 * 1024, // 50MB + }) + + if (result.status !== 0) { + throw new Error( + `git ${args.join(' ')} failed: ${result.stderr || result.stdout}`, + ) + } + + return result.stdout.trim() +} + +/** + * Get the bare clone path for a repo + * e.g. "django/django" -> "/path/to/repos/django__django.git" + */ +function getBareClonePath(repo: string): string { + const safeName = `${repo.replace(/\//g, '__')}.git` + return join(REPOS_CACHE_DIR, safeName) +} + +/** + * Get the worktree path for an instance + */ +function getWorktreePath(instanceId: string): string { + const safeName = instanceId.replace(/[^a-zA-Z0-9_-]/g, '_') + return join(WORKTREES_DIR, safeName) +} + +/** + * Ensure a bare clone exists for a repo, or create/update it + */ +async function ensureBareClone(repo: string): Promise { + ensureDir(REPOS_CACHE_DIR) + const barePath = getBareClonePath(repo) + + if (existsSync(barePath)) { + // Fetch latest + console.log(`[git] Fetching updates for ${repo}...`) + try { + git(['fetch', '--all', '--prune'], barePath) + } catch (err) { + console.warn(`[git] Fetch failed, will continue with existing: ${err}`) + } + } else { + // Clone bare + const url = `https://github.com/${repo}.git` + console.log(`[git] Cloning bare ${repo} from ${url}...`) + git(['clone', '--bare', url, barePath]) + } + + return barePath +} + +/** + * Create a worktree at a specific commit + */ +export async function createWorktree( + repo: string, + commit: string, + instanceId: string, +): Promise { + const startTime = Date.now() + + // Ensure bare clone exists + const barePath = await ensureBareClone(repo) + + // Get worktree path + ensureDir(WORKTREES_DIR) + const worktreePath = getWorktreePath(instanceId) + + // Remove existing worktree if it exists + if (existsSync(worktreePath)) { + console.log(`[git] Removing existing worktree at ${worktreePath}...`) + try { + git(['worktree', 'remove', '--force', worktreePath], barePath) + } catch { + // Force remove directory if git worktree remove fails + rmSync(worktreePath, { recursive: true, force: true }) + } + } + + // Create worktree + console.log( + `[git] Creating worktree for ${instanceId} at commit ${commit}...`, + ) + git(['worktree', 'add', '--detach', worktreePath, commit], barePath) + + // Verify the commit + const resolvedCommit = git(['rev-parse', 'HEAD'], worktreePath) + + const checkoutMs = Date.now() - startTime + console.log( + `[git] Worktree created at ${worktreePath} (commit: ${resolvedCommit}, took ${checkoutMs}ms)`, + ) + + return { + path: worktreePath, + commit: resolvedCommit, + checkout_ms: checkoutMs, + } +} + +/** + * Remove a worktree + */ +export async function removeWorktree( + repo: string, + instanceId: string, +): Promise { + const barePath = getBareClonePath(repo) + const worktreePath = getWorktreePath(instanceId) + + if (!existsSync(worktreePath)) { + return + } + + console.log(`[git] Removing worktree at ${worktreePath}...`) + try { + git(['worktree', 'remove', '--force', worktreePath], barePath) + } catch { + // Force remove directory if git worktree remove fails + rmSync(worktreePath, { recursive: true, force: true }) + } +} + +/** + * List all files in a worktree (for indexing) + */ +export function listFiles( + worktreePath: string, + extensions?: string[], +): string[] { + let files: string[] + + try { + // Use git ls-files for tracked files + const output = git(['ls-files'], worktreePath) + files = output.split('\n').filter(Boolean) + } catch { + // Fallback: use find + const result = spawnSync('find', ['.', '-type', 'f', '-name', '*.*'], { + cwd: worktreePath, + encoding: 'utf-8', + maxBuffer: 50 * 1024 * 1024, + }) + files = result.stdout + .split('\n') + .filter(Boolean) + .map((f) => f.replace(/^\.\//, '')) + } + + // Filter by extensions if provided + if (extensions && extensions.length > 0) { + const extSet = new Set( + extensions.map((e) => (e.startsWith('.') ? e : `.${e}`)), + ) + files = files.filter((f) => { + const ext = f.slice(f.lastIndexOf('.')) + return extSet.has(ext) + }) + } + + return files +} diff --git a/packages/eval/src/swebench/observe/instrumentation.ts b/packages/eval/src/swebench/observe/instrumentation.ts new file mode 100644 index 0000000..0437874 --- /dev/null +++ b/packages/eval/src/swebench/observe/instrumentation.ts @@ -0,0 +1,494 @@ +/** + * Instrumentation layer: hooks + message parsing + ranking + usage dedupe + */ + +import { appendFileSync, existsSync, mkdirSync } from 'node:fs' +import { dirname } from 'node:path' +import { matchesOracle, normalizePath } from '../score' +import type { AgentVariant, Event, OracleFiles } from '../types' + +// ───────────────────────────────────────────────────────────────────────────── +// Event Writer (merged from events.ts) +// ───────────────────────────────────────────────────────────────────────────── + +export class EventWriter { + private filePath: string + + constructor(filePath: string) { + this.filePath = filePath + const dir = dirname(filePath) + if (!existsSync(dir)) mkdirSync(dir, { recursive: true }) + } + + private write(event: Event): void { + appendFileSync(this.filePath, `${JSON.stringify(event)}\n`) + } + + writeSessionStart( + instanceId: string, + agentVariant: AgentVariant, + model: string, + cwd: string, + allowedTools: string[], + ): void { + this.write({ + type: 'session_start', + timestamp: Date.now(), + instance_id: instanceId, + agent_variant: agentVariant, + model, + cwd, + allowed_tools: allowedTools, + }) + } + + writeSessionEnd( + instanceId: string, + agentVariant: AgentVariant, + durationMs: number, + totalCostUsd: number, + usage: { + input_tokens: number + output_tokens: number + cache_read_input_tokens?: number + }, + topFilesFinal: string[], + rankedFilesFromTools: string[], + ): void { + this.write({ + type: 'session_end', + timestamp: Date.now(), + instance_id: instanceId, + agent_variant: agentVariant, + duration_ms: durationMs, + total_cost_usd: totalCostUsd, + usage, + top_files_final: topFilesFinal, + ranked_files_from_tools: rankedFilesFromTools, + }) + } + + writeToolCallStart( + toolName: string, + toolUseId: string, + input: unknown, + ): void { + this.write({ + type: 'tool_call', + timestamp: Date.now(), + tool_name: toolName, + tool_use_id: toolUseId, + input, + }) + } + + writeToolCallEnd( + toolName: string, + toolUseId: string, + input: unknown, + output: unknown, + latencyMs: number, + outputChars: number, + ): void { + this.write({ + type: 'tool_call', + timestamp: Date.now(), + tool_name: toolName, + tool_use_id: toolUseId, + input, + output, + latency_ms: latencyMs, + output_chars: outputChars, + }) + } + + writeToolCallError( + toolName: string, + toolUseId: string, + input: unknown, + error: string, + latencyMs: number, + ): void { + this.write({ + type: 'tool_call', + timestamp: Date.now(), + tool_name: toolName, + tool_use_id: toolUseId, + input, + error, + latency_ms: latencyMs, + }) + } + + writeUsage( + messageId: string, + inputTokens: number, + outputTokens: number, + cacheReadInputTokens?: number, + cacheCreationInputTokens?: number, + ): void { + this.write({ + type: 'usage', + timestamp: Date.now(), + message_id: messageId, + input_tokens: inputTokens, + output_tokens: outputTokens, + cache_read_input_tokens: cacheReadInputTokens, + cache_creation_input_tokens: cacheCreationInputTokens, + }) + } +} + +/** + * Accumulated usage (deduplicated by message ID) + */ +interface AccumulatedUsage { + input_tokens: number + output_tokens: number + cache_read_input_tokens: number + cache_creation_input_tokens: number +} + +/** + * Tool call timing for latency tracking + */ +interface ToolCallTiming { + startTime: number + toolName: string + input: unknown +} + +/** + * Run context for a single agent run + */ +export interface RunContext { + instanceId: string + agentVariant: AgentVariant + worktreePath: string + oracle: OracleFiles + eventWriter: EventWriter + + // State + startTime: number + toolCallCount: number + toolCallTimings: Map // toolUseId -> timing + seenMessageIds: Set + accumulatedUsage: AccumulatedUsage + + // Ranked file extraction + rankedFilesFromTools: string[] // Behavioral: first-seen order from tools + seenFilePaths: Set + + // Tool output size accounting + toolOutputCharsByType: Record + + // First hit tracking + firstHitTime: number | null + firstHitTokens: number | null + toolUseIdToMessageId: Map // For attribution +} + +/** + * Create a new run context + */ +export function createRunContext( + instanceId: string, + agentVariant: AgentVariant, + worktreePath: string, + oracle: OracleFiles, + eventWriter: EventWriter, +): RunContext { + return { + instanceId, + agentVariant, + worktreePath, + oracle, + eventWriter, + startTime: Date.now(), + toolCallCount: 0, + toolCallTimings: new Map(), + seenMessageIds: new Set(), + accumulatedUsage: { + input_tokens: 0, + output_tokens: 0, + cache_read_input_tokens: 0, + cache_creation_input_tokens: 0, + }, + rankedFilesFromTools: [], + seenFilePaths: new Set(), + toolOutputCharsByType: {}, + firstHitTime: null, + firstHitTokens: null, + toolUseIdToMessageId: new Map(), + } +} + +/** + * Extract file paths from tool output + */ +function extractFilePathsFromToolOutput( + toolName: string, + input: unknown, + output: unknown, +): string[] { + const paths: string[] = [] + + // Read tool: file_path in input + if (toolName === 'Read' && typeof input === 'object' && input !== null) { + const readInput = input as { file_path?: string } + if (readInput.file_path) { + paths.push(readInput.file_path) + } + } + + // Grep tool: parse output for file paths + if (toolName === 'Grep' && typeof output === 'string') { + // Grep output format: "filepath:line:content" or just filepaths + const lines = output.split('\n') + for (const line of lines) { + const match = line.match(/^([^:]+):/) + if (match?.[1]) { + const candidate = match[1].trim() + // Skip if it's just a number (line number) or doesn't look like a path + if (/^\d+$/.test(candidate)) continue + // Must contain a path separator or file extension + if (candidate.includes('/') || candidate.includes('.')) { + paths.push(candidate) + } + } + } + } + + // Glob tool: output is typically a list of paths + if (toolName === 'Glob') { + if (typeof output === 'string') { + const lines = output.split('\n').filter(Boolean) + paths.push(...lines) + } else if (Array.isArray(output)) { + paths.push(...output.filter((p) => typeof p === 'string')) + } + } + + // Semantic search: results contain filepath + // Output can be a string, or MCP format: [{type:"text", text:"..."}] + if (toolName === 'mcp__semantic_search__search') { + let textContent = '' + + if (typeof output === 'string') { + textContent = output + } else if (Array.isArray(output)) { + // MCP response format: [{type: "text", text: "..."}] + for (const item of output) { + if (item && typeof item === 'object' && 'text' in item) { + textContent += `${(item as { text: string }).text}\n` + } + } + } else if (output && typeof output === 'object' && 'text' in output) { + textContent = (output as { text: string }).text + } + + if (textContent) { + // Extract file paths from structured __FILES__ block (added by semantic_search_tool.ts) + const filesMatch = textContent.match( + /__FILES__\n([\s\S]*?)\n__END_FILES__/, + ) + if (filesMatch?.[1]) { + const files = filesMatch[1].split('\n').filter(Boolean) + paths.push(...files) + } + } + } + + return paths +} + +/** + * Strip the worktree prefix from an absolute path to get repo-relative path + */ +function stripWorktreePrefix(path: string, worktreePath: string): string { + // Normalize both paths for comparison + const normalizedWorktree = `${worktreePath.replace(/\/+$/, '')}/` + + if (path.startsWith(normalizedWorktree)) { + return path.slice(normalizedWorktree.length) + } + + // Also handle URL-decoded paths (spaces vs %20) + const decodedPath = decodeURIComponent(path) + if (decodedPath.startsWith(normalizedWorktree)) { + return decodedPath.slice(normalizedWorktree.length) + } + + return path +} + +/** + * Record file paths from tool output and check for oracle hits + */ +export function recordFilePathsFromTool( + ctx: RunContext, + toolName: string, + _toolUseId: string, + input: unknown, + output: unknown, +): void { + const paths = extractFilePathsFromToolOutput(toolName, input, output) + + for (const rawPath of paths) { + // First strip the worktree prefix to get repo-relative path + const relativePath = stripWorktreePrefix(rawPath, ctx.worktreePath) + const normalized = normalizePath(relativePath) + if (!ctx.seenFilePaths.has(normalized)) { + ctx.seenFilePaths.add(normalized) + ctx.rankedFilesFromTools.push(normalized) + + // Check for first hit + if (ctx.firstHitTime === null && matchesOracle(normalized, ctx.oracle)) { + ctx.firstHitTime = Date.now() - ctx.startTime + // Tokens to first hit: sum usage up to the message that spawned this tool + // We can't perfectly compute this without tracking per-message, so we use accumulated + ctx.firstHitTokens = + ctx.accumulatedUsage.input_tokens + ctx.accumulatedUsage.output_tokens + } + } + } +} + +/** + * Pre-tool-use hook handler + */ +export function onPreToolUse( + ctx: RunContext, + toolName: string, + toolUseId: string, + input: unknown, +): void { + ctx.toolCallCount++ + ctx.toolCallTimings.set(toolUseId, { + startTime: Date.now(), + toolName, + input, + }) + + ctx.eventWriter.writeToolCallStart(toolName, toolUseId, input) +} + +/** + * Post-tool-use hook handler + */ +export function onPostToolUse( + ctx: RunContext, + toolName: string, + toolUseId: string, + input: unknown, + output: unknown, +): void { + const timing = ctx.toolCallTimings.get(toolUseId) + const latencyMs = timing ? Date.now() - timing.startTime : 0 + + // Serialize output for size accounting + const outputStr = typeof output === 'string' ? output : JSON.stringify(output) + const outputChars = outputStr.length + + // Accumulate tool output chars by type + ctx.toolOutputCharsByType[toolName] = + (ctx.toolOutputCharsByType[toolName] || 0) + outputChars + + ctx.eventWriter.writeToolCallEnd( + toolName, + toolUseId, + input, + output, + latencyMs, + outputChars, + ) + + // Extract and record file paths + recordFilePathsFromTool(ctx, toolName, toolUseId, input, output) + + ctx.toolCallTimings.delete(toolUseId) +} + +/** + * Post-tool-use-failure hook handler + */ +export function onPostToolUseFailure( + ctx: RunContext, + toolName: string, + toolUseId: string, + input: unknown, + error: string, +): void { + const timing = ctx.toolCallTimings.get(toolUseId) + const latencyMs = timing ? Date.now() - timing.startTime : 0 + + ctx.eventWriter.writeToolCallError( + toolName, + toolUseId, + input, + error, + latencyMs, + ) + ctx.toolCallTimings.delete(toolUseId) +} + +/** + * Process an assistant message to extract tool_use blocks and usage + */ +export function processAssistantMessage( + ctx: RunContext, + message: { + id?: string + content?: unknown + usage?: { + input_tokens?: number + output_tokens?: number + cache_read_input_tokens?: number + cache_creation_input_tokens?: number + } + }, +): void { + const messageId = message.id + if (!messageId) return + + // Dedupe usage by message ID + if (!ctx.seenMessageIds.has(messageId)) { + ctx.seenMessageIds.add(messageId) + + if (message.usage) { + const usage = message.usage + ctx.accumulatedUsage.input_tokens += usage.input_tokens || 0 + ctx.accumulatedUsage.output_tokens += usage.output_tokens || 0 + ctx.accumulatedUsage.cache_read_input_tokens += + usage.cache_read_input_tokens || 0 + ctx.accumulatedUsage.cache_creation_input_tokens += + usage.cache_creation_input_tokens || 0 + + ctx.eventWriter.writeUsage( + messageId, + usage.input_tokens || 0, + usage.output_tokens || 0, + usage.cache_read_input_tokens, + usage.cache_creation_input_tokens, + ) + } + } + + // Build toolUseId -> messageId mapping from content blocks + if (Array.isArray(message.content)) { + for (const block of message.content) { + if (block && typeof block === 'object' && 'type' in block) { + if (block.type === 'tool_use' && 'id' in block) { + ctx.toolUseIdToMessageId.set(block.id as string, messageId) + } + } + } + } +} + +/** + * Get total tool output chars + */ +export function getTotalToolOutputChars(ctx: RunContext): number { + return Object.values(ctx.toolOutputCharsByType).reduce((a, b) => a + b, 0) +} diff --git a/packages/eval/src/swebench/run.ts b/packages/eval/src/swebench/run.ts new file mode 100644 index 0000000..b5db6c2 --- /dev/null +++ b/packages/eval/src/swebench/run.ts @@ -0,0 +1,669 @@ +/** + * SWE-bench Lite retrieval-only evaluation runner + * Main runner loop that orchestrates the evaluation + */ + +import { mkdirSync } from 'node:fs' +import { dirname, join } from 'node:path' +import { query } from '@anthropic-ai/claude-agent-sdk' +import { + createUserPrompt, + parseTopFiles, + RETRIEVAL_ONLY_SYSTEM_PROMPT, + RETRIEVAL_WITH_SEARCH_SYSTEM_PROMPT, +} from './agent/prompts' +import { + createSemanticSearchMetrics, + createSemanticSearchServer, + GeminiEmbeddingService, + SemanticSearchIndex, +} from './agent/semantic_search_adapter' +import { getAgentConfig } from './agent/variants' +import { aggregateMetrics, printSummary, writeSummary } from './aggregate' +import { loadSWEbenchLite } from './dataset' +import { createWorktree, listFiles, removeWorktree } from './git' +import { + createRunContext, + EventWriter, + onPostToolUse, + onPreToolUse, + processAssistantMessage, +} from './observe/instrumentation' +import { computeInstanceMetrics, extractOracle, writeMetrics } from './score' +import type { AgentVariant, InstanceMetrics, SWEbenchInstance } from './types' + +/** + * Configuration for the evaluation run + */ +export interface RunConfig { + split?: 'dev' | 'test' + maxInstances?: number + maxTurns?: number + maxToolCalls?: number + model?: string + runDir?: string + skipAgent1?: boolean + skipAgent2?: boolean + instanceIds?: string[] // Run specific instances only + indexExtensions?: string[] // File extensions to index for semantic search + embeddingDimensions?: number // Output dimensions for Gemini (768/1536/3072) + embeddingProvider?: 'gemini' | 'openai' +} + +const DEFAULT_CONFIG: Required< + Omit +> = { + split: 'test', + maxInstances: undefined as unknown as number, + maxTurns: 20, + maxToolCalls: 50, + model: 'claude-sonnet-4-5', + // Put runs in project root (not src/) to avoid polluting agent's Grep searches + runDir: join( + dirname(dirname(decodeURIComponent(new URL(import.meta.url).pathname))), + 'runs', + ), + skipAgent1: false, + skipAgent2: false, + indexExtensions: ['.py', '.js', '.ts', '.java', '.go', '.rs', '.rb', '.php'], + embeddingDimensions: 1536, // Gemini embedding dimensions (768/1536/3072) +} + +/** + * Run evaluation for a single agent on a single instance + */ +async function runAgent( + instance: SWEbenchInstance, + variant: AgentVariant, + worktreePath: string, + runTimestamp: string, + config: Required> & { + instanceIds?: string[] + embeddingProvider?: 'gemini' | 'openai' + }, + semanticIndex?: SemanticSearchIndex, +): Promise { + const agentConfig = getAgentConfig(variant) + const oracle = extractOracle(instance) + + // Setup output paths + const eventsPath = join( + config.runDir, + runTimestamp, + 'events', + `${instance.instance_id}_${variant}.jsonl`, + ) + const eventWriter = new EventWriter(eventsPath) + + // Create run context + const ctx = createRunContext( + instance.instance_id, + variant, + worktreePath, + oracle, + eventWriter, + ) + + // Log session start + eventWriter.writeSessionStart( + instance.instance_id, + variant, + config.model, + worktreePath, + agentConfig.allowedTools, + ) + + // Setup semantic search for Agent2 + const semanticSearchMetrics = createSemanticSearchMetrics() + const mcpServers: Record = {} + + if (variant === 'ops-plus-search' && semanticIndex) { + const semanticServer = createSemanticSearchServer( + semanticIndex, + semanticSearchMetrics, + ) + mcpServers.semantic_search = semanticServer + } + + // Create prompt with repo context + // Pass hasSemanticSearch=true for Agent2 to encourage semantic search usage + const hasSemanticSearch = variant === 'ops-plus-search' && !!semanticIndex + const userPrompt = createUserPrompt( + instance.problem_statement, + instance.repo, + hasSemanticSearch, + ) + + // Track tool call count for budget enforcement + let toolCallCount = 0 + + // Run the agent + let finalOutput = '' + let totalCostUsd = 0 + let totalDurationMs = 0 + + try { + // Build query options with explicit tool restriction + // SDK docs: 'tools' array = ONLY these tools available (excludes MCP!) + // SDK docs: 'allowedTools' = whitelist that includes MCP tools + const allowedBuiltinTools = ['Read', 'Grep', 'Glob', 'LS'] + + // Deny all tools that could modify or are not needed + const denyTools = [ + 'Write', + 'Edit', + 'Bash', + 'Task', + 'WebSearch', + 'WebFetch', + 'TodoRead', + 'TodoWrite', + 'NotebookRead', + 'NotebookEdit', + 'Agent', + 'MultiEdit', + ] + + // Select appropriate system prompt based on variant + const systemPrompt = + variant === 'ops-plus-search' + ? RETRIEVAL_WITH_SEARCH_SYSTEM_PROMPT + : RETRIEVAL_ONLY_SYSTEM_PROMPT + + const queryOptions: Record = { + cwd: worktreePath, // SDK uses 'cwd' not 'workingDirectory' for tool path resolution + model: config.model, + systemPrompt, + maxTurns: config.maxTurns, + disallowedTools: denyTools, + permissionMode: 'bypassPermissions', // Auto-allow for retrieval-only (no writes) + } + + // Configure tools based on variant + if (variant === 'ops-plus-search' && Object.keys(mcpServers).length > 0) { + // For Agent2 with MCP: use allowedTools (whitelist) instead of tools (restriction) + // This allows both built-in AND MCP tools + queryOptions.mcpServers = mcpServers + queryOptions.allowedTools = [ + ...allowedBuiltinTools, + 'mcp__semantic_search__search', + ] + } else { + // For Agent1: use tools array to strictly limit to built-in tools only + queryOptions.tools = allowedBuiltinTools + } + + console.log(`[runner] CWD (worktree): ${worktreePath}`) + if (queryOptions.tools) { + console.log( + `[runner] Tools (strict): [${(queryOptions.tools as string[]).join(', ')}]`, + ) + } + if (queryOptions.allowedTools) { + console.log( + `[runner] AllowedTools: [${(queryOptions.allowedTools as string[]).join(', ')}]`, + ) + } + console.log(`[runner] Denied: [${denyTools.slice(0, 5).join(', ')}...]`) + if (queryOptions.mcpServers) { + console.log(`[runner] MCP: semantic_search enabled`) + } + + const response = query({ + prompt: userPrompt, // Use simple string prompt instead of generator + options: queryOptions, + }) + + // Process streaming messages + for await (const message of response) { + // Cast message to any to handle SDK type inconsistencies with docs + const msg = message as Record + + switch (msg.type) { + case 'assistant': { + const msgContent = msg.message as { content?: unknown } | undefined + // Extract text content for final output parsing + if (typeof msgContent?.content === 'string') { + finalOutput = msgContent.content + // Log assistant thinking (truncate if long) + const preview = msgContent.content.slice(0, 150) + console.log( + ` [${variant}] thinking: ${preview}${msgContent.content.length > 150 ? '...' : ''}`, + ) + } else if (Array.isArray(msgContent?.content)) { + for (const block of msgContent.content) { + const b = block as { + type?: string + text?: string + name?: string + id?: string + input?: unknown + } + if (b?.type === 'text' && b.text) { + finalOutput = b.text + const preview = b.text.slice(0, 150) + console.log( + ` [${variant}] thinking: ${preview}${b.text.length > 150 ? '...' : ''}`, + ) + } else if (b?.type === 'tool_use') { + toolCallCount++ + const inputStr = JSON.stringify(b.input || {}).slice(0, 100) + console.log( + ` [${variant}] Tool[${toolCallCount}]: ${b.name}(${inputStr}${inputStr.length >= 100 ? '...' : ''})`, + ) + if (b.name && b.id) { + onPreToolUse(ctx, b.name, b.id, b.input) + } + } + } + } + // Process for usage and tool_use tracking + if (msgContent) { + processAssistantMessage( + ctx, + msgContent as { + id?: string + content?: unknown + usage?: { + input_tokens?: number + output_tokens?: number + cache_read_input_tokens?: number + cache_creation_input_tokens?: number + } + }, + ) + } + break + } + + case 'tool_result': { + // Track tool results - show brief result preview + const resultVal = msg.result + const resultStr = + typeof resultVal === 'string' + ? resultVal + : JSON.stringify(resultVal || '') + const resultPreview = resultStr.slice(0, 80) + console.log( + ` [${variant}] result: ${msg.tool_name}: ${resultPreview}${resultStr.length > 80 ? '...' : ''}`, + ) + if (msg.tool_name && msg.tool_use_id) { + onPostToolUse( + ctx, + msg.tool_name as string, + msg.tool_use_id as string, + msg.input, + msg.result, + ) + } + break + } + + case 'user': { + // Tool results come as "user" messages with tool_result content + const userMsg = msg.message as { content?: unknown[] } | undefined + if (Array.isArray(userMsg?.content)) { + for (const block of userMsg.content) { + const b = block as { + type?: string + tool_use_id?: string + content?: unknown + } + if (b?.type === 'tool_result' && b.tool_use_id) { + const resultContent = + typeof b.content === 'string' + ? b.content + : JSON.stringify(b.content || '') + const preview = resultContent.slice(0, 80) + + // Look up the tool info from when the call was made + const toolInfo = ctx.toolCallTimings.get(b.tool_use_id) + const toolName = toolInfo?.toolName || 'unknown' + const toolInput = toolInfo?.input || {} + + console.log( + ` [${variant}] result: ${toolName}: ${preview}${resultContent.length > 80 ? '...' : ''}`, + ) + onPostToolUse( + ctx, + toolName, + b.tool_use_id, + toolInput, + b.content, + ) + } + } + } + break + } + + case 'error': + console.error(` [${variant}] Agent error:`, msg.error) + break + + case 'result': { + totalCostUsd = (msg.total_cost_usd as number) || 0 + totalDurationMs = (msg.duration_ms as number) || 0 + // Extract token usage from result message if available + if (msg.total_input_tokens || msg.total_output_tokens) { + ctx.accumulatedUsage.input_tokens = + (msg.total_input_tokens as number) || 0 + ctx.accumulatedUsage.output_tokens = + (msg.total_output_tokens as number) || 0 + } + // Also check for usage object + const usage = msg.usage as + | { input_tokens?: number; output_tokens?: number } + | undefined + if (usage) { + ctx.accumulatedUsage.input_tokens = + usage.input_tokens || ctx.accumulatedUsage.input_tokens + ctx.accumulatedUsage.output_tokens = + usage.output_tokens || ctx.accumulatedUsage.output_tokens + } + { + const totalTokens = + ctx.accumulatedUsage.input_tokens + + ctx.accumulatedUsage.output_tokens + console.log( + ` [${variant}] Done in ${(totalDurationMs / 1000).toFixed(1)}s, cost: $${totalCostUsd.toFixed(4)}, tokens: ${totalTokens}`, + ) + } + break + } + + case 'system': { + if (msg.subtype === 'init') { + console.log(` [${variant}] Session: ${msg.session_id}`) + } + break + } + + default: + // Debug: log unknown message types + console.log( + ` [${variant}] [${msg.type}${msg.subtype ? `:${msg.subtype}` : ''}]`, + ) + } + } + } catch (err) { + console.error( + `[runner] Error running ${variant} on ${instance.instance_id}:`, + err, + ) + } + + // Parse final output for declared top_files + const topFilesFinal = parseTopFiles(finalOutput) + + // Log comparison: found vs expected + const oracleFilesArr = Array.from(ctx.oracle.files) + const foundFiles = ctx.rankedFilesFromTools.slice(0, 10) + const intersection = foundFiles.filter((f) => ctx.oracle.files.has(f)) + + console.log( + `\n +===================================================================+`, + ) + console.log( + ` | [${variant}] RESULTS |`, + ) + console.log( + ` +===================================================================+`, + ) + console.log( + ` | GOLDEN PATCH files: |`, + ) + for (const f of oracleFilesArr) { + console.log(` | - ${f.slice(0, 55).padEnd(55)} |`) + } + console.log( + ` +===================================================================+`, + ) + console.log( + ` | AGENT found files (top ${foundFiles.length}): |`, + ) + if (foundFiles.length === 0) { + console.log( + ` | (no files found) |`, + ) + } + for (const f of foundFiles) { + const match = ctx.oracle.files.has(f) ? '[x]' : '[ ]' + console.log(` | ${match} ${f.slice(0, 55).padEnd(55)} |`) + } + console.log( + ` +===================================================================+`, + ) + console.log( + ` | Hit: ${intersection.length}/${oracleFilesArr.length} | Tool calls: ${toolCallCount.toString().padEnd(3)} | Tokens: ${(ctx.accumulatedUsage.input_tokens + ctx.accumulatedUsage.output_tokens).toString().padEnd(8)} |`, + ) + console.log( + ` +===================================================================+\n`, + ) + + // Log session end + eventWriter.writeSessionEnd( + instance.instance_id, + variant, + totalDurationMs, + totalCostUsd, + { + input_tokens: ctx.accumulatedUsage.input_tokens, + output_tokens: ctx.accumulatedUsage.output_tokens, + cache_read_input_tokens: ctx.accumulatedUsage.cache_read_input_tokens, + }, + topFilesFinal, + ctx.rankedFilesFromTools, + ) + + // Compute metrics + const metrics = computeInstanceMetrics( + ctx, + topFilesFinal, + totalDurationMs, + totalCostUsd, + variant === 'ops-plus-search' && semanticIndex + ? { + callCount: semanticSearchMetrics.callCount, + totalQueryEmbedTokens: semanticSearchMetrics.totalQueryEmbedTokens, + totalQueryEmbedLatencyMs: + semanticSearchMetrics.totalQueryEmbedLatencyMs, + indexEmbedTokens: semanticIndex.totalEmbedTokens, + indexLoadMs: semanticIndex.indexLoadMs, + } + : undefined, + ) + + return metrics +} + +/** + * Main evaluation runner + */ +export async function runEvaluation( + config: Partial = {}, +): Promise { + const cfg: Required> & { + instanceIds?: string[] + embeddingProvider?: 'gemini' | 'openai' + } = { + ...DEFAULT_CONFIG, + ...config, + } + + // Create run directory + const runTimestamp = new Date().toISOString().replace(/[:.]/g, '-') + const runDir = join(cfg.runDir, runTimestamp) + mkdirSync(runDir, { recursive: true }) + mkdirSync(join(runDir, 'events'), { recursive: true }) + + console.log(`[runner] Starting evaluation run at ${runDir}`) + console.log(`[runner] Config:`, cfg) + + // Load dataset + let instances = await loadSWEbenchLite(cfg.split, cfg.maxInstances) + + // Filter to specific instances if provided + if (cfg.instanceIds && cfg.instanceIds.length > 0) { + const ids = new Set(cfg.instanceIds) + instances = instances.filter((i) => ids.has(i.instance_id)) + console.log(`[runner] Filtered to ${instances.length} specific instances`) + } + + console.log(`[runner] Loaded ${instances.length} instances`) + + // Metrics output path + const metricsPath = join(runDir, 'metrics.jsonl') + const allMetrics: InstanceMetrics[] = [] + + // Process each instance + for (let i = 0; i < instances.length; i++) { + const instance = instances[i]! + console.log( + `\n[runner] Processing instance ${i + 1}/${instances.length}: ${instance.instance_id}`, + ) + + // Checkout repo at base_commit + let worktree: + | { path: string; commit: string; checkout_ms: number } + | undefined + try { + worktree = await createWorktree( + instance.repo, + instance.base_commit, + instance.instance_id, + ) + } catch (err) { + console.error(`[runner] Failed to checkout ${instance.instance_id}:`, err) + continue + } + + try { + // Build semantic index for Agent2 (reused across both agents for fairness) + let semanticIndex: SemanticSearchIndex | undefined + if (!cfg.skipAgent2) { + const geminiKey = process.env.GOOGLE_API_KEY + if (!geminiKey) { + console.warn( + '[runner] GOOGLE_API_KEY not set, skipping semantic search indexing', + ) + } else { + const embedService = new GeminiEmbeddingService( + geminiKey, + 'gemini-embedding-001', + 5, + cfg.embeddingDimensions, + ) + console.log( + `[runner] Using Gemini embeddings (${cfg.embeddingDimensions} dimensions)`, + ) + // Check for cached index + const indexCacheDir = join(cfg.runDir, '.index_cache') + const cacheExists = SemanticSearchIndex.cacheExists( + indexCacheDir, + instance.instance_id, + 'gemini', + cfg.embeddingDimensions, + ) + + if (cacheExists) { + console.log(`[runner] Loading semantic index from cache...`) + const cached = await SemanticSearchIndex.loadFromCache( + indexCacheDir, + instance.instance_id, + 'gemini', + cfg.embeddingDimensions, + worktree.path, + embedService, + ) + if (cached) { + semanticIndex = cached + } + } + + // Index if not loaded from cache + if (!semanticIndex) { + semanticIndex = new SemanticSearchIndex(worktree.path, embedService) + + // List files to index + const files = listFiles(worktree.path, cfg.indexExtensions) + console.log( + `[runner] Indexing ${files.length} files for semantic search...`, + ) + + await semanticIndex.index(files) + + // Save to cache for future runs + await semanticIndex.saveToCache( + indexCacheDir, + instance.instance_id, + 'gemini', + cfg.embeddingDimensions, + ) + } + } + } + + // Run Agent1 (ops-only) + if (!cfg.skipAgent1) { + console.log( + `[runner] Running Agent1 (ops-only) on ${instance.instance_id}...`, + ) + const metrics1 = await runAgent( + instance, + 'ops-only', + worktree.path, + runTimestamp, + cfg, + ) + writeMetrics(metricsPath, metrics1) + allMetrics.push(metrics1) + console.log( + `[runner] Agent1 done: Hit@5=${metrics1.hit_at_5_behavioral}, MRR=${metrics1.mrr_behavioral.toFixed(3)}`, + ) + } + + // Run Agent2 (ops + semantic search) + if (!cfg.skipAgent2) { + if (!semanticIndex) { + console.warn( + `[runner] Skipping Agent2: semantic index not available (check API keys or indexing errors)`, + ) + } else { + console.log( + `[runner] Running Agent2 (ops+search) on ${instance.instance_id}...`, + ) + const metrics2 = await runAgent( + instance, + 'ops-plus-search', + worktree.path, + runTimestamp, + cfg, + semanticIndex, + ) + writeMetrics(metricsPath, metrics2) + allMetrics.push(metrics2) + console.log( + `[runner] Agent2 done: Hit@5=${metrics2.hit_at_5_behavioral}, MRR=${metrics2.mrr_behavioral.toFixed(3)}`, + ) + } + } + } finally { + // Cleanup worktree + try { + await removeWorktree(instance.repo, instance.instance_id) + } catch (err) { + console.warn(`[runner] Failed to cleanup worktree:`, err) + } + } + } + + // Aggregate and write summary + console.log('\n[runner] Computing aggregate summary...') + const summary = aggregateMetrics(allMetrics, cfg.split) + const summaryPath = join(runDir, 'summary.json') + writeSummary(summaryPath, summary) + printSummary(summary) + + console.log(`[runner] Evaluation complete. Results at ${runDir}`) +} diff --git a/packages/eval/src/swebench/score.ts b/packages/eval/src/swebench/score.ts new file mode 100644 index 0000000..b96c3d6 --- /dev/null +++ b/packages/eval/src/swebench/score.ts @@ -0,0 +1,183 @@ +/** + * Scoring functions for retrieval quality metrics + * Includes oracle extraction and path normalization (merged from oracle.ts) + */ + +import { appendFileSync, existsSync, mkdirSync } from 'node:fs' +import { dirname } from 'node:path' +import type { RunContext } from './observe/instrumentation' +import { getTotalToolOutputChars } from './observe/instrumentation' +import type { InstanceMetrics, OracleFiles, SWEbenchInstance } from './types' + +// ───────────────────────────────────────────────────────────────────────────── +// Oracle extraction (merged from oracle.ts) +// ───────────────────────────────────────────────────────────────────────────── + +const DIFF_HEADER_RE = /^diff --git a\/(.*?) b\/(.*)$/gm + +/** Normalize a file path: strip leading ./ or /, collapse slashes */ +export function normalizePath(path: string): string { + let n = path.trim() + while (n.startsWith('./')) n = n.slice(2) + while (n.startsWith('/')) n = n.slice(1) + n = n.replace(/\/+/g, '/') + while (n.endsWith('/')) n = n.slice(0, -1) + return n +} + +/** Extract file paths from unified diff patch */ +function extractFilesFromPatch(patch: string): Set { + const files = new Set() + let match = DIFF_HEADER_RE.exec(patch) + while (match !== null) { + const bPath = match[2]! + if (bPath !== '/dev/null' && bPath !== 'dev/null') { + const normalized = normalizePath(bPath) + if (normalized) files.add(normalized) + } + match = DIFF_HEADER_RE.exec(patch) + } + DIFF_HEADER_RE.lastIndex = 0 + return files +} + +/** Extract oracle files from a SWE-bench instance */ +export function extractOracle( + instance: SWEbenchInstance, + includeTestPatch = false, +): OracleFiles { + const files = extractFilesFromPatch(instance.patch) + if (includeTestPatch && instance.test_patch) { + for (const f of extractFilesFromPatch(instance.test_patch)) files.add(f) + } + return { instance_id: instance.instance_id, files } +} + +/** Check if a candidate path matches any oracle file */ +export function matchesOracle( + candidatePath: string, + oracle: OracleFiles, +): boolean { + return oracle.files.has(normalizePath(candidatePath)) +} + +// ───────────────────────────────────────────────────────────────────────────── +// Ranking metrics +// ───────────────────────────────────────────────────────────────────────────── + +function hitAtK( + rankedFiles: string[], + oracle: OracleFiles, + k: number, +): boolean { + return rankedFiles.slice(0, k).some((f) => matchesOracle(f, oracle)) +} + +function reciprocalRank(rankedFiles: string[], oracle: OracleFiles): number { + for (let i = 0; i < rankedFiles.length; i++) { + if (matchesOracle(rankedFiles[i]!, oracle)) return 1 / (i + 1) + } + return 0 +} + +function coverageAtK( + rankedFiles: string[], + oracle: OracleFiles, + k: number, +): number { + if (oracle.files.size === 0) return 1 + const topK = new Set(rankedFiles.slice(0, k).map(normalizePath)) + let hits = 0 + for (const f of oracle.files) if (topK.has(f)) hits++ + return hits / oracle.files.size +} + +// ───────────────────────────────────────────────────────────────────────────── +// Instance metrics computation +// ───────────────────────────────────────────────────────────────────────────── + +/** Compute all metrics for a single run */ +export function computeInstanceMetrics( + ctx: RunContext, + topFilesFinal: string[], + totalDurationMs: number, + totalCostUsd: number, + semanticSearchMetrics?: { + callCount: number + totalQueryEmbedTokens: number + totalQueryEmbedLatencyMs: number + indexEmbedTokens: number + indexLoadMs: number + }, +): InstanceMetrics { + const oracle = ctx.oracle + const rankedBehavioral = ctx.rankedFilesFromTools + const rankedDeclared = topFilesFinal + + return { + instance_id: ctx.instanceId, + agent_variant: ctx.agentVariant, + oracle_files: Array.from(oracle.files), + + // Quality metrics (behavioral ranking) + hit_at_1_behavioral: hitAtK(rankedBehavioral, oracle, 1), + hit_at_3_behavioral: hitAtK(rankedBehavioral, oracle, 3), + hit_at_5_behavioral: hitAtK(rankedBehavioral, oracle, 5), + hit_at_10_behavioral: hitAtK(rankedBehavioral, oracle, 10), + mrr_behavioral: reciprocalRank(rankedBehavioral, oracle), + coverage_at_5_behavioral: coverageAtK(rankedBehavioral, oracle, 5), + coverage_at_10_behavioral: coverageAtK(rankedBehavioral, oracle, 10), + + // Quality metrics (declared ranking) + hit_at_1_declared: hitAtK(rankedDeclared, oracle, 1), + hit_at_3_declared: hitAtK(rankedDeclared, oracle, 3), + hit_at_5_declared: hitAtK(rankedDeclared, oracle, 5), + hit_at_10_declared: hitAtK(rankedDeclared, oracle, 10), + mrr_declared: reciprocalRank(rankedDeclared, oracle), + coverage_at_5_declared: coverageAtK(rankedDeclared, oracle, 5), + coverage_at_10_declared: coverageAtK(rankedDeclared, oracle, 10), + + // Time/tokens to first hit + time_to_first_hit_ms: ctx.firstHitTime, + tokens_to_first_hit: ctx.firstHitTokens, + + // Efficiency metrics + total_duration_ms: totalDurationMs, + total_cost_usd: totalCostUsd, + total_input_tokens: ctx.accumulatedUsage.input_tokens, + total_output_tokens: ctx.accumulatedUsage.output_tokens, + tool_calls_count: ctx.toolCallCount, + + // Semantic search specific (Agent2 only) + semantic_search_calls: semanticSearchMetrics?.callCount, + embed_latency_ms: semanticSearchMetrics + ? semanticSearchMetrics.totalQueryEmbedLatencyMs + + semanticSearchMetrics.indexLoadMs + : undefined, + embed_tokens: semanticSearchMetrics + ? semanticSearchMetrics.totalQueryEmbedTokens + + semanticSearchMetrics.indexEmbedTokens + : undefined, + embed_cost_usd: semanticSearchMetrics + ? ((semanticSearchMetrics.totalQueryEmbedTokens + + semanticSearchMetrics.indexEmbedTokens) / + 1000) * + 0.00002 + : undefined, + + // Tool output size accounting + tool_output_chars_total: getTotalToolOutputChars(ctx), + tool_output_chars_by_type: { ...ctx.toolOutputCharsByType }, + + // Rankings + top_files_final: topFilesFinal, + ranked_files_from_tools: rankedBehavioral, + } +} + +/** Write metrics to JSONL file */ +export function writeMetrics(filePath: string, metrics: InstanceMetrics): void { + const dir = dirname(filePath) + if (!existsSync(dir)) mkdirSync(dir, { recursive: true }) + appendFileSync(filePath, `${JSON.stringify(metrics)}\n`) +} diff --git a/packages/eval/src/swebench/types.ts b/packages/eval/src/swebench/types.ts new file mode 100644 index 0000000..f183d13 --- /dev/null +++ b/packages/eval/src/swebench/types.ts @@ -0,0 +1,211 @@ +/** + * Core types for SWE-bench Lite retrieval-only evaluation + */ + +/** + * SWE-bench Lite instance (only fields needed for retrieval-only) + */ +export interface SWEbenchInstance { + instance_id: string + repo: string // e.g. "django/django" + base_commit: string + problem_statement: string + patch: string // unified diff + test_patch?: string // optional, for secondary reporting +} + +/** + * Parsed oracle: ground-truth file paths from the gold patch + */ +export interface OracleFiles { + instance_id: string + files: Set // normalized repo-relative paths +} + +/** + * Worktree checkout result + */ +export interface WorktreeInfo { + path: string + commit: string + checkout_ms: number +} + +/** + * Agent variant identifier + */ +export type AgentVariant = 'ops-only' | 'ops-plus-search' + +/** + * Semantic search result from the custom tool + */ +export interface SemanticSearchResult { + filepath: string + start_line?: number + end_line?: number + score: number + snippet?: string +} + +/** + * Per-tool-call event (logged in events.jsonl) + */ +export interface ToolCallEvent { + type: 'tool_call' + timestamp: number + tool_name: string + tool_use_id: string + input: unknown + output?: unknown + error?: string + latency_ms?: number + output_chars?: number +} + +/** + * LLM usage event (logged in events.jsonl) + */ +export interface UsageEvent { + type: 'usage' + timestamp: number + message_id: string + input_tokens: number + output_tokens: number + cache_read_input_tokens?: number + cache_creation_input_tokens?: number +} + +/** + * Session start event + */ +export interface SessionStartEvent { + type: 'session_start' + timestamp: number + instance_id: string + agent_variant: AgentVariant + model: string + cwd: string + allowed_tools: string[] +} + +/** + * Session end event + */ +export interface SessionEndEvent { + type: 'session_end' + timestamp: number + instance_id: string + agent_variant: AgentVariant + duration_ms: number + total_cost_usd: number + usage: { + input_tokens: number + output_tokens: number + cache_read_input_tokens?: number + } + top_files_final: string[] // declared from agent JSON output + ranked_files_from_tools: string[] // behavioral from tool traces +} + +/** + * Union of all event types + */ +export type Event = + | ToolCallEvent + | UsageEvent + | SessionStartEvent + | SessionEndEvent + +/** + * Per-instance per-agent metrics (one row in metrics.jsonl) + */ +export interface InstanceMetrics { + instance_id: string + agent_variant: AgentVariant + oracle_files: string[] + + // Quality metrics (behavioral ranking) + hit_at_1_behavioral: boolean + hit_at_3_behavioral: boolean + hit_at_5_behavioral: boolean + hit_at_10_behavioral: boolean + mrr_behavioral: number + coverage_at_5_behavioral: number + coverage_at_10_behavioral: number + + // Quality metrics (declared ranking) + hit_at_1_declared: boolean + hit_at_3_declared: boolean + hit_at_5_declared: boolean + hit_at_10_declared: boolean + mrr_declared: number + coverage_at_5_declared: number + coverage_at_10_declared: number + + // Time/tokens to first hit + time_to_first_hit_ms: number | null + tokens_to_first_hit: number | null + + // Efficiency metrics + total_duration_ms: number + total_cost_usd: number + total_input_tokens: number + total_output_tokens: number + tool_calls_count: number + semantic_search_calls?: number + + // Embedding metrics (Agent2 only) + embed_latency_ms?: number + embed_tokens?: number + embed_cost_usd?: number + + // Tool output size accounting + tool_output_chars_total: number + tool_output_chars_by_type: Record + + // Rankings + top_files_final: string[] + ranked_files_from_tools: string[] +} + +/** + * Aggregate summary across all instances + */ +export interface AggregateSummary { + split: string + total_instances: number + agent_summaries: { + agent_variant: AgentVariant + // Quality rates + hit_at_1_rate: number + hit_at_3_rate: number + hit_at_5_rate: number + hit_at_10_rate: number + mean_mrr: number + mean_coverage_at_5: number + mean_coverage_at_10: number + // Totals (sum across all instances) - for cost tracking + total_duration_ms: number + total_tokens: number + total_input_tokens: number + total_output_tokens: number + total_cost_usd: number + total_tool_calls: number + // Medians (for multi-instance benchmarks) + median_duration_ms: number + p90_duration_ms: number + median_tokens: number + p90_tokens: number + median_time_to_first_hit_ms: number | null + median_tokens_to_first_hit: number | null + }[] + delta: { + hit_at_1_delta: number + hit_at_3_delta: number + hit_at_5_delta: number + mrr_delta: number + duration_ms_delta: number + tokens_delta: number + cost_usd_delta: number + } +} diff --git a/packages/eval/uv.lock b/packages/eval/uv.lock deleted file mode 100644 index ad6e80f..0000000 --- a/packages/eval/uv.lock +++ /dev/null @@ -1,370 +0,0 @@ -version = 1 -revision = 2 -requires-python = ">=3.14" -resolution-markers = [ - "sys_platform == 'win32'", - "sys_platform != 'win32'", -] - -[[package]] -name = "anyio" -version = "4.12.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "idna" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/16/ce/8a777047513153587e5434fd752e89334ac33e379aa3497db860eeb60377/anyio-4.12.0.tar.gz", hash = "sha256:73c693b567b0c55130c104d0b43a9baf3aa6a31fc6110116509f27bf75e21ec0", size = 228266, upload-time = "2025-11-28T23:37:38.911Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7f/9c/36c5c37947ebfb8c7f22e0eb6e4d188ee2d53aa3880f3f2744fb894f0cb1/anyio-4.12.0-py3-none-any.whl", hash = "sha256:dad2376a628f98eeca4881fc56cd06affd18f659b17a747d3ff0307ced94b1bb", size = 113362, upload-time = "2025-11-28T23:36:57.897Z" }, -] - -[[package]] -name = "certifi" -version = "2025.11.12" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a2/8c/58f469717fa48465e4a50c014a0400602d3c437d7c0c468e17ada824da3a/certifi-2025.11.12.tar.gz", hash = "sha256:d8ab5478f2ecd78af242878415affce761ca6bc54a22a27e026d7c25357c3316", size = 160538, upload-time = "2025-11-12T02:54:51.517Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/70/7d/9bc192684cea499815ff478dfcdc13835ddf401365057044fb721ec6bddb/certifi-2025.11.12-py3-none-any.whl", hash = "sha256:97de8790030bbd5c2d96b7ec782fc2f7820ef8dba6db909ccf95449f2d062d4b", size = 159438, upload-time = "2025-11-12T02:54:49.735Z" }, -] - -[[package]] -name = "chonkie" -version = "1.0.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "tokenizers" }, - { name = "tqdm" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/0d/a8/c881853cf1759c27fb6e06ded72aa52eaa3e99407185cd2d32d70b55f3a4/chonkie-1.0.5.tar.gz", hash = "sha256:0396efcc8e79d25a2dd4fe6d01ac3b9c077e00472af1f04dd9b5183f89b92cfc", size = 63596, upload-time = "2025-04-22T01:18:39.339Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/82/ce/d8d8359b17761259f86df1e46a5dbebc637906f3b8cee6c02877134075d3/chonkie-1.0.5-py3-none-any.whl", hash = "sha256:81f66fc5897bf14d5c1b55d4f0735f85fec269f237929b5c6d8a6b548cfd1e10", size = 80815, upload-time = "2025-04-22T01:18:38.095Z" }, -] - -[package.optional-dependencies] -code = [ - { name = "tree-sitter" }, - { name = "tree-sitter-language-pack" }, -] - -[[package]] -name = "click" -version = "8.3.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/3d/fa/656b739db8587d7b5dfa22e22ed02566950fbfbcdc20311993483657a5c0/click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a", size = 295065, upload-time = "2025-11-15T20:45:42.706Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" }, -] - -[[package]] -name = "colorama" -version = "0.4.6" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, -] - -[[package]] -name = "eval" -version = "0.1.0" -source = { virtual = "." } -dependencies = [ - { name = "chonkie", extra = ["code"] }, -] - -[package.metadata] -requires-dist = [{ name = "chonkie", extras = ["code"], specifier = ">=1.0.5" }] - -[[package]] -name = "filelock" -version = "3.20.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a7/23/ce7a1126827cedeb958fc043d61745754464eb56c5937c35bbf2b8e26f34/filelock-3.20.1.tar.gz", hash = "sha256:b8360948b351b80f420878d8516519a2204b07aefcdcfd24912a5d33127f188c", size = 19476, upload-time = "2025-12-15T23:54:28.027Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e3/7f/a1a97644e39e7316d850784c642093c99df1290a460df4ede27659056834/filelock-3.20.1-py3-none-any.whl", hash = "sha256:15d9e9a67306188a44baa72f569d2bfd803076269365fdea0934385da4dc361a", size = 16666, upload-time = "2025-12-15T23:54:26.874Z" }, -] - -[[package]] -name = "fsspec" -version = "2025.12.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b6/27/954057b0d1f53f086f681755207dda6de6c660ce133c829158e8e8fe7895/fsspec-2025.12.0.tar.gz", hash = "sha256:c505de011584597b1060ff778bb664c1bc022e87921b0e4f10cc9c44f9635973", size = 309748, upload-time = "2025-12-03T15:23:42.687Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/51/c7/b64cae5dba3a1b138d7123ec36bb5ccd39d39939f18454407e5468f4763f/fsspec-2025.12.0-py3-none-any.whl", hash = "sha256:8bf1fe301b7d8acfa6e8571e3b1c3d158f909666642431cc78a1b7b4dbc5ec5b", size = 201422, upload-time = "2025-12-03T15:23:41.434Z" }, -] - -[[package]] -name = "h11" -version = "0.16.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, -] - -[[package]] -name = "hf-xet" -version = "1.2.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/5e/6e/0f11bacf08a67f7fb5ee09740f2ca54163863b07b70d579356e9222ce5d8/hf_xet-1.2.0.tar.gz", hash = "sha256:a8c27070ca547293b6890c4bf389f713f80e8c478631432962bb7f4bc0bd7d7f", size = 506020, upload-time = "2025-10-24T19:04:32.129Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e2/51/f7e2caae42f80af886db414d4e9885fac959330509089f97cccb339c6b87/hf_xet-1.2.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:10bfab528b968c70e062607f663e21e34e2bba349e8038db546646875495179e", size = 2861861, upload-time = "2025-10-24T19:04:19.01Z" }, - { url = "https://files.pythonhosted.org/packages/6e/1d/a641a88b69994f9371bd347f1dd35e5d1e2e2460a2e350c8d5165fc62005/hf_xet-1.2.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2a212e842647b02eb6a911187dc878e79c4aa0aa397e88dd3b26761676e8c1f8", size = 2717699, upload-time = "2025-10-24T19:04:17.306Z" }, - { url = "https://files.pythonhosted.org/packages/df/e0/e5e9bba7d15f0318955f7ec3f4af13f92e773fbb368c0b8008a5acbcb12f/hf_xet-1.2.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30e06daccb3a7d4c065f34fc26c14c74f4653069bb2b194e7f18f17cbe9939c0", size = 3314885, upload-time = "2025-10-24T19:04:07.642Z" }, - { url = "https://files.pythonhosted.org/packages/21/90/b7fe5ff6f2b7b8cbdf1bd56145f863c90a5807d9758a549bf3d916aa4dec/hf_xet-1.2.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:29c8fc913a529ec0a91867ce3d119ac1aac966e098cf49501800c870328cc090", size = 3221550, upload-time = "2025-10-24T19:04:05.55Z" }, - { url = "https://files.pythonhosted.org/packages/6f/cb/73f276f0a7ce46cc6a6ec7d6c7d61cbfe5f2e107123d9bbd0193c355f106/hf_xet-1.2.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e159cbfcfbb29f920db2c09ed8b660eb894640d284f102ada929b6e3dc410a", size = 3408010, upload-time = "2025-10-24T19:04:28.598Z" }, - { url = "https://files.pythonhosted.org/packages/b8/1e/d642a12caa78171f4be64f7cd9c40e3ca5279d055d0873188a58c0f5fbb9/hf_xet-1.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9c91d5ae931510107f148874e9e2de8a16052b6f1b3ca3c1b12f15ccb491390f", size = 3503264, upload-time = "2025-10-24T19:04:30.397Z" }, - { url = "https://files.pythonhosted.org/packages/17/b5/33764714923fa1ff922770f7ed18c2daae034d21ae6e10dbf4347c854154/hf_xet-1.2.0-cp314-cp314t-win_amd64.whl", hash = "sha256:210d577732b519ac6ede149d2f2f34049d44e8622bf14eb3d63bbcd2d4b332dc", size = 2901071, upload-time = "2025-10-24T19:04:37.463Z" }, - { url = "https://files.pythonhosted.org/packages/96/2d/22338486473df5923a9ab7107d375dbef9173c338ebef5098ef593d2b560/hf_xet-1.2.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:46740d4ac024a7ca9b22bebf77460ff43332868b661186a8e46c227fdae01848", size = 2866099, upload-time = "2025-10-24T19:04:15.366Z" }, - { url = "https://files.pythonhosted.org/packages/7f/8c/c5becfa53234299bc2210ba314eaaae36c2875e0045809b82e40a9544f0c/hf_xet-1.2.0-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:27df617a076420d8845bea087f59303da8be17ed7ec0cd7ee3b9b9f579dff0e4", size = 2722178, upload-time = "2025-10-24T19:04:13.695Z" }, - { url = "https://files.pythonhosted.org/packages/9a/92/cf3ab0b652b082e66876d08da57fcc6fa2f0e6c70dfbbafbd470bb73eb47/hf_xet-1.2.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3651fd5bfe0281951b988c0facbe726aa5e347b103a675f49a3fa8144c7968fd", size = 3320214, upload-time = "2025-10-24T19:04:03.596Z" }, - { url = "https://files.pythonhosted.org/packages/46/92/3f7ec4a1b6a65bf45b059b6d4a5d38988f63e193056de2f420137e3c3244/hf_xet-1.2.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d06fa97c8562fb3ee7a378dd9b51e343bc5bc8190254202c9771029152f5e08c", size = 3229054, upload-time = "2025-10-24T19:04:01.949Z" }, - { url = "https://files.pythonhosted.org/packages/0b/dd/7ac658d54b9fb7999a0ccb07ad863b413cbaf5cf172f48ebcd9497ec7263/hf_xet-1.2.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:4c1428c9ae73ec0939410ec73023c4f842927f39db09b063b9482dac5a3bb737", size = 3413812, upload-time = "2025-10-24T19:04:24.585Z" }, - { url = "https://files.pythonhosted.org/packages/92/68/89ac4e5b12a9ff6286a12174c8538a5930e2ed662091dd2572bbe0a18c8a/hf_xet-1.2.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a55558084c16b09b5ed32ab9ed38421e2d87cf3f1f89815764d1177081b99865", size = 3508920, upload-time = "2025-10-24T19:04:26.927Z" }, - { url = "https://files.pythonhosted.org/packages/cb/44/870d44b30e1dcfb6a65932e3e1506c103a8a5aea9103c337e7a53180322c/hf_xet-1.2.0-cp37-abi3-win_amd64.whl", hash = "sha256:e6584a52253f72c9f52f9e549d5895ca7a471608495c4ecaa6cc73dba2b24d69", size = 2905735, upload-time = "2025-10-24T19:04:35.928Z" }, -] - -[[package]] -name = "httpcore" -version = "1.0.9" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "certifi" }, - { name = "h11" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" }, -] - -[[package]] -name = "httpx" -version = "0.28.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "anyio" }, - { name = "certifi" }, - { name = "httpcore" }, - { name = "idna" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, -] - -[[package]] -name = "huggingface-hub" -version = "1.2.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "filelock" }, - { name = "fsspec" }, - { name = "hf-xet", marker = "platform_machine == 'AMD64' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, - { name = "httpx" }, - { name = "packaging" }, - { name = "pyyaml" }, - { name = "shellingham" }, - { name = "tqdm" }, - { name = "typer-slim" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a7/c8/9cd2fcb670ba0e708bfdf95a1177b34ca62de2d3821df0773bc30559af80/huggingface_hub-1.2.3.tar.gz", hash = "sha256:4ba57f17004fd27bb176a6b7107df579865d4cde015112db59184c51f5602ba7", size = 614605, upload-time = "2025-12-12T15:31:42.161Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/df/8d/7ca723a884d55751b70479b8710f06a317296b1fa1c1dec01d0420d13e43/huggingface_hub-1.2.3-py3-none-any.whl", hash = "sha256:c9b7a91a9eedaa2149cdc12bdd8f5a11780e10de1f1024718becf9e41e5a4642", size = 520953, upload-time = "2025-12-12T15:31:40.339Z" }, -] - -[[package]] -name = "idna" -version = "3.11" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/0703ccc57f3a7233505399edb88de3cbd678da106337b9fcde432b65ed60/idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902", size = 194582, upload-time = "2025-10-12T14:55:20.501Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, -] - -[[package]] -name = "packaging" -version = "25.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a1/d4/1fc4078c65507b51b96ca8f8c3ba19e6a61c8253c72794544580a7b6c24d/packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f", size = 165727, upload-time = "2025-04-19T11:48:59.673Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" }, -] - -[[package]] -name = "pyyaml" -version = "6.0.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" }, - { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" }, - { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" }, - { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" }, - { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" }, - { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" }, - { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" }, - { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" }, - { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" }, - { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" }, - { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" }, - { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" }, - { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" }, - { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" }, - { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" }, - { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" }, - { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" }, - { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, -] - -[[package]] -name = "shellingham" -version = "1.5.4" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size = 10310, upload-time = "2023-10-24T04:13:40.426Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" }, -] - -[[package]] -name = "tokenizers" -version = "0.22.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "huggingface-hub" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/1c/46/fb6854cec3278fbfa4a75b50232c77622bc517ac886156e6afbfa4d8fc6e/tokenizers-0.22.1.tar.gz", hash = "sha256:61de6522785310a309b3407bac22d99c4db5dba349935e99e4d15ea2226af2d9", size = 363123, upload-time = "2025-09-19T09:49:23.424Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/bf/33/f4b2d94ada7ab297328fc671fed209368ddb82f965ec2224eb1892674c3a/tokenizers-0.22.1-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:59fdb013df17455e5f950b4b834a7b3ee2e0271e6378ccb33aa74d178b513c73", size = 3069318, upload-time = "2025-09-19T09:49:11.848Z" }, - { url = "https://files.pythonhosted.org/packages/1c/58/2aa8c874d02b974990e89ff95826a4852a8b2a273c7d1b4411cdd45a4565/tokenizers-0.22.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:8d4e484f7b0827021ac5f9f71d4794aaef62b979ab7608593da22b1d2e3c4edc", size = 2926478, upload-time = "2025-09-19T09:49:09.759Z" }, - { url = "https://files.pythonhosted.org/packages/1e/3b/55e64befa1e7bfea963cf4b787b2cea1011362c4193f5477047532ce127e/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19d2962dd28bc67c1f205ab180578a78eef89ac60ca7ef7cbe9635a46a56422a", size = 3256994, upload-time = "2025-09-19T09:48:56.701Z" }, - { url = "https://files.pythonhosted.org/packages/71/0b/fbfecf42f67d9b7b80fde4aabb2b3110a97fac6585c9470b5bff103a80cb/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:38201f15cdb1f8a6843e6563e6e79f4abd053394992b9bbdf5213ea3469b4ae7", size = 3153141, upload-time = "2025-09-19T09:48:59.749Z" }, - { url = "https://files.pythonhosted.org/packages/17/a9/b38f4e74e0817af8f8ef925507c63c6ae8171e3c4cb2d5d4624bf58fca69/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d1cbe5454c9a15df1b3443c726063d930c16f047a3cc724b9e6e1a91140e5a21", size = 3508049, upload-time = "2025-09-19T09:49:05.868Z" }, - { url = "https://files.pythonhosted.org/packages/d2/48/dd2b3dac46bb9134a88e35d72e1aa4869579eacc1a27238f1577270773ff/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e7d094ae6312d69cc2a872b54b91b309f4f6fbce871ef28eb27b52a98e4d0214", size = 3710730, upload-time = "2025-09-19T09:49:01.832Z" }, - { url = "https://files.pythonhosted.org/packages/93/0e/ccabc8d16ae4ba84a55d41345207c1e2ea88784651a5a487547d80851398/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:afd7594a56656ace95cdd6df4cca2e4059d294c5cfb1679c57824b605556cb2f", size = 3412560, upload-time = "2025-09-19T09:49:03.867Z" }, - { url = "https://files.pythonhosted.org/packages/d0/c6/dc3a0db5a6766416c32c034286d7c2d406da1f498e4de04ab1b8959edd00/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e2ef6063d7a84994129732b47e7915e8710f27f99f3a3260b8a38fc7ccd083f4", size = 3250221, upload-time = "2025-09-19T09:49:07.664Z" }, - { url = "https://files.pythonhosted.org/packages/d7/a6/2c8486eef79671601ff57b093889a345dd3d576713ef047776015dc66de7/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ba0a64f450b9ef412c98f6bcd2a50c6df6e2443b560024a09fa6a03189726879", size = 9345569, upload-time = "2025-09-19T09:49:14.214Z" }, - { url = "https://files.pythonhosted.org/packages/6b/16/32ce667f14c35537f5f605fe9bea3e415ea1b0a646389d2295ec348d5657/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:331d6d149fa9c7d632cde4490fb8bbb12337fa3a0232e77892be656464f4b446", size = 9271599, upload-time = "2025-09-19T09:49:16.639Z" }, - { url = "https://files.pythonhosted.org/packages/51/7c/a5f7898a3f6baa3fc2685c705e04c98c1094c523051c805cdd9306b8f87e/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:607989f2ea68a46cb1dfbaf3e3aabdf3f21d8748312dbeb6263d1b3b66c5010a", size = 9533862, upload-time = "2025-09-19T09:49:19.146Z" }, - { url = "https://files.pythonhosted.org/packages/36/65/7e75caea90bc73c1dd8d40438adf1a7bc26af3b8d0a6705ea190462506e1/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a0f307d490295717726598ef6fa4f24af9d484809223bbc253b201c740a06390", size = 9681250, upload-time = "2025-09-19T09:49:21.501Z" }, - { url = "https://files.pythonhosted.org/packages/30/2c/959dddef581b46e6209da82df3b78471e96260e2bc463f89d23b1bf0e52a/tokenizers-0.22.1-cp39-abi3-win32.whl", hash = "sha256:b5120eed1442765cd90b903bb6cfef781fd8fe64e34ccaecbae4c619b7b12a82", size = 2472003, upload-time = "2025-09-19T09:49:27.089Z" }, - { url = "https://files.pythonhosted.org/packages/b3/46/e33a8c93907b631a99377ef4c5f817ab453d0b34f93529421f42ff559671/tokenizers-0.22.1-cp39-abi3-win_amd64.whl", hash = "sha256:65fd6e3fb11ca1e78a6a93602490f134d1fdeb13bcef99389d5102ea318ed138", size = 2674684, upload-time = "2025-09-19T09:49:24.953Z" }, -] - -[[package]] -name = "tqdm" -version = "4.67.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737, upload-time = "2024-11-24T20:12:22.481Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" }, -] - -[[package]] -name = "tree-sitter" -version = "0.25.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/66/7c/0350cfc47faadc0d3cf7d8237a4e34032b3014ddf4a12ded9933e1648b55/tree-sitter-0.25.2.tar.gz", hash = "sha256:fe43c158555da46723b28b52e058ad444195afd1db3ca7720c59a254544e9c20", size = 177961, upload-time = "2025-09-25T17:37:59.751Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/07/e3/d9526ba71dfbbe4eba5e51d89432b4b333a49a1e70712aa5590cd22fc74f/tree_sitter-0.25.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:65d3c931013ea798b502782acab986bbf47ba2c452610ab0776cf4a8ef150fc0", size = 146776, upload-time = "2025-09-25T17:37:50.898Z" }, - { url = "https://files.pythonhosted.org/packages/42/97/4bd4ad97f85a23011dd8a535534bb1035c4e0bac1234d58f438e15cff51f/tree_sitter-0.25.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:bda059af9d621918efb813b22fb06b3fe00c3e94079c6143fcb2c565eb44cb87", size = 137732, upload-time = "2025-09-25T17:37:51.877Z" }, - { url = "https://files.pythonhosted.org/packages/b6/19/1e968aa0b1b567988ed522f836498a6a9529a74aab15f09dd9ac1e41f505/tree_sitter-0.25.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eac4e8e4c7060c75f395feec46421eb61212cb73998dbe004b7384724f3682ab", size = 609456, upload-time = "2025-09-25T17:37:52.925Z" }, - { url = "https://files.pythonhosted.org/packages/48/b6/cf08f4f20f4c9094006ef8828555484e842fc468827ad6e56011ab668dbd/tree_sitter-0.25.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:260586381b23be33b6191a07cea3d44ecbd6c01aa4c6b027a0439145fcbc3358", size = 636772, upload-time = "2025-09-25T17:37:54.647Z" }, - { url = "https://files.pythonhosted.org/packages/57/e2/d42d55bf56360987c32bc7b16adb06744e425670b823fb8a5786a1cea991/tree_sitter-0.25.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7d2ee1acbacebe50ba0f85fff1bc05e65d877958f00880f49f9b2af38dce1af0", size = 631522, upload-time = "2025-09-25T17:37:55.833Z" }, - { url = "https://files.pythonhosted.org/packages/03/87/af9604ebe275a9345d88c3ace0cf2a1341aa3f8ef49dd9fc11662132df8a/tree_sitter-0.25.2-cp314-cp314-win_amd64.whl", hash = "sha256:4973b718fcadfb04e59e746abfbb0288694159c6aeecd2add59320c03368c721", size = 130864, upload-time = "2025-09-25T17:37:57.453Z" }, - { url = "https://files.pythonhosted.org/packages/a6/6e/e64621037357acb83d912276ffd30a859ef117f9c680f2e3cb955f47c680/tree_sitter-0.25.2-cp314-cp314-win_arm64.whl", hash = "sha256:b8d4429954a3beb3e844e2872610d2a4800ba4eb42bb1990c6a4b1949b18459f", size = 117470, upload-time = "2025-09-25T17:37:58.431Z" }, -] - -[[package]] -name = "tree-sitter-c-sharp" -version = "0.23.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/22/85/a61c782afbb706a47d990eaee6977e7c2bd013771c5bf5c81c617684f286/tree_sitter_c_sharp-0.23.1.tar.gz", hash = "sha256:322e2cfd3a547a840375276b2aea3335fa6458aeac082f6c60fec3f745c967eb", size = 1317728, upload-time = "2024-11-11T05:25:32.535Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/58/04/f6c2df4c53a588ccd88d50851155945cff8cd887bd70c175e00aaade7edf/tree_sitter_c_sharp-0.23.1-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2b612a6e5bd17bb7fa2aab4bb6fc1fba45c94f09cb034ab332e45603b86e32fd", size = 372235, upload-time = "2024-11-11T05:25:19.424Z" }, - { url = "https://files.pythonhosted.org/packages/99/10/1aa9486f1e28fc22810fa92cbdc54e1051e7f5536a5e5b5e9695f609b31e/tree_sitter_c_sharp-0.23.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:1a8b98f62bc53efcd4d971151950c9b9cd5cbe3bacdb0cd69fdccac63350d83e", size = 419046, upload-time = "2024-11-11T05:25:20.679Z" }, - { url = "https://files.pythonhosted.org/packages/0f/21/13df29f8fcb9ba9f209b7b413a4764b673dfd58989a0dd67e9c7e19e9c2e/tree_sitter_c_sharp-0.23.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:986e93d845a438ec3c4416401aa98e6a6f6631d644bbbc2e43fcb915c51d255d", size = 415999, upload-time = "2024-11-11T05:25:22.359Z" }, - { url = "https://files.pythonhosted.org/packages/ca/72/fc6846795bcdae2f8aa94cc8b1d1af33d634e08be63e294ff0d6794b1efc/tree_sitter_c_sharp-0.23.1-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8024e466b2f5611c6dc90321f232d8584893c7fb88b75e4a831992f877616d2", size = 402830, upload-time = "2024-11-11T05:25:24.198Z" }, - { url = "https://files.pythonhosted.org/packages/fe/3a/b6028c5890ce6653807d5fa88c72232c027c6ceb480dbeb3b186d60e5971/tree_sitter_c_sharp-0.23.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:7f9bf876866835492281d336b9e1f9626ab668737f74e914c31d285261507da7", size = 397880, upload-time = "2024-11-11T05:25:25.937Z" }, - { url = "https://files.pythonhosted.org/packages/47/d2/4facaa34b40f8104d8751746d0e1cd2ddf0beb9f1404b736b97f372bd1f3/tree_sitter_c_sharp-0.23.1-cp39-abi3-win_amd64.whl", hash = "sha256:ae9a9e859e8f44e2b07578d44f9a220d3fa25b688966708af6aa55d42abeebb3", size = 377562, upload-time = "2024-11-11T05:25:27.539Z" }, - { url = "https://files.pythonhosted.org/packages/d8/88/3cf6bd9959d94d1fec1e6a9c530c5f08ff4115a474f62aedb5fedb0f7241/tree_sitter_c_sharp-0.23.1-cp39-abi3-win_arm64.whl", hash = "sha256:c81548347a93347be4f48cb63ec7d60ef4b0efa91313330e69641e49aa5a08c5", size = 375157, upload-time = "2024-11-11T05:25:30.839Z" }, -] - -[[package]] -name = "tree-sitter-embedded-template" -version = "0.25.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/fd/a7/77729fefab8b1b5690cfc54328f2f629d1c076d16daf32c96ba39d3a3a3a/tree_sitter_embedded_template-0.25.0.tar.gz", hash = "sha256:7d72d5e8a1d1d501a7c90e841b51f1449a90cc240be050e4fb85c22dab991d50", size = 14114, upload-time = "2025-08-29T00:42:51.078Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1f/9d/3e3c8ee0c019d3bace728300a1ca807c03df39e66cc51e9a5e7c9d1e1909/tree_sitter_embedded_template-0.25.0-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:fa0d06467199aeb33fb3d6fa0665bf9b7d5a32621ffdaf37fd8249f8a8050649", size = 10266, upload-time = "2025-08-29T00:42:44.148Z" }, - { url = "https://files.pythonhosted.org/packages/e8/ab/6d4e43b736b2a895d13baea3791dc8ce7245bedf4677df9e7deb22e23a2a/tree_sitter_embedded_template-0.25.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:fc7aacbc2985a5d7e7fe7334f44dffe24c38fb0a8295c4188a04cf21a3d64a73", size = 10650, upload-time = "2025-08-29T00:42:45.147Z" }, - { url = "https://files.pythonhosted.org/packages/9f/97/ea3d1ea4b320fe66e0468b9f6602966e544c9fe641882484f9105e50ee0c/tree_sitter_embedded_template-0.25.0-cp310-abi3-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a7c88c3dd8b94b3c9efe8ae071ff6b1b936a27ac5f6e651845c3b9631fa4c1c2", size = 18268, upload-time = "2025-08-29T00:42:46.03Z" }, - { url = "https://files.pythonhosted.org/packages/64/40/0f42ca894a8f7c298cf336080046ccc14c10e8f4ea46d455f640193181b2/tree_sitter_embedded_template-0.25.0-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:025f7ca84218dcd8455efc901bdbcc2689fb694f3a636c0448e322a23d4bc96b", size = 19068, upload-time = "2025-08-29T00:42:46.699Z" }, - { url = "https://files.pythonhosted.org/packages/d0/2a/0b720bcae7c2dd0a44889c09e800a2f8eb08c496dede9f2b97683506c4c3/tree_sitter_embedded_template-0.25.0-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:b5dc1aef6ffa3fae621fe037d85dd98948b597afba20df29d779c426be813ee5", size = 18518, upload-time = "2025-08-29T00:42:47.694Z" }, - { url = "https://files.pythonhosted.org/packages/14/8a/d745071afa5e8bdf5b381cf84c4dc6be6c79dee6af8e0ff07476c3d8e4aa/tree_sitter_embedded_template-0.25.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:d0a35cfe634c44981a516243bc039874580e02a2990669313730187ce83a5bc6", size = 18267, upload-time = "2025-08-29T00:42:48.635Z" }, - { url = "https://files.pythonhosted.org/packages/5d/74/728355e594fca140f793f234fdfec195366b6956b35754d00ea97ca18b21/tree_sitter_embedded_template-0.25.0-cp310-abi3-win_amd64.whl", hash = "sha256:3e05a4ac013d54505e75ae48e1a0e9db9aab19949fe15d9f4c7345b11a84a069", size = 13049, upload-time = "2025-08-29T00:42:49.589Z" }, - { url = "https://files.pythonhosted.org/packages/d8/de/afac475e694d0e626b0808f3c86339c349cd15c5163a6a16a53cc11cf892/tree_sitter_embedded_template-0.25.0-cp310-abi3-win_arm64.whl", hash = "sha256:2751d402179ac0e83f2065b249d8fe6df0718153f1636bcb6a02bde3e5730db9", size = 11978, upload-time = "2025-08-29T00:42:50.226Z" }, -] - -[[package]] -name = "tree-sitter-language-pack" -version = "0.13.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "tree-sitter" }, - { name = "tree-sitter-c-sharp" }, - { name = "tree-sitter-embedded-template" }, - { name = "tree-sitter-yaml" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/c1/83/d1bc738d6f253f415ee54a8afb99640f47028871436f53f2af637c392c4f/tree_sitter_language_pack-0.13.0.tar.gz", hash = "sha256:032034c5e27b1f6e00730b9e7c2dbc8203b4700d0c681fd019d6defcf61183ec", size = 51353370, upload-time = "2025-11-26T14:01:04.586Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e9/38/aec1f450ae5c4796de8345442f297fcf8912c7d2e00a66d3236ff0f825ed/tree_sitter_language_pack-0.13.0-cp310-abi3-macosx_10_15_universal2.whl", hash = "sha256:0e7eae812b40a2dc8a12eb2f5c55e130eb892706a0bee06215dd76affeb00d07", size = 32991857, upload-time = "2025-11-26T14:00:51.459Z" }, - { url = "https://files.pythonhosted.org/packages/90/09/11f51c59ede786dccddd2d348d5d24a1d99c54117d00f88b477f5fae4bd5/tree_sitter_language_pack-0.13.0-cp310-abi3-manylinux2014_aarch64.whl", hash = "sha256:7fdacf383418a845b20772118fcb53ad245f9c5d409bd07dae16acec65151756", size = 20092989, upload-time = "2025-11-26T14:00:54.202Z" }, - { url = "https://files.pythonhosted.org/packages/72/9d/644db031047ab1a70fc5cb6a79a4d4067080fac628375b2320752d2d7b58/tree_sitter_language_pack-0.13.0-cp310-abi3-manylinux2014_x86_64.whl", hash = "sha256:0d4f261fce387ae040dae7e4d1c1aca63d84c88320afcc0961c123bec0be8377", size = 19952029, upload-time = "2025-11-26T14:00:56.699Z" }, - { url = "https://files.pythonhosted.org/packages/48/92/5fd749bbb3f5e4538492c77de7bc51a5e479fec6209464ddc25be9153b13/tree_sitter_language_pack-0.13.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:78f369dc4d456c5b08d659939e662c2f9b9fba8c0ec5538a1f973e01edfcf04d", size = 19944614, upload-time = "2025-11-26T14:00:59.381Z" }, - { url = "https://files.pythonhosted.org/packages/97/59/2287f07723c063475d6657babed0d5569f4b499e393ab51354d529c3e7b5/tree_sitter_language_pack-0.13.0-cp310-abi3-win_amd64.whl", hash = "sha256:1cdbc88a03dacd47bec69e56cc20c48eace1fbb6f01371e89c3ee6a2e8f34db1", size = 16896852, upload-time = "2025-11-26T14:01:01.788Z" }, -] - -[[package]] -name = "tree-sitter-yaml" -version = "0.7.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/57/b6/941d356ac70c90b9d2927375259e3a4204f38f7499ec6e7e8a95b9664689/tree_sitter_yaml-0.7.2.tar.gz", hash = "sha256:756db4c09c9d9e97c81699e8f941cb8ce4e51104927f6090eefe638ee567d32c", size = 84882, upload-time = "2025-10-07T14:40:36.071Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/38/29/c0b8dbff302c49ff4284666ffb6f2f21145006843bb4c3a9a85d0ec0b7ae/tree_sitter_yaml-0.7.2-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:7e269ddcfcab8edb14fbb1f1d34eed1e1e26888f78f94eedfe7cc98c60f8bc9f", size = 43898, upload-time = "2025-10-07T14:40:29.486Z" }, - { url = "https://files.pythonhosted.org/packages/18/0d/15a5add06b3932b5e4ce5f5e8e179197097decfe82a0ef000952c8b98216/tree_sitter_yaml-0.7.2-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:0807b7966e23ddf7dddc4545216e28b5a58cdadedcecca86b8d8c74271a07870", size = 44691, upload-time = "2025-10-07T14:40:30.369Z" }, - { url = "https://files.pythonhosted.org/packages/72/92/c4b896c90d08deb8308fadbad2210fdcc4c66c44ab4292eac4e80acb4b61/tree_sitter_yaml-0.7.2-cp310-abi3-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:f1a5c60c98b6c4c037aae023569f020d0c489fad8dc26fdfd5510363c9c29a41", size = 91430, upload-time = "2025-10-07T14:40:31.16Z" }, - { url = "https://files.pythonhosted.org/packages/89/59/61f1fed31eb6d46ff080b8c0d53658cf29e10263f41ef5fe34768908037a/tree_sitter_yaml-0.7.2-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:88636d19d0654fd24f4f242eaaafa90f6f5ebdba8a62e4b32d251ed156c51a2a", size = 92428, upload-time = "2025-10-07T14:40:31.954Z" }, - { url = "https://files.pythonhosted.org/packages/e3/62/a33a04d19b7f9a0ded780b9c9fcc6279e37c5d00b89b00425bb807a22cc2/tree_sitter_yaml-0.7.2-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:1d2e8f0bb14aa4537320952d0f9607eef3021d5aada8383c34ebeece17db1e06", size = 90580, upload-time = "2025-10-07T14:40:33.037Z" }, - { url = "https://files.pythonhosted.org/packages/6c/e7/9525defa7b30792623f56b1fba9bbba361752348875b165b8975b87398fd/tree_sitter_yaml-0.7.2-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:74ca712c50fc9d7dbc68cb36b4a7811d6e67a5466b5a789f19bf8dd6084ef752", size = 90455, upload-time = "2025-10-07T14:40:33.778Z" }, - { url = "https://files.pythonhosted.org/packages/4a/d6/8d1e1ace03db3b02e64e91daf21d1347941d1bbecc606a5473a1a605250d/tree_sitter_yaml-0.7.2-cp310-abi3-win_amd64.whl", hash = "sha256:7587b5ca00fc4f9a548eff649697a3b395370b2304b399ceefa2087d8a6c9186", size = 45514, upload-time = "2025-10-07T14:40:34.562Z" }, - { url = "https://files.pythonhosted.org/packages/d8/c7/dcf3ea1c4f5da9b10353b9af4455d756c92d728a8f58f03c480d3ef0ead5/tree_sitter_yaml-0.7.2-cp310-abi3-win_arm64.whl", hash = "sha256:f63c227b18e7ce7587bce124578f0bbf1f890ac63d3e3cd027417574273642c4", size = 44065, upload-time = "2025-10-07T14:40:35.337Z" }, -] - -[[package]] -name = "typer-slim" -version = "0.20.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "click" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/8e/45/81b94a52caed434b94da65729c03ad0fb7665fab0f7db9ee54c94e541403/typer_slim-0.20.0.tar.gz", hash = "sha256:9fc6607b3c6c20f5c33ea9590cbeb17848667c51feee27d9e314a579ab07d1a3", size = 106561, upload-time = "2025-10-20T17:03:46.642Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5e/dd/5cbf31f402f1cc0ab087c94d4669cfa55bd1e818688b910631e131d74e75/typer_slim-0.20.0-py3-none-any.whl", hash = "sha256:f42a9b7571a12b97dddf364745d29f12221865acef7a2680065f9bb29c7dc89d", size = 47087, upload-time = "2025-10-20T17:03:44.546Z" }, -] - -[[package]] -name = "typing-extensions" -version = "4.15.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, -]