diff --git a/.cursorrules b/.cursorrules new file mode 100644 index 0000000..f611c4f --- /dev/null +++ b/.cursorrules @@ -0,0 +1,10 @@ +IMPORTANT: To run tests, **alwasy** run the following command This command take a long time to return. Wait at least 5 minutes +askds --system-prompt="./prompts/fix-tests.txt" cargo test + +WAIT A LONG TIME FOR askds to run the tests and find the issues. + +use the output of the command to fix the tests. + +IMPORTANT: Before committing, run `cargo fmt` and `cargo clippy` and `askds --system-prompt="./prompts/fix-tests.txt" cargo test` to ensure code is formatted and linted correctly. + +IMPORTANT never commit without running the tests. diff --git a/.github/workflows/ailoop.yaml b/.github/workflows/ailoop.yaml index 2299ab0..37c6cc3 100644 --- a/.github/workflows/ailoop.yaml +++ b/.github/workflows/ailoop.yaml @@ -19,6 +19,7 @@ on: permissions: contents: write pull-requests: write + issues: write jobs: loop: @@ -67,6 +68,7 @@ jobs: - name: Run AI Fix timeout-minutes: 360 + continue-on-error: true id: ai_fix env: DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }} @@ -83,19 +85,13 @@ jobs: name: attempts-log path: attempts.txt - - name: Push changes - if: always() && ${{ github.event.inputs.pr-on-fail }} - run: | - git checkout -b ${{ env.BRANCH }}-ai-fix-${{ env.SHORT_DATE }} - git push origin ${{ env.BRANCH }}-ai-fix-${{ env.SHORT_DATE }} - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Create PR + - name: Create PR or Issue if: always() && ${{ github.event.inputs.pr-on-fail }} uses: peter-evans/create-pull-request@v5 + env: + GH_TOKEN: ${{ secrets.PAT_TOKEN }} with: - token: ${{ secrets.GITHUB_TOKEN }} + token: ${{ env.GH_TOKEN }} commit-message: "AI Fix" title: "AI Fix for ${{ env.BRANCH }} (Attempts: ${{ env.MAX_ATTEMPTS }})" body: | diff --git a/.github/workflows/test-install.yml b/.github/workflows/test-install.yml index 91d26a0..ab9d5fe 100644 --- a/.github/workflows/test-install.yml +++ b/.github/workflows/test-install.yml @@ -104,7 +104,7 @@ jobs: exit 1 } # Verify output exists - test -f repo-serialized/chunk-0.txt || { + test -f repo-serialized/output.txt || { echo "Error: Output file not found" ls -la repo-serialized/ || true exit 1 @@ -204,7 +204,7 @@ jobs: exit 1 } # Verify output exists - if (-not (Test-Path repo-serialized/chunk-0.txt)) { + if (-not (Test-Path repo-serialized/output.txt)) { Write-Error "Output file not found" Get-ChildItem repo-serialized -ErrorAction SilentlyContinue exit 1 @@ -299,7 +299,7 @@ jobs: exit 1 } # Verify output exists - test -f repo-serialized/chunk-0.txt || { + test -f repo-serialized/output.txt || { echo "Error: Output file not found" ls -la repo-serialized/ || true exit 1 diff --git a/.gitignore b/.gitignore index e50512a..e860306 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,5 @@ yek.toml repo-serialized/ dist/ /*.txt +/*.diff +/*.sh diff --git a/Cargo.lock b/Cargo.lock index ba1bc4c..41ccb93 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -126,7 +126,18 @@ checksum = "3f934833b4b7233644e5848f235df3f57ed8c80f1528a26c3dfa13d2147fa056" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.96", +] + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi 0.1.19", + "libc", + "winapi", ] [[package]] @@ -150,12 +161,45 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + +[[package]] +name = "base64" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" + [[package]] name = "base64" version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bit-set" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.7.0" @@ -321,7 +365,7 @@ dependencies = [ "anstream", "anstyle", "clap_lex", - "strsim", + "strsim 0.11.1", "terminal_size", ] @@ -343,7 +387,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn", + "syn 2.0.96", ] [[package]] @@ -442,7 +486,7 @@ dependencies = [ "clap", "criterion-plot", "is-terminal", - "itertools", + "itertools 0.10.5", "num-traits", "once_cell", "oorandom", @@ -463,7 +507,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" dependencies = [ "cast", - "itertools", + "itertools 0.10.5", ] [[package]] @@ -538,6 +582,41 @@ dependencies = [ "typenum", ] +[[package]] +name = "darling" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim 0.10.0", + "syn 1.0.109", +] + +[[package]] +name = "darling_macro" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e" +dependencies = [ + "darling_core", + "quote", + "syn 1.0.109", +] + [[package]] name = "deranged" version = "0.3.11" @@ -547,6 +626,37 @@ dependencies = [ "powerfmt", ] +[[package]] +name = "derive_builder" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d67778784b508018359cbc8696edb3db78160bab2c2a28ba7f56ef6932997f8" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c11bdc11a0c47bc7d37d582b5285da6849c96681023680b906673c5707af7b0f" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "derive_builder_macro" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebcda35c7a396850a55ffeac740804b40ffec779b98fffbb1738f4033f0ee79e" +dependencies = [ + "derive_builder_core", + "syn 1.0.109", +] + [[package]] name = "deunicode" version = "1.6.0" @@ -619,7 +729,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.96", ] [[package]] @@ -640,6 +750,16 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" +[[package]] +name = "env_filter" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0" +dependencies = [ + "log", + "regex", +] + [[package]] name = "env_logger" version = "0.10.2" @@ -653,6 +773,19 @@ dependencies = [ "termcolor", ] +[[package]] +name = "env_logger" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcaee3d8e3cfc3fd92428d477bc97fc29ec8716d180c0d74c643bb26166660e0" +dependencies = [ + "anstream", + "anstyle", + "env_filter", + "humantime", + "log", +] + [[package]] name = "equivalent" version = "1.0.1" @@ -679,6 +812,15 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "esaxx-rs" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6" +dependencies = [ + "cc", +] + [[package]] name = "etcetera" version = "0.8.0" @@ -690,6 +832,16 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "fancy-regex" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7493d4c459da9f84325ad297371a6b2b8a162800873a22e3b6b6512e61d18c05" +dependencies = [ + "bit-set", + "regex", +] + [[package]] name = "fastrand" version = "2.3.0" @@ -871,7 +1023,7 @@ version = "0.18.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "232e6a7bfe35766bf715e55a88b39a700596c0ccfd88cd3680b4cdb40d66ef70" dependencies = [ - "bitflags", + "bitflags 2.7.0", "libc", "libgit2-sys", "log", @@ -905,7 +1057,7 @@ version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0bf760ebf69878d9fd8f110c89703d90ce35095324d1f1edcb595c63945ee757" dependencies = [ - "bitflags", + "bitflags 2.7.0", "ignore", "walkdir", ] @@ -932,6 +1084,15 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + [[package]] name = "hermit-abi" version = "0.3.9" @@ -1202,9 +1363,15 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.96", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "1.0.3" @@ -1297,6 +1464,24 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.14" @@ -1342,7 +1527,7 @@ dependencies = [ "proc-macro2", "quote", "regex", - "syn", + "syn 2.0.96", ] [[package]] @@ -1383,7 +1568,7 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" dependencies = [ - "bitflags", + "bitflags 2.7.0", "libc", ] @@ -1431,6 +1616,16 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104" +[[package]] +name = "lock_api" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +dependencies = [ + "autocfg", + "scopeguard", +] + [[package]] name = "log" version = "0.4.24" @@ -1440,6 +1635,22 @@ dependencies = [ "value-bag", ] +[[package]] +name = "macro_rules_attribute" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a82271f7bc033d84bbca59a3ce3e4159938cb08a9c3aebbe54d215131518a13" +dependencies = [ + "macro_rules_attribute-proc_macro", + "paste", +] + +[[package]] +name = "macro_rules_attribute-proc_macro" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568" + [[package]] name = "matchers" version = "0.1.0" @@ -1487,6 +1698,27 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "monostate" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d208407d7552cd041d8cdb69a1bc3303e029c598738177a3d87082004dc0e1e" +dependencies = [ + "monostate-impl", + "serde", +] + +[[package]] +name = "monostate-impl" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7ce64b975ed4f123575d11afd9491f2e37bbd5813fbfbc0f09ae1fbddea74e0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.96", +] + [[package]] name = "next_version" version = "0.2.21" @@ -1570,6 +1802,28 @@ version = "1.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" +[[package]] +name = "onig" +version = "6.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f" +dependencies = [ + "bitflags 1.3.2", + "libc", + "once_cell", + "onig_sys", +] + +[[package]] +name = "onig_sys" +version = "69.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b829e3d7e9cc74c7e315ee8edb185bf4190da5acde74afd7fc59c35b1f086e7" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "oorandom" version = "11.1.4" @@ -1616,6 +1870,29 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" +[[package]] +name = "parking_lot" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets 0.52.6", +] + [[package]] name = "parse-zoneinfo" version = "0.3.1" @@ -1625,6 +1902,12 @@ dependencies = [ "regex", ] +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + [[package]] name = "path-slash" version = "0.2.1" @@ -1674,7 +1957,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn", + "syn 2.0.96", ] [[package]] @@ -1829,7 +2112,7 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "865724d4dbe39d9f3dd3b52b88d859d66bcb2d6a0acfd5ea68a65fb66d4bdc1c" dependencies = [ - "env_logger", + "env_logger 0.10.2", "log", ] @@ -1852,7 +2135,7 @@ dependencies = [ "pin-project-lite", "quinn-proto", "quinn-udp", - "rustc-hash", + "rustc-hash 2.1.0", "rustls", "socket2", "thiserror 2.0.11", @@ -1870,7 +2153,7 @@ dependencies = [ "getrandom", "rand", "ring", - "rustc-hash", + "rustc-hash 2.1.0", "rustls", "rustls-pki-types", "slab", @@ -1943,6 +2226,17 @@ dependencies = [ "rayon-core", ] +[[package]] +name = "rayon-cond" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9" +dependencies = [ + "either", + "itertools 0.11.0", + "rayon", +] + [[package]] name = "rayon-core" version = "1.12.1" @@ -1953,6 +2247,15 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "redox_syscall" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834" +dependencies = [ + "bitflags 2.7.0", +] + [[package]] name = "redox_users" version = "0.4.6" @@ -2014,7 +2317,7 @@ version = "0.12.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43e734407157c3c2034e0258f5e4473ddb361b1e85f95a66690d67264d7cd1da" dependencies = [ - "base64", + "base64 0.22.1", "bytes", "futures-channel", "futures-core", @@ -2093,7 +2396,7 @@ dependencies = [ "proc-macro2", "quote", "rust-embed-utils", - "syn", + "syn 2.0.96", "walkdir", ] @@ -2113,6 +2416,12 @@ version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustc-hash" version = "2.1.0" @@ -2125,7 +2434,7 @@ version = "0.38.43" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a78891ee6bf2340288408954ac787aa063d8e8817e9f53abb37c695c6d834ef6" dependencies = [ - "bitflags", + "bitflags 2.7.0", "errno", "libc", "linux-raw-sys", @@ -2197,6 +2506,12 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + [[package]] name = "semver" version = "1.0.24" @@ -2220,7 +2535,7 @@ checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.96", ] [[package]] @@ -2357,12 +2672,30 @@ version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +[[package]] +name = "spm_precompiled" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5851699c4033c63636f7ea4cf7b7c1f1bf06d0cc03cfb42e711de5a5c46cf326" +dependencies = [ + "base64 0.13.1", + "nom", + "serde", + "unicode-segmentation", +] + [[package]] name = "stable_deref_trait" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + [[package]] name = "strsim" version = "0.11.1" @@ -2453,6 +2786,17 @@ dependencies = [ "sval_nested", ] +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "syn" version = "2.0.96" @@ -2481,7 +2825,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.96", ] [[package]] @@ -2571,7 +2915,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.96", ] [[package]] @@ -2582,7 +2926,7 @@ checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.96", ] [[package]] @@ -2595,6 +2939,21 @@ dependencies = [ "once_cell", ] +[[package]] +name = "tiktoken-rs" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c314e7ce51440f9e8f5a497394682a57b7c323d0f4d0a6b1b13c429056e0e234" +dependencies = [ + "anyhow", + "base64 0.21.7", + "bstr", + "fancy-regex", + "lazy_static", + "parking_lot", + "rustc-hash 1.1.0", +] + [[package]] name = "time" version = "0.3.37" @@ -2649,6 +3008,39 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "tokenizers" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dd47962b0ba36e7fd33518fbf1754d136fd1474000162bbf2a8b5fcb2d3654d" +dependencies = [ + "aho-corasick", + "clap", + "derive_builder", + "esaxx-rs", + "getrandom", + "indicatif", + "itertools 0.12.1", + "lazy_static", + "log", + "macro_rules_attribute", + "monostate", + "onig", + "paste", + "rand", + "rayon", + "rayon-cond", + "regex", + "regex-syntax 0.8.5", + "serde", + "serde_json", + "spm_precompiled", + "thiserror 1.0.69", + "unicode-normalization-alignments", + "unicode-segmentation", + "unicode_categories", +] + [[package]] name = "tokio" version = "1.43.0" @@ -2661,9 +3053,21 @@ dependencies = [ "mio", "pin-project-lite", "socket2", + "tokio-macros", "windows-sys 0.52.0", ] +[[package]] +name = "tokio-macros" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.96", +] + [[package]] name = "tokio-rustls" version = "0.26.1" @@ -2763,7 +3167,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.96", ] [[package]] @@ -2891,12 +3295,33 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" +[[package]] +name = "unicode-normalization-alignments" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43f613e4fa046e69818dd287fdc4bc78175ff20331479dab6e1b0f98d57062de" +dependencies = [ + "smallvec", +] + +[[package]] +name = "unicode-segmentation" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" + [[package]] name = "unicode-width" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" +[[package]] +name = "unicode_categories" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" + [[package]] name = "untrusted" version = "0.9.0" @@ -2923,7 +3348,7 @@ version = "2.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "02d1a66277ed75f640d608235660df48c8e3c19f3b4edb6a263315626cc3c01d" dependencies = [ - "base64", + "base64 0.22.1", "flate2", "log", "once_cell", @@ -3080,7 +3505,7 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn", + "syn 2.0.96", "wasm-bindgen-shared", ] @@ -3115,7 +3540,7 @@ checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.96", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -3412,16 +3837,19 @@ version = "0.13.8" dependencies = [ "anyhow", "assert_cmd", + "atty", "byte-unit", "chrono", "clap", "criterion", "crossbeam", "crossbeam-channel", + "env_logger 0.11.6", "git-cliff", "git2", "ignore", "indicatif", + "lazy_static", "num_cpus", "path-slash", "predicates", @@ -3430,9 +3858,13 @@ dependencies = [ "regex", "serde", "serde_derive", + "serde_json", "sha2", "tempfile", + "tiktoken-rs", "time", + "tokenizers", + "tokio", "toml 0.8.19", "tracing", "tracing-subscriber", @@ -3459,7 +3891,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.96", "synstructure", ] @@ -3481,7 +3913,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.96", ] [[package]] @@ -3501,7 +3933,7 @@ checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.96", "synstructure", ] @@ -3530,5 +3962,5 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.96", ] diff --git a/Cargo.toml b/Cargo.toml index 7c9b0db..f85ea31 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,15 +2,16 @@ name = "yek" version = "0.13.8" edition = "2021" -description = "A tool to serialize a repository into chunks of text files" +description = "A tool to serialize a repository into a single text file" license = "MIT" repository = "https://github.com/mohsen-w-elsayed/yek" readme = "README.md" -keywords = ["git", "repository", "serialization", "text", "chunks"] +keywords = ["git", "repository", "serialization", "text"] categories = ["command-line-utilities", "development-tools"] [dependencies] anyhow = "1.0" +atty = "0.2" byte-unit = "4.0" clap = { version = "4.4", features = ["derive"] } crossbeam = "0.8" @@ -30,6 +31,10 @@ walkdir = "2.4" path-slash = "0.2.1" git2 = { version = "0.18.2", features = ["vendored-openssl", "https"] } crossbeam-channel = "0.5" +serde_json = "1.0.107" +tiktoken-rs = "0.5.2" +tokenizers = "0.15.0" +lazy_static = "1.5.0" [dev-dependencies] assert_cmd = "2.0" @@ -39,7 +44,8 @@ tempfile = "3.9" criterion = "0.5" rand = "0.8" git-cliff = "1.4.0" -regex = "1.10.3" +tokio = { version = "1.0", features = ["rt", "time", "macros"] } +env_logger = "0.11" [[bench]] name = "serialization" @@ -49,5 +55,5 @@ harness = false opt-level = 3 lto = true codegen-units = 1 -panic = 'abort' +panic = "abort" strip = true diff --git a/README.md b/README.md index 7ca5047..037ca60 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,10 @@ # `yek` -A [fast](#performance) Rust based tool to read text-based files in a repository or directory, chunk them, and serialize them for LLM consumption. By default, the tool: +A [fast](#performance) Rust based tool to read text-based files in a repository or directory and serialize them for LLM consumption. By default, the tool: - Uses `.gitignore` rules to skip unwanted files. - Uses the Git history to infer what files are important. - Infers additional ignore patterns (binary, large, etc.). -- Splits content into chunks based on either approximate "token" count or byte size. - Automatically detects if output is being piped and streams content instead of writing to files. - Supports processing multiple directories in a single command. - Configurable via a `yek.toml` file. @@ -24,7 +23,7 @@ Consider having a simple repo like this: └── test.rs ``` -Running `yek` in this directory will produce a single file and write it to the temp directory with the following content: +Running `yek` in this directory will produce a single file with the following content: ```txt >>>> README.md @@ -78,11 +77,11 @@ export PATH=$(pwd)/target/release:$PATH ## Usage -`yek` has sensible defaults, you can simply run `yek` in a directory to serialize the entire repository. It will serialize all files in the repository into chunks of 10MB by default. The file will be written to the temp directory and file path will be printed to the console. +`yek` has sensible defaults, you can simply run `yek` in a directory to serialize the entire repository. The output will be written to a single file named `output.txt` in the `yek-output` directory by default, or streamed to stdout if piped. ### Examples -Process current directory and write to temp directory: +Process current directory and write to output.txt: ```bash yek @@ -97,11 +96,17 @@ yek src/ | pbcopy Cap the max size to 128K tokens and only process the `src` directory: ```bash -yek --max-size 128K --tokens src/ +yek --max-size 128K src/ +``` + +Do actual token counting and use the `deepseek` model tokenizer: + +```bash +yek --max-size 128K --tokens deepseek ``` > [!NOTE] -> When multiple chunks are written, the last chunk will contain the highest-priority files. +> Token counting can be slow, so it's disabled by default. Cap the max size to 100KB and only process the `src` directory, writing to a specific directory: @@ -109,6 +114,9 @@ Cap the max size to 100KB and only process the `src` directory, writing to a spe yek --max-size 100KB --output-dir /tmp/yek src/ ``` +> [!NOTE] +> When max-size is reached, `yek` will throw away all of the less important files and generate output with only the most important files that fit within the size limit. + Process multiple directories: ```bash @@ -120,7 +128,7 @@ yek src/ tests/ ```bash yek --help -Repository content chunker and serializer for LLM consumption +Repository content serializer for LLM consumption Usage: yek [OPTIONS] [directories]... @@ -128,12 +136,12 @@ Arguments: [directories]... Directories to process [default: .] Options: - --max-size Maximum size per chunk (e.g. '10MB', '128KB', '1GB') [default: 10MB] - --tokens Count size in tokens instead of bytes + --max-size Maximum size of output + --tokens [] Count size in tokens using specified model family. + Options: openai, claude, mistral, deepseek, llama [default: openai] --debug Enable debug output - --output-dir Output directory for chunks + --output-dir Output directory for output file -h, --help Print help - -V, --version Print version ``` ## Configuration File @@ -144,12 +152,22 @@ You can place a file called `yek.toml` at your project root or pass a custom pat 2. Define file priority rules for processing order 3. Add additional binary file extensions to ignore (extends the built-in list) 4. Configure Git-based priority boost +5. Configure tokenizer model for token counting ### Example `yek.toml` This is optional, you can configure the `yek.toml` file at the root of your project. ```toml +# Output directory for the output file +output_dir = "yek-output" + +# Maximum size of output +max_size = "128K" + +# Tokenizer model for token counting (defaults to 'openai') +tokens = "deepseek" + # Add patterns to ignore (in addition to .gitignore) ignore_patterns = [ "node_modules/", diff --git a/benches/serialization.rs b/benches/serialization.rs index 6485b0e..6311a23 100644 --- a/benches/serialization.rs +++ b/benches/serialization.rs @@ -5,7 +5,7 @@ use std::io::Write; use std::path::Path; use std::time::Duration; use tempfile::TempDir; -use yek::{serialize_repo, PriorityRule, YekConfig}; +use yek::{serialize_repo, YekConfig}; /// Creates a text file of a specified size in bytes. fn create_test_data_bytes(dir: &Path, size: usize, file_name: &str) { @@ -52,14 +52,20 @@ fn single_small_file_byte_mode(c: &mut Criterion) { create_test_data_bytes(temp_dir.path(), size, "small_file.txt"); let output_dir = temp_dir.path().join("output"); + std::fs::create_dir_all(&output_dir).unwrap(); group.throughput(Throughput::Bytes(size as u64)); group.bench_function("single_small_file", |b| { b.iter(|| { - let mut config = YekConfig::default(); - config.output_dir = Some(output_dir.clone()); + let iter_output_dir = + output_dir.join(format!("iter-{}", rand::thread_rng().gen::())); + std::fs::create_dir_all(&iter_output_dir).unwrap(); + let config = YekConfig { + output_dir: Some(iter_output_dir.clone()), + ..Default::default() + }; serialize_repo(temp_dir.path(), Some(&config)).unwrap(); - fs::remove_dir_all(&output_dir).ok(); + fs::remove_dir_all(&iter_output_dir).expect("Failed to clean output dir"); }); }); group.finish(); @@ -73,14 +79,20 @@ fn single_large_file_byte_mode(c: &mut Criterion) { create_test_data_bytes(temp_dir.path(), size, "large_file.txt"); let output_dir = temp_dir.path().join("output"); + std::fs::create_dir_all(&output_dir).unwrap(); group.throughput(Throughput::Bytes(size as u64)); group.bench_function("single_large_file", |b| { b.iter(|| { - let mut config = YekConfig::default(); - config.output_dir = Some(output_dir.clone()); + let iter_output_dir = + output_dir.join(format!("iter-{}", rand::thread_rng().gen::())); + std::fs::create_dir_all(&iter_output_dir).unwrap(); + let config = YekConfig { + output_dir: Some(iter_output_dir.clone()), + ..Default::default() + }; serialize_repo(temp_dir.path(), Some(&config)).unwrap(); - fs::remove_dir_all(&output_dir).ok(); + fs::remove_dir_all(&iter_output_dir).expect("Failed to clean output dir"); }); }); group.finish(); @@ -94,14 +106,21 @@ fn single_large_file_token_mode(c: &mut Criterion) { create_test_data_tokens(temp_dir.path(), token_count, "large_tokens.txt"); let output_dir = temp_dir.path().join("output"); + std::fs::create_dir_all(&output_dir).unwrap(); group.throughput(Throughput::Elements(token_count as u64)); group.bench_function("single_large_token_file", |b| { b.iter(|| { - let mut config = YekConfig::default(); - config.output_dir = Some(output_dir.clone()); + let iter_output_dir = + output_dir.join(format!("iter-{}", rand::thread_rng().gen::())); + std::fs::create_dir_all(&iter_output_dir).unwrap(); + let config = YekConfig { + output_dir: Some(iter_output_dir.clone()), + token_mode: true, + ..Default::default() + }; serialize_repo(temp_dir.path(), Some(&config)).unwrap(); - fs::remove_dir_all(&output_dir).ok(); + fs::remove_dir_all(&iter_output_dir).expect("Failed to clean output dir"); }); }); group.finish(); @@ -117,13 +136,19 @@ fn multiple_small_files(c: &mut Criterion) { let sizes = vec![1024; 50]; // 50 files of 1KB each create_multiple_files(temp_dir.path(), &sizes, "small"); let output_dir = temp_dir.path().join("output"); + std::fs::create_dir_all(&output_dir).unwrap(); (temp_dir, output_dir) }, |(temp_dir, output_dir)| { - let mut config = YekConfig::default(); - config.output_dir = Some(output_dir.clone()); + let iter_output_dir = + output_dir.join(format!("iter-{}", rand::thread_rng().gen::())); + std::fs::create_dir_all(&iter_output_dir).unwrap(); + let config = YekConfig { + output_dir: Some(iter_output_dir.clone()), + ..Default::default() + }; serialize_repo(temp_dir.path(), Some(&config)).unwrap(); - fs::remove_dir_all(&output_dir).ok(); + fs::remove_dir_all(&iter_output_dir).expect("Failed to clean output dir"); }, BatchSize::SmallInput, ); @@ -144,13 +169,19 @@ fn multiple_medium_files(c: &mut Criterion) { .collect::>(); create_multiple_files(temp_dir.path(), &sizes, "medium"); let output_dir = temp_dir.path().join("output"); + std::fs::create_dir_all(&output_dir).unwrap(); (temp_dir, output_dir) }, |(temp_dir, output_dir)| { - let mut config = YekConfig::default(); - config.output_dir = Some(output_dir.clone()); + let iter_output_dir = + output_dir.join(format!("iter-{}", rand::thread_rng().gen::())); + std::fs::create_dir_all(&iter_output_dir).unwrap(); + let config = YekConfig { + output_dir: Some(iter_output_dir.clone()), + ..Default::default() + }; serialize_repo(temp_dir.path(), Some(&config)).unwrap(); - fs::remove_dir_all(&output_dir).ok(); + fs::remove_dir_all(&iter_output_dir).expect("Failed to clean output dir"); }, BatchSize::SmallInput, ); @@ -168,13 +199,19 @@ fn multiple_large_files(c: &mut Criterion) { let sizes = vec![5_242_880; 5]; // ~5 MB x 5 create_multiple_files(temp_dir.path(), &sizes, "large"); let output_dir = temp_dir.path().join("output"); + std::fs::create_dir_all(&output_dir).unwrap(); (temp_dir, output_dir) }, |(temp_dir, output_dir)| { - let mut config = YekConfig::default(); - config.output_dir = Some(output_dir.clone()); + let iter_output_dir = + output_dir.join(format!("iter-{}", rand::thread_rng().gen::())); + std::fs::create_dir_all(&iter_output_dir).unwrap(); + let config = YekConfig { + output_dir: Some(iter_output_dir.clone()), + ..Default::default() + }; serialize_repo(temp_dir.path(), Some(&config)).unwrap(); - fs::remove_dir_all(&output_dir).ok(); + fs::remove_dir_all(&iter_output_dir).expect("Failed to clean output dir"); }, BatchSize::SmallInput, ); @@ -192,13 +229,20 @@ fn multiple_token_files(c: &mut Criterion) { let tokens = vec![10_000; 10]; create_multiple_token_files(temp_dir.path(), &tokens, "token"); let output_dir = temp_dir.path().join("output"); + std::fs::create_dir_all(&output_dir).unwrap(); (temp_dir, output_dir) }, |(temp_dir, output_dir)| { - let mut config = YekConfig::default(); - config.output_dir = Some(output_dir.clone()); + let iter_output_dir = + output_dir.join(format!("iter-{}", rand::thread_rng().gen::())); + std::fs::create_dir_all(&iter_output_dir).unwrap(); + let config = YekConfig { + output_dir: Some(iter_output_dir.clone()), + token_mode: true, + ..Default::default() + }; serialize_repo(temp_dir.path(), Some(&config)).unwrap(); - fs::remove_dir_all(&output_dir).ok(); + fs::remove_dir_all(&iter_output_dir).expect("Failed to clean output dir"); }, BatchSize::SmallInput, ); @@ -209,13 +253,6 @@ fn multiple_token_files(c: &mut Criterion) { /// Demonstrates using a custom config (e.g. extra ignores or priority rules). fn custom_config_test(c: &mut Criterion) { let mut group = c.benchmark_group("CustomConfig"); - let mut config = YekConfig::default(); - config.priority_rules.push(PriorityRule { - pattern: "*.rs".into(), - score: 500, - }); - config.ignore_patterns = vec!["*.txt".into()]; - group.bench_function("custom_config_test", |b| { b.iter_batched( || { @@ -224,13 +261,19 @@ fn custom_config_test(c: &mut Criterion) { create_test_data_bytes(temp_dir.path(), 1024, "test.txt"); create_test_data_bytes(temp_dir.path(), 1024, "test.rs"); let output_dir = temp_dir.path().join("output"); + std::fs::create_dir_all(&output_dir).unwrap(); (temp_dir, output_dir) }, |(temp_dir, output_dir)| { - let mut config = YekConfig::default(); - config.output_dir = Some(output_dir.clone()); + let iter_output_dir = + output_dir.join(format!("iter-{}", rand::thread_rng().gen::())); + std::fs::create_dir_all(&iter_output_dir).unwrap(); + let config = YekConfig { + output_dir: Some(iter_output_dir.clone()), + ..Default::default() + }; serialize_repo(temp_dir.path(), Some(&config)).unwrap(); - fs::remove_dir_all(&output_dir).ok(); + fs::remove_dir_all(&iter_output_dir).expect("Failed to clean output dir"); }, BatchSize::SmallInput, ); diff --git a/prompts/diff.txt b/prompts/diff.txt new file mode 100644 index 0000000..8eb08be --- /dev/null +++ b/prompts/diff.txt @@ -0,0 +1,39 @@ +You are a senior Rust engineer with 10+ years of experience in systems programming. +Your expertise includes: +- Deep knowledge of Rust's ownership system, lifetimes, and concurrency model +- Mastery of cargo, clippy, and modern Rust toolchain features +- Experience debugging complex memory issues and performance bottlenecks +- Familiarity with common Rust crates and idiomatic patterns + +When analyzing test failures: +1. First clearly identify the failure type (compiler error, runtime panic, logical error, performance issue) +2. Analyze backtraces and error messages with attention to ownership boundaries +3. Consider common Rust pitfalls: + - Lifetime mismatches + - Unsafe code violations + - Trait bound errors + - Concurrency race conditions + - Iterator invalidation +4. Cross-reference with cargo test output and clippy warnings + +For proposed fixes: +- Always prioritize type safety and borrow checker rules +- Prefer idiomatic solutions over clever hacks + +Response guidelines: +- Structure analysis using bullet points for clarity +- Use code fences for error snippets and diffs +- Highlight connections between test failures and system architecture +- When uncertain, propose multiple hypothesis with verification strategies + +Special capabilities: +- Leverage knowledge of Rust internals (MIR, drop order, etc.) +- Reference similar issues in popular Rust OSS projects +- Suggest property-based testing strategies for edge cases + +**VERY IMPORTANT** +You should return only a git diff that fixes the test failure and nothing else +No explanation, no comments, no nothing. Do not wrap the diff in any other text. +DO NOT ADD ```diff and ``` on the start and end of the result. Just the body of diff +The diff should be valid and aplicable to the codebase. +Make sure line numbers are correct. \ No newline at end of file diff --git a/prompts/fix-tests.txt b/prompts/fix-tests.txt new file mode 100644 index 0000000..4677878 --- /dev/null +++ b/prompts/fix-tests.txt @@ -0,0 +1,160 @@ +README.md + +`yek` has sensible defaults, you can simply run `yek` in a directory to serialize the entire repository. The output will be written to a single file named `output.txt` in the current directory, or streamed to stdout if piped. + +### Examples + +Process current directory and write to output.txt: + +```bash +yek +``` + +Pipe output to clipboard (macOS): + +```bash +yek src/ | pbcopy +``` + +Cap the max size to 128K tokens and only process the `src` directory: + +```bash +yek --max-size 128K src/ +``` + +Do actual token counting and use the `deepseek` model tokenizer: + +```bash +yek --max-size 128K --tokens deepseek +``` + +> [!NOTE] +> Token counting can be slow, so it's disabled by default. + +Cap the max size to 100KB and only process the `src` directory, writing to a specific directory: + +```bash +yek --max-size 100KB --output-dir /tmp/yek src/ +``` + +> [!NOTE] +> When max-size is reached, `yek` will throw away all of the less important files and generate + +Process multiple directories: + +```bash +yek src/ tests/ +``` + +### CLI Reference + +```bash +yek --help + +Repository content serializer for LLM consumption + +Usage: yek [OPTIONS] [directories]... + +Arguments: + [directories]... Directories to process [default: .] + +Options: + --max-size Maximum size of output + --tokens [] Count size in tokens using specified model family. + Options: openai, claude, mistral, deepseek, llama [default: openai] + --debug Enable debug output + --output-dir Output directory for output file + -h, --help Print help +``` + +## Configuration File + +You can place a file called `yek.toml` at your project root or pass a custom path via `--config`. The configuration file allows you to: + +1. Add custom ignore patterns +2. Define file priority rules for processing order +3. Add additional binary file extensions to ignore (extends the built-in list) +4. Configure Git-based priority boost +5. Configure tokenizer model for token counting + +### Example `yek.toml` + +This is optional, you can configure the `yek.toml` file at the root of your project. + +```toml +# Output directory for the output file +output_dir = "yek-output" + +# Maximum size of output +max_size = "128K" + +# Tokenizer model for token counting (defaults to 'deepseek-reasoner') +tokens = "deepseek" + +# Add patterns to ignore (in addition to .gitignore) +ignore_patterns = [ + "node_modules/", + "\\.next/", + "my_custom_folder/" +] + +# Configure Git-based priority boost (optional) +git_boost_max = 50 # Maximum score boost based on Git history (default: 100) + +# Define priority rules for processing order +# Higher scores are processed first + +[[priority_rules]] +score = 90 +pattern = "^src/" + + +# Add additional binary file extensions to ignore +# These extend the built-in list (.jpg, .png, .exe, etc.) +binary_extensions = [ + ".blend", # Blender files +] +``` + +All configuration keys are optional. By default: + +- No extra ignore patterns +- All files have equal priority (score: 1) +- Git-based priority boost maximum is 100 +- Common binary file extensions are ignored (.jpg, .png, .exe, etc. - see source for full list) + +You are a senior Rust engineer with 10+ years of experience in systems programming. +Your expertise includes: +- Deep knowledge of Rust's ownership system, lifetimes, and concurrency model +- Mastery of cargo, clippy, and modern Rust toolchain features +- Experience debugging complex memory issues and performance bottlenecks +- Familiarity with common Rust crates and idiomatic patterns + +When analyzing test failures: +1. First clearly identify the failure type (compiler error, runtime panic, logical error, performance issue) +2. Analyze backtraces and error messages with attention to ownership boundaries +3. Consider common Rust pitfalls: + - Lifetime mismatches + - Unsafe code violations + - Trait bound errors + - Concurrency race conditions + - Iterator invalidation +4. Cross-reference with cargo test output and clippy warnings + +For proposed fixes: +- Always prioritize type safety and borrow checker rules +- Prefer idiomatic solutions over clever hacks +- Include exact code diffs using markdown format with file names +- Explain the root cause before presenting fixes +- Suggest relevant clippy lints or cargo checks to prevent regressions + +Response guidelines: +- Structure analysis using bullet points for clarity +- Use code fences for error snippets and diffs +- Highlight connections between test failures and system architecture +- When uncertain, propose multiple hypothesis with verification strategies + +Special capabilities: +- Leverage knowledge of Rust internals (MIR, drop order, etc.) +- Reference similar issues in popular Rust OSS projects +- Suggest property-based testing strategies for edge cases diff --git a/prompts/implement.txt b/prompts/implement.txt new file mode 100644 index 0000000..951e74b --- /dev/null +++ b/prompts/implement.txt @@ -0,0 +1,14 @@ +You are a senior Rust engineer with 10+ years of experience in systems programming. +Your expertise includes: +- Deep knowledge of Rust's ownership system, lifetimes, and concurrency model +- Mastery of cargo, clippy, and modern Rust toolchain features +- Experience debugging complex memory issues and performance bottlenecks +- Familiarity with common Rust crates and idiomatic patterns + +Special capabilities: +- Leverage knowledge of Rust internals (MIR, drop order, etc.) +- Reference similar issues in popular Rust OSS projects +- Suggest property-based testing strategies for edge cases + +You are given a Rust codebase to implement a feature. +Make sure to follow the Rust codebase conventions and best practices. \ No newline at end of file diff --git a/prompts/pr1.txt b/prompts/pr1.txt new file mode 100644 index 0000000..3c6541d --- /dev/null +++ b/prompts/pr1.txt @@ -0,0 +1,811 @@ +## Implement the following changes to the code: + +### Cargo.toml + +```toml +[package] +name = "yek" +version = "0.13.8" +edition = "2021" +description = "A tool to serialize a repository into a single text file" +license = "MIT" +repository = "https://github.com/mohsen-w-elsayed/yek" +readme = "README.md" +keywords = ["git", "repository", "serialization", "text"] +categories = ["command-line-utilities", "development-tools"] + +[dependencies] +anyhow = "1.0" +byte-unit = "4.0" +clap = { version = "4.4", features = ["derive"] } +crossbeam = "0.8" +ignore = "0.4" +indicatif = "0.17" +num_cpus = "1.16" +rayon = "1.8" +regex = "1.10" +serde = { version = "1.0", features = ["derive"] } +serde_derive = "1.0" +sha2 = "0.10" +time = "0.3" +toml = "0.8" +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } +walkdir = "2.4" +path-slash = "0.2.1" +git2 = { version = "0.18.2", features = ["vendored-openssl", "https"] } +crossbeam-channel = "0.5" + +[dev-dependencies] +assert_cmd = "2.0" +chrono = "0.4" +predicates = "3.0" +tempfile = "3.9" +criterion = "0.5" +rand = "0.8" +git-cliff = "1.4.0" +regex = "1.10.3" + +[[bench]] +name = "serialization" +harness = false + +[profile.release] +opt-level = 3 +lto = true +codegen-units = 1 +panic = "abort" +strip = true +``` + +### src/main.rs (Argument parsing and main logic) + +```rust +use anyhow::{anyhow, Result}; +use clap::Parser; +use std::io::IsTerminal; +use std::path::{Path, PathBuf}; +use tracing::{subscriber, Level}; +use tracing_subscriber::fmt; +use yek::{find_config_file, load_config_file, parse_size_input, serialize_repo, validate_config, YekConfig}; + + +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +#[command(after_help = "See https://github.com/mohsen-w-elsayed/yek for detailed documentation.")] +struct Args { + /// Directories to process + #[arg()] + directories: Vec, + + /// Path to custom config file + #[arg(short, long, value_name = "FILE")] + config: Option, + + /// Maximum output size (supports K/KB/M/MB suffixes) + #[arg(long, value_name = "SIZE")] + max_size: Option, + + #[arg(long, value_name = "MODEL")] + #[arg(num_args = 0..=1, require_equals = true, default_missing_value = "openai")] + #[arg(value_parser = ["openai", "claude", "mistral", "mixtral", "deepseek", "llama", "codellama"])] + #[arg( + help = "Count size in tokens using specified model family (default: openai)\nSUPPORTED MODELS: openai, claude, mistral, mixtral, deepseek, llama, codellama" + )] + tokens: Option, + + /// Output directory for generated files + #[arg(long, short, value_name = "DIR")] + output_dir: Option, + + /// Enable debug output + #[arg(long)] + debug: bool, +} + +fn main() -> Result<()> { + let args = Args::parse(); + + // Setup logging + let level = if args.debug { + Level::DEBUG + } else { + Level::INFO + }; + + // Configure logging output + if let Ok(debug_output) = std::env::var("YEK_DEBUG_OUTPUT") { + let file = std::fs::File::create(debug_output)?; + let subscriber = fmt() + .with_max_level(level) + .with_writer(file) + .without_time() + .with_target(false) + .with_ansi(false) + .finish(); + subscriber::set_global_default(subscriber)?; + } else { + fmt() + .with_max_level(level) + .without_time() + .with_target(false) + .with_ansi(true) + .init(); + } + + // Load and merge configurations + let mut config = YekConfig::default(); + + // Load config from file if specified + if let Some(config_path) = args.config.as_ref().or(find_config_file(Path::new("."))) { + if config_path.exists() { + let file_config = load_config_file(&config_path); + match file_config { + Some(file_config) => { + config.merge(&file_config); + } + None => { + return Err(anyhow!( + "Failed to load config from: {}", + config_path.display() + )); + } + } + } + } + + // Apply command-line arguments + if let Some(size_str) = args.max_size { + config.max_size = Some(parse_size_input(&size_str, config.token_mode)?); + } + + if let Some(model) = args.tokens { + config.token_mode = true; + config.tokenizer_model = Some(model); + } + + if args.output_dir.is_some() { + config.output_dir = args.output_dir; + } + + // Stream mode: if stdout is not a tty, enable streaming + if !std::io::stdout().is_terminal() { + config.stream = true; + } + + // Validate the merged configuration + let validation_errors = validate_config(&config); + if !validation_errors.is_empty() { + for error in validation_errors { + eprintln!("Configuration error: {}", error); + } + return Err(anyhow!("Invalid configuration")); + } + + // Use specified directories or default to current directory + let directories = if args.directories.is_empty() { + vec![PathBuf::from(".")] + } else { + args.directories + }; + + // Run serialization for each directory + for dir in directories { + serialize_repo(&dir, Some(&config))?; + } + + Ok(()) +} +``` + +### src/lib.rs (Configuration related parts) + +```rust +use anyhow::{anyhow, Result}; +use regex::Regex; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::fs::{self}; +use std::io::Read; +use std::io::{self, Write}; +use std::path::{Path, PathBuf}; +use std::process::{Command as SysCommand, Stdio}; +use tracing::debug; + +mod defaults; +mod parallel; +mod model_manager; + +use defaults::{BINARY_FILE_EXTENSIONS, TEXT_FILE_EXTENSIONS}; +use parallel::process_files_parallel; + +/// Convert a glob pattern to a regex pattern +fn glob_to_regex(pattern: &str) -> String { + let mut regex = String::with_capacity(pattern.len() * 2); + regex.push('^'); // Match from the start of the path + + let mut chars = pattern.chars().peekable(); + while let Some(c) = chars.next() { + match c { + '*' => { + if chars.peek() == Some(&'*') { + chars.next(); + regex.push_str(".*"); // Match anything with .* + } else { + regex.push_str("[^/]*"); // Match any character except / + } + } + '?' => regex.push('.'), // Match any single character + '.' => regex.push_str("\\."), // Escape dots + '/' => regex.push_str("[/]"), // Forward slash + '[' => { + regex.push('['); + if let Some(&'^') = chars.peek() { + chars.next(); + regex.push('^'); // Negated character class + } + // Parse character class, escape special characters as needed + while let Some(c) = chars.peek() { + let c = *c; + chars.next(); + match c { + ']' => { + regex.push(']'); + break; + } + '\\' => { + regex.push_str(r"\\"); + if let Some(c) = chars.next() { + regex.push(c); + } + } + '-' => regex.push('-'), + _ => regex.push(c), + } + } + } + '{' => regex.push('('), // Start of alternation group + '}' => regex.push(')'), // End of alternation group + ',' => regex.push('|'), // Alternation separator + c => regex.push_str(®ex::escape(&c.to_string())), // Escape other special characters + } + } + + regex.push('$'); // Match until the end of the path + regex +} + +#[derive(Debug, Serialize, Deserialize, Default)] +pub struct IgnorePatterns { + #[serde(default)] + pub patterns: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PriorityRule { + pub pattern: String, + pub score: i32, +} + +impl PriorityRule { + #[allow(dead_code)] + fn matches(&self, path: &str) -> bool { + if let Ok(re) = Regex::new(&self.pattern) { + re.is_match(path) + } else { + false + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct YekConfig { + #[serde(default)] + pub ignore_patterns: Vec, + #[serde(default)] + pub priority_rules: Vec, + #[serde(default)] + pub binary_extensions: Vec, + #[serde(default)] + pub max_size: Option, + #[serde(default)] + pub output_dir: Option, + #[serde(default)] + pub stream: bool, + #[serde(default)] + pub token_mode: bool, + #[serde(default)] + pub tokenizer_model: Option, + #[serde(default)] + pub max_files: Option, +} + +impl Default for YekConfig { + fn default() -> Self { + Self { + stream: false, + output_dir: None, + priority_rules: vec![], + binary_extensions: vec![ + "jpg".to_string(), + "jpeg".to_string(), + "png".to_string(), + "gif".to_string(), + "bin".to_string(), + "zip".to_string(), + "exe".to_string(), + "dll".to_string(), + "so".to_string(), + "dylib".to_string(), + "class".to_string(), + "jar".to_string(), + "pyc".to_string(), + "pyo".to_string(), + "pyd".to_string(), + ], + ignore_patterns: vec![], + token_mode: false, + tokenizer_model: None, + max_size: None, + max_files: None, + } + } +} + +impl YekConfig { + pub fn merge(&mut self, other: &YekConfig) { + // Only override output_dir if present in other config + if other.output_dir.is_some() { + self.output_dir = other.output_dir.clone(); + } + self.stream = other.stream; + self.token_mode = other.token_mode; + if other.max_size.is_some() { + self.max_size = other.max_size; + } + if other.max_files.is_some() { + self.max_files = other.max_files; + } + if other.tokenizer_model.is_some() { + self.tokenizer_model = other.tokenizer_model.clone(); + } + // Merge other fields as needed, for example: + self.ignore_patterns.extend(other.ignore_patterns.clone()); + self.priority_rules.extend(other.priority_rules.clone()); + self.binary_extensions.extend(other.binary_extensions.clone()); + } +} + +/// Check if file is text by extension or scanning first chunk for null bytes. +pub fn is_text_file(path: &Path, user_binary_extensions: &[String]) -> io::Result { + // Check user-provided binary extensions first, permitting no leading dot + if let Some(ext) = path.extension().and_then(|s| s.to_str()) { + let ext_lower = ext.to_lowercase(); + if user_binary_extensions + .iter() + .any(|e| e.trim_start_matches('.') == ext_lower) + { + return Ok(false); + } + } + + // Check default binary extensions + if let Some(ext) = path.extension().and_then(|s| s.to_str()) { + if BINARY_FILE_EXTENSIONS.contains(&ext.to_lowercase().as_str()) { + return Ok(false); + } + } + + // If no extension or not in binary list, check content + let mut file = fs::File::open(path)?; + let mut buffer = [0; 512]; // Read a small chunk to check for null bytes + let n = file.read(&mut buffer)?; + + // Check for null bytes which typically indicate binary content + Ok(!buffer[..n].contains(&0)) +} + +/// Determine final priority of a file by scanning the priority list +/// in descending order of score. +pub fn get_file_priority(path: &str, rules: &[PriorityRule]) -> i32 { + rules + .iter() + .filter_map(|rule| { + let re = match Regex::new(&rule.pattern) { + Ok(re) => re, + Err(_) => return None, + }; + if re.is_match(path) { + Some(rule.score) + } else { + None + } + }) + .max() + .unwrap_or(0) +} + +/// Get the commit time of the most recent change to each file. +/// Returns a map from file path (relative to the repo root) → last commit Unix time. +/// If Git or .git folder is missing, returns None instead of erroring. +pub fn get_recent_commit_times(repo_path: &Path) -> Option> { + // Confirm there's a .git folder + if !repo_path.join(".git").exists() { + debug!("No .git directory found, skipping Git-based prioritization"); + return None; + } + + // Get all files and their timestamps using bash with proper UTF-8 handling + let output = SysCommand::new("bash") + .args([ + "-c", + "export LC_ALL=en_US.UTF-8; export LANG=en_US.UTF-8; \ + git -c core.quotepath=false log \ + --format=%ct \ + --name-only \ + --no-merges \ + --no-renames \ + -- . | tr -cd '[:print:]\n' | iconv -f utf-8 -t utf-8 -c", + ]) + .current_dir(repo_path) + .stderr(Stdio::null()) + .output() + .ok()?; + + if !output.status.success() { + debug!("Git log command failed, skipping Git-based prioritization"); + return None; + } + + let mut git_times = HashMap::new(); + let mut current_timestamp = 0_u64; + + // Process output line by line with UTF-8 conversion + let stdout = String::from_utf8_lossy(&output.stdout); + for line in stdout.lines() { + if line.is_empty() { + continue; + } + + if let Ok(ts) = line.parse::() { + current_timestamp = ts; + debug!("Found timestamp: {}", ts); + } else { + debug!("Found file: {} with timestamp {}", line, current_timestamp); + git_times.insert(line.to_string(), current_timestamp); + } + } + + if git_times.is_empty() { + debug!("No valid timestamps found, skipping Git-based prioritization"); + None + } else { + Some(git_times) + } +} + +/// Validate the config object, returning any errors found +#[derive(Debug)] +pub struct ConfigError { + pub message: String, +} + +pub fn validate_config(config: &YekConfig) -> Vec { + let mut errors = Vec::new(); + + // Validate priority rules + for rule in &config.priority_rules { + if rule.score < 0 || rule.score > 1000 { + errors.push(ConfigError { + message: format!("Priority score {} must be between 0 and 1000", rule.score), + }); + } + if rule.pattern.is_empty() { + errors.push(ConfigError { + message: "Priority rule must have a pattern".to_string(), + }); + } + // Validate regex pattern + if let Err(e) = Regex::new(&rule.pattern) { + errors.push(ConfigError { + message: format!("Invalid regex pattern '{}': {}", rule.pattern, e), + }); + } + } + + // Validate ignore patterns + for pattern in &config.ignore_patterns { + let regex_pattern = if pattern.starts_with('^') || pattern.ends_with('$') { + // Already a regex pattern + pattern.to_string() + } else { + // Convert glob pattern to regex + glob_to_regex(pattern) + }; + + if let Err(e) = Regex::new(®ex_pattern) { + errors.push(ConfigError { + message: format!("Invalid pattern '{}': {}", pattern, e), + }); + } + } + + // Validate max_size + if let Some(size) = config.max_size { + if size == 0 { + errors.push(ConfigError { + message: "Max size cannot be 0".to_string(), + }); + } + } + + // Validate output directory if specified + if let Some(dir) = &config.output_dir { + let path = Path::new(dir); + if path.exists() && !path.is_dir() { + errors.push(ConfigError { + message: format!( + "Output path '{}' exists but is not a directory", + dir.display() + ), + }); + } + + if let Err(e) = std::fs::create_dir_all(path) { + errors.push(ConfigError { + message: format!("Cannot create output directory '{}': {}", dir.display(), e), + }); + } + } + + errors +} + +pub const DEFAULT_CHUNK_SIZE: usize = 10 * 1024 * 1024; // 10MB in README + +/// The main function that the tests call. +pub fn serialize_repo(repo_path: &Path, cfg: Option<&YekConfig>) -> Result<()> { + let config = cfg.cloned().unwrap_or_default(); + let is_stream = config.stream; + + // Process files in parallel + let processed_files = process_files_parallel(repo_path, &config)?; + + // Convert to the format expected by write_chunks + let entries: Vec<(String, String, i32)> = processed_files + .into_iter() + .map(|f| (f.rel_path, f.content, f.priority)) + .collect(); + + Ok(()) +} + +/// Find yek.toml by walking up directories +pub fn find_config_file(start_path: &Path) -> Option { + let mut current = if start_path.is_absolute() { + debug!( + "Starting config search from absolute path: {}", + start_path.display() + ); + start_path.to_path_buf() + } else { + let path = std::env::current_dir().ok()?.join(start_path); + debug!( + "Starting config search from relative path: {}", + path.display() + ); + path + }; + + loop { + let config_path = current.join("yek.toml"); + if config_path.exists() { + return Some(config_path); + } + if !current.pop() { + break; + } + } + + None +} + +/// Merge config from a TOML file if present +pub fn load_config_file(path: &Path) -> Option { + debug!("Attempting to load config from: {}", path.display()); + let content = match std::fs::read_to_string(path) { + Ok(c) => c, + Err(e) => { + eprintln!("Failed to read config file: {}", e); + return None; + } + }; + + match toml::from_str::(&content) { + Ok(cfg) => { + debug!("Successfully loaded config"); + // Validate the config + let errors = validate_config(&cfg); + if !errors.is_empty() { + eprintln!("Invalid configuration in {}:", path.display()); + for error in errors { + eprintln!(" {}", error.message); + } + None + } else { + Some(cfg) + } + } + Err(e) => { + eprintln!("Failed to parse config file: {}", e); + None + } + } +} + +/// Returns a relative, normalized path string (forward slashes on all platforms). +pub fn normalize_path(path: &Path, base: &Path) -> String { + // Handle current directory specially + if path.to_str() == Some(".") { + return ".".to_string(); + } + + // Resolve both paths to their canonical forms to handle symlinks + let canonical_path = path.canonicalize().unwrap_or_else(|_| path.to_path_buf()); + let canonical_base = base.canonicalize().unwrap_or_else(|_| base.to_path_buf()); + + // Attempt to strip the base directory from the file path + match canonical_path.strip_prefix(&canonical_base) { + Ok(rel_path) => { + // Convert to forward slashes and return as relative path + rel_path.to_string_lossy().replace('\\', "/") + } + Err(_) => { + // Return the absolute path without adding an extra leading slash + canonical_path.to_string_lossy().replace('\\', "/") + } + } +} + +/// Parse size (for bytes or tokens) with optional K/KB, M/MB, G/GB suffix if not in token mode. +pub fn parse_size_input(input: &str, is_tokens: bool) -> Result { + let s = input.trim(); + if is_tokens { + // If user typed "128K", interpret as 128000 tokens + if s.to_lowercase().ends_with('k') { + let val = s[..s.len() - 1] + .trim() + .parse::() + .map_err(|e| anyhow!("Invalid token size: {}", e))?; + return Ok(val * 1000); + } + Ok(s.parse::()?) + } else { + // Byte-based suffix + let s = s.to_uppercase(); + if s.ends_with("KB") { + let val = s[..s.len() - 2].trim().parse::()?; + return Ok(val * 1024); + } else if s.ends_with("MB") { + let val = s[..s.len() - 2].trim().parse::()?; + return Ok(val * 1024 * 1024); + } else if s.ends_with("GB") { + let val = s[..s.len() - 2].trim().parse::()?; + return Ok(val * 1024 * 1024 * 1024); + } else if let Ok(val) = s.parse::() { + return Ok(val); + } + Err(anyhow!("Invalid size string: {}", input)) + } +} + +pub fn is_ignored(path: &str, patterns: &[String]) -> bool { + patterns.iter().any(|p| { + let pattern = if p.starts_with('^') || p.ends_with('$') { + p.to_string() + } else { + glob_to_regex(p) + }; + if let Ok(re) = Regex::new(&pattern) { + re.is_match(path) + } else { + false + } + }) +} + +pub fn normalize_path_with_root(path: &Path, base: &Path) -> String { + let path = match path.strip_prefix(base) { + Ok(p) => p, + Err(_) => path, + }; + normalize_path(path) +} +``` + +tests/test_validation.rs (Tests for config validation) + +```rust +mod integration_common; +use assert_cmd::Command; +use integration_common::{create_file, setup_temp_repo}; +use std::fs; +use yek::{find_config_file, load_config_file, serialize_repo, PriorityRule, YekConfig}; + +#[test] +fn fails_on_invalid_regex_in_config() { + let repo = setup_temp_repo(); + create_file( + repo.path(), + "yek.toml", + r#" + [[priority_rules]] + pattern = "(?!.*)" # Invalid regex + score = 10 + "# + .as_bytes(), + ); + + let mut cmd = Command::cargo_bin("yek").unwrap(); + cmd.current_dir(repo.path()) + .assert() + .failure() + .stderr(predicate::str::contains("Invalid regex pattern")); +} + +#[test] +fn fails_on_invalid_priority_score() { + let repo = setup_temp_repo(); + create_file( + repo.path(), + "yek.toml", + r#" + [[priority_rules]] + pattern = ".*" + score = 1001 # Invalid score + "# + .as_bytes(), + ); + + let mut cmd = Command::cargo_bin("yek").unwrap(); + cmd.current_dir(repo.path()) + .assert() + .failure() + .stderr(predicate::str::contains("between 0 and 1000")); +} +``` + +## Explanation of Changes: + +Cargo.toml: +Added clap for command-line argument parsing. +Added serde and toml for config file handling. +src/main.rs: +Uses clap to define and parse command-line arguments. +Loads configuration from yek.toml if present, using find_config_file and load_config_file. +Merges command-line arguments into the config, with CLI arguments taking precedence. +Validates the final configuration using validate_config. +Handles --debug flag to enable debug logging. +Calls serialize_repo with the merged and validated configuration. +src/lib.rs: +Defines the YekConfig struct to hold configuration values. +Implements Default for YekConfig to provide sensible defaults. +Adds merge method to YekConfig to merge two configurations. +Implements find_config_file to locate yek.toml by walking up the directory tree. +Implements load_config_file to read and parse yek.toml. +Adds validate_config to check for invalid configuration values. +Adds parse_size_input to parse size strings with optional units (KB, MB, GB in byte mode, K, M in token mode). +Adds glob_to_regex to convert basic glob patterns to regular expressions. +tests/test_validation.rs: +Adds tests to verify that invalid configuration values are detected and result in appropriate error messages. +Testing Strategy: + +The test_validation.rs file includes tests to cover various validation scenarios: +Invalid regex in yek.toml +Invalid priority score in yek.toml +Existing integration tests will need to be adjusted to work with the new argument parsing and configuration loading. +This PR lays the groundwork for more advanced features by providing a solid foundation for configuration and input handling. It also improves error handling and makes the codebase more maintainable. + +You will be asked to do "Fixes" but we are really just adding a new feature. + +Add ALL the files needed to implement the feature. diff --git a/prompts/remove-chunk.txt b/prompts/remove-chunk.txt new file mode 100644 index 0000000..c531ef2 --- /dev/null +++ b/prompts/remove-chunk.txt @@ -0,0 +1,10 @@ +We should remove the notion of "chunks" from the codebase. yek will always output a single file. + +Implement the changes needed to remove the chunk code. +Update docs and tests to reflect the new behavior. + +We have done some work but some references are still in the codebase that mention "chunks". + +You will be asked to do "Fixes" but we are really just adding a new feature. + +Add ALL the files needed to implement the feature. diff --git a/scripts/ai-loop.sh b/scripts/ai-loop.sh index 21bd7ae..0cd06d5 100755 --- a/scripts/ai-loop.sh +++ b/scripts/ai-loop.sh @@ -13,19 +13,18 @@ BRANCH=${BRANCH:-tokenizer} success=0 for i in $(seq 1 $attempts); do - echo "=== Attempt $i/$attempts ===" | tee -a attempts.txt + echo "=== Attempt $i/$attempts ===" - # Run tests and print output to console, capture to temp file - cargo test -- --test accepts_model_from_config --test-threads=1 2>&1 | tee test_output.tmp - test_exit_code=${PIPESTATUS[0]} + # Run tests and print output to console + test_output=$(cargo test -- --test accepts_model_from_config --test-threads=1 2>&1) + test_exit_code=$? + echo "$test_output" >test_output.tmp - # Trim output to only include failures section - test_output=$(sed -n '/failures:/,/failures:/p' test_output.tmp | sed '1d; $d') - rm test_output.tmp - - # Append trimmed test results to attempts.txt - echo "$test_output" >>attempts.txt - echo -e "\n\n" >>attempts.txt + # Append last attempt to test output if it exists + if [ -f last_attempt.txt ]; then + echo "## Last time we tried this but we failed:" + cat last_attempt.txt >>test_output.tmp + fi # Exit loop if tests passed if [ $test_exit_code -eq 0 ]; then @@ -33,49 +32,33 @@ for i in $(seq 1 $attempts); do if [ "$GITHUB_ACTIONS" ]; then echo "ATTEMPTS=$i" >>$GITHUB_ENV fi + echo "Tests passed!!" break fi - # Create temp file for askds input and clean it up - askds_input=$(tail -c 250000 attempts.txt | sed 's/===/---/g') - echo "$askds_input" >askds_input.tmp - - # Run askds and stream output to both console and variable - echo "--- askds Output ---" | tee -a attempts.txt - - askds_output=$( - askds \ - --hide-ui \ - --fix \ - --auto-apply \ - --serialize="yek --max-size=100KB | cat" \ - --test-file-pattern='tests/*.rs' \ - --source-file-pattern='src/**/*.rs' \ - --system-prompt=./prompts/fix-tests.txt \ - --run="cat askds_input.tmp" 2>&1 | tee /dev/stderr - ) - askds_exit_code=$? - - if [ $askds_exit_code -ne 0 ]; then - echo "askds failed with exit code $askds_exit_code" >>attempts.txt - echo "askds failed. Guessing we ran out of context window. Trimming attempts.txt to last 30KB" - tail -c 30000 attempts.txt >attempts.tmp - mv attempts.tmp attempts.txt - continue - fi + # Run askds to fix the tests + askds \ + --hide-ui \ + --fix \ + --auto-apply \ + --serialize="yek --max-size=100KB | cat" \ + --test-file-pattern='tests/*.rs' \ + --source-file-pattern='src/**/*.rs' \ + --system-prompt=./prompts/fix-tests.txt \ + --run="cat test_output.tmp" || true - echo "$askds_output" >>attempts.txt - echo "--- End askds Output ---" >>attempts.txt - # Cleanup temp files - rm askds_input.tmp + rm last_attempt.txt + cargo fmt + cargo clippy --fix --allow-dirty # Commit changes if any if ! git diff --quiet; then git add . git commit -m "fix attempt $i (${BRANCH})" - echo "Applied fixes for ${BRANCH} tests" | tee -a attempts.txt + echo "Applied fixes for ${BRANCH} tests" else - echo "No changes in attempt $i" | tee -a attempts.txt + echo "No changes in attempt $i" + cp test_output.tmp last_attempt.txt continue fi done diff --git a/src/lib.rs b/src/lib.rs index a0bed14..b66768c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,68 +2,29 @@ use anyhow::{anyhow, Result}; use regex::Regex; use serde::{Deserialize, Serialize}; use std::collections::HashMap; -use std::fs::{self}; -use std::io::Read; -use std::io::{self, Write}; +use std::io::{self, Read}; use std::path::{Path, PathBuf}; use std::process::{Command as SysCommand, Stdio}; +use std::{fs, str}; use tracing::debug; mod defaults; +pub mod model_manager; mod parallel; -use defaults::{BINARY_FILE_EXTENSIONS, TEXT_FILE_EXTENSIONS}; +use defaults::BINARY_FILE_EXTENSIONS; use parallel::process_files_parallel; /// Convert a glob pattern to a regex pattern fn glob_to_regex(pattern: &str) -> String { - let mut regex = String::with_capacity(pattern.len() * 2); - let mut chars = pattern.chars().peekable(); - - while let Some(c) = chars.next() { - match c { - '*' => { - if chars.peek() == Some(&'*') { - chars.next(); // consume second * - regex.push_str(".*"); - } else { - regex.push_str("[^/]*"); - } - } - '?' => regex.push('.'), - '.' => regex.push_str("\\."), - '/' => regex.push('/'), - '[' => { - regex.push('['); - for c in chars.by_ref() { - if c == ']' { - regex.push(']'); - break; - } - regex.push(c); - } - } - '{' => { - regex.push('('); - for c in chars.by_ref() { - if c == '}' { - regex.push(')'); - break; - } else if c == ',' { - regex.push('|'); - } else { - regex.push(c); - } - } - } - c if c.is_alphanumeric() || c == '_' || c == '-' => regex.push(c), - c => { - regex.push('\\'); - regex.push(c); - } - } - } - regex + pattern + .replace(".", "\\.") + .replace("*", "[^/]*") // Match any character except / + .replace("?", "[^/]") // Match any single character except / + .replace("[!", "[^") + .replace("{", "(") + .replace("}", ")") + .replace(",", "|") } #[derive(Debug, Serialize, Deserialize, Default)] @@ -89,7 +50,7 @@ impl PriorityRule { } } -#[derive(Debug, Clone, Serialize, Deserialize, Default)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct YekConfig { #[serde(default)] pub ignore_patterns: Vec, @@ -105,32 +66,92 @@ pub struct YekConfig { pub stream: bool, #[serde(default)] pub token_mode: bool, + #[serde(default)] + pub tokenizer_model: Option, + #[serde(default)] + pub max_files: Option, +} + +impl Default for YekConfig { + fn default() -> Self { + Self { + stream: false, + output_dir: None, + priority_rules: vec![], + binary_extensions: vec![ + "jpg".to_string(), + "jpeg".to_string(), + "png".to_string(), + "gif".to_string(), + "bin".to_string(), + "zip".to_string(), + "exe".to_string(), + "dll".to_string(), + "so".to_string(), + "dylib".to_string(), + "class".to_string(), + "jar".to_string(), + "pyc".to_string(), + "pyo".to_string(), + "pyd".to_string(), + ], + ignore_patterns: vec![], + token_mode: false, + tokenizer_model: None, + max_size: None, + max_files: None, + } + } +} + +impl YekConfig { + pub fn merge(&mut self, other: &YekConfig) { + // Only override output_dir if present in other config + if other.output_dir.is_some() { + self.output_dir = other.output_dir.clone(); + } + self.stream = other.stream; + self.token_mode = other.token_mode; + if other.max_size.is_some() { + self.max_size = other.max_size; + } + if other.max_files.is_some() { + self.max_files = other.max_files; + } + if other.tokenizer_model.is_some() { + self.tokenizer_model = other.tokenizer_model.clone(); + } + // Merge other fields as needed, for example: + self.ignore_patterns.extend(other.ignore_patterns.clone()); + self.priority_rules.extend(other.priority_rules.clone()); + self.binary_extensions + .extend(other.binary_extensions.clone()); + } } /// Check if file is text by extension or scanning first chunk for null bytes. pub fn is_text_file(path: &Path, user_binary_extensions: &[String]) -> io::Result { - // First check extension - fast path + // Check user-provided binary extensions first, permitting no leading dot if let Some(ext) = path.extension().and_then(|s| s.to_str()) { - let ext_lc = ext.to_lowercase(); - // If it's in the known text extensions list, it's definitely text - if TEXT_FILE_EXTENSIONS.contains(&ext_lc.as_str()) { - return Ok(true); - } - // If it's in the binary extensions list (built-in or user-defined), it's definitely binary - if BINARY_FILE_EXTENSIONS.contains(&ext_lc.as_str()) - || user_binary_extensions - .iter() - .any(|e| e.trim_start_matches('.') == ext_lc) + let ext_lower = ext.to_lowercase(); + if user_binary_extensions + .iter() + .any(|e| e.trim_start_matches('.') == ext_lower) { return Ok(false); } - // Unknown extension - treat as binary - return Ok(false); } - // No extension - scan content + // Check default binary extensions + if let Some(ext) = path.extension().and_then(|s| s.to_str()) { + if BINARY_FILE_EXTENSIONS.contains(&ext.to_lowercase().as_str()) { + return Ok(false); + } + } + + // If no extension or not in binary list, check content let mut file = fs::File::open(path)?; - let mut buffer = [0; 512]; + let mut buffer = [0; 512]; // Read a small chunk to check for null bytes let n = file.read(&mut buffer)?; // Check for null bytes which typically indicate binary content @@ -219,10 +240,15 @@ pub fn get_recent_commit_times(repo_path: &Path) -> Option> /// Validate the config object, returning any errors found #[derive(Debug)] pub struct ConfigError { - pub field: String, pub message: String, } +impl std::fmt::Display for ConfigError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.message) + } +} + pub fn validate_config(config: &YekConfig) -> Vec { let mut errors = Vec::new(); @@ -230,20 +256,17 @@ pub fn validate_config(config: &YekConfig) -> Vec { for rule in &config.priority_rules { if rule.score < 0 || rule.score > 1000 { errors.push(ConfigError { - field: "priority_rules".to_string(), message: format!("Priority score {} must be between 0 and 1000", rule.score), }); } if rule.pattern.is_empty() { errors.push(ConfigError { - field: "priority_rules".to_string(), message: "Priority rule must have a pattern".to_string(), }); } // Validate regex pattern if let Err(e) = Regex::new(&rule.pattern) { errors.push(ConfigError { - field: "priority_rules".to_string(), message: format!("Invalid regex pattern '{}': {}", rule.pattern, e), }); } @@ -261,7 +284,6 @@ pub fn validate_config(config: &YekConfig) -> Vec { if let Err(e) = Regex::new(®ex_pattern) { errors.push(ConfigError { - field: "ignore_patterns".to_string(), message: format!("Invalid pattern '{}': {}", pattern, e), }); } @@ -271,7 +293,6 @@ pub fn validate_config(config: &YekConfig) -> Vec { if let Some(size) = config.max_size { if size == 0 { errors.push(ConfigError { - field: "max_size".to_string(), message: "Max size cannot be 0".to_string(), }); } @@ -282,7 +303,6 @@ pub fn validate_config(config: &YekConfig) -> Vec { let path = Path::new(dir); if path.exists() && !path.is_dir() { errors.push(ConfigError { - field: "output_dir".to_string(), message: format!( "Output path '{}' exists but is not a directory", dir.display() @@ -292,7 +312,6 @@ pub fn validate_config(config: &YekConfig) -> Vec { if let Err(e) = std::fs::create_dir_all(path) { errors.push(ConfigError { - field: "output_dir".to_string(), message: format!("Cannot create output directory '{}': {}", dir.display(), e), }); } @@ -301,227 +320,55 @@ pub fn validate_config(config: &YekConfig) -> Vec { errors } -pub const DEFAULT_CHUNK_SIZE: usize = 10 * 1024 * 1024; // 10MB in README - -/// Write a single chunk either to stdout or file -fn write_single_chunk( - content: &str, - index: usize, - part_index: Option, - out_dir: &Path, - is_stream: bool, -) -> io::Result<()> { - if is_stream { - let mut stdout = io::stdout(); - write!(stdout, "{}", content)?; - stdout.flush()?; - } else { - // Always use chunk index in filename - let mut file_name = format!("chunk-{}", index); - if let Some(part_i) = part_index { - file_name = format!("chunk-{}-part-{}", index, part_i); - } - let path = out_dir.join(format!("{}.txt", file_name)); - fs::create_dir_all(path.parent().unwrap())?; - fs::write(path, content.as_bytes())?; - } - Ok(()) -} - -/// The aggregator that writes chunk-* files or streams to stdout. -fn write_chunks( - entries: &[(String, String, i32)], - config: &YekConfig, - is_stream: bool, -) -> Result<()> { - debug!("Starting write_chunks with {} entries", entries.len()); - let chunk_size = config.max_size.unwrap_or(DEFAULT_CHUNK_SIZE); - let token_mode = config.token_mode; - let mut total_chunks = 0; - - // Sort entries by priority (ascending) - let mut sorted_entries = entries.to_vec(); - sorted_entries.sort_by_key(|(_, _, prio)| *prio); - - // For chunk files: - let out_dir = if !is_stream { - config - .output_dir - .as_ref() - .expect("output_dir is None but streaming is false") - } else { - // dummy - Path::new(".") - }; - debug!("Output directory: {:?}", out_dir); - - let mut chunk_idx = 0; - let mut buffer = String::new(); - let mut used_size = 0_usize; - - // Process each file - for (rel_path, content, _prio) in sorted_entries { - debug!("Processing file: {}", rel_path); - if token_mode { - // Count tokens - let tokens: Vec<&str> = content.split_whitespace().collect(); - let file_tokens = tokens.len(); - debug!("Token mode: {} tokens in file", file_tokens); - - // If file exceeds chunk_size by itself, do forced splits - if file_tokens >= chunk_size { - debug!("File exceeds chunk size, splitting into multiple chunks"); - // Flush current buffer first - if !buffer.is_empty() { - debug!("Flushing buffer before large file"); - write_single_chunk(&buffer, chunk_idx, None, out_dir, is_stream)?; - total_chunks += 1; - buffer.clear(); - used_size = 0; - chunk_idx += 1; - } - - // Split large file into chunks - let mut start = 0; - let mut part = 0; - while start < file_tokens { - let end = (start + chunk_size).min(file_tokens); - let chunk_tokens = &tokens[start..end]; - let chunk_str = format!( - "chunk {}\n>>>> {}:part {}\n{}\n", - chunk_idx, - rel_path, - part, - chunk_tokens.join(" ") - ); - debug!("Writing large file part {}", part); - write_single_chunk(&chunk_str, chunk_idx, Some(part), out_dir, is_stream)?; - total_chunks += 1; - chunk_idx += 1; - part += 1; - start = end; - } - } else { - // Small enough to fit in one chunk - let overhead = 10 + rel_path.len(); - let add_size = file_tokens + overhead; - - if used_size + add_size > chunk_size && !buffer.is_empty() { - debug!("Flushing buffer due to size limit"); - write_single_chunk(&buffer, chunk_idx, None, out_dir, is_stream)?; - total_chunks += 1; - buffer.clear(); - used_size = 0; - chunk_idx += 1; - } - - debug!("Adding file to buffer"); - buffer.push_str(&format!("chunk {}\n>>>> {}\n", chunk_idx, rel_path)); - buffer.push_str(&content); - buffer.push('\n'); - used_size += add_size; - } - } else { - // Byte mode - let file_len = content.len(); - debug!("Byte mode: {} bytes in file", file_len); - - // If file exceeds chunk_size by itself, do forced splits - if file_len >= chunk_size { - debug!("File exceeds chunk size, splitting into multiple chunks"); - // Flush current buffer first - if !buffer.is_empty() { - debug!("Flushing buffer before large file"); - write_single_chunk(&buffer, chunk_idx, None, out_dir, is_stream)?; - total_chunks += 1; - buffer.clear(); - used_size = 0; - chunk_idx += 1; - } - - // Split large file into chunks - let mut start = 0; - let mut part = 0; - while start < file_len { - let end = (start + chunk_size).min(file_len); - let chunk_data = &content.as_bytes()[start..end]; - let chunk_str = format!( - "chunk {}\n>>>> {}:part {}\n{}\n", - chunk_idx, - rel_path, - part, - String::from_utf8_lossy(chunk_data) - ); - debug!("Writing large file part {}", part); - write_single_chunk(&chunk_str, chunk_idx, Some(part), out_dir, is_stream)?; - total_chunks += 1; - chunk_idx += 1; - part += 1; - start = end; - } - } else { - // Small enough to fit in one chunk - let overhead = 10 + rel_path.len(); - let add_size = file_len + overhead; - - if used_size + add_size > chunk_size && !buffer.is_empty() { - debug!("Flushing buffer due to size limit"); - write_single_chunk(&buffer, chunk_idx, None, out_dir, is_stream)?; - total_chunks += 1; - buffer.clear(); - used_size = 0; - chunk_idx += 1; - } - - debug!("Adding file to buffer"); - buffer.push_str(&format!("chunk {}\n>>>> {}\n", chunk_idx, rel_path)); - buffer.push_str(&content); - buffer.push('\n'); - used_size += add_size; - } - } +/// Returns a relative, normalized path string (forward slashes on all platforms). +pub fn normalize_path(path: &Path) -> String { + if path.to_str() == Some(".") { + return ".".to_string(); } - // Flush final chunk if not empty - if !buffer.is_empty() { - debug!("Flushing final buffer"); - write_single_chunk(&buffer, chunk_idx, None, out_dir, is_stream)?; - total_chunks += 1; - } + let path_str = path.to_string_lossy().replace('\\', "/"); + let stripped = path_str.strip_prefix("./").unwrap_or(&path_str); + let trimmed = stripped.trim_start_matches('/').trim_end_matches('/'); - // Print final output message - if !is_stream { - match total_chunks { - 0 => {} // No files written (edge case) - 1 => { - let path = out_dir.join("chunk-0.txt"); - println!("Wrote: {}", path.display()); - } - _ => { - println!("Wrote {} chunks in {}", total_chunks, out_dir.display()); - } - } + if trimmed.is_empty() { + ".".to_string() + } else { + trimmed.to_string() } +} - debug!("Finished write_chunks"); - Ok(()) +pub fn normalize_path_with_root(path: &Path, base: &Path) -> String { + let path = match path.strip_prefix(base) { + Ok(p) => p, + Err(_) => path, + }; + normalize_path(path) } /// The main function that the tests call. pub fn serialize_repo(repo_path: &Path, cfg: Option<&YekConfig>) -> Result<()> { let config = cfg.cloned().unwrap_or_default(); + let _is_stream = config.stream; // Process files in parallel - let processed_files = process_files_parallel(repo_path, &config)?; + let mut output = String::new(); + process_files_parallel(repo_path, &config, &mut output)?; - // Convert to the format expected by write_chunks - let entries: Vec<(String, String, i32)> = processed_files - .into_iter() - .map(|f| (f.rel_path, f.content, f.priority)) - .collect(); - - // Write chunks - write_chunks(&entries, &config, config.stream)?; + if config.stream { + // Write to stdout + print!("{}", output); + } else { + // Determine output directory + let output_dir = config + .output_dir + .as_deref() + .unwrap_or_else(|| Path::new(".")); + // Create directory if it doesn't exist + fs::create_dir_all(output_dir)?; + // Write to output.txt in the output directory + let output_path = output_dir.join("output.txt"); + fs::write(output_path, output)?; + } Ok(()) } @@ -557,7 +404,8 @@ pub fn find_config_file(start_path: &Path) -> Option { } /// Merge config from a TOML file if present -pub fn load_config_file(path: &Path) -> Option { +pub fn load_config_file(path: impl AsRef) -> Option { + let path = path.as_ref(); debug!("Attempting to load config from: {}", path.display()); let content = match std::fs::read_to_string(path) { Ok(c) => c, @@ -575,7 +423,7 @@ pub fn load_config_file(path: &Path) -> Option { if !errors.is_empty() { eprintln!("Invalid configuration in {}:", path.display()); for error in errors { - eprintln!(" {}: {}", error.field, error.message); + eprintln!(" {}", error.message); } None } else { @@ -589,84 +437,6 @@ pub fn load_config_file(path: &Path) -> Option { } } -/// Rank-based approach to compute how "recent" each file is (0=oldest, 1=newest). -/// Then scale it to a user-defined or default max boost. -#[allow(dead_code)] -fn compute_recentness_boost( - commit_times: &HashMap, - max_boost: i32, -) -> HashMap { - if commit_times.is_empty() { - return HashMap::new(); - } - - // Sort by ascending commit time => first is oldest - let mut sorted: Vec<(&String, &u64)> = commit_times.iter().collect(); - sorted.sort_by_key(|(_, t)| **t); - - // oldest file => rank=0, newest => rank=1 - let last_index = sorted.len().saturating_sub(1) as f64; - if last_index < 1.0 { - // If there's only one file, or zero, no boosts make sense - let mut single = HashMap::new(); - for file in commit_times.keys() { - single.insert(file.clone(), 0); - } - return single; - } - - let mut result = HashMap::new(); - for (i, (path, _time)) in sorted.iter().enumerate() { - let rank = i as f64 / last_index; // 0.0..1.0 (older files get lower rank) - let boost = (rank * max_boost as f64).round() as i32; // Newer files get higher boost - result.insert((*path).clone(), boost); - } - result -} - -#[cfg(target_family = "windows")] -#[allow(dead_code)] -fn is_effectively_absolute(path: &std::path::Path) -> bool { - if path.is_absolute() { - return true; - } - // Also treat a leading slash/backslash as absolute - match path.to_str() { - Some(s) => s.starts_with('/') || s.starts_with('\\'), - None => false, - } -} - -#[cfg(not(target_family = "windows"))] -#[allow(dead_code)] -fn is_effectively_absolute(path: &std::path::Path) -> bool { - path.is_absolute() -} - -/// Returns a relative, normalized path string (forward slashes on all platforms). -pub fn normalize_path(path: &Path, base: &Path) -> String { - // Handle current directory specially - if path.to_str() == Some(".") { - return ".".to_string(); - } - - // Resolve both paths to their canonical forms to handle symlinks - let canonical_path = path.canonicalize().unwrap_or_else(|_| path.to_path_buf()); - let canonical_base = base.canonicalize().unwrap_or_else(|_| base.to_path_buf()); - - // Attempt to strip the base directory from the file path - match canonical_path.strip_prefix(&canonical_base) { - Ok(rel_path) => { - // Convert to forward slashes and return as relative path - rel_path.to_string_lossy().replace('\\', "/") - } - Err(_) => { - // Return the absolute path without adding an extra leading slash - canonical_path.to_string_lossy().replace('\\', "/") - } - } -} - /// Parse size (for bytes or tokens) with optional K/KB, M/MB, G/GB suffix if not in token mode. pub fn parse_size_input(input: &str, is_tokens: bool) -> Result { let s = input.trim(); @@ -698,3 +468,28 @@ pub fn parse_size_input(input: &str, is_tokens: bool) -> Result { Err(anyhow!("Invalid size string: {}", input)) } } + +pub fn is_ignored(path: &str, patterns: &[String]) -> bool { + patterns.iter().any(|p| { + let pattern = if p.starts_with('^') || p.ends_with('$') { + // If it's already a regex pattern, use it as is + p.to_string() + } else { + // Convert glob pattern to regex, handling special cases + let mut pattern = glob_to_regex(p); + if !pattern.starts_with('^') { + pattern = format!("^{}", pattern); + } + if !pattern.ends_with('$') { + pattern = format!("{}$", pattern); + } + pattern + }; + + if let Ok(re) = Regex::new(&pattern) { + re.is_match(path) + } else { + false + } + }) +} diff --git a/src/main.rs b/src/main.rs index fdc0582..be0c01c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,159 +1,165 @@ -use anyhow::Result; -use clap::{Arg, Command}; -use std::io::{stdout, IsTerminal}; +use anyhow::{anyhow, Result}; +use clap::Parser; +use std::io::IsTerminal; use std::path::{Path, PathBuf}; -use tracing::{subscriber, Level}; +use tracing_subscriber::filter::EnvFilter; use tracing_subscriber::fmt; -use yek::{find_config_file, load_config_file, parse_size_input, serialize_repo, YekConfig}; +use tracing_subscriber::prelude::*; +use tracing_subscriber::Registry; +use yek::{ + find_config_file, load_config_file, parse_size_input, serialize_repo, validate_config, + YekConfig, +}; + +fn glob_to_regex(pattern: &str) -> String { + pattern + .replace(".", "\\.") + .replace("*", ".*") + .replace("?", ".") + .replace("[!", "[^") + .replace("{", "(") + .replace("}", ")") + .replace(",", "|") +} -fn main() -> Result<()> { - let matches = Command::new("yek") - .about("Repository content chunker and serializer for LLM consumption") - .arg( - Arg::new("directories") - .help("Directories to process") - .num_args(0..) - .default_value("."), - ) - .arg( - Arg::new("max-size") - .long("max-size") - .help("Maximum size per chunk (e.g. '10MB', '128KB', '1GB')") - .default_value("10MB"), - ) - .arg( - Arg::new("tokens") - .long("tokens") - .help("Count size in tokens instead of bytes") - .action(clap::ArgAction::SetTrue), - ) - .arg( - Arg::new("debug") - .long("debug") - .help("Enable debug output") - .action(clap::ArgAction::SetTrue), - ) - .arg( - Arg::new("output-dir") - .long("output-dir") - .help("Output directory for chunks"), - ) - .get_matches(); - - // Setup logging - let level = if matches.get_flag("debug") { - Level::DEBUG - } else { - Level::INFO - }; +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +#[command(after_help = "See https://github.com/mohsen-w-elsayed/yek for detailed documentation.")] +struct Args { + /// Directories to process + #[arg()] + directories: Vec, + + /// Path to custom config file + #[arg(short, long, value_name = "FILE")] + config: Option, + + /// Maximum output size (supports K/KB/M/MB suffixes) + #[arg(long, value_name = "SIZE")] + max_size: Option, + + /// Include only files matching pattern + #[arg(long, value_name = "PATTERN")] + include: Option, + + #[arg(long, value_name = "MODEL")] + #[arg(num_args = 0..=1, require_equals = true, default_missing_value = "openai")] + #[arg(value_parser = ["openai", "claude", "mistral", "mixtral", "deepseek", "llama", "codellama"])] + #[arg( + help = "Count size in tokens using specified model family (default: openai)\nSUPPORTED MODELS: openai, claude, mistral, mixtral, deepseek, llama, codellama" + )] + tokens: Option, + + /// Output directory for generated files + #[arg(long, short, value_name = "DIR")] + output_dir: Option, + + /// Enable debug output + #[arg(long)] + debug: bool, +} - // Configure logging output - if let Ok(debug_output) = std::env::var("YEK_DEBUG_OUTPUT") { - let file = std::fs::File::create(debug_output)?; - let subscriber = fmt() - .with_max_level(level) - .with_writer(file) - .without_time() - .with_target(false) - .with_ansi(false) - .finish(); - subscriber::set_global_default(subscriber)?; +fn init_logging() { + let debug_output = std::env::var("YEK_DEBUG_OUTPUT").ok(); + let filter = EnvFilter::from_default_env().add_directive("yek=debug".parse().unwrap()); + + if let Some(path) = debug_output { + let file = std::fs::File::create(path).expect("Failed to create debug log file"); + let file_subscriber = fmt::layer().with_writer(file).with_ansi(false); + + Registry::default() + .with(filter) + .with(file_subscriber) + .init(); } else { fmt() - .with_max_level(level) - .without_time() - .with_target(false) - .with_ansi(true) + .with_env_filter(filter) + .with_ansi(std::io::stdout().is_terminal()) .init(); } +} - // Gather directories - let directories: Vec<&str> = matches - .get_many::("directories") - .unwrap() - .map(|s| s.as_str()) - .collect(); - - // Gather config - let mut yek_config = YekConfig::default(); +fn main() -> Result<()> { + let args = Args::parse(); - // Possibly parse max size - if let Some(size_str) = matches.get_one::("max-size") { - yek_config.max_size = Some(parse_size_input(size_str, matches.get_flag("tokens"))?); + if args.debug { + init_logging(); } - yek_config.token_mode = matches.get_flag("tokens"); - - // Are we writing chunk files or streaming? - // If --output-dir is given, we always write to that directory. - // Otherwise, if stdout is not a TTY, we stream. If it *is* a TTY, create a temp dir. - if let Some(out_dir) = matches.get_one::("output-dir") { - yek_config.output_dir = Some(PathBuf::from(out_dir)); - } else { - let stdout_is_tty = stdout().is_terminal(); - if stdout_is_tty { - // Write chunk files to a temporary directory - let tmp = std::env::temp_dir().join("yek-serialize"); - yek_config.output_dir = Some(tmp); + // Load config file + let mut config = if let Some(config_path) = args.config { + if let Some(cfg) = load_config_file(&config_path) { + cfg } else { - // Stream to stdout - yek_config.stream = true; + return Err(anyhow!( + "Failed to load config from: {}", + config_path.display() + )); } - } - - // Run serialize_repo for each directory - for dir in directories { - let path = Path::new(dir); - - // Make a per-directory clone of base config - let mut config_for_this_dir = yek_config.clone(); - - // Look up any local yek.toml - if let Some(toml_path) = find_config_file(path) { - if let Some(file_cfg) = load_config_file(&toml_path) { - // Merge file_cfg into config_for_this_dir - merge_config(&mut config_for_this_dir, &file_cfg); - } + } else if let Some(config_path) = find_config_file(std::env::current_dir()?.as_path()) { + if let Some(cfg) = load_config_file(&config_path) { + cfg + } else { + return Err(anyhow!( + "Failed to load config from: {}", + config_path.display() + )); } + } else { + YekConfig::default() + }; - serialize_repo(path, Some(&config_for_this_dir))?; + // Apply command-line arguments + if let Some(model) = args.tokens { + config.token_mode = true; + config.tokenizer_model = Some(model); } - Ok(()) -} - -/// Merge the fields of `other` into `dest`. -fn merge_config(dest: &mut YekConfig, other: &YekConfig) { - // Merge ignore patterns - dest.ignore_patterns - .extend_from_slice(&other.ignore_patterns); - // Merge priority rules - dest.priority_rules.extend_from_slice(&other.priority_rules); - // Merge binary extensions - dest.binary_extensions - .extend_from_slice(&other.binary_extensions); - - // Respect whichever max_size is more specific - if dest.max_size.is_none() && other.max_size.is_some() { - dest.max_size = other.max_size; + if let Some(size_str) = args.max_size { + config.max_size = Some(parse_size_input(&size_str, config.token_mode)?); } - // token_mode: if `other` is true, set it - if other.token_mode { - dest.token_mode = true; + if let Some(output_dir) = &args.output_dir { + config.output_dir = Some(output_dir.clone()); } - // If `other.output_dir` is set, we can choose to override or not. Usually the CLI - // argument has higher precedence, so we only override if `dest.output_dir` is None: - if dest.output_dir.is_none() && other.output_dir.is_some() { - dest.output_dir = other.output_dir.clone(); + // Add include pattern if specified + if let Some(include) = args.include { + // Convert glob pattern to regex for matching + let include_pattern = glob_to_regex(&include); + // Create a negative lookahead pattern that matches everything except the include pattern + config.ignore_patterns = vec![format!("^(?!{}$).*$", include_pattern)]; } - // Similarly for stream - if !dest.stream && other.stream { - // only override if CLI didn't force an output dir - if dest.output_dir.is_none() { - dest.stream = true; + // Determine if we should stream based on output_dir and stdout + config.stream = if config.output_dir.is_some() { + // Output directory is specified, don't stream + false + } else { + // No output directory, stream if we're piping + !std::io::stdout().is_terminal() + }; + + // Validate the merged configuration + let validation_errors = validate_config(&config); + if !validation_errors.is_empty() { + for error in validation_errors { + eprintln!("Configuration error: {}", error); } + return Err(anyhow!("Invalid configuration")); } + + // Use specified directories or default to current directory + let directories = if args.directories.is_empty() { + vec![PathBuf::from(".")] + } else { + args.directories + }; + + // Run serialization for each directory + for dir in directories { + serialize_repo(&dir, Some(&config))?; + } + + Ok(()) } diff --git a/src/model_manager.rs b/src/model_manager.rs new file mode 100644 index 0000000..0424963 --- /dev/null +++ b/src/model_manager.rs @@ -0,0 +1,106 @@ +use anyhow::{anyhow, Result}; +use std::collections::HashMap; +use std::sync::Mutex; +use tiktoken_rs::o200k_base; +use tokenizers::Tokenizer; + +lazy_static::lazy_static! { + static ref MODEL_CACHE: Mutex> = Mutex::new(HashMap::new()); +} + +pub const SUPPORTED_MODEL_FAMILIES: &[&str] = &[ + "openai", // All OpenAI models + "claude", // All Anthropic Claude models + "mistral", // All Mistral models + "mixtral", // All Mistral models + "llama", // All Meta Llama models + "deepseek", // DeepSeek models + "codellama", // All Meta Llama models +]; + +fn load_tokenizer(path: &str) -> Result { + Tokenizer::from_file(path).map_err(|e| anyhow!("Failed to load tokenizer from {}: {}", path, e)) +} + +pub fn tokenize(text: &str, model: &str) -> Result> { + // Handle OpenAI models separately as they use tiktoken + if model == "openai" { + let encoding = + o200k_base().map_err(|e| anyhow!("Failed to get o200k_base encoding: {}", e))?; + let tokens = encoding.encode_with_special_tokens(text); + return Ok(tokens.into_iter().map(|t| t as u32).collect()); + } + + let mut cache = MODEL_CACHE + .lock() + .map_err(|e| anyhow!("Failed to lock cache: {}", e))?; + + // Load tokenizer if not in cache + if !cache.contains_key(model) { + let tokenizer = match model { + "claude" => load_tokenizer("models/claude-3-opus/tokenizer.json")?, + "mistral" | "mixtral" => load_tokenizer("models/mistral/tokenizer.json")?, + "deepseek" => load_tokenizer("models/deepseek/tokenizer.json")?, + m if m.starts_with("llama") || m.starts_with("codellama") => { + load_tokenizer("models/llama/tokenizer.json")? + } + _ => return Err(anyhow!("Unsupported model: {}", model)), + }; + cache.insert(model.to_string(), tokenizer); + } + + // Get tokenizer and encode text + let tokenizer = cache.get(model).unwrap(); + let encoded = tokenizer + .encode(text, true) + .map_err(|e| anyhow!("Failed to encode text: {}", e))?; + Ok(encoded.get_ids().to_vec()) +} + +pub fn decode_tokens(tokens: &[u32], model: &str) -> Result { + // Handle OpenAI models separately + if model == "openai" { + let encoding = + o200k_base().map_err(|e| anyhow!("Failed to get o200k_base encoding: {}", e))?; + let tokens: Vec = tokens.iter().map(|&t| t as usize).collect(); + return encoding + .decode(tokens) + .map_err(|e| anyhow!("Failed to decode tokens: {}", e)); + } + + // Get tokenizer from cache and decode + let cache = MODEL_CACHE + .lock() + .map_err(|e| anyhow!("Failed to lock cache: {}", e))?; + let tokenizer = cache + .get(model) + .ok_or_else(|| anyhow!("Model not found: {}", model))?; + let result = tokenizer + .decode(tokens, true) + .map_err(|e| anyhow!("Failed to decode tokens: {}", e))?; + Ok(result) +} + +pub fn count_tokens(text: &str, model: &str) -> Result { + tokenize(text, model) + .map(|tokens| tokens.len()) + .map_err(|e: anyhow::Error| anyhow!("Token counting failed: {}", e)) +} + +pub struct ModelManager { + model: String, +} + +impl ModelManager { + pub fn new(model: Option<&str>) -> Result { + let model = model.unwrap_or("openai").to_string(); + if !SUPPORTED_MODEL_FAMILIES.contains(&model.as_str()) { + return Err(anyhow!("Unsupported model family: {}", model)); + } + Ok(Self { model }) + } + + pub fn count_tokens(&self, text: &str) -> Result { + count_tokens(text, &self.model) + } +} diff --git a/src/parallel.rs b/src/parallel.rs index fddbcaa..d16ee97 100644 --- a/src/parallel.rs +++ b/src/parallel.rs @@ -1,159 +1,153 @@ -use crate::{get_file_priority, glob_to_regex, is_text_file, normalize_path, Result, YekConfig}; -use crossbeam::channel::bounded; +use crate::{ + get_recent_commit_times, is_ignored, is_text_file, + model_manager::{self}, + normalize_path_with_root, Result, YekConfig, +}; +use anyhow::anyhow; use ignore::{WalkBuilder, WalkState}; -use regex::Regex; use std::{ - collections::HashSet, path::Path, sync::{Arc, Mutex}, - thread, }; use tracing::debug; -#[derive(Debug)] -pub struct ProcessedFile { - pub priority: i32, - pub file_index: usize, - pub rel_path: String, - pub content: String, -} - -pub fn process_files_parallel(base_dir: &Path, config: &YekConfig) -> Result> { - let (tx, rx) = bounded(1024); - let num_threads = num_cpus::get().min(16); // Cap at 16 threads - - let config = Arc::new(config.clone()); - let base_dir = Arc::new(base_dir.to_path_buf()); - let processed_files = Arc::new(Mutex::new(HashSet::new())); - - // Spawn worker threads - let mut handles = Vec::new(); - for _ in 0..num_threads { - let tx = tx.clone(); - let config = Arc::clone(&config); - let base_dir = Arc::clone(&base_dir); - let processed_files = Arc::clone(&processed_files); - - let handle = thread::spawn(move || -> Result<()> { - let file_index = Arc::new(Mutex::new(0_usize)); - - // Configure walker for this thread - let mut builder = WalkBuilder::new(&*base_dir); - builder - .hidden(true) - .git_ignore(true) - .follow_links(false) - .standard_filters(true) - .require_git(false) - .threads(1); // Single thread per walker - - let walker = builder.build_parallel(); - - let file_index = Arc::clone(&file_index); - walker.run(|| { - let tx = tx.clone(); - let config = Arc::clone(&config); - let base_dir = Arc::clone(&base_dir); - let file_index = Arc::clone(&file_index); - let processed_files = Arc::clone(&processed_files); - - Box::new(move |entry| { - let entry = match entry { - Ok(e) => e, - Err(_) => return WalkState::Continue, - }; - - if !entry.file_type().is_some_and(|ft| ft.is_file()) { - return WalkState::Continue; - } - - let path = entry.path().to_path_buf(); - let rel_path = normalize_path(&path, &base_dir); - - // Check if file has already been processed - { - let mut processed = processed_files.lock().unwrap(); - if !processed.insert(rel_path.clone()) { - // File was already processed - return WalkState::Continue; - } - } - - // Skip files matching ignore patterns from yek.toml - if config.ignore_patterns.iter().any(|p| { - let pattern = if p.starts_with('^') || p.ends_with('$') { - p.to_string() - } else { - glob_to_regex(p) - }; - if let Ok(re) = Regex::new(&pattern) { - re.is_match(&rel_path) - } else { - false - } - }) { - debug!("Skipping {} - matched ignore pattern", rel_path); - return WalkState::Continue; - } - - // Skip binary files unless explicitly allowed - match is_text_file(&path, &config.binary_extensions) { - Ok(is_text) if !is_text => { - debug!("Skipping binary file: {}", rel_path); - return WalkState::Continue; - } - Err(_) => return WalkState::Continue, - _ => {} - } - - // Read and process file - if let Ok(content) = std::fs::read_to_string(&path) { - let priority = get_file_priority(&rel_path, &config.priority_rules); - - let mut index = file_index.lock().unwrap(); - let processed = ProcessedFile { - priority, - file_index: *index, - rel_path, - content, - }; - - if tx.send(processed).is_ok() { - *index += 1; - } - } - - WalkState::Continue - }) - }); - - Ok(()) - }); - handles.push(handle); +pub fn process_files_parallel( + base_dir: &Path, + config: &YekConfig, + output_content: &mut String, +) -> Result<()> { + // Validate token mode configuration first + if config.token_mode { + let model = config.tokenizer_model.as_deref().unwrap_or("openai"); + if !model_manager::SUPPORTED_MODEL_FAMILIES.contains(&model) { + return Err(anyhow!( + "Unsupported model '{}'. Supported models: {}", + model, + model_manager::SUPPORTED_MODEL_FAMILIES.join(", ") + )); + } + debug!("Token mode enabled with model: {}", model); } - // Drop original sender - drop(tx); - - // Collect results - let mut results = Vec::new(); - while let Ok(processed) = rx.recv() { - results.push(processed); - } + // Get Git commit times for prioritization + let git_times = get_recent_commit_times(base_dir); + debug!("Git commit times: {:?}", git_times); + + // Create thread-safe shared output content + let shared_output = Arc::new(Mutex::new(String::new())); + + // Process files in parallel + let walker = WalkBuilder::new(base_dir).build_parallel(); + walker.run(|| { + let base_dir = base_dir.to_path_buf(); + let config = config.clone(); + let shared_output = Arc::clone(&shared_output); + let git_times = git_times.clone(); + Box::new(move |entry| { + let entry = match entry { + Ok(entry) => entry, + Err(e) => { + debug!("Error walking directory: {}", e); + return WalkState::Continue; + } + }; + + if !entry.file_type().is_some_and(|ft| ft.is_file()) { + return WalkState::Continue; + } + + let path = entry.path(); + let rel_path = normalize_path_with_root(path, &base_dir); + + // Calculate priority based on git history + let priority = if let Some(times) = &git_times { + times.get(&rel_path).copied().unwrap_or(0) + } else { + 0 + }; + + // Skip if path is ignored + if is_ignored(&rel_path, &config.ignore_patterns) { + debug!("Skipping ignored path: {}", rel_path); + return WalkState::Continue; + } + + // Skip if not a text file + if !is_text_file(path, &config.binary_extensions).unwrap_or_else(|e| { + debug!("Error checking if file is text: {}", e); + false + }) { + debug!("Skipping binary file: {}", rel_path); + return WalkState::Continue; + } + + // Process file based on priority + let mut output = shared_output.lock().unwrap(); + let file_content = match process_file(&rel_path, &base_dir, &config) { + Ok(content) => content, + Err(e) => { + debug!("Error processing file {}: {}", rel_path, e); + return WalkState::Continue; + } + }; + + // Insert content based on priority + if priority > 0 { + // Higher priority files go at the start + output.insert_str(0, &file_content); + } else { + // Lower priority files go at the end + output.push_str(&file_content); + } + + WalkState::Continue + }) + }); - // Wait for all threads - for handle in handles { - handle.join().unwrap()?; + // Copy shared output back to output_content + if let Ok(shared) = shared_output.lock() { + output_content.push_str(&shared); + } else { + return Err(anyhow!("Failed to acquire final lock for output")); } - debug!("Processed {} files in parallel", results.len()); + Ok(()) +} - // Sort by priority (ascending) and file index (ascending) - results.sort_by(|a, b| { - a.priority - .cmp(&b.priority) - .then_with(|| a.file_index.cmp(&b.file_index)) - }); +fn process_file(rel_path: &str, base_dir: &Path, config: &YekConfig) -> Result { + let path = base_dir.join(rel_path); + let content = std::fs::read_to_string(&path)?; + let model = config.tokenizer_model.as_deref().unwrap_or("openai"); + let entry_header = format!(">>>> {}\n", rel_path); + let content_with_newline = format!("{}\n", content); + + // Check size limits before processing + if let Some(max_size) = config.max_size { + if config.token_mode { + // TOKEN-MODE size check + let header_tokens = model_manager::tokenize(&entry_header, model)?; + let content_tokens = model_manager::tokenize(&content_with_newline, model)?; + let total_tokens = header_tokens.len() + content_tokens.len(); + + if total_tokens > max_size { + debug!( + "File {} exceeds token limit: {} > {}", + rel_path, total_tokens, max_size + ); + return Err(anyhow!("File too large")); + } + } else { + // BYTE-MODE size check + let total_bytes = entry_header.len() + content_with_newline.len(); + if total_bytes > max_size { + debug!( + "File {} exceeds byte limit: {} > {}", + rel_path, total_bytes, max_size + ); + return Err(anyhow!("File too large")); + } + } + } - Ok(results) + Ok(format!("{}{}", entry_header, content_with_newline)) } diff --git a/tests/git_priority_tests.rs b/tests/git_priority_tests.rs index 853a4f0..762f4da 100644 --- a/tests/git_priority_tests.rs +++ b/tests/git_priority_tests.rs @@ -2,38 +2,52 @@ mod integration_common; use integration_common::{create_file, setup_temp_repo}; use std::fs; +use std::process::Command; use tempfile::TempDir; -use walkdir::WalkDir; -use yek::{serialize_repo, PriorityRule, YekConfig}; +use yek::{get_recent_commit_times, serialize_repo, PriorityRule, YekConfig}; #[test] fn test_git_priority_basic() -> Result<(), Box> { + let tempdir = tempfile::tempdir().unwrap(); + let output_dir = tempdir.path().to_path_buf(); + let config = YekConfig { + output_dir: Some(output_dir.clone()), + ..Default::default() + }; let repo = setup_temp_repo(); let repo_path = repo.path(); - let output_dir = repo_path.join("test_output"); fs::create_dir_all(&output_dir)?; // Create test files and commit them create_file(repo_path, "src/main.rs", b"fn main() {}"); create_file(repo_path, "docs/README.md", b"# Documentation"); + // Verify Git commit times + let git_times = get_recent_commit_times(repo_path).expect("Failed to get Git commit times"); + assert!( + git_times.contains_key("src/main.rs"), + "src/main.rs should have Git commit time" + ); + assert!( + git_times.contains_key("docs/README.md"), + "docs/README.md should have Git commit time" + ); + // Run serialization - let mut config = YekConfig::default(); - config.output_dir = Some(output_dir.clone()); serialize_repo(repo_path, Some(&config))?; // Verify output - assert!(output_dir.exists(), "Output directory should exist"); - let mut found_files = 0; - for entry in WalkDir::new(&output_dir) { - let entry = entry?; - if entry.file_type().is_file() { - found_files += 1; - } - } + let output_file = output_dir.join("output.txt"); + assert!(output_file.exists(), "Output file should exist"); + let content = fs::read_to_string(output_file)?; + assert!( - found_files > 0, - "Should have created at least one output file" + content.contains("src/main.rs"), + "Should have included src/main.rs" + ); + assert!( + content.contains("docs/README.md"), + "Should have included docs/README.md" ); Ok(()) @@ -56,9 +70,22 @@ fn test_git_priority_stream() -> Result<(), Box> { b"# Documentation\nThis is a test.", ); + // Verify Git commit times + let git_times = get_recent_commit_times(repo_path).expect("Failed to get Git commit times"); + assert!( + git_times.contains_key("src/main.rs"), + "src/main.rs should have Git commit time" + ); + assert!( + git_times.contains_key("docs/README.md"), + "docs/README.md should have Git commit time" + ); + // Run serialization in stream mode - let mut config = YekConfig::default(); - config.stream = true; + let config = YekConfig { + stream: true, + ..Default::default() + }; serialize_repo(repo_path, Some(&config))?; Ok(()) @@ -83,6 +110,17 @@ fn test_git_priority_with_config() -> Result<(), Box> { b"# Documentation\nThis is a test.", ); + // Verify Git commit times + let git_times = get_recent_commit_times(repo_path).expect("Failed to get Git commit times"); + assert!( + git_times.contains_key("src/main.rs"), + "src/main.rs should have Git commit time" + ); + assert!( + git_times.contains_key("docs/README.md"), + "docs/README.md should have Git commit time" + ); + // Run serialization with custom config let config = YekConfig { priority_rules: vec![ @@ -101,17 +139,17 @@ fn test_git_priority_with_config() -> Result<(), Box> { serialize_repo(repo_path, Some(&config))?; // Verify output - assert!(output_dir.exists(), "Output directory should exist"); - let mut found_files = 0; - for entry in WalkDir::new(&output_dir) { - let entry = entry?; - if entry.file_type().is_file() { - found_files += 1; - } - } + let output_file = output_dir.join("output.txt"); + assert!(output_file.exists(), "Output file should exist"); + let content = fs::read_to_string(output_file)?; + + assert!( + content.contains("src/main.rs"), + "Should have included src/main.rs" + ); assert!( - found_files > 0, - "Should have created at least one output file" + content.contains("docs/README.md"), + "Should have included docs/README.md" ); Ok(()) @@ -119,13 +157,51 @@ fn test_git_priority_with_config() -> Result<(), Box> { #[test] fn test_git_priority_empty_repo() -> Result<(), Box> { - let repo = setup_temp_repo(); + let repo = TempDir::new()?; let repo_path = repo.path(); let output_dir = repo_path.join("test_output"); fs::create_dir_all(&output_dir)?; - let mut config = YekConfig::default(); - config.output_dir = Some(output_dir); + // Initialize empty git repo without any commits + Command::new("git") + .args(["init", "--quiet", repo_path.to_str().unwrap()]) + .status() + .expect("Failed to run git init"); + + // Configure git user info + Command::new("git") + .args([ + "-C", + repo_path.to_str().unwrap(), + "config", + "user.name", + "test-user", + ]) + .status() + .expect("Failed to set git user.name"); + + Command::new("git") + .args([ + "-C", + repo_path.to_str().unwrap(), + "config", + "user.email", + "test@example.com", + ]) + .status() + .expect("Failed to set git user.email"); + + // Verify Git commit times + let git_times = get_recent_commit_times(repo_path); + assert!( + git_times.is_none(), + "Empty repo should return None for Git commit times" + ); + + let config = YekConfig { + output_dir: Some(output_dir), + ..Default::default() + }; serialize_repo(repo_path, Some(&config))?; Ok(()) } @@ -142,24 +218,47 @@ fn test_git_priority_no_git() -> Result<(), Box> { b"This is a test file without git.", ); - let mut config = YekConfig::default(); - config.output_dir = Some(output_dir); + // Verify Git commit times + let git_times = get_recent_commit_times(temp.path()); + assert!(git_times.is_none(), "No Git repo should return None"); + + let config = YekConfig { + output_dir: Some(output_dir), + ..Default::default() + }; serialize_repo(temp.path(), Some(&config))?; Ok(()) } #[test] fn test_git_priority_binary_files() -> Result<(), Box> { + let temp_dir = TempDir::new()?; let repo = setup_temp_repo(); - let repo_path = repo.path(); - let output_dir = repo_path.join("test_output"); - fs::create_dir_all(&output_dir)?; - create_file(repo_path, "binary.bin", b"\x00\x01\x02\x03"); - create_file(repo_path, "text.txt", b"This is a text file."); + // Create test files + create_file(repo.path(), "binary.bin", b"\x00\x01\x02\x03"); + create_file(repo.path(), "image.jpg", b"\xFF\xD8\xFF\xE0"); + create_file(repo.path(), "README.md", b"# Test\n\nThis is a test."); + + // Run yek with output directory + let output_dir = temp_dir.path().join("output"); + let config = YekConfig { + output_dir: Some(output_dir.clone()), + ..Default::default() + }; + + serialize_repo(repo.path(), Some(&config))?; + + // Verify binary file is not present in output.txt + let output_file = output_dir.join("output.txt"); + let content = fs::read_to_string(output_file)?; + + // Check that binary files are not present in the output + assert!(!content.contains("binary.bin")); + assert!(!content.contains("image.jpg")); + + // Check that text files are present + assert!(content.contains("README.md")); - let mut config = YekConfig::default(); - config.output_dir = Some(output_dir); - serialize_repo(repo_path, Some(&config))?; Ok(()) } diff --git a/tests/integration_common.rs b/tests/integration_common.rs index aaf7e10..f965c0e 100644 --- a/tests/integration_common.rs +++ b/tests/integration_common.rs @@ -1,5 +1,3 @@ -use regex::Regex; -#[allow(dead_code)] use std::fs; use std::path::Path; use std::process::Command; @@ -9,132 +7,62 @@ use tempfile::TempDir; /// Returns a `TempDir` whose path is a fresh Git repository directory. #[allow(dead_code)] pub fn setup_temp_repo() -> TempDir { - let repo_dir = TempDir::new().expect("Failed to create temp dir for repo"); - init_git_repo(repo_dir.path()); - repo_dir -} + let tempdir = TempDir::new().unwrap(); + let repo_path = tempdir.path(); -/// Initializes a new Git repository in the given directory. -/// Configures user.name and user.email so commits will succeed without prompts. -#[allow(dead_code)] -fn init_git_repo(path: &Path) { - let repo_str = path.to_str().expect("Non-UTF8 path to temp dir?"); - // 1. git init - let status_init = Command::new("git") - .args(["init", "--quiet", repo_str]) + // Initialize a new git repository + Command::new("git") + .arg("init") + .arg("--quiet") + .current_dir(repo_path) .status() - .expect("Failed to run git init"); - assert!(status_init.success(), "git init failed"); + .unwrap(); - // 2. Set a dummy user name and email so commits work - let status_config_name = Command::new("git") - .args(["-C", repo_str, "config", "user.name", "test-user"]) + // Configure user name and email + Command::new("git") + .arg("config") + .arg("user.name") + .arg("Test User") + .current_dir(repo_path) .status() - .expect("Failed to set git user.name"); - assert!(status_config_name.success(), "git config user.name failed"); + .unwrap(); - let status_config_email = Command::new("git") - .args(["-C", repo_str, "config", "user.email", "test@example.com"]) + Command::new("git") + .arg("config") + .arg("user.email") + .arg("test@example.com") + .current_dir(repo_path) .status() - .expect("Failed to set git user.email"); - assert!( - status_config_email.success(), - "git config user.email failed" - ); + .unwrap(); + + tempdir } /// Creates (or overwrites) a file at `[repo_dir]/[file_path]` with `content`. /// If `repo_dir` contains `.git`, automatically `git add` and `git commit`. /// This function handles large or binary data (including `\0`) without shell expansions. #[allow(dead_code)] -pub fn create_file(repo_dir: &Path, file_path: &str, content: &[u8]) { - // Ensure parent directories exist - let full_path = repo_dir.join(file_path); - if let Some(parent) = full_path.parent() { - fs::create_dir_all(parent) - .unwrap_or_else(|_| panic!("Failed to create parent directory for {}", file_path)); - } - - // Write file content in Rust, no shell expansion - fs::write(&full_path, content) - .unwrap_or_else(|_| panic!("Failed to write file content for {}", file_path)); - - // If there's a .git folder, stage & commit the file - if repo_dir.join(".git").exists() { - let repo_str = repo_dir.to_str().unwrap(); +pub fn create_file(repo_path: &Path, file_path: &str, content: &[u8]) { + let full_path = repo_path.join(file_path); + fs::create_dir_all(full_path.parent().unwrap()).unwrap(); + fs::write(full_path, content).unwrap(); - // First check if .gitignore exists and if this file should be ignored - let gitignore_path = repo_dir.join(".gitignore"); - if gitignore_path.exists() { - let gitignore_content = fs::read_to_string(&gitignore_path).unwrap(); - let should_ignore = gitignore_content.lines().any(|pattern| { - let pattern = pattern.trim(); - if pattern.is_empty() || pattern.starts_with('#') { - return false; - } - // Very basic glob matching - just checks if pattern is a prefix or suffix - if pattern.ends_with('/') { - file_path.starts_with(&pattern[..pattern.len() - 1]) - } else if pattern.starts_with('*') { - file_path.ends_with(&pattern[1..]) - } else if pattern.ends_with('*') { - file_path.starts_with(&pattern[..pattern.len() - 1]) - } else { - file_path == pattern || file_path.starts_with(pattern) - } - }); - if should_ignore { - return; // Don't commit ignored files - } - } - - // Also check if yek.toml exists and if this file should be ignored - let yek_toml_path = repo_dir.join("yek.toml"); - if yek_toml_path.exists() { - let yek_toml_content = fs::read_to_string(&yek_toml_path).unwrap(); - let should_ignore = yek_toml_content - .lines() - .filter(|line| line.contains("^")) // Only look at lines with regex patterns - .map(|line| { - line.trim() - .trim_matches(|c| c == '"' || c == '[' || c == ']') - }) - .filter(|line| !line.is_empty()) - .any(|pattern| { - if let Ok(re) = Regex::new(pattern) { - re.is_match(file_path) - } else { - false - } - }); - if should_ignore { - return; // Don't commit ignored files - } - } - - // Stage the file - let status_add = Command::new("git") - .args(["add", "-f", file_path]) - .current_dir(repo_dir) - .status() - .expect("git add failed"); - assert!(status_add.success(), "git add failed for {}", file_path); + // Stage the new file + Command::new("git") + .arg("add") + .arg(file_path) + .current_dir(repo_path) + .status() + .unwrap(); - // Commit with a descriptive message - let status_commit = Command::new("git") - .args([ - "-C", - repo_str, - "commit", - "--quiet", - "--allow-empty", // allow empty trees - "-m", - &format!("Add {}", file_path), - ]) - .status() - .expect("Failed to git commit file"); - assert!(status_commit.success(), "git commit failed for {file_path}"); - } + // Commit the file + Command::new("git") + .arg("commit") + .arg("-m") + .arg(format!("Add {}", file_path)) + .current_dir(repo_path) + .status() + .unwrap(); } /// Ensures an output directory exists and is empty. @@ -150,3 +78,22 @@ pub fn ensure_empty_output_dir(path: &Path) { } fs::create_dir_all(path).expect("Failed to create output directory"); } + +#[allow(dead_code)] +pub fn assert_output_file_contains(dir: &Path, patterns: &[&str]) { + let output_file_path = dir.join("output.txt"); + assert!( + output_file_path.exists(), + "Output file should exist: {}", + output_file_path.display() + ); + + let content = fs::read_to_string(output_file_path).expect("Failed to read output file"); + for pattern in patterns { + assert!( + content.contains(pattern), + "Output file should contain '{}'", + pattern + ); + } +} diff --git a/tests/test_basic.rs b/tests/test_basic.rs index 19f21cc..3762a6b 100644 --- a/tests/test_basic.rs +++ b/tests/test_basic.rs @@ -15,8 +15,10 @@ fn basic_file_output_test() { fs::write(&test_file, "test content").unwrap(); // Run serialization - let mut config = YekConfig::default(); - config.output_dir = Some(output_dir.clone()); + let config = YekConfig { + output_dir: Some(output_dir.clone()), + ..Default::default() + }; serialize_repo(temp.path(), Some(&config)).unwrap(); // Verify output @@ -29,10 +31,9 @@ fn basic_file_output_test() { println!("File contents:\n{}", content); } - // Check that the first chunk exists and contains our test file - let chunk_0 = output_dir.join("chunk-0.txt"); - assert!(chunk_0.exists(), "Should write first chunk"); - let content = fs::read_to_string(chunk_0).unwrap(); + let outputfile = output_dir.join("output.txt"); + assert!(outputfile.exists(), "Should write output file"); + let content = fs::read_to_string(outputfile).unwrap(); assert!( content.contains("test content"), "Should contain file content" @@ -48,8 +49,10 @@ fn basic_pipe_test() { fs::write(&test_file, "test content").unwrap(); // Run serialization in stream mode - let mut config = YekConfig::default(); - config.stream = true; + let config = YekConfig { + stream: true, + ..Default::default() + }; serialize_repo(temp.path(), Some(&config)).unwrap(); // The output should be written to stdout, which we can't easily capture in a test diff --git a/tests/test_binary_files.rs b/tests/test_binary_files.rs index dffdd5f..528fff4 100644 --- a/tests/test_binary_files.rs +++ b/tests/test_binary_files.rs @@ -19,14 +19,16 @@ fn skips_known_binary_files() { fs::write(&text_file, "text content").unwrap(); // Run serialization - let mut config = YekConfig::default(); - config.output_dir = Some(output_dir.clone()); + let config = YekConfig { + output_dir: Some(output_dir.clone()), + ..Default::default() + }; serialize_repo(temp.path(), Some(&config)).unwrap(); - // Check that the first chunk exists and contains only the text file - let chunk_0 = output_dir.join("chunk-0.txt"); - assert!(chunk_0.exists(), "Should write first chunk"); - let content = fs::read_to_string(chunk_0).unwrap(); + // Check that output.txt exists and contains only the text file + let output_file = output_dir.join("output.txt"); + assert!(output_file.exists(), "Should write output file"); + let content = fs::read_to_string(output_file).unwrap(); assert!( content.contains("text content"), "Should contain text file content" @@ -52,15 +54,17 @@ fn respects_custom_binary_extensions() { fs::write(&text_file, "text content").unwrap(); // Run serialization with custom config - let mut config = YekConfig::default(); - config.output_dir = Some(output_dir.clone()); - config.binary_extensions = vec!["dat".to_string()]; + let config = YekConfig { + output_dir: Some(output_dir.clone()), + binary_extensions: vec!["dat".to_string()], + ..Default::default() + }; serialize_repo(temp.path(), Some(&config)).unwrap(); - // Check that the first chunk exists and contains only the text file - let chunk_0 = output_dir.join("chunk-0.txt"); - assert!(chunk_0.exists(), "Should write first chunk"); - let content = fs::read_to_string(chunk_0).unwrap(); + // Check that output.txt exists and contains only the text file + let output_file = output_dir.join("output.txt"); + assert!(output_file.exists(), "Should write output file"); + let content = fs::read_to_string(output_file).unwrap(); assert!( content.contains("text content"), "Should contain text file content" diff --git a/tests/test_chunk_order.rs b/tests/test_chunk_order.rs deleted file mode 100644 index 50ba84e..0000000 --- a/tests/test_chunk_order.rs +++ /dev/null @@ -1,76 +0,0 @@ -mod integration_common; -use std::fs; -use tempfile::TempDir; -use yek::{serialize_repo, PriorityRule, YekConfig}; - -/// Tests that files are written in ascending priority order within a chunk. -/// Lower priority files should appear first, and higher priority files should appear last. -#[test] -fn chunk_order_reflects_priority() { - let temp = TempDir::new().unwrap(); - let output_dir = temp.path().join("yek-output"); - fs::create_dir_all(&output_dir).unwrap(); - - // Create test files - fs::write(temp.path().join("test.txt"), "low priority content").unwrap(); - fs::create_dir_all(temp.path().join("src")).unwrap(); - fs::write(temp.path().join("src/lib.rs"), "medium priority content").unwrap(); - fs::write(temp.path().join("README.md"), "high priority content").unwrap(); - - // Configure priority rules - let config = YekConfig { - priority_rules: vec![ - PriorityRule { - pattern: "^README.md$".to_string(), - score: 100, - }, - PriorityRule { - pattern: "^src/".to_string(), - score: 50, - }, - ], - output_dir: Some(output_dir.clone()), - ..Default::default() - }; - serialize_repo(temp.path(), Some(&config)).unwrap(); - - // Debug output for file contents - for entry in fs::read_dir(&output_dir).unwrap() { - let entry = entry.unwrap(); - let path = entry.path(); - println!(" {}", path.display()); - if path.is_file() { - println!("Contents of {}:", path.display()); - println!("{}", fs::read_to_string(&path).unwrap()); - } - } - - // All files should be in chunk-0.txt since it's the first chunk - let output_path = output_dir.join("chunk-0.txt"); - let content = fs::read_to_string(&output_path).unwrap(); - - // Check that files appear in ascending priority order (lower priority first) - let test_pos = content.find("test.txt").expect("test.txt not found"); - let lib_pos = content.find("src/lib.rs").expect("src/lib.rs not found"); - let readme_pos = content.find("README.md").expect("README.md not found"); - - // Verify ascending priority order (lower priority first) - assert!( - test_pos < lib_pos && lib_pos < readme_pos, - "Files should appear in ascending priority order (lower priority first)" - ); - - // Verify file contents - assert!( - content.contains("low priority content"), - "Should contain low priority content" - ); - assert!( - content.contains("medium priority content"), - "Should contain medium priority content" - ); - assert!( - content.contains("high priority content"), - "Should contain high priority content" - ); -} diff --git a/tests/test_e2e.rs b/tests/test_e2e.rs index d36abed..5447874 100644 --- a/tests/test_e2e.rs +++ b/tests/test_e2e.rs @@ -1,460 +1,210 @@ -mod integration_common; -use assert_cmd::Command; -use integration_common::{create_file, ensure_empty_output_dir, setup_temp_repo}; -use std::fs; +use anyhow::{anyhow, Result}; +use std::{fs, process::Command}; use tempfile::TempDir; -/// This test simulates an entire small repository with multiple directories -/// and checks the end-to-end behavior of running `yek` on it. -/// It verifies chunking, ignoring, and content ordering. -#[test] -fn e2e_small_repo_basic() { - let repo = setup_temp_repo(); - - // Create a few files - create_file(repo.path(), "README.md", "# This is a test repo".as_bytes()); - create_file(repo.path(), "src/main.rs", "fn main() {}".as_bytes()); - create_file(repo.path(), "src/lib.rs", "pub fn lib_fn() {}".as_bytes()); - create_file( - repo.path(), - "tests/test_it.rs", - "#[test] fn test_it() {}".as_bytes(), - ); - create_file(repo.path(), "ignore_me/binary.bin", b"fakebinary\x00\x7f"); - // Add .gitignore to ignore `ignore_me/` - create_file(repo.path(), ".gitignore", "ignore_me/\n".as_bytes()); - - // Run `yek` in non-stream mode - let output_dir = repo.path().join("yek-output"); - ensure_empty_output_dir(&output_dir); - - let mut cmd = Command::cargo_bin("yek").unwrap(); - cmd.current_dir(repo.path()) - .arg("--output-dir") - .arg(&output_dir) - .arg("--max-size=200KB") // Large enough to include all files in one chunk - .assert() - .success(); - - // Check that ignore_me/binary.bin is not in any output chunk - let mut found_lib_rs = false; - let mut found_bin = false; +struct TestSetup { + dir: TempDir, + config: Option, + git: bool, +} - for entry in fs::read_dir(&output_dir).expect("Output dir must exist") { - let path = entry.expect("entry").path(); - if path.extension().unwrap_or_default() != "txt" { - continue; - } - let content = fs::read_to_string(&path).expect("read chunk file"); - if content.contains("binary.bin") { - found_bin = true; - } - if content.contains("src/lib.rs") { - found_lib_rs = true; +impl TestSetup { + fn new() -> Self { + TestSetup { + dir: TempDir::new().unwrap(), + config: None, + git: false, } } - assert!(!found_bin, "binary.bin (ignored) must not appear in chunks"); - assert!(found_lib_rs, "lib.rs must appear in the serialized output"); -} - -/// This test ensures that large single files (bigger than the chunk limit) -/// do indeed get split into multiple chunks on Windows and Unix. -#[test] -fn e2e_large_file_splitting() { - let repo = setup_temp_repo(); - // 1 MB worth of text - let big_content = "test content ".repeat(100_000); - create_file(repo.path(), "BIGFILE.txt", big_content.as_bytes()); + fn with_config(&mut self, config: &str) -> &mut Self { + self.config = Some(config.to_string()); + self + } - let output_dir = repo.path().join("yek-output"); - ensure_empty_output_dir(&output_dir); + fn with_git(&mut self) -> &mut Self { + self.git = true; + self.git_init(); + self + } - // We set chunk limit to ~100 KB so that 1 MB file is forced into ~10 parts - let mut cmd = Command::cargo_bin("yek").unwrap(); - cmd.current_dir(repo.path()) - .arg("--max-size=50KB") // Much smaller chunk size - .arg("--output-dir") - .arg(&output_dir) - .assert() - .success(); + fn create_file(&mut self, path: &str, contents: impl AsRef<[u8]>) -> Result<&mut Self> { + let full_path = self.dir.path().join(path); + fs::create_dir_all(full_path.parent().ok_or_else(|| anyhow!("Invalid path"))?)?; + fs::write(&full_path, contents)?; + if self.git { + self.git_add_and_commit(&format!("Add {}", path))?; + } + Ok(self) + } - // Verify multiple chunk files - let mut chunk_count = 0; - println!("Output directory: {:?}", output_dir); - for entry in fs::read_dir(&output_dir).unwrap() { - let path = entry.unwrap().path(); - if path.extension().unwrap_or_default() == "txt" { - chunk_count += 1; - let content = fs::read_to_string(&path).expect("read chunk"); - // Only print first 100 chars of content - println!( - "Chunk {}: {} ...", - chunk_count, - &content.chars().take(100).collect::() - ); - assert!( - content.contains("BIGFILE.txt:part"), - "Each chunk should show the same file name banner with part index" - ); + fn create_binary_file(&mut self, path: &str, size: usize) -> Result<&mut Self> { + let full_path = self.dir.path().join(path); + let content = vec![0u8; size]; + fs::write(&full_path, content)?; + if self.git { + self.git_add_and_commit(&format!("Add {}", path))?; } + Ok(self) } - assert!( - chunk_count > 1, - "Should produce multiple chunks for a large file" - ); -} -/// This test simulates a multi-directory layout, including deeper nested directories. -/// The scenario attempts cross-platform path handling. -#[test] -fn e2e_nested_paths() { - let repo = setup_temp_repo(); + fn run(&self, args: &[&str]) -> Result<(String, String)> { + let mut cmd = Command::new(env!("CARGO_BIN_EXE_yek")); + cmd.args(args).current_dir(&self.dir); - // Nested directories - create_file( - repo.path(), - "src/module1/foo.rs", - "// module1 foo".as_bytes(), - ); - create_file( - repo.path(), - "src/module1/bar.rs", - "// module1 bar".as_bytes(), - ); - create_file( - repo.path(), - "src/module2/baz.rs", - "// module2 baz".as_bytes(), - ); - create_file( - repo.path(), - "src/module2/extra/deep_file.rs", - "// deep nested file".as_bytes(), - ); + if let Some(config) = &self.config { + let config_path = self.dir.path().join("yek.toml"); + fs::write(&config_path, config)?; + } - let output_dir = repo.path().join("yek-output"); - ensure_empty_output_dir(&output_dir); + let output = cmd + .output() + .map_err(|e| anyhow!("Failed to run command: {}", e))?; + let stdout = String::from_utf8_lossy(&output.stdout).into_owned(); + let stderr = String::from_utf8_lossy(&output.stderr).into_owned(); - let mut cmd = Command::cargo_bin("yek").unwrap(); - cmd.current_dir(repo.path()) - .arg("--output-dir") - .arg(&output_dir) - .arg("--max-size=50KB") - .assert() - .success(); + Ok((stdout, stderr)) + } - // Check chunk content quickly - let mut chunk_found = false; - for entry in fs::read_dir(&output_dir).unwrap() { - let path = entry.unwrap().path(); - if path.extension().unwrap_or_default() == "txt" { - let content = fs::read_to_string(&path).unwrap(); - if content.contains("src/module2/extra/deep_file.rs") { - chunk_found = true; - } - } + fn git_init(&self) { + // Initialize git repo + Command::new("git") + .args(["init", "--quiet", "--initial-branch=main"]) + .current_dir(self.dir.path()) + .output() + .unwrap(); + + // Configure git user + Command::new("git") + .args(["config", "user.name", "test-user"]) + .current_dir(self.dir.path()) + .output() + .unwrap(); + + Command::new("git") + .args(["config", "user.email", "test@example.com"]) + .current_dir(self.dir.path()) + .output() + .unwrap(); + + // Create initial empty commit + Command::new("git") + .args(["commit", "--quiet", "--allow-empty", "-m", "Initial commit"]) + .current_dir(self.dir.path()) + .output() + .unwrap(); + } + + fn git_add_and_commit(&self, message: &str) -> Result<()> { + Command::new("git") + .arg("add") + .arg(".") + .current_dir(&self.dir) + .output() + .map_err(|e| anyhow!("Failed to git add: {}", e))?; + + Command::new("git") + .args(["commit", "-m", message]) + .current_dir(&self.dir) + .output() + .map_err(|e| anyhow!("Failed to git commit: {}", e))?; + + Ok(()) } - assert!(chunk_found, "Nested file wasn't found in output"); } -/// Test cross-platform environment by mocking environment variables or -/// checking for Windows path usage. -/// This won't fully replicate Windows vs. Unix, but it ensures code runs in both -/// without crashing or mishandling path separators. #[test] -fn e2e_cross_platform_sanity() { - let repo = setup_temp_repo(); - - // We just put some small files - create_file( - repo.path(), - "windows_path.txt", - "C:\\windows\\style\\path".as_bytes(), - ); - create_file( - repo.path(), - "unix_path.txt", - "/home/user/unix/style/path".as_bytes(), - ); - - let mut cmd = Command::cargo_bin("yek").unwrap(); - cmd.current_dir(repo.path()) - .env("TERM", "dumb") // Force streaming - .assert() - .success(); - - // We expect the output in stdout - // Because there's no --output-dir and output is not a TTY => streaming - // We'll just check that the command succeeded, for cross-plat sanity. +fn test_basic_processing() -> Result<()> { + let mut setup = TestSetup::new(); + setup + .create_file("file1.txt", "content1")? + .create_file("file2.txt", "content2")?; + + let (stdout, _) = setup.run(&[])?; + assert!(stdout.contains("file1.txt")); + assert!(stdout.contains("file2.txt")); + Ok(()) } -/// This test checks that with piping detection, if STDOUT is a TTY, -/// it writes to a file, otherwise it writes to STDOUT (stream). #[test] -fn e2e_stream_detection() { - let repo = setup_temp_repo(); - create_file(repo.path(), "test.txt", "some content".as_bytes()); - - // We'll forcibly pipe the output into a local buffer - let mut cmd = Command::cargo_bin("yek").unwrap(); - let assert = cmd - .current_dir(repo.path()) - .env("TERM", "dumb") - .output() - .expect("Failed to execute command"); - - let stdout = String::from_utf8_lossy(&assert.stdout); - assert!( - stdout.contains("test.txt"), - "Must see test.txt in streamed output" - ); - assert!( - stdout.contains("some content"), - "Must see file content in streamed output" - ); +fn test_ignore_file() -> Result<()> { + let mut setup = TestSetup::new(); + setup + .create_file(".gitignore", "*.txt")? + .create_file("file1.txt", "content1")? + .create_file("file2.rs", "content2")?; + + let (stdout, _) = setup.run(&[])?; + assert!(!stdout.contains("file1.txt")); + assert!(stdout.contains("file2.rs")); + Ok(()) } -/// This test checks a scenario with a `yek.toml` that modifies ignore patterns, -/// custom binary extensions, and priority rules in a single run. -/// Ensures the end-to-end flow respects all of them. #[test] -fn e2e_custom_config_all_features() { - let repo = setup_temp_repo(); - - // Custom config - let config_toml = r#" -ignore_patterns = ["assets/", "*.lock"] - -binary_extensions = ["custombin"] - -git_boost_max = 30 - -[[priority_rules]] -pattern = "^core/" -score = 100 - -[[priority_rules]] -pattern = "\\.md$" -score = 50 - -[[priority_rules]] -pattern = ".*" -score = 1 -"#; - create_file(repo.path(), "yek.toml", config_toml.as_bytes()); - - // Some files - create_file( - repo.path(), - "assets/secret.txt", - "should be ignored".as_bytes(), - ); - create_file(repo.path(), "README.md", "readme content".as_bytes()); - create_file(repo.path(), "app.lock", "lock file ignored".as_bytes()); - create_file( - repo.path(), - "core/main.rs", - "core is high priority".as_bytes(), - ); - create_file(repo.path(), "binary.custombin", b"fake binary\x00\x7f"); - - let output_dir = repo.path().join("yek-output"); - ensure_empty_output_dir(&output_dir); - - let mut cmd = Command::cargo_bin("yek").unwrap(); - let assert = cmd - .current_dir(repo.path()) - .arg("--output-dir") - .arg(&output_dir) - .arg("--debug") - .arg("--max-size=10KB") - .output() - .expect("Failed to execute command"); - - println!("STDOUT: {}", String::from_utf8_lossy(&assert.stdout)); - println!("STDERR: {}", String::from_utf8_lossy(&assert.stderr)); - - // Check final chunk (should have `core/main.rs` due to highest priority). - let entries = fs::read_dir(&output_dir).unwrap(); - let mut chunk_files: Vec<_> = entries - .filter_map(|e| { - let p = e.ok()?.path(); - (p.extension()? == "txt").then_some(p) - }) - .collect(); - - chunk_files.sort(); // chunk-0.txt, chunk-1.txt, ... - let last_chunk = chunk_files.last().expect("Must have at least one chunk"); - let content = fs::read_to_string(last_chunk).expect("Read last chunk"); - assert!( - content.contains("core/main.rs"), - "highest priority must come last" - ); - assert!( - !content.contains("assets/secret.txt"), - "ignored file should not appear" - ); - assert!(!content.contains("app.lock"), "lock file is ignored"); - assert!( - !content.contains("binary.custombin"), - "custom bin file is ignored" - ); - // Make sure README.md is included but before the highest priority - // We won't do a heavy check here, just confirm it appears somewhere - let mut included_md = false; - for file in &chunk_files { - let c = fs::read_to_string(file).unwrap(); - if c.contains("README.md") { - included_md = true; - break; - } - } - assert!( - included_md, - "README.md must be included, albeit with lower priority than core/" - ); +fn test_include_file() -> Result<()> { + let mut setup = TestSetup::new(); + setup + .create_file("file1.txt", "content1")? + .create_file("file2.rs", "content2")?; + + let (stdout, _) = setup.run(&["--include", "*.txt"])?; + assert!(stdout.contains("file1.txt")); + assert!(!stdout.contains("file2.rs")); + Ok(()) } -/// This test verifies that after chunking multiple directories at once, -/// the highest priority files from either directory appear last. #[test] -fn e2e_multi_directory_priority() { - let repo1 = setup_temp_repo(); - let repo2 = setup_temp_repo(); - - // Put a config in each - create_file( - repo1.path(), - "yek.toml", - r#" -[[priority_rules]] -pattern = "^dir1/" -score = 10 -"# - .as_bytes(), - ); - create_file( - repo2.path(), - "yek.toml", - r#" -[[priority_rules]] -pattern = "^super/" -score = 99 -"# - .as_bytes(), - ); - - // Some files in repo1 - create_file(repo1.path(), "dir1/a.txt", "from repo1/dir1".as_bytes()); - create_file(repo1.path(), "dir2/b.txt", "from repo1/dir2".as_bytes()); - // Some files in repo2 - create_file(repo2.path(), "super/c.txt", "from repo2/super".as_bytes()); - create_file(repo2.path(), "basic/d.txt", "from repo2/basic".as_bytes()); - - // Let's process them both at once - let output_dir = TempDir::new().unwrap(); // create a truly separate temp directory - ensure_empty_output_dir(output_dir.path()); - let out_str = output_dir.path().to_str().unwrap(); - - let mut cmd = Command::cargo_bin("yek").unwrap(); - cmd.arg(repo1.path()) - .arg(repo2.path()) - .arg("--output-dir") - .arg(out_str) - .arg("--max-size=5KB") - .assert() - .success(); - - // The last chunk should have `super/c.txt` due to higher priority from second repo - let mut chunk_files: Vec<_> = fs::read_dir(&output_dir) - .unwrap() - .filter_map(|e| { - let p = e.ok()?.path(); - (p.extension()? == "txt").then_some(p) - }) - .collect(); - chunk_files.sort(); - - let last_chunk = chunk_files.last().expect("need at least one chunk"); - let content = fs::read_to_string(last_chunk).unwrap(); - assert!( - content.contains("super/c.txt"), - "highest priority must come last" - ); - // dir1 is priority 10, super is priority 99 => super is last +fn test_git_integration() -> Result<()> { + let mut setup = TestSetup::new(); + setup.with_git(); + setup + .create_file("file1.txt", "content1")? + .create_file("file2.txt", "content2")?; + + let (stdout, _) = setup.run(&[])?; + assert!(stdout.contains("file1.txt")); + assert!(stdout.contains("file2.txt")); + Ok(()) } -/// This test tries to feed a large number of small files to check if we handle them in parallel -/// without overloading the aggregator or losing order correctness. #[test] -fn e2e_many_small_files_parallel() { - let repo = setup_temp_repo(); - - // Create many small files - for i in 0..200 { - let file_name = format!("file_{:03}.txt", i); - let content = "some small content\n".repeat(100); - create_file(repo.path(), &file_name, content.as_bytes()); - } - - // We rely on environment CPU cores for parallel chunk creation - // Then confirm all files appear in the final output - let output_dir = repo.path().join("yek-output"); - ensure_empty_output_dir(&output_dir); - - let mut cmd = Command::cargo_bin("yek").unwrap(); - cmd.current_dir(repo.path()) - .arg("--output-dir") - .arg(&output_dir) - .arg("--max-size=5KB") // Much smaller chunk size - .assert() - .success(); - - // Ensure we have multiple chunks - let mut chunk_files: Vec<_> = fs::read_dir(&output_dir) - .unwrap() - .filter_map(|e| { - let p = e.ok()?.path(); - if p.extension()? == "txt" { - // Extract chunk index from filename "chunk-{index}.txt" - let index = p - .file_stem()? - .to_str()? - .strip_prefix("chunk-")? - .split("-part-") // Handle split parts if any - .next()? - .parse::() - .ok()?; - Some((index, p)) - } else { - None - } - }) - .collect(); - // Sort by chunk index - chunk_files.sort_by_key(|(index, _)| *index); - let chunk_files: Vec<_> = chunk_files.into_iter().map(|(_, p)| p).collect(); - - assert!( - chunk_files.len() > 1, - "Must produce multiple chunks with 200 small files" - ); - - // Check if files appear in any chunk - let mut found_first = false; - let mut found_last = false; +fn test_dir_config() -> Result<()> { + let mut setup = TestSetup::new(); + setup + .with_config( + r#" + ignore_patterns = ["*.txt"] + "#, + ) + .create_file("file1.txt", "content1")? + .create_file("file2.rs", "content2")?; + + let (stdout, _) = setup.run(&[])?; + assert!(!stdout.contains("file1.txt")); + assert!(stdout.contains("file2.rs")); + Ok(()) +} - for chunk_file in &chunk_files { - let content = fs::read_to_string(chunk_file).unwrap(); - if content.contains(">>>> file_000.txt") { - found_first = true; - } - if content.contains(">>>> file_199.txt") { - found_last = true; - } - } +#[test] +fn test_max_size() -> Result<()> { + let mut setup = TestSetup::new(); + setup + .create_file("src/main.rs", "fn main() {}")? + .create_file("image.png", "binary data")? + .create_binary_file("big.bin", 1024)?; + + let (stdout, _) = setup.run(&["--max-size=200KB"])?; + assert!(stdout.contains("src/main.rs")); + assert!(!stdout.contains("image.png")); + assert!(!stdout.contains("big.bin")); + Ok(()) +} - assert!(found_first, "file_000.txt must appear in some chunk"); - assert!(found_last, "file_199.txt must appear in some chunk"); +#[test] +fn test_invalid_config() -> Result<()> { + let mut setup = TestSetup::new(); + setup.with_config("invalid toml"); + let (_, stderr) = setup.run(&["--max-size=200KB"])?; + assert!(stderr.contains("Failed to parse config")); + Ok(()) } diff --git a/tests/test_e2e.rs2 b/tests/test_e2e.rs2 new file mode 100644 index 0000000..47b93d8 --- /dev/null +++ b/tests/test_e2e.rs2 @@ -0,0 +1,288 @@ + +use anyhow::Result; +use predicates::prelude::*; +use std::{ + fs::{self, File}, + io::Write, + path::Path, + process::Command, + thread, + time::{Duration, SystemTime}, +}; +use tempfile::TempDir; + +struct TestSetup { + dir: TempDir, + config: Option, + git: bool, +} + +impl TestSetup { + fn new() -> Self { + TestSetup { + dir: TempDir::new().unwrap(), + config: None, + git: false, + } + } + + fn with_config(mut self, config: &str) -> Self { + self.config = Some(config.to_string()); + self + } + + fn with_git(mut self) -> Self { + self.git = true; + self + } + + fn create_file(&self, path: &str, contents: &str) -> String { + let full_path = self.dir.path().join(path); + fs::create_dir_all(full_path.parent().unwrap()).unwrap(); + fs::write(&full_path, contents).unwrap(); + full_path.to_str().unwrap().to_string() + } + + fn create_binary_file(&self, path: &str, size: usize) -> String { + let full_path = self.dir.path().join(path); + let content = vec![0u8; size]; + fs::write(&full_path, content).unwrap(); + full_path.to_str().unwrap().to_string() + } + + fn run(&self, args: &[&str]) -> (String, String) { + let mut cmd = Command::new(env!("CARGO_BIN_EXE_yek")); + cmd.arg("--no-stream"); + + if let Some(config) = &self.config { + let config_path = self.dir.path().join("yek.toml"); + fs::write(&config_path, config).unwrap(); + cmd.arg("--config").arg(config_path); + } + + cmd.arg(self.dir.path()); + cmd.args(args); + + let output = cmd.output().unwrap(); + ( + String::from_utf8_lossy(&output.stdout).into_owned(), + String::from_utf8_lossy(&output.stderr).into_owned(), + ) + } + + fn git_init(&self) { + Command::new("git") + .arg("init") + .current_dir(self.dir.path()) + .output() + .unwrap(); + } + + fn git_commit(&self, message: &str) { + Command::new("git") + .args(["add", "."]) + .current_dir(self.dir.path()) + .output() + .unwrap(); + + Command::new("git") + .args(["commit", "-m", message]) + .current_dir(self.dir.path()) + .env("GIT_AUTHOR_DATE", "2024-01-01T00:00:00") + .env("GIT_COMMITTER_DATE", "2024-01-01T00:00:00") + .output() + .unwrap(); + } +} + +#[test] +fn test_basic_processing() -> Result<()> { + let setup = TestSetup::new() + .create_file("src/main.rs", "fn main() {}") + .create_file("image.png", "binary data") + .create_binary_file("big.bin", 1024); + + let (output, _) = setup.run(&[]); + + // Should include text files + assert!(output.contains("src/main.rs")); + + // Should exclude binary files + assert!(!output.contains("image.png")); + assert!(!output.contains("big.bin")); + + Ok(()) +} + +#[test] +fn test_ignore_patterns() -> Result<()> { + let config = r#" + ignore_patterns = ["temp/*", "*.log"] + "#; + + let setup = TestSetup::new() + .with_config(config) + .create_file("temp/file.txt", "ignore") + .create_file("app.log", "logs") + .create_file("src/main.rs", "code"); + + let (output, _) = setup.run(&[]); + + assert!(!output.contains("temp/file.txt")); + assert!(!output.contains("app.log")); + assert!(output.contains("src/main.rs")); + + Ok(()) +} + +#[test] +fn test_priority_ordering() -> Result<()> { + let config = r#" + [[priority_rules]] + pattern = "src/*" + score = 10 + + [[priority_rules]] + pattern = "tests/*" + score = 5 + "#; + + let setup = TestSetup::new() + .with_config(config) + .create_file("src/a.rs", "a") + .create_file("tests/b.rs", "b") + .create_file("docs/c.md", "c"); + + let (output, _) = setup.run(&[]); + + let positions = [ + output.find("src/a.rs"), + output.find("tests/b.rs"), + output.find("docs/c.md"), + ]; + + assert!(positions[0] < positions[1]); + assert!(positions[1] < positions[2]); + + Ok(()) +} + +#[test] +fn test_git_commit_ordering() -> Result<()> { + let setup = TestSetup::new() + .with_git() + .create_file("file1.txt", "1") + .create_file("file2.txt", "2"); + + setup.git_init(); + + // Initial commit + setup.git_commit("Initial commit"); + + // Modify file2 and commit + setup.create_file("file2.txt", "updated"); + setup.git_commit("Update file2"); + + let (output, _) = setup.run(&[]); + + let pos1 = output.find("file1.txt").unwrap(); + let pos2 = output.find("file2.txt").unwrap(); + assert!(pos2 < pos1, "Recently modified file2 should come first"); + + Ok(()) +} + +#[test] +fn test_token_mode_truncation() -> Result<()> { + let config = r#" + token_mode = true + tokenizer_model = "openai" + max_size = "10" + "#; + + let setup = TestSetup::new() + .with_config(config) + .create_file("long.txt", "This is a test sentence that should be truncated."); + + let (output, _) = setup.run(&[]); + + // Verify truncation happened + assert!(output.len() < 50); + assert!(output.contains("[TRUNCATED]") || output.ends_with("...")); + + Ok(()) +} + +#[test] +fn test_config_merging() -> Result<()> { + let global_config = r#" + ignore_patterns = ["global_ignore"] + binary_extensions = ["xyz"] + "#; + + let dir_config = r#" + ignore_patterns = ["local_ignore"] + priority_rules = [{ pattern = "src/*", score = 10 }] + "#; + + let setup = TestSetup::new() + .with_config(global_config) + .create_file("yek.toml", dir_config) + .create_file("global_ignore", "") + .create_file("local_ignore", "") + .create_file("src/main.rs", "") + .create_file("file.xyz", ""); + + let (output, _) = setup.run(&[]); + + assert!(!output.contains("global_ignore")); // Both ignores should be excluded + assert!(!output.contains("local_ignore")); + assert!(output.contains("src/main.rs")); // Priority file included + assert!(!output.contains("file.xyz")); // Custom binary extension excluded + + Ok(()) +} + +#[test] +fn test_error_handling() -> Result<()> { + // Test invalid model + let setup = TestSetup::new(); + let (_, stderr) = setup.run(&["--tokens=invalid"]); + assert!(stderr.contains("Unsupported model family")); + + // Test invalid size format + let (_, stderr) = setup.run(&["--max-size=10invalid"]); + assert!(stderr.contains("Invalid byte format")); + + // Test invalid config file + let setup = TestSetup::new().with_config("invalid toml"); + let (_, stderr) = setup.run(&[]); + assert!(stderr.contains("Failed to parse config")); + + Ok(()) +} + +#[test] +fn test_output_modes() -> Result<()> { + // Test byte mode + let setup = TestSetup::new() + .create_file("test.txt", "A".repeat(5000)); + + let (output, _) = setup.run(&["--max-size=1KB"]); + assert!(output.len() <= 1024); + + // Test token mode + let config = r#" + token_mode = true + max_size = "5" + "#; + + let setup = TestSetup::new() + .with_config(config) + .create_file("test.txt", "Hello world"); + + let (output, _) = setup.run(&[]); + assert!(output.len() < "Hello world".len()); + + Ok(()) +} \ No newline at end of file diff --git a/tests/test_gitignore_e2e.rs b/tests/test_gitignore_e2e.rs index 3097677..a34a943 100644 --- a/tests/test_gitignore_e2e.rs +++ b/tests/test_gitignore_e2e.rs @@ -1,8 +1,45 @@ mod integration_common; +use assert_cmd::Command; use integration_common::{create_file, setup_temp_repo}; use std::fs; use yek::{find_config_file, load_config_file, serialize_repo, YekConfig}; +/// Helper to run yek in streaming mode (pipe to stdout) +#[allow(dead_code)] +fn run_stream_mode(dir: &std::path::Path) -> String { + let output = Command::cargo_bin("yek") + .unwrap() + .current_dir(dir) + .env("TERM", "dumb") // Force non-interactive mode + .env("NO_COLOR", "1") // Disable color output + .env("CI", "1") // Force CI mode + .output() + .expect("Failed to execute command"); + + String::from_utf8_lossy(&output.stdout).into_owned() +} + +/// Helper to run yek in file mode (write to output directory) +#[allow(dead_code)] +fn run_file_mode(dir: &std::path::Path) -> String { + let output_dir = dir.join("output"); + let _ = Command::cargo_bin("yek") + .unwrap() + .current_dir(dir) + .arg("--output-dir") + .arg(&output_dir) + .assert() + .success(); + + // Read all part files + let mut content = String::new(); + for entry in fs::read_dir(output_dir).unwrap() { + let path = entry.unwrap().path(); + content.push_str(&fs::read_to_string(path).unwrap()); + } + content +} + #[test] fn test_gitignore_basic() -> Result<(), Box> { let repo = setup_temp_repo(); @@ -21,34 +58,30 @@ fn test_gitignore_basic() -> Result<(), Box> { file_cfg.output_dir = Some(output_dir.clone()); file_cfg } else { - let mut cfg = YekConfig::default(); - cfg.output_dir = Some(output_dir.clone()); - cfg + YekConfig { + output_dir: Some(output_dir.clone()), + ..Default::default() + } } } else { - let mut cfg = YekConfig::default(); - cfg.output_dir = Some(output_dir.clone()); - cfg + YekConfig { + output_dir: Some(output_dir.clone()), + ..Default::default() + } }; serialize_repo(repo.path(), Some(&config))?; - // Read all chunk contents - let mut combined_content = String::new(); - for entry in fs::read_dir(&output_dir)? { - let entry = entry?; - let path = entry.path(); - if path.is_file() { - combined_content.push_str(&fs::read_to_string(path)?); - } - } + // Read output file + let output_file = output_dir.join("output.txt"); + let content = fs::read_to_string(&output_file)?; assert!( - !combined_content.contains(">>>> ignore_me.txt"), + !content.contains(">>>> ignore_me.txt"), "ignore_me.txt should be ignored" ); assert!( - combined_content.contains(">>>> keep_me.txt"), + content.contains(">>>> keep_me.txt"), "keep_me.txt should be kept" ); @@ -83,38 +116,34 @@ fn test_gitignore_subdirectory() -> Result<(), Box> { file_cfg.output_dir = Some(output_dir.clone()); file_cfg } else { - let mut cfg = YekConfig::default(); - cfg.output_dir = Some(output_dir.clone()); - cfg + YekConfig { + output_dir: Some(output_dir.clone()), + ..Default::default() + } } } else { - let mut cfg = YekConfig::default(); - cfg.output_dir = Some(output_dir.clone()); - cfg + YekConfig { + output_dir: Some(output_dir.clone()), + ..Default::default() + } }; serialize_repo(repo.path(), Some(&config))?; - // Read all chunk contents - let mut combined_content = String::new(); - for entry in fs::read_dir(&output_dir)? { - let entry = entry?; - let path = entry.path(); - if path.is_file() { - combined_content.push_str(&fs::read_to_string(path)?); - } - } + // Read output file + let output_file = output_dir.join("output.txt"); + let content = fs::read_to_string(&output_file)?; assert!( - !combined_content.contains(">>>> otherdir/settings.temp"), + !content.contains(">>>> otherdir/settings.temp"), "settings.temp should be ignored by root .gitignore" ); assert!( - !combined_content.contains(">>>> subdir/secret.conf"), + !content.contains(">>>> subdir/secret.conf"), "secret.conf should be ignored by subdirectory .gitignore" ); assert!( - combined_content.contains(">>>> subdir/app.rs"), + content.contains(">>>> subdir/app.rs"), "app.rs should be kept" ); @@ -152,46 +181,42 @@ temp/* file_cfg.output_dir = Some(output_dir.clone()); file_cfg } else { - let mut cfg = YekConfig::default(); - cfg.output_dir = Some(output_dir.clone()); - cfg + YekConfig { + output_dir: Some(output_dir.clone()), + ..Default::default() + } } } else { - let mut cfg = YekConfig::default(); - cfg.output_dir = Some(output_dir.clone()); - cfg + YekConfig { + output_dir: Some(output_dir.clone()), + ..Default::default() + } }; serialize_repo(repo.path(), Some(&config))?; - // Read all chunk contents - let mut combined_content = String::new(); - for entry in fs::read_dir(&output_dir)? { - let entry = entry?; - let path = entry.path(); - if path.is_file() { - combined_content.push_str(&fs::read_to_string(path)?); - } - } + // Read output file + let output_file = output_dir.join("output.txt"); + let content = fs::read_to_string(&output_file)?; assert!( - !combined_content.contains(">>>> error.log"), + !content.contains(">>>> error.log"), "error.log should be ignored" ); assert!( - !combined_content.contains(">>>> build/output.exe"), + !content.contains(">>>> build/output.exe"), "build/output.exe should be ignored" ); assert!( - !combined_content.contains(">>>> temp/junk.tmp"), + !content.contains(">>>> temp/junk.tmp"), "temp/junk.tmp should be ignored" ); assert!( - combined_content.contains(">>>> temp/keep.me"), + content.contains(">>>> temp/keep.me"), "temp/keep.me should be kept (negated pattern)" ); assert!( - combined_content.contains(">>>> src/main.rs"), + content.contains(">>>> src/main.rs"), "src/main.rs should be kept" ); @@ -206,7 +231,7 @@ fn test_gitignore_and_yek_toml() -> Result<(), Box> { create_file( repo.path(), "yek.toml", - b"ignore_patterns = [\"^exclude/.*$\"]\n", + b"ignore_patterns = [\"exclude/**\"]\n", ); // Create .gitignore @@ -233,42 +258,38 @@ fn test_gitignore_and_yek_toml() -> Result<(), Box> { file_cfg.output_dir = Some(output_dir.clone()); file_cfg } else { - let mut cfg = YekConfig::default(); - cfg.output_dir = Some(output_dir.clone()); - cfg + YekConfig { + output_dir: Some(output_dir.clone()), + ..Default::default() + } } } else { - let mut cfg = YekConfig::default(); - cfg.output_dir = Some(output_dir.clone()); - cfg + YekConfig { + output_dir: Some(output_dir.clone()), + ..Default::default() + } }; serialize_repo(repo.path(), Some(&config))?; - // Read all chunk contents - let mut combined_content = String::new(); - for entry in fs::read_dir(&output_dir)? { - let entry = entry?; - let path = entry.path(); - if path.is_file() { - combined_content.push_str(&fs::read_to_string(path)?); - } - } + // Read output file + let output_file = output_dir.join("output.txt"); + let content = fs::read_to_string(&output_file)?; assert!( - !combined_content.contains(">>>> exclude/secret.txt"), + !content.contains(">>>> exclude/secret.txt"), "exclude/secret.txt should be ignored by yek.toml" ); assert!( - !combined_content.contains(">>>> test.tmp"), + !content.contains(">>>> test.tmp"), "test.tmp should be ignored by .gitignore" ); assert!( - !combined_content.contains(">>>> node_modules/lib.js"), + !content.contains(">>>> node_modules/lib.js"), "node_modules/lib.js should be ignored by .gitignore" ); assert!( - combined_content.contains(">>>> src/index.rs"), + content.contains(">>>> src/index.rs"), "src/index.rs should be kept" ); @@ -282,7 +303,7 @@ fn test_gitignore_binary_files() -> Result<(), Box> { // Create test files with binary content create_file(repo.path(), "binary.jpg", b"\xFF\xD8\xFF\xDB"); // JPEG magic bytes create_file(repo.path(), "text.txt", b"normal text"); - create_file(repo.path(), "unknown.xyz", b"unknown format"); + create_file(repo.path(), "unknown.xyz", b"unknown\0format"); // Add null byte // Run serialization let output_dir = repo.path().join("test_output"); @@ -293,38 +314,31 @@ fn test_gitignore_binary_files() -> Result<(), Box> { file_cfg.output_dir = Some(output_dir.clone()); file_cfg } else { - let mut cfg = YekConfig::default(); - cfg.output_dir = Some(output_dir.clone()); - cfg + YekConfig { + output_dir: Some(output_dir.clone()), + ..Default::default() + } } } else { - let mut cfg = YekConfig::default(); - cfg.output_dir = Some(output_dir.clone()); - cfg + YekConfig { + output_dir: Some(output_dir.clone()), + ..Default::default() + } }; serialize_repo(repo.path(), Some(&config))?; - // Read all chunk contents - let mut combined_content = String::new(); - for entry in fs::read_dir(&output_dir)? { - let entry = entry?; - let path = entry.path(); - if path.is_file() { - combined_content.push_str(&fs::read_to_string(path)?); - } - } + // Read output file + let output_file = output_dir.join("output.txt"); + let content = fs::read_to_string(&output_file)?; assert!( - !combined_content.contains(">>>> binary.jpg"), + !content.contains(">>>> binary.jpg"), "binary.jpg should be ignored as a binary file" ); + assert!(content.contains(">>>> text.txt"), "text.txt should be kept"); assert!( - combined_content.contains(">>>> text.txt"), - "text.txt should be kept" - ); - assert!( - !combined_content.contains(">>>> unknown.xyz"), + !content.contains(">>>> unknown.xyz"), "unknown.xyz should be ignored as a binary file (unknown extension)" ); diff --git a/tests/test_ignore_patterns.rs b/tests/test_ignore_patterns.rs index 59cce4a..f1eb732 100644 --- a/tests/test_ignore_patterns.rs +++ b/tests/test_ignore_patterns.rs @@ -1,169 +1,43 @@ mod integration_common; use assert_cmd::Command as AssertCommand; use integration_common::{create_file, setup_temp_repo}; -use std::process::Command; #[test] fn respects_gitignore() { let repo = setup_temp_repo(); - println!("Created temp repo at: {}", repo.path().display()); + let output_dir = repo.path().join("output"); - create_file(repo.path(), ".gitignore", "ignore_me/**\n".as_bytes()); - println!( - "Created .gitignore at: {}", - repo.path().join(".gitignore").display() - ); + // Create and commit .gitignore and keep_me/foo.txt + create_file(repo.path(), ".gitignore", b"ignore_me/**\n"); + create_file(repo.path(), "keep_me/foo.txt", b"should be included"); - create_file( - repo.path(), - "ignore_me/foo.txt", + // Create ignored file without adding to git (untracked) + let ignore_me_dir = repo.path().join("ignore_me"); + std::fs::create_dir_all(&ignore_me_dir).unwrap(); + std::fs::write( + ignore_me_dir.join("foo.txt"), "should be ignored".as_bytes(), - ); - println!( - "Created ignored file at: {}", - repo.path().join("ignore_me/foo.txt").display() - ); - - create_file( - repo.path(), - "keep_me/foo.txt", - "should be included".as_bytes(), - ); - println!( - "Created kept file at: {}", - repo.path().join("keep_me/foo.txt").display() - ); + ) + .unwrap(); let mut cmd = AssertCommand::cargo_bin("yek").unwrap(); - let output = cmd - .current_dir(repo.path()) + cmd.current_dir(repo.path()) .arg("--debug") + .arg("--output-dir") + .arg(&output_dir) .output() .expect("Failed to execute command"); - assert!(output.status.success()); - let stdout = String::from_utf8_lossy(&output.stdout); - println!("\nSTDOUT:\n{}", stdout); - println!("\nSTDERR:\n{}", String::from_utf8_lossy(&output.stderr)); + // Read the output file + let output_file = output_dir.join("output.txt"); + let content = std::fs::read_to_string(output_file).expect("Failed to read output file"); - // Check that only the non-ignored file is in stdout assert!( - stdout.contains(">>>> keep_me/foo.txt"), + content.contains(">>>> keep_me/foo.txt"), "Should include non-ignored file" ); assert!( - !stdout.contains(">>>> ignore_me/foo.txt"), + !content.contains(">>>> ignore_me/foo.txt"), "Should not include ignored file" ); } - -#[test] -fn respects_custom_config_file() { - let repo = setup_temp_repo(); - let repo_path = repo.path().to_path_buf(); // Store path before repo is moved - println!("Created temp repo at: {}", repo_path.display()); - - // Initialize git repo - let status = Command::new("git") - .args(["init"]) - .current_dir(&repo_path) - .status() - .expect("Failed to init git repo"); - assert!(status.success(), "git init failed"); - - // Configure git user info - let status = Command::new("git") - .args(["config", "--global", "user.name", "Test User"]) - .status() - .expect("Failed to configure git user name"); - assert!(status.success(), "git config user.name failed"); - - let status = Command::new("git") - .args(["config", "--global", "user.email", "test@example.com"]) - .status() - .expect("Failed to configure git user email"); - assert!(status.success(), "git config user.email failed"); - - create_file( - &repo_path, - "yek.toml", - r#" -ignore_patterns = [ - "^dont_serialize/" -] -"# - .as_bytes(), - ); - println!( - "Created yek.toml at: {}", - repo_path.join("yek.toml").display() - ); - - create_file( - &repo_path, - "dont_serialize/file.rs", - "ignored by config".as_bytes(), - ); - println!( - "Created ignored file at: {}", - repo_path.join("dont_serialize/file.rs").display() - ); - - create_file( - &repo_path, - "do_serialize/file.rs", - "should be included".as_bytes(), - ); - println!( - "Created kept file at: {}", - repo_path.join("do_serialize/file.rs").display() - ); - - // Add and commit files - let status = Command::new("git") - .args(["add", "-f", "."]) - .current_dir(&repo_path) - .status() - .expect("Failed to add files to git"); - assert!(status.success(), "git add failed"); - - // Print git status before commit - let status = Command::new("git") - .args(["status"]) - .current_dir(&repo_path) - .status() - .expect("Failed to get git status"); - assert!(status.success(), "git status failed"); - - let status = Command::new("git") - .args(["commit", "-m", "Initial commit"]) - .current_dir(&repo_path) - .status() - .expect("Failed to commit files"); - assert!(status.success(), "git commit failed"); - - let mut cmd = AssertCommand::cargo_bin("yek").unwrap(); - let output = cmd - .current_dir(&repo_path) - .arg("--debug") - .output() - .expect("Failed to execute command"); - - assert!(output.status.success()); - let stdout = String::from_utf8_lossy(&output.stdout); - println!("\nSTDOUT:\n{}", stdout); - println!("\nSTDERR:\n{}", String::from_utf8_lossy(&output.stderr)); - - // Check that only the non-ignored file is in stdout - assert!( - stdout.contains(">>>> do_serialize/file.rs"), - "Should include non-ignored file" - ); - assert!( - !stdout.contains(">>>> dont_serialize/file.rs"), - "Should not include ignored file" - ); - - // Keep repo alive until end of test - drop(repo); -} diff --git a/tests/test_max_size.rs b/tests/test_max_size.rs index d091866..76c0ca5 100644 --- a/tests/test_max_size.rs +++ b/tests/test_max_size.rs @@ -3,94 +3,130 @@ mod integration_common; use assert_cmd::Command; use integration_common::{create_file, ensure_empty_output_dir, setup_temp_repo}; use std::fs; +use yek::model_manager; -/// Writes a file larger than the default 10MB limit in tokens or bytes, forcing multiple chunks. +/// Writes a file larger than the default 10MB limit in tokens or bytes, forcing trimming. #[test] -fn splits_large_file_in_chunks_bytes_mode() { +fn trims_large_file_in_bytes_mode() { + let _ = env_logger::builder().is_test(true).try_init(); + println!("Starting bytes mode test"); + let repo = setup_temp_repo(); - let large_content = "A ".repeat(1024 * 1024 * 11); // ~ 11MB + println!("Temp repo path: {}", repo.path().display()); + + let large_content = "A ".repeat(1024 * 100); // ~ 100KB + println!("Created content with size: {} bytes", large_content.len()); + create_file(repo.path(), "BIG.txt", large_content.as_bytes()); + println!( + "Created test file: {}", + repo.path().join("BIG.txt").display() + ); let output_dir = repo.path().join("yek-output"); ensure_empty_output_dir(&output_dir); + println!("Cleared output directory: {}", output_dir.display()); - let debug_output = repo.path().join("debug.log"); let mut cmd = Command::cargo_bin("yek").unwrap(); + println!("Running command with --max-size=50KB"); let output = cmd .current_dir(repo.path()) - .arg("--max-size") - .arg("10MB") - .arg("--debug") + .arg("--max-size=50KB") .arg("--output-dir") .arg(&output_dir) - .env("YEK_DEBUG_OUTPUT", &debug_output) + .arg("--debug") + .arg(repo.path()) .output() .expect("Failed to execute command"); let stderr = String::from_utf8_lossy(&output.stderr); - println!("stderr: {}", stderr); + println!("Command stderr:\n{}", stderr); + let stdout = String::from_utf8_lossy(&output.stdout); + println!("Command stdout:\n{}", stdout); + println!("Command exit status: {}", output.status); - // Read debug output - let debug_log = fs::read_to_string(&debug_output).expect("Failed to read debug log"); - println!("debug log: {}", debug_log); + assert!(output.status.success()); + + let output_file = output_dir.join("output.txt"); + println!("Checking output file: {}", output_file.display()); + assert!(output_file.exists(), "output.txt should exist"); + + assert!(stdout.contains(&output_file.display().to_string())); + + let content = fs::read_to_string(&output_file).expect("Failed to read output file"); + println!("Output file size: {} bytes", content.len()); - // Check debug messages - assert!( - debug_log.contains("File exceeds chunk size, splitting into multiple chunks"), - "Should indicate file exceeds chunk size" - ); - assert!( - debug_log.contains("Writing large file part 0"), - "Should write first part" - ); assert!( - debug_log.contains("Writing large file part 1"), - "Should write second part" + content.len() <= 51200, // 50KB = 50 * 1024 bytes + "File content length should be 51200 bytes (50KB, including headers), but was {} bytes", + content.len() ); } #[test] -fn splits_large_file_in_chunks_token_mode() { +fn trims_large_file_in_token_mode() { + let _ = env_logger::builder().is_test(true).try_init(); + println!("Starting token mode test"); + let repo = setup_temp_repo(); + println!("Temp repo path: {}", repo.path().display()); + // Each "word" is a token - let large_content = "TOKEN ".repeat(200_000); // enough tokens to exceed default + let large_content = r" +200 tokens exactly! Okay, let's try to figure out why the test is failing. The user mentioned that the command isn't writing to disk when using `--tokens`, and the test output shows that the stdout has the content of the file but the output.txt isn't being created. +First, I need to look at how the output is handled in the code. In `src/lib.rs`, the `process_directory` function checks if `config.stream` is true. If it is, it prints the output to stdout. Otherwise, it writes to the output directory. +Wait, in the test, when using `--tokens`, maybe the `stream` configuration is being set incorrectly. Let me check the `Args` struct in `src/main.rs`. There's a line where `config.stream` is set based on whether stdout is a terminal. But during tests, when running the command, stdout might not be a terminal, so `stream` would be true, causing it to print to stdout instead of writing"; + println!( + "Created test content with length: {} bytes", + large_content.len() + ); + create_file(repo.path(), "BIG_token.txt", large_content.as_bytes()); + println!( + "Created test file: {}", + repo.path().join("BIG_token.txt").display() + ); let output_dir = repo.path().join("yek-output"); ensure_empty_output_dir(&output_dir); + println!("Cleared output directory: {}", output_dir.display()); - let debug_output = repo.path().join("debug.log"); let mut cmd = Command::cargo_bin("yek").unwrap(); + println!("Running command with --tokens=openai and --max-size=150"); let output = cmd .current_dir(repo.path()) - .arg("--tokens") + .arg("--tokens=openai") .arg("--max-size") - .arg("150000") // ~150k tokens - .arg("--debug") + .arg("150") .arg("--output-dir") .arg(&output_dir) - .env("YEK_DEBUG_OUTPUT", &debug_output) + .arg("--debug") + .arg(repo.path()) .output() .expect("Failed to execute command"); let stderr = String::from_utf8_lossy(&output.stderr); - println!("stderr: {}", stderr); + println!("Command stderr:\n{}", stderr); + let stdout = String::from_utf8_lossy(&output.stdout); + println!("Command stdout:\n{}", stdout); + println!("Command exit status: {}", output.status); - // Read debug output - let debug_log = fs::read_to_string(&debug_output).expect("Failed to read debug log"); - println!("debug log: {}", debug_log); + assert!(output.status.success()); + + let output_file = output_dir.join("output.txt"); + println!("Checking output file: {}", output_file.display()); + assert!(output_file.exists(), "output.txt should exist"); - // Check debug messages - assert!( - debug_log.contains("File exceeds chunk size, splitting into multiple chunks"), - "Should indicate file exceeds chunk size" - ); - assert!( - debug_log.contains("Writing large file part 0"), - "Should write first part" - ); assert!( - debug_log.contains("Writing large file part 1"), - "Should write second part" + stdout.contains(&output_file.display().to_string()), + "stdout should contain path of output file" ); + + let content = fs::read_to_string(&output_file).expect("Failed to read output file"); + let token_count = model_manager::count_tokens(&content, "openai").unwrap(); + println!("Output file token count: {}", token_count); + println!("Output file content:\n{}", content); + + assert!(token_count <= 150, "Should not exceed token limit"); + assert!(token_count >= 100, "Should preserve most important content"); } diff --git a/tests/test_normalize_path.rs b/tests/test_normalize_path.rs index 9418880..35f1066 100644 --- a/tests/test_normalize_path.rs +++ b/tests/test_normalize_path.rs @@ -1,46 +1,26 @@ -use std::path::PathBuf; -use yek::normalize_path; +use std::path::Path; +use yek::normalize_path_with_root; #[test] -fn test_normalize_path() { - let base = PathBuf::from("/base/path"); - let path = PathBuf::from("/base/path/foo/bar.txt"); - assert_eq!(normalize_path(&path, &base), "foo/bar.txt"); +fn test_normalize_with_base() { + let base = Path::new("/base/dir"); + let path = base.join("foo/bar.txt"); + let other_path = Path::new("/other/path/baz.txt"); - // Test with path not under base - let other_path = PathBuf::from("/other/path/baz.txt"); - assert_eq!(normalize_path(&other_path, &base), "/other/path/baz.txt"); - - // Test with relative paths - let rel_base = PathBuf::from("base"); - let rel_path = PathBuf::from("base/foo/bar.txt"); - assert_eq!(normalize_path(&rel_path, &rel_base), "foo/bar.txt"); - - // Test with current directory - let current = PathBuf::from("."); - assert_eq!(normalize_path(¤t, ¤t), "."); - - // Test with Windows-style absolute path - #[cfg(target_family = "windows")] - { - let win_path = PathBuf::from("C:\\other\\path\\baz.txt"); - assert_eq!(normalize_path(&win_path, &base), "C:/other/path/baz.txt"); + assert_eq!(normalize_path_with_root(&path, base), "foo/bar.txt"); + assert_eq!(normalize_path_with_root(other_path, base), "baz.txt"); +} - let win_unc = PathBuf::from("\\\\server\\share\\file.txt"); - assert_eq!(normalize_path(&win_unc, &base), "//server/share/file.txt"); +#[test] +fn test_normalize_relative_paths() { + let rel_base = Path::new("some/relative/dir"); + let rel_path = rel_base.join("foo/bar.txt"); - // Test with forward slashes in UNC path - let win_unc_fwd = PathBuf::from("//server/share/file.txt"); - assert_eq!( - normalize_path(&win_unc_fwd, &base), - "//server/share/file.txt" - ); + assert_eq!(normalize_path_with_root(&rel_path, rel_base), "foo/bar.txt"); +} - // Test with mixed slashes in UNC path - let win_unc_mixed = PathBuf::from("\\/server\\share/file.txt"); - assert_eq!( - normalize_path(&win_unc_mixed, &base), - "//server/share/file.txt" - ); - } +#[test] +fn test_normalize_current_dir() { + let current = Path::new("."); + assert_eq!(normalize_path_with_root(current, current), "."); } diff --git a/tests/test_part_order.rs b/tests/test_part_order.rs new file mode 100644 index 0000000..2d5345b --- /dev/null +++ b/tests/test_part_order.rs @@ -0,0 +1,61 @@ +mod integration_common; +use std::fs; +use tempfile::TempDir; +use yek::serialize_repo; +use yek::PriorityRule; +use yek::YekConfig; + +/// Tests that files are written in descending priority order within a part. +#[test] +fn part_order_reflects_priority() { + let temp = TempDir::new().unwrap(); + let output_dir = temp.path().join("yek-output"); + fs::create_dir_all(&output_dir).unwrap(); + + // Create test files with different priorities + let files = vec![ + ("a.txt", "content a", 1), + ("b.txt", "content b", 2), + ("c.txt", "content c", 3), + ]; + + for (name, content, _) in &files { + let path = temp.path().join(name); + fs::write(&path, content).unwrap(); + } + + // Run serialization with priority rules + let config = YekConfig { + output_dir: Some(output_dir.clone()), + priority_rules: vec![ + PriorityRule { + pattern: "a.txt".to_string(), + score: 1, + }, + PriorityRule { + pattern: "b.txt".to_string(), + score: 2, + }, + PriorityRule { + pattern: "c.txt".to_string(), + score: 3, + }, + ], + ..Default::default() + }; + serialize_repo(temp.path(), Some(&config)).unwrap(); + + // All files should be in output.txt + let output_path = output_dir.join("output.txt"); + let content = fs::read_to_string(output_path).unwrap(); + + // Check that files appear in ascending priority order (higher priority files last) + let a_pos = content.find("a.txt").unwrap(); + let b_pos = content.find("b.txt").unwrap(); + let c_pos = content.find("c.txt").unwrap(); + + assert!( + a_pos < b_pos && b_pos < c_pos, + "Files should be ordered by ascending priority with higher priority files last" + ); +} diff --git a/tests/test_perf.rs b/tests/test_perf.rs index 897776b..ccf7103 100644 --- a/tests/test_perf.rs +++ b/tests/test_perf.rs @@ -1,11 +1,11 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion}; use std::fs; -use std::path::PathBuf; +use std::path::Path; use tempfile::TempDir; use yek::serialize_repo; use yek::YekConfig; -fn create_test_files(dir: &PathBuf, num_files: usize, file_size: usize) { +fn create_test_files(dir: &Path, num_files: usize, file_size: usize) { for i in 0..num_files { let content = "a".repeat(file_size); let file_path = dir.join(format!("file_{}.txt", i)); @@ -19,12 +19,14 @@ fn bench_serialize_repo(c: &mut Criterion) { fs::create_dir_all(&output_dir).unwrap(); // Create test files - create_test_files(&temp.path().to_path_buf(), 100, 1000); + create_test_files(temp.path(), 100, 1000); c.bench_function("serialize_repo", |b| { b.iter(|| { - let mut config = YekConfig::default(); - config.output_dir = Some(output_dir.clone()); + let config = YekConfig { + output_dir: Some(output_dir.clone()), + ..Default::default() + }; serialize_repo(black_box(temp.path()), Some(&config)).unwrap() }) }); diff --git a/tests/test_priority.rs b/tests/test_priority.rs index 3dea359..cf56692 100644 --- a/tests/test_priority.rs +++ b/tests/test_priority.rs @@ -10,8 +10,6 @@ fn priority_rules_are_applied() { repo.path(), "yek.toml", r#" -git_boost_max = 0 - [[priority_rules]] score = 10 pattern = "^very_important/" @@ -43,15 +41,15 @@ pattern = "^less_important/" .assert() .success(); - // Read the first chunk file - let chunk_0 = fs::read_to_string(output_dir.join("chunk-0.txt")).unwrap(); - println!("Chunk content:\n{}", chunk_0); + // Read the output file + let output = fs::read_to_string(output_dir.join("output.txt")).unwrap(); + println!("Output content:\n{}", output); // Check that very_important appears after less_important in the output - let very_pos = chunk_0 + let very_pos = output .find(">>>> very_important/one.txt") .expect("very_important/one.txt not found"); - let less_pos = chunk_0 + let less_pos = output .find(">>>> less_important/two.txt") .expect("less_important/two.txt not found"); assert!( diff --git a/tests/test_supported_models.rs b/tests/test_supported_models.rs new file mode 100644 index 0000000..5d6d10e --- /dev/null +++ b/tests/test_supported_models.rs @@ -0,0 +1,136 @@ +#[path = "integration_common.rs"] +mod integration_common; +use assert_cmd::Command; +use integration_common::{create_file, setup_temp_repo}; + +#[test] +fn test_supported_models_list() { + let repo = setup_temp_repo(); + let mut cmd = Command::cargo_bin("yek").unwrap(); + let output = cmd + .current_dir(repo.path()) + .arg("--help") + .output() + .expect("Failed to execute command"); + + assert!(output.status.success()); + let stdout = String::from_utf8_lossy(&output.stdout); + + // Verify help output contains supported models section + assert!(stdout.contains("SUPPORTED MODELS:")); + assert!(stdout.contains("possible values:")); + assert!(stdout.contains("")); // Added check for placeholder + + // Verify all models are listed + let models = [ + "openai", // OpenAI models + "claude", // Anthropic Claude models + "mistral", // Mistral models + "mixtral", // Mixtral models + "deepseek", // DeepSeek models + "llama", // Meta Llama models + "codellama", // CodeLlama models + ]; + + for model in models { + assert!( + stdout.contains(model), + "Help output should contain model: {}", + model + ); + } +} + +#[test] +fn test_model_validation() { + let repo = setup_temp_repo(); + let content = "Test content"; + create_file(repo.path(), "test.txt", content.as_bytes()); + + // Test with valid model + let mut cmd = Command::cargo_bin("yek").unwrap(); + let output = cmd + .current_dir(repo.path()) + .arg("--tokens=openai") + .output() + .expect("Failed to execute command"); + + assert!( + output.status.success(), + "Command should succeed with valid model" + ); + + // Test with invalid model + let mut cmd = Command::cargo_bin("yek").unwrap(); + let output = cmd + .current_dir(repo.path()) + .arg("--tokens=invalid-model") + .output() + .expect("Failed to execute command"); + + assert!( + !output.status.success(), + "Command should fail with invalid model" + ); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("invalid value"), + "Should indicate invalid model" + ); +} + +#[test] +fn test_model_from_config() { + let repo = setup_temp_repo(); + let content = "Test content"; + create_file(repo.path(), "test.txt", content.as_bytes()); + + // Create config with valid model + create_file( + repo.path(), + "yek.toml", + r#" +tokenizer_model = "openai" +token_mode = true +"# + .as_bytes(), + ); + + let mut cmd = Command::cargo_bin("yek").unwrap(); + let output = cmd + .current_dir(repo.path()) + .output() + .expect("Failed to execute command"); + + assert!( + output.status.success(), + "Command should succeed with valid model in config" + ); + + // Test with invalid model in config + create_file( + repo.path(), + "yek.toml", + r#" +tokenizer_model = "invalid-model" +token_mode = true +"# + .as_bytes(), + ); + + let mut cmd = Command::cargo_bin("yek").unwrap(); + let output = cmd + .current_dir(repo.path()) + .output() + .expect("Failed to execute command"); + + assert!( + !output.status.success(), + "Command should fail with invalid model in config" + ); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("Unsupported tokenizer model"), + "Should indicate unsupported model" + ); +} diff --git a/tests/test_tokenizer.rs b/tests/test_tokenizer.rs new file mode 100644 index 0000000..150c4d6 --- /dev/null +++ b/tests/test_tokenizer.rs @@ -0,0 +1,162 @@ +mod integration_common; +use integration_common::assert_output_file_contains; +use predicates::prelude::*; +use std::fs; + +#[test] +fn cli_model_overrides_config() { + let temp_dir = tempfile::tempdir().unwrap(); + let config_path = temp_dir.path().join("yek.toml"); + fs::write( + &config_path, + "tokenizer_model = \"mistral\"\ntoken_mode = true\n", + ) + .unwrap(); + + let test_file_path = temp_dir.path().join("test.txt"); + fs::write( + &test_file_path, + "This is a simple file with some words in it.\n", + ) + .unwrap(); + + let mut cmd = assert_cmd::Command::cargo_bin("yek").unwrap(); + cmd.arg("--config") + .arg(config_path) + .arg("--token-model=claude") // Using a different model to properly test override + .arg(temp_dir.path()) + .assert() + .success() + .stdout(predicate::str::contains( + "Token mode enabled with model: claude", + )); + + // Verify that the original config model was overridden + let debug_log = fs::read_to_string(temp_dir.path().join("debug.log")).unwrap(); + assert!( + !debug_log.contains("Token mode enabled with model: mistral"), + "Should not use model from config" + ); + + assert_output_file_contains( + temp_dir.path(), + &[ + ">>>> test.txt", + "This is a simple file with some words in it.\n", + ], + ); + + // Clean up the temporary directory + temp_dir.close().unwrap(); +} + +#[test] +fn accepts_model_from_config() { + let temp_dir = tempfile::tempdir().unwrap(); + let config_path = temp_dir.path().join("yek.toml"); + fs::write( + &config_path, + "tokenizer_model = \"openai\"\ntoken_mode = true\n", + ) + .unwrap(); + + // Create a dummy test.txt file + fs::write(temp_dir.path().join("test.txt"), "Test content\n").unwrap(); + + let mut cmd = assert_cmd::Command::cargo_bin("yek").unwrap(); + cmd.arg("--config") + .arg(config_path) + .arg(temp_dir.path()) + .env("YEK_DEBUG_OUTPUT", temp_dir.path().join("debug.log")) + .assert() + .success(); + + // Verify that the debug log contains the expected message + let debug_log = fs::read_to_string(temp_dir.path().join("debug.log")).unwrap(); + assert!( + debug_log.contains("Token mode enabled with model: openai"), + "Should enable token mode with model from config" + ); + + // Clean up the temporary directory + temp_dir.close().unwrap(); +} + +#[test] +fn default_tokens_is_false() { + let temp_dir = tempfile::tempdir().unwrap(); + let mut cmd = assert_cmd::Command::cargo_bin("yek").unwrap(); + cmd.arg(temp_dir.path()) + .assert() + .success() + .stdout(predicate::str::contains("Token mode enabled").not()); + + // Clean up the temporary directory + temp_dir.close().unwrap(); +} + +#[test] +fn cli_tokens_enables_token_mode() { + let temp_dir = tempfile::tempdir().unwrap(); + let mut cmd = assert_cmd::Command::cargo_bin("yek").unwrap(); + cmd.arg("--tokens") + .arg(temp_dir.path()) + .assert() + .success() + .stdout(predicate::str::contains( + "Token mode enabled with model: openai", + )); + + // Clean up the temporary directory + temp_dir.close().unwrap(); +} + +#[test] +fn counts_tokens_using_tokenizer() { + let temp_dir = tempfile::tempdir().unwrap(); + let test_file_path = temp_dir.path().join("test.txt"); + fs::write( + &test_file_path, + "This is a simple file with some words in it.\n", + ) + .unwrap(); + + // Create a temporary config file specifying the model + let config_path = temp_dir.path().join("yek.toml"); + fs::write( + &config_path, + "tokenizer_model = \"deepseek\"\ntoken_mode = true\n", + ) + .unwrap(); + + let mut cmd = assert_cmd::Command::cargo_bin("yek").unwrap(); + cmd.arg("--config") + .arg(&config_path) + .arg(temp_dir.path()) + .assert() + .success() + .stdout(predicate::str::contains("deepseek")); + + // Verify that the test file content is in the output + assert_output_file_contains( + temp_dir.path(), + &[ + ">>>> test.txt", + "This is a simple file with some words in it.\n", + ], + ); + + // Clean up the temporary directory + temp_dir.close().unwrap(); +} + +#[test] +fn unsupported_model() { + let temp_dir = tempfile::tempdir().unwrap(); + let mut cmd = assert_cmd::Command::cargo_bin("yek").unwrap(); + cmd.arg("--token-model=unsupported_model") + .arg(temp_dir.path()) + .assert() + .failure() + .stderr(predicate::str::contains("Unsupported model")); +} diff --git a/tests/test_validation.rs b/tests/test_validation.rs index c5b44df..869818a 100644 --- a/tests/test_validation.rs +++ b/tests/test_validation.rs @@ -15,13 +15,10 @@ fn fails_on_invalid_regex_in_config() { ); let mut cmd = Command::cargo_bin("yek").unwrap(); - cmd.current_dir(repo.path()) - .assert() - .success() // The tool doesn't "fail," it just logs invalid config - .stderr( - predicate::str::contains("Invalid configuration in") - .and(predicate::str::contains("Invalid pattern")), - ); + cmd.current_dir(repo.path()).assert().failure().stderr( + predicate::str::contains("Invalid configuration") + .and(predicate::str::contains("Invalid pattern")), + ); } #[test] @@ -39,8 +36,8 @@ pattern = ".*" ); let mut cmd = Command::cargo_bin("yek").unwrap(); - cmd.current_dir(repo.path()).assert().success().stderr( - predicate::str::contains("Invalid configuration in") + cmd.current_dir(repo.path()).assert().failure().stderr( + predicate::str::contains("Invalid configuration") .and(predicate::str::contains("must be between 0 and 1000")), ); } diff --git a/yek.toml b/yek.toml index 36eaea1..633d648 100644 --- a/yek.toml +++ b/yek.toml @@ -4,14 +4,18 @@ ignore_patterns = [ "^repo-serialized/", "^.*\\.txt$", "^tests/.*", + "^scripts/.*", + "^.github/.*", + "^.cargo/.*", + "^Cargo\\.lock$", "^benchmarks/", "^src/defaults\\.rs$", - "^\\.github/.*$", - "README.md", - "CHANGELOG.md", + "README\\.md", + "CHANGELOG\\.md", "^LICENSE$", "^README$", - "^askai.yaml$" + "^prompts/.*$", + "^\\.sh$" ] [[priority_rules]]