lich-5-docs/generate_docs.py at main · Nisugi/lich-5-docs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
"""
Lich5 Documentation Generator
Main script for generating YARD-compatible documentation for Lich5 Ruby code
"""

import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))

import argparse
import json
import logging
import re
import time
import hashlib
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any, Optional

from providers import get_provider, ProviderFactory, get_parallel_workers

# Import config (optional - falls back to defaults if not available)
try:
    from config import ConfigManager, get_config
    HAS_CONFIG = True
except ImportError:
    HAS_CONFIG = False
    ConfigManager = None
    get_config = None

# Import validation (optional - falls back to skipping validation)
try:
    from validation import YARDValidator, ValidationResult
    HAS_VALIDATION = True
except ImportError:
    HAS_VALIDATION = False
    YARDValidator = None
    ValidationResult = None

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


class Lich5DocumentationGenerator:
    """Main documentation generator for Lich5 Ruby code"""

    def __init__(self, provider_name: Optional[str] = None, output_dir: Optional[str] = None,
                 incremental: bool = True, force_rebuild: bool = False, parallel_workers: int = None,
                 output_structure: str = 'flat', source_root: Optional[Path] = None):
        """
        Initialize the documentation generator

        Args:
            provider_name: LLM provider to use (defaults to env var or 'openai')
            output_dir: Output directory for documentation (defaults to 'output/latest')
            incremental: Enable incremental processing (skip already documented files)
            force_rebuild: Force reprocessing of all files even if already documented
            parallel_workers: Number of parallel workers (None = auto-detect based on provider)
            output_structure: 'flat' (all files in one dir) or 'mirror' (preserve source structure)
            source_root: Root directory of source files (required for mirror structure)
        """
        self.provider_name = provider_name or os.environ.get('LLM_PROVIDER', 'openai')
        self.incremental = incremental and not force_rebuild
        self.force_rebuild = force_rebuild
        self.output_structure = output_structure
        self.source_root = source_root

        # Thread safety - use RLock (reentrant) to allow nested acquisitions
        self.manifest_lock = threading.RLock()
        self.file_lock = threading.RLock()

        # Get parallel workers from config or use provided value
        if parallel_workers is None:
            self.parallel_workers = get_parallel_workers(self.provider_name)
        else:
            self.parallel_workers = parallel_workers

        # Set up output directory
        if output_dir:
            self.output_dir = Path(output_dir)
        else:
            # Use 'latest' directory for incremental processing
            self.output_dir = Path('output') / 'latest'

        self.output_dir.mkdir(parents=True, exist_ok=True)

        # Initialize provider
        logger.info(f"Initializing {self.provider_name} provider")
        self.provider = get_provider(self.provider_name)

        # Track documentation
        self.documentation = {}
        self.failed_files = []

        # Load existing manifest for incremental processing
        self.manifest_file = self.output_dir / 'manifest.json'
        self.manifest = self.load_manifest()

        logger.info(f"Documentation generator initialized")
        logger.info(f"Provider: {self.provider_name}")
        logger.info(f"Output directory: {self.output_dir}")
        logger.info(f"Incremental mode: {self.incremental}")
        if self.incremental and self.manifest.get('processed_files'):
            logger.info(f"Found {len(self.manifest['processed_files'])} already processed files")

    def get_output_file_path(self, file_path: Path) -> Path:
        """
        Get the output file path based on output structure setting

        Args:
            file_path: Source file path

        Returns:
            Path to output file (either flat or mirrored structure)
        """
        if self.output_structure == 'mirror' and self.source_root:
            # Mirror directory structure
            try:
                # Ensure both paths are resolved to absolute paths for comparison
                file_path_resolved = file_path.resolve()
                source_root_resolved = self.source_root.resolve()

                # Get relative path from source root
                relative_path = file_path_resolved.relative_to(source_root_resolved)
                # Build mirrored path in documented directory
                output_path = self.output_dir / 'documented' / relative_path
                return output_path
            except ValueError as e:
                # File is not under source_root, fall back to flat
                logger.warning(f"File {file_path} not under source root {self.source_root}, using flat structure")
                logger.debug(f"  ValueError: {e}")
                return self.output_dir / 'documented' / file_path.name
        else:
            # Flat structure - all files in documented directory
            return self.output_dir / 'documented' / file_path.name

    def load_manifest(self) -> dict:
        """Load the manifest file tracking processed files"""
        if self.manifest_file.exists():
            try:
                with open(self.manifest_file, 'r') as f:
                    manifest = json.load(f)
                logger.info(f"Loaded manifest with {len(manifest.get('processed_files', []))} processed files")
                return manifest
            except Exception as e:
                logger.warning(f"Failed to load manifest: {e}")
                return {'processed_files': {}, 'failed_files': [], 'timestamp': datetime.now().isoformat()}
        return {'processed_files': {}, 'failed_files': [], 'timestamp': datetime.now().isoformat()}

    def save_manifest(self):
        """Save the manifest file (thread-safe)"""
        with self.manifest_lock:
            try:
                with open(self.manifest_file, 'w') as f:
                    json.dump(self.manifest, f, indent=2, default=str)
            except Exception as e:
                logger.error(f"Failed to save manifest: {e}")

    def compute_code_hash(self, content: str) -> str:
        """
        Compute hash of Ruby code excluding YARD comments
        This allows us to detect actual code changes vs documentation changes
        """
        lines = content.split('\n')
        code_lines = []
        in_yard_comment = False

        for line in lines:
            stripped = line.strip()

            # Skip YARD comment blocks
            if stripped.startswith('#') and any(tag in stripped for tag in ['@param', '@return', '@example', '@note', '@see', '@yield']):
                continue
            # Skip regular comment lines that look like documentation
            elif stripped.startswith('#') and len(stripped) > 1 and stripped[1] == ' ':
                # But keep shebang and encoding comments
                if stripped.startswith('#!') or 'coding:' in stripped or 'encoding:' in stripped:
                    code_lines.append(line)
            else:
                # Include actual code lines
                code_lines.append(line)

        # Compute hash of the actual code
        code_content = '\n'.join(code_lines)
        return hashlib.sha256(code_content.encode('utf-8')).hexdigest()[:16]

    def is_file_processed(self, file_path: Path) -> bool:
        """Check if a file has already been processed and hasn't changed"""
        if not self.incremental:
            return False

        relative_path = str(file_path)
        if relative_path in self.manifest.get('processed_files', {}):
            logger.info(f"  File found in manifest: {file_path.name} (key: {relative_path})")
            # Check if output file actually exists in committed documented/ directory
            # Use same logic as get_output_file_path but check repo root documented/
            if self.output_structure == 'mirror' and self.source_root:
                try:
                    file_path_resolved = file_path.resolve()
                    source_root_resolved = self.source_root.resolve()
                    relative_path_from_source = file_path_resolved.relative_to(source_root_resolved)
                    # Check in repo root documented/ directory (committed files)
                    output_file = Path('documented') / relative_path_from_source
                    logger.debug(f"  Checking: {output_file} (exists: {output_file.exists()})")
                except ValueError as e:
                    output_file = Path('documented') / file_path.name
                    logger.debug(f"  ValueError in path resolution: {e}, using flat: {output_file}")
            else:
                output_file = Path('documented') / file_path.name
                logger.debug(f"  Using flat structure: {output_file}")

            if not output_file.exists():
                logger.info(f"  Output file missing, reprocessing: {file_path.name}")
                logger.debug(f"    Looked for: {output_file.absolute()}")
                return False

            # Check if source file has changed by comparing hashes
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    current_content = f.read()
                current_hash = self.compute_code_hash(current_content)

                stored_info = self.manifest['processed_files'][relative_path]
                stored_hash = stored_info.get('content_hash')

                # Log hash comparison for debugging
                logger.info(f"  Hash check for {file_path.name}:")
                logger.info(f"    Manifest key: {relative_path}")
                logger.info(f"    Stored hash:  {stored_hash}")
                logger.info(f"    Current hash: {current_hash}")
                logger.info(f"    Match: {current_hash == stored_hash}")

                if current_hash != stored_hash:
                    logger.info(f"  Source file changed, reprocessing: {file_path.name}")
                    return False
                else:
                    logger.info(f"  Skipping (unchanged): {file_path.name}")
                    return True

            except Exception as e:
                logger.warning(f"  Error checking file hash, reprocessing: {e}")
                return False
        else:
            logger.info(f"  File NOT in manifest: {file_path.name} (key: {relative_path})")

        return False

    def mark_file_processed(self, file_path: Path, success: bool = True, content: str = None,
                            validation_status: str = None):
        """Mark a file as processed in the manifest with content hash (thread-safe)

        Args:
            file_path: Path to the file
            success: Whether processing was successful
            content: Optional content for hash computation
            validation_status: Validation result ('passed', 'warnings', 'failed', 'skipped')
        """
        with self.manifest_lock:
            relative_path = str(file_path)
            if success:
                if 'processed_files' not in self.manifest:
                    self.manifest['processed_files'] = {}

                # Compute hash of the source file (without comments)
                content_hash = None
                if content:
                    content_hash = self.compute_code_hash(content)
                else:
                    try:
                        with open(file_path, 'r', encoding='utf-8') as f:
                            content_hash = self.compute_code_hash(f.read())
                    except Exception as e:
                        logger.warning(f"Could not compute hash for {file_path}: {e}")

                entry = {
                    'timestamp': datetime.now().isoformat(),
                    'provider': self.provider_name,
                    'content_hash': content_hash,
                    'file_name': file_path.name
                }

                # Add validation status if provided
                if validation_status:
                    entry['validation_status'] = validation_status

                self.manifest['processed_files'][relative_path] = entry
            else:
                if 'failed_files' not in self.manifest:
                    self.manifest['failed_files'] = []
                if relative_path not in self.manifest['failed_files']:
                    self.manifest['failed_files'].append(relative_path)

            # Save manifest after each file (in case of interruption)
            self.save_manifest()

    def strip_yard_comments(self, content: str) -> str:
        """
        Remove existing YARD documentation while preserving inline code comments

        This prevents duplicate documentation when regenerating docs for files
        that already have partial YARD coverage.

        Strips:
        - Lines with YARD tags (@param, @return, @example, etc.)
        - Description comment lines that are part of YARD doc blocks

        Preserves:
        - Inline comments within code (e.g., "# Calculate total")
        - Special directives (e.g., # encoding:, # rubocop:, # :nodoc:)
        - Shebang lines (#!/usr/bin/env ruby)

        Args:
            content: Original Ruby source code

        Returns:
            Content with YARD documentation removed
        """
        lines = content.split('\n')
        result = []
        i = 0

        while i < len(lines):
            line = lines[i]
            stripped = line.strip()

            # Keep shebang, encoding, and other special directives
            if stripped.startswith('#!') or 'encoding:' in stripped or 'coding:' in stripped:
                result.append(line)
                i += 1
                continue

            # Keep rubocop directives and :nodoc:
            if '# rubocop:' in line or '# :nodoc:' in line or '# @!visibility' in line:
                result.append(line)
                i += 1
                continue

            # Check if this is a YARD tag line
            is_yard_tag = stripped.startswith('#') and any(
                tag in stripped for tag in [
                    '@param', '@return', '@example', '@raise', '@yield',
                    '@note', '@see', '@api', '@deprecated', '@since',
                    '@version', '@attr', '@attr_reader', '@attr_writer'
                ]
            )

            if is_yard_tag:
                # Skip this line and any continuation lines (part of the YARD block)
                i += 1
                # Also skip following YARD tag lines and example code
                while i < len(lines):
                    next_stripped = lines[i].strip()
                    # Continue skipping if it's:
                    # - A YARD tag line
                    # - Part of an @example block (indented code after @example)
                    # - A comment line that's part of the doc block
                    if next_stripped.startswith('#'):
                        # Check if it's another YARD tag or example code
                        has_tag = any(tag in next_stripped for tag in [
                            '@param', '@return', '@example', '@raise', '@yield',
                            '@note', '@see', '@api', '@deprecated', '@since',
                            '@version', '@attr', '@attr_reader', '@attr_writer'
                        ])
                        if has_tag or next_stripped.startswith('#   '):  # Example code (indented)
                            i += 1
                            continue
                    break
                continue

            # Check if this is a description comment line above a definition
            # (part of a YARD doc block without explicit tags)
            if stripped.startswith('#') and not stripped.startswith('##'):
                # Look ahead to see if this is followed by a definition
                j = i + 1
                is_yard_description = False

                # Skip blank comment lines and other description lines
                while j < len(lines):
                    next_line = lines[j].strip()

                    # If we hit a blank line, keep looking
                    if not next_line:
                        j += 1
                        continue

                    # If we hit a YARD tag, this is part of a YARD block
                    if next_line.startswith('#') and any(tag in next_line for tag in [
                        '@param', '@return', '@example', '@raise', '@yield', '@note'
                    ]):
                        is_yard_description = True
                        break

                    # If we hit another comment, it might be more description
                    if next_line.startswith('#'):
                        j += 1
                        if j - i > 10:  # Don't look too far ahead
                            break
                        continue

                    # If we hit a definition (class, module, def, attr_*), this is YARD
                    if any(next_line.startswith(kw) for kw in [
                        'class ', 'module ', 'def ', 'attr_reader', 'attr_writer', 'attr_accessor'
                    ]):
                        is_yard_description = True

                    # Stop looking
                    break

                if is_yard_description:
                    # Skip this description line
                    i += 1
                    continue

            # Keep this line (it's code or an inline comment)
            result.append(line)
            i += 1

        return '\n'.join(result)

    def create_documentation_prompt(self, file_name: str, content: str) -> tuple[str, str]:
        """
        Create prompts for documentation generation

        Args:
            file_name: Name of the file being documented
            content: Ruby source code (should already have YARD comments stripped)

        Returns:
            (system_prompt, user_prompt) tuple
        """
        system_prompt = """You are an expert Ruby documentation specialist.
Your task is to generate YARD-compatible documentation for Ruby code.
You will return JSON with documentation comments and their anchor points."""

        # Add line numbers to help AI identify exact lines
        numbered_lines = []
        for i, line in enumerate(content.split('\n'), start=1):
            numbered_lines.append(f"{i:4d}: {line}")
        numbered_content = '\n'.join(numbered_lines)

        user_prompt = f"""Analyze this Ruby file from the Lich5 project: **{file_name}**

```ruby
{numbered_content}
```

Generate **YARD-compatible** documentation following the lich-5 documentation style guide.
The line numbers are shown at the start of each line (e.g., "  15: def method_name").

**CRITICAL RULES - READ CAREFULLY:**

1. **WHAT TO DOCUMENT**
   - Document all public classes, modules, methods, and non-trivial constants
   - Add `@api private` to methods that are public but internal (not called from .lic scripts)
   - Skip already-documented code (existing YARD tags present)
   - If there's nothing to document, return an empty JSON array: []

2. **WHAT TO SKIP (no documentation needed)**
   - Trivial one-line delegation methods
   - `attr_reader` / `attr_accessor` / `attr_writer` declarations
   - Aliases where the target method is already documented
   - Constants with self-evident names and values (e.g., `MAX_RETRIES = 3`)
   - Private methods marked with `private` keyword, `# :nodoc:`, or `# @!visibility private`
     (unless they need `@api private` as they are public-but-internal)

3. **TAG ORDER** — always use this order when multiple tags appear:
   1. `@param` (one per parameter, in parameter order)
   2. `@return`
   3. `@example`
   4. `@note`
   5. `@raise`
   6. `@see`
   7. `@since`
   8. `@deprecated`
   9. `@api`

4. **TYPES ARE REQUIRED**
   - Every `@param` and `@return` MUST include a type annotation in [brackets]
   - Use `[void]` when a method has no meaningful return value
   - Common types: [String], [Integer], [Boolean], [Array<String>], [Hash], [nil], [String, nil]
   - Game-specific types: [String] for item/container nouns, [Regexp] for patterns, [OpenStruct] for settings

5. **PARAMETER NAME RULES**
   - @param tags MUST exactly match the method's parameter names
   - For block parameters (`&block`): DO NOT add a @param tag — document block behavior
     in the method description or with @yield/@yieldparam tags instead
     WRONG: @param block [Proc] ...
     WRONG: @param &block [Proc] ...
     RIGHT: omit @param for &block entirely; describe it in the summary or use @yield
   - For splat parameters (*args): Use the name without the asterisk
     WRONG: @param *messages [Array]
     RIGHT: @param messages [Array]
   - Parameter names must match what's in the def statement exactly (no & or * symbols)

6. **METHOD REFERENCE SYNTAX**
   - Class/module methods (def self.method): use `.method_name` in @see and {{.method_name}} inline
   - Instance methods (def method): use `#method_name` in @see and {{#method_name}} inline

7. **DO NOT USE THESE TAGS**
   - `@author` — use git blame instead
   - `@version` — use `@since` instead
   - `@todo` — use GitHub issues instead
   - `@abstract` — Ruby has no abstract methods

8. **VALIDATION BEFORE RETURNING**
   - Double-check each @param name matches the actual method parameter
   - Remove the & and * symbols from @param names
   - Confirm tag order matches rule 3 above
   - Ensure types are present on every @param and @return

Documentation structure:
1. For classes/modules:
   - Brief description on first line
   - Longer description if needed (document the contract, not implementation details)
   - `@see` cross-references to related classes/modules if relevant

2. For methods:
   - Brief description (what it does, not how)
   - Tags in order: @param, @return, @example (encouraged for consumer-facing), @note, @raise, @see
   - @example format: the tag line contains an optional title, code goes on the NEXT line(s) indented with "#   "
     WRONG: @example DRCI.get_item?(\\"sword\\")
     RIGHT: @example Get from default storage\\n#   DRCI.get_item?(\\"sword\\")

3. For constants (pattern arrays, non-trivial values):
   - Brief description
   - `@example` showing what strings the pattern matches (for Regexp arrays)
   - `@see` to cross-reference paired constants and methods that use them

Return a JSON array where each entry contains:
- "line_number": The line number to insert before (1-indexed, counting from line 1)
- "anchor": A snippet of the line for validation (e.g., "class GameObj", "def initialize")
- "indent": The indentation level (number of spaces before the line)
- "comment": The YARD comment block as a single string with \\n for newlines

Example output format:
```json
[
  {{
    "line_number": 15,
    "anchor": "class GameObj",
    "indent": 0,
    "comment": "# Represents a game object in the world.\\n#\\n# @see #noun The object's primary identifier"
  }},
  {{
    "line_number": 23,
    "anchor": "def initialize",
    "indent": 2,
    "comment": "# Initializes a new game object.\\n# @param id [String] unique object ID\\n# @param noun [String] object noun (e.g., \\"sword\\", \\"backpack\\")\\n# @return [GameObj]"
  }}
]
```

IMPORTANT:
- Return ONLY the JSON array, no other text
- Line numbers should match the file (1-indexed)
- Anchors should be concise (just the key part like "def method_name" or "class ClassName")
- @param names MUST NOT include & or * symbols (use "block" not "&block", use "args" not "*args")
- Verify @param names match the actual method parameters exactly
- CRITICAL: In the "comment" field, you MUST escape all special characters:
  * Double quotes MUST be escaped: use \\" not "
  * Backslashes MUST be escaped: use \\\\ not \\
  * Example code MUST have escaped quotes: Feat[\\"name\\"] not Feat["name"]
  * Line breaks use \\n (already escaped)
- Your JSON MUST be valid and parseable - test it mentally before returning
"""

        return system_prompt, user_prompt

    def process_file(self, file_path: Path) -> Optional[str]:
        """
        Process a single Ruby file and generate documentation using JSON-based approach

        Args:
            file_path: Path to Ruby file

        Returns:
            Generated documentation or None if failed
        """
        logger.info(f"Processing: {file_path.name}")

        try:
            # Read original file
            with open(file_path, 'r', encoding='utf-8') as f:
                original_content = f.read()

            # Get file stats
            lines = len(original_content.split('\n'))
            logger.info(f"  Lines: {lines}, Characters: {len(original_content)}")

            # Strip existing YARD comments to prevent duplicates
            # The LLM will regenerate all documentation from scratch
            stripped_content = self.strip_yard_comments(original_content)
            stripped_lines = len(stripped_content.split('\n'))
            removed_lines = lines - stripped_lines
            if removed_lines > 0:
                logger.info(f"  Stripped {removed_lines} lines of existing YARD documentation")

            # Create prompts for JSON-based documentation
            system_prompt, user_prompt = self.create_documentation_prompt(
                file_path.name,
                stripped_content
            )

            # Generate JSON with comments and anchors
            logger.info(f"  Requesting documentation from {self.provider_name}...")
            result = self.provider.generate(user_prompt, system_prompt)

            # Parse JSON response
            comments = self.extract_comments_json(result)

            if comments is None:
                # JSON parsing completely failed - save response for debugging
                logger.error(f"  No comments extracted from response")
                logger.error(f"  AI response length: {len(result)} characters")
                if len(result) < 1000:
                    logger.error(f"  Full AI response: {result}")
                else:
                    logger.error(f"  AI response (first 500): {result[:500]}")
                    logger.error(f"  AI response (last 500): {result[-500:]}")

                # Save failed response for manual inspection
                failed_response_file = self.output_dir / f"{file_path.stem}_failed_response.txt"
                with open(failed_response_file, 'w', encoding='utf-8') as f:
                    f.write(f"Failed to parse JSON for: {file_path.name}\n")
                    f.write(f"AI Response Length: {len(result)} characters\n")
                    f.write("="*80 + "\n")
                    f.write(result)
                logger.info(f"  Saved failed response to: {failed_response_file.name}")

                self.failed_files.append(file_path.name)
                return None

            if len(comments) == 0:
                # Empty array is valid - file has nothing to document (e.g., only require statements)
                logger.info(f"  No documentation needed (file contains only requires/imports)")
                documented_code = stripped_content
            else:
                logger.info(f"  Extracted {len(comments)} documentation entries")
                # Insert comments into stripped code (not original, to avoid duplicates)
                documented_code = self.insert_comments(stripped_content, comments)

            # Store documentation
            self.documentation[file_path.name] = {
                'original': original_content,
                'documented': documented_code,
                'timestamp': datetime.now().isoformat()
            }

            logger.info(f"  ✅ Successfully documented {file_path.name}")
            return documented_code

        except Exception as e:
            logger.error(f"  ❌ Failed to process {file_path.name}: {e}")
            import traceback
            logger.debug(traceback.format_exc())
            self.failed_files.append(file_path.name)
            return None

    def sanitize_json_escapes(self, json_text: str) -> str:
        r"""
        Sanitize invalid escape sequences in JSON string

        Valid JSON escapes: \", \\, \/, \b, \f, \n, \r, \t, \uXXXX
        Common invalid escapes from AI: \d, \s, \w, \x, etc. (regex patterns)

        Args:
            json_text: Raw JSON string that may contain invalid escapes

        Returns:
            Sanitized JSON string with invalid escapes fixed
        """
        # Simpler approach: find all escape sequences and validate them
        result = []
        i = 0
        while i < len(json_text):
            if json_text[i] == '\\' and i + 1 < len(json_text):
                next_char = json_text[i + 1]

                # Check if it's a valid escape
                if next_char in '"\\/bfnrt':
                    # Valid single-char escape
                    result.append('\\')
                    result.append(next_char)
                    i += 2
                elif next_char == 'u' and i + 5 < len(json_text):
                    # Check for \uXXXX (must be 4 hex digits)
                    hex_part = json_text[i+2:i+6]
                    if len(hex_part) == 4 and all(c in '0123456789ABCDEFabcdef' for c in hex_part):
                        # Valid \uXXXX
                        result.append('\\u')
                        result.append(hex_part)
                        i += 6
                    else:
                        # Invalid \u sequence - double-escape it
                        result.append('\\\\u')
                        i += 2
                else:
                    # Invalid escape - double-escape the backslash
                    result.append('\\\\')
                    result.append(next_char)
                    i += 2
            else:
                # Not an escape sequence
                result.append(json_text[i])
                i += 1

        return ''.join(result)

    def clean_json_concatenation(self, json_text: str) -> str:
        """
        Clean up invalid JSON string concatenation patterns

        LLMs sometimes generate JSON with JavaScript/Python-style string concatenation:
        "text1"
            + "text2"
            + "text3"

        This is NOT valid JSON. Convert to single concatenated string.

        Args:
            json_text: Raw JSON that may contain string concatenation

        Returns:
            Cleaned JSON with concatenations resolved
        """
        # Pattern: "string1" + "string2" (with optional whitespace/newlines)
        # Match: "..." followed by optional whitespace, +, optional whitespace, "..."
        # This handles both inline and multi-line concatenations:
        #   "line1\n" + "line2\n"  (inline)
        #   "line1\n"
        #       + "line2\n"        (multi-line)
        #       + "line3\n"

        def concat_strings(match):
            """Callback to concatenate matched string segments"""
            # Extract all string contents from the match (between quotes)
            # This regex finds content between quotes, handling escaped quotes
            strings = []
            # Match all "..." segments, including escaped characters
            for s in re.finditer(r'"((?:[^"\\]|\\.)*)"', match.group(0)):
                strings.append(s.group(1))

            # Concatenate all segments into a single JSON string
            # The content is already escaped (e.g., \n for newlines)
            return '"' + ''.join(strings) + '"'

        # Pattern explanation:
        # "(?:[^"\\]|\\.)*"  - Match a JSON string (with escaped chars)
        # (?:\s*\+\s*"(?:[^"\\]|\\.)*")+  - Match one or more: whitespace, +, whitespace, string
        # The \s* allows for optional newlines and indentation
        pattern = r'"(?:[^"\\]|\\.)*"(?:\s*\+\s*"(?:[^"\\]|\\.)*")+'

        cleaned = re.sub(pattern, concat_strings, json_text)
        return cleaned

    def extract_comments_json(self, response: str) -> List[Dict[str, Any]]:
        """
        Extract JSON array of comments from LLM response

        Tries direct JSON parsing first (for structured outputs), then falls back
        to regex-based extraction strategies.

        Returns:
            List of comment entries with anchor, indent, and comment fields
        """
        # Strategy 0: Try direct JSON parse first (for structured output responses)
        # This handles both wrapped {"comments": [...]} and direct [...] formats
        try:
            data = json.loads(response.strip())
            # Handle wrapped format from structured outputs
            if isinstance(data, dict) and "comments" in data:
                logger.debug("Direct JSON parse succeeded (wrapped format)")
                return data["comments"]
            # Handle direct array format
            if isinstance(data, list):
                logger.debug("Direct JSON parse succeeded (array format)")
                return data
        except json.JSONDecodeError:
            pass  # Fall through to extraction strategies

        extraction_attempts = []

        # Strategy 1: Try to find JSON code blocks first
        json_blocks = re.findall(r'```json\s*(.*?)```', response, re.DOTALL)
        if json_blocks:
            extraction_attempts.append(('json code block', json_blocks[0].strip()))

        # Strategy 2: Try to find JSON array directly (greedy match)
        json_match = re.search(r'\[\s*\{.*\}\s*\]', response, re.DOTALL)
        if json_match:
            extraction_attempts.append(('greedy array match', json_match.group(0)))

        # Strategy 3: Try to find JSON array (non-greedy)
        json_match_ng = re.search(r'\[\s*\{.*?\}\s*\]', response, re.DOTALL)
        if json_match_ng and json_match_ng.group(0) not in [a[1] for a in extraction_attempts]:
            extraction_attempts.append(('non-greedy array match', json_match_ng.group(0)))

        # Strategy 4: Last resort - assume entire response is JSON
        if response.strip():
            extraction_attempts.append(('raw response', response.strip()))

        # Try each extraction strategy
        for strategy_name, json_text in extraction_attempts:
            try:
                # Step 1: Clean up string concatenation (LLMs sometimes use + operators)
                cleaned = self.clean_json_concatenation(json_text)

                # Step 2: Sanitize invalid escape sequences
                sanitized = self.sanitize_json_escapes(cleaned)

                comments = json.loads(sanitized)

                if not isinstance(comments, list):
                    logger.debug(f"Strategy '{strategy_name}' found non-list JSON, skipping")
                    continue

                # Empty arrays are valid - file may have nothing to document
                logger.debug(f"Strategy '{strategy_name}' successfully extracted {len(comments)} comment entries")
                return comments

            except json.JSONDecodeError as e:
                logger.error(f"Strategy '{strategy_name}' failed to parse JSON: {e}")
                logger.error(f"  Error at position {e.pos}: {sanitized[max(0, e.pos-50):e.pos+50]}")
                continue
            except Exception as e:
                logger.debug(f"Strategy '{strategy_name}' failed with error: {e}")
                continue

        # All strategies failed
        logger.error(f"Failed to parse JSON response with all {len(extraction_attempts)} strategies")
        logger.error(f"Response preview (first 500 chars): {response[:500]}")
        logger.error(f"Response preview (last 500 chars): {response[-500:]}")
        return None

    def soft_match_anchor(self, anchor: str, line: str) -> bool:
        """
        Soft match anchor against line using Ruby-specific pattern matching

        Args:
            anchor: The anchor string (e.g., "def initialize", "class GameObj")
            line: The line of code to match against

        Returns:
            True if anchor matches line using Ruby syntax patterns
        """
        anchor_stripped = anchor.strip()
        line_stripped = line.strip()

        # Pattern 1: Class/Module definitions
        # Anchor: "class GameObj" or "module Lich"
        if anchor_stripped.startswith(('class ', 'module ')):
            keyword, name = anchor_stripped.split(None, 1)
            name = name.split('(')[0].strip()  # Remove any params
            return re.search(rf'^\s*{keyword}\s+{re.escape(name)}\b', line)

        # Pattern 2: Method definitions (instance or class methods)
        # Anchor: "def method_name" or "def self.method" or "def ClassName.method"
        if anchor_stripped.startswith('def '):
            method_sig = anchor_stripped[4:].split('(')[0].strip()

            # Extract the base method name (last part after any dots)
            if '.' in method_sig:
                method_name = method_sig.split('.')[-1]
            else:
                method_name = method_sig

            # Flexible matching: anchor "def method" should match:
            # - def method
            # - def self.method
            # - def ClassName.method
            # And anchor "def self.method" should also match all of those

            # Pattern matches: def <optional-qualifier>.<method_name>[?!=]? or []
            # Where qualifier can be "self", a class name, or nothing
            # Ruby allows ? ! = at end of method names, and [] for array access
            if method_name == '[]':
                # Special case: array access operator
                pattern = rf'\bdef\s+(?:(?:self|\w+)\.)?\[\]'
            else:
                # Regular method, might have ?, !, or = suffix
                pattern = rf'\bdef\s+(?:(?:self|\w+)\.)?{re.escape(method_name)}[?!=]?'

            if re.search(pattern, line):
                return True

            # Fallback: exact match of full signature
            if f'def {method_sig}' in line:
                return True

            return False

        # Pattern 3: Attribute readers/writers/accessors
        # Anchor: "attr_reader :mana" or "attr_accessor"
        if anchor_stripped.startswith('attr_'):
            # Extract the attribute type and symbol
            parts = anchor_stripped.split()
            attr_type = parts[0]  # attr_reader, attr_accessor, etc.
            if len(parts) > 1:
                symbol = parts[1].lstrip(':')
                return re.search(rf'{attr_type}\s+:{re.escape(symbol)}\b', line)
            else:
                return attr_type in line

        # Pattern 4: Constants (all caps with =)
        # Anchor: "CONSTANT_NAME" or "CONSTANT_NAME ="
        if anchor_stripped.replace('_', '').replace('=', '').strip().isupper():
            const_name = anchor_stripped.split('=')[0].strip()
            return re.search(rf'\b{re.escape(const_name)}\s*=', line)

        # Pattern 5: Class variables (@@var) or instance variables (@var)
        # Anchor: "@@variable" or "@variable"
        if anchor_stripped.startswith(('@@', '@')):
            var_name = anchor_stripped.split()[0].split('=')[0].strip()
            return re.search(rf'{re.escape(var_name)}\s*(=|\|\|=)', line)

        # Fallback: Token-based matching (original approach)
        # Remove params and clean up
        anchor_clean = anchor_stripped.split('(')[0].strip()
        tokens = anchor_clean.split()

        if not tokens:
            return False

        # Check if all key tokens appear in the line
        return all(token in line for token in tokens)

    def find_insertion_line(self, lines: List[str], line_number: int, anchor: str,
                           inserted_at_lines: set) -> Optional[int]:
        """
        Find the correct line to insert comment using progressive matching

        Strategy:
        1. Try exact match at expected line number
        2. Try soft match at expected line number
        3. Search entire file (nearby lines first, then rest of file)
           Methods/classes are unique, so safe to search whole file

        Args:
            lines: Source code lines
            line_number: Expected line number (1-indexed from AI)
            anchor: Anchor string for validation
            inserted_at_lines: Set of already-used line indices

        Returns:
            0-indexed line number to insert before, or None if not found
        """
        # Convert to 0-indexed
        expected_idx = line_number - 1

        # Bounds check
        if expected_idx < 0 or expected_idx >= len(lines):
            logger.warning(f"Line number {line_number} out of bounds (file has {len(lines)} lines)")
            return None

        # Skip if already inserted at this line
        if expected_idx in inserted_at_lines:
            logger.debug(f"Line {line_number} already has a comment, skipping")
            return None

        # Strategy 1: Exact match at expected line
        if anchor in lines[expected_idx]:
            logger.debug(f"Exact match at line {line_number}")
            return expected_idx

        # Strategy 2: Soft match at expected line
        if self.soft_match_anchor(anchor, lines[expected_idx]):
            logger.debug(f"Soft match at line {line_number} for anchor: {anchor[:30]}")
            return expected_idx

        # Strategy 3: Search entire file (methods/classes are unique in a file)
        # Start with nearby lines first, then expand outward
        search_order = []

        # Get line_offset from config
        line_offset = 5  # default
        if HAS_CONFIG:
            try:
                config = get_config()
                line_offset = config.anchor_matching.line_offset
            except Exception:
                pass

        # First check nearby lines (±line_offset)
        for offset in range(-line_offset, line_offset + 1):
            if offset == 0:
                continue
            idx = expected_idx + offset
            if 0 <= idx < len(lines):
                search_order.append(idx)

        # Then check rest of file
        for idx in range(len(lines)):
            if idx != expected_idx and idx not in search_order: