-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgenerate_docs.py
More file actions
1557 lines (1296 loc) · 63 KB
/
Copy pathgenerate_docs.py
File metadata and controls
1557 lines (1296 loc) · 63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
"""
Lich5 Documentation Generator
Main script for generating YARD-compatible documentation for Lich5 Ruby code
"""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
import argparse
import json
import logging
import re
import time
import hashlib
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any, Optional
from providers import get_provider, ProviderFactory, get_parallel_workers
# Import config (optional - falls back to defaults if not available)
try:
from config import ConfigManager, get_config
HAS_CONFIG = True
except ImportError:
HAS_CONFIG = False
ConfigManager = None
get_config = None
# Import validation (optional - falls back to skipping validation)
try:
from validation import YARDValidator, ValidationResult
HAS_VALIDATION = True
except ImportError:
HAS_VALIDATION = False
YARDValidator = None
ValidationResult = None
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class Lich5DocumentationGenerator:
"""Main documentation generator for Lich5 Ruby code"""
def __init__(self, provider_name: Optional[str] = None, output_dir: Optional[str] = None,
incremental: bool = True, force_rebuild: bool = False, parallel_workers: int = None,
output_structure: str = 'flat', source_root: Optional[Path] = None):
"""
Initialize the documentation generator
Args:
provider_name: LLM provider to use (defaults to env var or 'openai')
output_dir: Output directory for documentation (defaults to 'output/latest')
incremental: Enable incremental processing (skip already documented files)
force_rebuild: Force reprocessing of all files even if already documented
parallel_workers: Number of parallel workers (None = auto-detect based on provider)
output_structure: 'flat' (all files in one dir) or 'mirror' (preserve source structure)
source_root: Root directory of source files (required for mirror structure)
"""
self.provider_name = provider_name or os.environ.get('LLM_PROVIDER', 'openai')
self.incremental = incremental and not force_rebuild
self.force_rebuild = force_rebuild
self.output_structure = output_structure
self.source_root = source_root
# Thread safety - use RLock (reentrant) to allow nested acquisitions
self.manifest_lock = threading.RLock()
self.file_lock = threading.RLock()
# Get parallel workers from config or use provided value
if parallel_workers is None:
self.parallel_workers = get_parallel_workers(self.provider_name)
else:
self.parallel_workers = parallel_workers
# Set up output directory
if output_dir:
self.output_dir = Path(output_dir)
else:
# Use 'latest' directory for incremental processing
self.output_dir = Path('output') / 'latest'
self.output_dir.mkdir(parents=True, exist_ok=True)
# Initialize provider
logger.info(f"Initializing {self.provider_name} provider")
self.provider = get_provider(self.provider_name)
# Track documentation
self.documentation = {}
self.failed_files = []
# Load existing manifest for incremental processing
self.manifest_file = self.output_dir / 'manifest.json'
self.manifest = self.load_manifest()
logger.info(f"Documentation generator initialized")
logger.info(f"Provider: {self.provider_name}")
logger.info(f"Output directory: {self.output_dir}")
logger.info(f"Incremental mode: {self.incremental}")
if self.incremental and self.manifest.get('processed_files'):
logger.info(f"Found {len(self.manifest['processed_files'])} already processed files")
def get_output_file_path(self, file_path: Path) -> Path:
"""
Get the output file path based on output structure setting
Args:
file_path: Source file path
Returns:
Path to output file (either flat or mirrored structure)
"""
if self.output_structure == 'mirror' and self.source_root:
# Mirror directory structure
try:
# Ensure both paths are resolved to absolute paths for comparison
file_path_resolved = file_path.resolve()
source_root_resolved = self.source_root.resolve()
# Get relative path from source root
relative_path = file_path_resolved.relative_to(source_root_resolved)
# Build mirrored path in documented directory
output_path = self.output_dir / 'documented' / relative_path
return output_path
except ValueError as e:
# File is not under source_root, fall back to flat
logger.warning(f"File {file_path} not under source root {self.source_root}, using flat structure")
logger.debug(f" ValueError: {e}")
return self.output_dir / 'documented' / file_path.name
else:
# Flat structure - all files in documented directory
return self.output_dir / 'documented' / file_path.name
def load_manifest(self) -> dict:
"""Load the manifest file tracking processed files"""
if self.manifest_file.exists():
try:
with open(self.manifest_file, 'r') as f:
manifest = json.load(f)
logger.info(f"Loaded manifest with {len(manifest.get('processed_files', []))} processed files")
return manifest
except Exception as e:
logger.warning(f"Failed to load manifest: {e}")
return {'processed_files': {}, 'failed_files': [], 'timestamp': datetime.now().isoformat()}
return {'processed_files': {}, 'failed_files': [], 'timestamp': datetime.now().isoformat()}
def save_manifest(self):
"""Save the manifest file (thread-safe)"""
with self.manifest_lock:
try:
with open(self.manifest_file, 'w') as f:
json.dump(self.manifest, f, indent=2, default=str)
except Exception as e:
logger.error(f"Failed to save manifest: {e}")
def compute_code_hash(self, content: str) -> str:
"""
Compute hash of Ruby code excluding YARD comments
This allows us to detect actual code changes vs documentation changes
"""
lines = content.split('\n')
code_lines = []
in_yard_comment = False
for line in lines:
stripped = line.strip()
# Skip YARD comment blocks
if stripped.startswith('#') and any(tag in stripped for tag in ['@param', '@return', '@example', '@note', '@see', '@yield']):
continue
# Skip regular comment lines that look like documentation
elif stripped.startswith('#') and len(stripped) > 1 and stripped[1] == ' ':
# But keep shebang and encoding comments
if stripped.startswith('#!') or 'coding:' in stripped or 'encoding:' in stripped:
code_lines.append(line)
else:
# Include actual code lines
code_lines.append(line)
# Compute hash of the actual code
code_content = '\n'.join(code_lines)
return hashlib.sha256(code_content.encode('utf-8')).hexdigest()[:16]
def is_file_processed(self, file_path: Path) -> bool:
"""Check if a file has already been processed and hasn't changed"""
if not self.incremental:
return False
relative_path = str(file_path)
if relative_path in self.manifest.get('processed_files', {}):
logger.info(f" File found in manifest: {file_path.name} (key: {relative_path})")
# Check if output file actually exists in committed documented/ directory
# Use same logic as get_output_file_path but check repo root documented/
if self.output_structure == 'mirror' and self.source_root:
try:
file_path_resolved = file_path.resolve()
source_root_resolved = self.source_root.resolve()
relative_path_from_source = file_path_resolved.relative_to(source_root_resolved)
# Check in repo root documented/ directory (committed files)
output_file = Path('documented') / relative_path_from_source
logger.debug(f" Checking: {output_file} (exists: {output_file.exists()})")
except ValueError as e:
output_file = Path('documented') / file_path.name
logger.debug(f" ValueError in path resolution: {e}, using flat: {output_file}")
else:
output_file = Path('documented') / file_path.name
logger.debug(f" Using flat structure: {output_file}")
if not output_file.exists():
logger.info(f" Output file missing, reprocessing: {file_path.name}")
logger.debug(f" Looked for: {output_file.absolute()}")
return False
# Check if source file has changed by comparing hashes
try:
with open(file_path, 'r', encoding='utf-8') as f:
current_content = f.read()
current_hash = self.compute_code_hash(current_content)
stored_info = self.manifest['processed_files'][relative_path]
stored_hash = stored_info.get('content_hash')
# Log hash comparison for debugging
logger.info(f" Hash check for {file_path.name}:")
logger.info(f" Manifest key: {relative_path}")
logger.info(f" Stored hash: {stored_hash}")
logger.info(f" Current hash: {current_hash}")
logger.info(f" Match: {current_hash == stored_hash}")
if current_hash != stored_hash:
logger.info(f" Source file changed, reprocessing: {file_path.name}")
return False
else:
logger.info(f" Skipping (unchanged): {file_path.name}")
return True
except Exception as e:
logger.warning(f" Error checking file hash, reprocessing: {e}")
return False
else:
logger.info(f" File NOT in manifest: {file_path.name} (key: {relative_path})")
return False
def mark_file_processed(self, file_path: Path, success: bool = True, content: str = None,
validation_status: str = None):
"""Mark a file as processed in the manifest with content hash (thread-safe)
Args:
file_path: Path to the file
success: Whether processing was successful
content: Optional content for hash computation
validation_status: Validation result ('passed', 'warnings', 'failed', 'skipped')
"""
with self.manifest_lock:
relative_path = str(file_path)
if success:
if 'processed_files' not in self.manifest:
self.manifest['processed_files'] = {}
# Compute hash of the source file (without comments)
content_hash = None
if content:
content_hash = self.compute_code_hash(content)
else:
try:
with open(file_path, 'r', encoding='utf-8') as f:
content_hash = self.compute_code_hash(f.read())
except Exception as e:
logger.warning(f"Could not compute hash for {file_path}: {e}")
entry = {
'timestamp': datetime.now().isoformat(),
'provider': self.provider_name,
'content_hash': content_hash,
'file_name': file_path.name
}
# Add validation status if provided
if validation_status:
entry['validation_status'] = validation_status
self.manifest['processed_files'][relative_path] = entry
else:
if 'failed_files' not in self.manifest:
self.manifest['failed_files'] = []
if relative_path not in self.manifest['failed_files']:
self.manifest['failed_files'].append(relative_path)
# Save manifest after each file (in case of interruption)
self.save_manifest()
def strip_yard_comments(self, content: str) -> str:
"""
Remove existing YARD documentation while preserving inline code comments
This prevents duplicate documentation when regenerating docs for files
that already have partial YARD coverage.
Strips:
- Lines with YARD tags (@param, @return, @example, etc.)
- Description comment lines that are part of YARD doc blocks
Preserves:
- Inline comments within code (e.g., "# Calculate total")
- Special directives (e.g., # encoding:, # rubocop:, # :nodoc:)
- Shebang lines (#!/usr/bin/env ruby)
Args:
content: Original Ruby source code
Returns:
Content with YARD documentation removed
"""
lines = content.split('\n')
result = []
i = 0
while i < len(lines):
line = lines[i]
stripped = line.strip()
# Keep shebang, encoding, and other special directives
if stripped.startswith('#!') or 'encoding:' in stripped or 'coding:' in stripped:
result.append(line)
i += 1
continue
# Keep rubocop directives and :nodoc:
if '# rubocop:' in line or '# :nodoc:' in line or '# @!visibility' in line:
result.append(line)
i += 1
continue
# Check if this is a YARD tag line
is_yard_tag = stripped.startswith('#') and any(
tag in stripped for tag in [
'@param', '@return', '@example', '@raise', '@yield',
'@note', '@see', '@api', '@deprecated', '@since',
'@version', '@attr', '@attr_reader', '@attr_writer'
]
)
if is_yard_tag:
# Skip this line and any continuation lines (part of the YARD block)
i += 1
# Also skip following YARD tag lines and example code
while i < len(lines):
next_stripped = lines[i].strip()
# Continue skipping if it's:
# - A YARD tag line
# - Part of an @example block (indented code after @example)
# - A comment line that's part of the doc block
if next_stripped.startswith('#'):
# Check if it's another YARD tag or example code
has_tag = any(tag in next_stripped for tag in [
'@param', '@return', '@example', '@raise', '@yield',
'@note', '@see', '@api', '@deprecated', '@since',
'@version', '@attr', '@attr_reader', '@attr_writer'
])
if has_tag or next_stripped.startswith('# '): # Example code (indented)
i += 1
continue
break
continue
# Check if this is a description comment line above a definition
# (part of a YARD doc block without explicit tags)
if stripped.startswith('#') and not stripped.startswith('##'):
# Look ahead to see if this is followed by a definition
j = i + 1
is_yard_description = False
# Skip blank comment lines and other description lines
while j < len(lines):
next_line = lines[j].strip()
# If we hit a blank line, keep looking
if not next_line:
j += 1
continue
# If we hit a YARD tag, this is part of a YARD block
if next_line.startswith('#') and any(tag in next_line for tag in [
'@param', '@return', '@example', '@raise', '@yield', '@note'
]):
is_yard_description = True
break
# If we hit another comment, it might be more description
if next_line.startswith('#'):
j += 1
if j - i > 10: # Don't look too far ahead
break
continue
# If we hit a definition (class, module, def, attr_*), this is YARD
if any(next_line.startswith(kw) for kw in [
'class ', 'module ', 'def ', 'attr_reader', 'attr_writer', 'attr_accessor'
]):
is_yard_description = True
# Stop looking
break
if is_yard_description:
# Skip this description line
i += 1
continue
# Keep this line (it's code or an inline comment)
result.append(line)
i += 1
return '\n'.join(result)
def create_documentation_prompt(self, file_name: str, content: str) -> tuple[str, str]:
"""
Create prompts for documentation generation
Args:
file_name: Name of the file being documented
content: Ruby source code (should already have YARD comments stripped)
Returns:
(system_prompt, user_prompt) tuple
"""
system_prompt = """You are an expert Ruby documentation specialist.
Your task is to generate YARD-compatible documentation for Ruby code.
You will return JSON with documentation comments and their anchor points."""
# Add line numbers to help AI identify exact lines
numbered_lines = []
for i, line in enumerate(content.split('\n'), start=1):
numbered_lines.append(f"{i:4d}: {line}")
numbered_content = '\n'.join(numbered_lines)
user_prompt = f"""Analyze this Ruby file from the Lich5 project: **{file_name}**
```ruby
{numbered_content}
```
Generate **YARD-compatible** documentation following the lich-5 documentation style guide.
The line numbers are shown at the start of each line (e.g., " 15: def method_name").
**CRITICAL RULES - READ CAREFULLY:**
1. **WHAT TO DOCUMENT**
- Document all public classes, modules, methods, and non-trivial constants
- Add `@api private` to methods that are public but internal (not called from .lic scripts)
- Skip already-documented code (existing YARD tags present)
- If there's nothing to document, return an empty JSON array: []
2. **WHAT TO SKIP (no documentation needed)**
- Trivial one-line delegation methods
- `attr_reader` / `attr_accessor` / `attr_writer` declarations
- Aliases where the target method is already documented
- Constants with self-evident names and values (e.g., `MAX_RETRIES = 3`)
- Private methods marked with `private` keyword, `# :nodoc:`, or `# @!visibility private`
(unless they need `@api private` as they are public-but-internal)
3. **TAG ORDER** — always use this order when multiple tags appear:
1. `@param` (one per parameter, in parameter order)
2. `@return`
3. `@example`
4. `@note`
5. `@raise`
6. `@see`
7. `@since`
8. `@deprecated`
9. `@api`
4. **TYPES ARE REQUIRED**
- Every `@param` and `@return` MUST include a type annotation in [brackets]
- Use `[void]` when a method has no meaningful return value
- Common types: [String], [Integer], [Boolean], [Array<String>], [Hash], [nil], [String, nil]
- Game-specific types: [String] for item/container nouns, [Regexp] for patterns, [OpenStruct] for settings
5. **PARAMETER NAME RULES**
- @param tags MUST exactly match the method's parameter names
- For block parameters (`&block`): DO NOT add a @param tag — document block behavior
in the method description or with @yield/@yieldparam tags instead
WRONG: @param block [Proc] ...
WRONG: @param &block [Proc] ...
RIGHT: omit @param for &block entirely; describe it in the summary or use @yield
- For splat parameters (*args): Use the name without the asterisk
WRONG: @param *messages [Array]
RIGHT: @param messages [Array]
- Parameter names must match what's in the def statement exactly (no & or * symbols)
6. **METHOD REFERENCE SYNTAX**
- Class/module methods (def self.method): use `.method_name` in @see and {{.method_name}} inline
- Instance methods (def method): use `#method_name` in @see and {{#method_name}} inline
7. **DO NOT USE THESE TAGS**
- `@author` — use git blame instead
- `@version` — use `@since` instead
- `@todo` — use GitHub issues instead
- `@abstract` — Ruby has no abstract methods
8. **VALIDATION BEFORE RETURNING**
- Double-check each @param name matches the actual method parameter
- Remove the & and * symbols from @param names
- Confirm tag order matches rule 3 above
- Ensure types are present on every @param and @return
Documentation structure:
1. For classes/modules:
- Brief description on first line
- Longer description if needed (document the contract, not implementation details)
- `@see` cross-references to related classes/modules if relevant
2. For methods:
- Brief description (what it does, not how)
- Tags in order: @param, @return, @example (encouraged for consumer-facing), @note, @raise, @see
- @example format: the tag line contains an optional title, code goes on the NEXT line(s) indented with "# "
WRONG: @example DRCI.get_item?(\\"sword\\")
RIGHT: @example Get from default storage\\n# DRCI.get_item?(\\"sword\\")
3. For constants (pattern arrays, non-trivial values):
- Brief description
- `@example` showing what strings the pattern matches (for Regexp arrays)
- `@see` to cross-reference paired constants and methods that use them
Return a JSON array where each entry contains:
- "line_number": The line number to insert before (1-indexed, counting from line 1)
- "anchor": A snippet of the line for validation (e.g., "class GameObj", "def initialize")
- "indent": The indentation level (number of spaces before the line)
- "comment": The YARD comment block as a single string with \\n for newlines
Example output format:
```json
[
{{
"line_number": 15,
"anchor": "class GameObj",
"indent": 0,
"comment": "# Represents a game object in the world.\\n#\\n# @see #noun The object's primary identifier"
}},
{{
"line_number": 23,
"anchor": "def initialize",
"indent": 2,
"comment": "# Initializes a new game object.\\n# @param id [String] unique object ID\\n# @param noun [String] object noun (e.g., \\"sword\\", \\"backpack\\")\\n# @return [GameObj]"
}}
]
```
IMPORTANT:
- Return ONLY the JSON array, no other text
- Line numbers should match the file (1-indexed)
- Anchors should be concise (just the key part like "def method_name" or "class ClassName")
- @param names MUST NOT include & or * symbols (use "block" not "&block", use "args" not "*args")
- Verify @param names match the actual method parameters exactly
- CRITICAL: In the "comment" field, you MUST escape all special characters:
* Double quotes MUST be escaped: use \\" not "
* Backslashes MUST be escaped: use \\\\ not \\
* Example code MUST have escaped quotes: Feat[\\"name\\"] not Feat["name"]
* Line breaks use \\n (already escaped)
- Your JSON MUST be valid and parseable - test it mentally before returning
"""
return system_prompt, user_prompt
def process_file(self, file_path: Path) -> Optional[str]:
"""
Process a single Ruby file and generate documentation using JSON-based approach
Args:
file_path: Path to Ruby file
Returns:
Generated documentation or None if failed
"""
logger.info(f"Processing: {file_path.name}")
try:
# Read original file
with open(file_path, 'r', encoding='utf-8') as f:
original_content = f.read()
# Get file stats
lines = len(original_content.split('\n'))
logger.info(f" Lines: {lines}, Characters: {len(original_content)}")
# Strip existing YARD comments to prevent duplicates
# The LLM will regenerate all documentation from scratch
stripped_content = self.strip_yard_comments(original_content)
stripped_lines = len(stripped_content.split('\n'))
removed_lines = lines - stripped_lines
if removed_lines > 0:
logger.info(f" Stripped {removed_lines} lines of existing YARD documentation")
# Create prompts for JSON-based documentation
system_prompt, user_prompt = self.create_documentation_prompt(
file_path.name,
stripped_content
)
# Generate JSON with comments and anchors
logger.info(f" Requesting documentation from {self.provider_name}...")
result = self.provider.generate(user_prompt, system_prompt)
# Parse JSON response
comments = self.extract_comments_json(result)
if comments is None:
# JSON parsing completely failed - save response for debugging
logger.error(f" No comments extracted from response")
logger.error(f" AI response length: {len(result)} characters")
if len(result) < 1000:
logger.error(f" Full AI response: {result}")
else:
logger.error(f" AI response (first 500): {result[:500]}")
logger.error(f" AI response (last 500): {result[-500:]}")
# Save failed response for manual inspection
failed_response_file = self.output_dir / f"{file_path.stem}_failed_response.txt"
with open(failed_response_file, 'w', encoding='utf-8') as f:
f.write(f"Failed to parse JSON for: {file_path.name}\n")
f.write(f"AI Response Length: {len(result)} characters\n")
f.write("="*80 + "\n")
f.write(result)
logger.info(f" Saved failed response to: {failed_response_file.name}")
self.failed_files.append(file_path.name)
return None
if len(comments) == 0:
# Empty array is valid - file has nothing to document (e.g., only require statements)
logger.info(f" No documentation needed (file contains only requires/imports)")
documented_code = stripped_content
else:
logger.info(f" Extracted {len(comments)} documentation entries")
# Insert comments into stripped code (not original, to avoid duplicates)
documented_code = self.insert_comments(stripped_content, comments)
# Store documentation
self.documentation[file_path.name] = {
'original': original_content,
'documented': documented_code,
'timestamp': datetime.now().isoformat()
}
logger.info(f" ✅ Successfully documented {file_path.name}")
return documented_code
except Exception as e:
logger.error(f" ❌ Failed to process {file_path.name}: {e}")
import traceback
logger.debug(traceback.format_exc())
self.failed_files.append(file_path.name)
return None
def sanitize_json_escapes(self, json_text: str) -> str:
r"""
Sanitize invalid escape sequences in JSON string
Valid JSON escapes: \", \\, \/, \b, \f, \n, \r, \t, \uXXXX
Common invalid escapes from AI: \d, \s, \w, \x, etc. (regex patterns)
Args:
json_text: Raw JSON string that may contain invalid escapes
Returns:
Sanitized JSON string with invalid escapes fixed
"""
# Simpler approach: find all escape sequences and validate them
result = []
i = 0
while i < len(json_text):
if json_text[i] == '\\' and i + 1 < len(json_text):
next_char = json_text[i + 1]
# Check if it's a valid escape
if next_char in '"\\/bfnrt':
# Valid single-char escape
result.append('\\')
result.append(next_char)
i += 2
elif next_char == 'u' and i + 5 < len(json_text):
# Check for \uXXXX (must be 4 hex digits)
hex_part = json_text[i+2:i+6]
if len(hex_part) == 4 and all(c in '0123456789ABCDEFabcdef' for c in hex_part):
# Valid \uXXXX
result.append('\\u')
result.append(hex_part)
i += 6
else:
# Invalid \u sequence - double-escape it
result.append('\\\\u')
i += 2
else:
# Invalid escape - double-escape the backslash
result.append('\\\\')
result.append(next_char)
i += 2
else:
# Not an escape sequence
result.append(json_text[i])
i += 1
return ''.join(result)
def clean_json_concatenation(self, json_text: str) -> str:
"""
Clean up invalid JSON string concatenation patterns
LLMs sometimes generate JSON with JavaScript/Python-style string concatenation:
"text1"
+ "text2"
+ "text3"
This is NOT valid JSON. Convert to single concatenated string.
Args:
json_text: Raw JSON that may contain string concatenation
Returns:
Cleaned JSON with concatenations resolved
"""
# Pattern: "string1" + "string2" (with optional whitespace/newlines)
# Match: "..." followed by optional whitespace, +, optional whitespace, "..."
# This handles both inline and multi-line concatenations:
# "line1\n" + "line2\n" (inline)
# "line1\n"
# + "line2\n" (multi-line)
# + "line3\n"
def concat_strings(match):
"""Callback to concatenate matched string segments"""
# Extract all string contents from the match (between quotes)
# This regex finds content between quotes, handling escaped quotes
strings = []
# Match all "..." segments, including escaped characters
for s in re.finditer(r'"((?:[^"\\]|\\.)*)"', match.group(0)):
strings.append(s.group(1))
# Concatenate all segments into a single JSON string
# The content is already escaped (e.g., \n for newlines)
return '"' + ''.join(strings) + '"'
# Pattern explanation:
# "(?:[^"\\]|\\.)*" - Match a JSON string (with escaped chars)
# (?:\s*\+\s*"(?:[^"\\]|\\.)*")+ - Match one or more: whitespace, +, whitespace, string
# The \s* allows for optional newlines and indentation
pattern = r'"(?:[^"\\]|\\.)*"(?:\s*\+\s*"(?:[^"\\]|\\.)*")+'
cleaned = re.sub(pattern, concat_strings, json_text)
return cleaned
def extract_comments_json(self, response: str) -> List[Dict[str, Any]]:
"""
Extract JSON array of comments from LLM response
Tries direct JSON parsing first (for structured outputs), then falls back
to regex-based extraction strategies.
Returns:
List of comment entries with anchor, indent, and comment fields
"""
# Strategy 0: Try direct JSON parse first (for structured output responses)
# This handles both wrapped {"comments": [...]} and direct [...] formats
try:
data = json.loads(response.strip())
# Handle wrapped format from structured outputs
if isinstance(data, dict) and "comments" in data:
logger.debug("Direct JSON parse succeeded (wrapped format)")
return data["comments"]
# Handle direct array format
if isinstance(data, list):
logger.debug("Direct JSON parse succeeded (array format)")
return data
except json.JSONDecodeError:
pass # Fall through to extraction strategies
extraction_attempts = []
# Strategy 1: Try to find JSON code blocks first
json_blocks = re.findall(r'```json\s*(.*?)```', response, re.DOTALL)
if json_blocks:
extraction_attempts.append(('json code block', json_blocks[0].strip()))
# Strategy 2: Try to find JSON array directly (greedy match)
json_match = re.search(r'\[\s*\{.*\}\s*\]', response, re.DOTALL)
if json_match:
extraction_attempts.append(('greedy array match', json_match.group(0)))
# Strategy 3: Try to find JSON array (non-greedy)
json_match_ng = re.search(r'\[\s*\{.*?\}\s*\]', response, re.DOTALL)
if json_match_ng and json_match_ng.group(0) not in [a[1] for a in extraction_attempts]:
extraction_attempts.append(('non-greedy array match', json_match_ng.group(0)))
# Strategy 4: Last resort - assume entire response is JSON
if response.strip():
extraction_attempts.append(('raw response', response.strip()))
# Try each extraction strategy
for strategy_name, json_text in extraction_attempts:
try:
# Step 1: Clean up string concatenation (LLMs sometimes use + operators)
cleaned = self.clean_json_concatenation(json_text)
# Step 2: Sanitize invalid escape sequences
sanitized = self.sanitize_json_escapes(cleaned)
comments = json.loads(sanitized)
if not isinstance(comments, list):
logger.debug(f"Strategy '{strategy_name}' found non-list JSON, skipping")
continue
# Empty arrays are valid - file may have nothing to document
logger.debug(f"Strategy '{strategy_name}' successfully extracted {len(comments)} comment entries")
return comments
except json.JSONDecodeError as e:
logger.error(f"Strategy '{strategy_name}' failed to parse JSON: {e}")
logger.error(f" Error at position {e.pos}: {sanitized[max(0, e.pos-50):e.pos+50]}")
continue
except Exception as e:
logger.debug(f"Strategy '{strategy_name}' failed with error: {e}")
continue
# All strategies failed
logger.error(f"Failed to parse JSON response with all {len(extraction_attempts)} strategies")
logger.error(f"Response preview (first 500 chars): {response[:500]}")
logger.error(f"Response preview (last 500 chars): {response[-500:]}")
return None
def soft_match_anchor(self, anchor: str, line: str) -> bool:
"""
Soft match anchor against line using Ruby-specific pattern matching
Args:
anchor: The anchor string (e.g., "def initialize", "class GameObj")
line: The line of code to match against
Returns:
True if anchor matches line using Ruby syntax patterns
"""
anchor_stripped = anchor.strip()
line_stripped = line.strip()
# Pattern 1: Class/Module definitions
# Anchor: "class GameObj" or "module Lich"
if anchor_stripped.startswith(('class ', 'module ')):
keyword, name = anchor_stripped.split(None, 1)
name = name.split('(')[0].strip() # Remove any params
return re.search(rf'^\s*{keyword}\s+{re.escape(name)}\b', line)
# Pattern 2: Method definitions (instance or class methods)
# Anchor: "def method_name" or "def self.method" or "def ClassName.method"
if anchor_stripped.startswith('def '):
method_sig = anchor_stripped[4:].split('(')[0].strip()
# Extract the base method name (last part after any dots)
if '.' in method_sig:
method_name = method_sig.split('.')[-1]
else:
method_name = method_sig
# Flexible matching: anchor "def method" should match:
# - def method
# - def self.method
# - def ClassName.method
# And anchor "def self.method" should also match all of those
# Pattern matches: def <optional-qualifier>.<method_name>[?!=]? or []
# Where qualifier can be "self", a class name, or nothing
# Ruby allows ? ! = at end of method names, and [] for array access
if method_name == '[]':
# Special case: array access operator
pattern = rf'\bdef\s+(?:(?:self|\w+)\.)?\[\]'
else:
# Regular method, might have ?, !, or = suffix
pattern = rf'\bdef\s+(?:(?:self|\w+)\.)?{re.escape(method_name)}[?!=]?'
if re.search(pattern, line):
return True
# Fallback: exact match of full signature
if f'def {method_sig}' in line:
return True
return False
# Pattern 3: Attribute readers/writers/accessors
# Anchor: "attr_reader :mana" or "attr_accessor"
if anchor_stripped.startswith('attr_'):
# Extract the attribute type and symbol
parts = anchor_stripped.split()
attr_type = parts[0] # attr_reader, attr_accessor, etc.
if len(parts) > 1:
symbol = parts[1].lstrip(':')
return re.search(rf'{attr_type}\s+:{re.escape(symbol)}\b', line)
else:
return attr_type in line
# Pattern 4: Constants (all caps with =)
# Anchor: "CONSTANT_NAME" or "CONSTANT_NAME ="
if anchor_stripped.replace('_', '').replace('=', '').strip().isupper():
const_name = anchor_stripped.split('=')[0].strip()
return re.search(rf'\b{re.escape(const_name)}\s*=', line)
# Pattern 5: Class variables (@@var) or instance variables (@var)
# Anchor: "@@variable" or "@variable"
if anchor_stripped.startswith(('@@', '@')):
var_name = anchor_stripped.split()[0].split('=')[0].strip()
return re.search(rf'{re.escape(var_name)}\s*(=|\|\|=)', line)
# Fallback: Token-based matching (original approach)
# Remove params and clean up
anchor_clean = anchor_stripped.split('(')[0].strip()
tokens = anchor_clean.split()
if not tokens:
return False
# Check if all key tokens appear in the line
return all(token in line for token in tokens)
def find_insertion_line(self, lines: List[str], line_number: int, anchor: str,
inserted_at_lines: set) -> Optional[int]:
"""
Find the correct line to insert comment using progressive matching
Strategy:
1. Try exact match at expected line number
2. Try soft match at expected line number
3. Search entire file (nearby lines first, then rest of file)
Methods/classes are unique, so safe to search whole file
Args:
lines: Source code lines
line_number: Expected line number (1-indexed from AI)
anchor: Anchor string for validation
inserted_at_lines: Set of already-used line indices
Returns:
0-indexed line number to insert before, or None if not found
"""
# Convert to 0-indexed
expected_idx = line_number - 1
# Bounds check
if expected_idx < 0 or expected_idx >= len(lines):
logger.warning(f"Line number {line_number} out of bounds (file has {len(lines)} lines)")
return None
# Skip if already inserted at this line
if expected_idx in inserted_at_lines:
logger.debug(f"Line {line_number} already has a comment, skipping")
return None
# Strategy 1: Exact match at expected line
if anchor in lines[expected_idx]:
logger.debug(f"Exact match at line {line_number}")
return expected_idx
# Strategy 2: Soft match at expected line
if self.soft_match_anchor(anchor, lines[expected_idx]):
logger.debug(f"Soft match at line {line_number} for anchor: {anchor[:30]}")
return expected_idx
# Strategy 3: Search entire file (methods/classes are unique in a file)
# Start with nearby lines first, then expand outward
search_order = []
# Get line_offset from config
line_offset = 5 # default
if HAS_CONFIG:
try:
config = get_config()
line_offset = config.anchor_matching.line_offset
except Exception:
pass
# First check nearby lines (±line_offset)
for offset in range(-line_offset, line_offset + 1):
if offset == 0:
continue
idx = expected_idx + offset
if 0 <= idx < len(lines):
search_order.append(idx)
# Then check rest of file
for idx in range(len(lines)):
if idx != expected_idx and idx not in search_order: