Skip to content

Adds a new writeme with improved validation and error messaging. #7526

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 106 additions & 0 deletions .tools/readmes/cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0

"""
Cache implementation for WRITEME to speed up repeated runs.
"""

import json
import logging
import os
import pickle
from pathlib import Path
from typing import Any, Dict, Optional

logger = logging.getLogger(__name__)

# Cache directory relative to the readmes directory
CACHE_DIR = Path(__file__).parent / ".cache"


def get_cache_enabled() -> bool:
"""Check if caching is enabled via environment variable."""
return os.environ.get("USE_METADATA_CACHE", "0") == "1"


def ensure_cache_dir() -> None:
"""Ensure the cache directory exists."""
if not CACHE_DIR.exists():
CACHE_DIR.mkdir(exist_ok=True)
logger.debug(f"Created cache directory: {CACHE_DIR}")


def get_cache_path(key: str) -> Path:
"""Get the cache file path for a given key."""
# Create a filename-safe version of the key
safe_key = key.replace("/", "_").replace(":", "_")
return CACHE_DIR / f"{safe_key}.pickle"


def save_to_cache(key: str, data: Any) -> bool:
"""
Save data to cache.

Args:
key: Cache key
data: Data to cache (must be pickle-able)

Returns:
bool: True if successfully cached, False otherwise
"""
if not get_cache_enabled():
return False

try:
ensure_cache_dir()
cache_path = get_cache_path(key)

with open(cache_path, "wb") as f:
pickle.dump(data, f)

logger.debug(f"Cached data for key: {key}")
return True
except Exception as e:
logger.warning(f"Failed to cache data for key {key}: {e}")
return False


def load_from_cache(key: str) -> Optional[Any]:
"""
Load data from cache.

Args:
key: Cache key

Returns:
The cached data or None if not found or caching disabled
"""
if not get_cache_enabled():
return None

cache_path = get_cache_path(key)

if not cache_path.exists():
return None

try:
with open(cache_path, "rb") as f:
data = pickle.load(f)

logger.debug(f"Loaded data from cache for key: {key}")
return data
except Exception as e:
logger.warning(f"Failed to load cache for key {key}: {e}")
return None


def clear_cache() -> None:
"""Clear all cached data."""
if CACHE_DIR.exists():
for cache_file in CACHE_DIR.glob("*.pickle"):
try:
cache_file.unlink()
except Exception as e:
logger.warning(f"Failed to delete cache file {cache_file}: {e}")

logger.info("Cache cleared")
251 changes: 251 additions & 0 deletions .tools/readmes/deep_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0

"""
Deep validator module for WRITEME to check for issues in the codebase.
This version performs a more thorough check for duplicate snippet tags by
directly scanning the files in the repository.
"""

import logging
import os
import re
import concurrent.futures
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Set, Tuple, Optional, Any

from aws_doc_sdk_examples_tools.doc_gen import DocGen

logger = logging.getLogger(__name__)


class ValidationError(Exception):
"""Exception raised for validation errors."""
pass


def find_snippet_tags_in_file(file_path: Path) -> List[Tuple[str, int]]:
"""
Find all snippet tags in a file by directly parsing the file content.

Args:
file_path: Path to the file to check

Returns:
List of tuples containing (tag, line_number)
"""
if not file_path.exists():
return []

try:
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
lines = f.readlines()
except Exception as e:
logger.warning(f"Error reading file {file_path}: {e}")
return []

# Common snippet tag patterns
patterns = [
# Standard snippet tag format
r'snippet-start:\s*\[([^\]]+)\]',
r'snippet-end:\s*\[([^\]]+)\]',
# Alternative formats
r'SNIPPET\s+START\s+\[([^\]]+)\]',
r'SNIPPET\s+END\s+\[([^\]]+)\]',
r'//\s*SNIPPET:\s*([^\s]+)',
r'#\s*SNIPPET:\s*([^\s]+)',
r'<!--\s*SNIPPET:\s*([^\s]+)\s*-->',
# Look for any other potential tag formats
r'snippet[:\-_]([a-zA-Z0-9_\-]+)',
# Common AWS SDK snippet formats
r'//\s*snippet-start:\s*([^\s]+)',
r'#\s*snippet-start:\s*([^\s]+)',
r'<!--\s*snippet-start:\s*([^\s]+)\s*-->',
r'//\s*snippet-end:\s*([^\s]+)',
r'#\s*snippet-end:\s*([^\s]+)',
r'<!--\s*snippet-end:\s*([^\s]+)\s*-->',
]

results = []
for i, line in enumerate(lines, 1):
for pattern in patterns:
matches = re.findall(pattern, line, re.IGNORECASE)
for match in matches:
results.append((match, i))

return results


def scan_directory_for_snippet_tags(
root_dir: Path,
extensions: Optional[List[str]] = None,
max_workers: int = 10
) -> Dict[str, List[Tuple[str, int, str]]]:
"""
Scan a directory recursively for files containing snippet tags.
Uses parallel processing for faster scanning.

Args:
root_dir: Root directory to scan
extensions: Optional list of file extensions to check
max_workers: Maximum number of parallel workers

Returns:
Dictionary mapping snippet tags to lists of (file_path, line_number, context)
"""
if extensions is None:
# Default extensions to check
extensions = [
'.py', '.java', '.js', '.ts', '.cs', '.cpp', '.c', '.go', '.rb',
'.php', '.swift', '.kt', '.rs', '.abap', '.md', '.html', '.xml'
]

# Find all files with the specified extensions
files_to_scan = []
for root, _, files in os.walk(root_dir):
for file in files:
if any(file.endswith(ext) for ext in extensions):
files_to_scan.append(Path(root) / file)

# Process files in parallel
tag_to_locations = defaultdict(list)

def process_file(file_path):
try:
relative_path = file_path.relative_to(root_dir)
tags = find_snippet_tags_in_file(file_path)

results = []
for tag, line_number in tags:
# Get some context from the file
try:
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
lines = f.readlines()
start_line = max(0, line_number - 2)
end_line = min(len(lines), line_number + 1)
context = ''.join(lines[start_line:end_line]).strip()
except Exception:
context = "<context unavailable>"

results.append((str(relative_path), line_number, context))

return {tag: [loc] for tag, line_number in tags for loc in [(str(relative_path), line_number, "")]}
except Exception as e:
logger.warning(f"Error processing file {file_path}: {e}")
return {}

# Use ThreadPoolExecutor for parallel processing
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_file = {executor.submit(process_file, file): file for file in files_to_scan}

for future in concurrent.futures.as_completed(future_to_file):
file_results = future.result()
for tag, locations in file_results.items():
tag_to_locations[tag].extend(locations)

return tag_to_locations


def check_duplicate_snippet_tags_deep(doc_gen: DocGen) -> List[Tuple[str, List[Dict[str, Any]]]]:
"""
Deep check for duplicate snippet tags in the codebase.
This function scans all files directly to find snippet tags.

Args:
doc_gen: The DocGen instance containing snippets

Returns:
List of tuples containing (tag, [location_details]) for duplicate tags
"""
logger.info("Starting deep scan for duplicate snippet tags...")

# Scan the repository directly for snippet tags
root_dir = doc_gen.root
tag_locations = scan_directory_for_snippet_tags(root_dir)

# Find tags that appear in multiple files
duplicates = []
for tag, locations in tag_locations.items():
# Group locations by file path
files = {}
for file_path, line_number, context in locations:
if file_path not in files:
files[file_path] = []
files[file_path].append({"line": line_number, "context": context})

# If the tag appears in multiple files, it's a duplicate
if len(files) > 1:
duplicate_info = []
for file_path, occurrences in files.items():
duplicate_info.append({
"file": file_path,
"occurrences": occurrences
})
duplicates.append((tag, duplicate_info))

logger.info(f"Deep scan complete. Found {len(duplicates)} duplicate tags.")
return duplicates


def format_duplicate_report(duplicates: List[Tuple[str, List[Dict[str, Any]]]]) -> str:
"""
Format a detailed report of duplicate snippet tags.

Args:
duplicates: List of duplicate tag information

Returns:
Formatted report as a string
"""
if not duplicates:
return "No duplicate snippet tags found."

report = [f"Found {len(duplicates)} duplicate snippet tags:"]

for tag, locations in duplicates:
report.append(f"\nTag: '{tag}' found in {len(locations)} files:")

for location in locations:
file_path = location["file"]
occurrences = location["occurrences"]

report.append(f" File: {file_path}")
for occurrence in occurrences:
line = occurrence.get("line", "unknown")
context = occurrence.get("context", "").replace("\n", " ").strip()
if context:
context = f" - Context: {context[:60]}..."
report.append(f" Line {line}{context}")

return "\n".join(report)


def validate_snippets_deep(doc_gen: DocGen, strict: bool = False) -> bool:
"""
Deep validation of snippets in the codebase.

Args:
doc_gen: The DocGen instance containing snippets
strict: If True, raise an exception for validation errors

Returns:
True if validation passed, False otherwise
"""
validation_passed = True

# Check for duplicate snippet tags using the deep method
duplicates = check_duplicate_snippet_tags_deep(doc_gen)
if duplicates:
validation_passed = False
report = format_duplicate_report(duplicates)
print("\n=== DUPLICATE SNIPPET TAGS (DEEP SCAN) ===")
print(report)

# Exit with error if strict validation is enabled
if strict:
raise ValidationError("Validation failed: duplicate snippet tags found")
else:
print("No duplicate snippet tags found in deep scan.")

return validation_passed
Loading
Loading