-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_docs.py
executable file
·294 lines (249 loc) · 16 KB
/
generate_docs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
# -*- coding: utf-8 -*-
import os
import requests
import argparse
import logging
from pathlib import Path
import re
import json
# Configuration
LM_STUDIO_API_URL = "http://localhost:1234/v1/chat/completions" # Update this with your actual LLM server URL
MODEL_ID = "your-model-id" # Replace with your actual model ID
PROJECT_ROOT = Path(__file__).parent
DOCS_ROOT = PROJECT_ROOT / "docs"
# Define file extensions and directories relevant to your stacks
SOURCE_CODE_EXTENSIONS = [".php", ".js", ".jsx", ".ts", ".tsx", ".json",".prisma", ".yml"]
MAX_FILE_SIZE_MB = 10 # Maximum file size in megabytes to process
IGNORE_PATTERNS = ["node_modules", ".git", ".env", "vendor", "storage", "public", ".next", "dist",
"*.test.js", "*.test.ts", "*.spec.js", "*.spec.ts", "*.md", "*.css", "generate_docs.py"]
DEPENDENCY_FILES = [ "composer.json","package.json", "requirements.txt", "Pipfile", "Pipfile.lock"]
PROMPT_TEMPLATE = """
# Note: The following dependencies information is for contextual understanding only and should not be included in the final documentation.
# Dependencies:
{dependencies_info}
You are a skilled technical writer, tasked with creating well-organized and accessible documentation for the provided source code. The documentation should cater to both beginners and experienced developers, covering the following sections:
1. **Overview**: Summarize the code’s purpose and functionality in a concise, clear manner.
2. **In-Depth Explanation**: Detail the core functions, classes, and methods. Explain the logic, design decisions, and possible alternatives, using code snippets where necessary.
3. **Usage Examples**: Provide practical examples to demonstrate the code in action. Include step-by-step guides for beginners and advanced tips for seasoned developers.
4. **Dependencies **: List all necessary external dependencies and tools Used in the source code.
5. **Best Practices and Optimization**: Recommend best practices for using and extending the code. Suggest optimization techniques and potential areas for improvement.
6. **Common Pitfalls and Troubleshooting**: Highlight common issues and offer troubleshooting advice.
7. **Sensitive Data Masking**: Ensure any sensitive information in the code is obscured, replacing it with placeholders like `[MASKED]`.
Here is the source code:
**File:** {file_path}
```{content}```
**Formatting**: Use Markdown, with clear headings, code blocks, and bullet points. Include code snippets to support explanations and examples.
**Tone and Style**: Maintain a conversational yet professional tone. Avoid unnecessary jargon; explain technical terms clearly.
**Quality Assurance**: Ensure the documentation is accurate, complete, and that code snippets are functional and relevant.
"""
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def load_gitignore_patterns(root_path: Path) -> list:
"""Load ignore patterns from all .gitignore files within the root path."""
patterns = IGNORE_PATTERNS[:]
for root, _, files in os.walk(root_path):
for file in files:
if file == ".gitignore":
gitignore_path = Path(root) / file
logging.info(f"Loading patterns from {gitignore_path}...")
with open(gitignore_path, 'r', encoding='utf-8') as f:
patterns.extend(line.strip() for line in f if line.strip() and not line.startswith('#'))
return patterns
def should_ignore(file_path: Path, ignore_patterns: list) -> bool:
"""Check if the file path matches any ignore pattern."""
str_path = str(file_path)
for pattern in ignore_patterns:
if pattern.startswith('/') and (file_path.match(pattern) or str_path.endswith(pattern)):
logging.info(f"Ignoring path due to pattern '{pattern}': {file_path}")
return True
elif file_path.name == pattern.strip('/'):
logging.info(f"Ignoring path due to pattern '{pattern}': {file_path}")
return True
return False
def is_source_code_file(file_path: Path) -> bool:
"""Check if the file is a source code file based on its extension."""
return file_path.suffix in SOURCE_CODE_EXTENSIONS
def mask_secrets(code: str) -> str:
"""Preprocess the code to mask potential secrets."""
# Example patterns for common secrets; adjust as needed
patterns = [
# JWT tokens (commonly found in Authorization headers)
r"Bearer\s[a-zA-Z0-9\-_.]+", # Example: Bearer <token>
r"eyJhbGciOiJIUzI1NiIsIn[\w-]+", # JWT token in Python code (partially matched)
r"([a-zA-Z0-9]{20,}\.[a-zA-Z0-9]{20,}\.[a-zA-Z0-9]{20,})", # General JWT pattern
# General API keys (long alphanumeric strings)
r"[a-zA-Z0-9]{32,}", # General API key pattern (long alphanumeric strings)
r"[A-Za-z0-9-_]{40,}", # Extended general API key pattern
# API keys in single or double quotes
r"'[A-Za-z0-9-_]{20,}'", # API key in single quotes
r'"[A-Za-z0-9-_]{20,}"', # API key in double quotes
# Other potential secret patterns
r"(?:apikey|api_key|apiToken|api-token|secret|authToken|auth_token|token)[\s:]*['\"]?[a-zA-Z0-9-_]{20,}['\"]?", # Various labels for secrets
# r"\b[A-Za-z0-9-_]{20,}\b", # Generic long alphanumeric strings (could be passwords or tokens)
# Mask secrets assigned with quotes
r'\b(?:password|pass|secret|api[_-]?key|token|auth[_-]?key)\b\s*=\s*[\'"][^\'"]*[\'"]',
# Mask secrets assigned without quotes
r'\b(?:password|pass|secret|api[_-]?key|token|auth[_-]?key)\b\s*=\s*\S+',
]
masked_code = code
for pattern in patterns:
masked_code = re.sub(pattern, '[MASKED]', masked_code, flags=re.IGNORECASE)
return masked_code
def replace_spaces_and_tabs(text):
result = re.sub(r'[ \t]{4,}', '\t', text)
return result
def generate_documentation(prompt: str) -> str:
"""Generate documentation from the given prompt using the local LLM server."""
headers = {"Content-Type": "application/json"}
data = {
"model": MODEL_ID,
"messages": [
{
"role": 'system',
"content": '<Assistant_Info> Mr_AI, created by Its-Satyajit, has knowledge up to Today. It answers questions based on this knowledge and clarifies its cutoff date when relevant. It cannot access URLs, links, or videos, and asks users to provide relevant text or images when needed. Mr_AI approaches controversial topics with care, avoiding labeling information as sensitive or objective. It excels at solving complex problems by breaking them down into simpler components and using step-by-step reasoning. </Assistant_Info> <Task_Handling> Mr_AI handles both simple and complex tasks with precision. For complex problems, it divides the task into smaller, manageable steps and solves each systematically. When working on coding, logical reasoning, or problem-solving, Mr_AI uses detailed, step-by-step analysis to ensure clarity and accuracy. It assists with tasks involving popular opinions regardless of its own views. For obscure topics, it warns about possible hallucinations and advises users to verify citations since it lacks access to databases or search engines. </Task_Handling> <Interaction_Style> Mr_AI avoids unnecessary apologies and filler phrases like "Certainly" or "Absolutely." It provides concise answers but switches to detailed responses for more intricate tasks or when asked for specifics. For coding, math, and problem-solving, Mr_AI follows a clear, methodical process, explaining each step. Code explanations are provided only if requested. For lengthy tasks, it breaks them into parts, seeking user feedback as it completes each section. Users can give feedback via the thumbs-down button if dissatisfied. </Interaction_Style> <Image_Handling> Mr_AI is "face-blind" and cannot recognize individuals from images. It discusses named individuals but never implies facial recognition. For any image-based task, Mr_AI summarizes the content and repeats instructions before proceeding. </Image_Handling> <Language_and_Tone> Mr_AI communicates in the user’s language and follows these guidelines for tasks like analysis, coding, problem-solving, creative writing, teaching, and more. It prioritizes clarity and logical structure, especially for complex or abstract inquiries, ensuring responses are as accurate and systematic as possible. </Language_and_Tone>',
},
{"role": "user", "content": prompt}
],
"temperature": 1,
"max_tokens": 5000 # Increased to accommodate more detailed documentation
}
try:
logging.info("Sending request to LLM server...")
response = requests.post(LM_STUDIO_API_URL, headers=headers, json=data)
response.raise_for_status()
completion = response.json().get("choices", [{}])[0].get("message", {}).get("content", "")
logging.info("Documentation generated successfully.")
return completion
except requests.RequestException as e:
logging.error(f"Error generating documentation: {e}")
return ""
def get_output_file_name(file_path: Path) -> Path:
"""Generate an output file name based on the input file path, preserving folder structure."""
relative_path = file_path.relative_to(PROJECT_ROOT)
# Change the suffix to include '-documentation' before the .md extension
base_name = str(relative_path.with_suffix('')) + '-documentation.md' # Updated line
return DOCS_ROOT / base_name
def load_dependency_file(file_path: Path) -> dict:
"""Load dependency information from a given file."""
if file_path.exists():
with open(file_path, 'r', encoding='utf-8') as f:
if file_path.suffix == ".json":
return json.load(f)
elif file_path.name == "requirements.txt":
return {"requirements.txt": [line.strip() for line in f if line.strip()]}
elif file_path.name in ["Pipfile", "Pipfile.lock"]:
return {"Pipfile": [line.strip() for line in f if line.strip()]}
return {}
def get_dependencies_info() -> str:
"""Generate a formatted string of dependencies from known files."""
dependencies_info = ""
for dep_file in DEPENDENCY_FILES:
file_path = PROJECT_ROOT / dep_file
if file_path.exists():
data = load_dependency_file(file_path)
if file_path.suffix == ".json":
if "dependencies" in data:
dependencies_info += "### Node Dependencies\n\n"
for dep, version in data["dependencies"].items():
dependencies_info += f"- **{dep}**: {version}\n"
if "require" in data:
dependencies_info += "\n### Laravel Dependencies\n\n"
for dep, version in data["require"].items():
dependencies_info += f"- **{dep}**: {version}\n"
if "devDependencies" in data:
dependencies_info += "\n### Node Dev Dependencies\n\n"
for dev_dep, version in data["devDependencies"].items():
dependencies_info += f"- **{dev_dep}**: {version}\n"
if "require-dev" in data:
dependencies_info += "\n### Laravel Dev Dependencies\n\n"
for dep, version in data["require-dev"].items():
dependencies_info += f"- **{dep}**: {version}\n"
elif file_path.name == "requirements.txt":
dependencies_info += "### Python Dependencies (requirements.txt)\n\n"
for requirement in data.get("requirements.txt", []):
dependencies_info += f"- {requirement}\n"
elif file_path.name == "Pipfile":
dependencies_info += "### Python Dependencies (Pipfile)\n\n"
for requirement in data.get("Pipfile", []):
dependencies_info += f"- {requirement}\n"
return dependencies_info
def file_is_too_large(file_path: Path) -> bool:
"""Check if the file size exceeds the maximum allowed size."""
try:
file_size_mb = file_path.stat().st_size / (1024 * 1024) # Size in megabytes
return file_size_mb > MAX_FILE_SIZE_MB
except (OSError, FileNotFoundError) as e:
logging.error(f"Error checking file size for {file_path}: {e}")
return False
def process_file(file_path: Path, ignore_patterns: list, force: bool) -> None:
"""Process each file to generate or update documentation."""
if should_ignore(file_path, ignore_patterns) or not is_source_code_file(file_path):
logging.info(f"Skipping ignored or non-source file: {file_path}")
return
if file_is_too_large(file_path):
logging.info(f"Skipping ignored file too large: {file_path}")
return
output_file_path = get_output_file_name(file_path)
# Check if documentation needs to be regenerated
if not force and output_file_path.exists():
try:
doc_mtime = output_file_path.stat().st_mtime
file_mtime = file_path.stat().st_mtime
if file_mtime <= doc_mtime:
logging.info(f"Documentation already up-to-date for {file_path}. Skipping.")
return
else:
logging.info(f"File modified after documentation creation. Re-generating for {file_path}.")
except (OSError, FileNotFoundError) as e:
logging.warning(f"Error checking modification times: {e}")
logging.info(f"Proceeding to re-generate documentation for {file_path}.")
else:
logging.info(f"Processing file: {file_path}")
try:
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
except UnicodeDecodeError:
logging.error(f"Error reading file {file_path}: File is not UTF-8 encoded. Skipping.")
return
except IOError as e:
logging.error(f"Error reading file {file_path}: {e}")
return
masked_content = mask_secrets(content)
relative_file_path = file_path.relative_to(PROJECT_ROOT)
dependencies = get_dependencies_info()
prompt = PROMPT_TEMPLATE.format(content=replace_spaces_and_tabs(masked_content), file_path=relative_file_path, dependencies_info=dependencies )
documentation = generate_documentation(prompt)
if documentation:
output_file_path.parent.mkdir(parents=True, exist_ok=True)
try:
with open(output_file_path, 'w', encoding='utf-8') as file:
file.write(documentation)
logging.info(f"Documentation saved to {output_file_path}")
except IOError as e:
logging.error(f"Error writing file {output_file_path}: {e}")
def generate_docs(file: Path = None, force: bool = False):
"""Generate documentation for all files or a single file."""
ignore_patterns = load_gitignore_patterns(PROJECT_ROOT)
if file:
file_path = Path(file).resolve()
if file_path.is_file():
logging.info(f"Generating documentation for: {file_path}")
process_file(file_path, ignore_patterns, force)
else:
logging.error(f"File not found: {file_path}")
else:
logging.info("Generating documentation for all source code files...")
for root, dirs, files in os.walk(PROJECT_ROOT):
logging.info(f"Processing directory: {root}")
dirs[:] = [d for d in dirs if not should_ignore(Path(root) / d, ignore_patterns)]
for file in files:
file_path = Path(root) / file
process_file(file_path, ignore_patterns, force)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate or update documentation from source code.")
parser.add_argument('--file', type=str, help="Specify a single file to generate documentation for.")
parser.add_argument('--force', action='store_true', help="Force re-create documentation even if it already exists.")
args = parser.parse_args()
logging.info("Starting documentation generation...")
generate_docs(file=args.file, force=args.force)
logging.info("Documentation generation completed.")