diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..60dd06d --- /dev/null +++ b/.gitignore @@ -0,0 +1,69 @@ +*node_modules/ +settings.py +*.sqlite3 +.project +.pydevproject +.settings +.vscode +doctrees +bin/ +etc/ +pyvenv.cfg +share/ +.gzipper/ +.venv + +__pycache__/ +*.py[cod] +.pytest_cache/ +.ipynb_checkpoints +*.ipynb +notebooks/ + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg +build/ + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover + +# Django stuff: +*.log + +# Sphinx documentation +docs/_build + diff --git a/README.md b/README.md index 842d647..ac694d1 100644 --- a/README.md +++ b/README.md @@ -121,3 +121,51 @@ This single .md file contains all the content from your input files, with a tabl (Content of styles.css) ``` +## Terminal + +To use Auto MD from the command line, run the following command: + +```bash +python terminal.py [options] +``` + +### Options + +- `-i`, `--input`: Input file(s) or folder(s) (required) +- `-o`, `--output`: Output file or folder (required) +- `-s`, `--single-file`: Combine all input files into a single output file (default: False) +- `-r`, `--repo-depth`: Depth for GitHub repository cloning (default: None) +- `-m`, `--include-metadata`: Include metadata in output (default: True) +- `-t`, `--include-toc`: Include table of contents in output (default: True) +- `-v`, `--verbose`: Enable verbose output (default: False) +- `-g`, `--gitignore`: Path to a .gitignore file to use for excluding files (default: None) +- `-x`, `--ignore-paths`: Comma-separated list of paths to ignore (default: None) + +### Examples + +1. Process a single text file and save the output to a single Markdown file: + +```bash +python terminal.py -i input.txt -o output.md -s +``` + +2. Process a folder and save each file as a separate Markdown file: + +```bash +python terminal.py -i input_folder -o output_folder +``` + +3. Process a GitHub repository and save the output to a single Markdown file, excluding files listed in a .gitignore file: + +```bash +python terminal.py -i https://github.com/user/repo.git -o output.md -s -ig /path/to/.gitignore +``` + +4. Ignore some paths + +```bash +python terminal.py -i input_folder -o output_folder -ip docs tests +``` + +This will not include the `docs` and `tests` path in the processing + diff --git a/auto-md/file_processor.py b/auto-md/file_processor.py index 5969e45..6174950 100644 --- a/auto-md/file_processor.py +++ b/auto-md/file_processor.py @@ -9,77 +9,191 @@ from markdown_formatter import format_as_markdown, generate_toc TEXT_EXTENSIONS = { - '.txt', '.md', '.markdown', '.mdown', '.mkdn', '.mkd', '.mdwn', '.mdtxt', '.mdtext', '.text', - '.html', '.htm', '.xhtml', '.shtml', - '.css', '.scss', '.sass', '.less', - '.py', '.pyw', '.pyc', '.pyo', '.pyd', - '.js', '.jsx', '.ts', '.tsx', - '.yaml', '.yml', - '.json', '.jsonl', '.json5', - '.xml', '.xsl', '.xslt', '.svg', - '.csv', '.tsv', - '.rst', '.rest', - '.ini', '.cfg', '.conf', '.config', - '.log', '.log.1', '.log.2', - '.bat', '.cmd', '.sh', '.bash', '.zsh', '.fish', - '.sql', '.mysql', '.pgsql', '.sqlite', - '.php', '.phtml', '.php3', '.php4', '.php5', '.phps', - '.rb', '.rbw', '.rake', '.gemspec', - '.lua', '.luac', - '.pl', '.pm', '.t', '.pod', - '.go', '.gop', - '.java', '.class', '.jar', - '.cs', '.csx', '.vb', - '.c', '.h', '.cpp', '.hpp', '.cc', '.hh', '.cxx', '.hxx', - '.swift', '.kt', '.kts', - '.r', '.rdata', '.rds', '.rda', - '.m', '.mm', - '.tex', '.ltx', '.latex', '.bib', - '.asm', '.s', - '.f', '.for', '.f90', '.f95', '.f03', '.f08', - '.scala', '.sc', - '.clj', '.cljs', '.cljc', '.edn', - '.dart', - '.groovy', '.gvy', '.gy', '.gsh', - '.ps1', '.psm1', '.psd1', - '.elm', - '.erl', '.hrl', - '.ex', '.exs', - '.hs', '.lhs', - '.ml', '.mli', - '.rs', - '.vim', '.vimrc', - '.dockerfile', '.containerfile', - '.gitignore', '.gitattributes', '.gitmodules', - '.toml', '.editorconfig' + ".txt", + ".md", + ".markdown", + ".mdown", + ".mkdn", + ".mkd", + ".mdwn", + ".mdtxt", + ".mdtext", + ".text", + ".html", + ".htm", + ".xhtml", + ".shtml", + ".css", + ".scss", + ".sass", + ".less", + ".py", + ".pyw", + ".pyc", + ".pyo", + ".pyd", + ".js", + ".jsx", + ".ts", + ".tsx", + ".yaml", + ".yml", + ".json", + ".jsonl", + ".json5", + ".xml", + ".xsl", + ".xslt", + ".svg", + ".csv", + ".tsv", + ".rst", + ".rest", + ".ini", + ".cfg", + ".conf", + ".config", + ".log", + ".log.1", + ".log.2", + ".bat", + ".cmd", + ".sh", + ".bash", + ".zsh", + ".fish", + ".sql", + ".mysql", + ".pgsql", + ".sqlite", + ".php", + ".phtml", + ".php3", + ".php4", + ".php5", + ".phps", + ".rb", + ".rbw", + ".rake", + ".gemspec", + ".lua", + ".luac", + ".pl", + ".pm", + ".t", + ".pod", + ".go", + ".gop", + ".java", + ".class", + ".jar", + ".cs", + ".csx", + ".vb", + ".c", + ".h", + ".cpp", + ".hpp", + ".cc", + ".hh", + ".cxx", + ".hxx", + ".swift", + ".kt", + ".kts", + ".r", + ".rdata", + ".rds", + ".rda", + ".m", + ".mm", + ".tex", + ".ltx", + ".latex", + ".bib", + ".asm", + ".s", + ".f", + ".for", + ".f90", + ".f95", + ".f03", + ".f08", + ".scala", + ".sc", + ".clj", + ".cljs", + ".cljc", + ".edn", + ".dart", + ".groovy", + ".gvy", + ".gy", + ".gsh", + ".ps1", + ".psm1", + ".psd1", + ".elm", + ".erl", + ".hrl", + ".ex", + ".exs", + ".hs", + ".lhs", + ".ml", + ".mli", + ".rs", + ".vim", + ".vimrc", + ".dockerfile", + ".containerfile", + ".gitignore", + ".gitattributes", + ".gitmodules", + ".toml", + ".editorconfig", } def clean_text(text: str) -> str: """Clean the input text.""" - return re.sub(r'[^\x00-\x7F]+', '', re.sub(r'\s+', ' ', text)).strip() + return re.sub(r"[^\x00-\x7F]+", "", re.sub(r"\s+", " ", text)).strip() -def process_file(file_path: str, output_dir: str, single_file: bool, all_files: List[str], include_metadata: bool, - include_toc: bool, toc_entries: Dict[str, str]) -> Optional[str]: +def process_file( + file_path: str, + output_dir: str, + single_file: bool, + all_files: List[str], + include_metadata: bool, + include_toc: bool, + toc_entries: Dict[str, str], +) -> Optional[str]: """Process a single text file.""" logging.info(f"Processing file: {file_path}") try: - with open(file_path, 'r', encoding='utf-8', errors='ignore') as file: + with open(file_path, "r", encoding="utf-8", errors="ignore") as file: text = file.read() if not text.strip(): logging.warning(f"File is empty: {file_path}") return None - title = Path(file_path).stem.replace('_', ' ').replace('-', ' ') + title = Path(file_path).stem.replace("_", " ").replace("-", " ") cleaned_text = clean_text(text) - markdown_text = format_as_markdown(cleaned_text, title, file_path, all_files, include_metadata, include_toc, - toc_entries) + markdown_text = format_as_markdown( + cleaned_text, + title, + file_path, + all_files, + include_metadata, + include_toc, + toc_entries, + ) if not single_file: output_file = Path(output_dir) / f"{title}.md" - output_file.write_text(markdown_text, encoding='utf-8') + output_file.write_text(markdown_text, encoding="utf-8") logging.info(f"Saved markdown to: {output_file}") return markdown_text @@ -88,23 +202,54 @@ def process_file(file_path: str, output_dir: str, single_file: bool, all_files: return None -def process_folder(folder_path: str, output_dir: str, single_file: bool, combined_content: List[str], - all_files: List[str], include_metadata: bool, include_toc: bool, toc_entries: Dict[str, str]): - """Process all text files in a given folder and its subfolders.""" +def process_folder( + folder_path: str, + output_dir: str, + single_file: bool, + combined_content: List[str], + all_files: List[str], + include_metadata: bool, + include_toc: bool, + toc_entries: Dict[str, str], + gitignore_patterns: List[str], + is_verbose: bool = False, +): + """Process all text files in a given folder and its subfolders, excluding files/directories listed in .gitignore.""" logging.info(f"Processing folder: {folder_path}") - for path in Path(folder_path).rglob('*'): + for path in Path(folder_path).rglob("*"): if path.is_file(): if path.suffix.lower() in TEXT_EXTENSIONS: - all_files.append(str(path)) - result = process_file(str(path), output_dir, single_file, all_files, include_metadata, include_toc, - toc_entries) - if result: - combined_content.append(result) - elif path.suffix.lower() == '.zip': + if not should_exclude( + path.relative_to(folder_path), gitignore_patterns + ): + all_files.append(str(path)) + if is_verbose is True: + print(" - Processing file", str(path)) + result = process_file( + str(path), + output_dir, + single_file, + all_files, + include_metadata, + include_toc, + toc_entries, + ) + if result: + combined_content.append(result) + elif path.suffix.lower() == ".zip": temp_extract_to = path.parent / f"temp_{path.name}" extract_zip(str(path), str(temp_extract_to)) - process_folder(str(temp_extract_to), output_dir, single_file, combined_content, all_files, - include_metadata, include_toc, toc_entries) + process_folder( + str(temp_extract_to), + output_dir, + single_file, + combined_content, + all_files, + include_metadata, + include_toc, + toc_entries, + gitignore_patterns, + ) shutil.rmtree(temp_extract_to, ignore_errors=True) @@ -112,7 +257,7 @@ def extract_zip(zip_path: str, extract_to: str): """Extract a zip file to the specified directory.""" logging.info(f"Extracting zip file: {zip_path}") try: - with zipfile.ZipFile(zip_path, 'r') as zip_ref: + with zipfile.ZipFile(zip_path, "r") as zip_ref: zip_ref.extractall(extract_to) logging.info(f"Extracted to: {extract_to}") except Exception as e: @@ -135,38 +280,114 @@ def clone_git_repo(repo_url: str, temp_folder: str, depth: Optional[int] = None) logging.error(e.stderr) -def process_input(input_paths: List[str], output_path: str, temp_folder: str, single_file: bool, - repo_depth: Optional[int], include_metadata: bool, include_toc: bool) -> str: +def should_exclude(path: Path, gitignore_patterns: List[str]) -> bool: + """Check if the given path should be excluded based on .gitignore patterns.""" + if ".git" in path.parts: + return True + for pattern in gitignore_patterns: + if pattern in path.parts: + return True + return False + + +def process_gitignore(filepath: str) -> List[str]: + """Open a gitignore file to get a list of ignored paths""" + with open(filepath, "r", encoding="utf-8") as gitignore_file: + return [ + line.strip() + for line in gitignore_file + if line.strip() and not line.startswith("#") + ] + + +def process_input( + input_paths: List[str], + output_path: str, + temp_folder: str, + single_file: bool, + repo_depth: Optional[int], + include_metadata: bool, + include_toc: bool, + is_verbose: bool = False, + gitignore: Optional[str | None] = None, + ignore_paths: Optional[List[str]] = [], +) -> str: """Process each item in the input paths: directories, text files, zip files, and GitHub repos.""" combined_content = [] all_files = [] toc_entries: Dict[str, str] = {} output_dir = Path(output_path).parent if single_file else Path(output_path) output_dir.mkdir(parents=True, exist_ok=True) - + global_ignore_patterns = [*ignore_paths] + if gitignore is not None: + global_ignore_patterns = [*ignore_paths, *process_gitignore(gitignore)] for item_path in input_paths: path = Path(item_path) + if is_verbose is True: + print("Processing", item_path) if path.is_dir(): - process_folder(str(path), str(output_dir), single_file, combined_content, all_files, include_metadata, - include_toc, toc_entries) + gitignore_patterns = global_ignore_patterns + gitignore_path = Path(item_path) / ".gitignore" + if gitignore_path.exists(): + gitignore_patterns = [ + *global_ignore_patterns, + *process_gitignore(gitignore_path), + ] + process_folder( + str(path), + str(output_dir), + single_file, + combined_content, + all_files, + include_metadata, + include_toc, + toc_entries, + gitignore_patterns, + is_verbose, + ) elif path.suffix.lower() in TEXT_EXTENSIONS: all_files.append(str(path)) - result = process_file(str(path), str(output_dir), single_file, all_files, include_metadata, include_toc, - toc_entries) + result = process_file( + str(path), + str(output_dir), + single_file, + all_files, + include_metadata, + include_toc, + toc_entries, + ) if result: combined_content.append(result) - elif path.suffix.lower() == '.zip': + elif path.suffix.lower() == ".zip": extract_to = Path(temp_folder) / path.stem extract_zip(str(path), str(extract_to)) - process_folder(str(extract_to), str(output_dir), single_file, combined_content, all_files, include_metadata, - include_toc, toc_entries) + process_folder( + str(extract_to), + str(output_dir), + single_file, + combined_content, + all_files, + include_metadata, + include_toc, + toc_entries, + gitignore_patterns, + ) shutil.rmtree(extract_to, ignore_errors=True) elif item_path.startswith("https://github.com"): - repo_name = Path(item_path).name.replace('.git', '') + repo_name = Path(item_path).name.replace(".git", "") repo_temp_folder = Path(temp_folder) / repo_name clone_git_repo(item_path, str(repo_temp_folder), depth=repo_depth) - process_folder(str(repo_temp_folder), str(output_dir), single_file, combined_content, all_files, - include_metadata, include_toc, toc_entries) + process_folder( + str(repo_temp_folder), + str(output_dir), + single_file, + combined_content, + all_files, + include_metadata, + include_toc, + toc_entries, + gitignore_patterns, + ) shutil.rmtree(repo_temp_folder, ignore_errors=True) if single_file and combined_content: @@ -175,11 +396,11 @@ def process_input(input_paths: List[str], output_path: str, temp_folder: str, si if include_toc: toc = generate_toc(toc_entries) content = toc + "\n---\n\n" + content - output_file.write_text(content, encoding='utf-8') + output_file.write_text(content, encoding="utf-8") logging.info(f"Combined content saved to: {output_file}") elif single_file and not combined_content: logging.warning("No content was processed. Output file not created.") else: logging.info(f"Individual Markdown files saved in: {output_dir}") - return str(output_dir if not single_file else Path(output_path).parent) \ No newline at end of file + return str(output_dir if not single_file else Path(output_path).parent) diff --git a/auto-md/terminal.py b/auto-md/terminal.py new file mode 100644 index 0000000..0285e4a --- /dev/null +++ b/auto-md/terminal.py @@ -0,0 +1,92 @@ +import argparse +import sys +from file_processor import process_input + + +def main(): + parser = argparse.ArgumentParser(description="Process text files.") + parser.add_argument( + "-i", + "--input_paths", + nargs="+", + required=True, + help="Input directories, text files, zip files, or GitHub repos.", + ) + parser.add_argument( + "-o", "--output_path", required=True, help="Output directory or file." + ) + parser.add_argument( + "-t", + "--temp_folder", + default="temp", + help="Temporary folder for extracting zip files and cloning GitHub repos.", + ) + parser.add_argument( + "-s", + "--single_file", + action="store_true", + help="Combine all processed files into a single output file.", + ) + parser.add_argument( + "-d", + "--repo_depth", + type=int, + help="Depth for cloning GitHub repos. Useful for large repos.", + ) + parser.add_argument( + "-m", + "--include_metadata", + action="store_true", + help="Include metadata (file path and title) in the output.", + ) + parser.add_argument( + "-c", + "--include_toc", + action="store_true", + help="Include a table of contents at the beginning of the output.", + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Print verbose output.", + ) + parser.add_argument( + "-ig", + "--gitignore", + type=str, + help="Path to a .gitignore file to ignore certain files and directories.", + ) + parser.add_argument( + "-ip", + "--ignore_paths", + nargs="+", + default=[], + help="Additional paths to ignore.", + ) + + args = parser.parse_args() + + print(f"gitignore: {args.gitignore}") # Add this line + + try: + output_dir = process_input( + args.input_paths, + args.output_path, + args.temp_folder, + args.single_file, + args.repo_depth, + args.include_metadata, + args.include_toc, + args.verbose, + args.gitignore, + args.ignore_paths, + ) + print(f"Processed files saved to: {output_dir}") + except Exception as e: + print(f"An error occurred: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main()