From ed5419d8870837440a8304ec8579bdc0affc6330 Mon Sep 17 00:00:00 2001 From: ebembi-crdb Date: Mon, 2 Feb 2026 20:24:11 +0530 Subject: [PATCH 1/4] Add offline archive creation script and documentation Adds tools for creating a complete offline archive of the CockroachDB documentation that can be viewed locally in a browser. - create_full_archive.py: Main script that builds the offline archive - OFFLINE_ARCHIVE_README.md: Usage documentation Features: - Supports all doc versions (v23.1 - v26.1) - Localizes Google Fonts and jQuery for offline use - Fixes paths for relative navigation - Preserves version switcher functionality - Hides online-only elements (search, AI widgets) - Creates distributable ZIP archive --- src/current/OFFLINE_ARCHIVE_README.md | 117 ++++ src/current/create_full_archive.py | 777 ++++++++++++++++++++++++++ 2 files changed, 894 insertions(+) create mode 100644 src/current/OFFLINE_ARCHIVE_README.md create mode 100644 src/current/create_full_archive.py diff --git a/src/current/OFFLINE_ARCHIVE_README.md b/src/current/OFFLINE_ARCHIVE_README.md new file mode 100644 index 00000000000..5db80036f02 --- /dev/null +++ b/src/current/OFFLINE_ARCHIVE_README.md @@ -0,0 +1,117 @@ +# Offline Archive Creator for CockroachDB Documentation + +This tool creates a complete offline archive of the CockroachDB documentation that can be viewed locally in a browser without an internet connection. + +## Prerequisites + +- Python 3.8+ +- Required packages: `requests`, `beautifulsoup4` +- Built Jekyll site in `_site/docs/` + +```bash +pip install requests beautifulsoup4 +``` + +## Usage + +### Basic Usage + +Generate the archive directory only: + +```bash +python3 create_full_archive.py +``` + +### Generate Archive with ZIP + +Generate the archive and create a distributable ZIP file: + +```bash +python3 create_full_archive.py --zip +``` + +### Custom Options + +```bash +# Specify stable version +python3 create_full_archive.py --stable-version v26.1 --zip + +# Custom source and output directories +python3 create_full_archive.py --site-dir /path/to/_site/docs --output-dir /path/to/output +``` + +## Output + +| Output | Description | +|--------|-------------| +| `complete_archive/` | Directory containing the full offline site | +| `complete_archive.zip` | ZIP file for distribution (with `--zip` flag) | + +## What the Script Does + +1. **Copies all content** - All version directories (v23.1 - v26.1), cockroachcloud, releases, advisories, molt +2. **Copies assets** - CSS, JS, images, fonts, sidebar data +3. **Localizes external resources**: + - Downloads Google Fonts for offline use + - Replaces CDN jQuery with local copy + - Downloads navgoco navigation library +4. **Fixes paths for offline viewing**: + - Converts absolute paths to relative + - Fixes version switcher links + - Removes query parameters from image URLs +5. **Applies offline-specific fixes**: + - Hides search (requires online connection) + - Hides AI assistant widgets + - Preserves version switcher functionality + - Adds archived version banner +6. **Cleans artifacts** - Removes macOS `.DS_Store` and `._*` files + +## Included Versions + +The archive includes documentation for: +- v23.1, v23.2 +- v24.1, v24.2, v24.3 +- v25.1, v25.2, v25.3, v25.4 +- v26.1 +- CockroachDB Cloud +- MOLT migration tools +- Releases and Advisories + +## Testing the Archive + +1. Run the script: + ```bash + python3 create_full_archive.py + ``` + +2. Open in browser: + ```bash + open complete_archive/index.html + ``` + +3. Verify: + - Sidebar navigation works + - Version switcher displays (no arrows) + - Links between pages work + - Images load correctly + - No console errors for missing resources + +## Archive Size + +- Uncompressed: ~4 GB +- Compressed ZIP: ~1.5 GB + +## Offline Fixes Applied + +The script applies several fixes to make the documentation work offline: + +| Fix | Description | +|-----|-------------| +| Home URL | Replaces `/` with `index.html` in sidebar JSON | +| Version arrows | Hides dropdown arrows (non-functional offline) | +| Version links | Converts to relative paths (`../v25.3/page.html`) | +| Version switcher | Preserves visibility (not removed by JS/CSS) | +| Asset paths | Converts absolute `/docs/...` to relative `../...` | +| Google Fonts | Downloads and localizes font files | +| jQuery | Uses local copy instead of CDN | +| macOS artifacts | Removes `.DS_Store` and `._*` files | diff --git a/src/current/create_full_archive.py b/src/current/create_full_archive.py new file mode 100644 index 00000000000..a5de7a2c7fc --- /dev/null +++ b/src/current/create_full_archive.py @@ -0,0 +1,777 @@ +#!/usr/bin/env python3 +""" +Full Website Archive Creator for CockroachDB Documentation + +Creates a complete offline archive from _site/docs that can be opened locally +in a browser. Matches the structure and patterns of complete_test_archive_latest.zip. + +Usage: + python3 create_full_archive.py + +Output: + - Creates 'complete_archive/' directory with the full offline site + - Optionally creates 'complete_archive.zip' for distribution +""" +import re +import shutil +import requests +import os +import sys +from pathlib import Path +from urllib.parse import urlparse +from bs4 import BeautifulSoup +import json +from datetime import datetime +import argparse + +# Configuration +SCRIPT_DIR = Path(__file__).parent +SITE_DIR = SCRIPT_DIR / "_site" / "docs" +OUTPUT_DIR = SCRIPT_DIR / "complete_archive" + +# All versions to include in the archive +ALL_VERSIONS = [ + "v23.1", "v23.2", "v24.1", "v24.2", "v24.3", + "v25.1", "v25.2", "v25.3", "v25.4", "v26.1" +] + +# Current stable version (update as needed) +STABLE_VERSION = "v25.3" + +# Directories to copy entirely +ASSET_DIRS = ["css", "js", "images", "fonts", "_internal"] + +# Top-level content directories to include +CONTENT_DIRS = ["cockroachcloud", "releases", "advisories", "molt", "stable"] + +# Top-level HTML files to include +TOP_LEVEL_FILES = ["index.html", "404.html", "search.html"] + +# Google Fonts URL for offline download +FONTS_CSS_URL = ( + "https://fonts.googleapis.com/css2?" + "family=Poppins:wght@400;600&" + "family=Source+Code+Pro&" + "family=Source+Sans+Pro:wght@300;400;600;700&" + "display=swap" +) + + +class FullArchiveCreator: + def __init__(self, site_dir=None, output_dir=None, stable_version=None): + self.site_dir = Path(site_dir) if site_dir else SITE_DIR + self.output_dir = Path(output_dir) if output_dir else OUTPUT_DIR + self.stable_version = stable_version or STABLE_VERSION + self.processed_files = 0 + self.error_count = 0 + + def log(self, message, level="INFO"): + """Log a message with timestamp and level""" + timestamp = datetime.now().strftime("%H:%M:%S") + prefix = { + "INFO": "[*]", + "SUCCESS": "[+]", + "WARNING": "[!]", + "ERROR": "[-]", + "DEBUG": "[.]" + }.get(level, "[*]") + print(f"[{timestamp}] {prefix} {message}") + + def clean_output_dir(self): + """Remove existing output directory if it exists""" + if self.output_dir.exists(): + self.log(f"Removing existing {self.output_dir}") + shutil.rmtree(self.output_dir) + self.output_dir.mkdir(parents=True) + self.log(f"Created output directory: {self.output_dir}", "SUCCESS") + + def copy_asset_dirs(self): + """Copy all asset directories (css, js, images, fonts, _internal)""" + self.log("Copying asset directories...") + for asset_dir in ASSET_DIRS: + src = self.site_dir / asset_dir + if src.exists(): + dst = self.output_dir / asset_dir + shutil.copytree(src, dst, dirs_exist_ok=True) + self.log(f" Copied {asset_dir}/", "SUCCESS") + else: + self.log(f" {asset_dir}/ not found, skipping", "WARNING") + + def copy_tree_ignore_broken_symlinks(self, src, dst): + """Copy directory tree, ignoring broken/circular symlinks""" + dst.mkdir(parents=True, exist_ok=True) + + for item in src.iterdir(): + src_item = src / item.name + dst_item = dst / item.name + + try: + if src_item.is_symlink(): + # Check if symlink is valid + try: + src_item.resolve(strict=True) + # Valid symlink - copy target or recreate link + target = os.readlink(src_item) + # Skip circular symlinks (symlink pointing to parent dir) + if item.name in target or target.startswith('..'): + continue + os.symlink(target, dst_item) + except (OSError, RuntimeError): + # Broken/circular symlink - skip it + continue + elif src_item.is_dir(): + self.copy_tree_ignore_broken_symlinks(src_item, dst_item) + elif src_item.is_file(): + shutil.copy2(src_item, dst_item) + except Exception as e: + # Skip any problematic items + continue + + def copy_version_dirs(self): + """Copy all version directories with their content""" + self.log("Copying version directories...") + copied_count = 0 + for version in ALL_VERSIONS: + src = self.site_dir / version + if src.exists(): + dst = self.output_dir / version + self.copy_tree_ignore_broken_symlinks(src, dst) + file_count = len(list(dst.rglob("*.html"))) + self.log(f" Copied {version}/ ({file_count} HTML files)", "SUCCESS") + copied_count += 1 + else: + self.log(f" {version}/ not found, skipping", "WARNING") + self.log(f"Copied {copied_count} version directories", "SUCCESS") + + def copy_content_dirs(self): + """Copy content directories (cockroachcloud, releases, advisories, etc.)""" + self.log("Copying content directories...") + for content_dir in CONTENT_DIRS: + src = self.site_dir / content_dir + if src.exists() or src.is_symlink(): + dst = self.output_dir / content_dir + try: + if src.is_symlink(): + # Resolve the symlink and copy the actual content + # (zip files don't handle symlinks well across platforms) + target = os.readlink(src) + resolved_src = self.site_dir / target + if resolved_src.exists(): + self.copy_tree_ignore_broken_symlinks(resolved_src, dst) + self.log(f" Copied {content_dir}/ (from {target})", "SUCCESS") + else: + self.log(f" {content_dir} symlink target {target} not found", "WARNING") + else: + self.copy_tree_ignore_broken_symlinks(src, dst) + self.log(f" Copied {content_dir}/", "SUCCESS") + except Exception as e: + self.log(f" Error copying {content_dir}: {e}", "WARNING") + else: + self.log(f" {content_dir}/ not found, skipping", "WARNING") + + def copy_top_level_files(self): + """Copy top-level HTML files""" + self.log("Copying top-level files...") + for filename in TOP_LEVEL_FILES: + src = self.site_dir / filename + if src.exists(): + dst = self.output_dir / filename + shutil.copy2(src, dst) + self.log(f" Copied {filename}", "SUCCESS") + else: + self.log(f" {filename} not found, skipping", "WARNING") + + def get_file_depth(self, file_path): + """Calculate the depth of a file relative to output directory""" + try: + rel_path = file_path.relative_to(self.output_dir) + return len(rel_path.parent.parts) + except ValueError: + return 0 + + def get_relative_prefix(self, file_path): + """Get the relative path prefix (../) for a file""" + depth = self.get_file_depth(file_path) + return "../" * depth if depth > 0 else "" + + def fix_docs_home_url(self, content): + """Fix 1: Replace '/' with 'index.html' in sidebar nav JSON""" + content = re.sub( + r'"urls":\s*\[\s*"/"\s*\]', + '"urls": ["index.html"]', + content + ) + return content + + def get_arrow_fix_css(self): + """Fix 2: CSS to hide ALL arrows from version-switcher""" + return ''' +/* Remove ALL arrows from version-switcher */ +#version-switcher .arrow, +#version-switcher .tier-1 .arrow, +#version-switcher .tier-1 a .arrow { + display: none !important; + visibility: hidden !important; +} +#version-switcher .nav li > a > span:after, +#version-switcher .nav li > a > span::after, +#version-switcher li > a > span:after, +#version-switcher li > a > span::after, +#version-switcher .tier-1 > a > span:after, +#version-switcher .tier-1 > a > span::after, +#version-switcher .version-name:after, +#version-switcher .version-name::after { + content: none !important; + display: none !important; +} +''' + + def fix_version_links(self, content, file_depth): + """Fix 3: Fix version switcher links to use correct relative paths""" + prefix = "../" * file_depth if file_depth > 0 else "" + + def fix_version_link(match): + full_match = match.group(0) + href = match.group(1) + classes = match.group(2) + + # Extract version and page from href + version_match = re.match(r'^(?:\.\./)*(?:/docs/)?(v\d+\.\d+)/(.+)$', href) + if version_match: + version = version_match.group(1) + page = version_match.group(2) + if not page.endswith('.html'): + page = page + '.html' + return f'href="{prefix}{version}/{page}"{classes}' + + # No version directory - check if it's a relative path without version + clean_href = re.sub(r'^(?:\.\./)+', '', href) + if clean_href and not clean_href.startswith(('http', '#', 'mailto', 'javascript')): + if not clean_href.endswith('.html') and '.' not in clean_href.split('/')[-1]: + clean_href = clean_href + '.html' + return f'href="{prefix}{clean_href}"{classes}' + + return full_match + + # Fix version--mobile and version--desktop links + content = re.sub( + r'href="([^"]+)"(\s+[^>]*class="[^"]*version--(?:mobile|desktop)[^"]*")', + fix_version_link, + content + ) + + return content + + def preserve_version_switcher(self, content): + """Fix 4: Preserve version-switcher (don't remove via JS or CSS)""" + # Change JS to only remove feedback-widget, not version-switcher + content = re.sub( + r"\$\('\.version-switcher,\s*#version-switcher,\s*\.feedback-widget'\)\.remove\(\);", + "$('.feedback-widget').remove(); // version-switcher preserved for offline", + content + ) + + # Change CSS to only hide feedback-widget, not version-switcher + content = re.sub( + r'\.version-switcher,\s*#version-switcher,\s*\.feedback-widget,', + '.feedback-widget,', + content + ) + + return content + + def fix_asset_paths(self, content, prefix): + """Fix 5: Convert absolute asset paths to relative""" + # Fix /docs/images/ paths + content = re.sub( + r'content:\s*url\(/docs/images/([^)]+)\)', + f'content:url({prefix}images/\\1)', + content + ) + + # Fix other absolute image paths in CSS + content = re.sub( + r'url\(["\']?/docs/images/([^)"\']+)["\']?\)', + f'url({prefix}images/\\1)', + content + ) + + return content + + def get_offline_styles(self, prefix): + """Get CSS styles for offline mode - comprehensive version matching reference archive""" + arrow_fix = self.get_arrow_fix_css() + return f'''''' + + def get_nav_dependencies(self, prefix): + """Get navigation JS dependencies to inject""" + return f''' + + +''' + + def get_nav_init_script(self): + """Get navigation initialization script""" + return '''''' + + def process_html_file(self, file_path): + """Process a single HTML file with all offline fixes""" + try: + content = file_path.read_text(encoding='utf-8') + prefix = self.get_relative_prefix(file_path) + depth = self.get_file_depth(file_path) + + # Apply all fixes + content = self.fix_docs_home_url(content) + content = self.fix_version_links(content, depth) + content = self.preserve_version_switcher(content) + content = self.fix_asset_paths(content, prefix) + + # Fix various path patterns + # Remove /docs/ prefix from paths + content = re.sub(r'(href|src)="/docs/([^"]+)"', r'\1="\2"', content) + content = re.sub(r'(href|src)="docs/([^"]+)"', r'\1="\2"', content) + # Fix bare /docs/ link (navbar brand) - match reference archive pattern + content = re.sub(r'href="/docs/"', 'href=""', content) + content = re.sub(r'href="/docs"', 'href=""', content) + # Fix action attributes for search forms + content = re.sub(r'action="/docs/search"', f'action="{prefix}search.html"', content) + + # Fix stable -> actual stable version + content = re.sub(r'(href|src)="stable/', f'\\1="{self.stable_version}/', content) + content = re.sub(r'(href|src)="/stable/', f'\\1="{self.stable_version}/', content) + + # Fix absolute paths to relative for assets + for asset in ASSET_DIRS: + content = re.sub( + rf'(src|href)="/{asset}/([^"]+)"', + rf'\1="{prefix}{asset}/\2"', + content + ) + content = re.sub( + rf'(src|href)="{asset}/([^"]+)"', + rf'\1="{prefix}{asset}/\2"', + content + ) + + # Replace Google Fonts CDN with local + content = re.sub( + r']+fonts\.googleapis\.com[^>]+>', + f'', + content + ) + + # Fix 1 (additional): CSS @import paths - convert absolute to relative + content = re.sub( + r'@import url\(["\']?/docs/([^)"\']+)["\']?\)', + f'@import url({prefix}\\1)', + content + ) + content = re.sub( + r'@import url\(["\']?docs/([^)"\']+)["\']?\)', + f'@import url({prefix}\\1)', + content + ) + + # Fix 3: Clean up double-slash paths and remaining /docs/ references + content = re.sub(r'(href|src)="\.\.//docs/', r'\1="../', content) + content = re.sub(r'(href|src)="\.\.//+', r'\1="../', content) + content = re.sub(r'(href|src)="//+([^/])', r'\1="\2', content) + + # Fix 4: Strip query parameters from local image URLs + content = re.sub( + r'(src="[^"?]+\.(png|jpg|jpeg|svg|gif|webp))\?[^"]*"', + r'\1"', + content + ) + content = re.sub( + r'(href="[^"?]+\.(png|jpg|jpeg|svg|gif|webp|css))\?[^"]*"', + r'\1"', + content + ) + + # Fix 6: Replace CDN jQuery with local version + content = re.sub( + r']+cdn\.jsdelivr\.net[^>]+jquery[^>]*>', + f'', + content + ) + + # Inject navigation dependencies before + nav_deps = self.get_nav_dependencies(prefix) + offline_styles = self.get_offline_styles(prefix) + content = re.sub( + r'', + f'{nav_deps}\n{offline_styles}\n', + content, + flags=re.IGNORECASE + ) + + # Inject navigation init script before + nav_init = self.get_nav_init_script() + content = re.sub( + r'', + f'{nav_init}\n', + content, + flags=re.IGNORECASE + ) + + # Final cleanup: normalize any remaining double slashes in paths (but not in URLs) + content = re.sub(r'(href|src)="([^"]*[^:])//+([^"]*)"', r'\1="\2/\3"', content) + + # Write processed content + file_path.write_text(content, encoding='utf-8') + self.processed_files += 1 + + except Exception as e: + self.log(f"Error processing {file_path}: {e}", "ERROR") + self.error_count += 1 + + def process_all_html_files(self): + """Process all HTML files in the output directory""" + self.log("Processing HTML files...") + html_files = list(self.output_dir.rglob("*.html")) + total = len(html_files) + + for i, file_path in enumerate(html_files, 1): + if i % 100 == 0: + self.log(f" Progress: {i}/{total} ({i*100//total}%)") + self.process_html_file(file_path) + + self.log(f"Processed {self.processed_files} HTML files, {self.error_count} errors", "SUCCESS") + + def download_google_fonts(self): + """Download and localize Google Fonts""" + self.log("Downloading Google Fonts...") + + fonts_dir = self.output_dir / "fonts" + fonts_dir.mkdir(exist_ok=True) + css_dir = self.output_dir / "css" + css_dir.mkdir(exist_ok=True) + + try: + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'} + css_response = requests.get(FONTS_CSS_URL, headers=headers, timeout=10) + css_response.raise_for_status() + css_content = css_response.text + + # Find all font URLs + font_urls = set(re.findall(r"url\((https://fonts\.gstatic\.com/[^\)]+)\)", css_content)) + + for url in font_urls: + try: + font_response = requests.get(url, headers=headers, timeout=10) + font_response.raise_for_status() + + parsed = urlparse(url) + font_path = parsed.path.lstrip("/") + dst = fonts_dir / font_path + dst.parent.mkdir(parents=True, exist_ok=True) + dst.write_bytes(font_response.content) + + css_content = css_content.replace(url, f"../fonts/{font_path}") + + except Exception as e: + self.log(f" Failed to download font: {e}", "WARNING") + + (css_dir / "google-fonts.css").write_text(css_content, encoding="utf-8") + self.log("Google Fonts localized", "SUCCESS") + + except Exception as e: + self.log(f"Error downloading fonts: {e}", "ERROR") + # Create fallback CSS + fallback = '''/* Fallback fonts */ +body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Arial, sans-serif; } +code, pre { font-family: Consolas, Monaco, "Courier New", monospace; }''' + (css_dir / "google-fonts.css").write_text(fallback) + + def ensure_nav_assets(self): + """Ensure required navigation JS assets exist""" + self.log("Ensuring navigation assets...") + + js_dir = self.output_dir / "js" + js_dir.mkdir(exist_ok=True) + css_dir = self.output_dir / "css" + css_dir.mkdir(exist_ok=True) + + assets = [ + ("jquery.min.js", "https://code.jquery.com/jquery-3.6.3.min.js", js_dir), + ("jquery.cookie.min.js", "https://cdnjs.cloudflare.com/ajax/libs/jquery-cookie/1.4.1/jquery.cookie.min.js", js_dir), + ("jquery.navgoco.min.js", "https://raw.githubusercontent.com/tefra/navgoco/master/src/jquery.navgoco.js", js_dir), + ("jquery.navgoco.css", "https://raw.githubusercontent.com/tefra/navgoco/master/src/jquery.navgoco.css", css_dir), + ] + + for name, url, dest_dir in assets: + dest_file = dest_dir / name + if not dest_file.exists(): + try: + self.log(f" Downloading {name}...") + response = requests.get(url, timeout=10) + response.raise_for_status() + dest_file.write_bytes(response.content) + self.log(f" Downloaded {name}", "SUCCESS") + except Exception as e: + self.log(f" Failed to download {name}: {e}", "ERROR") + else: + self.log(f" {name} already exists", "SUCCESS") + + def add_archived_banner(self): + """Add archived version banner to index.html""" + self.log("Adding archived banner to index.html...") + + index_path = self.output_dir / "index.html" + if not index_path.exists(): + self.log("index.html not found", "WARNING") + return + + content = index_path.read_text(encoding='utf-8') + + banner_css = '''''' + + banner_html = '''
+

+This is an archived version of the CockroachDB documentation. +View the latest documentation +

+
''' + + content = content.replace('', banner_css + '\n') + content = content.replace('', '\n' + banner_html) + + index_path.write_text(content, encoding='utf-8') + self.log("Added archived banner", "SUCCESS") + + def clean_macos_artifacts(self): + """Remove macOS artifacts from the archive""" + self.log("Cleaning macOS artifacts...") + + # Remove __MACOSX directories + for macosx_dir in self.output_dir.rglob("__MACOSX"): + shutil.rmtree(macosx_dir) + self.log(f" Removed {macosx_dir}", "SUCCESS") + + # Remove .DS_Store files + for ds_store in self.output_dir.rglob(".DS_Store"): + ds_store.unlink() + + # Remove ._ files + for dot_file in self.output_dir.rglob("._*"): + dot_file.unlink() + + self.log("Cleaned macOS artifacts", "SUCCESS") + + def create_zip_archive(self): + """Create a zip archive of the output directory""" + self.log("Creating zip archive...") + + zip_path = self.output_dir.with_suffix('.zip') + if zip_path.exists(): + zip_path.unlink() + + # Use shutil to create the zip + shutil.make_archive( + str(self.output_dir), + 'zip', + self.output_dir.parent, + self.output_dir.name + ) + + # Get zip size + zip_size = zip_path.stat().st_size / (1024 * 1024) + self.log(f"Created {zip_path.name} ({zip_size:.1f} MB)", "SUCCESS") + + def build(self, create_zip=False): + """Main build process""" + print("\n" + "=" * 60) + print("COCKROACHDB FULL WEBSITE ARCHIVE CREATOR") + print("=" * 60) + + self.log(f"Source: {self.site_dir}") + self.log(f"Output: {self.output_dir}") + self.log(f"Stable version: {self.stable_version}") + self.log(f"Versions: {', '.join(ALL_VERSIONS)}") + + if not self.site_dir.exists(): + self.log(f"Source directory not found: {self.site_dir}", "ERROR") + self.log("Run 'jekyll build' first to generate _site", "ERROR") + return False + + # Build steps + self.clean_output_dir() + self.copy_asset_dirs() + self.copy_version_dirs() + self.copy_content_dirs() + self.copy_top_level_files() + self.ensure_nav_assets() + self.download_google_fonts() + self.process_all_html_files() + self.add_archived_banner() + self.clean_macos_artifacts() + + if create_zip: + self.create_zip_archive() + + # Summary + print("\n" + "=" * 60) + self.log("ARCHIVE CREATED SUCCESSFULLY!", "SUCCESS") + self.log(f"Output: {self.output_dir}") + self.log(f"Files processed: {self.processed_files}") + self.log(f"Errors: {self.error_count}") + print("=" * 60) + print(f"\nTo test: open {self.output_dir}/index.html in your browser") + + return True + + +def main(): + parser = argparse.ArgumentParser(description="Create full website archive from _site") + parser.add_argument("--site-dir", help="Path to _site/docs directory") + parser.add_argument("--output-dir", help="Output directory for archive") + parser.add_argument("--stable-version", default=STABLE_VERSION, help="Stable version (default: v25.3)") + parser.add_argument("--zip", action="store_true", help="Create zip archive") + args = parser.parse_args() + + creator = FullArchiveCreator( + site_dir=args.site_dir, + output_dir=args.output_dir, + stable_version=args.stable_version + ) + + success = creator.build(create_zip=args.zip) + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() From b2d83df6114b230c6f63e0aba28f948485072270 Mon Sep 17 00:00:00 2001 From: ebembi-crdb Date: Fri, 20 Feb 2026 23:39:13 +0530 Subject: [PATCH 2/4] Address review comments on create_full_archive.py - Replace blind regex HTML surgery with BeautifulSoup for safe, structured href/src rewriting (addresses reviewer concern about brittle regex manipulation) - Add _make_session() with urllib3 Retry for robust external downloads with backoff on 429/5xx responses (addresses reviewer concern about flaky downloads) - Fix copy_tree_ignore_broken_symlinks() to copy resolved symlink targets instead of recreating symlinks, making archives portable across platforms - Add _rewrite_url() helper to centralize /docs/ to relative path conversion, including /docs/ to ../index.html for Docs Home links - Rewrite add_archived_banner() with BeautifulSoup instead of string replace - Patch renderSidebar baseUrl: "/docs" to "" and replace href computation with archive-root-aware function using known subdirs detection so sidebar links work in file:// context regardless of where the archive is placed on disk --- src/current/create_full_archive.py | 406 ++++++++++++++--------------- 1 file changed, 197 insertions(+), 209 deletions(-) diff --git a/src/current/create_full_archive.py b/src/current/create_full_archive.py index a5de7a2c7fc..6efa3c944de 100644 --- a/src/current/create_full_archive.py +++ b/src/current/create_full_archive.py @@ -23,6 +23,8 @@ import json from datetime import datetime import argparse +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry # Configuration SCRIPT_DIR = Path(__file__).parent @@ -36,7 +38,7 @@ ] # Current stable version (update as needed) -STABLE_VERSION = "v25.3" +STABLE_VERSION = "v26.1" # Directories to copy entirely ASSET_DIRS = ["css", "js", "images", "fonts", "_internal"] @@ -77,6 +79,19 @@ def log(self, message, level="INFO"): }.get(level, "[*]") print(f"[{timestamp}] {prefix} {message}") + @staticmethod + def _make_session(retries=3, backoff=0.5): + """Create a requests Session with automatic retry and exponential backoff.""" + session = requests.Session() + retry = Retry( + total=retries, + backoff_factor=backoff, + status_forcelist=[429, 500, 502, 503, 504], + ) + session.mount("https://", HTTPAdapter(max_retries=retry)) + session.mount("http://", HTTPAdapter(max_retries=retry)) + return session + def clean_output_dir(self): """Remove existing output directory if it exists""" if self.output_dir.exists(): @@ -98,7 +113,7 @@ def copy_asset_dirs(self): self.log(f" {asset_dir}/ not found, skipping", "WARNING") def copy_tree_ignore_broken_symlinks(self, src, dst): - """Copy directory tree, ignoring broken/circular symlinks""" + """Copy directory tree, resolving symlinks to their targets (portable across platforms).""" dst.mkdir(parents=True, exist_ok=True) for item in src.iterdir(): @@ -107,24 +122,20 @@ def copy_tree_ignore_broken_symlinks(self, src, dst): try: if src_item.is_symlink(): - # Check if symlink is valid try: - src_item.resolve(strict=True) - # Valid symlink - copy target or recreate link - target = os.readlink(src_item) - # Skip circular symlinks (symlink pointing to parent dir) - if item.name in target or target.startswith('..'): - continue - os.symlink(target, dst_item) + resolved = src_item.resolve(strict=True) except (OSError, RuntimeError): - # Broken/circular symlink - skip it + # Broken or circular symlink — skip continue + if resolved.is_dir(): + self.copy_tree_ignore_broken_symlinks(resolved, dst_item) + else: + shutil.copy2(resolved, dst_item) elif src_item.is_dir(): self.copy_tree_ignore_broken_symlinks(src_item, dst_item) elif src_item.is_file(): shutil.copy2(src_item, dst_item) - except Exception as e: - # Skip any problematic items + except Exception: continue def copy_version_dirs(self): @@ -152,15 +163,13 @@ def copy_content_dirs(self): dst = self.output_dir / content_dir try: if src.is_symlink(): - # Resolve the symlink and copy the actual content - # (zip files don't handle symlinks well across platforms) - target = os.readlink(src) - resolved_src = self.site_dir / target - if resolved_src.exists(): - self.copy_tree_ignore_broken_symlinks(resolved_src, dst) - self.log(f" Copied {content_dir}/ (from {target})", "SUCCESS") - else: - self.log(f" {content_dir} symlink target {target} not found", "WARNING") + try: + resolved = src.resolve(strict=True) + except (OSError, RuntimeError): + self.log(f" {content_dir} symlink is broken, skipping", "WARNING") + continue + self.copy_tree_ignore_broken_symlinks(resolved, dst) + self.log(f" Copied {content_dir}/ (resolved symlink)", "SUCCESS") else: self.copy_tree_ignore_broken_symlinks(src, dst) self.log(f" Copied {content_dir}/", "SUCCESS") @@ -194,11 +203,56 @@ def get_relative_prefix(self, file_path): depth = self.get_file_depth(file_path) return "../" * depth if depth > 0 else "" - def fix_docs_home_url(self, content): - """Fix 1: Replace '/' with 'index.html' in sidebar nav JSON""" + def _rewrite_url(self, url, prefix): + """Rewrite a URL for offline use: strip /docs/ prefix and make it relative.""" + if not url or re.match(r'^(https?://|mailto:|javascript:|data:|#)', url): + return url + + # Strip /docs/ prefix + if url.startswith('/docs/'): + url = url[6:] + if not url: # was '/docs/' exactly → root index + url = 'index.html' + elif url in ('/docs', '/docs/'): + return f'{prefix}index.html' + + # /stable/ → actual stable version + url = re.sub(r'^/?stable/', f'{self.stable_version}/', url) + + # Strip remaining leading slash + url = url.lstrip('/') + + # Strip query strings from static asset URLs + url = re.sub( + r'^([^?]+\.(png|jpg|jpeg|svg|gif|webp|woff2?|ttf|eot|css|js))\?.*$', + r'\1', url + ) + + # Add .html extension to version page paths that lack an extension + for version in ALL_VERSIONS: + if url.startswith(f'{version}/'): + page = url[len(f'{version}/'):] + last_seg = page.split('/')[-1] + if last_seg and '.' not in last_seg: + url = f'{version}/{page}.html' + break + + # Prefix with relative path depth (skip if already relative) + if url and not url.startswith(('../', './')): + url = f'{prefix}{url}' + + return url + + def fix_asset_paths(self, content, prefix): + """Fix CSS url() expressions: convert absolute /docs/images/ paths to relative.""" content = re.sub( - r'"urls":\s*\[\s*"/"\s*\]', - '"urls": ["index.html"]', + r'content:\s*url\(/docs/images/([^)]+)\)', + f'content:url({prefix}images/\\1)', + content + ) + content = re.sub( + r'url\(["\']?/docs/images/([^)"\']+)["\']?\)', + f'url({prefix}images/\\1)', content ) return content @@ -226,78 +280,6 @@ def get_arrow_fix_css(self): } ''' - def fix_version_links(self, content, file_depth): - """Fix 3: Fix version switcher links to use correct relative paths""" - prefix = "../" * file_depth if file_depth > 0 else "" - - def fix_version_link(match): - full_match = match.group(0) - href = match.group(1) - classes = match.group(2) - - # Extract version and page from href - version_match = re.match(r'^(?:\.\./)*(?:/docs/)?(v\d+\.\d+)/(.+)$', href) - if version_match: - version = version_match.group(1) - page = version_match.group(2) - if not page.endswith('.html'): - page = page + '.html' - return f'href="{prefix}{version}/{page}"{classes}' - - # No version directory - check if it's a relative path without version - clean_href = re.sub(r'^(?:\.\./)+', '', href) - if clean_href and not clean_href.startswith(('http', '#', 'mailto', 'javascript')): - if not clean_href.endswith('.html') and '.' not in clean_href.split('/')[-1]: - clean_href = clean_href + '.html' - return f'href="{prefix}{clean_href}"{classes}' - - return full_match - - # Fix version--mobile and version--desktop links - content = re.sub( - r'href="([^"]+)"(\s+[^>]*class="[^"]*version--(?:mobile|desktop)[^"]*")', - fix_version_link, - content - ) - - return content - - def preserve_version_switcher(self, content): - """Fix 4: Preserve version-switcher (don't remove via JS or CSS)""" - # Change JS to only remove feedback-widget, not version-switcher - content = re.sub( - r"\$\('\.version-switcher,\s*#version-switcher,\s*\.feedback-widget'\)\.remove\(\);", - "$('.feedback-widget').remove(); // version-switcher preserved for offline", - content - ) - - # Change CSS to only hide feedback-widget, not version-switcher - content = re.sub( - r'\.version-switcher,\s*#version-switcher,\s*\.feedback-widget,', - '.feedback-widget,', - content - ) - - return content - - def fix_asset_paths(self, content, prefix): - """Fix 5: Convert absolute asset paths to relative""" - # Fix /docs/images/ paths - content = re.sub( - r'content:\s*url\(/docs/images/([^)]+)\)', - f'content:url({prefix}images/\\1)', - content - ) - - # Fix other absolute image paths in CSS - content = re.sub( - r'url\(["\']?/docs/images/([^)"\']+)["\']?\)', - f'url({prefix}images/\\1)', - content - ) - - return content - def get_offline_styles(self, prefix): """Get CSS styles for offline mode - comprehensive version matching reference archive""" arrow_fix = self.get_arrow_fix_css() @@ -419,112 +401,114 @@ def get_nav_init_script(self): ''' def process_html_file(self, file_path): - """Process a single HTML file with all offline fixes""" + """Process a single HTML file for offline use via BeautifulSoup.""" try: content = file_path.read_text(encoding='utf-8') prefix = self.get_relative_prefix(file_path) - depth = self.get_file_depth(file_path) - - # Apply all fixes - content = self.fix_docs_home_url(content) - content = self.fix_version_links(content, depth) - content = self.preserve_version_switcher(content) - content = self.fix_asset_paths(content, prefix) - - # Fix various path patterns - # Remove /docs/ prefix from paths - content = re.sub(r'(href|src)="/docs/([^"]+)"', r'\1="\2"', content) - content = re.sub(r'(href|src)="docs/([^"]+)"', r'\1="\2"', content) - # Fix bare /docs/ link (navbar brand) - match reference archive pattern - content = re.sub(r'href="/docs/"', 'href=""', content) - content = re.sub(r'href="/docs"', 'href=""', content) - # Fix action attributes for search forms - content = re.sub(r'action="/docs/search"', f'action="{prefix}search.html"', content) - - # Fix stable -> actual stable version - content = re.sub(r'(href|src)="stable/', f'\\1="{self.stable_version}/', content) - content = re.sub(r'(href|src)="/stable/', f'\\1="{self.stable_version}/', content) - - # Fix absolute paths to relative for assets - for asset in ASSET_DIRS: - content = re.sub( - rf'(src|href)="/{asset}/([^"]+)"', - rf'\1="{prefix}{asset}/\2"', - content + + soup = BeautifulSoup(content, 'html.parser') + + # Rewrite href attributes on all tags + for tag in soup.find_all(href=True): + tag['href'] = self._rewrite_url(tag['href'], prefix) + + # Rewrite src attributes on all tags + for tag in soup.find_all(src=True): + tag['src'] = self._rewrite_url(tag['src'], prefix) + + # Fix form actions for search + for form in soup.find_all('form', action=True): + if '/search' in form['action']: + form['action'] = f'{prefix}search.html' + + # Localize Google Fonts link tags + for link in soup.find_all('link', rel='stylesheet'): + if 'fonts.googleapis.com' in link.get('href', ''): + link['href'] = f'{prefix}css/google-fonts.css' + + # Localize CDN jQuery script tags + for script in soup.find_all('script', src=True): + src = script.get('src', '') + if 'cdn.jsdelivr.net' in src and 'jquery' in src.lower(): + script['src'] = f'{prefix}js/jquery.min.js' + + # Fix inline script content (version-switcher, nav JSON, archive root detection) + for script in soup.find_all('script'): + txt = script.string + if not txt: + continue + # Preserve version-switcher: only remove feedback-widget + if '.version-switcher' in txt and 'feedback-widget' in txt: + txt = txt.replace( + "$('.version-switcher, #version-switcher, .feedback-widget').remove();", + "$('.feedback-widget').remove();" + ) + # Fix sidebar baseUrl: Jekyll sets it to "/docs", making every sidebar link + # resolve to file:///docs/... in a local file:// archive. + # Fix: set baseUrl to "" and patch the href generation to use archive-root + # detection instead of counting from the filesystem root. + if 'baseUrl: "/docs"' in txt: + txt = txt.replace('baseUrl: "/docs"', 'baseUrl: ""') + # Replace .attr("href", ...) with archive-root-aware relative path computation. + # Detects the archive root by finding a path component followed by a known + # archive subdirectory, then computes depth only relative to that root — + # works correctly regardless of where the archive is placed on disk. + txt = txt.replace( + '.attr("href", urls[0] || "#")', + '.attr("href", (function(u){' + 'if(!u||u==="#"||/^https?:/.test(u))return u||"#";' + 'if(!u.startsWith("/"))return u;' + 'var pp=window.location.pathname.split("/");' + 'var idx=-1;' + 'var kd=["cockroachcloud","molt","releases","advisories","stable","_internal","css","js","images","fonts"];' + 'for(var i=0;i tags (safe: CSS content, not HTML structure) + for style in soup.find_all('style'): + if style.string: + style.string = self.fix_asset_paths(style.string, prefix) + + # Fix CSS url() in inline style attributes + for tag in soup.find_all(style=True): + tag['style'] = self.fix_asset_paths(tag['style'], prefix) + + # Inject offline nav assets and styles into + if soup.head is not None: + soup.head.append( + BeautifulSoup(self.get_nav_dependencies(prefix), 'html.parser') ) - content = re.sub( - rf'(src|href)="{asset}/([^"]+)"', - rf'\1="{prefix}{asset}/\2"', - content + soup.head.append( + BeautifulSoup(self.get_offline_styles(prefix), 'html.parser') ) - # Replace Google Fonts CDN with local - content = re.sub( - r']+fonts\.googleapis\.com[^>]+>', - f'', - content - ) - - # Fix 1 (additional): CSS @import paths - convert absolute to relative - content = re.sub( - r'@import url\(["\']?/docs/([^)"\']+)["\']?\)', - f'@import url({prefix}\\1)', - content - ) - content = re.sub( - r'@import url\(["\']?docs/([^)"\']+)["\']?\)', - f'@import url({prefix}\\1)', - content - ) - - # Fix 3: Clean up double-slash paths and remaining /docs/ references - content = re.sub(r'(href|src)="\.\.//docs/', r'\1="../', content) - content = re.sub(r'(href|src)="\.\.//+', r'\1="../', content) - content = re.sub(r'(href|src)="//+([^/])', r'\1="\2', content) - - # Fix 4: Strip query parameters from local image URLs - content = re.sub( - r'(src="[^"?]+\.(png|jpg|jpeg|svg|gif|webp))\?[^"]*"', - r'\1"', - content - ) - content = re.sub( - r'(href="[^"?]+\.(png|jpg|jpeg|svg|gif|webp|css))\?[^"]*"', - r'\1"', - content - ) - - # Fix 6: Replace CDN jQuery with local version - content = re.sub( - r']+cdn\.jsdelivr\.net[^>]+jquery[^>]*>', - f'', - content - ) - - # Inject navigation dependencies before - nav_deps = self.get_nav_dependencies(prefix) - offline_styles = self.get_offline_styles(prefix) - content = re.sub( - r'', - f'{nav_deps}\n{offline_styles}\n', - content, - flags=re.IGNORECASE - ) - - # Inject navigation init script before - nav_init = self.get_nav_init_script() - content = re.sub( - r'', - f'{nav_init}\n', - content, - flags=re.IGNORECASE - ) - - # Final cleanup: normalize any remaining double slashes in paths (but not in URLs) - content = re.sub(r'(href|src)="([^"]*[^:])//+([^"]*)"', r'\1="\2/\3"', content) - - # Write processed content - file_path.write_text(content, encoding='utf-8') + # Inject nav init script before + if soup.body is not None: + soup.body.append( + BeautifulSoup(self.get_nav_init_script(), 'html.parser') + ) + + file_path.write_text(str(soup), encoding='utf-8') self.processed_files += 1 except Exception as e: @@ -553,9 +537,10 @@ def download_google_fonts(self): css_dir = self.output_dir / "css" css_dir.mkdir(exist_ok=True) + session = self._make_session() try: headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'} - css_response = requests.get(FONTS_CSS_URL, headers=headers, timeout=10) + css_response = session.get(FONTS_CSS_URL, headers=headers, timeout=10) css_response.raise_for_status() css_content = css_response.text @@ -564,7 +549,7 @@ def download_google_fonts(self): for url in font_urls: try: - font_response = requests.get(url, headers=headers, timeout=10) + font_response = session.get(url, headers=headers, timeout=10) font_response.raise_for_status() parsed = urlparse(url) @@ -605,12 +590,13 @@ def ensure_nav_assets(self): ("jquery.navgoco.css", "https://raw.githubusercontent.com/tefra/navgoco/master/src/jquery.navgoco.css", css_dir), ] + session = self._make_session() for name, url, dest_dir in assets: dest_file = dest_dir / name if not dest_file.exists(): try: self.log(f" Downloading {name}...") - response = requests.get(url, timeout=10) + response = session.get(url, timeout=10) response.raise_for_status() dest_file.write_bytes(response.content) self.log(f" Downloaded {name}", "SUCCESS") @@ -628,9 +614,9 @@ def add_archived_banner(self): self.log("index.html not found", "WARNING") return - content = index_path.read_text(encoding='utf-8') + soup = BeautifulSoup(index_path.read_text(encoding='utf-8'), 'html.parser') - banner_css = '''''' +''', 'html.parser') - banner_html = '''
+ banner_div = BeautifulSoup('''

This is an archived version of the CockroachDB documentation. View the latest documentation

-
''' +
''', 'html.parser') - content = content.replace('', banner_css + '\n') - content = content.replace('', '\n' + banner_html) + if soup.head is not None: + soup.head.append(banner_style) + if soup.body is not None: + soup.body.insert(0, banner_div) - index_path.write_text(content, encoding='utf-8') + index_path.write_text(str(soup), encoding='utf-8') self.log("Added archived banner", "SUCCESS") def clean_macos_artifacts(self): From 33085a4bc495803e77fa5b8988a7b77b8d84e467 Mon Sep 17 00:00:00 2001 From: ebembi-crdb Date: Thu, 26 Feb 2026 23:31:24 +0530 Subject: [PATCH 3/4] Add CI smoke test for create_full_archive.py - Add test_full_archive_smoke.py: builds a tiny _site/docs fixture and runs FullArchiveCreator.build() with network calls stubbed out, then verifies: output dir + index.html created, no internal /stable/ paths remain in HTML, nav assets present, and zip is created when requested - Add .github/workflows/test-full-archive.yml CI job that runs test_full_archive_smoke.py on every push/PR touching create_full_archive.py --- .github/workflows/test-full-archive.yml | 35 +++++ src/current/test_full_archive_smoke.py | 176 ++++++++++++++++++++++++ 2 files changed, 211 insertions(+) create mode 100644 .github/workflows/test-full-archive.yml create mode 100644 src/current/test_full_archive_smoke.py diff --git a/.github/workflows/test-full-archive.yml b/.github/workflows/test-full-archive.yml new file mode 100644 index 00000000000..d8611b1334a --- /dev/null +++ b/.github/workflows/test-full-archive.yml @@ -0,0 +1,35 @@ +name: Smoke test full archive script + +on: + push: + branches: + - feat/offline-archive-scripts + paths: + - "src/current/create_full_archive.py" + - "src/current/test_full_archive_smoke.py" + pull_request: + paths: + - "src/current/create_full_archive.py" + - "src/current/test_full_archive_smoke.py" + +jobs: + smoke-test: + runs-on: ubuntu-latest + defaults: + run: + working-directory: src/current + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install dependencies + run: pip install beautifulsoup4 requests + + - name: Run smoke tests + # Stubs out network calls (Google Fonts, nav asset downloads) so the + # test works fully offline using a tiny _site/docs fixture. + run: python3 test_full_archive_smoke.py diff --git a/src/current/test_full_archive_smoke.py b/src/current/test_full_archive_smoke.py new file mode 100644 index 00000000000..78a44f1bf6c --- /dev/null +++ b/src/current/test_full_archive_smoke.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 +""" +Smoke test for create_full_archive.py. + +Creates a minimal _site/docs fixture, runs FullArchiveCreator.build(), and +verifies key invariants without requiring a full Jekyll build or network access. + +Run from src/current/: + python3 test_full_archive_smoke.py +""" +import re +import shutil +import sys +import tempfile +from pathlib import Path +from unittest.mock import patch + +SCRIPT_DIR = Path(__file__).parent +STABLE_VERSION = "v26.1" + +# Minimal HTML page with a /docs/-prefixed href and a Google Fonts link +FIXTURE_HTML = """\ + + + + Test Page + + + + + Overview + CockroachCloud + + +""" + +FIXTURE_CSS = "body { font-family: Arial; }" + + +def _build_fixture(site_dir: Path): + """Populate a minimal _site/docs tree.""" + version_dir = site_dir / STABLE_VERSION + version_dir.mkdir(parents=True) + (version_dir / "index.html").write_text(FIXTURE_HTML, encoding="utf-8") + + css_dir = site_dir / "css" + css_dir.mkdir() + (css_dir / "styles.css").write_text(FIXTURE_CSS, encoding="utf-8") + + js_dir = site_dir / "js" + js_dir.mkdir() + (js_dir / "jquery.min.js").write_text("/* jquery stub */", encoding="utf-8") + (js_dir / "jquery.cookie.min.js").write_text("/* cookie stub */", encoding="utf-8") + (js_dir / "jquery.navgoco.min.js").write_text("/* navgoco stub */", encoding="utf-8") + + css_navgoco = site_dir / "css" / "jquery.navgoco.css" + css_navgoco.write_text("/* navgoco css stub */", encoding="utf-8") + + (site_dir / "index.html").write_text(FIXTURE_HTML, encoding="utf-8") + + +def _make_creator(site_dir: Path, output_dir: Path): + sys.path.insert(0, str(SCRIPT_DIR)) + from create_full_archive import FullArchiveCreator + sys.path.pop(0) + return FullArchiveCreator( + site_dir=str(site_dir), + output_dir=str(output_dir), + stable_version=STABLE_VERSION, + ) + + +def test_output_dir_created(site_dir, output_dir): + """Output directory is created and index.html is present.""" + creator = _make_creator(site_dir, output_dir) + + # Stub out network calls so the test works offline + with patch.object(creator, "download_google_fonts", return_value=None), \ + patch.object(creator, "ensure_nav_assets", return_value=None): + creator.build(create_zip=False) + + assert output_dir.exists(), "FAIL: output directory was not created" + assert (output_dir / "index.html").exists(), "FAIL: index.html not in output" + print(" PASS: output directory created and index.html present") + + +def test_no_stable_artifacts(site_dir, output_dir): + """No internal /stable/ path remains in processed HTML (external https:// links are allowed).""" + creator = _make_creator(site_dir, output_dir) + + with patch.object(creator, "download_google_fonts", return_value=None), \ + patch.object(creator, "ensure_nav_assets", return_value=None): + creator.build(create_zip=False) + + # Match href/src that start with a relative or absolute *internal* path containing /stable/ + # External URLs (https://) are intentional (e.g. the archived banner link) and excluded. + stable_pattern = re.compile(r'(?:href|src)=["\'](?!https?://)[^"\']*?/stable/') + for html_file in output_dir.rglob("*.html"): + content = html_file.read_text(encoding="utf-8", errors="replace") + match = stable_pattern.search(content) + assert not match, ( + f"FAIL: internal /stable/ artifact found in {html_file}: {match.group()}" + ) + print(" PASS: no internal /stable/ artifacts in processed HTML") + + +def test_nav_assets_present(site_dir, output_dir): + """Navigation JS assets exist in the output directory.""" + creator = _make_creator(site_dir, output_dir) + + with patch.object(creator, "download_google_fonts", return_value=None), \ + patch.object(creator, "ensure_nav_assets", return_value=None): + creator.build(create_zip=False) + + # ensure_nav_assets is stubbed, but assets were copied from fixture + assert (output_dir / "js" / "jquery.min.js").exists(), ( + "FAIL: jquery.min.js missing from output" + ) + print(" PASS: nav assets present in output") + + +def test_zip_created(site_dir, output_dir): + """ZIP archive is created when --zip flag is used.""" + creator = _make_creator(site_dir, output_dir) + + with patch.object(creator, "download_google_fonts", return_value=None), \ + patch.object(creator, "ensure_nav_assets", return_value=None): + creator.build(create_zip=True) + + zip_path = output_dir.with_suffix(".zip") + assert zip_path.exists(), f"FAIL: expected zip at {zip_path}" + assert zip_path.stat().st_size > 0, "FAIL: zip file is empty" + print(" PASS: complete_archive.zip created and non-empty") + + +def main(): + print("Running full archive smoke tests...") + failures = [] + + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + site_dir = tmp_path / "_site" / "docs" + _build_fixture(site_dir) + + tests = [ + ("output dir created + index.html present", + lambda: test_output_dir_created(site_dir, tmp_path / "out1")), + ("no /stable/ artifacts in output HTML", + lambda: test_no_stable_artifacts(site_dir, tmp_path / "out2")), + ("nav assets present", + lambda: test_nav_assets_present(site_dir, tmp_path / "out3")), + ("zip created", + lambda: test_zip_created(site_dir, tmp_path / "out4")), + ] + + for name, fn in tests: + try: + fn() + except AssertionError as e: + print(f" {e}") + failures.append(name) + except Exception as e: + print(f" ERROR in '{name}': {e}") + failures.append(name) + + if failures: + print(f"\nFAILED: {len(failures)} test(s): {', '.join(failures)}") + sys.exit(1) + else: + print(f"\nAll {len(tests)} smoke tests passed.") + + +if __name__ == "__main__": + main() From 3c5df413e970add4dae4f0addf628ce19e627bd0 Mon Sep 17 00:00:00 2001 From: ebembi-crdb Date: Sat, 23 May 2026 00:11:25 +0530 Subject: [PATCH 4/4] Add CLI flags for self-serve archive creation and operating runbook - Add --add-version and --stable CLI flags so new versions can be added without editing the script - Output directory renamed to complete_test_archive/ to match escrow format - Zip output renamed to cockroachdb-docs.zip with automatic __MACOSX exclusion and verification - Add operating.md runbook with step-by-step instructions, verification checklist, and troubleshooting guide Co-Authored-By: Claude Opus 4.6 --- src/current/create_full_archive.py | 99 ++++++++++++---- src/current/operating.md | 178 +++++++++++++++++++++++++++++ 2 files changed, 252 insertions(+), 25 deletions(-) create mode 100644 src/current/operating.md diff --git a/src/current/create_full_archive.py b/src/current/create_full_archive.py index 6efa3c944de..ce2c4a25344 100644 --- a/src/current/create_full_archive.py +++ b/src/current/create_full_archive.py @@ -3,14 +3,22 @@ Full Website Archive Creator for CockroachDB Documentation Creates a complete offline archive from _site/docs that can be opened locally -in a browser. Matches the structure and patterns of complete_test_archive_latest.zip. +in a browser. Produces a complete_test_archive/ directory matching the +structure required for IBM Escrow delivery. Usage: - python3 create_full_archive.py + # Add a new version and set it as stable: + python3 create_full_archive.py --add-version v26.2 --stable v26.2 --zip + + # Use all defaults (versions and stable from script constants): + python3 create_full_archive.py --zip + + # Custom output directory: + python3 create_full_archive.py --add-version v26.2 --stable v26.2 --output-dir my_archive --zip Output: - - Creates 'complete_archive/' directory with the full offline site - - Optionally creates 'complete_archive.zip' for distribution + - Creates 'complete_test_archive/' directory with the full offline site + - With --zip, creates 'cockroachdb-docs.zip' excluding macOS artifacts """ import re import shutil @@ -29,7 +37,7 @@ # Configuration SCRIPT_DIR = Path(__file__).parent SITE_DIR = SCRIPT_DIR / "_site" / "docs" -OUTPUT_DIR = SCRIPT_DIR / "complete_archive" +OUTPUT_DIR = SCRIPT_DIR / "complete_test_archive" # All versions to include in the archive ALL_VERSIONS = [ @@ -60,10 +68,11 @@ class FullArchiveCreator: - def __init__(self, site_dir=None, output_dir=None, stable_version=None): + def __init__(self, site_dir=None, output_dir=None, stable_version=None, versions=None): self.site_dir = Path(site_dir) if site_dir else SITE_DIR self.output_dir = Path(output_dir) if output_dir else OUTPUT_DIR self.stable_version = stable_version or STABLE_VERSION + self.versions = list(versions) if versions else list(ALL_VERSIONS) self.processed_files = 0 self.error_count = 0 @@ -142,7 +151,7 @@ def copy_version_dirs(self): """Copy all version directories with their content""" self.log("Copying version directories...") copied_count = 0 - for version in ALL_VERSIONS: + for version in self.versions: src = self.site_dir / version if src.exists(): dst = self.output_dir / version @@ -229,7 +238,7 @@ def _rewrite_url(self, url, prefix): ) # Add .html extension to version page paths that lack an extension - for version in ALL_VERSIONS: + for version in self.versions: if url.startswith(f'{version}/'): page = url[len(f'{version}/'):] last_seg = page.split('/')[-1] @@ -681,25 +690,40 @@ def clean_macos_artifacts(self): self.log("Cleaned macOS artifacts", "SUCCESS") def create_zip_archive(self): - """Create a zip archive of the output directory""" - self.log("Creating zip archive...") + """Create cockroachdb-docs.zip excluding macOS artifacts""" + self.log("Creating zip archive (excluding macOS artifacts)...") + import subprocess - zip_path = self.output_dir.with_suffix('.zip') + zip_path = self.output_dir.parent / "cockroachdb-docs.zip" if zip_path.exists(): zip_path.unlink() - # Use shutil to create the zip - shutil.make_archive( - str(self.output_dir), - 'zip', - self.output_dir.parent, - self.output_dir.name + result = subprocess.run( + [ + "zip", "-r", "-q", str(zip_path), self.output_dir.name, + "-x", "**/__MACOSX/*", "-x", "**/.DS_Store", "-x", "**/._*", + ], + cwd=str(self.output_dir.parent), + capture_output=True, text=True, ) + if result.returncode != 0: + self.log(f"zip failed: {result.stderr}", "ERROR") + return - # Get zip size zip_size = zip_path.stat().st_size / (1024 * 1024) self.log(f"Created {zip_path.name} ({zip_size:.1f} MB)", "SUCCESS") + # Verify no macOS artifacts leaked in + check = subprocess.run( + ["unzip", "-l", str(zip_path)], + capture_output=True, text=True, + ) + macos_count = check.stdout.count("__MACOSX") + if macos_count > 0: + self.log(f"WARNING: {macos_count} __MACOSX entries found in zip!", "WARNING") + else: + self.log("Verified: 0 __MACOSX entries in zip", "SUCCESS") + def build(self, create_zip=False): """Main build process""" print("\n" + "=" * 60) @@ -709,7 +733,7 @@ def build(self, create_zip=False): self.log(f"Source: {self.site_dir}") self.log(f"Output: {self.output_dir}") self.log(f"Stable version: {self.stable_version}") - self.log(f"Versions: {', '.join(ALL_VERSIONS)}") + self.log(f"Versions: {', '.join(self.versions)}") if not self.site_dir.exists(): self.log(f"Source directory not found: {self.site_dir}", "ERROR") @@ -744,17 +768,42 @@ def build(self, create_zip=False): def main(): - parser = argparse.ArgumentParser(description="Create full website archive from _site") - parser.add_argument("--site-dir", help="Path to _site/docs directory") - parser.add_argument("--output-dir", help="Output directory for archive") - parser.add_argument("--stable-version", default=STABLE_VERSION, help="Stable version (default: v25.3)") - parser.add_argument("--zip", action="store_true", help="Create zip archive") + parser = argparse.ArgumentParser( + description="Create full offline archive of CockroachDB docs for IBM Escrow delivery.", + epilog="Example: python3 create_full_archive.py --add-version v26.2 --stable v26.2 --zip", + ) + parser.add_argument("--site-dir", help="Path to _site/docs directory (default: src/current/_site/docs)") + parser.add_argument("--output-dir", help="Output directory name (default: complete_test_archive)") + parser.add_argument( + "--add-version", action="append", default=[], metavar="VERSION", + help="Add a version to the archive (e.g. --add-version v26.2). Can be repeated.", + ) + parser.add_argument( + "--stable", dest="stable_version", default=None, + help="Set the stable version (default: highest version in the list)", + ) + parser.add_argument("--zip", action="store_true", help="Create cockroachdb-docs.zip for delivery") args = parser.parse_args() + # Build the version list: start from defaults, add any new versions + versions = list(ALL_VERSIONS) + for v in args.add_version: + if not v.startswith("v"): + v = f"v{v}" + if v not in versions: + versions.append(v) + versions.sort(key=lambda x: [int(n) for n in x.lstrip("v").split(".")]) + + # Determine stable version + stable = args.stable_version or versions[-1] + if not stable.startswith("v"): + stable = f"v{stable}" + creator = FullArchiveCreator( site_dir=args.site_dir, output_dir=args.output_dir, - stable_version=args.stable_version + stable_version=stable, + versions=versions, ) success = creator.build(create_zip=args.zip) diff --git a/src/current/operating.md b/src/current/operating.md new file mode 100644 index 00000000000..0a244779b9c --- /dev/null +++ b/src/current/operating.md @@ -0,0 +1,178 @@ +# IBM Escrow Archive - Operating Runbook + +## Overview + +IBM Escrow requires a quarterly zip of the CockroachDB documentation delivered within 14 days of each new version release. The deliverable is `cockroachdb-docs.zip` containing a `complete_test_archive/` directory with all supported doc versions, viewable offline in a browser. + +## Prerequisites + +- Python 3.12+ with `beautifulsoup4` and `requests` installed +- The Jekyll site must be built (`_site/docs/` exists with the new version) +- Internet access (script downloads Google Fonts for offline use; falls back to system fonts if offline) +- ~2 GB free disk space + +## Quick Start + +```bash +cd /path/to/docs/src/current + +# 1. Fetch the archive script +git fetch origin feat/offline-archive-scripts +git show "origin/feat/offline-archive-scripts:src/current/create_full_archive.py" > create_full_archive.py + +# 2. Run it with the new version +python3 create_full_archive.py --add-version v26.2 --stable v26.2 --zip + +# 3. Deliverable is ready +ls -lh cockroachdb-docs.zip +``` + +That's it. The script handles everything: copying all versions, fixing navigation for offline use, downloading fonts, cleaning macOS artifacts, and producing the final zip. + +## CLI Reference + +``` +python3 create_full_archive.py [OPTIONS] + +Options: + --add-version VERSION Add a doc version to the archive. Repeatable. + Example: --add-version v26.2 + --stable VERSION Set the stable/default version. + Defaults to the highest version in the list. + --zip Produce cockroachdb-docs.zip (required for delivery). + --site-dir PATH Path to _site/docs/ if non-standard. + --output-dir PATH Output directory name (default: complete_test_archive). +``` + +### Examples + +```bash +# Standard quarterly delivery (add new version, set as stable) +python3 create_full_archive.py --add-version v26.2 --stable v26.2 --zip + +# Add multiple new versions at once +python3 create_full_archive.py --add-version v26.2 --add-version v26.3 --stable v26.3 --zip + +# Rebuild with all defaults (no new version) +python3 create_full_archive.py --zip +``` + +## Step-by-Step Walkthrough + +### 1. Ensure the site is built + +The script reads from `src/current/_site/docs/`. Confirm the new version directory exists: + +```bash +ls _site/docs/v26.2/ | head -5 +``` + +If not built, run `make build` or `jekyll build` from the repo root first. + +### 2. Fetch the archive script + +The script lives on the `feat/offline-archive-scripts` branch: + +```bash +git fetch origin feat/offline-archive-scripts +git show "origin/feat/offline-archive-scripts:src/current/create_full_archive.py" > create_full_archive.py +``` + +### 3. Run the archive creator + +```bash +python3 create_full_archive.py --add-version v26.2 --stable v26.2 --zip +``` + +The script runs these steps automatically: +1. Creates `complete_test_archive/` directory +2. Copies asset directories (css, js, images, fonts, _internal) +3. Copies all version directories (v23.1 through the new version) +4. Copies content directories (cockroachcloud, releases, advisories, molt, stable) +5. Copies top-level HTML files +6. Downloads jQuery and navgoco for offline navigation +7. Downloads and localizes Google Fonts +8. Processes all HTML files (rewrites URLs for offline use) +9. Adds "archived version" banner to index.html +10. Cleans macOS artifacts (__MACOSX, .DS_Store, ._ files) +11. Creates `cockroachdb-docs.zip` excluding macOS artifacts +12. Verifies 0 __MACOSX entries in the zip + +### 4. Verify the archive + +```bash +# Quick check: no macOS artifacts +unzip -l cockroachdb-docs.zip | grep -c __MACOSX # should be 0 + +# Open in browser +mkdir -p /tmp/verify-archive +unzip -q cockroachdb-docs.zip -d /tmp/verify-archive +open /tmp/verify-archive/complete_test_archive/index.html +``` + +Browser verification checklist: +- [ ] Home page loads with sidebar and archived banner +- [ ] Sidebar navigation expands/collapses +- [ ] New version pages load (e.g., v26.2/install-cockroachdb.html) +- [ ] Older version pages load (e.g., v24.3/install-cockroachdb.html) +- [ ] CSS, fonts, and images render correctly +- [ ] No search bar or AI widgets visible (they are online-only) +- [ ] Console (Cmd+Option+J) shows no 404 errors for archive resources + +Expected benign console errors (not issues): +- `file: URLs are treated as unique security origins` -- standard browser security for local files +- `"[object Object]" is not valid JSON` -- navgoco cookie state, sidebar still works +- `Failed to load resource: net::ERR_INVALID_URL` -- external resource stripped for offline +- `Cannot read properties of null (reading 'error')` -- HubSpot form loader, irrelevant offline + +### 5. Deliver + +Upload `cockroachdb-docs.zip` to the escrow delivery location. + +### 6. Clean up + +```bash +rm -f create_full_archive.py +rm -rf complete_test_archive/ +rm -rf /tmp/verify-archive/ +``` + +## Archive Structure + +``` +cockroachdb-docs.zip + complete_test_archive/ + index.html # Home page with archived banner + 404.html + search.html + css/ # Stylesheets + google-fonts.css + js/ # jQuery, navgoco, site JS + fonts/ # Localized Google Fonts (Poppins, Source Sans/Code Pro) + images/ # All site images + _internal/ # Sidebar HTML files per version + cockroachcloud/ # CockroachDB Cloud docs + releases/ # Release notes + advisories/ # Technical advisories + molt/ # MOLT migration docs + stable/ # Symlink target (= latest stable version) + v23.1/ # Versioned docs + v23.2/ + ... + v26.2/ # Latest version +``` + +## Troubleshooting + +| Problem | Solution | +|---------|----------| +| `ModuleNotFoundError: bs4` | `pip3 install beautifulsoup4 requests` | +| `_site/docs/ not found` | Run `make build` or `jekyll build` first | +| Script fails with Python 3.14 syntax errors | Use Python 3.12 or 3.13 | +| Fonts fallback to system fonts | Check internet connectivity; archive still works | +| Zip is too small (< 1 GB) | Verify all version dirs exist in `_site/docs/` | + +## Version History + +| Delivery Date | Stable Version | Versions Included | Size | +|---------------|---------------|-------------------|------| +| 2026-05-22 | v26.2 | v23.1 -- v26.2 (11 versions) | 1.7 GB |