coffeegist · DrorDvash · Dec 24, 2025 · Dec 29, 2025 · Dec 29, 2025 · Dec 30, 2025
diff --git a/.gitignore b/.gitignore
@@ -156,3 +156,6 @@ cython_debug/
 utilities/benchmarks/*.txt
 *.lprof
 *.mprof
+
+# Local databases
+*.db
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,31 @@
 # Changelog
+## [Unreleased]
+### Performance
+- Parallel ACL processing using multiprocessing
+- SQLite-based object cache for incremental processing - only processes new/changed objects on subsequent runs
+- Optimized group membership resolution algorithm from polynomial to linear complexity using reverse index lookups
+
+### Added
+- `--workers N` option to control number of parallel workers (default: ~90% of CPU cores)
+- `--no-cache` flag to disable caching for a run
+- `--cache-file` option for custom cache database path
+- `--cache-stats` to display cache statistics and exit
+- `--context-from` to load SID/domain context from previous run's cache for ACL resolution
+- Real-time progress indicators with ETA and objects/second processing rate
+- CPU core utilization info in logs
+- Benchmarking utilities for performance testing
+
+### Fixed
+- Crash on malformed certificate data (invalid base64 in `cacertificate` attribute)
+- Crash when certificate chain building encounters null certificates
+- Cache operations now properly check for null when `--no-cache` flag is used
+
+### Changed
+- Refactored parser pipeline to use streaming input architecture with generator-based file reading
+- Cache storage and commit operations now happen after JSON writing (doesn't block output)
+- Improved type hints in ADDS class for better code clarity
+- Enhanced debug logging messages for object properties
+
 ## [0.4.20] - 12/16/2025
 ### Fixes
 - Fix [#46](https://github.com/coffeegist/bofhound/issues/46) which caused well-known SIDs (groups) to be mising from bofhound output

diff --git a/README.md b/README.md
@@ -108,6 +108,39 @@ Parse Havoc loot logs (will change default input path to `/opt/havoc/data/loot`)
 bofhound --parser havoc --zip
 ```
 
+## Performance & Cache
+
+This branch introduces a SQLite-backed object cache and parallel-ready ACL processing to speed up repeated runs on large datasets.
+
+- Caching is enabled by default. When a cache exists, incremental filtering is automatic: objects already seen (by SID/DN) are skipped.
+- To disable caching (and incremental behavior), pass `--no-cache` or remove the cache file.
+- Cache location defaults to `bofhound_cache.db` in the output folder; override with `--cache-file PATH`.
+- ACL processing accepts a worker count via `--workers` (auto-detected if unspecified).
+- Inspect cache with `--cache-stats`.
+
+Examples (Windows PowerShell):
+
+```
+# First run: build cache and output JSON
+.\n+venv\Scripts\bofhound.exe -i samples/ -p All --parser ldapsearch --output out
+
+# Incremental run: skip unchanged objects (automatic when cache exists)
+.
+\venv\Scripts\bofhound.exe -i samples/ -p All --parser ldapsearch --output out
+
+# Disable cache entirely
+.
+\venv\Scripts\bofhound.exe -i samples/ -p All --parser ldapsearch --output out --no-cache
+
+# Use a specific cache file and workers
+.
+\venv\Scripts\bofhound.exe -i samples/ -p All --parser ldapsearch --output out --cache-file out\my_cache.db --workers 6
+
+# Show cache stats and exit
+.
+\venv\Scripts\bofhound.exe -i samples/ --parser ldapsearch --output out --cache-stats
+```
+
 # ldapsearch
 Specify `*,ntsecuritydescriptor` as the attributes to return to be able to parse ACL edges. You are missing a ton of data if you don't include this in your `ldapsearch` queries!
 

diff --git a/bofhound/__main__.py b/bofhound/__main__.py
@@ -11,6 +11,7 @@
 from bofhound import console
 from bofhound.ad.helpers import PropertiesLevel
 from bofhound.logger import logger
+from bofhound.cache import ObjectCache
 
 app = typer.Typer(
     add_completion=False,
@@ -44,6 +45,31 @@ def main(
         help="Compress the JSON output files into a zip archive"
     ),
     quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress banner"),
+    no_cache: bool = typer.Option(
+        False, "--no-cache",
+        help="Disable object caching (cache is enabled by default)",
+        rich_help_panel="Performance Options"
+    ),
+    cache_file: str = typer.Option(
+        None, "--cache-file",
+        help="Custom path to cache database (default: bofhound_cache.db in output folder)",
+        rich_help_panel="Performance Options"
+    ),
+    context_from: str = typer.Option(
+        None, "--context-from",
+        help="Load SID/domain context from a previous run's cache file for ACL resolution. Use when processing late data (e.g., certificates) separately.",
+        rich_help_panel="Performance Options"
+    ),
+    workers: int = typer.Option(
+        None, "--workers",
+        help='Number of worker processes for parallel ACL parsing. Default: ~90%% of CPU cores (auto-detected). Check your system: python -c "import os; print(f\'CPU cores: {os.cpu_count()}\')"',
+        rich_help_panel="Performance Options"
+    ),
+    cache_stats: bool = typer.Option(
+        False, "--cache-stats",
+        help="Display cache statistics and exit",
+        rich_help_panel="Performance Options"
+    ),
     mythic_server: str = typer.Option(
         "127.0.0.1", "--mythic-server", help="IP or hostname of Mythic server to connect to",
         rich_help_panel="Mythic Options"
@@ -73,9 +99,84 @@ def main(
     else:
         logging.getLogger().setLevel(logging.INFO)
 
+    # Handle cache stats display
+    if cache_stats:
+        # Determine cache file path
+        if not cache_file:
+            cache_file = f"{output_folder}/bofhound_cache.db"
+
+        import os
+        if not os.path.exists(cache_file):
+            console.print(f"[yellow]Cache file not found: {cache_file}[/yellow]")
+            console.print("Run bofhound first to create the cache.")
+            sys.exit(0)
+
+        try:
+            with ObjectCache(cache_file) as cache:
+                stats = cache.get_statistics()
+                console.print("\n[bold cyan]Cache Statistics[/bold cyan]")
+                console.print(f"Cache file: {cache_file}")
+                console.print(f"Version: {stats['cache_version']}")
+                console.print(f"Total objects: {stats['total_objects']:,}")
+                console.print("\n[bold]Objects by type:[/bold]")
+                for obj_type, count in sorted(stats['by_type'].items(), key=lambda x: x[1], reverse=True):
+                    console.print(f"  {obj_type}: {count:,}")
+                console.print(f"\nCache size: {stats['file_size_mb']} MB")
+                console.print(f"Created: {stats.get('created_at', 'Unknown')}")
+                console.print(f"Last accessed: {stats.get('last_accessed', 'Unknown')}")
+        except Exception as e:
+            console.print(f"[red]Error reading cache: {e}[/red]")
+            sys.exit(1)
+        return
+
     if not quiet:
         banner()
 
+    # Auto-detect worker count if not specified
+    import os
+    cpu_count = os.cpu_count() or 4
+
+    if workers is None:
+        # Use ~90% of cores by default - leave headroom for OS and other processes
+        workers = max(1, int(cpu_count * 0.9))
+    elif workers < 1:
+        console.print("[red]Error: --workers must be at least 1[/red]")
+        sys.exit(1)
+    elif workers > cpu_count:
+        console.print(f"[yellow]Warning: {workers} workers exceeds CPU count ({cpu_count})[/yellow]")
+        console.print("This may reduce performance. Recommended: --workers {}".format(int(cpu_count * 0.9)))
+
+    # Log worker and CPU info in one line
+    worker_pct = int((workers / cpu_count) * 100)
+    logger.info(f"Using {workers}/{cpu_count} CPU cores ({worker_pct}%)")
+
+    # Create output directory if it doesn't exist
+    os.makedirs(output_folder, exist_ok=True)
+
+    # Initialize cache (enabled by default unless --no-cache)
+    cache = None
+    if not no_cache:
+        # Determine cache file path
+        if not cache_file:
+            cache_file = f"{output_folder}/bofhound_cache.db"
+
+        cache_exists = os.path.exists(cache_file)
+
+        try:
+            cache = ObjectCache(cache_file)
+            if cache_exists:
+                stats = cache.get_statistics()
+                logger.info(f"Found existing cache: {cache_file} ({stats['total_objects']:,} objects)")
+                logger.info("Only new/changed objects will be processed (incremental mode)")
+                logger.info("To disable caching, use --no-cache or delete/rename the cache file")
+            else:
+                logger.info(f"Creating new cache: {cache_file}")
+        except Exception as e:
+            logger.error(f"Failed to initialize cache: {e}")
+            sys.exit(1)
+    else:
+        logger.info("Caching disabled (--no-cache)")
+
      # default to Cobalt logfile naming format
     data_source = None
 
@@ -120,18 +221,57 @@ def main(
     ad = ADDS()
     broker = LocalBroker()
     pipeline = ParsingPipelineFactory.create_pipeline(parser_type=parser_type)
+
+    # Load context from external cache if specified
+    if context_from:
+        import os
+        context_cache_path = context_from
+
+        # If directory, look for cache file inside it
+        if os.path.isdir(context_from):
+            context_cache_path = os.path.join(context_from, 'bofhound_cache.db')
+
+        if not os.path.exists(context_cache_path):
+            logger.error(f"Context cache file not found: {context_cache_path}")
+            sys.exit(1)
+
+        try:
+            logger.info(f"Loading context from: {context_cache_path}")
+            with ObjectCache(context_cache_path) as context_cache:
+                ctx_stats = context_cache.get_context_statistics()
+                if ctx_stats['sid_mappings'] == 0:
+                    logger.warning("Context cache has no SID mappings - ACL resolution may be incomplete")
+                else:
+                    ad.load_context_from_cache(context_cache)
+        except Exception as e:
+            logger.error(f"Failed to load context: {e}")
+            sys.exit(1)
 
     with console.status("", spinner="aesthetic") as status:
         results = pipeline.process_data_source(
             data_source,
-            progress_callback=lambda id: status.update(f"Processing {id}")
+            progress_callback=lambda id: status.update(f"Processing {id}"),
+            num_workers=workers
         )
 
     ldap_objects = results.get_ldap_objects()
     local_objects = results.get_local_group_memberships() + results.get_sessions() + \
         results.get_privileged_sessions() + results.get_registry_sessions()
     logger.info("Parsed %d LDAP objects", len(ldap_objects))
     logger.info("Parsed %d local group/session objects", len(local_objects))
+
+    # Apply cache filtering (automatic when cache exists)
+    if cache and ldap_objects:
+        original_count = len(ldap_objects)
+        stats = cache.get_statistics()
+        if stats['total_objects'] > 0:
+            logger.info("Filtering against cache (%d existing objects)...", stats['total_objects'])
+            ldap_objects = cache.get_changed_objects(ldap_objects)
+            logger.info("After cache filter: %d new/changed, %d skipped", 
+                       len(ldap_objects), original_count - len(ldap_objects))
+        else:
+            logger.info("Cache is empty - all objects will be processed")
+
     logger.info("Sorting parsed objects by type...")
 
     ad.import_objects(ldap_objects)
@@ -159,9 +299,53 @@ def main(
     logger.info("Parsed %d Registry Sessions", len(broker.registry_sessions))
     logger.info("Parsed %d Local Group Memberships", len(broker.local_group_memberships))
 
-    ad.process()
+    ad.process(num_workers=workers)
     ad.process_local_objects(broker)
 
+    # Store processed objects in cache
+    if cache:
+        logger.info("Updating cache with processed objects...")
+        all_objects = (ad.users + ad.groups + ad.computers + ad.domains + 
+                      ad.ous + ad.gpos + ad.containers + ad.aiacas + ad.rootcas +
+                      ad.enterprisecas + ad.certtemplates + ad.issuancepolicies + 
+                      ad.ntauthstores + ad.trustaccounts + ad.schemas)
+        # Note: ad.unknown_objects are raw dicts, not BloodHoundObject instances, so they can't be cached
+
+        stored_count = 0
+        for obj in all_objects:
+            try:
+                cache.store_object(obj)
+                stored_count += 1
+            except Exception as e:
+                logger.debug(f"Failed to cache object {getattr(obj, 'ObjectIdentifier', 'unknown')}: {e}")
+
+        # Store SID mappings for context in future runs
+        logger.debug("Storing SID mappings in cache...")
+        sid_mappings = []
+        dn_mappings = []
+        for sid, obj in ad.SID_MAP.items():
+            if hasattr(obj, '_entry_type') and hasattr(obj, 'Properties'):
+                name = obj.Properties.get('name', '')
+                dn = obj.Properties.get('distinguishedname', '')
+                domain = obj.Properties.get('domain', '')
+                obj_type = obj._entry_type
+                sid_mappings.append((sid, name, obj_type, domain))
+                if dn:
+                    dn_mappings.append((dn, sid, obj_type))
+
+        if sid_mappings:
+            cache.store_sid_mappings_bulk(sid_mappings)
+        if dn_mappings:
+            cache.store_dn_mappings_bulk(dn_mappings)
+
+        # Store domain mappings
+        for dc, domain_sid in ad.DOMAIN_MAP.items():
+            cache.store_domain_mapping(dc, domain_sid)
+
+        # Store schema GUIDs
+        if ad.ObjectTypeGuidMap:
+            cache.store_schema_guids_bulk(ad.ObjectTypeGuidMap)
+
     #
     # Write out the BloodHound JSON files
     #
@@ -184,6 +368,16 @@ def main(
         zip_files=zip_files
     )
 
+    if cache:
+        cache.commit()
+
+        # Log context statistics
+        ctx_stats = cache.get_context_statistics()
+        logger.info(f"Cache updated successfully ({stored_count:,} objects stored)")
+        logger.debug(f"Context stored: {ctx_stats['sid_mappings']} SID mappings, "
+                    f"{ctx_stats['domain_mappings']} domain mappings, "
+                    f"{ctx_stats['schema_guids']} schema GUIDs")
+
     #
     # Upload files to BloodHound CE
     #
@@ -201,6 +395,11 @@ def main(
             uploader.close_upload_job()
         logger.info("Files uploaded to BloodHound server")
 
+    # Close cache
+    if cache:
+        cache.close()
+        logger.info("Cache closed")
+
 
 def banner():
     """Display the bofhound banner."""
-Original file line number
+Diff line change
@@ Expand Up / @@ -156,3 +156,6 @@ cython_debug/ @@
     utilities/benchmarks/*.txt
     *.lprof
     *.mprof
+    # Local databases
+    *.db