Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -156,3 +156,6 @@ cython_debug/
utilities/benchmarks/*.txt
*.lprof
*.mprof

# Local databases
*.db
27 changes: 27 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,31 @@
# Changelog
## [Unreleased]
### Performance
- Parallel ACL processing using multiprocessing
- SQLite-based object cache for incremental processing - only processes new/changed objects on subsequent runs
- Optimized group membership resolution algorithm from polynomial to linear complexity using reverse index lookups

### Added
- `--workers N` option to control number of parallel workers (default: ~90% of CPU cores)
- `--no-cache` flag to disable caching for a run
- `--cache-file` option for custom cache database path
- `--cache-stats` to display cache statistics and exit
- `--context-from` to load SID/domain context from previous run's cache for ACL resolution
- Real-time progress indicators with ETA and objects/second processing rate
- CPU core utilization info in logs
- Benchmarking utilities for performance testing

### Fixed
- Crash on malformed certificate data (invalid base64 in `cacertificate` attribute)
- Crash when certificate chain building encounters null certificates
- Cache operations now properly check for null when `--no-cache` flag is used

### Changed
- Refactored parser pipeline to use streaming input architecture with generator-based file reading
- Cache storage and commit operations now happen after JSON writing (doesn't block output)
- Improved type hints in ADDS class for better code clarity
- Enhanced debug logging messages for object properties

## [0.4.20] - 12/16/2025
### Fixes
- Fix [#46](https://github.com/coffeegist/bofhound/issues/46) which caused well-known SIDs (groups) to be mising from bofhound output
Expand Down
33 changes: 33 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,39 @@ Parse Havoc loot logs (will change default input path to `/opt/havoc/data/loot`)
bofhound --parser havoc --zip
```

## Performance & Cache

This branch introduces a SQLite-backed object cache and parallel-ready ACL processing to speed up repeated runs on large datasets.

- Caching is enabled by default. When a cache exists, incremental filtering is automatic: objects already seen (by SID/DN) are skipped.
- To disable caching (and incremental behavior), pass `--no-cache` or remove the cache file.
- Cache location defaults to `bofhound_cache.db` in the output folder; override with `--cache-file PATH`.
- ACL processing accepts a worker count via `--workers` (auto-detected if unspecified).
- Inspect cache with `--cache-stats`.

Examples (Windows PowerShell):

```
# First run: build cache and output JSON
.\n+venv\Scripts\bofhound.exe -i samples/ -p All --parser ldapsearch --output out

# Incremental run: skip unchanged objects (automatic when cache exists)
.
\venv\Scripts\bofhound.exe -i samples/ -p All --parser ldapsearch --output out

# Disable cache entirely
.
\venv\Scripts\bofhound.exe -i samples/ -p All --parser ldapsearch --output out --no-cache

# Use a specific cache file and workers
.
\venv\Scripts\bofhound.exe -i samples/ -p All --parser ldapsearch --output out --cache-file out\my_cache.db --workers 6

# Show cache stats and exit
.
\venv\Scripts\bofhound.exe -i samples/ --parser ldapsearch --output out --cache-stats
```

# ldapsearch
Specify `*,ntsecuritydescriptor` as the attributes to return to be able to parse ACL edges. You are missing a ton of data if you don't include this in your `ldapsearch` queries!

Expand Down
203 changes: 201 additions & 2 deletions bofhound/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from bofhound import console
from bofhound.ad.helpers import PropertiesLevel
from bofhound.logger import logger
from bofhound.cache import ObjectCache

app = typer.Typer(
add_completion=False,
Expand Down Expand Up @@ -44,6 +45,31 @@ def main(
help="Compress the JSON output files into a zip archive"
),
quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress banner"),
no_cache: bool = typer.Option(
False, "--no-cache",
help="Disable object caching (cache is enabled by default)",
rich_help_panel="Performance Options"
),
cache_file: str = typer.Option(
None, "--cache-file",
help="Custom path to cache database (default: bofhound_cache.db in output folder)",
rich_help_panel="Performance Options"
),
context_from: str = typer.Option(
None, "--context-from",
help="Load SID/domain context from a previous run's cache file for ACL resolution. Use when processing late data (e.g., certificates) separately.",
rich_help_panel="Performance Options"
),
workers: int = typer.Option(
None, "--workers",
help='Number of worker processes for parallel ACL parsing. Default: ~90%% of CPU cores (auto-detected). Check your system: python -c "import os; print(f\'CPU cores: {os.cpu_count()}\')"',
rich_help_panel="Performance Options"
),
cache_stats: bool = typer.Option(
False, "--cache-stats",
help="Display cache statistics and exit",
rich_help_panel="Performance Options"
),
mythic_server: str = typer.Option(
"127.0.0.1", "--mythic-server", help="IP or hostname of Mythic server to connect to",
rich_help_panel="Mythic Options"
Expand Down Expand Up @@ -73,9 +99,84 @@ def main(
else:
logging.getLogger().setLevel(logging.INFO)

# Handle cache stats display
if cache_stats:
# Determine cache file path
if not cache_file:
cache_file = f"{output_folder}/bofhound_cache.db"

import os
if not os.path.exists(cache_file):
console.print(f"[yellow]Cache file not found: {cache_file}[/yellow]")
console.print("Run bofhound first to create the cache.")
sys.exit(0)

try:
with ObjectCache(cache_file) as cache:
stats = cache.get_statistics()
console.print("\n[bold cyan]Cache Statistics[/bold cyan]")
console.print(f"Cache file: {cache_file}")
console.print(f"Version: {stats['cache_version']}")
console.print(f"Total objects: {stats['total_objects']:,}")
console.print("\n[bold]Objects by type:[/bold]")
for obj_type, count in sorted(stats['by_type'].items(), key=lambda x: x[1], reverse=True):
console.print(f" {obj_type}: {count:,}")
console.print(f"\nCache size: {stats['file_size_mb']} MB")
console.print(f"Created: {stats.get('created_at', 'Unknown')}")
console.print(f"Last accessed: {stats.get('last_accessed', 'Unknown')}")
except Exception as e:
console.print(f"[red]Error reading cache: {e}[/red]")
sys.exit(1)
return

if not quiet:
banner()

# Auto-detect worker count if not specified
import os
cpu_count = os.cpu_count() or 4

if workers is None:
# Use ~90% of cores by default - leave headroom for OS and other processes
workers = max(1, int(cpu_count * 0.9))
elif workers < 1:
console.print("[red]Error: --workers must be at least 1[/red]")
sys.exit(1)
elif workers > cpu_count:
console.print(f"[yellow]Warning: {workers} workers exceeds CPU count ({cpu_count})[/yellow]")
console.print("This may reduce performance. Recommended: --workers {}".format(int(cpu_count * 0.9)))

# Log worker and CPU info in one line
worker_pct = int((workers / cpu_count) * 100)
logger.info(f"Using {workers}/{cpu_count} CPU cores ({worker_pct}%)")

# Create output directory if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Initialize cache (enabled by default unless --no-cache)
cache = None
if not no_cache:
# Determine cache file path
if not cache_file:
cache_file = f"{output_folder}/bofhound_cache.db"

cache_exists = os.path.exists(cache_file)

try:
cache = ObjectCache(cache_file)
if cache_exists:
stats = cache.get_statistics()
logger.info(f"Found existing cache: {cache_file} ({stats['total_objects']:,} objects)")
logger.info("Only new/changed objects will be processed (incremental mode)")
logger.info("To disable caching, use --no-cache or delete/rename the cache file")
else:
logger.info(f"Creating new cache: {cache_file}")
except Exception as e:
logger.error(f"Failed to initialize cache: {e}")
sys.exit(1)
else:
logger.info("Caching disabled (--no-cache)")

# default to Cobalt logfile naming format
data_source = None

Expand Down Expand Up @@ -120,18 +221,57 @@ def main(
ad = ADDS()
broker = LocalBroker()
pipeline = ParsingPipelineFactory.create_pipeline(parser_type=parser_type)

# Load context from external cache if specified
if context_from:
import os
context_cache_path = context_from

# If directory, look for cache file inside it
if os.path.isdir(context_from):
context_cache_path = os.path.join(context_from, 'bofhound_cache.db')

if not os.path.exists(context_cache_path):
logger.error(f"Context cache file not found: {context_cache_path}")
sys.exit(1)

try:
logger.info(f"Loading context from: {context_cache_path}")
with ObjectCache(context_cache_path) as context_cache:
ctx_stats = context_cache.get_context_statistics()
if ctx_stats['sid_mappings'] == 0:
logger.warning("Context cache has no SID mappings - ACL resolution may be incomplete")
else:
ad.load_context_from_cache(context_cache)
except Exception as e:
logger.error(f"Failed to load context: {e}")
sys.exit(1)

with console.status("", spinner="aesthetic") as status:
results = pipeline.process_data_source(
data_source,
progress_callback=lambda id: status.update(f"Processing {id}")
progress_callback=lambda id: status.update(f"Processing {id}"),
num_workers=workers
)

ldap_objects = results.get_ldap_objects()
local_objects = results.get_local_group_memberships() + results.get_sessions() + \
results.get_privileged_sessions() + results.get_registry_sessions()
logger.info("Parsed %d LDAP objects", len(ldap_objects))
logger.info("Parsed %d local group/session objects", len(local_objects))

# Apply cache filtering (automatic when cache exists)
if cache and ldap_objects:
original_count = len(ldap_objects)
stats = cache.get_statistics()
if stats['total_objects'] > 0:
logger.info("Filtering against cache (%d existing objects)...", stats['total_objects'])
ldap_objects = cache.get_changed_objects(ldap_objects)
logger.info("After cache filter: %d new/changed, %d skipped",
len(ldap_objects), original_count - len(ldap_objects))
else:
logger.info("Cache is empty - all objects will be processed")

logger.info("Sorting parsed objects by type...")

ad.import_objects(ldap_objects)
Expand Down Expand Up @@ -159,9 +299,53 @@ def main(
logger.info("Parsed %d Registry Sessions", len(broker.registry_sessions))
logger.info("Parsed %d Local Group Memberships", len(broker.local_group_memberships))

ad.process()
ad.process(num_workers=workers)
ad.process_local_objects(broker)

# Store processed objects in cache
if cache:
logger.info("Updating cache with processed objects...")
all_objects = (ad.users + ad.groups + ad.computers + ad.domains +
ad.ous + ad.gpos + ad.containers + ad.aiacas + ad.rootcas +
ad.enterprisecas + ad.certtemplates + ad.issuancepolicies +
ad.ntauthstores + ad.trustaccounts + ad.schemas)
# Note: ad.unknown_objects are raw dicts, not BloodHoundObject instances, so they can't be cached

stored_count = 0
for obj in all_objects:
try:
cache.store_object(obj)
stored_count += 1
except Exception as e:
logger.debug(f"Failed to cache object {getattr(obj, 'ObjectIdentifier', 'unknown')}: {e}")

# Store SID mappings for context in future runs
logger.debug("Storing SID mappings in cache...")
sid_mappings = []
dn_mappings = []
for sid, obj in ad.SID_MAP.items():
if hasattr(obj, '_entry_type') and hasattr(obj, 'Properties'):
name = obj.Properties.get('name', '')
dn = obj.Properties.get('distinguishedname', '')
domain = obj.Properties.get('domain', '')
obj_type = obj._entry_type
sid_mappings.append((sid, name, obj_type, domain))
if dn:
dn_mappings.append((dn, sid, obj_type))

if sid_mappings:
cache.store_sid_mappings_bulk(sid_mappings)
if dn_mappings:
cache.store_dn_mappings_bulk(dn_mappings)

# Store domain mappings
for dc, domain_sid in ad.DOMAIN_MAP.items():
cache.store_domain_mapping(dc, domain_sid)

# Store schema GUIDs
if ad.ObjectTypeGuidMap:
cache.store_schema_guids_bulk(ad.ObjectTypeGuidMap)

#
# Write out the BloodHound JSON files
#
Expand All @@ -184,6 +368,16 @@ def main(
zip_files=zip_files
)

if cache:
cache.commit()

# Log context statistics
ctx_stats = cache.get_context_statistics()
logger.info(f"Cache updated successfully ({stored_count:,} objects stored)")
logger.debug(f"Context stored: {ctx_stats['sid_mappings']} SID mappings, "
f"{ctx_stats['domain_mappings']} domain mappings, "
f"{ctx_stats['schema_guids']} schema GUIDs")

#
# Upload files to BloodHound CE
#
Expand All @@ -201,6 +395,11 @@ def main(
uploader.close_upload_job()
logger.info("Files uploaded to BloodHound server")

# Close cache
if cache:
cache.close()
logger.info("Cache closed")


def banner():
"""Display the bofhound banner."""
Expand Down
Loading