diff --git a/scripts/__init__.py b/scripts/__init__.py index 1c28744..5e59001 100644 --- a/scripts/__init__.py +++ b/scripts/__init__.py @@ -1,11 +1,30 @@ """ scripts package - AdGuard Home Blocklist Compiler +A high-performance blocklist compiler that merges 80+ DNS blocklists into a single, +deduplicated output optimized for AdGuard Home. + Modules: - downloader: Download blocklists with ETag/Last-Modified caching - cleaner: Clean and validate rules for AGH compatibility - compiler: Modifier-aware deduplication - pipeline: Main processing pipeline + downloader: Async blocklist downloader with ETag/Last-Modified caching + cleaner: Rule filtering and validation for AdGuard Home compatibility + compiler: Format compression and modifier-aware deduplication engine + pipeline: Main processing pipeline orchestrator + +Example: + >>> from scripts.pipeline import process_files + >>> stats = process_files("lists/_raw", "lists/merged.txt") """ -__version__ = "1.0.0" +from typing import Final + +__version__: Final[str] = "1.4.0" +__author__: Final[str] = "MissionWAR" + +__all__ = [ + "__version__", + "__author__", + "downloader", + "cleaner", + "compiler", + "pipeline", +] diff --git a/scripts/cleaner.py b/scripts/cleaner.py index 4e3d98b..6b878c1 100644 --- a/scripts/cleaner.py +++ b/scripts/cleaner.py @@ -5,7 +5,7 @@ This module filters blocklist rules to keep only those compatible with AdGuard Home. It's the first stage of the pipeline, running BEFORE the compiler. -CRITICAL UNDERSTANDING - DNS vs Browser Blocking: +Critical Understanding - DNS vs Browser Blocking: AdGuard Home is a DNS-level blocker, NOT a browser extension. This means: - DNS only sees domain names, not URLs, request types, or page content @@ -19,13 +19,13 @@ If we stripped the modifiers, we'd get ||ads.example.com^ which blocks EVERYTHING from that domain - a much more aggressive rule than intended! This could break sites. -DESIGN DECISION - Discard, Don't Strip: +Design Decision - Discard, Don't Strip: Rules with unsupported modifiers are COMPLETELY DISCARDED, not stripped. This prevents false positives and unexpected site breakage. A smaller, more accurate blocklist is better than a larger, overly-aggressive one. -KEY OPERATIONS: +Key Operations: 1. Remove comments (# and ! lines) 2. Discard cosmetic/element-hiding rules (##, #@#, #$#, etc.) 3. Discard rules with browser-only modifiers @@ -33,86 +33,181 @@ """ import re -from typing import NamedTuple - - -# ============================================================================ -# MODIFIER DEFINITIONS (based on official AdGuard DNS filtering syntax docs) -# ============================================================================ - -# Modifiers that are browser-only and NOT supported by AGH -# If a rule contains ANY of these, the ENTIRE RULE should be discarded -UNSUPPORTED_MODIFIERS = frozenset({ - # Content type modifiers (browser-only) - "script", "image", "stylesheet", "font", "media", "object", - "subdocument", "xmlhttprequest", "websocket", "webrtc", - "ping", "other", +from typing import Final, NamedTuple, TypedDict + + +# ============================================================================= +# MODIFIER DEFINITIONS +# ============================================================================= +# Based on official AdGuard DNS filtering syntax documentation. +# https://adguard-dns.io/kb/general/dns-filtering-syntax/ + +# Modifiers that are browser-only and NOT supported by AdGuard Home. +# If a rule contains ANY of these, the ENTIRE RULE should be discarded. +# +# Grouped by category for easier maintenance: +UNSUPPORTED_MODIFIERS: Final[frozenset[str]] = frozenset({ + # ------------------------------------------------------------------------- + # Content type modifiers (browser-only, DNS can't see content types) + # ------------------------------------------------------------------------- + "script", # JavaScript files + "image", # Images (png, jpg, etc.) + "stylesheet", # CSS files + "font", # Web fonts + "media", # Audio/video content + "object", # Flash/plugins (legacy) + "subdocument", # Iframes + "xmlhttprequest", # AJAX requests + "websocket", # WebSocket connections + "webrtc", # WebRTC connections + "ping", # Navigator.sendBeacon() + "other", # Other content types + + # ------------------------------------------------------------------------- # Shorthand content types - "css", "js", - # Third-party/first-party - "third-party", "3p", "first-party", "1p", - # Document modifiers - "document", "doc", "popup", "all", - # Network/redirect modifiers - "network", "redirect", "redirect-rule", "empty", "mp4", - # Request modification - "csp", "permissions", "header", "removeparam", "removeheader", - "replace", "hls", "jsonprune", - # Exception modifiers - "genericblock", "generichide", "elemhide", "specifichide", - "jsinject", "urlblock", "content", "extension", - # Domain restriction (would make rule domain-specific, not useful for DNS-wide) - "domain", - # Matching modifiers - "match-case", "strict-first-party", "strict-third-party", - # Stealth mode - "stealth", - # App-specific - "app", - # Method restrictions (browser-only) - "method", + # ------------------------------------------------------------------------- + "css", # Alias for stylesheet + "js", # Alias for script + + # ------------------------------------------------------------------------- + # Third-party/first-party (requires page context) + # ------------------------------------------------------------------------- + "third-party", # Requests from different domain + "3p", # Shorthand for third-party + "first-party", # Requests from same domain + "1p", # Shorthand for first-party + + # ------------------------------------------------------------------------- + # Document modifiers (page-level blocking) + # ------------------------------------------------------------------------- + "document", # Block entire document + "doc", # Alias for document + "popup", # Block popups + "all", # Match all content types + + # ------------------------------------------------------------------------- + # Network/redirect modifiers (require HTTP-level access) + # ------------------------------------------------------------------------- + "network", # Network requests + "redirect", # Redirect to resource + "redirect-rule", # Conditional redirect + "empty", # Return empty response + "mp4", # Return empty MP4 + + # ------------------------------------------------------------------------- + # Request modification (HTTP header manipulation) + # ------------------------------------------------------------------------- + "csp", # Content Security Policy injection + "permissions", # Permissions Policy injection + "header", # HTTP header modification + "removeparam", # Remove URL parameters + "removeheader", # Remove HTTP headers + "replace", # Replace response content + "hls", # HLS playlist modification + "jsonprune", # JSON response modification + + # ------------------------------------------------------------------------- + # Exception modifiers (browser extension exceptions) + # ------------------------------------------------------------------------- + "genericblock", # Disable generic blocking + "generichide", # Disable generic hiding + "elemhide", # Disable element hiding + "specifichide", # Disable specific hiding + "jsinject", # Disable JS injection + "urlblock", # Disable URL blocking + "content", # Disable content blocking + "extension", # Disable extension rules + + # ------------------------------------------------------------------------- + # Domain restriction (page-level, not useful for DNS-wide blocking) + # ------------------------------------------------------------------------- + "domain", # Only apply on specific domains + + # ------------------------------------------------------------------------- + # Matching modifiers (case sensitivity, strict party) + # ------------------------------------------------------------------------- + "match-case", # Case-sensitive matching + "strict-first-party", # Strict first-party check + "strict-third-party", # Strict third-party check + + # ------------------------------------------------------------------------- + # Other browser-only features + # ------------------------------------------------------------------------- + "stealth", # Stealth mode settings + "app", # App-specific rules + "method", # HTTP method restrictions }) -# ============================================================================ +# ============================================================================= # REGEX PATTERNS -# ============================================================================ +# ============================================================================= -# Cosmetic/element-hiding rule patterns (DISCARD entirely) -# These include: ## #@# #?# #$# #$?# #@?# #@$# etc. -COSMETIC_PATTERN = re.compile( +#: Cosmetic/element-hiding rule patterns (DISCARD entirely) +#: These include: ## #@# #?# #$# #$?# #@?# #@$# etc. +COSMETIC_PATTERN: Final[re.Pattern[str]] = re.compile( r"#[@$?%]*#|" # Standard element hiding: ## #@# #?# #$# etc. r"#[@$?%]*\?#|" # Extended CSS: #?# #@?# etc. r"\$#|" # Snippet injection: $# r"#%#|" # Scriptlet injection: #%# - r"\[adblock", # Adblock header + r"\[adblock", # Adblock header: [Adblock Plus ...] re.IGNORECASE ) -# Pattern to detect if a line is likely a comment -COMMENT_PATTERN = re.compile(r"^\s*[#!]") +#: Pattern to detect if a line is a comment (starts with # or !) +COMMENT_PATTERN: Final[re.Pattern[str]] = re.compile(r"^\s*[#!]") -# Trailing inline comment: match "# comment" preceded by whitespace -TRAILING_COMMENT_PATTERN = re.compile(r"\s+#\s+.*$") +#: Trailing inline comment: match " # comment" (space before #) +#: Example: "||example.com^ # block ads" → "||example.com^" +TRAILING_COMMENT_PATTERN: Final[re.Pattern[str]] = re.compile(r"\s+#\s+.*$") -# Pattern to extract modifier section from ABP rule -# Matches: $modifier1,modifier2,... at end of rule -MODIFIER_PATTERN = re.compile(r"\$([^$]+)$") +#: Pattern to extract modifier section from ABP rule +#: Matches: $modifier1,modifier2,... at end of rule +MODIFIER_PATTERN: Final[re.Pattern[str]] = re.compile(r"\$([^$]+)$") -# ============================================================================ +# ============================================================================= # DATA STRUCTURES -# ============================================================================ +# ============================================================================= class CleanResult(NamedTuple): - """Result of cleaning a single line.""" - line: str | None # Cleaned line, or None if discarded - discarded: bool # True if line was discarded - reason: str | None # Reason for discard (for logging) + """ + Result of cleaning a single line. + + Attributes: + line: Cleaned line, or None if discarded + discarded: True if line was discarded + reason: Reason for discard (for logging/stats), or None if kept + + Example: + >>> result = CleanResult("||example.com^", False, None) + >>> result.discarded + False + """ + line: str | None + discarded: bool + reason: str | None class CleanStats(NamedTuple): - """Statistics from cleaning operation.""" + """ + Statistics from cleaning operation. + + Attributes: + total_lines: Total lines processed + kept_lines: Lines kept after cleaning + comments_removed: Comment lines removed + cosmetic_removed: Cosmetic/element-hiding rules removed + unsupported_modifier_removed: Rules with unsupported modifiers removed + empty_removed: Empty lines removed + invalid_removed: Invalid/malformed lines removed + trimmed: Lines that had whitespace trimmed + + Example: + >>> stats = CleanStats(100, 80, 10, 5, 3, 2, 0, 15) + >>> stats.kept_lines + 80 + """ total_lines: int kept_lines: int comments_removed: int @@ -120,15 +215,43 @@ class CleanStats(NamedTuple): unsupported_modifier_removed: int empty_removed: int invalid_removed: int - trimmed: int # Lines that had whitespace trimmed + trimmed: int -# ============================================================================ +class CleanStatsDict(TypedDict): + """TypedDict for internal stats tracking with type safety.""" + total: int + kept: int + comments: int + cosmetic: int + unsupported_modifier: int + empty: int + invalid: int + trimmed: int + + +# ============================================================================= # CLEANING FUNCTIONS -# ============================================================================ +# ============================================================================= def is_comment(line: str) -> bool: - """Check if line is a comment (starts with # or !).""" + """ + Check if line is a comment (starts with # or !). + + Args: + line: The line to check + + Returns: + True if the line is a comment + + Example: + >>> is_comment("# This is a comment") + True + >>> is_comment("! Another comment style") + True + >>> is_comment("||example.com^") + False + """ return bool(COMMENT_PATTERN.match(line)) @@ -138,6 +261,18 @@ def is_cosmetic_rule(line: str) -> bool: These rules can't be processed by AdGuard Home (DNS-level blocker) and should be completely discarded. + + Args: + line: The line to check + + Returns: + True if the line is a cosmetic rule + + Example: + >>> is_cosmetic_rule("example.com##.ad-banner") + True + >>> is_cosmetic_rule("||example.com^") + False """ return bool(COSMETIC_PATTERN.search(line)) @@ -146,10 +281,20 @@ def strip_trailing_comment(line: str) -> str: """ Remove trailing inline comments to reduce file size. - Example: - "||example.com^ # block ads" -> "||example.com^" + Only strips comments that are preceded by whitespace, to avoid + accidentally stripping URL fragments or modifier values. - Be careful not to strip URL fragments or rule modifiers. + Args: + line: The line to process + + Returns: + Line with trailing comment removed + + Example: + >>> strip_trailing_comment("||example.com^ # block ads") + '||example.com^' + >>> strip_trailing_comment("||example.com^#fragment") # No space, kept + '||example.com^#fragment' """ # Don't process lines that might have # in modifiers if "$" in line and "#" in line.split("$")[-1]: @@ -166,17 +311,28 @@ def extract_modifiers(rule: str) -> set[str]: """ Extract modifier names from an ABP-style rule. + Handles modifiers with values (key=value) and negation (~modifier). + + Args: + rule: The ABP rule to parse + + Returns: + Set of modifier names (lowercase, without ~ prefix or =value suffix) + Example: - "||example.com^$script,third-party" -> {"script", "third-party"} - "||example.com^$important" -> {"important"} - "||example.com^" -> set() + >>> extract_modifiers("||example.com^$script,third-party") + {'script', 'third-party'} + >>> extract_modifiers("||example.com^$important") + {'important'} + >>> extract_modifiers("||example.com^") + set() """ match = MODIFIER_PATTERN.search(rule) if not match: return set() modifier_string = match.group(1) - modifiers = set() + modifiers: set[str] = set() for part in modifier_string.split(","): # Handle modifiers with values: client=192.168.1.1, dnsrewrite=example.com @@ -196,6 +352,18 @@ def has_unsupported_modifiers(modifiers: set[str]) -> bool: If ANY unsupported modifier is found, the rule should be DISCARDED (not stripped) to avoid false positives and breakage. + + Args: + modifiers: Set of modifier names to check + + Returns: + True if any modifier is unsupported + + Example: + >>> has_unsupported_modifiers({'script', 'important'}) + True # 'script' is unsupported + >>> has_unsupported_modifiers({'important', 'client'}) + False # Both are supported """ return bool(modifiers & UNSUPPORTED_MODIFIERS) @@ -204,8 +372,23 @@ def clean_line(line: str) -> tuple[CleanResult, bool]: """ Clean a single rule line. + Performs all cleaning operations: strip whitespace, remove comments, + discard cosmetic rules, and check for unsupported modifiers. + + Args: + line: The raw line to clean + Returns: - (CleanResult, was_trimmed) - was_trimmed is True if whitespace was removed + Tuple of (CleanResult, was_trimmed): + - CleanResult: The cleaning result with line/discarded/reason + - was_trimmed: True if whitespace was removed from the line + + Example: + >>> result, trimmed = clean_line(" ||example.com^ ") + >>> result.line + '||example.com^' + >>> trimmed + True """ original = line @@ -233,10 +416,11 @@ def clean_line(line: str) -> tuple[CleanResult, bool]: was_trimmed = True # For ABP-style rules, check modifiers - if line.startswith("||") or line.startswith("@@||"): + # Use tuple form for single startswith call (faster than OR) + if line.startswith(("||", "@@||")): modifiers = extract_modifiers(line) if modifiers and has_unsupported_modifiers(modifiers): - # DISCARD entire rule (as per user's requirement) + # DISCARD entire rule (as per design decision) # This prevents false positives like blocking google.com when # the original rule was "$third-party" (third-party connections only) return CleanResult(None, True, "unsupported_modifier"), False @@ -256,11 +440,26 @@ def clean_lines(lines: list[str]) -> tuple[list[str], CleanStats]: """ Clean a list of lines. + Processes each line through the cleaning pipeline and collects statistics. + + Args: + lines: List of raw lines to clean + Returns: - (cleaned_lines, stats) + Tuple of (cleaned_lines, stats): + - cleaned_lines: List of valid, cleaned lines + - stats: CleanStats with counts of removed line types + + Example: + >>> lines = ["# comment", "||example.com^", "bad.com##.ad"] + >>> cleaned, stats = clean_lines(lines) + >>> len(cleaned) + 1 + >>> stats.comments_removed + 1 """ - cleaned = [] - stats = { + cleaned: list[str] = [] + stats: CleanStatsDict = { "total": 0, "kept": 0, "comments": 0, @@ -290,7 +489,7 @@ def clean_lines(lines: list[str]) -> tuple[list[str], CleanStats]: else: stats["invalid"] += 1 else: - cleaned.append(result.line) + cleaned.append(result.line) # type: ignore[arg-type] stats["kept"] += 1 return cleaned, CleanStats( @@ -309,12 +508,18 @@ def clean_file(input_path: str, output_path: str | None = None) -> CleanStats: """ Clean a single file. + Reads the input file, cleans all lines, and writes to output. + Args: input_path: Path to input file output_path: Path to output file (defaults to in-place modification) - + Returns: - CleanStats + CleanStats with counts of removed line types + + Example: + >>> stats = clean_file("raw.txt", "cleaned.txt") + >>> print(f"Kept {stats.kept_lines} of {stats.total_lines} lines") """ from pathlib import Path @@ -334,9 +539,9 @@ def clean_file(input_path: str, output_path: str | None = None) -> CleanStats: return stats -# ============================================================================ -# MAIN -# ============================================================================ +# ============================================================================= +# CLI INTERFACE +# ============================================================================= if __name__ == "__main__": import sys diff --git a/scripts/compiler.py b/scripts/compiler.py index 7dce313..c1cfcd4 100644 --- a/scripts/compiler.py +++ b/scripts/compiler.py @@ -5,14 +5,14 @@ This module is the core of the blocklist merging pipeline. It takes cleaned rules from multiple blocklists and produces a minimal, deduplicated output file. -DESIGN GOALS: +Core Goals (in priority order): 1. Maximum blocking coverage - Every domain that should be blocked, IS blocked 2. Minimum rule count - Smaller lists = faster loading, less memory in AdGuard Home 3. Only output blocking rules - No whitelist/exception rules (@@) in output -KEY INSIGHT - FORMAT COMPRESSION: +Key Insight - Format Compression: Instead of handling hosts, plain domains, and ABP rules separately, we CONVERT - everything to ABP format during parsing: + everything to ABP format during parsing:: 0.0.0.0 ads.example.com → ||ads.example.com^ ads.example.com → ||ads.example.com^ @@ -22,87 +22,164 @@ If we have ||example.com^, then ||sub.example.com^ becomes redundant regardless of whether it came from a hosts file or an ABP list. -MODIFIER-AWARE PRUNING: +Modifier-Aware Pruning: Not all subdomain rules can be pruned! AdGuard Home modifiers change behavior: - - $important → Child with $important must NOT be pruned by parent without it - - $badfilter → Never prune by a $badfilter parent (it disables rules, not blocks) - - $dnsrewrite → Never prune (has custom DNS response behavior) - - $denyallow → Never prune (excludes specific domains) - - $dnstype → Only prune if parent blocks ALL types - - $client/$ctag → Parent with restrictions can't prune unrestricted child - -WHITELIST HANDLING: + ============ ================================================================ + Modifier Behavior + ============ ================================================================ + $important Child with $important must NOT be pruned by parent without it + $badfilter Never prune by a $badfilter parent (it disables rules, not blocks) + $dnsrewrite Never prune (has custom DNS response behavior) + $denyallow Never prune (excludes specific domains) + $dnstype Only prune if parent blocks ALL types + $client/$ctag Parent with restrictions can't prune unrestricted child + ============ ================================================================ + +Whitelist Handling: @@rules (whitelist/exception rules) are used ONLY to remove conflicting blocking rules. The @@rules themselves are NOT output. This keeps the output file simple. - -See docs/LOGIC.md for detailed examples of each pruning rule. """ import re from dataclasses import dataclass from functools import lru_cache from pathlib import Path +from sys import intern +from typing import Final import tldextract -# Pre-configure tldextract for better performance (no updates check) +# ============================================================================= +# TYPE ALIASES +# ============================================================================= +# These make complex type signatures more readable throughout the codebase. + +#: A parsed ABP rule entry: (original_rule, modifiers_frozenset, is_wildcard) +RuleEntry = tuple[str, frozenset[str], bool] + +#: A TLD wildcard entry: (original_rule, modifiers_frozenset) +WildcardEntry = tuple[str, frozenset[str]] + +# ============================================================================= +# CONFIGURATION CONSTANTS +# ============================================================================= +# Named constants improve readability and make tuning easier. + +#: LRU cache size for domain extraction (covers most unique domains in a run) +LRU_CACHE_SIZE: Final[int] = 65536 + +#: Pre-allocated empty frozenset to avoid repeated allocations +EMPTY_FROZENSET: Final[frozenset[str]] = frozenset() + +# Pre-configure tldextract for better performance (no online updates check) _tld_extract = tldextract.TLDExtract(suffix_list_urls=None) -# ============================================================================ +# ============================================================================= # REGEX PATTERNS -# ============================================================================ +# ============================================================================= +# Pre-compiled patterns for performance. Each pattern is documented with +# examples of what it matches. -# ABP pattern: ||[*.]domain^ (including IP addresses) -ABP_DOMAIN_PATTERN = re.compile( +#: ABP pattern: ||[*.]domain^ (including IP addresses) +#: Examples: ||example.com^, ||*.example.com^, @@||example.com^$important +ABP_DOMAIN_PATTERN: Final[re.Pattern[str]] = re.compile( r"^(@@)?\|\|" # Start: || or @@|| (group 1: exception marker) r"(\*\.)?" # Optional *. wildcard (group 2) - r"([^^\$|*\s]+)" # Domain/IP (group 3) + r"([^^$|*\s]+)" # Domain/IP (group 3) r"\^" # Separator ) -# Hosts format: IP domain [domain2 ...] -HOSTS_PATTERN = re.compile( +#: Hosts format: IP domain [domain2 ...] +#: Examples: 0.0.0.0 example.com, 127.0.0.1 ads.example.com tracking.example.com +HOSTS_PATTERN: Final[re.Pattern[str]] = re.compile( r"^([\d.:a-fA-F]+)\s+" # IP address (IPv4 or IPv6) r"(.+)$" # Rest of line (domains) ) -# Valid domain/IP for hosts -HOSTS_DOMAIN_PATTERN = re.compile(r"^[a-zA-Z0-9][\w.-]*$") +#: Valid domain/IP for hosts file entries +#: Examples: example.com, sub.example.com, my-domain.co.uk +HOSTS_DOMAIN_PATTERN: Final[re.Pattern[str]] = re.compile( + r"^[a-zA-Z0-9][\w.-]*$" +) -# Plain domain (simple domain name, no special chars except . and -) -PLAIN_DOMAIN_PATTERN = re.compile( +#: Plain domain (simple domain name, no special chars except . and -) +#: Examples: example.com, sub.example.com (NOT: ||example.com^, 0.0.0.0 example.com) +PLAIN_DOMAIN_PATTERN: Final[re.Pattern[str]] = re.compile( r"^[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?" r"(\.[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$" ) -# Local/blocking IPs recognized in hosts format -BLOCKING_IPS = frozenset({ +# ============================================================================= +# DOMAIN CONSTANTS +# ============================================================================= + +#: Local/blocking IPs recognized in hosts format +#: These indicate the entry is meant to block the domain, not redirect it. +BLOCKING_IPS: Final[frozenset[str]] = frozenset({ "0.0.0.0", "127.0.0.1", "::1", "::0", "::", "0:0:0:0:0:0:0:0", "0:0:0:0:0:0:0:1", }) -# Local hostnames to skip -LOCAL_HOSTNAMES = frozenset({ +#: Local hostnames to skip (these appear in hosts files but shouldn't be blocked) +LOCAL_HOSTNAMES: Final[frozenset[str]] = frozenset({ "localhost", "localhost.localdomain", "local", "broadcasthost", "ip6-localhost", "ip6-loopback", "ip6-localnet", "ip6-mcastprefix", "ip6-allnodes", "ip6-allrouters", "ip6-allhosts", }) -# Modifiers with special behavior that should never be pruned -SPECIAL_BEHAVIOR_MODIFIERS = frozenset({"badfilter", "dnsrewrite", "denyallow"}) +# ============================================================================= +# MODIFIER CONSTANTS +# ============================================================================= -# Modifiers that restrict who is blocked -CLIENT_RESTRICTION_MODIFIERS = frozenset({"client", "ctag"}) +#: Modifiers with special behavior that should never be pruned. +#: These modifiers have effects that can't be covered by a parent rule. +SPECIAL_BEHAVIOR_MODIFIERS: Final[frozenset[str]] = frozenset({ + "badfilter", # Disables other rules (meta-modifier) + "dnsrewrite", # Custom DNS response (e.g., redirect to specific IP) + "denyallow", # Excludes specific domains from blocking +}) + +#: Modifiers that restrict who is blocked (client-specific rules). +#: A parent with these can't prune a child without them. +CLIENT_RESTRICTION_MODIFIERS: Final[frozenset[str]] = frozenset({ + "client", # Block only for specific client IP + "ctag", # Block only for specific client tag +}) -# ============================================================================ +# ============================================================================= # DATA STRUCTURES -# ============================================================================ +# ============================================================================= @dataclass(slots=True) class CompileStats: - """Statistics from compilation.""" + """ + Statistics from the compilation process. + + This dataclass tracks all metrics during rule compilation, + providing insight into how many rules were kept, pruned, or transformed. + + Attributes: + total_input: Total number of input lines processed + total_output: Total number of rules written to output + abp_kept: ABP-style rules kept in output + other_kept: Other rules (regex, etc.) kept in output + abp_subdomain_pruned: Subdomain rules pruned by parent rules + tld_wildcard_pruned: Rules pruned by TLD wildcards (e.g., ||*.autos^) + duplicate_pruned: Exact duplicate rules removed + whitelist_conflict_pruned: Rules removed due to whitelist conflicts + local_hostname_pruned: Local hostnames (localhost, etc.) skipped + formats_compressed: Hosts/plain domains converted to ABP format + malformed_discarded: Malformed rules (e.g., ||^) discarded + + Example: + >>> stats = CompileStats() + >>> stats.total_input = 1000 + >>> stats.abp_kept = 500 + >>> print(f"Kept {stats.abp_kept} of {stats.total_input}") + Kept 500 of 1000 + """ total_input: int = 0 total_output: int = 0 @@ -110,61 +187,108 @@ class CompileStats: abp_kept: int = 0 other_kept: int = 0 - # Pruning + # Pruning counts abp_subdomain_pruned: int = 0 tld_wildcard_pruned: int = 0 duplicate_pruned: int = 0 whitelist_conflict_pruned: int = 0 local_hostname_pruned: int = 0 - formats_compressed: int = 0 # Hosts/plain domains converted to ABP format - malformed_discarded: int = 0 # Malformed rules (e.g., ||^) discarded + formats_compressed: int = 0 + malformed_discarded: int = 0 -# ============================================================================ +# ============================================================================= # HELPER FUNCTIONS -# ============================================================================ +# ============================================================================= def normalize_domain(domain: str) -> str: - """Normalize domain to lowercase, stripped.""" - return domain.lower().strip().rstrip(".") + """ + Normalize a domain to lowercase, stripped of whitespace and trailing dots. + + Uses sys.intern() to deduplicate domain strings in memory, which also + speeds up dictionary lookups (pointer comparison vs string comparison). + + Args: + domain: The domain string to normalize + + Returns: + Normalized and interned domain string + + Example: + >>> normalize_domain(" Example.COM. ") + 'example.com' + """ + return intern(domain.lower().strip().rstrip(".")) -def extract_abp_info(rule: str) -> tuple[str | None, frozenset, bool, bool]: +def extract_abp_info(rule: str) -> tuple[str | None, frozenset[str], bool, bool]: """ Extract domain, modifiers, exception status, and wildcard status from ABP rule. + Args: + rule: An ABP-style rule string + Returns: - (domain, modifiers, is_exception, is_wildcard) + A tuple of (domain, modifiers, is_exception, is_wildcard): + - domain: The extracted domain, or None if parsing failed + - modifiers: Frozenset of modifier names (lowercase) + - is_exception: True if this is a whitelist rule (@@) + - is_wildcard: True if this is a wildcard rule (||*.domain^) + + Example: + >>> extract_abp_info("||example.com^$important") + ('example.com', frozenset({'important'}), False, False) + >>> extract_abp_info("@@||*.example.com^") + ('example.com', frozenset(), True, True) """ match = ABP_DOMAIN_PATTERN.match(rule) if not match: - return None, frozenset(), False, False + return None, EMPTY_FROZENSET, False, False # Groups: (1) @@ exception, (2) *. wildcard, (3) domain is_exception = match.group(1) is not None is_wildcard = match.group(2) is not None domain = normalize_domain(match.group(3)) - # Extract modifiers - modifiers = set() - if "$" in rule: - mod_part = rule.split("$", 1)[1] - for mod in mod_part.split(","): - mod_name = mod.split("=")[0].strip().lower() - if mod_name.startswith("~"): - mod_name = mod_name[1:] - if mod_name: - modifiers.add(mod_name) + # Extract modifiers from $modifier1,modifier2,... + # Fast path: no $ means no modifiers (common case) + if "$" not in rule: + return domain, EMPTY_FROZENSET, is_exception, is_wildcard - return domain, frozenset(modifiers), is_exception, is_wildcard + modifiers: set[str] = set() + mod_part = rule.split("$", 1)[1] + for mod in mod_part.split(","): + mod_name = mod.split("=")[0].strip().lower() + # Handle negation prefix (e.g., ~third-party) + if mod_name.startswith("~"): + mod_name = mod_name[1:] + if mod_name: + modifiers.add(mod_name) + + return domain, frozenset(modifiers) if modifiers else EMPTY_FROZENSET, is_exception, is_wildcard def extract_hosts_info(rule: str) -> tuple[str | None, list[str]]: """ Extract IP and domains from hosts-style rule. + Args: + rule: A hosts-style rule string (e.g., "0.0.0.0 example.com") + Returns: - (ip, [domains]) or (None, []) if not valid hosts + A tuple of (ip, domains): + - ip: The IP address, or None if not a valid hosts rule + - domains: List of domain names (may be empty) + + Note: + Only "blocking" IPs (0.0.0.0, 127.0.0.1, etc.) are recognized. + Real IPs like 8.8.8.8 are ignored as they indicate redirects, not blocks. + + Example: + >>> extract_hosts_info("0.0.0.0 example.com ads.example.com") + ('0.0.0.0', ['example.com', 'ads.example.com']) + >>> extract_hosts_info("8.8.8.8 dns.google") # Real IP, not blocking + (None, []) """ match = HOSTS_PATTERN.match(rule) if not match: @@ -173,11 +297,11 @@ def extract_hosts_info(rule: str) -> tuple[str | None, list[str]]: ip = match.group(1) rest = match.group(2) - # Only process blocking IPs + # Only process blocking IPs (0.0.0.0, 127.x.x.x, ::, etc.) if ip not in BLOCKING_IPS and not ip.startswith("0.") and not ip.startswith("127."): return None, [] - domains = [] + domains: list[str] = [] for part in rest.split(): # Stop at comments if part.startswith("#"): @@ -190,35 +314,89 @@ def extract_hosts_info(rule: str) -> tuple[str | None, list[str]]: return ip, domains -@lru_cache(maxsize=65536) +@lru_cache(maxsize=LRU_CACHE_SIZE) def _extract_domain_parts(domain: str) -> tuple[str, str, str]: - """Cached tldextract extraction. Returns (subdomain, domain, suffix).""" + """ + Cached tldextract extraction. + + Uses LRU cache to avoid repeated expensive tldextract calls for the same domain. + + Args: + domain: Full domain to parse + + Returns: + Tuple of (subdomain, domain, suffix) + + Example: + >>> _extract_domain_parts("sub.example.co.uk") + ('sub', 'example', 'co.uk') + """ ext = _tld_extract(domain) return ext.subdomain, ext.domain, ext.suffix def get_tld(domain: str) -> str | None: - """Get the TLD (suffix) of a domain.""" + """ + Get the TLD (suffix) of a domain. + + Args: + domain: The domain to extract TLD from + + Returns: + The TLD string, or None if not found + + Example: + >>> get_tld("example.com") + 'com' + >>> get_tld("example.co.uk") + 'co.uk' + """ _, _, suffix = _extract_domain_parts(domain) return suffix if suffix else None def get_registered_domain(domain: str) -> str | None: - """Get registered domain (domain.tld) from full domain.""" + """ + Get registered domain (domain.tld) from full domain. + + Args: + domain: Full domain including subdomains + + Returns: + The registered domain (e.g., "example.com"), or None if not found + + Example: + >>> get_registered_domain("sub.example.com") + 'example.com' + >>> get_registered_domain("deep.sub.example.co.uk") + 'example.co.uk' + """ _, dom, suffix = _extract_domain_parts(domain) if suffix and dom: return f"{dom}.{suffix}" return None -@lru_cache(maxsize=65536) +@lru_cache(maxsize=LRU_CACHE_SIZE) def walk_parent_domains(domain: str) -> tuple[str, ...]: """ Walk up the domain hierarchy to find all parent domains. - Example: "a.b.example.com" -> ("b.example.com", "example.com") - - Returns tuple for hashability (caching). + Args: + domain: The domain to find parents for + + Returns: + Tuple of parent domains, from most specific to least specific. + Returns empty tuple for apex domains (no parents). + + Note: + Returns tuple (not list) for hashability, enabling LRU caching. + + Example: + >>> walk_parent_domains("a.b.example.com") + ('b.example.com', 'example.com') + >>> walk_parent_domains("example.com") # Apex domain + () """ subdomain, dom, suffix = _extract_domain_parts(domain) if not suffix or not dom: @@ -230,8 +408,9 @@ def walk_parent_domains(domain: str) -> tuple[str, ...]: return () parts = subdomain.split(".") - parents = [] - # Build parents from most specific to least + parents: list[str] = [] + + # Build parents from most specific to least specific for i in range(1, len(parts) + 1): if i == len(parts): parents.append(registered) @@ -246,15 +425,34 @@ def should_prune_by_modifiers(child_mods: frozenset[str], parent_mods: frozenset """ Determine if a child rule is redundant given the parent's modifiers. - Returns True if child can be safely pruned (parent covers it). + This function implements the modifier-aware pruning logic that ensures + we don't incorrectly remove rules with special behavior. - Key rules: - - $badfilter parent: Never prune (it disables rules, doesn't block) - - $important child: Keep if parent lacks $important (child takes priority) - - $dnsrewrite/$denyallow/$badfilter child: Never prune (special behavior) - - $dnstype mismatch: Child blocking ALL types not covered by parent blocking ONE type - - $client/$ctag parent: Child without restrictions blocks more broadly + Args: + child_mods: Modifiers on the child (subdomain) rule + parent_mods: Modifiers on the parent rule + + Returns: + True if child can be safely pruned (parent covers it), False otherwise + + Pruning Rules: + 1. $badfilter parent → Never prune (it disables rules, doesn't block) + 2. $important child → Keep if parent lacks $important + 3. $dnsrewrite/$denyallow/$badfilter child → Never prune (special behavior) + 4. $dnstype mismatch → Child blocking ALL types not covered by parent blocking ONE + 5. $client/$ctag parent → Child without restrictions blocks more broadly + + Example: + >>> should_prune_by_modifiers(frozenset(), frozenset()) + True + >>> should_prune_by_modifiers(frozenset({'important'}), frozenset()) + False # Child's $important takes priority """ + # Fast path: no modifiers on either side (most common case ~90%+) + # This avoids all the set operations below + if not child_mods and not parent_mods: + return True + # $badfilter disables rules, it doesn't block anything if "badfilter" in parent_mods: return False @@ -283,19 +481,38 @@ def should_prune_by_modifiers(child_mods: frozenset[str], parent_mods: frozenset return True -# ============================================================================ +# ============================================================================= # MAIN COMPILATION -# ============================================================================ +# ============================================================================= def compile_rules( lines: list[str], output_file: str, ) -> CompileStats: """ - Compile and deduplicate rules with two-phase approach. + Compile and deduplicate rules with format compression. - Phase 1: Collect all rules and build lookup structures - Phase 2: Filter and deduplicate + This is the main entry point for the compiler. It processes input lines through + multiple phases to produce a minimal, deduplicated output file. + + Args: + lines: List of rule strings to compile + output_file: Path to write the compiled output + + Returns: + CompileStats with metrics about the compilation process + + Pipeline Phases: + 1. **Parse & Compress**: Parse all rules, converting hosts/plain to ABP format + 2. **Build Lookups**: Create efficient lookup structures for pruning + 3. **Prune**: Remove redundant subdomain and whitelist-conflicted rules + 4. **Output**: Write deduplicated rules atomically + + Example: + >>> lines = ["||example.com^", "||sub.example.com^", "0.0.0.0 other.example.com"] + >>> stats = compile_rules(lines, "output.txt") + >>> print(f"Reduced {stats.total_input} to {stats.total_output} rules") + Reduced 3 to 1 rules """ stats = CompileStats() @@ -304,25 +521,27 @@ def compile_rules( # ========================================================================= # ABP blocking rules: domain -> (original_rule, modifiers, is_wildcard) - abp_rules: dict[str, tuple[str, frozenset, bool]] = {} - abp_wildcards: dict[str, tuple[str, frozenset]] = {} # TLD wildcards: tld -> rule + abp_rules: dict[str, RuleEntry] = {} + abp_wildcards: dict[str, WildcardEntry] = {} # TLD wildcards: tld -> rule # Whitelisted domains (from @@rules) allow_domains: set[str] = set() - # Other rules (regex, partial matches, etc.) - other_rules: list[str] = [] + # Other rules (regex, partial matches, etc.) - use set for inline dedup + other_rules: set[str] = set() for line in lines: stats.total_input += 1 - line = line.strip() - if not line: + + # Early exit for empty lines (walrus operator avoids assignment for empty) + if not (line := line.strip()): continue - # ===================================================================== + # ----------------------------------------------------------------- # ABP-style rules (highest priority) - # ===================================================================== - if line.startswith("||") or line.startswith("@@||"): + # ----------------------------------------------------------------- + # Use tuple form for single startswith call (faster than OR) + if line.startswith(("||", "@@||")): domain, modifiers, is_exception, is_wildcard = extract_abp_info(line) if not domain: @@ -369,9 +588,9 @@ def compile_rules( # Can't extract domain from non-ABP exceptions, skip for now continue - # ===================================================================== + # ----------------------------------------------------------------- # Hosts-style rules - COMPRESS TO ABP FORMAT - # ===================================================================== + # ----------------------------------------------------------------- ip, domains = extract_hosts_info(line) if ip and domains: for domain in domains: @@ -383,34 +602,37 @@ def compile_rules( # Convert to ABP format: 0.0.0.0 example.com → ||example.com^ abp_rule = f"||{domain}^" if domain not in abp_rules: - abp_rules[domain] = (abp_rule, frozenset(), False) + abp_rules[domain] = (abp_rule, EMPTY_FROZENSET, False) stats.formats_compressed += 1 else: stats.duplicate_pruned += 1 continue - # ===================================================================== + # ----------------------------------------------------------------- # Plain domain - COMPRESS TO ABP FORMAT - # ===================================================================== + # ----------------------------------------------------------------- if PLAIN_DOMAIN_PATTERN.match(line): domain = normalize_domain(line) if domain and domain not in LOCAL_HOSTNAMES: # Convert to ABP format: example.com → ||example.com^ abp_rule = f"||{domain}^" if domain not in abp_rules: - abp_rules[domain] = (abp_rule, frozenset(), False) - stats.formats_compressed += 1 # Reusing stat for both hosts and plain + abp_rules[domain] = (abp_rule, EMPTY_FROZENSET, False) + stats.formats_compressed += 1 else: stats.duplicate_pruned += 1 else: stats.local_hostname_pruned += 1 continue - # ===================================================================== - # Other (regex, etc.) - # ===================================================================== + # ----------------------------------------------------------------- + # Other (regex, etc.) - inline duplicate check with set + # ----------------------------------------------------------------- if line.startswith("/") or "|" in line or "*" in line: - other_rules.append(line) + if line not in other_rules: + other_rules.add(line) + else: + stats.duplicate_pruned += 1 continue # ========================================================================= @@ -446,7 +668,8 @@ def is_covered_by_abp(domain: str) -> bool: return False def is_whitelisted(domain: str) -> bool: - """Check if domain is whitelisted. + """ + Check if domain is whitelisted. A domain is whitelisted if: 1. It's directly in allow_domains (@@||domain^) @@ -469,7 +692,7 @@ def is_whitelisted(domain: str) -> bool: # PHASE 3: Prune ABP subdomain rules # ========================================================================= - pruned_abp: dict[str, tuple[str, frozenset, bool]] = {} + pruned_abp: dict[str, RuleEntry] = {} for domain, (rule, modifiers, is_wildcard) in abp_rules.items(): # Skip if whitelisted @@ -519,20 +742,8 @@ def is_whitelisted(domain: str) -> bool: else: pruned_abp[domain] = (rule, modifiers, is_wildcard) - # ========================================================================= - # PHASE 4: Deduplicate other rules (regex, partial matches, etc.) - # ========================================================================= - - seen_other: set[str] = set() - kept_other: list[str] = [] - for rule in other_rules: - if rule not in seen_other: - seen_other.add(rule) - kept_other.append(rule) - else: - stats.duplicate_pruned += 1 - - # NOTE: Whitelist/exception rules (@@) are intentionally NOT output. + # NOTE: other_rules is already deduplicated (used set during parse) + # Whitelist/exception rules (@@) are intentionally NOT output. # They were only used internally to remove conflicting blocking rules. # The final output contains only blocking rules. @@ -560,8 +771,8 @@ def is_whitelisted(domain: str) -> bool: f.write(rule + "\n") stats.abp_kept += 1 - # Other rules (regex, partial matches, etc.) - for rule in kept_other: + # Other rules (regex, partial matches, etc.) - already deduplicated + for rule in other_rules: f.write(rule + "\n") stats.other_kept += 1 @@ -573,9 +784,9 @@ def is_whitelisted(domain: str) -> bool: return stats -# ============================================================================ -# MAIN -# ============================================================================ +# ============================================================================= +# CLI INTERFACE +# ============================================================================= if __name__ == "__main__": import sys @@ -606,4 +817,3 @@ def is_whitelisted(domain: str) -> bool: print(f" Duplicates: {stats.duplicate_pruned:,}") print(f" Whitelist conflicts: {stats.whitelist_conflict_pruned:,}") print(f" Local hostnames: {stats.local_hostname_pruned:,}") - diff --git a/scripts/downloader.py b/scripts/downloader.py index a8ad0ca..e8944d2 100644 --- a/scripts/downloader.py +++ b/scripts/downloader.py @@ -5,8 +5,21 @@ Downloads blocklists with ETag/Last-Modified caching and concurrent fetching. Falls back to cached files if download fails. +Features: + - Async downloads with aiohttp for high concurrency + - ETag/Last-Modified caching to avoid re-downloading unchanged files + - Automatic retry with exponential backoff + - Graceful fallback to cached files on error + - Progress tracking and detailed statistics + Usage: python -m scripts.downloader --sources sources.txt --outdir data/ --cache .cache + +Example: + >>> from scripts.downloader import fetch_all + >>> import asyncio + >>> results = asyncio.run(fetch_all(urls, output_dir, cache_dir, 8, 30, 3)) + >>> print(f"Downloaded {sum(r.success for r in results)} of {len(results)} sources") """ import argparse @@ -16,31 +29,74 @@ import sys import time from pathlib import Path -from typing import NamedTuple +from typing import Final, NamedTuple import aiohttp import aiofiles -# Default configuration -DEFAULT_TIMEOUT = 30 # seconds per request -DEFAULT_RETRIES = 3 # attempts before giving up -DEFAULT_CONCURRENCY = 8 # simultaneous connections +# ============================================================================= +# CONFIGURATION CONSTANTS +# ============================================================================= + +#: Default timeout per HTTP request in seconds +DEFAULT_TIMEOUT: Final[int] = 30 + +#: Default number of retry attempts before giving up +DEFAULT_RETRIES: Final[int] = 3 + +#: Default number of simultaneous HTTP connections +DEFAULT_CONCURRENCY: Final[int] = 8 -# State file for ETag/Last-Modified tracking -STATE_FILE = "state.json" +#: State file name for ETag/Last-Modified tracking +STATE_FILE: Final[str] = "state.json" +# ============================================================================= +# DATA STRUCTURES +# ============================================================================= + class FetchResult(NamedTuple): - """Result of a single fetch operation.""" + """ + Result of a single fetch operation. + + Attributes: + url: The URL that was fetched + success: True if fetch succeeded (includes cache fallback) + changed: True if content changed (False for 304 Not Modified) + error: Error message if something went wrong, None otherwise + + Example: + >>> result = FetchResult("https://example.com/list.txt", True, True, None) + >>> result.success + True + """ url: str success: bool changed: bool error: str | None = None +# ============================================================================= +# HELPER FUNCTIONS +# ============================================================================= + def url_to_filename(url: str) -> str: - """Generate a safe, unique filename from a URL.""" + """ + Generate a safe, unique filename from a URL. + + Uses SHA256 hash for uniqueness and extracts domain for readability. + + Args: + url: The source URL + + Returns: + Safe filename like "example_com_a1b2c3d4.txt" + + Example: + >>> url_to_filename("https://example.com/blocklist.txt") + 'example_com_a1b2c3d4e5f6g7h8.txt' + """ # Use SHA256 hash for uniqueness, take first 16 chars url_hash = hashlib.sha256(url.encode()).hexdigest()[:16] # Extract domain for readability @@ -53,8 +109,25 @@ def url_to_filename(url: str) -> str: def load_sources(sources_file: str) -> list[str]: - """Load URLs from sources file, skipping comments and empty lines.""" - urls = [] + """ + Load URLs from sources file, skipping comments and empty lines. + + Args: + sources_file: Path to the sources.txt file + + Returns: + List of URLs to fetch + + Note: + Lines starting with # are treated as comments. + Inline comments (after #) are also stripped. + + Example: + >>> urls = load_sources("config/sources.txt") + >>> len(urls) + 80 + """ + urls: list[str] = [] path = Path(sources_file) if not path.exists(): print(f"ERROR: Sources file not found: {sources_file}", file=sys.stderr) @@ -72,8 +145,21 @@ def load_sources(sources_file: str) -> list[str]: return urls -def load_state(cache_dir: Path) -> dict: - """Load state.json containing ETag/Last-Modified cache.""" +def load_state(cache_dir: Path) -> dict[str, dict[str, str]]: + """ + Load state.json containing ETag/Last-Modified cache. + + Args: + cache_dir: Directory containing the state file + + Returns: + State dictionary mapping URLs to their cached headers + + Example: + >>> state = load_state(Path(".cache")) + >>> state.get("https://example.com/list.txt", {}).get("etag") + '"abc123"' + """ state_path = cache_dir / STATE_FILE if state_path.exists(): try: @@ -84,8 +170,17 @@ def load_state(cache_dir: Path) -> dict: return {} -def save_state(cache_dir: Path, state: dict) -> None: - """Save state.json atomically.""" +def save_state(cache_dir: Path, state: dict[str, dict[str, str]]) -> None: + """ + Save state.json atomically. + + Uses a temporary file and atomic rename to prevent corruption if + the process is interrupted during write. + + Args: + cache_dir: Directory to save state file in + state: State dictionary to save + """ state_path = cache_dir / STATE_FILE temp_path = state_path.with_suffix(".tmp") try: @@ -96,31 +191,50 @@ def save_state(cache_dir: Path, state: dict) -> None: print(f"Warning: Could not save state.json: {e}", file=sys.stderr) +# ============================================================================= +# ASYNC FETCH FUNCTIONS +# ============================================================================= + async def fetch_url( session: aiohttp.ClientSession, url: str, output_dir: Path, cache_dir: Path, - state: dict, + state: dict[str, dict[str, str]], timeout: int, retries: int, ) -> FetchResult: """ Fetch a single URL with ETag/Last-Modified caching. + Uses conditional requests (If-None-Match, If-Modified-Since) to avoid + re-downloading unchanged content. Falls back to cached files on error. + + Args: + session: aiohttp session for connection pooling + url: URL to fetch + output_dir: Directory to save fetched files + cache_dir: Directory for cached files and state + state: Mutable state dict for tracking ETags + timeout: Request timeout in seconds + retries: Number of retry attempts + Returns: FetchResult with success/changed status + + Note: + This function modifies `state` in-place when new ETags are received. """ filename = url_to_filename(url) output_path = output_dir / filename cache_path = cache_dir / filename - # Get cached headers + # Get cached headers for conditional request url_state = state.get(url, {}) etag = url_state.get("etag") last_modified = url_state.get("last_modified") - headers = {} + headers: dict[str, str] = {} if etag: headers["If-None-Match"] = etag if last_modified: @@ -180,7 +294,7 @@ async def fetch_url( await f.write(content) # Update state with new ETag/Last-Modified - new_state = {} + new_state: dict[str, str] = {} if "ETag" in response.headers: new_state["etag"] = response.headers["ETag"] if "Last-Modified" in response.headers: @@ -231,7 +345,27 @@ async def fetch_all( timeout: int, retries: int, ) -> list[FetchResult]: - """Fetch all URLs concurrently with rate limiting.""" + """ + Fetch all URLs concurrently with rate limiting. + + Uses a semaphore to control maximum concurrent connections, + preventing overwhelming the network or servers. + + Args: + urls: List of URLs to fetch + output_dir: Directory to save fetched files + cache_dir: Directory for cached files and state + concurrency: Maximum concurrent connections + timeout: Request timeout in seconds per URL + retries: Number of retry attempts per URL + + Returns: + List of FetchResult for each URL + + Example: + >>> results = await fetch_all(urls, Path("data"), Path(".cache"), 8, 30, 3) + >>> success_count = sum(r.success for r in results) + """ # Ensure directories exist output_dir.mkdir(parents=True, exist_ok=True) cache_dir.mkdir(parents=True, exist_ok=True) @@ -255,7 +389,7 @@ async def fetch_with_semaphore(url: str) -> FetchResult: results = await asyncio.gather(*tasks, return_exceptions=True) # Handle exceptions in results - final_results = [] + final_results: list[FetchResult] = [] for i, result in enumerate(results): if isinstance(result, Exception): final_results.append(FetchResult(urls[i], success=False, changed=False, error=str(result))) @@ -268,8 +402,17 @@ async def fetch_with_semaphore(url: str) -> FetchResult: return final_results +# ============================================================================= +# CLI INTERFACE +# ============================================================================= + def main() -> int: - """Main entry point.""" + """ + Main entry point for CLI usage. + + Returns: + Exit code (0 for success, 1 for failure) + """ parser = argparse.ArgumentParser(description="Fetch blocklist sources with caching") parser.add_argument("--sources", required=True, help="Path to sources.txt file") parser.add_argument("--outdir", required=True, help="Output directory for fetched files") diff --git a/scripts/pipeline.py b/scripts/pipeline.py index 475b1db..18fa519 100644 --- a/scripts/pipeline.py +++ b/scripts/pipeline.py @@ -1,44 +1,100 @@ #!/usr/bin/env python3 """ -pipeline.py +pipeline.py - Main Processing Pipeline for Blocklist Compilation -Main processing pipeline for blocklist compilation. +This is the orchestrator that ties together the cleaning and compilation stages. +It reads raw blocklist files, processes them through the pipeline, and outputs +a unified, deduplicated blocklist. + +Pipeline Stages: + 1. **Read**: Load all .txt files from input directory + 2. **Clean**: Remove comments, cosmetic rules, unsupported modifiers + 3. **Compile**: Compress formats, deduplicate, prune subdomains + 4. **Write**: Output merged blocklist with statistics Usage: python -m scripts.pipeline -Pipeline stages: -1. Read all files from input_dir -2. Clean each rule (remove comments, cosmetic, unsupported modifiers) -3. Compile (compress formats, deduplicate, prune subdomains) -4. Write merged output +Example: + >>> from scripts.pipeline import process_files + >>> stats = process_files("lists/_raw", "lists/merged.txt") + >>> print(f"Output {stats['lines_output']:,} rules") """ +import json import sys import time from pathlib import Path +from typing import TypedDict from scripts.cleaner import clean_line from scripts.compiler import compile_rules -def process_files(input_dir: str, output_file: str) -> dict[str, int]: +# ============================================================================= +# DATA STRUCTURES +# ============================================================================= + +class PipelineStats(TypedDict): + """ + Statistics collected during pipeline execution. + + Provides detailed metrics about each stage of processing, + useful for monitoring and debugging. + """ + files_processed: int + lines_raw: int + lines_clean: int + lines_output: int + comments_removed: int + cosmetic_removed: int + unsupported_removed: int + empty_removed: int + trimmed: int + abp_subdomain_pruned: int + tld_wildcard_pruned: int + duplicate_pruned: int + whitelist_conflict_pruned: int + local_hostname_pruned: int + formats_compressed: int + abp_kept: int + other_kept: int + + +# ============================================================================= +# PIPELINE FUNCTIONS +# ============================================================================= + +def process_files(input_dir: str, output_file: str) -> PipelineStats: """ Run the full pipeline on input directory. + Orchestrates the complete blocklist compilation process: + 1. Reads all .txt files from input_dir + 2. Cleans each line (removes comments, cosmetic, unsupported modifiers) + 3. Compiles all lines (compresses formats, deduplicates) + 4. Writes output and returns statistics + Args: input_dir: Directory containing raw blocklist files output_file: Path to output merged list - + Returns: - Statistics dictionary + PipelineStats with detailed metrics from all stages + + Raises: + FileNotFoundError: If input_dir doesn't exist + + Example: + >>> stats = process_files("lists/_raw", "lists/merged.txt") + >>> print(f"Reduced {stats['lines_raw']:,} to {stats['lines_output']:,}") """ input_path = Path(input_dir) if not input_path.is_dir(): raise FileNotFoundError(f"Input directory not found: {input_dir}") - stats = { + stats: PipelineStats = { "files_processed": 0, "lines_raw": 0, "lines_clean": 0, @@ -54,20 +110,24 @@ def process_files(input_dir: str, output_file: str) -> dict[str, int]: "whitelist_conflict_pruned": 0, "local_hostname_pruned": 0, "formats_compressed": 0, + "abp_kept": 0, + "other_kept": 0, } # ========================================================================= - # Stage 1: Read and clean all files + # STAGE 1: Read and clean all files # ========================================================================= print("📖 Stage 1: Reading and cleaning files...") stage1_start = time.time() all_cleaned: list[str] = [] + files = sorted(input_path.glob("*.txt")) - # Process .txt files - for file in sorted(input_path.glob("*.txt")): + # Process files (deterministic order for reproducibility) + for file in files: stats["files_processed"] += 1 + # Read and clean each file with open(file, encoding="utf-8-sig", errors="replace") as f: for line in f: stats["lines_raw"] += 1 @@ -79,16 +139,17 @@ def process_files(input_dir: str, output_file: str) -> dict[str, int]: stats["trimmed"] += 1 if result.discarded: - if result.reason == "comment": - stats["comments_removed"] += 1 - elif result.reason == "cosmetic": - stats["cosmetic_removed"] += 1 - elif result.reason == "unsupported_modifier": - stats["unsupported_removed"] += 1 - elif result.reason == "empty": - stats["empty_removed"] += 1 + match result.reason: + case "comment": + stats["comments_removed"] += 1 + case "cosmetic": + stats["cosmetic_removed"] += 1 + case "unsupported_modifier": + stats["unsupported_removed"] += 1 + case "empty": + stats["empty_removed"] += 1 else: - all_cleaned.append(result.line) + all_cleaned.append(result.line) # type: ignore[arg-type] stats["lines_clean"] = len(all_cleaned) stage1_time = time.time() - stage1_start @@ -96,13 +157,14 @@ def process_files(input_dir: str, output_file: str) -> dict[str, int]: print(f" Kept {stats['lines_clean']:,} clean rules ({stage1_time:.1f}s)") # ========================================================================= - # Stage 2: Compile and deduplicate + # STAGE 2: Compile and deduplicate # ========================================================================= print("\n⚙️ Stage 2: Compiling and deduplicating...") stage2_start = time.time() compile_stats = compile_rules(all_cleaned, output_file) + # Transfer compilation stats stats["lines_output"] = compile_stats.total_output stats["abp_subdomain_pruned"] = compile_stats.abp_subdomain_pruned stats["tld_wildcard_pruned"] = compile_stats.tld_wildcard_pruned @@ -111,7 +173,7 @@ def process_files(input_dir: str, output_file: str) -> dict[str, int]: stats["local_hostname_pruned"] = compile_stats.local_hostname_pruned stats["formats_compressed"] = compile_stats.formats_compressed - # Add format breakdown + # Format breakdown stats["abp_kept"] = compile_stats.abp_kept stats["other_kept"] = compile_stats.other_kept @@ -121,8 +183,16 @@ def process_files(input_dir: str, output_file: str) -> dict[str, int]: return stats -def print_summary(stats: dict[str, int]) -> None: - """Print formatted summary.""" +def print_summary(stats: PipelineStats) -> None: + """ + Print formatted summary of pipeline execution. + + Displays a comprehensive breakdown of what was processed, + what was removed at each stage, and the final output. + + Args: + stats: PipelineStats from process_files() + """ print("\n" + "=" * 60) print("📊 PIPELINE SUMMARY") print("=" * 60) @@ -158,18 +228,63 @@ def print_summary(stats: dict[str, int]) -> None: print(f" Local hostnames: {stats['local_hostname_pruned']:>10,}") print(f"\n📦 Output breakdown:") - print(f" ABP rules: {stats.get('abp_kept', 0):>10,} (incl. {stats.get('formats_compressed', 0):,} compressed)") - print(f" Other rules: {stats.get('other_kept', 0):>10,}") + print(f" ABP rules: {stats['abp_kept']:>10,} (incl. {stats['formats_compressed']:,} compressed)") + print(f" Other rules: {stats['other_kept']:>10,}") +def save_stats_json(stats: PipelineStats, output_path: str, total_time: float) -> None: + """ + Save pipeline statistics to a JSON file. + + Args: + stats: Pipeline statistics dictionary + output_path: Path to write JSON file + total_time: Total execution time in seconds + """ + output = { + "version": "1.4.0", + "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + "execution_time_seconds": round(total_time, 2), + "statistics": dict(stats), + } + + path = Path(output_path) + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + json.dump(output, f, indent=2) + + +# ============================================================================= +# CLI INTERFACE +# ============================================================================= + def main() -> int: - """Main entry point.""" - if len(sys.argv) < 3: - print("Usage: python -m scripts.pipeline ") + """ + Main entry point for CLI usage. + + Returns: + Exit code (0 for success, 1 for error, 2 for usage error) + """ + # Parse arguments + args = sys.argv[1:] + json_stats_path: str | None = None + + # Check for --json-stats flag + if "--json-stats" in args: + idx = args.index("--json-stats") + if idx + 1 < len(args): + json_stats_path = args[idx + 1] + args = args[:idx] + args[idx + 2:] + else: + print("Error: --json-stats requires a path argument", file=sys.stderr) + return 2 + + if len(args) < 2: + print("Usage: python -m scripts.pipeline [--json-stats ]") return 2 - input_dir = sys.argv[1] - output_file = sys.argv[2] + input_dir = args[0] + output_file = args[1] try: print("🚀 Starting blocklist pipeline...") @@ -181,6 +296,12 @@ def main() -> int: print_summary(stats) print(f"\n⏱️ Total time: {total_time:.1f}s") + + # Save JSON stats if requested + if json_stats_path: + save_stats_json(stats, json_stats_path, total_time) + print(f"📊 Stats saved to: {json_stats_path}") + print("✅ Pipeline completed successfully!") return 0