diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 08bc663..835f9bd 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -4,12 +4,29 @@ updates: directory: "/" schedule: interval: "weekly" + day: "monday" commit-message: prefix: "ci" + labels: + - "dependencies" + - "github-actions" + groups: + actions: + patterns: + - "*" + # Python dependencies - package-ecosystem: "pip" directory: "/" schedule: interval: "weekly" + day: "monday" commit-message: prefix: "deps" + labels: + - "dependencies" + - "python" + groups: + python-deps: + patterns: + - "*" diff --git a/.github/workflows/update.yml b/.github/workflows/update.yml index e6f9dd5..af9a31c 100644 --- a/.github/workflows/update.yml +++ b/.github/workflows/update.yml @@ -12,6 +12,7 @@ env: permissions: contents: write + actions: write concurrency: group: blocklist-update @@ -24,14 +25,14 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v6 + uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 with: fetch-depth: 1 - name: Setup Python - uses: actions/setup-python@v6 + uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 with: - python-version: "3.12" + python-version: "3.14" cache: pip - name: Install Dependencies @@ -39,13 +40,14 @@ jobs: - name: Get Cache Key id: cache-key - run: echo "week=$(date -u +%Y-%V)" >> $GITHUB_OUTPUT + run: echo "date=$(date -u +'%Y-%m-%d_%H%M')" >> $GITHUB_OUTPUT - name: Restore Cache - uses: actions/cache@v5 + id: cache-restore + uses: actions/cache/restore@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0 with: path: .cache - key: blocklists-${{ steps.cache-key.outputs.week }} + key: blocklists-${{ steps.cache-key.outputs.date }} restore-keys: blocklists- - name: Fetch Sources @@ -87,7 +89,7 @@ jobs: echo "Validation passed: $RULES rules" - name: Publish Release - uses: softprops/action-gh-release@v2 + uses: softprops/action-gh-release@a06a81a03ee405af7f2048a818ed3f03bbf83c7b # v2.5.0 with: tag_name: latest name: Merged Blocklist @@ -110,9 +112,28 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Cleanup Cache - if: always() - run: find .cache -mtime +7 -delete 2>/dev/null || true + - name: Save Cache + uses: actions/cache/save@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0 + if: always() && steps.fetch.outcome == 'success' + with: + path: .cache + key: blocklists-${{ steps.cache-key.outputs.date }} + + - name: Cleanup Old Caches + if: success() + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh cache list --repo ${{ github.repository }} \ + --key blocklists- \ + --sort created_at \ + --order desc \ + --limit 100 \ + --json key \ + --jq '.[3:] | .[].key' | while read -r key; do + echo "Deleting cache: $key" + gh cache delete "$key" --repo ${{ github.repository }} || true + done - name: Summary if: always() @@ -127,4 +148,5 @@ jobs: | Sources | ${{ steps.fetch.outputs.count || 'N/A' }} | | Fetch Time | ${{ steps.fetch.outputs.time || '?' }} | | Compile Time | ${{ steps.compile.outputs.time || '?' }} | + | Cache | ${{ steps.cache-restore.outputs.cache-matched-key || 'New' }} | EOF diff --git a/pyproject.toml b/pyproject.toml index f46915d..a42d333 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,17 +2,16 @@ name = "blocklist-merger" version = "1.0.0" description = "AdGuard Home blocklist compiler with intelligent deduplication" -requires-python = ">=3.11" +requires-python = ">=3.14" dependencies = [ - "requests>=2.31.0", - "tldextract>=5.1.0", - "aiohttp>=3.9.0", - "aiofiles>=23.2.0", + "tldextract>=5.3.0", + "aiohttp>=3.13.0", + "aiofiles>=25.1.0", ] [project.optional-dependencies] dev = [ - "pytest>=7.4.0", + "pytest>=9.0.0", ] [project.scripts] diff --git a/scripts/cleaner.py b/scripts/cleaner.py index 73cef32..4e3d98b 100644 --- a/scripts/cleaner.py +++ b/scripts/cleaner.py @@ -32,8 +32,6 @@ 4. Keep hosts, plain domains, and ABP rules with supported modifiers """ -from __future__ import annotations - import re from typing import NamedTuple @@ -42,17 +40,6 @@ # MODIFIER DEFINITIONS (based on official AdGuard DNS filtering syntax docs) # ============================================================================ -# Modifiers supported by AdGuard Home DNS filtering -SUPPORTED_MODIFIERS = frozenset({ - "important", # Increases rule priority - "badfilter", # Disables matching rules - "dnsrewrite", # Rewrites DNS responses - "denyallow", # Excludes domains from blocking - "client", # Restricts to specific clients - "dnstype", # Filters by DNS record type - "ctag", # Client tags (keeping for completeness, though rare in public lists) -}) - # Modifiers that are browser-only and NOT supported by AGH # If a rule contains ANY of these, the ENTIRE RULE should be discarded UNSUPPORTED_MODIFIERS = frozenset({ @@ -84,7 +71,6 @@ "app", # Method restrictions (browser-only) "method", - # Any other modifiers not in supported list }) @@ -106,17 +92,13 @@ # Pattern to detect if a line is likely a comment COMMENT_PATTERN = re.compile(r"^\s*[#!]") -# Pattern to extract trailing inline comment (be careful not to match URLs) -# Only match # comments that are clearly at end of rule, not in URLs +# Trailing inline comment: match "# comment" preceded by whitespace TRAILING_COMMENT_PATTERN = re.compile(r"\s+#\s+.*$") # Pattern to extract modifier section from ABP rule # Matches: $modifier1,modifier2,... at end of rule MODIFIER_PATTERN = re.compile(r"\$([^$]+)$") -# Pattern to validate basic ABP format -ABP_RULE_PATTERN = re.compile(r"^@@?\|\|[^\s|^]+\^") - # ============================================================================ # DATA STRUCTURES @@ -260,6 +242,7 @@ def clean_line(line: str) -> tuple[CleanResult, bool]: return CleanResult(None, True, "unsupported_modifier"), False # Handle rules with just $ and modifiers (no pattern) + # e.g., "$script,third-party" without a domain prefix if line.startswith("$") or ("|" not in line and "$" in line): modifiers = extract_modifiers(line) if modifiers and has_unsupported_modifiers(modifiers): diff --git a/scripts/compiler.py b/scripts/compiler.py index 4430291..2293ddc 100644 --- a/scripts/compiler.py +++ b/scripts/compiler.py @@ -39,10 +39,7 @@ See docs/LOGIC.md for detailed examples of each pruning rule. """ -from __future__ import annotations - import re -import sys from dataclasses import dataclass from functools import lru_cache from pathlib import Path @@ -56,12 +53,11 @@ # REGEX PATTERNS # ============================================================================ -# ABP domain pattern: ||domain^ or ||*.domain^ -# Also matches IP addresses like ||100.48.203.212^ +# ABP pattern: ||[*.]domain^ (including IP addresses) ABP_DOMAIN_PATTERN = re.compile( - r"^(@@)?\|\|" # Start with || or @@|| (capture @@ for exception check) - r"(\*\.)?" # Optional *. for wildcard - r"([^^\$|*\s]+)" # Domain/IP (anything except ^$|* or whitespace) + r"^(@@)?\|\|" # Start: || or @@|| (group 1: exception marker) + r"(\*\.)?" # Optional *. wildcard (group 2) + r"([^^\$|*\s]+)" # Domain/IP (group 3) r"\^" # Separator ) @@ -80,9 +76,10 @@ r"(\.[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$" ) -# Local/blocking IPs in hosts format +# Local/blocking IPs recognized in hosts format BLOCKING_IPS = frozenset({ - "0.0.0.0", "127.0.0.1", "::1", "::0", "::","0:0:0:0:0:0:0:0", "0:0:0:0:0:0:0:1", + "0.0.0.0", "127.0.0.1", "::1", "::0", "::", + "0:0:0:0:0:0:0:0", "0:0:0:0:0:0:0:1", }) # Local hostnames to skip @@ -92,6 +89,12 @@ "ip6-mcastprefix", "ip6-allnodes", "ip6-allrouters", "ip6-allhosts", }) +# Modifiers with special behavior that should never be pruned +SPECIAL_BEHAVIOR_MODIFIERS = frozenset({"badfilter", "dnsrewrite", "denyallow"}) + +# Modifiers that restrict who is blocked +CLIENT_RESTRICTION_MODIFIERS = frozenset({"client", "ctag"}) + # ============================================================================ # DATA STRUCTURES @@ -136,7 +139,7 @@ def extract_abp_info(rule: str) -> tuple[str | None, frozenset, bool, bool]: if not match: return None, frozenset(), False, False - # Group 1: @@ (exception marker), Group 2: *. (wildcard), Group 3: domain + # Groups: (1) @@ exception, (2) *. wildcard, (3) domain is_exception = match.group(1) is not None is_wildcard = match.group(2) is not None domain = normalize_domain(match.group(3)) @@ -238,7 +241,7 @@ def walk_parent_domains(domain: str) -> tuple[str, ...]: return tuple(parents) -def should_prune_by_modifiers(child_mods: frozenset, parent_mods: frozenset) -> bool: +def should_prune_by_modifiers(child_mods: frozenset[str], parent_mods: frozenset[str]) -> bool: """ Determine if a child rule is redundant given the parent's modifiers. @@ -260,7 +263,7 @@ def should_prune_by_modifiers(child_mods: frozenset, parent_mods: frozenset) -> return False # Special behavior modifiers are never redundant - if child_mods & {"badfilter", "dnsrewrite", "denyallow"}: + if child_mods & SPECIAL_BEHAVIOR_MODIFIERS: return False # Handle $dnstype: parent blocking ALL types covers child blocking specific type, @@ -273,7 +276,7 @@ def should_prune_by_modifiers(child_mods: frozenset, parent_mods: frozenset) -> return False # Child blocks ALL types, parent only blocks one type # $client/$ctag restrict WHO is blocked. Unrestricted child is more general. - if (parent_mods & {"client", "ctag"}) and not (child_mods & {"client", "ctag"}): + if (parent_mods & CLIENT_RESTRICTION_MODIFIERS) and not (child_mods & CLIENT_RESTRICTION_MODIFIERS): return False return True @@ -303,9 +306,8 @@ def compile_rules( abp_rules: dict[str, tuple[str, frozenset, bool]] = {} abp_wildcards: dict[str, tuple[str, frozenset]] = {} # TLD wildcards: tld -> rule - # Exception rules - allow_rules: list[str] = [] - allow_domains: set[str] = set() # Domains covered by @@rules + # Whitelisted domains (from @@rules) + allow_domains: set[str] = set() # Other rules (regex, partial matches, etc.) other_rules: list[str] = [] @@ -327,7 +329,6 @@ def compile_rules( continue if is_exception: - allow_rules.append(line) # Track whitelisted domains for conflict removal if is_wildcard: # @@||*.example.com^ - covers all subdomains @@ -361,9 +362,9 @@ def compile_rules( stats.duplicate_pruned += 1 continue - # Other exception rules + # Other exception rules (non-ABP format like /regex/) if line.startswith("@@"): - allow_rules.append(line) + # Can't extract domain from non-ABP exceptions, skip for now continue # ===================================================================== @@ -415,13 +416,11 @@ def compile_rules( # ========================================================================= # All ABP blocking domains (for subdomain checks) - abp_blocking_domains: set[str] = set() - for domain, (rule, mods, is_wc) in abp_rules.items(): - if not is_wc: # Don't add wildcard keys like "*.example.com" - abp_blocking_domains.add(domain) - else: - # For wildcards, add the base domain for coverage checking - abp_blocking_domains.add(domain[2:]) # Remove "*." + # Use set comprehension for efficiency + abp_blocking_domains: set[str] = { + domain if not is_wc else domain[2:] # Remove "*." prefix for wildcards + for domain, (_, _, is_wc) in abp_rules.items() + } # TLD wildcards tld_wildcards: set[str] = set(abp_wildcards.keys()) @@ -445,11 +444,21 @@ def is_covered_by_abp(domain: str) -> bool: return False def is_whitelisted(domain: str) -> bool: - """Check if domain is whitelisted.""" + """Check if domain is whitelisted. + + A domain is whitelisted if: + 1. It's directly in allow_domains (@@||domain^) + 2. Any parent domain is whitelisted (@@||parent^ covers subdomains) + 3. A wildcard whitelist covers it (@@||*.parent^) + """ if domain in allow_domains: return True - # Check wildcard whitelists + # Check parent domains and wildcard whitelists for parent in walk_parent_domains(domain): + # @@||parent^ whitelists parent AND all subdomains + if parent in allow_domains: + return True + # @@||*.parent^ whitelists all subdomains of parent if f"*.{parent}" in allow_domains: return True return False @@ -567,6 +576,8 @@ def is_whitelisted(domain: str) -> bool: # ============================================================================ if __name__ == "__main__": + import sys + if len(sys.argv) < 3: print("Usage: python -m scripts.compiler ") sys.exit(1) @@ -591,6 +602,6 @@ def is_whitelisted(domain: str) -> bool: print(f" ABP subdomains: {stats.abp_subdomain_pruned:,}") print(f" TLD wildcards: {stats.tld_wildcard_pruned:,}") print(f" Duplicates: {stats.duplicate_pruned:,}") - print(f" Whitelist conflicts:{stats.whitelist_conflict_pruned:,}") + print(f" Whitelist conflicts: {stats.whitelist_conflict_pruned:,}") print(f" Local hostnames: {stats.local_hostname_pruned:,}") diff --git a/scripts/downloader.py b/scripts/downloader.py index 68ff9d7..a8ad0ca 100644 --- a/scripts/downloader.py +++ b/scripts/downloader.py @@ -8,7 +8,6 @@ Usage: python -m scripts.downloader --sources sources.txt --outdir data/ --cache .cache """ -from __future__ import annotations import argparse import asyncio @@ -24,9 +23,9 @@ # Default configuration -DEFAULT_TIMEOUT = 30 -DEFAULT_RETRIES = 3 -DEFAULT_CONCURRENCY = 8 +DEFAULT_TIMEOUT = 30 # seconds per request +DEFAULT_RETRIES = 3 # attempts before giving up +DEFAULT_CONCURRENCY = 8 # simultaneous connections # State file for ETag/Last-Modified tracking STATE_FILE = "state.json" diff --git a/scripts/pipeline.py b/scripts/pipeline.py index 658c386..06ee5cf 100644 --- a/scripts/pipeline.py +++ b/scripts/pipeline.py @@ -13,7 +13,6 @@ 3. Compile (compress formats, deduplicate, prune subdomains) 4. Write merged output """ -from __future__ import annotations import sys import time