Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,29 @@ updates:
directory: "/"
schedule:
interval: "weekly"
day: "monday"
commit-message:
prefix: "ci"
labels:
- "dependencies"
- "github-actions"
groups:
actions:
patterns:
- "*"

# Python dependencies
- package-ecosystem: "pip"
directory: "/"
schedule:
interval: "weekly"
day: "monday"
commit-message:
prefix: "deps"
labels:
- "dependencies"
- "python"
groups:
python-deps:
patterns:
- "*"
42 changes: 32 additions & 10 deletions .github/workflows/update.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ env:

permissions:
contents: write
actions: write

concurrency:
group: blocklist-update
Expand All @@ -24,28 +25,29 @@ jobs:

steps:
- name: Checkout
uses: actions/checkout@v6
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
with:
fetch-depth: 1

- name: Setup Python
uses: actions/setup-python@v6
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
with:
python-version: "3.12"
python-version: "3.14"
cache: pip

- name: Install Dependencies
run: pip install -q .

- name: Get Cache Key
id: cache-key
run: echo "week=$(date -u +%Y-%V)" >> $GITHUB_OUTPUT
run: echo "date=$(date -u +'%Y-%m-%d_%H%M')" >> $GITHUB_OUTPUT

- name: Restore Cache
uses: actions/cache@v5
id: cache-restore
uses: actions/cache/restore@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
with:
path: .cache
key: blocklists-${{ steps.cache-key.outputs.week }}
key: blocklists-${{ steps.cache-key.outputs.date }}
restore-keys: blocklists-

- name: Fetch Sources
Expand Down Expand Up @@ -87,7 +89,7 @@ jobs:
echo "Validation passed: $RULES rules"

- name: Publish Release
uses: softprops/action-gh-release@v2
uses: softprops/action-gh-release@a06a81a03ee405af7f2048a818ed3f03bbf83c7b # v2.5.0
with:
tag_name: latest
name: Merged Blocklist
Expand All @@ -110,9 +112,28 @@ jobs:
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

- name: Cleanup Cache
if: always()
run: find .cache -mtime +7 -delete 2>/dev/null || true
- name: Save Cache
uses: actions/cache/save@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
if: always() && steps.fetch.outcome == 'success'
with:
path: .cache
key: blocklists-${{ steps.cache-key.outputs.date }}

- name: Cleanup Old Caches
if: success()
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
gh cache list --repo ${{ github.repository }} \
--key blocklists- \
--sort created_at \
--order desc \
--limit 100 \
--json key \
--jq '.[3:] | .[].key' | while read -r key; do
echo "Deleting cache: $key"
gh cache delete "$key" --repo ${{ github.repository }} || true
done

- name: Summary
if: always()
Expand All @@ -127,4 +148,5 @@ jobs:
| Sources | ${{ steps.fetch.outputs.count || 'N/A' }} |
| Fetch Time | ${{ steps.fetch.outputs.time || '?' }} |
| Compile Time | ${{ steps.compile.outputs.time || '?' }} |
| Cache | ${{ steps.cache-restore.outputs.cache-matched-key || 'New' }} |
EOF
11 changes: 5 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,16 @@
name = "blocklist-merger"
version = "1.0.0"
description = "AdGuard Home blocklist compiler with intelligent deduplication"
requires-python = ">=3.11"
requires-python = ">=3.14"
dependencies = [
"requests>=2.31.0",
"tldextract>=5.1.0",
"aiohttp>=3.9.0",
"aiofiles>=23.2.0",
"tldextract>=5.3.0",
"aiohttp>=3.13.0",
"aiofiles>=25.1.0",
]

[project.optional-dependencies]
dev = [
"pytest>=7.4.0",
"pytest>=9.0.0",
]

[project.scripts]
Expand Down
21 changes: 2 additions & 19 deletions scripts/cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,6 @@
4. Keep hosts, plain domains, and ABP rules with supported modifiers
"""

from __future__ import annotations

import re
from typing import NamedTuple

Expand All @@ -42,17 +40,6 @@
# MODIFIER DEFINITIONS (based on official AdGuard DNS filtering syntax docs)
# ============================================================================

# Modifiers supported by AdGuard Home DNS filtering
SUPPORTED_MODIFIERS = frozenset({
"important", # Increases rule priority
"badfilter", # Disables matching rules
"dnsrewrite", # Rewrites DNS responses
"denyallow", # Excludes domains from blocking
"client", # Restricts to specific clients
"dnstype", # Filters by DNS record type
"ctag", # Client tags (keeping for completeness, though rare in public lists)
})

# Modifiers that are browser-only and NOT supported by AGH
# If a rule contains ANY of these, the ENTIRE RULE should be discarded
UNSUPPORTED_MODIFIERS = frozenset({
Expand Down Expand Up @@ -84,7 +71,6 @@
"app",
# Method restrictions (browser-only)
"method",
# Any other modifiers not in supported list
})


Expand All @@ -106,17 +92,13 @@
# Pattern to detect if a line is likely a comment
COMMENT_PATTERN = re.compile(r"^\s*[#!]")

# Pattern to extract trailing inline comment (be careful not to match URLs)
# Only match # comments that are clearly at end of rule, not in URLs
# Trailing inline comment: match "# comment" preceded by whitespace
TRAILING_COMMENT_PATTERN = re.compile(r"\s+#\s+.*$")

# Pattern to extract modifier section from ABP rule
# Matches: $modifier1,modifier2,... at end of rule
MODIFIER_PATTERN = re.compile(r"\$([^$]+)$")

# Pattern to validate basic ABP format
ABP_RULE_PATTERN = re.compile(r"^@@?\|\|[^\s|^]+\^")


# ============================================================================
# DATA STRUCTURES
Expand Down Expand Up @@ -260,6 +242,7 @@ def clean_line(line: str) -> tuple[CleanResult, bool]:
return CleanResult(None, True, "unsupported_modifier"), False

# Handle rules with just $ and modifiers (no pattern)
# e.g., "$script,third-party" without a domain prefix
if line.startswith("$") or ("|" not in line and "$" in line):
modifiers = extract_modifiers(line)
if modifiers and has_unsupported_modifiers(modifiers):
Expand Down
71 changes: 41 additions & 30 deletions scripts/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,7 @@
See docs/LOGIC.md for detailed examples of each pruning rule.
"""

from __future__ import annotations

import re
import sys
from dataclasses import dataclass
from functools import lru_cache
from pathlib import Path
Expand All @@ -56,12 +53,11 @@
# REGEX PATTERNS
# ============================================================================

# ABP domain pattern: ||domain^ or ||*.domain^
# Also matches IP addresses like ||100.48.203.212^
# ABP pattern: ||[*.]domain^ (including IP addresses)
ABP_DOMAIN_PATTERN = re.compile(
r"^(@@)?\|\|" # Start with || or @@|| (capture @@ for exception check)
r"(\*\.)?" # Optional *. for wildcard
r"([^^\$|*\s]+)" # Domain/IP (anything except ^$|* or whitespace)
r"^(@@)?\|\|" # Start: || or @@|| (group 1: exception marker)
r"(\*\.)?" # Optional *. wildcard (group 2)
r"([^^\$|*\s]+)" # Domain/IP (group 3)
r"\^" # Separator
)

Expand All @@ -80,9 +76,10 @@
r"(\.[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$"
)

# Local/blocking IPs in hosts format
# Local/blocking IPs recognized in hosts format
BLOCKING_IPS = frozenset({
"0.0.0.0", "127.0.0.1", "::1", "::0", "::","0:0:0:0:0:0:0:0", "0:0:0:0:0:0:0:1",
"0.0.0.0", "127.0.0.1", "::1", "::0", "::",
"0:0:0:0:0:0:0:0", "0:0:0:0:0:0:0:1",
})

# Local hostnames to skip
Expand All @@ -92,6 +89,12 @@
"ip6-mcastprefix", "ip6-allnodes", "ip6-allrouters", "ip6-allhosts",
})

# Modifiers with special behavior that should never be pruned
SPECIAL_BEHAVIOR_MODIFIERS = frozenset({"badfilter", "dnsrewrite", "denyallow"})

# Modifiers that restrict who is blocked
CLIENT_RESTRICTION_MODIFIERS = frozenset({"client", "ctag"})


# ============================================================================
# DATA STRUCTURES
Expand Down Expand Up @@ -136,7 +139,7 @@ def extract_abp_info(rule: str) -> tuple[str | None, frozenset, bool, bool]:
if not match:
return None, frozenset(), False, False

# Group 1: @@ (exception marker), Group 2: *. (wildcard), Group 3: domain
# Groups: (1) @@ exception, (2) *. wildcard, (3) domain
is_exception = match.group(1) is not None
is_wildcard = match.group(2) is not None
domain = normalize_domain(match.group(3))
Expand Down Expand Up @@ -238,7 +241,7 @@ def walk_parent_domains(domain: str) -> tuple[str, ...]:
return tuple(parents)


def should_prune_by_modifiers(child_mods: frozenset, parent_mods: frozenset) -> bool:
def should_prune_by_modifiers(child_mods: frozenset[str], parent_mods: frozenset[str]) -> bool:
"""
Determine if a child rule is redundant given the parent's modifiers.

Expand All @@ -260,7 +263,7 @@ def should_prune_by_modifiers(child_mods: frozenset, parent_mods: frozenset) ->
return False

# Special behavior modifiers are never redundant
if child_mods & {"badfilter", "dnsrewrite", "denyallow"}:
if child_mods & SPECIAL_BEHAVIOR_MODIFIERS:
return False

# Handle $dnstype: parent blocking ALL types covers child blocking specific type,
Expand All @@ -273,7 +276,7 @@ def should_prune_by_modifiers(child_mods: frozenset, parent_mods: frozenset) ->
return False # Child blocks ALL types, parent only blocks one type

# $client/$ctag restrict WHO is blocked. Unrestricted child is more general.
if (parent_mods & {"client", "ctag"}) and not (child_mods & {"client", "ctag"}):
if (parent_mods & CLIENT_RESTRICTION_MODIFIERS) and not (child_mods & CLIENT_RESTRICTION_MODIFIERS):
return False

return True
Expand Down Expand Up @@ -303,9 +306,8 @@ def compile_rules(
abp_rules: dict[str, tuple[str, frozenset, bool]] = {}
abp_wildcards: dict[str, tuple[str, frozenset]] = {} # TLD wildcards: tld -> rule

# Exception rules
allow_rules: list[str] = []
allow_domains: set[str] = set() # Domains covered by @@rules
# Whitelisted domains (from @@rules)
allow_domains: set[str] = set()

# Other rules (regex, partial matches, etc.)
other_rules: list[str] = []
Expand All @@ -327,7 +329,6 @@ def compile_rules(
continue

if is_exception:
allow_rules.append(line)
# Track whitelisted domains for conflict removal
if is_wildcard:
# @@||*.example.com^ - covers all subdomains
Expand Down Expand Up @@ -361,9 +362,9 @@ def compile_rules(
stats.duplicate_pruned += 1
continue

# Other exception rules
# Other exception rules (non-ABP format like /regex/)
if line.startswith("@@"):
allow_rules.append(line)
# Can't extract domain from non-ABP exceptions, skip for now
continue

# =====================================================================
Expand Down Expand Up @@ -415,13 +416,11 @@ def compile_rules(
# =========================================================================

# All ABP blocking domains (for subdomain checks)
abp_blocking_domains: set[str] = set()
for domain, (rule, mods, is_wc) in abp_rules.items():
if not is_wc: # Don't add wildcard keys like "*.example.com"
abp_blocking_domains.add(domain)
else:
# For wildcards, add the base domain for coverage checking
abp_blocking_domains.add(domain[2:]) # Remove "*."
# Use set comprehension for efficiency
abp_blocking_domains: set[str] = {
domain if not is_wc else domain[2:] # Remove "*." prefix for wildcards
for domain, (_, _, is_wc) in abp_rules.items()
}

# TLD wildcards
tld_wildcards: set[str] = set(abp_wildcards.keys())
Expand All @@ -445,11 +444,21 @@ def is_covered_by_abp(domain: str) -> bool:
return False

def is_whitelisted(domain: str) -> bool:
"""Check if domain is whitelisted."""
"""Check if domain is whitelisted.

A domain is whitelisted if:
1. It's directly in allow_domains (@@||domain^)
2. Any parent domain is whitelisted (@@||parent^ covers subdomains)
3. A wildcard whitelist covers it (@@||*.parent^)
"""
if domain in allow_domains:
return True
# Check wildcard whitelists
# Check parent domains and wildcard whitelists
for parent in walk_parent_domains(domain):
# @@||parent^ whitelists parent AND all subdomains
if parent in allow_domains:
return True
# @@||*.parent^ whitelists all subdomains of parent
if f"*.{parent}" in allow_domains:
return True
return False
Expand Down Expand Up @@ -567,6 +576,8 @@ def is_whitelisted(domain: str) -> bool:
# ============================================================================

if __name__ == "__main__":
import sys

if len(sys.argv) < 3:
print("Usage: python -m scripts.compiler <input_file> <output_file>")
sys.exit(1)
Expand All @@ -591,6 +602,6 @@ def is_whitelisted(domain: str) -> bool:
print(f" ABP subdomains: {stats.abp_subdomain_pruned:,}")
print(f" TLD wildcards: {stats.tld_wildcard_pruned:,}")
print(f" Duplicates: {stats.duplicate_pruned:,}")
print(f" Whitelist conflicts:{stats.whitelist_conflict_pruned:,}")
print(f" Whitelist conflicts: {stats.whitelist_conflict_pruned:,}")
print(f" Local hostnames: {stats.local_hostname_pruned:,}")

Loading