Skip to content

Commit e6065d2

Browse files
committed
Add a linter for the PSL and use it on Travis CI
1 parent 5bece2d commit e6065d2

29 files changed

+676
-16
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
linter/log

.travis.yml

+33-16
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,38 @@
11
language: c
2-
compiler:
3-
- gcc
4-
# Change this to your needs
2+
compiler: gcc
3+
addons:
4+
apt:
5+
packages:
6+
- python3
7+
- autoconf
8+
- automake
9+
- autopoint
10+
- libtool
11+
- gettext
12+
- libidn11
13+
- libidn11-dev
14+
- libidn2-0
15+
- libidn2-0-dev
16+
- libicu48
17+
- libicu-dev
18+
- libunistring0
19+
- libunistring-dev
20+
521
script:
6-
- DIR=`pwd`
7-
- git clone https://github.com/rockdaboot/libpsl
8-
- cd libpsl
9-
- echo "EXTRA_DIST =" >gtk-doc.make
10-
- echo "CLEANFILES =" >>gtk-doc.make
11-
- autoreconf --install --force --symlink
12-
- OPTIONS="--with-psl-file=$DIR/public_suffix_list.dat --with-psl-testfile=$DIR/tests/test_psl.txt"
22+
- DIR=`pwd`
23+
- cd linter
24+
- ./pslint_selftest.sh
25+
- ./pslint.py ../public_suffix_list.dat
26+
- cd $DIR
27+
- git clone --depth=1 --branch newfmt https://github.com/rockdaboot/libpsl
28+
- cd libpsl
29+
- echo "EXTRA_DIST =" >gtk-doc.make
30+
- echo "CLEANFILES =" >>gtk-doc.make
31+
- autoreconf --install --force --symlink
32+
- OPTIONS="--with-psl-file=$DIR/public_suffix_list.dat --with-psl-testfile=$DIR/tests/tests.txt"
1333
# Test PSL data with libicu (IDNA2008 UTS#46)
14-
- ./configure -C --enable-runtime=libicu --enable-builtin=libicu $OPTIONS && make clean && make check -j4
34+
- ./configure -C --enable-runtime=libicu --enable-builtin=libicu $OPTIONS && make clean && make check -j4
1535
# TEST PSL data with libidn2 (IDNA2008)
16-
- ./configure -C --enable-runtime=libidn2 --enable-builtin=libidn2 $OPTIONS && make clean && make check -j4
36+
# - ./configure -C --enable-runtime=libidn2 --enable-builtin=libidn2 $OPTIONS && make clean && make check -j4
1737
# TEST PSL data with libidn (IDNA2003)
18-
- ./configure -C --enable-runtime=libidn --enable-builtin=libidn $OPTIONS && make clean && make check -j4
19-
before_install:
20-
- sudo apt-get -qq update
21-
- sudo apt-get -q install autoconf automake autopoint libtool gettext libidn11 libidn11-dev libidn2-0 libidn2-0-dev libicu48 libicu-dev libunistring0 libunistring-dev
38+
# - ./configure -C --enable-runtime=libidn --enable-builtin=libidn $OPTIONS && make clean && make check -j4

linter/README.md

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
This directory contains a linter for the Public Suffix List.
2+
3+
Before you commit any changes to the PSL, please use the
4+
linter to check the syntax.
5+
6+
Usage
7+
=====
8+
9+
(from the repo's main directory)
10+
11+
$ linter/pslint.py public_suffix_list.dat
12+
13+
$? is set to 0 on success, else it is set to 1.
14+
15+
16+
Selftest
17+
========
18+
19+
Every change on pslint.py should be followed by a self-test.
20+
21+
```
22+
$ cd linter
23+
$ ./pslint_selftest.sh
24+
test_allowedchars: OK
25+
test_dots: OK
26+
test_duplicate: OK
27+
test_exception: OK
28+
test_punycode: OK
29+
test_section1: OK
30+
test_section2: OK
31+
test_section3: OK
32+
test_section4: OK
33+
test_spaces: OK
34+
test_wildcard: OK
35+
```

linter/pslint.py

+271
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,271 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-#
3+
#
4+
# PSL linter written in python
5+
#
6+
# Copyright 2016 Tim Rühsen (tim dot ruehsen at gmx dot de). All rights reserved.
7+
#
8+
# Permission is hereby granted, free of charge, to any person obtaining a
9+
# copy of this software and associated documentation files (the "Software"),
10+
# to deal in the Software without restriction, including without limitation
11+
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
12+
# and/or sell copies of the Software, and to permit persons to whom the
13+
# Software is furnished to do so, subject to the following conditions:
14+
#
15+
# The above copyright notice and this permission notice shall be included in
16+
# all copies or substantial portions of the Software.
17+
#
18+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23+
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24+
# DEALINGS IN THE SOFTWARE.
25+
26+
import sys
27+
import codecs
28+
29+
nline = 0
30+
line = ""
31+
orig_line = ""
32+
warnings = 0
33+
errors = 0
34+
skip_order_check = False
35+
36+
def warning(msg):
37+
global warnings, orig_line, nline
38+
print('%d: warning: %s%s' % (nline, msg, ": \'" + orig_line + "\'" if orig_line else ""))
39+
warnings += 1
40+
41+
def error(msg):
42+
global errors, orig_line, nline
43+
print('%d: error: %s%s' % (nline, msg, ": \'" + orig_line + "\'" if orig_line else ""))
44+
errors += 1
45+
# skip_order_check = True
46+
47+
def print_psl(list):
48+
for domain in list:
49+
print(".".join(str(label) for label in reversed(domain)))
50+
51+
def psl_key(s):
52+
if s[0] == '*':
53+
return 0
54+
if s[0] == '!':
55+
return 1
56+
return 2
57+
58+
def check_order(group):
59+
"""Check the correct order of a domain group"""
60+
global skip_order_check
61+
62+
try:
63+
if skip_order_check or len(group) < 2:
64+
skip_order_check = False
65+
return
66+
67+
# check if the TLD is the identical within the group
68+
if any(group[0][0] != labels[0] for labels in group):
69+
warning('Domain group TLD is not consistent')
70+
71+
# sort by # of labels, label-by-label (labels are in reversed order)
72+
sorted_group = sorted(group, key = lambda labels: (len(labels), psl_key(labels[-1][0]), labels))
73+
74+
if group != sorted_group:
75+
warning('Incorrectly sorted group of domains')
76+
print(" " + str(group))
77+
print(" " + str(sorted_group))
78+
print("Correct sorting would be:")
79+
print_psl(sorted_group)
80+
81+
finally:
82+
del group[:]
83+
84+
85+
def lint_psl(infile):
86+
"""Parses PSL file and performs syntax checking"""
87+
global orig_line, nline
88+
89+
PSL_FLAG_EXCEPTION = (1<<0)
90+
PSL_FLAG_WILDCARD = (1<<1)
91+
PSL_FLAG_ICANN = (1<<2) # entry of ICANN section
92+
PSL_FLAG_PRIVATE = (1<<3) # entry of PRIVATE section
93+
PSL_FLAG_PLAIN = (1<<4) #just used for PSL syntax checking
94+
95+
line2number = {}
96+
line2flag = {}
97+
group = []
98+
section = 0
99+
icann_sections = 0
100+
private_sections = 0
101+
102+
lines = [line.strip('\n') for line in infile]
103+
104+
for line in lines:
105+
nline += 1
106+
107+
# check for leadind/trailing whitespace
108+
stripped = line.strip()
109+
if stripped != line:
110+
line = line.replace('\t','\\t')
111+
line = line.replace('\r','^M')
112+
orig_line = line
113+
warning('Leading/Trailing whitespace')
114+
orig_line = line
115+
line = stripped
116+
117+
# empty line (end of sorted domain group)
118+
if not line:
119+
# check_order(group)
120+
continue
121+
122+
# check for section begin/end
123+
if line[0:2] == "//":
124+
# check_order(group)
125+
126+
if section == 0:
127+
if line == "// ===BEGIN ICANN DOMAINS===":
128+
section = PSL_FLAG_ICANN
129+
icann_sections += 1
130+
elif line == "// ===BEGIN PRIVATE DOMAINS===":
131+
section = PSL_FLAG_PRIVATE
132+
private_sections += 1
133+
elif line[3:11] == "===BEGIN":
134+
error('Unexpected begin of unknown section')
135+
elif line[3:9] == "===END":
136+
error('End of section without previous begin')
137+
elif section == PSL_FLAG_ICANN:
138+
if line == "// ===END ICANN DOMAINS===":
139+
section = 0
140+
elif line[3:11] == "===BEGIN":
141+
error('Unexpected begin of section: ')
142+
elif line[3:9] == "===END":
143+
error('Unexpected end of section')
144+
elif section == PSL_FLAG_PRIVATE:
145+
if line == "// ===END PRIVATE DOMAINS===":
146+
section = 0
147+
elif line[3:11] == "===BEGIN":
148+
error('Unexpected begin of section')
149+
elif line[3:9] == "===END":
150+
error('Unexpected end of section')
151+
152+
continue # processing of comments ends here
153+
154+
# No rule must be outside of a section
155+
if section == 0:
156+
error('Rule outside of section')
157+
158+
group.append(list(reversed(line.split('.'))))
159+
160+
# decode UTF-8 input into unicode, needed only for python 2.x
161+
try:
162+
if sys.version_info[0] < 3:
163+
line = line.decode('utf-8')
164+
else:
165+
line.encode('utf-8')
166+
except (UnicodeDecodeError, UnicodeEncodeError):
167+
orig_line = None
168+
error('Invalid UTF-8 character')
169+
continue
170+
171+
# each rule must be lowercase (or more exactly: not uppercase and not titlecase)
172+
if line != line.lower():
173+
error('Rule must be lowercase')
174+
175+
# strip leading wildcards
176+
flags = section
177+
# while line[0:2] == '*.':
178+
if line[0:2] == '*.':
179+
flags |= PSL_FLAG_WILDCARD
180+
line = line[2:]
181+
182+
if line[0] == '!':
183+
flags |= PSL_FLAG_EXCEPTION
184+
line = line[1:]
185+
else:
186+
flags |= PSL_FLAG_PLAIN
187+
188+
# wildcard and exception must not combine
189+
if flags & PSL_FLAG_WILDCARD and flags & PSL_FLAG_EXCEPTION:
190+
error('Combination of wildcard and exception')
191+
continue
192+
193+
labels = line.split('.')
194+
195+
if flags & PSL_FLAG_EXCEPTION and len(labels) > 1:
196+
domain = ".".join(str(label) for label in labels[1:])
197+
if not domain in line2flag:
198+
error('Exception without previous wildcard')
199+
elif not line2flag[domain] & PSL_FLAG_WILDCARD:
200+
error('Exception without previous wildcard')
201+
202+
for label in labels:
203+
if not label:
204+
error('Leading/trailing or multiple dot')
205+
continue
206+
207+
if label[0:4] == 'xn--':
208+
error('Punycode found')
209+
continue
210+
211+
if '--' in label:
212+
error('Double minus found')
213+
continue
214+
215+
# allowed are a-z,0-9,- and unicode >= 128 (maybe that can be finetuned a bit !?)
216+
for c in label:
217+
if not c.isalnum() and c != '-' and ord(c) < 128:
218+
error('Illegal character')
219+
break
220+
221+
if line in line2flag:
222+
'''Found existing entry:
223+
Combination of exception and plain rule is contradictionary
224+
!foo.bar + foo.bar
225+
Doublette, since *.foo.bar implies foo.bar:
226+
foo.bar + *.foo.bar
227+
Allowed:
228+
!foo.bar + *.foo.bar
229+
'''
230+
error('Found doublette/ambiguity (previous line was %d)' % line2number[line])
231+
232+
line2number[line] = nline
233+
line2flag[line] = flags
234+
235+
orig_line = None
236+
237+
if section == PSL_FLAG_ICANN:
238+
error('ICANN section not closed')
239+
elif section == PSL_FLAG_PRIVATE:
240+
error('PRIVATE section not closed')
241+
242+
if icann_sections < 1:
243+
warning('No ICANN section found')
244+
elif icann_sections > 1:
245+
warning('%d ICANN sections found' % icann_sections)
246+
247+
if private_sections < 1:
248+
warning('No PRIVATE section found')
249+
elif private_sections > 1:
250+
warning('%d PRIVATE sections found' % private_sections)
251+
252+
def usage():
253+
"""Prints the usage"""
254+
print('usage: %s PSLfile' % sys.argv[0])
255+
print('or %s - # To read PSL from STDIN' % sys.argv[0])
256+
exit(1)
257+
258+
259+
def main():
260+
"""Check syntax of a PSL file"""
261+
if len(sys.argv) < 2:
262+
usage()
263+
264+
with sys.stdin if sys.argv[-1] == '-' else open(sys.argv[-1], 'r', encoding='utf-8', errors="surrogateescape") as infile:
265+
lint_psl(infile)
266+
267+
return errors != 0
268+
269+
270+
if __name__ == '__main__':
271+
sys.exit(main())

linter/pslint_selftest.sh

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/bin/sh
2+
3+
rc=0
4+
rm -rf log
5+
mkdir -p log
6+
7+
# add CR if missing, it won't possibly survive git
8+
sed -i -e 's/^e.example.com$/e.example.com\r/g' test_spaces.input
9+
10+
for file in `ls *.input|cut -d'.' -f1`; do
11+
echo -n "${file}: "
12+
./pslint.py ${file}.input >log/${file}.log 2>&1
13+
diff -u ${file}.expected log/${file}.log >log/${file}.diff
14+
if [ $? -eq 0 ]; then
15+
echo OK
16+
rm log/${file}.diff log/${file}.log
17+
else
18+
echo FAILED
19+
cat log/${file}.diff
20+
rc=1
21+
fi
22+
done
23+
24+
# remove CR, to not appear as changed to git
25+
sed -i -e 's/^e.example.com\r$/e.example.com/g' test_spaces.input
26+
27+
if [ $rc -eq 0 ]; then
28+
rmdir log
29+
fi
30+
31+
exit $rc

0 commit comments

Comments
 (0)