Skip to content

Commit 86a8679

Browse files
author
Jim Priest
committed
Add excluded urls option
* refactor from pull request comments to compile regex * add exclude test
1 parent 8838f8c commit 86a8679

File tree

2 files changed

+10
-2
lines changed

2 files changed

+10
-2
lines changed

pylinkvalidator/models.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ def should_download(self, url_split):
173173
url = url_split.geturl()
174174

175175
for exclude_url in self.excluded_urls:
176-
if re.search(exclude_url, url):
176+
if exclude_url.search(url):
177177
return False
178178

179179
for ignored_prefix in self.ignored_prefixes:
@@ -215,7 +215,8 @@ def _parse_config(self):
215215
self.ignored_prefixes = self.options.ignored_prefixes.split(',')
216216

217217
if self.options.excluded_urls:
218-
self.excluded_urls = self.options.excluded_urls.split(',')
218+
self.excluded_urls = [re.compile(pattern) for pattern in self.options.excluded_urls.split(',')]
219+
219220

220221
if self.options.workers:
221222
self.worker_size = self.options.workers

pylinkvalidator/tests.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,13 @@ def test_run_once(self):
331331
self.assertEqual(8, len(site.pages))
332332
self.assertEqual(0, len(site.error_pages))
333333

334+
def test_exclude(self):
335+
site = self._run_crawler_plain(ThreadSiteCrawler, ["--exclude=/sub/"])
336+
337+
# exclude /sub/ directory = 4 pages linked on the index
338+
self.assertEqual(4, len(site.pages))
339+
self.assertEqual(0, len(site.error_pages))
340+
334341
def test_depth_0(self):
335342
site = self._run_crawler_plain(
336343
ThreadSiteCrawler, ["--depth", "0"], "/depth/root.html")

0 commit comments

Comments
 (0)