|
| 1 | +""" |
| 2 | +Loads feeds linked from pages and collects information on the contained content |
| 3 | +""" |
| 4 | + |
| 5 | +import logging |
| 6 | +from time import mktime |
| 7 | +from datetime import datetime |
| 8 | + |
| 9 | +import feedparser |
| 10 | + |
| 11 | +from checks.abstract_checker import AbstractChecker |
| 12 | + |
| 13 | +class Checker(AbstractChecker): |
| 14 | + def __init__(self, config, previous_results=None): |
| 15 | + super().__init__(config, previous_results) |
| 16 | + self.feeds = {} |
| 17 | + |
| 18 | + def depends_on_results(self): |
| 19 | + return ['html_head'] |
| 20 | + |
| 21 | + def run(self): |
| 22 | + assert 'html_head' in self.previous_results |
| 23 | + |
| 24 | + for url in self.config.urls: |
| 25 | + self.collect_feeds(url) |
| 26 | + |
| 27 | + for feed_url in self.feeds: |
| 28 | + self.feeds[feed_url] = self.analyse_feed(feed_url) |
| 29 | + |
| 30 | + return self.feeds |
| 31 | + |
| 32 | + def collect_feeds(self, url): |
| 33 | + """ |
| 34 | + This collects the feeds from all urls. |
| 35 | + The assumption is that in most cases the urls will reference the same |
| 36 | + feeds. |
| 37 | + """ |
| 38 | + head = self.previous_results['html_head'][url] |
| 39 | + assert 'link_rss_atom' in head |
| 40 | + assert isinstance(head['link_rss_atom'], list) |
| 41 | + |
| 42 | + for feed_url in head['link_rss_atom']: |
| 43 | + if feed_url not in self.feeds: |
| 44 | + self.feeds[feed_url] = {} |
| 45 | + |
| 46 | + result = { |
| 47 | + 'feeds': [], |
| 48 | + 'exception': None, |
| 49 | + } |
| 50 | + |
| 51 | + return result |
| 52 | + |
| 53 | + |
| 54 | + def analyse_feed(self, feed_url): |
| 55 | + result = { |
| 56 | + 'exception': None, |
| 57 | + 'title': None, |
| 58 | + 'latest_entry': None, |
| 59 | + 'first_entry': None, |
| 60 | + 'average_interval': None, |
| 61 | + 'num_entries': None, |
| 62 | + } |
| 63 | + |
| 64 | + logging.debug("Loading feed %s" % feed_url) |
| 65 | + data = feedparser.parse(feed_url) |
| 66 | + |
| 67 | + if 'bozo_exception' in data: |
| 68 | + result['exception'] = data['bozo_exception'] |
| 69 | + |
| 70 | + if data['headers'].get('status') not in ('200', '301', '302'): |
| 71 | + result['exception'] = 'Server responded with status %s' % data['headers'].get('status') |
| 72 | + |
| 73 | + if 'feed' in data: |
| 74 | + result['title'] = data['feed'].get('title') |
| 75 | + if 'entries' in data: |
| 76 | + result['num_entries'] = len(data['entries']) |
| 77 | + result['latest_entry'] = self.find_latest_entry(data['entries']) |
| 78 | + result['first_entry'] = self.find_first_entry(data['entries']) |
| 79 | + if result['num_entries'] > 1 and result['first_entry'] < result['latest_entry']: |
| 80 | + result['average_interval'] = round((result['latest_entry'] - result['first_entry']).total_seconds() / (result['num_entries'] - 1)) |
| 81 | + |
| 82 | + return result |
| 83 | + |
| 84 | + |
| 85 | + def find_latest_entry(self, entries): |
| 86 | + max_date = None |
| 87 | + |
| 88 | + for entry in entries: |
| 89 | + timestamp = mktime(entry.get('published_parsed')) |
| 90 | + if max_date is None or timestamp > max_date: |
| 91 | + max_date = timestamp |
| 92 | + |
| 93 | + return datetime.fromtimestamp(max_date) |
| 94 | + |
| 95 | + |
| 96 | + def find_first_entry(self, entries): |
| 97 | + min_date = None |
| 98 | + |
| 99 | + for entry in entries: |
| 100 | + timestamp = mktime(entry.get('published_parsed')) |
| 101 | + if min_date is None or timestamp < min_date: |
| 102 | + min_date = timestamp |
| 103 | + |
| 104 | + return datetime.fromtimestamp(min_date) |
0 commit comments