Skip to content

Commit 3b9ead3

Browse files
authored
Load feeds and gather info (#103)
1 parent 3063a44 commit 3b9ead3

File tree

4 files changed

+181
-1
lines changed

4 files changed

+181
-1
lines changed

Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ RUN echo "http://dl-4.alpinelinux.org/alpine/v3.7/main" >> /etc/apk/repositories
66
apk update && \
77
apk --no-cache add chromium chromium-chromedriver python3-dev build-base git py3-lxml libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev && \
88
pip3 install --upgrade pip && \
9-
pip3 install selenium==3.8.0 GitPython PyYAML beautifulsoup4==4.6.0 html-similarity==0.3.2 httpretty==0.9.4 pyopenssl==18.0.0 requests==2.18.4 responses==0.9.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 tenacity==5.0.2 && \
9+
pip3 install selenium==3.8.0 GitPython PyYAML beautifulsoup4==4.6.0 html-similarity==0.3.2 httpretty==0.9.4 feedparser==5.2.1 pyopenssl==18.0.0 requests==2.18.4 responses==0.9.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 tenacity==5.0.2 && \
1010
apk del python3-dev build-base
1111

1212
ADD cli.py /

checks/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from checks import http_and_https
1717
from checks import hyperlinks
1818
from checks import page_content
19+
from checks import load_feeds
1920
from checks import load_in_browser
2021
from checks import url_reachability
2122
from checks import url_canonicalization
@@ -45,6 +46,7 @@ def perform_checks(input_url):
4546
('frameset', frameset),
4647
('hyperlinks', hyperlinks),
4748
('generator', generator),
49+
('load_feeds', load_feeds),
4850
('load_in_browser', load_in_browser),
4951
]
5052

checks/load_feeds.py

+104
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
"""
2+
Loads feeds linked from pages and collects information on the contained content
3+
"""
4+
5+
import logging
6+
from time import mktime
7+
from datetime import datetime
8+
9+
import feedparser
10+
11+
from checks.abstract_checker import AbstractChecker
12+
13+
class Checker(AbstractChecker):
14+
def __init__(self, config, previous_results=None):
15+
super().__init__(config, previous_results)
16+
self.feeds = {}
17+
18+
def depends_on_results(self):
19+
return ['html_head']
20+
21+
def run(self):
22+
assert 'html_head' in self.previous_results
23+
24+
for url in self.config.urls:
25+
self.collect_feeds(url)
26+
27+
for feed_url in self.feeds:
28+
self.feeds[feed_url] = self.analyse_feed(feed_url)
29+
30+
return self.feeds
31+
32+
def collect_feeds(self, url):
33+
"""
34+
This collects the feeds from all urls.
35+
The assumption is that in most cases the urls will reference the same
36+
feeds.
37+
"""
38+
head = self.previous_results['html_head'][url]
39+
assert 'link_rss_atom' in head
40+
assert isinstance(head['link_rss_atom'], list)
41+
42+
for feed_url in head['link_rss_atom']:
43+
if feed_url not in self.feeds:
44+
self.feeds[feed_url] = {}
45+
46+
result = {
47+
'feeds': [],
48+
'exception': None,
49+
}
50+
51+
return result
52+
53+
54+
def analyse_feed(self, feed_url):
55+
result = {
56+
'exception': None,
57+
'title': None,
58+
'latest_entry': None,
59+
'first_entry': None,
60+
'average_interval': None,
61+
'num_entries': None,
62+
}
63+
64+
logging.debug("Loading feed %s" % feed_url)
65+
data = feedparser.parse(feed_url)
66+
67+
if 'bozo_exception' in data:
68+
result['exception'] = data['bozo_exception']
69+
70+
if data['headers'].get('status') not in ('200', '301', '302'):
71+
result['exception'] = 'Server responded with status %s' % data['headers'].get('status')
72+
73+
if 'feed' in data:
74+
result['title'] = data['feed'].get('title')
75+
if 'entries' in data:
76+
result['num_entries'] = len(data['entries'])
77+
result['latest_entry'] = self.find_latest_entry(data['entries'])
78+
result['first_entry'] = self.find_first_entry(data['entries'])
79+
if result['num_entries'] > 1 and result['first_entry'] < result['latest_entry']:
80+
result['average_interval'] = round((result['latest_entry'] - result['first_entry']).total_seconds() / (result['num_entries'] - 1))
81+
82+
return result
83+
84+
85+
def find_latest_entry(self, entries):
86+
max_date = None
87+
88+
for entry in entries:
89+
timestamp = mktime(entry.get('published_parsed'))
90+
if max_date is None or timestamp > max_date:
91+
max_date = timestamp
92+
93+
return datetime.fromtimestamp(max_date)
94+
95+
96+
def find_first_entry(self, entries):
97+
min_date = None
98+
99+
for entry in entries:
100+
timestamp = mktime(entry.get('published_parsed'))
101+
if min_date is None or timestamp < min_date:
102+
min_date = timestamp
103+
104+
return datetime.fromtimestamp(min_date)

checks/load_feeds_test.py

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import httpretty
2+
from httpretty import httprettified
3+
import unittest
4+
5+
from checks import html_head, page_content
6+
from checks import load_feeds
7+
from checks.config import Config
8+
from datetime import datetime
9+
10+
@httprettified
11+
class TestFeed(unittest.TestCase):
12+
13+
def test_feed_rss2(self):
14+
"""
15+
Checks RSS 2.0
16+
"""
17+
18+
feed = """<?xml version="1.0"?>
19+
<rss version="2.0">
20+
<channel>
21+
<title>Liftoff News</title>
22+
<link>http://liftoff.msfc.nasa.gov/</link>
23+
<description>Liftoff to Space Exploration.</description>
24+
<pubDate>Tue, 10 Jun 2003 04:00:00 GMT</pubDate>
25+
<item>
26+
<title>Star City</title>
27+
<link>http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp</link>
28+
<pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate>
29+
<guid>http://liftoff.msfc.nasa.gov/2003/06/03.html#item573</guid>
30+
</item>
31+
<item>
32+
<description>Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a &lt;a href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm"&gt;partial eclipse of the Sun&lt;/a&gt; on Saturday, May 31st.</description>
33+
<pubDate>Fri, 30 May 2003 11:06:42 GMT</pubDate>
34+
<guid>http://liftoff.msfc.nasa.gov/2003/05/30.html#item572</guid>
35+
</item>
36+
</channel>
37+
</rss>
38+
"""
39+
40+
feed_url = 'http://example.com/feed.xml'
41+
httpretty.register_uri(httpretty.GET, feed_url,
42+
body=feed,
43+
adding_headers={
44+
"Content-type": "application/rss+xml",
45+
})
46+
47+
# mocking a previous result from some page
48+
results = {
49+
'html_head': {
50+
'http://example.com/': {
51+
'link_rss_atom': ['http://example.com/feed.xml']
52+
}
53+
}
54+
}
55+
config = Config(urls=['http://example.com/'])
56+
checker = load_feeds.Checker(config=config, previous_results=results)
57+
58+
result = checker.run()
59+
print(result)
60+
61+
self.assertEqual(result, {
62+
'http://example.com/feed.xml': {
63+
'exception': None,
64+
'title': 'Liftoff News',
65+
'latest_entry': datetime(2003, 6, 3, 10, 39, 21),
66+
'first_entry': datetime(2003, 5, 30, 12, 6, 42),
67+
'average_interval': 340359,
68+
'num_entries': 2,
69+
}
70+
})
71+
72+
73+
if __name__ == '__main__':
74+
unittest.main()

0 commit comments

Comments
 (0)