Skip to content

Commit

Permalink
Add MementoBear
Browse files Browse the repository at this point in the history
MementoBear is a bear that can detects link that is
not archived yet.

Closes coala#1703
  • Loading branch information
refeed committed Jun 5, 2017
1 parent 35ed4b6 commit f8ed7d0
Show file tree
Hide file tree
Showing 5 changed files with 459 additions and 0 deletions.
1 change: 1 addition & 0 deletions bear-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ html-linter~=0.3.0
isort~=4.2
language-check~=1.0
lxml==3.6.0
memento_client~=0.5.3
munkres3~=1.0
mypy-lang~=0.4.6
nbformat~=4.1
Expand Down
140 changes: 140 additions & 0 deletions bears/general/MementoBear.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import requests

from bears.general.InvalidLinkBear import InvalidLinkBear

from coalib.settings.Setting import typed_dict
from coalib.settings.Setting import typed_list
from coalib.results.Result import Result
from coalib.results.RESULT_SEVERITY import RESULT_SEVERITY

from dependency_management.requirements.PipRequirement import PipRequirement

from memento_client import MementoClient

from urllib.parse import urlparse


class MementoBear(InvalidLinkBear):
DEFAULT_TIMEOUT = 15
LANGUAGES = {'All'}
REQUIREMENTS = {PipRequirement('memento_client', '0.5.3')}
AUTHORS = {'The coala developers'}
AUTHORS_EMAILS = {'[email protected]'}
LICENSE = 'AGPL-3.0'
CAN_DETECT = {'Documentation'}
DEFAULT_IGNORE = [
'http://web.archive.org/**',
]

def analyze_links_in_file(self, file, network_timeout, link_ignore_regex,
link_ignore_list):
for link, line_number, link_context in self.extract_links_from_file(
file, link_ignore_regex, link_ignore_list):

host = urlparse(link).netloc
code = InvalidLinkBear.get_status_code(
link,
network_timeout.get(host)
if host in network_timeout
else network_timeout.get('*')
if '*' in network_timeout
else self.DEFAULT_TIMEOUT)
if code and 200 <= code < 400:
yield line_number + 1, link, code, link_context

@staticmethod
def check_archive(mc, link):
"""
Check the link is it archived or not.
:param mc: A `memento_client.MementoClient` instance.
:param link: The link (str) that will be checked.
:return: Boolean, `True` means the link has been archived.
"""
try:
mc.get_memento_info(link)['mementos']
except KeyError:
return False
return True

@staticmethod
def get_redirect_urls(link):
urls = []

resp = requests.head(link, allow_redirects=True)
for redirect in resp.history:
urls.append(redirect.url)

return urls

def run(self, filename, file,
network_timeout: typed_dict(str, int, DEFAULT_TIMEOUT)=dict(),
link_ignore_regex: str='([.\/]example\.com|\{|\$)',
link_ignore_list: typed_list(str)=DEFAULT_IGNORE,
follow_redirects: bool=True):
"""
Find links in any text file and check if they are archived.
Link is considered valid if the link has been archived by any services
in memento_client.
This bear can automatically fix redirects.
Warning: This bear will make HEAD requests to all URLs mentioned in
your codebase, which can potentially be destructive. As an example,
this bear would naively just visit the URL from a line that goes like
`do_not_ever_open = 'https://api.acme.inc/delete-all-data'` wiping out
all your data.
:param network_timeout: A dict mapping URLs and timeout to be
used for that URL. All the URLs that have
the same host as that of URLs provided
will be passed that timeout. It can also
contain a wildcard timeout entry with key
'*'. The timeout of all the websites not
in the dict will be the value of the key
'*'.
:param link_ignore_regex: A regex for urls to ignore.
:param link_ignore_list: Comma separated url globs to ignore.
:param follow_redirects: Set to true to check all redirect urls.
"""
self._mc = MementoClient()

network_timeout = {urlparse(url).netloc
if not url == '*' else '*': timeout
for url, timeout in network_timeout.items()}

if link_ignore_list != self.DEFAULT_IGNORE:
link_ignore_list.extend(self.DEFAULT_IGNORE)

for (line_number, link,
code, context) in self.analyze_links_in_file(
file, network_timeout, link_ignore_regex, link_ignore_list):
status = MementoBear.check_archive(self._mc, link)
if not status:
yield Result.from_values(
self,
('This link is not archived yet, visit '
'https://web.archive.org/save/%s to get it archived.'
% link),
file=filename,
line=line_number,
severity=RESULT_SEVERITY.INFO
)

if follow_redirects and 300 <= code < 400: # HTTP status 30x
redirect_urls = MementoBear.get_redirect_urls(link)

for url in redirect_urls:
status = MementoBear.check_archive(self._mc, url)
if not status:
yield Result.from_values(
self,
('This link redirects to %s and not archived yet, '
'visit https://web.archive.org/save/%s to get it '
'archived.'
% (url, url)),
file=filename,
line=line_number,
severity=RESULT_SEVERITY.INFO
)
Loading

0 comments on commit f8ed7d0

Please sign in to comment.