diff --git a/.gitignore b/.gitignore index bd7d7c8..f92a2cf 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,7 @@ *.pyc +*.json +*.xml pylinkvalidator.egg-info/ dist/ build/ +.idea/ diff --git a/README.rst b/README.rst index c7a9730..569bce2 100644 --- a/README.rst +++ b/README.rst @@ -171,7 +171,7 @@ usage examples. These options change the output of the crawler. -f FORMAT, --format=FORMAT - Format of the report: plain (default) + Format of the report: plain (default), json, junit -o OUTPUT, --output=OUTPUT Path of the file where the report will be printed. -W WHEN, --when=WHEN diff --git a/pylinkvalidator/crawler.py b/pylinkvalidator/crawler.py index 73a0666..531f0f8 100644 --- a/pylinkvalidator/crawler.py +++ b/pylinkvalidator/crawler.py @@ -525,6 +525,7 @@ def _get_links(self, elements, attribute, base_url_split, for element in elements: if attribute in element.attrs: url = element[attribute] + target = element.attrs.get('target', None) if not self.worker_config.strict_mode: url = url.strip() @@ -540,7 +541,7 @@ def _get_links(self, elements, attribute, base_url_split, link = Link( type=unicode(element.name), url_split=abs_url_split, original_url_split=original_url_split, - source_str=unicode(element)) + source_str=unicode(element), target=target) links.append(link) return links @@ -658,7 +659,8 @@ def process_links(self, page_crawl): continue page_status = self.page_statuses.get(url_split, None) - page_source = PageSource(source_url_split, link.source_str) + page_source = PageSource( + source_url_split, link.source_str, link.target) if not page_status: # We never encountered this url before diff --git a/pylinkvalidator/models.py b/pylinkvalidator/models.py index 228ef11..e10fe03 100644 --- a/pylinkvalidator/models.py +++ b/pylinkvalidator/models.py @@ -81,6 +81,7 @@ def namedtuple_with_defaults(typename, field_names, default_values=[]): FORMAT_PLAIN = "plain" FORMAT_HTML = "html" FORMAT_JSON = "json" +FORMAT_JUNIT = "junit" WHEN_ALWAYS = "always" @@ -133,7 +134,7 @@ def namedtuple_with_defaults(typename, field_names, default_values=[]): Link = namedtuple_with_defaults( "Link", - ["type", "url_split", "original_url_split", "source_str"]) + ["type", "url_split", "original_url_split", "source_str", "target"]) PageCrawl = namedtuple_with_defaults( @@ -149,7 +150,7 @@ def namedtuple_with_defaults(typename, field_names, default_values=[]): PageSource = namedtuple_with_defaults( - "PageSource", ["origin", "origin_str"]) + "PageSource", ["origin", "origin_str", "target"]) ContentCheck = namedtuple_with_defaults( @@ -582,8 +583,9 @@ def _build_parser(self): output_group.add_option( "-f", "--format", dest="format", action="store", - default=FORMAT_PLAIN, choices=[FORMAT_PLAIN], - help="Format of the report: plain") + default=FORMAT_PLAIN, + choices=[FORMAT_PLAIN, FORMAT_JSON, FORMAT_JUNIT], + help="Format of the report: plain (default), json, junit") output_group.add_option( "-o", "--output", dest="output", action="store", default=None, diff --git a/pylinkvalidator/reporter.py b/pylinkvalidator/reporter.py index 19f9715..9878c68 100644 --- a/pylinkvalidator/reporter.py +++ b/pylinkvalidator/reporter.py @@ -4,15 +4,23 @@ from __future__ import unicode_literals, absolute_import, print_function import codecs +import json import re import smtplib import sys from email.mime.text import MIMEText +from junit_xml import TestSuite, TestCase + from pylinkvalidator.compat import StringIO from pylinkvalidator.models import ( - REPORT_TYPE_ERRORS, REPORT_TYPE_ALL, FORMAT_PLAIN) + FORMAT_JSON, + FORMAT_JUNIT, + FORMAT_PLAIN, + REPORT_TYPE_ALL, + REPORT_TYPE_ERRORS, +) PLAIN_TEXT = "text/plain" @@ -54,6 +62,10 @@ def report(site, config, total_time, logger=None): try: if config.options.format == FORMAT_PLAIN: _write_plain_text_report(site, config, output_files, total_time) + if config.options.format == FORMAT_JSON: + _write_json_report(site, config, output_file, total_time) + if config.options.format == FORMAT_JUNIT: + _write_junit_report(site, config, output_file, total_time) except Exception: if logger: logger.exception("An exception occurred while writing the report") @@ -72,6 +84,113 @@ def _write_plain_text_report(site, config, output_files, total_time): _write_plain_text_report_single(site, config, output_files, total_time) +def _write_junit_report(site, config, output_file, total_time): + pages = site.pages + test_cases = [] + + for results, resource in pages.items(): + origins = [source.origin.geturl() for source in resource.sources] + if resource.status == 200: + test_case = TestCase( + name=resource.url_split.geturl(), + classname=results.hostname, + elapsed_sec=resource.response_time, + stdout=resource.status, + status="passed" + ) + else: + stderr_message = "Link found on:\n{}".format("\n".join(origins)) + test_case = TestCase( + name=resource.url_split.geturl(), + classname=results.hostname, + elapsed_sec=resource.response_time, + stderr=stderr_message, + status="failed" + ) + if resource.exception: + message = str(resource.exception) + else: + message = "Expected 200 OK but got {}".format(resource.status) + test_case.add_failure_info( + message=message, failure_type="UnexpectedStatusCode") + test_cases.append(test_case) + test_suite = TestSuite("pylinkvalidator test suite", test_cases) + output_file.write(TestSuite.to_xml_string([test_suite])) + print_summary(site, config, total_time) + + +def _write_json_report(site, config, output_file, total_time): + start_urls = ",".join((start_url_split.geturl() for start_url_split in + site.start_url_splits)) + + total_urls = len(site.pages) + total_errors = len(site.error_pages) + + if not site.is_ok: + global_status = "ERROR" + error_summary = "with {0} error(s) ".format(total_errors) + else: + global_status = "SUCCESS" + error_summary = "" + + meta = { + "total_urls": total_urls, + "total_errors": total_errors, + "total_time": total_time, + "start_urls": start_urls, + "global_status": global_status, + "error_summary": error_summary + } + try: + avg_response_time = site.get_average_response_time() + avg_process_time = site.get_average_process_time() + meta.update({"avg_response_time": avg_response_time}) + meta.update({"avg_process_time": avg_process_time}) + except Exception: + from traceback import print_exc + print_exc() + + pages = {} + + if config.options.report_type == REPORT_TYPE_ERRORS: + pages = site.error_pages + elif config.options.report_type == REPORT_TYPE_ALL: + pages = site.pages + + res_pages = [] + + for results, resource in pages.items(): + details = { + 'link': resource.url_split.geturl(), + 'fragment': results.fragment, + 'hostname': results.hostname, + 'netloc': results.netloc, + 'is_local': resource.is_local, + 'is_html': resource.is_html, + 'is_ok': resource.is_ok, + 'is_timeout': resource.is_timeout, + 'process_time': resource.process_time, + 'response_time': resource.response_time, + 'status': resource.status, + 'path': results.path, + 'port': results.port, + 'query': results.query, + 'scheme': results.scheme, + 'origins': [source.origin.geturl() for source in resource.sources], + 'sources': [source.origin_str for source in resource.sources], + 'targets': [source.target for source in resource.sources] + } + res_pages.append(details) + + res = { + "meta": meta, + "pages": res_pages + } + output_file.write( + json.dumps(res, sort_keys=True, indent=4, separators=(',', ': '))) + print_summary(site, config, total_time) + + def _write_plain_text_report_multi(site, config, output_files, total_time): total_urls = len(site.pages) total_errors = len(site.error_pages) @@ -163,6 +282,42 @@ def _write_plain_text_report_single(site, config, output_files, total_time): _print_details(pages.values(), output_files, config) +def print_summary(site, config, total_time, indent=2): + total_urls = len(site.pages) + total_errors = len(site.error_pages) + + if not site.is_ok: + global_status = "ERROR" + error_summary = "with {0} error(s) ".format(total_errors) + else: + global_status = "SUCCESS" + error_summary = "" + + print("{0} Crawled {1} urls {2}in {3:.2f} seconds".format( + global_status, total_urls, error_summary, total_time)) + + pages = {} + + if config.options.report_type == REPORT_TYPE_ERRORS: + pages = site.error_pages + elif config.options.report_type == REPORT_TYPE_ALL: + pages = site.pages + + initial_indent = " " * indent + for page in pages.values(): + print("\n{2}{0}: {1}".format( + page.get_status_message(), page.url_split.geturl(), + initial_indent)) + for content_message in page.get_content_messages(): + print("{1} {0}".format(content_message, initial_indent)) + for source in page.sources: + print("{1} from {0} target={2}".format( + source.origin.geturl(), initial_indent, source.target)) + if config.options.show_source: + print("{1} {0}".format( + source.origin_str, initial_indent)) + + def _print_details(page_iterator, output_files, config, indent=2): initial_indent = " " * indent for page in page_iterator: @@ -174,11 +329,11 @@ def _print_details(page_iterator, output_files, config, indent=2): oprint("{1} {0}".format(content_message, initial_indent), files=output_files) for source in page.sources: - oprint("{1} from {0}".format( - source.origin.geturl(), initial_indent), files=output_files) + oprint("{1} from {0} target={2}".format( + source.origin.geturl(), initial_indent, source.target), files=output_files) if config.options.show_source: oprint("{1} {0}".format( - truncate(source.origin_str), initial_indent), + source.origin_str, initial_indent), files=output_files) diff --git a/requirements.txt b/requirements.txt index bd3ba7e..3024e4d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ -beautifulsoup4>=4.2.0 \ No newline at end of file +beautifulsoup4>=4.2.0 +junit-xml>=1.8