script/testing/reporting/parsers/parse_data.py

import glob
import os
import re
from decimal import Decimal

import distro

from ...util.constants import LOG
from ..constants import UNKNOWN_RESULT
from .oltpbench.config_parser import parse_config_file
from .oltpbench.res_parser import parse_res_file
from .oltpbench.summary_parser import parse_summary_file


def parse_oltpbench_data(results_dir):
    """
    Collect the information needed to send to the performance storage service
    from the files produced by OLTPBench.

    Args
    -----
    results_dir : str
        The directory where the OLTPBench results were stored
    Returns
    --------
    metadata : dict
        The metadata of the OLTPBench test
    timestamp : int
        When the test was run in milliseconds
    type : str
        The benchmark type (i.e. tpcc)
    parameters : dict
        The parameters that were used to run the test
    metrics : dict
        The metrics gathered from the result of the test
    """
    env_metadata = _parse_jenkins_env_vars()
    files_metadata, timestamp, benchmark_type, parameters, metrics = parse_oltpbench_files(
        results_dir)
    metadata = {**env_metadata, **files_metadata}
    return metadata, timestamp, benchmark_type, parameters, metrics


def parse_microbenchmark_data(artifact_processor_comparison):
    """ 
    Collect the information needed to send to the performance storage service
    from the files produced by the microbenchmark

    Args:
        artifact_processor_comparison (dict): The comparison object generated 
                                                by the artifact processor

    Returns: 
        metadata (dict): The metadata of the microbenchmark test
        test_suite (str): The name of the test suite
        test_name (str): The name of the specific benchmark test
        metrics (dict): The metrics gathered from the result of the test
    """
    metadata = parse_standard_metadata()
    test_suite, test_name, metrics = parse_microbenchmark_comparison(
        artifact_processor_comparison)
    return metadata, test_suite, test_name, metrics


def parse_standard_metadata():
    """
    Gather the standard metadata information from Jenkins and the DBMS.

    Returns
    -------
    The metadata obtained from Jenkins and the DBMS.

    Warnings
    --------
    Underlying implementation is hacky right now.
    """
    return {**_parse_jenkins_env_vars(), **_parse_db_metadata()}


def _parse_jenkins_env_vars():
    """
    Parse environment variables from Jenkins and the OS.

    Returns
    -------
    metadata : dict
        Metadata about the Jenkins environment.
        WARNING: Note that cpu_socket is a completely garbage value.
        TODO(WAN): I'd remove cpu_socket except I'm afraid of breakages.
    """
    # TODO find a way to get the socket number of
    os_cpu_socket = 'true'

    metadata = {
        'jenkins': {
            'jenkins_job_id': os.environ['BUILD_ID'],
        },
        'github': {
            'git_branch': os.environ['GIT_BRANCH'],
            'git_commit_id': os.environ['GIT_COMMIT'],
        },
        'environment': {
            'os_version': ' '.join(distro.linux_distribution()),
            'cpu_number': os.cpu_count(),
            'cpu_socket': os_cpu_socket
        }
    }
    return metadata


def parse_oltpbench_files(results_dir):
    """
    Parse information from the config and summary files generated by OLTPBench.

    Parameters
    ----------
    results_dir : str
        The directory where OLTPBench results are stored.

    Returns
    -------
    metadata : dict
        An object containing metadata information.
    timestamp : int
        The timestamp when the benchmark was created, in milliseconds.
        TODO(WAN): wtf is this?
    benchmark_type : str
        The benchmark that was run (e.g., tatp, noop).
    parameters : dict
        Information about the parameters with which the test was run.
    metrics : dict
        The summary measurements that were gathered from the test.
    """

    def hack_rename(old_glob_target, new_name):
        """
        Wan wants to avoid a rabbit hole of refactoring.
        Therefore the new OLTPBench files are being renamed to match old expectations here.
        """
        matches = glob.glob(old_glob_target)
        assert len(matches) == 1
        os.rename(matches[0], new_name)

    hack_rename(f'{results_dir}/*.results.csv', f'{results_dir}/oltpbench.res')
    hack_rename(f'{results_dir}/*.raw.csv', f'{results_dir}/oltpbench.csv')
    hack_rename(f'{results_dir}/*.samples.csv', f'{results_dir}/oltpbench.samples')
    hack_rename(f'{results_dir}/*.summary.json', f'{results_dir}/oltpbench.summary')
    hack_rename(f'{results_dir}/*.params.json', f'{results_dir}/oltpbench.params')
    hack_rename(f'{results_dir}/*.metrics.json', f'{results_dir}/oltpbench.metrics')
    hack_rename(f'{results_dir}/*.config.xml', f'{results_dir}/oltpbench.expconfig')

    config_parameters = parse_config_file(results_dir + '/oltpbench.expconfig')
    metadata, timestamp, benchmark_type, summary_parameters, metrics = parse_summary_file(
        results_dir + '/oltpbench.summary')
    metrics['incremental_metrics'] = parse_res_file(results_dir +
                                                    '/oltpbench.res')
    parameters = {**summary_parameters, **config_parameters}
    return metadata, timestamp, benchmark_type, parameters, metrics


def parse_microbenchmark_comparison(artifact_processor_comparison):
    """ Extract the relevant information from the artifact_processor_comparison 
    and parse out the test name and suite"""
    metrics_fields = [
        'throughput', 'stdev_throughput', 'tolerance', 'status', 'iterations',
        'ref_throughput', 'num_results'
    ]
    test_suite, test_name = artifact_processor_comparison.get(
        'suite'), artifact_processor_comparison.get('test')

    metrics = {}
    for key, value in artifact_processor_comparison.items():
        if key in metrics_fields:
            metrics[key] = round(value, 15) if isinstance(
                value, (float, Decimal)) else value
    return test_suite, test_name, metrics


def _parse_db_metadata():
    """
    Parse metadata from the DBMS.

    Returns
    -------
    metadata : dict
        A dictionary containing metadata about the database.

    Warnings
    --------
    Giant hack that hardcodes version number.

    If the hack is unsuccessful, it defaults to UNKNOWN_RESULT.
    """
    return {'noisepage': {'db_version': '1.0.0'}}