diff --git a/.github/workflows/python-lint-and-test.yaml b/.github/workflows/python-lint-and-test.yaml index 20f325645..5d7cd1c2b 100644 --- a/.github/workflows/python-lint-and-test.yaml +++ b/.github/workflows/python-lint-and-test.yaml @@ -10,12 +10,14 @@ on: - main-fixes - pre-release - dev + - feature-software-testing pull_request: branches: - main - main-fixes - pre-release - dev + - feature-software-testing jobs: build: diff --git a/.gitignore b/.gitignore index a464fa511..7f8d2ff68 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,6 @@ dbg/ site dist/ .cache/ +*.asm +*.bin +*.dat diff --git a/consfuzz.py b/consfuzz.py new file mode 100755 index 000000000..beb5a36e5 --- /dev/null +++ b/consfuzz.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 +""" +File: Command Line Interface to Contract-based Software Fuzzer (ConSFuzz) + +Copyright (C) Microsoft Corporation +SPDX-License-Identifier: MIT +""" +import sys +from consfuzz.cli import main + +if __name__ == '__main__': + exit_code = main() + sys.exit(exit_code) diff --git a/consfuzz/README.md b/consfuzz/README.md new file mode 100644 index 000000000..ac767ee3d --- /dev/null +++ b/consfuzz/README.md @@ -0,0 +1,122 @@ +# Software Leakage Fuzzer + +Note: This module is at the experimental stage of development and its interfaces + may (and likely will) change in the future. + +This module leverages a leakage model to detect side-channel information leaks +in software binaries. The leakage model is the same one as used by the hardware fuzzer, +and it is assumed to be already tested against the target CPU. The software fuzzer uses +this model to collect contract traces for the target binary. + +The software fuzzer takes as input a target binary and a grammar describing the format of +the binary's inputs. The grammar must specify which parts of the input are public and which +are private. +FIXME: the current prototype doesn't actually use a grammar, but instead assumes +that the target binary takes two files as input: one for public data and one for private data. + +The goal of the software fuzzer is to identify cases where contract traces depend on +the private data, which is a sign of information leakage. To this end, the fuzzer checks +traces for the non-interference property: if two executions of the binary with different +private values but identical public data produce different traces, then the binary is +leaking information. + +The fuzzer operates in three stages: + +## THE BELOW IS NOT YET IMPLEMENTED (SEE "ACTUAL EXAMPLE" BELOW) + +## Stage 1: Public Input Generation + +The fuzzer uses AFL++ to generate a set of public inputs that cover a wide range of execution paths +in the target binary. + +Example: +``` +./consfuzz.py pub_gen -c config.yaml -w ~/consfuzz-results/ -t 60 --target-cov 5 -- /usr/bin/openssl enc -e -aes256 -out enc.bin -in @@ -pbkdf2 -pass @# +``` + +## Stage 2: [NAME TBD] + +The second stage combines generation of secret inputs (fully random) and tracing of the binary. +The tracing is done for each pair of public and secret inputs, and the traces are +collected in a directory. The underlying tracing engine is the DynamoRIO-based backend of Revizor +(see `rvzr/model_dynamorio/backend`). + +Example: +``` +./consfuzz.py stage2 -c config.yaml -w ~/consfuzz-results/ -n 10 -- /usr/bin/openssl enc -e -aes256 -out enc.bin -in @@ -pbkdf2 -pass @# +``` + +## Stage 3: Leakage Analysis & Reporting + +The third stage analyzes the traces collected in the previous stage and reports +the results. + +Example: +``` +./consfuzz.py report -c config.yaml -w ~/consfuzz-results/ -b /usr/bin/openssl +``` + +## Stage 4: Triaging + +A line in the report might look something like this: + + +```yaml +{ + "seq": { + "D": { + "lib/blockciphermodes.c:281": { + "4456491": [ + "/home/ubuntu/results/stage2/id:000022,src:000014,time:302,execs:289,op:havoc,rep:1/002.trace:155734:155734", +``` + +To triage a violation you can use `consfuzz inspect` to either: + +1. Inspect it with GDB +1. Print a use-def graph, both a textual version and a `.dot` graph + + +### GDB Script + +As an example for the above violation, you can run: + +``` +./consfuzz.py inspect -c consfuzz.yaml --violation "D" /home/ubuntu/results/stage2/id:000022,src:000014,time:302,execs:289,op:havoc,rep:1/002.trace:155734:155734 +``` + +This will: +1. analyze the trace to get the relevant PC +2. generate a debug trace (`.dbgtrace`) + * the original command is taken from `.log` which should be generated by the fuzzer +3. find the same instruction in the debug trace +4. generate a gdb script to reach the desired speculative instruction. + +Finally, a `gdb` command is printed at the end that can be used to spawn a shell at the leakage point. This gdb command will source a custom GDB plugin (`triage/plugin.py`) which makes the `spec` command available (`help spec`). + +Following invocations can be use `--skip-tracing` to avoid rebuilding a debug trace. + +### Use-Def Chains + +By adding `--usedef` to the command, the tool will also generate two files: + +* `/home/ubuntu/results/stage2/[...]/002.usedef` contains a textual representation of the reverse use-def analysis +* `/home/ubuntu/results/stage2/[...]/002.usedef.dot` contains a graphiz representation of the same + +Such chains can be _very_ long. To enhance analysis, you can: + +* Add a **baseline** with `--baseline`: this will add a baseline trace that will be used for pruning values that are the same + * `--baseline='auto'` simply selects the corresponding `000.trace` in the same folder +* Add **declassified symbols** and **key expansions** that cause the analysis to stop earlier. This should be done by + 1. adding `--binary ` to the command + 2. adding the corresponding source_code_file:line entry in the dedicated list inside `config.yaml` + + +## ACTUAL EXAMPLE + +``` +echo 0 | sudo tee /proc/sys/kernel/randomize_va_space + +./consfuzz.py pub_gen -c dbg/consfuzz.yaml -w ~/results/ -t 10 --target-cov 50 -- ~/eval-rvzr-sw/drivers/bearssl/bearssl -k @# -i ~/eval-rvzr-sw/drivers/bearssl/test/iv.bin -o enc.bin @@ +./consfuzz.py stage2 -c dbg/consfuzz.yaml -w ~/results/ -n 2 -- ~/eval-rvzr-sw/drivers/bearssl/bearssl -k @# -i ~/eval-rvzr-sw/drivers/bearssl/test/iv.bin -o enc.bin @@ +./consfuzz.py report -c dbg/consfuzz.yaml -w ~/results -b ~/eval-rvzr-sw/drivers/bearssl/bearssl +``` diff --git a/consfuzz/__init__.py b/consfuzz/__init__.py new file mode 100644 index 000000000..5e7bcf7b2 --- /dev/null +++ b/consfuzz/__init__.py @@ -0,0 +1,7 @@ +# flake8: noqa +# pylint: skip-file + +from .config import * +from .fuzzer import * + +__version__ = "0.0.1" diff --git a/consfuzz/cli.py b/consfuzz/cli.py new file mode 100644 index 000000000..d73a659a7 --- /dev/null +++ b/consfuzz/cli.py @@ -0,0 +1,240 @@ +""" +File: Function definitions for using Contract-based Software Fuzzer (ConSFuzz) as command-line tool +(Note: the actual CLI is accessed via consfuzz.py) + +Copyright (C) Microsoft Corporation +SPDX-License-Identifier: MIT +""" +from __future__ import annotations + +from typing import Any +import os +from argparse import ArgumentParser + +from typing_extensions import get_args + +from .config import Config, FuzzingStages +from .fuzzer import FuzzerCore +from .triager import LeakageInspector + +CMD_HELP =\ + "Command to execute (e.g., 'openssl enc -e -aes256 -out enc.bin -in @@ -pbkdf2 -pass @#').\n" \ + "NOTE: use '@@' as a placeholder for generated public argument and\n" \ + "'@#' for generated private argument" + + +def _parse_args() -> Any: # pylint: disable=r0915 + parser = ArgumentParser(add_help=True) + subparsers = parser.add_subparsers(dest='subparser_name', help="Subcommand to run") + + parser.add_argument( + "--help-config", + action='store_true', + help="Print a help message for the configuration file format and defaults.", + ) + + # ============================================================================================== + # Common arguments + common_parser = ArgumentParser(add_help=False) + common_parser.add_argument( + "-c", + "--config", + type=str, + required=False, + help="Path to the configuration file (YAML) that will be used during fuzzing.", + ) + + # ============================================================================================== + # All Phases Together: Public input generation, secret input generation, and reporting + all_phases = subparsers.add_parser('fuzz', add_help=True, parents=[common_parser]) + all_phases.add_argument( + "-t", + "--timeout", + type=int, + default=10, + help="Fuzzing timeout, in seconds (default: 10)", + ) + all_phases.add_argument( + "-n", + "--num-sec-inputs", + type=int, + default=10, + help="Number of secret inputs to generate per public input (default: 10)", + ) + + # everything after '--' is saved into 'target_cmd' argument + all_phases.add_argument( + "target_cmd", + nargs="+", + help=CMD_HELP, + ) + + # ============================================================================================== + # Phase 1: Public input generation (AFL++ interface) + pub_gen = subparsers.add_parser('pub_gen', add_help=True, parents=[common_parser]) + pub_gen.add_argument( + "-t", + "--timeout", + type=int, + default=10, + help="Fuzzing timeout, in seconds (default: 10)", + ) + # TODO: target-cov is not used yet, but it will be used in the future to control the coverage + # pub_gen.add_argument( + # "--target-cov", + # type=int, + # default=10, + # help="Target coverage to achieve, in percentage (default: 10)", + # ) + + # everything after '--' is saved into 'target_cmd' argument + pub_gen.add_argument( + "target_cmd", + nargs="+", + help=CMD_HELP, + ) + + # ============================================================================================== + # Phase 2: Secret input generation and collection of contract traces + stage2 = subparsers.add_parser('stage2', add_help=True, parents=[common_parser]) + stage2.add_argument( + "-n", + "--num-sec-inputs", + type=int, + default=10, + help="Number of secret inputs to generate per public input (default: 10)", + ) + + # everything after '--' is saved into 'target_cmd' argument + stage2.add_argument( + "target_cmd", + nargs="+", + help=CMD_HELP, + ) + + # ============================================================================================== + # Phase 3: Analysis of traces and reporting of leaks + report = subparsers.add_parser('report', add_help=True, parents=[common_parser]) + report.add_argument( + "-b", + "--target-binary", + type=str, + required=True, + help="Path to the target binary to be fuzzed (e.g., '/usr/bin/openssl')", + ) + + # ============================================================================================== + # Phase 4: Leak inspection/triaging + inspect = subparsers.add_parser('inspect', add_help=True, parents=[common_parser]) + inspect.add_argument("file_and_line", + type=str, + help="A string as found in the report") + inspect.add_argument("-b", + "--binary", + type=str, + help="Path of the binary to read the debug symbols from", + default=None) + inspect.add_argument("-v", + "--violation", + type=str, + help="Which violation has been found at the specified line", + choices=["I", "D", "C"]) + inspect.add_argument("--skip-tracing", + action="store_true", + help="Avoid regenerating traces") + inspect.add_argument("--usedef", + action="store_true", + help="Generate a use-def analysis for this violation") + inspect.add_argument("--debug-trace", + action="store_true", + help="The provided file path is already a debug trace") + inspect.add_argument("--baseline", + type=str, + default="none", + help="Select a second trace as baseline for differential analysis." + + "Specify 'auto' for automatic baseline and 'none' for no differential analysis.") + + + args = parser.parse_args() + + # Custom check for subparser name + if not args.subparser_name and not args.help_config: + parser.print_help() + return None + + return args + + +def _validate_args(args: Any) -> bool: + """ + Validate the command-line arguments, beyond the basic checks done by argparse. + :param args: parsed CLI arguments + :return: True if paths are valid, False otherwise + """ + if args.subparser_name == 'report': + # check if target_binary exists + if not args.target_binary or not os.path.exists(args.target_binary): + print(f"ERROR: Target binary '{args.target_binary}' not found") + return False + + return True + + +def main() -> int: + """ Main function for the CLI """ + + # pylint: disable=too-many-return-statements,too-many-branches + # NOTE: disabling is justified here, as this function is the main entry point + # and it naturally has many branches due to different subcommands + + args = _parse_args() + if args is None: + return 1 + if not _validate_args(args): + return 1 + + # Config help requested + if args.help_config: + print(Config.help()) + return 0 + + assert args.subparser_name in get_args(FuzzingStages) + config = Config(args.config, args.subparser_name) + fuzzer = FuzzerCore(config) + + # Start the fuzzer in the mode requested by the user + if args.subparser_name == 'pub_gen': + return fuzzer.generate_public_inputs( + cmd=args.target_cmd, + target_cov=0, # TODO: will be replaced with args.target_cov when implemented + timeout_s=args.timeout, + ) + if args.subparser_name == 'stage2': + return fuzzer.stage2( + cmd=args.target_cmd, + num_sec_inputs=args.num_sec_inputs, + ) + if args.subparser_name == 'report': + return fuzzer.report(target_binary=args.target_binary) + if args.subparser_name == 'fuzz': + return fuzzer.all( + cmd=args.target_cmd, + target_cov=0, # TODO: will be replaced with args.target_cov when implemented + timeout_s=args.timeout, + num_sec_inputs=args.num_sec_inputs, + ) + + if args.subparser_name == 'inspect': + inspector = LeakageInspector(config.get_inspector_config()) + return inspector.inspect( + file_and_line=args.file_and_line, + violation=args.violation, + baseline=args.baseline, + binary=args.binary, + skip_tracing=args.skip_tracing, + usedef=args.usedef, + debug_trace=args.debug_trace + ) + + print("ERROR: Unknown subcommand") + return 1 diff --git a/consfuzz/config.py b/consfuzz/config.py new file mode 100644 index 000000000..121db7ff7 --- /dev/null +++ b/consfuzz/config.py @@ -0,0 +1,371 @@ +""" +File: Global ConSFuzz configuration. + +Copyright (C) Microsoft Corporation +SPDX-License-Identifier: MIT +""" +from __future__ import annotations + +from typing import Dict, Optional, Literal, Any, Final, List +import os +import pathlib +import shutil + +import yaml +from typing_extensions import assert_never + +from .triage.config import LeakageInspectorConfig + +FuzzingStages = Literal["fuzz", "pub_gen", "stage2", "report", "inspect"] +YAMLData = Dict[str, Any] +ReportVerbosity = Literal[1, 2, 3] + + +# ================================================================================================== +# Service Classes +# ================================================================================================== +class _ConfigException(SystemExit): + """ Custom exception class for configuration errors. """ + + def __init__(self, var: str, message: str) -> None: + super().__init__(f"[ERROR] Invalid value of config variable {var}\nIssue: {message}\n") + + +class _WorkingDirManager: + """ + Context manager for handling the working directory. + It ensures that the working directory is created and preserved properly. + """ + + def __init__(self, config: Config) -> None: + self.config = config + + def set_working_dirs(self, stage: FuzzingStages) -> None: + """ + Ensure that the working directory is set up correctly. + + Algorithm: + 1. If the directory does not exist, throw an exception. + 2. If the directory is empty, do nothing. + 3. If the directory is not empty: + - If `force_working_dir_overwrite` is set, remove the contents of the subdirectory + corresponding to the given stage. + - If `archive_dir` is set, archive the contents of the subdirectory corresponding + to the given stage into `archive_dir` and remove the contents of the subdirectory. + - If `archive_dir` is not set, throw an exception. + + """ + assert self.config.working_dir is not None, \ + "working_dir must be checked before calling this method." + + # Throw an exception if the working directory does not exist + if not pathlib.Path(self.config.working_dir).is_dir(): + raise _ConfigException( + "working_dir", + f"Working directory {self.config.working_dir} does not exist. " + "Please create it before running the fuzzer.", + ) + + # Empty working directory? No risk of overwriting anything + if not os.listdir(self.config.working_dir): + if stage == "fuzz": + os.makedirs(self.config.stage1_wd, exist_ok=True) + os.makedirs(self.config.stage2_wd, exist_ok=True) + os.makedirs(self.config.stage3_wd, exist_ok=True) + return + + # Inspector doesn't need a working dir + # TODO: For now the inspector creates files in-place in the stage2 folder + if stage == "inspect": + return + + # Identify the target directory for the given stage + if stage == "fuzz": + stage_dir = self.config.working_dir + elif stage == "pub_gen": + stage_dir = self.config.stage1_wd + elif stage == "stage2": + stage_dir = self.config.stage2_wd + elif stage == "report": + stage_dir = self.config.stage3_wd + else: + assert_never(stage) + + # Stage directory does not exist? Create it + if not os.path.exists(stage_dir): + os.makedirs(stage_dir, exist_ok=True) + return + + # Stage directory exists, but is empty? We're good to go + if not os.listdir(stage_dir): + return + + # If force overwrite is set, remove the contents of the target directory + if self.config.force_working_dir_overwrite: + print(f"[INFO] Directory {stage_dir} is not empty; removing its contents.") + self._reset_dirs(stage_dir, stage) + os.makedirs(stage_dir, exist_ok=True) + return + + # If archive directory is not set and force overwrite is not set, raise an exception + if self.config.archive_dir is None: + raise _ConfigException( + "archive_dir", + "Working directory is not empty and force_working_dir_overwrite is not set. " + "Please set archive_dir to preserve the contents of the working directory.", + ) + + # Archive based on the stage + self._archive(stage_dir, stage, self.config.working_dir, self.config.archive_dir) + self._reset_dirs(stage_dir, stage) + + def _archive(self, source_dir: str, target_name: str, working_dir: str, + archive_dir: str) -> None: + """ Archive the contents of source_dir to the archive directory """ + # Ensure that archives have unique names + primary_timestamp = int(pathlib.Path(working_dir).stat().st_mtime) + archive_name = f"{primary_timestamp}_{target_name}" + + # Ensure that different per-stage archives from the same work dir have unique names + if source_dir != working_dir: + secondary_timestamp = int(pathlib.Path(source_dir).stat().st_mtime) + archive_name += f"_{secondary_timestamp}" + + archive_path = archive_dir + "/" + archive_name + + # Create the archive + shutil.make_archive(archive_path, 'gztar', str(source_dir)) + print(f"[INFO] Archived {working_dir} to {archive_path}.tar.gz.") + + def _reset_dirs(self, stage_dir: str, stage: FuzzingStages) -> None: + shutil.rmtree(stage_dir) + os.makedirs(stage_dir, exist_ok=True) + if stage == "fuzz": + os.makedirs(self.config.stage1_wd, exist_ok=True) + os.makedirs(self.config.stage2_wd, exist_ok=True) + os.makedirs(self.config.stage3_wd, exist_ok=True) + + +# ================================================================================================== +# Main Configuration Class +# ================================================================================================== +class Config: + """ + Class responsible for storing global fuzzing configuration. + + Note: This class is expected to be instantiated only once by `rvzr-sw.py` and passed to all + other modules by reference. + """ + # pylint: disable=too-few-public-methods,too-many-instance-attributes + # NOTE: disabling is justified here, as this class is a configuration holder + + __config_instantiated: bool = False + """ Class-local flag that allows us to detect attempts to instantiate Config more than once. """ + + _internal_opts: Final[List[str]] = ["stage1_wd", "stage2_wd", "stage3_wd", "_inspector_config"] + _help: str = "" + _inspector_config: LeakageInspectorConfig = LeakageInspectorConfig() + + # ============================================================================================== + # Fuzzing directories + working_dir: Optional[str] = None + _help += """\n\n working_dir (None) + Working directory for the fuzzer. It will contain all fuzzing artifacts as well as + log files and fuzzing reports. """ + + archive_dir: Optional[str] = None + _help += """\n\n archive_dir (None) + Directory where the fuzzing artifacts from previous runs will be archived. + If the working directory is non-empty and `force_working_dir_overwrite` is False, + the contents of the working_dir will be moved into archive_dir into a timestamped archive. """ + + force_working_dir_overwrite: bool = False + _help += """\n\n force_working_dir_overwrite (False) + Flag indicating whether the fuzzer should overwrite the working directory + if it already exists. + * If set to True, the fuzzer will remove the contents of + the working directory before starting. + * If set to False, the fuzzer will refuse to run if the working directory is not empty and + the `archive_dir` is not set. """ + + # internal working directories for each stage of the fuzzing process + # (cannot be set directly from the config YAML file) + stage1_wd: str + stage2_wd: str + stage3_wd: str + + # ============================================================================================== + # Fuzzing parameters + contract_observation_clause: str = "ct" + _help += """\n\n contract_observation_clause (ct)""" + contract_execution_clause: str = "seq" + _help += """\n\n contract_execution_clause (seq)""" + + # ============================================================================================== + # DR backend parameters + model_root: str = "~/.local/dynamorio/" + _help += """\n\n model_root (~/.local/dynamorio/) + Path to the directory containing the installation of the leakage model. """ + + # ============================================================================================== + # AFL++ parameters + afl_root: str = "~/.local/afl/" + _help += """\n\n afl_root (~/.local/afl/) + th to the directory containing the installation of AFL++. """ + + afl_seed_dir: Optional[str] = None + _help += """\n\n afl_seed_dir (None) + Path to the directory containing the seed corpus for AFL++. """ + + afl_exec_timeout_ms: int = 100 + _help += """\n\n afl_exec_timeout_ms (100) + Timeout for AFL++ execution, in milliseconds. """ + + # afl_qemu_mode: bool = False + # """ Flag indicating whether AFL++ should be run in QEMU mode. """ + + # ============================================================================================== + # Reporting parameters + report_verbosity: ReportVerbosity = 3 + _help += """\n\n report_verbosity (3) + Verbosity level for the report: + * 1 - only lines of code with leaks; + * 2 - also include PC of the instructions that cause the leaks; + * 3 - also include the file names of the traces that contain the leaks """ + + report_allowlist: Optional[str] = None + _help += """\n\n report_allowlist (None) + Path to a file containing a list of allowed lines of code, in the format: + : + If set, the report will only include lines of code that are not in this list. + This is useful for filtering out known leaks or false positives. """ + + # ============================================================================================== + # Coverage-related parameters + coverage: bool = False + _help += """\n\n coverage (True) + Flag indicating whether the fuzzer should collect coverage information. + If set to True, the fuzzer will execute an additional run in Stage 2 where it will run + the target binary with the generated public-private input pairs and collect + coverage information. This information will be later used to build a coverage model + for the complete fuzzing campaign, and it will be summarized in the final report. + """ + + llvm_cov_cmd: str = "llvm-cov" + llvm_profdata_cmd: str = "llvm-profdata" + + def __init__(self, config_yaml: str, stage: FuzzingStages) -> None: + if Config.__config_instantiated: + raise RuntimeError("Config class should be instantiated only once.") + Config.__config_instantiated = True + + # Parse the config YAML file and ensure that it is set up correctly + yaml_data = self._parse_yaml(config_yaml) + self._set_from_yaml(yaml_data) + self._inspector_config.parse(yaml_data.get('inspector', {})) + self._validate_config() + + # Ensure that the working directory is managed properly + wd_manager = _WorkingDirManager(self) + wd_manager.set_working_dirs(stage) + + @classmethod + def help(cls) -> str: + """ + Return a help string describing all configuration options. + :return: Help string + """ + help_str = "ConSFuzz Configuration Options:\n" + help_str += cls._help + return help_str + + def get_inspector_config(self) -> LeakageInspectorConfig: + return self._inspector_config + + def _parse_yaml(self, config_yaml: str) -> YAMLData: + """ + Parse the YAML configuration file. + :param config_yaml: Path to the YAML configuration file + :return: Parsed configuration data as a dictionary + """ + if not os.path.exists(config_yaml): + raise SystemExit(f"[ERROR] Config YAML file {config_yaml} does not exist.") + with open(config_yaml, 'r') as file: + config_data = yaml.safe_load(file) + if not isinstance(config_data, dict): + raise SystemExit(f"[ERROR] YAML file {config_yaml} isn't a valid ConSFuzz config file.") + return config_data + + def _set_from_yaml(self, yaml_data: YAMLData) -> None: + """ + Set configuration values from the parsed YAML data. + :param yaml_data: Parsed configuration data as a dictionary + """ + self.working_dir = yaml_data.get("working_dir", None) + if self.working_dir is None: + raise _ConfigException("working_dir", + "working_dir is a required field in the config file.") + + self.working_dir = str(pathlib.Path(self.working_dir).expanduser()) + self.stage1_wd = os.path.join(self.working_dir, "stage1") + self.stage2_wd = os.path.join(self.working_dir, "stage2") + self.stage3_wd = os.path.join(self.working_dir, "stage3") + + self.archive_dir = yaml_data.get("archive_dir", None) + if self.archive_dir is not None: + self.archive_dir = str(pathlib.Path(self.archive_dir).expanduser()) + + self.force_working_dir_overwrite = yaml_data.get("force_working_dir_overwrite", + self.force_working_dir_overwrite) + + self.model_root = yaml_data.get("model_root", self.model_root) + if not self.model_root.startswith("/"): + self.model_root = str(pathlib.Path(self.model_root).expanduser()) + + self.afl_root = yaml_data.get("afl_root", self.afl_root) + if not self.afl_root.startswith("/"): + self.afl_root = str(pathlib.Path(self.afl_root).expanduser()) + + self.afl_seed_dir = yaml_data.get("afl_seed_dir", self.afl_seed_dir) + if self.afl_seed_dir is not None: + self.afl_seed_dir = str(pathlib.Path(self.afl_seed_dir).expanduser()) + + self.afl_exec_timeout_ms = yaml_data.get("afl_exec_timeout_ms", self.afl_exec_timeout_ms) + + self.contract_observation_clause = yaml_data.get("contract_observation_clause", + self.contract_observation_clause) + self.contract_execution_clause = yaml_data.get("contract_execution_clause", + self.contract_execution_clause) + + self.report_verbosity = yaml_data.get("report_verbosity", self.report_verbosity) + self.report_allowlist = yaml_data.get("report_allowlist", self.report_allowlist) + + self.coverage = yaml_data.get("coverage", self.coverage) + self.llvm_cov_cmd = yaml_data.get("llvm_cov_cmd", self.llvm_cov_cmd) + self.llvm_profdata_cmd = yaml_data.get("llvm_profdata_cmd", self.llvm_profdata_cmd) + + # check for attempts to set internal config variables + for opt in self._internal_opts: + if opt in yaml_data: + raise _ConfigException( + opt, f"Option {opt} is for internal use only and should not be set in" + " the user config; use working_dir instead.") + + def _validate_config(self) -> None: + """ + Validate the configuration values. + """ + if not pathlib.Path(self.model_root).expanduser().is_dir(): + raise _ConfigException("model_root", f"{self.model_root} does not exist.") + if not pathlib.Path(self.afl_root).expanduser().is_dir(): + raise _ConfigException("afl_root", f"{self.afl_root} does not exist.") + if self.afl_seed_dir is None: + raise _ConfigException("afl_seed_dir", "Seed directory is not set.") + if not pathlib.Path(self.afl_seed_dir).expanduser().is_dir(): + raise _ConfigException("afl_seed_dir", f"{self.afl_seed_dir} does not exist.") + + if self.coverage and not shutil.which(self.llvm_cov_cmd): + raise _ConfigException("llvm_cov_cmd", f"command {self.llvm_cov_cmd} not found.") + if self.coverage and not shutil.which(self.llvm_profdata_cmd): + raise _ConfigException("llvm_profdata_cmd", + f"command {self.llvm_profdata_cmd} not found.") diff --git a/consfuzz/fuzzer.py b/consfuzz/fuzzer.py new file mode 100644 index 000000000..f2eb59d54 --- /dev/null +++ b/consfuzz/fuzzer.py @@ -0,0 +1,91 @@ +""" +File: Implementation of the high-level fuzzing logic for the software leakage fuzzer. + +Copyright (C) Microsoft Corporation +SPDX-License-Identifier: MIT +""" +from __future__ import annotations +from typing import TYPE_CHECKING, List + +from .pub_gen import PubGen +from .sec_gen import SecGen +from .tracer import Tracer +from .reporter import Reporter + +if TYPE_CHECKING: + from .config import Config + + +class FuzzerCore: + """ + Class responsible for orchestrating the fuzzing process. + """ + _config: Config + _working_dir: str + + def __init__(self, config: Config) -> None: + self._config = config + + def all(self, cmd: List[str], target_cov: int, timeout_s: int, num_sec_inputs: int) -> int: + """ + Run all fuzzing stages: public input generation, private input generation, and reporting. + + :param cmd: Command to run the target binary, with placeholders for public (@@) + and private (@#) inputs + :param target_cov: Target coverage to achieve + :param timeout_s: Timeout for the fuzzing process + :param num_sec_inputs: Number of secret (private) inputs to generate for each public input + :return: 0 if successful, 1 if error occurs + """ + if self.generate_public_inputs(cmd, target_cov, timeout_s) != 0: + return 1 + print("\n") # Print a newline for better readability in the console output + + if self.stage2(cmd, num_sec_inputs) != 0: + return 1 + + return self.report(cmd[0]) + + def generate_public_inputs(self, cmd: List[str], target_cov: int, timeout_s: int) -> int: + """ + Fuzzing Stage 1: + Generate public inputs with PubGen. + + :param cmd: Command to run the target binary, with placeholders for public (@@) + and private (@#) inputs + :param target_cov: Target coverage to achieve + :param timeout_s: Timeout for the fuzzing process + :return: 0 if the target coverage or timeout is reached, 1 if error occurs + """ + pub_gen = PubGen(self._config) + return pub_gen.generate(cmd, target_cov, timeout_s) + + def stage2(self, cmd: List[str], num_sec_inputs: int) -> int: + """ + Fuzzing Stage 2: + Generate private inputs, and collect contract traces for each public-private input pair. + + :param cmd: Command to run the target binary, with placeholders for public (@@) + and private (@#) inputs + :param num_sec_inputs: Number of secret (private) inputs to generate for each public input + :return: 0 if successful, 1 if error occurs + """ + sec_gen = SecGen(self._config) + tracer = Tracer(self._config) + + sec_gen.generate(num_sec_inputs) + return tracer.collect_traces(cmd) + + def report(self, target_binary: str) -> int: + """ + Fuzzing Stage 3: + Analyze the target binary for software leakage and generate a report. + + :param target_binary: Path to the target binary + :return: 0 if successful, 1 if error occurs + """ + reporter = Reporter(self._config) + reporter.analyze() + reporter.generate_report(target_binary) + reporter.process_coverage(target_binary) + return 0 diff --git a/consfuzz/logger.py b/consfuzz/logger.py new file mode 100644 index 000000000..4b36ebc35 --- /dev/null +++ b/consfuzz/logger.py @@ -0,0 +1,58 @@ +""" +File: Module responsible for logging. + +Copyright (C) Microsoft Corporation +SPDX-License-Identifier: MIT +""" +from __future__ import annotations + + +class Logger: + """ + Logger class for managing logging operations. + """ + + def __init__(self, name: str) -> None: + self._name = name + + def critical(self, message: str) -> None: + """Log a critical message with the logger's name.""" + print(f"[CRITICAL] {self._name}: {message}") + raise SystemExit(1) + + def error(self, message: str) -> None: + """Log an error message with the logger's name.""" + print(f"[ERROR] {self._name}: {message}") + + def info(self, message: str) -> None: + """Log an informational message with the logger's name.""" + print(f"[INFO] {self._name}: {message}") + + +class Color: + """ + ANSI color codes. + """ + PURPLE = '\033[1;35;48m' + CYAN = '\033[1;36;48m' + BOLD = '\033[1;37;48m' + BLUE = '\033[1;34;48m' + GREEN = '\033[1;32;48m' + YELLOW = '\033[1;33;48m' + RED = '\033[1;31;48m' + BLACK = '\033[1;30;48m' + UNDERLINE = '\033[4;37;48m' + END = '\033[1;37;0m' + +def printc(color: Color, text: str) -> None: + """ + Print colored string. + """ + print(color + str(text) + Color.END) + +def getc(color: Color, text: str) -> None: + """ + Get colored version of a string. + """ + return color + str(text) + Color.END + diff --git a/consfuzz/pub_gen.py b/consfuzz/pub_gen.py new file mode 100644 index 000000000..2f110db67 --- /dev/null +++ b/consfuzz/pub_gen.py @@ -0,0 +1,85 @@ +""" +File: Module responsible for generation of diverse public inputs for the target binary. + +Copyright (C) Microsoft Corporation +SPDX-License-Identifier: MIT +""" +from __future__ import annotations +from typing import TYPE_CHECKING, Final, List + +import os +import sys +import subprocess + +if TYPE_CHECKING: + from .config import Config + + +class PubGen: + """ + Class responsible for generating public inputs for the target binary using AFL++. + """ + _config: Config + _wd: Final[str] # Working directory for AFL++ + + _afl_bin: Final[str] # Path to the AFL++ binary + _libcompcov: Final[str] # Path to the libcompcov.so library + + def __init__(self, config: Config) -> None: + self._config = config + self._wd = config.stage1_wd + self._afl_bin = os.path.join(config.afl_root, "afl-fuzz") + self._libcompcov = os.path.join(config.afl_root, "libcompcov.so") + + def generate(self, cmd: List[str], target_cov: int, timeout_s: int) -> int: + """ + Generate public inputs for the target binary invoked with the given command. + The generation continues until either the target coverage is achieved or + the timeout is reached. + + :param cmd: Command to run the target binary, with placeholders for public (@@) + and private (@#) inputs + :param target_cov: Target coverage to achieve + :param timeout_s: Timeout for the fuzzing process + :return: 0 if the target coverage or timeout is reached, 1 if error occurs + """ + return self._start_afl_fuzz(cmd, target_cov, timeout_s) + + def _start_afl_fuzz(self, cmd: List[str], _: int, timeout_s: int) -> int: + """ + Starts the AFL++ fuzzing process. + """ + assert self._config.afl_seed_dir is not None, "AFL seed directory not set." + + # configure the AFL++ environment + env = os.environ.copy() + env["AFL_COMPCOV_LEVEL"] = "2" + env["AFL_PRELOAD"] = self._libcompcov + env["AFL_KEEP_TRACES"] = "1" + env["AFL_SKIP_CPUFREQ"] = "1" + + afl_flags = [ + "-V", + str(timeout_s), "-c", cmd[0], "-i", self._config.afl_seed_dir, "-o", self._wd, "-t", + str(self._config.afl_exec_timeout_ms) + ] + + cmd = [self._afl_bin] + afl_flags + ["--"] + cmd + # print(cmd, flush=True) + + try: + subprocess.check_call(cmd, timeout=timeout_s, env=env, shell=False) + except subprocess.TimeoutExpired: + # ignore timeout errors + # it just means a clock mismatch between AFL and this function + pass + except subprocess.CalledProcessError as e: + print(f"[AFL ERROR]: {e}") + return 1 + finally: + # Workaround: AFL++ corrupts the terminal output under some environments; + # Force cursor restoration to mitigate this issue. + sys.stdout.write('\033[?25h') # ANSI escape to show cursor + sys.stdout.flush() + + return 0 diff --git a/consfuzz/reporter.py b/consfuzz/reporter.py new file mode 100644 index 000000000..74aeef1b0 --- /dev/null +++ b/consfuzz/reporter.py @@ -0,0 +1,578 @@ +""" +File: Module responsible for Stage 3 of the fuzzing process: analysis of the collected traces + and reporting of the results. + +Copyright (C) Microsoft Corporation +SPDX-License-Identifier: MIT +""" +from __future__ import annotations +from typing import TYPE_CHECKING, List, Tuple, Optional, Dict, Iterator, NewType, Literal, \ + Final, Union + +import os +import json +from copy import deepcopy +from subprocess import run +from elftools.elf.elffile import ELFFile # type: ignore +from typing_extensions import assert_never +from tqdm import tqdm + +from rvzr.model_dynamorio.trace_decoder import TraceDecoder, TraceEntryType + +if TYPE_CHECKING: + from .config import Config, ReportVerbosity + +# ================================================================================================== +# Local type definitions +# ================================================================================================== +PC = NewType('PC', int) +""" Program Counter, used to identify instructions in the trace. """ + +TraceFileName = NewType('TraceFileName', str) +""" Name of the trace file, used to link leaks back the trace file they were found in. """ + +LeakType = Literal['I', 'D'] +""" Type of the leak: + 'I' for instruction leaks (e.g., secret dependent branch), + 'D' for data leaks (e.g., secret dependent memory access). +""" + +TraceLine = NewType('TraceLine', int) +""" Line number in the trace file, used to locate the leak in the original trace file. """ + +LeakyInstr = Tuple[PC, LeakType, TraceLine, TraceLine] +""" A tuple representing a leaky instruction: + * First element is the program counter (PC) of the instruction, + * Second element is the line number in the trace file where + the instruction was found, + * Third element is the line number in the reference trace file + (000.trace, which is the same for all leaks), + * Fourth element is the type of the leak (see LeakType). +""" + +LinesInTracePair = NewType('LinesInTracePair', str) +""" A string representing a location of a leak in a trace pair. + It is in the format "trace_file_name:line_number_in_trace:line_number_in_reference", + where: + * trace_file_name is the name of the trace file where + the leak was found, + * line_number_in_trace is the line number in the trace file where + the leak was found, + * line_number_in_reference is the line number in the reference trace file + (000.trace, which is the same for all leaks). +""" + +LeakageMap = Dict[ + LeakType, + Dict[ + PC, + List[LinesInTracePair], + ], +] +""" Map of leaks found in the traces, indexed by leak type and PC. + The value is a list of trace file names where the leak was found. +""" + +CodeLine = NewType('CodeLine', str) +""" Location of a line in the source code, used to group leaks by code lines. + It is a string in the format "filename:line_number", where + * filename is the name of the source file, + * line_number is the line number in the source file. +""" + +LeakageLineMapVrb3 = Dict[ + LeakType, + Dict[ + CodeLine, + Dict[ + PC, + List[LinesInTracePair], + ], + ], +] +""" Map of unique leaky lines of code, indexed by leak type and code line. + The value is a map of PCs where the leak was found, and a list of locations + where the leak was found in the trace files. +""" + +LeakageLineMapVrb2 = Dict[ + LeakType, + Dict[ + CodeLine, + List[PC], + ], +] +""" A variant of LeakageLineMap for the lower verbosity level (verbosity 2). """ + +LeakageLineMapVrb1 = Dict[ + LeakType, + List[CodeLine], +] +""" A variant of LeakageLineMap for the lowest verbosity level (verbosity 1). """ + +LeakageLineMap = Union[ + LeakageLineMapVrb3, + LeakageLineMapVrb2, + LeakageLineMapVrb1, +] + + +# ================================================================================================== +# Classes representing parsed traces and their elements +# ================================================================================================== +class _TracedInstruction: + pc: Final[PC] + mem_accesses: List[int] + lit: Final[TraceLine] + + def __init__(self, pc: int, lit: int) -> None: + self.pc = PC(pc) + self.mem_accesses = [] + self.lit = TraceLine(lit) + + def __eq__(self, value: object) -> bool: + assert isinstance(value, _TracedInstruction) + return self.pc == value.pc and self.mem_accesses == value.mem_accesses + + +class _Trace: + """ + A trace of a contract execution, containing a list of instructions executed + during the execution and their memory accesses. + """ + file_name: Final[TraceFileName] + + def __init__(self, file_name: str) -> None: + self.file_name = TraceFileName(file_name) + self.instructions: List[_TracedInstruction] = [] + + def __len__(self) -> int: + return len(self.instructions) + + def __iter__(self) -> Iterator[_TracedInstruction]: + return iter(self.instructions) + + def __getitem__(self, item: int) -> _TracedInstruction: + return self.instructions[item] + + def append(self, instruction: _TracedInstruction) -> None: + """ Append a new instruction to the trace. """ + self.instructions.append(instruction) + + +# ================================================================================================== +# Trace parsing and leakage analysis +# ================================================================================================== +class _Analyser: + """ + Class responsible for checking the collected contract traces for violations of the + non-interference property. + """ + trace_decoder: TraceDecoder + + def __init__(self) -> None: + self.trace_decoder = TraceDecoder() + + def build_leakage_map(self, stage2_dir: str) -> LeakageMap: + """ + Analyse all leaks stored in the given directory after a completed fuzzing campaign. + """ + leakage_map: LeakageMap = {'I': {}, 'D': {}} + input_groups = os.listdir(stage2_dir) + + # Iterate over all input groups + # (i.e., groups of traces collected from the same public input) + inputs: Dict[str, List[str]] = {} + for input_group in input_groups: + input_group_dir = os.path.join(stage2_dir, input_group) + + # Get a reference trace for the given group; we will use it to check that + # all other traces are the same + reference_trace_file = os.path.join(input_group_dir, "000.trace") + if not os.path.exists(reference_trace_file) or \ + os.path.exists(reference_trace_file.replace(".trace", ".failed")): + # Ignore faulty traces + # TODO: add to failed list + continue + + inputs[reference_trace_file] = [] + + # Compare the reference trace with all other traces in the group + for trace_file in os.listdir(input_group_dir): + # skip non-trace files and the reference trace itself + if not trace_file.endswith(".trace"): + continue + if trace_file == "000.trace": + continue + + # parse the trace file and extract a list of leaky instructions + trace_file = os.path.join(input_group_dir, trace_file) + if os.path.exists(trace_file.replace(".trace", ".failed")): + # Ignore faulty traces + # TODO: add to failed list + continue + inputs[reference_trace_file].append(trace_file) + + # Initialize a progress bar to track the progress of the analysis + progress_bar = tqdm( + total=sum(len(traces) for ref_file, traces in inputs.items()), + colour='green', + ) + + # Collect traces for each pair and check for leaks + for reference_trace_file, trace_files in inputs.items(): + reference_trace = self._parse_trace_file(reference_trace_file) + + for trace_file in trace_files: + progress_bar.update() + trace = self._parse_trace_file(trace_file) + leaky_instructions = self._identify_leaks(reference_trace, trace) + + # nothing to do if there are no leaky instructions + if not leaky_instructions: + continue + + # add the leaky instructions to the global map + self._update_global_map(leakage_map, leaky_instructions, trace_file) + + progress_bar.close() + return leakage_map + + def _parse_trace_file(self, trace_file: str) -> _Trace: + trace = _Trace(trace_file) + raw_traces, _ = self.trace_decoder.decode_trace_file(trace_file) + assert len(raw_traces) > 0, f"No trace found for {trace_file}" + + for i, entry in enumerate(raw_traces[0]): + type_ = TraceEntryType(entry.type) + if type_ == TraceEntryType.ENTRY_PC: + trace.append(_TracedInstruction(entry.addr, i + 1)) + elif type_ in (TraceEntryType.ENTRY_READ, TraceEntryType.ENTRY_WRITE, + TraceEntryType.ENTRY_IND): + trace[-1].mem_accesses.append(entry.addr) + + return trace + + def _identify_leaks(self, ref_trace: _Trace, target_trace: _Trace) -> List[LeakyInstr]: + """ + Check the given set of contract traces for violations of the non-interference property + and return a list of addresses of instructions that violate the property (i.e., are leaky). + + The function walks through the two traces in lockstep, comparing each instruction. + At each step, three options are possible: + 1. If the PC of the instruction and their memory accesses match, + then the instruction is not leaky. Move to the next instruction. + 2. If the PC of the instruction matches, but their memory accesses differ, + then the instruction has a D-type leak. Record it and move to the next instruction. + 3. If the PC of the instruction differs, then the instruction has an I-type leak. + Record the previous instruction as a leak and rewind to the merge point. + FIXME: the rewind is not implemented yet; instead, the function terminates after + the first I-type leak is found. + + :param ref_trace: Reference trace to compare against + :param trace: Trace to check for leaks + :return: List of addresses of leaky instructions + """ + if ref_trace == target_trace: + return [] + + # Initialize the variables to track the leaky instructions and the current entry + leaky_instructions: List[LeakyInstr] = [] + curr_ref_entry: _TracedInstruction + curr_tgt_entry: _TracedInstruction + prev_entry: Optional[_TracedInstruction] = None + entry_id: int = 0 + end_id: int = min(len(ref_trace), len(target_trace)) + + # Iterate through the traces until the end of the shorter trace + while entry_id < end_id: + curr_ref_entry = ref_trace[entry_id] + curr_tgt_entry = target_trace[entry_id] + + # I-type leak: the PC of the instruction differs + if curr_ref_entry.pc != curr_tgt_entry.pc: + # Record the previous instruction as a leak + if prev_entry is not None: + leak: LeakyInstr = (prev_entry.pc, 'I', prev_entry.lit, prev_entry.lit) + leaky_instructions.append(leak) + # Rewind to the merge point + # FIXME: the rewind is not implemented yet; instead, we terminate + return leaky_instructions + + # D-type leak: the PC of the instruction matches, but memory accesses differ + if curr_ref_entry.mem_accesses != curr_tgt_entry.mem_accesses: + # Record the current instruction as a leak + leak = (curr_tgt_entry.pc, 'D', curr_tgt_entry.lit, curr_ref_entry.lit) + leaky_instructions.append(leak) + + # Move to the next instruction + prev_entry = curr_ref_entry + entry_id += 1 + + return leaky_instructions + + def _update_global_map(self, leakage_map: LeakageMap, leaky_instructions: List[LeakyInstr], + source: str) -> None: + """ + Update the global leakage map with the given address and trace file. + """ + for leaky_instr in leaky_instructions: + # Unpack the leaky instruction tuple + per_type_map = leakage_map[leaky_instr[1]] + pc = leaky_instr[0] + reference_lit = leaky_instr[2] + target_lit = leaky_instr[3] + + # If the PC is not in the map, create a new entry + if pc not in per_type_map: + per_type_map[pc] = [] + + # Create a new leakage location and append it to the map + leakage_location = LinesInTracePair(f"{source}:{target_lit}:{reference_lit}") + per_type_map[pc].append(leakage_location) + + +# ================================================================================================== +# Reporting of the analysis results +# ================================================================================================== +class _ReportPrinter: + """ + Class responsible for printing the analysis results to a report file. + """ + + def __init__(self, target_binary: str, config: Config) -> None: + self._config = config + with open(target_binary, "rb") as f: + self._elf_data = ELFFile(f) + self.dwarf_info = self._elf_data.get_dwarf_info() + + def final_report(self, leakage_map: LeakageMap, report_file: str) -> None: + """ Print the global map of leaks to the trace log """ + leakage_line_map = self._group_by_code_line(leakage_map, self._config.report_verbosity) + leakage_line_map = self._filter_allowlist(leakage_line_map) + self._write_report(report_file, leakage_line_map) + + def _write_report(self, report_file: str, leakage_line_map: LeakageLineMap) -> None: + """ + Write the report to the given file in a json format: + { + "seq": { + "I": { + "file:line": { + "0x12345678": ["trace1:10:20", "trace2:15:25"], + ... + }, + ... + }, + "D": { + ... + } + } + } + """ + report_dict = {'seq': leakage_line_map} + with open(report_file, "w") as f: + json.dump(report_dict, f, indent=4, sort_keys=True) + + def _group_by_code_line(self, leakage_map: LeakageMap, + verbosity: ReportVerbosity) -> LeakageLineMap: + """ + Transform a LeakageMap object into a LeakageLineMap object by + grouping all instructions that map to the same line in the source code and filtering + them based on the verbosity level. + + Use DWARF information to get the source code line for each instruction address. + + :param leakage_map: Map of leaks found in the traces, indexed by leak type and PC. + :param verbosity: Amount of information to include in the report + (see Config.report_verbosity for details). + :return: Map of unique leaks, grouped by source code line. + """ + if verbosity == 1: + return self._group_by_code_line_vrb1(leakage_map) + if verbosity == 2: + return self._group_by_code_line_vrb2(leakage_map) + if verbosity == 3: + return self._group_by_code_line_vrb3(leakage_map) + assert_never(verbosity) + + def _group_by_code_line_vrb3(self, leakage_map: LeakageMap) -> LeakageLineMapVrb3: + leakage_line_map: LeakageLineMapVrb3 = {'I': {}, 'D': {}} + for type_ in leakage_map: + per_type_map = leakage_map[type_] + for pc in per_type_map: + # get the source code line for the instruction address + source_code_line = self._decode_addr(pc) + + # create a new entry in the leakage line map if it does not exist + if source_code_line not in leakage_line_map[type_]: + leakage_line_map[type_][source_code_line] = {} + + # create a new entry for the PC if it does not exist + if pc not in leakage_line_map[type_][source_code_line]: + leakage_line_map[type_][source_code_line][pc] = [] + + # append the trace locations to the map + leakage_line_map[type_][source_code_line][pc].extend(per_type_map[pc]) + + return leakage_line_map + + def _group_by_code_line_vrb2(self, leakage_map: LeakageMap) -> LeakageLineMapVrb2: + leakage_line_map: LeakageLineMapVrb2 = {'I': {}, 'D': {}} + for type_ in leakage_map: + per_type_map = leakage_map[type_] + for pc in per_type_map: + # get the source code line for the instruction address + source_code_line = self._decode_addr(pc) + + # create a new entry in the leakage line map if it does not exist + if source_code_line not in leakage_line_map[type_]: + leakage_line_map[type_][source_code_line] = [] + + # append the PC to the map + leakage_line_map[type_][source_code_line].append(pc) + return leakage_line_map + + def _group_by_code_line_vrb1(self, leakage_map: LeakageMap) -> LeakageLineMapVrb1: + leakage_line_map: LeakageLineMapVrb1 = {'I': [], 'D': []} + for type_ in leakage_map: + per_type_map = leakage_map[type_] + for pc in per_type_map: + # get the source code line for the instruction address + source_code_line = self._decode_addr(pc) + + # append the source code line to the map if it does not exist + if source_code_line not in leakage_line_map[type_]: + leakage_line_map[type_].append(source_code_line) + return leakage_line_map + + def _filter_allowlist(self, leakage_line_map: LeakageLineMap) -> LeakageLineMap: + """ + Filter the leakage line map by the allowlist of source code lines. + The allowlist is a list of source code lines that should be included in the report. + """ + allowlist_file = self._config.report_allowlist + if not allowlist_file: + return leakage_line_map + + # Read the allowlist file and create a set of allowed source code lines + with open(allowlist_file, "r") as f: + allowlist_lines = {line.strip() for line in f if line.strip()} + + # Filter the leakage line map by the allowlist + filtered_leakage_line_map: LeakageLineMap = deepcopy(leakage_line_map) + for type_ in leakage_line_map: + per_type_map = leakage_line_map[type_] + for code_line in per_type_map: + if code_line in allowlist_lines: + filtered_per_type_map = filtered_leakage_line_map[type_] + if isinstance(filtered_per_type_map, list): # Verbosity 1 + filtered_per_type_map.remove(code_line) + continue + if isinstance(filtered_per_type_map, dict): # Verbosity 2 or 3 + filtered_per_type_map.pop(code_line) + + return filtered_leakage_line_map + + def _decode_addr(self, address: int) -> CodeLine: + # Go over all the line programs in the DWARF information, looking for + # one that describes the given address. + for CU in self.dwarf_info.iter_CUs(): + # First, look at line programs to find the file/line for the address + line = self.dwarf_info.line_program_for_CU(CU) + if not line: + continue + delta = 1 if line.header.version < 5 else 0 + prevstate = None + for entry in line.get_entries(): + # We're interested in those entries where a new state is assigned + if entry.state is None: + continue + # Looking for a range of addresses in two consecutive states that + # contain the required address. + if prevstate and int(prevstate.address) <= address < int(entry.state.address): + filename = line['file_entry'][prevstate.file - delta].name.decode() + line = prevstate.line + return CodeLine(f"{filename}:{line}") + if entry.state.end_sequence: + # For the state with `end_sequence`, `address` means the address + # of the first byte after the target machine instruction + # sequence and other information is meaningless. We clear + # prevstate so that it's not used in the next iteration. Address + # info is used in the above comparison to see if we need to use + # the line information for the prevstate. + prevstate = None + else: + prevstate = entry.state + return CodeLine("undefined:0") + + +# ================================================================================================== +# Coverage +# ================================================================================================== +def _build_cov_report(config: Config, target_binary: str) -> None: + """ + Simple function to invoke llvm-profdata to merge all collected coverage files + and then invoke llvm-cov to generate a coverage report. + """ + stage2_wd = config.stage2_wd + stage3_wd = config.stage3_wd + profdata = config.llvm_profdata_cmd + cov = config.llvm_cov_cmd + + # merge all reports + profdata_cmd = f"{profdata} merge -sparse {stage2_wd}/*/*.profraw " \ + f"-o {os.path.join(stage3_wd, 'merged.profdata')}" + run(profdata_cmd, check=True, shell=True) + + # generate the coverage report (txt) + cov_cmd = f"{cov} report {target_binary} -instr-profile " \ + f"{os.path.join(stage3_wd, 'merged.profdata')} > " \ + f"{os.path.join(stage3_wd, 'coverage_report.txt')}" + run(cov_cmd, check=True, shell=True) + + # generate another, more detailed coverage report (html) + cov_html_cmd = f"{cov} show {target_binary} -instr-profile " \ + f"{os.path.join(stage3_wd, 'merged.profdata')} " \ + f"-format html -output-dir {os.path.join(stage3_wd, 'cov_html')}" + run(cov_html_cmd, check=True, shell=True) + + +# ================================================================================================== +# Public interface to the analysis and reporting module +# ================================================================================================== +class Reporter: + """ + Class responsible for processing the collected contract traces, detecting leaks exposed in them, + and building a final report with the results of the analysis. + """ + _leakage_map: Optional[LeakageMap] = None + + def __init__(self, config: Config) -> None: + self._config = config + + def analyze(self) -> None: + """ + Analyze the results of the fuzzing campaign and identify the uncovered + leaks in the target binary. + :param target_binary: Path to the target binary + """ + analyser = _Analyser() + self._leakage_map = analyser.build_leakage_map(self._config.stage2_wd) + + def generate_report(self, target_binary: str) -> None: + """ + Generate a report of the analysis. + """ + assert self._leakage_map is not None, "No leakage map found. Did you run analyze()?" + report_file = os.path.join(self._config.stage3_wd, "fuzzing_report.json") + printer = _ReportPrinter(target_binary, self._config) + printer.final_report(self._leakage_map, report_file) + + def process_coverage(self, target_binary: str) -> None: + """ + Process the collected coverage files and merge them into a single file. + """ + if not self._config.coverage: + return + _build_cov_report(self._config, target_binary) diff --git a/consfuzz/sec_gen.py b/consfuzz/sec_gen.py new file mode 100644 index 000000000..8b9076820 --- /dev/null +++ b/consfuzz/sec_gen.py @@ -0,0 +1,103 @@ +""" +File: Module responsible for generation of secret (private) inputs for the target binary. + +Copyright (C) Microsoft Corporation +SPDX-License-Identifier: MIT +""" +from __future__ import annotations +from typing import TYPE_CHECKING, Final + +import os + +if TYPE_CHECKING: + from .config import Config + +CONF_SIZE: Final[int] = 0x10 # Size of the config data in bytes + + +class SecGen: + """ + Class responsible for generating secret (private) inputs for the target binary. + """ + + def __init__(self, config: Config) -> None: + self._config = config + + def _generate_from_reference(self, wd: str, reference_input: str, num_sec_inputs: int) -> int: + """ + Given a reference input, generate mode inputs that will contain the same public data, + but the secret (private) data will be randomly generated + (though the size of the secret data will be the same). + + The input file contains three sections: config data, public data, and private data. + * The config data is always 16 bytes long, and it should be copied from the reference input. + The first byte of the config data is a ratio of public to private data, and it thus + determines the layout of the remaining data. + E.g., if the value of the first config byte is 1 and the file size is 1040 bytes, then + data_size = 1040 - 16 = 1024 bytes, which is split into public and private data + priv_size = (1 / 256) * 1024 = 4 bytes, and + pub_size = (255 / 256) * 1024 = 1020 bytes. + * The private data has size priv_size. This is the region that will be randomized. + * The public data has size pub_size. This region will be copied from the + reference input. + :param reference_input: Path to the reference input file + :param num_sec_inputs: Number of secret (private) inputs to generate + :return: 0 if successful, 1 if the reference input is invalid or an error occurs + """ + # Read the reference input to determine the sizes of public and private data + with open(reference_input, 'rb') as f: + ref_data = f.read() + + if len(ref_data) < CONF_SIZE + 2: # Public and private data must be present + return 1 + + data_size = len(ref_data) - CONF_SIZE + priv_size = (ref_data[0] * data_size) // 256 + pub_size = data_size - priv_size + if len(ref_data) < (CONF_SIZE + pub_size): + return 1 + + # Copy the reference input to the working directory + new_ref_name = os.path.join(wd, "000.bin") + with open(new_ref_name, 'wb') as dest_file: + dest_file.write(ref_data) + + # Generate the secret inputs + config_data = ref_data[:CONF_SIZE] + pub_data = ref_data[CONF_SIZE + priv_size:CONF_SIZE + priv_size + pub_size] + for i in range(1, num_sec_inputs): + priv_data = os.urandom(priv_size) + dest_path = os.path.join(wd, f"{i:03}.bin") + with open(dest_path, 'wb') as dest_file: + dest_file.write(config_data + pub_data + priv_data) + + return 0 + + def generate(self, num_sec_inputs: int) -> int: + """ + Generate public-equivalent inputs for each reference input generated on Stage 1 by AFL++. + The inputs will contain the same public data, but the secret (private) data will be + randomly generated (though the size of the secret data will be the same). + The inputs will be stored in the stage 2 working directory. + :param num_sec_inputs: Number of secret (private) inputs to generate for each reference + :return: 0 if successful, 1 if an error occurs + :raises FileNotFoundError: If the AFL++ working directory does not exist + :raises OSError: If there is an error creating directories or files + """ + + afl_dir = self._config.stage1_wd + "/default/queue/" + ref_inputs = [f for f in os.listdir(afl_dir) if os.path.isfile(os.path.join(afl_dir, f))] + for ref_input in ref_inputs: + # Create a directory for each reference input + ref_input_path = os.path.join(afl_dir, ref_input) + dest_dir = os.path.join(self._config.stage2_wd, ref_input) + os.makedirs(dest_dir, exist_ok=True) + + # Try generating more public-equivalent inputs from the reference input + if self._generate_from_reference(dest_dir, ref_input_path, num_sec_inputs) == 0: + continue + + # If we failed, remove the directory + os.rmdir(dest_dir) + + return 0 diff --git a/consfuzz/tracer.py b/consfuzz/tracer.py new file mode 100644 index 000000000..77cc2b03a --- /dev/null +++ b/consfuzz/tracer.py @@ -0,0 +1,211 @@ +""" +File: Module responsible for collecting contract traces + +Copyright (C) Microsoft Corporation +SPDX-License-Identifier: MIT +""" +from __future__ import annotations +from typing import TYPE_CHECKING, List, Final + +import os +import subprocess +from enum import Enum +from tqdm import tqdm + +from rvzr.model_dynamorio.trace_decoder import TraceDecoder +from .logger import Logger + +if TYPE_CHECKING: + from .config import Config + + +class _ExecOutcome(Enum): + """ + Outcome of an execution of the tracer. The program can either exit without errors, or throw + an (architectural) exception, or an unexpected failure can be happening in the instrumentation. + """ + SUCCESS = 0 + PROGRAM_EXCEPTION = 1 + INSTR_EXCEPTION = 2 + COV_EXCEPTION = 3 + UNKNOWN = 4 + + +class Tracer: + """ + Class responsible for executing the target binary on the leakage model and retrieving the + collected contract traces. + """ + + _drrun_cmd: Final[str] + _log: Final[Logger] + + def __init__(self, config: Config) -> None: + self._log = Logger("Tracer") + + self._config = config + self._drrun_cmd = f"{config.model_root}/drrun " \ + f"-c {config.model_root}/libdr_model.so " \ + f"--tracer {config.contract_observation_clause} " \ + f"--speculator {config.contract_execution_clause} " \ + "--instrumented-func wrapper --trace-output {trace_file} -- {cmd}" + self._coverage_cmd = "LLVM_PROFILE_FILE={cov_file} {cmd}" + + def collect_traces(self, cmd: List[str]) -> int: + """ + Iterate over all previously-generated public-private input pairs and collect contract traces + for each pair. + + :param cmd: Command to run the target binary, with placeholders for public (@@) + and private (@#) inputs + :return: 0 if successful, 1 if error occurs + """ + # Check if the traces are deterministic; abort if they are not + if not self._check_determinism(self._config.stage2_wd, cmd): + self._log.error("The target binary produces non-deterministic traces. Tracing aborted.") + return 1 + + # Get a list of input groups + input_group_dirs = [] + for input_group in os.listdir(self._config.stage2_wd): + input_group_dir = os.path.join(self._config.stage2_wd, input_group) + if not os.path.isdir(input_group_dir): + continue + input_group_dirs.append(input_group_dir) + + # Iterate over all input groups and collect traces + inputs: List[str] = [] + for input_group_dir in input_group_dirs: + # Get a list of all inputs + for input_name in os.listdir(input_group_dir): + if ".bin" not in input_name: + continue + if "log" in input_name or "trace" in input_name or "failed" in input_name: + continue + input_path = os.path.join(input_group_dir, input_name) + inputs.append(input_path) + + # Initialize a progress bar to track the progress of the tracing process + progress_bar = tqdm(total=len(inputs)) + + # Process each pair + for input_ in inputs: + # Expand the command with the public and private inputs + expanded_cmd = self._expand_target_cmd(cmd, input_) + + # Execute the target binary and collect traces + outcome = self._execute(expanded_cmd, input_, self._config.coverage) + if outcome == _ExecOutcome.INSTR_EXCEPTION: + # Check if the error was produced by the target program or by a bug in the + # instrumentation + # NOTE: we intentionally ignore errors in the target program, as many files + # generated by AFL++ are invalid, which leads to errors during execution; this + # is expected and does not affect the correctness of the fuzzing process + # Mark this test as failed by creating a .failed file + with open(f"{input_}.failed", "w") as failed_log: + failed_log.close() + + progress_bar.update() + + progress_bar.close() + return 0 + + def _expand_target_cmd(self, cmd: List[str], input_: str) -> str: + """ + Replace the placeholders in the command with the actual public and private inputs. + """ + expanded_cmd = cmd + expanded_cmd = [s if s != "@@" else input_ for s in expanded_cmd] + expanded_str = " ".join(expanded_cmd) + return expanded_str + + def _execute(self, expanded_str: str, input_name: str, enable_cov: bool) -> _ExecOutcome: + """ + Execute the target binary on the leakage model with the given public and private inputs. + + If `enable_cov` is True, the command will also collect coverage information. + + :param expanded_str: Command to run the target binary, with public and private inputs + :param pair_name: Base name for the output files (trace and log) + :param enable_cov: Whether to collect coverage information + :return: The outcome of the execution (either success or reason for failures) + """ + base = input_name.strip(".bin") + trace_file = f"{base}.trace" + log_file = f"{base}.log" + + complete_cmd = self._drrun_cmd.format(cmd=expanded_str, trace_file=trace_file) + # print(complete_cmd, flush=True) + try: + with open(log_file, "a") as f: + f.write("$> " + complete_cmd + "\n") + subprocess.check_call(complete_cmd, shell=True, stdout=f, stderr=f) + except subprocess.CalledProcessError: + if TraceDecoder().is_trace_corrupted(trace_file): + return _ExecOutcome.INSTR_EXCEPTION + return _ExecOutcome.PROGRAM_EXCEPTION + + if not enable_cov: + return _ExecOutcome.SUCCESS + + # If coverage is enabled, run the command with coverage collection + cov_file = f"{base}.profraw" + coverage_cmd = self._coverage_cmd.format(cov_file=cov_file, cmd=expanded_str) + try: + subprocess.check_call( + coverage_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + except subprocess.CalledProcessError: + self._log.error(f"Error executing coverage command: {coverage_cmd}") + return _ExecOutcome.COV_EXCEPTION + return _ExecOutcome.SUCCESS + + def _check_determinism(self, wd: str, cmd: List[str]) -> bool: + """ + Check if the traces are deterministic by running the target binary multiple times + with the same inputs and comparing the outputs. + :param wd: Working directory containing the input pairs + :param cmd: Command to run the target binary, with placeholders for public (@@) + and private (@#) inputs + :return: True if the traces are deterministic, False otherwise + :raise: AssertionError if no input pairs are found + """ + # find an arbitrary input in the working directory that does not produce an error + # and construct a command to run it + found: bool = False + expanded_cmd = "" + for input_group in os.listdir(wd): + input_group_dir = os.path.join(wd, input_group) + if not os.path.isdir(input_group_dir): + continue + + # check if the directory contains 000.bin file + ref_input = os.path.join(input_group_dir, "000.bin") + if not os.path.isfile(ref_input): + continue + + # try running the target binary with the reference input + expanded_cmd = self._expand_target_cmd(cmd, ref_input) + err = self._execute(expanded_cmd, ref_input, False) + if err in (_ExecOutcome.INSTR_EXCEPTION, _ExecOutcome.PROGRAM_EXCEPTION): + # if the target binary throws an exception, skip this input group + continue + found = True + break + if not found: + raise AssertionError("No valid inputs found in the working directory; aborting") + + # execute the target binary twice and collect traces + for i in [0, 1]: + pair_name = os.path.join(input_group_dir, f"determinism_check_{i}") + err = self._execute(expanded_cmd, pair_name, False) + assert err not in (_ExecOutcome.INSTR_EXCEPTION, _ExecOutcome.PROGRAM_EXCEPTION) + + # compare the traces + with open(os.path.join(input_group_dir, "determinism_check_0.trace"), "rb") as f0, \ + open(os.path.join(input_group_dir, "determinism_check_1.trace"), "rb") as f1: + trace_0_content = f0.read() + trace_1_content = f1.read() + if trace_0_content != trace_1_content: + return False + + return True diff --git a/consfuzz/triage/__init__.py b/consfuzz/triage/__init__.py new file mode 100644 index 000000000..6073f240f --- /dev/null +++ b/consfuzz/triage/__init__.py @@ -0,0 +1,10 @@ +# flake8: noqa +# pylint: skip-file + +import os + +def get_plugin_path() -> str: + """ + Return the path of the gdb plugin + """ + return os.path.abspath(os.path.dirname(__file__)) diff --git a/consfuzz/triage/config.py b/consfuzz/triage/config.py new file mode 100644 index 000000000..e461f6477 --- /dev/null +++ b/consfuzz/triage/config.py @@ -0,0 +1,52 @@ +""" +File: Configuration part specific to the inspector. + +Copyright (C) Microsoft Corporation +SPDX-License-Identifier: MIT +""" + +from typing import Dict, List, Optional, Tuple, Any + +class LeakageInspectorConfig: + """ + Configuration part specific to the inspector. + """ + declassified: List[str] + known_syms: Dict[str, List[int]] + key: List[str] + dont_follow: List[str] + follow_mem_uses: bool + + def __init__(self): + self.declassified = [] + self.known_syms = [] + self.key = [] + # NOTE: These are used as size register for AVX instructions. Since they are not logged, + # they create a lot of noise. We decided to silence them, although potentially + # also these could be a source for differential use-def differences. + self.dont_follow = ["K0", "K1", "K2", "K3", "K4", "K5", "K6", "K7"] + # If the addresses are different, of course the values can differ, hence we stop the + # analysis there. + self.follow_mem_uses = False + + def parse(self, yaml_data: Dict[str, Any]) -> None: + """ + Parse the values from a dictionary. + """ + self.declassified = yaml_data.get("declassified", self.declassified) + self.known_syms = yaml_data.get("known_syms", self.known_syms) + self.key = yaml_data.get("key", self.key) + self.dont_follow = yaml_data.get("dont_follow", self.dont_follow) + self.follow_mem_uses = yaml_data.get("follow_mem_uses", self.follow_mem_uses) + + def get_sym_annotation(self, address: str) -> Optional[Tuple[str, int]]: + """ + If the address is in the range of a known symbol (defined in the config) get symbol name + and offset. + """ + for sym_name, sym_address in self.known_syms.items(): + start = sym_address[0] + size = sym_address[1] + if address >= start and address < start + size: + return sym_name, address-start + return None diff --git a/consfuzz/triage/plugin.py b/consfuzz/triage/plugin.py new file mode 100644 index 000000000..2fee82617 --- /dev/null +++ b/consfuzz/triage/plugin.py @@ -0,0 +1,492 @@ +""" +File: GDB plugin that can be used to navigate traces in GDB. + +Copyright (C) Microsoft Corporation +SPDX-License-Identifier: MIT +""" + +import gdb + +# FIXME!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +import sys +sys.path.append("/home/alvise/venv-revizor-2/lib/python3.12/site-packages") +sys.path.append(".") +sys.path.append("..") + +from typing import Any, List, Optional +from rvzr.model_dynamorio.trace_decoder import TraceDecoder, DebugTraceEntryType + +# ------------------------------------------------------------------------------ +# Trace Helpers +# ------------------------------------------------------------------------------ + +_glob_trace = None +_cached_lines = {} + +_glob_cur_line = None +_glob_cur_spec_level = None +_glob_cur_spec_context = None + +class SpecWinInfo: + """ + Store relevant information about a specific speculative window. + """ + first_pc: Optional[int] + first_line: Optional[int] + target_pc: int + target_line: int + target_count: int + nesting: int + + def __init__(self, first_pc: Optional[int], first_line: Optional[int], target_pc: int, target_line: int, target_count: int, nesting: int): + self.first_pc = first_pc + self.target_pc = target_pc + self.target_count = target_count + self.nesting = nesting + self.first_line = first_line + self.target_line = target_line + + def __str__(self): + prefix = " " * self.nesting + if self.first_pc is None: + return f"{prefix} └─ last architectural: {hex(self.target_pc)} (#{self.target_count}) [line: {self.target_line}]" + + # Check if we're at the beginning of this window + global _glob_cur_line + if _glob_cur_line == self.first_line: + return f"{prefix} └─ last: {hex(self.first_pc)} [line: {self.first_line}]\n" + else: + s = f"{prefix} ├─ speculated to: {hex(self.first_pc)} [line: {self.first_line}]\n" + s += f"{prefix} └─ last: {hex(self.target_pc)} (#{self.target_count}) [line: {self.target_line}]" + return s + + +def _is_arch(entry: Any) -> bool: + """ + Is the entry architectural (debug trace entries only) + """ + return entry.nesting_level == 0 + +def _is_spec(entry: Any) -> bool: + """ + Is the entry speculative (debug trace entries only) + """ + return entry.nesting_level != 0 + +def _build_spec_info(line): + """ + Return a list of relevant information for each (nested) speculation window that leads to the + instruction at line `line` in the trace. + """ + spec_windows: list[SpecWinInfo] = [] + + print(f" • Analyzing debug trace from line {line}...", flush=True) + cur_idx = line + prev_nesting = None + + while cur_idx > 0: + # Visit trace in reverse order + entry = _glob_trace[cur_idx] + + if DebugTraceEntryType(entry.type) == DebugTraceEntryType.ENTRY_REG_DUMP: + # Found new instruction + cur_pc = entry.regs.pc + cur_nesting = entry.nesting_level + if len(spec_windows) > 0: + prev_nesting = spec_windows[-1].nesting + + if prev_nesting is None or cur_nesting < prev_nesting: + # Found start of new speculation window + start_spec_pc = cur_pc if _is_spec(entry) else None + start_spec_line = cur_idx if _is_spec(entry) else None + spec_windows.append(SpecWinInfo(start_spec_pc, start_spec_line, cur_pc, cur_idx, 1, cur_nesting)) + elif cur_nesting == prev_nesting: + # Update the current speculation window + if spec_windows[-1].first_pc is not None: + spec_windows[-1].first_pc = cur_pc + spec_windows[-1].first_line = cur_idx + if cur_pc == spec_windows[-1].target_pc: + spec_windows[-1].target_count += 1 + + cur_idx -= 1 + + return spec_windows + +def _get_line_info(line: int): + """ + Return the relevant information for a given line (take from the cache if already computed). + """ + if line in _cached_lines.keys(): + spec_windows = _cached_lines[line] + else: + spec_windows = _build_spec_info(line) + _cached_lines[line] = spec_windows + + return spec_windows + +# ------------------------------------------------------------------------------ +# GDB Helpers +# ------------------------------------------------------------------------------ + +class Printing: + """ + Customize the amount of information printed when running commands + """ + @staticmethod + def setup(): + gdb.execute("set print frame-arguments presence") + + @staticmethod + def restore(): + gdb.execute("set print frame-arguments all") + +class BpManager: + """ + Breakpoints management + """ + @staticmethod + def add(bp: str | int) -> int: + if isinstance(bp, int): + result = gdb.execute(f"b *{hex(bp)}", to_string=True) + else: + result = gdb.execute(f"b {bp}", to_string=True) + + if result.startswith("Breakpoint"): + bp_num = int(result.split(" ")[1]) + else: + print(result) + raise ValueError("Cannot add breakpoint") + + return bp_num + + @staticmethod + def delete(n: int): + gdb.execute(f"dis {n}") + gdb.execute(f"del {n}") + + @staticmethod + def ignore(bp: int, count: int): + gdb.execute(f"ignore {bp} {count}") + + +class GdbExec: + @staticmethod + def run(): + gdb.execute("run", to_string=True) + + @staticmethod + def cont(): + gdb.execute("continue", to_string=True) + + @staticmethod + def backtrace(): + gdb.execute("bt") + + @staticmethod + def jump(target_pc: int): + gdb.execute(f"jump *{hex(target_pc)}", to_string=True) + +# ------------------------------------------------------------------------------ +# GDB Commands +# ------------------------------------------------------------------------------ +class SpecPrefixCommand (gdb.Command): + "Spec command." + + def __init__ (self): + super (SpecPrefixCommand, self).__init__ ("spec", + gdb.COMMAND_SUPPORT, + gdb.COMPLETE_NONE, prefix=True) + +SpecPrefixCommand() + + +class SpecSourceCommand (gdb.Command): + """ + Load a debug trace generate by consfuzz. + """ + def __init__ (self): + super (SpecSourceCommand, self).__init__ ("spec source", + gdb.COMMAND_SUPPORT, + gdb.COMPLETE_FILENAME) + self._decoder = TraceDecoder() + + def invoke (self, arg, from_tty): + # Parse the debug trace + print(f" • Decoding debug trace... {arg}", flush=True) + + _, dbg_traces = self._decoder.decode_trace_file(arg) + if len(dbg_traces) != 1: + print(" • Error! Not a debug trace.") + return + global _glob_trace + _glob_trace = dbg_traces[0] + + print(" • Done!") + +SpecSourceCommand () + + +def spec_goto(context: List[SpecWinInfo], depth: int = 0, stop_at_first: bool = False): + Printing.setup() + + # Restart the program + bp = BpManager.add("wrapper") + GdbExec.run() + BpManager.delete(bp) + + # Simulate all windows + relevant_context = context[depth:] + for cur_n_win, win in enumerate(reversed(relevant_context)): + # Check if we need to open a speculation window + if win.first_pc is not None: + # Break on the first speculative instruction + bp = BpManager.add(win.first_pc) + # Jump to the first speculative instruction (stops at previous breakpoint) + GdbExec.jump(win.first_pc) + # Disable entrypoint (in case it's visited again in the rest of the trace) + BpManager.delete(bp) + + # Check if we need to stop at the first instruction of the current window + if stop_at_first: + if cur_n_win == len(relevant_context) -1: + break + + # Check if this window has only one instruction + if win.first_line != win.target_line: + # Break on last instruction of the window + bp = BpManager.add(win.target_pc) + # Continue until the right number of occurrences have been found + BpManager.ignore(bp, win.target_count - 1) + GdbExec.cont() + # Disable breakpoint (in case we encounter this instruction again later in the trace) + BpManager.delete(bp) + # Print Backtrace + # print(f"------------------- Reached {hex(win.target_pc)} ----------------------") + # GdbExec.backtrace() + + Printing.restore() + + +class SpecGotoCommand (gdb.Command): + """ + Execute the program until we reach the instruction corresponding to the given line. + """ + def __init__ (self): + super (SpecGotoCommand, self).__init__ ("spec goto", + gdb.COMMAND_SUPPORT, + gdb.COMPLETE_FILENAME) + + def invoke (self, arg, from_tty): + # Check validity of state and inputs + if _glob_trace is None: + print ("Error: no trace is sourced") + return + try: + arg = int(arg) + except: + print ("Error: command expects a line number") + return + if arg > len(_glob_trace): + print ("Error: invalid line for current trace") + return + + global _glob_cur_line + _glob_cur_line = arg + global _glob_cur_spec_context + _glob_cur_spec_context = _get_line_info(arg) + global _glob_cur_spec_level + _glob_cur_spec_level = 0 + + spec_goto(_glob_cur_spec_context) + +SpecGotoCommand () + + +class SpecBtCommand (gdb.Command): + """ + Get information about speculation backtrace for the currently selected line. + """ + def __init__ (self): + super (SpecBtCommand, self).__init__ ("spec bt", + gdb.COMMAND_SUPPORT, + gdb.COMPLETE_FILENAME) + + def invoke (self, arg, from_tty): + global _glob_cur_spec_context + global _glob_cur_spec_level + + # Check validity of state and inputs + if _glob_cur_spec_context is None or _glob_cur_spec_level is None: + print ("Error: no line selected - use 'spec goto' to select one") + return + + # Print spec info + for win in reversed(_glob_cur_spec_context[_glob_cur_spec_level:]): + print(win) + +SpecBtCommand () + + +class SpecUpCommand (gdb.Command): + """ + Goto "up" int the speculative backtrace, i.e. to the last instruction before speculation. + """ + def __init__ (self): + super (SpecUpCommand, self).__init__ ("spec up", + gdb.COMMAND_SUPPORT, + gdb.COMPLETE_FILENAME) + + def invoke (self, arg, from_tty): + global _glob_cur_line + global _glob_cur_spec_context + global _glob_cur_spec_level + + # Check validity of state and inputs + if _glob_cur_line is None or _glob_cur_spec_context is None or _glob_cur_spec_level is None: + print ("Error: no line selected - use 'spec goto' to select one") + return + + # Check at what point of the window we are + cur_win = _glob_cur_spec_context[_glob_cur_spec_level] + is_win_start = (_glob_cur_line == cur_win.first_line) + + if is_win_start: + # Goto end of previous window + if _glob_cur_spec_level >= len(_glob_cur_spec_context) - 1: + print ("Already at top level") + return + + _glob_cur_spec_level += 1 + _glob_cur_line = _glob_cur_spec_context[_glob_cur_spec_level].target_line + spec_goto(_glob_cur_spec_context, _glob_cur_spec_level) + else: + # Goto start of the current window + cur_line = _glob_cur_spec_context[_glob_cur_spec_level].first_line + if cur_line == None: + print ("Hit start of speculation!") + return + + _glob_cur_line = cur_line + spec_goto(_glob_cur_spec_context, _glob_cur_spec_level, stop_at_first=True) + + +SpecUpCommand() + + +class SpecDownCommand (gdb.Command): + """ + Goto "down" int the speculative backtrace, i.e. to the last instruction of the next window. + """ + def __init__ (self): + super (SpecDownCommand, self).__init__ ("spec down", + gdb.COMMAND_SUPPORT, + gdb.COMPLETE_FILENAME) + + def invoke (self, arg, from_tty): + global _glob_cur_line + global _glob_cur_spec_context + global _glob_cur_spec_level + + # Check validity of state and inputs + if _glob_cur_line is None or _glob_cur_spec_context is None or _glob_cur_spec_level is None: + print ("Error: no line selected - use 'spec goto' to select one") + return + + # Check at what point of the window we are + cur_win = _glob_cur_spec_context[_glob_cur_spec_level] + is_win_end = (_glob_cur_line == cur_win.target_line) + + if is_win_end: + # Goto start of next window + if _glob_cur_spec_level == 0: + print ("Already at bottom level") + return + + _glob_cur_spec_level -= 1 + _glob_cur_line = _glob_cur_spec_context[_glob_cur_spec_level].first_line + spec_goto(_glob_cur_spec_context, _glob_cur_spec_level, stop_at_first=True) + else: + # Goto end of the current window + _glob_cur_line = _glob_cur_spec_context[_glob_cur_spec_level].target_line + spec_goto(_glob_cur_spec_context, _glob_cur_spec_level) + + +SpecDownCommand() + + +class SpecPrevCommand (gdb.Command): + """ + Goto the previous instruction in the trace that belongs to the same speculation window + """ + def __init__ (self): + super (SpecPrevCommand, self).__init__ ("spec prev", + gdb.COMMAND_SUPPORT, + gdb.COMPLETE_FILENAME) + + def invoke (self, arg, from_tty): + global _glob_cur_line + global _glob_cur_spec_context + global _glob_cur_spec_level + + while True: + if _glob_cur_line == 0: + print("Already at first instruction of the trace!") + return + if _glob_cur_line == _glob_cur_spec_context[_glob_cur_spec_level].first_line: + print("Already at first instruction of the current window!") + print("Use 'spec up' to go to the previous window") + return + # Get cur nesting level + cur_nesting = _glob_trace[_glob_cur_line].nesting_level + # Get previous instruction + _glob_cur_line -= 1 + entry = _glob_trace[_glob_cur_line] + if DebugTraceEntryType(entry.type) == DebugTraceEntryType.ENTRY_REG_DUMP: + if entry.nesting_level > cur_nesting: + # Ignore unrelated windows + continue + # Found new instruction + tmp_context = _get_line_info(_glob_cur_line) + spec_goto(tmp_context) + break + +SpecPrevCommand() + + +class SpecNextCommand (gdb.Command): + """ + Goto the next instruction in the trace that belongs to the same speculation window + """ + def __init__ (self): + super (SpecNextCommand, self).__init__ ("spec next", + gdb.COMMAND_SUPPORT, + gdb.COMPLETE_FILENAME) + + def invoke (self, arg, from_tty): + global _glob_cur_line + global _glob_cur_spec_context + global _glob_cur_spec_level + + while True: + if _glob_cur_line == len(_glob_trace) - 1: + print("Already at last instruction of the trace!") + return + if _glob_cur_line == _glob_cur_spec_context[_glob_cur_spec_level].target_line: + print("Already at last instruction of the current window!") + print("Use 'spec down' to enter to the next window") + return + # Get cur nesting level + cur_nesting = _glob_trace[_glob_cur_line].nesting_level + # Get next instruction + _glob_cur_line += 1 + entry = _glob_trace[_glob_cur_line] + if DebugTraceEntryType(entry.type) == DebugTraceEntryType.ENTRY_REG_DUMP: + if entry.nesting_level > cur_nesting: + # Ignore unrelated windows + continue + # Found new instruction + tmp_context = _get_line_info(_glob_cur_line) + spec_goto(tmp_context) + break + +SpecNextCommand() diff --git a/consfuzz/triage/regs.py b/consfuzz/triage/regs.py new file mode 100644 index 000000000..6e05b0800 --- /dev/null +++ b/consfuzz/triage/regs.py @@ -0,0 +1,320 @@ +""" +File: Register ID to/from Register Name mappings for DynamoRIO. + +Copyright (C) Microsoft Corporation +SPDX-License-Identifier: MIT +""" + +from typing import Dict, Final, List +from .shared_types import * + + +# Map regiter names back to registers. +REVERSE_REGS: Dict[RegName, RegId] = {} + +# Map register id to register names, taken from the DynamoRIO headers. +REGS: List[RegName] = [] + +# Register used for AVX instruction size +OPMASKS: Final[List[RegName]] = ["K0", + "K1", + "K2", + "K3", + "K4", + "K5", + "K6", + "K7"] + +def init_reg_map(): + """ + Initialize the value of the register maps. + """ + global REGS + global REVERSE_REGS + + REGS.extend([ "NULL", + "RAX", + "RCX", + "RDX", + "RBX", + "RSP", + "RBP", + "RSI", + "RDI", + "R8", + "R9", + "R10", + "R11", + "R12", + "R13", + "R14", + "R15", + "EAX", + "ECX", + "EDX", + "EBX", + "ESP", + "EBP", + "ESI", + "EDI", + "R8D", + "R9D", + "R10D", + "R11D", + "R12D", + "R13D", + "R14D", + "R15D", + "AX", + "CX", + "DX", + "BX", + "SP", + "BP", + "SI", + "DI", + "R8W", + "R9W", + "R10W", + "R11W", + "R12W", + "R13W", + "R14W", + "R15W", + "AL", + "CL", + "DL", + "BL", + "AH", + "CH", + "DH", + "BH", + "R8L", + "R9L", + "R10L", + "R11L", + "R12L", + "R13L", + "R14L", + "R15L", + "SPL", + "BPL", + "SIL", + "DIL", + "MM0", + "MM1", + "MM2", + "MM3", + "MM4", + "MM5", + "MM6", + "MM7", + "XMM0", + "XMM1", + "XMM2", + "XMM3", + "XMM4", + "XMM5", + "XMM6", + "XMM7", + "XMM8", + "XMM9", + "XMM10", + "XMM11", + "XMM12", + "XMM13", + "XMM14", + "XMM15", + "XMM16", + "XMM17", + "XMM18", + "XMM19", + "XMM20", + "XMM21", + "XMM22", + "XMM23", + "XMM24", + "XMM25", + "XMM26", + "XMM27", + "XMM28", + "XMM29", + "XMM30", + "XMM31", + ]) + REGS.extend(["RESERVED_XMM"]*32) + REGS.extend([ + "ST0", + "ST1", + "ST2", + "ST3", + "ST4", + "ST5", + "ST6", + "ST7", + "DR_SEG_ES", + "DR_SEG_CS", + "DR_SEG_SS", + "DR_SEG_DS", + "DR_SEG_FS", + "DR_SEG_GS", + "DR0", + "DR1", + "DR2", + "DR3", + "DR4", + "DR5", + "DR6", + "DR7", + "DR8", + "DR9", + "DR10", + "DR11", + "DR12", + "DR13", + "DR14", + "DR15", + "CR0", + "CR1", + "CR2", + "CR3", + "CR4", + "CR5", + "CR6", + "CR7", + "CR8", + "CR9", + "CR10", + "CR11", + "CR12", + "CR13", + "CR14", + "CR15", + "INVALID", + "YMM0", + "YMM1", + "YMM2", + "YMM3", + "YMM4", + "YMM5", + "YMM6", + "YMM7", + "YMM8", + "YMM9", + "YMM10", + "YMM11", + "YMM12", + "YMM13", + "YMM14", + "YMM15", + "YMM16", + "YMM17", + "YMM18", + "YMM19", + "YMM20", + "YMM21", + "YMM22", + "YMM23", + "YMM24", + "YMM25", + "YMM26", + "YMM27", + "YMM28", + "YMM29", + "YMM30", + "YMM31", + ]) + REGS.extend(["RESERVED_YMM"]*32) + REGS.extend(["ZMM0", + "ZMM1", + "ZMM2", + "ZMM3", + "ZMM4", + "ZMM5", + "ZMM6", + "ZMM7", + "ZMM8", + "ZMM9", + "ZMM10", + "ZMM11", + "ZMM12", + "ZMM13", + "ZMM14", + "ZMM15", + "ZMM16", + "ZMM17", + "ZMM18", + "ZMM19", + "ZMM20", + "ZMM21", + "ZMM22", + "ZMM23", + "ZMM24", + "ZMM25", + "ZMM26", + "ZMM27", + "ZMM28", + "ZMM29", + "ZMM30", + "ZMM31", + ]) + REGS.extend(["RESERVED_ZMM"]*32) + REGS.extend(["K0", + "K1", + "K2", + "K3", + "K4", + "K5", + "K6", + "K7"]) + REGS.extend(["RESERVED_OPMASK"]*8) + REGS.extend([ + "BND0", + "BND1", + "BND2", + "BND3", + ]) + + # Map each name back to an ID. + for idx, val in enumerate(REGS): + REVERSE_REGS[val] = idx + + +def strip_alias(reg_name: RegName): + """ + Reduce different names that represent portions of the same register to a single name. + """ + if reg_name in ["RAX", "EAX", "AX", "AL"]: + return "RAX" + elif reg_name in ["RBX", "EBX", "BX", "BL"]: + return "RBX" + elif reg_name in ["RCX", "ECX", "CX", "CL"]: + return "RCX" + elif reg_name in ["RDX", "EDX", "DX", "DL"]: + return "RDX" + elif reg_name in ["RDI", "EDI", "DI", "DIL"]: + return "RDI" + elif reg_name in ["RSI", "ESI", "SI", "SIL"]: + return "RSI" + elif reg_name in ["RSP", "ESP", "SP", "SPL"]: + return "RSP" + elif reg_name in ["RBP", "EBP", "BP", "BPL"]: + return "RBP" + elif reg_name in ["R8", "R8D", "R8W", "R8L"]: + return "R8" + elif reg_name in ["R9", "R9D", "R9W", "R9L"]: + return "R9" + elif reg_name in ["R10", "R10D", "R10W", "R10L"]: + return "R10" + elif reg_name in ["R11", "R11D", "R11W", "R11L"]: + return "R11" + + return reg_name + + +def reg_id_to_stripped_name(reg_idx: RegId): + """ + Get the name for a reg id after stripping the alias, treating registers that alias as + the same register. + """ + return strip_alias(REGS[reg_idx]) diff --git a/consfuzz/triage/rvzr_trace.py b/consfuzz/triage/rvzr_trace.py new file mode 100644 index 000000000..d739690bf --- /dev/null +++ b/consfuzz/triage/rvzr_trace.py @@ -0,0 +1,266 @@ +""" +File: Structured parsing for revizor's debug traces. + +Copyright (C) Microsoft Corporation +SPDX-License-Identifier: MIT +""" + +from typing import Any, Dict, Final, List, Optional + +from rvzr.model_dynamorio.trace_decoder import DebugTraceEntryType + +from .shared_types import * +from .regs import REGS, strip_alias + +# Entries that are relevant to an instruction. +_INST_ENTRIES: Final[List[DebugTraceEntryType]] = [ + DebugTraceEntryType.ENTRY_REG_DUMP, + DebugTraceEntryType.ENTRY_READ, + DebugTraceEntryType.ENTRY_WRITE, + DebugTraceEntryType.ENTRY_LOC, + DebugTraceEntryType.ENTRY_REG_DUMP_EXTENDED, + DebugTraceEntryType.ENTRY_USE_DEF, +] +# Entries that indicate the instruction information has ended. +_INST_TERMINATORS: Final[List[DebugTraceEntryType]] = [ + DebugTraceEntryType.ENTRY_REG_DUMP, + DebugTraceEntryType.ENTRY_EOT, + DebugTraceEntryType.ENTRY_EXCEPTION, + DebugTraceEntryType.ENTRY_CHECKPOINT, + DebugTraceEntryType.ENTRY_ROLLBACK, + DebugTraceEntryType.ENTRY_ROLLBACK_STORE, +] + + +class ParsedInst: + """ + Class holding all the information related to a specific instruction that can be parsed from + a debug trace. + """ + entries: Dict[DebugTraceEntryType, Any] + regs: Dict[str, int] + start: TraceLineNum + + def __init__(self) -> None: + self.entries = {} + self.regs = {} + + def digest(self, entry: Any) -> bool: + """ + Add information coming from one line of the debug trace to this instruction. + Returns false if the entry was not "digested", i.e. it was not added to the instruction. + """ + t = DebugTraceEntryType(entry.type) + + if t in _INST_ENTRIES: + # Collect all reads and writes in two lists. + if t in [DebugTraceEntryType.ENTRY_READ, DebugTraceEntryType.ENTRY_WRITE]: + if t not in self.entries.keys(): + # Create the list of reads/writes if they are not there + self.entries[t] = [] + self.entries[t].append(entry) + else: + # Simply add the entry in a map + self.entries[t] = entry + + # Parse registers + if t == DebugTraceEntryType.ENTRY_REG_DUMP: + self.regs["RAX"] = entry.regs.xax + self.regs["RBX"] = entry.regs.xbx + self.regs["RCX"] = entry.regs.xcx + self.regs["RDX"] = entry.regs.xdx + self.regs["RSI"] = entry.regs.xsi + self.regs["RDI"] = entry.regs.xdi + elif t == DebugTraceEntryType.ENTRY_REG_DUMP_EXTENDED: + self.regs["RSP"] = entry.regs_2.rsp + self.regs["RBP"] = entry.regs_2.rbp + self.regs["R8"] = entry.regs_2.r8 + self.regs["R9"] = entry.regs_2.r9 + self.regs["R10"] = entry.regs_2.r10 + self.regs["R11"] = entry.regs_2.r11 + + return True + + return False + + def get_pc(self) -> int: + """ + Return the PC of this instruction. + """ + return self.entries[DebugTraceEntryType.ENTRY_REG_DUMP].regs.pc + + def get_loc(self) -> str: + """ + Return location string. + """ + if DebugTraceEntryType.ENTRY_LOC in self.entries: + loc_info = self.entries[DebugTraceEntryType.ENTRY_LOC].loc + module = ''.join([x.decode('utf-8') for x in loc_info.module_name]) + return module + '+' + str(loc_info.offset) + return 'unknown+0x0' + + def get_reg_val(self, reg_id: RegId) -> Optional[int]: + """ + Find the value of a register, if it's among the ones logged by the tracer. + NOTE: XMM and other special registers are never logged by the debug tracer, but we + still follow them. + """ + reg_name = strip_alias(REGS[reg_id]) + + reg_val = None + if reg_name in self.regs.keys(): + reg_val = self.regs[reg_name] + + return reg_val + + def get_reg_uses(self) -> list[RegId]: + """ + Return the ids of the registers used directly by this instruction. + """ + return [x for x in self.entries[DebugTraceEntryType.ENTRY_USE_DEF].def_use.reg_use if x != 0] + + def get_mem_uses(self) -> list[RegId]: + """ + Return the ids of the registers used to compute a memory address in instruction. + """ + return [x for x in self.entries[DebugTraceEntryType.ENTRY_USE_DEF].def_use.mem_use if x != 0] + + def get_mem_reads(self) -> list[tuple[MemAddr, int]]: + """ + Return a list of pairs for each memory location read by this instruction. + """ + if DebugTraceEntryType.ENTRY_READ not in self.entries: + return [] + return [(x.mem.address, x.mem.value) for x in self.entries[DebugTraceEntryType.ENTRY_READ]] + + def get_uses(self, regs: bool, mem: bool) -> list[tuple[Use, int]]: + """ + Get a list of used registers/memory locations for an instruction. + """ + uses = [] + # Mem uses + if mem: + for address, val in self.get_mem_reads(): + uses.append((Use(UseType.MEM, address), val)) + # Reg uses + if regs: + for reg_id in self.get_reg_uses(): + reg_val = self.get_reg_val(reg_id) + uses.append((Use(UseType.REG, reg_id), reg_val)) + + return uses + + +class TraceState: + """ + Associates a trace with a cursor that is aware of the currently-parsed instruction. + This avoids grouping all the entries into parsed instructions, allowing to create ParsedInst + object on-demand. + """ + trace: list[Any] + cur_idx: TraceLineNum + cur_entry: Any + cur_inst: ParsedInst + + def __init__(self, trace: list[Any], idx: TraceLineNum = 0) -> None: + self.trace = trace + self.cur_idx = idx + self.cur_entry = self.trace[idx] + self.cur_inst = ParsedInst() + + def seek(self, lineno: TraceLineNum) -> None: + """ + Move cursor to a specific line of the trace. + """ + self.cur_idx = lineno + self.cur_entry = self.trace[self.cur_idx] + + def _prev_entry(self) -> None: + """ + Move cursor to previous trace line. + """ + self.seek(self.cur_idx - 1) + + def _next_entry(self)-> None: + """ + Move cursor to next trace line. + """ + self.seek(self.cur_idx + 1) + + def prev_entry(self) -> None: + """ + Move cursor to previous trace line in the same speculative window + (skips nested speculation window that happened before the current entry). + """ + cur_nesting = self.cur_entry.nesting_level + self._prev_entry() + while self.cur_entry.nesting_level > cur_nesting: + self._prev_entry() + + def next_entry(self) -> None: + """ + Move cursor to next trace line in the same speculative window + (skips nested speculation window that happened after the current entry). + """ + cur_nesting = self.cur_entry.nesting_level + self._next_entry() + while self.cur_entry.nesting_level > cur_nesting: + self._next_entry() + + def parse_current(self) -> ParsedInst: + """ + Finds what instruction is related to the current line and parses all relevant information + for that instruction. + """ + # Find the start of the instruction + while DebugTraceEntryType(self.cur_entry.type) != DebugTraceEntryType.ENTRY_REG_DUMP: + self.prev_entry() + + first_line = self.cur_idx + + # Digest first entry + self.cur_inst = ParsedInst() + self.cur_inst.digest(self.cur_entry) + self.next_entry() + + # Digest other entries until we find a terminator + while DebugTraceEntryType(self.cur_entry.type) not in _INST_TERMINATORS: + self.cur_inst.digest(self.cur_entry) + self.next_entry() + + # Move back to the first entry + self.seek(first_line) + self.start = first_line + return self.cur_inst + + def find_last_def(self, use: Use, until: TraceLineNum) -> Optional[TraceLineNum]: + """ + Find the most recent instruction in the trace before `until` that wrote to `addr`. + `addr` is a register id if `use_type` is REG, otherwise it's an address. + Returns a parsed instruction and the corresponding line, or if no previous + definition is found. + """ + cur_nesting = self.trace[until].nesting_level + + # Start from `until` + idx = until + while idx > 0: + # Go to previous line + idx -= 1 + e = self.trace[idx] + # Only consider entries of the same spec window or architectural entries + if e.nesting_level > cur_nesting: + continue + cur_nesting = e.nesting_level + + # If we're looking for memory defs, check memory stores. + if use.use_type == UseType.MEM and DebugTraceEntryType(e.type) == DebugTraceEntryType.ENTRY_WRITE: + if use.addr == e.mem.address: + return idx + # If we're looking for register defs, check USE_DEF entries. + elif use.use_type == UseType.REG and DebugTraceEntryType(e.type) == DebugTraceEntryType.ENTRY_USE_DEF: + if use.addr in e.def_use.reg_def: + return idx + + # Reached the start of the trace. + return None diff --git a/consfuzz/triage/shared_types.py b/consfuzz/triage/shared_types.py new file mode 100644 index 000000000..d739f584d --- /dev/null +++ b/consfuzz/triage/shared_types.py @@ -0,0 +1,42 @@ +""" +File: Types shared between different modules of the inspector. + +Copyright (C) Microsoft Corporation +SPDX-License-Identifier: MIT +""" + +from enum import Enum + + +type RegId = int +type RegName = str +type MemAddr = int +type TraceLineNum = int + +class UseType(Enum): + """ + Indicates a register or memory use. + """ + MEM = 0 + REG = 1 + + +class Use: + """ + Indicates a use of a specific register/memory value. + """ + use_type: UseType + addr: RegId | MemAddr + + def __init__(self, use_type: UseType, addr: RegId | MemAddr) -> None: + self.use_type = use_type + self.addr = addr + + def __eq__(self, other): + if type(other) is type(self): + return (self.use_type == other.use_type) and (self.addr == other.addr) + else: + return False + + def __hash__(self): + return hash((self.use_type, self.addr)) diff --git a/consfuzz/triage/symbol_server.py b/consfuzz/triage/symbol_server.py new file mode 100644 index 000000000..5c0b8359e --- /dev/null +++ b/consfuzz/triage/symbol_server.py @@ -0,0 +1,153 @@ +""" +File: SymbolServer and subclasses, used to print source locations from raw PCs. + +Copyright (C) Microsoft Corporation +SPDX-License-Identifier: MIT +""" + +from typing import Optional + +from elftools.elf.elffile import ELFFile +from pygdbmi.gdbcontroller import GdbController + + +class SymbolServer: + """ + Superclass for getting location information from a binary. + """ + def __init__(self, binary: str) -> None: + pass + + def get_location(self, address: int) -> Optional[str]: + """ + Given a PC in the binary, return a string representing the source location and + the corresponding source code. + """ + return None + + +class GdbSymbolServer(SymbolServer): + """ + Implement symbol name retrival using GDB. + NOTE: this is significantly slower that parsing the ELF binary but it provides much more + information even for code with missing symbols. + """ + def __init__(self, binary: str) -> None: + # Start gdb process + self.gdbmi = GdbController() + # Load binary + _ = self.gdbmi.write(f'file {binary}') + + def _gdb_exec(self, cmd: str) -> str: + """ + Execute a gdb command and return what gdb printed as a result. + """ + response = self.gdbmi.write(cmd) + return response[1]['payload'].strip() + + def get_func_name(self, address: int) -> str: + """ + Return function name and assembly code corresponding to a program's PC + """ + # Get function name from GDB + payload = self._gdb_exec(f'info sym {hex(address)}') + + # Format as + + splitted = payload.split(' ') + loc = splitted[0] + '+' + splitted[2] + # Disassemble one instruction at that location + payload = self._gdb_exec(f'x/1i {loc}') + asm = ':'.join(payload.split(':')[1:]) + + return loc.strip() + " " + asm.strip() + + def get_location(self, address: int) -> str: + """ + Returns a tuple of location string (file:line) and the corresponding source code if + available, or function_name+offset and disassembly if no source code is available for that + location (e.g. library code with no symbols). + """ + # Try to get source location from gdb + payload = self._gdb_exec(f'info line *{hex(address)}') + + # If not available, get at least function_name+offset and assembly + if "No line number" in payload: + return self.get_func_name(address) + else: + # Format as : + splitted = payload.split(' ') + loc = splitted[3].replace('\"','') + ':' + splitted[1] + # Get source code from GDB + payload = self._gdb_exec(f'list {loc.strip()},{loc.strip()}') + code = " " + ' '.join(payload.split(' ')[1:]) + + return loc.strip() + " " + code.strip() + + +class ElfSymbolServer(SymbolServer): + """ + Implement symbol name retrival using the debug symbols embedded in the binary. + NOTE: this is significantly faster than using GDB but some symbols might not be available. + """ + def __init__(self, binary: str) -> None: + with open(binary, "rb") as f: + self._elf_data = ELFFile(f) + self.dwarf_info = self._elf_data.get_dwarf_info() + + def get_location(self, address: int) -> Optional[str]: + """ + Find the DWARF information in the ELF binary corresponding to a given PC. + The corresponding source code is always empty. + """ + # Go over all the line programs in the DWARF information, looking for + # one that describes the given address. + for CU in self.dwarf_info.iter_CUs(): + # First, look at line programs to find the file/line for the address + line = self.dwarf_info.line_program_for_CU(CU) + if not line: + continue + delta = 1 if line.header.version < 5 else 0 + prevstate = None + for entry in line.get_entries(): + # We're interested in those entries where a new state is assigned + if entry.state is None: + continue + # Looking for a range of addresses in two consecutive states that + # contain the required address. + if prevstate and prevstate.address <= address < entry.state.address: + filename = line['file_entry'][prevstate.file - delta].name.decode() + line = prevstate.line + return f"{filename}:{line}" + if entry.state.end_sequence: + # For the state with `end_sequence`, `address` means the address + # of the first byte after the target machine instruction + # sequence and other information is meaningless. We clear + # prevstate so that it's not used in the next iteration. Address + # info is used in the above comparison to see if we need to use + # the line information for the prevstate. + prevstate = None + else: + prevstate = entry.state + + # if we're here, we didn't find a symbol + return None + + +class CombinedSymbolServer(SymbolServer): + """" + Get the symbol location from the ELF binary when available, or fallback to the + GDB server if needed. + """ + def __init__(self, binary: str) -> None: + # Fast + self.elf_server = ElfSymbolServer(binary) + # Slow + self.gdb_server = GdbSymbolServer(binary) + + def get_location(self, address: int) -> Optional[str]: + result = self.elf_server.get_location(address) + if result is None: + result = self.gdb_server.get_func_name(address) + + return result + diff --git a/consfuzz/triage/use_def_graph.py b/consfuzz/triage/use_def_graph.py new file mode 100644 index 000000000..ce9df7894 --- /dev/null +++ b/consfuzz/triage/use_def_graph.py @@ -0,0 +1,197 @@ +""" +File: Implementation of the Use-Def Graph, used for nicer printing (and potentially graph analysis). + +Copyright (C) Microsoft Corporation +SPDX-License-Identifier: MIT +""" + +from typing import Dict, Optional +from enum import Enum + +from .regs import REGS +from .shared_types import * + +#--------------------------------------------------------------------------------------------------- +# Node and Edge Types +#--------------------------------------------------------------------------------------------------- + +class UseDefEdge: + """ + Edge between an instruction and all of its users (i.e. instructions that use either a + register value or a memory location defined by this instruction). + """ + dst: TraceLineNum + use: Use + + def __init__(self, dst: TraceLineNum, use: Use) -> None: + self.dst = dst + self.use = use + + def label(self) -> str: + return hex(self.use.addr) if self.use.use_type == UseType.MEM else REGS[self.use.addr] + + +class UseDefNode: + """ + Node representing an instruction in the trace. + """ + line: TraceLineNum + trimmed: bool + + def __init__(self, line: TraceLineNum): + self.line = line + self.trimmed = False + + def name(self) -> str: + if self.line > 0: + return f"node_{self.line}" + else: + return f"terminator_{self.line * -1}" + +class TerminatorNodeType(Enum): + """ + Final nodes of the reverse use-def graph. + """ + DECLASSIFIED = 0 + FIRST_USE = 1 + TRIMMED_BY_DIFF = 2 + NO_FOLLOW = 3 + KNOWN_SYMBOL = 4 + KEY = 5 + +class TerminatorNode(UseDefNode): + """ + Node representing the end of a branch of the use-def chain. + """ + _type: TerminatorNodeType + + def __init__(self, _type: TerminatorNodeType, line: TraceLineNum): + super().__init__(line) + self._type = _type + +#--------------------------------------------------------------------------------------------------- +# Graph Implementation +#--------------------------------------------------------------------------------------------------- + +class UseDefGraph: + nodes: Dict[TraceLineNum, UseDefNode] + edges: Dict[TraceLineNum, list[UseDefEdge]] + _terminator_idx: int + head: TraceLineNum + + def __init__(self): + self.nodes = {} + self.edges = {} + self._terminator_idx = -1 + self.head = None + + #----------------------------------------------------------------------------------------------- + # Graph Constructions + #----------------------------------------------------------------------------------------------- + + def get_or_create(self, line: TraceLineNum) -> UseDefNode: + """ + Get the node corresponding to a trace line, or create if it does not exist. + """ + if line not in self.nodes.keys(): + self.nodes[line] = UseDefNode(line) + if self.head is None: + self.head = line + return self.nodes[line] + + def add_terminator(self, terminator_type: TerminatorNodeType) -> TraceLineNum: + """ + Create a terminator node and return its unique id. + """ + node = TerminatorNode(terminator_type, self._terminator_idx) + # NOTE: This is a bit of a hack: to identify nodes in the dictionary, + # we typically use the line number, but for terminators we might not + # have an associated line. We use unique negative numbers for that. + self.nodes[self._terminator_idx] = node + self._terminator_idx -= 1 + return node.line + + def link(self, src: TraceLineNum, dst: TraceLineNum, use: Use): + if src not in self.edges.keys(): + self.edges[src] = [] + self.edges[src].append(UseDefEdge(dst, use)) + + def trim(self, node_id: TraceLineNum) -> None: + if node_id not in self.nodes.keys(): + self.nodes[node_id] = UseDefNode(node_id) + self.nodes[node_id].trimmed = True + + #----------------------------------------------------------------------------------------------- + # Graph Printing + #----------------------------------------------------------------------------------------------- + + def print_recursive(self, node=None, prefix=""): + if node is None: + node = self.nodes[self.head] + + print(prefix + str(node.line)) + prefix += " " + + if node.trimmed: + print(prefix + " Trimmed") + return + if node.line not in self.edges.keys(): + print(prefix + " END!") + return + + for e in self.edges[node.line]: + use_str = hex(e.use.addr) if e.use.use_type == UseType.MEM else REGS[e.use.addr] + print(prefix + "Use of " + use_str) + + self.print_recursive(self.nodes[e.dst], prefix + " ") + + def _draw_recursive(self, node: UseDefNode, parent:UseDefNode, edge: UseDefEdge, dot_file, visited): + node_name = node.name() + is_terminator = isinstance(node, TerminatorNode) or (node.line not in self.edges.keys()) + already_visited = node.line in visited + + if node.trimmed: + return + if isinstance(node, TerminatorNode) and node._type in [TerminatorNodeType.NO_FOLLOW]: + return + + if not already_visited: + # Draw node + dot_file.write(node_name) + if is_terminator: + dot_file.write(f" [shape=\"rectangle\"") + if isinstance(node, TerminatorNode): + dot_file.write(f",label=\"{node._type.name}\"") + if node._type == TerminatorNodeType.DECLASSIFIED: + dot_file.write(",style=\"filled\",fillcolor=\"orange\"") + if node._type == TerminatorNodeType.KNOWN_SYMBOL: + dot_file.write(",style=\"filled\",fillcolor=\"yellow\"") + if node._type == TerminatorNodeType.FIRST_USE: + dot_file.write(",style=\"filled\",fillcolor=\"cyan\"") + if node._type == TerminatorNodeType.KEY: + dot_file.write(",style=\"filled\",fillcolor=\"red\"") + dot_file.write(f"]") + dot_file.write("\n") + visited.add(node.line) + + # Draw edge + if parent is not None: + parent_name = parent.name() + dot_file.write(f"{node_name} -> {parent_name} [label=\"{edge.label()}\"]\n") + + if is_terminator: + return + if already_visited: + return + + # Visit children + for e in self.edges[node.line]: + self._draw_recursive(self.nodes[e.dst], parent=node, edge=e, dot_file=dot_file, visited=visited) + + def draw(self, dot_file: str) -> None: + init = self.nodes[self.head] + f = open(dot_file, "w") + f.write("digraph \"usedef\" {\n") + self._draw_recursive(init, parent=None, edge=None, dot_file=f, visited=set()) + f.write("\n}") + f.close() diff --git a/consfuzz/triage/use_def_tracker.py b/consfuzz/triage/use_def_tracker.py new file mode 100644 index 000000000..1f33539e3 --- /dev/null +++ b/consfuzz/triage/use_def_tracker.py @@ -0,0 +1,305 @@ +""" +File: Implementation of differential origin tracking for leaked values using use-def chains. + +Copyright (C) Microsoft Corporation +SPDX-License-Identifier: MIT +""" + +from typing import Any, Dict, List, Optional + +from .regs import REGS, strip_alias, init_reg_map +from .rvzr_trace import TraceState, ParsedInst +from .shared_types import * +from .use_def_graph import UseDefGraph, TerminatorNodeType, TerminatorNode, UseDefNode, UseDefEdge +from .symbol_server import SymbolServer +from .config import LeakageInspectorConfig + +# -------------------------------------------------------------------------------------------------- +# Local Types +# -------------------------------------------------------------------------------------------------- + +# Values used by an instruction. +UsesDict = Dict[Use, int] +# List of values for the same use across different traces. +MergedUsesDict = Dict[Use, List[Optional[int]]] + +def _merge_uses(uses_dicts: List[UsesDict]) -> MergedUsesDict: + """ + Merge uses from multiple traces. + This will return a dictionary that has as keys the union of all the keys (sorted by type) + and as values an array of values where each position corresponds to a separate trace. + If a use is not present in one of the traces, it's corresponding value in the array is None. + """ + merged = {} + total = len(uses_dicts) + + for idx, uses in enumerate(uses_dicts): + for use, val in uses: + if use not in merged.keys(): + # If there's no entry, create a list as long as the number of dicts to merge + merged[use] = [None]*total + # Add the value at the index corresponding to the current dict being merged + merged[use][idx] = val + + return merged + +def print_use(use: Use) -> str: + return hex(use.addr) if use.use_type == UseType.MEM else REGS[use.addr] + +class DifferentialTraceState: + """ + Hold the state of multiple traces together (for differential analysis). + """ + states: list[TraceState] + + def __init__(self, states: list[TraceState]) -> None: + self.states = states + + +# -------------------------------------------------------------------------------------------------- +# Tracker implementation +# -------------------------------------------------------------------------------------------------- + +class UseDefTracker: + """ + Class that implements reverse use-def exploration for multiple + traces at a time. + """ + _visited: list[list[int, int]] + _out_file: str + _symbol_server: SymbolServer + _prefix: str + _graph: UseDefGraph + _config: LeakageInspectorConfig + + def __init__(self, out_file: str, config: LeakageInspectorConfig, symbol_server: SymbolServer) -> None: + self._out_file = out_file + self._symbol_server = symbol_server + self._visited = [] + self._prefix = "" + self._out_file = open(out_file, "w") + self._graph = UseDefGraph() + self._config = config + + # Initialize the list of registers + init_reg_map() + + def _set_prefix(self, prefix: str) -> None: + self._prefix = prefix + + def _print(self, x: Any) -> None: + """ + Print to the selected out file. + """ + print(str(self._prefix) + str(x), file=self._out_file) + + def _get_loc(self, instr: ParsedInst) -> str: + """ + Return a formatted string representing the source location of an instruction. + """ + loc = self._symbol_server.get_location(instr.get_pc()) + if loc is None: + return instr.get_loc() + return loc + + def _get_short_descr(self, instr: ParsedInst, line: int) -> str: + """ + Return a formatted string representing an instruction in the trace. + """ + return f"{hex(instr.get_pc())} (line: {line}) {self._get_loc(instr)}" + + def _get_defs(self, use: Use, cur_state: DifferentialTraceState) -> DifferentialTraceState: + """ + For a given use, find the trace line corresponding to the last definition of that + register or memory location in all the traces of `cur_state`. + """ + defs = [] + + # Get def of corresponding register/memory location for all traces. + for s in cur_state.states: + def_line = s.find_last_def(use, until=s.cur_idx) + if def_line: + defs.append(TraceState(s.trace, def_line)) + else: + defs.append(None) + # Group into a single DifferentialState. + return DifferentialTraceState(defs) + + def _step_def_use_chain(self, diff_state: DifferentialTraceState, follow_regs: bool, follow_mem: bool) -> list[DifferentialTraceState]: + """ + Go "up" one step in the def use chain for multiple traces at the same time. This returns a + DifferentialTraceState for each register/memory location used by the current state. + """ + # Parse the current instruction of each of the parallel traces. + cur_insts = [s.parse_current() for s in diff_state.states] + cur_line = diff_state.states[0].cur_idx + # Print current instruction + self._print(self._get_short_descr(cur_insts[0], cur_line)) + + # Check if the current instruction is a sink. + loc = self._get_loc(cur_insts[0]) + if any(loc.endswith(x) for x in self._config.declassified): + self._print(" END: Found Declassified") + self._graph.nodes[cur_line] = TerminatorNode(TerminatorNodeType.DECLASSIFIED, cur_line) + return [] + if any(loc.endswith(x) for x in self._config.key): + self._print(" END: Found Declassified") + self._graph.nodes[cur_line] = TerminatorNode(TerminatorNodeType.KEY, cur_line) + return [] + + + # Group together uses of the same register/memory location from different traces. + uses = [i.get_uses(regs=follow_regs, mem=follow_mem) for i in cur_insts] + merged = _merge_uses(uses) + + # Apply filters. + to_follow = [] + trimmed_by_diff = 0 + for use, vals in merged.items(): + + if use.use_type == UseType.REG and strip_alias(REGS[use.addr]) in self._config.get().dont_follow: + # We avoid following some registers that are not logged and are known to + # cause overtainting (i.e. AVX K registers). + self._print(" Use of " + print_use(use)) + self._print(" SKIP: No-follow register ") + idx = self._graph.add_terminator(TerminatorNodeType.NO_FOLLOW) + self._graph.link(cur_line, idx, use) + continue + + elif use.use_type == UseType.MEM and self._config.get_sym_annotation(use.addr) is not None: + name, offset = self._config.get_sym_annotation(use.addr) + self._print(" Use of " + print_use(use)) + self._print(f" END: Annotated symbol: {name}+{offset}") + idx = self._graph.add_terminator(TerminatorNodeType.KNOWN_SYMBOL) + self._graph.link(cur_line, idx, use) + continue + + elif all(v is None for v in vals): + # If the value is unknown for _all_ traces, it means that the tracer + # doesn't log it logged: don't remove it. + self._print(" Use of " + print_use(use)) + self._print(" INFO: Untracked value, visiting") + pass + + elif any(v is None for v in vals): + # If a value is only used in some of the traces (e.g. a memory location that is + # only read in one of the two tarces), we can't continue differentially: + # stop backwards tracking for this use. + self._print(" Use of " + print_use(use)) + self._print(" SKIP: Use only appears in some of the traces") + idx = self._graph.add_terminator(TerminatorNodeType.TRIMMED_BY_DIFF) + self._graph.link(cur_line, idx, use) + continue + + elif len(vals) > 1 and all(v == vals[0] for v in vals): + # If all the traces agree on a value, we can skip tracking for this use. + self._print(" Use of " + print_use(use)) + self._print(" SKIP: Trimmed by differential tracking") + idx = self._graph.add_terminator(TerminatorNodeType.TRIMMED_BY_DIFF) + trimmed_by_diff += 1 + self._graph.link(cur_line, idx, use) + continue + + to_follow.append(use) + + if trimmed_by_diff == len(merged.items()): + self._graph.trim(cur_line) + + # Gef the def of each use + next_states = [] + for use in to_follow: + defs = self._get_defs(use, diff_state) + + # If there's no previous definition, we reached the top. + if any(d is None for d in defs.states): + self._print(" Use of " + print_use(use)) + self._print(" END: First use") + idx = self._graph.add_terminator(TerminatorNodeType.FIRST_USE) + self._graph.link(cur_line, idx, use) + break + + defs.states[0].parse_current() + node2 = self._graph.get_or_create(defs.states[0].cur_idx) + if not node2.trimmed: + next_states.append(defs) + self._graph.link(cur_line, node2.line, use) + + return next_states + + def follow_def_use_chain_recursive(self, diff_states: list[DifferentialTraceState], follow_regs: bool, follow_mem: bool) -> None: + """ + Recursively explore the def-use chain starting from a set of states. + Only for the first step, we might want to follow only memory uses (for D-type violations) + or only register uses (for I-type violations). + """ + idx = 0 + prefix = self._prefix + for diff_state in diff_states: + cur_lines = [s.cur_idx for s in diff_state.states] + # Cache results to avoid recomputing stuff. + if cur_lines in self._visited: + self._print(" └─ Skipping (already visited)") + continue + if all(x != None for x in cur_lines): + self._visited.append(cur_lines) + + # Check if it's the last state. + if idx == len(diff_states) - 1: + cur_prefix = prefix + " └─" + else: + cur_prefix = prefix + " ├─" + + # Perform one reverse step in the def-use chain. + self._set_prefix(cur_prefix) + next_list = self._step_def_use_chain(diff_state, follow_regs, follow_mem) + if idx == len(diff_states) - 1: + next_prefix = prefix + " " + else: + next_prefix = prefix + " │" + + # Follow all the uses recursively. + self._set_prefix(next_prefix) + self.follow_def_use_chain_recursive(next_list, follow_regs=True, follow_mem=True) + idx += 1 + + def analyze(self, raw_trace1: list[Any], line1: TraceLineNum, + raw_trace2: Optional[list[Any]], line2: Optional[TraceLineNum], + violation: str) -> UseDefGraph: + # Initialize trace(s) + trace1 = TraceState(raw_trace1, line1) + trace2 = None + if raw_trace2 is not None: + trace2 = TraceState(raw_trace2, line2) + + if violation == "D": + # MEM violation: get all MEM uses + init_state = DifferentialTraceState([trace1]) + if trace2 is not None: + init_state.states.append(trace2) + self.follow_def_use_chain_recursive(diff_states=[init_state], follow_mem=True, follow_regs=False) + + elif violation == "I": + # PC violation: 1. go to previous instruction + # NOTE: if a trace has two different PCs it means that the control-flow + # instruction immediately preceding them had a different outcome. + trace1.prev_entry() + init_state = DifferentialTraceState([trace1]) + if trace2 is not None: + trace2.prev_entry() + init_state.states.append(trace2) + # 2. follow all reg uses of the previous instruction + self.follow_def_use_chain_recursive(diff_states=[init_state], follow_mem=False, follow_regs=True) + + elif violation == "C": + # INDCALL violation: get all REG uses + init_state = DifferentialTraceState([trace1]) + if trace2 is not None: + init_state.states.append(trace2) + self.follow_def_use_chain_recursive(diff_states=[init_state], follow_mem=False, follow_regs=True) + + else: + self._out_file.close() + assert False, "Unknown violation type" + + self._out_file.close() + return self._graph diff --git a/consfuzz/triager.py b/consfuzz/triager.py new file mode 100755 index 000000000..0b8652e0b --- /dev/null +++ b/consfuzz/triager.py @@ -0,0 +1,313 @@ +""" +File: Inspect a single reported leakage. + +Copyright (C) Microsoft Corporation +SPDX-License-Identifier: MIT +""" + +import os +import subprocess as sp +from typing import Any, List, Optional, Tuple + +from rvzr.model_dynamorio.trace_decoder import TraceDecoder, TraceEntryType, DebugTraceEntryType + +from .triage import get_plugin_path +from .triage.config import LeakageInspectorConfig +from .triage.use_def_tracker import UseDefTracker +from .triage.shared_types import TraceLineNum +from .triage.symbol_server import SymbolServer, CombinedSymbolServer + +_TRACING_FLAGS = "--log-level 5 --debug-trace-output {dbg_trace_file} " + +type InstPc = int + + +def _parse_traces_info(file_and_line: str, baseline: str) -> List[Tuple[str, TraceLineNum]]: + # Parse trace path and line of the cmdline arguments + splitted = file_and_line.split(':') + trace1 = ':'.join(splitted[:-2]) + line1 = int(splitted[-2]) + + traces = [(trace1, line1)] + + if baseline != 'none': + # Generate trace path and line for the baseline we should compare against (needed for + # differential analysis) + if baseline == 'auto': + input_name = os.path.basename(trace1) + ext = '.' + trace1.split('.')[-1] + trace2 = trace1.replace(input_name, '000' + ext) # 000.trace or 000.dbgtrace + else: + trace2 = baseline + + line2 = int(splitted[-1]) + traces.append((trace2, line2)) + + return traces + + +class LeakageInspector: + """ + Extract information from the report for leakage analysis. + """ + _decoder: TraceDecoder + _config: LeakageInspectorConfig + + leak_trace: list[Any] + debug_trace: list[Any] + + def __init__(self, config: LeakageInspectorConfig): + self._decoder = TraceDecoder() + self._config = config + self.leak_trace = None + self.debug_trace = None + + #--------------------------------------------------------------------------- + # Internal Helpers + #--------------------------------------------------------------------------- + def _count_occurrences(self, trace: list[Any], pc: InstPc, until: TraceLineNum, + entry_type: TraceEntryType) -> int: + """ + Count occurrences of `pc` in `trace` before line `until` in a leakage trace. + """ + count = 0 + for entry in trace[:until]: + type_ = TraceEntryType(entry.type) + if type_ == entry_type and entry.addr == pc: + count +=1 + return count + + def _count_dbg_occurrences(self, trace: list[Any], pc: InstPc, until: TraceLineNum, + only_arch: bool = False) -> int: + """ + Count occurrences of `pc` in `trace` before line `until` in a debug trace. + """ + count = 0 + for entry in trace[:until]: + type_ = DebugTraceEntryType(entry.type) + if only_arch and self._is_spec(entry): + continue + if type_ == DebugTraceEntryType.ENTRY_REG_DUMP and entry.regs.pc == pc: + count +=1 + return count + + + def _is_arch(self, entry: Any) -> bool: + """ + Is the entry architectural (debug trace entries only) + """ + return entry.nesting_level == 0 + + def _is_spec(self, entry: Any) -> bool: + """ + Is the entry speculative (debug trace entries only) + """ + return entry.nesting_level != 0 + + def _get_original_cmd(self, trace_file: str) -> str: + """ + Get the original command that was ran to produce a given trace. + NOTE: This requires that the original DynamoRIO command is logged in a separate `.log` + file that has the same name of the trace file (minus the extension). + """ + log_file = trace_file.replace(".trace", ".log").replace(".dbgtrace", ".log") + print(f" • Reading original command from logfile {log_file}", flush=True) + with open(log_file, "r") as log: + for l in log: + if l.startswith("$> "): + return l.replace("$> ", "").replace("\n","").strip() + + raise ValueError(f"Could not find command that produced {trace_file}") + + def _run_dbg_tracer(self, trace_file: str, regenerate_trace: bool) -> str: + """ + Produce a debug trace for a given test case. + Returns the file name of the trace. + """ + cmd = self._get_original_cmd(trace_file) + + # Add debug flags to command + dbg_trace_f = trace_file.replace(".trace", ".dbgtrace") + dbg_flags = _TRACING_FLAGS.format(dbg_trace_file=dbg_trace_f) + cmd = cmd.replace("libdr_model.so", f"libdr_model.so {dbg_flags} ") + # Output debug trace in human-readable format + # cmd += " > {dbg_trace_file}.asm".format(dbg_trace_file=dbg_trace_f) + # Run debug command + if regenerate_trace: + print(f"{cmd}\n", flush=True) + sp.check_call(cmd, shell=True) + + return dbg_trace_f + + #--------------------------------------------------------------------------- + # Leak trace analysis + #--------------------------------------------------------------------------- + def find_leak_pc(self, trace_file: str, trace_line: TraceLineNum) -> tuple[InstPc, TraceEntryType, int]: + """ + Given a trace file and a corresponding line, returns the PC of the intruction logged + at that line and the number of occurrences of that PC before that line. + This uniquely identifies an instruction in the trace. + """ + print(f" • Decoding leak trace {trace_file}...", flush=True) + traces, _ = self._decoder.decode_trace_file(trace_file) + assert len(traces) == 1 + trace = traces[0] + self.leak_trace = trace + + # Find last pc right before trace_line + print(" • Finding leak PC...", flush=True) + cur_line = trace_line + while cur_line > 0: + cur_line -= 1 + entry = trace[cur_line] + + # For 'I' violations, we want to get the PC of the previous instruction + # For 'D' violations, we want to get the PC of the load/store instruction + # For indirect call violations, we get the target PC + entry_type = TraceEntryType(entry.type) + if entry_type in [TraceEntryType.ENTRY_PC, TraceEntryType.ENTRY_IND]: + leak_pc = entry.addr + n_occurrences = self._count_occurrences(trace, pc=leak_pc, until=cur_line, entry_type=entry_type) + n_occurrences += 1 # count also the last occurrence that we just found + print(f" • Found address {hex(leak_pc)} (occurrence #{n_occurrences})", flush=True) + return (leak_pc, entry_type, n_occurrences) + + raise ValueError(f"No instruction found for trace line {trace_line}") + + #--------------------------------------------------------------------------- + # Debug trace analysis + #--------------------------------------------------------------------------- + def find_dbg_line(self, trace_file: str, leak_pc: InstPc, entry_type: TraceEntryType, + leak_pc_count: int, regenerate_trace: bool) -> tuple[str, TraceLineNum]: + """ + Find the line that contains the `leak_pc_count`-th occurrence of `leak_pc` in the debug trace. + """ + print(" • Collecting debug trace...", flush=True) + dbg_trace_f = self._run_dbg_tracer(trace_file, regenerate_trace) + # Parse the debug trace + print(" • Decoding debug trace...", flush=True) + _, dbg_traces = self._decoder.decode_trace_file(dbg_trace_f) + assert len(dbg_traces) == 1 + dbg_trace = dbg_traces[0] + self.debug_trace = dbg_trace + + print(" • Analyzing debug trace...", flush=True) + last_xcpt: Optional[Any] = None + last_valid: TraceLineNum = 0 + last_lineno: TraceLineNum = 0 + n_found = 0 + + for entry in dbg_trace: + # NOTE: In some configurations (e.g. with --poison-value) the tracer will continue + # execution when it encouters a fault on a speculative path. When running in GDB, + # we cannot execute faulty instructions, so need to record the last _valid_ instruction. + if last_xcpt and entry.nesting_level < last_xcpt.nesting_level: + # Flush last_xcpt if we exited the corresponding speculation window + last_xcpt = None + if DebugTraceEntryType(entry.type) == DebugTraceEntryType.ENTRY_EXCEPTION: + if not last_xcpt: + # Record exceptions + last_xcpt = entry + + elif DebugTraceEntryType(entry.type) == DebugTraceEntryType.ENTRY_REG_DUMP: + if not last_xcpt: + # If there's not pending exception at this speculation level, + # update the last valid instruction + last_valid = last_lineno + + # Check if we found the PC we were looking for + if entry_type == TraceEntryType.ENTRY_PC: + if entry.regs.pc == leak_pc: + n_found += 1 + if n_found == leak_pc_count: + print(f"Done! Found leak at line {last_valid-1}") + return dbg_trace_f, last_valid-1 + + elif DebugTraceEntryType(entry.type) == DebugTraceEntryType.ENTRY_IND: + # Check if we found the indcall we were looking for + if entry_type == TraceEntryType.ENTRY_IND: + if entry.ind.target == leak_pc: + n_found += 1 + if n_found == leak_pc_count: + print(f"Done! Found leak at line {last_valid-1}") + return dbg_trace_f, last_valid-1 + + last_lineno += 1 + + raise IndexError(f"Could not find occurence {leak_pc_count} of pc {hex(leak_pc)} in {dbg_trace_f}") + + #--------------------------------------------------------------------------- + # GDB + #--------------------------------------------------------------------------- + def generate_gdb_script(self, trace_file: str, trace_line: TraceLineNum) -> str: + """ + Generate a GDB script that can follow the trace speculatively until the leakage point. + """ + gdb_string = f"source {get_plugin_path()}/plugin.py" + gdb_string += "\nspec source " + trace_file + gdb_string += "\nspec goto " + str(trace_line) + gdb_string += "\nspec bt" + return gdb_string + + #--------------------------------------------------------------------------- + # Public interface + #--------------------------------------------------------------------------- + def inspect(self, file_and_line: str, + violation: str, + baseline: str, + binary: str, + skip_tracing: bool, + usedef: bool, + debug_trace: bool) -> None: + traces = _parse_traces_info(file_and_line, baseline) + + dbg_traces = [] + + # Create a GDB script to reach the specified line + for idx, (trace, line) in enumerate(traces): + # If the specified line is a line in the trace, we need to find the corresponding line in + # the _debug_ trace. + if not debug_trace: + # Find the target PC in the leak trace + pc, leak_type, n_occurrences = self.find_leak_pc(trace, line) + regenerate_trace = not skip_tracing + # Generate the debug trace and find the corresponding line + dbg_trace, dbg_line = self.find_dbg_line(trace, pc, leak_type, n_occurrences, regenerate_trace) + else: + dbg_trace = trace + dbg_line = line + _, decoded = self._decoder.decode_trace_file(dbg_trace) + assert len(decoded) == 1 + self.debug_trace = decoded[0] + + # Create the gdb script to reach the target line + script = self.generate_gdb_script(dbg_trace, dbg_line) + script_name = f"spec_{idx}.gdb" + with open(script_name, "w") as f: + f.write(script) + # Print the gdb command (user can copy-paste in separate terminal) + program_cmd = self._get_original_cmd(trace).split(" -- ")[1] + print(f"\n====== GDB Command:\ngdb -x {script_name} --args {program_cmd}\n======") + + # Append the already parsed trace, used later for use-def analysis + raw_dbg_trace = self.debug_trace + dbg_traces.append((raw_dbg_trace, dbg_line)) + + if usedef: + # Create output file + trace1, line1 = traces[0] + use_def_file = trace1.replace('.trace', '.usedef').replace('.dbgtrace', '.usedef') + print(f"\n====== Printing use-def information at {use_def_file}") + # Setup symbol server + symbols = SymbolServer("") if binary is None else CombinedSymbolServer(binary) # GdbSymbolServer(binary) + # Print textual representation of the use-def chain to a file + tracker = UseDefTracker(use_def_file, self._config, symbols) + + if len(dbg_traces) == 1: + dbg_traces.append((None, None)) + + graph = tracker.analyze(dbg_traces[0][0], dbg_traces[0][1], dbg_traces[1][0], dbg_traces[1][1], violation) + # Print use-def graph to a dot file + dot_file = use_def_file + ".dot" + print(f"\n====== Printing graph at {dot_file}") + graph.draw(dot_file) diff --git a/pyproject.toml b/pyproject.toml index 70f2d5aa9..b5add552c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,8 @@ dependencies = [ "setuptools", "cffi", "types-cffi", + "tqdm", + "types-tqdm" ] maintainers = [{name = "Oleksii Oleksenko", email = ""}] diff --git a/rvzr/model_dynamorio/backend/dispatcher.cpp b/rvzr/model_dynamorio/backend/dispatcher.cpp index 142edabec..b4562a115 100644 --- a/rvzr/model_dynamorio/backend/dispatcher.cpp +++ b/rvzr/model_dynamorio/backend/dispatcher.cpp @@ -18,6 +18,8 @@ #include "cli.hpp" #include "dispatcher.hpp" +#include "dr_ir_instr.h" +#include "dr_ir_opnd.h" #include "factory.hpp" #include "observables.hpp" #include "util.hpp" @@ -40,7 +42,7 @@ extern std::unique_ptr glob_dispatcher; // NOLINT static pc_t instruction_dispatch(dr_mcontext_t *mc, void *dc, const Dispatcher *dispatcher, instr_obs_t instr) { - dispatcher->logger->log_instruction(instr, mc, dispatcher->speculator->get_nesting_level()); + dispatcher->logger->log_instruction(instr, mc, dc, dispatcher->speculator->get_nesting_level()); dispatcher->tracer->observe_instruction(instr, mc, dc); const pc_t next_pc = dispatcher->speculator->handle_instruction(instr, mc, dc); return next_pc; diff --git a/rvzr/model_dynamorio/backend/factory.cpp b/rvzr/model_dynamorio/backend/factory.cpp index a0e06ac39..4e64f0b78 100644 --- a/rvzr/model_dynamorio/backend/factory.cpp +++ b/rvzr/model_dynamorio/backend/factory.cpp @@ -110,7 +110,7 @@ unique_ptr create_logger(const string &out_path, int level, bool print) { // Sanitize log level if (level >= Logger::log_level_t::LOG_MAX) { - level = Logger::log_level_t::LOG_MAX - 1; + level = Logger::log_level_t::LOG_MAX; } else if (level < 0) { level = 0; } diff --git a/rvzr/model_dynamorio/backend/include/logger.hpp b/rvzr/model_dynamorio/backend/include/logger.hpp index 40b1d19ca..5b869959e 100644 --- a/rvzr/model_dynamorio/backend/include/logger.hpp +++ b/rvzr/model_dynamorio/backend/include/logger.hpp @@ -29,7 +29,8 @@ class Logger LOG_INSTRUCTIONS = 1, // Report PC, registers, memory operations and exceptions LOG_SPEC = 2, // Also report rollbacks and checkpoints LOG_DISASM = 3, // Also report module_name+offset of each instruction - LOG_MAX = 4, + LOG_DEF_USE = 4, // Also report DEFs/USEs for an instruction + LOG_MAX = 5, }; /// @param logs_path Path of the file where to dump the binary logs @@ -51,7 +52,8 @@ class Logger /// @brief log the PC and registers of the current instruction, and whether it is speculative or /// biot - void log_instruction(instr_obs_t instr, dr_mcontext_t *mc, unsigned int nesting_level); + void log_instruction(instr_obs_t instr, dr_mcontext_t *mc, void *dc, + unsigned int nesting_level); /// @brief log a memory operation, including the value that is currently stored at the address void log_mem_access(bool is_write, void *address, uint64_t size); /// @brief log an exception @@ -64,6 +66,8 @@ class Logger void log_rollback_store(uint64_t addr, uint64_t val, size_t size, uint64_t nesting_level); /// @brief log end of trace void log_eot(); + /// @brief log multi-way branch target + void log_mbr(uint64_t source, uint64_t target); private: static constexpr const unsigned buf_sz = 8 * 1024; diff --git a/rvzr/model_dynamorio/backend/include/types/debug_trace.hpp b/rvzr/model_dynamorio/backend/include/types/debug_trace.hpp index f5290d15f..c5f1fbe68 100644 --- a/rvzr/model_dynamorio/backend/include/types/debug_trace.hpp +++ b/rvzr/model_dynamorio/backend/include/types/debug_trace.hpp @@ -22,6 +22,8 @@ enum class debug_trace_entry_type_t : uint8_t { ENTRY_ROLLBACK = 7, ENTRY_ROLLBACK_STORE = 8, ENTRY_REG_DUMP_EXTENDED = 9, + ENTRY_DEF_USE = 10, + ENTRY_IND = 11, }; /// @brief Pretty-printer for trace_entry_type_t @@ -48,12 +50,25 @@ static constexpr const char *to_string(const debug_trace_entry_type_t &type) return "ROLLBACK"; case debug_trace_entry_type_t::ENTRY_REG_DUMP_EXTENDED: return "REG_DUMP2"; + case debug_trace_entry_type_t::ENTRY_DEF_USE: + return "DEF_USE"; + case debug_trace_entry_type_t::ENTRY_IND: + return "IND"; } return "UNKNOWN"; } struct debug_trace_entry_t { + // Size of the initial padding + static constexpr const unsigned PADDING_SIZE = 8; + // Entries have a fixed size + static constexpr const unsigned TOTAL_SIZE = 64; + // Calculate how many characters of the module name fit in an entry + static constexpr const unsigned MAX_LOC_LEN = TOTAL_SIZE - PADDING_SIZE - sizeof(uint64_t); + // Calculate how many defined or used registers fit in an entry + static constexpr const unsigned MAX_REGS_NUM = ((TOTAL_SIZE - PADDING_SIZE) / 4) / 2; + // What does this entry contain debug_trace_entry_type_t type; // Nested speculation (0 is architectural) @@ -92,7 +107,7 @@ struct debug_trace_entry_t { // ENTRY_LOC (module name and offset, for disassembly) struct { uint64_t offset; - std::array module_name; // NOLINT + std::array module_name; } loc; // ENTRY_EXCEPTION struct { @@ -117,6 +132,18 @@ struct debug_trace_entry_t { size_t size; uint64_t nesting_level; } rollback_store; + // ENTRY_DEF_USE + struct { + uint16_t reg_def[MAX_REGS_NUM]; // NOLINT + uint16_t mem_def[MAX_REGS_NUM]; // NOLINT + uint16_t reg_use[MAX_REGS_NUM]; // NOLINT + uint16_t mem_use[MAX_REGS_NUM]; // NOLINT + } def_use; + // ENTRY_IND + struct { + uint64_t source; + uint64_t target; + } ind; }; /// @param Declare a marker to identify traces of this type @@ -169,11 +196,13 @@ struct debug_trace_entry_t { case debug_trace_entry_type_t::ENTRY_EOT: out << "---- END OF TRACE ----\n"; break; + case debug_trace_entry_type_t::ENTRY_CHECKPOINT: out << " rollback_pc: " << std::hex << checkpoint.rollback_pc; out << " (storelog_sz: " << std::dec << checkpoint.cur_store_log_size; out << " window_sz: " << std::dec << checkpoint.cur_window_size << ")"; break; + case debug_trace_entry_type_t::ENTRY_ROLLBACK: out << " rollback_pc: " << std::hex << rollback.rollback_pc; out << " (nesting: " << std::dec << rollback.nesting << ")"; @@ -185,6 +214,7 @@ struct debug_trace_entry_t { out << " (sz: " << std::dec << rollback_store.size; out << " nesting: " << std::dec << rollback_store.nesting_level << ")"; break; + case debug_trace_entry_type_t::ENTRY_REG_DUMP_EXTENDED: out << " rsp: 0x" << std::hex << regs_2.rsp; out << " rbp: 0x" << std::hex << regs_2.rbp; @@ -194,8 +224,50 @@ struct debug_trace_entry_t { out << " r10: 0x" << std::hex << regs_2.r10; out << " r11: 0x" << std::hex << regs_2.r11; break; + + case debug_trace_entry_type_t::ENTRY_DEF_USE: { + out << "REG_DEFS = ["; + char delimiter = ' '; + for (const auto ®_id : def_use.reg_def) { + if (reg_id == 0) + break; + out << delimiter << " " << std::dec << (int)reg_id; + delimiter = ','; + } + out << "]; REG_USES = ["; + delimiter = ' '; + for (const auto ®_id : def_use.reg_use) { + if (reg_id == 0) + break; + out << delimiter << " " << std::dec << (int)reg_id; + delimiter = ','; + } + out << "]; MEM_DEFS = ["; + delimiter = ' '; + for (const auto ®_id : def_use.mem_def) { + if (reg_id == 0) + break; + out << delimiter << " " << std::dec << (int)reg_id; + delimiter = ','; + } + out << "]; MEM_USES = ["; + delimiter = ' '; + for (const auto ®_id : def_use.mem_use) { + if (reg_id == 0) + break; + out << delimiter << " " << std::dec << (int)reg_id; + delimiter = ','; + } + out << "];"; + break; } + case debug_trace_entry_type_t::ENTRY_IND: + out << std::hex << ind.source << " --> " << std::hex << ind.target; + break; + } out << "\n"; } }; + +static_assert(sizeof(debug_trace_entry_t) == debug_trace_entry_t::TOTAL_SIZE); diff --git a/rvzr/model_dynamorio/backend/logger.cpp b/rvzr/model_dynamorio/backend/logger.cpp index 802b6d75a..e0f963af4 100644 --- a/rvzr/model_dynamorio/backend/logger.cpp +++ b/rvzr/model_dynamorio/backend/logger.cpp @@ -18,6 +18,7 @@ #include #include "logger.hpp" +#include "types/debug_trace.hpp" // ================================================================================================= // Local helper functions @@ -79,7 +80,8 @@ void Logger::close() { log.clear(); } // Logging methods // ================================================================================================= -void Logger::log_instruction(instr_obs_t instr, dr_mcontext_t *mc, unsigned int nesting_level) +void Logger::log_instruction(instr_obs_t instr, dr_mcontext_t *mc, void *dc, + unsigned int nesting_level) { if (not is_enabled()) return; @@ -126,6 +128,81 @@ void Logger::log_instruction(instr_obs_t instr, dr_mcontext_t *mc, unsigned int std::move(module_name.begin(), module_name.end(), loc_entry.loc.module_name.begin()); log.push_back(loc_entry); } + + if (log_level >= LOG_DEF_USE) { + // Decode the instruction + instr_noalloc_t noalloc; + instr_noalloc_init(dc, &noalloc); + instr_t *cur_instr = instr_from_noalloc(&noalloc); + byte *next_pc = decode(dc, (byte *)instr.pc, cur_instr); + DR_ASSERT_MSG(next_pc != nullptr, + "[ERROR] cond_speculator: Failed to decode instruction\n"); + + debug_trace_entry_t def_use_entry = {}; + def_use_entry.type = debug_trace_entry_type_t::ENTRY_DEF_USE; + def_use_entry.nesting_level = cur_nesting_level; + + // Log source registers (uses) + int num_srcs = instr_num_srcs(cur_instr); + DR_ASSERT_MSG(num_srcs < debug_trace_entry_t::MAX_REGS_NUM, + "Too many source registers to log DEF-USE"); + int cur_src_reg_idx = 0; + int cur_src_mem_idx = 0; + for (int i = 0; i < num_srcs; i++) { + opnd_t src = instr_get_src(cur_instr, i); + + if (opnd_is_reg(src)) { + reg_id_t reg = opnd_get_reg(src); + def_use_entry.def_use.reg_use[cur_src_reg_idx] = (uint16_t)reg; + cur_src_reg_idx += 1; + } else if (opnd_is_memory_reference(src)) { + reg_id_t base = opnd_get_base(src); + reg_id_t index = opnd_get_index(src); + if (base != DR_REG_NULL) { + def_use_entry.def_use.mem_use[cur_src_mem_idx] = (uint16_t)base; + cur_src_mem_idx += 1; + } + if (index != DR_REG_NULL) { + def_use_entry.def_use.mem_use[cur_src_mem_idx] = (uint16_t)index; + cur_src_mem_idx += 1; + } + } + DR_ASSERT_MSG(cur_src_reg_idx < debug_trace_entry_t::MAX_REGS_NUM && + cur_src_mem_idx < debug_trace_entry_t::MAX_REGS_NUM, + "Too many source registers to log DEF-USE"); + } + + // Log destination registers (defs) + int num_dsts = instr_num_dsts(cur_instr); + int cur_dst_reg_idx = 0; + int cur_dst_mem_idx = 0; + DR_ASSERT_MSG(num_dsts < debug_trace_entry_t::MAX_REGS_NUM, + "Too many destination registers to log DEF-USE"); + for (int i = 0; i < num_dsts; i++) { + opnd_t dst = instr_get_dst(cur_instr, i); + if (opnd_is_reg(dst)) { + reg_id_t reg = opnd_get_reg(dst); + def_use_entry.def_use.reg_def[cur_dst_reg_idx] = (uint16_t)reg; + cur_dst_reg_idx += 1; + } else if (opnd_is_memory_reference(dst)) { + reg_id_t base = opnd_get_base(dst); + reg_id_t index = opnd_get_index(dst); + if (base != DR_REG_NULL) { + def_use_entry.def_use.mem_def[cur_dst_mem_idx] = (uint16_t)base; + cur_dst_mem_idx += 1; + } + if (index != DR_REG_NULL) { + def_use_entry.def_use.mem_def[cur_dst_mem_idx] = (uint16_t)index; + cur_dst_mem_idx += 1; + } + } + DR_ASSERT_MSG(cur_dst_reg_idx < debug_trace_entry_t::MAX_REGS_NUM && + cur_dst_mem_idx < debug_trace_entry_t::MAX_REGS_NUM, + "Too many destination registers to log DEF-USE"); + } + + log.push_back(def_use_entry); + } } void Logger::log_mem_access(bool is_write, void *address, uint64_t size) @@ -224,3 +301,16 @@ void Logger::log_eot() log.push_back({.type = debug_trace_entry_type_t::ENTRY_EOT}); } + +void Logger::log_mbr(uint64_t source, uint64_t target) +{ + if (not is_enabled()) + return; + + log.push_back({.type = debug_trace_entry_type_t::ENTRY_IND, + .nesting_level = cur_nesting_level, + .ind{ + .source = source, + .target = target, + }}); +} diff --git a/rvzr/model_dynamorio/backend/tracers/ind.cpp b/rvzr/model_dynamorio/backend/tracers/ind.cpp index 089ed4af9..ab38126a5 100644 --- a/rvzr/model_dynamorio/backend/tracers/ind.cpp +++ b/rvzr/model_dynamorio/backend/tracers/ind.cpp @@ -95,20 +95,22 @@ void TracerInd::observe_instruction(instr_obs_t instr, dr_mcontext_t *mc, void * instr_noalloc_t noalloc; const auto &mbr_info = get_mbr_info(instr, mc, dc, &noalloc); - // Skip if not a branch + // Skip if it's not an MBR if (not mbr_info) return; - // Log source + // Trace source trace.push_back({ - .addr = mbr_info->src, + .addr = instr.pc, .size = 0, .type = trace_entry_type_t::ENTRY_PC, }); - // Log destination + // Trace target trace.push_back({ .addr = mbr_info->target, .size = 0, .type = trace_entry_type_t::ENTRY_IND, }); + // Log indirect call + logger.log_mbr(mbr_info->src, mbr_info->target); } diff --git a/rvzr/model_dynamorio/trace_decoder.py b/rvzr/model_dynamorio/trace_decoder.py index 5e9d4ebd2..3044d3ed8 100644 --- a/rvzr/model_dynamorio/trace_decoder.py +++ b/rvzr/model_dynamorio/trace_decoder.py @@ -73,6 +73,8 @@ class DebugTraceEntryType(Enum): ENTRY_ROLLBACK = 7 ENTRY_ROLLBACK_STORE = 8 ENTRY_REG_DUMP_EXTENDED = 9 + ENTRY_USE_DEF = 10 + ENTRY_IND = 11 _DEBUG_TRACE_ENTRY_T: Final[str] = "struct debug_trace_entry_t" @@ -141,6 +143,18 @@ class DebugTraceEntryType(Enum): size_t size; uint64_t nesting_level; } rollback_store; + // ENTRY_DEF_USE + struct { + uint16_t reg_def[7]; // NOLINT + uint16_t mem_def[7]; // NOLINT + uint16_t reg_use[7]; // NOLINT + uint16_t mem_use[7]; // NOLINT + } def_use; + // ENTRY_IND + struct { + uint64_t source; + uint64_t target; + } ind; }; }; """ diff --git a/tests/consfuzz/unit_config.py b/tests/consfuzz/unit_config.py new file mode 100644 index 000000000..95734f2f9 --- /dev/null +++ b/tests/consfuzz/unit_config.py @@ -0,0 +1,263 @@ +""" +Copyright (C) Microsoft Corporation +SPDX-License-Identifier: MIT +""" +# pylint: disable=missing-function-docstring +# pylint: disable=missing-class-docstring +# pylint: disable=protected-access + +import os +import tempfile +import shutil +import unittest +from unittest.mock import patch, mock_open +from io import StringIO + +import yaml +from consfuzz.config import Config, _ConfigException + + +class TestConfig(unittest.TestCase): + + # ============================================================================================== + # Helper methods + + def setUp(self) -> None: + self._reset_config_instantiation() + + # Create temporary directories for testing + self.temp_dir = tempfile.mkdtemp() + self.working_dir = os.path.join(self.temp_dir, "working") + self.archive_dir = os.path.join(self.temp_dir, "archive") + self.model_root = os.path.join(self.temp_dir, "model") + self.afl_root = os.path.join(self.temp_dir, "afl") + self.afl_seed_dir = os.path.join(self.temp_dir, "seeds") + + # Create the required directories + os.makedirs(self.working_dir) + os.makedirs(self.archive_dir) + os.makedirs(self.model_root) + os.makedirs(self.afl_root) + os.makedirs(self.afl_seed_dir) + + # Basic valid config + self.valid_config = f""" +working_dir: {self.working_dir} +archive_dir: {self.archive_dir} +model_root: {self.model_root} +afl_root: {self.afl_root} +afl_seed_dir: {self.afl_seed_dir} +""" + + def tearDown(self) -> None: + # Clean up temporary directories + shutil.rmtree(self.temp_dir, ignore_errors=True) + self._reset_config_instantiation() + + def _reset_config_instantiation(self) -> None: + # Helper method to reset the Config instantiation flag + Config._Config__config_instantiated = False # type: ignore + + # ============================================================================================== + # Tests + + def test_config_single_instantiation(self) -> None: + # Test that Config can only be instantiated once + with patch("os.path.exists", return_value=True): + with patch("builtins.open", mock_open(read_data=self.valid_config)): + Config("config.yaml", "fuzz") + with self.assertRaises(RuntimeError): + Config("config.yaml", "fuzz") + + def test_config_nonexistent_yaml(self) -> None: + # Test that missing config file raises SystemExit + with self.assertRaises(SystemExit): + Config("nonexistent.yaml", "fuzz") + + def test_config_invalid_yaml(self) -> None: + # Test that invalid YAML content raises SystemExit + with patch("os.path.exists", return_value=True): + with patch("builtins.open", mock_open(read_data="invalid: yaml: content")): + with self.assertRaises(yaml.scanner.ScannerError): # type: ignore + Config("config.yaml", "fuzz") + self._reset_config_instantiation() + + with patch("builtins.open", mock_open(read_data="non-dictionary content")): + with self.assertRaises(SystemExit): + Config("config.yaml", "fuzz") + + def test_config_missing_required_fields(self) -> None: + # Test that missing required fields raises _ConfigException + with patch("os.path.exists", return_value=True): + # working_dir + config_data = "some_other_field: value" + with patch("builtins.open", mock_open(read_data=config_data)): + with self.assertRaises(_ConfigException) as cm: + Config("config.yaml", "fuzz") + self.assertIn("working_dir", str(cm.exception)) + + def test_config_empty_working_dir(self) -> None: + # Test configuration with empty working directory + with patch("os.path.exists", return_value=True): + with patch("builtins.open", mock_open(read_data=self.valid_config)): + config = Config("config.yaml", "fuzz") + self.assertEqual(config.working_dir, self.working_dir) + self.assertTrue(os.path.exists(config.stage1_wd)) + self.assertTrue(os.path.exists(config.stage2_wd)) + self.assertTrue(os.path.exists(config.stage3_wd)) + + def test_config_nonexistent_working_dir(self) -> None: + # Test that nonexistent working directory raises exception + config_data = f""" +working_dir: /nonexistent/directory +model_root: {self.model_root} +afl_root: {self.afl_root} +afl_seed_dir: {self.afl_seed_dir} +""" + with patch("os.path.exists", return_value=True): + with patch("builtins.open", mock_open(read_data=config_data)): + with self.assertRaises(_ConfigException): + Config("config.yaml", "fuzz") + + @patch('sys.stdout', new_callable=StringIO) + def test_config_force_overwrite(self, mock_stdout: StringIO) -> None: + # Test force_working_dir_overwrite functionality + # Create some files in working directory + test_file = os.path.join(self.working_dir, "test.txt") + with open(test_file, "w") as f: + f.write("test") + + config_data = f""" +working_dir: {self.working_dir} +force_working_dir_overwrite: true +model_root: {self.model_root} +afl_root: {self.afl_root} +afl_seed_dir: {self.afl_seed_dir} +""" + with patch("os.path.exists", return_value=True): + with patch("builtins.open", mock_open(read_data=config_data)): + Config("config.yaml", "fuzz") + # Check that the working directory is empty + self.assertEqual(len(os.listdir(self.working_dir)), 3) # Only stage dirs + + output = mock_stdout.getvalue() + self.assertIn("removing", output) + + @patch('sys.stdout', new_callable=StringIO) + def test_config_archive_functionality(self, mock_stdout: StringIO) -> None: + # Test archiving functionality when working dir is not empty + # Create a file in working directory + test_file = os.path.join(self.working_dir, "test.txt") + with open(test_file, "w") as f: + f.write("test content") + + # We need a functional os.path.exits here, so we cannot mock it + # Thus, we will create a temporary file for the config.yaml + config_file = os.path.join(self.temp_dir, "config.yaml") + with open(config_file, "w") as f: + f.write(self.valid_config) + Config(config_file, "fuzz") + + output = mock_stdout.getvalue() + self.assertIn("Archived", output) + + # Check that archive was created + archives = os.listdir(self.archive_dir) + self.assertEqual(len(archives), 1) + self.assertTrue(archives[0].endswith(".tar.gz")) + + def test_config_no_archive_no_force(self) -> None: + # Test that exception is raised when working dir is not empty without archive or force + # Create a file in working directory + test_file = os.path.join(self.working_dir, "test.txt") + with open(test_file, "w") as f: + f.write("test") + + config_data = f""" +working_dir: {self.working_dir} +model_root: {self.model_root} +afl_root: {self.afl_root} +afl_seed_dir: {self.afl_seed_dir} +""" + + with patch("os.path.exists", return_value=True): + with patch("builtins.open", mock_open(read_data=config_data)): + with self.assertRaises(_ConfigException): + Config("config.yaml", "fuzz") + + def test_config_invalid_model_root(self) -> None: + # Test that invalid model_root raises exception + config_data = f""" +working_dir: {self.working_dir} +model_root: /nonexistent/model +afl_root: {self.afl_root} +afl_seed_dir: {self.afl_seed_dir} +""" + with patch("os.path.exists", return_value=True): + with patch("builtins.open", mock_open(read_data=config_data)): + with self.assertRaises(_ConfigException) as cm: + Config("config.yaml", "fuzz") + self.assertIn("model_root", str(cm.exception)) + + def test_config_invalid_afl_root(self) -> None: + # Test that invalid afl_root raises exception + config_data = f""" +working_dir: {self.working_dir} +model_root: {self.model_root} +afl_root: /nonexistent/afl +afl_seed_dir: {self.afl_seed_dir} +""" + with patch("os.path.exists", return_value=True): + with patch("builtins.open", mock_open(read_data=config_data)): + with self.assertRaises(_ConfigException) as cm: + Config("config.yaml", "fuzz") + self.assertIn("afl_root", str(cm.exception)) + + def test_config_missing_afl_seed_dir(self) -> None: + # Test that missing afl_seed_dir raises exception + config_data = f""" +working_dir: {self.working_dir} +model_root: {self.model_root} +afl_root: {self.afl_root} +""" + with patch("os.path.exists", return_value=True): + with patch("builtins.open", mock_open(read_data=config_data)): + with self.assertRaises(_ConfigException) as cm: + Config("config.yaml", "fuzz") + self.assertIn("afl_seed_dir", str(cm.exception)) + + def test_config_internal_option_rejection(self) -> None: + # Test that internal options cannot be set via YAML + config_data = f""" +working_dir: {self.working_dir} +stage1_wd: /some/path +model_root: {self.model_root} +afl_root: {self.afl_root} +afl_seed_dir: {self.afl_seed_dir} +""" + with patch("os.path.exists", return_value=True): + with patch("builtins.open", mock_open(read_data=config_data)): + with self.assertRaises(_ConfigException) as cm: + Config("config.yaml", "fuzz") + self.assertIn("internal use only", str(cm.exception)) + + def test_config_stage_directories(self) -> None: + # Test different stage directory behaviors + # Test stage2 with existing empty directory + os.makedirs(os.path.join(self.working_dir, "stage2")) + + with patch("os.path.exists", return_value=True): + with patch("builtins.open", mock_open(read_data=self.valid_config)): + config = Config("config.yaml", "stage2") + self.assertEqual(os.listdir(config.stage2_wd), []) + + self._reset_config_instantiation() + + # Test pub_gen stage + # We need a functional os.path.exits here, so we cannot mock it + # Thus, we will create a temporary file for the config.yaml + config_file = os.path.join(self.temp_dir, "config.yaml") + with open(config_file, "w") as f: + f.write(self.valid_config) + config = Config(config_file, "pub_gen") + self.assertEqual(os.listdir(config.stage1_wd), []) diff --git a/tests/runtests.sh b/tests/runtests.sh index 5572d4df9..16c7a2c37 100755 --- a/tests/runtests.sh +++ b/tests/runtests.sh @@ -1,7 +1,8 @@ #!/usr/bin/env bash AVAILABLE_STAGES=("type_check" "code_style_check" "core_unit_tests" "package_install_test" - "km_tests" "arch_unit_tests" "acceptance_tests") + "km_tests" "arch_unit_tests" "acceptance_tests" + "consfuzz_type_check" "consfuzz_style_check" "consfuzz_unit_test") function parse_args() { POSITIONAL_ARGS=() @@ -201,6 +202,52 @@ function acceptance_tests() { fi } +function consfuzz_type_check() { + local enable_strict=$1 + + echo "" + echo "===== Consfuzz MyPy =====" + cd $SCRIPT_DIR/.. || exit + MYPYPATH=rvzr/ python3 -m mypy --strict consfuzz/*.py \ + --no-warn-unused-ignores --untyped-calls-exclude=elftools + cd - >/dev/null || exit + + if [ "$enable_strict" = true ]; then + echo "" + cd $SCRIPT_DIR/.. || exit + echo "===== STRICT CHECK: Consfuzz MyPy (Unit Tests) =====" + MYPYPATH=rvzr/ python3 -m mypy --strict tests/consfuzz/unit_*.py \ + --no-warn-unused-ignores --untyped-calls-exclude=elftools + cd - >/dev/null || exit + fi +} + +function consfuzz_style_check() { + local enable_strict=$1 + + echo "" + echo "===== Consfuzz style check =====" + cd $SCRIPT_DIR/.. || exit + python3 -m flake8 --max-line-length 100 --ignore E402,W503 consfuzz --count --show-source --statistics + cd - >/dev/null || exit + + if [ "$enable_strict" = true ]; then + echo "" + cd $SCRIPT_DIR/.. || exit + echo "===== STRICT CHECK: Consfuzz PyLint =====" + python3 -m pylint --rcfile=.pylintrc consfuzz/*.py + cd - >/dev/null || exit + fi +} + +function consfuzz_unit_test() { + echo "" + echo "===== Consfuzz unit tests =====" + cd $SCRIPT_DIR/.. || exit + python3 -m unittest tests.consfuzz.unit_config -v + cd - >/dev/null || exit +} + # ================================================================================================== # Runners # ================================================================================================== @@ -229,6 +276,15 @@ function run_one_stage() { acceptance_tests) acceptance_tests ;; + consfuzz_type_check) + consfuzz_type_check $STRICT + ;; + consfuzz_style_check) + consfuzz_style_check $STRICT + ;; + consfuzz_unit_test) + consfuzz_unit_test + ;; *) echo "Unknown stage: $stage" exit 1 @@ -266,6 +322,10 @@ function main() { km_tests arch_unit_tests acceptance_tests + + consfuzz_type_check $STRICT + consfuzz_style_check $STRICT + consfuzz_unit_test } main $@