diff --git a/.gitignore b/.gitignore index 4c0b550..887c9e4 100644 --- a/.gitignore +++ b/.gitignore @@ -8,7 +8,7 @@ bin # Ignore auto-generated benchmarking files and dependencies benchmarking/jars/ benchmarking/datasets/ -benchmarking/results/ +benchmarking/outputs/ benchmarking/sources/ benchmarking/compiled_classes/ diff --git a/benchmarking/benchmarking.md b/benchmarking/benchmarking.md index f796886..af05aa3 100644 --- a/benchmarking/benchmarking.md +++ b/benchmarking/benchmarking.md @@ -1,6 +1,6 @@ # Benchmarking -# Stage Zero +# Stage Zero: Initiailization The first stage of benchmarking involves downloading all the necessary files required, i.e. Jar files and the NJR-1 Dataset. @@ -11,7 +11,7 @@ In order for benchmarking to run, the jar files for all the following dependencies must be located in the `JARS_DIR` directory, in their respective sub-folder (i.e. `JARS_DIR/errorprone`, `JARS_DIR/nullaway`, and `JARS_DIR/annotator`). If not present, they will be automaticlaly downloaded -from the Maven Repository. ++from the Maven Repository. ## Note: Different versions of the following dependencies may not be compatable. The newest versions of each project that are confirmed to work together are: @@ -66,3 +66,13 @@ before NullAway can process them. - [Checker-Qual](https://mvnrepository.com/artifact/org.checkerframework/checker-qual/) - checker-qual contains annotations (type qualifiers) that a programmer writes to specify Java code for type-checking by the Checker Framework. + +# Stage One: Annotation + +Stage One is the largest stage in terms of time consumption and computation. It +involves running NullAwayAnnotator on the entire NJR-1 dataset in order to +prepare it for refactoring, as well as to get an accurate count of the number of +NullAway errors in the original programs, refactoring every program using VGR, +and finally re-running annotator to get an updated error count. This cycle +(annotate->refactor->annotate) is completed for each program in NJR-1 +sequentially. diff --git a/benchmarking/run_benchmark.py b/benchmarking/run_benchmark.py index fd20a3b..731e6db 100644 --- a/benchmarking/run_benchmark.py +++ b/benchmarking/run_benchmark.py @@ -1,8 +1,20 @@ # pyright: basic +import argparse +import csv from datetime import datetime import os +import re import shutil +import subprocess import sys +from typing import TypedDict + + +class BenchmarkingResult(TypedDict): + benchmark: str + initial_error_count: int | str # "Error" for failed runs; Error count otherwise + refactored_error_count: int | str # "Error" for failed runs; Error count otherwise + BENCHMARKING_DIR = "./benchmarking" # Base directory for benchmarking inputs / outputs DATASETS_DIR = ( @@ -14,7 +26,9 @@ ) DATASETS_REFACTORED_SAVE_DIR = f"{DATASETS_DIR}/old-runs/refactored" # Directory for datasets that will be modified -OUTPUT_DIR = f"{BENCHMARKING_DIR}/results" # Directory for storing outputs +OUTPUT_DIR = f"{BENCHMARKING_DIR}/outputs" # Directory for storing outputs +OUTPUT_LOGS_DIR = f"{OUTPUT_DIR}/logs" # Directory for storing outputs +RESULTS_DIR = f"{OUTPUT_DIR}/results" # Directory for storing result csvs SRC_DIR = f"{BENCHMARKING_DIR}/sources" # Directory for storing text files listing source files for each project COMPILED_CLASSES_DIR = ( f"{BENCHMARKING_DIR}/compiled_classes" # Directory for storing compiled_classes @@ -27,8 +41,8 @@ ANNOTATOR_JAR_DIR = f"{JARS_DIR}/annotator" PROCESSOR_JARS = [ { - "PATH": f"{ERRORPRONE_JAR_DIR}/error_prone_core-2.38.0-with-dependencies.jar", - "DOWNLOAD_URL": "https://repo1.maven.org/maven2/com/google/errorprone/error_prone_core/2.38.0/error_prone_core-2.38.0.jar", + "PATH": f"{ERRORPRONE_JAR_DIR}/error_prone_core-2.35.1-with-dependencies.jar", + "DOWNLOAD_URL": "https://repo1.maven.org/maven2/com/google/errorprone/error_prone_core/2.35.1/error_prone_core-2.35.1.jar", }, { "PATH": f"{ERRORPRONE_JAR_DIR}/dataflow-errorprone-3.49.3-eisop1.jar", @@ -78,13 +92,19 @@ "-J--add-opens=jdk.compiler/com.sun.tools.javac.comp=ALL-UNNAMED", ] +# Arguments +DEBUG = False # Debug Mode +MODULE = "All" # Refactoring Module to use + +benchmark_start_time_string = f"{datetime.now():%Y-%m-%d_%H:%M:%S}" + # The initialization stage for benchmarking # Creates the necessary directories, saves old refactored datasets, confirms the existence of the necessary jar files, and downloads NJR-1 dataset if it has not been already. def stage_zero(): print("Beginning Stage Zero: Initialization...") - save_dir = f"{DATASETS_REFACTORED_SAVE_DIR}/{datetime.now():%Y-%m-%d_%H:%M:%S}" + save_dir = f"{DATASETS_REFACTORED_SAVE_DIR}/{benchmark_start_time_string}" print(f"Saving existing refactored datasets to {save_dir}") if os.path.exists(DATASETS_REFACTORED_DIR): try: @@ -93,11 +113,13 @@ def stage_zero(): print( f"Fatal Error: Could not save existing refactored datasets to {save_dir}. Move operation failed. Exiting program." ) - sys.exit(1) + sys.exit(1) print("Initializing benchmarking folders and datasets") os.makedirs(SRC_DIR, exist_ok=True) os.makedirs(OUTPUT_DIR, exist_ok=True) + os.makedirs(OUTPUT_LOGS_DIR, exist_ok=True) + os.makedirs(RESULTS_DIR, exist_ok=True) os.makedirs(DATASETS_DIR, exist_ok=True) os.makedirs(DATASETS_CACHE_DIR, exist_ok=True) os.makedirs(COMPILED_CLASSES_DIR, exist_ok=True) @@ -128,7 +150,7 @@ def stage_zero(): sys.exit(1) print("Creating copy of NJR-1 datasets cache to refactor...") - res = os.system(f"cp -av {DATASETS_CACHE_DIR} {DATASETS_REFACTORED_DIR}") + res = os.system(f"cp -a {DATASETS_CACHE_DIR} {DATASETS_REFACTORED_DIR}") if res != 0: print(f"Copy dataset cache failed with exit code {res}. Exiting Program") sys.exit(1) @@ -151,11 +173,323 @@ def stage_zero(): print("Benchmarking Stage Zero Completed\n") +def stage_one(): + """ + Runs the full benchmarking routine (Annotate -> Count Errors -> Refactor -> Annotate -> Count Errors) for every dataset in the NJR-1 dataset collection and then summarizes the results. + """ + datasets_list = os.listdir(DATASETS_REFACTORED_DIR) + + # List of data structures representing the results of a benchmark + results: list[BenchmarkingResult] = [] + + for dataset in datasets_list: + print(f"Benchmarking {dataset}...") + os.makedirs(f"{OUTPUT_DIR}/{dataset}", exist_ok=True) + + ## Step 1: Annotate dataset + stage_one_annotate(dataset) + + ## Step 2: Count initial errors + old_err_count = stage_one_count_errors(dataset) + if old_err_count is None: + print(f"Skipping {dataset} due to javac/NullAway crash.") + results.append( + { + "benchmark": dataset, + "initial_error_count": "Error", + "refactored_error_count": "", + } + ) + continue + + ## Step 3: Refactor dataset + stage_one_refactor(dataset) + + ## Step 4: Count errors after refactoring + new_err_count = stage_one_count_errors(dataset) + if new_err_count is None: + print(f"Skipping {dataset} due to javac/NullAway crash after refactoring.") + results.append( + { + "benchmark": dataset, + "initial_error_count": old_err_count, + "refactored_error_count": "Error", + } + ) + continue + + print( + f"Succesfully benchmarked {dataset}. Errors: {old_err_count} --> {new_err_count}\n" + ) + results.append( + { + "benchmark": dataset, + "initial_error_count": old_err_count, + "refactored_error_count": new_err_count, + } + ) + print(f"Finished benchmarking datasets.") + print(f"Saving results to csv...") + + stage_one_save_results(results) + return + + +# Utility Functions +def stage_one_annotate(dataset: str): + """ + Runs NullAwayAnnotator on the passed dataset in order to prepare it for + refactoring + """ + + print(f"Annotating {dataset}...") + + # Create config files + os.makedirs(ANNOTATOR_OUT_DIR, exist_ok=True) + with open(f"{ANNOTATOR_CONFIG}", "w+") as config_file: + _ = config_file.write(f"{NULLAWAY_CONFIG}/\t{SCANNER_CONFIG}\n") + + # Clear annotator output folder (required for annotator to run) + shutil.rmtree(ANNOTATOR_OUT_DIR + "/0", ignore_errors=True) + + build_cmd = " ".join(get_build_cmd(dataset)) + cwd = os.getcwd() + + annotate_cmd: list[str] = [ + "java", + "-jar", + ANNOTATOR_JAR, + # Absolute path of an Empty Directory where all outputs of AnnotatorScanner and NullAway are serialized. + "-d", + ANNOTATOR_OUT_DIR, + # Command to run Nullaway on target; Should be executable from anywhere + "-bc", + f'"cd {cwd} && {build_cmd}"', + # Path to a TSV file containing value of config paths + "-cp", + ANNOTATOR_CONFIG, + # Fully qualified name of the @Initializer annotation. + "-i", + "com.uber.nullaway.annotations.Initializer", + # Checker name to be used for the analysis. + "-cn", + "NULLAWAY", + # Max depth to traverse as part of the analysis search + "--depth", + "10", + ] + res = subprocess.run(annotate_cmd, text=True, capture_output=True) + if res.returncode != 0: + print( + f"Annotation failed with exit code {res.returncode} for dataset {dataset}" + ) + return + + output_log_path = f"{OUTPUT_DIR}/{dataset}/annotator.txt" + with open(output_log_path, "w+") as f: + f.write(f"CMD:\n\t{" ".join(annotate_cmd)}\n") + f.write(f"STDOUT:\n\t{res.stdout}\n") + f.write(f"STDERR:\n\t{res.stderr}\n") + + if DEBUG: + print( + f"Command used to annotate dataset {dataset}: \n\t{" ".join(annotate_cmd)}\n" + ) + + return + + +def stage_one_refactor(dataset: str): + """ + Runs VGR on the passed dataset + """ + + print(f"Refactoring {dataset}...") + + output_file = f"{OUTPUT_DIR}/{dataset}/refactoring.txt" + dataset_path = f"{DATASETS_REFACTORED_DIR}/{dataset}" + + refactor_cmd: list[str] = ["./gradlew", "run", f"--args='{dataset_path} {MODULE}'"] + + with open(output_file, "w+") as f: + res = subprocess.run( + " ".join(refactor_cmd) + f" &> {output_file}", shell=True, check=False + ) + + if res.returncode != 0: + print( + f"Running VGRTool failed with exit code {res.returncode} for dataset {dataset}. See {output_file} for more details." + ) + + if DEBUG: + print( + f"Refactor Command for dataset {dataset}: : {' '.join(refactor_cmd)} &> {output_file}" + ) + return + + +def stage_one_count_errors(dataset: str): + """Builds the passed datsets and counts NullAway errors during the build process.""" + build_cmd = " ".join(get_build_cmd(dataset)) + log_file = f"{OUTPUT_LOGS_DIR}/{dataset}-error_count_log-{datetime.now():%Y-%m-%d_%H:%M:%S}.txt" + output_file = ( + f"{OUTPUT_LOGS_DIR}/{dataset}-error_count-{benchmark_start_time_string}.txt" + ) + + # Build the dataset and redirect all outputs to a log file + with open(log_file, "w+") as f: + res = subprocess.run( + build_cmd, + stdout=f, + stderr=subprocess.STDOUT, + check=False, + text=True, + shell=True, + ) + f.write(build_cmd) + + # Handle javac / NullAway crash + # if res.returncode != 0: + # print( + # f"Building dataset {dataset} failed with exit code {res.returncode}. Skipping dataset..." + # ) + # return None # Return None type so programs which are erroring do not look like real results + + # Read the log file and count occurrences of NullAway errors + with open(log_file, "r") as f: + error_count = len(re.findall(r"error: \[NullAway\]", f.read())) + + with open(output_file, "a") as f: + f.write(f"Error Count: {error_count}\n") + + if DEBUG: + print(f"Number of errors found for dataset {dataset}: {error_count}") + return error_count + + +def get_build_cmd(dataset: str): + """ + Constructs the full 'javac' build command used to compile the passed dataset. + """ + lib_dir = f"{DATASETS_REFACTORED_DIR}/{dataset}/lib" + src_file = get_source_files(dataset) + plugin_options = get_plugin_options(dataset) + + build_cmd: list[str] = ["javac"] + build_cmd += ERROR_PRONE_EXPORTS + build_cmd += [ + "-d", + f"{COMPILED_CLASSES_DIR}", + "-cp", + f"{lib_dir}:{ANNOTATOR_JAR}", + "-XDcompilePolicy=simple", + "--should-stop=ifError=FLOW", + "-processorpath", + f"{PROCESSOR_JAR_PATHS}", + f"'{plugin_options}'", + "-Xmaxerrs", + "0", + "-Xmaxwarns", + "0", + f"@{src_file}", + ] + return build_cmd + + +def get_source_files(dataset): + find_srcs_command = [ + "find", + f"{DATASETS_REFACTORED_DIR}/{dataset}/src", + "-name", + "*.java", + ] + src_file = f"{SRC_DIR}/{dataset}.txt" + with open(src_file, "w+") as f: + _ = subprocess.run(find_srcs_command, stdout=f) + return src_file + + +def get_plugin_options(dataset: str): + """ + Generates the -Xplugin:ErrorProne option string, including a dynamically generated list of packages to annotate. + """ + dataset_path = f"{DATASETS_REFACTORED_DIR}/{dataset}" + find_pkgs_command = ( + f"find {dataset_path}" + + " -name '*.java' -exec awk 'FNR==1 && /^package/ {print $2}' {} + | sed 's/;//' | sort -u | tr '\n\r' ',' | sed 's/,,/,/g' | sed 's/,$//'" + ) + + pkgs = subprocess.run( + find_pkgs_command, shell=True, capture_output=True + ).stdout.decode("utf-8") + + # Split the annotated packages + annotated_pkgs = pkgs.strip() + annotated_pkgs_arg = f"-XepOpt:NullAway:AnnotatedPackages={annotated_pkgs}" + + return f"-Xplugin:ErrorProne \ + -XepDisableAllChecks \ + -Xep:AnnotatorScanner:ERROR \ + -XepOpt:AnnotatorScanner:ConfigPath={SCANNER_CONFIG} \ + -Xep:NullAway:ERROR \ + -XepOpt:NullAway:SerializeFixMetadata=true \ + -XepOpt:NullAway:FixSerializationConfigPath={NULLAWAY_CONFIG} \ + {annotated_pkgs_arg}" + + +def stage_one_save_results(results): + """ + Saves benchmark results to a csv" + """ + + csv_path = f"{RESULTS_DIR}/results-{benchmark_start_time_string}" + + column_names = [ + "benchmark", + "initial_error_count", + "refactored_error_count", + ] + + # Write CSV + with open(csv_path, "w+") as f: + writer = csv.DictWriter(f, fieldnames=column_names) + writer.writeheader() + + for result in results: + row = { + "benchmark": result["benchmark"], + "initial_error_count": (result["initial_error_count"]), + "refactored_error_count": (result["refactored_error_count"]), + } + writer.writerow(row) + + print(f"Saved results to {csv_path}") + + def run(): """ Runs the full benchmarking routine for every dataset in the NJR-1 dataset collection and then summarizes the results. """ stage_zero() + stage_one() + + +def main(): + """Main entry point of the script.""" + global DEBUG + argparser = argparse.ArgumentParser(description="Runs benchmark.") + argparser.add_argument( + "--debug", action="store_true", help="Enabling debugging statements." + ) + argparser.add_argument( + "module", help="The refactoring module to use.", default="All" + ) + args = argparser.parse_args() + DEBUG = args.debug + MODULE = args.module + + run() -run() +if __name__ == "__main__": + main()