migration: ubench fixes

csegarragonz · csegarragonz · commit a6d859b48ea4 · 2024-05-21T15:18:45.000Z
diff --git a/tasks/migration/README.md b/tasks/migration/README.md
@@ -6,8 +6,8 @@ applications to benefit from dynamic changes in the compute environment.
 First, provision the cluster:
 
 ```bash
-(faasm-exp-pase) inv cluster.provision --vm Standard_D8_v5 --nodes 3 --name ${CLUSTER_NAME}
-(faasm-exp-base) inv cluster.credentials --name ${CLUSTER_NAME}
+inv cluster.provision --vm Standard_D8_v5 --nodes 3 --name ${CLUSTER_NAME}
+inv cluster.credentials --name ${CLUSTER_NAME}
 ```
 
 Second, deploy the cluster
diff --git a/tasks/migration/oracle.py b/tasks/migration/oracle.py
@@ -24,6 +24,7 @@
     get_faasm_benchmark,
     get_lammps_migration_params,
 )
+from tasks.util.plot import save_plot
 from time import sleep
 
 
@@ -54,14 +55,14 @@ def calculate_cross_vm_links(part):
 
 
 @task()
-def run(ctx, workload="network", nprocs=None):
+def run(ctx, workload="very-network", nprocs=None):
     """
     Experiment to measure the benefits of migration in isolation
     """
     # Work out the number of processes to run with
-    num_procs = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+    num_procs = [2, 3, 4, 5, 6, 7, 8] # , 9, 10, 11, 12, 13, 14, 15, 16]
     num_cpus_per_vm = 8
-    num_vms = 16
+    num_vms = 8 # 16
     if nprocs is not None:
         num_procs = [int(nprocs)]
 
@@ -160,20 +161,17 @@ def do_write_csv_line(csv_name, part, xvm_links, actual_time):
 
 
 @task
-def plot(ctx):
+def plot(ctx, workload="very-network"):
     plots_dir = join(PLOTS_ROOT, "migration")
     makedirs(plots_dir, exist_ok=True)
-    out_file = join(
-        plots_dir, "migration_oracle_{}.pdf".format(LAMMPS_SIM_WORKLOAD)
-    )
 
     results_dir = join(PROJ_ROOT, "results", "migration")
     result_dict = {}
 
     for csv in glob(
         join(
             results_dir,
-            "migration_oracle_{}_*.csv".format(LAMMPS_SIM_WORKLOAD),
+            "migration_oracle_{}_*.csv".format(workload),
         )
     ):
         num_procs = csv.split("_")[-1].split(".")[0]
@@ -193,13 +191,12 @@ def plot(ctx):
                     float(line.split(",")[-1])
                 )
 
-    print(result_dict)
     num_plots = len(result_dict)
     num_cols = 4
     num_rows = ceil(num_plots / num_cols)
     fig, axes = subplots(nrows=num_rows, ncols=num_cols)
     fig.suptitle(
-        "Correlation between execution time (Y) and x-VM links (X)\n(wload: compute)"
+        "Correlation between execution time (Y) and x-VM links (X)\n(wload: {})".format(workload)
     )
 
     def do_plot(ax, results, num_procs):
@@ -213,6 +210,4 @@ def do_plot(ax, results, num_procs):
             axes[int(i / 4)][int(i % 4)], result_dict[num_procs], num_procs
         )
 
-    fig.tight_layout()
-    savefig(out_file, format="pdf")  # , bbox_inches="tight")
-    print("Plot saved to: {}".format(out_file))
+    save_plot(fig, join(PLOTS_ROOT, "migration"), "migration_oracle_{}".format(workload))
diff --git a/tasks/migration/plot.py b/tasks/migration/plot.py
@@ -9,7 +9,7 @@
 from tasks.util.plot import save_plot
 
 
-ALL_WORKLOADS = ["compute", "network"]
+ALL_WORKLOADS = ["all-to-all", "compute", "network", "og-network", "very-network"]
 
 
 def _read_results():
@@ -43,8 +43,11 @@ def plot(ctx):
     """
     migration_results = _read_results()
 
+    do_plot("all-to-all", migration_results)
     do_plot("compute", migration_results)
     do_plot("network", migration_results)
+    do_plot("very-network", migration_results)
+    do_plot("og-network", migration_results)
 
 
 def do_plot(workload, migration_results):
diff --git a/tasks/migration/run.py b/tasks/migration/run.py
@@ -4,7 +4,11 @@
 from os import makedirs
 from os.path import basename, join
 from tasks.migration.util import generate_host_list
-from tasks.util.env import RESULTS_DIR
+from tasks.util.env import (
+    MPI_MIGRATE_FAASM_FUNC,
+    MPI_MIGRATE_FAASM_USER,
+    RESULTS_DIR,
+)
 from tasks.util.faasm import (
     get_faasm_exec_time_from_json,
     post_async_msg_and_get_result_json,
@@ -48,23 +52,24 @@ def run(ctx, w, check_in=None, repeats=1, num_cores_per_vm=8):
     """
     num_vms = len(get_faasm_worker_ips())
     assert num_vms == 2, "Expected 2 VMs got: {}!".format(num_vms)
-    # data_file = basename(get_faasm_benchmark(LAMMPS_SIM_WORKLOAD)["data"][0])
-    # TODO: is this a good idea? FIXME FIXME DELETE ME
-    data_file = basename(get_faasm_benchmark("compute")["data"][0])
 
     if check_in is None:
         check_array = [0, 2, 4, 6, 8, 10]
     else:
         check_array = [int(check_in)]
 
     for workload in w:
-        if workload not in LAMMPS_SIM_WORKLOAD_CONFIGS:
+        if workload != "all-to-all" and workload not in LAMMPS_SIM_WORKLOAD_CONFIGS:
             print(
                 "Unrecognised workload config ({}) must be one in: {}".format(
                     workload, LAMMPS_SIM_WORKLOAD.keys()
                 )
             )
-        workload_config = LAMMPS_SIM_WORKLOAD_CONFIGS[workload]
+            raise RuntimeError("Unrecognised workload: {}".format(workload))
+
+        if workload != "all-to-all":
+            workload_config = LAMMPS_SIM_WORKLOAD_CONFIGS[workload]
+            data_file = basename(get_faasm_benchmark(workload_config["data_file"])["data"][0])
 
         csv_name = "migration_{}.csv".format(workload)
         _init_csv_file(csv_name)
@@ -75,37 +80,41 @@ def run(ctx, w, check_in=None, repeats=1, num_cores_per_vm=8):
 
                 # Print progress
                 print(
-                    "Running migration micro-benchmark (wload:"
+                    "Running migration micro-benchmark (wload: "
                     + "{} - check-at: {} - repeat: {}/{})".format(
                         workload, check, run_num + 1, repeats
                     )
                 )
 
-                """
-                TODO: do we want to keep the all-to-all baseline?
                 if workload == "all-to-all":
                     num_loops = 100000
                     user = MPI_MIGRATE_FAASM_USER
                     func = MPI_MIGRATE_FAASM_FUNC
                     cmdline = "{} {}".format(
                         check if check != 0 else 5, num_loops
                     )
-                """
+                    input_data = None
+                else:
+                    user = LAMMPS_FAASM_USER
+                    func = LAMMPS_FAASM_MIGRATION_NET_FUNC
+                    cmdline = "-in faasm://lammps-data/{}".format(data_file)
+                    input_data = get_lammps_migration_params(
+                        check_every=check if check != 0 else 5,
+                        num_loops=5,
+                        num_net_loops=workload_config["num_net_loops"],
+                        chunk_size=workload_config["chunk_size"],
+                    )
 
-                # Run LAMMPS
-                cmdline = "-in faasm://lammps-data/{}".format(data_file)
                 msg = {
-                    "user": LAMMPS_FAASM_USER,
-                    "function": LAMMPS_FAASM_MIGRATION_NET_FUNC,
+                    "user": user,
+                    "function": func,
                     "cmdline": cmdline,
                     "mpi_world_size": int(num_cores_per_vm),
-                    "input_data": get_lammps_migration_params(
-                        num_loops=5,
-                        num_net_loops=workload_config["num_net_loops"],
-                        chunk_size=workload_config["chunk_size"],
-                    ),
                 }
 
+                if input_data is not None:
+                    msg["input_data"] = input_data
+
                 if check == 0:
                     # Setting a check fraction of 0 means we don't
                     # under-schedule. We use it as a baseline
diff --git a/tasks/util/lammps.py b/tasks/util/lammps.py
@@ -49,13 +49,25 @@
 
 LAMMPS_SIM_WORKLOAD_CONFIGS = {
     "compute": {
+        "data_file": "compute",
         "num_net_loops": 0,
         "chunk_size": 0,
     },
     "network": {
+        "data_file": "compute",
         "num_net_loops": LAMMPS_SIM_NUM_NET_LOOPS,
         "chunk_size": LAMMPS_SIM_CHUNK_SIZE,
     },
+    "very-network": {
+        "data_file": "compute",
+        "num_net_loops": 1e5,
+        "chunk_size": 10,
+    },
+    "og-network": {
+        "data_file": "network",
+        "num_net_loops": 0,
+        "chunk_size": 0,
+    },
 }
 
 # Different supported LAMMPS benchmarks