omp: elastic scaling experiment (#41)

* elastic: microbenchmark running * makespan(elastic): working on compose * makespan(elastic): more fixes in plots * util: plot fixes * plots: add legend on top * plots: more ground work * docker: bump examples commit after merge * nits: self-review
faasm · May 14, 2024 · 2b1ea3a · 2b1ea3a
1 parent 4b35b0f
commit 2b1ea3a
Show file tree

Hide file tree

Showing 38 changed files with 2,079 additions and 360 deletions.
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.5.0
+0.6.0
diff --git a/bin/workon.sh b/bin/workon.sh
@@ -35,7 +35,7 @@ export PS1="(faasm-exp-faabric) $PS1"
 # Experiment-specific variables
 export FAASM_INI_FILE=${PROJ_ROOT}/faasm.ini
 export FAASM_WASM_VM=wamr
-export FAASM_VERSION=0.26.0
+export FAASM_VERSION=0.27.0
 
 popd >> /dev/null
 
diff --git a/docker/faabric-experiments.dockerfile b/docker/faabric-experiments.dockerfile
@@ -9,10 +9,11 @@ RUN rm -rf /code \
     # Checkout to a specific commit, to make sure we do not forget to update it
     # when changes occur upstream, and we do not accidentally cache old WASM
     # versions
-    && git checkout 269557d7244c67d27ec4c98cc72fb04d7af762c8 \
+    && git checkout 428a11c80263b82ea8a83157205c4ef0eceab979 \
     && git submodule update --init -f cpp \
     && git submodule update --init -f python \
     && git submodule update --init -f examples/Kernels \
+    && git submodule update --init -f examples/Kernels-elastic \
     && git submodule update --init -f examples/lammps \
     && git submodule update --init -f examples/lammps-migration \
     && git submodule update --init -f examples/lammps-migration-net \
@@ -22,6 +23,13 @@ RUN rm -rf /code \
     && source ./venv/bin/activate \
     && inv kernels --native \
     && inv kernels \
+    # FIXME: for some reason, build only works if we create these directories
+    # manually. Annoyingly, the problem can not be reproduced inside the
+    # container image
+    && mkdir -p /code/faasm-examples/examples/Kernels-elastic/build/native \
+    && inv kernels --elastic --native --clean \
+    && mkdir -p /code/faasm-examples/examples/Kernels-elastic/build/wasm \
+    && inv kernels --elastic --clean \
     && inv lammps --native \
     && inv lammps \
     && inv lammps --migration --native \

diff --git a/tasks/__init__.py b/tasks/__init__.py
@@ -5,6 +5,7 @@
 
 import logging
 
+from tasks.elastic import ns as elastic_ns
 from tasks.kernels_mpi import ns as kernels_mpi_ns
 from tasks.kernels_omp import ns as kernels_omp_ns
 from tasks.lammps import ns as lammps_ns
@@ -23,6 +24,7 @@
     format_code,
 )
 
+ns.add_collection(elastic_ns, name="elastic")
 ns.add_collection(kernels_mpi_ns, name="kernels-mpi")
 ns.add_collection(kernels_omp_ns, name="kernels-omp")
 ns.add_collection(lammps_ns, name="lammps")

diff --git a/tasks/elastic/README.md b/tasks/elastic/README.md
@@ -0,0 +1,59 @@
+# Elastic Scaling Micro-Benchmark
+
+In this experiment we measure the benefits of elastically scaling-up OpenMP
+applications to benefit from idle resources. We run a pipe-lined algorithm
+on a matrix with a varying number of threads, and at 50% of execution we
+scale-up to the maximum number of available threads. This plot is a best-case
+scenario for the benefits we can get by elastically scaling-up.
+
+## Granny
+
+First, start the AKS cluster by running:
+
+```bash
+inv cluster.provision --vm Standard_D8_v5 --nodes 2 cluster.credentials
+```
+
+Second, deploy the Granny cluster:
+
+```bash
+faasmctl deploy.k8s --workers=1
+```
+
+Third, upload the WASM file:
+
+```bash
+inv elastic.wasm.upload
+```
+
+and run the experiment with:
+
+```bash
+# Without elastic scaling
+inv elastic.run
+
+# With elastic scaling
+inv elastic.run --elastic
+```
+
+## Plot
+
+You may now plot the results using:
+
+```bash
+inv elastic.plot
+```
+
+## Clean-Up
+
+Finally, delete the Granny cluster:
+
+```bash
+faasmctl delete
+```
+
+and the AKS cluster:
+
+```bash
+inv cluster.delete
+```
diff --git a/tasks/elastic/__init__.py b/tasks/elastic/__init__.py
@@ -0,0 +1,8 @@
+from invoke import Collection
+
+from . import native
+from . import plot
+from . import run
+from . import wasm
+
+ns = Collection(native, plot, run, wasm)
diff --git a/tasks/elastic/env.py b/tasks/elastic/env.py
@@ -0,0 +1,61 @@
+from os.path import join
+from tasks.util.env import PROJ_ROOT
+
+LAMMPS_DIR = join(PROJ_ROOT, "third-party", "lammps")
+
+LAMMPS_IMAGE_NAME = "experiment-lammps"
+LAMMPS_DOCKERFILE = join(PROJ_ROOT, "docker", "lammps.dockerfile")
+
+DOCKER_PROJ_ROOT = "/code/experiment-mpi"
+DOCKER_LAMMPS_DIR = join(DOCKER_PROJ_ROOT, "third-party", "lammps")
+DOCKER_NATIVE_INSTALL_DIR = join(DOCKER_PROJ_ROOT, "build", "native-install")
+DOCKER_LAMMPS_BINARY = join(DOCKER_NATIVE_INSTALL_DIR, "bin", "lmp")
+
+LAMMPS_FAASM_USER = "lammps"
+LAMMPS_FAASM_FUNC = "main"
+LAMMPS_FAASM_DATA_PREFIX = "/lammps-data"
+
+# Define the different benchmarks we run in LAMMPS
+
+BENCHMARKS = {
+    "lj": {"data": ["bench/in.lj"], "out_file": "compute"},
+    "compute": {"data": ["bench/in.lj"], "out_file": "compute"},
+    "compute-xl": {"data": ["bench/in.lj-xl"], "out_file": "compute"},
+    "compute-xxl": {"data": ["bench/in.lj-xxl"], "out_file": "compute"},
+    "controller": {
+        "data": ["examples/controller/in.controller.wall"],
+        "out_file": "network",
+    },
+    "network": {
+        "data": ["examples/controller/in.controller.wall"],
+        "out_file": "network",
+    },
+    "eam": {"data": ["bench/in.eam", "bench/Cu_u3.eam"], "out_file": "eam"},
+    "chute": {
+        "data": ["bench/in.chute", "bench/data.chute"],
+        "out_file": "chute",
+    },
+    "rhodo": {
+        "data": ["bench/in.rhodo", "bench/data.rhodo"],
+        "out_file": "rhodo",
+    },
+    "chain": {
+        "data": ["bench/in.chain", "bench/data.chain"],
+        "out_file": "chain",
+    },
+    "short": {
+        "data": ["examples/controller/in.controller.wall"],
+        "out_file": "short",
+    },
+}
+
+
+def get_faasm_benchmark(bench):
+    if bench not in BENCHMARKS:
+        print("Unrecognized benchmark: {}".format(bench))
+        print(
+            "The supported LAMMPS benchmarks are: {}".format(BENCHMARKS.keys())
+        )
+        raise RuntimeError("Unrecognized LAMMPS benchmark")
+
+    return BENCHMARKS[bench]
diff --git a/tasks/elastic/native.py b/tasks/elastic/native.py
@@ -0,0 +1,25 @@
+from invoke import task
+from tasks.util.env import FAABRIC_EXP_IMAGE_NAME
+from tasks.util.openmpi import deploy_native_mpi, delete_native_mpi
+
+
+@task
+def deploy(ctx, backend="k8s", num_vms=1):
+    """
+    Deploy the native OpenMP k8s cluster
+    """
+    if backend == "k8s":
+        deploy_native_mpi("openmp", FAABRIC_EXP_IMAGE_NAME, num_vms)
+    else:
+        raise RuntimeError("Backend not supported: {}!".format(backend))
+
+
+@task
+def delete(ctx, backend="k8s", num_vms=1):
+    """
+    Deploy the native OpenMP k8s cluster
+    """
+    if backend == "k8s":
+        delete_native_mpi("openmp", FAABRIC_EXP_IMAGE_NAME, num_vms)
+    else:
+        raise RuntimeError("Backend not supported: {}!".format(backend))
diff --git a/tasks/elastic/plot.py b/tasks/elastic/plot.py
@@ -0,0 +1,72 @@
+from glob import glob
+from invoke import task
+from matplotlib.pyplot import subplots
+from os import makedirs
+from os.path import join
+from pandas import read_csv
+from tasks.util.elastic import ELASTIC_PLOTS_DIR, ELASTIC_RESULTS_DIR
+from tasks.util.env import SYSTEM_NAME
+from tasks.util.plot import SINGLE_COL_FIGSIZE, save_plot
+
+
+def _read_results():
+    result_dict = {}
+
+    for csv in glob(join(ELASTIC_RESULTS_DIR, "openmp_*.csv")):
+        results = read_csv(csv)
+        baseline = csv.split("_")[1]
+
+        groupped_results = results.groupby("NumThreads", as_index=False)
+        if baseline not in result_dict:
+            result_dict[baseline] = {}
+
+        for nt in groupped_results.mean()["NumThreads"].to_list():
+            index = groupped_results.mean()["NumThreads"].to_list().index(nt)
+            result_dict[baseline][nt] = {
+                "mean": groupped_results.mean()["ExecTimeSecs"].to_list()[
+                    index
+                ],
+                "sem": groupped_results.sem()["ExecTimeSecs"].to_list()[index],
+            }
+
+    return result_dict
+
+
+@task(default=True)
+def plot(ctx):
+    """
+    Plot the slowdown of OpenMP's ParRes kernels
+    """
+    results = _read_results()
+    print(results)
+    makedirs(ELASTIC_PLOTS_DIR, exist_ok=True)
+    fig, ax = subplots(figsize=SINGLE_COL_FIGSIZE)
+
+    assert len(results["elastic"]) == len(results["no-elastic"]), "Results mismatch! (elastic: {} - no-elastic: {})".format(len(results["elastic"]), len(results["no-elastic"]))
+
+    xs = list(results["elastic"].keys())
+    ys = [
+        float(results["no-elastic"][x]["mean"] / results["elastic"][x]["mean"])
+        for x in xs
+    ]
+
+    ax.bar(
+        xs,
+        ys,
+        edgecolor="black",
+    )
+
+    # Labels
+    ax.set_xticks(xs)
+
+    # Horizontal line at slowdown of 1
+    xlim_left = 0.5
+    xlim_right = len(xs) + 0.5
+    ax.hlines(1, xlim_left, xlim_right, linestyle="dashed", colors="red")
+
+    ax.set_xlim(left=xlim_left, right=xlim_right)
+    ax.set_ylim(bottom=0)
+    ax.set_xlabel("Number of OpenMP threads")
+    ax.set_ylabel("Speed-Up \n [No-Elastic / Elastic]")
+
+    save_plot(fig, ELASTIC_PLOTS_DIR, "elastic_speedup")
diff --git a/tasks/elastic/run.py b/tasks/elastic/run.py
@@ -0,0 +1,110 @@
+from faasmctl.util.planner import get_available_hosts
+from faasmctl.util.planner import reset as reset_planner, set_planner_policy
+from invoke import task
+from os import makedirs
+from os.path import join
+from tasks.util.faasm import (
+    get_faasm_exec_time_from_json,
+    post_async_msg_and_get_result_json,
+)
+from tasks.util.elastic import (
+    ELASTIC_KERNEL,
+    ELASTIC_RESULTS_DIR,
+    OPENMP_ELASTIC_FUNCTION,
+    OPENMP_ELASTIC_USER,
+    get_elastic_input_data,
+)
+from tasks.util.kernels import get_openmp_kernel_cmdline
+
+EXPECTED_NUM_VMS = 1
+TOTAL_NUM_THREADS = [1, 2, 3, 4, 5, 6, 7, 8]
+
+
+def _init_csv_file(csv_name):
+    makedirs(ELASTIC_RESULTS_DIR, exist_ok=True)
+
+    result_file = join(ELASTIC_RESULTS_DIR, csv_name)
+    with open(result_file, "w") as out_file:
+        out_file.write("NumThreads,Run,ExecTimeSecs\n")
+
+
+def _write_csv_line(csv_name, num_threads, run, exec_time):
+    result_file = join(ELASTIC_RESULTS_DIR, csv_name)
+    with open(result_file, "a") as out_file:
+        out_file.write("{},{},{}\n".format(num_threads, run, exec_time))
+
+
+def has_execution_failed(results_json):
+    for result in results_json:
+        if "returnValue" in result and result["returnValue"] != 0:
+            return True
+
+        if "output_data" in result:
+            if "ERROR" in result["output_data"]:
+                return True
+            if "Call failed" in result["output_data"]:
+                return True
+
+    return False
+
+
+@task(default=True)
+def wasm(ctx, num_threads=None, elastic=False, repeats=1):
+    """
+    Run the OpenMP Kernels
+    """
+    set_planner_policy("bin-pack")
+
+    avail_hosts = get_available_hosts().hosts
+    num_vms = len(avail_hosts)
+    """
+    assert num_vms == EXPECTED_NUM_VMS, "Expected {} VMs got: {}!".format(
+        EXPECTED_NUM_VMS, num_vms
+    )
+    """
+
+    if num_threads is not None:
+        num_threads = [num_threads]
+    else:
+        num_threads = TOTAL_NUM_THREADS
+
+    reset_planner(num_vms)
+
+    csv_name = "openmp_{}_granny.csv".format("elastic" if elastic else "no-elastic")
+    _init_csv_file(csv_name)
+
+    for nthread in num_threads:
+        for r in range(int(repeats)):
+            print(
+                "Running OpenMP elastic experiment with {} threads (elastic: {} - repeat: {}/{})".format(
+                    nthread, elastic, r + 1, repeats
+                )
+            )
+            user = OPENMP_ELASTIC_USER
+            func = OPENMP_ELASTIC_FUNCTION
+            cmdline = get_openmp_kernel_cmdline(ELASTIC_KERNEL, nthread)
+            msg = {
+                "user": user,
+                "function": func,
+                "cmdline": cmdline,
+                "input_data": get_elastic_input_data(num_loops=2),
+                "isOmp": True,
+                "ompNumThreads": nthread,
+            }
+            req = {
+                "user": user,
+                "function": func,
+                "singleHostHint": True,
+                "elasticScaleHint": elastic,
+            }
+
+            # Note that when executing with just two iterations, the first one
+            # will always be pre-loaded by the planner (so not elastically
+            # scaled) thus naturally fitting the goal of our plot
+            result_json = post_async_msg_and_get_result_json(msg, req_dict=req)
+            actual_time = get_faasm_exec_time_from_json(
+                result_json, check=True
+            )
+            _write_csv_line(csv_name, nthread, r, actual_time)
+            # TODO: delete me
+            print("Actual time: {}".format(actual_time))