From 48ccec5ac850be447f88d9e2b81b2cdd32055ad7 Mon Sep 17 00:00:00 2001
From: Michael Kamprath <michael@kamprath.net>
Date: Sun, 8 Mar 2020 14:53:39 -0700
Subject: [PATCH] updated simple spark cluster configuration

Update the docker configuration of the simple spark cluster to bring it more inline with
the learnings I've made setting up the Spark-QFS cluster. This cluster depends on the
stack being deployed with all nodes mounting a file system, such os GlusterFS or NFS.

The main change over the previous version of the simple Spark cluster is that the
Spark node docker image is built in this build rather than using a third party image. Also
improved the deployment of the Jupyter server.
---
 simple-spark-swarm/README.md                  |  8 +-
 simple-spark-swarm/build-images.sh            | 29 ++++--
 .../configured-spark-node/Dockerfile          | 11 ---
 .../spark-conf/spark-defaults.conf            | 15 ---
 simple-spark-swarm/deploy-spark-swarm.yml     | 78 +++++++---------
 .../simple-spark-cluster-jupyter/Dockerfile   | 26 ++++++
 .../start-jupyter.sh                          |  3 +
 .../simple-spark-cluster-node/Dockerfile      | 92 +++++++++++++++++++
 .../spark-conf/spark-defaults.conf            | 15 +++
 .../spark-conf/spark-env.sh                   |  9 +-
 .../start-spark-master.sh                     |  7 ++
 .../start-spark-node.sh                       |  7 ++
 .../spark-jupyter-notebook/Dockerfile         | 12 ---
 .../spark-jupyter-notebook/start-jupyter.sh   |  3 -
 14 files changed, 215 insertions(+), 100 deletions(-)
 delete mode 100644 simple-spark-swarm/configured-spark-node/Dockerfile
 delete mode 100644 simple-spark-swarm/configured-spark-node/spark-conf/spark-defaults.conf
 create mode 100644 simple-spark-swarm/simple-spark-cluster-jupyter/Dockerfile
 create mode 100644 simple-spark-swarm/simple-spark-cluster-jupyter/start-jupyter.sh
 create mode 100644 simple-spark-swarm/simple-spark-cluster-node/Dockerfile
 create mode 100644 simple-spark-swarm/simple-spark-cluster-node/spark-conf/spark-defaults.conf
 rename simple-spark-swarm/{configured-spark-node => simple-spark-cluster-node}/spark-conf/spark-env.sh (70%)
 create mode 100644 simple-spark-swarm/simple-spark-cluster-node/start-spark-master.sh
 create mode 100644 simple-spark-swarm/simple-spark-cluster-node/start-spark-node.sh
 delete mode 100644 simple-spark-swarm/spark-jupyter-notebook/Dockerfile
 delete mode 100644 simple-spark-swarm/spark-jupyter-notebook/start-jupyter.sh

diff --git a/simple-spark-swarm/README.md b/simple-spark-swarm/README.md
index 10c2a8d..60e6191 100644
--- a/simple-spark-swarm/README.md
+++ b/simple-spark-swarm/README.md
@@ -8,13 +8,15 @@ First, edit the following items as needed for your swarm:
 1. `configured-sparknode -> spark-conf -> spark-env.sh`: adjust the environment variables as appropriate for your cluster's nodes, most notably `SPARK_WORKER_MEMORY` and `SPARK_WORKER_CORES`. Leave 1-2 cores and at least 10% of RAM for other processes.
 2. `configured-sparknode -> spark-conf -> spark-env.sh`: Adjust the memory and core settings for the executors and driver. Each executor should have about 5 cores (if possible), and should be a whole divisor into `SPARK_WORKER_CORES`. Spark will launch as many executors as `SPARK_WORKER_CORES` divided by `spark.executor.cores`. Reserve about 7-8% of `SPARK_WORKER_MEMORY` for overhead when setting `spark.executor.memory`.
 3. `build-images.sh`: Adjust the IP address for your local Docker registry that all nodes in your cluster can access. You can use a domain name if all nodes in your swarm can resolve it. This is needed as it allows all nodes in the swarm to pull the locally built Docker images.
-4. `spark-deploy.yml`: Adjust all image names for the updated local Docker registry address you used in the prior step. Also, adjust the resource limits for each of the services. Setting a `cpus` limit here that is smaller than the number of cores on your node has the effect of giving your process a fraction of each core's capacity. You might consider doing this if your swarm hosts other services or does not handle long term 100% CPU load well (e.g., overheats). Also adjust the `replicas` count for the `spark-worker` service to be equal to the number of nodes in your swarm (or less). 
+4. `deploy-spark-swarm.yml`: Adjust all image names for the updated local Docker registry address you used in the prior step. Also, adjust the resource limits for each of the services. Setting a `cpus` limit here that is smaller than the number of cores on your node has the effect of giving your process a fraction of each core's capacity. You might consider doing this if your swarm hosts other services or does not handle long term 100% CPU load well (e.g., overheats). Also adjust the `replicas` count for the `spark-worker` service to be equal to the number of nodes in your swarm (or less). 
 
 This set up depends on have a GlusterFS volume mounted at `/mnt/gfs` on all nodes and the following directories exist on it:
 
 * `/mnt/gfs/jupyter-notbooks` - used to persist the Jupyter notebooks.
 * `/mnt/gfs/data` - This is where data to analyze with spark gets placed.
 
+You could replace the GlusterFS mount with some other network mount, such as NFS.
+
 Then, to start up the Spark cluster in your Docker swarm, `cd` into this project's directory and:
 ```
 ./build-images.sh
@@ -22,6 +24,4 @@ docker stack deploy -c deploy-spark-swarm.yml spark
 ```
 
 Point your development computer's browser at `http://swarm-public-ip:7777/` to load the Jupyter notebook.
-
-## Acknowledgements
-The docker configuration leverages the [`gettyimages/spark`](https://hub.docker.com/r/gettyimages/spark/) Docker image as a starting point. 
+ 
diff --git a/simple-spark-swarm/build-images.sh b/simple-spark-swarm/build-images.sh
index 40c25b6..b0ee646 100755
--- a/simple-spark-swarm/build-images.sh
+++ b/simple-spark-swarm/build-images.sh
@@ -2,14 +2,31 @@
 
 set -e
 
+DOCKER_BUILD_ARGS=
+
+while getopts b: option
+do
+case "${option}"
+in
+b) DOCKER_BUILD_ARGS=${OPTARG};;
+esac
+done
+
+if [ -z "$DOCKER_ARGS" ]
+then
+	echo "Building with default docker options"
+else
+	echo "Building with docker arguments = '$DOCKER_BUILD_ARGS'"
+fi
+
 # build images
-docker build -t configured-spark-node:latest ./configured-spark-node
-docker build -t spark-jupyter-notebook:latest ./spark-jupyter-notebook
+docker build -t simple-spark-cluster-node:latest $DOCKER_BUILD_ARGS ./simple-spark-cluster-node
+docker build -t simple-spark-cluster-jupyter:latest $DOCKER_BUILD_ARGS ./simple-spark-cluster-jupyter
 
 # tag image with local repository
-docker tag configured-spark-node:latest master:5000/configured-spark-node:latest
-docker tag spark-jupyter-notebook:latest master:5000/spark-jupyter-notebook:latest
+docker tag simple-spark-cluster-node:latest master:5000/simple-spark-cluster-node:latest
+docker tag simple-spark-cluster-jupyter:latest master:5000/simple-spark-cluster-jupyter:latest
 
 # push the images to local repository
-docker push master:5000/configured-spark-node:latest
-docker push master:5000/spark-jupyter-notebook:latest
+docker push master:5000/simple-spark-cluster-node:latest
+docker push master:5000/simple-spark-cluster-jupyter:latest
diff --git a/simple-spark-swarm/configured-spark-node/Dockerfile b/simple-spark-swarm/configured-spark-node/Dockerfile
deleted file mode 100644
index cde3e40..0000000
--- a/simple-spark-swarm/configured-spark-node/Dockerfile
+++ /dev/null
@@ -1,11 +0,0 @@
-FROM gettyimages/spark
-
-# add python libraries useful in PySpark
-RUN python3 -mpip install matplotlib \
-	&& pip3 install pandas
-
-# copy desired configuration to the spark conf
-COPY ./spark-conf/* $SPARK_HOME/conf/
-
-# same default command as the FROM image
-CMD ["bin/spark-class", "org.apache.spark.deploy.master.Master"]
diff --git a/simple-spark-swarm/configured-spark-node/spark-conf/spark-defaults.conf b/simple-spark-swarm/configured-spark-node/spark-conf/spark-defaults.conf
deleted file mode 100644
index cd21d22..0000000
--- a/simple-spark-swarm/configured-spark-node/spark-conf/spark-defaults.conf
+++ /dev/null
@@ -1,15 +0,0 @@
-# performance optimizations
-spark.serializer			org.apache.spark.serializer.KryoSerializer
-spark.default.parallelism	100
-
-# worker node / executor set up
-# expecting a worker with 10 cores and 52g of memory 
-spark.executor.memory 		24g
-spark.executor.cores		5
-
-# driver configurations
-spark.driver.memory			4g
-spark.driver.cores			2
-
-# operational configurations
-spark.logConf				true
diff --git a/simple-spark-swarm/deploy-spark-swarm.yml b/simple-spark-swarm/deploy-spark-swarm.yml
index 066485e..0a2c2a9 100644
--- a/simple-spark-swarm/deploy-spark-swarm.yml
+++ b/simple-spark-swarm/deploy-spark-swarm.yml
@@ -1,99 +1,83 @@
 version: '3.4'
 services:
     spark-master:
-        image: master:5000/configured-spark-node:latest
-        command: bin/spark-class org.apache.spark.deploy.master.Master -h spark-master
+        image: master:5000/simple-spark-cluster-node:latest
+        command: ["/bin/bash", "/start-spark-master.sh"]
         hostname: spark-master
-        environment:
-            MASTER: spark://spark-master:7077
-            SPARK_PUBLIC_DNS: 10.1.1.1
-        expose:
-            - 7001
-            - 7002
-            - 7003
-            - 7004
-            - 7005
-            - 7006
-            - 7077
-            - 6066
         networks:
-            - spark-network
+            - cluster_network
         ports:
             - 6066:6066
             - 7077:7077
             - 8080:8080
+            - 18080:18080
         volumes:
             - type: bind
               source: /mnt/gfs/data
               target: /data
+            - type: bind
+              source: /mnt/data/spark
+              target: /app/spark
         deploy:
             resources:
                 limits:
                     cpus: "2.0"
-                    memory: 8G
-     
+                    memory: 6g     
     spark-worker:
-        image: master:5000/configured-spark-node:latest
-        command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077
+        image: master:5000/simple-spark-cluster-node:latest
         hostname: spark-worker
-        environment:
-            SPARK_PUBLIC_DNS: 10.1.1.1
-        links:
+        depends_on:
+            - qfs-master
             - spark-master
-        expose:
-            - 7012
-            - 7013
-            - 7014
-            - 7015
-            - 7016
-            - 8881
         networks:
-            - spark-network
+            - cluster_network
         ports:
             - 8081:8081
         volumes:
             - type: bind
               source: /mnt/gfs/data
               target: /data
+            - type: bind
+              source: /mnt/data/spark
+              target: /app/spark
         deploy:
-            mode: replicated
-            replicas: 4
+            mode: global
             resources:
                 limits:
-                    cpus: "6.0"
-                    memory: 52g
-
+                    memory: 56g
     spark-jupyter:
-        image: master:5000/spark-jupyter-notebook:latest
+        image: master:5000/simple-spark-cluster-jupyter:latest
         hostname: spark-jupyter
-        environment:
-            PARK_PUBLIC_DNS: 10.1.1.1
         depends_on:
             - spark-master
             - spark-worker
-        links:
-            - spark-master
         networks:
-            - spark-network
-        expose:
-            - 7777
-            - 4040
+            - cluster_network
         ports:
             - 7777:7777
             - 4040:4040
+            - 4041:4041
         volumes:
             - type: bind
               source: /mnt/gfs/jupyter-notebooks
-              target: /home/jupyter/notebooks
+              target: /home/spark/jupyter/notebooks
             - type: bind
               source: /mnt/gfs/data
               target: /data
+            - type: bind
+              source: /mnt/data/spark
+              target: /app/spark
         deploy:
             resources:
                 limits:
                     cpus: "2.0"
-                    memory: 10G
+                    memory: 6g
 
 networks:
-    spark-network:
+    cluster_network:
+        attachable: true
+        ipam:
+            driver: default
+            config:
+                - subnet: 10.20.30.0/24
 
diff --git a/simple-spark-swarm/simple-spark-cluster-jupyter/Dockerfile b/simple-spark-swarm/simple-spark-cluster-jupyter/Dockerfile
new file mode 100644
index 0000000..b052d89
--- /dev/null
+++ b/simple-spark-swarm/simple-spark-cluster-jupyter/Dockerfile
@@ -0,0 +1,26 @@
+FROM simple-spark-cluster-node:latest
+
+#
+# Expected volumes:
+# 	/home/spark/jupyter/notebooks - where the Jupyter notebooks will be persisted
+#   /app/spark - Spark's data directory
+#
+
+USER root
+RUN apt-get install -y g++
+RUN pip3 install notebook==5.7.8 \
+		jupyter_nbextensions_configurator \
+		jupyter_contrib_nbextensions
+COPY start-jupyter.sh /
+
+USER spark
+RUN jupyter contrib nbextension install --user
+RUN jupyter nbextensions_configurator enable --user
+RUN jupyter nbextension enable toc2/main
+RUN jupyter nbextension enable codefolding/main
+RUN jupyter nbextension enable execute_time/ExecuteTime
+
+RUN mkdir -p /home/spark/jupyter/runtime \
+ &&	mkdir -p /home/spark/jupyter/notebooks
+
+CMD ["/bin/bash", "/start-jupyter.sh"]
diff --git a/simple-spark-swarm/simple-spark-cluster-jupyter/start-jupyter.sh b/simple-spark-swarm/simple-spark-cluster-jupyter/start-jupyter.sh
new file mode 100644
index 0000000..2cc87c3
--- /dev/null
+++ b/simple-spark-swarm/simple-spark-cluster-jupyter/start-jupyter.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+SHELL=/bin/bash XDG_RUNTIME_DIR=/home/spark/jupyter/runtime PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS="notebook --no-browser --port=7777 --notebook-dir=/home/spark/jupyter/notebooks --ip=0.0.0.0 --NotebookApp.password='' --NotebookApp.token=''" $SPARK_HOME/bin/pyspark --packages graphframes:graphframes:0.7.0-spark2.4-s_2.11 --master spark://spark-master:7077
diff --git a/simple-spark-swarm/simple-spark-cluster-node/Dockerfile b/simple-spark-swarm/simple-spark-cluster-node/Dockerfile
new file mode 100644
index 0000000..157f22e
--- /dev/null
+++ b/simple-spark-swarm/simple-spark-cluster-node/Dockerfile
@@ -0,0 +1,92 @@
+FROM debian:stretch
+MAINTAINER Michael Kamprath "https://github.com/michaelkamprath"
+#
+# Base image for Apace Spak standalone cluster. 
+#
+# Inspired by https://hub.docker.com/r/gettyimages/spark/dockerfile
+#
+#
+# Expected volumes:
+#	/app/spark - this is the spark working directory
+#	<data dir> - All nodes should mount the same data directory. This can be a GlusterFS or NFS mount.
+#
+# Expected service names:
+#	spark-master - the service where the spark master runs
+#
+
+ARG SPARK_VERSION=2.4.5
+ARG HADOOP_MINOR_VERSION=2.7
+ARG HADOOP_VERSION=2.7.2
+ARG SCALA_VERSION=2.11.12
+
+RUN apt-get update \
+ && apt-get install -y locales \
+ && dpkg-reconfigure -f noninteractive locales \
+ && locale-gen C.UTF-8 \
+ && /usr/sbin/update-locale LANG=C.UTF-8 \
+ && echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen \
+ && locale-gen \
+ && apt-get clean \
+ && rm -rf /var/lib/apt/lists/*
+ 
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US:en
+ENV LC_ALL en_US.UTF-8
+
+RUN apt-get update \
+ && apt-get install -y curl unzip procps \
+    python3 python3-setuptools \
+ && ln -s /usr/bin/python3 /usr/bin/python \
+ && easy_install3 pip py4j \
+ && apt-get clean \
+ && rm -rf /var/lib/apt/lists/*
+
+ENV PYTHONIOENCODING UTF-8
+ENV PIP_DISABLE_PIP_VERSION_CHECK 1
+
+# JAVA & SCALA
+RUN apt-get update \
+ && apt-get install -y openjdk-8-jre \
+ && apt-get remove scala-library scala  \
+ && curl -o scala-${SCALA_VERSION}.deb https://www.scala-lang.org/files/archive/scala-${SCALA_VERSION}.deb \
+ && dpkg -i scala-${SCALA_VERSION}.deb \
+ && apt-get clean \
+ && rm scala-${SCALA_VERSION}.deb \
+ && rm -rf /var/lib/apt/lists/*
+
+# create the user software will run from
+RUN useradd -m -s /bin/bash spark
+
+
+# SPARK
+ENV SPARK_PACKAGE spark-${SPARK_VERSION}-bin-hadoop${HADOOP_MINOR_VERSION}
+ENV SPARK_HOME /usr/spark-${SPARK_VERSION}
+#ENV SPARK_DIST_CLASSPATH=""
+ENV HADOOP_CONF_DIR=${SPARK_HOME}/conf/
+ENV PATH $PATH:${SPARK_HOME}/bin
+ARG SPARK_DOWNLOAD_URL="https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_PACKAGE}.tgz"
+RUN echo "Downloading Spark from : ${SPARK_DOWNLOAD_URL}\n" \
+	&& curl -L --retry 3 \
+	     $SPARK_DOWNLOAD_URL \
+	   | gunzip \
+	   | tar x -C /usr/ \
+	&& mv /usr/$SPARK_PACKAGE $SPARK_HOME \
+	&& chown -R root:root $SPARK_HOME \
+	&& ln -s $SPARK_HOME /usr/local/spark
+RUN mkdir -p /app/spark \
+ && chown spark -R /app/spark
+ 
+
+# add python libraries useful in PySpark
+RUN python3 -mpip install matplotlib \
+	&& pip3 install pandas seaborn
+
+# copy Spark configurations
+COPY ./spark-conf/* $SPARK_HOME/conf/
+
+# set up command
+COPY start-spark-node.sh /
+COPY start-spark-master.sh /
+USER spark
+WORKDIR /home/spark
+CMD ["/bin/bash", "/start-spark-node.sh"]
diff --git a/simple-spark-swarm/simple-spark-cluster-node/spark-conf/spark-defaults.conf b/simple-spark-swarm/simple-spark-cluster-node/spark-conf/spark-defaults.conf
new file mode 100644
index 0000000..5e37377
--- /dev/null
+++ b/simple-spark-swarm/simple-spark-cluster-node/spark-conf/spark-defaults.conf
@@ -0,0 +1,15 @@
+# performance optimizations
+spark.serializer		org.apache.spark.serializer.KryoSerializer
+spark.default.parallelism	100
+
+# worker node / executor set up
+# expecting a worker with 10 cores and 56g of memory 
+spark.executor.memory 		26g
+spark.executor.cores			6
+
+# driver configurations
+spark.driver.memory			6g
+spark.driver.cores			2
+
+# operational configurations
+spark.logConf				true
diff --git a/simple-spark-swarm/configured-spark-node/spark-conf/spark-env.sh b/simple-spark-swarm/simple-spark-cluster-node/spark-conf/spark-env.sh
similarity index 70%
rename from simple-spark-swarm/configured-spark-node/spark-conf/spark-env.sh
rename to simple-spark-swarm/simple-spark-cluster-node/spark-conf/spark-env.sh
index 79dee53..2709df8 100644
--- a/simple-spark-swarm/configured-spark-node/spark-conf/spark-env.sh
+++ b/simple-spark-swarm/simple-spark-cluster-node/spark-conf/spark-env.sh
@@ -1,8 +1,8 @@
 # the total amount of memory a worker (node) can use
-SPARK_WORKER_MEMORY=52g
+SPARK_WORKER_MEMORY=56g
 
 # the total amount of cores a worker (node) can use
-SPARK_WORKER_CORES=10
+SPARK_WORKER_CORES=12
 
 # the number of worker processes per node
 SPARK_WORKER_INSTANCES=1
@@ -16,3 +16,8 @@ PYSPARK_PYTHON=python3
 
 # hash seed so all node hash numbers consistently
 PYTHONHASHSEED=8675309
+
+# the location of spark working files
+SPARK_LOCAL_DIRS=/app/spark/tmp
+SPARK_WORKER_DIR=/app/spark/work
+SPARK_LOG_DIR=/app/spark/logs
diff --git a/simple-spark-swarm/simple-spark-cluster-node/start-spark-master.sh b/simple-spark-swarm/simple-spark-cluster-node/start-spark-master.sh
new file mode 100644
index 0000000..d70da86
--- /dev/null
+++ b/simple-spark-swarm/simple-spark-cluster-node/start-spark-master.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+# start Spark master
+$SPARK_HOME/sbin/start-master.sh
+
+# now do nothing and do not exit
+while true; do sleep 3600; done
diff --git a/simple-spark-swarm/simple-spark-cluster-node/start-spark-node.sh b/simple-spark-swarm/simple-spark-cluster-node/start-spark-node.sh
new file mode 100644
index 0000000..a09b39c
--- /dev/null
+++ b/simple-spark-swarm/simple-spark-cluster-node/start-spark-node.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+# start the spark worker 
+$SPARK_HOME/sbin/start-slave.sh spark://spark-master:7077
+
+# now do nothing and do not exit
+while true; do sleep 3600; done
diff --git a/simple-spark-swarm/spark-jupyter-notebook/Dockerfile b/simple-spark-swarm/spark-jupyter-notebook/Dockerfile
deleted file mode 100644
index e81d519..0000000
--- a/simple-spark-swarm/spark-jupyter-notebook/Dockerfile
+++ /dev/null
@@ -1,12 +0,0 @@
-FROM configured-spark-node:latest
-
-RUN apt-get install -y g++
-RUN pip3 install jupyter
-RUN mkdir -p /home/jupyter/runtime
-
-COPY start-jupyter.sh /
-
-#EXPOSE 7777
-#EXPOSE 4040
-
-CMD ["/bin/bash", "/start-jupyter.sh"]
diff --git a/simple-spark-swarm/spark-jupyter-notebook/start-jupyter.sh b/simple-spark-swarm/spark-jupyter-notebook/start-jupyter.sh
deleted file mode 100644
index 5595578..0000000
--- a/simple-spark-swarm/spark-jupyter-notebook/start-jupyter.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-
-XDG_RUNTIME_DIR=/home/jupyter/runtime PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS="notebook --no-browser --port=7777 --notebook-dir=/home/jupyter/notebooks --ip=* --no-browser --allow-root --NotebookApp.token='' --NotebookApp.password=''" $SPARK_HOME/bin/pyspark --master spark://spark-master:7077