From 44a7438d15b2b148721908ff2dbb9793cb36a1c3 Mon Sep 17 00:00:00 2001
From: Michael Kamprath <michael@kamprath.net>
Date: Sun, 8 Sep 2019 17:44:24 -0700
Subject: [PATCH] first version of spark cluster on docker swarm

---
 README.md                                     |  4 +-
 spark-on-docker-swarm/README.md               | 28 +++++++
 spark-on-docker-swarm/build-images.sh         | 15 ++++
 .../configured-spark-node/Dockerfile          |  7 ++
 .../spark-conf/spark-defaults.conf            | 15 ++++
 .../spark-conf/spark-env.sh                   | 18 ++++
 spark-on-docker-swarm/deploy-spark-swarm.yml  | 83 +++++++++++++++++++
 .../spark-jupyter-notebook/Dockerfile         | 13 +++
 .../spark-jupyter-notebook/start-jupyter.sh   |  3 +
 9 files changed, 185 insertions(+), 1 deletion(-)
 create mode 100644 spark-on-docker-swarm/README.md
 create mode 100755 spark-on-docker-swarm/build-images.sh
 create mode 100644 spark-on-docker-swarm/configured-spark-node/Dockerfile
 create mode 100644 spark-on-docker-swarm/configured-spark-node/spark-conf/spark-defaults.conf
 create mode 100644 spark-on-docker-swarm/configured-spark-node/spark-conf/spark-env.sh
 create mode 100644 spark-on-docker-swarm/deploy-spark-swarm.yml
 create mode 100644 spark-on-docker-swarm/spark-jupyter-notebook/Dockerfile
 create mode 100644 spark-on-docker-swarm/spark-jupyter-notebook/start-jupyter.sh

diff --git a/README.md b/README.md
index 5ccd08d..4d0db27 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,4 @@
-# personal-compute-cluster
+# Personal Compute Cluster
 Software and tools for setting up and operating a personal compute cluster, with focus on big data.
+
+More information at [DIY Big Data](https://diybigdata.net/personal-compute-cluster-2019-edition/).
diff --git a/spark-on-docker-swarm/README.md b/spark-on-docker-swarm/README.md
new file mode 100644
index 0000000..6ae7a6d
--- /dev/null
+++ b/spark-on-docker-swarm/README.md
@@ -0,0 +1,28 @@
+# Deploy Stand Alone Spark Cluster on Docker Swarm
+
+This project brings up a simple Apache Spark stand alone cluster in a Docker swarm. It will also launch and make available a Jupyter PySpark notebook that is connected to the Spark cluster.
+
+## Usage
+First, edit the following items as needed for your swarm:
+
+1. `configured-sparknode -> spark-conf -> spark-env.sh`: adjust the environment variables as appropriate for your cluster's nodes, most notably `SPARK_WORKER_MEMORY` and `SPARK_WORKER_CORES`. Leave 1-2 cores and at least 10% of RAM for other processes.
+2. `configured-sparknode -> spark-conf -> spark-env.sh`: Adjust the memory and core settings for the executors and driver. Each executor should have about 5 cores (if possible), and should be a whole divisor into `SPARK_WORKER_CORES`. Spark will launch as many executors as `SPARK_WORKER_CORES` divided by `spark.executor.cores`. reserver about 7-8% of `SPARK_WORKER_MEMORY` for overhead when setting `spark.executor.memory`.
+3. `build-images.sh`: Adjust the IP address for your local Docker registry. You can use a domain name if all nodes in your swarm can resolve it. This is needed as it allows all nodes in the swarm to pull the locally built Docker images.
+4. `spark-deploy.yml`: Adjust all image names for the updated local Docker registry address you used in the prior step. Also, adjust the resource limits for each of the services. Setting a `cpus` limit here that is smaller than the number of cores on your node has the effect of giving your process a fraction of each core's capacity. You might consider doing this if your swarm hosts other services or does not handle long term 100% CPU load well (e.g., overheats). Also adjust the `replicas` count for the `spark-worker` service to be equal to the number of nodes in your swarm (or less). 
+
+Then, to start up the Spark cluster in your Docker swarm, `cd` into this project's directory and:
+```
+./build-images.sh
+docker stack deploy -c deploy-spark-swarm.yml spark
+```
+
+Then point your development computer's browser at `http://swarm-public-ip:7777/` to load the Jupyter notebook.
+
+## TODO
+This cluster is a work in progress. Currently, the following items are missing:
+* Persistence for Jupyter notebooks. Once you bring down the cluster, all notebooks you made are deleted.
+* A distributed file system, such as HDFS or QFS. Currently there is no way to ingest data into the cluster except through network transfers, such as through `curl`, set up in a Jupyter notebook.
+* Robust set Python libraries. This build is currently missing things like [`matplotlib`](https://matplotlib.org) and [`pandas`](https://pandas.pydata.org) from the build.
+
+## Acknowledgements
+The docker configuration leverages the [`gettyimages/spark`](https://hub.docker.com/r/gettyimages/spark/) Docker image as a starting point. 
\ No newline at end of file
diff --git a/spark-on-docker-swarm/build-images.sh b/spark-on-docker-swarm/build-images.sh
new file mode 100755
index 0000000..3634e22
--- /dev/null
+++ b/spark-on-docker-swarm/build-images.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+set -e
+
+#build images
+docker build -t configured-spark-node:latest ./configured-spark-node
+docker build -t spark-jupyter-notebook:latest ./spark-jupyter-notebook
+
+# tag image with local repository
+docker tag configured-spark-node:latest 10.1.1.1:5000/configured-spark-node:latest
+docker tag spark-jupyter-notebook:latest 10.1.1.1:5000/spark-jupyter-notebook:latest
+
+# push the images to local repository
+docker push 10.1.1.1:5000/configured-spark-node:latest
+docker push 10.1.1.1:5000/spark-jupyter-notebook:latest
diff --git a/spark-on-docker-swarm/configured-spark-node/Dockerfile b/spark-on-docker-swarm/configured-spark-node/Dockerfile
new file mode 100644
index 0000000..e734d86
--- /dev/null
+++ b/spark-on-docker-swarm/configured-spark-node/Dockerfile
@@ -0,0 +1,7 @@
+FROM gettyimages/spark
+
+# copy desired configuration to the spark conf
+COPy ./spark-conf/* $SPARK_HOME/conf/
+
+# same default command as the FROM image
+CMD ["bin/spark-class", "org.apache.spark.deploy.master.Master"]
diff --git a/spark-on-docker-swarm/configured-spark-node/spark-conf/spark-defaults.conf b/spark-on-docker-swarm/configured-spark-node/spark-conf/spark-defaults.conf
new file mode 100644
index 0000000..cd21d22
--- /dev/null
+++ b/spark-on-docker-swarm/configured-spark-node/spark-conf/spark-defaults.conf
@@ -0,0 +1,15 @@
+# performance optimizations
+spark.serializer			org.apache.spark.serializer.KryoSerializer
+spark.default.parallelism	100
+
+# worker node / executor set up
+# expecting a worker with 10 cores and 52g of memory 
+spark.executor.memory 		24g
+spark.executor.cores		5
+
+# driver configurations
+spark.driver.memory			4g
+spark.driver.cores			2
+
+# operational configurations
+spark.logConf				true
diff --git a/spark-on-docker-swarm/configured-spark-node/spark-conf/spark-env.sh b/spark-on-docker-swarm/configured-spark-node/spark-conf/spark-env.sh
new file mode 100644
index 0000000..766da90
--- /dev/null
+++ b/spark-on-docker-swarm/configured-spark-node/spark-conf/spark-env.sh
@@ -0,0 +1,18 @@
+# the total amount of memory a worker (node) can use
+SPARK_WORKER_MEMORY=52g
+
+# the total amount of cores a worker (node) can use
+SPARK_WORKER_CORES=10
+
+# the number of worker processes per node
+SPARK_WORKER_INSTANCES=1
+
+# the ports the worker will advertise
+SPARK_WORKER_PORT=8881
+SPARK_WORKER_WEBUI_PORT=8081
+
+# which python the spark cluster should use for pyspark
+PYSPARK_PYTHON=python3 
+
+# hash seed so all node has numbers consistently
+PYTHONHASHSEED=8675309
diff --git a/spark-on-docker-swarm/deploy-spark-swarm.yml b/spark-on-docker-swarm/deploy-spark-swarm.yml
new file mode 100644
index 0000000..994327c
--- /dev/null
+++ b/spark-on-docker-swarm/deploy-spark-swarm.yml
@@ -0,0 +1,83 @@
+version: '3'
+services:
+    spark-master:
+        image: 10.1.1.1:5000/configured-spark-node:latest
+        command: bin/spark-class org.apache.spark.deploy.master.Master -h spark-master
+        hostname: spark-master
+        environment:
+            MASTER: spark://spark-master:7077
+            SPARK_PUBLIC_DNS: 10.1.1.1
+        expose:
+            - 7001
+            - 7002
+            - 7003
+            - 7004
+            - 7005
+            - 7006
+            - 7077
+            - 6066
+        networks:
+            - spark-network
+        ports:
+            - 6066:6066
+            - 7077:7077
+            - 8080:8080
+        deploy:
+            resources:
+                limits:
+                    cpus: "2.0"
+                    memory: 8G
+     
+    spark-worker:
+        image: 10.1.1.1:5000/configured-spark-node:latest
+        command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077
+        hostname: spark-worker-1
+        environment:
+            SPARK_PUBLIC_DNS: 10.1.1.1
+        links:
+            - spark-master
+        expose:
+            - 7012
+            - 7013
+            - 7014
+            - 7015
+            - 7016
+            - 8881
+        networks:
+            - spark-network
+        ports:
+            - 8081:8081
+        deploy:
+            mode: replicated
+            replicas: 4
+            resources:
+                limits:
+                    cpus: "8.0"
+                    memory: 52g
+
+    spark-jupyter:
+        image: 10.1.1.1:5000/spark-jupyter-notebook:latest
+        hostname: spark-jupyter
+        environment:
+            PARK_PUBLIC_DNS: 10.1.1.1
+        depends_on:
+            - spark-master
+            - spark-worker
+        links:
+            - spark-master
+        networks:
+            - spark-network
+        expose:
+            - 7777
+            - 4040
+        ports:
+            - 7777:7777
+            - 4040:4040
+        deploy:
+            resources:
+                limits:
+                    cpus: "2.0"
+                    memory: 10G
+
+networks:
+    spark-network:
diff --git a/spark-on-docker-swarm/spark-jupyter-notebook/Dockerfile b/spark-on-docker-swarm/spark-jupyter-notebook/Dockerfile
new file mode 100644
index 0000000..90c4c16
--- /dev/null
+++ b/spark-on-docker-swarm/spark-jupyter-notebook/Dockerfile
@@ -0,0 +1,13 @@
+FROM configured-spark-node:latest
+
+RUN apt-get install -y g++
+RUN pip3 install jupyter
+RUN mkdir -p /home/jupyter/notebooks
+RUN mkdir -p /home/jupyter/runtime
+
+COPY start-jupyter.sh /
+
+#EXPOSE 7777
+#EXPOSE 4040
+
+CMD ["/bin/bash", "/start-jupyter.sh"]
diff --git a/spark-on-docker-swarm/spark-jupyter-notebook/start-jupyter.sh b/spark-on-docker-swarm/spark-jupyter-notebook/start-jupyter.sh
new file mode 100644
index 0000000..5595578
--- /dev/null
+++ b/spark-on-docker-swarm/spark-jupyter-notebook/start-jupyter.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+XDG_RUNTIME_DIR=/home/jupyter/runtime PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS="notebook --no-browser --port=7777 --notebook-dir=/home/jupyter/notebooks --ip=* --no-browser --allow-root --NotebookApp.token='' --NotebookApp.password=''" $SPARK_HOME/bin/pyspark --master spark://spark-master:7077