From 44a7438d15b2b148721908ff2dbb9793cb36a1c3 Mon Sep 17 00:00:00 2001 From: Michael Kamprath Date: Sun, 8 Sep 2019 17:44:24 -0700 Subject: [PATCH] first version of spark cluster on docker swarm --- README.md | 4 +- spark-on-docker-swarm/README.md | 28 +++++++ spark-on-docker-swarm/build-images.sh | 15 ++++ .../configured-spark-node/Dockerfile | 7 ++ .../spark-conf/spark-defaults.conf | 15 ++++ .../spark-conf/spark-env.sh | 18 ++++ spark-on-docker-swarm/deploy-spark-swarm.yml | 83 +++++++++++++++++++ .../spark-jupyter-notebook/Dockerfile | 13 +++ .../spark-jupyter-notebook/start-jupyter.sh | 3 + 9 files changed, 185 insertions(+), 1 deletion(-) create mode 100644 spark-on-docker-swarm/README.md create mode 100755 spark-on-docker-swarm/build-images.sh create mode 100644 spark-on-docker-swarm/configured-spark-node/Dockerfile create mode 100644 spark-on-docker-swarm/configured-spark-node/spark-conf/spark-defaults.conf create mode 100644 spark-on-docker-swarm/configured-spark-node/spark-conf/spark-env.sh create mode 100644 spark-on-docker-swarm/deploy-spark-swarm.yml create mode 100644 spark-on-docker-swarm/spark-jupyter-notebook/Dockerfile create mode 100644 spark-on-docker-swarm/spark-jupyter-notebook/start-jupyter.sh diff --git a/README.md b/README.md index 5ccd08d..4d0db27 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,4 @@ -# personal-compute-cluster +# Personal Compute Cluster Software and tools for setting up and operating a personal compute cluster, with focus on big data. + +More information at [DIY Big Data](https://diybigdata.net/personal-compute-cluster-2019-edition/). diff --git a/spark-on-docker-swarm/README.md b/spark-on-docker-swarm/README.md new file mode 100644 index 0000000..6ae7a6d --- /dev/null +++ b/spark-on-docker-swarm/README.md @@ -0,0 +1,28 @@ +# Deploy Stand Alone Spark Cluster on Docker Swarm + +This project brings up a simple Apache Spark stand alone cluster in a Docker swarm. It will also launch and make available a Jupyter PySpark notebook that is connected to the Spark cluster. + +## Usage +First, edit the following items as needed for your swarm: + +1. `configured-sparknode -> spark-conf -> spark-env.sh`: adjust the environment variables as appropriate for your cluster's nodes, most notably `SPARK_WORKER_MEMORY` and `SPARK_WORKER_CORES`. Leave 1-2 cores and at least 10% of RAM for other processes. +2. `configured-sparknode -> spark-conf -> spark-env.sh`: Adjust the memory and core settings for the executors and driver. Each executor should have about 5 cores (if possible), and should be a whole divisor into `SPARK_WORKER_CORES`. Spark will launch as many executors as `SPARK_WORKER_CORES` divided by `spark.executor.cores`. reserver about 7-8% of `SPARK_WORKER_MEMORY` for overhead when setting `spark.executor.memory`. +3. `build-images.sh`: Adjust the IP address for your local Docker registry. You can use a domain name if all nodes in your swarm can resolve it. This is needed as it allows all nodes in the swarm to pull the locally built Docker images. +4. `spark-deploy.yml`: Adjust all image names for the updated local Docker registry address you used in the prior step. Also, adjust the resource limits for each of the services. Setting a `cpus` limit here that is smaller than the number of cores on your node has the effect of giving your process a fraction of each core's capacity. You might consider doing this if your swarm hosts other services or does not handle long term 100% CPU load well (e.g., overheats). Also adjust the `replicas` count for the `spark-worker` service to be equal to the number of nodes in your swarm (or less). + +Then, to start up the Spark cluster in your Docker swarm, `cd` into this project's directory and: +``` +./build-images.sh +docker stack deploy -c deploy-spark-swarm.yml spark +``` + +Then point your development computer's browser at `http://swarm-public-ip:7777/` to load the Jupyter notebook. + +## TODO +This cluster is a work in progress. Currently, the following items are missing: +* Persistence for Jupyter notebooks. Once you bring down the cluster, all notebooks you made are deleted. +* A distributed file system, such as HDFS or QFS. Currently there is no way to ingest data into the cluster except through network transfers, such as through `curl`, set up in a Jupyter notebook. +* Robust set Python libraries. This build is currently missing things like [`matplotlib`](https://matplotlib.org) and [`pandas`](https://pandas.pydata.org) from the build. + +## Acknowledgements +The docker configuration leverages the [`gettyimages/spark`](https://hub.docker.com/r/gettyimages/spark/) Docker image as a starting point. \ No newline at end of file diff --git a/spark-on-docker-swarm/build-images.sh b/spark-on-docker-swarm/build-images.sh new file mode 100755 index 0000000..3634e22 --- /dev/null +++ b/spark-on-docker-swarm/build-images.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +set -e + +#build images +docker build -t configured-spark-node:latest ./configured-spark-node +docker build -t spark-jupyter-notebook:latest ./spark-jupyter-notebook + +# tag image with local repository +docker tag configured-spark-node:latest 10.1.1.1:5000/configured-spark-node:latest +docker tag spark-jupyter-notebook:latest 10.1.1.1:5000/spark-jupyter-notebook:latest + +# push the images to local repository +docker push 10.1.1.1:5000/configured-spark-node:latest +docker push 10.1.1.1:5000/spark-jupyter-notebook:latest diff --git a/spark-on-docker-swarm/configured-spark-node/Dockerfile b/spark-on-docker-swarm/configured-spark-node/Dockerfile new file mode 100644 index 0000000..e734d86 --- /dev/null +++ b/spark-on-docker-swarm/configured-spark-node/Dockerfile @@ -0,0 +1,7 @@ +FROM gettyimages/spark + +# copy desired configuration to the spark conf +COPy ./spark-conf/* $SPARK_HOME/conf/ + +# same default command as the FROM image +CMD ["bin/spark-class", "org.apache.spark.deploy.master.Master"] diff --git a/spark-on-docker-swarm/configured-spark-node/spark-conf/spark-defaults.conf b/spark-on-docker-swarm/configured-spark-node/spark-conf/spark-defaults.conf new file mode 100644 index 0000000..cd21d22 --- /dev/null +++ b/spark-on-docker-swarm/configured-spark-node/spark-conf/spark-defaults.conf @@ -0,0 +1,15 @@ +# performance optimizations +spark.serializer org.apache.spark.serializer.KryoSerializer +spark.default.parallelism 100 + +# worker node / executor set up +# expecting a worker with 10 cores and 52g of memory +spark.executor.memory 24g +spark.executor.cores 5 + +# driver configurations +spark.driver.memory 4g +spark.driver.cores 2 + +# operational configurations +spark.logConf true diff --git a/spark-on-docker-swarm/configured-spark-node/spark-conf/spark-env.sh b/spark-on-docker-swarm/configured-spark-node/spark-conf/spark-env.sh new file mode 100644 index 0000000..766da90 --- /dev/null +++ b/spark-on-docker-swarm/configured-spark-node/spark-conf/spark-env.sh @@ -0,0 +1,18 @@ +# the total amount of memory a worker (node) can use +SPARK_WORKER_MEMORY=52g + +# the total amount of cores a worker (node) can use +SPARK_WORKER_CORES=10 + +# the number of worker processes per node +SPARK_WORKER_INSTANCES=1 + +# the ports the worker will advertise +SPARK_WORKER_PORT=8881 +SPARK_WORKER_WEBUI_PORT=8081 + +# which python the spark cluster should use for pyspark +PYSPARK_PYTHON=python3 + +# hash seed so all node has numbers consistently +PYTHONHASHSEED=8675309 diff --git a/spark-on-docker-swarm/deploy-spark-swarm.yml b/spark-on-docker-swarm/deploy-spark-swarm.yml new file mode 100644 index 0000000..994327c --- /dev/null +++ b/spark-on-docker-swarm/deploy-spark-swarm.yml @@ -0,0 +1,83 @@ +version: '3' +services: + spark-master: + image: 10.1.1.1:5000/configured-spark-node:latest + command: bin/spark-class org.apache.spark.deploy.master.Master -h spark-master + hostname: spark-master + environment: + MASTER: spark://spark-master:7077 + SPARK_PUBLIC_DNS: 10.1.1.1 + expose: + - 7001 + - 7002 + - 7003 + - 7004 + - 7005 + - 7006 + - 7077 + - 6066 + networks: + - spark-network + ports: + - 6066:6066 + - 7077:7077 + - 8080:8080 + deploy: + resources: + limits: + cpus: "2.0" + memory: 8G + + spark-worker: + image: 10.1.1.1:5000/configured-spark-node:latest + command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077 + hostname: spark-worker-1 + environment: + SPARK_PUBLIC_DNS: 10.1.1.1 + links: + - spark-master + expose: + - 7012 + - 7013 + - 7014 + - 7015 + - 7016 + - 8881 + networks: + - spark-network + ports: + - 8081:8081 + deploy: + mode: replicated + replicas: 4 + resources: + limits: + cpus: "8.0" + memory: 52g + + spark-jupyter: + image: 10.1.1.1:5000/spark-jupyter-notebook:latest + hostname: spark-jupyter + environment: + PARK_PUBLIC_DNS: 10.1.1.1 + depends_on: + - spark-master + - spark-worker + links: + - spark-master + networks: + - spark-network + expose: + - 7777 + - 4040 + ports: + - 7777:7777 + - 4040:4040 + deploy: + resources: + limits: + cpus: "2.0" + memory: 10G + +networks: + spark-network: diff --git a/spark-on-docker-swarm/spark-jupyter-notebook/Dockerfile b/spark-on-docker-swarm/spark-jupyter-notebook/Dockerfile new file mode 100644 index 0000000..90c4c16 --- /dev/null +++ b/spark-on-docker-swarm/spark-jupyter-notebook/Dockerfile @@ -0,0 +1,13 @@ +FROM configured-spark-node:latest + +RUN apt-get install -y g++ +RUN pip3 install jupyter +RUN mkdir -p /home/jupyter/notebooks +RUN mkdir -p /home/jupyter/runtime + +COPY start-jupyter.sh / + +#EXPOSE 7777 +#EXPOSE 4040 + +CMD ["/bin/bash", "/start-jupyter.sh"] diff --git a/spark-on-docker-swarm/spark-jupyter-notebook/start-jupyter.sh b/spark-on-docker-swarm/spark-jupyter-notebook/start-jupyter.sh new file mode 100644 index 0000000..5595578 --- /dev/null +++ b/spark-on-docker-swarm/spark-jupyter-notebook/start-jupyter.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +XDG_RUNTIME_DIR=/home/jupyter/runtime PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS="notebook --no-browser --port=7777 --notebook-dir=/home/jupyter/notebooks --ip=* --no-browser --allow-root --NotebookApp.token='' --NotebookApp.password=''" $SPARK_HOME/bin/pyspark --master spark://spark-master:7077