From 48ccec5ac850be447f88d9e2b81b2cdd32055ad7 Mon Sep 17 00:00:00 2001 From: Michael Kamprath Date: Sun, 8 Mar 2020 14:53:39 -0700 Subject: [PATCH] updated simple spark cluster configuration Update the docker configuration of the simple spark cluster to bring it more inline with the learnings I've made setting up the Spark-QFS cluster. This cluster depends on the stack being deployed with all nodes mounting a file system, such os GlusterFS or NFS. The main change over the previous version of the simple Spark cluster is that the Spark node docker image is built in this build rather than using a third party image. Also improved the deployment of the Jupyter server. --- simple-spark-swarm/README.md | 8 +- simple-spark-swarm/build-images.sh | 29 ++++-- .../configured-spark-node/Dockerfile | 11 --- .../spark-conf/spark-defaults.conf | 15 --- simple-spark-swarm/deploy-spark-swarm.yml | 78 +++++++--------- .../simple-spark-cluster-jupyter/Dockerfile | 26 ++++++ .../start-jupyter.sh | 3 + .../simple-spark-cluster-node/Dockerfile | 92 +++++++++++++++++++ .../spark-conf/spark-defaults.conf | 15 +++ .../spark-conf/spark-env.sh | 9 +- .../start-spark-master.sh | 7 ++ .../start-spark-node.sh | 7 ++ .../spark-jupyter-notebook/Dockerfile | 12 --- .../spark-jupyter-notebook/start-jupyter.sh | 3 - 14 files changed, 215 insertions(+), 100 deletions(-) delete mode 100644 simple-spark-swarm/configured-spark-node/Dockerfile delete mode 100644 simple-spark-swarm/configured-spark-node/spark-conf/spark-defaults.conf create mode 100644 simple-spark-swarm/simple-spark-cluster-jupyter/Dockerfile create mode 100644 simple-spark-swarm/simple-spark-cluster-jupyter/start-jupyter.sh create mode 100644 simple-spark-swarm/simple-spark-cluster-node/Dockerfile create mode 100644 simple-spark-swarm/simple-spark-cluster-node/spark-conf/spark-defaults.conf rename simple-spark-swarm/{configured-spark-node => simple-spark-cluster-node}/spark-conf/spark-env.sh (70%) create mode 100644 simple-spark-swarm/simple-spark-cluster-node/start-spark-master.sh create mode 100644 simple-spark-swarm/simple-spark-cluster-node/start-spark-node.sh delete mode 100644 simple-spark-swarm/spark-jupyter-notebook/Dockerfile delete mode 100644 simple-spark-swarm/spark-jupyter-notebook/start-jupyter.sh diff --git a/simple-spark-swarm/README.md b/simple-spark-swarm/README.md index 10c2a8d..60e6191 100644 --- a/simple-spark-swarm/README.md +++ b/simple-spark-swarm/README.md @@ -8,13 +8,15 @@ First, edit the following items as needed for your swarm: 1. `configured-sparknode -> spark-conf -> spark-env.sh`: adjust the environment variables as appropriate for your cluster's nodes, most notably `SPARK_WORKER_MEMORY` and `SPARK_WORKER_CORES`. Leave 1-2 cores and at least 10% of RAM for other processes. 2. `configured-sparknode -> spark-conf -> spark-env.sh`: Adjust the memory and core settings for the executors and driver. Each executor should have about 5 cores (if possible), and should be a whole divisor into `SPARK_WORKER_CORES`. Spark will launch as many executors as `SPARK_WORKER_CORES` divided by `spark.executor.cores`. Reserve about 7-8% of `SPARK_WORKER_MEMORY` for overhead when setting `spark.executor.memory`. 3. `build-images.sh`: Adjust the IP address for your local Docker registry that all nodes in your cluster can access. You can use a domain name if all nodes in your swarm can resolve it. This is needed as it allows all nodes in the swarm to pull the locally built Docker images. -4. `spark-deploy.yml`: Adjust all image names for the updated local Docker registry address you used in the prior step. Also, adjust the resource limits for each of the services. Setting a `cpus` limit here that is smaller than the number of cores on your node has the effect of giving your process a fraction of each core's capacity. You might consider doing this if your swarm hosts other services or does not handle long term 100% CPU load well (e.g., overheats). Also adjust the `replicas` count for the `spark-worker` service to be equal to the number of nodes in your swarm (or less). +4. `deploy-spark-swarm.yml`: Adjust all image names for the updated local Docker registry address you used in the prior step. Also, adjust the resource limits for each of the services. Setting a `cpus` limit here that is smaller than the number of cores on your node has the effect of giving your process a fraction of each core's capacity. You might consider doing this if your swarm hosts other services or does not handle long term 100% CPU load well (e.g., overheats). Also adjust the `replicas` count for the `spark-worker` service to be equal to the number of nodes in your swarm (or less). This set up depends on have a GlusterFS volume mounted at `/mnt/gfs` on all nodes and the following directories exist on it: * `/mnt/gfs/jupyter-notbooks` - used to persist the Jupyter notebooks. * `/mnt/gfs/data` - This is where data to analyze with spark gets placed. +You could replace the GlusterFS mount with some other network mount, such as NFS. + Then, to start up the Spark cluster in your Docker swarm, `cd` into this project's directory and: ``` ./build-images.sh @@ -22,6 +24,4 @@ docker stack deploy -c deploy-spark-swarm.yml spark ``` Point your development computer's browser at `http://swarm-public-ip:7777/` to load the Jupyter notebook. - -## Acknowledgements -The docker configuration leverages the [`gettyimages/spark`](https://hub.docker.com/r/gettyimages/spark/) Docker image as a starting point. + diff --git a/simple-spark-swarm/build-images.sh b/simple-spark-swarm/build-images.sh index 40c25b6..b0ee646 100755 --- a/simple-spark-swarm/build-images.sh +++ b/simple-spark-swarm/build-images.sh @@ -2,14 +2,31 @@ set -e +DOCKER_BUILD_ARGS= + +while getopts b: option +do +case "${option}" +in +b) DOCKER_BUILD_ARGS=${OPTARG};; +esac +done + +if [ -z "$DOCKER_ARGS" ] +then + echo "Building with default docker options" +else + echo "Building with docker arguments = '$DOCKER_BUILD_ARGS'" +fi + # build images -docker build -t configured-spark-node:latest ./configured-spark-node -docker build -t spark-jupyter-notebook:latest ./spark-jupyter-notebook +docker build -t simple-spark-cluster-node:latest $DOCKER_BUILD_ARGS ./simple-spark-cluster-node +docker build -t simple-spark-cluster-jupyter:latest $DOCKER_BUILD_ARGS ./simple-spark-cluster-jupyter # tag image with local repository -docker tag configured-spark-node:latest master:5000/configured-spark-node:latest -docker tag spark-jupyter-notebook:latest master:5000/spark-jupyter-notebook:latest +docker tag simple-spark-cluster-node:latest master:5000/simple-spark-cluster-node:latest +docker tag simple-spark-cluster-jupyter:latest master:5000/simple-spark-cluster-jupyter:latest # push the images to local repository -docker push master:5000/configured-spark-node:latest -docker push master:5000/spark-jupyter-notebook:latest +docker push master:5000/simple-spark-cluster-node:latest +docker push master:5000/simple-spark-cluster-jupyter:latest diff --git a/simple-spark-swarm/configured-spark-node/Dockerfile b/simple-spark-swarm/configured-spark-node/Dockerfile deleted file mode 100644 index cde3e40..0000000 --- a/simple-spark-swarm/configured-spark-node/Dockerfile +++ /dev/null @@ -1,11 +0,0 @@ -FROM gettyimages/spark - -# add python libraries useful in PySpark -RUN python3 -mpip install matplotlib \ - && pip3 install pandas - -# copy desired configuration to the spark conf -COPY ./spark-conf/* $SPARK_HOME/conf/ - -# same default command as the FROM image -CMD ["bin/spark-class", "org.apache.spark.deploy.master.Master"] diff --git a/simple-spark-swarm/configured-spark-node/spark-conf/spark-defaults.conf b/simple-spark-swarm/configured-spark-node/spark-conf/spark-defaults.conf deleted file mode 100644 index cd21d22..0000000 --- a/simple-spark-swarm/configured-spark-node/spark-conf/spark-defaults.conf +++ /dev/null @@ -1,15 +0,0 @@ -# performance optimizations -spark.serializer org.apache.spark.serializer.KryoSerializer -spark.default.parallelism 100 - -# worker node / executor set up -# expecting a worker with 10 cores and 52g of memory -spark.executor.memory 24g -spark.executor.cores 5 - -# driver configurations -spark.driver.memory 4g -spark.driver.cores 2 - -# operational configurations -spark.logConf true diff --git a/simple-spark-swarm/deploy-spark-swarm.yml b/simple-spark-swarm/deploy-spark-swarm.yml index 066485e..0a2c2a9 100644 --- a/simple-spark-swarm/deploy-spark-swarm.yml +++ b/simple-spark-swarm/deploy-spark-swarm.yml @@ -1,99 +1,83 @@ version: '3.4' services: spark-master: - image: master:5000/configured-spark-node:latest - command: bin/spark-class org.apache.spark.deploy.master.Master -h spark-master + image: master:5000/simple-spark-cluster-node:latest + command: ["/bin/bash", "/start-spark-master.sh"] hostname: spark-master - environment: - MASTER: spark://spark-master:7077 - SPARK_PUBLIC_DNS: 10.1.1.1 - expose: - - 7001 - - 7002 - - 7003 - - 7004 - - 7005 - - 7006 - - 7077 - - 6066 networks: - - spark-network + - cluster_network ports: - 6066:6066 - 7077:7077 - 8080:8080 + - 18080:18080 volumes: - type: bind source: /mnt/gfs/data target: /data + - type: bind + source: /mnt/data/spark + target: /app/spark deploy: resources: limits: cpus: "2.0" - memory: 8G - + memory: 6g spark-worker: - image: master:5000/configured-spark-node:latest - command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077 + image: master:5000/simple-spark-cluster-node:latest hostname: spark-worker - environment: - SPARK_PUBLIC_DNS: 10.1.1.1 - links: + depends_on: + - qfs-master - spark-master - expose: - - 7012 - - 7013 - - 7014 - - 7015 - - 7016 - - 8881 networks: - - spark-network + - cluster_network ports: - 8081:8081 volumes: - type: bind source: /mnt/gfs/data target: /data + - type: bind + source: /mnt/data/spark + target: /app/spark deploy: - mode: replicated - replicas: 4 + mode: global resources: limits: - cpus: "6.0" - memory: 52g - + memory: 56g spark-jupyter: - image: master:5000/spark-jupyter-notebook:latest + image: master:5000/simple-spark-cluster-jupyter:latest hostname: spark-jupyter - environment: - PARK_PUBLIC_DNS: 10.1.1.1 depends_on: - spark-master - spark-worker - links: - - spark-master networks: - - spark-network - expose: - - 7777 - - 4040 + - cluster_network ports: - 7777:7777 - 4040:4040 + - 4041:4041 volumes: - type: bind source: /mnt/gfs/jupyter-notebooks - target: /home/jupyter/notebooks + target: /home/spark/jupyter/notebooks - type: bind source: /mnt/gfs/data target: /data + - type: bind + source: /mnt/data/spark + target: /app/spark deploy: resources: limits: cpus: "2.0" - memory: 10G + memory: 6g networks: - spark-network: + cluster_network: + attachable: true + ipam: + driver: default + config: + - subnet: 10.20.30.0/24 diff --git a/simple-spark-swarm/simple-spark-cluster-jupyter/Dockerfile b/simple-spark-swarm/simple-spark-cluster-jupyter/Dockerfile new file mode 100644 index 0000000..b052d89 --- /dev/null +++ b/simple-spark-swarm/simple-spark-cluster-jupyter/Dockerfile @@ -0,0 +1,26 @@ +FROM simple-spark-cluster-node:latest + +# +# Expected volumes: +# /home/spark/jupyter/notebooks - where the Jupyter notebooks will be persisted +# /app/spark - Spark's data directory +# + +USER root +RUN apt-get install -y g++ +RUN pip3 install notebook==5.7.8 \ + jupyter_nbextensions_configurator \ + jupyter_contrib_nbextensions +COPY start-jupyter.sh / + +USER spark +RUN jupyter contrib nbextension install --user +RUN jupyter nbextensions_configurator enable --user +RUN jupyter nbextension enable toc2/main +RUN jupyter nbextension enable codefolding/main +RUN jupyter nbextension enable execute_time/ExecuteTime + +RUN mkdir -p /home/spark/jupyter/runtime \ + && mkdir -p /home/spark/jupyter/notebooks + +CMD ["/bin/bash", "/start-jupyter.sh"] diff --git a/simple-spark-swarm/simple-spark-cluster-jupyter/start-jupyter.sh b/simple-spark-swarm/simple-spark-cluster-jupyter/start-jupyter.sh new file mode 100644 index 0000000..2cc87c3 --- /dev/null +++ b/simple-spark-swarm/simple-spark-cluster-jupyter/start-jupyter.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +SHELL=/bin/bash XDG_RUNTIME_DIR=/home/spark/jupyter/runtime PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS="notebook --no-browser --port=7777 --notebook-dir=/home/spark/jupyter/notebooks --ip=0.0.0.0 --NotebookApp.password='' --NotebookApp.token=''" $SPARK_HOME/bin/pyspark --packages graphframes:graphframes:0.7.0-spark2.4-s_2.11 --master spark://spark-master:7077 diff --git a/simple-spark-swarm/simple-spark-cluster-node/Dockerfile b/simple-spark-swarm/simple-spark-cluster-node/Dockerfile new file mode 100644 index 0000000..157f22e --- /dev/null +++ b/simple-spark-swarm/simple-spark-cluster-node/Dockerfile @@ -0,0 +1,92 @@ +FROM debian:stretch +MAINTAINER Michael Kamprath "https://github.com/michaelkamprath" +# +# Base image for Apace Spak standalone cluster. +# +# Inspired by https://hub.docker.com/r/gettyimages/spark/dockerfile +# +# +# Expected volumes: +# /app/spark - this is the spark working directory +# - All nodes should mount the same data directory. This can be a GlusterFS or NFS mount. +# +# Expected service names: +# spark-master - the service where the spark master runs +# + +ARG SPARK_VERSION=2.4.5 +ARG HADOOP_MINOR_VERSION=2.7 +ARG HADOOP_VERSION=2.7.2 +ARG SCALA_VERSION=2.11.12 + +RUN apt-get update \ + && apt-get install -y locales \ + && dpkg-reconfigure -f noninteractive locales \ + && locale-gen C.UTF-8 \ + && /usr/sbin/update-locale LANG=C.UTF-8 \ + && echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen \ + && locale-gen \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +ENV LANG en_US.UTF-8 +ENV LANGUAGE en_US:en +ENV LC_ALL en_US.UTF-8 + +RUN apt-get update \ + && apt-get install -y curl unzip procps \ + python3 python3-setuptools \ + && ln -s /usr/bin/python3 /usr/bin/python \ + && easy_install3 pip py4j \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +ENV PYTHONIOENCODING UTF-8 +ENV PIP_DISABLE_PIP_VERSION_CHECK 1 + +# JAVA & SCALA +RUN apt-get update \ + && apt-get install -y openjdk-8-jre \ + && apt-get remove scala-library scala \ + && curl -o scala-${SCALA_VERSION}.deb https://www.scala-lang.org/files/archive/scala-${SCALA_VERSION}.deb \ + && dpkg -i scala-${SCALA_VERSION}.deb \ + && apt-get clean \ + && rm scala-${SCALA_VERSION}.deb \ + && rm -rf /var/lib/apt/lists/* + +# create the user software will run from +RUN useradd -m -s /bin/bash spark + + +# SPARK +ENV SPARK_PACKAGE spark-${SPARK_VERSION}-bin-hadoop${HADOOP_MINOR_VERSION} +ENV SPARK_HOME /usr/spark-${SPARK_VERSION} +#ENV SPARK_DIST_CLASSPATH="" +ENV HADOOP_CONF_DIR=${SPARK_HOME}/conf/ +ENV PATH $PATH:${SPARK_HOME}/bin +ARG SPARK_DOWNLOAD_URL="https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_PACKAGE}.tgz" +RUN echo "Downloading Spark from : ${SPARK_DOWNLOAD_URL}\n" \ + && curl -L --retry 3 \ + $SPARK_DOWNLOAD_URL \ + | gunzip \ + | tar x -C /usr/ \ + && mv /usr/$SPARK_PACKAGE $SPARK_HOME \ + && chown -R root:root $SPARK_HOME \ + && ln -s $SPARK_HOME /usr/local/spark +RUN mkdir -p /app/spark \ + && chown spark -R /app/spark + + +# add python libraries useful in PySpark +RUN python3 -mpip install matplotlib \ + && pip3 install pandas seaborn + +# copy Spark configurations +COPY ./spark-conf/* $SPARK_HOME/conf/ + +# set up command +COPY start-spark-node.sh / +COPY start-spark-master.sh / +USER spark +WORKDIR /home/spark +CMD ["/bin/bash", "/start-spark-node.sh"] diff --git a/simple-spark-swarm/simple-spark-cluster-node/spark-conf/spark-defaults.conf b/simple-spark-swarm/simple-spark-cluster-node/spark-conf/spark-defaults.conf new file mode 100644 index 0000000..5e37377 --- /dev/null +++ b/simple-spark-swarm/simple-spark-cluster-node/spark-conf/spark-defaults.conf @@ -0,0 +1,15 @@ +# performance optimizations +spark.serializer org.apache.spark.serializer.KryoSerializer +spark.default.parallelism 100 + +# worker node / executor set up +# expecting a worker with 10 cores and 56g of memory +spark.executor.memory 26g +spark.executor.cores 6 + +# driver configurations +spark.driver.memory 6g +spark.driver.cores 2 + +# operational configurations +spark.logConf true diff --git a/simple-spark-swarm/configured-spark-node/spark-conf/spark-env.sh b/simple-spark-swarm/simple-spark-cluster-node/spark-conf/spark-env.sh similarity index 70% rename from simple-spark-swarm/configured-spark-node/spark-conf/spark-env.sh rename to simple-spark-swarm/simple-spark-cluster-node/spark-conf/spark-env.sh index 79dee53..2709df8 100644 --- a/simple-spark-swarm/configured-spark-node/spark-conf/spark-env.sh +++ b/simple-spark-swarm/simple-spark-cluster-node/spark-conf/spark-env.sh @@ -1,8 +1,8 @@ # the total amount of memory a worker (node) can use -SPARK_WORKER_MEMORY=52g +SPARK_WORKER_MEMORY=56g # the total amount of cores a worker (node) can use -SPARK_WORKER_CORES=10 +SPARK_WORKER_CORES=12 # the number of worker processes per node SPARK_WORKER_INSTANCES=1 @@ -16,3 +16,8 @@ PYSPARK_PYTHON=python3 # hash seed so all node hash numbers consistently PYTHONHASHSEED=8675309 + +# the location of spark working files +SPARK_LOCAL_DIRS=/app/spark/tmp +SPARK_WORKER_DIR=/app/spark/work +SPARK_LOG_DIR=/app/spark/logs diff --git a/simple-spark-swarm/simple-spark-cluster-node/start-spark-master.sh b/simple-spark-swarm/simple-spark-cluster-node/start-spark-master.sh new file mode 100644 index 0000000..d70da86 --- /dev/null +++ b/simple-spark-swarm/simple-spark-cluster-node/start-spark-master.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# start Spark master +$SPARK_HOME/sbin/start-master.sh + +# now do nothing and do not exit +while true; do sleep 3600; done diff --git a/simple-spark-swarm/simple-spark-cluster-node/start-spark-node.sh b/simple-spark-swarm/simple-spark-cluster-node/start-spark-node.sh new file mode 100644 index 0000000..a09b39c --- /dev/null +++ b/simple-spark-swarm/simple-spark-cluster-node/start-spark-node.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# start the spark worker +$SPARK_HOME/sbin/start-slave.sh spark://spark-master:7077 + +# now do nothing and do not exit +while true; do sleep 3600; done diff --git a/simple-spark-swarm/spark-jupyter-notebook/Dockerfile b/simple-spark-swarm/spark-jupyter-notebook/Dockerfile deleted file mode 100644 index e81d519..0000000 --- a/simple-spark-swarm/spark-jupyter-notebook/Dockerfile +++ /dev/null @@ -1,12 +0,0 @@ -FROM configured-spark-node:latest - -RUN apt-get install -y g++ -RUN pip3 install jupyter -RUN mkdir -p /home/jupyter/runtime - -COPY start-jupyter.sh / - -#EXPOSE 7777 -#EXPOSE 4040 - -CMD ["/bin/bash", "/start-jupyter.sh"] diff --git a/simple-spark-swarm/spark-jupyter-notebook/start-jupyter.sh b/simple-spark-swarm/spark-jupyter-notebook/start-jupyter.sh deleted file mode 100644 index 5595578..0000000 --- a/simple-spark-swarm/spark-jupyter-notebook/start-jupyter.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -XDG_RUNTIME_DIR=/home/jupyter/runtime PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS="notebook --no-browser --port=7777 --notebook-dir=/home/jupyter/notebooks --ip=* --no-browser --allow-root --NotebookApp.token='' --NotebookApp.password=''" $SPARK_HOME/bin/pyspark --master spark://spark-master:7077