diff --git a/simple-spark-swarm/README.md b/simple-spark-swarm/README.md index 10c2a8d..60e6191 100644 --- a/simple-spark-swarm/README.md +++ b/simple-spark-swarm/README.md @@ -8,13 +8,15 @@ First, edit the following items as needed for your swarm: 1. `configured-sparknode -> spark-conf -> spark-env.sh`: adjust the environment variables as appropriate for your cluster's nodes, most notably `SPARK_WORKER_MEMORY` and `SPARK_WORKER_CORES`. Leave 1-2 cores and at least 10% of RAM for other processes. 2. `configured-sparknode -> spark-conf -> spark-env.sh`: Adjust the memory and core settings for the executors and driver. Each executor should have about 5 cores (if possible), and should be a whole divisor into `SPARK_WORKER_CORES`. Spark will launch as many executors as `SPARK_WORKER_CORES` divided by `spark.executor.cores`. Reserve about 7-8% of `SPARK_WORKER_MEMORY` for overhead when setting `spark.executor.memory`. 3. `build-images.sh`: Adjust the IP address for your local Docker registry that all nodes in your cluster can access. You can use a domain name if all nodes in your swarm can resolve it. This is needed as it allows all nodes in the swarm to pull the locally built Docker images. -4. `spark-deploy.yml`: Adjust all image names for the updated local Docker registry address you used in the prior step. Also, adjust the resource limits for each of the services. Setting a `cpus` limit here that is smaller than the number of cores on your node has the effect of giving your process a fraction of each core's capacity. You might consider doing this if your swarm hosts other services or does not handle long term 100% CPU load well (e.g., overheats). Also adjust the `replicas` count for the `spark-worker` service to be equal to the number of nodes in your swarm (or less). +4. `deploy-spark-swarm.yml`: Adjust all image names for the updated local Docker registry address you used in the prior step. Also, adjust the resource limits for each of the services. Setting a `cpus` limit here that is smaller than the number of cores on your node has the effect of giving your process a fraction of each core's capacity. You might consider doing this if your swarm hosts other services or does not handle long term 100% CPU load well (e.g., overheats). Also adjust the `replicas` count for the `spark-worker` service to be equal to the number of nodes in your swarm (or less). This set up depends on have a GlusterFS volume mounted at `/mnt/gfs` on all nodes and the following directories exist on it: * `/mnt/gfs/jupyter-notbooks` - used to persist the Jupyter notebooks. * `/mnt/gfs/data` - This is where data to analyze with spark gets placed. +You could replace the GlusterFS mount with some other network mount, such as NFS. + Then, to start up the Spark cluster in your Docker swarm, `cd` into this project's directory and: ``` ./build-images.sh @@ -22,6 +24,4 @@ docker stack deploy -c deploy-spark-swarm.yml spark ``` Point your development computer's browser at `http://swarm-public-ip:7777/` to load the Jupyter notebook. - -## Acknowledgements -The docker configuration leverages the [`gettyimages/spark`](https://hub.docker.com/r/gettyimages/spark/) Docker image as a starting point. + diff --git a/simple-spark-swarm/build-images.sh b/simple-spark-swarm/build-images.sh index 40c25b6..b0ee646 100755 --- a/simple-spark-swarm/build-images.sh +++ b/simple-spark-swarm/build-images.sh @@ -2,14 +2,31 @@ set -e +DOCKER_BUILD_ARGS= + +while getopts b: option +do +case "${option}" +in +b) DOCKER_BUILD_ARGS=${OPTARG};; +esac +done + +if [ -z "$DOCKER_ARGS" ] +then + echo "Building with default docker options" +else + echo "Building with docker arguments = '$DOCKER_BUILD_ARGS'" +fi + # build images -docker build -t configured-spark-node:latest ./configured-spark-node -docker build -t spark-jupyter-notebook:latest ./spark-jupyter-notebook +docker build -t simple-spark-cluster-node:latest $DOCKER_BUILD_ARGS ./simple-spark-cluster-node +docker build -t simple-spark-cluster-jupyter:latest $DOCKER_BUILD_ARGS ./simple-spark-cluster-jupyter # tag image with local repository -docker tag configured-spark-node:latest master:5000/configured-spark-node:latest -docker tag spark-jupyter-notebook:latest master:5000/spark-jupyter-notebook:latest +docker tag simple-spark-cluster-node:latest master:5000/simple-spark-cluster-node:latest +docker tag simple-spark-cluster-jupyter:latest master:5000/simple-spark-cluster-jupyter:latest # push the images to local repository -docker push master:5000/configured-spark-node:latest -docker push master:5000/spark-jupyter-notebook:latest +docker push master:5000/simple-spark-cluster-node:latest +docker push master:5000/simple-spark-cluster-jupyter:latest diff --git a/simple-spark-swarm/configured-spark-node/Dockerfile b/simple-spark-swarm/configured-spark-node/Dockerfile deleted file mode 100644 index cde3e40..0000000 --- a/simple-spark-swarm/configured-spark-node/Dockerfile +++ /dev/null @@ -1,11 +0,0 @@ -FROM gettyimages/spark - -# add python libraries useful in PySpark -RUN python3 -mpip install matplotlib \ - && pip3 install pandas - -# copy desired configuration to the spark conf -COPY ./spark-conf/* $SPARK_HOME/conf/ - -# same default command as the FROM image -CMD ["bin/spark-class", "org.apache.spark.deploy.master.Master"] diff --git a/simple-spark-swarm/configured-spark-node/spark-conf/spark-defaults.conf b/simple-spark-swarm/configured-spark-node/spark-conf/spark-defaults.conf deleted file mode 100644 index cd21d22..0000000 --- a/simple-spark-swarm/configured-spark-node/spark-conf/spark-defaults.conf +++ /dev/null @@ -1,15 +0,0 @@ -# performance optimizations -spark.serializer org.apache.spark.serializer.KryoSerializer -spark.default.parallelism 100 - -# worker node / executor set up -# expecting a worker with 10 cores and 52g of memory -spark.executor.memory 24g -spark.executor.cores 5 - -# driver configurations -spark.driver.memory 4g -spark.driver.cores 2 - -# operational configurations -spark.logConf true diff --git a/simple-spark-swarm/deploy-spark-swarm.yml b/simple-spark-swarm/deploy-spark-swarm.yml index 066485e..0a2c2a9 100644 --- a/simple-spark-swarm/deploy-spark-swarm.yml +++ b/simple-spark-swarm/deploy-spark-swarm.yml @@ -1,99 +1,83 @@ version: '3.4' services: spark-master: - image: master:5000/configured-spark-node:latest - command: bin/spark-class org.apache.spark.deploy.master.Master -h spark-master + image: master:5000/simple-spark-cluster-node:latest + command: ["/bin/bash", "/start-spark-master.sh"] hostname: spark-master - environment: - MASTER: spark://spark-master:7077 - SPARK_PUBLIC_DNS: 10.1.1.1 - expose: - - 7001 - - 7002 - - 7003 - - 7004 - - 7005 - - 7006 - - 7077 - - 6066 networks: - - spark-network + - cluster_network ports: - 6066:6066 - 7077:7077 - 8080:8080 + - 18080:18080 volumes: - type: bind source: /mnt/gfs/data target: /data + - type: bind + source: /mnt/data/spark + target: /app/spark deploy: resources: limits: cpus: "2.0" - memory: 8G - + memory: 6g spark-worker: - image: master:5000/configured-spark-node:latest - command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077 + image: master:5000/simple-spark-cluster-node:latest hostname: spark-worker - environment: - SPARK_PUBLIC_DNS: 10.1.1.1 - links: + depends_on: + - qfs-master - spark-master - expose: - - 7012 - - 7013 - - 7014 - - 7015 - - 7016 - - 8881 networks: - - spark-network + - cluster_network ports: - 8081:8081 volumes: - type: bind source: /mnt/gfs/data target: /data + - type: bind + source: /mnt/data/spark + target: /app/spark deploy: - mode: replicated - replicas: 4 + mode: global resources: limits: - cpus: "6.0" - memory: 52g - + memory: 56g spark-jupyter: - image: master:5000/spark-jupyter-notebook:latest + image: master:5000/simple-spark-cluster-jupyter:latest hostname: spark-jupyter - environment: - PARK_PUBLIC_DNS: 10.1.1.1 depends_on: - spark-master - spark-worker - links: - - spark-master networks: - - spark-network - expose: - - 7777 - - 4040 + - cluster_network ports: - 7777:7777 - 4040:4040 + - 4041:4041 volumes: - type: bind source: /mnt/gfs/jupyter-notebooks - target: /home/jupyter/notebooks + target: /home/spark/jupyter/notebooks - type: bind source: /mnt/gfs/data target: /data + - type: bind + source: /mnt/data/spark + target: /app/spark deploy: resources: limits: cpus: "2.0" - memory: 10G + memory: 6g networks: - spark-network: + cluster_network: + attachable: true + ipam: + driver: default + config: + - subnet: 10.20.30.0/24 diff --git a/simple-spark-swarm/simple-spark-cluster-jupyter/Dockerfile b/simple-spark-swarm/simple-spark-cluster-jupyter/Dockerfile new file mode 100644 index 0000000..b052d89 --- /dev/null +++ b/simple-spark-swarm/simple-spark-cluster-jupyter/Dockerfile @@ -0,0 +1,26 @@ +FROM simple-spark-cluster-node:latest + +# +# Expected volumes: +# /home/spark/jupyter/notebooks - where the Jupyter notebooks will be persisted +# /app/spark - Spark's data directory +# + +USER root +RUN apt-get install -y g++ +RUN pip3 install notebook==5.7.8 \ + jupyter_nbextensions_configurator \ + jupyter_contrib_nbextensions +COPY start-jupyter.sh / + +USER spark +RUN jupyter contrib nbextension install --user +RUN jupyter nbextensions_configurator enable --user +RUN jupyter nbextension enable toc2/main +RUN jupyter nbextension enable codefolding/main +RUN jupyter nbextension enable execute_time/ExecuteTime + +RUN mkdir -p /home/spark/jupyter/runtime \ + && mkdir -p /home/spark/jupyter/notebooks + +CMD ["/bin/bash", "/start-jupyter.sh"] diff --git a/simple-spark-swarm/simple-spark-cluster-jupyter/start-jupyter.sh b/simple-spark-swarm/simple-spark-cluster-jupyter/start-jupyter.sh new file mode 100644 index 0000000..2cc87c3 --- /dev/null +++ b/simple-spark-swarm/simple-spark-cluster-jupyter/start-jupyter.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +SHELL=/bin/bash XDG_RUNTIME_DIR=/home/spark/jupyter/runtime PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS="notebook --no-browser --port=7777 --notebook-dir=/home/spark/jupyter/notebooks --ip=0.0.0.0 --NotebookApp.password='' --NotebookApp.token=''" $SPARK_HOME/bin/pyspark --packages graphframes:graphframes:0.7.0-spark2.4-s_2.11 --master spark://spark-master:7077 diff --git a/simple-spark-swarm/simple-spark-cluster-node/Dockerfile b/simple-spark-swarm/simple-spark-cluster-node/Dockerfile new file mode 100644 index 0000000..157f22e --- /dev/null +++ b/simple-spark-swarm/simple-spark-cluster-node/Dockerfile @@ -0,0 +1,92 @@ +FROM debian:stretch +MAINTAINER Michael Kamprath "https://github.com/michaelkamprath" +# +# Base image for Apace Spak standalone cluster. +# +# Inspired by https://hub.docker.com/r/gettyimages/spark/dockerfile +# +# +# Expected volumes: +# /app/spark - this is the spark working directory +# - All nodes should mount the same data directory. This can be a GlusterFS or NFS mount. +# +# Expected service names: +# spark-master - the service where the spark master runs +# + +ARG SPARK_VERSION=2.4.5 +ARG HADOOP_MINOR_VERSION=2.7 +ARG HADOOP_VERSION=2.7.2 +ARG SCALA_VERSION=2.11.12 + +RUN apt-get update \ + && apt-get install -y locales \ + && dpkg-reconfigure -f noninteractive locales \ + && locale-gen C.UTF-8 \ + && /usr/sbin/update-locale LANG=C.UTF-8 \ + && echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen \ + && locale-gen \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +ENV LANG en_US.UTF-8 +ENV LANGUAGE en_US:en +ENV LC_ALL en_US.UTF-8 + +RUN apt-get update \ + && apt-get install -y curl unzip procps \ + python3 python3-setuptools \ + && ln -s /usr/bin/python3 /usr/bin/python \ + && easy_install3 pip py4j \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +ENV PYTHONIOENCODING UTF-8 +ENV PIP_DISABLE_PIP_VERSION_CHECK 1 + +# JAVA & SCALA +RUN apt-get update \ + && apt-get install -y openjdk-8-jre \ + && apt-get remove scala-library scala \ + && curl -o scala-${SCALA_VERSION}.deb https://www.scala-lang.org/files/archive/scala-${SCALA_VERSION}.deb \ + && dpkg -i scala-${SCALA_VERSION}.deb \ + && apt-get clean \ + && rm scala-${SCALA_VERSION}.deb \ + && rm -rf /var/lib/apt/lists/* + +# create the user software will run from +RUN useradd -m -s /bin/bash spark + + +# SPARK +ENV SPARK_PACKAGE spark-${SPARK_VERSION}-bin-hadoop${HADOOP_MINOR_VERSION} +ENV SPARK_HOME /usr/spark-${SPARK_VERSION} +#ENV SPARK_DIST_CLASSPATH="" +ENV HADOOP_CONF_DIR=${SPARK_HOME}/conf/ +ENV PATH $PATH:${SPARK_HOME}/bin +ARG SPARK_DOWNLOAD_URL="https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_PACKAGE}.tgz" +RUN echo "Downloading Spark from : ${SPARK_DOWNLOAD_URL}\n" \ + && curl -L --retry 3 \ + $SPARK_DOWNLOAD_URL \ + | gunzip \ + | tar x -C /usr/ \ + && mv /usr/$SPARK_PACKAGE $SPARK_HOME \ + && chown -R root:root $SPARK_HOME \ + && ln -s $SPARK_HOME /usr/local/spark +RUN mkdir -p /app/spark \ + && chown spark -R /app/spark + + +# add python libraries useful in PySpark +RUN python3 -mpip install matplotlib \ + && pip3 install pandas seaborn + +# copy Spark configurations +COPY ./spark-conf/* $SPARK_HOME/conf/ + +# set up command +COPY start-spark-node.sh / +COPY start-spark-master.sh / +USER spark +WORKDIR /home/spark +CMD ["/bin/bash", "/start-spark-node.sh"] diff --git a/simple-spark-swarm/simple-spark-cluster-node/spark-conf/spark-defaults.conf b/simple-spark-swarm/simple-spark-cluster-node/spark-conf/spark-defaults.conf new file mode 100644 index 0000000..5e37377 --- /dev/null +++ b/simple-spark-swarm/simple-spark-cluster-node/spark-conf/spark-defaults.conf @@ -0,0 +1,15 @@ +# performance optimizations +spark.serializer org.apache.spark.serializer.KryoSerializer +spark.default.parallelism 100 + +# worker node / executor set up +# expecting a worker with 10 cores and 56g of memory +spark.executor.memory 26g +spark.executor.cores 6 + +# driver configurations +spark.driver.memory 6g +spark.driver.cores 2 + +# operational configurations +spark.logConf true diff --git a/simple-spark-swarm/configured-spark-node/spark-conf/spark-env.sh b/simple-spark-swarm/simple-spark-cluster-node/spark-conf/spark-env.sh similarity index 70% rename from simple-spark-swarm/configured-spark-node/spark-conf/spark-env.sh rename to simple-spark-swarm/simple-spark-cluster-node/spark-conf/spark-env.sh index 79dee53..2709df8 100644 --- a/simple-spark-swarm/configured-spark-node/spark-conf/spark-env.sh +++ b/simple-spark-swarm/simple-spark-cluster-node/spark-conf/spark-env.sh @@ -1,8 +1,8 @@ # the total amount of memory a worker (node) can use -SPARK_WORKER_MEMORY=52g +SPARK_WORKER_MEMORY=56g # the total amount of cores a worker (node) can use -SPARK_WORKER_CORES=10 +SPARK_WORKER_CORES=12 # the number of worker processes per node SPARK_WORKER_INSTANCES=1 @@ -16,3 +16,8 @@ PYSPARK_PYTHON=python3 # hash seed so all node hash numbers consistently PYTHONHASHSEED=8675309 + +# the location of spark working files +SPARK_LOCAL_DIRS=/app/spark/tmp +SPARK_WORKER_DIR=/app/spark/work +SPARK_LOG_DIR=/app/spark/logs diff --git a/simple-spark-swarm/simple-spark-cluster-node/start-spark-master.sh b/simple-spark-swarm/simple-spark-cluster-node/start-spark-master.sh new file mode 100644 index 0000000..d70da86 --- /dev/null +++ b/simple-spark-swarm/simple-spark-cluster-node/start-spark-master.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# start Spark master +$SPARK_HOME/sbin/start-master.sh + +# now do nothing and do not exit +while true; do sleep 3600; done diff --git a/simple-spark-swarm/simple-spark-cluster-node/start-spark-node.sh b/simple-spark-swarm/simple-spark-cluster-node/start-spark-node.sh new file mode 100644 index 0000000..a09b39c --- /dev/null +++ b/simple-spark-swarm/simple-spark-cluster-node/start-spark-node.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# start the spark worker +$SPARK_HOME/sbin/start-slave.sh spark://spark-master:7077 + +# now do nothing and do not exit +while true; do sleep 3600; done diff --git a/simple-spark-swarm/spark-jupyter-notebook/Dockerfile b/simple-spark-swarm/spark-jupyter-notebook/Dockerfile deleted file mode 100644 index e81d519..0000000 --- a/simple-spark-swarm/spark-jupyter-notebook/Dockerfile +++ /dev/null @@ -1,12 +0,0 @@ -FROM configured-spark-node:latest - -RUN apt-get install -y g++ -RUN pip3 install jupyter -RUN mkdir -p /home/jupyter/runtime - -COPY start-jupyter.sh / - -#EXPOSE 7777 -#EXPOSE 4040 - -CMD ["/bin/bash", "/start-jupyter.sh"] diff --git a/simple-spark-swarm/spark-jupyter-notebook/start-jupyter.sh b/simple-spark-swarm/spark-jupyter-notebook/start-jupyter.sh deleted file mode 100644 index 5595578..0000000 --- a/simple-spark-swarm/spark-jupyter-notebook/start-jupyter.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -XDG_RUNTIME_DIR=/home/jupyter/runtime PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS="notebook --no-browser --port=7777 --notebook-dir=/home/jupyter/notebooks --ip=* --no-browser --allow-root --NotebookApp.token='' --NotebookApp.password=''" $SPARK_HOME/bin/pyspark --master spark://spark-master:7077