-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
updated simple spark cluster configuration
Update the docker configuration of the simple spark cluster to bring it more inline with the learnings I've made setting up the Spark-QFS cluster. This cluster depends on the stack being deployed with all nodes mounting a file system, such os GlusterFS or NFS. The main change over the previous version of the simple Spark cluster is that the Spark node docker image is built in this build rather than using a third party image. Also improved the deployment of the Jupyter server.
- Loading branch information
1 parent
bbfd876
commit 48ccec5
Showing
14 changed files
with
215 additions
and
100 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
15 changes: 0 additions & 15 deletions
15
simple-spark-swarm/configured-spark-node/spark-conf/spark-defaults.conf
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,99 +1,83 @@ | ||
version: '3.4' | ||
services: | ||
spark-master: | ||
image: master:5000/configured-spark-node:latest | ||
command: bin/spark-class org.apache.spark.deploy.master.Master -h spark-master | ||
image: master:5000/simple-spark-cluster-node:latest | ||
command: ["/bin/bash", "/start-spark-master.sh"] | ||
hostname: spark-master | ||
environment: | ||
MASTER: spark://spark-master:7077 | ||
SPARK_PUBLIC_DNS: 10.1.1.1 | ||
expose: | ||
- 7001 | ||
- 7002 | ||
- 7003 | ||
- 7004 | ||
- 7005 | ||
- 7006 | ||
- 7077 | ||
- 6066 | ||
networks: | ||
- spark-network | ||
- cluster_network | ||
ports: | ||
- 6066:6066 | ||
- 7077:7077 | ||
- 8080:8080 | ||
- 18080:18080 | ||
volumes: | ||
- type: bind | ||
source: /mnt/gfs/data | ||
target: /data | ||
- type: bind | ||
source: /mnt/data/spark | ||
target: /app/spark | ||
deploy: | ||
resources: | ||
limits: | ||
cpus: "2.0" | ||
memory: 8G | ||
|
||
memory: 6g | ||
spark-worker: | ||
image: master:5000/configured-spark-node:latest | ||
command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077 | ||
image: master:5000/simple-spark-cluster-node:latest | ||
hostname: spark-worker | ||
environment: | ||
SPARK_PUBLIC_DNS: 10.1.1.1 | ||
links: | ||
depends_on: | ||
- qfs-master | ||
- spark-master | ||
expose: | ||
- 7012 | ||
- 7013 | ||
- 7014 | ||
- 7015 | ||
- 7016 | ||
- 8881 | ||
networks: | ||
- spark-network | ||
- cluster_network | ||
ports: | ||
- 8081:8081 | ||
volumes: | ||
- type: bind | ||
source: /mnt/gfs/data | ||
target: /data | ||
- type: bind | ||
source: /mnt/data/spark | ||
target: /app/spark | ||
deploy: | ||
mode: replicated | ||
replicas: 4 | ||
mode: global | ||
resources: | ||
limits: | ||
cpus: "6.0" | ||
memory: 52g | ||
|
||
memory: 56g | ||
spark-jupyter: | ||
image: master:5000/spark-jupyter-notebook:latest | ||
image: master:5000/simple-spark-cluster-jupyter:latest | ||
hostname: spark-jupyter | ||
environment: | ||
PARK_PUBLIC_DNS: 10.1.1.1 | ||
depends_on: | ||
- spark-master | ||
- spark-worker | ||
links: | ||
- spark-master | ||
networks: | ||
- spark-network | ||
expose: | ||
- 7777 | ||
- 4040 | ||
- cluster_network | ||
ports: | ||
- 7777:7777 | ||
- 4040:4040 | ||
- 4041:4041 | ||
volumes: | ||
- type: bind | ||
source: /mnt/gfs/jupyter-notebooks | ||
target: /home/jupyter/notebooks | ||
target: /home/spark/jupyter/notebooks | ||
- type: bind | ||
source: /mnt/gfs/data | ||
target: /data | ||
- type: bind | ||
source: /mnt/data/spark | ||
target: /app/spark | ||
deploy: | ||
resources: | ||
limits: | ||
cpus: "2.0" | ||
memory: 10G | ||
memory: 6g | ||
|
||
networks: | ||
spark-network: | ||
cluster_network: | ||
attachable: true | ||
ipam: | ||
driver: default | ||
config: | ||
- subnet: 10.20.30.0/24 | ||
|
26 changes: 26 additions & 0 deletions
26
simple-spark-swarm/simple-spark-cluster-jupyter/Dockerfile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
FROM simple-spark-cluster-node:latest | ||
|
||
# | ||
# Expected volumes: | ||
# /home/spark/jupyter/notebooks - where the Jupyter notebooks will be persisted | ||
# /app/spark - Spark's data directory | ||
# | ||
|
||
USER root | ||
RUN apt-get install -y g++ | ||
RUN pip3 install notebook==5.7.8 \ | ||
jupyter_nbextensions_configurator \ | ||
jupyter_contrib_nbextensions | ||
COPY start-jupyter.sh / | ||
|
||
USER spark | ||
RUN jupyter contrib nbextension install --user | ||
RUN jupyter nbextensions_configurator enable --user | ||
RUN jupyter nbextension enable toc2/main | ||
RUN jupyter nbextension enable codefolding/main | ||
RUN jupyter nbextension enable execute_time/ExecuteTime | ||
|
||
RUN mkdir -p /home/spark/jupyter/runtime \ | ||
&& mkdir -p /home/spark/jupyter/notebooks | ||
|
||
CMD ["/bin/bash", "/start-jupyter.sh"] |
3 changes: 3 additions & 0 deletions
3
simple-spark-swarm/simple-spark-cluster-jupyter/start-jupyter.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
#!/bin/bash | ||
|
||
SHELL=/bin/bash XDG_RUNTIME_DIR=/home/spark/jupyter/runtime PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS="notebook --no-browser --port=7777 --notebook-dir=/home/spark/jupyter/notebooks --ip=0.0.0.0 --NotebookApp.password='' --NotebookApp.token=''" $SPARK_HOME/bin/pyspark --packages graphframes:graphframes:0.7.0-spark2.4-s_2.11 --master spark://spark-master:7077 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
FROM debian:stretch | ||
MAINTAINER Michael Kamprath "https://github.com/michaelkamprath" | ||
# | ||
# Base image for Apace Spak standalone cluster. | ||
# | ||
# Inspired by https://hub.docker.com/r/gettyimages/spark/dockerfile | ||
# | ||
# | ||
# Expected volumes: | ||
# /app/spark - this is the spark working directory | ||
# <data dir> - All nodes should mount the same data directory. This can be a GlusterFS or NFS mount. | ||
# | ||
# Expected service names: | ||
# spark-master - the service where the spark master runs | ||
# | ||
|
||
ARG SPARK_VERSION=2.4.5 | ||
ARG HADOOP_MINOR_VERSION=2.7 | ||
ARG HADOOP_VERSION=2.7.2 | ||
ARG SCALA_VERSION=2.11.12 | ||
|
||
RUN apt-get update \ | ||
&& apt-get install -y locales \ | ||
&& dpkg-reconfigure -f noninteractive locales \ | ||
&& locale-gen C.UTF-8 \ | ||
&& /usr/sbin/update-locale LANG=C.UTF-8 \ | ||
&& echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen \ | ||
&& locale-gen \ | ||
&& apt-get clean \ | ||
&& rm -rf /var/lib/apt/lists/* | ||
|
||
ENV LANG en_US.UTF-8 | ||
ENV LANGUAGE en_US:en | ||
ENV LC_ALL en_US.UTF-8 | ||
|
||
RUN apt-get update \ | ||
&& apt-get install -y curl unzip procps \ | ||
python3 python3-setuptools \ | ||
&& ln -s /usr/bin/python3 /usr/bin/python \ | ||
&& easy_install3 pip py4j \ | ||
&& apt-get clean \ | ||
&& rm -rf /var/lib/apt/lists/* | ||
|
||
ENV PYTHONIOENCODING UTF-8 | ||
ENV PIP_DISABLE_PIP_VERSION_CHECK 1 | ||
|
||
# JAVA & SCALA | ||
RUN apt-get update \ | ||
&& apt-get install -y openjdk-8-jre \ | ||
&& apt-get remove scala-library scala \ | ||
&& curl -o scala-${SCALA_VERSION}.deb https://www.scala-lang.org/files/archive/scala-${SCALA_VERSION}.deb \ | ||
&& dpkg -i scala-${SCALA_VERSION}.deb \ | ||
&& apt-get clean \ | ||
&& rm scala-${SCALA_VERSION}.deb \ | ||
&& rm -rf /var/lib/apt/lists/* | ||
|
||
# create the user software will run from | ||
RUN useradd -m -s /bin/bash spark | ||
|
||
|
||
# SPARK | ||
ENV SPARK_PACKAGE spark-${SPARK_VERSION}-bin-hadoop${HADOOP_MINOR_VERSION} | ||
ENV SPARK_HOME /usr/spark-${SPARK_VERSION} | ||
#ENV SPARK_DIST_CLASSPATH="" | ||
ENV HADOOP_CONF_DIR=${SPARK_HOME}/conf/ | ||
ENV PATH $PATH:${SPARK_HOME}/bin | ||
ARG SPARK_DOWNLOAD_URL="https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_PACKAGE}.tgz" | ||
RUN echo "Downloading Spark from : ${SPARK_DOWNLOAD_URL}\n" \ | ||
&& curl -L --retry 3 \ | ||
$SPARK_DOWNLOAD_URL \ | ||
| gunzip \ | ||
| tar x -C /usr/ \ | ||
&& mv /usr/$SPARK_PACKAGE $SPARK_HOME \ | ||
&& chown -R root:root $SPARK_HOME \ | ||
&& ln -s $SPARK_HOME /usr/local/spark | ||
RUN mkdir -p /app/spark \ | ||
&& chown spark -R /app/spark | ||
|
||
|
||
# add python libraries useful in PySpark | ||
RUN python3 -mpip install matplotlib \ | ||
&& pip3 install pandas seaborn | ||
|
||
# copy Spark configurations | ||
COPY ./spark-conf/* $SPARK_HOME/conf/ | ||
|
||
# set up command | ||
COPY start-spark-node.sh / | ||
COPY start-spark-master.sh / | ||
USER spark | ||
WORKDIR /home/spark | ||
CMD ["/bin/bash", "/start-spark-node.sh"] |
15 changes: 15 additions & 0 deletions
15
simple-spark-swarm/simple-spark-cluster-node/spark-conf/spark-defaults.conf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# performance optimizations | ||
spark.serializer org.apache.spark.serializer.KryoSerializer | ||
spark.default.parallelism 100 | ||
|
||
# worker node / executor set up | ||
# expecting a worker with 10 cores and 56g of memory | ||
spark.executor.memory 26g | ||
spark.executor.cores 6 | ||
|
||
# driver configurations | ||
spark.driver.memory 6g | ||
spark.driver.cores 2 | ||
|
||
# operational configurations | ||
spark.logConf true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
7 changes: 7 additions & 0 deletions
7
simple-spark-swarm/simple-spark-cluster-node/start-spark-master.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
#!/bin/bash | ||
|
||
# start Spark master | ||
$SPARK_HOME/sbin/start-master.sh | ||
|
||
# now do nothing and do not exit | ||
while true; do sleep 3600; done |
7 changes: 7 additions & 0 deletions
7
simple-spark-swarm/simple-spark-cluster-node/start-spark-node.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
#!/bin/bash | ||
|
||
# start the spark worker | ||
$SPARK_HOME/sbin/start-slave.sh spark://spark-master:7077 | ||
|
||
# now do nothing and do not exit | ||
while true; do sleep 3600; done |
Oops, something went wrong.