From ebbb89428c78b073d3a7768d9ac01a4e0de7bbfc Mon Sep 17 00:00:00 2001 From: Michael Kamprath Date: Thu, 12 Sep 2019 09:01:07 -0700 Subject: [PATCH] added persistence for jupyter notebooks --- spark-on-docker-swarm/README.md | 7 +++---- spark-on-docker-swarm/deploy-spark-swarm.yml | 7 ++++++- spark-on-docker-swarm/spark-jupyter-notebook/Dockerfile | 1 - 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/spark-on-docker-swarm/README.md b/spark-on-docker-swarm/README.md index 21b8ccc..5321d09 100644 --- a/spark-on-docker-swarm/README.md +++ b/spark-on-docker-swarm/README.md @@ -10,18 +10,17 @@ First, edit the following items as needed for your swarm: 3. `build-images.sh`: Adjust the IP address for your local Docker registry. You can use a domain name if all nodes in your swarm can resolve it. This is needed as it allows all nodes in the swarm to pull the locally built Docker images. 4. `spark-deploy.yml`: Adjust all image names for the updated local Docker registry address you used in the prior step. Also, adjust the resource limits for each of the services. Setting a `cpus` limit here that is smaller than the number of cores on your node has the effect of giving your process a fraction of each core's capacity. You might consider doing this if your swarm hosts other services or does not handle long term 100% CPU load well (e.g., overheats). Also adjust the `replicas` count for the `spark-worker` service to be equal to the number of nodes in your swarm (or less). -Then, to start up the Spark cluster in your Docker swarm, `cd` into this project's directory and: +This set up depend son have a GlusterFS volume mounted at `/mnt/gfs` on all nodes and a directory `/mnt/gfs/jupyter-notbooks` exists on it. Then, to start up the Spark cluster in your Docker swarm, `cd` into this project's directory and: ``` ./build-images.sh docker stack deploy -c deploy-spark-swarm.yml spark ``` -Then point your development computer's browser at `http://swarm-public-ip:7777/` to load the Jupyter notebook. +Point your development computer's browser at `http://swarm-public-ip:7777/` to load the Jupyter notebook. ## TODO This cluster is a work in progress. Currently, the following items are missing: -* Persistence for Jupyter notebooks. Once you bring down the cluster, all notebooks you made are deleted. * A distributed file system, such as HDFS or QFS. Currently there is no way to ingest data into the cluster except through network transfers, such as through `curl`, set up in a Jupyter notebook. ## Acknowledgements -The docker configuration leverages the [`gettyimages/spark`](https://hub.docker.com/r/gettyimages/spark/) Docker image as a starting point. \ No newline at end of file +The docker configuration leverages the [`gettyimages/spark`](https://hub.docker.com/r/gettyimages/spark/) Docker image as a starting point. diff --git a/spark-on-docker-swarm/deploy-spark-swarm.yml b/spark-on-docker-swarm/deploy-spark-swarm.yml index 425a403..643f4b9 100644 --- a/spark-on-docker-swarm/deploy-spark-swarm.yml +++ b/spark-on-docker-swarm/deploy-spark-swarm.yml @@ -1,4 +1,4 @@ -version: '3' +version: '3.4' services: spark-master: image: master:5000/configured-spark-node:latest @@ -73,6 +73,10 @@ services: ports: - 7777:7777 - 4040:4040 + volumes: + - type: bind + source: /mnt/gfs/jupyter-notbooks + target: /home/jupyter/notebooks deploy: resources: limits: @@ -81,3 +85,4 @@ services: networks: spark-network: + diff --git a/spark-on-docker-swarm/spark-jupyter-notebook/Dockerfile b/spark-on-docker-swarm/spark-jupyter-notebook/Dockerfile index 90c4c16..e81d519 100644 --- a/spark-on-docker-swarm/spark-jupyter-notebook/Dockerfile +++ b/spark-on-docker-swarm/spark-jupyter-notebook/Dockerfile @@ -2,7 +2,6 @@ FROM configured-spark-node:latest RUN apt-get install -y g++ RUN pip3 install jupyter -RUN mkdir -p /home/jupyter/notebooks RUN mkdir -p /home/jupyter/runtime COPY start-jupyter.sh /