From 92345df7ec29d4183da4683db0f170e225b5b0d6 Mon Sep 17 00:00:00 2001 From: Michael Kamprath Date: Sun, 5 Jan 2020 15:27:27 -0800 Subject: [PATCH] added full support for spark history server --- spark-qfs-swarm/README.md | 6 ++++-- spark-qfs-swarm/deploy-spark-qfs-swarm.yml | 1 + spark-qfs-swarm/spark-master/start-spark-master.sh | 3 +++ spark-qfs-swarm/worker-node/spark-conf/spark-defaults.conf | 6 ++++-- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/spark-qfs-swarm/README.md b/spark-qfs-swarm/README.md index 885135e..de0d491 100644 --- a/spark-qfs-swarm/README.md +++ b/spark-qfs-swarm/README.md @@ -11,7 +11,7 @@ First, edit the following items as needed for your swarm: This set up depends on have a GlusterFS volume mounted at `/mnt/gfs` and a normal file system (such as XFS) at `/mnt/data` on all nodes and the following directories exist on it: -* `/mnt/gfs/jupyter-notbooks` - used to persist the Jupyter notebooks. +* `/mnt/gfs/jupyter-notbooks` - used to persist the Jupyter notebooks. * `/mnt/gfs/data` - a location to transitionally store data that is accessible from the Jupyter server * `/mnt/data/qfs/logs` - where QFS will store it's logs * `/mnt/data/qfs/chunk` - Where the chunk servers of QFS will store the data @@ -24,9 +24,10 @@ Before the first time you run this cluster, you will need to initialize the QFS ``` docker run -it -u spark --mount type=bind,source=/mnt/data/qfs,target=/data/qfs master:5000/qfs-master:latest /bin/bash ``` -Then at the shell prompt in this container, run: +Then at the shell prompt in this container, run the following to initialize QFS and create the directory for Spark history server: ``` $QFS_HOME/bin/metaserver -c $QFS_HOME/conf/Metaserver.prp +qfs -mkdir /history/spark-event exit ``` @@ -49,5 +50,6 @@ Note that you must attach to the network on which the Docker spark cluster servi * `cptoqfs` - Copies files from the local file system (in the Docker container) to the QFS instance. * `cpfromqfs` - Copies files from the QFS instance to the local file system (in the Docker container) * `qfsshell` - A useful shell-style interface to the QFS instance +* `qfsfsck` - Perform `fsck` on the QFS file system You might consider adding a volume mount to the `docker run` command so that the Docker container can access data from you local file system. diff --git a/spark-qfs-swarm/deploy-spark-qfs-swarm.yml b/spark-qfs-swarm/deploy-spark-qfs-swarm.yml index 1c92e82..4afa1d4 100644 --- a/spark-qfs-swarm/deploy-spark-qfs-swarm.yml +++ b/spark-qfs-swarm/deploy-spark-qfs-swarm.yml @@ -32,6 +32,7 @@ services: - 6066:6066 - 7077:7077 - 8080:8080 + - 18080:18080 volumes: - type: bind source: /mnt/data/spark diff --git a/spark-qfs-swarm/spark-master/start-spark-master.sh b/spark-qfs-swarm/spark-master/start-spark-master.sh index d70da86..28ca2d5 100644 --- a/spark-qfs-swarm/spark-master/start-spark-master.sh +++ b/spark-qfs-swarm/spark-master/start-spark-master.sh @@ -3,5 +3,8 @@ # start Spark master $SPARK_HOME/sbin/start-master.sh +# start the Spark history server +$SPARK_HOME/sbin/start-history-server.sh + # now do nothing and do not exit while true; do sleep 3600; done diff --git a/spark-qfs-swarm/worker-node/spark-conf/spark-defaults.conf b/spark-qfs-swarm/worker-node/spark-conf/spark-defaults.conf index 8b77f62..6e0e0ab 100644 --- a/spark-qfs-swarm/worker-node/spark-conf/spark-defaults.conf +++ b/spark-qfs-swarm/worker-node/spark-conf/spark-defaults.conf @@ -21,5 +21,7 @@ spark.sql.hive.metastore.sharedPrefixes com.quantcast.qfs # Set up retention of Spark events to enable the history server. # The configured directory needs to be created prior to launching # Spark master. -spark.eventLog.enabled true -spark.eventLog.dir qfs:///history/spark/ +spark.eventLog.enabled true +spark.eventLog.dir qfs:///history/spark-event/ +spark.history.fs.logDirectory qfs:///history/spark-event/ +spark.history.fs.cleaner.maxAge 30d