From 069946fda1bcbd9aa7e97f4018aa8c45e33debb7 Mon Sep 17 00:00:00 2001 From: Michael Kamprath Date: Mon, 22 Jun 2020 19:36:50 -0700 Subject: [PATCH] updated to spark 3.0.0 --- spark-qfs-swarm/jupyter-server/start-jupyter.sh | 2 +- spark-qfs-swarm/qfs-master/qfs-conf/Metaserver.prp | 4 ++++ spark-qfs-swarm/worker-node/Dockerfile | 4 ++-- spark-qfs-swarm/worker-node/spark-conf/spark-defaults.conf | 5 ++++- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/spark-qfs-swarm/jupyter-server/start-jupyter.sh b/spark-qfs-swarm/jupyter-server/start-jupyter.sh index 1625047..ffc9ea1 100644 --- a/spark-qfs-swarm/jupyter-server/start-jupyter.sh +++ b/spark-qfs-swarm/jupyter-server/start-jupyter.sh @@ -4,4 +4,4 @@ SHELL=/bin/bash \ XDG_RUNTIME_DIR=/home/spark/jupyter/runtime \ PYSPARK_DRIVER_PYTHON=jupyter \ PYSPARK_DRIVER_PYTHON_OPTS="notebook --no-browser --port=7777 --notebook-dir=/home/spark/jupyter/notebooks --ip=0.0.0.0 --NotebookApp.password='' --NotebookApp.token=''" \ - $SPARK_HOME/bin/pyspark --packages graphframes:graphframes:0.8.0-spark2.4-s_2.11 --master spark://spark-master:7077 + $SPARK_HOME/bin/pyspark --packages graphframes:graphframes:0.8.0-spark3.0-s_2.12 --master spark://spark-master:7077 diff --git a/spark-qfs-swarm/qfs-master/qfs-conf/Metaserver.prp b/spark-qfs-swarm/qfs-master/qfs-conf/Metaserver.prp index 3d602dc..5ee4d4d 100644 --- a/spark-qfs-swarm/qfs-master/qfs-conf/Metaserver.prp +++ b/spark-qfs-swarm/qfs-master/qfs-conf/Metaserver.prp @@ -9,3 +9,7 @@ chunkServer.msgLogWriter.logLevel = NOTICE metaServer.rootDirMode = 0777 metaServer.rootDirGroup = 1000 metaServer.rootDirUser = 1000 + +metaServer.rebalancingEnabled = 1 +metaServer.maxRebalanceSpaceUtilThreshold = 0.50 +metaServer.minRebalanceSpaceUtilThreshold = 0.45 diff --git a/spark-qfs-swarm/worker-node/Dockerfile b/spark-qfs-swarm/worker-node/Dockerfile index 95d39a4..04f74ed 100644 --- a/spark-qfs-swarm/worker-node/Dockerfile +++ b/spark-qfs-swarm/worker-node/Dockerfile @@ -16,10 +16,10 @@ MAINTAINER Michael Kamprath "https://github.com/michaelkamprath" # ARG QFS_VERSION=2.2.0 -ARG SPARK_VERSION=2.4.6 +ARG SPARK_VERSION=3.0.0 ARG HADOOP_MINOR_VERSION=2.7 ARG HADOOP_VERSION=2.7.2 -ARG SCALA_VERSION=2.11.12 +ARG SCALA_VERSION=2.12.11 RUN apt-get update \ && apt-get install -y locales \ diff --git a/spark-qfs-swarm/worker-node/spark-conf/spark-defaults.conf b/spark-qfs-swarm/worker-node/spark-conf/spark-defaults.conf index 8516515..b83e19b 100644 --- a/spark-qfs-swarm/worker-node/spark-conf/spark-defaults.conf +++ b/spark-qfs-swarm/worker-node/spark-conf/spark-defaults.conf @@ -1,6 +1,7 @@ # performance optimizations spark.serializer org.apache.spark.serializer.KryoSerializer -spark.default.parallelism 100 +spark.default.parallelism 200 +spark.sql.shuffle.partitions 400 # worker node / executor set up # expecting a worker with 12 cores and 56g of memory @@ -13,6 +14,8 @@ spark.driver.cores 2 # operational configurations spark.logConf true +spark.worker.cleanup.enabled true +spark.ui.reverseProxy true # This setting is to tell the class loaders in Spark that they # only need to load the QFS access libraries once