From 882c37a147cb1a94e08a184dee32a7b7bfefa542 Mon Sep 17 00:00:00 2001 From: Michael Kamprath Date: Tue, 26 May 2020 01:20:00 -0700 Subject: [PATCH] minor tweaks --- spark-qfs-swarm/qfs-master/Dockerfile | 2 +- spark-qfs-swarm/worker-node/Dockerfile | 8 ++--- .../spark-conf/spark-defaults.conf | 36 +++++++++---------- .../worker-node/spark-conf/spark-env.sh | 4 +-- 4 files changed, 25 insertions(+), 25 deletions(-) diff --git a/spark-qfs-swarm/qfs-master/Dockerfile b/spark-qfs-swarm/qfs-master/Dockerfile index 5144e6b..47abc49 100644 --- a/spark-qfs-swarm/qfs-master/Dockerfile +++ b/spark-qfs-swarm/qfs-master/Dockerfile @@ -10,7 +10,7 @@ FROM worker-node:latest # need python 2 for webserver USER root RUN apt-get update \ - && apt-get install -y python2.7 less wget vim openssh-client \ + && apt-get install -y python2.7 wget vim openssh-client \ && ln -s /usr/bin/python2.7 /usr/bin/python2 \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* diff --git a/spark-qfs-swarm/worker-node/Dockerfile b/spark-qfs-swarm/worker-node/Dockerfile index 201a55d..8b947b2 100644 --- a/spark-qfs-swarm/worker-node/Dockerfile +++ b/spark-qfs-swarm/worker-node/Dockerfile @@ -1,7 +1,7 @@ FROM debian:stretch MAINTAINER Michael Kamprath "https://github.com/michaelkamprath" # -# Base image for Apace Spak standalone cluster with QFS +# Base image for Apace Spak standalone cluster with QFS # # Inspired by https://hub.docker.com/r/gettyimages/spark/dockerfile # @@ -30,13 +30,13 @@ RUN apt-get update \ && locale-gen \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* - + ENV LANG en_US.UTF-8 ENV LANGUAGE en_US:en ENV LC_ALL en_US.UTF-8 RUN apt-get update \ - && apt-get install -y curl unzip procps \ + && apt-get install -y less curl unzip procps \ python3 python3-setuptools \ libboost-regex-dev \ && ln -s /usr/bin/python3 /usr/bin/python \ @@ -95,7 +95,7 @@ RUN echo "Downloading Spark from : ${SPARK_DOWNLOAD_URL}\n" \ && ln -s $SPARK_HOME /usr/local/spark RUN mkdir -p /data/spark \ && chown spark -R /data/spark - + # add python libraries useful in PySpark RUN python3 -mpip install matplotlib \ diff --git a/spark-qfs-swarm/worker-node/spark-conf/spark-defaults.conf b/spark-qfs-swarm/worker-node/spark-conf/spark-defaults.conf index 6cef406..8516515 100644 --- a/spark-qfs-swarm/worker-node/spark-conf/spark-defaults.conf +++ b/spark-qfs-swarm/worker-node/spark-conf/spark-defaults.conf @@ -1,38 +1,38 @@ # performance optimizations -spark.serializer org.apache.spark.serializer.KryoSerializer -spark.default.parallelism 100 +spark.serializer org.apache.spark.serializer.KryoSerializer +spark.default.parallelism 100 # worker node / executor set up -# expecting a worker with 10 cores and 56g of memory -spark.executor.memory 26g -spark.executor.cores 6 +# expecting a worker with 12 cores and 56g of memory +spark.executor.memory 25g +spark.executor.cores 6 # driver configurations -spark.driver.memory 6g -spark.driver.cores 2 +spark.driver.memory 8g +spark.driver.cores 2 # operational configurations -spark.logConf true +spark.logConf true # This setting is to tell the class loaders in Spark that they -# only need to load the QFS access libraries once +# only need to load the QFS access libraries once spark.sql.hive.metastore.sharedPrefixes com.quantcast.qfs # Set up retention of Spark events to enable the history server. # The configured directory needs to be created prior to launching # Spark master. -spark.eventLog.enabled true -spark.eventLog.dir qfs:///history/spark-event/ -spark.history.fs.logDirectory qfs:///history/spark-event/ -spark.history.fs.cleaner.maxAge 30d +spark.eventLog.enabled true +spark.eventLog.dir qfs:///history/spark-event/ +spark.history.fs.logDirectory qfs:///history/spark-event/ +spark.history.fs.cleaner.maxAge 30d # Configure QFS here rather than in core-site.xml -spark.hadoop.fs.qfs.impl com.quantcast.qfs.hadoop.QuantcastFileSystem -spark.hadoop.fs.defaultFS qfs://qfs-master:20000 -spark.hadoop.fs.qfs.metaServerHost qfs-master -spark.hadoop.fs.qfs.metaServerPort 20000 +spark.hadoop.fs.qfs.impl com.quantcast.qfs.hadoop.QuantcastFileSystem +spark.hadoop.fs.defaultFS qfs://qfs-master:20000 +spark.hadoop.fs.qfs.metaServerHost qfs-master +spark.hadoop.fs.qfs.metaServerPort 20000 # this spark.hadoop.fs.qfs.createParams configure causes files written by Sark to # QFS to be 2x replicated rather than using Reed-Solomon encoding. If you have at # least 9 chunkservers, remove this configuration to instead use Reed-Solomon encoding. -spark.hadoop.fs.qfs.createParams 2 +spark.hadoop.fs.qfs.createParams 2 diff --git a/spark-qfs-swarm/worker-node/spark-conf/spark-env.sh b/spark-qfs-swarm/worker-node/spark-conf/spark-env.sh index c61c333..32132f2 100644 --- a/spark-qfs-swarm/worker-node/spark-conf/spark-env.sh +++ b/spark-qfs-swarm/worker-node/spark-conf/spark-env.sh @@ -1,5 +1,5 @@ # the total amount of memory a worker (node) can use -SPARK_WORKER_MEMORY=56g +SPARK_WORKER_MEMORY=55g # the total amount of cores a worker (node) can use SPARK_WORKER_CORES=12 @@ -12,7 +12,7 @@ SPARK_WORKER_PORT=8881 SPARK_WORKER_WEBUI_PORT=8081 # which python the spark cluster should use for pyspark -PYSPARK_PYTHON=python3 +PYSPARK_PYTHON=python3 # hash seed so all node hash numbers consistently PYTHONHASHSEED=8675309