Skip to content

Commit

Permalink
updated qfs configuration
Browse files Browse the repository at this point in the history
Updated the QFS configuration by consolidating its configurations to spark-defaults.conf
and make 2x replication the default for files created on QFS. Also make the build-images.sh
script able to take arguments to be passed to the docker file builds, and made the
worker-node Dockerfile configurable through ARG parameters. The intent here is to be able
to build the images with different versions of key software.
  • Loading branch information
michaelkamprath committed Feb 9, 2020
1 parent a956d82 commit 9e10e00
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 58 deletions.
25 changes: 21 additions & 4 deletions spark-qfs-swarm/build-images.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,28 @@

set -e

DOCKER_BUILD_ARGS=

while getopts b: option
do
case "${option}"
in
b) DOCKER_BUILD_ARGS=${OPTARG};;
esac
done

if [ -z "$DOCKER_ARGS" ]
then
echo "Building with default docker optins"
else
echo "Building with docker arguments = '$DOCKER_BUILD_ARGS'"
fi

# build images
docker build -t worker-node:latest ./worker-node
docker build -t qfs-master:latest ./qfs-master
docker build -t spark-master:latest ./spark-master
docker build -t jupyter-server:latest ./jupyter-server
docker build -t worker-node:latest $DOCKER_BUILD_ARGS ./worker-node
docker build -t qfs-master:latest $DOCKER_BUILD_ARGS ./qfs-master
docker build -t spark-master:latest $DOCKER_BUILD_ARGS ./spark-master
docker build -t jupyter-server:latest $DOCKER_BUILD_ARGS ./jupyter-server

# tag image with local repository
docker tag worker-node:latest master:5000/worker-node:latest
Expand Down
3 changes: 2 additions & 1 deletion spark-qfs-swarm/qfs-master/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ RUN echo 'export PATH=$PATH:$QFS_HOME/bin/:$QFS_HOME/bin/tools/' >> ~/.bash_alia
&& echo 'alias cptoqfs="cptoqfs -s qfs-master -p 20000"' >> ~/.bash_aliases \
&& echo 'alias cpfromqfs="cpfromqfs -s qfs-master -p 20000"' >> ~/.bash_aliases \
&& echo 'alias qfsshell="qfsshell -s qfs-master -p 20000"' >> ~/.bash_aliases \
&& echo 'alias qfsfsck="qfsfsck -s qfs-master -p 20000"' >> ~/.bash_aliases
&& echo 'alias qfsfsck="qfsfsck -s qfs-master -p 20000"' >> ~/.bash_aliases \
&& echo 'alias qfsfileenum="qfsfileenum -s qfs-master -p 20000"' >> ~/.bash_aliases

CMD ["/bin/bash", "/start-qfs-master.sh"]
39 changes: 22 additions & 17 deletions spark-qfs-swarm/worker-node/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@ MAINTAINER Michael Kamprath "https://github.com/michaelkamprath"
# spark-master - the service where the spark master runs
#

ENV QFS_VERSION 2.1.3
ENV SPARK_VERSION 2.4.4
ENV HADOOP_MINOR_VERSION 2.7
ENV HADOOP_VERSION 2.7.2
ENV SCALA_VERSION 2.11.12
ARG QFS_VERSION=2.1.3
ARG SPARK_VERSION=2.4.4
ARG HADOOP_MINOR_VERSION=2.7
ARG HADOOP_VERSION=2.7.2
ARG SCALA_VERSION=2.11.12

RUN apt-get update \
&& apt-get install -y locales \
Expand Down Expand Up @@ -61,14 +61,17 @@ RUN apt-get update \
RUN useradd -m -s /bin/bash spark

# QFS
ENV QFS_PACKAGE qfs-debian-9-${QFS_VERSION}-x86_64
#ENV QFS_PACKAGE qfs-debian-9-${QFS_VERSION}-x86_64
ENV QFS_PACKAGE qfs-debian-9-de3b9d66-x86_64
ENV QFS_HOME /usr/qfs-${QFS_VERSION}
ENV QFS_LOGS_DIR /data/qfs/logs
ENV LD_LIBRARY_PATH ${QFS_HOME}/lib
RUN curl -sL --retry 3 \
"https://s3.amazonaws.com/quantcast-qfs/qfs-debian-9-${QFS_VERSION}-x86_64.tgz" \
| gunzip \
| tar x -C /usr/ \
ARG QFS_DOWNLOAD_URL="https://s3.amazonaws.com/quantcast-qfs/qfs-debian-9-${QFS_VERSION}-x86_64.tgz"
RUN echo "Downloading QFS from : ${QFS_DOWNLOAD_URL}\n" \
&& curl -L --retry 3 \
$QFS_DOWNLOAD_URL \
| gunzip \
| tar x -C /usr/ \
&& mv /usr/$QFS_PACKAGE $QFS_HOME \
&& chown -R root:root $QFS_HOME \
&& ln -s $QFS_HOME /usr/local/qfs
Expand All @@ -82,13 +85,15 @@ ENV SPARK_HOME /usr/spark-${SPARK_VERSION}
ENV SPARK_DIST_CLASSPATH="$QFS_HOME/lib/hadoop-$HADOOP_VERSION-qfs-$QFS_VERSION.jar:$QFS_HOME/lib/qfs-access-$QFS_VERSION"
ENV HADOOP_CONF_DIR=${SPARK_HOME}/conf/
ENV PATH $PATH:${SPARK_HOME}/bin
RUN curl -sL --retry 3 \
"https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_PACKAGE}.tgz" \
| gunzip \
| tar x -C /usr/ \
&& mv /usr/$SPARK_PACKAGE $SPARK_HOME \
&& chown -R root:root $SPARK_HOME \
&& ln -s $SPARK_HOME /usr/local/spark
ARG SPARK_DOWNLOAD_URL="https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_PACKAGE}.tgz"
RUN echo "Downloading Spark from : ${SPARK_DOWNLOAD_URL}\n" \
&& curl -L --retry 3 \
$SPARK_DOWNLOAD_URL \
| gunzip \
| tar x -C /usr/ \
&& mv /usr/$SPARK_PACKAGE $SPARK_HOME \
&& chown -R root:root $SPARK_HOME \
&& ln -s $SPARK_HOME /usr/local/spark
RUN mkdir -p /data/spark \
&& chown spark -R /data/spark

Expand Down
32 changes: 0 additions & 32 deletions spark-qfs-swarm/worker-node/spark-conf/core-site.xml

This file was deleted.

19 changes: 15 additions & 4 deletions spark-qfs-swarm/worker-node/spark-conf/spark-defaults.conf
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@ spark.default.parallelism 100
# worker node / executor set up
# expecting a worker with 10 cores and 56g of memory
spark.executor.memory 26g
spark.executor.cores 6
spark.executor.cores 6

# driver configurations
spark.driver.memory 6g
spark.driver.cores 2
spark.driver.memory 6g
spark.driver.cores 2

# operational configurations
spark.logConf true
spark.logConf true

# This setting is to tell the class loaders in Spark that they
# only need to load the QFS access libraries once
Expand All @@ -25,3 +25,14 @@ spark.eventLog.enabled true
spark.eventLog.dir qfs:///history/spark-event/
spark.history.fs.logDirectory qfs:///history/spark-event/
spark.history.fs.cleaner.maxAge 30d

# Configure QFS here rather than in core-site.xml
spark.hadoop.fs.qfs.impl com.quantcast.qfs.hadoop.QuantcastFileSystem
spark.hadoop.fs.defaultFS qfs://qfs-master:20000
spark.hadoop.fs.qfs.metaServerHost qfs-master
spark.hadoop.fs.qfs.metaServerPort 20000

# this spark.hadoop.fs.qfs.createParams configure causes files written by Sark to
# QFS to be 2x replicated rather than using Reed-Solomon encoding. If you have at
# least 9 chunkservers, remove this configuration to instead use Reed-Solomon encoding.
spark.hadoop.fs.qfs.createParams 2

0 comments on commit 9e10e00

Please sign in to comment.