From 4f20478df0f1ba3bc29d322a5258e379092afc21 Mon Sep 17 00:00:00 2001 From: Michael Kamprath Date: Sat, 23 Nov 2019 01:03:59 -0800 Subject: [PATCH] use spark user to run processes --- spark-qfs-swarm/README.md | 21 ++++++++++++++----- spark-qfs-swarm/deploy-spark-qfs-swarm.yml | 12 +++++------ spark-qfs-swarm/jupyter-server/Dockerfile | 13 ++++++++++-- .../jupyter-server/start-jupyter.sh | 2 +- spark-qfs-swarm/qfs-master/Dockerfile | 16 +++++++------- spark-qfs-swarm/spark-master/Dockerfile | 4 +++- spark-qfs-swarm/worker-node/Dockerfile | 18 +++++++++++----- .../worker-node/qfs-conf/Chunkserver.prp | 2 +- .../worker-node/spark-conf/spark-env.sh | 4 +++- 9 files changed, 63 insertions(+), 29 deletions(-) diff --git a/spark-qfs-swarm/README.md b/spark-qfs-swarm/README.md index dc8d041..885135e 100644 --- a/spark-qfs-swarm/README.md +++ b/spark-qfs-swarm/README.md @@ -7,19 +7,30 @@ First, edit the following items as needed for your swarm: 1. `worker-node -> spark-conf -> spark-env.sh`: adjust the environment variables as appropriate for your cluster's nodes, most notably `SPARK_WORKER_MEMORY` and `SPARK_WORKER_CORES`. Leave 1-2 cores and at least 10% of RAM for other processes. 2. `worker-node -> spark-conf -> spark-env.sh`: Adjust the memory and core settings for the executors and driver. Each executor should have about 5 cores (if possible), and should be a whole divisor into `SPARK_WORKER_CORES`. Spark will launch as many executors as `SPARK_WORKER_CORES` divided by `spark.executor.cores`. Reserve about 7-8% of `SPARK_WORKER_MEMORY` for overhead when setting `spark.executor.memory`. 3. `build-images.sh`: Adjust the IP address for your local Docker registry that all nodes in your cluster can access. You can use a domain name if all nodes in your swarm can resolve it. This is needed as it allows all nodes in the swarm to pull the locally built Docker images. -4. `deploy-spark-qfs-swarm.yml`: Adjust all image names for the updated local Docker registry address you used in the prior step. Also, adjust the resource limits for each of the services. Setting a `cpus` limit here that is smaller than the number of cores on your node has the effect of giving your process a fraction of each core's capacity. You might consider doing this if your swarm hosts other services or does not handle long term 100% CPU load well (e.g., overheats). Also adjust the `replicas` count for the `spark-worker` service to be equal to the number of nodes in your swarm (or less). +4. `deploy-spark-qfs-swarm.yml`: Adjust all image names for the updated local Docker registry address you used in the prior step. Also, adjust the resource limits for each of the services. Setting a `cpus` limit here that is smaller than the number of cores on your node has the effect of giving your process a fraction of each core's capacity. You might consider doing this if your swarm hosts other services or does not handle long term 100% CPU load well (e.g., overheats). -This set up depends on have a GlusterFS volume mounted at `/mnt/gfs` on all nodes and the following directories exist on it: +This set up depends on have a GlusterFS volume mounted at `/mnt/gfs` and a normal file system (such as XFS) at `/mnt/data` on all nodes and the following directories exist on it: * `/mnt/gfs/jupyter-notbooks` - used to persist the Jupyter notebooks. +* `/mnt/gfs/data` - a location to transitionally store data that is accessible from the Jupyter server * `/mnt/data/qfs/logs` - where QFS will store it's logs * `/mnt/data/qfs/chunk` - Where the chunk servers of QFS will store the data -* `/mnt/data/qfs/checkpoint` - Where the QFS metaserver will store the fulesystem check points +* `/mnt/data/qfs/checkpoint` - Where the QFS metaserver will store the fulesystem check points. This actually only needs to exist on the master node. * `/mnt/data/spark` - The local working directory for spark You can adjust these as you see fit, but be sure to update the mounts specified in `deploy-spark-qfs-swarm.yml`. -Then, to start up the Spark cluster in your Docker swarm, `cd` into this project's directory and: +Before the first time you run this cluster, you will need to initialize the QFS file system. Do so by launching a qfs-master container on the master node: +``` +docker run -it -u spark --mount type=bind,source=/mnt/data/qfs,target=/data/qfs master:5000/qfs-master:latest /bin/bash +``` +Then at the shell prompt in this container, run: +``` +$QFS_HOME/bin/metaserver -c $QFS_HOME/conf/Metaserver.prp +exit +``` + +Finally, to start up the Spark cluster in your Docker swarm, `cd` into this project's directory and: ``` ./build-images.sh docker stack deploy -c deploy-spark-qfs-swarm.yml spark @@ -30,7 +41,7 @@ Point your development computer's browser at `http://swarm-public-ip:7777/` to l ### Working with QFS To launch a Docker container to give you command line access to QFS, use the following command: ``` -docker run -it --network="spark_cluster_network" master:5000/qfs-master:latest /bin/bash +docker run -it --network="spark_cluster_network" -u spark master:5000/qfs-master:latest /bin/bash ``` Note that you must attach to the network on which the Docker spark cluster services are using. From this command prompt, the following commands are pre-configured to connect to the QFS instance: diff --git a/spark-qfs-swarm/deploy-spark-qfs-swarm.yml b/spark-qfs-swarm/deploy-spark-qfs-swarm.yml index d57996d..8458025 100644 --- a/spark-qfs-swarm/deploy-spark-qfs-swarm.yml +++ b/spark-qfs-swarm/deploy-spark-qfs-swarm.yml @@ -26,7 +26,6 @@ services: hostname: spark-master environment: - SPARK_PUBLIC_DNS=10.1.1.1 - - SPARK_LOG_DIR=/data/spark/logs networks: - cluster_network ports: @@ -47,7 +46,6 @@ services: hostname: jupyter-server environment: - SPARK_PUBLIC_DNS=10.1.1.1 - - SPARK_LOG_DIR=/data/spark/logs depends_on: - spark-master - qfs-master @@ -60,10 +58,13 @@ services: volumes: - type: bind source: /mnt/gfs/jupyter-notebooks - target: /home/jupyter/notebooks + target: /home/spark/jupyter/notebooks + - type: bind + source: /mnt/data/spark + target: /data/spark - type: bind source: /mnt/gfs/data - target: /data + target: /gfs/data deploy: resources: limits: @@ -74,7 +75,6 @@ services: hostname: worker environment: - SPARK_PUBLIC_DNS=10.1.1.1 - - SPARK_LOG_DIR=/data/spark/logs depends_on: - qfs-master - spark-master @@ -93,7 +93,7 @@ services: mode: global resources: limits: - cpus: "6.0" + cpus: "8.0" memory: 56g networks: cluster_network: diff --git a/spark-qfs-swarm/jupyter-server/Dockerfile b/spark-qfs-swarm/jupyter-server/Dockerfile index 71fe9f1..8842feb 100644 --- a/spark-qfs-swarm/jupyter-server/Dockerfile +++ b/spark-qfs-swarm/jupyter-server/Dockerfile @@ -1,9 +1,18 @@ FROM qfs-master:latest +# +# Expected volumes: +# /home/spark/jupyter/notebooks - where the Jupyter notebooks will be persisted +# /data/spark - Spark's ata directory +# + +USER root RUN apt-get install -y g++ RUN pip3 install jupyter -RUN mkdir -p /home/jupyter/runtime - COPY start-jupyter.sh / +USER spark +RUN mkdir -p /home/spark/jupyter/runtime \ + && mkdir -p /home/spark/jupyter/notebooks + CMD ["/bin/bash", "/start-jupyter.sh"] diff --git a/spark-qfs-swarm/jupyter-server/start-jupyter.sh b/spark-qfs-swarm/jupyter-server/start-jupyter.sh index aed9cd3..7060e6b 100644 --- a/spark-qfs-swarm/jupyter-server/start-jupyter.sh +++ b/spark-qfs-swarm/jupyter-server/start-jupyter.sh @@ -1,3 +1,3 @@ #!/bin/bash -SHELL=/bin/bash XDG_RUNTIME_DIR=/home/jupyter/runtime PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS="notebook --no-browser --port=7777 --notebook-dir=/home/jupyter/notebooks --ip=* --no-browser --allow-root --NotebookApp.token='' --NotebookApp.password=''" $SPARK_HOME/bin/pyspark --master spark://spark-master:7077 +SHELL=/bin/bash XDG_RUNTIME_DIR=/home/spark/jupyter/runtime PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS="notebook --no-browser --port=7777 --notebook-dir=/home/spark/jupyter/notebooks --ip=* --no-browser --allow-root --NotebookApp.token='' --NotebookApp.password=''" $SPARK_HOME/bin/pyspark --master spark://spark-master:7077 diff --git a/spark-qfs-swarm/qfs-master/Dockerfile b/spark-qfs-swarm/qfs-master/Dockerfile index e037ea0..6ed6d44 100644 --- a/spark-qfs-swarm/qfs-master/Dockerfile +++ b/spark-qfs-swarm/qfs-master/Dockerfile @@ -8,7 +8,7 @@ FROM worker-node:latest # # need python 2 for webserver - +USER root RUN apt-get update \ && apt-get install -y python2.7 less wget vim openssh-client \ && ln -s /usr/bin/python2.7 /usr/bin/python2 \ @@ -17,13 +17,15 @@ RUN apt-get update \ # set configuration COPY ./qfs-conf/* $QFS_HOME/conf/ +COPY start-qfs-master.sh / +USER spark # create some useful bash aliases for when at bash shell prompt of this image -RUN echo 'export PATH=$PATH:$QFS_HOME/bin/:$QFS_HOME/bin/tools/' >> ~/.bashrc \ - && echo 'alias qfs="qfs -fs qfs://qfs-master:20000"' >> ~/.bashrc \ - && echo 'alias cptoqfs="cptoqfs -s qfs-master -p 20000"' >> ~/.bashrc \ - && echo 'alias cpfromqfs="cpfromqfs -s qfs-master -p 20000"' >> ~/.bashrc \ - && echo 'alias qfsshell="qfsshell -s qfs-master -p 20000"' >> ~/.bashrc +RUN echo 'export PATH=$PATH:$QFS_HOME/bin/:$QFS_HOME/bin/tools/' >> ~/.bash_aliases \ + && echo 'alias qfs="qfs -fs qfs://qfs-master:20000"' >> ~/.bash_aliases \ + && echo 'alias cptoqfs="cptoqfs -s qfs-master -p 20000"' >> ~/.bash_aliases \ + && echo 'alias cpfromqfs="cpfromqfs -s qfs-master -p 20000"' >> ~/.bash_aliases \ + && echo 'alias qfsshell="qfsshell -s qfs-master -p 20000"' >> ~/.bash_aliases \ + && echo 'alias qfsfsck="qfsfsck -s qfs-master -p 20000"' >> ~/.bash_aliases -COPY start-qfs-master.sh / CMD ["/bin/bash", "/start-qfs-master.sh"] diff --git a/spark-qfs-swarm/spark-master/Dockerfile b/spark-qfs-swarm/spark-master/Dockerfile index d22e2b1..9d64e80 100644 --- a/spark-qfs-swarm/spark-master/Dockerfile +++ b/spark-qfs-swarm/spark-master/Dockerfile @@ -4,6 +4,8 @@ FROM worker-node:latest # Expected volumes: # /data/spark - this is the spark working directory # - +USER root COPY start-spark-master.sh / + +USER spark CMD ["/bin/bash", "/start-spark-master.sh"] diff --git a/spark-qfs-swarm/worker-node/Dockerfile b/spark-qfs-swarm/worker-node/Dockerfile index 13a8add..bcac2db 100644 --- a/spark-qfs-swarm/worker-node/Dockerfile +++ b/spark-qfs-swarm/worker-node/Dockerfile @@ -38,6 +38,10 @@ RUN apt-get update \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* +# add python libraries useful in PySpark +RUN python3 -mpip install matplotlib \ + && pip3 install pandas + ENV PYTHONIOENCODING UTF-8 ENV PIP_DISABLE_PIP_VERSION_CHECK 1 @@ -47,6 +51,9 @@ RUN apt-get update \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* +# create the user software will run from +RUN useradd -m -s /bin/bash spark + # QFS ENV QFS_VERSION 2.1.3 ENV HADOOP_VERSION 2.7.2 @@ -62,6 +69,8 @@ RUN curl -sL --retry 3 \ && chown -R root:root $QFS_HOME COPY ./qfs-conf/* $QFS_HOME/conf/ ENV PATH $PATH:${QFS_HOME}/bin:${QFS_HOME}/bin/tools +RUN mkdir -p /data/qfs/ \ + && chown spark -R /data/qfs # SPARK ENV SPARK_VERSION 2.4.4 @@ -77,12 +86,11 @@ RUN curl -sL --retry 3 \ && mv /usr/$SPARK_PACKAGE $SPARK_HOME \ && chown -R root:root $SPARK_HOME COPY ./spark-conf/* $SPARK_HOME/conf/ - -# add python libraries useful in PySpark -RUN python3 -mpip install matplotlib \ - && pip3 install pandas +RUN mkdir -p /data/spark \ + && chown spark -R /data/spark # set up command -WORKDIR /root COPY start-worker-node.sh / +USER spark +WORKDIR /home/spark CMD ["/bin/bash", "/start-worker-node.sh"] diff --git a/spark-qfs-swarm/worker-node/qfs-conf/Chunkserver.prp b/spark-qfs-swarm/worker-node/qfs-conf/Chunkserver.prp index 7020889..74eb824 100644 --- a/spark-qfs-swarm/worker-node/qfs-conf/Chunkserver.prp +++ b/spark-qfs-swarm/worker-node/qfs-conf/Chunkserver.prp @@ -7,4 +7,4 @@ chunkServer.stdout = /dev/null chunkServer.stderr = /dev/null chunkServer.ioBufferPool.partitionBufferCount = 65536 chunkServer.msgLogWriter.logLevel = INFO -chunkServer.diskQueue.threadCount = 4 +chunkServer.diskQueue.threadCount = 12 diff --git a/spark-qfs-swarm/worker-node/spark-conf/spark-env.sh b/spark-qfs-swarm/worker-node/spark-conf/spark-env.sh index e8c56dd..d95b1db 100644 --- a/spark-qfs-swarm/worker-node/spark-conf/spark-env.sh +++ b/spark-qfs-swarm/worker-node/spark-conf/spark-env.sh @@ -18,4 +18,6 @@ PYSPARK_PYTHON=python3 PYTHONHASHSEED=8675309 # the location of spark working files -SPARK_LOCAL_DIRS=/data/spark +SPARK_LOCAL_DIRS=/data/spark/tmp +SPARK_WORKER_DIR=/data/spark/work +SPARK_LOG_DIR=/data/spark/logs