diff --git a/spark-qfs-swarm/build-images.sh b/spark-qfs-swarm/build-images.sh index 30de7e7..055a7b9 100755 --- a/spark-qfs-swarm/build-images.sh +++ b/spark-qfs-swarm/build-images.sh @@ -12,7 +12,7 @@ b) DOCKER_BUILD_ARGS=${OPTARG};; esac done -if [ -z "$DOCKER_ARGS" ] +if [ -z "$DOCKER_BUILD_ARGS" ] then echo "Building with default docker options" else diff --git a/spark-qfs-swarm/jupyter-server/Dockerfile b/spark-qfs-swarm/jupyter-server/Dockerfile index 56caba2..3683b57 100644 --- a/spark-qfs-swarm/jupyter-server/Dockerfile +++ b/spark-qfs-swarm/jupyter-server/Dockerfile @@ -7,6 +7,7 @@ FROM qfs-master:latest # ENV GRAPHFRAMES_VERSION 0.8.2-spark3.2-s_2.12 +ENV SPARK_NLP_VERSION spark-nlp-spark32_2.12:3.4.4 USER root RUN apt-get install -y g++ diff --git a/spark-qfs-swarm/jupyter-server/start-jupyter.sh b/spark-qfs-swarm/jupyter-server/start-jupyter.sh index efe505a..7712aa5 100644 --- a/spark-qfs-swarm/jupyter-server/start-jupyter.sh +++ b/spark-qfs-swarm/jupyter-server/start-jupyter.sh @@ -5,6 +5,6 @@ SHELL=/bin/bash \ PYSPARK_DRIVER_PYTHON=jupyter \ PYSPARK_DRIVER_PYTHON_OPTS="notebook --no-browser --port=7777 --notebook-dir=/home/spark/jupyter/notebooks --ip=0.0.0.0 --NotebookApp.password='' --NotebookApp.token='' --NotebookApp.iopub_data_rate_limit=1.0e10" \ $SPARK_HOME/bin/pyspark \ - --packages graphframes:graphframes:$GRAPHFRAMES_VERSION \ + --packages graphframes:graphframes:$GRAPHFRAMES_VERSION,com.johnsnowlabs.nlp:$SPARK_NLP_VERSION \ --repositories https://repos.spark-packages.org/ \ --master spark://spark-master:7077 diff --git a/spark-qfs-swarm/worker-node/Dockerfile b/spark-qfs-swarm/worker-node/Dockerfile index 092d7ff..596c012 100644 --- a/spark-qfs-swarm/worker-node/Dockerfile +++ b/spark-qfs-swarm/worker-node/Dockerfile @@ -19,7 +19,7 @@ ARG QFS_VERSION=2.2.4 ARG SPARK_VERSION=3.2.1 ARG HADOOP_MINOR_VERSION=2.7 ARG HADOOP_VERSION=2.7.2 -ARG SCALA_VERSION=2.12.13 +ARG SCALA_VERSION=2.12.15 RUN apt-get update \ && apt-get install -y locales \ @@ -65,7 +65,7 @@ ENV QFS_LOGS_DIR /data/qfs/logs ENV LD_LIBRARY_PATH ${QFS_HOME}/lib ARG QFS_DOWNLOAD_URL="https://s3.amazonaws.com/quantcast-qfs/qfs-debian-9-${QFS_VERSION}-x86_64.tgz" RUN echo "Downloading QFS from : ${QFS_DOWNLOAD_URL}\n" \ - && curl -L --retry 3 \ + && curl -L --retry 3 -k \ $QFS_DOWNLOAD_URL \ | gunzip \ | tar x -C /usr/ \ @@ -97,7 +97,7 @@ RUN mkdir -p /data/spark \ # add python libraries useful in PySpark RUN python3 -mpip install matplotlib \ - && pip3 install pandas seaborn pyarrow + && pip3 install pandas seaborn pyarrow spark-nlp # copy QFS and Spark configurations COPY ./qfs-conf/* $QFS_HOME/conf/ diff --git a/spark-qfs-swarm/worker-node/spark-conf/spark-defaults.conf b/spark-qfs-swarm/worker-node/spark-conf/spark-defaults.conf index eb89b5b..98c9d7b 100644 --- a/spark-qfs-swarm/worker-node/spark-conf/spark-defaults.conf +++ b/spark-qfs-swarm/worker-node/spark-conf/spark-defaults.conf @@ -33,7 +33,7 @@ spark.history.fs.logDirectory qfs:///history/spark-event/ spark.history.fs.cleaner.maxAge 30d # Configure QFS here rather than in core-site.xml -spark.hadoop.fs.qfs.impl com.quantcast.qfs.hadoop.QuantcastFileSystem +spark.hadoop.fs.qfs.impl com.quantcast.qfs.hadoop.QuantcastFileSystem2 spark.hadoop.fs.defaultFS qfs://qfs-master:20000 spark.hadoop.fs.qfs.metaServerHost qfs-master spark.hadoop.fs.qfs.metaServerPort 20000