Skip to content

Commit

Permalink
added support for Spark NLP
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelkamprath committed May 14, 2022
1 parent 227938a commit 4065475
Show file tree
Hide file tree
Showing 5 changed files with 7 additions and 6 deletions.
2 changes: 1 addition & 1 deletion spark-qfs-swarm/build-images.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ b) DOCKER_BUILD_ARGS=${OPTARG};;
esac
done

if [ -z "$DOCKER_ARGS" ]
if [ -z "$DOCKER_BUILD_ARGS" ]
then
echo "Building with default docker options"
else
Expand Down
1 change: 1 addition & 0 deletions spark-qfs-swarm/jupyter-server/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ FROM qfs-master:latest
#

ENV GRAPHFRAMES_VERSION 0.8.2-spark3.2-s_2.12
ENV SPARK_NLP_VERSION spark-nlp-spark32_2.12:3.4.4

USER root
RUN apt-get install -y g++
Expand Down
2 changes: 1 addition & 1 deletion spark-qfs-swarm/jupyter-server/start-jupyter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@ SHELL=/bin/bash \
PYSPARK_DRIVER_PYTHON=jupyter \
PYSPARK_DRIVER_PYTHON_OPTS="notebook --no-browser --port=7777 --notebook-dir=/home/spark/jupyter/notebooks --ip=0.0.0.0 --NotebookApp.password='' --NotebookApp.token='' --NotebookApp.iopub_data_rate_limit=1.0e10" \
$SPARK_HOME/bin/pyspark \
--packages graphframes:graphframes:$GRAPHFRAMES_VERSION \
--packages graphframes:graphframes:$GRAPHFRAMES_VERSION,com.johnsnowlabs.nlp:$SPARK_NLP_VERSION \
--repositories https://repos.spark-packages.org/ \
--master spark://spark-master:7077
6 changes: 3 additions & 3 deletions spark-qfs-swarm/worker-node/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ ARG QFS_VERSION=2.2.4
ARG SPARK_VERSION=3.2.1
ARG HADOOP_MINOR_VERSION=2.7
ARG HADOOP_VERSION=2.7.2
ARG SCALA_VERSION=2.12.13
ARG SCALA_VERSION=2.12.15

RUN apt-get update \
&& apt-get install -y locales \
Expand Down Expand Up @@ -65,7 +65,7 @@ ENV QFS_LOGS_DIR /data/qfs/logs
ENV LD_LIBRARY_PATH ${QFS_HOME}/lib
ARG QFS_DOWNLOAD_URL="https://s3.amazonaws.com/quantcast-qfs/qfs-debian-9-${QFS_VERSION}-x86_64.tgz"
RUN echo "Downloading QFS from : ${QFS_DOWNLOAD_URL}\n" \
&& curl -L --retry 3 \
&& curl -L --retry 3 -k \
$QFS_DOWNLOAD_URL \
| gunzip \
| tar x -C /usr/ \
Expand Down Expand Up @@ -97,7 +97,7 @@ RUN mkdir -p /data/spark \

# add python libraries useful in PySpark
RUN python3 -mpip install matplotlib \
&& pip3 install pandas seaborn pyarrow
&& pip3 install pandas seaborn pyarrow spark-nlp

# copy QFS and Spark configurations
COPY ./qfs-conf/* $QFS_HOME/conf/
Expand Down
2 changes: 1 addition & 1 deletion spark-qfs-swarm/worker-node/spark-conf/spark-defaults.conf
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ spark.history.fs.logDirectory qfs:///history/spark-event/
spark.history.fs.cleaner.maxAge 30d

# Configure QFS here rather than in core-site.xml
spark.hadoop.fs.qfs.impl com.quantcast.qfs.hadoop.QuantcastFileSystem
spark.hadoop.fs.qfs.impl com.quantcast.qfs.hadoop.QuantcastFileSystem2
spark.hadoop.fs.defaultFS qfs://qfs-master:20000
spark.hadoop.fs.qfs.metaServerHost qfs-master
spark.hadoop.fs.qfs.metaServerPort 20000
Expand Down

0 comments on commit 4065475

Please sign in to comment.