generated from kbase/kbase-template
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathDockerfile
128 lines (102 loc) · 4.54 KB
/
Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
FROM bitnami/spark:3.5.1
# Switch to root to install packages
# https://github.com/bitnami/containers/tree/main/bitnami/spark#installing-additional-jars
USER root
# Create a non-root user
# User 1001 is not defined in /etc/passwd in the bitnami/spark image, causing various issues.
# References:
# https://github.com/bitnami/containers/issues/52698
# https://github.com/bitnami/containers/pull/52661
RUN groupadd -r spark && useradd -r -g spark spark_user
RUN apt-get update && apt-get install -y \
# GCC required to resolve error during JupyterLab installation: psutil could not be installed from sources because gcc is not installed.
gcc \
curl \
git \
wget \
vim \
npm \
nodejs \
graphviz \
graphviz-dev \
libgdal-dev \
build-essential \
python3-dev \
sudo \
&& rm -rf /var/lib/apt/lists/*
ENV HADOOP_AWS_VER=3.3.4
# NOTE: ensure Delta Spark jar version matches python pip delta-spark version specified in the Pipfile
ENV DELTA_SPARK_VER=3.2.0
ENV SCALA_VER=2.12
ENV POSTGRES_JDBC_VER=42.2.23
# Run Gradle task to download JARs to /gradle/gradle_jars location
COPY build.gradle settings.gradle gradlew /gradle/
COPY gradle /gradle/gradle
ENV GRADLE_JARS_DIR=gradle_jars
RUN /gradle/gradlew -p /gradle build
RUN cp -r /gradle/${GRADLE_JARS_DIR}/* /opt/bitnami/spark/jars/
# make an empty yarn conf dir to prevent spark from complaining
RUN mkdir -p /opt/yarn/conf && chown -R spark_user:spark /opt/yarn
ENV YARN_CONF_DIR=/opt/yarn/conf
# install pipenv
RUN pip3 install pipenv
# install python dependencies
COPY Pipfile* ./
RUN pipenv sync --system
RUN chown -R spark_user:spark /opt/bitnami
# Set up JupyterLab directories
ENV JUPYTER_CONFIG_DIR=/.jupyter
ENV JUPYTER_RUNTIME_DIR=/.jupyter/runtime
ENV JUPYTER_DATA_DIR=/.jupyter/data
RUN mkdir -p ${JUPYTER_CONFIG_DIR} ${JUPYTER_RUNTIME_DIR} ${JUPYTER_DATA_DIR}
RUN chown -R spark_user:spark /.jupyter
# Set up JupyterHub directories
ENV JUPYTERHUB_CONFIG_DIR=/srv/jupyterhub
RUN mkdir -p ${JUPYTERHUB_CONFIG_DIR}
ENV JUPYTER_AI_CONFIG_FILE=jupyter_jupyter_ai_config.json
COPY ./config/${JUPYTER_AI_CONFIG_FILE} ${JUPYTERHUB_CONFIG_DIR}/${JUPYTER_AI_CONFIG_FILE}
COPY ./src/notebook_utils/startup.py ${JUPYTERHUB_CONFIG_DIR}/startup.py
COPY ./config/jupyterhub_config.py ${JUPYTERHUB_CONFIG_DIR}/jupyterhub_config.py
COPY ./scripts/spawn_notebook.sh ${JUPYTERHUB_CONFIG_DIR}/spawn_notebook.sh
RUN chmod +x ${JUPYTERHUB_CONFIG_DIR}/spawn_notebook.sh
RUN chown -R spark_user:spark ${JUPYTERHUB_CONFIG_DIR}
# Jupyter Hub user home directory
ENV JUPYTERHUB_USER_HOME=/jupyterhub/users_home
RUN mkdir -p $JUPYTERHUB_USER_HOME
RUN chown -R spark_user:spark /jupyterhub
RUN npm install -g configurable-http-proxy
COPY ./src/ /src
ENV PYTHONPATH "${PYTHONPATH}:/src"
# Copy the startup script to the default profile location to automatically load pre-built functions in Jupyter Notebook
COPY ./src/notebook_utils/startup.py /.ipython/profile_default/startup/
RUN chown -R spark_user:spark /.ipython
COPY ./scripts/ /opt/scripts/
RUN chmod a+x /opt/scripts/*.sh
# Copy the configuration files
ENV CONFIG_DIR=/opt/config
COPY ./config/ ${CONFIG_DIR}
# Don't just do /opt since we already did bitnami
RUN chown -R spark_user:spark /src /opt/scripts /opt/config
# This is the shared directory between the spark master, worker and driver containers
ENV CDM_SHARED_DIR=/cdm_shared_workspace
RUN mkdir -p ${CDM_SHARED_DIR} && chmod -R 777 ${CDM_SHARED_DIR}
RUN chown -R spark_user:spark $CDM_SHARED_DIR
# TODO: Config through a config file or DB as the number of groups increases.
ENV KBASE_GROUP_SHARED_DIR=$CDM_SHARED_DIR/kbase_group_shared
RUN mkdir -p ${KBASE_GROUP_SHARED_DIR} && chmod -R 777 ${KBASE_GROUP_SHARED_DIR}
RUN chown -R spark_user:spark $KBASE_GROUP_SHARED_DIR
# Set a directory for hosting Hive metastore files - defined in config/hive-site-template.xml
ENV HIVE_METASTORE_DIR=$CDM_SHARED_DIR/hive_metastore
RUN mkdir -p ${HIVE_METASTORE_DIR}
RUN chown -R spark_user:spark $HIVE_METASTORE_DIR
# Set a directory for hosting Jupyterhub db and cookie secret
ENV JUPYTERHUB_SECRETS_DIR=/jupyterhub_secrets
RUN mkdir -p ${JUPYTERHUB_SECRETS_DIR}
RUN chown -R spark_user:spark ${JUPYTERHUB_SECRETS_DIR}
# Allow spark_user to use sudo without a password
# TODO: use `sudospawner` in JupyterHub to avoid this (https://jupyterhub.readthedocs.io/en/stable/howto/configuration/config-sudo.html)
RUN echo "spark_user ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
# Switch back to non-root user
# Facing permission errors when accessing the Docker API as a non-root user
#USER spark_user
ENTRYPOINT ["/opt/scripts/entrypoint.sh"]