diff --git a/nvflare/lighter/constants.py b/nvflare/lighter/constants.py index b462de0e02..99d765f770 100644 --- a/nvflare/lighter/constants.py +++ b/nvflare/lighter/constants.py @@ -90,7 +90,9 @@ class TemplateSectionKey: START_CLIENT_SH = "start_cln_sh" DOCKER_BUILD_SH = "docker_build_sh" DOCKER_SERVER_SH = "docker_svr_sh" + DOCKER_LAUNCHER_SERVER_SH = "docker_launcher_svr_sh" DOCKER_CLIENT_SH = "docker_cln_sh" + DOCKER_LAUNCHER_CLIENT_SH = "docker_launcher_cln_sh" DOCKER_ADMIN_SH = "docker_adm_sh" GUNICORN_CONF_PY = "gunicorn_conf_py" START_OVERSEER_SH = "start_ovsr_sh" @@ -111,6 +113,7 @@ class TemplateSectionKey: FED_ADMIN = "fed_admin" COMPOSE_YAML = "compose_yaml" DOCKERFILE = "dockerfile" + LAUNCHER_DOCKERFILE = "launcher_dockerfile" HELM_CHART_CHART = "helm_chart_chart" HELM_CHART_VALUES = "helm_chart_values" HELM_CHART_SERVICE_OVERSEER = "helm_chart_service_overseer" @@ -124,7 +127,8 @@ class ProvFileName: SUB_START_SH = "sub_start.sh" PRIVILEGE_YML = "privilege.yml" DOCKER_BUILD_SH = "docker_build.sh" - DOCKER_SH = "start_docker.sh" + DOCKER_SH = "docker.sh" + DOCKER_LAUNCHER_SH = "docker_launcher.sh" GUNICORN_CONF_PY = "gunicorn.conf.py" FED_SERVER_JSON = "fed_server.json" FED_CLIENT_JSON = "fed_client.json" @@ -142,6 +146,7 @@ class ProvFileName: ENV = ".env" COMPOSE_BUILD_DIR = "nvflare_compose" DOCKERFILE = "Dockerfile" + LAUNCHER_DOCKERFILE = "Dockerfile.launcher" REQUIREMENTS_TXT = "requirements.txt" SERVER_CONTEXT_TENSEAL = "server_context.tenseal" CLIENT_CONTEXT_TENSEAL = "client_context.tenseal" diff --git a/nvflare/lighter/impl/docker.py b/nvflare/lighter/impl/docker.py index 9a90a3866e..8773b64dc7 100644 --- a/nvflare/lighter/impl/docker.py +++ b/nvflare/lighter/impl/docker.py @@ -13,24 +13,18 @@ # limitations under the License. import copy -import json import os import shutil import yaml -from nvflare.app_opt.job_launcher.docker_launcher import ClientDockerJobLauncher, ServerDockerJobLauncher -from nvflare.lighter import utils -from nvflare.lighter.constants import CtxKey, PropKey, ProvFileName, TemplateSectionKey +from nvflare.lighter.constants import CtxKey, ProvFileName, TemplateSectionKey from nvflare.lighter.spec import Builder, Project, ProvisionContext class DockerBuilder(Builder): - def __init__( - self, docker_image="nvflare-docker:0.0.1", base_image="python:3.8", requirements_file="requirements.txt" - ): + def __init__(self, base_image="python:3.8", requirements_file="requirements.txt"): """Build docker compose file.""" - self.docker_image = docker_image self.base_image = base_image self.requirements_file = requirements_file self.services = {} @@ -63,31 +57,7 @@ def _build_server(self, server, ctx: ProvisionContext): info_dict["container_name"] = server.name self.services[server.name] = info_dict - # local folder creation - dest_dir = ctx.get_local_dir(server) - with open(os.path.join(dest_dir, ProvFileName.RESOURCES_JSON_DEFAULT), "rt") as f: - resources = json.load(f) - resources["components"].append( - { - "id": "docker_launcher", - "path": ServerDockerJobLauncher().__module__ + "." + "ServerDockerJobLauncher", - "args": {}, - } - ) - utils.write(os.path.join(dest_dir, ProvFileName.RESOURCES_JSON_DEFAULT), json.dumps(resources, indent=4), "t") - - communication_port = server.get_prop(CtxKey.DOCKER_COMM_PORT) - if communication_port: - replacement_dict = {"comm_host_name": "server-parent", "communication_port": communication_port} - ctx.build_from_template( - dest_dir, - TemplateSectionKey.COMM_CONFIG, - ProvFileName.COMM_CONFIG, - replacement=replacement_dict, - exe=True, - ) - - def _build_client(self, client, ctx: ProvisionContext): + def _build_client(self, client): info_dict = copy.deepcopy(self.services["__flclient__"]) info_dict["volumes"] = [f"./{client.name}:" + "${WORKSPACE}"] info_dict["build"] = "nvflare_compose" @@ -101,30 +71,6 @@ def _build_client(self, client, ctx: ProvisionContext): info_dict["container_name"] = client.name self.services[client.name] = info_dict - # local folder creation - dest_dir = ctx.get_local_dir(client) - with open(os.path.join(dest_dir, ProvFileName.RESOURCES_JSON_DEFAULT), "rt") as f: - resources = json.load(f) - resources["components"].append( - { - "id": "docker_launcher", - "path": ClientDockerJobLauncher().__module__ + "." + "ClientDockerJobLauncher", - "args": {}, - } - ) - utils.write(os.path.join(dest_dir, ProvFileName.RESOURCES_JSON_DEFAULT), json.dumps(resources, indent=4), "t") - - communication_port = client.get_prop(PropKey.DOCKER_COMM_PORT) - if communication_port: - replacement_dict = {"comm_host_name": client.name + "-parent", "communication_port": communication_port} - ctx.build_from_template( - dest_dir, - TemplateSectionKey.COMM_CONFIG, - ProvFileName.COMM_CONFIG, - replacement=replacement_dict, - exe=True, - ) - def build(self, project: Project, ctx: ProvisionContext): compose = ctx.yaml_load_template_section(TemplateSectionKey.COMPOSE_YAML) self.services = compose.get("services") @@ -137,7 +83,7 @@ def build(self, project: Project, ctx: ProvisionContext): self._build_server(server, ctx) for client in project.get_clients(): - self._build_client(client, ctx) + self._build_client(client) self.services.pop("__overseer__", None) self.services.pop("__flserver__", None) @@ -155,14 +101,6 @@ def build(self, project: Project, ctx: ProvisionContext): with open(os.path.join(compose_build_dir, ProvFileName.DOCKERFILE), "wt") as f: f.write(f"FROM {self.base_image}\n") f.write(ctx.get_template_section(TemplateSectionKey.DOCKERFILE)) - replacement_dict = {"image": self.docker_image} - ctx.build_from_template( - compose_build_dir, - TemplateSectionKey.DOCKER_BUILD_SH, - ProvFileName.DOCKER_BUILD_SH, - replacement=replacement_dict, - exe=True, - ) try: shutil.copyfile(self.requirements_file, os.path.join(compose_build_dir, ProvFileName.REQUIREMENTS_TXT)) except Exception: diff --git a/nvflare/lighter/impl/docker_launcher.py b/nvflare/lighter/impl/docker_launcher.py new file mode 100644 index 0000000000..c8d812a821 --- /dev/null +++ b/nvflare/lighter/impl/docker_launcher.py @@ -0,0 +1,210 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import json +import os +import shutil + +import yaml + +from nvflare.app_opt.job_launcher.docker_launcher import ClientDockerJobLauncher, ServerDockerJobLauncher +from nvflare.lighter import utils +from nvflare.lighter.constants import CtxKey, PropKey, ProvFileName, TemplateSectionKey +from nvflare.lighter.spec import Builder, Project, ProvisionContext + + +class DockerLauncherBuilder(Builder): + """DockerLauncherBuilder is used for generating the docker build command and service startup command for using the + DockerJobLauncher as the job launcher (both server and client). + + """ + + def __init__( + self, docker_image="nvflare-docker:0.0.1", base_image="python:3.8", requirements_file="requirements.txt" + ): + """Build docker compose file.""" + self.docker_image = docker_image + self.base_image = base_image + self.requirements_file = requirements_file + self.services = {} + self.compose_file_path = None + + def _build_overseer(self, overseer): + protocol = overseer.props.get("protocol", "http") + default_port = "443" if protocol == "https" else "80" + port = overseer.props.get("port", default_port) + info_dict = copy.deepcopy(self.services["__overseer__"]) + info_dict["volumes"] = [f"./{overseer.name}:" + "${WORKSPACE}"] + info_dict["ports"] = [f"{port}:{port}"] + info_dict["build"] = "nvflare_compose" + info_dict["container_name"] = overseer.name + self.services[overseer.name] = info_dict + + def _build_server(self, server, ctx: ProvisionContext): + fed_learn_port = ctx.get(CtxKey.FED_LEARN_PORT) + admin_port = ctx.get(CtxKey.ADMIN_PORT) + + info_dict = copy.deepcopy(self.services["__flserver__"]) + info_dict["volumes"][0] = f"./{server.name}:" + "${WORKSPACE}" + info_dict["ports"] = [f"{fed_learn_port}:{fed_learn_port}", f"{admin_port}:{admin_port}"] + info_dict["build"] = "nvflare_compose" + for i in range(len(info_dict["command"])): + if info_dict["command"][i] == "flserver": + info_dict["command"][i] = server.name + if info_dict["command"][i] == "org=__org_name__": + info_dict["command"][i] = f"org={server.org}" + info_dict["container_name"] = server.name + self.services[server.name] = info_dict + + # local folder creation + dest_dir = ctx.get_local_dir(server) + with open(os.path.join(dest_dir, ProvFileName.RESOURCES_JSON_DEFAULT), "rt") as f: + resources = json.load(f) + resources["components"].append( + { + "id": "docker_launcher", + "path": ServerDockerJobLauncher().__module__ + "." + "ServerDockerJobLauncher", + "args": {}, + } + ) + utils.write(os.path.join(dest_dir, ProvFileName.RESOURCES_JSON_DEFAULT), json.dumps(resources, indent=4), "t") + + communication_port = server.get_prop(CtxKey.DOCKER_COMM_PORT) + if communication_port: + replacement_dict = {"comm_host_name": "server-parent", "communication_port": communication_port} + ctx.build_from_template( + dest_dir, + TemplateSectionKey.COMM_CONFIG, + ProvFileName.COMM_CONFIG, + replacement=replacement_dict, + exe=True, + ) + + dest_dir = ctx.get_kit_dir(server) + replacement_dict = { + "admin_port": admin_port, + "fed_learn_port": fed_learn_port, + "comm_host_name": "server-parent", + "communication_port": communication_port, + "docker_image": self.docker_image, + } + ctx.build_from_template( + dest_dir, + TemplateSectionKey.DOCKER_LAUNCHER_SERVER_SH, + ProvFileName.DOCKER_LAUNCHER_SH, + replacement=replacement_dict, + exe=True, + ) + + def _build_client(self, client, ctx: ProvisionContext): + fed_learn_port = ctx.get(CtxKey.FED_LEARN_PORT) + admin_port = ctx.get(CtxKey.ADMIN_PORT) + + info_dict = copy.deepcopy(self.services["__flclient__"]) + info_dict["volumes"] = [f"./{client.name}:" + "${WORKSPACE}"] + info_dict["build"] = "nvflare_compose" + for i in range(len(info_dict["command"])): + if info_dict["command"][i] == "flclient": + info_dict["command"][i] = client.name + if info_dict["command"][i] == "uid=__flclient__": + info_dict["command"][i] = f"uid={client.name}" + if info_dict["command"][i] == "org=__org_name__": + info_dict["command"][i] = f"org={client.org}" + info_dict["container_name"] = client.name + self.services[client.name] = info_dict + + # local folder creation + dest_dir = ctx.get_local_dir(client) + with open(os.path.join(dest_dir, ProvFileName.RESOURCES_JSON_DEFAULT), "rt") as f: + resources = json.load(f) + resources["components"].append( + { + "id": "docker_launcher", + "path": ClientDockerJobLauncher().__module__ + "." + "ClientDockerJobLauncher", + "args": {}, + } + ) + utils.write(os.path.join(dest_dir, ProvFileName.RESOURCES_JSON_DEFAULT), json.dumps(resources, indent=4), "t") + + communication_port = client.get_prop(PropKey.DOCKER_COMM_PORT) + if communication_port: + replacement_dict = {"comm_host_name": client.name + "-parent", "communication_port": communication_port} + ctx.build_from_template( + dest_dir, + TemplateSectionKey.COMM_CONFIG, + ProvFileName.COMM_CONFIG, + replacement=replacement_dict, + exe=True, + ) + + dest_dir = ctx.get_kit_dir(client) + replacement_dict = { + "admin_port": admin_port, + "fed_learn_port": fed_learn_port, + "comm_host_name": "server-parent", + "communication_port": communication_port, + "docker_image": self.docker_image, + } + ctx.build_from_template( + dest_dir, + TemplateSectionKey.DOCKER_LAUNCHER_CLIENT_SH, + ProvFileName.DOCKER_LAUNCHER_SH, + replacement=replacement_dict, + exe=True, + ) + + def build(self, project: Project, ctx: ProvisionContext): + compose = ctx.yaml_load_template_section(TemplateSectionKey.COMPOSE_YAML) + self.services = compose.get("services") + self.compose_file_path = os.path.join(ctx.get_wip_dir(), ProvFileName.COMPOSE_YAML) + overseer = project.get_overseer() + if overseer: + self._build_overseer(overseer) + server = project.get_server() + if server: + self._build_server(server, ctx) + + for client in project.get_clients(): + self._build_client(client, ctx) + + self.services.pop("__overseer__", None) + self.services.pop("__flserver__", None) + self.services.pop("__flclient__", None) + compose["services"] = self.services + with open(self.compose_file_path, "wt") as f: + yaml.dump(compose, f) + env_file_path = os.path.join(ctx.get_wip_dir(), ProvFileName.ENV) + with open(env_file_path, "wt") as f: + f.write("WORKSPACE=/workspace\n") + f.write("PYTHON_EXECUTABLE=/usr/local/bin/python3\n") + f.write("IMAGE_NAME=nvflare-service\n") + compose_build_dir = os.path.join(ctx.get_wip_dir(), ProvFileName.COMPOSE_BUILD_DIR) + os.makedirs(compose_build_dir, exist_ok=True) + with open(os.path.join(compose_build_dir, ProvFileName.LAUNCHER_DOCKERFILE), "wt") as f: + f.write(f"FROM {self.base_image}\n") + f.write(ctx.get_template_section(TemplateSectionKey.LAUNCHER_DOCKERFILE)) + replacement_dict = {"image": self.docker_image} + ctx.build_from_template( + compose_build_dir, + TemplateSectionKey.DOCKER_BUILD_SH, + ProvFileName.DOCKER_BUILD_SH, + replacement=replacement_dict, + exe=True, + ) + try: + shutil.copyfile(self.requirements_file, os.path.join(compose_build_dir, ProvFileName.REQUIREMENTS_TXT)) + except Exception: + f = open(os.path.join(compose_build_dir, ProvFileName.REQUIREMENTS_TXT), "wt") + f.close() diff --git a/nvflare/lighter/templates/docker_launcher_template.yml b/nvflare/lighter/templates/docker_launcher_template.yml new file mode 100644 index 0000000000..cc68d89d8b --- /dev/null +++ b/nvflare/lighter/templates/docker_launcher_template.yml @@ -0,0 +1,59 @@ +docker_launcher_cln_sh: | + #!/usr/bin/env bash + DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + # docker run script for FL server + DOCKER_IMAGE={~~docker_image~~} + echo "Starting docker with $DOCKER_IMAGE" + + NETWORK_NAME="nvflare-network" + if docker network ls --filter name=$NETWORK_NAME --format "{{.Name}}" | grep -wq $NETWORK_NAME; then + echo "Network '${NETWORK_NAME}' exists." + else + docker network create $NETWORK_NAME + fi + + docker run --name {~~comm_host_name~~} --network nvflare-network \ + -v $DIR/..:/workspace \ + -v /var/run/docker.sock:/var/run/docker.sock \ + -p {~~communication_port~~}:{~~communication_port~~} \ + -it --rm $DOCKER_IMAGE /bin/bash -c "export NVFL_DOCKER_WORKSPACE=$DIR/..;startup/sub_start.sh \ + {~~client_name~~} server-parent:{~~fed_learn_port~~}:{~~admin_port~~}" + +docker_launcher_svr_sh: | + #!/usr/bin/env bash + DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + # docker run script for FL server + DOCKER_IMAGE={~~docker_image~~} + echo "Starting docker with $DOCKER_IMAGE" + + NETWORK_NAME="nvflare-network" + if docker network ls --filter name=$NETWORK_NAME --format "{{.Name}}" | grep -wq $NETWORK_NAME; then + echo "Network '${NETWORK_NAME}' exists." + else + docker network create $NETWORK_NAME + fi + + docker run --name server-parent --network nvflare-network \ + -v $DIR/..:/workspace \ + -v /var/run/docker.sock:/var/run/docker.sock \ + -p {~~admin_port~~}:{~~admin_port~~} -p {~~fed_learn_port~~}:{~~fed_learn_port~~} \ + -p {~~communication_port~~}:{~~communication_port~~} \ + -it --rm $DOCKER_IMAGE /bin/bash -c "export NVFL_DOCKER_WORKSPACE=$DIR/..;startup/sub_start.sh" + +launcher_dockerfile: | + RUN mkdir /opt/NVFlare + WORKDIR /opt/NVFlare + RUN pip install -U pip + RUN pip install nvflare + COPY requirements.txt requirements.txt + RUN pip install -r requirements.txt + + RUN apt-get update + RUN apt install docker.io -y + WORKDIR /workspace + +docker_build_sh: | + #!/usr/bin/env bash + docker image rm {~~image~~} + docker build -t {~~image~~} -f Dockerfile.launcher . + docker push {~~image~~} diff --git a/nvflare/lighter/templates/master_template.yml b/nvflare/lighter/templates/master_template.yml index 145d0c0632..e083b13e13 100644 --- a/nvflare/lighter/templates/master_template.yml +++ b/nvflare/lighter/templates/master_template.yml @@ -596,44 +596,60 @@ sub_start_sh: | docker_cln_sh: | #!/usr/bin/env bash DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" - # docker run script for FL server + # docker run script for FL client + # local data directory + : ${MY_DATA_DIR:="/home/flclient/data"} + # The syntax above is to set MY_DATA_DIR to /home/flcient/data if this + # environment variable is not set previously. + # Therefore, users can set their own MY_DATA_DIR with + # export MY_DATA_DIR=$SOME_DIRECTORY + # before running docker.sh + + # for all gpus use line below + #GPU2USE='--gpus=all' + # for 2 gpus use line below + #GPU2USE='--gpus=2' + # for specific gpus as gpu#0 and gpu#2 use line below + #GPU2USE='--gpus="device=0,2"' + # to use host network, use line below + NETARG="--net=host" + # FL clients do not need to open ports, so the following line is not needed. + #NETARG="-p 443:443 -p 8003:8003" DOCKER_IMAGE={~~docker_image~~} echo "Starting docker with $DOCKER_IMAGE" - - NETWORK_NAME="nvflare-network" - if docker network ls --filter name=$NETWORK_NAME --format "{{.Name}}" | grep -wq $NETWORK_NAME; then - echo "Network '${NETWORK_NAME}' exists." + mode="${1:--r}" + if [ $mode = "-d" ] + then + docker run -d --rm --name={~~client_name~~} $GPU2USE -u $(id -u):$(id -g) \ + -v /etc/passwd:/etc/passwd -v /etc/group:/etc/group -v $DIR/..:/workspace/ \ + -v $MY_DATA_DIR:/data/:ro -w /workspace/ --ipc=host $NETARG $DOCKER_IMAGE \ + /bin/bash -c "python -u -m nvflare.private.fed.app.client.client_train -m /workspace -s fed_client.json --set uid={~~client_name~~} secure_train=true config_folder=config org={~~org_name~~}" else - docker network create $NETWORK_NAME + docker run --rm -it --name={~~client_name~~} $GPU2USE -u $(id -u):$(id -g) \ + -v /etc/passwd:/etc/passwd -v /etc/group:/etc/group -v $DIR/..:/workspace/ \ + -v $MY_DATA_DIR:/data/:ro -w /workspace/ --ipc=host $NETARG $DOCKER_IMAGE /bin/bash fi - - docker run --name {~~comm_host_name~~} --network nvflare-network \ - -v $DIR/..:/workspace \ - -v /var/run/docker.sock:/var/run/docker.sock \ - -p {~~communication_port~~}:{~~communication_port~~} \ - -it --rm $DOCKER_IMAGE /bin/bash -c "export NVFL_DOCKER_WORKSPACE=$DIR/..;startup/sub_start.sh \ - {~~client_name~~} server-parent:{~~fed_learn_port~~}:{~~admin_port~~}" docker_svr_sh: | #!/usr/bin/env bash DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" # docker run script for FL server + # to use host network, use line below + NETARG="--net=host" + # or to expose specific ports, use line below + #NETARG="-p {~~admin_port~~}:{~~admin_port~~} -p {~~fed_learn_port~~}:{~~fed_learn_port~~}" DOCKER_IMAGE={~~docker_image~~} echo "Starting docker with $DOCKER_IMAGE" - - NETWORK_NAME="nvflare-network" - if docker network ls --filter name=$NETWORK_NAME --format "{{.Name}}" | grep -wq $NETWORK_NAME; then - echo "Network '${NETWORK_NAME}' exists." + svr_name="${SVR_NAME:-flserver}" + mode="${1:-r}" + if [ $mode = "-d" ] + then + docker run -d --rm --name=$svr_name -v $DIR/..:/workspace/ -w /workspace \ + --ipc=host $NETARG $DOCKER_IMAGE /bin/bash -c \ + "python -u -m nvflare.private.fed.app.server.server_train -m /workspace -s fed_server.json --set secure_train=true config_folder=config org={~~org_name~~}" else - docker network create $NETWORK_NAME + docker run --rm -it --name=$svr_name -v $DIR/..:/workspace/ -w /workspace/ --ipc=host $NETARG $DOCKER_IMAGE /bin/bash fi - - docker run --name server-parent --network nvflare-network \ - -v $DIR/..:/workspace \ - -v /var/run/docker.sock:/var/run/docker.sock \ - -p {~~admin_port~~}:{~~admin_port~~} -p {~~fed_learn_port~~}:{~~fed_learn_port~~} \ - -p {~~communication_port~~}:{~~communication_port~~} \ - -it --rm $DOCKER_IMAGE /bin/bash -c "export NVFL_DOCKER_WORKSPACE=$DIR/..;startup/sub_start.sh" docker_adm_sh: | #!/usr/bin/env bash @@ -703,22 +719,10 @@ compose_yaml: | nvflare_svc_persist: dockerfile: | - RUN mkdir /opt/NVFlare - WORKDIR /opt/NVFlare RUN pip install -U pip RUN pip install nvflare COPY requirements.txt requirements.txt RUN pip install -r requirements.txt - - RUN apt-get update - RUN apt install docker.io -y - WORKDIR /workspace - -docker_build_sh: | - #!/usr/bin/env bash - docker image rm {~~image~~} - docker build -t {~~image~~} -f Dockerfile . - docker push {~~image~~} helm_chart_chart: | apiVersion: v2