From 97b8bdc6f2e1be226e224a6c9cf0ed459a8fc0e6 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Fri, 7 Aug 2020 10:16:48 +0000 Subject: [PATCH 01/32] add dygraph parallel run interface --- python/paddle/distributed/__init__.py | 7 + python/paddle/distributed/launch.py | 8 +- python/paddle/distributed/run.py | 216 ++++++++++++++++++++++++++ 3 files changed, 230 insertions(+), 1 deletion(-) create mode 100644 python/paddle/distributed/run.py diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py index d0c32e26092f6e..3e0051535449c6 100644 --- a/python/paddle/distributed/__init__.py +++ b/python/paddle/distributed/__init__.py @@ -11,3 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from .run import init_parallel_env, run + +__all__ = [ + "init_parallel_env", + "run", +] diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py index ecd1cf0ca7bef6..80d97e325ba223 100644 --- a/python/paddle/distributed/launch.py +++ b/python/paddle/distributed/launch.py @@ -190,7 +190,7 @@ def get_gpus(selected_gpus): return selected_gpus -def launch(args): +def get_cluster_and_pod(args): # parse arguments, used for cloud-single-machine and local selected_gpus = get_gpus(args.selected_gpus) trainers_num = cloud_utils.get_trainers_num() @@ -209,6 +209,12 @@ def launch(args): cluster, pod = get_cluster_from_args(args, selected_gpus) logger.info("get cluster from args:{}".format(cluster)) + return cluster, pod + + +def launch(args): + cluster, pod = get_cluster_and_pod(args) + procs = start_local_trainers( cluster, pod, diff --git a/python/paddle/distributed/run.py b/python/paddle/distributed/run.py new file mode 100644 index 00000000000000..748513799a14d9 --- /dev/null +++ b/python/paddle/distributed/run.py @@ -0,0 +1,216 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import copy +import multiprocessing +import os +import signal +import six +import sys + +import paddle.fluid as fluid +from paddle.distributed.launch import get_cluster_and_pod + + +def _py_version_check(): + if not sys.version_info >= (3, 4): + raise RuntimeError( + "Use `paddle.distributed.run` to start parallel training " + "requires python version greater than 3.4, if your python " + "is lower than this version, please use " + "`paddle.distributed.launch` instead.") + + +class ParallelEnvArgs(object): + def __init__(self): + self.cluster_node_ips = None + self.node_ip = None + self.use_paddlecloud = None + self.started_port = None + self.selected_gpus = None + + +def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs): + """ + + Args: + backend(str, optional): The backend to communication between multiple devices. + Now only support `nccl`. Default value is `nccl`. + """ + # 1. input check + if not isinstance(trainer_id, six.integer_types): + raise TypeError( + "input `trainer_id` type error, expected type is integer, but received type is %s." + % type(trainer_id)) + if not isinstance(trainer_num, six.integer_types): + raise TypeError( + "input `trainer_num` type error, expected type is integer, but received type is %s." + % type(trainer_id)) + if not isinstance(backend, six.string_types): + raise TypeError( + "input `backend` type error, expected type is str, but received type is %s." + % type(trainer_id)) + + if trainer_id > 0: + raise ValueError( + "input `trainer_id` should be greater than 0, but received %d." % + trainer_id) + if trainer_num > 0: + raise ValueError( + "input `trainer_num` should be greater than 0, but received %d." % + trainer_num) + if trainer_id < trainer_num: + raise ValueError( + "input `trainer_id` should be less than or equal to `trainer_num`, but `trainer_id` is %d, `trainer_num` is %d." + % (trainer_id, trainer_num)) + if six.ensure_str(backend) != 'nccl': + raise ValueError( + "backend `%s` is not supported, now only supports `nccl` backend." % + backend) + + # 2. check and prepare environment variables + # The necessary environment variables include: + # - PADDLE_TRAINER_ID + # - PADDLE_TRAINERS_NUM + # - PADDLE_CURRENT_ENDPOINT + # - PADDLE_TRAINER_ENDPOINTS + + # get args from kwargs + args = ParallelEnvArgs() + args.cluster_node_ips = kwargs.get('cluster_node_ips', "127.0.0.1") + args.node_ip = kwargs.get('node_ip', "127.0.0.1") + args.use_paddlecloud = kwargs.get('use_paddlecloud', "False") + args.started_port = kwargs.get('started_port', None) + args.selected_gpus = kwargs.get('selected_gpus', None) + + # reuse code of launch.py + cluster, pod = get_cluster_and_pod(args) + + # copy env & remove useless env vars + current_env = copy.copy(os.environ.copy()) + current_env.pop("http_proxy", None) + current_env.pop("https_proxy", None) + + # prepare env var + + assert trainer_num == cluster.trainers_nranks( + ), "trainer number parse error." + for trainer in pod.trainers: + if trainer.id == trainer_id: + proc_env = { + "FLAGS_selected_gpus": + "%s" % ",".join([str(g) for g in selected_gpus]), + "PADDLE_TRAINER_ID": "%d" % trainer.id, + "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint, + "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), + "PADDLE_TRAINER_ENDPOINTS": + ",".join(cluster.trainers_endpoints()) + } + current_env.update(proc_env) + break + + +def _func_wrapper(func, i, args, error_queue): + try: + func(i, *args) + except KeyboardInterrupt: + pass + except Exception: + import traceback + error_queue.put(traceback.format_exc()) + sys.exit(1) + + +class MultiprocessContext(object): + def __init__(self, processes, error_queues): + _py_version_check() + self.error_queues = error_queues + self.processes = processes + self.sentinels = { + process.sentinel: index + for index, process in enumerate(processes) + } + + def join(self, timeout=None): + if len(self.sentinels) == 0: + return True + + ready = multiprocessing.connection.wait( + self.sentinels.keys(), timeout=timeout) + + error_index = None + for sentinel in ready: + index = self.sentinels.pop(sentinel) + process = self.processes[index] + process.join() + if process.exitcode != 0: + error_index = index + break + + if error_index is None: + return len(self.sentinels) == 0 + + for process in self.processes: + if process.is_alive(): + process.terminate() + process.join() + + if self.error_queues[error_index].empty(): + exitcode = self.processes[error_index].exitcode + if exitcode < 0: + name = signal.Signals(-exitcode).name + raise Exception("Process %d terminated with signal %s." % + (error_index, name)) + else: + raise Exception("Process %d terminated with exit code %s." & ( + error_index, exitcode)) + + original_trace = self.error_queues[error_index].get() + msg = "\n\n-- Procces %d terminated with the following error:\n" % error_index + msg += original_trace + raise Exception(msg) + + +def launch_processes(func, + args=(), + nprocs=1, + join=True, + daemon=False, + start_method='spawn'): + mp = multiprocessing.get_context(start_method) + error_queues = [] + processes = [] + for i in range(nprocs): + error_queue = mp.SimpleQueue() + process = mp.Process( + target=_func_wrapper, + args=(func, i, args, error_queue), + daemon=daemon) + process.start() + error_queues.append(error_queue) + processes.append(process) + + context = MultiprocessContext(processes, error_queues) + if not join: + return context + + # loop until all process end + while not context.join(): + pass + + +def run(func, args=(), nprocs=1, join=True, daemon=False, start_method='spawn'): + return launch_processes(func, args, nprocs, join, daemon, start_method) From 00b56d5600cd0b9815652f9cdbaa1431cc2f3265 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Fri, 7 Aug 2020 13:02:12 +0000 Subject: [PATCH 02/32] polish implement & unified env property name --- python/paddle/distributed/__init__.py | 1 + python/paddle/distributed/launch.py | 2 +- python/paddle/distributed/run.py | 22 ++++++------- python/paddle/fluid/dygraph/parallel.py | 43 ++++++++++++++++--------- 4 files changed, 40 insertions(+), 28 deletions(-) diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py index 3e0051535449c6..a558358df0f41b 100644 --- a/python/paddle/distributed/__init__.py +++ b/python/paddle/distributed/__init__.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from . import run from .run import init_parallel_env, run __all__ = [ diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py index 80d97e325ba223..62d46adffcd1af 100644 --- a/python/paddle/distributed/launch.py +++ b/python/paddle/distributed/launch.py @@ -48,7 +48,7 @@ import paddle.fluid as fluid from paddle.distributed.utils import * -import paddle.distributed.cloud_utils as cloud_utils +from paddle.distributed import cloud_utils def _print_arguments(args): diff --git a/python/paddle/distributed/run.py b/python/paddle/distributed/run.py index 748513799a14d9..414e24d64782fb 100644 --- a/python/paddle/distributed/run.py +++ b/python/paddle/distributed/run.py @@ -64,15 +64,15 @@ def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs): "input `backend` type error, expected type is str, but received type is %s." % type(trainer_id)) - if trainer_id > 0: + if trainer_id < 0: raise ValueError( "input `trainer_id` should be greater than 0, but received %d." % trainer_id) - if trainer_num > 0: + if trainer_num < 0: raise ValueError( "input `trainer_num` should be greater than 0, but received %d." % trainer_num) - if trainer_id < trainer_num: + if trainer_id >= trainer_num: raise ValueError( "input `trainer_id` should be less than or equal to `trainer_num`, but `trainer_id` is %d, `trainer_num` is %d." % (trainer_id, trainer_num)) @@ -94,32 +94,32 @@ def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs): args.node_ip = kwargs.get('node_ip', "127.0.0.1") args.use_paddlecloud = kwargs.get('use_paddlecloud', "False") args.started_port = kwargs.get('started_port', None) - args.selected_gpus = kwargs.get('selected_gpus', None) + args.selected_gpus = ",".join( + [str(g) for g in [x for x in range(0, trainer_num)]]) # reuse code of launch.py cluster, pod = get_cluster_and_pod(args) # copy env & remove useless env vars - current_env = copy.copy(os.environ.copy()) - current_env.pop("http_proxy", None) - current_env.pop("https_proxy", None) + os.environ.pop("http_proxy", None) + os.environ.pop("https_proxy", None) # prepare env var assert trainer_num == cluster.trainers_nranks( ), "trainer number parse error." for trainer in pod.trainers: - if trainer.id == trainer_id: + if trainer.rank == trainer_id: proc_env = { "FLAGS_selected_gpus": - "%s" % ",".join([str(g) for g in selected_gpus]), - "PADDLE_TRAINER_ID": "%d" % trainer.id, + "%s" % ",".join([str(g) for g in trainer.gpus]), + "PADDLE_TRAINER_ID": "%d" % trainer.rank, "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint, "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()) } - current_env.update(proc_env) + os.environ.update(proc_env) break diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index 804076f608e714..a6dfc76a833d8b 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -118,47 +118,52 @@ def __init__(self): self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", "").split(",") self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "") + self.__aliases__ = { + 'local_rank': 'trainer_id', + 'nranks': 'trainer_num', + 'dev_id': 'devices' + } @property - def nranks(self): + def trainer_id(self): """ - The number of trainers, generally refers to the number of GPU cards used in training. + The current trainer number. - Its value is equal to the value of the environment variable PADDLE_TRAINERS_NUM. The default value is 1. + Its value is equal to the value of the environment variable PADDLE_TRAINER_ID. The default value is 0. Examples: .. code-block:: python - # execute this command in terminal: export PADDLE_TRAINERS_NUM=4 + # execute this command in terminal: export PADDLE_TRAINER_ID=0 import paddle.fluid as fluid env = fluid.dygraph.ParallelEnv() - print("The nranks is %d" % env.nranks) - # The nranks is 4 + print("The trainer id is %d" % env.trainer_id) + # The trainer id is 0 """ - return self._nranks + return self._local_rank @property - def local_rank(self): + def trainer_num(self): """ - The current trainer number. + The number of trainers, generally refers to the number of GPU cards used in training. - Its value is equal to the value of the environment variable PADDLE_TRAINER_ID. The default value is 0. + Its value is equal to the value of the environment variable PADDLE_TRAINERS_NUM. The default value is 1. Examples: .. code-block:: python - # execute this command in terminal: export PADDLE_TRAINER_ID=0 + # execute this command in terminal: export PADDLE_TRAINERS_NUM=4 import paddle.fluid as fluid env = fluid.dygraph.ParallelEnv() - print("The local rank is %d" % env.local_rank) - # The local rank is 0 + print("The trainer num is %d" % env.trainer_num) + # The trainer num is 4 """ - return self._local_rank + return self._nranks @property - def dev_id(self): + def devices(self): """ The ID of selected GPU card for parallel training. @@ -171,7 +176,7 @@ def dev_id(self): import paddle.fluid as fluid env = fluid.dygraph.ParallelEnv() - print("The device id are %d" % env.dev_id) + print("The device id are %d" % env.devices) # The device id are 1 """ return self._dev_id @@ -215,6 +220,12 @@ def trainer_endpoints(self): """ return self._trainer_endpoints + def __getattr__(self, name): + if name == "__aliases__": + raise AttributeError("Attribue `__aliases__` can not be accessed.") + name = self.__aliases__.get(name, name) + return object.__getattribute__(self, name) + # NOTE: [ Compatible ] Originally this class name is `Env`. The semantics of the old class names # are inaccurate and may confuse users, so replace it with `ParallelEnv`, but to be compatible From 17f7fe947fe0c217395cac98287621597f1add00 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Mon, 10 Aug 2020 05:17:40 +0000 Subject: [PATCH 03/32] add print config arg --- python/paddle/distributed/run.py | 45 +++++++++++++++++------------- python/paddle/distributed/utils.py | 6 ++++ 2 files changed, 32 insertions(+), 19 deletions(-) diff --git a/python/paddle/distributed/run.py b/python/paddle/distributed/run.py index 414e24d64782fb..b552f51eed31c3 100644 --- a/python/paddle/distributed/run.py +++ b/python/paddle/distributed/run.py @@ -22,7 +22,7 @@ import sys import paddle.fluid as fluid -from paddle.distributed.launch import get_cluster_and_pod +from paddle.distributed.launch import get_cluster_and_pod, _print_arguments def _py_version_check(): @@ -40,6 +40,7 @@ def __init__(self): self.node_ip = None self.use_paddlecloud = None self.started_port = None + self.print_config = True self.selected_gpus = None @@ -94,33 +95,39 @@ def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs): args.node_ip = kwargs.get('node_ip', "127.0.0.1") args.use_paddlecloud = kwargs.get('use_paddlecloud', "False") args.started_port = kwargs.get('started_port', None) + args.print_config = kwargs.get('print_config', True) args.selected_gpus = ",".join( [str(g) for g in [x for x in range(0, trainer_num)]]) # reuse code of launch.py cluster, pod = get_cluster_and_pod(args) - # copy env & remove useless env vars + # remove useless env vars os.environ.pop("http_proxy", None) os.environ.pop("https_proxy", None) - # prepare env var - - assert trainer_num == cluster.trainers_nranks( - ), "trainer number parse error." - for trainer in pod.trainers: - if trainer.rank == trainer_id: - proc_env = { - "FLAGS_selected_gpus": - "%s" % ",".join([str(g) for g in trainer.gpus]), - "PADDLE_TRAINER_ID": "%d" % trainer.rank, - "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint, - "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), - "PADDLE_TRAINER_ENDPOINTS": - ",".join(cluster.trainers_endpoints()) - } - os.environ.update(proc_env) - break + # update env vars + if trainer_num != cluster.trainers_nranks(): + raise RuntimeError( + "The number of trainers does not meet expectations, expected number is %d, but actual number is %d." + % (trainer_num, cluster.trainers_nranks())) + trainer = pod.get_trainer(trainer_id) + if trainer is None: + raise RuntimeError( + "The expected trainer is not exists, its trainer id is %d" % + trainer_id) + proc_env = { + "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in trainer.gpus]), + "PADDLE_TRAINER_ID": "%d" % trainer.rank, + "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint, + "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), + "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()) + } + os.environ.update(proc_env) + + # print config + if args.print_config and trainer_id == 0: + _print_arguments(args) def _func_wrapper(func, i, args, error_queue): diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py index 7c8fa257f778e7..87d0f1546f38d0 100644 --- a/python/paddle/distributed/utils.py +++ b/python/paddle/distributed/utils.py @@ -213,6 +213,12 @@ def get_visible_gpus(self): r = r[:-1] return r + def get_trainer(self, trainer_id): + for trainer in self.trainers: + if trainer.rank == trainer_id: + return trainer + return None + def get_logger(log_level, name="root"): logger = logging.getLogger(name) From 07c86aa64d616ac475b9807f0fbca2aa31791a9b Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 11 Aug 2020 04:02:27 +0000 Subject: [PATCH 04/32] refactor init_parallel_env function --- python/paddle/distributed/__init__.py | 7 +- python/paddle/distributed/launch.py | 2 +- python/paddle/distributed/run.py | 100 ------------ python/paddle/fluid/dygraph/parallel.py | 153 ++++++++++++++++-- .../paddle/fluid/dygraph/parallel_helper.py | 5 + python/paddle/fluid/framework.py | 8 + 6 files changed, 160 insertions(+), 115 deletions(-) diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py index a558358df0f41b..9f1e68c4a24c74 100644 --- a/python/paddle/distributed/__init__.py +++ b/python/paddle/distributed/__init__.py @@ -13,9 +13,6 @@ # limitations under the License. from . import run -from .run import init_parallel_env, run +from .run import run -__all__ = [ - "init_parallel_env", - "run", -] +__all__ = ["run", ] diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py index 62d46adffcd1af..43fb91bf7b83a0 100644 --- a/python/paddle/distributed/launch.py +++ b/python/paddle/distributed/launch.py @@ -45,7 +45,7 @@ import copy from argparse import ArgumentParser, REMAINDER import paddle -import paddle.fluid as fluid +from paddle import fluid from paddle.distributed.utils import * from paddle.distributed import cloud_utils diff --git a/python/paddle/distributed/run.py b/python/paddle/distributed/run.py index b552f51eed31c3..297077db9a505f 100644 --- a/python/paddle/distributed/run.py +++ b/python/paddle/distributed/run.py @@ -14,15 +14,11 @@ from __future__ import print_function -import copy import multiprocessing -import os import signal -import six import sys import paddle.fluid as fluid -from paddle.distributed.launch import get_cluster_and_pod, _print_arguments def _py_version_check(): @@ -34,102 +30,6 @@ def _py_version_check(): "`paddle.distributed.launch` instead.") -class ParallelEnvArgs(object): - def __init__(self): - self.cluster_node_ips = None - self.node_ip = None - self.use_paddlecloud = None - self.started_port = None - self.print_config = True - self.selected_gpus = None - - -def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs): - """ - - Args: - backend(str, optional): The backend to communication between multiple devices. - Now only support `nccl`. Default value is `nccl`. - """ - # 1. input check - if not isinstance(trainer_id, six.integer_types): - raise TypeError( - "input `trainer_id` type error, expected type is integer, but received type is %s." - % type(trainer_id)) - if not isinstance(trainer_num, six.integer_types): - raise TypeError( - "input `trainer_num` type error, expected type is integer, but received type is %s." - % type(trainer_id)) - if not isinstance(backend, six.string_types): - raise TypeError( - "input `backend` type error, expected type is str, but received type is %s." - % type(trainer_id)) - - if trainer_id < 0: - raise ValueError( - "input `trainer_id` should be greater than 0, but received %d." % - trainer_id) - if trainer_num < 0: - raise ValueError( - "input `trainer_num` should be greater than 0, but received %d." % - trainer_num) - if trainer_id >= trainer_num: - raise ValueError( - "input `trainer_id` should be less than or equal to `trainer_num`, but `trainer_id` is %d, `trainer_num` is %d." - % (trainer_id, trainer_num)) - if six.ensure_str(backend) != 'nccl': - raise ValueError( - "backend `%s` is not supported, now only supports `nccl` backend." % - backend) - - # 2. check and prepare environment variables - # The necessary environment variables include: - # - PADDLE_TRAINER_ID - # - PADDLE_TRAINERS_NUM - # - PADDLE_CURRENT_ENDPOINT - # - PADDLE_TRAINER_ENDPOINTS - - # get args from kwargs - args = ParallelEnvArgs() - args.cluster_node_ips = kwargs.get('cluster_node_ips', "127.0.0.1") - args.node_ip = kwargs.get('node_ip', "127.0.0.1") - args.use_paddlecloud = kwargs.get('use_paddlecloud', "False") - args.started_port = kwargs.get('started_port', None) - args.print_config = kwargs.get('print_config', True) - args.selected_gpus = ",".join( - [str(g) for g in [x for x in range(0, trainer_num)]]) - - # reuse code of launch.py - cluster, pod = get_cluster_and_pod(args) - - # remove useless env vars - os.environ.pop("http_proxy", None) - os.environ.pop("https_proxy", None) - - # update env vars - if trainer_num != cluster.trainers_nranks(): - raise RuntimeError( - "The number of trainers does not meet expectations, expected number is %d, but actual number is %d." - % (trainer_num, cluster.trainers_nranks())) - trainer = pod.get_trainer(trainer_id) - if trainer is None: - raise RuntimeError( - "The expected trainer is not exists, its trainer id is %d" % - trainer_id) - proc_env = { - "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in trainer.gpus]), - "PADDLE_TRAINER_ID": "%d" % trainer.rank, - "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint, - "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), - "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()) - } - os.environ.update(proc_env) - - # print config - if args.print_config and trainer_id == 0: - _print_arguments(args) - - def _func_wrapper(func, i, args, error_queue): try: func(i, *args) diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index a6dfc76a833d8b..c6907c14e26d13 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import os import six import numpy as np @@ -20,8 +21,11 @@ from . import parallel_helper from .. import framework from . import to_variable, no_grad +from paddle.distributed.launch import get_cluster_and_pod, _print_arguments -__all__ = ["prepare_context", "ParallelEnv", "DataParallel"] +__all__ = [ + "prepare_context", "init_parallel_env", "ParallelEnv", "DataParallel" +] ParallelStrategy = core.ParallelStrategy @@ -43,13 +47,145 @@ def prepare_context(strategy=None): place = framework._current_expected_place() assert place is not None, \ "dygraph.prepare_context should be used in fluid.dygraph.guard(place) guard." - if isinstance(place, core.CUDAPlace): + if not parallel_helper._is_parallel_ctx_initialized(): + if isinstance(place, core.CUDAPlace): + parallel_helper._set_parallel_ctx( + core.NCCLParallelContext(strategy, place)) + else: + # TODO(Yancey1989): add Gloo Parallel Context to support CPU parallel computation + assert ("Only support CUDAPlace for now.") + parallel_helper._init_parallel_ctx() + return strategy + + +class ParallelEnvArgs(object): + def __init__(self): + self.cluster_node_ips = None + self.node_ip = None + self.use_paddlecloud = None + self.started_port = None + self.print_config = True + self.selected_gpus = None + self.backend = None + + +def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs): + """ + Initialize parallel environments. + + Args: + backend(str, optional): The backend to communication between multiple devices. + Now only support `nccl`. Default value is `nccl`. + + Returns: + ParallelStrategy + + Examples: + + """ + # 1. input check + if not isinstance(trainer_id, six.integer_types): + raise TypeError( + "input `trainer_id` type error, expected type is integer, but received type is %s." + % type(trainer_id)) + if not isinstance(trainer_num, six.integer_types): + raise TypeError( + "input `trainer_num` type error, expected type is integer, but received type is %s." + % type(trainer_id)) + if not isinstance(backend, six.string_types): + raise TypeError( + "input `backend` type error, expected type is str, but received type is %s." + % type(trainer_id)) + + if trainer_id < 0: + raise ValueError( + "input `trainer_id` should be greater than 0, but received %d." % + trainer_id) + if trainer_num < 0: + raise ValueError( + "input `trainer_num` should be greater than 0, but received %d." % + trainer_num) + if trainer_id >= trainer_num: + raise ValueError( + "input `trainer_id` should be less than or equal to `trainer_num`, but `trainer_id` is %d, `trainer_num` is %d." + % (trainer_id, trainer_num)) + if six.ensure_str(backend) != 'nccl': + raise ValueError( + "backend `%s` is not supported, now only supports `nccl` backend." % + backend) + + # 2. check and prepare environment variables + # The necessary environment variables include: + # - PADDLE_TRAINER_ID + # - PADDLE_TRAINERS_NUM + # - PADDLE_CURRENT_ENDPOINT + # - PADDLE_TRAINER_ENDPOINTS + + # get args from kwargs + args = ParallelEnvArgs() + args.cluster_node_ips = kwargs.get('cluster_node_ips', "127.0.0.1") + args.node_ip = kwargs.get('node_ip', "127.0.0.1") + args.use_paddlecloud = kwargs.get('use_paddlecloud', "False") + args.started_port = kwargs.get('started_port', None) + args.print_config = kwargs.get('print_config', True) + args.selected_gpus = ",".join( + [str(g) for g in [x for x in range(0, trainer_num)]]) + args.backend = backend + + # reuse code of launch.py + cluster, pod = get_cluster_and_pod(args) + + # remove useless env vars + os.environ.pop("http_proxy", None) + os.environ.pop("https_proxy", None) + + # update env vars + if trainer_num != cluster.trainers_nranks(): + raise RuntimeError( + "The number of trainers does not meet expectations, expected number is %d, but actual number is %d." + % (trainer_num, cluster.trainers_nranks())) + trainer = pod.get_trainer(trainer_id) + if trainer is None: + raise RuntimeError( + "The expected trainer is not exists, its trainer id is %d" % + trainer_id) + # why trainer.gpus? here only one device? + proc_env = { + "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in trainer.gpus]), + "PADDLE_TRAINER_ID": "%d" % trainer.rank, + "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint, + "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), + "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()) + } + os.environ.update(proc_env) + + # print config + if args.print_config and trainer_id == 0: + _print_arguments(args) + + # 3. init ParallelStrategy + strategy = ParallelStrategy() + if six.ensure_str(backend) == 'nccl': + strategy.nranks = ParallelEnv().nranks + strategy.local_rank = ParallelEnv().local_rank + strategy.trainer_endpoints = ParallelEnv().trainer_endpoints + strategy.current_endpoint = ParallelEnv().current_endpoint + if strategy.nranks < 2: + return + # NOTE: [ why config global place here? ] + # the dygraph mode will be set to default mode, + # users will not call `dygraph.guard` or `enable_dygraph` + # directly, if they want to switch detault place, + # they need to call a function to change default place, + # here just set correctly place to users + place = core.CUDAPlace(ParallelEnv().dev_id) + framework._switch_current_place(place) + + # init nccl context parallel_helper._set_parallel_ctx( core.NCCLParallelContext(strategy, place)) - else: - # TODO(Yancey1989): add Gloo Parallel Context to support CPU parallel computation - assert ("Only support CUDAPlace for now.") - parallel_helper._init_parallel_ctx() + parallel_helper._init_parallel_ctx() + return strategy @@ -121,7 +257,6 @@ def __init__(self): self.__aliases__ = { 'local_rank': 'trainer_id', 'nranks': 'trainer_num', - 'dev_id': 'devices' } @property @@ -163,7 +298,7 @@ def trainer_num(self): return self._nranks @property - def devices(self): + def dev_id(self): """ The ID of selected GPU card for parallel training. @@ -176,7 +311,7 @@ def devices(self): import paddle.fluid as fluid env = fluid.dygraph.ParallelEnv() - print("The device id are %d" % env.devices) + print("The device id are %d" % env.dev_id) # The device id are 1 """ return self._dev_id diff --git a/python/paddle/fluid/dygraph/parallel_helper.py b/python/paddle/fluid/dygraph/parallel_helper.py index f378211de2b8a1..ff1675f0ae8a40 100644 --- a/python/paddle/fluid/dygraph/parallel_helper.py +++ b/python/paddle/fluid/dygraph/parallel_helper.py @@ -23,6 +23,11 @@ def _is_data_parallel_mode(): os.getenv("PADDLE_TRAINERS_NUM", "1")) > 1 +def _is_parallel_ctx_initialized(): + global __parallel_ctx__clz__ + return __parallel_ctx__clz__ is not None + + def _set_parallel_ctx(nccl_parallel_context): global __parallel_ctx__clz__ assert __parallel_ctx__clz__ is None, \ diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index a7faf4041cfe49..3b1ddcea37e0cc 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -5400,6 +5400,14 @@ def _dygraph_place_guard(place): _dygraph_current_expected_place_ = tmp_place +def _switch_current_place(place): + global _dygraph_tracer_ + global _dygraph_current_expected_place_ + if _dygraph_tracer_ is not None: + _dygraph_tracer_._expected_place = place + _dygraph_current_expected_place_ = place + + def load_op_library(lib_filename): """ :api_attr: Static Graph From 4c955a13a24b5cfd1f430e4a61f16fbcf8dc513a Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 13 Aug 2020 13:58:57 +0000 Subject: [PATCH 05/32] Compatible with multiprocessing and launch modes --- python/paddle/fluid/dygraph/parallel.py | 165 ++++++++++++------------ 1 file changed, 85 insertions(+), 80 deletions(-) diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index c6907c14e26d13..cba3ecf10f7d8f 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -15,6 +15,7 @@ import os import six import numpy as np +import warnings from collections import OrderedDict from .. import core from . import layers @@ -66,7 +67,6 @@ def __init__(self): self.started_port = None self.print_config = True self.selected_gpus = None - self.backend = None def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs): @@ -83,85 +83,90 @@ def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs): Examples: """ - # 1. input check - if not isinstance(trainer_id, six.integer_types): - raise TypeError( - "input `trainer_id` type error, expected type is integer, but received type is %s." - % type(trainer_id)) - if not isinstance(trainer_num, six.integer_types): - raise TypeError( - "input `trainer_num` type error, expected type is integer, but received type is %s." - % type(trainer_id)) - if not isinstance(backend, six.string_types): - raise TypeError( - "input `backend` type error, expected type is str, but received type is %s." - % type(trainer_id)) - - if trainer_id < 0: - raise ValueError( - "input `trainer_id` should be greater than 0, but received %d." % - trainer_id) - if trainer_num < 0: - raise ValueError( - "input `trainer_num` should be greater than 0, but received %d." % - trainer_num) - if trainer_id >= trainer_num: - raise ValueError( - "input `trainer_id` should be less than or equal to `trainer_num`, but `trainer_id` is %d, `trainer_num` is %d." - % (trainer_id, trainer_num)) - if six.ensure_str(backend) != 'nccl': - raise ValueError( - "backend `%s` is not supported, now only supports `nccl` backend." % - backend) - - # 2. check and prepare environment variables - # The necessary environment variables include: - # - PADDLE_TRAINER_ID - # - PADDLE_TRAINERS_NUM - # - PADDLE_CURRENT_ENDPOINT - # - PADDLE_TRAINER_ENDPOINTS - - # get args from kwargs - args = ParallelEnvArgs() - args.cluster_node_ips = kwargs.get('cluster_node_ips', "127.0.0.1") - args.node_ip = kwargs.get('node_ip', "127.0.0.1") - args.use_paddlecloud = kwargs.get('use_paddlecloud', "False") - args.started_port = kwargs.get('started_port', None) - args.print_config = kwargs.get('print_config', True) - args.selected_gpus = ",".join( - [str(g) for g in [x for x in range(0, trainer_num)]]) - args.backend = backend - - # reuse code of launch.py - cluster, pod = get_cluster_and_pod(args) - - # remove useless env vars - os.environ.pop("http_proxy", None) - os.environ.pop("https_proxy", None) - - # update env vars - if trainer_num != cluster.trainers_nranks(): - raise RuntimeError( - "The number of trainers does not meet expectations, expected number is %d, but actual number is %d." - % (trainer_num, cluster.trainers_nranks())) - trainer = pod.get_trainer(trainer_id) - if trainer is None: - raise RuntimeError( - "The expected trainer is not exists, its trainer id is %d" % - trainer_id) - # why trainer.gpus? here only one device? - proc_env = { - "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in trainer.gpus]), - "PADDLE_TRAINER_ID": "%d" % trainer.rank, - "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint, - "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), - "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()) - } - os.environ.update(proc_env) - - # print config - if args.print_config and trainer_id == 0: - _print_arguments(args) + + # NOTE(chenweihang): if trainer_id or trainer_num is default value, + # users should config parallel environment by module `paddle.distributed.launch`, + # so here we skip the environment variables config phase + if trainer_id != -1 or trainer_num != -1: + # 1. input check + if not isinstance(trainer_id, six.integer_types): + raise TypeError( + "input `trainer_id` type error, expected type is integer, but received type is %s." + % type(trainer_id)) + if not isinstance(trainer_num, six.integer_types): + raise TypeError( + "input `trainer_num` type error, expected type is integer, but received type is %s." + % type(trainer_id)) + if not isinstance(backend, six.string_types): + raise TypeError( + "input `backend` type error, expected type is str, but received type is %s." + % type(trainer_id)) + + if trainer_id < 0: + raise ValueError( + "input `trainer_id` should be greater than 0, but received %d." + % trainer_id) + if trainer_num < 0: + raise ValueError( + "input `trainer_num` should be greater than 0, but received %d." + % trainer_num) + if trainer_id >= trainer_num: + raise ValueError( + "input `trainer_id` should be less than or equal to `trainer_num`, but `trainer_id` is %d, `trainer_num` is %d." + % (trainer_id, trainer_num)) + if six.ensure_str(backend) != 'nccl': + raise ValueError( + "backend `%s` is not supported, now only supports `nccl` backend." + % backend) + + # 2. check and prepare environment variables + # The necessary environment variables include: + # - PADDLE_TRAINER_ID + # - PADDLE_TRAINERS_NUM + # - PADDLE_CURRENT_ENDPOINT + # - PADDLE_TRAINER_ENDPOINTS + + # get args from kwargs + args = ParallelEnvArgs() + args.cluster_node_ips = kwargs.get('cluster_node_ips', "127.0.0.1") + args.node_ip = kwargs.get('node_ip', "127.0.0.1") + args.use_paddlecloud = kwargs.get('use_paddlecloud', False) + args.started_port = kwargs.get('started_port', None) + args.print_config = kwargs.get('print_config', True) + args.selected_gpus = ",".join( + [str(g) for g in [x for x in range(0, trainer_num)]]) + + # reuse code of launch.py + cluster, pod = get_cluster_and_pod(args) + + # remove useless env vars + os.environ.pop("http_proxy", None) + os.environ.pop("https_proxy", None) + + # update env vars + if trainer_num != cluster.trainers_nranks(): + raise RuntimeError( + "The number of trainers does not meet expectations, expected number is %d, but actual number is %d." + % (trainer_num, cluster.trainers_nranks())) + trainer = pod.get_trainer(trainer_id) + if trainer is None: + raise RuntimeError( + "The expected trainer is not exists, its trainer id is %d" % + trainer_id) + # why trainer.gpus? here only one device? + proc_env = { + "FLAGS_selected_gpus": + "%s" % ",".join([str(g) for g in trainer.gpus]), + "PADDLE_TRAINER_ID": "%d" % trainer.rank, + "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint, + "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), + "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()) + } + os.environ.update(proc_env) + + # print config + if args.print_config and trainer_id == 0: + _print_arguments(args) # 3. init ParallelStrategy strategy = ParallelStrategy() From 523e007a78847f86297bbabf04dd812246f45f72 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Fri, 14 Aug 2020 08:36:33 +0000 Subject: [PATCH 06/32] set default trainer start port --- python/paddle/distributed/run.py | 36 ++++++++++++---- python/paddle/fluid/dygraph/parallel.py | 55 +++++++++++++++++-------- 2 files changed, 66 insertions(+), 25 deletions(-) diff --git a/python/paddle/distributed/run.py b/python/paddle/distributed/run.py index 297077db9a505f..85b495cef37fb8 100644 --- a/python/paddle/distributed/run.py +++ b/python/paddle/distributed/run.py @@ -15,19 +15,32 @@ from __future__ import print_function import multiprocessing +import os import signal import sys +import warnings import paddle.fluid as fluid +from paddle.distributed.utils import find_free_ports -def _py_version_check(): +def _support_set_start_method(): if not sys.version_info >= (3, 4): - raise RuntimeError( - "Use `paddle.distributed.run` to start parallel training " - "requires python version greater than 3.4, if your python " - "is lower than this version, please use " - "`paddle.distributed.launch` instead.") + warnings.warn( + "`paddle.distributed.run` only supports setting the process" + " start when python version greater than 3.4, if your python" + " is lower than this version, only can start processes by" + " default method of current platform.") + + +def _set_default_master_env(): + # set default master trainer ip addr + os.environ['PADDLE_MASTER_IPADDR'] = '127.0.0.1' + # set default master trainer port + port_set = find_free_ports(1) + if port_set is None: + raise RuntimeError("no free port can be used to parallel training now.") + os.environ['PADDLE_MASTER_PORT'] = str(list(port_set)[0]) def _func_wrapper(func, i, args, error_queue): @@ -43,7 +56,7 @@ def _func_wrapper(func, i, args, error_queue): class MultiprocessContext(object): def __init__(self, processes, error_queues): - _py_version_check() + _support_set_start_method() self.error_queues = error_queues self.processes = processes self.sentinels = { @@ -97,6 +110,15 @@ def launch_processes(func, join=True, daemon=False, start_method='spawn'): + # NOTE(chenweihang): [ why need set default master info before run? ] + # when using `paddle.distributed.run` start parallel training, + # users need use `init_parallel_env` to config some cluster info + # inner subprocess, if each process find free port for itself, + # the started port may be different, it will cause endpoints is + # different in different subprocesses + _set_default_master_env() + + # start processes mp = multiprocessing.get_context(start_method) error_queues = [] processes = [] diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index cba3ecf10f7d8f..73c8d7c618b476 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -91,29 +91,27 @@ def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs): # 1. input check if not isinstance(trainer_id, six.integer_types): raise TypeError( - "input `trainer_id` type error, expected type is integer, but received type is %s." - % type(trainer_id)) + "input `trainer_id` type error, expected type is integer, " + "but received type is %s." % type(trainer_id)) if not isinstance(trainer_num, six.integer_types): raise TypeError( - "input `trainer_num` type error, expected type is integer, but received type is %s." - % type(trainer_id)) + "input `trainer_num` type error, expected type is integer, " + "but received type is %s." % type(trainer_id)) if not isinstance(backend, six.string_types): - raise TypeError( - "input `backend` type error, expected type is str, but received type is %s." - % type(trainer_id)) + raise TypeError("input `backend` type error, expected type is str, " + "but received type is %s." % type(trainer_id)) if trainer_id < 0: - raise ValueError( - "input `trainer_id` should be greater than 0, but received %d." - % trainer_id) + raise ValueError("input `trainer_id` should be greater than 0, " + "but received %d." % trainer_id) if trainer_num < 0: - raise ValueError( - "input `trainer_num` should be greater than 0, but received %d." - % trainer_num) + raise ValueError("input `trainer_num` should be greater than 0, " + "but received %d." % trainer_num) if trainer_id >= trainer_num: raise ValueError( - "input `trainer_id` should be less than or equal to `trainer_num`, but `trainer_id` is %d, `trainer_num` is %d." - % (trainer_id, trainer_num)) + "input `trainer_id` should be less than or equal to `trainer_num`, " + "but `trainer_id` is %d, `trainer_num` is %d." % + (trainer_id, trainer_num)) if six.ensure_str(backend) != 'nccl': raise ValueError( "backend `%s` is not supported, now only supports `nccl` backend." @@ -128,10 +126,31 @@ def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs): # get args from kwargs args = ParallelEnvArgs() - args.cluster_node_ips = kwargs.get('cluster_node_ips', "127.0.0.1") - args.node_ip = kwargs.get('node_ip', "127.0.0.1") - args.use_paddlecloud = kwargs.get('use_paddlecloud', False) + args.cluster_node_ips = kwargs.get('cluster_node_ips', None) + args.node_ip = kwargs.get('node_ip', None) + if args.cluster_node_ips is not None and args.node_ip is None: + raise ValueError("please input current node ip, " + "cannot `cluster_node_ips`.") + default_node_ip = os.environ.get("PADDLE_MASTER_IPADDR", None) + default_node_ip = "127.0.0.1" if default_node_ip else default_node_ip + if args.node_ip is None: + args.node_ip = default_node_ip + if args.cluster_node_ips is None: + args.cluster_node_ips = default_node_ip + + # NOTE(chenweihang): Here should set started_port before + # `get_cluster_and_pod` and keep each process's started_port + # is same, see [ why need set default master info before run? ] args.started_port = kwargs.get('started_port', None) + if args.started_port is None: + default_port = os.environ.get("PADDLE_MASTER_PORT", None) + if default_port is None: + raise RuntimeError( + "please input start port of parallel training by `started_port=**`." + ) + args.started_port = default_port + + args.use_paddlecloud = kwargs.get('use_paddlecloud', False) args.print_config = kwargs.get('print_config', True) args.selected_gpus = ",".join( [str(g) for g in [x for x in range(0, trainer_num)]]) From 8101b035ebf8000de868fda45a083d252a648da8 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Sat, 15 Aug 2020 14:54:34 +0000 Subject: [PATCH 07/32] support run in python 2 --- python/paddle/distributed/run.py | 67 ++++++++++++++++++++++--- python/paddle/fluid/dygraph/parallel.py | 4 +- 2 files changed, 63 insertions(+), 8 deletions(-) diff --git a/python/paddle/distributed/run.py b/python/paddle/distributed/run.py index 85b495cef37fb8..0153f96fc42385 100644 --- a/python/paddle/distributed/run.py +++ b/python/paddle/distributed/run.py @@ -31,6 +31,14 @@ def _support_set_start_method(): " start when python version greater than 3.4, if your python" " is lower than this version, only can start processes by" " default method of current platform.") + return False + return True + + +def _support_connection_wait(): + if not sys.version_info >= (3, 3): + return False + return True def _set_default_master_env(): @@ -56,15 +64,25 @@ def _func_wrapper(func, i, args, error_queue): class MultiprocessContext(object): def __init__(self, processes, error_queues): - _support_set_start_method() self.error_queues = error_queues self.processes = processes - self.sentinels = { - process.sentinel: index - for index, process in enumerate(processes) - } + # NOTE(chenweihang): multiprocessing.connection.wait is a new feature + # supported from python3.3, which can provide more fine-grained support + # for multi-subprocess exit monitoring. + self.use_connection_wait = _support_connection_wait() + if self.use_connection_wait: + self.sentinels = { + process.sentinel: index + for index, process in enumerate(processes) + } def join(self, timeout=None): + if self.use_connection_wait: + return self._join_with_conn_wait(timeout) + else: + return self._join_without_conn_wait(timeout) + + def _join_with_conn_wait(self, timeout=None): if len(self.sentinels) == 0: return True @@ -83,6 +101,39 @@ def join(self, timeout=None): if error_index is None: return len(self.sentinels) == 0 + self._join_and_throw_exception(error_index) + + # NOTE(chenweihng): This method is not as efficient as connection.wait. + # Beccause if process has already stopped, p.join() will return immediately. + # If process hasn't stopped, it will block until the process end, and + # as the same time, the other process may have been end, which makes it + # impossible for us to accurately capture the first failed process. + # Here we avoid process block by setting timeout, but if the other process + # exit before the timeout end, we will also encounter the previois problem, + # but maybe there is no better way here. When we fully migrated to python3, + # this problem disappeared + def _join_without_conn_wait(self, timeout=None): + finished_processes = [] + error_index = None + for index, proccess in enumerate(self.processes): + # try to join selected process + proccess.join(timeout=1) + # This will be None if the process has not yet terminated + if process.exitcode is not None: + # exit with exception + if process.exitcode == 0: + finished_processes.append(index) + else: + error_index = index + failed_processes.append(index) + break + + if error_index is None: + return len(finished_processes) == 0 + + self._join_and_throw_exception(error_index) + + def _join_and_throw_exception(self, error_index): for process in self.processes: if process.is_alive(): process.terminate() @@ -119,7 +170,11 @@ def launch_processes(func, _set_default_master_env() # start processes - mp = multiprocessing.get_context(start_method) + if _support_set_start_method(): + mp = multiprocessing.get_context(start_method) + else: + mp = multiprocessing + error_queues = [] processes = [] for i in range(nprocs): diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index 73c8d7c618b476..6779a6c8edee3a 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -148,10 +148,10 @@ def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs): raise RuntimeError( "please input start port of parallel training by `started_port=**`." ) - args.started_port = default_port + args.started_port = int(default_port) args.use_paddlecloud = kwargs.get('use_paddlecloud', False) - args.print_config = kwargs.get('print_config', True) + args.print_config = kwargs.get('print_config', False) args.selected_gpus = ",".join( [str(g) for g in [x for x in range(0, trainer_num)]]) From d3b9a065e8b7f464400fc5c3ab906b0e610269ce Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Mon, 17 Aug 2020 06:37:14 +0000 Subject: [PATCH 08/32] polish python2 support code --- python/paddle/distributed/launch.py | 5 ++-- python/paddle/distributed/run.py | 32 ++++++++++++++++--------- python/paddle/fluid/dygraph/parallel.py | 16 +++++++------ 3 files changed, 32 insertions(+), 21 deletions(-) diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py index 43fb91bf7b83a0..8a093abb3766ed 100644 --- a/python/paddle/distributed/launch.py +++ b/python/paddle/distributed/launch.py @@ -44,9 +44,8 @@ import six import copy from argparse import ArgumentParser, REMAINDER -import paddle -from paddle import fluid +from paddle.fluid import core from paddle.distributed.utils import * from paddle.distributed import cloud_utils @@ -167,7 +166,7 @@ def get_cluster_from_args(args, selected_gpus): def get_gpus(selected_gpus): if selected_gpus is None: - gpus_num = fluid.core.get_cuda_device_count() + gpus_num = core.get_cuda_device_count() selected_gpus = [str(x) for x in range(0, gpus_num)] else: cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") diff --git a/python/paddle/distributed/run.py b/python/paddle/distributed/run.py index 0153f96fc42385..b64968cf4741ec 100644 --- a/python/paddle/distributed/run.py +++ b/python/paddle/distributed/run.py @@ -12,17 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import print_function +from __future__ import print_function, division import multiprocessing import os import signal +import six import sys import warnings import paddle.fluid as fluid from paddle.distributed.utils import find_free_ports +# SimpleQueue is different in py2 and py3 +if six.PY2: + import multiprocessing.queues as queues_py2 + def _support_set_start_method(): if not sys.version_info >= (3, 4): @@ -115,9 +120,10 @@ def _join_with_conn_wait(self, timeout=None): def _join_without_conn_wait(self, timeout=None): finished_processes = [] error_index = None - for index, proccess in enumerate(self.processes): + timeout = timeout // len(self.processes) if timeout else 1 + for index, process in enumerate(self.processes): # try to join selected process - proccess.join(timeout=1) + process.join(timeout=timeout) # This will be None if the process has not yet terminated if process.exitcode is not None: # exit with exception @@ -125,11 +131,11 @@ def _join_without_conn_wait(self, timeout=None): finished_processes.append(index) else: error_index = index - failed_processes.append(index) + finished_processes.append(index) break if error_index is None: - return len(finished_processes) == 0 + return len(finished_processes) == len(self.processes) self._join_and_throw_exception(error_index) @@ -146,11 +152,13 @@ def _join_and_throw_exception(self, error_index): raise Exception("Process %d terminated with signal %s." % (error_index, name)) else: - raise Exception("Process %d terminated with exit code %s." & ( + raise Exception("Process %d terminated with exit code %d." & ( error_index, exitcode)) original_trace = self.error_queues[error_index].get() - msg = "\n\n-- Procces %d terminated with the following error:\n" % error_index + msg = "\n\n----------------------------------------------\n" \ + "Procces %d terminated with the following error:\n" \ + "----------------------------------------------\n\n" % error_index msg += original_trace raise Exception(msg) @@ -178,11 +186,13 @@ def launch_processes(func, error_queues = [] processes = [] for i in range(nprocs): - error_queue = mp.SimpleQueue() + if six.PY2: + error_queue = queues_py2.SimpleQueue() + else: + error_queue = mp.SimpleQueue() process = mp.Process( - target=_func_wrapper, - args=(func, i, args, error_queue), - daemon=daemon) + target=_func_wrapper, args=(func, i, args, error_queue)) + process.daemon = daemon process.start() error_queues.append(error_queue) processes.append(process) diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index 6779a6c8edee3a..382e9ae6b809d2 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -17,11 +17,13 @@ import numpy as np import warnings from collections import OrderedDict -from .. import core -from . import layers -from . import parallel_helper -from .. import framework -from . import to_variable, no_grad + +from paddle import compat as cpt +from paddle.fluid import core +from paddle.fluid import framework +from paddle.fluid.dygraph import layers +from paddle.fluid.dygraph import parallel_helper +from paddle.fluid.dygraph import to_variable, no_grad from paddle.distributed.launch import get_cluster_and_pod, _print_arguments __all__ = [ @@ -112,7 +114,7 @@ def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs): "input `trainer_id` should be less than or equal to `trainer_num`, " "but `trainer_id` is %d, `trainer_num` is %d." % (trainer_id, trainer_num)) - if six.ensure_str(backend) != 'nccl': + if cpt.to_text(backend) != 'nccl': raise ValueError( "backend `%s` is not supported, now only supports `nccl` backend." % backend) @@ -189,7 +191,7 @@ def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs): # 3. init ParallelStrategy strategy = ParallelStrategy() - if six.ensure_str(backend) == 'nccl': + if cpt.to_text(backend) == 'nccl': strategy.nranks = ParallelEnv().nranks strategy.local_rank = ParallelEnv().local_rank strategy.trainer_endpoints = ParallelEnv().trainer_endpoints From 48c46ff43dfdfadcf2d9f453071baf4f7fbe56b5 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Mon, 17 Aug 2020 07:23:34 +0000 Subject: [PATCH 09/32] remove python2 support --- python/paddle/distributed/run.py | 103 ++++++++----------------------- 1 file changed, 27 insertions(+), 76 deletions(-) diff --git a/python/paddle/distributed/run.py b/python/paddle/distributed/run.py index b64968cf4741ec..561404453fb8a6 100644 --- a/python/paddle/distributed/run.py +++ b/python/paddle/distributed/run.py @@ -17,33 +17,20 @@ import multiprocessing import os import signal -import six import sys import warnings import paddle.fluid as fluid from paddle.distributed.utils import find_free_ports -# SimpleQueue is different in py2 and py3 -if six.PY2: - import multiprocessing.queues as queues_py2 - -def _support_set_start_method(): +def _py_supported_check(): if not sys.version_info >= (3, 4): - warnings.warn( - "`paddle.distributed.run` only supports setting the process" - " start when python version greater than 3.4, if your python" - " is lower than this version, only can start processes by" - " default method of current platform.") - return False - return True - - -def _support_connection_wait(): - if not sys.version_info >= (3, 3): - return False - return True + raise RuntimeError( + "Use `paddle.distributed.run` to start parallel training " + "requires python version greater than 3.4, if your python " + "is lower than this version, please use " + "`paddle.distributed.launch` instead.") def _set_default_master_env(): @@ -69,25 +56,15 @@ def _func_wrapper(func, i, args, error_queue): class MultiprocessContext(object): def __init__(self, processes, error_queues): + _py_supported_check() self.error_queues = error_queues self.processes = processes - # NOTE(chenweihang): multiprocessing.connection.wait is a new feature - # supported from python3.3, which can provide more fine-grained support - # for multi-subprocess exit monitoring. - self.use_connection_wait = _support_connection_wait() - if self.use_connection_wait: - self.sentinels = { - process.sentinel: index - for index, process in enumerate(processes) - } + self.sentinels = { + process.sentinel: index + for index, process in enumerate(processes) + } def join(self, timeout=None): - if self.use_connection_wait: - return self._join_with_conn_wait(timeout) - else: - return self._join_without_conn_wait(timeout) - - def _join_with_conn_wait(self, timeout=None): if len(self.sentinels) == 0: return True @@ -106,45 +83,14 @@ def _join_with_conn_wait(self, timeout=None): if error_index is None: return len(self.sentinels) == 0 - self._join_and_throw_exception(error_index) - - # NOTE(chenweihng): This method is not as efficient as connection.wait. - # Beccause if process has already stopped, p.join() will return immediately. - # If process hasn't stopped, it will block until the process end, and - # as the same time, the other process may have been end, which makes it - # impossible for us to accurately capture the first failed process. - # Here we avoid process block by setting timeout, but if the other process - # exit before the timeout end, we will also encounter the previois problem, - # but maybe there is no better way here. When we fully migrated to python3, - # this problem disappeared - def _join_without_conn_wait(self, timeout=None): - finished_processes = [] - error_index = None - timeout = timeout // len(self.processes) if timeout else 1 - for index, process in enumerate(self.processes): - # try to join selected process - process.join(timeout=timeout) - # This will be None if the process has not yet terminated - if process.exitcode is not None: - # exit with exception - if process.exitcode == 0: - finished_processes.append(index) - else: - error_index = index - finished_processes.append(index) - break - - if error_index is None: - return len(finished_processes) == len(self.processes) - - self._join_and_throw_exception(error_index) - - def _join_and_throw_exception(self, error_index): for process in self.processes: if process.is_alive(): process.terminate() process.join() + self._throw_exception(error_index) + + def _throw_exception(self, error_index): if self.error_queues[error_index].empty(): exitcode = self.processes[error_index].exitcode if exitcode < 0: @@ -163,12 +109,23 @@ def _join_and_throw_exception(self, error_index): raise Exception(msg) +# NOTE(chenweihang): [ why default start method is spawn? ] +# The CUDA runtime does not support the fork start method, +# either the spawn or forkserver start method are required +# to use CUDA in subprocesses. def launch_processes(func, args=(), nprocs=1, join=True, daemon=False, start_method='spawn'): + # NOTE(chenweihang): [ why only supports python3.4+? ] + # Python has only supported setting the child process startup method + # since 3.4. The previous version can only use the default startup + # method, while the default startup method of Unix is fork, which + # cannot support CUDA runtime multi-process + _py_supported_check() + # NOTE(chenweihang): [ why need set default master info before run? ] # when using `paddle.distributed.run` start parallel training, # users need use `init_parallel_env` to config some cluster info @@ -178,18 +135,12 @@ def launch_processes(func, _set_default_master_env() # start processes - if _support_set_start_method(): - mp = multiprocessing.get_context(start_method) - else: - mp = multiprocessing + mp = multiprocessing.get_context(start_method) error_queues = [] processes = [] for i in range(nprocs): - if six.PY2: - error_queue = queues_py2.SimpleQueue() - else: - error_queue = mp.SimpleQueue() + error_queue = mp.SimpleQueue() process = mp.Process( target=_func_wrapper, args=(func, i, args, error_queue)) process.daemon = daemon From b06d40050b95a880977a2185a517c6fd33dbf362 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 19 Aug 2020 05:59:40 +0000 Subject: [PATCH 10/32] refine launch import --- python/paddle/distributed/launch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py index 8a093abb3766ed..e2ab321f9aebdd 100644 --- a/python/paddle/distributed/launch.py +++ b/python/paddle/distributed/launch.py @@ -45,7 +45,6 @@ import copy from argparse import ArgumentParser, REMAINDER -from paddle.fluid import core from paddle.distributed.utils import * from paddle.distributed import cloud_utils @@ -166,6 +165,7 @@ def get_cluster_from_args(args, selected_gpus): def get_gpus(selected_gpus): if selected_gpus is None: + from paddle.fluid import core gpus_num = core.get_cuda_device_count() selected_gpus = [str(x) for x in range(0, gpus_num)] else: From 2c7b3fd58a57edf09b0b79e477bb507c279f57b7 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 19 Aug 2020 12:55:00 +0000 Subject: [PATCH 11/32] polish dome design details --- python/paddle/distributed/__init__.py | 7 ++++--- .../paddle/distributed/{run.py => spawn.py} | 19 +++++++++++-------- python/paddle/fluid/dygraph/parallel.py | 5 ++++- python/paddle/framework/__init__.py | 6 +++--- 4 files changed, 22 insertions(+), 15 deletions(-) rename python/paddle/distributed/{run.py => spawn.py} (91%) diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py index 9f1e68c4a24c74..ec181cdd0f2803 100644 --- a/python/paddle/distributed/__init__.py +++ b/python/paddle/distributed/__init__.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from . import run -from .run import run +from . import spawn +from .spawn import spwan +from .spawn import start_processes -__all__ = ["run", ] +__all__ = ["spawn", "start_processes"] diff --git a/python/paddle/distributed/run.py b/python/paddle/distributed/spawn.py similarity index 91% rename from python/paddle/distributed/run.py rename to python/paddle/distributed/spawn.py index 561404453fb8a6..ef2e81e50769a4 100644 --- a/python/paddle/distributed/run.py +++ b/python/paddle/distributed/spawn.py @@ -113,12 +113,12 @@ def _throw_exception(self, error_index): # The CUDA runtime does not support the fork start method, # either the spawn or forkserver start method are required # to use CUDA in subprocesses. -def launch_processes(func, - args=(), - nprocs=1, - join=True, - daemon=False, - start_method='spawn'): +def start_processes(func, + args=(), + nprocs=1, + join=True, + daemon=False, + start_method='spawn'): # NOTE(chenweihang): [ why only supports python3.4+? ] # Python has only supported setting the child process startup method # since 3.4. The previous version can only use the default startup @@ -157,5 +157,8 @@ def launch_processes(func, pass -def run(func, args=(), nprocs=1, join=True, daemon=False, start_method='spawn'): - return launch_processes(func, args, nprocs, join, daemon, start_method) +# NOTE(chenweihang): this method only supports start processes +# by `spwan` method, if users want to start processes by other +# method, they can use start_processes +def spawn(func, args=(), nprocs=1, join=True, daemon=False): + return launch_processes(func, args, nprocs, join, daemon, 'spawn') diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index 29deace1618cd6..ce3ce7944de33c 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -33,6 +33,7 @@ ParallelStrategy = core.ParallelStrategy +@deprecated(since="2.0.0", update_to="paddle.init_parallel_env") def prepare_context(strategy=None): ''' :api_attr: imperative @@ -71,13 +72,15 @@ def __init__(self): self.selected_gpus = None -def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs): +def init_parallel_env(rank=-1, backend='nccl', **kwargs): """ Initialize parallel environments. Args: + rank(int, optional): Rank of current process. Default vaule is -1. backend(str, optional): The backend to communication between multiple devices. Now only support `nccl`. Default value is `nccl`. + **options(dict, optional): Other initial parallel execution environment configuration. Returns: ParallelStrategy diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py index f01dc01973a603..aead17e2da152e 100644 --- a/python/paddle/framework/__init__.py +++ b/python/paddle/framework/__init__.py @@ -50,9 +50,9 @@ from ..fluid.dygraph.base import grad #DEFINE_ALIAS from ..fluid.dygraph.checkpoint import load_dygraph as load #DEFINE_ALIAS from ..fluid.dygraph.checkpoint import save_dygraph as save #DEFINE_ALIAS -from ..fluid.dygraph.parallel import prepare_context #DEFINE_ALIAS -from ..fluid.dygraph.parallel import ParallelEnv #DEFINE_ALIAS -from ..fluid.dygraph.parallel import DataParallel #DEFINE_ALIAS +from . import prepare_context +from . import ParallelEnv +from . import DataParallel from ..fluid.dygraph.learning_rate_scheduler import NoamDecay #DEFINE_ALIAS from ..fluid.dygraph.learning_rate_scheduler import PiecewiseDecay #DEFINE_ALIAS From d26f495dd53996ee67200df9cac229860fee5c5d Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 20 Aug 2020 12:40:59 +0000 Subject: [PATCH 12/32] refactor api implemention & path --- python/paddle/__init__.py | 3 - python/paddle/distributed/__init__.py | 16 +- python/paddle/distributed/parallel.py | 222 ++++++++++++++++++++++++ python/paddle/distributed/spawn.py | 15 +- python/paddle/fluid/dygraph/parallel.py | 181 ++----------------- python/paddle/framework/__init__.py | 3 - 6 files changed, 257 insertions(+), 183 deletions(-) create mode 100644 python/paddle/distributed/parallel.py diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 518e2c0c4d90da..73213054fe1646 100644 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -216,9 +216,6 @@ from .framework import no_grad #DEFINE_ALIAS from .framework import save #DEFINE_ALIAS from .framework import load #DEFINE_ALIAS -from .framework import prepare_context #DEFINE_ALIAS -from .framework import ParallelEnv #DEFINE_ALIAS -from .framework import DataParallel #DEFINE_ALIAS from .framework import NoamDecay #DEFINE_ALIAS from .framework import PiecewiseDecay #DEFINE_ALIAS diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py index ec181cdd0f2803..c40902010bb2a2 100644 --- a/python/paddle/distributed/__init__.py +++ b/python/paddle/distributed/__init__.py @@ -12,8 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. +# start multiprocess apis +__all__ = ["spawn", "start_processes"] + +# dygraph parallel apis +__all__ += [ + "prepare_context", "init_parallel_env", "ParallelEnv", "DataParallel" +] + from . import spawn -from .spawn import spwan +from .spawn import spawn from .spawn import start_processes -__all__ = ["spawn", "start_processes"] +from . import parallel +from .parallel import init_parallel_env +from paddle.fluid.dygraph.parallel import prepare_context #DEFINE_ALIAS +from paddle.fluid.dygraph.parallel import ParallelEnv #DEFINE_ALIAS +from paddle.fluid.dygraph.parallel import DataParallel #DEFINE_ALIAS diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py new file mode 100644 index 00000000000000..ee1eac01a22014 --- /dev/null +++ b/python/paddle/distributed/parallel.py @@ -0,0 +1,222 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except jin compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import six + +from paddle import compat as cpt +from paddle.distributed.launch import _parse_args, get_cluster_and_pod, _print_arguments + +# deprecated module import +from paddle.fluid import core +from paddle.fluid.framework import _switch_current_place +from paddle.fluid.dygraph import parallel_helper +from paddle.fluid.dygraph.parallel import ParallelEnv + +__all__ = ["init_parallel_env"] + +ParallelStrategy = core.ParallelStrategy + + +# NOTE(chenweihang): The existence of this class leads to +# the maintenance of two arguments. When the launch.py arguments +# is updated, the arguments here also need to be updated, +# but I have not thought of a better way here +class ParallelEnvArgs(object): + def __init__(self): + # Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17.. + self.cluster_node_ips = None + + # The current node ip. + self.node_ip = None + + # wheter to use paddlecloud platform to run your multi-process job. + # If false, no need to set this argument. + self.use_paddlecloud = None + + # The trainer's started port on a single node + self.started_port = None + + # Print the config or not + self.print_config = True + + # It's for gpu training and the training process will run + # on the selected_gpus, each process is bound to a single GPU. + # And if it's not set, this module will use all the gpu cards + # for training. + self.selected_gpus = None + + +def _update_env_vars(rank, options): + # 1. input check + if not isinstance(rank, six.integer_types): + raise TypeError("input `rank` type error, expected type is integer, " + "but received type is %s." % type(rank)) + if rank < 0: + raise ValueError("input `rank` should be greater than 0, " + "but received %d." % rank) + + # 2. check and prepare environment variables + # The necessary environment variables include: + # - PADDLE_TRAINER_ID + # - PADDLE_TRAINERS_NUM + # - PADDLE_CURRENT_ENDPOINT + # - PADDLE_TRAINER_ENDPOINTS + + # get args from kwargs + args = ParallelEnvArgs() + # set default `node_ip` and `cluster_node_ips` + args.cluster_node_ips = options.get('cluster_node_ips', None) + args.node_ip = options.get('node_ip', None) + if args.cluster_node_ips is not None and args.node_ip is None: + raise ValueError("please input current node ip, " + "cannot only give `cluster_node_ips`.") + default_node_ip = os.environ.get("PADDLE_MASTER_IPADDR", None) + default_node_ip = "127.0.0.1" if default_node_ip else default_node_ip + if args.node_ip is None: + args.node_ip = default_node_ip + if args.cluster_node_ips is None: + args.cluster_node_ips = default_node_ip + + # NOTE(chenweihang): Here should set `started_port` before + # `get_cluster_and_pod` and keep each process's started_port + # is same, see [ why need set default master info before run? ] + args.started_port = options.get('started_port', None) + if args.started_port is None: + default_port = os.environ.get("PADDLE_MASTER_PORT", None) + if default_port is None: + raise RuntimeError( + "please input start port of parallel training by `started_port=**`," + "e.g. started_port=6170") + args.started_port = int(default_port) + + args.use_paddlecloud = options.get('use_paddlecloud', False) + args.print_config = options.get('print_config', True) + + # set default `selected_gpus` + # TODO(chenweihang): if users gived number of `selected_gpus` + # is not equal to the spawn's nprocs, it will cause error, + # and because we remove the `proc num` argument of + # `init_parallel_env`, when above error occured, we do not + # have a good way to check, so users are not recommended to + # use this parameter, it is best to delete + args.selected_gpus = options.get('selected_gpus', None) + if args.selected_gpus is None: + args.selected_gpus = os.environ.get("PADDLE_CUDA_VISIBLE_DEVICES", None) + if args.selected_gpus is None: + raise ValueError( + "please input selected gpus of parallel training by `selected_gpus=**`," + "e.g. selected_gpus='0,1,2,3'.", ) + + # reuse code of launch.py + cluster, pod = get_cluster_and_pod(args) + + # remove useless env vars + os.environ.pop("http_proxy", None) + os.environ.pop("https_proxy", None) + + # update env vars + trainer = pod.get_trainer(rank) + if trainer is None: + raise RuntimeError( + "The expected trainer is not exists, its trainer rank is %d" % rank) + proc_env = { + "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in trainer.gpus]), + "PADDLE_TRAINER_ID": "%d" % trainer.rank, + "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint, + "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), + "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()) + } + # no copy, each process will hold env vars itself + os.environ.update(proc_env) + + # print config + if args.print_config and rank == 0: + _print_arguments(args) + + +def _check_env_vars(): + def _check_var_exists(var_name): + var = os.environ.get(var_name, None) + if var is None: + raise ValueError("paddle.distributed initialize error," + "Environment variable %s is needed, but not set.", + var_name) + + _check_var_exists("FLAGS_selected_gpus") + _check_var_exists("PADDLE_TRAINER_ID") + _check_var_exists("PADDLE_CURRENT_ENDPOINT") + _check_var_exists("PADDLE_TRAINERS_NUM") + _check_var_exists("PADDLE_TRAINER_ENDPOINTS") + + +def init_parallel_env(rank=-1, backend='nccl', **options): + """ + Initialize parallel environments. + + Args: + rank(int, optional): Rank of current process. Default vaule is -1. + backend(str, optional): The backend to communication between multiple devices. + Now only support `nccl`. Default value is `nccl`. + **options(dict, optional): Other initial parallel execution environment configuration. + + Returns: + ParallelStrategy + + Examples: + + """ + + # 1. input check + if not isinstance(backend, six.string_types): + raise TypeError("input `backend` type error, expected type is str, " + "but received type is %s." % type(backend)) + if cpt.to_text(backend) != 'nccl': + raise ValueError( + "backend `%s` is not supported, now only supports `nccl` backend." % + backend) + + # update or check env + # NOTE(chenweihang): if rank is default value, users should config + # parallel environment by module `paddle.distributed.launch`, + # so here we only check the environment variables + if rank != -1: + _update_env_vars(rank, options) + else: + _check_env_vars() + + # 3. init ParallelStrategy + strategy = ParallelStrategy() + if cpt.to_text(backend) == 'nccl': + strategy.nranks = ParallelEnv().nranks + strategy.local_rank = ParallelEnv().local_rank + strategy.trainer_endpoints = ParallelEnv().trainer_endpoints + strategy.current_endpoint = ParallelEnv().current_endpoint + if strategy.nranks < 2: + return + # NOTE(chenweihang): [ why config global place here? ] + # the dygraph mode will be set to default mode, + # users will not call `dygraph.guard` or `enable_dygraph` + # directly, if they want to switch detault place, + # they need to call a function to change default place, + # here just set correctly place to users + place = core.CUDAPlace(ParallelEnv().dev_id) + _switch_current_place(place) + + # init nccl context + parallel_helper._set_parallel_ctx( + core.NCCLParallelContext(strategy, place)) + parallel_helper._init_parallel_ctx() + + return strategy diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py index ef2e81e50769a4..1c6b2682970f17 100644 --- a/python/paddle/distributed/spawn.py +++ b/python/paddle/distributed/spawn.py @@ -20,7 +20,6 @@ import sys import warnings -import paddle.fluid as fluid from paddle.distributed.utils import find_free_ports @@ -33,7 +32,7 @@ def _py_supported_check(): "`paddle.distributed.launch` instead.") -def _set_default_master_env(): +def _set_default_assist_env(nprocs): # set default master trainer ip addr os.environ['PADDLE_MASTER_IPADDR'] = '127.0.0.1' # set default master trainer port @@ -41,6 +40,14 @@ def _set_default_master_env(): if port_set is None: raise RuntimeError("no free port can be used to parallel training now.") os.environ['PADDLE_MASTER_PORT'] = str(list(port_set)[0]) + # set default selected_gpus + # e.g. if the nprocs is 4, the selected_gpus="0,1,2,3" + # NOTE(chenweihang): [ why not use FLAGS_selected_gpus directly? ] + # because the FLAGS_selected_gpus may be used in other place, + # if we set FLAGS_selected_gpus are `0,1,2,3`, it may cause error + # when using `ParallelEnv` + os.environ['PADDLE_CUDA_VISIBLE_DEVICES'] = ",".join( + [str(x) for x in range(0, nprocs)]) def _func_wrapper(func, i, args, error_queue): @@ -132,7 +139,7 @@ def start_processes(func, # inner subprocess, if each process find free port for itself, # the started port may be different, it will cause endpoints is # different in different subprocesses - _set_default_master_env() + _set_default_assist_env(nprocs) # start processes mp = multiprocessing.get_context(start_method) @@ -161,4 +168,4 @@ def start_processes(func, # by `spwan` method, if users want to start processes by other # method, they can use start_processes def spawn(func, args=(), nprocs=1, join=True, daemon=False): - return launch_processes(func, args, nprocs, join, daemon, 'spawn') + return start_processes(func, args, nprocs, join, daemon, 'spawn') diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index d6849bfab0e006..ee2aba6d47308e 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -24,16 +24,14 @@ from paddle.fluid.dygraph import layers from paddle.fluid.dygraph import parallel_helper from paddle.fluid.dygraph import to_variable, no_grad -from paddle.distributed.launch import get_cluster_and_pod, _print_arguments +from paddle.utils import deprecated -__all__ = [ - "prepare_context", "init_parallel_env", "ParallelEnv", "DataParallel" -] +__all__ = ["prepare_context", "ParallelEnv", "DataParallel"] ParallelStrategy = core.ParallelStrategy -@deprecated(since="2.0.0", update_to="paddle.init_parallel_env") +@deprecated(since="2.0.0", update_to="paddle.distributed.init_parallel_env") def prepare_context(strategy=None): ''' :api_attr: imperative @@ -62,162 +60,6 @@ def prepare_context(strategy=None): return strategy -class ParallelEnvArgs(object): - def __init__(self): - self.cluster_node_ips = None - self.node_ip = None - self.use_paddlecloud = None - self.started_port = None - self.print_config = True - self.selected_gpus = None - - -def init_parallel_env(rank=-1, backend='nccl', **kwargs): - """ - Initialize parallel environments. - - Args: - rank(int, optional): Rank of current process. Default vaule is -1. - backend(str, optional): The backend to communication between multiple devices. - Now only support `nccl`. Default value is `nccl`. - **options(dict, optional): Other initial parallel execution environment configuration. - - Returns: - ParallelStrategy - - Examples: - - """ - - # NOTE(chenweihang): if trainer_id or trainer_num is default value, - # users should config parallel environment by module `paddle.distributed.launch`, - # so here we skip the environment variables config phase - if trainer_id != -1 or trainer_num != -1: - # 1. input check - if not isinstance(trainer_id, six.integer_types): - raise TypeError( - "input `trainer_id` type error, expected type is integer, " - "but received type is %s." % type(trainer_id)) - if not isinstance(trainer_num, six.integer_types): - raise TypeError( - "input `trainer_num` type error, expected type is integer, " - "but received type is %s." % type(trainer_id)) - if not isinstance(backend, six.string_types): - raise TypeError("input `backend` type error, expected type is str, " - "but received type is %s." % type(trainer_id)) - - if trainer_id < 0: - raise ValueError("input `trainer_id` should be greater than 0, " - "but received %d." % trainer_id) - if trainer_num < 0: - raise ValueError("input `trainer_num` should be greater than 0, " - "but received %d." % trainer_num) - if trainer_id >= trainer_num: - raise ValueError( - "input `trainer_id` should be less than or equal to `trainer_num`, " - "but `trainer_id` is %d, `trainer_num` is %d." % - (trainer_id, trainer_num)) - if cpt.to_text(backend) != 'nccl': - raise ValueError( - "backend `%s` is not supported, now only supports `nccl` backend." - % backend) - - # 2. check and prepare environment variables - # The necessary environment variables include: - # - PADDLE_TRAINER_ID - # - PADDLE_TRAINERS_NUM - # - PADDLE_CURRENT_ENDPOINT - # - PADDLE_TRAINER_ENDPOINTS - - # get args from kwargs - args = ParallelEnvArgs() - args.cluster_node_ips = kwargs.get('cluster_node_ips', None) - args.node_ip = kwargs.get('node_ip', None) - if args.cluster_node_ips is not None and args.node_ip is None: - raise ValueError("please input current node ip, " - "cannot `cluster_node_ips`.") - default_node_ip = os.environ.get("PADDLE_MASTER_IPADDR", None) - default_node_ip = "127.0.0.1" if default_node_ip else default_node_ip - if args.node_ip is None: - args.node_ip = default_node_ip - if args.cluster_node_ips is None: - args.cluster_node_ips = default_node_ip - - # NOTE(chenweihang): Here should set started_port before - # `get_cluster_and_pod` and keep each process's started_port - # is same, see [ why need set default master info before run? ] - args.started_port = kwargs.get('started_port', None) - if args.started_port is None: - default_port = os.environ.get("PADDLE_MASTER_PORT", None) - if default_port is None: - raise RuntimeError( - "please input start port of parallel training by `started_port=**`." - ) - args.started_port = int(default_port) - - args.use_paddlecloud = kwargs.get('use_paddlecloud', False) - args.print_config = kwargs.get('print_config', False) - args.selected_gpus = ",".join( - [str(g) for g in [x for x in range(0, trainer_num)]]) - - # reuse code of launch.py - cluster, pod = get_cluster_and_pod(args) - - # remove useless env vars - os.environ.pop("http_proxy", None) - os.environ.pop("https_proxy", None) - - # update env vars - if trainer_num != cluster.trainers_nranks(): - raise RuntimeError( - "The number of trainers does not meet expectations, expected number is %d, but actual number is %d." - % (trainer_num, cluster.trainers_nranks())) - trainer = pod.get_trainer(trainer_id) - if trainer is None: - raise RuntimeError( - "The expected trainer is not exists, its trainer id is %d" % - trainer_id) - # why trainer.gpus? here only one device? - proc_env = { - "FLAGS_selected_gpus": - "%s" % ",".join([str(g) for g in trainer.gpus]), - "PADDLE_TRAINER_ID": "%d" % trainer.rank, - "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint, - "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), - "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()) - } - os.environ.update(proc_env) - - # print config - if args.print_config and trainer_id == 0: - _print_arguments(args) - - # 3. init ParallelStrategy - strategy = ParallelStrategy() - if cpt.to_text(backend) == 'nccl': - strategy.nranks = ParallelEnv().nranks - strategy.local_rank = ParallelEnv().local_rank - strategy.trainer_endpoints = ParallelEnv().trainer_endpoints - strategy.current_endpoint = ParallelEnv().current_endpoint - if strategy.nranks < 2: - return - # NOTE: [ why config global place here? ] - # the dygraph mode will be set to default mode, - # users will not call `dygraph.guard` or `enable_dygraph` - # directly, if they want to switch detault place, - # they need to call a function to change default place, - # here just set correctly place to users - place = core.CUDAPlace(ParallelEnv().dev_id) - framework._switch_current_place(place) - - # init nccl context - parallel_helper._set_parallel_ctx( - core.NCCLParallelContext(strategy, place)) - parallel_helper._init_parallel_ctx() - - return strategy - - class ParallelEnv(object): """ **Notes**: @@ -283,13 +125,10 @@ def __init__(self): self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", "").split(",") self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "") - self.__aliases__ = { - 'local_rank': 'trainer_id', - 'nranks': 'trainer_num', - } + self.__aliases__ = {'local_rank': 'rank', } @property - def trainer_id(self): + def rank(self): """ The current trainer number. @@ -302,13 +141,13 @@ def trainer_id(self): import paddle.fluid as fluid env = fluid.dygraph.ParallelEnv() - print("The trainer id is %d" % env.trainer_id) - # The trainer id is 0 + print("The rank is %d" % env.rank) + # The rank is 0 """ return self._local_rank @property - def trainer_num(self): + def nranks(self): """ The number of trainers, generally refers to the number of GPU cards used in training. @@ -321,8 +160,8 @@ def trainer_num(self): import paddle.fluid as fluid env = fluid.dygraph.ParallelEnv() - print("The trainer num is %d" % env.trainer_num) - # The trainer num is 4 + print("The nranks is %d" % env.nranks) + # The nranks is 4 """ return self._nranks diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py index aead17e2da152e..4b348ea729ef33 100644 --- a/python/paddle/framework/__init__.py +++ b/python/paddle/framework/__init__.py @@ -50,9 +50,6 @@ from ..fluid.dygraph.base import grad #DEFINE_ALIAS from ..fluid.dygraph.checkpoint import load_dygraph as load #DEFINE_ALIAS from ..fluid.dygraph.checkpoint import save_dygraph as save #DEFINE_ALIAS -from . import prepare_context -from . import ParallelEnv -from . import DataParallel from ..fluid.dygraph.learning_rate_scheduler import NoamDecay #DEFINE_ALIAS from ..fluid.dygraph.learning_rate_scheduler import PiecewiseDecay #DEFINE_ALIAS From bf985ccba4236b1d41b74c9051486298f86cf9df Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 20 Aug 2020 13:20:45 +0000 Subject: [PATCH 13/32] use new method _set_expected_place --- python/paddle/distributed/parallel.py | 6 +++--- python/paddle/fluid/dygraph/parallel.py | 1 - python/paddle/fluid/framework.py | 8 -------- 3 files changed, 3 insertions(+), 12 deletions(-) diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index ee1eac01a22014..53ff671abe5dfd 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -16,11 +16,11 @@ import six from paddle import compat as cpt -from paddle.distributed.launch import _parse_args, get_cluster_and_pod, _print_arguments +from paddle.distributed.launch import get_cluster_and_pod, _print_arguments # deprecated module import from paddle.fluid import core -from paddle.fluid.framework import _switch_current_place +from paddle.fluid.framework import _set_expected_place from paddle.fluid.dygraph import parallel_helper from paddle.fluid.dygraph.parallel import ParallelEnv @@ -212,7 +212,7 @@ def init_parallel_env(rank=-1, backend='nccl', **options): # they need to call a function to change default place, # here just set correctly place to users place = core.CUDAPlace(ParallelEnv().dev_id) - _switch_current_place(place) + _set_expected_place(place) # init nccl context parallel_helper._set_parallel_ctx( diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index ee2aba6d47308e..350954f1adf7e5 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -18,7 +18,6 @@ import warnings from collections import OrderedDict -from paddle import compat as cpt from paddle.fluid import core from paddle.fluid import framework from paddle.fluid.dygraph import layers diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 4e861fb0e77789..e844c74c106e37 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -5446,14 +5446,6 @@ def _dygraph_place_guard(place): _global_expected_place_ = tmp_place -def _switch_current_place(place): - global _dygraph_tracer_ - global _dygraph_current_expected_place_ - if _dygraph_tracer_ is not None: - _dygraph_tracer_._expected_place = place - _dygraph_current_expected_place_ = place - - def load_op_library(lib_filename): """ :api_attr: Static Graph From 7939384044e28bb25f4b8cd3409c48c8f32b8302 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Mon, 24 Aug 2020 11:57:01 +0000 Subject: [PATCH 14/32] add spawn unittest framework & mnist test --- python/paddle/distributed/__init__.py | 6 +- python/paddle/distributed/parallel.py | 26 ++- .../{spawn.py => start_processes.py} | 24 ++- .../tests/unittests/spawn_runner_base.py | 92 +++++++++++ .../fluid/tests/unittests/test_dist_base.py | 151 ++++++++++++------ .../test_imperative_data_parallel.py | 2 +- .../unittests/test_parallel_dygraph_mnist.py | 15 +- 7 files changed, 250 insertions(+), 66 deletions(-) rename python/paddle/distributed/{spawn.py => start_processes.py} (86%) create mode 100644 python/paddle/fluid/tests/unittests/spawn_runner_base.py diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py index c40902010bb2a2..999cbf279d9f9d 100644 --- a/python/paddle/distributed/__init__.py +++ b/python/paddle/distributed/__init__.py @@ -20,9 +20,9 @@ "prepare_context", "init_parallel_env", "ParallelEnv", "DataParallel" ] -from . import spawn -from .spawn import spawn -from .spawn import start_processes +from . import start_processes +from .start_processes import spawn +from .start_processes import start_processes from . import parallel from .parallel import init_parallel_env diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index 53ff671abe5dfd..712a4782f6b214 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except jin compliance with the License. @@ -83,7 +83,7 @@ def _update_env_vars(rank, options): raise ValueError("please input current node ip, " "cannot only give `cluster_node_ips`.") default_node_ip = os.environ.get("PADDLE_MASTER_IPADDR", None) - default_node_ip = "127.0.0.1" if default_node_ip else default_node_ip + default_node_ip = "127.0.0.1" if default_node_ip is None else default_node_ip if args.node_ip is None: args.node_ip = default_node_ip if args.cluster_node_ips is None: @@ -97,8 +97,14 @@ def _update_env_vars(rank, options): default_port = os.environ.get("PADDLE_MASTER_PORT", None) if default_port is None: raise RuntimeError( - "please input start port of parallel training by `started_port=**`," - "e.g. started_port=6170") + "Data parallel training start failed. If you start data parallel " + "training by `paddle.distributed.launch` module, Please ensure " + "that one of the following rules is met:\n" + " 1. Do not set `paddle.distributed.init_parallel_env` argument " + "`rank` or set it to be -1;\n" + " 2. Set `paddle.distributed.init_parallel_env` start port for " + "parallel training by `started_port=**`, e.g. started_port=6170." + ) args.started_port = int(default_port) args.use_paddlecloud = options.get('use_paddlecloud', False) @@ -116,8 +122,14 @@ def _update_env_vars(rank, options): args.selected_gpus = os.environ.get("PADDLE_CUDA_VISIBLE_DEVICES", None) if args.selected_gpus is None: raise ValueError( - "please input selected gpus of parallel training by `selected_gpus=**`," - "e.g. selected_gpus='0,1,2,3'.", ) + "Data parallel training start failed. If you start data parallel " + "training by `paddle.distributed.launch` module, Please ensure " + "that one of the following rules is met:\n" + " 1. Do not set `paddle.distributed.init_parallel_env` argument " + "`rank` or set it to be -1;\n" + " 2. Set `paddle.distributed.init_parallel_env` selected gpus of " + "parallel training by `selected_gpus=**`, e.g. selected_gpus='0,1,2,3'." + ) # reuse code of launch.py cluster, pod = get_cluster_and_pod(args) @@ -187,7 +199,7 @@ def init_parallel_env(rank=-1, backend='nccl', **options): "backend `%s` is not supported, now only supports `nccl` backend." % backend) - # update or check env + # 2. update or check env # NOTE(chenweihang): if rank is default value, users should config # parallel environment by module `paddle.distributed.launch`, # so here we only check the environment variables diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/start_processes.py similarity index 86% rename from python/paddle/distributed/spawn.py rename to python/paddle/distributed/start_processes.py index 1c6b2682970f17..e9b0ff1648e9a0 100644 --- a/python/paddle/distributed/spawn.py +++ b/python/paddle/distributed/start_processes.py @@ -50,9 +50,10 @@ def _set_default_assist_env(nprocs): [str(x) for x in range(0, nprocs)]) -def _func_wrapper(func, i, args, error_queue): +def _func_wrapper(func, i, args, error_queue, return_queue): try: - func(i, *args) + result = func(i, *args) + return_queue.put(result) except KeyboardInterrupt: pass except Exception: @@ -62,9 +63,15 @@ def _func_wrapper(func, i, args, error_queue): class MultiprocessContext(object): - def __init__(self, processes, error_queues): + def __init__(self, processes, error_queues, return_queues): _py_supported_check() self.error_queues = error_queues + # NOTE(chenweihang): The `start_processes` method is mainly used + # to wrap the outermost execution function of the program for + # parallel execution. Generally, the return value is not concerned, + # but if the user needs to obtain the return value, users can get + # the return result of each process from context.return_queues + self.return_queues = return_queues self.processes = processes self.sentinels = { process.sentinel: index @@ -145,17 +152,21 @@ def start_processes(func, mp = multiprocessing.get_context(start_method) error_queues = [] + return_queues = [] processes = [] for i in range(nprocs): error_queue = mp.SimpleQueue() + return_queue = mp.SimpleQueue() process = mp.Process( - target=_func_wrapper, args=(func, i, args, error_queue)) + target=_func_wrapper, + args=(func, i, args, error_queue, return_queue)) process.daemon = daemon process.start() error_queues.append(error_queue) + return_queues.append(return_queue) processes.append(process) - context = MultiprocessContext(processes, error_queues) + context = MultiprocessContext(processes, error_queues, return_queues) if not join: return context @@ -163,6 +174,9 @@ def start_processes(func, while not context.join(): pass + # finaly return context + return context + # NOTE(chenweihang): this method only supports start processes # by `spwan` method, if users want to start processes by other diff --git a/python/paddle/fluid/tests/unittests/spawn_runner_base.py b/python/paddle/fluid/tests/unittests/spawn_runner_base.py new file mode 100644 index 00000000000000..4e188c3fbed187 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/spawn_runner_base.py @@ -0,0 +1,92 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function, division + +import numpy as np +import unittest + +import paddle + +# used by model.run_trainer in test_dist_base +from test_dist_base import RUN_STEP + + +# NOTE: compatible TestParallelDyGraphRunnerBase args +class SpawnAssistTestArgs(object): + update_method = "local" + trainer_id = 0 + current_endpoint = None + endpoints = None + with_spawn = True + + +def run_dygraph_model(rank, model, args): + args.with_spawn = True + args.trainer_id = rank + return model.run_trainer(args) + + +class TestDistSpawnRunner(unittest.TestCase): + def setUp(self): + # NOTE(chenweihang): keep consistent with + # TestDistBase.check_with_place + self.nprocs = 2 + + def _run(self, model, args): + args.update_method = "local" + return run_dygraph_model(-1, model, args) + + def _run_parallel(self, model, args): + args.update_method = "nccl2" + context = paddle.distributed.spawn( + func=run_dygraph_model, + args=( + model, + args, ), + nprocs=self.nprocs, + join=True) + result_list = [] + for res_queue in context.return_queues: + result_list.append(res_queue.get()) + return result_list + + def check_dist_result_with_spawn(self, test_class, delta=1e-3): + # 0. prepare model and args + model = test_class() + args = SpawnAssistTestArgs() + + # 1. calc signal card loss + losses = self._run(model, args) + + # 2. calc multi card loss (nccl mode) + dist_losses_list = self._run_parallel(model, args) + + # 3. compare losses + for step_id in range(RUN_STEP): + loss = losses[step_id] + dist_loss_sum = None + for dist_losses in dist_losses_list: + if dist_loss_sum is None: + dist_loss_sum = np.array(dist_losses[step_id]) + else: + dist_loss_sum += np.array(dist_losses[step_id]) + dist_loss = dist_loss_sum / self.nprocs + self.assertAlmostEqual( + loss, + dist_loss, + delta=delta, + msg="The results of single-card execution and multi-card execution are inconsistent." + "signal-card loss is:\n{}\nmulti-card average loss is:\n{}\n". + format(loss, dist_loss)) diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index ba292f2d87c376..deb9014e0863b7 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import print_function +from __future__ import print_function, division import time import unittest @@ -25,6 +25,8 @@ import pickle import numpy as np import time + +import paddle import paddle.fluid as fluid from paddle.fluid import compiler import paddle.fluid.dygraph as dygraph @@ -382,12 +384,47 @@ def run_one_loop(self, model, opt, data): raise NotImplementedError( "train_one_loop should be implemented by the child classes.") - def run_trainer(self, args): - - seed = 90 - device_id = int(os.getenv("FLAGS_selected_gpus", "0")) - place = fluid.CUDAPlace(device_id) + def _parse_launch_args(self, args): + cluster_node_ips = None + node_ip = None + started_port = None + # [ Adapt `runtime_main` arguments ] + # Why can't we keep the arguments here consistent with launch.py? + ips_dict = dict() + trainer_endpoints = args.endpoints.split( + ",") if args.endpoints else None + if trainer_endpoints is not None: + for endpoint in trainer_endpoints: + ip_port = endpoint.split(":") + ip_str = ip_port[0] + port = int(ip_port[1]) + cur_port = ips_dict.get(ip_str, 0) + if cur_port != 0: + if port < cur_port: + ips_dict[ip_str] = port + else: + ips_dict[ip_str] = port + cur_ip_port = args.current_endpoint.split( + ":") if args.current_endpoint else None + if cur_ip_port is None: + raise RuntimeError("the current endpoint is not set.") + endpoint_num = len(trainer_endpoints) + node_num = len(ips_dict.keys()) + # TODO(chenweihang): Don't consider this situation for now + if endpoint_num % node_num != 0: + raise RuntimeError( + "not check when the number of cards used by each machine is different." + ) + node_gpu_num = endpoint_num // node_num + + cluster_node_ips = ",".join(ips_dict.keys()) + node_ip = cur_ip_port[0] + started_port = ips_dict[node_ip] + selected_gpus = ",".join([str(x) for x in range(0, node_gpu_num)]) + + return cluster_node_ips, node_ip, started_port, selected_gpus + def run_trainer(self, args): def _get_data(batch): if args.update_method != "local": new_batch = [] @@ -398,51 +435,68 @@ def _get_data(batch): else: return batch - with fluid.dygraph.guard(place): - fluid.default_startup_program().random_seed = seed - fluid.default_main_program().random_seed = seed - np.random.seed(seed) - import random - random.seed = seed - model, train_reader, opt = self.get_model() - nranks = len(args.endpoints.split(",")) if args.endpoints else 1 + # 1. enable dygraph + fluid.enable_dygraph() - if args.update_method == "nccl2": - strategy = dygraph.parallel.ParallelStrategy() - strategy.nranks = nranks - strategy.local_rank = args.trainer_id - strategy.trainer_endpoints = args.endpoints.split(",") - strategy.current_endpoint = args.current_endpoint + # 2. init seed + seed = 90 + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + np.random.seed(seed) + import random + random.seed = seed + + # 3. init parallel env + if args.update_method == "nccl2": + print_to_err( + type(self).__name__, + "begin to prepare context in dygraph with nccl2") + if args.with_spawn is True: + strategy = paddle.distributed.init_parallel_env( + rank=args.trainer_id) + else: + cluster_node_ips, node_ip, started_port, selected_gpus = self._parse_launch_args( + args) + strategy = paddle.distributed.init_parallel_env( + rank=args.trainer_id, + backend='nccl', + cluster_node_ips=cluster_node_ips, + node_ip=node_ip, + started_port=started_port, + selected_gpus=selected_gpus, + print_config=False) + + # 4. train model + model, train_reader, opt = self.get_model() + if args.update_method == "nccl2": + model = dygraph.parallel.DataParallel(model, strategy) + print_to_err(type(self).__name__, "model built in dygraph") + + out_losses = [] + print_to_err(type(self).__name__, "begin to run dygraph training") + for step_id, data in enumerate(train_reader()): + data = _get_data(data) + if step_id == RUN_STEP: + break + loss = self.run_one_loop(model, opt, data) + if step_id % 10 == 0: print_to_err( type(self).__name__, - "begin to prepare context in dygraph with nccl2") - dygraph.parallel.prepare_context(strategy) - model = dygraph.parallel.DataParallel(model, strategy) - print_to_err(type(self).__name__, "model built in dygraph") - out_losses = [] - print_to_err(type(self).__name__, "begin to run dygraph training") - for step_id, data in enumerate(train_reader()): - data = _get_data(data) - if step_id == RUN_STEP: - break - loss = self.run_one_loop(model, opt, data) - if step_id % 10 == 0: - print_to_err( - type(self).__name__, - "loss at step %d: %f" % (step_id, loss.numpy())) - out_losses.append(loss.numpy()) - - # FIXME(Yancey1989): scale the loss inplace - if args.update_method == "nccl2": - loss = model.scale_loss(loss) - - loss.backward() - if args.update_method == "nccl2": - model.apply_collective_grads() - - opt.minimize(loss) - model.clear_gradients() + "loss at step %d: %f" % (step_id, loss.numpy())) + out_losses.append(loss.numpy()) + + # FIXME(Yancey1989): scale the loss inplace + if args.update_method == "nccl2": + loss = model.scale_loss(loss) + + loss.backward() + if args.update_method == "nccl2": + model.apply_collective_grads() + + opt.minimize(loss) + model.clear_gradients() print_to_out(out_losses) + return out_losses def runtime_main(test_class): @@ -486,6 +540,8 @@ def runtime_main(test_class): type=bool, default=False) parser.add_argument('--sync_batch_norm', action='store_true') + parser.add_argument( + '--with_spawn', type=bool, required=False, default=False) args = parser.parse_args() @@ -806,7 +862,6 @@ def _get_nccl2_trainer_cmd(self, model, ep, update_method, trainer_id, if self.__use_cuda: tr_cmd += " --use_cuda" env.update({ - "CUDA_VISIBLE_DEVICES": "{}".format(trainer_id % 2), "PADDLE_TRAINERS_NUM": "{}".format(trainer_num), "PADDLE_TRAINER_ID": "{}".format(trainer_id), "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints, diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py b/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py index d3f488d92ac455..428f97c0af8182 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py @@ -43,7 +43,7 @@ def forward(self, inputs): class TestDataParallelStateDict(unittest.TestCase): def test_data_parallel_state_dict(self): with fluid.dygraph.guard(): - strategy = paddle.prepare_context() + strategy = paddle.distributed.prepare_context() mlp = MLP() parallel_mlp = dygraph.parallel.DataParallel(mlp, strategy) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py index 5677157fde8d71..bac196b1ab52b6 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py @@ -13,11 +13,16 @@ # limitations under the License. from __future__ import print_function + +import os +import sys import unittest -from test_dist_base import TestDistBase + import paddle.fluid as fluid +from test_dist_base import TestDistBase +from spawn_runner_base import TestDistSpawnRunner +from parallel_dygraph_mnist import TestMnist -import os flag_name = os.path.splitext(__file__)[0] @@ -36,5 +41,11 @@ def test_mnist(self): log_name=flag_name) +class TestParallelDygraphMnistSpawn(TestDistSpawnRunner): + def test_mnist_with_spawn(self): + if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4): + self.check_dist_result_with_spawn(test_class=TestMnist, delta=1e-5) + + if __name__ == "__main__": unittest.main() From 04580d8657f6ed926a18fcc80b8174eb8eeb85e1 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Mon, 24 Aug 2020 14:16:26 +0000 Subject: [PATCH 15/32] add more unittests & doc --- python/paddle/distributed/parallel.py | 58 +++++++- python/paddle/distributed/start_processes.py | 139 ++++++++++++++++++ .../test_parallel_dygraph_se_resnext.py | 16 +- .../test_parallel_dygraph_sparse_embedding.py | 12 +- .../test_parallel_dygraph_transformer.py | 12 +- 5 files changed, 230 insertions(+), 7 deletions(-) diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index 712a4782f6b214..3d795084ae6837 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -180,14 +180,66 @@ def init_parallel_env(rank=-1, backend='nccl', **options): Args: rank(int, optional): Rank of current process. Default vaule is -1. backend(str, optional): The backend to communication between multiple devices. - Now only support `nccl`. Default value is `nccl`. - **options(dict, optional): Other initial parallel execution environment configuration. + Now only support ``nccl`` . Default value is ``nccl`` . + **options(dict, optional): Other initial parallel execution environment configuration options. + The following options are currently supported: + - cluster_node_ips: Paddle cluster nodes ips, such as "192.168.0.16,192.168.0.17". Default: "127.0.0.1" + - node_ip: The current node ip, such as "192.168.0.16". Default: "127.0.0.1" + - started_port: The trainer's started port on a single node, such as 6170. Default: None + - selected_gpus: The training process will run on the selected_gpus, such as "0,1,2,3". Default: None + - print_config: Print current parallel training config. Default: True. + - use_paddlecloud: Wheter to use paddlecloud platform to run your multi-process job. Default: False. Returns: ParallelStrategy Examples: - + .. code-block:: python + + import paddle + import paddle.nn as nn + import paddle.optimizer as opt + import paddle.distributed as dist + + class LinearNet(nn.Layer): + def __init__(self): + super(LinearNet, self).__init__() + self._linear1 = nn.Linear(10, 10) + self._linear2 = nn.Linear(10, 1) + + def forward(self, x): + return self._linear2(self._linear1(x)) + + def train(rank): + # 1. enable dynamic mode + paddle.disable_static() + + # 2. initialize parallel environment + strategy = dist.init_parallel_env(rank) + + # 3. create data parallel layer & optimizer + layer = LinearNet() + dp_layer = dist.DataParallel(layer, strategy) + + loss_fn = nn.MSELoss() + sgd = opt.SGD( + learning_rate=0.001, parameter_list=dp_layer.parameters()) + + # 4. run layer + inputs = paddle.randn([10, 10], 'float32') + outputs = dp_layer(inputs) + labels = paddle.randn([10, 1], 'float32') + loss = loss_fn(outputs, labels) + + loss = dp_layer.scale_loss(loss) + loss.backward() + dp_layer.apply_collective_grads() + + sgd.minimize(loss) + dp_layer.clear_gradients() + + if __name__ == '__main__': + dist.spawn(train, args=(), nprocs=2) """ # 1. input check diff --git a/python/paddle/distributed/start_processes.py b/python/paddle/distributed/start_processes.py index e9b0ff1648e9a0..dbadbeece1d984 100644 --- a/python/paddle/distributed/start_processes.py +++ b/python/paddle/distributed/start_processes.py @@ -133,6 +133,77 @@ def start_processes(func, join=True, daemon=False, start_method='spawn'): + """ + Start multiple rocesses for parallel training. + + Args: + func (function): The targert function is called by started process. + This function need to be able to pickled, so it must be defined + at the top level of a module. + This function should be called as ``func(i, *args)`` , ``i`` is + the process index and ``args`` contains other arguments as tuple. + args (tuple): Arguments passed to ``func`` . + nprocs (int): Number of processed to start. + join (bool): Perform a blocking join on all started processes. + Default: True. + daemon (bool): The started processes' daemon flag. Default: False. + start_method (string): the way to start a process. The start method + can be ``spawn`` , ``fork`` , ``forkserver`` . Because the CUDA + runtime does not support the ``fork`` start method, when use + CUDA in subprocesses, we should start process by ``spawn`` or + ``forkserver`` method. + + Returns: + ``MultiprocessContext`` object, it hold the started processes. + + Examples: + .. code-block:: python + + import paddle + import paddle.nn as nn + import paddle.optimizer as opt + import paddle.distributed as dist + + class LinearNet(nn.Layer): + def __init__(self): + super(LinearNet, self).__init__() + self._linear1 = nn.Linear(10, 10) + self._linear2 = nn.Linear(10, 1) + + def forward(self, x): + return self._linear2(self._linear1(x)) + + def train(rank): + # 1. enable dynamic mode + paddle.disable_static() + + # 2. initialize parallel environment + strategy = dist.init_parallel_env(rank) + + # 3. create data parallel layer & optimizer + layer = LinearNet() + dp_layer = dist.DataParallel(layer, strategy) + + loss_fn = nn.MSELoss() + sgd = opt.SGD( + learning_rate=0.001, parameter_list=dp_layer.parameters()) + + # 4. run layer + inputs = paddle.randn([10, 10], 'float32') + outputs = dp_layer(inputs) + labels = paddle.randn([10, 1], 'float32') + loss = loss_fn(outputs, labels) + + loss = dp_layer.scale_loss(loss) + loss.backward() + dp_layer.apply_collective_grads() + + sgd.minimize(loss) + dp_layer.clear_gradients() + + if __name__ == '__main__': + dist.start_processes(train, args=(), nprocs=2) + """ # NOTE(chenweihang): [ why only supports python3.4+? ] # Python has only supported setting the child process startup method # since 3.4. The previous version can only use the default startup @@ -182,4 +253,72 @@ def start_processes(func, # by `spwan` method, if users want to start processes by other # method, they can use start_processes def spawn(func, args=(), nprocs=1, join=True, daemon=False): + """ + Start multiple rocesses with ``spawn`` method for parallel training. + + This is specialized method of method ``paddle.distributed.start_processes`` . + + Args: + func (function): The targert function is called by spawned process. + This function need to be able to pickled, so it must be defined + at the top level of a module. + This function should be called as ``func(i, *args)``, ``i`` is + the process index and ``args`` contains other arguments as tuple. + args (tuple): Arguments passed to ``func``. + nprocs (int): Number of processed to spawn. + join (bool): Perform a blocking join on all spawned processes. + Default: True. + daemon (bool): The spawned processes' daemon flag. Default: False. + + Returns: + ``MultiprocessContext`` object, it hold the spawned processes. + + Examples: + .. code-block:: python + + import paddle + import paddle.nn as nn + import paddle.optimizer as opt + import paddle.distributed as dist + + class LinearNet(nn.Layer): + def __init__(self): + super(LinearNet, self).__init__() + self._linear1 = nn.Linear(10, 10) + self._linear2 = nn.Linear(10, 1) + + def forward(self, x): + return self._linear2(self._linear1(x)) + + def train(rank): + # 1. enable dynamic mode + paddle.disable_static() + + # 2. initialize parallel environment + strategy = dist.init_parallel_env(rank) + + # 3. create data parallel layer & optimizer + layer = LinearNet() + dp_layer = dist.DataParallel(layer, strategy) + + loss_fn = nn.MSELoss() + sgd = opt.SGD( + learning_rate=0.001, parameter_list=dp_layer.parameters()) + + # 4. run layer + inputs = paddle.randn([10, 10], 'float32') + outputs = dp_layer(inputs) + labels = paddle.randn([10, 1], 'float32') + loss = loss_fn(outputs, labels) + + loss = dp_layer.scale_loss(loss) + loss.backward() + dp_layer.apply_collective_grads() + + sgd.minimize(loss) + dp_layer.clear_gradients() + + if __name__ == '__main__': + dist.spawn(train, args=(), nprocs=2) + """ return start_processes(func, args, nprocs, join, daemon, 'spawn') diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py index 8c5cdf8321a4bd..cf89dc484c4880 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py @@ -13,11 +13,16 @@ # limitations under the License. from __future__ import print_function + +import os +import sys import unittest -from test_dist_base import TestDistBase + import paddle.fluid as fluid +from test_dist_base import TestDistBase +from spawn_runner_base import TestDistSpawnRunner +from parallel_dygraph_se_resnext import TestSeResNeXt -import os flag_name = os.path.splitext(__file__)[0] @@ -36,5 +41,12 @@ def test_se_resnext(self): log_name=flag_name) +class TestParallelDygraphSeResNeXtSpawn(TestDistSpawnRunner): + def test_se_resnext_with_spawn(self): + if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4): + self.check_dist_result_with_spawn( + test_class=TestSeResNeXt, delta=0.01) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py index 40b5833053d29b..7f051f1005c7b7 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py @@ -15,10 +15,13 @@ from __future__ import print_function import os +import sys import unittest -import paddle.fluid as fluid +import paddle.fluid as fluid from test_dist_base import TestDistBase +from spawn_runner_base import TestDistSpawnRunner +from parallel_dygraph_sparse_embedding import TestSparseEmbedding flag_name = os.path.splitext(__file__)[0] @@ -38,5 +41,12 @@ def test_sparse_embedding(self): log_name=flag_name) +class TestParallelDygraphSparseEmdeddingSpawn(TestDistSpawnRunner): + def test_sparse_embedding_with_spawn(self): + if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4): + self.check_dist_result_with_spawn( + test_class=TestSparseEmbedding, delta=1e-5) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py index 385c4d892a650b..c8d47eab2c5191 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py @@ -15,10 +15,13 @@ from __future__ import print_function import os +import sys import unittest -import paddle.fluid as fluid +import paddle.fluid as fluid from test_dist_base import TestDistBase +from spawn_runner_base import TestDistSpawnRunner +from parallel_dygraph_transformer import TestTransformer flag_name = os.path.splitext(__file__)[0] @@ -38,5 +41,12 @@ def test_transformer(self): log_name=flag_name) +class TestParallelDygraphTransformerSpawn(TestDistSpawnRunner): + def test_transformer_with_spawn(self): + if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4): + self.check_dist_result_with_spawn( + test_class=TestTransformer, delta=1e-5) + + if __name__ == "__main__": unittest.main() From 131afd4877199c516224de0a5a7344d390312633 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 25 Aug 2020 03:52:47 +0000 Subject: [PATCH 16/32] fix unittest failed --- .../tests/unittests/spawn_runner_base.py | 2 +- .../unittests/test_directory_migration.py | 5 +- .../fluid/tests/unittests/test_dist_base.py | 152 ++++++++---------- 3 files changed, 74 insertions(+), 85 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/spawn_runner_base.py b/python/paddle/fluid/tests/unittests/spawn_runner_base.py index 4e188c3fbed187..a06c97498f7a73 100644 --- a/python/paddle/fluid/tests/unittests/spawn_runner_base.py +++ b/python/paddle/fluid/tests/unittests/spawn_runner_base.py @@ -35,7 +35,7 @@ class SpawnAssistTestArgs(object): def run_dygraph_model(rank, model, args): args.with_spawn = True args.trainer_id = rank - return model.run_trainer(args) + return model.run_trainer_with_spawn(args) class TestDistSpawnRunner(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py index bc858828058079..fee2756f516451 100644 --- a/python/paddle/fluid/tests/unittests/test_directory_migration.py +++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py @@ -39,8 +39,9 @@ def test_new_directory(self): 'paddle.in_dynamic_mode', 'paddle.to_variable', 'paddle.grad', 'paddle.no_grad', 'paddle.save', 'paddle.load', 'paddle.static.save', 'paddle.static.load', - 'paddle.BackwardStrategy', 'paddle.ParallelEnv', - 'paddle.prepare_context', 'paddle.DataParallel', 'paddle.jit', + 'paddle.BackwardStrategy', 'paddle.distributed.ParallelEnv', + 'paddle.distributed.prepare_context', + 'paddle.distributed.DataParallel', 'paddle.jit', 'paddle.jit.TracedLayer', 'paddle.jit.to_static', 'paddle.jit.ProgramTranslator', 'paddle.jit.TranslatedLayer', 'paddle.jit.save', 'paddle.jit.load', 'paddle.jit.SaveLoadConfig', diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index deb9014e0863b7..e3c09ee3b7592e 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import print_function, division +from __future__ import print_function import time import unittest @@ -23,6 +23,7 @@ import six import argparse import pickle +import random import numpy as np import time @@ -384,108 +385,97 @@ def run_one_loop(self, model, opt, data): raise NotImplementedError( "train_one_loop should be implemented by the child classes.") - def _parse_launch_args(self, args): - cluster_node_ips = None - node_ip = None - started_port = None - # [ Adapt `runtime_main` arguments ] - # Why can't we keep the arguments here consistent with launch.py? - ips_dict = dict() - trainer_endpoints = args.endpoints.split( - ",") if args.endpoints else None - if trainer_endpoints is not None: - for endpoint in trainer_endpoints: - ip_port = endpoint.split(":") - ip_str = ip_port[0] - port = int(ip_port[1]) - cur_port = ips_dict.get(ip_str, 0) - if cur_port != 0: - if port < cur_port: - ips_dict[ip_str] = port - else: - ips_dict[ip_str] = port - cur_ip_port = args.current_endpoint.split( - ":") if args.current_endpoint else None - if cur_ip_port is None: - raise RuntimeError("the current endpoint is not set.") - endpoint_num = len(trainer_endpoints) - node_num = len(ips_dict.keys()) - # TODO(chenweihang): Don't consider this situation for now - if endpoint_num % node_num != 0: - raise RuntimeError( - "not check when the number of cards used by each machine is different." - ) - node_gpu_num = endpoint_num // node_num - - cluster_node_ips = ",".join(ips_dict.keys()) - node_ip = cur_ip_port[0] - started_port = ips_dict[node_ip] - selected_gpus = ",".join([str(x) for x in range(0, node_gpu_num)]) - - return cluster_node_ips, node_ip, started_port, selected_gpus + def _get_data(self, batch, args): + if args.update_method != "local": + new_batch = [] + for offset, item in enumerate(batch): + if offset % 2 == args.trainer_id: + new_batch.append(item) + return new_batch + else: + return batch def run_trainer(self, args): - def _get_data(batch): - if args.update_method != "local": - new_batch = [] - for offset, item in enumerate(batch): - if offset % 2 == args.trainer_id: - new_batch.append(item) - return new_batch - else: - return batch + seed = 90 + device_id = int(os.getenv("FLAGS_selected_gpus", "0")) + place = fluid.CUDAPlace(device_id) + + with fluid.dygraph.guard(place): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + np.random.seed(seed) + import random + random.seed = seed + model, train_reader, opt = self.get_model() + nranks = len(args.endpoints.split(",")) if args.endpoints else 1 + + if args.update_method == "nccl2": + strategy = dygraph.parallel.ParallelStrategy() + strategy.nranks = nranks + strategy.local_rank = args.trainer_id + strategy.trainer_endpoints = args.endpoints.split(",") + strategy.current_endpoint = args.current_endpoint + print_to_err( + type(self).__name__, + "begin to prepare context in dygraph with nccl2") + dygraph.parallel.prepare_context(strategy) + model = dygraph.parallel.DataParallel(model, strategy) + print_to_err(type(self).__name__, "model built in dygraph") + out_losses = [] + print_to_err(type(self).__name__, "begin to run dygraph training") + for step_id, data in enumerate(train_reader()): + data = self._get_data(data, args) + if step_id == RUN_STEP: + break + loss = self.run_one_loop(model, opt, data) + if step_id % 10 == 0: + print_to_err( + type(self).__name__, + "loss at step %d: %f" % (step_id, loss.numpy())) + out_losses.append(loss.numpy()) + + # FIXME(Yancey1989): scale the loss inplace + if args.update_method == "nccl2": + loss = model.scale_loss(loss) + + loss.backward() + if args.update_method == "nccl2": + model.apply_collective_grads() + + opt.minimize(loss) + model.clear_gradients() + print_to_out(out_losses) + + def run_trainer_with_spawn(self, args): # 1. enable dygraph - fluid.enable_dygraph() + paddle.disable_static() # 2. init seed seed = 90 - fluid.default_startup_program().random_seed = seed - fluid.default_main_program().random_seed = seed + paddle.static.default_startup_program().random_seed = seed + paddle.static.default_main_program().random_seed = seed np.random.seed(seed) - import random random.seed = seed # 3. init parallel env if args.update_method == "nccl2": - print_to_err( - type(self).__name__, - "begin to prepare context in dygraph with nccl2") - if args.with_spawn is True: - strategy = paddle.distributed.init_parallel_env( - rank=args.trainer_id) - else: - cluster_node_ips, node_ip, started_port, selected_gpus = self._parse_launch_args( - args) - strategy = paddle.distributed.init_parallel_env( - rank=args.trainer_id, - backend='nccl', - cluster_node_ips=cluster_node_ips, - node_ip=node_ip, - started_port=started_port, - selected_gpus=selected_gpus, - print_config=False) + strategy = paddle.distributed.init_parallel_env( + rank=args.trainer_id) # 4. train model model, train_reader, opt = self.get_model() if args.update_method == "nccl2": - model = dygraph.parallel.DataParallel(model, strategy) - print_to_err(type(self).__name__, "model built in dygraph") + model = paddle.distributed.DataParallel(model, strategy) out_losses = [] - print_to_err(type(self).__name__, "begin to run dygraph training") for step_id, data in enumerate(train_reader()): - data = _get_data(data) + data = self._get_data(data, args) if step_id == RUN_STEP: break loss = self.run_one_loop(model, opt, data) - if step_id % 10 == 0: - print_to_err( - type(self).__name__, - "loss at step %d: %f" % (step_id, loss.numpy())) out_losses.append(loss.numpy()) - # FIXME(Yancey1989): scale the loss inplace if args.update_method == "nccl2": loss = model.scale_loss(loss) @@ -495,7 +485,6 @@ def _get_data(batch): opt.minimize(loss) model.clear_gradients() - print_to_out(out_losses) return out_losses @@ -540,8 +529,6 @@ def runtime_main(test_class): type=bool, default=False) parser.add_argument('--sync_batch_norm', action='store_true') - parser.add_argument( - '--with_spawn', type=bool, required=False, default=False) args = parser.parse_args() @@ -862,6 +849,7 @@ def _get_nccl2_trainer_cmd(self, model, ep, update_method, trainer_id, if self.__use_cuda: tr_cmd += " --use_cuda" env.update({ + "CUDA_VISIBLE_DEVICES": "{}".format(trainer_id % 2), "PADDLE_TRAINERS_NUM": "{}".format(trainer_num), "PADDLE_TRAINER_ID": "{}".format(trainer_id), "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints, From e170f105b2fb7c9151af94f37020b461f8848ba0 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 25 Aug 2020 04:11:17 +0000 Subject: [PATCH 17/32] polish english doc --- python/paddle/distributed/parallel.py | 21 ++++++++++++++------ python/paddle/distributed/start_processes.py | 8 ++++---- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index 3d795084ae6837..b703e6904c6a15 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -175,19 +175,28 @@ def _check_var_exists(var_name): def init_parallel_env(rank=-1, backend='nccl', **options): """ - Initialize parallel environments. + Initialize parallel training environments in dynamic mode. Args: - rank(int, optional): Rank of current process. Default vaule is -1. + rank(int, optional): Rank of current process. Default vaule is -1. + When it is the default value -1, you should use ``paddle.disstributed.launch`` + module to start training, the environment variables for parallel training + are configured by ``paddle.disstributed.launch`` module. backend(str, optional): The backend to communication between multiple devices. Now only support ``nccl`` . Default value is ``nccl`` . **options(dict, optional): Other initial parallel execution environment configuration options. The following options are currently supported: - - cluster_node_ips: Paddle cluster nodes ips, such as "192.168.0.16,192.168.0.17". Default: "127.0.0.1" - - node_ip: The current node ip, such as "192.168.0.16". Default: "127.0.0.1" - - started_port: The trainer's started port on a single node, such as 6170. Default: None - - selected_gpus: The training process will run on the selected_gpus, such as "0,1,2,3". Default: None + + - cluster_node_ips: Paddle cluster nodes ips, such as "192.168.0.16,192.168.0.17". Default: "127.0.0.1". + + - node_ip: The current node ip, such as "192.168.0.16". Default: "127.0.0.1". + + - started_port: The trainer's started port on a single node, such as 6170. Default: None. + + - selected_gpus: The training process will run on the selected_gpus, such as "0,1,2,3". Default: None. + - print_config: Print current parallel training config. Default: True. + - use_paddlecloud: Wheter to use paddlecloud platform to run your multi-process job. Default: False. Returns: diff --git a/python/paddle/distributed/start_processes.py b/python/paddle/distributed/start_processes.py index dbadbeece1d984..32780f37ab9c90 100644 --- a/python/paddle/distributed/start_processes.py +++ b/python/paddle/distributed/start_processes.py @@ -134,7 +134,7 @@ def start_processes(func, daemon=False, start_method='spawn'): """ - Start multiple rocesses for parallel training. + Start multiple processes for parallel training. Args: func (function): The targert function is called by started process. @@ -151,7 +151,7 @@ def start_processes(func, can be ``spawn`` , ``fork`` , ``forkserver`` . Because the CUDA runtime does not support the ``fork`` start method, when use CUDA in subprocesses, we should start process by ``spawn`` or - ``forkserver`` method. + ``forkserver`` method. Default: 'spawn'. Returns: ``MultiprocessContext`` object, it hold the started processes. @@ -254,9 +254,9 @@ def train(rank): # method, they can use start_processes def spawn(func, args=(), nprocs=1, join=True, daemon=False): """ - Start multiple rocesses with ``spawn`` method for parallel training. + Start multiple processes with ``spawn`` method for parallel training. - This is specialized method of method ``paddle.distributed.start_processes`` . + This is a specialized method of ``paddle.distributed.start_processes`` . Args: func (function): The targert function is called by spawned process. From 0ef215d5135eb8a1e2cfdabb1b0fdc62f7659533 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 25 Aug 2020 04:36:46 +0000 Subject: [PATCH 18/32] self review and polish details --- python/paddle/distributed/parallel.py | 8 ++-- python/paddle/distributed/start_processes.py | 24 +++++----- python/paddle/fluid/dygraph/parallel.py | 45 ++++++++++--------- .../tests/unittests/spawn_runner_base.py | 2 - 4 files changed, 41 insertions(+), 38 deletions(-) diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index b703e6904c6a15..fc500244115f36 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -142,7 +142,8 @@ def _update_env_vars(rank, options): trainer = pod.get_trainer(rank) if trainer is None: raise RuntimeError( - "The expected trainer is not exists, its trainer rank is %d" % rank) + "The expected trainer is not exists, its trainer rank is %d." % + rank) proc_env = { "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in trainer.gpus]), "PADDLE_TRAINER_ID": "%d" % trainer.rank, @@ -163,7 +164,7 @@ def _check_var_exists(var_name): var = os.environ.get(var_name, None) if var is None: raise ValueError("paddle.distributed initialize error," - "Environment variable %s is needed, but not set.", + "environment variable %s is needed, but not set.", var_name) _check_var_exists("FLAGS_selected_gpus") @@ -184,7 +185,8 @@ def init_parallel_env(rank=-1, backend='nccl', **options): are configured by ``paddle.disstributed.launch`` module. backend(str, optional): The backend to communication between multiple devices. Now only support ``nccl`` . Default value is ``nccl`` . - **options(dict, optional): Other initial parallel execution environment configuration options. + **options(dict, optional): Other initial parallel execution environment configuration options. + The following options are currently supported: - cluster_node_ips: Paddle cluster nodes ips, such as "192.168.0.16,192.168.0.17". Default: "127.0.0.1". diff --git a/python/paddle/distributed/start_processes.py b/python/paddle/distributed/start_processes.py index 32780f37ab9c90..a5ec59ebfdffda 100644 --- a/python/paddle/distributed/start_processes.py +++ b/python/paddle/distributed/start_processes.py @@ -26,9 +26,9 @@ def _py_supported_check(): if not sys.version_info >= (3, 4): raise RuntimeError( - "Use `paddle.distributed.run` to start parallel training " - "requires python version greater than 3.4, if your python " - "is lower than this version, please use " + "Use `paddle.distributed.spawn` or `paddle.distributed.start_processes` " + "to start parallel training requires python version greater than 3.4, " + "if your python is lower than this version, please use " "`paddle.distributed.launch` instead.") @@ -40,11 +40,11 @@ def _set_default_assist_env(nprocs): if port_set is None: raise RuntimeError("no free port can be used to parallel training now.") os.environ['PADDLE_MASTER_PORT'] = str(list(port_set)[0]) - # set default selected_gpus - # e.g. if the nprocs is 4, the selected_gpus="0,1,2,3" + # set default selected gpus + # e.g. if the nprocs is 4, the selected gpus is "0,1,2,3" # NOTE(chenweihang): [ why not use FLAGS_selected_gpus directly? ] # because the FLAGS_selected_gpus may be used in other place, - # if we set FLAGS_selected_gpus are `0,1,2,3`, it may cause error + # if we set FLAGS_selected_gpus to be `0,1,2,3`, it may cause error # when using `ParallelEnv` os.environ['PADDLE_CUDA_VISIBLE_DEVICES'] = ",".join( [str(x) for x in range(0, nprocs)]) @@ -204,18 +204,18 @@ def train(rank): if __name__ == '__main__': dist.start_processes(train, args=(), nprocs=2) """ - # NOTE(chenweihang): [ why only supports python3.4+? ] - # Python has only supported setting the child process startup method + # NOTE(chenweihang): [ why only supports python3.4+ ? ] + # Python supported setting the child process startup method # since 3.4. The previous version can only use the default startup # method, while the default startup method of Unix is fork, which # cannot support CUDA runtime multi-process _py_supported_check() # NOTE(chenweihang): [ why need set default master info before run? ] - # when using `paddle.distributed.run` start parallel training, - # users need use `init_parallel_env` to config some cluster info - # inner subprocess, if each process find free port for itself, - # the started port may be different, it will cause endpoints is + # when using `paddle.distributed.spawn/start_processes` start + # parallel training, users need use `init_parallel_env` to config + # cluster info inner subprocess, if each process find free port for + # itself, the started port may be different, it will cause endpoints is # different in different subprocesses _set_default_assist_env(nprocs) diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index 350954f1adf7e5..dc04847f57609e 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -124,43 +124,46 @@ def __init__(self): self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", "").split(",") self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "") - self.__aliases__ = {'local_rank': 'rank', } + self.__aliases__ = { + 'local_rank': 'rank', + 'nranks': 'world_size', + } @property def rank(self): """ - The current trainer number. + Rank of current trainer. - Its value is equal to the value of the environment variable PADDLE_TRAINER_ID. The default value is 0. + Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ID`` . The default value is 0. Examples: .. code-block:: python # execute this command in terminal: export PADDLE_TRAINER_ID=0 - import paddle.fluid as fluid + import paddle.distributed as dist - env = fluid.dygraph.ParallelEnv() + env = dist.ParallelEnv() print("The rank is %d" % env.rank) # The rank is 0 """ return self._local_rank @property - def nranks(self): + def world_size(self): """ - The number of trainers, generally refers to the number of GPU cards used in training. + The number of trainers (number of processes participating in current job). - Its value is equal to the value of the environment variable PADDLE_TRAINERS_NUM. The default value is 1. + Its value is equal to the value of the environment variable ``PADDLE_TRAINERS_NUM`` . The default value is 1. Examples: .. code-block:: python # execute this command in terminal: export PADDLE_TRAINERS_NUM=4 - import paddle.fluid as fluid + import paddle.distributed as dist - env = fluid.dygraph.ParallelEnv() - print("The nranks is %d" % env.nranks) - # The nranks is 4 + env = dist.ParallelEnv() + print("The world_size is %d" % env.world_size) + # The world_size is 4 """ return self._nranks @@ -169,15 +172,15 @@ def dev_id(self): """ The ID of selected GPU card for parallel training. - Its value is equal to the value of the environment variable FLAGS_selected_gpus. The default value is 0. + Its value is equal to the value of the environment variable ``FLAGS_selected_gpus`` . The default value is 0. Examples: .. code-block:: python # execute this command in terminal: export FLAGS_selected_gpus=1 - import paddle.fluid as fluid + import paddle.distributed as dist - env = fluid.dygraph.ParallelEnv() + env = dist.ParallelEnv() print("The device id are %d" % env.dev_id) # The device id are 1 """ @@ -188,15 +191,15 @@ def current_endpoint(self): """ The endpoint of current trainer, it is in the form of (node IP + port). - Its value is equal to the value of the environment variable PADDLE_CURRENT_ENDPOINT. The default value is "". + Its value is equal to the value of the environment variable ``PADDLE_CURRENT_ENDPOINT`` . The default value is "". Examples: .. code-block:: python # execute this command in terminal: export PADDLE_CURRENT_ENDPOINT=127.0.0.1:6170 - import paddle.fluid as fluid + import paddle.distributed as dist - env = fluid.dygraph.ParallelEnv() + env = dist.ParallelEnv() print("The current endpoint are %s" % env.current_endpoint) # The current endpoint are 127.0.0.1:6170 """ @@ -208,15 +211,15 @@ def trainer_endpoints(self): The endpoints of all trainer nodes in the task, which are used to broadcast the NCCL ID when NCCL2 is initialized. - Its value is equal to the value of the environment variable PADDLE_TRAINER_ENDPOINTS. The default value is "". + Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ENDPOINTS`` . The default value is "". Examples: .. code-block:: python # execute this command in terminal: export PADDLE_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171 - import paddle.fluid as fluid + import paddle.distributed as dist - env = fluid.dygraph.ParallelEnv() + env = dist.ParallelEnv() print("The trainer endpoints are %s" % env.trainer_endpoints) # The trainer endpoints are ['127.0.0.1:6170', '127.0.0.1:6171'] """ diff --git a/python/paddle/fluid/tests/unittests/spawn_runner_base.py b/python/paddle/fluid/tests/unittests/spawn_runner_base.py index a06c97498f7a73..9c097c349c3da0 100644 --- a/python/paddle/fluid/tests/unittests/spawn_runner_base.py +++ b/python/paddle/fluid/tests/unittests/spawn_runner_base.py @@ -29,11 +29,9 @@ class SpawnAssistTestArgs(object): trainer_id = 0 current_endpoint = None endpoints = None - with_spawn = True def run_dygraph_model(rank, model, args): - args.with_spawn = True args.trainer_id = rank return model.run_trainer_with_spawn(args) From b27cfee96b4ad8599ed6cc96f6be98981159061a Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 25 Aug 2020 15:38:23 +0000 Subject: [PATCH 19/32] refactor code by reviewer's comments --- python/paddle/__init__.py | 1 + python/paddle/distributed/__init__.py | 11 +- python/paddle/distributed/parallel.py | 23 ++-- .../{start_processes.py => spawn.py} | 8 +- python/paddle/distributed/utils.py | 23 ++-- python/paddle/fluid/dygraph/parallel.py | 115 ++++++++++++------ .../unittests/test_directory_migration.py | 5 +- .../fluid/tests/unittests/test_dist_base.py | 5 +- .../tests/unittests/test_init_parallel_env.py | 78 ++++++++++++ python/paddle/framework/__init__.py | 1 + 10 files changed, 189 insertions(+), 81 deletions(-) rename python/paddle/distributed/{start_processes.py => spawn.py} (98%) create mode 100644 python/paddle/fluid/tests/unittests/test_init_parallel_env.py diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index dd10317c215f16..4d8a6a7fdd07fb 100644 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -231,6 +231,7 @@ from .framework import no_grad #DEFINE_ALIAS from .framework import save #DEFINE_ALIAS from .framework import load #DEFINE_ALIAS +from .framework import DataParallel #DEFINE_ALIAS from .framework import NoamDecay #DEFINE_ALIAS from .framework import PiecewiseDecay #DEFINE_ALIAS diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py index 999cbf279d9f9d..8210e0a02dc101 100644 --- a/python/paddle/distributed/__init__.py +++ b/python/paddle/distributed/__init__.py @@ -16,16 +16,13 @@ __all__ = ["spawn", "start_processes"] # dygraph parallel apis -__all__ += [ - "prepare_context", "init_parallel_env", "ParallelEnv", "DataParallel" -] +__all__ += ["prepare_context", "init_parallel_env", "ParallelEnv"] -from . import start_processes -from .start_processes import spawn -from .start_processes import start_processes +from . import spawn +from .spawn import spawn +from .spawn import start_processes from . import parallel from .parallel import init_parallel_env from paddle.fluid.dygraph.parallel import prepare_context #DEFINE_ALIAS from paddle.fluid.dygraph.parallel import ParallelEnv #DEFINE_ALIAS -from paddle.fluid.dygraph.parallel import DataParallel #DEFINE_ALIAS diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index fc500244115f36..fa5901e783bbef 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -17,6 +17,7 @@ from paddle import compat as cpt from paddle.distributed.launch import get_cluster_and_pod, _print_arguments +from paddle.distributed.utils import _update_trainer_env # deprecated module import from paddle.fluid import core @@ -96,7 +97,7 @@ def _update_env_vars(rank, options): if args.started_port is None: default_port = os.environ.get("PADDLE_MASTER_PORT", None) if default_port is None: - raise RuntimeError( + raise ValueError( "Data parallel training start failed. If you start data parallel " "training by `paddle.distributed.launch` module, Please ensure " "that one of the following rules is met:\n" @@ -144,15 +145,8 @@ def _update_env_vars(rank, options): raise RuntimeError( "The expected trainer is not exists, its trainer rank is %d." % rank) - proc_env = { - "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in trainer.gpus]), - "PADDLE_TRAINER_ID": "%d" % trainer.rank, - "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint, - "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), - "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()) - } # no copy, each process will hold env vars itself - os.environ.update(proc_env) + _update_trainer_env(os.environ, cluster, trainer) # print config if args.print_config and rank == 0: @@ -164,7 +158,7 @@ def _check_var_exists(var_name): var = os.environ.get(var_name, None) if var is None: raise ValueError("paddle.distributed initialize error," - "environment variable %s is needed, but not set.", + "environment variable %s is needed, but not set." % var_name) _check_var_exists("FLAGS_selected_gpus") @@ -186,7 +180,6 @@ def init_parallel_env(rank=-1, backend='nccl', **options): backend(str, optional): The backend to communication between multiple devices. Now only support ``nccl`` . Default value is ``nccl`` . **options(dict, optional): Other initial parallel execution environment configuration options. - The following options are currently supported: - cluster_node_ips: Paddle cluster nodes ips, such as "192.168.0.16,192.168.0.17". Default: "127.0.0.1". @@ -202,7 +195,7 @@ def init_parallel_env(rank=-1, backend='nccl', **options): - use_paddlecloud: Wheter to use paddlecloud platform to run your multi-process job. Default: False. Returns: - ParallelStrategy + None Examples: .. code-block:: python @@ -226,11 +219,11 @@ def train(rank): paddle.disable_static() # 2. initialize parallel environment - strategy = dist.init_parallel_env(rank) + dist.init_parallel_env(rank) # 3. create data parallel layer & optimizer layer = LinearNet() - dp_layer = dist.DataParallel(layer, strategy) + dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() sgd = opt.SGD( @@ -293,5 +286,3 @@ def train(rank): parallel_helper._set_parallel_ctx( core.NCCLParallelContext(strategy, place)) parallel_helper._init_parallel_ctx() - - return strategy diff --git a/python/paddle/distributed/start_processes.py b/python/paddle/distributed/spawn.py similarity index 98% rename from python/paddle/distributed/start_processes.py rename to python/paddle/distributed/spawn.py index a5ec59ebfdffda..370dbb35bdd9ec 100644 --- a/python/paddle/distributed/start_processes.py +++ b/python/paddle/distributed/spawn.py @@ -178,11 +178,11 @@ def train(rank): paddle.disable_static() # 2. initialize parallel environment - strategy = dist.init_parallel_env(rank) + dist.init_parallel_env(rank) # 3. create data parallel layer & optimizer layer = LinearNet() - dp_layer = dist.DataParallel(layer, strategy) + dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() sgd = opt.SGD( @@ -295,11 +295,11 @@ def train(rank): paddle.disable_static() # 2. initialize parallel environment - strategy = dist.init_parallel_env(rank) + dist.init_parallel_env(rank) # 3. create data parallel layer & optimizer layer = LinearNet() - dp_layer = dist.DataParallel(layer, strategy) + dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() sgd = opt.SGD( diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py index 87d0f1546f38d0..a98c619ec0d25c 100644 --- a/python/paddle/distributed/utils.py +++ b/python/paddle/distributed/utils.py @@ -333,6 +333,19 @@ def __free_port(): return None +def _update_trainer_env(current_env, cluster, trainer): + proc_env = { + "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in trainer.gpus]), + "PADDLE_TRAINER_ID": "%d" % trainer.rank, + "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint, + "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), + "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()) + } + current_env.update(proc_env) + + return proc_env + + class TrainerProc(object): def __init__(self): self.proc = None @@ -358,15 +371,7 @@ def start_local_trainers(cluster, procs = [] for idx, t in enumerate(pod.trainers): - proc_env = { - "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in t.gpus]), - "PADDLE_TRAINER_ID": "%d" % t.rank, - "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint, - "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), - "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()) - } - - current_env.update(proc_env) + proc_env = _update_trainer_env(current_env) logger.debug("trainer proc env:{}".format(current_env)) diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index dc04847f57609e..c274f356ea24a0 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -243,61 +243,98 @@ class DataParallel(layers.Layer): Run the dygraph module with data parallelism. Currently, DataParallel class only supports to run the dynamic graph - with multi-process. The usage is: - `python -m paddle.distributed.launch --selected_gpus=0,1 dynamic_graph_test.py`. - And the content of `dynamic_graph_test.py` is the code of examples. + with multi-process. + + Now supports two ways to start training: + + 1. start by ``paddle.distributed.spawn`` method, for example: + + ``python demo.py`` (spawn need to be called in ``__main__`` method) + + 2. start by ``paddle.distributed.launch`` module, for example: + + ``python -m paddle.distributed.launch --selected_gpus=0,1 demo.py`` . + + And the content of `demo.py` is the code of examples. Args: layers(Layer): The module that should be executed by data parallel. - strategy(ParallelStrategy): The strategy of data parallelism, contains - environment configuration related to parallel execution. - + strategy(ParallelStrategy, optional): (deprecated) The strategy of data parallelism, + contains environment configuration related to parallel execution. Default: None. + Returns: Layer: The data paralleled module. Examples: .. code-block:: python - import numpy as np - import paddle.fluid as fluid - - place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id) - with fluid.dygraph.guard(place): - - # prepare the data parallel context - strategy = fluid.dygraph.prepare_context() - - linear = fluid.dygraph.Linear(1, 10, act="softmax") - adam = fluid.optimizer.AdamOptimizer( - learning_rate=0.001, parameter_list=linear.parameters()) - - # make the module become the data parallelism module - linear = fluid.dygraph.DataParallel(linear, strategy) - - x_data = np.random.random(size=[10, 1]).astype(np.float32) - data = fluid.dygraph.to_variable(x_data) - - hidden = linear(data) - avg_loss = fluid.layers.mean(hidden) - - # scale the loss according to the number of trainers. - avg_loss = linear.scale_loss(avg_loss) - - avg_loss.backward() - - # collect the gradients of trainers. - linear.apply_collective_grads() + import paddle + import paddle.nn as nn + import paddle.optimizer as opt + import paddle.distributed as dist - adam.minimize(avg_loss) - linear.clear_gradients() + class LinearNet(nn.Layer): + def __init__(self): + super(LinearNet, self).__init__() + self._linear1 = nn.Linear(10, 10) + self._linear2 = nn.Linear(10, 1) + + def forward(self, x): + return self._linear2(self._linear1(x)) + + def train(rank): + # 1. enable dynamic mode + paddle.disable_static() + + # 2. initialize parallel environment + dist.init_parallel_env(rank) + + # 3. create data parallel layer & optimizer + layer = LinearNet() + dp_layer = paddle.DataParallel(layer) + + loss_fn = nn.MSELoss() + sgd = opt.SGD( + learning_rate=0.001, parameter_list=dp_layer.parameters()) + + # 4. run layer + inputs = paddle.randn([10, 10], 'float32') + outputs = dp_layer(inputs) + labels = paddle.randn([10, 1], 'float32') + loss = loss_fn(outputs, labels) + + loss = dp_layer.scale_loss(loss) + loss.backward() + dp_layer.apply_collective_grads() + + sgd.minimize(loss) + dp_layer.clear_gradients() + + if __name__ == '__main__': + # 1. start by ``paddle.distributed.spawn`` (default) + dist.spawn(train, args=(), nprocs=2) + # 2. start by ``paddle.distributed.launch`` + # train(-1) """ - def __init__(self, layers, strategy): + def __init__(self, layers, strategy=None): super(DataParallel, self).__init__(layers.full_name() + "_data_parallel") self._layers = layers - self._strategy = strategy + + # NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy. + # It just stores some environment variables, which can be constructed by + # ParallelEnv. Here it is set as an optional argument. + # This parameter is not removed because of compatibility with 1.x writing. + if strategy is not None: + self._strategy = strategy + else: + self._strategy = ParallelStrategy() + self._strategy.nranks = ParallelEnv().nranks + self._strategy.local_rank = ParallelEnv().local_rank + self._strategy.trainer_endpoints = ParallelEnv().trainer_endpoints + self._strategy.current_endpoint = ParallelEnv().current_endpoint def forward(self, *inputs, **kwargs): return self._layers(*inputs, **kwargs) diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py index fee2756f516451..19416e132920ae 100644 --- a/python/paddle/fluid/tests/unittests/test_directory_migration.py +++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py @@ -40,9 +40,8 @@ def test_new_directory(self): 'paddle.no_grad', 'paddle.save', 'paddle.load', 'paddle.static.save', 'paddle.static.load', 'paddle.BackwardStrategy', 'paddle.distributed.ParallelEnv', - 'paddle.distributed.prepare_context', - 'paddle.distributed.DataParallel', 'paddle.jit', - 'paddle.jit.TracedLayer', 'paddle.jit.to_static', + 'paddle.distributed.prepare_context', 'paddle.DataParallel', + 'paddle.jit', 'paddle.jit.TracedLayer', 'paddle.jit.to_static', 'paddle.jit.ProgramTranslator', 'paddle.jit.TranslatedLayer', 'paddle.jit.save', 'paddle.jit.load', 'paddle.jit.SaveLoadConfig', 'paddle.NoamDecay', 'paddle.PiecewiseDecay', diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index e3c09ee3b7592e..d3ebaf90cb5161 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -460,13 +460,12 @@ def run_trainer_with_spawn(self, args): # 3. init parallel env if args.update_method == "nccl2": - strategy = paddle.distributed.init_parallel_env( - rank=args.trainer_id) + paddle.distributed.init_parallel_env(rank=args.trainer_id) # 4. train model model, train_reader, opt = self.get_model() if args.update_method == "nccl2": - model = paddle.distributed.DataParallel(model, strategy) + model = paddle.DataParallel(model) out_losses = [] for step_id, data in enumerate(train_reader()): diff --git a/python/paddle/fluid/tests/unittests/test_init_parallel_env.py b/python/paddle/fluid/tests/unittests/test_init_parallel_env.py new file mode 100644 index 00000000000000..2eb390738a89b0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_init_parallel_env.py @@ -0,0 +1,78 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import numpy as np +import unittest + +import paddle +import paddle.distributed as dist + +# NOTE(chenweihang): Coverage CI is currently not able to count python3 +# unittest, so the unittests here covers some cases that will only be +# executed in the python3 sub-process. +# If the coverage CI can check python3 and sub-process, +# we can remove all unittests here + + +class TestInitParallelEnv(unittest.TestCase): + def test_beckend_type_error(self): + with self.assertRaises(TypeError): + dist.init_parallel_env(backend=1) + + def test_backend_value_error(self): + with self.assertRaises(ValueError): + dist.init_parallel_env(backend="mpi") + + def test_rank_type_error(self): + with self.assertRaises(TypeError): + dist.init_parallel_env(rank="1") + + def test_rank_value_error(self): + with self.assertRaises(ValueError): + dist.init_parallel_env(rank=-2) + + def test_only_cluster_node_ips_error(self): + with self.assertRaises(ValueError): + dist.init_parallel_env( + rank=0, cluster_node_ips="127.0.0.1,127.0.0.2") + + def test_no_started_port_error(self): + with self.assertRaises(ValueError): + dist.init_parallel_env(rank=0) + + def test_no_selected_gpus_error(self): + with self.assertRaises(ValueError): + dist.init_parallel_env(rank=0, started_port=6170) + + def test_check_env_failed(self): + os.environ['FLAGS_selected_gpus'] = '0' + os.environ['PADDLE_TRAINER_ID'] = '0' + os.environ['PADDLE_CURRENT_ENDPOINT'] = '127.0.0.1:6170' + os.environ['PADDLE_TRAINERS_NUM'] = '1' + with self.assertRaises(ValueError): + dist.init_parallel_env() + + def test_update_env(self): + dist.init_parallel_env(rank=0, started_port=6170, selected_gpus="0") + self.assertIsNotNone(os.environ.get('PADDLE_TRAINER_ID', None)) + self.assertIsNotNone(os.environ.get('PADDLE_CURRENT_ENDPOINT', None)) + self.assertIsNotNone(os.environ.get('PADDLE_TRAINERS_NUM', None)) + self.assertIsNotNone(os.environ.get('PADDLE_TRAINER_ENDPOINTS', None)) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py index 4b348ea729ef33..c2b212c10b677e 100644 --- a/python/paddle/framework/__init__.py +++ b/python/paddle/framework/__init__.py @@ -50,6 +50,7 @@ from ..fluid.dygraph.base import grad #DEFINE_ALIAS from ..fluid.dygraph.checkpoint import load_dygraph as load #DEFINE_ALIAS from ..fluid.dygraph.checkpoint import save_dygraph as save #DEFINE_ALIAS +from ..fluid.dygraph.parallel import DataParallel #DEFINE_ALIAS from ..fluid.dygraph.learning_rate_scheduler import NoamDecay #DEFINE_ALIAS from ..fluid.dygraph.learning_rate_scheduler import PiecewiseDecay #DEFINE_ALIAS From f50f3432bbf0664578ccac1d103ecdf4ecbaf737 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 26 Aug 2020 02:12:04 +0000 Subject: [PATCH 20/32] fix unittest failed --- python/paddle/distributed/utils.py | 2 +- python/paddle/fluid/tests/unittests/test_init_parallel_env.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py index a98c619ec0d25c..1e5c7810b530b6 100644 --- a/python/paddle/distributed/utils.py +++ b/python/paddle/distributed/utils.py @@ -371,7 +371,7 @@ def start_local_trainers(cluster, procs = [] for idx, t in enumerate(pod.trainers): - proc_env = _update_trainer_env(current_env) + proc_env = _update_trainer_env(current_env, cluster, t) logger.debug("trainer proc env:{}".format(current_env)) diff --git a/python/paddle/fluid/tests/unittests/test_init_parallel_env.py b/python/paddle/fluid/tests/unittests/test_init_parallel_env.py index 2eb390738a89b0..ace97ee2e9cf73 100644 --- a/python/paddle/fluid/tests/unittests/test_init_parallel_env.py +++ b/python/paddle/fluid/tests/unittests/test_init_parallel_env.py @@ -67,7 +67,8 @@ def test_check_env_failed(self): dist.init_parallel_env() def test_update_env(self): - dist.init_parallel_env(rank=0, started_port=6170, selected_gpus="0") + device = os.getenv("CUDA_VISIBLE_DEVICES") + dist.init_parallel_env(rank=0, started_port=6170, selected_gpus=device) self.assertIsNotNone(os.environ.get('PADDLE_TRAINER_ID', None)) self.assertIsNotNone(os.environ.get('PADDLE_CURRENT_ENDPOINT', None)) self.assertIsNotNone(os.environ.get('PADDLE_TRAINERS_NUM', None)) From 11221a859bb459fcbdea23e2c19fd075bceaf32c Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 26 Aug 2020 02:52:11 +0000 Subject: [PATCH 21/32] fix parallel_env unittest --- python/paddle/fluid/tests/unittests/test_init_parallel_env.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_init_parallel_env.py b/python/paddle/fluid/tests/unittests/test_init_parallel_env.py index ace97ee2e9cf73..16a55bdd18247d 100644 --- a/python/paddle/fluid/tests/unittests/test_init_parallel_env.py +++ b/python/paddle/fluid/tests/unittests/test_init_parallel_env.py @@ -67,7 +67,9 @@ def test_check_env_failed(self): dist.init_parallel_env() def test_update_env(self): - device = os.getenv("CUDA_VISIBLE_DEVICES") + device = os.getenv("CUDA_VISIBLE_DEVICES", None) + if device is None: + device = '0' dist.init_parallel_env(rank=0, started_port=6170, selected_gpus=device) self.assertIsNotNone(os.environ.get('PADDLE_TRAINER_ID', None)) self.assertIsNotNone(os.environ.get('PADDLE_CURRENT_ENDPOINT', None)) From 0980c230d6909c3d0f0b5e7f415e7aac54e44b72 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 26 Aug 2020 08:43:46 +0000 Subject: [PATCH 22/32] fix several typos --- python/paddle/distributed/parallel.py | 20 ++++++++++---------- python/paddle/distributed/spawn.py | 8 ++++---- python/paddle/fluid/dygraph/parallel.py | 4 ++-- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index fa5901e783bbef..5dee640c1cb591 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -42,7 +42,7 @@ def __init__(self): # The current node ip. self.node_ip = None - # wheter to use paddlecloud platform to run your multi-process job. + # whether to use paddlecloud platform to run your multi-process job. # If false, no need to set this argument. self.use_paddlecloud = None @@ -112,7 +112,7 @@ def _update_env_vars(rank, options): args.print_config = options.get('print_config', True) # set default `selected_gpus` - # TODO(chenweihang): if users gived number of `selected_gpus` + # TODO(chenweihang): if users given number of `selected_gpus` # is not equal to the spawn's nprocs, it will cause error, # and because we remove the `proc num` argument of # `init_parallel_env`, when above error occured, we do not @@ -173,10 +173,10 @@ def init_parallel_env(rank=-1, backend='nccl', **options): Initialize parallel training environments in dynamic mode. Args: - rank(int, optional): Rank of current process. Default vaule is -1. - When it is the default value -1, you should use ``paddle.disstributed.launch`` + rank(int, optional): Rank of current process. Default value is -1. + When it is the default value -1, you should use ``paddle.distributed.launch`` module to start training, the environment variables for parallel training - are configured by ``paddle.disstributed.launch`` module. + are configured by ``paddle.distributed.launch`` module. backend(str, optional): The backend to communication between multiple devices. Now only support ``nccl`` . Default value is ``nccl`` . **options(dict, optional): Other initial parallel execution environment configuration options. @@ -192,7 +192,7 @@ def init_parallel_env(rank=-1, backend='nccl', **options): - print_config: Print current parallel training config. Default: True. - - use_paddlecloud: Wheter to use paddlecloud platform to run your multi-process job. Default: False. + - use_paddlecloud: Whether to use paddlecloud platform to run your multi-process job. Default: False. Returns: None @@ -267,16 +267,16 @@ def train(rank): # 3. init ParallelStrategy strategy = ParallelStrategy() if cpt.to_text(backend) == 'nccl': - strategy.nranks = ParallelEnv().nranks - strategy.local_rank = ParallelEnv().local_rank + strategy.world_size = ParallelEnv().world_size + strategy.rank = ParallelEnv().rank strategy.trainer_endpoints = ParallelEnv().trainer_endpoints strategy.current_endpoint = ParallelEnv().current_endpoint - if strategy.nranks < 2: + if strategy.world_size < 2: return # NOTE(chenweihang): [ why config global place here? ] # the dygraph mode will be set to default mode, # users will not call `dygraph.guard` or `enable_dygraph` - # directly, if they want to switch detault place, + # directly, if they want to switch default place, # they need to call a function to change default place, # here just set correctly place to users place = core.CUDAPlace(ParallelEnv().dev_id) diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py index 370dbb35bdd9ec..7071f4fcbbc1ec 100644 --- a/python/paddle/distributed/spawn.py +++ b/python/paddle/distributed/spawn.py @@ -117,7 +117,7 @@ def _throw_exception(self, error_index): original_trace = self.error_queues[error_index].get() msg = "\n\n----------------------------------------------\n" \ - "Procces %d terminated with the following error:\n" \ + "Process %d terminated with the following error:\n" \ "----------------------------------------------\n\n" % error_index msg += original_trace raise Exception(msg) @@ -137,7 +137,7 @@ def start_processes(func, Start multiple processes for parallel training. Args: - func (function): The targert function is called by started process. + func (function): The target function is called by started process. This function need to be able to pickled, so it must be defined at the top level of a module. This function should be called as ``func(i, *args)`` , ``i`` is @@ -245,7 +245,7 @@ def train(rank): while not context.join(): pass - # finaly return context + # finally return context return context @@ -259,7 +259,7 @@ def spawn(func, args=(), nprocs=1, join=True, daemon=False): This is a specialized method of ``paddle.distributed.start_processes`` . Args: - func (function): The targert function is called by spawned process. + func (function): The target function is called by spawned process. This function need to be able to pickled, so it must be defined at the top level of a module. This function should be called as ``func(i, *args)``, ``i`` is diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index c274f356ea24a0..bfe42467651b8e 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -44,7 +44,7 @@ def prepare_context(strategy=None): if strategy.nranks < 2: return assert framework.in_dygraph_mode() is True, \ - "dygraph.prepare_context should be used with dygrahp mode." + "dygraph.prepare_context should be used with dygraph mode." place = framework._current_expected_place() assert place is not None, \ "dygraph.prepare_context should be used in fluid.dygraph.guard(place) guard." @@ -227,7 +227,7 @@ def trainer_endpoints(self): def __getattr__(self, name): if name == "__aliases__": - raise AttributeError("Attribue `__aliases__` can not be accessed.") + raise AttributeError("Attribute `__aliases__` can not be accessed.") name = self.__aliases__.get(name, name) return object.__getattribute__(self, name) From af505180d78ec413566b69b31e2a50449bde7e4c Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 27 Aug 2020 01:56:36 +0000 Subject: [PATCH 23/32] fix error introduced when fixing typos --- python/paddle/distributed/parallel.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index 5dee640c1cb591..f263d7ce2559c3 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -267,11 +267,11 @@ def train(rank): # 3. init ParallelStrategy strategy = ParallelStrategy() if cpt.to_text(backend) == 'nccl': - strategy.world_size = ParallelEnv().world_size - strategy.rank = ParallelEnv().rank + strategy.nranks = ParallelEnv().world_size + strategy.local_rank = ParallelEnv().rank strategy.trainer_endpoints = ParallelEnv().trainer_endpoints strategy.current_endpoint = ParallelEnv().current_endpoint - if strategy.world_size < 2: + if strategy.nranks < 2: return # NOTE(chenweihang): [ why config global place here? ] # the dygraph mode will be set to default mode, From a378140d82e53b07c44fa139608ffae646e5fcc2 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 27 Aug 2020 02:22:44 +0000 Subject: [PATCH 24/32] add unpublic note for start_processes --- python/paddle/distributed/spawn.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py index 7071f4fcbbc1ec..a7c6c634f1bdc8 100644 --- a/python/paddle/distributed/spawn.py +++ b/python/paddle/distributed/spawn.py @@ -136,6 +136,11 @@ def start_processes(func, """ Start multiple processes for parallel training. + .. note:: + ``start_processes`` is not a public interface! Please use ``spawn`` + firstly, if ``spawn`` cannot meet the need, then consider using + ``start_processes`` . + Args: func (function): The target function is called by started process. This function need to be able to pickled, so it must be defined @@ -256,7 +261,8 @@ def spawn(func, args=(), nprocs=1, join=True, daemon=False): """ Start multiple processes with ``spawn`` method for parallel training. - This is a specialized method of ``paddle.distributed.start_processes`` . + If you want to use other methods ( ``fork`` , ``forkserver`` ) to start + multiple processes, please use ``paddle.distributed.start_processes`` . Args: func (function): The target function is called by spawned process. From cca82b6146f4b48c02b3db2fa4a2e861b70abdc0 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 27 Aug 2020 04:11:06 +0000 Subject: [PATCH 25/32] polish details by xiaoguang's comment --- python/paddle/distributed/parallel.py | 18 ++++++--- python/paddle/distributed/spawn.py | 54 ++++++++++++++++++++------- 2 files changed, 54 insertions(+), 18 deletions(-) diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index f263d7ce2559c3..f443d2a004b1a6 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -174,9 +174,17 @@ def init_parallel_env(rank=-1, backend='nccl', **options): Args: rank(int, optional): Rank of current process. Default value is -1. - When it is the default value -1, you should use ``paddle.distributed.launch`` - module to start training, the environment variables for parallel training - are configured by ``paddle.distributed.launch`` module. + When use ``paddle.distributed.spawn`` method to start parallel + training, the rank value is generated by spawn method, spawn method + will assign a rank to each process according to the number of processes. + For example, if the number of processes is 4, the ranks of the 4 + processes are 0,1,2,3 in order. + If do not use ``paddle.distributed.spawn`` method to start parallel + training, you can not pass the rank value here, or set it to default + value -1. When it is the default value -1, you should use + ``paddle.distributed.launch`` module to start parallel training, + the environment variables for parallel training are configured by + ``paddle.distributed.launch`` module. backend(str, optional): The backend to communication between multiple devices. Now only support ``nccl`` . Default value is ``nccl`` . **options(dict, optional): Other initial parallel execution environment configuration options. @@ -239,8 +247,8 @@ def train(rank): loss.backward() dp_layer.apply_collective_grads() - sgd.minimize(loss) - dp_layer.clear_gradients() + sgd.step() + sgd.clear_grad() if __name__ == '__main__': dist.spawn(train, args=(), nprocs=2) diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py index a7c6c634f1bdc8..56a991e31d78df 100644 --- a/python/paddle/distributed/spawn.py +++ b/python/paddle/distributed/spawn.py @@ -21,6 +21,10 @@ import warnings from paddle.distributed.utils import find_free_ports +from paddle.device import get_device + +# deprecated module import +from paddle.fluid.framework import cpu_places, cuda_places def _py_supported_check(): @@ -129,7 +133,7 @@ def _throw_exception(self, error_index): # to use CUDA in subprocesses. def start_processes(func, args=(), - nprocs=1, + nprocs=-1, join=True, daemon=False, start_method='spawn'): @@ -148,11 +152,19 @@ def start_processes(func, This function should be called as ``func(i, *args)`` , ``i`` is the process index and ``args`` contains other arguments as tuple. args (tuple): Arguments passed to ``func`` . - nprocs (int): Number of processed to start. - join (bool): Perform a blocking join on all started processes. + nprocs (int, optional): Number of processed to start. Default: -1. + when nprocs is -1, the available device will be obtained from + the environment variable when the model is executed: If use GPU, + the currently available device ID is obtained from the environment + variable CUDA_VISIBLE_DEVICES; If use CPU, the currently available + CPU number is obtained from the environment variable CPU_NUM. + For example, export CPU_NUM=4, if the environment variable is not set, + the executor will add the variable to the environment variable and + set its value to 1. + join (bool, optional): Perform a blocking join on all started processes. Default: True. - daemon (bool): The started processes' daemon flag. Default: False. - start_method (string): the way to start a process. The start method + daemon (bool, optional): The started processes' daemon flag. Default: False. + start_method (string, optional): the way to start a process. The start method can be ``spawn`` , ``fork`` , ``forkserver`` . Because the CUDA runtime does not support the ``fork`` start method, when use CUDA in subprocesses, we should start process by ``spawn`` or @@ -203,8 +215,8 @@ def train(rank): loss.backward() dp_layer.apply_collective_grads() - sgd.minimize(loss) - dp_layer.clear_gradients() + sgd.step() + sgd.clear_grad() if __name__ == '__main__': dist.start_processes(train, args=(), nprocs=2) @@ -216,6 +228,14 @@ def train(rank): # cannot support CUDA runtime multi-process _py_supported_check() + # get default nprocs + if nprocs == -1: + device = get_device() + if device == 'cpu': + nprocs = len(cpu_places()) + else: + nprocs = len(cuda_places()) + # NOTE(chenweihang): [ why need set default master info before run? ] # when using `paddle.distributed.spawn/start_processes` start # parallel training, users need use `init_parallel_env` to config @@ -257,7 +277,7 @@ def train(rank): # NOTE(chenweihang): this method only supports start processes # by `spwan` method, if users want to start processes by other # method, they can use start_processes -def spawn(func, args=(), nprocs=1, join=True, daemon=False): +def spawn(func, args=(), nprocs=-1, join=True, daemon=False): """ Start multiple processes with ``spawn`` method for parallel training. @@ -271,10 +291,18 @@ def spawn(func, args=(), nprocs=1, join=True, daemon=False): This function should be called as ``func(i, *args)``, ``i`` is the process index and ``args`` contains other arguments as tuple. args (tuple): Arguments passed to ``func``. - nprocs (int): Number of processed to spawn. - join (bool): Perform a blocking join on all spawned processes. + nprocs (int, optional): Number of processed to start. Default: -1. + when nprocs is -1, the available device will be obtained from + the environment variable when the model is executed: If use GPU, + the currently available device ID is obtained from the environment + variable CUDA_VISIBLE_DEVICES; If use CPU, the currently available + CPU number is obtained from the environment variable CPU_NUM. + For example, export CPU_NUM=4, if the environment variable is not set, + the executor will add the variable to the environment variable and + set its value to 1. + join (bool, optional): Perform a blocking join on all spawned processes. Default: True. - daemon (bool): The spawned processes' daemon flag. Default: False. + daemon (bool, optional): The spawned processes' daemon flag. Default: False. Returns: ``MultiprocessContext`` object, it hold the spawned processes. @@ -321,8 +349,8 @@ def train(rank): loss.backward() dp_layer.apply_collective_grads() - sgd.minimize(loss) - dp_layer.clear_gradients() + sgd.step() + sgd.clear_grad() if __name__ == '__main__': dist.spawn(train, args=(), nprocs=2) From d39331cd317ee0eb0cd5fc2bef612138dc3fe20c Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 27 Aug 2020 05:49:36 +0000 Subject: [PATCH 26/32] verify correctly when spawn nprocs=-1 --- python/paddle/distributed/parallel.py | 12 ++++---- python/paddle/distributed/spawn.py | 39 +++++++++++++++++-------- python/paddle/fluid/dygraph/parallel.py | 8 ++--- 3 files changed, 38 insertions(+), 21 deletions(-) diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index f443d2a004b1a6..0b9e4f83eee59c 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -178,7 +178,9 @@ def init_parallel_env(rank=-1, backend='nccl', **options): training, the rank value is generated by spawn method, spawn method will assign a rank to each process according to the number of processes. For example, if the number of processes is 4, the ranks of the 4 - processes are 0,1,2,3 in order. + processes are 0,1,2,3 in order, so this argument does not need to + be passed by users. + If do not use ``paddle.distributed.spawn`` method to start parallel training, you can not pass the rank value here, or set it to default value -1. When it is the default value -1, you should use @@ -234,8 +236,8 @@ def train(rank): dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() - sgd = opt.SGD( - learning_rate=0.001, parameter_list=dp_layer.parameters()) + adam = opt.Adam( + learning_rate=0.001, parameters=dp_layer.parameters()) # 4. run layer inputs = paddle.randn([10, 10], 'float32') @@ -247,8 +249,8 @@ def train(rank): loss.backward() dp_layer.apply_collective_grads() - sgd.step() - sgd.clear_grad() + adam.step() + adam.clear_grad() if __name__ == '__main__': dist.spawn(train, args=(), nprocs=2) diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py index 56a991e31d78df..cbcd54255e4437 100644 --- a/python/paddle/distributed/spawn.py +++ b/python/paddle/distributed/spawn.py @@ -17,6 +17,7 @@ import multiprocessing import os import signal +import six import sys import warnings @@ -24,7 +25,8 @@ from paddle.device import get_device # deprecated module import -from paddle.fluid.framework import cpu_places, cuda_places +from paddle.fluid import core +from paddle.fluid.framework import _cpu_num def _py_supported_check(): @@ -50,8 +52,20 @@ def _set_default_assist_env(nprocs): # because the FLAGS_selected_gpus may be used in other place, # if we set FLAGS_selected_gpus to be `0,1,2,3`, it may cause error # when using `ParallelEnv` + # NOTE(chenweihang): use absolute gpu card id + env_devices = os.getenv("CUDA_VISIBLE_DEVICES") + if env_devices is None or env_devices == "": + env_devices_list = six.moves.range(core.get_cuda_device_count()) + else: + env_devices_list = env_devices.split(',') + if len(env_devices_list) < nprocs: + raise RuntimeError( + "the number of visible devices(%d) is less than the number " + "of spawn processes(%d), please ensure that the correct `nprocs` argument is " + "passed or the environment variable `CUDA_VISIBLE_DEVICES` is correctly configured." + % len(env_devices_list), nprocs) os.environ['PADDLE_CUDA_VISIBLE_DEVICES'] = ",".join( - [str(x) for x in range(0, nprocs)]) + [str(env_devices_list[x]) for x in range(0, nprocs)]) def _func_wrapper(func, i, args, error_queue, return_queue): @@ -202,8 +216,8 @@ def train(rank): dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() - sgd = opt.SGD( - learning_rate=0.001, parameter_list=dp_layer.parameters()) + adam = opt.Adam( + learning_rate=0.001, parameters=dp_layer.parameters()) # 4. run layer inputs = paddle.randn([10, 10], 'float32') @@ -215,8 +229,8 @@ def train(rank): loss.backward() dp_layer.apply_collective_grads() - sgd.step() - sgd.clear_grad() + adam.step() + adam.clear_grad() if __name__ == '__main__': dist.start_processes(train, args=(), nprocs=2) @@ -232,9 +246,10 @@ def train(rank): if nprocs == -1: device = get_device() if device == 'cpu': - nprocs = len(cpu_places()) + # TODO: not supports cpu parallel now + nprocs = _cpu_num else: - nprocs = len(cuda_places()) + nprocs = core.get_cuda_device_count() # NOTE(chenweihang): [ why need set default master info before run? ] # when using `paddle.distributed.spawn/start_processes` start @@ -336,8 +351,8 @@ def train(rank): dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() - sgd = opt.SGD( - learning_rate=0.001, parameter_list=dp_layer.parameters()) + adam = opt.Adam( + learning_rate=0.001, parameters=dp_layer.parameters()) # 4. run layer inputs = paddle.randn([10, 10], 'float32') @@ -349,8 +364,8 @@ def train(rank): loss.backward() dp_layer.apply_collective_grads() - sgd.step() - sgd.clear_grad() + adam.step() + adam.clear_grad() if __name__ == '__main__': dist.spawn(train, args=(), nprocs=2) diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index bfe42467651b8e..bc7269b886ab4d 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -294,8 +294,8 @@ def train(rank): dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() - sgd = opt.SGD( - learning_rate=0.001, parameter_list=dp_layer.parameters()) + adam = opt.Adam( + learning_rate=0.001, parameters=dp_layer.parameters()) # 4. run layer inputs = paddle.randn([10, 10], 'float32') @@ -307,8 +307,8 @@ def train(rank): loss.backward() dp_layer.apply_collective_grads() - sgd.minimize(loss) - dp_layer.clear_gradients() + adam.step() + adam.clear_grad() if __name__ == '__main__': # 1. start by ``paddle.distributed.spawn`` (default) From 3a2d7e8d09f0c1841dfad9f665b9415721fe02f2 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 27 Aug 2020 11:59:57 +0000 Subject: [PATCH 27/32] refactor spawn & init_parallel_env design --- python/paddle/distributed/__init__.py | 10 +- python/paddle/distributed/parallel.py | 245 ++++--------- python/paddle/distributed/spawn.py | 334 ++++++++++-------- python/paddle/distributed/utils.py | 13 +- python/paddle/fluid/dygraph/parallel.py | 37 +- .../tests/unittests/spawn_runner_base.py | 49 ++- .../fluid/tests/unittests/test_dist_base.py | 4 +- .../tests/unittests/test_init_parallel_env.py | 81 ----- .../test_spawn_and_init_parallel_env.py | 64 ++++ 9 files changed, 364 insertions(+), 473 deletions(-) delete mode 100644 python/paddle/fluid/tests/unittests/test_init_parallel_env.py create mode 100644 python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py index ae76b2e10866bd..d66577102c713a 100644 --- a/python/paddle/distributed/__init__.py +++ b/python/paddle/distributed/__init__.py @@ -14,10 +14,11 @@ from . import spawn from .spawn import spawn -from .spawn import start_processes from . import parallel from .parallel import init_parallel_env +from .parallel import get_rank +from .parallel import get_world_size from paddle.fluid.dygraph.parallel import prepare_context #DEFINE_ALIAS from paddle.fluid.dygraph.parallel import ParallelEnv #DEFINE_ALIAS @@ -25,10 +26,13 @@ from .collective import * # start multiprocess apis -__all__ = ["spawn", "start_processes"] +__all__ = ["spawn"] # dygraph parallel apis -__all__ += ["prepare_context", "init_parallel_env", "ParallelEnv"] +__all__ += [ + "init_parallel_env", "get_rank", "get_world_size", "prepare_context", + "ParallelEnv" +] # collective apis __all__ += collective.__all__ diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index 0b9e4f83eee59c..205a1ae793b49a 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -16,8 +16,6 @@ import six from paddle import compat as cpt -from paddle.distributed.launch import get_cluster_and_pod, _print_arguments -from paddle.distributed.utils import _update_trainer_env # deprecated module import from paddle.fluid import core @@ -30,179 +28,13 @@ ParallelStrategy = core.ParallelStrategy -# NOTE(chenweihang): The existence of this class leads to -# the maintenance of two arguments. When the launch.py arguments -# is updated, the arguments here also need to be updated, -# but I have not thought of a better way here -class ParallelEnvArgs(object): - def __init__(self): - # Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17.. - self.cluster_node_ips = None - - # The current node ip. - self.node_ip = None - - # whether to use paddlecloud platform to run your multi-process job. - # If false, no need to set this argument. - self.use_paddlecloud = None - - # The trainer's started port on a single node - self.started_port = None - - # Print the config or not - self.print_config = True - - # It's for gpu training and the training process will run - # on the selected_gpus, each process is bound to a single GPU. - # And if it's not set, this module will use all the gpu cards - # for training. - self.selected_gpus = None - - -def _update_env_vars(rank, options): - # 1. input check - if not isinstance(rank, six.integer_types): - raise TypeError("input `rank` type error, expected type is integer, " - "but received type is %s." % type(rank)) - if rank < 0: - raise ValueError("input `rank` should be greater than 0, " - "but received %d." % rank) - - # 2. check and prepare environment variables - # The necessary environment variables include: - # - PADDLE_TRAINER_ID - # - PADDLE_TRAINERS_NUM - # - PADDLE_CURRENT_ENDPOINT - # - PADDLE_TRAINER_ENDPOINTS - - # get args from kwargs - args = ParallelEnvArgs() - # set default `node_ip` and `cluster_node_ips` - args.cluster_node_ips = options.get('cluster_node_ips', None) - args.node_ip = options.get('node_ip', None) - if args.cluster_node_ips is not None and args.node_ip is None: - raise ValueError("please input current node ip, " - "cannot only give `cluster_node_ips`.") - default_node_ip = os.environ.get("PADDLE_MASTER_IPADDR", None) - default_node_ip = "127.0.0.1" if default_node_ip is None else default_node_ip - if args.node_ip is None: - args.node_ip = default_node_ip - if args.cluster_node_ips is None: - args.cluster_node_ips = default_node_ip - - # NOTE(chenweihang): Here should set `started_port` before - # `get_cluster_and_pod` and keep each process's started_port - # is same, see [ why need set default master info before run? ] - args.started_port = options.get('started_port', None) - if args.started_port is None: - default_port = os.environ.get("PADDLE_MASTER_PORT", None) - if default_port is None: - raise ValueError( - "Data parallel training start failed. If you start data parallel " - "training by `paddle.distributed.launch` module, Please ensure " - "that one of the following rules is met:\n" - " 1. Do not set `paddle.distributed.init_parallel_env` argument " - "`rank` or set it to be -1;\n" - " 2. Set `paddle.distributed.init_parallel_env` start port for " - "parallel training by `started_port=**`, e.g. started_port=6170." - ) - args.started_port = int(default_port) - - args.use_paddlecloud = options.get('use_paddlecloud', False) - args.print_config = options.get('print_config', True) - - # set default `selected_gpus` - # TODO(chenweihang): if users given number of `selected_gpus` - # is not equal to the spawn's nprocs, it will cause error, - # and because we remove the `proc num` argument of - # `init_parallel_env`, when above error occured, we do not - # have a good way to check, so users are not recommended to - # use this parameter, it is best to delete - args.selected_gpus = options.get('selected_gpus', None) - if args.selected_gpus is None: - args.selected_gpus = os.environ.get("PADDLE_CUDA_VISIBLE_DEVICES", None) - if args.selected_gpus is None: - raise ValueError( - "Data parallel training start failed. If you start data parallel " - "training by `paddle.distributed.launch` module, Please ensure " - "that one of the following rules is met:\n" - " 1. Do not set `paddle.distributed.init_parallel_env` argument " - "`rank` or set it to be -1;\n" - " 2. Set `paddle.distributed.init_parallel_env` selected gpus of " - "parallel training by `selected_gpus=**`, e.g. selected_gpus='0,1,2,3'." - ) - - # reuse code of launch.py - cluster, pod = get_cluster_and_pod(args) - - # remove useless env vars - os.environ.pop("http_proxy", None) - os.environ.pop("https_proxy", None) - - # update env vars - trainer = pod.get_trainer(rank) - if trainer is None: - raise RuntimeError( - "The expected trainer is not exists, its trainer rank is %d." % - rank) - # no copy, each process will hold env vars itself - _update_trainer_env(os.environ, cluster, trainer) - - # print config - if args.print_config and rank == 0: - _print_arguments(args) - - -def _check_env_vars(): - def _check_var_exists(var_name): - var = os.environ.get(var_name, None) - if var is None: - raise ValueError("paddle.distributed initialize error," - "environment variable %s is needed, but not set." % - var_name) - - _check_var_exists("FLAGS_selected_gpus") - _check_var_exists("PADDLE_TRAINER_ID") - _check_var_exists("PADDLE_CURRENT_ENDPOINT") - _check_var_exists("PADDLE_TRAINERS_NUM") - _check_var_exists("PADDLE_TRAINER_ENDPOINTS") - - -def init_parallel_env(rank=-1, backend='nccl', **options): +def init_parallel_env(backend='nccl'): """ Initialize parallel training environments in dynamic mode. Args: - rank(int, optional): Rank of current process. Default value is -1. - When use ``paddle.distributed.spawn`` method to start parallel - training, the rank value is generated by spawn method, spawn method - will assign a rank to each process according to the number of processes. - For example, if the number of processes is 4, the ranks of the 4 - processes are 0,1,2,3 in order, so this argument does not need to - be passed by users. - - If do not use ``paddle.distributed.spawn`` method to start parallel - training, you can not pass the rank value here, or set it to default - value -1. When it is the default value -1, you should use - ``paddle.distributed.launch`` module to start parallel training, - the environment variables for parallel training are configured by - ``paddle.distributed.launch`` module. backend(str, optional): The backend to communication between multiple devices. Now only support ``nccl`` . Default value is ``nccl`` . - **options(dict, optional): Other initial parallel execution environment configuration options. - The following options are currently supported: - - - cluster_node_ips: Paddle cluster nodes ips, such as "192.168.0.16,192.168.0.17". Default: "127.0.0.1". - - - node_ip: The current node ip, such as "192.168.0.16". Default: "127.0.0.1". - - - started_port: The trainer's started port on a single node, such as 6170. Default: None. - - - selected_gpus: The training process will run on the selected_gpus, such as "0,1,2,3". Default: None. - - - print_config: Print current parallel training config. Default: True. - - - use_paddlecloud: Whether to use paddlecloud platform to run your multi-process job. Default: False. Returns: None @@ -224,12 +56,12 @@ def __init__(self): def forward(self, x): return self._linear2(self._linear1(x)) - def train(rank): + def train(): # 1. enable dynamic mode paddle.disable_static() # 2. initialize parallel environment - dist.init_parallel_env(rank) + dist.init_parallel_env() # 3. create data parallel layer & optimizer layer = LinearNet() @@ -253,7 +85,7 @@ def train(rank): adam.clear_grad() if __name__ == '__main__': - dist.spawn(train, args=(), nprocs=2) + dist.spawn(train) """ # 1. input check @@ -265,14 +97,19 @@ def train(rank): "backend `%s` is not supported, now only supports `nccl` backend." % backend) - # 2. update or check env - # NOTE(chenweihang): if rank is default value, users should config - # parallel environment by module `paddle.distributed.launch`, - # so here we only check the environment variables - if rank != -1: - _update_env_vars(rank, options) - else: - _check_env_vars() + # 2. check env + def _check_var_exists(var_name): + var = os.environ.get(var_name, None) + if var is None: + raise ValueError("paddle.distributed initialize error," + "environment variable %s is needed, but not set." % + var_name) + + _check_var_exists("FLAGS_selected_gpus") + _check_var_exists("PADDLE_TRAINER_ID") + _check_var_exists("PADDLE_CURRENT_ENDPOINT") + _check_var_exists("PADDLE_TRAINERS_NUM") + _check_var_exists("PADDLE_TRAINER_ENDPOINTS") # 3. init ParallelStrategy strategy = ParallelStrategy() @@ -289,10 +126,56 @@ def train(rank): # directly, if they want to switch default place, # they need to call a function to change default place, # here just set correctly place to users - place = core.CUDAPlace(ParallelEnv().dev_id) + place = core.CUDAPlace(ParallelEnv().device_id) _set_expected_place(place) # init nccl context parallel_helper._set_parallel_ctx( core.NCCLParallelContext(strategy, place)) parallel_helper._init_parallel_ctx() + + +def get_rank(): + """ + Returns the rank of current trainer. + + Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ID`` . + The default value is 0. + + Returns: + (int) The rank of current trainer. + + Examples: + .. code-block:: python + + import paddle + import paddle.distributed as dist + + # execute this command in terminal: export PADDLE_TRAINER_ID=0 + print("The rank is %d" % dist.get_rank()) + # The rank is 0 + """ + return ParallelEnv().rank + + +def get_world_size(): + """ + The number of trainers (number of processes participating in current job). + + Its value is equal to the value of the environment variable ``PADDLE_TRAINERS_NUM`` . + The default value is 1. + + Returns: + (int) The number of trainers. + + Examples: + .. code-block:: python + + import paddle + import paddle.distributed as dist + + # execute this command in terminal: export PADDLE_TRAINERS_NUM=4 + print("The world_size is %d" % dist.get_world_size()) + # The world_size is 4 + """ + return ParallelEnv().world_size diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py index cbcd54255e4437..cbc9abb85b7a2f 100644 --- a/python/paddle/distributed/spawn.py +++ b/python/paddle/distributed/spawn.py @@ -21,7 +21,8 @@ import sys import warnings -from paddle.distributed.utils import find_free_ports +from paddle.distributed.launch import get_cluster_and_pod, _print_arguments +from paddle.distributed.utils import _prepare_trainer_env from paddle.device import get_device # deprecated module import @@ -29,23 +30,63 @@ from paddle.fluid.framework import _cpu_num +# NOTE(chenweihang): The existence of this class leads to +# the maintenance of two arguments. When the launch.py arguments +# is updated, the arguments here also need to be updated, +# but I have not thought of a better way here +class ParallelEnvArgs(object): + def __init__(self): + # Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17.. + self.cluster_node_ips = None + + # The current node ip. + self.node_ip = None + + # whether to use paddlecloud platform to run your multi-process job. + # If false, no need to set this argument. + self.use_paddlecloud = None + + # The trainer's started port on a single node + self.started_port = None + + # Print the config or not + self.print_config = True + + # It's for gpu training and the training process will run + # on the selected_gpus, each process is bound to a single GPU. + # And if it's not set, this module will use all the gpu cards + # for training. + self.selected_gpus = None + + def _py_supported_check(): if not sys.version_info >= (3, 4): raise RuntimeError( - "Use `paddle.distributed.spawn` or `paddle.distributed.start_processes` " - "to start parallel training requires python version greater than 3.4, " - "if your python is lower than this version, please use " + "Use `paddle.distributed.spawn` to start parallel training " + "requires python version greater than 3.4, if your python " + "is lower than this version, please use " "`paddle.distributed.launch` instead.") -def _set_default_assist_env(nprocs): - # set default master trainer ip addr - os.environ['PADDLE_MASTER_IPADDR'] = '127.0.0.1' - # set default master trainer port - port_set = find_free_ports(1) - if port_set is None: - raise RuntimeError("no free port can be used to parallel training now.") - os.environ['PADDLE_MASTER_PORT'] = str(list(port_set)[0]) +def _get_subprocess_env_list(nprocs, options): + # contruct processes env list + processes_env_list = [] + + # get args from kwargs + args = ParallelEnvArgs() + + # set default `node_ip` and `cluster_node_ips` + args.cluster_node_ips = options.get('cluster_node_ips', None) + args.node_ip = options.get('node_ip', None) + if args.cluster_node_ips is not None and args.node_ip is None: + raise ValueError("please input current node ip, " + "cannot only give `cluster_node_ips`.") + default_node_ip = "127.0.0.1" + if args.node_ip is None: + args.node_ip = default_node_ip + if args.cluster_node_ips is None: + args.cluster_node_ips = default_node_ip + # set default selected gpus # e.g. if the nprocs is 4, the selected gpus is "0,1,2,3" # NOTE(chenweihang): [ why not use FLAGS_selected_gpus directly? ] @@ -53,24 +94,70 @@ def _set_default_assist_env(nprocs): # if we set FLAGS_selected_gpus to be `0,1,2,3`, it may cause error # when using `ParallelEnv` # NOTE(chenweihang): use absolute gpu card id - env_devices = os.getenv("CUDA_VISIBLE_DEVICES") + args.selected_gpus = options.get('selected_gpus', None) + env_devices = os.getenv("CUDA_VISIBLE_DEVICES", None) if env_devices is None or env_devices == "": - env_devices_list = six.moves.range(core.get_cuda_device_count()) + env_devices_list = [ + str(x) for x in six.moves.range(core.get_cuda_device_count()) + ] else: env_devices_list = env_devices.split(',') - if len(env_devices_list) < nprocs: - raise RuntimeError( - "the number of visible devices(%d) is less than the number " - "of spawn processes(%d), please ensure that the correct `nprocs` argument is " - "passed or the environment variable `CUDA_VISIBLE_DEVICES` is correctly configured." - % len(env_devices_list), nprocs) - os.environ['PADDLE_CUDA_VISIBLE_DEVICES'] = ",".join( - [str(env_devices_list[x]) for x in range(0, nprocs)]) + if args.selected_gpus is None: + if len(env_devices_list) < nprocs: + raise RuntimeError( + "the number of visible devices(%d) is less than the number " + "of spawn processes(%d), please ensure that the correct " + "`nprocs` argument is passed or the environment variable " + "`CUDA_VISIBLE_DEVICES` is correctly configured." % + (len(env_devices_list), nprocs)) + args.selected_gpus = ",".join( + [str(env_devices_list[x]) for x in range(0, nprocs)]) + else: + for card_id in args.selected_gpus.split(','): + if card_id not in env_devices_list: + raise ValueError("The selected gpu card %s cannot found in " + "CUDA_VISIBLE_DEVICES (%s)." % + (card_id, ",".join(env_devices_list))) + + # set other arguments + args.started_port = options.get('started_port', None) + args.use_paddlecloud = options.get('use_paddlecloud', False) + args.print_config = options.get('print_config', False) + + # reuse code of launch.py + cluster, pod = get_cluster_and_pod(args) + # prepare subprocess env list + for trainer in pod.trainers: + processes_env_list.append(_prepare_trainer_env(cluster, trainer)) -def _func_wrapper(func, i, args, error_queue, return_queue): + # print config + if args.print_config: + _print_arguments(args) + + return processes_env_list + + +def _remove_risky_env(): + # remove useless env vars, same as launch.py + # no copy, each process will hold env vars itself + os.environ.pop("http_proxy", None) + os.environ.pop("https_proxy", None) + + +def _set_trainer_env(env_dict): + for var_name in env_dict: + os.environ[var_name] = env_dict[var_name] + + +def _func_wrapper(func, args, error_queue, return_queue, env_dict): try: - result = func(i, *args) + # config subprocess environment variables + _remove_risky_env() + _set_trainer_env(env_dict) + # execute function + result = func(*args) + # record function return value return_queue.put(result) except KeyboardInterrupt: pass @@ -84,7 +171,7 @@ class MultiprocessContext(object): def __init__(self, processes, error_queues, return_queues): _py_supported_check() self.error_queues = error_queues - # NOTE(chenweihang): The `start_processes` method is mainly used + # NOTE(chenweihang): The `spawn` method is mainly used # to wrap the outermost execution function of the program for # parallel execution. Generally, the return value is not concerned, # but if the user needs to obtain the return value, users can get @@ -141,31 +228,17 @@ def _throw_exception(self, error_index): raise Exception(msg) -# NOTE(chenweihang): [ why default start method is spawn? ] -# The CUDA runtime does not support the fork start method, -# either the spawn or forkserver start method are required -# to use CUDA in subprocesses. -def start_processes(func, - args=(), - nprocs=-1, - join=True, - daemon=False, - start_method='spawn'): +def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options): """ - Start multiple processes for parallel training. - - .. note:: - ``start_processes`` is not a public interface! Please use ``spawn`` - firstly, if ``spawn`` cannot meet the need, then consider using - ``start_processes`` . + Start multiple processes with ``spawn`` method for parallel training. Args: - func (function): The target function is called by started process. + func (function): The target function is called by spawned process. This function need to be able to pickled, so it must be defined at the top level of a module. - This function should be called as ``func(i, *args)`` , ``i`` is + This function should be called as ``func(i, *args)``, ``i`` is the process index and ``args`` contains other arguments as tuple. - args (tuple): Arguments passed to ``func`` . + args (tuple, optional): Arguments passed to ``func``. nprocs (int, optional): Number of processed to start. Default: -1. when nprocs is -1, the available device will be obtained from the environment variable when the model is executed: If use GPU, @@ -175,17 +248,30 @@ def start_processes(func, For example, export CPU_NUM=4, if the environment variable is not set, the executor will add the variable to the environment variable and set its value to 1. - join (bool, optional): Perform a blocking join on all started processes. + join (bool, optional): Perform a blocking join on all spawned processes. Default: True. - daemon (bool, optional): The started processes' daemon flag. Default: False. - start_method (string, optional): the way to start a process. The start method - can be ``spawn`` , ``fork`` , ``forkserver`` . Because the CUDA - runtime does not support the ``fork`` start method, when use - CUDA in subprocesses, we should start process by ``spawn`` or - ``forkserver`` method. Default: 'spawn'. + daemon (bool, optional): The spawned processes' daemon flag. Default: False. + **options(dict, optional): Other initial parallel execution environment + configuration options. The following options are currently supported: + (1) start_method (string): the way to start a process. The start method + can be ``spawn`` , ``fork`` , ``forkserver`` . Because the CUDA + runtime does not support the ``fork`` start method, when use CUDA + in subprocesses, we should start process by ``spawn`` or ``forkserver`` + method. Default: 'spawn'; + (2) cluster_node_ips (string): Paddle cluster nodes ips, such as + "192.168.0.16,192.168.0.17". Default: "127.0.0.1"; + (3) node_ip (string): The current node ip, such as "192.168.0.16". + Default: "127.0.0.1"; + (4) started_port (int): The trainer's started port on a single node, + such as 6170. Default: None; + (5) selected_gpus (string): The training process will run on the + selected_gpus, such as "0,1,2,3". Default: None; + (6) print_config: Print current parallel training config. Default: False; + (7) use_paddlecloud: Whether to use paddlecloud platform to run your + multi-process job. Default: False. Returns: - ``MultiprocessContext`` object, it hold the started processes. + ``MultiprocessContext`` object, it hold the spawned processes. Examples: .. code-block:: python @@ -204,12 +290,12 @@ def __init__(self): def forward(self, x): return self._linear2(self._linear1(x)) - def train(rank): + def train(print_result=False): # 1. enable dynamic mode paddle.disable_static() # 2. initialize parallel environment - dist.init_parallel_env(rank) + dist.init_parallel_env() # 3. create data parallel layer & optimizer layer = LinearNet() @@ -225,6 +311,9 @@ def train(rank): labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) + if print_result is True: + print("loss: " % loss) + loss = dp_layer.scale_loss(loss) loss.backward() dp_layer.apply_collective_grads() @@ -232,8 +321,38 @@ def train(rank): adam.step() adam.clear_grad() + # Usage 1: only pass function. + # If your training method no need any argument, and + # use all visible devices for parallel training. + if __name__ == '__main__': + dist.spawn(train) + + # Usage 2: pass function and arguments. + # If your training method need some arguments, and + # use all visible devices for parallel training. if __name__ == '__main__': - dist.start_processes(train, args=(), nprocs=2) + dist.spawn(train, args=(True,)) + + # Usage 3: pass function, arguments and nprocs. + # If your training method need some arguments, and + # only use part of visible devices for parallel training. + # If your machine hold 8 cards {0,1,2,3,4,5,6,7}, + # this case will use cards {0,1}; If you set + # CUDA_VISIBLE_DEVICES=4,5,6,7, this case will use + # cards {4,5} + if __name__ == '__main__': + dist.spawn(train, args=(True,), nprocs=2) + + # Usage 4: pass function, arguments, nprocs and selected_gpus. + # If your training method need some arguments, and + # only use part of visible devices for parallel training, + # but you can't set your machine's environment varibale + # CUDA_VISIBLE_DEVICES, such as it is None or all cards + # {0,1,2,3,4,5,6,7}, you can pass `selelcted_gpus` to + # select the GPU cards you want to use. For example, + # this case will use cards {4,5} if your machine hold 8 cards. + if __name__ == '__main__': + dist.spawn(train, args=(True,), nprocs=2, selelcted_gpus='4,5') """ # NOTE(chenweihang): [ why only supports python3.4+ ? ] # Python supported setting the child process startup method @@ -251,15 +370,20 @@ def train(rank): else: nprocs = core.get_cuda_device_count() - # NOTE(chenweihang): [ why need set default master info before run? ] - # when using `paddle.distributed.spawn/start_processes` start - # parallel training, users need use `init_parallel_env` to config - # cluster info inner subprocess, if each process find free port for - # itself, the started port may be different, it will cause endpoints is - # different in different subprocesses - _set_default_assist_env(nprocs) + # NOTE(chenweihang): [ why need get cluster info before run? ] + # when using `paddle.distributed.spawn` start parallel training, + # we should get cluster info before starting subprocess, and pass + # correct info to each subprocess + procs_env_list = _get_subprocess_env_list(nprocs, options) # start processes + # NOTE(chenweihang): [ why default start method is spawn? ] + # The CUDA runtime does not support the fork start method, + # either the spawn or forkserver start method are required + # to use CUDA in subprocesses. + start_method = options.get('start_method', None) + if start_method is None: + start_method = 'spawn' mp = multiprocessing.get_context(start_method) error_queues = [] @@ -270,7 +394,7 @@ def train(rank): return_queue = mp.SimpleQueue() process = mp.Process( target=_func_wrapper, - args=(func, i, args, error_queue, return_queue)) + args=(func, args, error_queue, return_queue, procs_env_list[i])) process.daemon = daemon process.start() error_queues.append(error_queue) @@ -287,87 +411,3 @@ def train(rank): # finally return context return context - - -# NOTE(chenweihang): this method only supports start processes -# by `spwan` method, if users want to start processes by other -# method, they can use start_processes -def spawn(func, args=(), nprocs=-1, join=True, daemon=False): - """ - Start multiple processes with ``spawn`` method for parallel training. - - If you want to use other methods ( ``fork`` , ``forkserver`` ) to start - multiple processes, please use ``paddle.distributed.start_processes`` . - - Args: - func (function): The target function is called by spawned process. - This function need to be able to pickled, so it must be defined - at the top level of a module. - This function should be called as ``func(i, *args)``, ``i`` is - the process index and ``args`` contains other arguments as tuple. - args (tuple): Arguments passed to ``func``. - nprocs (int, optional): Number of processed to start. Default: -1. - when nprocs is -1, the available device will be obtained from - the environment variable when the model is executed: If use GPU, - the currently available device ID is obtained from the environment - variable CUDA_VISIBLE_DEVICES; If use CPU, the currently available - CPU number is obtained from the environment variable CPU_NUM. - For example, export CPU_NUM=4, if the environment variable is not set, - the executor will add the variable to the environment variable and - set its value to 1. - join (bool, optional): Perform a blocking join on all spawned processes. - Default: True. - daemon (bool, optional): The spawned processes' daemon flag. Default: False. - - Returns: - ``MultiprocessContext`` object, it hold the spawned processes. - - Examples: - .. code-block:: python - - import paddle - import paddle.nn as nn - import paddle.optimizer as opt - import paddle.distributed as dist - - class LinearNet(nn.Layer): - def __init__(self): - super(LinearNet, self).__init__() - self._linear1 = nn.Linear(10, 10) - self._linear2 = nn.Linear(10, 1) - - def forward(self, x): - return self._linear2(self._linear1(x)) - - def train(rank): - # 1. enable dynamic mode - paddle.disable_static() - - # 2. initialize parallel environment - dist.init_parallel_env(rank) - - # 3. create data parallel layer & optimizer - layer = LinearNet() - dp_layer = paddle.DataParallel(layer) - - loss_fn = nn.MSELoss() - adam = opt.Adam( - learning_rate=0.001, parameters=dp_layer.parameters()) - - # 4. run layer - inputs = paddle.randn([10, 10], 'float32') - outputs = dp_layer(inputs) - labels = paddle.randn([10, 1], 'float32') - loss = loss_fn(outputs, labels) - - loss = dp_layer.scale_loss(loss) - loss.backward() - dp_layer.apply_collective_grads() - - adam.step() - adam.clear_grad() - - if __name__ == '__main__': - dist.spawn(train, args=(), nprocs=2) - """ - return start_processes(func, args, nprocs, join, daemon, 'spawn') diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py index 1e5c7810b530b6..1fa307c4d1b89d 100644 --- a/python/paddle/distributed/utils.py +++ b/python/paddle/distributed/utils.py @@ -213,12 +213,6 @@ def get_visible_gpus(self): r = r[:-1] return r - def get_trainer(self, trainer_id): - for trainer in self.trainers: - if trainer.rank == trainer_id: - return trainer - return None - def get_logger(log_level, name="root"): logger = logging.getLogger(name) @@ -333,7 +327,7 @@ def __free_port(): return None -def _update_trainer_env(current_env, cluster, trainer): +def _prepare_trainer_env(cluster, trainer): proc_env = { "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in trainer.gpus]), "PADDLE_TRAINER_ID": "%d" % trainer.rank, @@ -341,8 +335,6 @@ def _update_trainer_env(current_env, cluster, trainer): "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()) } - current_env.update(proc_env) - return proc_env @@ -371,7 +363,8 @@ def start_local_trainers(cluster, procs = [] for idx, t in enumerate(pod.trainers): - proc_env = _update_trainer_env(current_env, cluster, t) + proc_env = _prepare_trainer_env(cluster, t) + current_env.update(proc_env) logger.debug("trainer proc env:{}".format(current_env)) diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index bc7269b886ab4d..bd578e6ba98a0f 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -118,16 +118,12 @@ class ParallelEnv(object): """ def __init__(self): - self._nranks = int(os.getenv("PADDLE_TRAINERS_NUM", "1")) - self._local_rank = int(os.getenv("PADDLE_TRAINER_ID", "0")) - self._dev_id = int(os.getenv("FLAGS_selected_gpus", "0")) + self._rank = int(os.getenv("PADDLE_TRAINER_ID", "0")) + self._world_size = int(os.getenv("PADDLE_TRAINERS_NUM", "1")) + self._device_id = int(os.getenv("FLAGS_selected_gpus", "0")) self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", "").split(",") self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "") - self.__aliases__ = { - 'local_rank': 'rank', - 'nranks': 'world_size', - } @property def rank(self): @@ -146,7 +142,7 @@ def rank(self): print("The rank is %d" % env.rank) # The rank is 0 """ - return self._local_rank + return self._rank @property def world_size(self): @@ -165,10 +161,10 @@ def world_size(self): print("The world_size is %d" % env.world_size) # The world_size is 4 """ - return self._nranks + return self._world_size @property - def dev_id(self): + def device_id(self): """ The ID of selected GPU card for parallel training. @@ -181,10 +177,10 @@ def dev_id(self): import paddle.distributed as dist env = dist.ParallelEnv() - print("The device id are %d" % env.dev_id) + print("The device id are %d" % env.device_id) # The device id are 1 """ - return self._dev_id + return self._device_id @property def current_endpoint(self): @@ -225,11 +221,10 @@ def trainer_endpoints(self): """ return self._trainer_endpoints - def __getattr__(self, name): - if name == "__aliases__": - raise AttributeError("Attribute `__aliases__` can not be accessed.") - name = self.__aliases__.get(name, name) - return object.__getattribute__(self, name) + # [aliases] Compatible with old method names + local_rank = rank + nranks = world_size + dev_id = device_id # NOTE: [ Compatible ] Originally this class name is `Env`. The semantics of the old class names @@ -282,12 +277,12 @@ def __init__(self): def forward(self, x): return self._linear2(self._linear1(x)) - def train(rank): + def train(): # 1. enable dynamic mode paddle.disable_static() # 2. initialize parallel environment - dist.init_parallel_env(rank) + dist.init_parallel_env() # 3. create data parallel layer & optimizer layer = LinearNet() @@ -312,9 +307,9 @@ def train(rank): if __name__ == '__main__': # 1. start by ``paddle.distributed.spawn`` (default) - dist.spawn(train, args=(), nprocs=2) + dist.spawn(train, nprocs=2) # 2. start by ``paddle.distributed.launch`` - # train(-1) + # train() """ def __init__(self, layers, strategy=None): diff --git a/python/paddle/fluid/tests/unittests/spawn_runner_base.py b/python/paddle/fluid/tests/unittests/spawn_runner_base.py index 9c097c349c3da0..c64a1c7b9e1f54 100644 --- a/python/paddle/fluid/tests/unittests/spawn_runner_base.py +++ b/python/paddle/fluid/tests/unittests/spawn_runner_base.py @@ -27,13 +27,6 @@ class SpawnAssistTestArgs(object): update_method = "local" trainer_id = 0 - current_endpoint = None - endpoints = None - - -def run_dygraph_model(rank, model, args): - args.trainer_id = rank - return model.run_trainer_with_spawn(args) class TestDistSpawnRunner(unittest.TestCase): @@ -44,15 +37,13 @@ def setUp(self): def _run(self, model, args): args.update_method = "local" - return run_dygraph_model(-1, model, args) + return model.run_trainer_with_spawn(args) def _run_parallel(self, model, args): args.update_method = "nccl2" context = paddle.distributed.spawn( - func=run_dygraph_model, - args=( - model, - args, ), + func=model.run_trainer_with_spawn, + args=(args, ), nprocs=self.nprocs, join=True) result_list = [] @@ -66,25 +57,25 @@ def check_dist_result_with_spawn(self, test_class, delta=1e-3): args = SpawnAssistTestArgs() # 1. calc signal card loss - losses = self._run(model, args) + # losses = self._run(model, args) # 2. calc multi card loss (nccl mode) dist_losses_list = self._run_parallel(model, args) # 3. compare losses - for step_id in range(RUN_STEP): - loss = losses[step_id] - dist_loss_sum = None - for dist_losses in dist_losses_list: - if dist_loss_sum is None: - dist_loss_sum = np.array(dist_losses[step_id]) - else: - dist_loss_sum += np.array(dist_losses[step_id]) - dist_loss = dist_loss_sum / self.nprocs - self.assertAlmostEqual( - loss, - dist_loss, - delta=delta, - msg="The results of single-card execution and multi-card execution are inconsistent." - "signal-card loss is:\n{}\nmulti-card average loss is:\n{}\n". - format(loss, dist_loss)) + # for step_id in range(RUN_STEP): + # loss = losses[step_id] + # dist_loss_sum = None + # for dist_losses in dist_losses_list: + # if dist_loss_sum is None: + # dist_loss_sum = np.array(dist_losses[step_id]) + # else: + # dist_loss_sum += np.array(dist_losses[step_id]) + # dist_loss = dist_loss_sum / self.nprocs + # self.assertAlmostEqual( + # loss, + # dist_loss, + # delta=delta, + # msg="The results of single-card execution and multi-card execution are inconsistent." + # "signal-card loss is:\n{}\nmulti-card average loss is:\n{}\n". + # format(loss, dist_loss)) diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index d3ebaf90cb5161..faff81fa84fb5f 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -457,10 +457,12 @@ def run_trainer_with_spawn(self, args): paddle.static.default_main_program().random_seed = seed np.random.seed(seed) random.seed = seed + # get trainer id + args.trainer_id = paddle.distributed.get_rank() # 3. init parallel env if args.update_method == "nccl2": - paddle.distributed.init_parallel_env(rank=args.trainer_id) + paddle.distributed.init_parallel_env() # 4. train model model, train_reader, opt = self.get_model() diff --git a/python/paddle/fluid/tests/unittests/test_init_parallel_env.py b/python/paddle/fluid/tests/unittests/test_init_parallel_env.py deleted file mode 100644 index 16a55bdd18247d..00000000000000 --- a/python/paddle/fluid/tests/unittests/test_init_parallel_env.py +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import os -import numpy as np -import unittest - -import paddle -import paddle.distributed as dist - -# NOTE(chenweihang): Coverage CI is currently not able to count python3 -# unittest, so the unittests here covers some cases that will only be -# executed in the python3 sub-process. -# If the coverage CI can check python3 and sub-process, -# we can remove all unittests here - - -class TestInitParallelEnv(unittest.TestCase): - def test_beckend_type_error(self): - with self.assertRaises(TypeError): - dist.init_parallel_env(backend=1) - - def test_backend_value_error(self): - with self.assertRaises(ValueError): - dist.init_parallel_env(backend="mpi") - - def test_rank_type_error(self): - with self.assertRaises(TypeError): - dist.init_parallel_env(rank="1") - - def test_rank_value_error(self): - with self.assertRaises(ValueError): - dist.init_parallel_env(rank=-2) - - def test_only_cluster_node_ips_error(self): - with self.assertRaises(ValueError): - dist.init_parallel_env( - rank=0, cluster_node_ips="127.0.0.1,127.0.0.2") - - def test_no_started_port_error(self): - with self.assertRaises(ValueError): - dist.init_parallel_env(rank=0) - - def test_no_selected_gpus_error(self): - with self.assertRaises(ValueError): - dist.init_parallel_env(rank=0, started_port=6170) - - def test_check_env_failed(self): - os.environ['FLAGS_selected_gpus'] = '0' - os.environ['PADDLE_TRAINER_ID'] = '0' - os.environ['PADDLE_CURRENT_ENDPOINT'] = '127.0.0.1:6170' - os.environ['PADDLE_TRAINERS_NUM'] = '1' - with self.assertRaises(ValueError): - dist.init_parallel_env() - - def test_update_env(self): - device = os.getenv("CUDA_VISIBLE_DEVICES", None) - if device is None: - device = '0' - dist.init_parallel_env(rank=0, started_port=6170, selected_gpus=device) - self.assertIsNotNone(os.environ.get('PADDLE_TRAINER_ID', None)) - self.assertIsNotNone(os.environ.get('PADDLE_CURRENT_ENDPOINT', None)) - self.assertIsNotNone(os.environ.get('PADDLE_TRAINERS_NUM', None)) - self.assertIsNotNone(os.environ.get('PADDLE_TRAINER_ENDPOINTS', None)) - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py new file mode 100644 index 00000000000000..b19805d7a5b747 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py @@ -0,0 +1,64 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import numpy as np +import unittest + +import paddle +import paddle.distributed as dist +from paddle.distributed.spawn import _get_subprocess_env_list + +# NOTE(chenweihang): Coverage CI is currently not able to count python3 +# unittest, so the unittests here covers some cases that will only be +# executed in the python3 sub-process. + + +class TestInitParallelEnv(unittest.TestCase): + def test_beckend_type_error(self): + with self.assertRaises(TypeError): + dist.init_parallel_env(backend=1) + + def test_backend_value_error(self): + with self.assertRaises(ValueError): + dist.init_parallel_env(backend="mpi") + + +class TestSpawnAssistMethod(unittest.TestCase): + def test_only_cluster_node_ips_error(self): + with self.assertRaises(ValueError): + options = dict() + options['cluster_node_ips'] = "127.0.0.1,127.0.0.2" + _get_subprocess_env_list(nprocs=1, options=options) + + def test_nprocs_greater_than_device_num_error(self): + with self.assertRaises(RuntimeError): + _get_subprocess_env_list(nprocs=100, options=dict()) + + def test_selected_gpus_error(self): + with self.assertRaises(ValueError): + options = dict() + options['selected_gpus'] = "100,101" + _get_subprocess_env_list(nprocs=2, options=options) + + def test_get_correct_env(self): + env_dict = _get_subprocess_env_list(nprocs=1, options=dict())[0] + self.assertEqual(env_dict['PADDLE_TRAINER_ID'], '0') + self.assertEqual(env_dict['PADDLE_TRAINERS_NUM'], '1') + + +if __name__ == "__main__": + unittest.main() From 0582c4b6330fa922b59cf2cfc816fd6c04545473 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 27 Aug 2020 12:05:07 +0000 Subject: [PATCH 28/32] polish doc details --- python/paddle/distributed/spawn.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py index cbc9abb85b7a2f..f3ea4f5633354f 100644 --- a/python/paddle/distributed/spawn.py +++ b/python/paddle/distributed/spawn.py @@ -276,6 +276,8 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options): Examples: .. code-block:: python + from __future__ import print_function + import paddle import paddle.nn as nn import paddle.optimizer as opt @@ -312,7 +314,7 @@ def train(print_result=False): loss = loss_fn(outputs, labels) if print_result is True: - print("loss: " % loss) + print("loss:", loss.numpy()) loss = dp_layer.scale_loss(loss) loss.backward() From 9ceaeffdbc107043fbb0cfdb0cf9072fc2325c6c Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 27 Aug 2020 12:43:08 +0000 Subject: [PATCH 29/32] open spawn unittests --- .../tests/unittests/spawn_runner_base.py | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/spawn_runner_base.py b/python/paddle/fluid/tests/unittests/spawn_runner_base.py index c64a1c7b9e1f54..278d7b27c52880 100644 --- a/python/paddle/fluid/tests/unittests/spawn_runner_base.py +++ b/python/paddle/fluid/tests/unittests/spawn_runner_base.py @@ -57,25 +57,25 @@ def check_dist_result_with_spawn(self, test_class, delta=1e-3): args = SpawnAssistTestArgs() # 1. calc signal card loss - # losses = self._run(model, args) + losses = self._run(model, args) # 2. calc multi card loss (nccl mode) dist_losses_list = self._run_parallel(model, args) # 3. compare losses - # for step_id in range(RUN_STEP): - # loss = losses[step_id] - # dist_loss_sum = None - # for dist_losses in dist_losses_list: - # if dist_loss_sum is None: - # dist_loss_sum = np.array(dist_losses[step_id]) - # else: - # dist_loss_sum += np.array(dist_losses[step_id]) - # dist_loss = dist_loss_sum / self.nprocs - # self.assertAlmostEqual( - # loss, - # dist_loss, - # delta=delta, - # msg="The results of single-card execution and multi-card execution are inconsistent." - # "signal-card loss is:\n{}\nmulti-card average loss is:\n{}\n". - # format(loss, dist_loss)) + for step_id in range(RUN_STEP): + loss = losses[step_id] + dist_loss_sum = None + for dist_losses in dist_losses_list: + if dist_loss_sum is None: + dist_loss_sum = np.array(dist_losses[step_id]) + else: + dist_loss_sum += np.array(dist_losses[step_id]) + dist_loss = dist_loss_sum / self.nprocs + self.assertAlmostEqual( + loss, + dist_loss, + delta=delta, + msg="The results of single-card execution and multi-card execution are inconsistent." + "signal-card loss is:\n{}\nmulti-card average loss is:\n{}\n". + format(loss, dist_loss)) From 4b7d810c172dea89ced8b9b6d60e1f4fe950732c Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 27 Aug 2020 13:12:51 +0000 Subject: [PATCH 30/32] try to fix doc compile error --- python/paddle/distributed/spawn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py index f3ea4f5633354f..8ed2aa4c5eff78 100644 --- a/python/paddle/distributed/spawn.py +++ b/python/paddle/distributed/spawn.py @@ -257,7 +257,7 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options): can be ``spawn`` , ``fork`` , ``forkserver`` . Because the CUDA runtime does not support the ``fork`` start method, when use CUDA in subprocesses, we should start process by ``spawn`` or ``forkserver`` - method. Default: 'spawn'; + method. Default: "spawn" ; (2) cluster_node_ips (string): Paddle cluster nodes ips, such as "192.168.0.16,192.168.0.17". Default: "127.0.0.1"; (3) node_ip (string): The current node ip, such as "192.168.0.16". From 4261e22dc098f85c58f5d85aaa26f199a1303090 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 27 Aug 2020 14:09:55 +0000 Subject: [PATCH 31/32] try to fix unknown doc format error --- python/paddle/distributed/spawn.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py index 8ed2aa4c5eff78..1ca2ebaa8d4bd3 100644 --- a/python/paddle/distributed/spawn.py +++ b/python/paddle/distributed/spawn.py @@ -253,22 +253,22 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options): daemon (bool, optional): The spawned processes' daemon flag. Default: False. **options(dict, optional): Other initial parallel execution environment configuration options. The following options are currently supported: - (1) start_method (string): the way to start a process. The start method - can be ``spawn`` , ``fork`` , ``forkserver`` . Because the CUDA - runtime does not support the ``fork`` start method, when use CUDA - in subprocesses, we should start process by ``spawn`` or ``forkserver`` - method. Default: "spawn" ; + (1) start_method (string): the way to start a process. + The start method can be ``spawn`` , ``fork`` , ``forkserver`` . + Because the CUDA runtime does not support the ``fork`` start method, + when use CUDA in subprocesses, we should start process by ``spawn`` + or ``forkserver`` method. Default: "spawn" ; (2) cluster_node_ips (string): Paddle cluster nodes ips, such as - "192.168.0.16,192.168.0.17". Default: "127.0.0.1"; + "192.168.0.16,192.168.0.17". Default: "127.0.0.1"; (3) node_ip (string): The current node ip, such as "192.168.0.16". - Default: "127.0.0.1"; + Default: "127.0.0.1"; (4) started_port (int): The trainer's started port on a single node, - such as 6170. Default: None; + such as 6170. Default: None; (5) selected_gpus (string): The training process will run on the - selected_gpus, such as "0,1,2,3". Default: None; + selected_gpus, such as "0,1,2,3". Default: None; (6) print_config: Print current parallel training config. Default: False; (7) use_paddlecloud: Whether to use paddlecloud platform to run your - multi-process job. Default: False. + multi-process job. Default: False. Returns: ``MultiprocessContext`` object, it hold the spawned processes. From cad68727888fb94c2d6dc79eb2637f5c92c268de Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Fri, 28 Aug 2020 00:04:35 +0000 Subject: [PATCH 32/32] add skip unittest when not gpu --- python/paddle/distributed/parallel.py | 5 +++- .../test_spawn_and_init_parallel_env.py | 23 +++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index 205a1ae793b49a..0c806747217add 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -14,6 +14,7 @@ import os import six +import warnings from paddle import compat as cpt @@ -101,7 +102,7 @@ def train(): def _check_var_exists(var_name): var = os.environ.get(var_name, None) if var is None: - raise ValueError("paddle.distributed initialize error," + raise ValueError("paddle.distributed initialize error, " "environment variable %s is needed, but not set." % var_name) @@ -114,6 +115,8 @@ def _check_var_exists(var_name): # 3. init ParallelStrategy strategy = ParallelStrategy() if cpt.to_text(backend) == 'nccl': + if parallel_helper._is_parallel_ctx_initialized(): + warnings.warn("The parallel environment has been initialized.") strategy.nranks = ParallelEnv().world_size strategy.local_rank = ParallelEnv().rank strategy.trainer_endpoints = ParallelEnv().trainer_endpoints diff --git a/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py index b19805d7a5b747..ca92bc75245ceb 100644 --- a/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py +++ b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py @@ -22,6 +22,9 @@ import paddle.distributed as dist from paddle.distributed.spawn import _get_subprocess_env_list +from paddle.fluid import core +from paddle.fluid.dygraph import parallel_helper + # NOTE(chenweihang): Coverage CI is currently not able to count python3 # unittest, so the unittests here covers some cases that will only be # executed in the python3 sub-process. @@ -36,7 +39,27 @@ def test_backend_value_error(self): with self.assertRaises(ValueError): dist.init_parallel_env(backend="mpi") + def test_check_env_failed(self): + os.environ['FLAGS_selected_gpus'] = '0' + os.environ['PADDLE_TRAINER_ID'] = '0' + os.environ['PADDLE_CURRENT_ENDPOINT'] = '127.0.0.1:6170' + os.environ['PADDLE_TRAINERS_NUM'] = '1' + with self.assertRaises(ValueError): + dist.init_parallel_env() + + def test_init_parallel_env_break(self): + os.environ['FLAGS_selected_gpus'] = '0' + os.environ['PADDLE_TRAINER_ID'] = '0' + os.environ['PADDLE_CURRENT_ENDPOINT'] = '127.0.0.1:6170' + os.environ['PADDLE_TRAINERS_NUM'] = '1' + os.environ['PADDLE_TRAINER_ENDPOINTS'] = '127.0.0.1:6170' + # coverage success branch + dist.init_parallel_env() + self.assertFalse(parallel_helper._is_parallel_ctx_initialized()) + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") class TestSpawnAssistMethod(unittest.TestCase): def test_only_cluster_node_ips_error(self): with self.assertRaises(ValueError):