From 97b8bdc6f2e1be226e224a6c9cf0ed459a8fc0e6 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 7 Aug 2020 10:16:48 +0000
Subject: [PATCH 01/32] add dygraph parallel run interface

---
 python/paddle/distributed/__init__.py |   7 +
 python/paddle/distributed/launch.py   |   8 +-
 python/paddle/distributed/run.py      | 216 ++++++++++++++++++++++++++
 3 files changed, 230 insertions(+), 1 deletion(-)
 create mode 100644 python/paddle/distributed/run.py

diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index d0c32e26092f6e..3e0051535449c6 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -11,3 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from .run import init_parallel_env, run
+
+__all__ = [
+    "init_parallel_env",
+    "run",
+]
diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py
index ecd1cf0ca7bef6..80d97e325ba223 100644
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -190,7 +190,7 @@ def get_gpus(selected_gpus):
     return selected_gpus
 
 
-def launch(args):
+def get_cluster_and_pod(args):
     # parse arguments, used for cloud-single-machine and local
     selected_gpus = get_gpus(args.selected_gpus)
     trainers_num = cloud_utils.get_trainers_num()
@@ -209,6 +209,12 @@ def launch(args):
         cluster, pod = get_cluster_from_args(args, selected_gpus)
         logger.info("get cluster from args:{}".format(cluster))
 
+    return cluster, pod
+
+
+def launch(args):
+    cluster, pod = get_cluster_and_pod(args)
+
     procs = start_local_trainers(
         cluster,
         pod,
diff --git a/python/paddle/distributed/run.py b/python/paddle/distributed/run.py
new file mode 100644
index 00000000000000..748513799a14d9
--- /dev/null
+++ b/python/paddle/distributed/run.py
@@ -0,0 +1,216 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import copy
+import multiprocessing
+import os
+import signal
+import six
+import sys
+
+import paddle.fluid as fluid
+from paddle.distributed.launch import get_cluster_and_pod
+
+
+def _py_version_check():
+    if not sys.version_info >= (3, 4):
+        raise RuntimeError(
+            "Use `paddle.distributed.run` to start parallel training "
+            "requires python version greater than 3.4, if your python "
+            "is lower than this version, please use "
+            "`paddle.distributed.launch` instead.")
+
+
+class ParallelEnvArgs(object):
+    def __init__(self):
+        self.cluster_node_ips = None
+        self.node_ip = None
+        self.use_paddlecloud = None
+        self.started_port = None
+        self.selected_gpus = None
+
+
+def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs):
+    """
+
+    Args:
+        backend(str, optional): The backend to communication between multiple devices.
+            Now only support `nccl`. Default value is `nccl`.
+    """
+    # 1. input check
+    if not isinstance(trainer_id, six.integer_types):
+        raise TypeError(
+            "input `trainer_id` type error, expected type is integer, but received type is %s."
+            % type(trainer_id))
+    if not isinstance(trainer_num, six.integer_types):
+        raise TypeError(
+            "input `trainer_num` type error, expected type is integer, but received type is %s."
+            % type(trainer_id))
+    if not isinstance(backend, six.string_types):
+        raise TypeError(
+            "input `backend` type error, expected type is str, but received type is %s."
+            % type(trainer_id))
+
+    if trainer_id > 0:
+        raise ValueError(
+            "input `trainer_id` should be greater than 0, but received %d." %
+            trainer_id)
+    if trainer_num > 0:
+        raise ValueError(
+            "input `trainer_num` should be greater than 0, but received %d." %
+            trainer_num)
+    if trainer_id < trainer_num:
+        raise ValueError(
+            "input `trainer_id` should be less than or equal to `trainer_num`, but `trainer_id` is %d, `trainer_num` is %d."
+            % (trainer_id, trainer_num))
+    if six.ensure_str(backend) != 'nccl':
+        raise ValueError(
+            "backend `%s` is not supported, now only supports `nccl` backend." %
+            backend)
+
+    # 2. check and prepare environment variables
+    # The necessary environment variables include:
+    # - PADDLE_TRAINER_ID
+    # - PADDLE_TRAINERS_NUM
+    # - PADDLE_CURRENT_ENDPOINT
+    # - PADDLE_TRAINER_ENDPOINTS
+
+    # get args from kwargs
+    args = ParallelEnvArgs()
+    args.cluster_node_ips = kwargs.get('cluster_node_ips', "127.0.0.1")
+    args.node_ip = kwargs.get('node_ip', "127.0.0.1")
+    args.use_paddlecloud = kwargs.get('use_paddlecloud', "False")
+    args.started_port = kwargs.get('started_port', None)
+    args.selected_gpus = kwargs.get('selected_gpus', None)
+
+    # reuse code of launch.py
+    cluster, pod = get_cluster_and_pod(args)
+
+    # copy env & remove useless env vars
+    current_env = copy.copy(os.environ.copy())
+    current_env.pop("http_proxy", None)
+    current_env.pop("https_proxy", None)
+
+    # prepare env var
+
+    assert trainer_num == cluster.trainers_nranks(
+    ), "trainer number parse error."
+    for trainer in pod.trainers:
+        if trainer.id == trainer_id:
+            proc_env = {
+                "FLAGS_selected_gpus":
+                "%s" % ",".join([str(g) for g in selected_gpus]),
+                "PADDLE_TRAINER_ID": "%d" % trainer.id,
+                "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
+                "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
+                "PADDLE_TRAINER_ENDPOINTS":
+                ",".join(cluster.trainers_endpoints())
+            }
+            current_env.update(proc_env)
+            break
+
+
+def _func_wrapper(func, i, args, error_queue):
+    try:
+        func(i, *args)
+    except KeyboardInterrupt:
+        pass
+    except Exception:
+        import traceback
+        error_queue.put(traceback.format_exc())
+        sys.exit(1)
+
+
+class MultiprocessContext(object):
+    def __init__(self, processes, error_queues):
+        _py_version_check()
+        self.error_queues = error_queues
+        self.processes = processes
+        self.sentinels = {
+            process.sentinel: index
+            for index, process in enumerate(processes)
+        }
+
+    def join(self, timeout=None):
+        if len(self.sentinels) == 0:
+            return True
+
+        ready = multiprocessing.connection.wait(
+            self.sentinels.keys(), timeout=timeout)
+
+        error_index = None
+        for sentinel in ready:
+            index = self.sentinels.pop(sentinel)
+            process = self.processes[index]
+            process.join()
+            if process.exitcode != 0:
+                error_index = index
+                break
+
+        if error_index is None:
+            return len(self.sentinels) == 0
+
+        for process in self.processes:
+            if process.is_alive():
+                process.terminate()
+            process.join()
+
+        if self.error_queues[error_index].empty():
+            exitcode = self.processes[error_index].exitcode
+            if exitcode < 0:
+                name = signal.Signals(-exitcode).name
+                raise Exception("Process %d terminated with signal %s." %
+                                (error_index, name))
+            else:
+                raise Exception("Process %d terminated with exit code %s." & (
+                    error_index, exitcode))
+
+        original_trace = self.error_queues[error_index].get()
+        msg = "\n\n-- Procces %d terminated with the following error:\n" % error_index
+        msg += original_trace
+        raise Exception(msg)
+
+
+def launch_processes(func,
+                     args=(),
+                     nprocs=1,
+                     join=True,
+                     daemon=False,
+                     start_method='spawn'):
+    mp = multiprocessing.get_context(start_method)
+    error_queues = []
+    processes = []
+    for i in range(nprocs):
+        error_queue = mp.SimpleQueue()
+        process = mp.Process(
+            target=_func_wrapper,
+            args=(func, i, args, error_queue),
+            daemon=daemon)
+        process.start()
+        error_queues.append(error_queue)
+        processes.append(process)
+
+    context = MultiprocessContext(processes, error_queues)
+    if not join:
+        return context
+
+    # loop until all process end
+    while not context.join():
+        pass
+
+
+def run(func, args=(), nprocs=1, join=True, daemon=False, start_method='spawn'):
+    return launch_processes(func, args, nprocs, join, daemon, start_method)

From 00b56d5600cd0b9815652f9cdbaa1431cc2f3265 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 7 Aug 2020 13:02:12 +0000
Subject: [PATCH 02/32] polish implement & unified env property name

---
 python/paddle/distributed/__init__.py   |  1 +
 python/paddle/distributed/launch.py     |  2 +-
 python/paddle/distributed/run.py        | 22 ++++++-------
 python/paddle/fluid/dygraph/parallel.py | 43 ++++++++++++++++---------
 4 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index 3e0051535449c6..a558358df0f41b 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from . import run
 from .run import init_parallel_env, run
 
 __all__ = [
diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py
index 80d97e325ba223..62d46adffcd1af 100644
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -48,7 +48,7 @@
 import paddle.fluid as fluid
 
 from paddle.distributed.utils import *
-import paddle.distributed.cloud_utils as cloud_utils
+from paddle.distributed import cloud_utils
 
 
 def _print_arguments(args):
diff --git a/python/paddle/distributed/run.py b/python/paddle/distributed/run.py
index 748513799a14d9..414e24d64782fb 100644
--- a/python/paddle/distributed/run.py
+++ b/python/paddle/distributed/run.py
@@ -64,15 +64,15 @@ def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs):
             "input `backend` type error, expected type is str, but received type is %s."
             % type(trainer_id))
 
-    if trainer_id > 0:
+    if trainer_id < 0:
         raise ValueError(
             "input `trainer_id` should be greater than 0, but received %d." %
             trainer_id)
-    if trainer_num > 0:
+    if trainer_num < 0:
         raise ValueError(
             "input `trainer_num` should be greater than 0, but received %d." %
             trainer_num)
-    if trainer_id < trainer_num:
+    if trainer_id >= trainer_num:
         raise ValueError(
             "input `trainer_id` should be less than or equal to `trainer_num`, but `trainer_id` is %d, `trainer_num` is %d."
             % (trainer_id, trainer_num))
@@ -94,32 +94,32 @@ def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs):
     args.node_ip = kwargs.get('node_ip', "127.0.0.1")
     args.use_paddlecloud = kwargs.get('use_paddlecloud', "False")
     args.started_port = kwargs.get('started_port', None)
-    args.selected_gpus = kwargs.get('selected_gpus', None)
+    args.selected_gpus = ",".join(
+        [str(g) for g in [x for x in range(0, trainer_num)]])
 
     # reuse code of launch.py
     cluster, pod = get_cluster_and_pod(args)
 
     # copy env & remove useless env vars
-    current_env = copy.copy(os.environ.copy())
-    current_env.pop("http_proxy", None)
-    current_env.pop("https_proxy", None)
+    os.environ.pop("http_proxy", None)
+    os.environ.pop("https_proxy", None)
 
     # prepare env var
 
     assert trainer_num == cluster.trainers_nranks(
     ), "trainer number parse error."
     for trainer in pod.trainers:
-        if trainer.id == trainer_id:
+        if trainer.rank == trainer_id:
             proc_env = {
                 "FLAGS_selected_gpus":
-                "%s" % ",".join([str(g) for g in selected_gpus]),
-                "PADDLE_TRAINER_ID": "%d" % trainer.id,
+                "%s" % ",".join([str(g) for g in trainer.gpus]),
+                "PADDLE_TRAINER_ID": "%d" % trainer.rank,
                 "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
                 "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
                 "PADDLE_TRAINER_ENDPOINTS":
                 ",".join(cluster.trainers_endpoints())
             }
-            current_env.update(proc_env)
+            os.environ.update(proc_env)
             break
 
 
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 804076f608e714..a6dfc76a833d8b 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -118,47 +118,52 @@ def __init__(self):
         self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
                                             "").split(",")
         self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "")
+        self.__aliases__ = {
+            'local_rank': 'trainer_id',
+            'nranks': 'trainer_num',
+            'dev_id': 'devices'
+        }
 
     @property
-    def nranks(self):
+    def trainer_id(self):
         """
-        The number of trainers, generally refers to the number of GPU cards used in training.
+        The current trainer number.
 
-        Its value is equal to the value of the environment variable PADDLE_TRAINERS_NUM. The default value is 1.
+        Its value is equal to the value of the environment variable PADDLE_TRAINER_ID. The default value is 0.
 
         Examples:
           .. code-block:: python
 
-            # execute this command in terminal: export PADDLE_TRAINERS_NUM=4
+            # execute this command in terminal: export PADDLE_TRAINER_ID=0
             import paddle.fluid as fluid
             
             env = fluid.dygraph.ParallelEnv()
-            print("The nranks is %d" % env.nranks)
-            # The nranks is 4
+            print("The trainer id is %d" % env.trainer_id)
+            # The trainer id is 0
         """
-        return self._nranks
+        return self._local_rank
 
     @property
-    def local_rank(self):
+    def trainer_num(self):
         """
-        The current trainer number.
+        The number of trainers, generally refers to the number of GPU cards used in training.
 
-        Its value is equal to the value of the environment variable PADDLE_TRAINER_ID. The default value is 0.
+        Its value is equal to the value of the environment variable PADDLE_TRAINERS_NUM. The default value is 1.
 
         Examples:
           .. code-block:: python
 
-            # execute this command in terminal: export PADDLE_TRAINER_ID=0
+            # execute this command in terminal: export PADDLE_TRAINERS_NUM=4
             import paddle.fluid as fluid
             
             env = fluid.dygraph.ParallelEnv()
-            print("The local rank is %d" % env.local_rank)
-            # The local rank is 0
+            print("The trainer num is %d" % env.trainer_num)
+            # The trainer num is 4
         """
-        return self._local_rank
+        return self._nranks
 
     @property
-    def dev_id(self):
+    def devices(self):
         """
         The ID of selected GPU card for parallel training.
 
@@ -171,7 +176,7 @@ def dev_id(self):
             import paddle.fluid as fluid
             
             env = fluid.dygraph.ParallelEnv()
-            print("The device id are %d" % env.dev_id)
+            print("The device id are %d" % env.devices)
             # The device id are 1
         """
         return self._dev_id
@@ -215,6 +220,12 @@ def trainer_endpoints(self):
         """
         return self._trainer_endpoints
 
+    def __getattr__(self, name):
+        if name == "__aliases__":
+            raise AttributeError("Attribue `__aliases__` can not be accessed.")
+        name = self.__aliases__.get(name, name)
+        return object.__getattribute__(self, name)
+
 
 # NOTE: [ Compatible ] Originally this class name is `Env`. The semantics of the old class names
 # are inaccurate and may confuse users, so replace it with `ParallelEnv`, but to be compatible

From 17f7fe947fe0c217395cac98287621597f1add00 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 10 Aug 2020 05:17:40 +0000
Subject: [PATCH 03/32] add print config arg

---
 python/paddle/distributed/run.py   | 45 +++++++++++++++++-------------
 python/paddle/distributed/utils.py |  6 ++++
 2 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/python/paddle/distributed/run.py b/python/paddle/distributed/run.py
index 414e24d64782fb..b552f51eed31c3 100644
--- a/python/paddle/distributed/run.py
+++ b/python/paddle/distributed/run.py
@@ -22,7 +22,7 @@
 import sys
 
 import paddle.fluid as fluid
-from paddle.distributed.launch import get_cluster_and_pod
+from paddle.distributed.launch import get_cluster_and_pod, _print_arguments
 
 
 def _py_version_check():
@@ -40,6 +40,7 @@ def __init__(self):
         self.node_ip = None
         self.use_paddlecloud = None
         self.started_port = None
+        self.print_config = True
         self.selected_gpus = None
 
 
@@ -94,33 +95,39 @@ def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs):
     args.node_ip = kwargs.get('node_ip', "127.0.0.1")
     args.use_paddlecloud = kwargs.get('use_paddlecloud', "False")
     args.started_port = kwargs.get('started_port', None)
+    args.print_config = kwargs.get('print_config', True)
     args.selected_gpus = ",".join(
         [str(g) for g in [x for x in range(0, trainer_num)]])
 
     # reuse code of launch.py
     cluster, pod = get_cluster_and_pod(args)
 
-    # copy env & remove useless env vars
+    # remove useless env vars
     os.environ.pop("http_proxy", None)
     os.environ.pop("https_proxy", None)
 
-    # prepare env var
-
-    assert trainer_num == cluster.trainers_nranks(
-    ), "trainer number parse error."
-    for trainer in pod.trainers:
-        if trainer.rank == trainer_id:
-            proc_env = {
-                "FLAGS_selected_gpus":
-                "%s" % ",".join([str(g) for g in trainer.gpus]),
-                "PADDLE_TRAINER_ID": "%d" % trainer.rank,
-                "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
-                "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
-                "PADDLE_TRAINER_ENDPOINTS":
-                ",".join(cluster.trainers_endpoints())
-            }
-            os.environ.update(proc_env)
-            break
+    # update env vars
+    if trainer_num != cluster.trainers_nranks():
+        raise RuntimeError(
+            "The number of trainers does not meet expectations, expected number is %d, but actual number is %d."
+            % (trainer_num, cluster.trainers_nranks()))
+    trainer = pod.get_trainer(trainer_id)
+    if trainer is None:
+        raise RuntimeError(
+            "The expected trainer is not exists, its trainer id is %d" %
+            trainer_id)
+    proc_env = {
+        "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in trainer.gpus]),
+        "PADDLE_TRAINER_ID": "%d" % trainer.rank,
+        "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
+        "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
+        "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
+    }
+    os.environ.update(proc_env)
+
+    # print config
+    if args.print_config and trainer_id == 0:
+        _print_arguments(args)
 
 
 def _func_wrapper(func, i, args, error_queue):
diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index 7c8fa257f778e7..87d0f1546f38d0 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -213,6 +213,12 @@ def get_visible_gpus(self):
         r = r[:-1]
         return r
 
+    def get_trainer(self, trainer_id):
+        for trainer in self.trainers:
+            if trainer.rank == trainer_id:
+                return trainer
+        return None
+
 
 def get_logger(log_level, name="root"):
     logger = logging.getLogger(name)

From 07c86aa64d616ac475b9807f0fbca2aa31791a9b Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 11 Aug 2020 04:02:27 +0000
Subject: [PATCH 04/32] refactor init_parallel_env function

---
 python/paddle/distributed/__init__.py         |   7 +-
 python/paddle/distributed/launch.py           |   2 +-
 python/paddle/distributed/run.py              | 100 ------------
 python/paddle/fluid/dygraph/parallel.py       | 153 ++++++++++++++++--
 .../paddle/fluid/dygraph/parallel_helper.py   |   5 +
 python/paddle/fluid/framework.py              |   8 +
 6 files changed, 160 insertions(+), 115 deletions(-)

diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index a558358df0f41b..9f1e68c4a24c74 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -13,9 +13,6 @@
 # limitations under the License.
 
 from . import run
-from .run import init_parallel_env, run
+from .run import run
 
-__all__ = [
-    "init_parallel_env",
-    "run",
-]
+__all__ = ["run", ]
diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py
index 62d46adffcd1af..43fb91bf7b83a0 100644
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -45,7 +45,7 @@
 import copy
 from argparse import ArgumentParser, REMAINDER
 import paddle
-import paddle.fluid as fluid
+from paddle import fluid
 
 from paddle.distributed.utils import *
 from paddle.distributed import cloud_utils
diff --git a/python/paddle/distributed/run.py b/python/paddle/distributed/run.py
index b552f51eed31c3..297077db9a505f 100644
--- a/python/paddle/distributed/run.py
+++ b/python/paddle/distributed/run.py
@@ -14,15 +14,11 @@
 
 from __future__ import print_function
 
-import copy
 import multiprocessing
-import os
 import signal
-import six
 import sys
 
 import paddle.fluid as fluid
-from paddle.distributed.launch import get_cluster_and_pod, _print_arguments
 
 
 def _py_version_check():
@@ -34,102 +30,6 @@ def _py_version_check():
             "`paddle.distributed.launch` instead.")
 
 
-class ParallelEnvArgs(object):
-    def __init__(self):
-        self.cluster_node_ips = None
-        self.node_ip = None
-        self.use_paddlecloud = None
-        self.started_port = None
-        self.print_config = True
-        self.selected_gpus = None
-
-
-def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs):
-    """
-
-    Args:
-        backend(str, optional): The backend to communication between multiple devices.
-            Now only support `nccl`. Default value is `nccl`.
-    """
-    # 1. input check
-    if not isinstance(trainer_id, six.integer_types):
-        raise TypeError(
-            "input `trainer_id` type error, expected type is integer, but received type is %s."
-            % type(trainer_id))
-    if not isinstance(trainer_num, six.integer_types):
-        raise TypeError(
-            "input `trainer_num` type error, expected type is integer, but received type is %s."
-            % type(trainer_id))
-    if not isinstance(backend, six.string_types):
-        raise TypeError(
-            "input `backend` type error, expected type is str, but received type is %s."
-            % type(trainer_id))
-
-    if trainer_id < 0:
-        raise ValueError(
-            "input `trainer_id` should be greater than 0, but received %d." %
-            trainer_id)
-    if trainer_num < 0:
-        raise ValueError(
-            "input `trainer_num` should be greater than 0, but received %d." %
-            trainer_num)
-    if trainer_id >= trainer_num:
-        raise ValueError(
-            "input `trainer_id` should be less than or equal to `trainer_num`, but `trainer_id` is %d, `trainer_num` is %d."
-            % (trainer_id, trainer_num))
-    if six.ensure_str(backend) != 'nccl':
-        raise ValueError(
-            "backend `%s` is not supported, now only supports `nccl` backend." %
-            backend)
-
-    # 2. check and prepare environment variables
-    # The necessary environment variables include:
-    # - PADDLE_TRAINER_ID
-    # - PADDLE_TRAINERS_NUM
-    # - PADDLE_CURRENT_ENDPOINT
-    # - PADDLE_TRAINER_ENDPOINTS
-
-    # get args from kwargs
-    args = ParallelEnvArgs()
-    args.cluster_node_ips = kwargs.get('cluster_node_ips', "127.0.0.1")
-    args.node_ip = kwargs.get('node_ip', "127.0.0.1")
-    args.use_paddlecloud = kwargs.get('use_paddlecloud', "False")
-    args.started_port = kwargs.get('started_port', None)
-    args.print_config = kwargs.get('print_config', True)
-    args.selected_gpus = ",".join(
-        [str(g) for g in [x for x in range(0, trainer_num)]])
-
-    # reuse code of launch.py
-    cluster, pod = get_cluster_and_pod(args)
-
-    # remove useless env vars
-    os.environ.pop("http_proxy", None)
-    os.environ.pop("https_proxy", None)
-
-    # update env vars
-    if trainer_num != cluster.trainers_nranks():
-        raise RuntimeError(
-            "The number of trainers does not meet expectations, expected number is %d, but actual number is %d."
-            % (trainer_num, cluster.trainers_nranks()))
-    trainer = pod.get_trainer(trainer_id)
-    if trainer is None:
-        raise RuntimeError(
-            "The expected trainer is not exists, its trainer id is %d" %
-            trainer_id)
-    proc_env = {
-        "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in trainer.gpus]),
-        "PADDLE_TRAINER_ID": "%d" % trainer.rank,
-        "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
-        "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
-        "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
-    }
-    os.environ.update(proc_env)
-
-    # print config
-    if args.print_config and trainer_id == 0:
-        _print_arguments(args)
-
-
 def _func_wrapper(func, i, args, error_queue):
     try:
         func(i, *args)
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index a6dfc76a833d8b..c6907c14e26d13 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import os
 import six
 import numpy as np
@@ -20,8 +21,11 @@
 from . import parallel_helper
 from .. import framework
 from . import to_variable, no_grad
+from paddle.distributed.launch import get_cluster_and_pod, _print_arguments
 
-__all__ = ["prepare_context", "ParallelEnv", "DataParallel"]
+__all__ = [
+    "prepare_context", "init_parallel_env", "ParallelEnv", "DataParallel"
+]
 
 ParallelStrategy = core.ParallelStrategy
 
@@ -43,13 +47,145 @@ def prepare_context(strategy=None):
     place = framework._current_expected_place()
     assert place is not None, \
         "dygraph.prepare_context should be used in fluid.dygraph.guard(place) guard."
-    if isinstance(place, core.CUDAPlace):
+    if not parallel_helper._is_parallel_ctx_initialized():
+        if isinstance(place, core.CUDAPlace):
+            parallel_helper._set_parallel_ctx(
+                core.NCCLParallelContext(strategy, place))
+        else:
+            # TODO(Yancey1989): add Gloo Parallel Context to support CPU parallel computation
+            assert ("Only support CUDAPlace for now.")
+        parallel_helper._init_parallel_ctx()
+    return strategy
+
+
+class ParallelEnvArgs(object):
+    def __init__(self):
+        self.cluster_node_ips = None
+        self.node_ip = None
+        self.use_paddlecloud = None
+        self.started_port = None
+        self.print_config = True
+        self.selected_gpus = None
+        self.backend = None
+
+
+def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs):
+    """
+    Initialize parallel environments.
+
+    Args:
+        backend(str, optional): The backend to communication between multiple devices.
+            Now only support `nccl`. Default value is `nccl`.
+
+    Returns:
+        ParallelStrategy
+        
+    Examples:
+        
+    """
+    # 1. input check
+    if not isinstance(trainer_id, six.integer_types):
+        raise TypeError(
+            "input `trainer_id` type error, expected type is integer, but received type is %s."
+            % type(trainer_id))
+    if not isinstance(trainer_num, six.integer_types):
+        raise TypeError(
+            "input `trainer_num` type error, expected type is integer, but received type is %s."
+            % type(trainer_id))
+    if not isinstance(backend, six.string_types):
+        raise TypeError(
+            "input `backend` type error, expected type is str, but received type is %s."
+            % type(trainer_id))
+
+    if trainer_id < 0:
+        raise ValueError(
+            "input `trainer_id` should be greater than 0, but received %d." %
+            trainer_id)
+    if trainer_num < 0:
+        raise ValueError(
+            "input `trainer_num` should be greater than 0, but received %d." %
+            trainer_num)
+    if trainer_id >= trainer_num:
+        raise ValueError(
+            "input `trainer_id` should be less than or equal to `trainer_num`, but `trainer_id` is %d, `trainer_num` is %d."
+            % (trainer_id, trainer_num))
+    if six.ensure_str(backend) != 'nccl':
+        raise ValueError(
+            "backend `%s` is not supported, now only supports `nccl` backend." %
+            backend)
+
+    # 2. check and prepare environment variables
+    # The necessary environment variables include:
+    # - PADDLE_TRAINER_ID
+    # - PADDLE_TRAINERS_NUM
+    # - PADDLE_CURRENT_ENDPOINT
+    # - PADDLE_TRAINER_ENDPOINTS
+
+    # get args from kwargs
+    args = ParallelEnvArgs()
+    args.cluster_node_ips = kwargs.get('cluster_node_ips', "127.0.0.1")
+    args.node_ip = kwargs.get('node_ip', "127.0.0.1")
+    args.use_paddlecloud = kwargs.get('use_paddlecloud', "False")
+    args.started_port = kwargs.get('started_port', None)
+    args.print_config = kwargs.get('print_config', True)
+    args.selected_gpus = ",".join(
+        [str(g) for g in [x for x in range(0, trainer_num)]])
+    args.backend = backend
+
+    # reuse code of launch.py
+    cluster, pod = get_cluster_and_pod(args)
+
+    # remove useless env vars
+    os.environ.pop("http_proxy", None)
+    os.environ.pop("https_proxy", None)
+
+    # update env vars
+    if trainer_num != cluster.trainers_nranks():
+        raise RuntimeError(
+            "The number of trainers does not meet expectations, expected number is %d, but actual number is %d."
+            % (trainer_num, cluster.trainers_nranks()))
+    trainer = pod.get_trainer(trainer_id)
+    if trainer is None:
+        raise RuntimeError(
+            "The expected trainer is not exists, its trainer id is %d" %
+            trainer_id)
+    # why trainer.gpus? here only one device?
+    proc_env = {
+        "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in trainer.gpus]),
+        "PADDLE_TRAINER_ID": "%d" % trainer.rank,
+        "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
+        "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
+        "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
+    }
+    os.environ.update(proc_env)
+
+    # print config
+    if args.print_config and trainer_id == 0:
+        _print_arguments(args)
+
+    # 3. init ParallelStrategy
+    strategy = ParallelStrategy()
+    if six.ensure_str(backend) == 'nccl':
+        strategy.nranks = ParallelEnv().nranks
+        strategy.local_rank = ParallelEnv().local_rank
+        strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
+        strategy.current_endpoint = ParallelEnv().current_endpoint
+        if strategy.nranks < 2:
+            return
+        # NOTE: [ why config global place here? ]
+        # the dygraph mode will be set to default mode, 
+        # users will not call `dygraph.guard` or `enable_dygraph`
+        # directly, if they want to switch detault place,
+        # they need to call a function to change default place,
+        # here just set correctly place to users
+        place = core.CUDAPlace(ParallelEnv().dev_id)
+        framework._switch_current_place(place)
+
+        # init nccl context
         parallel_helper._set_parallel_ctx(
             core.NCCLParallelContext(strategy, place))
-    else:
-        # TODO(Yancey1989): add Gloo Parallel Context to support CPU parallel computation
-        assert ("Only support CUDAPlace for now.")
-    parallel_helper._init_parallel_ctx()
+        parallel_helper._init_parallel_ctx()
+
     return strategy
 
 
@@ -121,7 +257,6 @@ def __init__(self):
         self.__aliases__ = {
             'local_rank': 'trainer_id',
             'nranks': 'trainer_num',
-            'dev_id': 'devices'
         }
 
     @property
@@ -163,7 +298,7 @@ def trainer_num(self):
         return self._nranks
 
     @property
-    def devices(self):
+    def dev_id(self):
         """
         The ID of selected GPU card for parallel training.
 
@@ -176,7 +311,7 @@ def devices(self):
             import paddle.fluid as fluid
             
             env = fluid.dygraph.ParallelEnv()
-            print("The device id are %d" % env.devices)
+            print("The device id are %d" % env.dev_id)
             # The device id are 1
         """
         return self._dev_id
diff --git a/python/paddle/fluid/dygraph/parallel_helper.py b/python/paddle/fluid/dygraph/parallel_helper.py
index f378211de2b8a1..ff1675f0ae8a40 100644
--- a/python/paddle/fluid/dygraph/parallel_helper.py
+++ b/python/paddle/fluid/dygraph/parallel_helper.py
@@ -23,6 +23,11 @@ def _is_data_parallel_mode():
         os.getenv("PADDLE_TRAINERS_NUM", "1")) > 1
 
 
+def _is_parallel_ctx_initialized():
+    global __parallel_ctx__clz__
+    return __parallel_ctx__clz__ is not None
+
+
 def _set_parallel_ctx(nccl_parallel_context):
     global __parallel_ctx__clz__
     assert __parallel_ctx__clz__ is None, \
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index a7faf4041cfe49..3b1ddcea37e0cc 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -5400,6 +5400,14 @@ def _dygraph_place_guard(place):
         _dygraph_current_expected_place_ = tmp_place
 
 
+def _switch_current_place(place):
+    global _dygraph_tracer_
+    global _dygraph_current_expected_place_
+    if _dygraph_tracer_ is not None:
+        _dygraph_tracer_._expected_place = place
+    _dygraph_current_expected_place_ = place
+
+
 def load_op_library(lib_filename):
     """
     :api_attr: Static Graph

From 4c955a13a24b5cfd1f430e4a61f16fbcf8dc513a Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 13 Aug 2020 13:58:57 +0000
Subject: [PATCH 05/32] Compatible with multiprocessing and launch modes

---
 python/paddle/fluid/dygraph/parallel.py | 165 ++++++++++++------------
 1 file changed, 85 insertions(+), 80 deletions(-)

diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index c6907c14e26d13..cba3ecf10f7d8f 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -15,6 +15,7 @@
 import os
 import six
 import numpy as np
+import warnings
 from collections import OrderedDict
 from .. import core
 from . import layers
@@ -66,7 +67,6 @@ def __init__(self):
         self.started_port = None
         self.print_config = True
         self.selected_gpus = None
-        self.backend = None
 
 
 def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs):
@@ -83,85 +83,90 @@ def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs):
     Examples:
         
     """
-    # 1. input check
-    if not isinstance(trainer_id, six.integer_types):
-        raise TypeError(
-            "input `trainer_id` type error, expected type is integer, but received type is %s."
-            % type(trainer_id))
-    if not isinstance(trainer_num, six.integer_types):
-        raise TypeError(
-            "input `trainer_num` type error, expected type is integer, but received type is %s."
-            % type(trainer_id))
-    if not isinstance(backend, six.string_types):
-        raise TypeError(
-            "input `backend` type error, expected type is str, but received type is %s."
-            % type(trainer_id))
-
-    if trainer_id < 0:
-        raise ValueError(
-            "input `trainer_id` should be greater than 0, but received %d." %
-            trainer_id)
-    if trainer_num < 0:
-        raise ValueError(
-            "input `trainer_num` should be greater than 0, but received %d." %
-            trainer_num)
-    if trainer_id >= trainer_num:
-        raise ValueError(
-            "input `trainer_id` should be less than or equal to `trainer_num`, but `trainer_id` is %d, `trainer_num` is %d."
-            % (trainer_id, trainer_num))
-    if six.ensure_str(backend) != 'nccl':
-        raise ValueError(
-            "backend `%s` is not supported, now only supports `nccl` backend." %
-            backend)
-
-    # 2. check and prepare environment variables
-    # The necessary environment variables include:
-    # - PADDLE_TRAINER_ID
-    # - PADDLE_TRAINERS_NUM
-    # - PADDLE_CURRENT_ENDPOINT
-    # - PADDLE_TRAINER_ENDPOINTS
-
-    # get args from kwargs
-    args = ParallelEnvArgs()
-    args.cluster_node_ips = kwargs.get('cluster_node_ips', "127.0.0.1")
-    args.node_ip = kwargs.get('node_ip', "127.0.0.1")
-    args.use_paddlecloud = kwargs.get('use_paddlecloud', "False")
-    args.started_port = kwargs.get('started_port', None)
-    args.print_config = kwargs.get('print_config', True)
-    args.selected_gpus = ",".join(
-        [str(g) for g in [x for x in range(0, trainer_num)]])
-    args.backend = backend
-
-    # reuse code of launch.py
-    cluster, pod = get_cluster_and_pod(args)
-
-    # remove useless env vars
-    os.environ.pop("http_proxy", None)
-    os.environ.pop("https_proxy", None)
-
-    # update env vars
-    if trainer_num != cluster.trainers_nranks():
-        raise RuntimeError(
-            "The number of trainers does not meet expectations, expected number is %d, but actual number is %d."
-            % (trainer_num, cluster.trainers_nranks()))
-    trainer = pod.get_trainer(trainer_id)
-    if trainer is None:
-        raise RuntimeError(
-            "The expected trainer is not exists, its trainer id is %d" %
-            trainer_id)
-    # why trainer.gpus? here only one device?
-    proc_env = {
-        "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in trainer.gpus]),
-        "PADDLE_TRAINER_ID": "%d" % trainer.rank,
-        "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
-        "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
-        "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
-    }
-    os.environ.update(proc_env)
-
-    # print config
-    if args.print_config and trainer_id == 0:
-        _print_arguments(args)
+
+    # NOTE(chenweihang): if trainer_id or trainer_num is default value,
+    # users should config parallel environment by module `paddle.distributed.launch`,
+    # so here we skip the environment variables config phase
+    if trainer_id != -1 or trainer_num != -1:
+        # 1. input check
+        if not isinstance(trainer_id, six.integer_types):
+            raise TypeError(
+                "input `trainer_id` type error, expected type is integer, but received type is %s."
+                % type(trainer_id))
+        if not isinstance(trainer_num, six.integer_types):
+            raise TypeError(
+                "input `trainer_num` type error, expected type is integer, but received type is %s."
+                % type(trainer_id))
+        if not isinstance(backend, six.string_types):
+            raise TypeError(
+                "input `backend` type error, expected type is str, but received type is %s."
+                % type(trainer_id))
+
+        if trainer_id < 0:
+            raise ValueError(
+                "input `trainer_id` should be greater than 0, but received %d."
+                % trainer_id)
+        if trainer_num < 0:
+            raise ValueError(
+                "input `trainer_num` should be greater than 0, but received %d."
+                % trainer_num)
+        if trainer_id >= trainer_num:
+            raise ValueError(
+                "input `trainer_id` should be less than or equal to `trainer_num`, but `trainer_id` is %d, `trainer_num` is %d."
+                % (trainer_id, trainer_num))
+        if six.ensure_str(backend) != 'nccl':
+            raise ValueError(
+                "backend `%s` is not supported, now only supports `nccl` backend."
+                % backend)
+
+        # 2. check and prepare environment variables
+        # The necessary environment variables include:
+        # - PADDLE_TRAINER_ID
+        # - PADDLE_TRAINERS_NUM
+        # - PADDLE_CURRENT_ENDPOINT
+        # - PADDLE_TRAINER_ENDPOINTS
+
+        # get args from kwargs
+        args = ParallelEnvArgs()
+        args.cluster_node_ips = kwargs.get('cluster_node_ips', "127.0.0.1")
+        args.node_ip = kwargs.get('node_ip', "127.0.0.1")
+        args.use_paddlecloud = kwargs.get('use_paddlecloud', False)
+        args.started_port = kwargs.get('started_port', None)
+        args.print_config = kwargs.get('print_config', True)
+        args.selected_gpus = ",".join(
+            [str(g) for g in [x for x in range(0, trainer_num)]])
+
+        # reuse code of launch.py
+        cluster, pod = get_cluster_and_pod(args)
+
+        # remove useless env vars
+        os.environ.pop("http_proxy", None)
+        os.environ.pop("https_proxy", None)
+
+        # update env vars
+        if trainer_num != cluster.trainers_nranks():
+            raise RuntimeError(
+                "The number of trainers does not meet expectations, expected number is %d, but actual number is %d."
+                % (trainer_num, cluster.trainers_nranks()))
+        trainer = pod.get_trainer(trainer_id)
+        if trainer is None:
+            raise RuntimeError(
+                "The expected trainer is not exists, its trainer id is %d" %
+                trainer_id)
+        # why trainer.gpus? here only one device?
+        proc_env = {
+            "FLAGS_selected_gpus":
+            "%s" % ",".join([str(g) for g in trainer.gpus]),
+            "PADDLE_TRAINER_ID": "%d" % trainer.rank,
+            "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
+            "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
+            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
+        }
+        os.environ.update(proc_env)
+
+        # print config
+        if args.print_config and trainer_id == 0:
+            _print_arguments(args)
 
     # 3. init ParallelStrategy
     strategy = ParallelStrategy()

From 523e007a78847f86297bbabf04dd812246f45f72 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 14 Aug 2020 08:36:33 +0000
Subject: [PATCH 06/32] set default trainer start port

---
 python/paddle/distributed/run.py        | 36 ++++++++++++----
 python/paddle/fluid/dygraph/parallel.py | 55 +++++++++++++++++--------
 2 files changed, 66 insertions(+), 25 deletions(-)

diff --git a/python/paddle/distributed/run.py b/python/paddle/distributed/run.py
index 297077db9a505f..85b495cef37fb8 100644
--- a/python/paddle/distributed/run.py
+++ b/python/paddle/distributed/run.py
@@ -15,19 +15,32 @@
 from __future__ import print_function
 
 import multiprocessing
+import os
 import signal
 import sys
+import warnings
 
 import paddle.fluid as fluid
+from paddle.distributed.utils import find_free_ports
 
 
-def _py_version_check():
+def _support_set_start_method():
     if not sys.version_info >= (3, 4):
-        raise RuntimeError(
-            "Use `paddle.distributed.run` to start parallel training "
-            "requires python version greater than 3.4, if your python "
-            "is lower than this version, please use "
-            "`paddle.distributed.launch` instead.")
+        warnings.warn(
+            "`paddle.distributed.run` only supports setting the process"
+            " start when python version greater than 3.4, if your python"
+            " is lower than this version, only can start processes by"
+            " default method of current platform.")
+
+
+def _set_default_master_env():
+    # set default master trainer ip addr
+    os.environ['PADDLE_MASTER_IPADDR'] = '127.0.0.1'
+    # set default master trainer port
+    port_set = find_free_ports(1)
+    if port_set is None:
+        raise RuntimeError("no free port can be used to parallel training now.")
+    os.environ['PADDLE_MASTER_PORT'] = str(list(port_set)[0])
 
 
 def _func_wrapper(func, i, args, error_queue):
@@ -43,7 +56,7 @@ def _func_wrapper(func, i, args, error_queue):
 
 class MultiprocessContext(object):
     def __init__(self, processes, error_queues):
-        _py_version_check()
+        _support_set_start_method()
         self.error_queues = error_queues
         self.processes = processes
         self.sentinels = {
@@ -97,6 +110,15 @@ def launch_processes(func,
                      join=True,
                      daemon=False,
                      start_method='spawn'):
+    # NOTE(chenweihang): [ why need set default master info before run? ]
+    # when using `paddle.distributed.run` start parallel training,
+    # users need use `init_parallel_env` to config some cluster info
+    # inner subprocess, if each process find free port for itself,
+    # the started port may be different, it will cause endpoints is
+    # different in different subprocesses
+    _set_default_master_env()
+
+    # start processes
     mp = multiprocessing.get_context(start_method)
     error_queues = []
     processes = []
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index cba3ecf10f7d8f..73c8d7c618b476 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -91,29 +91,27 @@ def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs):
         # 1. input check
         if not isinstance(trainer_id, six.integer_types):
             raise TypeError(
-                "input `trainer_id` type error, expected type is integer, but received type is %s."
-                % type(trainer_id))
+                "input `trainer_id` type error, expected type is integer, "
+                "but received type is %s." % type(trainer_id))
         if not isinstance(trainer_num, six.integer_types):
             raise TypeError(
-                "input `trainer_num` type error, expected type is integer, but received type is %s."
-                % type(trainer_id))
+                "input `trainer_num` type error, expected type is integer, "
+                "but received type is %s." % type(trainer_id))
         if not isinstance(backend, six.string_types):
-            raise TypeError(
-                "input `backend` type error, expected type is str, but received type is %s."
-                % type(trainer_id))
+            raise TypeError("input `backend` type error, expected type is str, "
+                            "but received type is %s." % type(trainer_id))
 
         if trainer_id < 0:
-            raise ValueError(
-                "input `trainer_id` should be greater than 0, but received %d."
-                % trainer_id)
+            raise ValueError("input `trainer_id` should be greater than 0, "
+                             "but received %d." % trainer_id)
         if trainer_num < 0:
-            raise ValueError(
-                "input `trainer_num` should be greater than 0, but received %d."
-                % trainer_num)
+            raise ValueError("input `trainer_num` should be greater than 0, "
+                             "but received %d." % trainer_num)
         if trainer_id >= trainer_num:
             raise ValueError(
-                "input `trainer_id` should be less than or equal to `trainer_num`, but `trainer_id` is %d, `trainer_num` is %d."
-                % (trainer_id, trainer_num))
+                "input `trainer_id` should be less than or equal to `trainer_num`, "
+                "but `trainer_id` is %d, `trainer_num` is %d." %
+                (trainer_id, trainer_num))
         if six.ensure_str(backend) != 'nccl':
             raise ValueError(
                 "backend `%s` is not supported, now only supports `nccl` backend."
@@ -128,10 +126,31 @@ def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs):
 
         # get args from kwargs
         args = ParallelEnvArgs()
-        args.cluster_node_ips = kwargs.get('cluster_node_ips', "127.0.0.1")
-        args.node_ip = kwargs.get('node_ip', "127.0.0.1")
-        args.use_paddlecloud = kwargs.get('use_paddlecloud', False)
+        args.cluster_node_ips = kwargs.get('cluster_node_ips', None)
+        args.node_ip = kwargs.get('node_ip', None)
+        if args.cluster_node_ips is not None and args.node_ip is None:
+            raise ValueError("please input current node ip, "
+                             "cannot `cluster_node_ips`.")
+        default_node_ip = os.environ.get("PADDLE_MASTER_IPADDR", None)
+        default_node_ip = "127.0.0.1" if default_node_ip else default_node_ip
+        if args.node_ip is None:
+            args.node_ip = default_node_ip
+        if args.cluster_node_ips is None:
+            args.cluster_node_ips = default_node_ip
+
+        # NOTE(chenweihang): Here should set started_port before
+        # `get_cluster_and_pod` and keep each process's started_port
+        # is same, see [ why need set default master info before run? ]
         args.started_port = kwargs.get('started_port', None)
+        if args.started_port is None:
+            default_port = os.environ.get("PADDLE_MASTER_PORT", None)
+            if default_port is None:
+                raise RuntimeError(
+                    "please input start port of parallel training by `started_port=**`."
+                )
+            args.started_port = default_port
+
+        args.use_paddlecloud = kwargs.get('use_paddlecloud', False)
         args.print_config = kwargs.get('print_config', True)
         args.selected_gpus = ",".join(
             [str(g) for g in [x for x in range(0, trainer_num)]])

From 8101b035ebf8000de868fda45a083d252a648da8 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Sat, 15 Aug 2020 14:54:34 +0000
Subject: [PATCH 07/32] support run in python 2

---
 python/paddle/distributed/run.py        | 67 ++++++++++++++++++++++---
 python/paddle/fluid/dygraph/parallel.py |  4 +-
 2 files changed, 63 insertions(+), 8 deletions(-)

diff --git a/python/paddle/distributed/run.py b/python/paddle/distributed/run.py
index 85b495cef37fb8..0153f96fc42385 100644
--- a/python/paddle/distributed/run.py
+++ b/python/paddle/distributed/run.py
@@ -31,6 +31,14 @@ def _support_set_start_method():
             " start when python version greater than 3.4, if your python"
             " is lower than this version, only can start processes by"
             " default method of current platform.")
+        return False
+    return True
+
+
+def _support_connection_wait():
+    if not sys.version_info >= (3, 3):
+        return False
+    return True
 
 
 def _set_default_master_env():
@@ -56,15 +64,25 @@ def _func_wrapper(func, i, args, error_queue):
 
 class MultiprocessContext(object):
     def __init__(self, processes, error_queues):
-        _support_set_start_method()
         self.error_queues = error_queues
         self.processes = processes
-        self.sentinels = {
-            process.sentinel: index
-            for index, process in enumerate(processes)
-        }
+        # NOTE(chenweihang): multiprocessing.connection.wait is a new feature
+        # supported from python3.3, which can provide more fine-grained support 
+        # for multi-subprocess exit monitoring.
+        self.use_connection_wait = _support_connection_wait()
+        if self.use_connection_wait:
+            self.sentinels = {
+                process.sentinel: index
+                for index, process in enumerate(processes)
+            }
 
     def join(self, timeout=None):
+        if self.use_connection_wait:
+            return self._join_with_conn_wait(timeout)
+        else:
+            return self._join_without_conn_wait(timeout)
+
+    def _join_with_conn_wait(self, timeout=None):
         if len(self.sentinels) == 0:
             return True
 
@@ -83,6 +101,39 @@ def join(self, timeout=None):
         if error_index is None:
             return len(self.sentinels) == 0
 
+        self._join_and_throw_exception(error_index)
+
+    # NOTE(chenweihng): This method is not as efficient as connection.wait.
+    # Beccause if process has already stopped, p.join() will return immediately.
+    # If process hasn't stopped, it will block until the process end, and 
+    # as the same time, the other process may have been end, which makes it 
+    # impossible for us to accurately capture the first failed process.
+    # Here we avoid process block by setting timeout, but if the other process
+    # exit before the timeout end, we will also encounter the previois problem,
+    # but maybe there is no better way here. When we fully migrated to python3, 
+    # this problem disappeared
+    def _join_without_conn_wait(self, timeout=None):
+        finished_processes = []
+        error_index = None
+        for index, proccess in enumerate(self.processes):
+            # try to join selected process
+            proccess.join(timeout=1)
+            # This will be None if the process has not yet terminated
+            if process.exitcode is not None:
+                # exit with exception
+                if process.exitcode == 0:
+                    finished_processes.append(index)
+                else:
+                    error_index = index
+                    failed_processes.append(index)
+                    break
+
+        if error_index is None:
+            return len(finished_processes) == 0
+
+        self._join_and_throw_exception(error_index)
+
+    def _join_and_throw_exception(self, error_index):
         for process in self.processes:
             if process.is_alive():
                 process.terminate()
@@ -119,7 +170,11 @@ def launch_processes(func,
     _set_default_master_env()
 
     # start processes
-    mp = multiprocessing.get_context(start_method)
+    if _support_set_start_method():
+        mp = multiprocessing.get_context(start_method)
+    else:
+        mp = multiprocessing
+
     error_queues = []
     processes = []
     for i in range(nprocs):
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 73c8d7c618b476..6779a6c8edee3a 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -148,10 +148,10 @@ def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs):
                 raise RuntimeError(
                     "please input start port of parallel training by `started_port=**`."
                 )
-            args.started_port = default_port
+            args.started_port = int(default_port)
 
         args.use_paddlecloud = kwargs.get('use_paddlecloud', False)
-        args.print_config = kwargs.get('print_config', True)
+        args.print_config = kwargs.get('print_config', False)
         args.selected_gpus = ",".join(
             [str(g) for g in [x for x in range(0, trainer_num)]])
 

From d3b9a065e8b7f464400fc5c3ab906b0e610269ce Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 17 Aug 2020 06:37:14 +0000
Subject: [PATCH 08/32] polish python2 support code

---
 python/paddle/distributed/launch.py     |  5 ++--
 python/paddle/distributed/run.py        | 32 ++++++++++++++++---------
 python/paddle/fluid/dygraph/parallel.py | 16 +++++++------
 3 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py
index 43fb91bf7b83a0..8a093abb3766ed 100644
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -44,9 +44,8 @@
 import six
 import copy
 from argparse import ArgumentParser, REMAINDER
-import paddle
-from paddle import fluid
 
+from paddle.fluid import core
 from paddle.distributed.utils import *
 from paddle.distributed import cloud_utils
 
@@ -167,7 +166,7 @@ def get_cluster_from_args(args, selected_gpus):
 
 def get_gpus(selected_gpus):
     if selected_gpus is None:
-        gpus_num = fluid.core.get_cuda_device_count()
+        gpus_num = core.get_cuda_device_count()
         selected_gpus = [str(x) for x in range(0, gpus_num)]
     else:
         cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
diff --git a/python/paddle/distributed/run.py b/python/paddle/distributed/run.py
index 0153f96fc42385..b64968cf4741ec 100644
--- a/python/paddle/distributed/run.py
+++ b/python/paddle/distributed/run.py
@@ -12,17 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
+from __future__ import print_function, division
 
 import multiprocessing
 import os
 import signal
+import six
 import sys
 import warnings
 
 import paddle.fluid as fluid
 from paddle.distributed.utils import find_free_ports
 
+# SimpleQueue is different in py2 and py3
+if six.PY2:
+    import multiprocessing.queues as queues_py2
+
 
 def _support_set_start_method():
     if not sys.version_info >= (3, 4):
@@ -115,9 +120,10 @@ def _join_with_conn_wait(self, timeout=None):
     def _join_without_conn_wait(self, timeout=None):
         finished_processes = []
         error_index = None
-        for index, proccess in enumerate(self.processes):
+        timeout = timeout // len(self.processes) if timeout else 1
+        for index, process in enumerate(self.processes):
             # try to join selected process
-            proccess.join(timeout=1)
+            process.join(timeout=timeout)
             # This will be None if the process has not yet terminated
             if process.exitcode is not None:
                 # exit with exception
@@ -125,11 +131,11 @@ def _join_without_conn_wait(self, timeout=None):
                     finished_processes.append(index)
                 else:
                     error_index = index
-                    failed_processes.append(index)
+                    finished_processes.append(index)
                     break
 
         if error_index is None:
-            return len(finished_processes) == 0
+            return len(finished_processes) == len(self.processes)
 
         self._join_and_throw_exception(error_index)
 
@@ -146,11 +152,13 @@ def _join_and_throw_exception(self, error_index):
                 raise Exception("Process %d terminated with signal %s." %
                                 (error_index, name))
             else:
-                raise Exception("Process %d terminated with exit code %s." & (
+                raise Exception("Process %d terminated with exit code %d." & (
                     error_index, exitcode))
 
         original_trace = self.error_queues[error_index].get()
-        msg = "\n\n-- Procces %d terminated with the following error:\n" % error_index
+        msg = "\n\n----------------------------------------------\n" \
+              "Procces %d terminated with the following error:\n" \
+              "----------------------------------------------\n\n" % error_index
         msg += original_trace
         raise Exception(msg)
 
@@ -178,11 +186,13 @@ def launch_processes(func,
     error_queues = []
     processes = []
     for i in range(nprocs):
-        error_queue = mp.SimpleQueue()
+        if six.PY2:
+            error_queue = queues_py2.SimpleQueue()
+        else:
+            error_queue = mp.SimpleQueue()
         process = mp.Process(
-            target=_func_wrapper,
-            args=(func, i, args, error_queue),
-            daemon=daemon)
+            target=_func_wrapper, args=(func, i, args, error_queue))
+        process.daemon = daemon
         process.start()
         error_queues.append(error_queue)
         processes.append(process)
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 6779a6c8edee3a..382e9ae6b809d2 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -17,11 +17,13 @@
 import numpy as np
 import warnings
 from collections import OrderedDict
-from .. import core
-from . import layers
-from . import parallel_helper
-from .. import framework
-from . import to_variable, no_grad
+
+from paddle import compat as cpt
+from paddle.fluid import core
+from paddle.fluid import framework
+from paddle.fluid.dygraph import layers
+from paddle.fluid.dygraph import parallel_helper
+from paddle.fluid.dygraph import to_variable, no_grad
 from paddle.distributed.launch import get_cluster_and_pod, _print_arguments
 
 __all__ = [
@@ -112,7 +114,7 @@ def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs):
                 "input `trainer_id` should be less than or equal to `trainer_num`, "
                 "but `trainer_id` is %d, `trainer_num` is %d." %
                 (trainer_id, trainer_num))
-        if six.ensure_str(backend) != 'nccl':
+        if cpt.to_text(backend) != 'nccl':
             raise ValueError(
                 "backend `%s` is not supported, now only supports `nccl` backend."
                 % backend)
@@ -189,7 +191,7 @@ def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs):
 
     # 3. init ParallelStrategy
     strategy = ParallelStrategy()
-    if six.ensure_str(backend) == 'nccl':
+    if cpt.to_text(backend) == 'nccl':
         strategy.nranks = ParallelEnv().nranks
         strategy.local_rank = ParallelEnv().local_rank
         strategy.trainer_endpoints = ParallelEnv().trainer_endpoints

From 48c46ff43dfdfadcf2d9f453071baf4f7fbe56b5 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 17 Aug 2020 07:23:34 +0000
Subject: [PATCH 09/32] remove python2 support

---
 python/paddle/distributed/run.py | 103 ++++++++-----------------------
 1 file changed, 27 insertions(+), 76 deletions(-)

diff --git a/python/paddle/distributed/run.py b/python/paddle/distributed/run.py
index b64968cf4741ec..561404453fb8a6 100644
--- a/python/paddle/distributed/run.py
+++ b/python/paddle/distributed/run.py
@@ -17,33 +17,20 @@
 import multiprocessing
 import os
 import signal
-import six
 import sys
 import warnings
 
 import paddle.fluid as fluid
 from paddle.distributed.utils import find_free_ports
 
-# SimpleQueue is different in py2 and py3
-if six.PY2:
-    import multiprocessing.queues as queues_py2
 
-
-def _support_set_start_method():
+def _py_supported_check():
     if not sys.version_info >= (3, 4):
-        warnings.warn(
-            "`paddle.distributed.run` only supports setting the process"
-            " start when python version greater than 3.4, if your python"
-            " is lower than this version, only can start processes by"
-            " default method of current platform.")
-        return False
-    return True
-
-
-def _support_connection_wait():
-    if not sys.version_info >= (3, 3):
-        return False
-    return True
+        raise RuntimeError(
+            "Use `paddle.distributed.run` to start parallel training "
+            "requires python version greater than 3.4, if your python "
+            "is lower than this version, please use "
+            "`paddle.distributed.launch` instead.")
 
 
 def _set_default_master_env():
@@ -69,25 +56,15 @@ def _func_wrapper(func, i, args, error_queue):
 
 class MultiprocessContext(object):
     def __init__(self, processes, error_queues):
+        _py_supported_check()
         self.error_queues = error_queues
         self.processes = processes
-        # NOTE(chenweihang): multiprocessing.connection.wait is a new feature
-        # supported from python3.3, which can provide more fine-grained support 
-        # for multi-subprocess exit monitoring.
-        self.use_connection_wait = _support_connection_wait()
-        if self.use_connection_wait:
-            self.sentinels = {
-                process.sentinel: index
-                for index, process in enumerate(processes)
-            }
+        self.sentinels = {
+            process.sentinel: index
+            for index, process in enumerate(processes)
+        }
 
     def join(self, timeout=None):
-        if self.use_connection_wait:
-            return self._join_with_conn_wait(timeout)
-        else:
-            return self._join_without_conn_wait(timeout)
-
-    def _join_with_conn_wait(self, timeout=None):
         if len(self.sentinels) == 0:
             return True
 
@@ -106,45 +83,14 @@ def _join_with_conn_wait(self, timeout=None):
         if error_index is None:
             return len(self.sentinels) == 0
 
-        self._join_and_throw_exception(error_index)
-
-    # NOTE(chenweihng): This method is not as efficient as connection.wait.
-    # Beccause if process has already stopped, p.join() will return immediately.
-    # If process hasn't stopped, it will block until the process end, and 
-    # as the same time, the other process may have been end, which makes it 
-    # impossible for us to accurately capture the first failed process.
-    # Here we avoid process block by setting timeout, but if the other process
-    # exit before the timeout end, we will also encounter the previois problem,
-    # but maybe there is no better way here. When we fully migrated to python3, 
-    # this problem disappeared
-    def _join_without_conn_wait(self, timeout=None):
-        finished_processes = []
-        error_index = None
-        timeout = timeout // len(self.processes) if timeout else 1
-        for index, process in enumerate(self.processes):
-            # try to join selected process
-            process.join(timeout=timeout)
-            # This will be None if the process has not yet terminated
-            if process.exitcode is not None:
-                # exit with exception
-                if process.exitcode == 0:
-                    finished_processes.append(index)
-                else:
-                    error_index = index
-                    finished_processes.append(index)
-                    break
-
-        if error_index is None:
-            return len(finished_processes) == len(self.processes)
-
-        self._join_and_throw_exception(error_index)
-
-    def _join_and_throw_exception(self, error_index):
         for process in self.processes:
             if process.is_alive():
                 process.terminate()
             process.join()
 
+        self._throw_exception(error_index)
+
+    def _throw_exception(self, error_index):
         if self.error_queues[error_index].empty():
             exitcode = self.processes[error_index].exitcode
             if exitcode < 0:
@@ -163,12 +109,23 @@ def _join_and_throw_exception(self, error_index):
         raise Exception(msg)
 
 
+# NOTE(chenweihang): [ why default start method is spawn? ]
+# The CUDA runtime does not support the fork start method, 
+# either the spawn or forkserver start method are required 
+# to use CUDA in subprocesses.
 def launch_processes(func,
                      args=(),
                      nprocs=1,
                      join=True,
                      daemon=False,
                      start_method='spawn'):
+    # NOTE(chenweihang): [ why only supports python3.4+? ]
+    # Python has only supported setting the child process startup method
+    # since 3.4. The previous version can only use the default startup 
+    # method, while the default startup method of Unix is fork, which 
+    # cannot support CUDA runtime multi-process
+    _py_supported_check()
+
     # NOTE(chenweihang): [ why need set default master info before run? ]
     # when using `paddle.distributed.run` start parallel training,
     # users need use `init_parallel_env` to config some cluster info
@@ -178,18 +135,12 @@ def launch_processes(func,
     _set_default_master_env()
 
     # start processes
-    if _support_set_start_method():
-        mp = multiprocessing.get_context(start_method)
-    else:
-        mp = multiprocessing
+    mp = multiprocessing.get_context(start_method)
 
     error_queues = []
     processes = []
     for i in range(nprocs):
-        if six.PY2:
-            error_queue = queues_py2.SimpleQueue()
-        else:
-            error_queue = mp.SimpleQueue()
+        error_queue = mp.SimpleQueue()
         process = mp.Process(
             target=_func_wrapper, args=(func, i, args, error_queue))
         process.daemon = daemon

From b06d40050b95a880977a2185a517c6fd33dbf362 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 19 Aug 2020 05:59:40 +0000
Subject: [PATCH 10/32] refine launch import

---
 python/paddle/distributed/launch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py
index 8a093abb3766ed..e2ab321f9aebdd 100644
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -45,7 +45,6 @@
 import copy
 from argparse import ArgumentParser, REMAINDER
 
-from paddle.fluid import core
 from paddle.distributed.utils import *
 from paddle.distributed import cloud_utils
 
@@ -166,6 +165,7 @@ def get_cluster_from_args(args, selected_gpus):
 
 def get_gpus(selected_gpus):
     if selected_gpus is None:
+        from paddle.fluid import core
         gpus_num = core.get_cuda_device_count()
         selected_gpus = [str(x) for x in range(0, gpus_num)]
     else:

From 2c7b3fd58a57edf09b0b79e477bb507c279f57b7 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 19 Aug 2020 12:55:00 +0000
Subject: [PATCH 11/32] polish dome design details

---
 python/paddle/distributed/__init__.py         |  7 ++++---
 .../paddle/distributed/{run.py => spawn.py}   | 19 +++++++++++--------
 python/paddle/fluid/dygraph/parallel.py       |  5 ++++-
 python/paddle/framework/__init__.py           |  6 +++---
 4 files changed, 22 insertions(+), 15 deletions(-)
 rename python/paddle/distributed/{run.py => spawn.py} (91%)

diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index 9f1e68c4a24c74..ec181cdd0f2803 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import run
-from .run import run
+from . import spawn
+from .spawn import spwan
+from .spawn import start_processes
 
-__all__ = ["run", ]
+__all__ = ["spawn", "start_processes"]
diff --git a/python/paddle/distributed/run.py b/python/paddle/distributed/spawn.py
similarity index 91%
rename from python/paddle/distributed/run.py
rename to python/paddle/distributed/spawn.py
index 561404453fb8a6..ef2e81e50769a4 100644
--- a/python/paddle/distributed/run.py
+++ b/python/paddle/distributed/spawn.py
@@ -113,12 +113,12 @@ def _throw_exception(self, error_index):
 # The CUDA runtime does not support the fork start method, 
 # either the spawn or forkserver start method are required 
 # to use CUDA in subprocesses.
-def launch_processes(func,
-                     args=(),
-                     nprocs=1,
-                     join=True,
-                     daemon=False,
-                     start_method='spawn'):
+def start_processes(func,
+                    args=(),
+                    nprocs=1,
+                    join=True,
+                    daemon=False,
+                    start_method='spawn'):
     # NOTE(chenweihang): [ why only supports python3.4+? ]
     # Python has only supported setting the child process startup method
     # since 3.4. The previous version can only use the default startup 
@@ -157,5 +157,8 @@ def launch_processes(func,
         pass
 
 
-def run(func, args=(), nprocs=1, join=True, daemon=False, start_method='spawn'):
-    return launch_processes(func, args, nprocs, join, daemon, start_method)
+# NOTE(chenweihang): this method only supports start processes
+# by `spwan` method, if users want to start processes by other
+# method, they can use start_processes
+def spawn(func, args=(), nprocs=1, join=True, daemon=False):
+    return launch_processes(func, args, nprocs, join, daemon, 'spawn')
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 29deace1618cd6..ce3ce7944de33c 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -33,6 +33,7 @@
 ParallelStrategy = core.ParallelStrategy
 
 
+@deprecated(since="2.0.0", update_to="paddle.init_parallel_env")
 def prepare_context(strategy=None):
     '''
     :api_attr: imperative
@@ -71,13 +72,15 @@ def __init__(self):
         self.selected_gpus = None
 
 
-def init_parallel_env(trainer_id=-1, trainer_num=-1, backend='nccl', **kwargs):
+def init_parallel_env(rank=-1, backend='nccl', **kwargs):
     """
     Initialize parallel environments.
 
     Args:
+        rank(int, optional): Rank of current process. Default vaule is -1.
         backend(str, optional): The backend to communication between multiple devices.
             Now only support `nccl`. Default value is `nccl`.
+        **options(dict, optional): Other initial parallel execution environment configuration.
 
     Returns:
         ParallelStrategy
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index f01dc01973a603..aead17e2da152e 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -50,9 +50,9 @@
 from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
 from ..fluid.dygraph.checkpoint import load_dygraph as load  #DEFINE_ALIAS
 from ..fluid.dygraph.checkpoint import save_dygraph as save  #DEFINE_ALIAS
-from ..fluid.dygraph.parallel import prepare_context  #DEFINE_ALIAS
-from ..fluid.dygraph.parallel import ParallelEnv  #DEFINE_ALIAS
-from ..fluid.dygraph.parallel import DataParallel  #DEFINE_ALIAS
+from . import prepare_context
+from . import ParallelEnv
+from . import DataParallel
 
 from ..fluid.dygraph.learning_rate_scheduler import NoamDecay  #DEFINE_ALIAS
 from ..fluid.dygraph.learning_rate_scheduler import PiecewiseDecay  #DEFINE_ALIAS

From d26f495dd53996ee67200df9cac229860fee5c5d Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 20 Aug 2020 12:40:59 +0000
Subject: [PATCH 12/32] refactor api implemention & path

---
 python/paddle/__init__.py               |   3 -
 python/paddle/distributed/__init__.py   |  16 +-
 python/paddle/distributed/parallel.py   | 222 ++++++++++++++++++++++++
 python/paddle/distributed/spawn.py      |  15 +-
 python/paddle/fluid/dygraph/parallel.py | 181 ++-----------------
 python/paddle/framework/__init__.py     |   3 -
 6 files changed, 257 insertions(+), 183 deletions(-)
 create mode 100644 python/paddle/distributed/parallel.py

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 518e2c0c4d90da..73213054fe1646 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -216,9 +216,6 @@
 from .framework import no_grad  #DEFINE_ALIAS
 from .framework import save  #DEFINE_ALIAS
 from .framework import load  #DEFINE_ALIAS
-from .framework import prepare_context  #DEFINE_ALIAS
-from .framework import ParallelEnv  #DEFINE_ALIAS
-from .framework import DataParallel  #DEFINE_ALIAS
 
 from .framework import NoamDecay  #DEFINE_ALIAS
 from .framework import PiecewiseDecay  #DEFINE_ALIAS
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index ec181cdd0f2803..c40902010bb2a2 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -12,8 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# start multiprocess apis
+__all__ = ["spawn", "start_processes"]
+
+# dygraph parallel apis
+__all__ += [
+    "prepare_context", "init_parallel_env", "ParallelEnv", "DataParallel"
+]
+
 from . import spawn
-from .spawn import spwan
+from .spawn import spawn
 from .spawn import start_processes
 
-__all__ = ["spawn", "start_processes"]
+from . import parallel
+from .parallel import init_parallel_env
+from paddle.fluid.dygraph.parallel import prepare_context  #DEFINE_ALIAS
+from paddle.fluid.dygraph.parallel import ParallelEnv  #DEFINE_ALIAS
+from paddle.fluid.dygraph.parallel import DataParallel  #DEFINE_ALIAS
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
new file mode 100644
index 00000000000000..ee1eac01a22014
--- /dev/null
+++ b/python/paddle/distributed/parallel.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except jin compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import six
+
+from paddle import compat as cpt
+from paddle.distributed.launch import _parse_args, get_cluster_and_pod, _print_arguments
+
+# deprecated module import
+from paddle.fluid import core
+from paddle.fluid.framework import _switch_current_place
+from paddle.fluid.dygraph import parallel_helper
+from paddle.fluid.dygraph.parallel import ParallelEnv
+
+__all__ = ["init_parallel_env"]
+
+ParallelStrategy = core.ParallelStrategy
+
+
+# NOTE(chenweihang): The existence of this class leads to 
+# the maintenance of two arguments. When the launch.py arguments 
+# is updated, the arguments here also need to be updated, 
+# but I have not thought of a better way here
+class ParallelEnvArgs(object):
+    def __init__(self):
+        # Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..
+        self.cluster_node_ips = None
+
+        # The current node ip.
+        self.node_ip = None
+
+        # wheter to use paddlecloud platform to run your multi-process job.
+        # If false, no need to set this argument.
+        self.use_paddlecloud = None
+
+        # The trainer's started port on a single node
+        self.started_port = None
+
+        # Print the config or not
+        self.print_config = True
+
+        # It's for gpu training and the training process will run 
+        # on the selected_gpus, each process is bound to a single GPU. 
+        # And if it's not set, this module will use all the gpu cards 
+        # for training.
+        self.selected_gpus = None
+
+
+def _update_env_vars(rank, options):
+    # 1. input check
+    if not isinstance(rank, six.integer_types):
+        raise TypeError("input `rank` type error, expected type is integer, "
+                        "but received type is %s." % type(rank))
+    if rank < 0:
+        raise ValueError("input `rank` should be greater than 0, "
+                         "but received %d." % rank)
+
+    # 2. check and prepare environment variables
+    # The necessary environment variables include:
+    # - PADDLE_TRAINER_ID
+    # - PADDLE_TRAINERS_NUM
+    # - PADDLE_CURRENT_ENDPOINT
+    # - PADDLE_TRAINER_ENDPOINTS
+
+    # get args from kwargs
+    args = ParallelEnvArgs()
+    # set default `node_ip` and `cluster_node_ips`
+    args.cluster_node_ips = options.get('cluster_node_ips', None)
+    args.node_ip = options.get('node_ip', None)
+    if args.cluster_node_ips is not None and args.node_ip is None:
+        raise ValueError("please input current node ip, "
+                         "cannot only give `cluster_node_ips`.")
+    default_node_ip = os.environ.get("PADDLE_MASTER_IPADDR", None)
+    default_node_ip = "127.0.0.1" if default_node_ip else default_node_ip
+    if args.node_ip is None:
+        args.node_ip = default_node_ip
+    if args.cluster_node_ips is None:
+        args.cluster_node_ips = default_node_ip
+
+    # NOTE(chenweihang): Here should set `started_port` before
+    # `get_cluster_and_pod` and keep each process's started_port
+    # is same, see [ why need set default master info before run? ]
+    args.started_port = options.get('started_port', None)
+    if args.started_port is None:
+        default_port = os.environ.get("PADDLE_MASTER_PORT", None)
+        if default_port is None:
+            raise RuntimeError(
+                "please input start port of parallel training by `started_port=**`,"
+                "e.g. started_port=6170")
+        args.started_port = int(default_port)
+
+    args.use_paddlecloud = options.get('use_paddlecloud', False)
+    args.print_config = options.get('print_config', True)
+
+    # set default `selected_gpus`
+    # TODO(chenweihang): if users gived number of `selected_gpus`
+    # is not equal to the spawn's nprocs, it will cause error, 
+    # and because we remove the `proc num` argument of 
+    # `init_parallel_env`, when above error occured, we do not 
+    # have a good way to check, so users are not recommended to 
+    # use this parameter, it is best to delete
+    args.selected_gpus = options.get('selected_gpus', None)
+    if args.selected_gpus is None:
+        args.selected_gpus = os.environ.get("PADDLE_CUDA_VISIBLE_DEVICES", None)
+        if args.selected_gpus is None:
+            raise ValueError(
+                "please input selected gpus of parallel training by `selected_gpus=**`,"
+                "e.g. selected_gpus='0,1,2,3'.", )
+
+    # reuse code of launch.py
+    cluster, pod = get_cluster_and_pod(args)
+
+    # remove useless env vars
+    os.environ.pop("http_proxy", None)
+    os.environ.pop("https_proxy", None)
+
+    # update env vars
+    trainer = pod.get_trainer(rank)
+    if trainer is None:
+        raise RuntimeError(
+            "The expected trainer is not exists, its trainer rank is %d" % rank)
+    proc_env = {
+        "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in trainer.gpus]),
+        "PADDLE_TRAINER_ID": "%d" % trainer.rank,
+        "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
+        "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
+        "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
+    }
+    # no copy, each process will hold env vars itself
+    os.environ.update(proc_env)
+
+    # print config
+    if args.print_config and rank == 0:
+        _print_arguments(args)
+
+
+def _check_env_vars():
+    def _check_var_exists(var_name):
+        var = os.environ.get(var_name, None)
+        if var is None:
+            raise ValueError("paddle.distributed initialize error,"
+                             "Environment variable %s is needed, but not set.",
+                             var_name)
+
+    _check_var_exists("FLAGS_selected_gpus")
+    _check_var_exists("PADDLE_TRAINER_ID")
+    _check_var_exists("PADDLE_CURRENT_ENDPOINT")
+    _check_var_exists("PADDLE_TRAINERS_NUM")
+    _check_var_exists("PADDLE_TRAINER_ENDPOINTS")
+
+
+def init_parallel_env(rank=-1, backend='nccl', **options):
+    """
+    Initialize parallel environments.
+
+    Args:
+        rank(int, optional): Rank of current process. Default vaule is -1.
+        backend(str, optional): The backend to communication between multiple devices.
+            Now only support `nccl`. Default value is `nccl`.
+        **options(dict, optional): Other initial parallel execution environment configuration.
+
+    Returns:
+        ParallelStrategy
+        
+    Examples:
+        
+    """
+
+    # 1. input check
+    if not isinstance(backend, six.string_types):
+        raise TypeError("input `backend` type error, expected type is str, "
+                        "but received type is %s." % type(backend))
+    if cpt.to_text(backend) != 'nccl':
+        raise ValueError(
+            "backend `%s` is not supported, now only supports `nccl` backend." %
+            backend)
+
+    # update or check env
+    # NOTE(chenweihang): if rank is default value, users should config 
+    # parallel environment by module `paddle.distributed.launch`,
+    # so here we only check the environment variables
+    if rank != -1:
+        _update_env_vars(rank, options)
+    else:
+        _check_env_vars()
+
+    # 3. init ParallelStrategy
+    strategy = ParallelStrategy()
+    if cpt.to_text(backend) == 'nccl':
+        strategy.nranks = ParallelEnv().nranks
+        strategy.local_rank = ParallelEnv().local_rank
+        strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
+        strategy.current_endpoint = ParallelEnv().current_endpoint
+        if strategy.nranks < 2:
+            return
+        # NOTE(chenweihang): [ why config global place here? ]
+        # the dygraph mode will be set to default mode, 
+        # users will not call `dygraph.guard` or `enable_dygraph`
+        # directly, if they want to switch detault place,
+        # they need to call a function to change default place,
+        # here just set correctly place to users
+        place = core.CUDAPlace(ParallelEnv().dev_id)
+        _switch_current_place(place)
+
+        # init nccl context
+        parallel_helper._set_parallel_ctx(
+            core.NCCLParallelContext(strategy, place))
+        parallel_helper._init_parallel_ctx()
+
+    return strategy
diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
index ef2e81e50769a4..1c6b2682970f17 100644
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -20,7 +20,6 @@
 import sys
 import warnings
 
-import paddle.fluid as fluid
 from paddle.distributed.utils import find_free_ports
 
 
@@ -33,7 +32,7 @@ def _py_supported_check():
             "`paddle.distributed.launch` instead.")
 
 
-def _set_default_master_env():
+def _set_default_assist_env(nprocs):
     # set default master trainer ip addr
     os.environ['PADDLE_MASTER_IPADDR'] = '127.0.0.1'
     # set default master trainer port
@@ -41,6 +40,14 @@ def _set_default_master_env():
     if port_set is None:
         raise RuntimeError("no free port can be used to parallel training now.")
     os.environ['PADDLE_MASTER_PORT'] = str(list(port_set)[0])
+    # set default selected_gpus
+    # e.g. if the nprocs is 4, the selected_gpus="0,1,2,3"
+    # NOTE(chenweihang): [ why not use FLAGS_selected_gpus directly? ]
+    # because the FLAGS_selected_gpus may be used in other place,
+    # if we set FLAGS_selected_gpus are `0,1,2,3`, it may cause error
+    # when using `ParallelEnv`
+    os.environ['PADDLE_CUDA_VISIBLE_DEVICES'] = ",".join(
+        [str(x) for x in range(0, nprocs)])
 
 
 def _func_wrapper(func, i, args, error_queue):
@@ -132,7 +139,7 @@ def start_processes(func,
     # inner subprocess, if each process find free port for itself,
     # the started port may be different, it will cause endpoints is
     # different in different subprocesses
-    _set_default_master_env()
+    _set_default_assist_env(nprocs)
 
     # start processes
     mp = multiprocessing.get_context(start_method)
@@ -161,4 +168,4 @@ def start_processes(func,
 # by `spwan` method, if users want to start processes by other
 # method, they can use start_processes
 def spawn(func, args=(), nprocs=1, join=True, daemon=False):
-    return launch_processes(func, args, nprocs, join, daemon, 'spawn')
+    return start_processes(func, args, nprocs, join, daemon, 'spawn')
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index d6849bfab0e006..ee2aba6d47308e 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -24,16 +24,14 @@
 from paddle.fluid.dygraph import layers
 from paddle.fluid.dygraph import parallel_helper
 from paddle.fluid.dygraph import to_variable, no_grad
-from paddle.distributed.launch import get_cluster_and_pod, _print_arguments
+from paddle.utils import deprecated
 
-__all__ = [
-    "prepare_context", "init_parallel_env", "ParallelEnv", "DataParallel"
-]
+__all__ = ["prepare_context", "ParallelEnv", "DataParallel"]
 
 ParallelStrategy = core.ParallelStrategy
 
 
-@deprecated(since="2.0.0", update_to="paddle.init_parallel_env")
+@deprecated(since="2.0.0", update_to="paddle.distributed.init_parallel_env")
 def prepare_context(strategy=None):
     '''
     :api_attr: imperative
@@ -62,162 +60,6 @@ def prepare_context(strategy=None):
     return strategy
 
 
-class ParallelEnvArgs(object):
-    def __init__(self):
-        self.cluster_node_ips = None
-        self.node_ip = None
-        self.use_paddlecloud = None
-        self.started_port = None
-        self.print_config = True
-        self.selected_gpus = None
-
-
-def init_parallel_env(rank=-1, backend='nccl', **kwargs):
-    """
-    Initialize parallel environments.
-
-    Args:
-        rank(int, optional): Rank of current process. Default vaule is -1.
-        backend(str, optional): The backend to communication between multiple devices.
-            Now only support `nccl`. Default value is `nccl`.
-        **options(dict, optional): Other initial parallel execution environment configuration.
-
-    Returns:
-        ParallelStrategy
-        
-    Examples:
-        
-    """
-
-    # NOTE(chenweihang): if trainer_id or trainer_num is default value,
-    # users should config parallel environment by module `paddle.distributed.launch`,
-    # so here we skip the environment variables config phase
-    if trainer_id != -1 or trainer_num != -1:
-        # 1. input check
-        if not isinstance(trainer_id, six.integer_types):
-            raise TypeError(
-                "input `trainer_id` type error, expected type is integer, "
-                "but received type is %s." % type(trainer_id))
-        if not isinstance(trainer_num, six.integer_types):
-            raise TypeError(
-                "input `trainer_num` type error, expected type is integer, "
-                "but received type is %s." % type(trainer_id))
-        if not isinstance(backend, six.string_types):
-            raise TypeError("input `backend` type error, expected type is str, "
-                            "but received type is %s." % type(trainer_id))
-
-        if trainer_id < 0:
-            raise ValueError("input `trainer_id` should be greater than 0, "
-                             "but received %d." % trainer_id)
-        if trainer_num < 0:
-            raise ValueError("input `trainer_num` should be greater than 0, "
-                             "but received %d." % trainer_num)
-        if trainer_id >= trainer_num:
-            raise ValueError(
-                "input `trainer_id` should be less than or equal to `trainer_num`, "
-                "but `trainer_id` is %d, `trainer_num` is %d." %
-                (trainer_id, trainer_num))
-        if cpt.to_text(backend) != 'nccl':
-            raise ValueError(
-                "backend `%s` is not supported, now only supports `nccl` backend."
-                % backend)
-
-        # 2. check and prepare environment variables
-        # The necessary environment variables include:
-        # - PADDLE_TRAINER_ID
-        # - PADDLE_TRAINERS_NUM
-        # - PADDLE_CURRENT_ENDPOINT
-        # - PADDLE_TRAINER_ENDPOINTS
-
-        # get args from kwargs
-        args = ParallelEnvArgs()
-        args.cluster_node_ips = kwargs.get('cluster_node_ips', None)
-        args.node_ip = kwargs.get('node_ip', None)
-        if args.cluster_node_ips is not None and args.node_ip is None:
-            raise ValueError("please input current node ip, "
-                             "cannot `cluster_node_ips`.")
-        default_node_ip = os.environ.get("PADDLE_MASTER_IPADDR", None)
-        default_node_ip = "127.0.0.1" if default_node_ip else default_node_ip
-        if args.node_ip is None:
-            args.node_ip = default_node_ip
-        if args.cluster_node_ips is None:
-            args.cluster_node_ips = default_node_ip
-
-        # NOTE(chenweihang): Here should set started_port before
-        # `get_cluster_and_pod` and keep each process's started_port
-        # is same, see [ why need set default master info before run? ]
-        args.started_port = kwargs.get('started_port', None)
-        if args.started_port is None:
-            default_port = os.environ.get("PADDLE_MASTER_PORT", None)
-            if default_port is None:
-                raise RuntimeError(
-                    "please input start port of parallel training by `started_port=**`."
-                )
-            args.started_port = int(default_port)
-
-        args.use_paddlecloud = kwargs.get('use_paddlecloud', False)
-        args.print_config = kwargs.get('print_config', False)
-        args.selected_gpus = ",".join(
-            [str(g) for g in [x for x in range(0, trainer_num)]])
-
-        # reuse code of launch.py
-        cluster, pod = get_cluster_and_pod(args)
-
-        # remove useless env vars
-        os.environ.pop("http_proxy", None)
-        os.environ.pop("https_proxy", None)
-
-        # update env vars
-        if trainer_num != cluster.trainers_nranks():
-            raise RuntimeError(
-                "The number of trainers does not meet expectations, expected number is %d, but actual number is %d."
-                % (trainer_num, cluster.trainers_nranks()))
-        trainer = pod.get_trainer(trainer_id)
-        if trainer is None:
-            raise RuntimeError(
-                "The expected trainer is not exists, its trainer id is %d" %
-                trainer_id)
-        # why trainer.gpus? here only one device?
-        proc_env = {
-            "FLAGS_selected_gpus":
-            "%s" % ",".join([str(g) for g in trainer.gpus]),
-            "PADDLE_TRAINER_ID": "%d" % trainer.rank,
-            "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
-            "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
-            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
-        }
-        os.environ.update(proc_env)
-
-        # print config
-        if args.print_config and trainer_id == 0:
-            _print_arguments(args)
-
-    # 3. init ParallelStrategy
-    strategy = ParallelStrategy()
-    if cpt.to_text(backend) == 'nccl':
-        strategy.nranks = ParallelEnv().nranks
-        strategy.local_rank = ParallelEnv().local_rank
-        strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
-        strategy.current_endpoint = ParallelEnv().current_endpoint
-        if strategy.nranks < 2:
-            return
-        # NOTE: [ why config global place here? ]
-        # the dygraph mode will be set to default mode, 
-        # users will not call `dygraph.guard` or `enable_dygraph`
-        # directly, if they want to switch detault place,
-        # they need to call a function to change default place,
-        # here just set correctly place to users
-        place = core.CUDAPlace(ParallelEnv().dev_id)
-        framework._switch_current_place(place)
-
-        # init nccl context
-        parallel_helper._set_parallel_ctx(
-            core.NCCLParallelContext(strategy, place))
-        parallel_helper._init_parallel_ctx()
-
-    return strategy
-
-
 class ParallelEnv(object):
     """
     **Notes**:
@@ -283,13 +125,10 @@ def __init__(self):
         self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
                                             "").split(",")
         self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "")
-        self.__aliases__ = {
-            'local_rank': 'trainer_id',
-            'nranks': 'trainer_num',
-        }
+        self.__aliases__ = {'local_rank': 'rank', }
 
     @property
-    def trainer_id(self):
+    def rank(self):
         """
         The current trainer number.
 
@@ -302,13 +141,13 @@ def trainer_id(self):
             import paddle.fluid as fluid
             
             env = fluid.dygraph.ParallelEnv()
-            print("The trainer id is %d" % env.trainer_id)
-            # The trainer id is 0
+            print("The rank is %d" % env.rank)
+            # The rank is 0
         """
         return self._local_rank
 
     @property
-    def trainer_num(self):
+    def nranks(self):
         """
         The number of trainers, generally refers to the number of GPU cards used in training.
 
@@ -321,8 +160,8 @@ def trainer_num(self):
             import paddle.fluid as fluid
             
             env = fluid.dygraph.ParallelEnv()
-            print("The trainer num is %d" % env.trainer_num)
-            # The trainer num is 4
+            print("The nranks is %d" % env.nranks)
+            # The nranks is 4
         """
         return self._nranks
 
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index aead17e2da152e..4b348ea729ef33 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -50,9 +50,6 @@
 from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
 from ..fluid.dygraph.checkpoint import load_dygraph as load  #DEFINE_ALIAS
 from ..fluid.dygraph.checkpoint import save_dygraph as save  #DEFINE_ALIAS
-from . import prepare_context
-from . import ParallelEnv
-from . import DataParallel
 
 from ..fluid.dygraph.learning_rate_scheduler import NoamDecay  #DEFINE_ALIAS
 from ..fluid.dygraph.learning_rate_scheduler import PiecewiseDecay  #DEFINE_ALIAS

From bf985ccba4236b1d41b74c9051486298f86cf9df Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 20 Aug 2020 13:20:45 +0000
Subject: [PATCH 13/32] use new method _set_expected_place

---
 python/paddle/distributed/parallel.py   | 6 +++---
 python/paddle/fluid/dygraph/parallel.py | 1 -
 python/paddle/fluid/framework.py        | 8 --------
 3 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index ee1eac01a22014..53ff671abe5dfd 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -16,11 +16,11 @@
 import six
 
 from paddle import compat as cpt
-from paddle.distributed.launch import _parse_args, get_cluster_and_pod, _print_arguments
+from paddle.distributed.launch import get_cluster_and_pod, _print_arguments
 
 # deprecated module import
 from paddle.fluid import core
-from paddle.fluid.framework import _switch_current_place
+from paddle.fluid.framework import _set_expected_place
 from paddle.fluid.dygraph import parallel_helper
 from paddle.fluid.dygraph.parallel import ParallelEnv
 
@@ -212,7 +212,7 @@ def init_parallel_env(rank=-1, backend='nccl', **options):
         # they need to call a function to change default place,
         # here just set correctly place to users
         place = core.CUDAPlace(ParallelEnv().dev_id)
-        _switch_current_place(place)
+        _set_expected_place(place)
 
         # init nccl context
         parallel_helper._set_parallel_ctx(
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index ee2aba6d47308e..350954f1adf7e5 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -18,7 +18,6 @@
 import warnings
 from collections import OrderedDict
 
-from paddle import compat as cpt
 from paddle.fluid import core
 from paddle.fluid import framework
 from paddle.fluid.dygraph import layers
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 4e861fb0e77789..e844c74c106e37 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -5446,14 +5446,6 @@ def _dygraph_place_guard(place):
         _global_expected_place_ = tmp_place
 
 
-def _switch_current_place(place):
-    global _dygraph_tracer_
-    global _dygraph_current_expected_place_
-    if _dygraph_tracer_ is not None:
-        _dygraph_tracer_._expected_place = place
-    _dygraph_current_expected_place_ = place
-
-
 def load_op_library(lib_filename):
     """
     :api_attr: Static Graph

From 7939384044e28bb25f4b8cd3409c48c8f32b8302 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 24 Aug 2020 11:57:01 +0000
Subject: [PATCH 14/32] add spawn unittest framework & mnist test

---
 python/paddle/distributed/__init__.py         |   6 +-
 python/paddle/distributed/parallel.py         |  26 ++-
 .../{spawn.py => start_processes.py}          |  24 ++-
 .../tests/unittests/spawn_runner_base.py      |  92 +++++++++++
 .../fluid/tests/unittests/test_dist_base.py   | 151 ++++++++++++------
 .../test_imperative_data_parallel.py          |   2 +-
 .../unittests/test_parallel_dygraph_mnist.py  |  15 +-
 7 files changed, 250 insertions(+), 66 deletions(-)
 rename python/paddle/distributed/{spawn.py => start_processes.py} (86%)
 create mode 100644 python/paddle/fluid/tests/unittests/spawn_runner_base.py

diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index c40902010bb2a2..999cbf279d9f9d 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -20,9 +20,9 @@
     "prepare_context", "init_parallel_env", "ParallelEnv", "DataParallel"
 ]
 
-from . import spawn
-from .spawn import spawn
-from .spawn import start_processes
+from . import start_processes
+from .start_processes import spawn
+from .start_processes import start_processes
 
 from . import parallel
 from .parallel import init_parallel_env
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 53ff671abe5dfd..712a4782f6b214 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except jin compliance with the License.
@@ -83,7 +83,7 @@ def _update_env_vars(rank, options):
         raise ValueError("please input current node ip, "
                          "cannot only give `cluster_node_ips`.")
     default_node_ip = os.environ.get("PADDLE_MASTER_IPADDR", None)
-    default_node_ip = "127.0.0.1" if default_node_ip else default_node_ip
+    default_node_ip = "127.0.0.1" if default_node_ip is None else default_node_ip
     if args.node_ip is None:
         args.node_ip = default_node_ip
     if args.cluster_node_ips is None:
@@ -97,8 +97,14 @@ def _update_env_vars(rank, options):
         default_port = os.environ.get("PADDLE_MASTER_PORT", None)
         if default_port is None:
             raise RuntimeError(
-                "please input start port of parallel training by `started_port=**`,"
-                "e.g. started_port=6170")
+                "Data parallel training start failed. If you start data parallel "
+                "training by `paddle.distributed.launch` module, Please ensure "
+                "that one of the following rules is met:\n"
+                "  1. Do not set `paddle.distributed.init_parallel_env` argument "
+                "`rank` or set it to be -1;\n"
+                "  2. Set `paddle.distributed.init_parallel_env` start port for "
+                "parallel training by `started_port=**`, e.g. started_port=6170."
+            )
         args.started_port = int(default_port)
 
     args.use_paddlecloud = options.get('use_paddlecloud', False)
@@ -116,8 +122,14 @@ def _update_env_vars(rank, options):
         args.selected_gpus = os.environ.get("PADDLE_CUDA_VISIBLE_DEVICES", None)
         if args.selected_gpus is None:
             raise ValueError(
-                "please input selected gpus of parallel training by `selected_gpus=**`,"
-                "e.g. selected_gpus='0,1,2,3'.", )
+                "Data parallel training start failed. If you start data parallel "
+                "training by `paddle.distributed.launch` module, Please ensure "
+                "that one of the following rules is met:\n"
+                "  1. Do not set `paddle.distributed.init_parallel_env` argument "
+                "`rank` or set it to be -1;\n"
+                "  2. Set `paddle.distributed.init_parallel_env` selected gpus of "
+                "parallel training by `selected_gpus=**`, e.g. selected_gpus='0,1,2,3'."
+            )
 
     # reuse code of launch.py
     cluster, pod = get_cluster_and_pod(args)
@@ -187,7 +199,7 @@ def init_parallel_env(rank=-1, backend='nccl', **options):
             "backend `%s` is not supported, now only supports `nccl` backend." %
             backend)
 
-    # update or check env
+    # 2. update or check env
     # NOTE(chenweihang): if rank is default value, users should config 
     # parallel environment by module `paddle.distributed.launch`,
     # so here we only check the environment variables
diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/start_processes.py
similarity index 86%
rename from python/paddle/distributed/spawn.py
rename to python/paddle/distributed/start_processes.py
index 1c6b2682970f17..e9b0ff1648e9a0 100644
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/start_processes.py
@@ -50,9 +50,10 @@ def _set_default_assist_env(nprocs):
         [str(x) for x in range(0, nprocs)])
 
 
-def _func_wrapper(func, i, args, error_queue):
+def _func_wrapper(func, i, args, error_queue, return_queue):
     try:
-        func(i, *args)
+        result = func(i, *args)
+        return_queue.put(result)
     except KeyboardInterrupt:
         pass
     except Exception:
@@ -62,9 +63,15 @@ def _func_wrapper(func, i, args, error_queue):
 
 
 class MultiprocessContext(object):
-    def __init__(self, processes, error_queues):
+    def __init__(self, processes, error_queues, return_queues):
         _py_supported_check()
         self.error_queues = error_queues
+        # NOTE(chenweihang): The `start_processes` method is mainly used 
+        # to wrap the outermost execution function of the program for 
+        # parallel execution. Generally, the return value is not concerned, 
+        # but if the user needs to obtain the return value, users can get  
+        # the return result of each process from context.return_queues
+        self.return_queues = return_queues
         self.processes = processes
         self.sentinels = {
             process.sentinel: index
@@ -145,17 +152,21 @@ def start_processes(func,
     mp = multiprocessing.get_context(start_method)
 
     error_queues = []
+    return_queues = []
     processes = []
     for i in range(nprocs):
         error_queue = mp.SimpleQueue()
+        return_queue = mp.SimpleQueue()
         process = mp.Process(
-            target=_func_wrapper, args=(func, i, args, error_queue))
+            target=_func_wrapper,
+            args=(func, i, args, error_queue, return_queue))
         process.daemon = daemon
         process.start()
         error_queues.append(error_queue)
+        return_queues.append(return_queue)
         processes.append(process)
 
-    context = MultiprocessContext(processes, error_queues)
+    context = MultiprocessContext(processes, error_queues, return_queues)
     if not join:
         return context
 
@@ -163,6 +174,9 @@ def start_processes(func,
     while not context.join():
         pass
 
+    # finaly return context
+    return context
+
 
 # NOTE(chenweihang): this method only supports start processes
 # by `spwan` method, if users want to start processes by other
diff --git a/python/paddle/fluid/tests/unittests/spawn_runner_base.py b/python/paddle/fluid/tests/unittests/spawn_runner_base.py
new file mode 100644
index 00000000000000..4e188c3fbed187
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/spawn_runner_base.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function, division
+
+import numpy as np
+import unittest
+
+import paddle
+
+# used by model.run_trainer in test_dist_base
+from test_dist_base import RUN_STEP
+
+
+# NOTE: compatible TestParallelDyGraphRunnerBase args
+class SpawnAssistTestArgs(object):
+    update_method = "local"
+    trainer_id = 0
+    current_endpoint = None
+    endpoints = None
+    with_spawn = True
+
+
+def run_dygraph_model(rank, model, args):
+    args.with_spawn = True
+    args.trainer_id = rank
+    return model.run_trainer(args)
+
+
+class TestDistSpawnRunner(unittest.TestCase):
+    def setUp(self):
+        # NOTE(chenweihang): keep consistent with
+        # TestDistBase.check_with_place
+        self.nprocs = 2
+
+    def _run(self, model, args):
+        args.update_method = "local"
+        return run_dygraph_model(-1, model, args)
+
+    def _run_parallel(self, model, args):
+        args.update_method = "nccl2"
+        context = paddle.distributed.spawn(
+            func=run_dygraph_model,
+            args=(
+                model,
+                args, ),
+            nprocs=self.nprocs,
+            join=True)
+        result_list = []
+        for res_queue in context.return_queues:
+            result_list.append(res_queue.get())
+        return result_list
+
+    def check_dist_result_with_spawn(self, test_class, delta=1e-3):
+        # 0. prepare model and args
+        model = test_class()
+        args = SpawnAssistTestArgs()
+
+        # 1. calc signal card loss
+        losses = self._run(model, args)
+
+        # 2. calc multi card loss (nccl mode)
+        dist_losses_list = self._run_parallel(model, args)
+
+        # 3. compare losses
+        for step_id in range(RUN_STEP):
+            loss = losses[step_id]
+            dist_loss_sum = None
+            for dist_losses in dist_losses_list:
+                if dist_loss_sum is None:
+                    dist_loss_sum = np.array(dist_losses[step_id])
+                else:
+                    dist_loss_sum += np.array(dist_losses[step_id])
+            dist_loss = dist_loss_sum / self.nprocs
+            self.assertAlmostEqual(
+                loss,
+                dist_loss,
+                delta=delta,
+                msg="The results of single-card execution and multi-card execution are inconsistent."
+                "signal-card loss is:\n{}\nmulti-card average loss is:\n{}\n".
+                format(loss, dist_loss))
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index ba292f2d87c376..deb9014e0863b7 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
+from __future__ import print_function, division
 import time
 
 import unittest
@@ -25,6 +25,8 @@
 import pickle
 import numpy as np
 import time
+
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import compiler
 import paddle.fluid.dygraph as dygraph
@@ -382,12 +384,47 @@ def run_one_loop(self, model, opt, data):
         raise NotImplementedError(
             "train_one_loop should be implemented by the child classes.")
 
-    def run_trainer(self, args):
-
-        seed = 90
-        device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
-        place = fluid.CUDAPlace(device_id)
+    def _parse_launch_args(self, args):
+        cluster_node_ips = None
+        node_ip = None
+        started_port = None
+        # [ Adapt `runtime_main` arguments ]
+        # Why can't we keep the arguments here consistent with launch.py?
+        ips_dict = dict()
+        trainer_endpoints = args.endpoints.split(
+            ",") if args.endpoints else None
+        if trainer_endpoints is not None:
+            for endpoint in trainer_endpoints:
+                ip_port = endpoint.split(":")
+                ip_str = ip_port[0]
+                port = int(ip_port[1])
+                cur_port = ips_dict.get(ip_str, 0)
+                if cur_port != 0:
+                    if port < cur_port:
+                        ips_dict[ip_str] = port
+                else:
+                    ips_dict[ip_str] = port
+            cur_ip_port = args.current_endpoint.split(
+                ":") if args.current_endpoint else None
+            if cur_ip_port is None:
+                raise RuntimeError("the current endpoint is not set.")
+            endpoint_num = len(trainer_endpoints)
+            node_num = len(ips_dict.keys())
+            # TODO(chenweihang): Don't consider this situation for now
+            if endpoint_num % node_num != 0:
+                raise RuntimeError(
+                    "not check when the number of cards used by each machine is different."
+                )
+            node_gpu_num = endpoint_num // node_num
+
+            cluster_node_ips = ",".join(ips_dict.keys())
+            node_ip = cur_ip_port[0]
+            started_port = ips_dict[node_ip]
+            selected_gpus = ",".join([str(x) for x in range(0, node_gpu_num)])
+
+        return cluster_node_ips, node_ip, started_port, selected_gpus
 
+    def run_trainer(self, args):
         def _get_data(batch):
             if args.update_method != "local":
                 new_batch = []
@@ -398,51 +435,68 @@ def _get_data(batch):
             else:
                 return batch
 
-        with fluid.dygraph.guard(place):
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-            np.random.seed(seed)
-            import random
-            random.seed = seed
-            model, train_reader, opt = self.get_model()
-            nranks = len(args.endpoints.split(",")) if args.endpoints else 1
+        # 1. enable dygraph
+        fluid.enable_dygraph()
 
-            if args.update_method == "nccl2":
-                strategy = dygraph.parallel.ParallelStrategy()
-                strategy.nranks = nranks
-                strategy.local_rank = args.trainer_id
-                strategy.trainer_endpoints = args.endpoints.split(",")
-                strategy.current_endpoint = args.current_endpoint
+        # 2. init seed
+        seed = 90
+        fluid.default_startup_program().random_seed = seed
+        fluid.default_main_program().random_seed = seed
+        np.random.seed(seed)
+        import random
+        random.seed = seed
+
+        # 3. init parallel env
+        if args.update_method == "nccl2":
+            print_to_err(
+                type(self).__name__,
+                "begin to prepare context in dygraph with nccl2")
+            if args.with_spawn is True:
+                strategy = paddle.distributed.init_parallel_env(
+                    rank=args.trainer_id)
+            else:
+                cluster_node_ips, node_ip, started_port, selected_gpus = self._parse_launch_args(
+                    args)
+                strategy = paddle.distributed.init_parallel_env(
+                    rank=args.trainer_id,
+                    backend='nccl',
+                    cluster_node_ips=cluster_node_ips,
+                    node_ip=node_ip,
+                    started_port=started_port,
+                    selected_gpus=selected_gpus,
+                    print_config=False)
+
+        # 4. train model
+        model, train_reader, opt = self.get_model()
+        if args.update_method == "nccl2":
+            model = dygraph.parallel.DataParallel(model, strategy)
+            print_to_err(type(self).__name__, "model built in dygraph")
+
+        out_losses = []
+        print_to_err(type(self).__name__, "begin to run dygraph training")
+        for step_id, data in enumerate(train_reader()):
+            data = _get_data(data)
+            if step_id == RUN_STEP:
+                break
+            loss = self.run_one_loop(model, opt, data)
+            if step_id % 10 == 0:
                 print_to_err(
                     type(self).__name__,
-                    "begin to prepare context in dygraph with nccl2")
-                dygraph.parallel.prepare_context(strategy)
-                model = dygraph.parallel.DataParallel(model, strategy)
-                print_to_err(type(self).__name__, "model built in dygraph")
-            out_losses = []
-            print_to_err(type(self).__name__, "begin to run dygraph training")
-            for step_id, data in enumerate(train_reader()):
-                data = _get_data(data)
-                if step_id == RUN_STEP:
-                    break
-                loss = self.run_one_loop(model, opt, data)
-                if step_id % 10 == 0:
-                    print_to_err(
-                        type(self).__name__,
-                        "loss at step %d: %f" % (step_id, loss.numpy()))
-                out_losses.append(loss.numpy())
-
-                # FIXME(Yancey1989): scale the loss inplace
-                if args.update_method == "nccl2":
-                    loss = model.scale_loss(loss)
-
-                loss.backward()
-                if args.update_method == "nccl2":
-                    model.apply_collective_grads()
-
-                opt.minimize(loss)
-                model.clear_gradients()
+                    "loss at step %d: %f" % (step_id, loss.numpy()))
+            out_losses.append(loss.numpy())
+
+            # FIXME(Yancey1989): scale the loss inplace
+            if args.update_method == "nccl2":
+                loss = model.scale_loss(loss)
+
+            loss.backward()
+            if args.update_method == "nccl2":
+                model.apply_collective_grads()
+
+            opt.minimize(loss)
+            model.clear_gradients()
         print_to_out(out_losses)
+        return out_losses
 
 
 def runtime_main(test_class):
@@ -486,6 +540,8 @@ def runtime_main(test_class):
         type=bool,
         default=False)
     parser.add_argument('--sync_batch_norm', action='store_true')
+    parser.add_argument(
+        '--with_spawn', type=bool, required=False, default=False)
 
     args = parser.parse_args()
 
@@ -806,7 +862,6 @@ def _get_nccl2_trainer_cmd(self, model, ep, update_method, trainer_id,
         if self.__use_cuda:
             tr_cmd += " --use_cuda"
             env.update({
-                "CUDA_VISIBLE_DEVICES": "{}".format(trainer_id % 2),
                 "PADDLE_TRAINERS_NUM": "{}".format(trainer_num),
                 "PADDLE_TRAINER_ID": "{}".format(trainer_id),
                 "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py b/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
index d3f488d92ac455..428f97c0af8182 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py
@@ -43,7 +43,7 @@ def forward(self, inputs):
 class TestDataParallelStateDict(unittest.TestCase):
     def test_data_parallel_state_dict(self):
         with fluid.dygraph.guard():
-            strategy = paddle.prepare_context()
+            strategy = paddle.distributed.prepare_context()
             mlp = MLP()
             parallel_mlp = dygraph.parallel.DataParallel(mlp, strategy)
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
index 5677157fde8d71..bac196b1ab52b6 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
@@ -13,11 +13,16 @@
 # limitations under the License.
 
 from __future__ import print_function
+
+import os
+import sys
 import unittest
-from test_dist_base import TestDistBase
+
 import paddle.fluid as fluid
+from test_dist_base import TestDistBase
+from spawn_runner_base import TestDistSpawnRunner
+from parallel_dygraph_mnist import TestMnist
 
-import os
 flag_name = os.path.splitext(__file__)[0]
 
 
@@ -36,5 +41,11 @@ def test_mnist(self):
                 log_name=flag_name)
 
 
+class TestParallelDygraphMnistSpawn(TestDistSpawnRunner):
+    def test_mnist_with_spawn(self):
+        if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
+            self.check_dist_result_with_spawn(test_class=TestMnist, delta=1e-5)
+
+
 if __name__ == "__main__":
     unittest.main()

From 04580d8657f6ed926a18fcc80b8174eb8eeb85e1 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 24 Aug 2020 14:16:26 +0000
Subject: [PATCH 15/32] add more unittests & doc

---
 python/paddle/distributed/parallel.py         |  58 +++++++-
 python/paddle/distributed/start_processes.py  | 139 ++++++++++++++++++
 .../test_parallel_dygraph_se_resnext.py       |  16 +-
 .../test_parallel_dygraph_sparse_embedding.py |  12 +-
 .../test_parallel_dygraph_transformer.py      |  12 +-
 5 files changed, 230 insertions(+), 7 deletions(-)

diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 712a4782f6b214..3d795084ae6837 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -180,14 +180,66 @@ def init_parallel_env(rank=-1, backend='nccl', **options):
     Args:
         rank(int, optional): Rank of current process. Default vaule is -1.
         backend(str, optional): The backend to communication between multiple devices.
-            Now only support `nccl`. Default value is `nccl`.
-        **options(dict, optional): Other initial parallel execution environment configuration.
+            Now only support ``nccl`` . Default value is ``nccl`` .
+        **options(dict, optional): Other initial parallel execution environment configuration options. 
+            The following options are currently supported:
+            - cluster_node_ips: Paddle cluster nodes ips, such as "192.168.0.16,192.168.0.17". Default: "127.0.0.1"
+            - node_ip: The current node ip, such as "192.168.0.16". Default: "127.0.0.1"
+            - started_port: The trainer's started port on a single node, such as 6170. Default: None
+            - selected_gpus: The training process will run on the selected_gpus, such as "0,1,2,3". Default: None
+            - print_config: Print current parallel training config. Default: True.
+            - use_paddlecloud: Wheter to use paddlecloud platform to run your multi-process job. Default: False.
 
     Returns:
         ParallelStrategy
         
     Examples:
-        
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import paddle.optimizer as opt
+            import paddle.distributed as dist
+
+            class LinearNet(nn.Layer):
+                def __init__(self):
+                    super(LinearNet, self).__init__()
+                    self._linear1 = nn.Linear(10, 10)
+                    self._linear2 = nn.Linear(10, 1)
+                    
+                def forward(self, x):
+                    return self._linear2(self._linear1(x))
+
+            def train(rank):
+                # 1. enable dynamic mode
+                paddle.disable_static()
+                
+                # 2. initialize parallel environment
+                strategy = dist.init_parallel_env(rank)
+
+                # 3. create data parallel layer & optimizer
+                layer = LinearNet()
+                dp_layer = dist.DataParallel(layer, strategy)
+
+                loss_fn = nn.MSELoss()
+                sgd = opt.SGD(
+                    learning_rate=0.001, parameter_list=dp_layer.parameters())
+
+                # 4. run layer
+                inputs = paddle.randn([10, 10], 'float32')
+                outputs = dp_layer(inputs)
+                labels = paddle.randn([10, 1], 'float32')
+                loss = loss_fn(outputs, labels)
+                
+                loss = dp_layer.scale_loss(loss)
+                loss.backward()
+                dp_layer.apply_collective_grads()
+
+                sgd.minimize(loss)
+                dp_layer.clear_gradients()
+
+            if __name__ == '__main__':
+                dist.spawn(train, args=(), nprocs=2)
     """
 
     # 1. input check
diff --git a/python/paddle/distributed/start_processes.py b/python/paddle/distributed/start_processes.py
index e9b0ff1648e9a0..dbadbeece1d984 100644
--- a/python/paddle/distributed/start_processes.py
+++ b/python/paddle/distributed/start_processes.py
@@ -133,6 +133,77 @@ def start_processes(func,
                     join=True,
                     daemon=False,
                     start_method='spawn'):
+    """
+    Start multiple rocesses for parallel training.
+
+    Args:
+        func (function): The targert function is called by started process.
+            This function need to be able to pickled, so it must be defined
+            at the top level of a module.
+            This function should be called as ``func(i, *args)`` , ``i`` is
+            the process index and ``args`` contains other arguments as tuple.
+        args (tuple): Arguments passed to ``func`` .
+        nprocs (int): Number of processed to start. 
+        join (bool): Perform a blocking join on all started processes.
+            Default: True.
+        daemon (bool): The started processes' daemon flag. Default: False.
+        start_method (string): the way to start a process. The start method
+            can be ``spawn`` , ``fork`` , ``forkserver`` . Because the CUDA 
+            runtime does not support the ``fork`` start method, when use 
+            CUDA in subprocesses, we should start process by ``spawn`` or
+            ``forkserver`` method.
+
+    Returns:
+        ``MultiprocessContext`` object, it hold the started processes.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import paddle.optimizer as opt
+            import paddle.distributed as dist
+
+            class LinearNet(nn.Layer):
+                def __init__(self):
+                    super(LinearNet, self).__init__()
+                    self._linear1 = nn.Linear(10, 10)
+                    self._linear2 = nn.Linear(10, 1)
+                    
+                def forward(self, x):
+                    return self._linear2(self._linear1(x))
+
+            def train(rank):
+                # 1. enable dynamic mode
+                paddle.disable_static()
+                
+                # 2. initialize parallel environment
+                strategy = dist.init_parallel_env(rank)
+
+                # 3. create data parallel layer & optimizer
+                layer = LinearNet()
+                dp_layer = dist.DataParallel(layer, strategy)
+
+                loss_fn = nn.MSELoss()
+                sgd = opt.SGD(
+                    learning_rate=0.001, parameter_list=dp_layer.parameters())
+
+                # 4. run layer
+                inputs = paddle.randn([10, 10], 'float32')
+                outputs = dp_layer(inputs)
+                labels = paddle.randn([10, 1], 'float32')
+                loss = loss_fn(outputs, labels)
+                
+                loss = dp_layer.scale_loss(loss)
+                loss.backward()
+                dp_layer.apply_collective_grads()
+
+                sgd.minimize(loss)
+                dp_layer.clear_gradients()
+
+            if __name__ == '__main__':
+                dist.start_processes(train, args=(), nprocs=2)
+    """
     # NOTE(chenweihang): [ why only supports python3.4+? ]
     # Python has only supported setting the child process startup method
     # since 3.4. The previous version can only use the default startup 
@@ -182,4 +253,72 @@ def start_processes(func,
 # by `spwan` method, if users want to start processes by other
 # method, they can use start_processes
 def spawn(func, args=(), nprocs=1, join=True, daemon=False):
+    """
+    Start multiple rocesses with ``spawn`` method for parallel training.
+    
+    This is specialized method of method ``paddle.distributed.start_processes`` .
+
+    Args:
+        func (function): The targert function is called by spawned process.
+            This function need to be able to pickled, so it must be defined
+            at the top level of a module.
+            This function should be called as ``func(i, *args)``, ``i`` is
+            the process index and ``args`` contains other arguments as tuple.
+        args (tuple): Arguments passed to ``func``.
+        nprocs (int): Number of processed to spawn. 
+        join (bool): Perform a blocking join on all spawned processes.
+            Default: True.
+        daemon (bool): The spawned processes' daemon flag. Default: False.
+
+    Returns:
+        ``MultiprocessContext`` object, it hold the spawned processes.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+            import paddle.optimizer as opt
+            import paddle.distributed as dist
+
+            class LinearNet(nn.Layer):
+                def __init__(self):
+                    super(LinearNet, self).__init__()
+                    self._linear1 = nn.Linear(10, 10)
+                    self._linear2 = nn.Linear(10, 1)
+                    
+                def forward(self, x):
+                    return self._linear2(self._linear1(x))
+
+            def train(rank):
+                # 1. enable dynamic mode
+                paddle.disable_static()
+                
+                # 2. initialize parallel environment
+                strategy = dist.init_parallel_env(rank)
+
+                # 3. create data parallel layer & optimizer
+                layer = LinearNet()
+                dp_layer = dist.DataParallel(layer, strategy)
+
+                loss_fn = nn.MSELoss()
+                sgd = opt.SGD(
+                    learning_rate=0.001, parameter_list=dp_layer.parameters())
+
+                # 4. run layer
+                inputs = paddle.randn([10, 10], 'float32')
+                outputs = dp_layer(inputs)
+                labels = paddle.randn([10, 1], 'float32')
+                loss = loss_fn(outputs, labels)
+                
+                loss = dp_layer.scale_loss(loss)
+                loss.backward()
+                dp_layer.apply_collective_grads()
+
+                sgd.minimize(loss)
+                dp_layer.clear_gradients()
+
+            if __name__ == '__main__':
+                dist.spawn(train, args=(), nprocs=2)
+    """
     return start_processes(func, args, nprocs, join, daemon, 'spawn')
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py
index 8c5cdf8321a4bd..cf89dc484c4880 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py
@@ -13,11 +13,16 @@
 # limitations under the License.
 
 from __future__ import print_function
+
+import os
+import sys
 import unittest
-from test_dist_base import TestDistBase
+
 import paddle.fluid as fluid
+from test_dist_base import TestDistBase
+from spawn_runner_base import TestDistSpawnRunner
+from parallel_dygraph_se_resnext import TestSeResNeXt
 
-import os
 flag_name = os.path.splitext(__file__)[0]
 
 
@@ -36,5 +41,12 @@ def test_se_resnext(self):
                 log_name=flag_name)
 
 
+class TestParallelDygraphSeResNeXtSpawn(TestDistSpawnRunner):
+    def test_se_resnext_with_spawn(self):
+        if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
+            self.check_dist_result_with_spawn(
+                test_class=TestSeResNeXt, delta=0.01)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py
index 40b5833053d29b..7f051f1005c7b7 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding.py
@@ -15,10 +15,13 @@
 from __future__ import print_function
 
 import os
+import sys
 import unittest
-import paddle.fluid as fluid
 
+import paddle.fluid as fluid
 from test_dist_base import TestDistBase
+from spawn_runner_base import TestDistSpawnRunner
+from parallel_dygraph_sparse_embedding import TestSparseEmbedding
 
 flag_name = os.path.splitext(__file__)[0]
 
@@ -38,5 +41,12 @@ def test_sparse_embedding(self):
                 log_name=flag_name)
 
 
+class TestParallelDygraphSparseEmdeddingSpawn(TestDistSpawnRunner):
+    def test_sparse_embedding_with_spawn(self):
+        if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
+            self.check_dist_result_with_spawn(
+                test_class=TestSparseEmbedding, delta=1e-5)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py
index 385c4d892a650b..c8d47eab2c5191 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py
@@ -15,10 +15,13 @@
 from __future__ import print_function
 
 import os
+import sys
 import unittest
-import paddle.fluid as fluid
 
+import paddle.fluid as fluid
 from test_dist_base import TestDistBase
+from spawn_runner_base import TestDistSpawnRunner
+from parallel_dygraph_transformer import TestTransformer
 
 flag_name = os.path.splitext(__file__)[0]
 
@@ -38,5 +41,12 @@ def test_transformer(self):
                 log_name=flag_name)
 
 
+class TestParallelDygraphTransformerSpawn(TestDistSpawnRunner):
+    def test_transformer_with_spawn(self):
+        if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
+            self.check_dist_result_with_spawn(
+                test_class=TestTransformer, delta=1e-5)
+
+
 if __name__ == "__main__":
     unittest.main()

From 131afd4877199c516224de0a5a7344d390312633 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 25 Aug 2020 03:52:47 +0000
Subject: [PATCH 16/32] fix unittest failed

---
 .../tests/unittests/spawn_runner_base.py      |   2 +-
 .../unittests/test_directory_migration.py     |   5 +-
 .../fluid/tests/unittests/test_dist_base.py   | 152 ++++++++----------
 3 files changed, 74 insertions(+), 85 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/spawn_runner_base.py b/python/paddle/fluid/tests/unittests/spawn_runner_base.py
index 4e188c3fbed187..a06c97498f7a73 100644
--- a/python/paddle/fluid/tests/unittests/spawn_runner_base.py
+++ b/python/paddle/fluid/tests/unittests/spawn_runner_base.py
@@ -35,7 +35,7 @@ class SpawnAssistTestArgs(object):
 def run_dygraph_model(rank, model, args):
     args.with_spawn = True
     args.trainer_id = rank
-    return model.run_trainer(args)
+    return model.run_trainer_with_spawn(args)
 
 
 class TestDistSpawnRunner(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py
index bc858828058079..fee2756f516451 100644
--- a/python/paddle/fluid/tests/unittests/test_directory_migration.py
+++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py
@@ -39,8 +39,9 @@ def test_new_directory(self):
             'paddle.in_dynamic_mode', 'paddle.to_variable', 'paddle.grad',
             'paddle.no_grad', 'paddle.save', 'paddle.load',
             'paddle.static.save', 'paddle.static.load',
-            'paddle.BackwardStrategy', 'paddle.ParallelEnv',
-            'paddle.prepare_context', 'paddle.DataParallel', 'paddle.jit',
+            'paddle.BackwardStrategy', 'paddle.distributed.ParallelEnv',
+            'paddle.distributed.prepare_context',
+            'paddle.distributed.DataParallel', 'paddle.jit',
             'paddle.jit.TracedLayer', 'paddle.jit.to_static',
             'paddle.jit.ProgramTranslator', 'paddle.jit.TranslatedLayer',
             'paddle.jit.save', 'paddle.jit.load', 'paddle.jit.SaveLoadConfig',
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index deb9014e0863b7..e3c09ee3b7592e 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function, division
+from __future__ import print_function
 import time
 
 import unittest
@@ -23,6 +23,7 @@
 import six
 import argparse
 import pickle
+import random
 import numpy as np
 import time
 
@@ -384,108 +385,97 @@ def run_one_loop(self, model, opt, data):
         raise NotImplementedError(
             "train_one_loop should be implemented by the child classes.")
 
-    def _parse_launch_args(self, args):
-        cluster_node_ips = None
-        node_ip = None
-        started_port = None
-        # [ Adapt `runtime_main` arguments ]
-        # Why can't we keep the arguments here consistent with launch.py?
-        ips_dict = dict()
-        trainer_endpoints = args.endpoints.split(
-            ",") if args.endpoints else None
-        if trainer_endpoints is not None:
-            for endpoint in trainer_endpoints:
-                ip_port = endpoint.split(":")
-                ip_str = ip_port[0]
-                port = int(ip_port[1])
-                cur_port = ips_dict.get(ip_str, 0)
-                if cur_port != 0:
-                    if port < cur_port:
-                        ips_dict[ip_str] = port
-                else:
-                    ips_dict[ip_str] = port
-            cur_ip_port = args.current_endpoint.split(
-                ":") if args.current_endpoint else None
-            if cur_ip_port is None:
-                raise RuntimeError("the current endpoint is not set.")
-            endpoint_num = len(trainer_endpoints)
-            node_num = len(ips_dict.keys())
-            # TODO(chenweihang): Don't consider this situation for now
-            if endpoint_num % node_num != 0:
-                raise RuntimeError(
-                    "not check when the number of cards used by each machine is different."
-                )
-            node_gpu_num = endpoint_num // node_num
-
-            cluster_node_ips = ",".join(ips_dict.keys())
-            node_ip = cur_ip_port[0]
-            started_port = ips_dict[node_ip]
-            selected_gpus = ",".join([str(x) for x in range(0, node_gpu_num)])
-
-        return cluster_node_ips, node_ip, started_port, selected_gpus
+    def _get_data(self, batch, args):
+        if args.update_method != "local":
+            new_batch = []
+            for offset, item in enumerate(batch):
+                if offset % 2 == args.trainer_id:
+                    new_batch.append(item)
+            return new_batch
+        else:
+            return batch
 
     def run_trainer(self, args):
-        def _get_data(batch):
-            if args.update_method != "local":
-                new_batch = []
-                for offset, item in enumerate(batch):
-                    if offset % 2 == args.trainer_id:
-                        new_batch.append(item)
-                return new_batch
-            else:
-                return batch
 
+        seed = 90
+        device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        place = fluid.CUDAPlace(device_id)
+
+        with fluid.dygraph.guard(place):
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            np.random.seed(seed)
+            import random
+            random.seed = seed
+            model, train_reader, opt = self.get_model()
+            nranks = len(args.endpoints.split(",")) if args.endpoints else 1
+
+            if args.update_method == "nccl2":
+                strategy = dygraph.parallel.ParallelStrategy()
+                strategy.nranks = nranks
+                strategy.local_rank = args.trainer_id
+                strategy.trainer_endpoints = args.endpoints.split(",")
+                strategy.current_endpoint = args.current_endpoint
+                print_to_err(
+                    type(self).__name__,
+                    "begin to prepare context in dygraph with nccl2")
+                dygraph.parallel.prepare_context(strategy)
+                model = dygraph.parallel.DataParallel(model, strategy)
+                print_to_err(type(self).__name__, "model built in dygraph")
+            out_losses = []
+            print_to_err(type(self).__name__, "begin to run dygraph training")
+            for step_id, data in enumerate(train_reader()):
+                data = self._get_data(data, args)
+                if step_id == RUN_STEP:
+                    break
+                loss = self.run_one_loop(model, opt, data)
+                if step_id % 10 == 0:
+                    print_to_err(
+                        type(self).__name__,
+                        "loss at step %d: %f" % (step_id, loss.numpy()))
+                out_losses.append(loss.numpy())
+
+                # FIXME(Yancey1989): scale the loss inplace
+                if args.update_method == "nccl2":
+                    loss = model.scale_loss(loss)
+
+                loss.backward()
+                if args.update_method == "nccl2":
+                    model.apply_collective_grads()
+
+                opt.minimize(loss)
+                model.clear_gradients()
+        print_to_out(out_losses)
+
+    def run_trainer_with_spawn(self, args):
         # 1. enable dygraph
-        fluid.enable_dygraph()
+        paddle.disable_static()
 
         # 2. init seed
         seed = 90
-        fluid.default_startup_program().random_seed = seed
-        fluid.default_main_program().random_seed = seed
+        paddle.static.default_startup_program().random_seed = seed
+        paddle.static.default_main_program().random_seed = seed
         np.random.seed(seed)
-        import random
         random.seed = seed
 
         # 3. init parallel env
         if args.update_method == "nccl2":
-            print_to_err(
-                type(self).__name__,
-                "begin to prepare context in dygraph with nccl2")
-            if args.with_spawn is True:
-                strategy = paddle.distributed.init_parallel_env(
-                    rank=args.trainer_id)
-            else:
-                cluster_node_ips, node_ip, started_port, selected_gpus = self._parse_launch_args(
-                    args)
-                strategy = paddle.distributed.init_parallel_env(
-                    rank=args.trainer_id,
-                    backend='nccl',
-                    cluster_node_ips=cluster_node_ips,
-                    node_ip=node_ip,
-                    started_port=started_port,
-                    selected_gpus=selected_gpus,
-                    print_config=False)
+            strategy = paddle.distributed.init_parallel_env(
+                rank=args.trainer_id)
 
         # 4. train model
         model, train_reader, opt = self.get_model()
         if args.update_method == "nccl2":
-            model = dygraph.parallel.DataParallel(model, strategy)
-            print_to_err(type(self).__name__, "model built in dygraph")
+            model = paddle.distributed.DataParallel(model, strategy)
 
         out_losses = []
-        print_to_err(type(self).__name__, "begin to run dygraph training")
         for step_id, data in enumerate(train_reader()):
-            data = _get_data(data)
+            data = self._get_data(data, args)
             if step_id == RUN_STEP:
                 break
             loss = self.run_one_loop(model, opt, data)
-            if step_id % 10 == 0:
-                print_to_err(
-                    type(self).__name__,
-                    "loss at step %d: %f" % (step_id, loss.numpy()))
             out_losses.append(loss.numpy())
 
-            # FIXME(Yancey1989): scale the loss inplace
             if args.update_method == "nccl2":
                 loss = model.scale_loss(loss)
 
@@ -495,7 +485,6 @@ def _get_data(batch):
 
             opt.minimize(loss)
             model.clear_gradients()
-        print_to_out(out_losses)
         return out_losses
 
 
@@ -540,8 +529,6 @@ def runtime_main(test_class):
         type=bool,
         default=False)
     parser.add_argument('--sync_batch_norm', action='store_true')
-    parser.add_argument(
-        '--with_spawn', type=bool, required=False, default=False)
 
     args = parser.parse_args()
 
@@ -862,6 +849,7 @@ def _get_nccl2_trainer_cmd(self, model, ep, update_method, trainer_id,
         if self.__use_cuda:
             tr_cmd += " --use_cuda"
             env.update({
+                "CUDA_VISIBLE_DEVICES": "{}".format(trainer_id % 2),
                 "PADDLE_TRAINERS_NUM": "{}".format(trainer_num),
                 "PADDLE_TRAINER_ID": "{}".format(trainer_id),
                 "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,

From e170f105b2fb7c9151af94f37020b461f8848ba0 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 25 Aug 2020 04:11:17 +0000
Subject: [PATCH 17/32] polish english doc

---
 python/paddle/distributed/parallel.py        | 21 ++++++++++++++------
 python/paddle/distributed/start_processes.py |  8 ++++----
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 3d795084ae6837..b703e6904c6a15 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -175,19 +175,28 @@ def _check_var_exists(var_name):
 
 def init_parallel_env(rank=-1, backend='nccl', **options):
     """
-    Initialize parallel environments.
+    Initialize parallel training environments in dynamic mode.
 
     Args:
-        rank(int, optional): Rank of current process. Default vaule is -1.
+        rank(int, optional): Rank of current process. Default vaule is -1. 
+            When it is the default value -1, you should use ``paddle.disstributed.launch`` 
+            module to start training, the environment variables for parallel training 
+            are configured by ``paddle.disstributed.launch`` module.
         backend(str, optional): The backend to communication between multiple devices.
             Now only support ``nccl`` . Default value is ``nccl`` .
         **options(dict, optional): Other initial parallel execution environment configuration options. 
             The following options are currently supported:
-            - cluster_node_ips: Paddle cluster nodes ips, such as "192.168.0.16,192.168.0.17". Default: "127.0.0.1"
-            - node_ip: The current node ip, such as "192.168.0.16". Default: "127.0.0.1"
-            - started_port: The trainer's started port on a single node, such as 6170. Default: None
-            - selected_gpus: The training process will run on the selected_gpus, such as "0,1,2,3". Default: None
+
+            - cluster_node_ips: Paddle cluster nodes ips, such as "192.168.0.16,192.168.0.17". Default: "127.0.0.1".
+
+            - node_ip: The current node ip, such as "192.168.0.16". Default: "127.0.0.1".
+
+            - started_port: The trainer's started port on a single node, such as 6170. Default: None.
+
+            - selected_gpus: The training process will run on the selected_gpus, such as "0,1,2,3". Default: None.
+
             - print_config: Print current parallel training config. Default: True.
+
             - use_paddlecloud: Wheter to use paddlecloud platform to run your multi-process job. Default: False.
 
     Returns:
diff --git a/python/paddle/distributed/start_processes.py b/python/paddle/distributed/start_processes.py
index dbadbeece1d984..32780f37ab9c90 100644
--- a/python/paddle/distributed/start_processes.py
+++ b/python/paddle/distributed/start_processes.py
@@ -134,7 +134,7 @@ def start_processes(func,
                     daemon=False,
                     start_method='spawn'):
     """
-    Start multiple rocesses for parallel training.
+    Start multiple processes for parallel training.
 
     Args:
         func (function): The targert function is called by started process.
@@ -151,7 +151,7 @@ def start_processes(func,
             can be ``spawn`` , ``fork`` , ``forkserver`` . Because the CUDA 
             runtime does not support the ``fork`` start method, when use 
             CUDA in subprocesses, we should start process by ``spawn`` or
-            ``forkserver`` method.
+            ``forkserver`` method. Default: 'spawn'.
 
     Returns:
         ``MultiprocessContext`` object, it hold the started processes.
@@ -254,9 +254,9 @@ def train(rank):
 # method, they can use start_processes
 def spawn(func, args=(), nprocs=1, join=True, daemon=False):
     """
-    Start multiple rocesses with ``spawn`` method for parallel training.
+    Start multiple processes with ``spawn`` method for parallel training.
     
-    This is specialized method of method ``paddle.distributed.start_processes`` .
+    This is a specialized method of ``paddle.distributed.start_processes`` .
 
     Args:
         func (function): The targert function is called by spawned process.

From 0ef215d5135eb8a1e2cfdabb1b0fdc62f7659533 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 25 Aug 2020 04:36:46 +0000
Subject: [PATCH 18/32] self review and polish details

---
 python/paddle/distributed/parallel.py         |  8 ++--
 python/paddle/distributed/start_processes.py  | 24 +++++-----
 python/paddle/fluid/dygraph/parallel.py       | 45 ++++++++++---------
 .../tests/unittests/spawn_runner_base.py      |  2 -
 4 files changed, 41 insertions(+), 38 deletions(-)

diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index b703e6904c6a15..fc500244115f36 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -142,7 +142,8 @@ def _update_env_vars(rank, options):
     trainer = pod.get_trainer(rank)
     if trainer is None:
         raise RuntimeError(
-            "The expected trainer is not exists, its trainer rank is %d" % rank)
+            "The expected trainer is not exists, its trainer rank is %d." %
+            rank)
     proc_env = {
         "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in trainer.gpus]),
         "PADDLE_TRAINER_ID": "%d" % trainer.rank,
@@ -163,7 +164,7 @@ def _check_var_exists(var_name):
         var = os.environ.get(var_name, None)
         if var is None:
             raise ValueError("paddle.distributed initialize error,"
-                             "Environment variable %s is needed, but not set.",
+                             "environment variable %s is needed, but not set.",
                              var_name)
 
     _check_var_exists("FLAGS_selected_gpus")
@@ -184,7 +185,8 @@ def init_parallel_env(rank=-1, backend='nccl', **options):
             are configured by ``paddle.disstributed.launch`` module.
         backend(str, optional): The backend to communication between multiple devices.
             Now only support ``nccl`` . Default value is ``nccl`` .
-        **options(dict, optional): Other initial parallel execution environment configuration options. 
+        **options(dict, optional): Other initial parallel execution environment configuration options.
+        
             The following options are currently supported:
 
             - cluster_node_ips: Paddle cluster nodes ips, such as "192.168.0.16,192.168.0.17". Default: "127.0.0.1".
diff --git a/python/paddle/distributed/start_processes.py b/python/paddle/distributed/start_processes.py
index 32780f37ab9c90..a5ec59ebfdffda 100644
--- a/python/paddle/distributed/start_processes.py
+++ b/python/paddle/distributed/start_processes.py
@@ -26,9 +26,9 @@
 def _py_supported_check():
     if not sys.version_info >= (3, 4):
         raise RuntimeError(
-            "Use `paddle.distributed.run` to start parallel training "
-            "requires python version greater than 3.4, if your python "
-            "is lower than this version, please use "
+            "Use `paddle.distributed.spawn` or `paddle.distributed.start_processes` "
+            "to start parallel training requires python version greater than 3.4, "
+            "if your python is lower than this version, please use "
             "`paddle.distributed.launch` instead.")
 
 
@@ -40,11 +40,11 @@ def _set_default_assist_env(nprocs):
     if port_set is None:
         raise RuntimeError("no free port can be used to parallel training now.")
     os.environ['PADDLE_MASTER_PORT'] = str(list(port_set)[0])
-    # set default selected_gpus
-    # e.g. if the nprocs is 4, the selected_gpus="0,1,2,3"
+    # set default selected gpus
+    # e.g. if the nprocs is 4, the selected gpus is "0,1,2,3"
     # NOTE(chenweihang): [ why not use FLAGS_selected_gpus directly? ]
     # because the FLAGS_selected_gpus may be used in other place,
-    # if we set FLAGS_selected_gpus are `0,1,2,3`, it may cause error
+    # if we set FLAGS_selected_gpus to be `0,1,2,3`, it may cause error
     # when using `ParallelEnv`
     os.environ['PADDLE_CUDA_VISIBLE_DEVICES'] = ",".join(
         [str(x) for x in range(0, nprocs)])
@@ -204,18 +204,18 @@ def train(rank):
             if __name__ == '__main__':
                 dist.start_processes(train, args=(), nprocs=2)
     """
-    # NOTE(chenweihang): [ why only supports python3.4+? ]
-    # Python has only supported setting the child process startup method
+    # NOTE(chenweihang): [ why only supports python3.4+ ? ]
+    # Python supported setting the child process startup method
     # since 3.4. The previous version can only use the default startup 
     # method, while the default startup method of Unix is fork, which 
     # cannot support CUDA runtime multi-process
     _py_supported_check()
 
     # NOTE(chenweihang): [ why need set default master info before run? ]
-    # when using `paddle.distributed.run` start parallel training,
-    # users need use `init_parallel_env` to config some cluster info
-    # inner subprocess, if each process find free port for itself,
-    # the started port may be different, it will cause endpoints is
+    # when using `paddle.distributed.spawn/start_processes` start 
+    # parallel training, users need use `init_parallel_env` to config 
+    # cluster info inner subprocess, if each process find free port for 
+    # itself, the started port may be different, it will cause endpoints is
     # different in different subprocesses
     _set_default_assist_env(nprocs)
 
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 350954f1adf7e5..dc04847f57609e 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -124,43 +124,46 @@ def __init__(self):
         self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
                                             "").split(",")
         self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "")
-        self.__aliases__ = {'local_rank': 'rank', }
+        self.__aliases__ = {
+            'local_rank': 'rank',
+            'nranks': 'world_size',
+        }
 
     @property
     def rank(self):
         """
-        The current trainer number.
+        Rank of current trainer.
 
-        Its value is equal to the value of the environment variable PADDLE_TRAINER_ID. The default value is 0.
+        Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ID`` . The default value is 0.
 
         Examples:
           .. code-block:: python
 
             # execute this command in terminal: export PADDLE_TRAINER_ID=0
-            import paddle.fluid as fluid
+            import paddle.distributed as dist
             
-            env = fluid.dygraph.ParallelEnv()
+            env = dist.ParallelEnv()
             print("The rank is %d" % env.rank)
             # The rank is 0
         """
         return self._local_rank
 
     @property
-    def nranks(self):
+    def world_size(self):
         """
-        The number of trainers, generally refers to the number of GPU cards used in training.
+        The number of trainers (number of processes participating in current job).
 
-        Its value is equal to the value of the environment variable PADDLE_TRAINERS_NUM. The default value is 1.
+        Its value is equal to the value of the environment variable ``PADDLE_TRAINERS_NUM`` . The default value is 1.
 
         Examples:
           .. code-block:: python
 
             # execute this command in terminal: export PADDLE_TRAINERS_NUM=4
-            import paddle.fluid as fluid
+            import paddle.distributed as dist
             
-            env = fluid.dygraph.ParallelEnv()
-            print("The nranks is %d" % env.nranks)
-            # The nranks is 4
+            env = dist.ParallelEnv()
+            print("The world_size is %d" % env.world_size)
+            # The world_size is 4
         """
         return self._nranks
 
@@ -169,15 +172,15 @@ def dev_id(self):
         """
         The ID of selected GPU card for parallel training.
 
-        Its value is equal to the value of the environment variable FLAGS_selected_gpus. The default value is 0.
+        Its value is equal to the value of the environment variable ``FLAGS_selected_gpus`` . The default value is 0.
 
         Examples:
           .. code-block:: python
 
             # execute this command in terminal: export FLAGS_selected_gpus=1
-            import paddle.fluid as fluid
+            import paddle.distributed as dist
             
-            env = fluid.dygraph.ParallelEnv()
+            env = dist.ParallelEnv()
             print("The device id are %d" % env.dev_id)
             # The device id are 1
         """
@@ -188,15 +191,15 @@ def current_endpoint(self):
         """
         The endpoint of current trainer, it is in the form of (node IP + port).
 
-        Its value is equal to the value of the environment variable PADDLE_CURRENT_ENDPOINT. The default value is "".
+        Its value is equal to the value of the environment variable ``PADDLE_CURRENT_ENDPOINT`` . The default value is "".
 
         Examples:
           .. code-block:: python
             
             # execute this command in terminal: export PADDLE_CURRENT_ENDPOINT=127.0.0.1:6170
-            import paddle.fluid as fluid
+            import paddle.distributed as dist
             
-            env = fluid.dygraph.ParallelEnv()
+            env = dist.ParallelEnv()
             print("The current endpoint are %s" % env.current_endpoint)
             # The current endpoint are 127.0.0.1:6170
         """
@@ -208,15 +211,15 @@ def trainer_endpoints(self):
         The endpoints of all trainer nodes in the task, 
         which are used to broadcast the NCCL ID when NCCL2 is initialized.
 
-        Its value is equal to the value of the environment variable PADDLE_TRAINER_ENDPOINTS. The default value is "".
+        Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ENDPOINTS`` . The default value is "".
 
         Examples:
           .. code-block:: python
 
             # execute this command in terminal: export PADDLE_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171
-            import paddle.fluid as fluid
+            import paddle.distributed as dist
             
-            env = fluid.dygraph.ParallelEnv()
+            env = dist.ParallelEnv()
             print("The trainer endpoints are %s" % env.trainer_endpoints)
             # The trainer endpoints are ['127.0.0.1:6170', '127.0.0.1:6171']
         """
diff --git a/python/paddle/fluid/tests/unittests/spawn_runner_base.py b/python/paddle/fluid/tests/unittests/spawn_runner_base.py
index a06c97498f7a73..9c097c349c3da0 100644
--- a/python/paddle/fluid/tests/unittests/spawn_runner_base.py
+++ b/python/paddle/fluid/tests/unittests/spawn_runner_base.py
@@ -29,11 +29,9 @@ class SpawnAssistTestArgs(object):
     trainer_id = 0
     current_endpoint = None
     endpoints = None
-    with_spawn = True
 
 
 def run_dygraph_model(rank, model, args):
-    args.with_spawn = True
     args.trainer_id = rank
     return model.run_trainer_with_spawn(args)
 

From b27cfee96b4ad8599ed6cc96f6be98981159061a Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 25 Aug 2020 15:38:23 +0000
Subject: [PATCH 19/32] refactor code by reviewer's comments

---
 python/paddle/__init__.py                     |   1 +
 python/paddle/distributed/__init__.py         |  11 +-
 python/paddle/distributed/parallel.py         |  23 ++--
 .../{start_processes.py => spawn.py}          |   8 +-
 python/paddle/distributed/utils.py            |  23 ++--
 python/paddle/fluid/dygraph/parallel.py       | 115 ++++++++++++------
 .../unittests/test_directory_migration.py     |   5 +-
 .../fluid/tests/unittests/test_dist_base.py   |   5 +-
 .../tests/unittests/test_init_parallel_env.py |  78 ++++++++++++
 python/paddle/framework/__init__.py           |   1 +
 10 files changed, 189 insertions(+), 81 deletions(-)
 rename python/paddle/distributed/{start_processes.py => spawn.py} (98%)
 create mode 100644 python/paddle/fluid/tests/unittests/test_init_parallel_env.py

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index dd10317c215f16..4d8a6a7fdd07fb 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -231,6 +231,7 @@
 from .framework import no_grad  #DEFINE_ALIAS
 from .framework import save  #DEFINE_ALIAS
 from .framework import load  #DEFINE_ALIAS
+from .framework import DataParallel  #DEFINE_ALIAS
 
 from .framework import NoamDecay  #DEFINE_ALIAS
 from .framework import PiecewiseDecay  #DEFINE_ALIAS
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index 999cbf279d9f9d..8210e0a02dc101 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -16,16 +16,13 @@
 __all__ = ["spawn", "start_processes"]
 
 # dygraph parallel apis
-__all__ += [
-    "prepare_context", "init_parallel_env", "ParallelEnv", "DataParallel"
-]
+__all__ += ["prepare_context", "init_parallel_env", "ParallelEnv"]
 
-from . import start_processes
-from .start_processes import spawn
-from .start_processes import start_processes
+from . import spawn
+from .spawn import spawn
+from .spawn import start_processes
 
 from . import parallel
 from .parallel import init_parallel_env
 from paddle.fluid.dygraph.parallel import prepare_context  #DEFINE_ALIAS
 from paddle.fluid.dygraph.parallel import ParallelEnv  #DEFINE_ALIAS
-from paddle.fluid.dygraph.parallel import DataParallel  #DEFINE_ALIAS
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index fc500244115f36..fa5901e783bbef 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -17,6 +17,7 @@
 
 from paddle import compat as cpt
 from paddle.distributed.launch import get_cluster_and_pod, _print_arguments
+from paddle.distributed.utils import _update_trainer_env
 
 # deprecated module import
 from paddle.fluid import core
@@ -96,7 +97,7 @@ def _update_env_vars(rank, options):
     if args.started_port is None:
         default_port = os.environ.get("PADDLE_MASTER_PORT", None)
         if default_port is None:
-            raise RuntimeError(
+            raise ValueError(
                 "Data parallel training start failed. If you start data parallel "
                 "training by `paddle.distributed.launch` module, Please ensure "
                 "that one of the following rules is met:\n"
@@ -144,15 +145,8 @@ def _update_env_vars(rank, options):
         raise RuntimeError(
             "The expected trainer is not exists, its trainer rank is %d." %
             rank)
-    proc_env = {
-        "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in trainer.gpus]),
-        "PADDLE_TRAINER_ID": "%d" % trainer.rank,
-        "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
-        "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
-        "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
-    }
     # no copy, each process will hold env vars itself
-    os.environ.update(proc_env)
+    _update_trainer_env(os.environ, cluster, trainer)
 
     # print config
     if args.print_config and rank == 0:
@@ -164,7 +158,7 @@ def _check_var_exists(var_name):
         var = os.environ.get(var_name, None)
         if var is None:
             raise ValueError("paddle.distributed initialize error,"
-                             "environment variable %s is needed, but not set.",
+                             "environment variable %s is needed, but not set." %
                              var_name)
 
     _check_var_exists("FLAGS_selected_gpus")
@@ -186,7 +180,6 @@ def init_parallel_env(rank=-1, backend='nccl', **options):
         backend(str, optional): The backend to communication between multiple devices.
             Now only support ``nccl`` . Default value is ``nccl`` .
         **options(dict, optional): Other initial parallel execution environment configuration options.
-        
             The following options are currently supported:
 
             - cluster_node_ips: Paddle cluster nodes ips, such as "192.168.0.16,192.168.0.17". Default: "127.0.0.1".
@@ -202,7 +195,7 @@ def init_parallel_env(rank=-1, backend='nccl', **options):
             - use_paddlecloud: Wheter to use paddlecloud platform to run your multi-process job. Default: False.
 
     Returns:
-        ParallelStrategy
+        None
         
     Examples:
         .. code-block:: python
@@ -226,11 +219,11 @@ def train(rank):
                 paddle.disable_static()
                 
                 # 2. initialize parallel environment
-                strategy = dist.init_parallel_env(rank)
+                dist.init_parallel_env(rank)
 
                 # 3. create data parallel layer & optimizer
                 layer = LinearNet()
-                dp_layer = dist.DataParallel(layer, strategy)
+                dp_layer = paddle.DataParallel(layer)
 
                 loss_fn = nn.MSELoss()
                 sgd = opt.SGD(
@@ -293,5 +286,3 @@ def train(rank):
         parallel_helper._set_parallel_ctx(
             core.NCCLParallelContext(strategy, place))
         parallel_helper._init_parallel_ctx()
-
-    return strategy
diff --git a/python/paddle/distributed/start_processes.py b/python/paddle/distributed/spawn.py
similarity index 98%
rename from python/paddle/distributed/start_processes.py
rename to python/paddle/distributed/spawn.py
index a5ec59ebfdffda..370dbb35bdd9ec 100644
--- a/python/paddle/distributed/start_processes.py
+++ b/python/paddle/distributed/spawn.py
@@ -178,11 +178,11 @@ def train(rank):
                 paddle.disable_static()
                 
                 # 2. initialize parallel environment
-                strategy = dist.init_parallel_env(rank)
+                dist.init_parallel_env(rank)
 
                 # 3. create data parallel layer & optimizer
                 layer = LinearNet()
-                dp_layer = dist.DataParallel(layer, strategy)
+                dp_layer = paddle.DataParallel(layer)
 
                 loss_fn = nn.MSELoss()
                 sgd = opt.SGD(
@@ -295,11 +295,11 @@ def train(rank):
                 paddle.disable_static()
                 
                 # 2. initialize parallel environment
-                strategy = dist.init_parallel_env(rank)
+                dist.init_parallel_env(rank)
 
                 # 3. create data parallel layer & optimizer
                 layer = LinearNet()
-                dp_layer = dist.DataParallel(layer, strategy)
+                dp_layer = paddle.DataParallel(layer)
 
                 loss_fn = nn.MSELoss()
                 sgd = opt.SGD(
diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index 87d0f1546f38d0..a98c619ec0d25c 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -333,6 +333,19 @@ def __free_port():
     return None
 
 
+def _update_trainer_env(current_env, cluster, trainer):
+    proc_env = {
+        "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in trainer.gpus]),
+        "PADDLE_TRAINER_ID": "%d" % trainer.rank,
+        "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
+        "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
+        "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
+    }
+    current_env.update(proc_env)
+
+    return proc_env
+
+
 class TrainerProc(object):
     def __init__(self):
         self.proc = None
@@ -358,15 +371,7 @@ def start_local_trainers(cluster,
 
     procs = []
     for idx, t in enumerate(pod.trainers):
-        proc_env = {
-            "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in t.gpus]),
-            "PADDLE_TRAINER_ID": "%d" % t.rank,
-            "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
-            "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
-            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
-        }
-
-        current_env.update(proc_env)
+        proc_env = _update_trainer_env(current_env)
 
         logger.debug("trainer proc env:{}".format(current_env))
 
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index dc04847f57609e..c274f356ea24a0 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -243,61 +243,98 @@ class DataParallel(layers.Layer):
     Run the dygraph module with data parallelism.
 
     Currently, DataParallel class only supports to run the dynamic graph
-    with multi-process. The usage is:
-    `python -m paddle.distributed.launch --selected_gpus=0,1 dynamic_graph_test.py`.
-    And the content of `dynamic_graph_test.py` is the code of examples.
+    with multi-process. 
+    
+    Now supports two ways to start training:
+
+    1. start by ``paddle.distributed.spawn`` method, for example:
+
+        ``python demo.py`` (spawn need to be called in ``__main__`` method)
+    
+    2. start by ``paddle.distributed.launch`` module, for example:
+    
+        ``python -m paddle.distributed.launch --selected_gpus=0,1 demo.py`` .
+
+    And the content of `demo.py` is the code of examples.
 
     Args:
         layers(Layer): The module that should be executed by data parallel.
-        strategy(ParallelStrategy): The strategy of data parallelism, contains 
-            environment configuration related to parallel execution.
-
+        strategy(ParallelStrategy, optional): (deprecated) The strategy of data parallelism, 
+            contains environment configuration related to parallel execution. Default: None.
+            
     Returns:
         Layer: The data paralleled module.
 
     Examples:
         .. code-block:: python
 
-            import numpy as np
-            import paddle.fluid as fluid
-
-            place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
-            with fluid.dygraph.guard(place):
-
-                # prepare the data parallel context
-                strategy = fluid.dygraph.prepare_context()
-
-                linear = fluid.dygraph.Linear(1, 10, act="softmax")
-                adam = fluid.optimizer.AdamOptimizer(
-                    learning_rate=0.001, parameter_list=linear.parameters())
-
-                # make the module become the data parallelism module
-                linear = fluid.dygraph.DataParallel(linear, strategy)
-
-                x_data = np.random.random(size=[10, 1]).astype(np.float32)
-                data = fluid.dygraph.to_variable(x_data)
-
-                hidden = linear(data)
-                avg_loss = fluid.layers.mean(hidden)
-
-                # scale the loss according to the number of trainers.
-                avg_loss = linear.scale_loss(avg_loss)
-
-                avg_loss.backward()
-
-                # collect the gradients of trainers.
-                linear.apply_collective_grads()
+            import paddle
+            import paddle.nn as nn
+            import paddle.optimizer as opt
+            import paddle.distributed as dist
 
-                adam.minimize(avg_loss)
-                linear.clear_gradients()
+            class LinearNet(nn.Layer):
+                def __init__(self):
+                    super(LinearNet, self).__init__()
+                    self._linear1 = nn.Linear(10, 10)
+                    self._linear2 = nn.Linear(10, 1)
+                    
+                def forward(self, x):
+                    return self._linear2(self._linear1(x))
+
+            def train(rank):
+                # 1. enable dynamic mode
+                paddle.disable_static()
+                
+                # 2. initialize parallel environment
+                dist.init_parallel_env(rank)
+
+                # 3. create data parallel layer & optimizer
+                layer = LinearNet()
+                dp_layer = paddle.DataParallel(layer)
+
+                loss_fn = nn.MSELoss()
+                sgd = opt.SGD(
+                    learning_rate=0.001, parameter_list=dp_layer.parameters())
+
+                # 4. run layer
+                inputs = paddle.randn([10, 10], 'float32')
+                outputs = dp_layer(inputs)
+                labels = paddle.randn([10, 1], 'float32')
+                loss = loss_fn(outputs, labels)
+                
+                loss = dp_layer.scale_loss(loss)
+                loss.backward()
+                dp_layer.apply_collective_grads()
+
+                sgd.minimize(loss)
+                dp_layer.clear_gradients()
+
+            if __name__ == '__main__':
+                # 1. start by ``paddle.distributed.spawn`` (default)
+                dist.spawn(train, args=(), nprocs=2)
+                # 2. start by ``paddle.distributed.launch``
+                # train(-1)
     """
 
-    def __init__(self, layers, strategy):
+    def __init__(self, layers, strategy=None):
         super(DataParallel,
               self).__init__(layers.full_name() + "_data_parallel")
 
         self._layers = layers
-        self._strategy = strategy
+
+        # NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy. 
+        # It just stores some environment variables, which can be constructed by 
+        # ParallelEnv. Here it is set as an optional argument.
+        # This parameter is not removed because of compatibility with 1.x writing.
+        if strategy is not None:
+            self._strategy = strategy
+        else:
+            self._strategy = ParallelStrategy()
+            self._strategy.nranks = ParallelEnv().nranks
+            self._strategy.local_rank = ParallelEnv().local_rank
+            self._strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
+            self._strategy.current_endpoint = ParallelEnv().current_endpoint
 
     def forward(self, *inputs, **kwargs):
         return self._layers(*inputs, **kwargs)
diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py
index fee2756f516451..19416e132920ae 100644
--- a/python/paddle/fluid/tests/unittests/test_directory_migration.py
+++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py
@@ -40,9 +40,8 @@ def test_new_directory(self):
             'paddle.no_grad', 'paddle.save', 'paddle.load',
             'paddle.static.save', 'paddle.static.load',
             'paddle.BackwardStrategy', 'paddle.distributed.ParallelEnv',
-            'paddle.distributed.prepare_context',
-            'paddle.distributed.DataParallel', 'paddle.jit',
-            'paddle.jit.TracedLayer', 'paddle.jit.to_static',
+            'paddle.distributed.prepare_context', 'paddle.DataParallel',
+            'paddle.jit', 'paddle.jit.TracedLayer', 'paddle.jit.to_static',
             'paddle.jit.ProgramTranslator', 'paddle.jit.TranslatedLayer',
             'paddle.jit.save', 'paddle.jit.load', 'paddle.jit.SaveLoadConfig',
             'paddle.NoamDecay', 'paddle.PiecewiseDecay',
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index e3c09ee3b7592e..d3ebaf90cb5161 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -460,13 +460,12 @@ def run_trainer_with_spawn(self, args):
 
         # 3. init parallel env
         if args.update_method == "nccl2":
-            strategy = paddle.distributed.init_parallel_env(
-                rank=args.trainer_id)
+            paddle.distributed.init_parallel_env(rank=args.trainer_id)
 
         # 4. train model
         model, train_reader, opt = self.get_model()
         if args.update_method == "nccl2":
-            model = paddle.distributed.DataParallel(model, strategy)
+            model = paddle.DataParallel(model)
 
         out_losses = []
         for step_id, data in enumerate(train_reader()):
diff --git a/python/paddle/fluid/tests/unittests/test_init_parallel_env.py b/python/paddle/fluid/tests/unittests/test_init_parallel_env.py
new file mode 100644
index 00000000000000..2eb390738a89b0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_init_parallel_env.py
@@ -0,0 +1,78 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import numpy as np
+import unittest
+
+import paddle
+import paddle.distributed as dist
+
+# NOTE(chenweihang): Coverage CI is currently not able to count python3
+# unittest, so the unittests here covers some cases that will only be 
+# executed in the python3 sub-process. 
+# If the coverage CI can check python3 and sub-process, 
+# we can remove all unittests here
+
+
+class TestInitParallelEnv(unittest.TestCase):
+    def test_beckend_type_error(self):
+        with self.assertRaises(TypeError):
+            dist.init_parallel_env(backend=1)
+
+    def test_backend_value_error(self):
+        with self.assertRaises(ValueError):
+            dist.init_parallel_env(backend="mpi")
+
+    def test_rank_type_error(self):
+        with self.assertRaises(TypeError):
+            dist.init_parallel_env(rank="1")
+
+    def test_rank_value_error(self):
+        with self.assertRaises(ValueError):
+            dist.init_parallel_env(rank=-2)
+
+    def test_only_cluster_node_ips_error(self):
+        with self.assertRaises(ValueError):
+            dist.init_parallel_env(
+                rank=0, cluster_node_ips="127.0.0.1,127.0.0.2")
+
+    def test_no_started_port_error(self):
+        with self.assertRaises(ValueError):
+            dist.init_parallel_env(rank=0)
+
+    def test_no_selected_gpus_error(self):
+        with self.assertRaises(ValueError):
+            dist.init_parallel_env(rank=0, started_port=6170)
+
+    def test_check_env_failed(self):
+        os.environ['FLAGS_selected_gpus'] = '0'
+        os.environ['PADDLE_TRAINER_ID'] = '0'
+        os.environ['PADDLE_CURRENT_ENDPOINT'] = '127.0.0.1:6170'
+        os.environ['PADDLE_TRAINERS_NUM'] = '1'
+        with self.assertRaises(ValueError):
+            dist.init_parallel_env()
+
+    def test_update_env(self):
+        dist.init_parallel_env(rank=0, started_port=6170, selected_gpus="0")
+        self.assertIsNotNone(os.environ.get('PADDLE_TRAINER_ID', None))
+        self.assertIsNotNone(os.environ.get('PADDLE_CURRENT_ENDPOINT', None))
+        self.assertIsNotNone(os.environ.get('PADDLE_TRAINERS_NUM', None))
+        self.assertIsNotNone(os.environ.get('PADDLE_TRAINER_ENDPOINTS', None))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index 4b348ea729ef33..c2b212c10b677e 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -50,6 +50,7 @@
 from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
 from ..fluid.dygraph.checkpoint import load_dygraph as load  #DEFINE_ALIAS
 from ..fluid.dygraph.checkpoint import save_dygraph as save  #DEFINE_ALIAS
+from ..fluid.dygraph.parallel import DataParallel  #DEFINE_ALIAS
 
 from ..fluid.dygraph.learning_rate_scheduler import NoamDecay  #DEFINE_ALIAS
 from ..fluid.dygraph.learning_rate_scheduler import PiecewiseDecay  #DEFINE_ALIAS

From f50f3432bbf0664578ccac1d103ecdf4ecbaf737 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 26 Aug 2020 02:12:04 +0000
Subject: [PATCH 20/32] fix unittest failed

---
 python/paddle/distributed/utils.py                            | 2 +-
 python/paddle/fluid/tests/unittests/test_init_parallel_env.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index a98c619ec0d25c..1e5c7810b530b6 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -371,7 +371,7 @@ def start_local_trainers(cluster,
 
     procs = []
     for idx, t in enumerate(pod.trainers):
-        proc_env = _update_trainer_env(current_env)
+        proc_env = _update_trainer_env(current_env, cluster, t)
 
         logger.debug("trainer proc env:{}".format(current_env))
 
diff --git a/python/paddle/fluid/tests/unittests/test_init_parallel_env.py b/python/paddle/fluid/tests/unittests/test_init_parallel_env.py
index 2eb390738a89b0..ace97ee2e9cf73 100644
--- a/python/paddle/fluid/tests/unittests/test_init_parallel_env.py
+++ b/python/paddle/fluid/tests/unittests/test_init_parallel_env.py
@@ -67,7 +67,8 @@ def test_check_env_failed(self):
             dist.init_parallel_env()
 
     def test_update_env(self):
-        dist.init_parallel_env(rank=0, started_port=6170, selected_gpus="0")
+        device = os.getenv("CUDA_VISIBLE_DEVICES")
+        dist.init_parallel_env(rank=0, started_port=6170, selected_gpus=device)
         self.assertIsNotNone(os.environ.get('PADDLE_TRAINER_ID', None))
         self.assertIsNotNone(os.environ.get('PADDLE_CURRENT_ENDPOINT', None))
         self.assertIsNotNone(os.environ.get('PADDLE_TRAINERS_NUM', None))

From 11221a859bb459fcbdea23e2c19fd075bceaf32c Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 26 Aug 2020 02:52:11 +0000
Subject: [PATCH 21/32] fix parallel_env unittest

---
 python/paddle/fluid/tests/unittests/test_init_parallel_env.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_init_parallel_env.py b/python/paddle/fluid/tests/unittests/test_init_parallel_env.py
index ace97ee2e9cf73..16a55bdd18247d 100644
--- a/python/paddle/fluid/tests/unittests/test_init_parallel_env.py
+++ b/python/paddle/fluid/tests/unittests/test_init_parallel_env.py
@@ -67,7 +67,9 @@ def test_check_env_failed(self):
             dist.init_parallel_env()
 
     def test_update_env(self):
-        device = os.getenv("CUDA_VISIBLE_DEVICES")
+        device = os.getenv("CUDA_VISIBLE_DEVICES", None)
+        if device is None:
+            device = '0'
         dist.init_parallel_env(rank=0, started_port=6170, selected_gpus=device)
         self.assertIsNotNone(os.environ.get('PADDLE_TRAINER_ID', None))
         self.assertIsNotNone(os.environ.get('PADDLE_CURRENT_ENDPOINT', None))

From 0980c230d6909c3d0f0b5e7f415e7aac54e44b72 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 26 Aug 2020 08:43:46 +0000
Subject: [PATCH 22/32] fix several typos

---
 python/paddle/distributed/parallel.py   | 20 ++++++++++----------
 python/paddle/distributed/spawn.py      |  8 ++++----
 python/paddle/fluid/dygraph/parallel.py |  4 ++--
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index fa5901e783bbef..5dee640c1cb591 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -42,7 +42,7 @@ def __init__(self):
         # The current node ip.
         self.node_ip = None
 
-        # wheter to use paddlecloud platform to run your multi-process job.
+        # whether to use paddlecloud platform to run your multi-process job.
         # If false, no need to set this argument.
         self.use_paddlecloud = None
 
@@ -112,7 +112,7 @@ def _update_env_vars(rank, options):
     args.print_config = options.get('print_config', True)
 
     # set default `selected_gpus`
-    # TODO(chenweihang): if users gived number of `selected_gpus`
+    # TODO(chenweihang): if users given number of `selected_gpus`
     # is not equal to the spawn's nprocs, it will cause error, 
     # and because we remove the `proc num` argument of 
     # `init_parallel_env`, when above error occured, we do not 
@@ -173,10 +173,10 @@ def init_parallel_env(rank=-1, backend='nccl', **options):
     Initialize parallel training environments in dynamic mode.
 
     Args:
-        rank(int, optional): Rank of current process. Default vaule is -1. 
-            When it is the default value -1, you should use ``paddle.disstributed.launch`` 
+        rank(int, optional): Rank of current process. Default value is -1. 
+            When it is the default value -1, you should use ``paddle.distributed.launch`` 
             module to start training, the environment variables for parallel training 
-            are configured by ``paddle.disstributed.launch`` module.
+            are configured by ``paddle.distributed.launch`` module.
         backend(str, optional): The backend to communication between multiple devices.
             Now only support ``nccl`` . Default value is ``nccl`` .
         **options(dict, optional): Other initial parallel execution environment configuration options.
@@ -192,7 +192,7 @@ def init_parallel_env(rank=-1, backend='nccl', **options):
 
             - print_config: Print current parallel training config. Default: True.
 
-            - use_paddlecloud: Wheter to use paddlecloud platform to run your multi-process job. Default: False.
+            - use_paddlecloud: Whether to use paddlecloud platform to run your multi-process job. Default: False.
 
     Returns:
         None
@@ -267,16 +267,16 @@ def train(rank):
     # 3. init ParallelStrategy
     strategy = ParallelStrategy()
     if cpt.to_text(backend) == 'nccl':
-        strategy.nranks = ParallelEnv().nranks
-        strategy.local_rank = ParallelEnv().local_rank
+        strategy.world_size = ParallelEnv().world_size
+        strategy.rank = ParallelEnv().rank
         strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
         strategy.current_endpoint = ParallelEnv().current_endpoint
-        if strategy.nranks < 2:
+        if strategy.world_size < 2:
             return
         # NOTE(chenweihang): [ why config global place here? ]
         # the dygraph mode will be set to default mode, 
         # users will not call `dygraph.guard` or `enable_dygraph`
-        # directly, if they want to switch detault place,
+        # directly, if they want to switch default place,
         # they need to call a function to change default place,
         # here just set correctly place to users
         place = core.CUDAPlace(ParallelEnv().dev_id)
diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
index 370dbb35bdd9ec..7071f4fcbbc1ec 100644
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -117,7 +117,7 @@ def _throw_exception(self, error_index):
 
         original_trace = self.error_queues[error_index].get()
         msg = "\n\n----------------------------------------------\n" \
-              "Procces %d terminated with the following error:\n" \
+              "Process %d terminated with the following error:\n" \
               "----------------------------------------------\n\n" % error_index
         msg += original_trace
         raise Exception(msg)
@@ -137,7 +137,7 @@ def start_processes(func,
     Start multiple processes for parallel training.
 
     Args:
-        func (function): The targert function is called by started process.
+        func (function): The target function is called by started process.
             This function need to be able to pickled, so it must be defined
             at the top level of a module.
             This function should be called as ``func(i, *args)`` , ``i`` is
@@ -245,7 +245,7 @@ def train(rank):
     while not context.join():
         pass
 
-    # finaly return context
+    # finally return context
     return context
 
 
@@ -259,7 +259,7 @@ def spawn(func, args=(), nprocs=1, join=True, daemon=False):
     This is a specialized method of ``paddle.distributed.start_processes`` .
 
     Args:
-        func (function): The targert function is called by spawned process.
+        func (function): The target function is called by spawned process.
             This function need to be able to pickled, so it must be defined
             at the top level of a module.
             This function should be called as ``func(i, *args)``, ``i`` is
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index c274f356ea24a0..bfe42467651b8e 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -44,7 +44,7 @@ def prepare_context(strategy=None):
     if strategy.nranks < 2:
         return
     assert framework.in_dygraph_mode() is True, \
-        "dygraph.prepare_context should be used with dygrahp mode."
+        "dygraph.prepare_context should be used with dygraph mode."
     place = framework._current_expected_place()
     assert place is not None, \
         "dygraph.prepare_context should be used in fluid.dygraph.guard(place) guard."
@@ -227,7 +227,7 @@ def trainer_endpoints(self):
 
     def __getattr__(self, name):
         if name == "__aliases__":
-            raise AttributeError("Attribue `__aliases__` can not be accessed.")
+            raise AttributeError("Attribute `__aliases__` can not be accessed.")
         name = self.__aliases__.get(name, name)
         return object.__getattribute__(self, name)
 

From af505180d78ec413566b69b31e2a50449bde7e4c Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 27 Aug 2020 01:56:36 +0000
Subject: [PATCH 23/32] fix error introduced when fixing typos

---
 python/paddle/distributed/parallel.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 5dee640c1cb591..f263d7ce2559c3 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -267,11 +267,11 @@ def train(rank):
     # 3. init ParallelStrategy
     strategy = ParallelStrategy()
     if cpt.to_text(backend) == 'nccl':
-        strategy.world_size = ParallelEnv().world_size
-        strategy.rank = ParallelEnv().rank
+        strategy.nranks = ParallelEnv().world_size
+        strategy.local_rank = ParallelEnv().rank
         strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
         strategy.current_endpoint = ParallelEnv().current_endpoint
-        if strategy.world_size < 2:
+        if strategy.nranks < 2:
             return
         # NOTE(chenweihang): [ why config global place here? ]
         # the dygraph mode will be set to default mode, 

From a378140d82e53b07c44fa139608ffae646e5fcc2 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 27 Aug 2020 02:22:44 +0000
Subject: [PATCH 24/32] add unpublic note for start_processes

---
 python/paddle/distributed/spawn.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
index 7071f4fcbbc1ec..a7c6c634f1bdc8 100644
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -136,6 +136,11 @@ def start_processes(func,
     """
     Start multiple processes for parallel training.
 
+    .. note:: 
+        ``start_processes`` is not a public interface! Please use ``spawn``
+        firstly, if ``spawn`` cannot meet the need, then consider using
+        ``start_processes`` .
+
     Args:
         func (function): The target function is called by started process.
             This function need to be able to pickled, so it must be defined
@@ -256,7 +261,8 @@ def spawn(func, args=(), nprocs=1, join=True, daemon=False):
     """
     Start multiple processes with ``spawn`` method for parallel training.
     
-    This is a specialized method of ``paddle.distributed.start_processes`` .
+    If you want to use other methods ( ``fork`` , ``forkserver`` ) to start 
+    multiple processes, please use ``paddle.distributed.start_processes`` .
 
     Args:
         func (function): The target function is called by spawned process.

From cca82b6146f4b48c02b3db2fa4a2e861b70abdc0 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 27 Aug 2020 04:11:06 +0000
Subject: [PATCH 25/32] polish details by xiaoguang's comment

---
 python/paddle/distributed/parallel.py | 18 ++++++---
 python/paddle/distributed/spawn.py    | 54 ++++++++++++++++++++-------
 2 files changed, 54 insertions(+), 18 deletions(-)

diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index f263d7ce2559c3..f443d2a004b1a6 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -174,9 +174,17 @@ def init_parallel_env(rank=-1, backend='nccl', **options):
 
     Args:
         rank(int, optional): Rank of current process. Default value is -1. 
-            When it is the default value -1, you should use ``paddle.distributed.launch`` 
-            module to start training, the environment variables for parallel training 
-            are configured by ``paddle.distributed.launch`` module.
+            When use ``paddle.distributed.spawn`` method to start parallel 
+            training, the rank value is generated by spawn method, spawn method
+            will assign a rank to each process according to the number of processes.
+            For example, if the number of processes is 4, the ranks of the 4 
+            processes are 0,1,2,3 in order.
+            If do not use ``paddle.distributed.spawn`` method to start parallel 
+            training, you can not pass the rank value here, or set it to default
+            value -1. When it is the default value -1, you should use 
+            ``paddle.distributed.launch`` module to start parallel training, 
+            the environment variables for parallel training are configured by 
+            ``paddle.distributed.launch`` module.
         backend(str, optional): The backend to communication between multiple devices.
             Now only support ``nccl`` . Default value is ``nccl`` .
         **options(dict, optional): Other initial parallel execution environment configuration options.
@@ -239,8 +247,8 @@ def train(rank):
                 loss.backward()
                 dp_layer.apply_collective_grads()
 
-                sgd.minimize(loss)
-                dp_layer.clear_gradients()
+                sgd.step()
+                sgd.clear_grad()
 
             if __name__ == '__main__':
                 dist.spawn(train, args=(), nprocs=2)
diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
index a7c6c634f1bdc8..56a991e31d78df 100644
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -21,6 +21,10 @@
 import warnings
 
 from paddle.distributed.utils import find_free_ports
+from paddle.device import get_device
+
+# deprecated module import
+from paddle.fluid.framework import cpu_places, cuda_places
 
 
 def _py_supported_check():
@@ -129,7 +133,7 @@ def _throw_exception(self, error_index):
 # to use CUDA in subprocesses.
 def start_processes(func,
                     args=(),
-                    nprocs=1,
+                    nprocs=-1,
                     join=True,
                     daemon=False,
                     start_method='spawn'):
@@ -148,11 +152,19 @@ def start_processes(func,
             This function should be called as ``func(i, *args)`` , ``i`` is
             the process index and ``args`` contains other arguments as tuple.
         args (tuple): Arguments passed to ``func`` .
-        nprocs (int): Number of processed to start. 
-        join (bool): Perform a blocking join on all started processes.
+        nprocs (int, optional): Number of processed to start. Default: -1.
+            when nprocs is -1, the available device will be obtained from 
+            the environment variable when the model is executed: If use GPU, 
+            the currently available device ID is obtained from the environment 
+            variable CUDA_VISIBLE_DEVICES; If use CPU, the currently available
+            CPU number is obtained from the environment variable CPU_NUM. 
+            For example, export CPU_NUM=4, if the environment variable is not set, 
+            the executor will add the variable to the environment variable and 
+            set its value to 1.
+        join (bool, optional): Perform a blocking join on all started processes.
             Default: True.
-        daemon (bool): The started processes' daemon flag. Default: False.
-        start_method (string): the way to start a process. The start method
+        daemon (bool, optional): The started processes' daemon flag. Default: False.
+        start_method (string, optional): the way to start a process. The start method
             can be ``spawn`` , ``fork`` , ``forkserver`` . Because the CUDA 
             runtime does not support the ``fork`` start method, when use 
             CUDA in subprocesses, we should start process by ``spawn`` or
@@ -203,8 +215,8 @@ def train(rank):
                 loss.backward()
                 dp_layer.apply_collective_grads()
 
-                sgd.minimize(loss)
-                dp_layer.clear_gradients()
+                sgd.step()
+                sgd.clear_grad()
 
             if __name__ == '__main__':
                 dist.start_processes(train, args=(), nprocs=2)
@@ -216,6 +228,14 @@ def train(rank):
     # cannot support CUDA runtime multi-process
     _py_supported_check()
 
+    # get default nprocs
+    if nprocs == -1:
+        device = get_device()
+        if device == 'cpu':
+            nprocs = len(cpu_places())
+        else:
+            nprocs = len(cuda_places())
+
     # NOTE(chenweihang): [ why need set default master info before run? ]
     # when using `paddle.distributed.spawn/start_processes` start 
     # parallel training, users need use `init_parallel_env` to config 
@@ -257,7 +277,7 @@ def train(rank):
 # NOTE(chenweihang): this method only supports start processes
 # by `spwan` method, if users want to start processes by other
 # method, they can use start_processes
-def spawn(func, args=(), nprocs=1, join=True, daemon=False):
+def spawn(func, args=(), nprocs=-1, join=True, daemon=False):
     """
     Start multiple processes with ``spawn`` method for parallel training.
     
@@ -271,10 +291,18 @@ def spawn(func, args=(), nprocs=1, join=True, daemon=False):
             This function should be called as ``func(i, *args)``, ``i`` is
             the process index and ``args`` contains other arguments as tuple.
         args (tuple): Arguments passed to ``func``.
-        nprocs (int): Number of processed to spawn. 
-        join (bool): Perform a blocking join on all spawned processes.
+        nprocs (int, optional): Number of processed to start. Default: -1.
+            when nprocs is -1, the available device will be obtained from 
+            the environment variable when the model is executed: If use GPU, 
+            the currently available device ID is obtained from the environment 
+            variable CUDA_VISIBLE_DEVICES; If use CPU, the currently available
+            CPU number is obtained from the environment variable CPU_NUM. 
+            For example, export CPU_NUM=4, if the environment variable is not set, 
+            the executor will add the variable to the environment variable and 
+            set its value to 1.
+        join (bool, optional): Perform a blocking join on all spawned processes.
             Default: True.
-        daemon (bool): The spawned processes' daemon flag. Default: False.
+        daemon (bool, optional): The spawned processes' daemon flag. Default: False.
 
     Returns:
         ``MultiprocessContext`` object, it hold the spawned processes.
@@ -321,8 +349,8 @@ def train(rank):
                 loss.backward()
                 dp_layer.apply_collective_grads()
 
-                sgd.minimize(loss)
-                dp_layer.clear_gradients()
+                sgd.step()
+                sgd.clear_grad()
 
             if __name__ == '__main__':
                 dist.spawn(train, args=(), nprocs=2)

From d39331cd317ee0eb0cd5fc2bef612138dc3fe20c Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 27 Aug 2020 05:49:36 +0000
Subject: [PATCH 26/32] verify correctly when spawn nprocs=-1

---
 python/paddle/distributed/parallel.py   | 12 ++++----
 python/paddle/distributed/spawn.py      | 39 +++++++++++++++++--------
 python/paddle/fluid/dygraph/parallel.py |  8 ++---
 3 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index f443d2a004b1a6..0b9e4f83eee59c 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -178,7 +178,9 @@ def init_parallel_env(rank=-1, backend='nccl', **options):
             training, the rank value is generated by spawn method, spawn method
             will assign a rank to each process according to the number of processes.
             For example, if the number of processes is 4, the ranks of the 4 
-            processes are 0,1,2,3 in order.
+            processes are 0,1,2,3 in order, so this argument does not need to
+            be passed by users.
+            
             If do not use ``paddle.distributed.spawn`` method to start parallel 
             training, you can not pass the rank value here, or set it to default
             value -1. When it is the default value -1, you should use 
@@ -234,8 +236,8 @@ def train(rank):
                 dp_layer = paddle.DataParallel(layer)
 
                 loss_fn = nn.MSELoss()
-                sgd = opt.SGD(
-                    learning_rate=0.001, parameter_list=dp_layer.parameters())
+                adam = opt.Adam(
+                    learning_rate=0.001, parameters=dp_layer.parameters())
 
                 # 4. run layer
                 inputs = paddle.randn([10, 10], 'float32')
@@ -247,8 +249,8 @@ def train(rank):
                 loss.backward()
                 dp_layer.apply_collective_grads()
 
-                sgd.step()
-                sgd.clear_grad()
+                adam.step()
+                adam.clear_grad()
 
             if __name__ == '__main__':
                 dist.spawn(train, args=(), nprocs=2)
diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
index 56a991e31d78df..cbcd54255e4437 100644
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -17,6 +17,7 @@
 import multiprocessing
 import os
 import signal
+import six
 import sys
 import warnings
 
@@ -24,7 +25,8 @@
 from paddle.device import get_device
 
 # deprecated module import
-from paddle.fluid.framework import cpu_places, cuda_places
+from paddle.fluid import core
+from paddle.fluid.framework import _cpu_num
 
 
 def _py_supported_check():
@@ -50,8 +52,20 @@ def _set_default_assist_env(nprocs):
     # because the FLAGS_selected_gpus may be used in other place,
     # if we set FLAGS_selected_gpus to be `0,1,2,3`, it may cause error
     # when using `ParallelEnv`
+    # NOTE(chenweihang): use absolute gpu card id
+    env_devices = os.getenv("CUDA_VISIBLE_DEVICES")
+    if env_devices is None or env_devices == "":
+        env_devices_list = six.moves.range(core.get_cuda_device_count())
+    else:
+        env_devices_list = env_devices.split(',')
+    if len(env_devices_list) < nprocs:
+        raise RuntimeError(
+            "the number of visible devices(%d) is less than the number "
+            "of spawn processes(%d), please ensure that the correct `nprocs` argument is "
+            "passed or the environment variable `CUDA_VISIBLE_DEVICES` is correctly configured."
+            % len(env_devices_list), nprocs)
     os.environ['PADDLE_CUDA_VISIBLE_DEVICES'] = ",".join(
-        [str(x) for x in range(0, nprocs)])
+        [str(env_devices_list[x]) for x in range(0, nprocs)])
 
 
 def _func_wrapper(func, i, args, error_queue, return_queue):
@@ -202,8 +216,8 @@ def train(rank):
                 dp_layer = paddle.DataParallel(layer)
 
                 loss_fn = nn.MSELoss()
-                sgd = opt.SGD(
-                    learning_rate=0.001, parameter_list=dp_layer.parameters())
+                adam = opt.Adam(
+                    learning_rate=0.001, parameters=dp_layer.parameters())
 
                 # 4. run layer
                 inputs = paddle.randn([10, 10], 'float32')
@@ -215,8 +229,8 @@ def train(rank):
                 loss.backward()
                 dp_layer.apply_collective_grads()
 
-                sgd.step()
-                sgd.clear_grad()
+                adam.step()
+                adam.clear_grad()
 
             if __name__ == '__main__':
                 dist.start_processes(train, args=(), nprocs=2)
@@ -232,9 +246,10 @@ def train(rank):
     if nprocs == -1:
         device = get_device()
         if device == 'cpu':
-            nprocs = len(cpu_places())
+            # TODO: not supports cpu parallel now
+            nprocs = _cpu_num
         else:
-            nprocs = len(cuda_places())
+            nprocs = core.get_cuda_device_count()
 
     # NOTE(chenweihang): [ why need set default master info before run? ]
     # when using `paddle.distributed.spawn/start_processes` start 
@@ -336,8 +351,8 @@ def train(rank):
                 dp_layer = paddle.DataParallel(layer)
 
                 loss_fn = nn.MSELoss()
-                sgd = opt.SGD(
-                    learning_rate=0.001, parameter_list=dp_layer.parameters())
+                adam = opt.Adam(
+                    learning_rate=0.001, parameters=dp_layer.parameters())
 
                 # 4. run layer
                 inputs = paddle.randn([10, 10], 'float32')
@@ -349,8 +364,8 @@ def train(rank):
                 loss.backward()
                 dp_layer.apply_collective_grads()
 
-                sgd.step()
-                sgd.clear_grad()
+                adam.step()
+                adam.clear_grad()
 
             if __name__ == '__main__':
                 dist.spawn(train, args=(), nprocs=2)
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index bfe42467651b8e..bc7269b886ab4d 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -294,8 +294,8 @@ def train(rank):
                 dp_layer = paddle.DataParallel(layer)
 
                 loss_fn = nn.MSELoss()
-                sgd = opt.SGD(
-                    learning_rate=0.001, parameter_list=dp_layer.parameters())
+                adam = opt.Adam(
+                    learning_rate=0.001, parameters=dp_layer.parameters())
 
                 # 4. run layer
                 inputs = paddle.randn([10, 10], 'float32')
@@ -307,8 +307,8 @@ def train(rank):
                 loss.backward()
                 dp_layer.apply_collective_grads()
 
-                sgd.minimize(loss)
-                dp_layer.clear_gradients()
+                adam.step()
+                adam.clear_grad()
 
             if __name__ == '__main__':
                 # 1. start by ``paddle.distributed.spawn`` (default)

From 3a2d7e8d09f0c1841dfad9f665b9415721fe02f2 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 27 Aug 2020 11:59:57 +0000
Subject: [PATCH 27/32] refactor spawn & init_parallel_env design

---
 python/paddle/distributed/__init__.py         |  10 +-
 python/paddle/distributed/parallel.py         | 245 ++++---------
 python/paddle/distributed/spawn.py            | 334 ++++++++++--------
 python/paddle/distributed/utils.py            |  13 +-
 python/paddle/fluid/dygraph/parallel.py       |  37 +-
 .../tests/unittests/spawn_runner_base.py      |  49 ++-
 .../fluid/tests/unittests/test_dist_base.py   |   4 +-
 .../tests/unittests/test_init_parallel_env.py |  81 -----
 .../test_spawn_and_init_parallel_env.py       |  64 ++++
 9 files changed, 364 insertions(+), 473 deletions(-)
 delete mode 100644 python/paddle/fluid/tests/unittests/test_init_parallel_env.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py

diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index ae76b2e10866bd..d66577102c713a 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -14,10 +14,11 @@
 
 from . import spawn
 from .spawn import spawn
-from .spawn import start_processes
 
 from . import parallel
 from .parallel import init_parallel_env
+from .parallel import get_rank
+from .parallel import get_world_size
 from paddle.fluid.dygraph.parallel import prepare_context  #DEFINE_ALIAS
 from paddle.fluid.dygraph.parallel import ParallelEnv  #DEFINE_ALIAS
 
@@ -25,10 +26,13 @@
 from .collective import *
 
 # start multiprocess apis
-__all__ = ["spawn", "start_processes"]
+__all__ = ["spawn"]
 
 # dygraph parallel apis
-__all__ += ["prepare_context", "init_parallel_env", "ParallelEnv"]
+__all__ += [
+    "init_parallel_env", "get_rank", "get_world_size", "prepare_context",
+    "ParallelEnv"
+]
 
 # collective apis
 __all__ += collective.__all__
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 0b9e4f83eee59c..205a1ae793b49a 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -16,8 +16,6 @@
 import six
 
 from paddle import compat as cpt
-from paddle.distributed.launch import get_cluster_and_pod, _print_arguments
-from paddle.distributed.utils import _update_trainer_env
 
 # deprecated module import
 from paddle.fluid import core
@@ -30,179 +28,13 @@
 ParallelStrategy = core.ParallelStrategy
 
 
-# NOTE(chenweihang): The existence of this class leads to 
-# the maintenance of two arguments. When the launch.py arguments 
-# is updated, the arguments here also need to be updated, 
-# but I have not thought of a better way here
-class ParallelEnvArgs(object):
-    def __init__(self):
-        # Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..
-        self.cluster_node_ips = None
-
-        # The current node ip.
-        self.node_ip = None
-
-        # whether to use paddlecloud platform to run your multi-process job.
-        # If false, no need to set this argument.
-        self.use_paddlecloud = None
-
-        # The trainer's started port on a single node
-        self.started_port = None
-
-        # Print the config or not
-        self.print_config = True
-
-        # It's for gpu training and the training process will run 
-        # on the selected_gpus, each process is bound to a single GPU. 
-        # And if it's not set, this module will use all the gpu cards 
-        # for training.
-        self.selected_gpus = None
-
-
-def _update_env_vars(rank, options):
-    # 1. input check
-    if not isinstance(rank, six.integer_types):
-        raise TypeError("input `rank` type error, expected type is integer, "
-                        "but received type is %s." % type(rank))
-    if rank < 0:
-        raise ValueError("input `rank` should be greater than 0, "
-                         "but received %d." % rank)
-
-    # 2. check and prepare environment variables
-    # The necessary environment variables include:
-    # - PADDLE_TRAINER_ID
-    # - PADDLE_TRAINERS_NUM
-    # - PADDLE_CURRENT_ENDPOINT
-    # - PADDLE_TRAINER_ENDPOINTS
-
-    # get args from kwargs
-    args = ParallelEnvArgs()
-    # set default `node_ip` and `cluster_node_ips`
-    args.cluster_node_ips = options.get('cluster_node_ips', None)
-    args.node_ip = options.get('node_ip', None)
-    if args.cluster_node_ips is not None and args.node_ip is None:
-        raise ValueError("please input current node ip, "
-                         "cannot only give `cluster_node_ips`.")
-    default_node_ip = os.environ.get("PADDLE_MASTER_IPADDR", None)
-    default_node_ip = "127.0.0.1" if default_node_ip is None else default_node_ip
-    if args.node_ip is None:
-        args.node_ip = default_node_ip
-    if args.cluster_node_ips is None:
-        args.cluster_node_ips = default_node_ip
-
-    # NOTE(chenweihang): Here should set `started_port` before
-    # `get_cluster_and_pod` and keep each process's started_port
-    # is same, see [ why need set default master info before run? ]
-    args.started_port = options.get('started_port', None)
-    if args.started_port is None:
-        default_port = os.environ.get("PADDLE_MASTER_PORT", None)
-        if default_port is None:
-            raise ValueError(
-                "Data parallel training start failed. If you start data parallel "
-                "training by `paddle.distributed.launch` module, Please ensure "
-                "that one of the following rules is met:\n"
-                "  1. Do not set `paddle.distributed.init_parallel_env` argument "
-                "`rank` or set it to be -1;\n"
-                "  2. Set `paddle.distributed.init_parallel_env` start port for "
-                "parallel training by `started_port=**`, e.g. started_port=6170."
-            )
-        args.started_port = int(default_port)
-
-    args.use_paddlecloud = options.get('use_paddlecloud', False)
-    args.print_config = options.get('print_config', True)
-
-    # set default `selected_gpus`
-    # TODO(chenweihang): if users given number of `selected_gpus`
-    # is not equal to the spawn's nprocs, it will cause error, 
-    # and because we remove the `proc num` argument of 
-    # `init_parallel_env`, when above error occured, we do not 
-    # have a good way to check, so users are not recommended to 
-    # use this parameter, it is best to delete
-    args.selected_gpus = options.get('selected_gpus', None)
-    if args.selected_gpus is None:
-        args.selected_gpus = os.environ.get("PADDLE_CUDA_VISIBLE_DEVICES", None)
-        if args.selected_gpus is None:
-            raise ValueError(
-                "Data parallel training start failed. If you start data parallel "
-                "training by `paddle.distributed.launch` module, Please ensure "
-                "that one of the following rules is met:\n"
-                "  1. Do not set `paddle.distributed.init_parallel_env` argument "
-                "`rank` or set it to be -1;\n"
-                "  2. Set `paddle.distributed.init_parallel_env` selected gpus of "
-                "parallel training by `selected_gpus=**`, e.g. selected_gpus='0,1,2,3'."
-            )
-
-    # reuse code of launch.py
-    cluster, pod = get_cluster_and_pod(args)
-
-    # remove useless env vars
-    os.environ.pop("http_proxy", None)
-    os.environ.pop("https_proxy", None)
-
-    # update env vars
-    trainer = pod.get_trainer(rank)
-    if trainer is None:
-        raise RuntimeError(
-            "The expected trainer is not exists, its trainer rank is %d." %
-            rank)
-    # no copy, each process will hold env vars itself
-    _update_trainer_env(os.environ, cluster, trainer)
-
-    # print config
-    if args.print_config and rank == 0:
-        _print_arguments(args)
-
-
-def _check_env_vars():
-    def _check_var_exists(var_name):
-        var = os.environ.get(var_name, None)
-        if var is None:
-            raise ValueError("paddle.distributed initialize error,"
-                             "environment variable %s is needed, but not set." %
-                             var_name)
-
-    _check_var_exists("FLAGS_selected_gpus")
-    _check_var_exists("PADDLE_TRAINER_ID")
-    _check_var_exists("PADDLE_CURRENT_ENDPOINT")
-    _check_var_exists("PADDLE_TRAINERS_NUM")
-    _check_var_exists("PADDLE_TRAINER_ENDPOINTS")
-
-
-def init_parallel_env(rank=-1, backend='nccl', **options):
+def init_parallel_env(backend='nccl'):
     """
     Initialize parallel training environments in dynamic mode.
 
     Args:
-        rank(int, optional): Rank of current process. Default value is -1. 
-            When use ``paddle.distributed.spawn`` method to start parallel 
-            training, the rank value is generated by spawn method, spawn method
-            will assign a rank to each process according to the number of processes.
-            For example, if the number of processes is 4, the ranks of the 4 
-            processes are 0,1,2,3 in order, so this argument does not need to
-            be passed by users.
-            
-            If do not use ``paddle.distributed.spawn`` method to start parallel 
-            training, you can not pass the rank value here, or set it to default
-            value -1. When it is the default value -1, you should use 
-            ``paddle.distributed.launch`` module to start parallel training, 
-            the environment variables for parallel training are configured by 
-            ``paddle.distributed.launch`` module.
         backend(str, optional): The backend to communication between multiple devices.
             Now only support ``nccl`` . Default value is ``nccl`` .
-        **options(dict, optional): Other initial parallel execution environment configuration options.
-            The following options are currently supported:
-
-            - cluster_node_ips: Paddle cluster nodes ips, such as "192.168.0.16,192.168.0.17". Default: "127.0.0.1".
-
-            - node_ip: The current node ip, such as "192.168.0.16". Default: "127.0.0.1".
-
-            - started_port: The trainer's started port on a single node, such as 6170. Default: None.
-
-            - selected_gpus: The training process will run on the selected_gpus, such as "0,1,2,3". Default: None.
-
-            - print_config: Print current parallel training config. Default: True.
-
-            - use_paddlecloud: Whether to use paddlecloud platform to run your multi-process job. Default: False.
 
     Returns:
         None
@@ -224,12 +56,12 @@ def __init__(self):
                 def forward(self, x):
                     return self._linear2(self._linear1(x))
 
-            def train(rank):
+            def train():
                 # 1. enable dynamic mode
                 paddle.disable_static()
                 
                 # 2. initialize parallel environment
-                dist.init_parallel_env(rank)
+                dist.init_parallel_env()
 
                 # 3. create data parallel layer & optimizer
                 layer = LinearNet()
@@ -253,7 +85,7 @@ def train(rank):
                 adam.clear_grad()
 
             if __name__ == '__main__':
-                dist.spawn(train, args=(), nprocs=2)
+                dist.spawn(train)
     """
 
     # 1. input check
@@ -265,14 +97,19 @@ def train(rank):
             "backend `%s` is not supported, now only supports `nccl` backend." %
             backend)
 
-    # 2. update or check env
-    # NOTE(chenweihang): if rank is default value, users should config 
-    # parallel environment by module `paddle.distributed.launch`,
-    # so here we only check the environment variables
-    if rank != -1:
-        _update_env_vars(rank, options)
-    else:
-        _check_env_vars()
+    # 2. check env
+    def _check_var_exists(var_name):
+        var = os.environ.get(var_name, None)
+        if var is None:
+            raise ValueError("paddle.distributed initialize error,"
+                             "environment variable %s is needed, but not set." %
+                             var_name)
+
+    _check_var_exists("FLAGS_selected_gpus")
+    _check_var_exists("PADDLE_TRAINER_ID")
+    _check_var_exists("PADDLE_CURRENT_ENDPOINT")
+    _check_var_exists("PADDLE_TRAINERS_NUM")
+    _check_var_exists("PADDLE_TRAINER_ENDPOINTS")
 
     # 3. init ParallelStrategy
     strategy = ParallelStrategy()
@@ -289,10 +126,56 @@ def train(rank):
         # directly, if they want to switch default place,
         # they need to call a function to change default place,
         # here just set correctly place to users
-        place = core.CUDAPlace(ParallelEnv().dev_id)
+        place = core.CUDAPlace(ParallelEnv().device_id)
         _set_expected_place(place)
 
         # init nccl context
         parallel_helper._set_parallel_ctx(
             core.NCCLParallelContext(strategy, place))
         parallel_helper._init_parallel_ctx()
+
+
+def get_rank():
+    """
+    Returns the rank of current trainer.
+
+    Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ID`` . 
+    The default value is 0.
+
+    Returns:
+        (int) The rank of current trainer.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.distributed as dist
+
+            # execute this command in terminal: export PADDLE_TRAINER_ID=0
+            print("The rank is %d" % dist.get_rank())
+            # The rank is 0
+    """
+    return ParallelEnv().rank
+
+
+def get_world_size():
+    """
+    The number of trainers (number of processes participating in current job).
+
+    Its value is equal to the value of the environment variable ``PADDLE_TRAINERS_NUM`` . 
+    The default value is 1.
+
+    Returns:
+        (int) The number of trainers.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.distributed as dist
+
+            # execute this command in terminal: export PADDLE_TRAINERS_NUM=4
+            print("The world_size is %d" % dist.get_world_size())
+            # The world_size is 4
+    """
+    return ParallelEnv().world_size
diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
index cbcd54255e4437..cbc9abb85b7a2f 100644
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -21,7 +21,8 @@
 import sys
 import warnings
 
-from paddle.distributed.utils import find_free_ports
+from paddle.distributed.launch import get_cluster_and_pod, _print_arguments
+from paddle.distributed.utils import _prepare_trainer_env
 from paddle.device import get_device
 
 # deprecated module import
@@ -29,23 +30,63 @@
 from paddle.fluid.framework import _cpu_num
 
 
+# NOTE(chenweihang): The existence of this class leads to 
+# the maintenance of two arguments. When the launch.py arguments 
+# is updated, the arguments here also need to be updated, 
+# but I have not thought of a better way here
+class ParallelEnvArgs(object):
+    def __init__(self):
+        # Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..
+        self.cluster_node_ips = None
+
+        # The current node ip.
+        self.node_ip = None
+
+        # whether to use paddlecloud platform to run your multi-process job.
+        # If false, no need to set this argument.
+        self.use_paddlecloud = None
+
+        # The trainer's started port on a single node
+        self.started_port = None
+
+        # Print the config or not
+        self.print_config = True
+
+        # It's for gpu training and the training process will run 
+        # on the selected_gpus, each process is bound to a single GPU. 
+        # And if it's not set, this module will use all the gpu cards 
+        # for training.
+        self.selected_gpus = None
+
+
 def _py_supported_check():
     if not sys.version_info >= (3, 4):
         raise RuntimeError(
-            "Use `paddle.distributed.spawn` or `paddle.distributed.start_processes` "
-            "to start parallel training requires python version greater than 3.4, "
-            "if your python is lower than this version, please use "
+            "Use `paddle.distributed.spawn` to start parallel training "
+            "requires python version greater than 3.4, if your python "
+            "is lower than this version, please use "
             "`paddle.distributed.launch` instead.")
 
 
-def _set_default_assist_env(nprocs):
-    # set default master trainer ip addr
-    os.environ['PADDLE_MASTER_IPADDR'] = '127.0.0.1'
-    # set default master trainer port
-    port_set = find_free_ports(1)
-    if port_set is None:
-        raise RuntimeError("no free port can be used to parallel training now.")
-    os.environ['PADDLE_MASTER_PORT'] = str(list(port_set)[0])
+def _get_subprocess_env_list(nprocs, options):
+    # contruct processes env list
+    processes_env_list = []
+
+    # get args from kwargs
+    args = ParallelEnvArgs()
+
+    # set default `node_ip` and `cluster_node_ips`
+    args.cluster_node_ips = options.get('cluster_node_ips', None)
+    args.node_ip = options.get('node_ip', None)
+    if args.cluster_node_ips is not None and args.node_ip is None:
+        raise ValueError("please input current node ip, "
+                         "cannot only give `cluster_node_ips`.")
+    default_node_ip = "127.0.0.1"
+    if args.node_ip is None:
+        args.node_ip = default_node_ip
+    if args.cluster_node_ips is None:
+        args.cluster_node_ips = default_node_ip
+
     # set default selected gpus
     # e.g. if the nprocs is 4, the selected gpus is "0,1,2,3"
     # NOTE(chenweihang): [ why not use FLAGS_selected_gpus directly? ]
@@ -53,24 +94,70 @@ def _set_default_assist_env(nprocs):
     # if we set FLAGS_selected_gpus to be `0,1,2,3`, it may cause error
     # when using `ParallelEnv`
     # NOTE(chenweihang): use absolute gpu card id
-    env_devices = os.getenv("CUDA_VISIBLE_DEVICES")
+    args.selected_gpus = options.get('selected_gpus', None)
+    env_devices = os.getenv("CUDA_VISIBLE_DEVICES", None)
     if env_devices is None or env_devices == "":
-        env_devices_list = six.moves.range(core.get_cuda_device_count())
+        env_devices_list = [
+            str(x) for x in six.moves.range(core.get_cuda_device_count())
+        ]
     else:
         env_devices_list = env_devices.split(',')
-    if len(env_devices_list) < nprocs:
-        raise RuntimeError(
-            "the number of visible devices(%d) is less than the number "
-            "of spawn processes(%d), please ensure that the correct `nprocs` argument is "
-            "passed or the environment variable `CUDA_VISIBLE_DEVICES` is correctly configured."
-            % len(env_devices_list), nprocs)
-    os.environ['PADDLE_CUDA_VISIBLE_DEVICES'] = ",".join(
-        [str(env_devices_list[x]) for x in range(0, nprocs)])
+    if args.selected_gpus is None:
+        if len(env_devices_list) < nprocs:
+            raise RuntimeError(
+                "the number of visible devices(%d) is less than the number "
+                "of spawn processes(%d), please ensure that the correct "
+                "`nprocs` argument is passed or the environment variable "
+                "`CUDA_VISIBLE_DEVICES` is correctly configured." %
+                (len(env_devices_list), nprocs))
+        args.selected_gpus = ",".join(
+            [str(env_devices_list[x]) for x in range(0, nprocs)])
+    else:
+        for card_id in args.selected_gpus.split(','):
+            if card_id not in env_devices_list:
+                raise ValueError("The selected gpu card %s cannot found in "
+                                 "CUDA_VISIBLE_DEVICES (%s)." %
+                                 (card_id, ",".join(env_devices_list)))
+
+    # set other arguments
+    args.started_port = options.get('started_port', None)
+    args.use_paddlecloud = options.get('use_paddlecloud', False)
+    args.print_config = options.get('print_config', False)
+
+    # reuse code of launch.py
+    cluster, pod = get_cluster_and_pod(args)
 
+    # prepare subprocess env list
+    for trainer in pod.trainers:
+        processes_env_list.append(_prepare_trainer_env(cluster, trainer))
 
-def _func_wrapper(func, i, args, error_queue, return_queue):
+    # print config
+    if args.print_config:
+        _print_arguments(args)
+
+    return processes_env_list
+
+
+def _remove_risky_env():
+    # remove useless env vars, same as launch.py
+    # no copy, each process will hold env vars itself
+    os.environ.pop("http_proxy", None)
+    os.environ.pop("https_proxy", None)
+
+
+def _set_trainer_env(env_dict):
+    for var_name in env_dict:
+        os.environ[var_name] = env_dict[var_name]
+
+
+def _func_wrapper(func, args, error_queue, return_queue, env_dict):
     try:
-        result = func(i, *args)
+        # config subprocess environment variables
+        _remove_risky_env()
+        _set_trainer_env(env_dict)
+        # execute function
+        result = func(*args)
+        # record function return value
         return_queue.put(result)
     except KeyboardInterrupt:
         pass
@@ -84,7 +171,7 @@ class MultiprocessContext(object):
     def __init__(self, processes, error_queues, return_queues):
         _py_supported_check()
         self.error_queues = error_queues
-        # NOTE(chenweihang): The `start_processes` method is mainly used 
+        # NOTE(chenweihang): The `spawn` method is mainly used 
         # to wrap the outermost execution function of the program for 
         # parallel execution. Generally, the return value is not concerned, 
         # but if the user needs to obtain the return value, users can get  
@@ -141,31 +228,17 @@ def _throw_exception(self, error_index):
         raise Exception(msg)
 
 
-# NOTE(chenweihang): [ why default start method is spawn? ]
-# The CUDA runtime does not support the fork start method, 
-# either the spawn or forkserver start method are required 
-# to use CUDA in subprocesses.
-def start_processes(func,
-                    args=(),
-                    nprocs=-1,
-                    join=True,
-                    daemon=False,
-                    start_method='spawn'):
+def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
     """
-    Start multiple processes for parallel training.
-
-    .. note:: 
-        ``start_processes`` is not a public interface! Please use ``spawn``
-        firstly, if ``spawn`` cannot meet the need, then consider using
-        ``start_processes`` .
+    Start multiple processes with ``spawn`` method for parallel training.
 
     Args:
-        func (function): The target function is called by started process.
+        func (function): The target function is called by spawned process.
             This function need to be able to pickled, so it must be defined
             at the top level of a module.
-            This function should be called as ``func(i, *args)`` , ``i`` is
+            This function should be called as ``func(i, *args)``, ``i`` is
             the process index and ``args`` contains other arguments as tuple.
-        args (tuple): Arguments passed to ``func`` .
+        args (tuple, optional): Arguments passed to ``func``.
         nprocs (int, optional): Number of processed to start. Default: -1.
             when nprocs is -1, the available device will be obtained from 
             the environment variable when the model is executed: If use GPU, 
@@ -175,17 +248,30 @@ def start_processes(func,
             For example, export CPU_NUM=4, if the environment variable is not set, 
             the executor will add the variable to the environment variable and 
             set its value to 1.
-        join (bool, optional): Perform a blocking join on all started processes.
+        join (bool, optional): Perform a blocking join on all spawned processes.
             Default: True.
-        daemon (bool, optional): The started processes' daemon flag. Default: False.
-        start_method (string, optional): the way to start a process. The start method
-            can be ``spawn`` , ``fork`` , ``forkserver`` . Because the CUDA 
-            runtime does not support the ``fork`` start method, when use 
-            CUDA in subprocesses, we should start process by ``spawn`` or
-            ``forkserver`` method. Default: 'spawn'.
+        daemon (bool, optional): The spawned processes' daemon flag. Default: False.
+        **options(dict, optional): Other initial parallel execution environment 
+            configuration options. The following options are currently supported: 
+            (1) start_method (string): the way to start a process. The start method 
+                can be ``spawn`` , ``fork`` , ``forkserver`` . Because the CUDA 
+                runtime does not support the ``fork`` start method, when use CUDA
+                in subprocesses, we should start process by ``spawn`` or ``forkserver`` 
+                method. Default: 'spawn'; 
+            (2) cluster_node_ips (string): Paddle cluster nodes ips, such as 
+                "192.168.0.16,192.168.0.17". Default: "127.0.0.1"; 
+            (3) node_ip (string): The current node ip, such as "192.168.0.16". 
+                Default: "127.0.0.1"; 
+            (4) started_port (int): The trainer's started port on a single node,
+                such as 6170. Default: None; 
+            (5) selected_gpus (string): The training process will run on the 
+                selected_gpus, such as "0,1,2,3". Default: None; 
+            (6) print_config: Print current parallel training config. Default: False;
+            (7) use_paddlecloud: Whether to use paddlecloud platform to run your 
+                multi-process job. Default: False.
 
     Returns:
-        ``MultiprocessContext`` object, it hold the started processes.
+        ``MultiprocessContext`` object, it hold the spawned processes.
 
     Examples:
         .. code-block:: python
@@ -204,12 +290,12 @@ def __init__(self):
                 def forward(self, x):
                     return self._linear2(self._linear1(x))
 
-            def train(rank):
+            def train(print_result=False):
                 # 1. enable dynamic mode
                 paddle.disable_static()
                 
                 # 2. initialize parallel environment
-                dist.init_parallel_env(rank)
+                dist.init_parallel_env()
 
                 # 3. create data parallel layer & optimizer
                 layer = LinearNet()
@@ -225,6 +311,9 @@ def train(rank):
                 labels = paddle.randn([10, 1], 'float32')
                 loss = loss_fn(outputs, labels)
                 
+                if print_result is True:
+                    print("loss: " % loss)
+                
                 loss = dp_layer.scale_loss(loss)
                 loss.backward()
                 dp_layer.apply_collective_grads()
@@ -232,8 +321,38 @@ def train(rank):
                 adam.step()
                 adam.clear_grad()
 
+            # Usage 1: only pass function. 
+            # If your training method no need any argument, and 
+            # use all visible devices for parallel training. 
+            if __name__ == '__main__':
+                dist.spawn(train)
+
+            # Usage 2: pass function and arguments.
+            # If your training method need some arguments, and 
+            # use all visible devices for parallel training.
             if __name__ == '__main__':
-                dist.start_processes(train, args=(), nprocs=2)
+                dist.spawn(train, args=(True,))
+
+            # Usage 3: pass function, arguments and nprocs.
+            # If your training method need some arguments, and 
+            # only use part of visible devices for parallel training.
+            # If your machine hold 8 cards {0,1,2,3,4,5,6,7},
+            # this case will use cards {0,1}; If you set 
+            # CUDA_VISIBLE_DEVICES=4,5,6,7, this case will use
+            # cards {4,5}
+            if __name__ == '__main__':
+                dist.spawn(train, args=(True,), nprocs=2)
+
+            # Usage 4: pass function, arguments, nprocs and selected_gpus.
+            # If your training method need some arguments, and 
+            # only use part of visible devices for parallel training,
+            # but you can't set your machine's environment varibale 
+            # CUDA_VISIBLE_DEVICES, such as it is None or all cards
+            # {0,1,2,3,4,5,6,7}, you can pass `selelcted_gpus` to 
+            # select the GPU cards you want to use. For example,
+            # this case will use cards {4,5} if your machine hold 8 cards.
+            if __name__ == '__main__':
+                dist.spawn(train, args=(True,), nprocs=2, selelcted_gpus='4,5')
     """
     # NOTE(chenweihang): [ why only supports python3.4+ ? ]
     # Python supported setting the child process startup method
@@ -251,15 +370,20 @@ def train(rank):
         else:
             nprocs = core.get_cuda_device_count()
 
-    # NOTE(chenweihang): [ why need set default master info before run? ]
-    # when using `paddle.distributed.spawn/start_processes` start 
-    # parallel training, users need use `init_parallel_env` to config 
-    # cluster info inner subprocess, if each process find free port for 
-    # itself, the started port may be different, it will cause endpoints is
-    # different in different subprocesses
-    _set_default_assist_env(nprocs)
+    # NOTE(chenweihang): [ why need get cluster info before run? ]
+    # when using `paddle.distributed.spawn` start parallel training, 
+    # we should get cluster info before starting subprocess, and pass 
+    # correct info to each subprocess
+    procs_env_list = _get_subprocess_env_list(nprocs, options)
 
     # start processes
+    # NOTE(chenweihang): [ why default start method is spawn? ]
+    # The CUDA runtime does not support the fork start method, 
+    # either the spawn or forkserver start method are required 
+    # to use CUDA in subprocesses.
+    start_method = options.get('start_method', None)
+    if start_method is None:
+        start_method = 'spawn'
     mp = multiprocessing.get_context(start_method)
 
     error_queues = []
@@ -270,7 +394,7 @@ def train(rank):
         return_queue = mp.SimpleQueue()
         process = mp.Process(
             target=_func_wrapper,
-            args=(func, i, args, error_queue, return_queue))
+            args=(func, args, error_queue, return_queue, procs_env_list[i]))
         process.daemon = daemon
         process.start()
         error_queues.append(error_queue)
@@ -287,87 +411,3 @@ def train(rank):
 
     # finally return context
     return context
-
-
-# NOTE(chenweihang): this method only supports start processes
-# by `spwan` method, if users want to start processes by other
-# method, they can use start_processes
-def spawn(func, args=(), nprocs=-1, join=True, daemon=False):
-    """
-    Start multiple processes with ``spawn`` method for parallel training.
-    
-    If you want to use other methods ( ``fork`` , ``forkserver`` ) to start 
-    multiple processes, please use ``paddle.distributed.start_processes`` .
-
-    Args:
-        func (function): The target function is called by spawned process.
-            This function need to be able to pickled, so it must be defined
-            at the top level of a module.
-            This function should be called as ``func(i, *args)``, ``i`` is
-            the process index and ``args`` contains other arguments as tuple.
-        args (tuple): Arguments passed to ``func``.
-        nprocs (int, optional): Number of processed to start. Default: -1.
-            when nprocs is -1, the available device will be obtained from 
-            the environment variable when the model is executed: If use GPU, 
-            the currently available device ID is obtained from the environment 
-            variable CUDA_VISIBLE_DEVICES; If use CPU, the currently available
-            CPU number is obtained from the environment variable CPU_NUM. 
-            For example, export CPU_NUM=4, if the environment variable is not set, 
-            the executor will add the variable to the environment variable and 
-            set its value to 1.
-        join (bool, optional): Perform a blocking join on all spawned processes.
-            Default: True.
-        daemon (bool, optional): The spawned processes' daemon flag. Default: False.
-
-    Returns:
-        ``MultiprocessContext`` object, it hold the spawned processes.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.nn as nn
-            import paddle.optimizer as opt
-            import paddle.distributed as dist
-
-            class LinearNet(nn.Layer):
-                def __init__(self):
-                    super(LinearNet, self).__init__()
-                    self._linear1 = nn.Linear(10, 10)
-                    self._linear2 = nn.Linear(10, 1)
-                    
-                def forward(self, x):
-                    return self._linear2(self._linear1(x))
-
-            def train(rank):
-                # 1. enable dynamic mode
-                paddle.disable_static()
-                
-                # 2. initialize parallel environment
-                dist.init_parallel_env(rank)
-
-                # 3. create data parallel layer & optimizer
-                layer = LinearNet()
-                dp_layer = paddle.DataParallel(layer)
-
-                loss_fn = nn.MSELoss()
-                adam = opt.Adam(
-                    learning_rate=0.001, parameters=dp_layer.parameters())
-
-                # 4. run layer
-                inputs = paddle.randn([10, 10], 'float32')
-                outputs = dp_layer(inputs)
-                labels = paddle.randn([10, 1], 'float32')
-                loss = loss_fn(outputs, labels)
-                
-                loss = dp_layer.scale_loss(loss)
-                loss.backward()
-                dp_layer.apply_collective_grads()
-
-                adam.step()
-                adam.clear_grad()
-
-            if __name__ == '__main__':
-                dist.spawn(train, args=(), nprocs=2)
-    """
-    return start_processes(func, args, nprocs, join, daemon, 'spawn')
diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index 1e5c7810b530b6..1fa307c4d1b89d 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -213,12 +213,6 @@ def get_visible_gpus(self):
         r = r[:-1]
         return r
 
-    def get_trainer(self, trainer_id):
-        for trainer in self.trainers:
-            if trainer.rank == trainer_id:
-                return trainer
-        return None
-
 
 def get_logger(log_level, name="root"):
     logger = logging.getLogger(name)
@@ -333,7 +327,7 @@ def __free_port():
     return None
 
 
-def _update_trainer_env(current_env, cluster, trainer):
+def _prepare_trainer_env(cluster, trainer):
     proc_env = {
         "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in trainer.gpus]),
         "PADDLE_TRAINER_ID": "%d" % trainer.rank,
@@ -341,8 +335,6 @@ def _update_trainer_env(current_env, cluster, trainer):
         "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
         "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
     }
-    current_env.update(proc_env)
-
     return proc_env
 
 
@@ -371,7 +363,8 @@ def start_local_trainers(cluster,
 
     procs = []
     for idx, t in enumerate(pod.trainers):
-        proc_env = _update_trainer_env(current_env, cluster, t)
+        proc_env = _prepare_trainer_env(cluster, t)
+        current_env.update(proc_env)
 
         logger.debug("trainer proc env:{}".format(current_env))
 
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index bc7269b886ab4d..bd578e6ba98a0f 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -118,16 +118,12 @@ class ParallelEnv(object):
     """
 
     def __init__(self):
-        self._nranks = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
-        self._local_rank = int(os.getenv("PADDLE_TRAINER_ID", "0"))
-        self._dev_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        self._rank = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        self._world_size = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
+        self._device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
         self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
                                             "").split(",")
         self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "")
-        self.__aliases__ = {
-            'local_rank': 'rank',
-            'nranks': 'world_size',
-        }
 
     @property
     def rank(self):
@@ -146,7 +142,7 @@ def rank(self):
             print("The rank is %d" % env.rank)
             # The rank is 0
         """
-        return self._local_rank
+        return self._rank
 
     @property
     def world_size(self):
@@ -165,10 +161,10 @@ def world_size(self):
             print("The world_size is %d" % env.world_size)
             # The world_size is 4
         """
-        return self._nranks
+        return self._world_size
 
     @property
-    def dev_id(self):
+    def device_id(self):
         """
         The ID of selected GPU card for parallel training.
 
@@ -181,10 +177,10 @@ def dev_id(self):
             import paddle.distributed as dist
             
             env = dist.ParallelEnv()
-            print("The device id are %d" % env.dev_id)
+            print("The device id are %d" % env.device_id)
             # The device id are 1
         """
-        return self._dev_id
+        return self._device_id
 
     @property
     def current_endpoint(self):
@@ -225,11 +221,10 @@ def trainer_endpoints(self):
         """
         return self._trainer_endpoints
 
-    def __getattr__(self, name):
-        if name == "__aliases__":
-            raise AttributeError("Attribute `__aliases__` can not be accessed.")
-        name = self.__aliases__.get(name, name)
-        return object.__getattribute__(self, name)
+    # [aliases] Compatible with old method names
+    local_rank = rank
+    nranks = world_size
+    dev_id = device_id
 
 
 # NOTE: [ Compatible ] Originally this class name is `Env`. The semantics of the old class names
@@ -282,12 +277,12 @@ def __init__(self):
                 def forward(self, x):
                     return self._linear2(self._linear1(x))
 
-            def train(rank):
+            def train():
                 # 1. enable dynamic mode
                 paddle.disable_static()
                 
                 # 2. initialize parallel environment
-                dist.init_parallel_env(rank)
+                dist.init_parallel_env()
 
                 # 3. create data parallel layer & optimizer
                 layer = LinearNet()
@@ -312,9 +307,9 @@ def train(rank):
 
             if __name__ == '__main__':
                 # 1. start by ``paddle.distributed.spawn`` (default)
-                dist.spawn(train, args=(), nprocs=2)
+                dist.spawn(train, nprocs=2)
                 # 2. start by ``paddle.distributed.launch``
-                # train(-1)
+                # train()
     """
 
     def __init__(self, layers, strategy=None):
diff --git a/python/paddle/fluid/tests/unittests/spawn_runner_base.py b/python/paddle/fluid/tests/unittests/spawn_runner_base.py
index 9c097c349c3da0..c64a1c7b9e1f54 100644
--- a/python/paddle/fluid/tests/unittests/spawn_runner_base.py
+++ b/python/paddle/fluid/tests/unittests/spawn_runner_base.py
@@ -27,13 +27,6 @@
 class SpawnAssistTestArgs(object):
     update_method = "local"
     trainer_id = 0
-    current_endpoint = None
-    endpoints = None
-
-
-def run_dygraph_model(rank, model, args):
-    args.trainer_id = rank
-    return model.run_trainer_with_spawn(args)
 
 
 class TestDistSpawnRunner(unittest.TestCase):
@@ -44,15 +37,13 @@ def setUp(self):
 
     def _run(self, model, args):
         args.update_method = "local"
-        return run_dygraph_model(-1, model, args)
+        return model.run_trainer_with_spawn(args)
 
     def _run_parallel(self, model, args):
         args.update_method = "nccl2"
         context = paddle.distributed.spawn(
-            func=run_dygraph_model,
-            args=(
-                model,
-                args, ),
+            func=model.run_trainer_with_spawn,
+            args=(args, ),
             nprocs=self.nprocs,
             join=True)
         result_list = []
@@ -66,25 +57,25 @@ def check_dist_result_with_spawn(self, test_class, delta=1e-3):
         args = SpawnAssistTestArgs()
 
         # 1. calc signal card loss
-        losses = self._run(model, args)
+        # losses = self._run(model, args)
 
         # 2. calc multi card loss (nccl mode)
         dist_losses_list = self._run_parallel(model, args)
 
         # 3. compare losses
-        for step_id in range(RUN_STEP):
-            loss = losses[step_id]
-            dist_loss_sum = None
-            for dist_losses in dist_losses_list:
-                if dist_loss_sum is None:
-                    dist_loss_sum = np.array(dist_losses[step_id])
-                else:
-                    dist_loss_sum += np.array(dist_losses[step_id])
-            dist_loss = dist_loss_sum / self.nprocs
-            self.assertAlmostEqual(
-                loss,
-                dist_loss,
-                delta=delta,
-                msg="The results of single-card execution and multi-card execution are inconsistent."
-                "signal-card loss is:\n{}\nmulti-card average loss is:\n{}\n".
-                format(loss, dist_loss))
+        # for step_id in range(RUN_STEP):
+        #     loss = losses[step_id]
+        #     dist_loss_sum = None
+        #     for dist_losses in dist_losses_list:
+        #         if dist_loss_sum is None:
+        #             dist_loss_sum = np.array(dist_losses[step_id])
+        #         else:
+        #             dist_loss_sum += np.array(dist_losses[step_id])
+        #     dist_loss = dist_loss_sum / self.nprocs
+        #     self.assertAlmostEqual(
+        #         loss,
+        #         dist_loss,
+        #         delta=delta,
+        #         msg="The results of single-card execution and multi-card execution are inconsistent."
+        #         "signal-card loss is:\n{}\nmulti-card average loss is:\n{}\n".
+        #         format(loss, dist_loss))
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index d3ebaf90cb5161..faff81fa84fb5f 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -457,10 +457,12 @@ def run_trainer_with_spawn(self, args):
         paddle.static.default_main_program().random_seed = seed
         np.random.seed(seed)
         random.seed = seed
+        # get trainer id
+        args.trainer_id = paddle.distributed.get_rank()
 
         # 3. init parallel env
         if args.update_method == "nccl2":
-            paddle.distributed.init_parallel_env(rank=args.trainer_id)
+            paddle.distributed.init_parallel_env()
 
         # 4. train model
         model, train_reader, opt = self.get_model()
diff --git a/python/paddle/fluid/tests/unittests/test_init_parallel_env.py b/python/paddle/fluid/tests/unittests/test_init_parallel_env.py
deleted file mode 100644
index 16a55bdd18247d..00000000000000
--- a/python/paddle/fluid/tests/unittests/test_init_parallel_env.py
+++ /dev/null
@@ -1,81 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-import numpy as np
-import unittest
-
-import paddle
-import paddle.distributed as dist
-
-# NOTE(chenweihang): Coverage CI is currently not able to count python3
-# unittest, so the unittests here covers some cases that will only be 
-# executed in the python3 sub-process. 
-# If the coverage CI can check python3 and sub-process, 
-# we can remove all unittests here
-
-
-class TestInitParallelEnv(unittest.TestCase):
-    def test_beckend_type_error(self):
-        with self.assertRaises(TypeError):
-            dist.init_parallel_env(backend=1)
-
-    def test_backend_value_error(self):
-        with self.assertRaises(ValueError):
-            dist.init_parallel_env(backend="mpi")
-
-    def test_rank_type_error(self):
-        with self.assertRaises(TypeError):
-            dist.init_parallel_env(rank="1")
-
-    def test_rank_value_error(self):
-        with self.assertRaises(ValueError):
-            dist.init_parallel_env(rank=-2)
-
-    def test_only_cluster_node_ips_error(self):
-        with self.assertRaises(ValueError):
-            dist.init_parallel_env(
-                rank=0, cluster_node_ips="127.0.0.1,127.0.0.2")
-
-    def test_no_started_port_error(self):
-        with self.assertRaises(ValueError):
-            dist.init_parallel_env(rank=0)
-
-    def test_no_selected_gpus_error(self):
-        with self.assertRaises(ValueError):
-            dist.init_parallel_env(rank=0, started_port=6170)
-
-    def test_check_env_failed(self):
-        os.environ['FLAGS_selected_gpus'] = '0'
-        os.environ['PADDLE_TRAINER_ID'] = '0'
-        os.environ['PADDLE_CURRENT_ENDPOINT'] = '127.0.0.1:6170'
-        os.environ['PADDLE_TRAINERS_NUM'] = '1'
-        with self.assertRaises(ValueError):
-            dist.init_parallel_env()
-
-    def test_update_env(self):
-        device = os.getenv("CUDA_VISIBLE_DEVICES", None)
-        if device is None:
-            device = '0'
-        dist.init_parallel_env(rank=0, started_port=6170, selected_gpus=device)
-        self.assertIsNotNone(os.environ.get('PADDLE_TRAINER_ID', None))
-        self.assertIsNotNone(os.environ.get('PADDLE_CURRENT_ENDPOINT', None))
-        self.assertIsNotNone(os.environ.get('PADDLE_TRAINERS_NUM', None))
-        self.assertIsNotNone(os.environ.get('PADDLE_TRAINER_ENDPOINTS', None))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
new file mode 100644
index 00000000000000..b19805d7a5b747
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
@@ -0,0 +1,64 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import numpy as np
+import unittest
+
+import paddle
+import paddle.distributed as dist
+from paddle.distributed.spawn import _get_subprocess_env_list
+
+# NOTE(chenweihang): Coverage CI is currently not able to count python3
+# unittest, so the unittests here covers some cases that will only be 
+# executed in the python3 sub-process. 
+
+
+class TestInitParallelEnv(unittest.TestCase):
+    def test_beckend_type_error(self):
+        with self.assertRaises(TypeError):
+            dist.init_parallel_env(backend=1)
+
+    def test_backend_value_error(self):
+        with self.assertRaises(ValueError):
+            dist.init_parallel_env(backend="mpi")
+
+
+class TestSpawnAssistMethod(unittest.TestCase):
+    def test_only_cluster_node_ips_error(self):
+        with self.assertRaises(ValueError):
+            options = dict()
+            options['cluster_node_ips'] = "127.0.0.1,127.0.0.2"
+            _get_subprocess_env_list(nprocs=1, options=options)
+
+    def test_nprocs_greater_than_device_num_error(self):
+        with self.assertRaises(RuntimeError):
+            _get_subprocess_env_list(nprocs=100, options=dict())
+
+    def test_selected_gpus_error(self):
+        with self.assertRaises(ValueError):
+            options = dict()
+            options['selected_gpus'] = "100,101"
+            _get_subprocess_env_list(nprocs=2, options=options)
+
+    def test_get_correct_env(self):
+        env_dict = _get_subprocess_env_list(nprocs=1, options=dict())[0]
+        self.assertEqual(env_dict['PADDLE_TRAINER_ID'], '0')
+        self.assertEqual(env_dict['PADDLE_TRAINERS_NUM'], '1')
+
+
+if __name__ == "__main__":
+    unittest.main()

From 0582c4b6330fa922b59cf2cfc816fd6c04545473 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 27 Aug 2020 12:05:07 +0000
Subject: [PATCH 28/32] polish doc details

---
 python/paddle/distributed/spawn.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
index cbc9abb85b7a2f..f3ea4f5633354f 100644
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -276,6 +276,8 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
     Examples:
         .. code-block:: python
 
+            from __future__ import print_function
+
             import paddle
             import paddle.nn as nn
             import paddle.optimizer as opt
@@ -312,7 +314,7 @@ def train(print_result=False):
                 loss = loss_fn(outputs, labels)
                 
                 if print_result is True:
-                    print("loss: " % loss)
+                    print("loss:", loss.numpy())
                 
                 loss = dp_layer.scale_loss(loss)
                 loss.backward()

From 9ceaeffdbc107043fbb0cfdb0cf9072fc2325c6c Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 27 Aug 2020 12:43:08 +0000
Subject: [PATCH 29/32] open spawn unittests

---
 .../tests/unittests/spawn_runner_base.py      | 34 +++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/spawn_runner_base.py b/python/paddle/fluid/tests/unittests/spawn_runner_base.py
index c64a1c7b9e1f54..278d7b27c52880 100644
--- a/python/paddle/fluid/tests/unittests/spawn_runner_base.py
+++ b/python/paddle/fluid/tests/unittests/spawn_runner_base.py
@@ -57,25 +57,25 @@ def check_dist_result_with_spawn(self, test_class, delta=1e-3):
         args = SpawnAssistTestArgs()
 
         # 1. calc signal card loss
-        # losses = self._run(model, args)
+        losses = self._run(model, args)
 
         # 2. calc multi card loss (nccl mode)
         dist_losses_list = self._run_parallel(model, args)
 
         # 3. compare losses
-        # for step_id in range(RUN_STEP):
-        #     loss = losses[step_id]
-        #     dist_loss_sum = None
-        #     for dist_losses in dist_losses_list:
-        #         if dist_loss_sum is None:
-        #             dist_loss_sum = np.array(dist_losses[step_id])
-        #         else:
-        #             dist_loss_sum += np.array(dist_losses[step_id])
-        #     dist_loss = dist_loss_sum / self.nprocs
-        #     self.assertAlmostEqual(
-        #         loss,
-        #         dist_loss,
-        #         delta=delta,
-        #         msg="The results of single-card execution and multi-card execution are inconsistent."
-        #         "signal-card loss is:\n{}\nmulti-card average loss is:\n{}\n".
-        #         format(loss, dist_loss))
+        for step_id in range(RUN_STEP):
+            loss = losses[step_id]
+            dist_loss_sum = None
+            for dist_losses in dist_losses_list:
+                if dist_loss_sum is None:
+                    dist_loss_sum = np.array(dist_losses[step_id])
+                else:
+                    dist_loss_sum += np.array(dist_losses[step_id])
+            dist_loss = dist_loss_sum / self.nprocs
+            self.assertAlmostEqual(
+                loss,
+                dist_loss,
+                delta=delta,
+                msg="The results of single-card execution and multi-card execution are inconsistent."
+                "signal-card loss is:\n{}\nmulti-card average loss is:\n{}\n".
+                format(loss, dist_loss))

From 4b7d810c172dea89ced8b9b6d60e1f4fe950732c Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 27 Aug 2020 13:12:51 +0000
Subject: [PATCH 30/32] try to fix doc compile error

---
 python/paddle/distributed/spawn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
index f3ea4f5633354f..8ed2aa4c5eff78 100644
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -257,7 +257,7 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
                 can be ``spawn`` , ``fork`` , ``forkserver`` . Because the CUDA 
                 runtime does not support the ``fork`` start method, when use CUDA
                 in subprocesses, we should start process by ``spawn`` or ``forkserver`` 
-                method. Default: 'spawn'; 
+                method. Default: "spawn" ; 
             (2) cluster_node_ips (string): Paddle cluster nodes ips, such as 
                 "192.168.0.16,192.168.0.17". Default: "127.0.0.1"; 
             (3) node_ip (string): The current node ip, such as "192.168.0.16". 

From 4261e22dc098f85c58f5d85aaa26f199a1303090 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 27 Aug 2020 14:09:55 +0000
Subject: [PATCH 31/32] try to fix unknown doc format error

---
 python/paddle/distributed/spawn.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
index 8ed2aa4c5eff78..1ca2ebaa8d4bd3 100644
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -253,22 +253,22 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
         daemon (bool, optional): The spawned processes' daemon flag. Default: False.
         **options(dict, optional): Other initial parallel execution environment 
             configuration options. The following options are currently supported: 
-            (1) start_method (string): the way to start a process. The start method 
-                can be ``spawn`` , ``fork`` , ``forkserver`` . Because the CUDA 
-                runtime does not support the ``fork`` start method, when use CUDA
-                in subprocesses, we should start process by ``spawn`` or ``forkserver`` 
-                method. Default: "spawn" ; 
+            (1) start_method (string): the way to start a process. 
+            The start method can be ``spawn`` , ``fork`` , ``forkserver`` . 
+            Because the CUDA runtime does not support the ``fork`` start method, 
+            when use CUDA in subprocesses, we should start process by ``spawn`` 
+            or ``forkserver`` method. Default: "spawn" ; 
             (2) cluster_node_ips (string): Paddle cluster nodes ips, such as 
-                "192.168.0.16,192.168.0.17". Default: "127.0.0.1"; 
+            "192.168.0.16,192.168.0.17". Default: "127.0.0.1"; 
             (3) node_ip (string): The current node ip, such as "192.168.0.16". 
-                Default: "127.0.0.1"; 
+            Default: "127.0.0.1"; 
             (4) started_port (int): The trainer's started port on a single node,
-                such as 6170. Default: None; 
+            such as 6170. Default: None; 
             (5) selected_gpus (string): The training process will run on the 
-                selected_gpus, such as "0,1,2,3". Default: None; 
+            selected_gpus, such as "0,1,2,3". Default: None; 
             (6) print_config: Print current parallel training config. Default: False;
             (7) use_paddlecloud: Whether to use paddlecloud platform to run your 
-                multi-process job. Default: False.
+            multi-process job. Default: False.
 
     Returns:
         ``MultiprocessContext`` object, it hold the spawned processes.

From cad68727888fb94c2d6dc79eb2637f5c92c268de Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 28 Aug 2020 00:04:35 +0000
Subject: [PATCH 32/32] add skip unittest when not gpu

---
 python/paddle/distributed/parallel.py         |  5 +++-
 .../test_spawn_and_init_parallel_env.py       | 23 +++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 205a1ae793b49a..0c806747217add 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -14,6 +14,7 @@
 
 import os
 import six
+import warnings
 
 from paddle import compat as cpt
 
@@ -101,7 +102,7 @@ def train():
     def _check_var_exists(var_name):
         var = os.environ.get(var_name, None)
         if var is None:
-            raise ValueError("paddle.distributed initialize error,"
+            raise ValueError("paddle.distributed initialize error, "
                              "environment variable %s is needed, but not set." %
                              var_name)
 
@@ -114,6 +115,8 @@ def _check_var_exists(var_name):
     # 3. init ParallelStrategy
     strategy = ParallelStrategy()
     if cpt.to_text(backend) == 'nccl':
+        if parallel_helper._is_parallel_ctx_initialized():
+            warnings.warn("The parallel environment has been initialized.")
         strategy.nranks = ParallelEnv().world_size
         strategy.local_rank = ParallelEnv().rank
         strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
diff --git a/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
index b19805d7a5b747..ca92bc75245ceb 100644
--- a/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
+++ b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
@@ -22,6 +22,9 @@
 import paddle.distributed as dist
 from paddle.distributed.spawn import _get_subprocess_env_list
 
+from paddle.fluid import core
+from paddle.fluid.dygraph import parallel_helper
+
 # NOTE(chenweihang): Coverage CI is currently not able to count python3
 # unittest, so the unittests here covers some cases that will only be 
 # executed in the python3 sub-process. 
@@ -36,7 +39,27 @@ def test_backend_value_error(self):
         with self.assertRaises(ValueError):
             dist.init_parallel_env(backend="mpi")
 
+    def test_check_env_failed(self):
+        os.environ['FLAGS_selected_gpus'] = '0'
+        os.environ['PADDLE_TRAINER_ID'] = '0'
+        os.environ['PADDLE_CURRENT_ENDPOINT'] = '127.0.0.1:6170'
+        os.environ['PADDLE_TRAINERS_NUM'] = '1'
+        with self.assertRaises(ValueError):
+            dist.init_parallel_env()
+
+    def test_init_parallel_env_break(self):
+        os.environ['FLAGS_selected_gpus'] = '0'
+        os.environ['PADDLE_TRAINER_ID'] = '0'
+        os.environ['PADDLE_CURRENT_ENDPOINT'] = '127.0.0.1:6170'
+        os.environ['PADDLE_TRAINERS_NUM'] = '1'
+        os.environ['PADDLE_TRAINER_ENDPOINTS'] = '127.0.0.1:6170'
+        # coverage success branch
+        dist.init_parallel_env()
+        self.assertFalse(parallel_helper._is_parallel_ctx_initialized())
+
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestSpawnAssistMethod(unittest.TestCase):
     def test_only_cluster_node_ips_error(self):
         with self.assertRaises(ValueError):