PaddlePaddle · chenwhql · Sep 1, 2020 · Aug 25, 2020 · Aug 26, 2020 · Aug 27, 2020
diff --git a/doc/fluid/api/distributed.rst b/doc/fluid/api/distributed.rst
@@ -0,0 +1,13 @@
+==================
+paddle.distributed
+==================
+
+..  toctree::
+    :maxdepth: 1
+
+    distributed/get_rank.rst
+    distributed/get_world_size.rst
+    distributed/init_parallel_env.rst
+    distributed/ParallelEnv.rst
+    distributed/prepare_context.rst
+    distributed/spawn.rst
diff --git a/doc/fluid/api/distributed/ParallelEnv.rst b/doc/fluid/api/distributed/ParallelEnv.rst
@@ -0,0 +1,5 @@
+.. _api_distributed_ParallelEnv:
+
+ParallelEnv
+-------------------------------
+:doc_source: paddle.fluid.dygraph.parallel.ParallelEnv
diff --git a/doc/fluid/api/distributed/get_rank.rst b/doc/fluid/api/distributed/get_rank.rst
@@ -0,0 +1,10 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+.. _api_distributed_get_rank:
+
+get_rank
+--------
+
+..  autofunction:: paddle.distributed.get_rank
+    :noindex:
diff --git a/doc/fluid/api/distributed/get_world_size.rst b/doc/fluid/api/distributed/get_world_size.rst
@@ -0,0 +1,10 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+.. _api_distributed_get_world_size:
+
+get_world_size
+--------------
+
+..  autofunction:: paddle.distributed.get_world_size
+    :noindex:
diff --git a/doc/fluid/api/distributed/init_parallel_env.rst b/doc/fluid/api/distributed/init_parallel_env.rst
@@ -0,0 +1,10 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+.. _api_distributed_init_parallel_env:
+
+init_parallel_env
+-----------------
+
+..  autofunction:: paddle.distributed.init_parallel_env
+    :noindex:
diff --git a/doc/fluid/api/distributed/prepare_context.rst b/doc/fluid/api/distributed/prepare_context.rst
@@ -0,0 +1,5 @@
+.. _api_distributed_prepare_context:
+
+prepare_context
+-------------------------------
+:doc_source: paddle.fluid.dygraph.parallel.prepare_context
diff --git a/doc/fluid/api/distributed/spawn.rst b/doc/fluid/api/distributed/spawn.rst
@@ -0,0 +1,10 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+.. _api_distributed_spawn:
+
+spawn
+-----
+
+..  autofunction:: paddle.distributed.spawn
+    :noindex:
diff --git a/doc/fluid/api/index_en.rst b/doc/fluid/api/index_en.rst
@@ -8,6 +8,7 @@ API Reference
     ../api_guides/index_en.rst
     dataset.rst
     declarative.rst
+    distributed.rst
     framework.rst
     imperative.rst
     io.rst

diff --git a/doc/fluid/api/paddle.rst b/doc/fluid/api/paddle.rst
@@ -37,6 +37,7 @@ paddle
 	paddle/CUDAPinnedPlace.rst
 	paddle/CUDAPlace.rst
 	paddle/cumsum.rst
+	paddle/DataParallel.rst
 	paddle/default_main_program.rst
 	paddle/default_startup_program.rst
 	paddle/diag.rst

diff --git a/doc/fluid/api/paddle/DataParallel.rst b/doc/fluid/api/paddle/DataParallel.rst
@@ -0,0 +1,7 @@
+.. _api_paddle_DataParallel:
+
+DataParallel
+-------------------------------
+:doc_source: paddle.fluid.dygraph.parallel.DataParallel
+
+
diff --git a/doc/fluid/api_cn/distributed_cn.rst b/doc/fluid/api_cn/distributed_cn.rst
@@ -12,5 +12,11 @@ paddle.distributed
     distributed_cn/all_reduce_cn.rst
     distributed_cn/barrier_cn.rst
     distributed_cn/broadcast_cn.rst
+    distributed_cn/get_rank_cn.rst
+    distributed_cn/get_world_size_cn.rst
+    distributed_cn/init_parallel_env_cn.rst
+    distributed_cn/ParallelEnv_cn.rst
+    distributed_cn/prepare_context_cn.rst
     distributed_cn/reduce_cn.rst
     distributed_cn/scatter_cn.rst
+    distributed_cn/spawn_cn.rst
diff --git a/doc/fluid/api_cn/distributed_cn/ParallelEnv_cn.rst b/doc/fluid/api_cn/distributed_cn/ParallelEnv_cn.rst
@@ -0,0 +1,5 @@
+.. _cn_api_distributed_ParallelEnv:
+
+ParallelEnv
+-------------------------------
+:doc_source: paddle.fluid.dygraph.parallel.ParallelEnv
diff --git a/doc/fluid/api_cn/distributed_cn/get_rank_cn.rst b/doc/fluid/api_cn/distributed_cn/get_rank_cn.rst
@@ -0,0 +1,25 @@
+.. _cn_api_distributed_get_rank:
+
+get_rank
+----------
+
+..  py:function:: paddle.distributed.get_rank()
+
+返回当前进程的rank。
+
+当前进程rank的值等于环境变量 ``PADDLE_TRAINER_ID`` 的值，默认值为0。
+
+返回
+:::::::::
+(int) 当前进程的rank。
+
+代码示例
+:::::::::
+.. code-block:: python
+
+    import paddle
+    import paddle.distributed as dist
+
+    # execute this command in terminal: export PADDLE_TRAINER_ID=0
+    print("The rank is %d" % dist.get_rank())
+    # The rank is 0
diff --git a/doc/fluid/api_cn/distributed_cn/get_world_size_cn.rst b/doc/fluid/api_cn/distributed_cn/get_world_size_cn.rst
@@ -0,0 +1,23 @@
+.. _cn_api_distributed_get_world_size:
+
+get_world_size
+----------------
+
+返回参与当前任务的进程数。
+
+当前进程数等于环境变量 ``PADDLE_TRAINERS_NUM`` 的值，默认值为1。
+
+返回
+:::::::::
+(int) 参与任务的进程数。
+
+代码示例
+:::::::::
+.. code-block:: python
+
+    import paddle
+    import paddle.distributed as dist
+
+    # execute this command in terminal: export PADDLE_TRAINERS_NUM=4
+    print("The world_size is %d" % dist.get_world_size())
+    # The world_size is 4
diff --git a/doc/fluid/api_cn/distributed_cn/init_parallel_env_cn.rst b/doc/fluid/api_cn/distributed_cn/init_parallel_env_cn.rst
@@ -0,0 +1,62 @@
+.. _cn_api_distributed_init_parallel_env:
+
+init_parallel_env
+-----------------
+
+初始化动态图模式下的并行训练环境。
+
+.. note::
+    目前仅支持初始化GPU训练环境，使用NCCL进行通信。
+
+返回
+:::::::::
+无
+
+代码示例
+:::::::::
+.. code-block:: python
+
+    import paddle
+    import paddle.nn as nn
+    import paddle.optimizer as opt
+    import paddle.distributed as dist
+
+    class LinearNet(nn.Layer):
+        def __init__(self):
+            super(LinearNet, self).__init__()
+            self._linear1 = nn.Linear(10, 10)
+            self._linear2 = nn.Linear(10, 1)
+
+        def forward(self, x):
+            return self._linear2(self._linear1(x))
+
+    def train():
+        # 1. enable dynamic mode
+        paddle.disable_static()
+
+        # 2. initialize parallel environment
+        dist.init_parallel_env()
+
+        # 3. create data parallel layer & optimizer
+        layer = LinearNet()
+        dp_layer = paddle.DataParallel(layer)
+
+        loss_fn = nn.MSELoss()
+        adam = opt.Adam(
+            learning_rate=0.001, parameters=dp_layer.parameters())
+
+        # 4. run layer
+        inputs = paddle.randn([10, 10], 'float32')
+        outputs = dp_layer(inputs)
+        labels = paddle.randn([10, 1], 'float32')
+        loss = loss_fn(outputs, labels)
+
+        loss = dp_layer.scale_loss(loss)
+        loss.backward()
+        dp_layer.apply_collective_grads()
+
+        adam.step()
+        adam.clear_grad()
+
+    if __name__ == '__main__':
+        dist.spawn(train)
diff --git a/doc/fluid/api_cn/distributed_cn/prepare_context_cn.rst b/doc/fluid/api_cn/distributed_cn/prepare_context_cn.rst
@@ -0,0 +1,5 @@
+.. _cn_api_distributed_prepare_context:
+
+prepare_context
+-------------------------------
+:doc_source: paddle.fluid.dygraph.parallel.prepare_context
diff --git a/doc/fluid/api_cn/distributed_cn/spawn_cn.rst b/doc/fluid/api_cn/distributed_cn/spawn_cn.rst
@@ -0,0 +1,103 @@
+.. _cn_api_distributed_spawn:
+
+spawn
+-----
+
+使用 ``spawn`` 方法启动多进程任务。
+
+参数
+:::::::::
+    - func (function) - 由 ``spawn`` 方法启动的进程所调用的目标函数。该目标函数需要能够被 ``pickled`` (序列化)，所以目标函数必须定义为模块的一级函数，不能是内部子函数或者类方法。
+    - args (tuple, 可选) - 传入目标函数 ``func`` 的参数。
+    - nprocs (int, 可选) - 启动进程的数目。默认值为-1。当 ``nproc`` 为-1时，模型执行时将会从环境变量中获取当前可用的所有设备进行使用：如果使用GPU执行任务，将会从环境变量 ``CUDA_VISIBLE_DEVICES`` 中获取当前所有可用的设备ID；如果使用CPU执行任务，将会从环境变量 ``CPU_NUM`` 中获取当前可用的CPU设备数，例如，可以通过指令 ``export CPU_NUM=4`` 配置默认可用CPU设备数，如果此环境变量没有设置，将会默认设置该环境变量的值为1。
+    - join (bool, 可选) - 对所有启动的进程执行阻塞的 ``join`` ，等待进程执行结束。默认为True。
+    - daemon (bool, 可选) - 配置启动进程的 ``daemon`` 属性。默认为False。
+    - **options (dict, 可选) - 其他初始化并行执行环境的配置选项。目前支持以下选项： (1) start_method (string) - 启动子进程的方法。进程的启动方法可以是 ``spawn`` ， ``fork`` , ``forkserver`` 。 因为CUDA运行时环境不支持 ``fork`` 方法，当在子进程中使用CUDA时，需要使用 ``spawn`` 或者 ``forkserver`` 方法启动进程。默认方法为 ``spawn`` ； (2) cluster_node_ips (string) - 运行集群的节点（机器）IP，例如 "192.168.0.16,192.168.0.17" ，默认值为 "127.0.0.1" ； (3) node_ip (string) - 当前节点（机器）的IP。例如 "192.168.0.16" , 默认值为 "127.0.0.1" ； (4) started_port (int) - 一个训练节点（机器）上各训练进程的起始端口。例如 6170. 默认值为None ； (5) selected_gpus (string) - 指定训练使用的GPU ID, 例如 "0,1,2,3" ， 默认值为None ； (6) print_config (bool) - 打印当前并行训练的配置， 默认值为False ； (7) use_paddlecloud (bool) - 配置是否使用PaddleCloud启动多进程任务，默认值为False。
+
+返回
+:::::::::
+ ``MultiprocessContext`` 对象，持有创建的多个进程。
+
+代码示例
+:::::::::
+.. code-block:: python
+
+    from __future__ import print_function
+
+    import paddle
+    import paddle.nn as nn
+    import paddle.optimizer as opt
+    import paddle.distributed as dist
+
+    class LinearNet(nn.Layer):
+        def __init__(self):
+            super(LinearNet, self).__init__()
+            self._linear1 = nn.Linear(10, 10)
+            self._linear2 = nn.Linear(10, 1)
+
+        def forward(self, x):
+            return self._linear2(self._linear1(x))
+
+    def train(print_result=False):
+        # 1. enable dynamic mode
+        paddle.disable_static()
+
+        # 2. initialize parallel environment
+        dist.init_parallel_env()
+
+        # 3. create data parallel layer & optimizer
+        layer = LinearNet()
+        dp_layer = paddle.DataParallel(layer)
+
+        loss_fn = nn.MSELoss()
+        adam = opt.Adam(
+            learning_rate=0.001, parameters=dp_layer.parameters())
+
+        # 4. run layer
+        inputs = paddle.randn([10, 10], 'float32')
+        outputs = dp_layer(inputs)
+        labels = paddle.randn([10, 1], 'float32')
+        loss = loss_fn(outputs, labels)
+
+        if print_result is True:
+            print("loss:", loss.numpy())
+
+        loss = dp_layer.scale_loss(loss)
+        loss.backward()
+        dp_layer.apply_collective_grads()
+
+        adam.step()
+        adam.clear_grad()
+
+    # Usage 1: only pass function. 
+    # If your training method no need any argument, and 
+    # use all visible devices for parallel training. 
+    if __name__ == '__main__':
+        dist.spawn(train)
+
+    # Usage 2: pass function and arguments.
+    # If your training method need some arguments, and 
+    # use all visible devices for parallel training.
+    if __name__ == '__main__':
+        dist.spawn(train, args=(True,))
+
+    # Usage 3: pass function, arguments and nprocs.
+    # If your training method need some arguments, and 
+    # only use part of visible devices for parallel training.
+    # If your machine hold 8 cards {0,1,2,3,4,5,6,7},
+    # this case will use cards {0,1}; If you set 
+    # CUDA_VISIBLE_DEVICES=4,5,6,7, this case will use
+    # cards {4,5}
+    if __name__ == '__main__':
+        dist.spawn(train, args=(True,), nprocs=2)
+
+    # Usage 4: pass function, arguments, nprocs and selected_gpus.
+    # If your training method need some arguments, and 
+    # only use part of visible devices for parallel training,
+    # but you can't set your machine's environment varibale 
+    # CUDA_VISIBLE_DEVICES, such as it is None or all cards
+    # {0,1,2,3,4,5,6,7}, you can pass `selelcted_gpus` to 
+    # select the GPU cards you want to use. For example,
+    # this case will use cards {4,5} if your machine hold 8 cards.
+    if __name__ == '__main__':
+        dist.spawn(train, args=(True,), nprocs=2, selelcted_gpus='4,5')