Updated documentation, fixed typos etc.v

veritas9872 · veritas9872 · commit ff7c84305209 · 2020-02-14T15:33:59.000+09:00
diff --git a/README.md b/README.md
@@ -25,6 +25,8 @@ __*Please star/fork my repository if you find this tutorial helpful!*__
 To run this project please install 
 [NVIDIA-Docker](https://github.com/NVIDIA/nvidia-docker) first.
 
+Unfortunately for Windows users, NVIDIA-Docker is only available for Linux as of the time of writing. 
+
 NVIDIA-Docker has many dependencies, such as the NVIDIA driver and Docker.
 
 These are all necessary for this project.
@@ -33,7 +35,7 @@ I am using Docker because I have found that local installation often fails.
 
 This is likely due to complicated dependency issues.
 
-Also, catastrophic errors are easier to handle in a Docker container than on a local computer.
+Also, catastrophic errors are easier to handle in a Docker container than on a local machine.
 
 Please view basic [Docker](https://docs.docker.com) concepts for this project.
 
@@ -42,14 +44,14 @@ Please view basic [Docker](https://docs.docker.com) concepts for this project.
 ### Environment
 
 The Docker container generated by the Dockerfile will create a 
-Ubuntu 18.04 LTS Container with CUDA10.0, CUDNN version 7.6.0.64-1,
-NCCL version2.4.7-1, OPENMPI version 4.0.2.
+Ubuntu 18.04 LTS image with CUDA 10.0, CuDNN 7.6.0.64-1,
+NCCL 2.4.7-1, and OpenMPI 4.0.2.
 
 Python version is 3.6.7, Pytorch is 1.4.0, and Torchvision is 0.5.0.
 
 The settings were modified from the currently available official horovod image.
 
-The current official image has an issue with pillow 7 incompatibility with Torchvision 0.4.2.
+The current official horovod Docker image has an issue with pillow 7 incompatibility with Torchvision 0.4.2.
 
 ### Task
 
diff --git a/docker_files/Dockerfile b/docker_files/Dockerfile
@@ -4,13 +4,16 @@ FROM nvidia/cuda:10.0-devel-ubuntu18.04
 ENV CUDNN_VERSION=7.6.0.64-1+cuda10.0
 ENV NCCL_VERSION=2.4.7-1+cuda10.0
 ENV OPENMPI_VERSION=4.0.2
+ENV TORCH_WHEEL=https://download.pytorch.org/whl/cu100/torch-1.4.0%2Bcu100-cp36-cp36m-linux_x86_64.whl
+ENV TORCHVISION_WHEEL=https://download.pytorch.org/whl/cu100/torchvision-0.5.0%2Bcu100-cp36-cp36m-linux_x86_64.whl
 
-# Python 3.6 is supported by Ubuntu Bionic out of the box
+# Python 3.6 is supported by Ubuntu Bionic out of the box.
 ENV PYTHON_VERSION=3.6
 
-# Set default shell to /bin/bash
+# Set default shell to /bin/bash.
 SHELL ["/bin/bash", "-cu"]
 
+# Get dependencies on Ubuntu.
 RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
         build-essential \
         cmake \
@@ -32,17 +35,18 @@ RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-
         libibverbs1 \
         ibverbs-providers
 
+# Create symbolic link.
 RUN ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
 
+# Get pip.
 RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
     python get-pip.py && \
     rm get-pip.py
 
 # Install Pytorch, torchvivison and other required libraries. Tensorboard depends on the future library.
-RUN pip install typing numpy future tensorboard==2.0.0 \
-    https://download.pytorch.org/whl/cu100/torch-1.4.0%2Bcu100-cp36-cp36m-linux_x86_64.whl \
-    https://download.pytorch.org/whl/cu100/torchvision-0.5.0%2Bcu100-cp36-cp36m-linux_x86_64.whl
+RUN pip install typing numpy future tensorboard==2.0.0 ${TORCH_WHEEL} ${TORCHVISION_WHEEL}
 
+# Get OpenMPI.
 RUN mkdir /tmp/openmpi && \
     cd /tmp/openmpi && \
     wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-${OPENMPI_VERSION}.tar.gz && \
@@ -54,17 +58,17 @@ RUN mkdir /tmp/openmpi && \
     ldconfig && \
     rm -rf /tmp/openmpi
 
-# Install Horovod, temporarily using CUDA stubs
+# Install Horovod, temporarily using CUDA stubs.
 RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs && \
     HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_WITH_PYTORCH=1 \
          pip install --no-cache-dir horovod && \
     ldconfig
 
-# Install OpenSSH for MPI to communicate between containers
+# Install OpenSSH for MPI to communicate between containers.
 RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \
     mkdir -p /var/run/sshd
 
-# Allow OpenSSH to talk to containers without asking for confirmation
+# Allow OpenSSH to talk to containers without asking for confirmation.
 RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
     echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
     mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
diff --git a/docker_files/README.md b/docker_files/README.md
@@ -2,11 +2,11 @@
 
 The Dockerfile here is used to create an environment where Horovod can be used with Pytorch.
 
-Software versions have been fixed for my convenience. 
+Software versions have been fixed for Python, Pytorch, etc. for my convenience. 
 
 However, they can be changed manually.
 
-The current installation uses pip instead of Anaconda.
+Also, the current installation uses pip instead of Anaconda.
 
 This is keeping with the original Dockerfile in Horovod.
 
@@ -15,7 +15,7 @@ for the original.
 
 ### Dependencies and installation
 
-The section before `pip install ...` is boilerplate for dependencies on Ubuntu.
+The section before `pip install [...]` is boilerplate for dependencies on Ubuntu.
 
 The current project only uses Pytorch, Torchvision, Tensorboard, Numpy, and typing.
 
@@ -25,7 +25,7 @@ Pytorch and Torchvision are installed with their wheel directories in PyPI for f
 
 See [here](https://download.pytorch.org/whl/cu100/torch_stable.html) for Pytorch wheels for CUDA10.0.
 
-Other project requirements should be installed here.
+Other project requirements should be installed by including them in the `pip install [...]` line.
 
 ### Horovod installation
 
@@ -36,7 +36,7 @@ HOROVOD_GPU_ALLREDUCE=NCCL,
 HOROVOD_GPU_BROADCAST=NCCL, and 
 HOROVOD_WITH_PYTORCH=1.
 
-The first two indicate that GPU operations should use NCCL (pronounced "nickel").
+The first two indicate that GPU operations should use [NCCL](https://github.com/NVIDIA/nccl) (pronounced "nickel").
 
 This setting is crucial for performance.
 
diff --git a/scripts/README.md b/scripts/README.md
@@ -12,6 +12,8 @@ Each script and its contents are explained here.
 
 Build the Docker image with the docker_build_script.
 
+Please read the README file attached with the Dockerfile for an explanation of its contents.
+
 The -t indicates the tag/name. The format is (repository):(tag).
 
 The full path to the Dockerfile must be specified.
@@ -98,15 +100,15 @@ This is specified by the `num_workers` variable.
 
 However, each Horovod process will launch these workers independently.
 
-This may cause an excessive number of workers to be launched. 
+The true number of pre-processing workers will therefore be `np x num_workers`.
 
 **2. Host**
 
 The `-H` flag specifies the host type. 
 
 The number of GPUs to be used is specified on the right.
 
-N must be the same or lesser than the number of GPUs.
+N must be the same or lesser than the number of available GPUs.
 
 For a local machine where N GPUs are to be used, use "localhost:N".
 
@@ -116,13 +118,13 @@ For servers, a different scheme is used.
 
 For server with index I with N GPUs, use "serverI:N".
 
-For large servers, use a hostfile.
+For large servers, use a hostfile with the server information.
 
 **3. Autotune**
 
-Use the `--autotune` flag to autotune parameters for best performance.
+Use the `--autotune` flag to autotune parameters for best speed performance.
 
-Autotuning uses Bayesian optimization for finding the best parameters.
+Autotuning uses Bayesian optimization for finding the best parameters for speed.
 
 This will cause early runs to be slower, but later runs will be faster.
 
diff --git a/scripts/docker_build_script.sh b/scripts/docker_build_script.sh
@@ -1,2 +1,3 @@
 set -ex
-docker build -t horovod:py-3.6-torch-1.4.0 $HOME/PycharmProjects/Horovod-Pytorch-Tutorial/docker_files/
+docker build -t horovod:py-3.6-torch-1.4.0 \
+              $HOME/PycharmProjects/Horovod-Pytorch-Tutorial/docker_files
diff --git a/scripts/docker_run_script.sh b/scripts/docker_run_script.sh
@@ -1,4 +1,4 @@
 set -ex
 docker run -v $HOME/PycharmProjects/Horovod-Pytorch-Tutorial:/opt/project \
-            -it -w /opt/project --runtime nvidia --gpus all --name horovod_torch --rm \
+            -it -w /opt/project --runtime=nvidia --gpus=all --name=horovod_torch --rm \
             horovod:py-3.6-torch-1.4.0
diff --git a/scripts/horovod_run_script.sh b/scripts/horovod_run_script.sh
@@ -1,2 +1,3 @@
 set -ex
-horovodrun -np 2 -H localhost:2 python train/train_model.py
+N=2
+horovodrun -np $N -H localhost:$N python train/train_model.py
diff --git a/train/README.md b/train/README.md
@@ -1,9 +1,8 @@
 # Training with multiple processes.
 
----
 Concepts used in Horovod and MPI are outlined [here](https://horovod.readthedocs.io/en/latest/concepts_include.html).
 
----
+### How does it work?
 
 When `horovodrun` is used, multiple Python processes are launched simultaneously.
 
@@ -13,24 +12,34 @@ Each process is given an identifier, the *"local rank"*.
 
 Each process can access its identifier via the hvd.local_rank() function.
 
-This is similar to how CUDA threads operate.
+This is similar to how CUDA threads operate within a CUDA kernel.
 
 Imagine launching the `python ...` command simultaneously, in parallel, 
 but each process knowing its identifying number.
 
-Note that the DataLoaders launch new workers in each process.
+Each process does its own thing but synchronizes with the others via the Ring-AllReduce.
+
+### How many processes? 
+
+As mentioned previously, note that the DataLoaders launch new workers in each process.
 
 This means that the number of pre-processing processes 
 is multiplied by the number of Horovod processes.
 
 This may cause memory issues or performance drops.
 
-The mini-batch size is not affected by the number of Horovod processes because of the DistributedSampler.
+However, **mini-batch size is not affected by the number of Horovod processes.**
+ 
+DistributedSampler handles this very well.
+
+### How to write logs and save checkpoints.
 
 The Horovod documentation recommends that only model checkpoints and logs from 
-"hvd.local_rank() == 0" should be saved.
+`"hvd.local_rank() == 0"` should be saved.
+
+The Ring-AllReduce ensures that the different versions will not diverge very much.
 
-The Ring-AllReduce will ensure that the values will converge eventually.
+### How to manage devices
 
 Within each `horovodrun` process, the device assigned to that process is set as the default device.
 
diff --git a/train/train_model.py b/train/train_model.py
@@ -15,9 +15,8 @@
 def main():
     model: nn.Module = resnet34(num_classes=10).cuda()
 
-    # print('Number of threads: ', torch.get_num_threads(), torch.get_num_interop_threads())
-
-    batch_size = 1024
+    # Set variables here. These are just for demonstration so no need for argparse.
+    batch_size = 1024  # This is the true min-batch size, thanks to DistributedSampler.
     num_workers_per_process = 2  # Workers launched by each process started by horovodrun command.
     lr = 0.1
     momentum = 0.9
@@ -42,9 +41,7 @@ def main():
 
     loss_func = nn.CrossEntropyLoss()
 
-    # print('Thread number: ', torch.get_num_threads(), torch.get_num_interop_threads())
-
-    # Writing separate log files for each process. Verified that models are different.
+    # Writing separate log files for each process for comparison. Verified that models are different.
     writer = SummaryWriter(log_dir=f'./logs/{hvd.local_rank()}', comment='Summary writer for run.')
 
     # Optimizer must be distributed for the Ring-AllReduce.
@@ -68,7 +65,7 @@ def warm_up(epoch: int):  # Learning rate warm-up.
 
     for epoch in range(num_epochs):
         print(epoch)
-        torch.autograd.set_grad_enabled = True
+        torch.autograd.set_grad_enabled = True  # Training mode.
         train_sampler.set_epoch(epoch)  # Set epoch to sampler for proper shuffling of training set.
         for inputs, targets in train_loader:
             inputs: Tensor = inputs.cuda(non_blocking=True)
@@ -81,15 +78,15 @@ def warm_up(epoch: int):  # Learning rate warm-up.
             loss.backward()
             optimizer.step()
 
-        torch.autograd.set_grad_enabled = False
+        torch.autograd.set_grad_enabled = False  # Evaluation mode.
         for step, (inputs, targets) in enumerate(test_loader):
             inputs: Tensor = inputs.cuda(non_blocking=True)
             targets: Tensor = targets.cuda(non_blocking=True)
             outputs = model(inputs)
             loss = loss_func(outputs, targets)
             writer.add_scalar(tag='val_loss', scalar_value=loss.item(), global_step=step)
 
-        scheduler.step()
+        scheduler.step()  # Scheduler works fine on DistributedOptimizer.
 
 
 if __name__ == '__main__':
@@ -105,5 +102,5 @@ def warm_up(epoch: int):  # Learning rate warm-up.
     print(f'Local Rank: {hvd.local_rank()}')
 
     # Set default to each GPU device. Local rank is different for each process launched by horovodrun.
-    with torch.cuda.device(f'cuda:{hvd.local_rank()}'):
-        main()
+    with torch.cuda.device(f'cuda:{hvd.local_rank()}'):  # Not sure if this is absolutely necessary.
+        main()  # Run main function for each process on different devices, specified by "local_rank".