Cambridge-ICCS
diff --git a/‎.github/workflows/test_suite_ubuntu.yml
+1-1 b/‎.github/workflows/test_suite_ubuntu.yml
+1-1
diff --git a/‎examples/3_MultiGPU/multigpu_infer_fortran.f90
+3-3 b/‎examples/3_MultiGPU/multigpu_infer_fortran.f90
+3-3
diff --git a/‎examples/7_MPI/CMakeLists.txt
+66 b/‎examples/7_MPI/CMakeLists.txt
+66
diff --git a/‎examples/7_MPI/README.md
+124 b/‎examples/7_MPI/README.md
+124
diff --git a/‎examples/7_MPI/mpi_infer_fortran.f90
+124 b/‎examples/7_MPI/mpi_infer_fortran.f90
+124
@@ -66,7 +66,7 @@ jobs:
       - name: Install an MPI distribution
         run: |
           sudo apt update
-          sudo apt install mpich
+          sudo apt install openmpi-bin openmpi-common libopenmpi-dev
 
       - name: Install pFUnit
         run: |
 
@@ -1,7 +1,7 @@
 program inference
 
    ! Import precision info from iso
-   use, intrinsic :: iso_fortran_env, only : sp => real32
+   use, intrinsic :: iso_fortran_env, only : sp => real32, stdout => output_unit
 
    ! Import our library for interfacing with PyTorch
    use ftorch, only : torch_model, torch_tensor, &
@@ -61,7 +61,7 @@ program inference
 
       ! Initialise data and print the values used
       in_data = [(device_index + i, i = 0, 4)]
-      write (6, 100) device_index, in_data(:)
+      write(unit=stdout, fmt=100) device_index, in_data(:)
       100 format("input on device ", i1,": [", 4(f5.1,","), f5.1,"]")
 
       ! Create Torch input tensor from the above array and assign it to the first (and only)
@@ -83,7 +83,7 @@ program inference
       call torch_model_forward(model, in_tensors, out_tensors)
 
       ! Print the values computed on the current device.
-      write (6, 200) device_index, out_data(:)
+      write(unit=stdout, fmt=200) device_index, out_data(:)
       200 format("output on device ", i1,": [", 4(f5.1,","), f5.1,"]")
 
       ! Check output tensor matches expected value
 
@@ -0,0 +1,66 @@
+cmake_minimum_required(VERSION 3.15...3.31)
+# policy CMP0076 - target_sources source files are relative to file where
+# target_sources is run
+cmake_policy(SET CMP0076 NEW)
+
+set(PROJECT_NAME MPIExample)
+
+project(${PROJECT_NAME} LANGUAGES Fortran)
+
+# Build in Debug mode if not specified
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE
+      Debug
+      CACHE STRING "" FORCE)
+endif()
+
+find_package(FTorch)
+find_package(MPI REQUIRED)
+message(STATUS "Building with Fortran PyTorch coupling")
+
+# Fortran example
+add_executable(mpi_infer_fortran mpi_infer_fortran.f90)
+target_link_libraries(mpi_infer_fortran PRIVATE FTorch::ftorch)
+target_link_libraries(mpi_infer_fortran PRIVATE MPI::MPI_Fortran)
+
+# Integration testing
+if(CMAKE_BUILD_TESTS)
+  include(CTest)
+
+  # 1. Check the PyTorch model runs and its outputs meet expectations
+  add_test(NAME simplenet COMMAND ${Python_EXECUTABLE}
+                                  ${PROJECT_SOURCE_DIR}/simplenet.py)
+
+  # 2. Check the model is saved to file in the expected location with the
+  #   pt2ts.py script
+  add_test(
+    NAME pt2ts
+    COMMAND ${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/pt2ts.py --filepath
+            ${PROJECT_BINARY_DIR}
+    WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
+
+  # 3. Check the model can be loaded from file and run with MPI in Python and
+  #   that its outputs meet expectations
+  add_test(
+    NAME mpi_infer_python
+    COMMAND
+      ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 2 ${Python_EXECUTABLE}
+      ${PROJECT_SOURCE_DIR}/mpi_infer_python.py --filepath ${PROJECT_BINARY_DIR}
+    WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
+  set_tests_properties(
+    mpi_infer_python PROPERTIES PASS_REGULAR_EXPRESSION
+                                "MPI Python example ran successfully")
+
+  # 4. Check the model can be loaded from file and run with MPI in Fortran and
+  #   that its outputs meet expectations
+  add_test(
+    NAME mpi_infer_fortran
+    COMMAND
+      ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 2 ./mpi_infer_fortran
+      ${PROJECT_BINARY_DIR}/saved_simplenet_model_cpu.pt
+      # Command line argument: model file
+    WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
+  set_tests_properties(
+    mpi_infer_fortran PROPERTIES PASS_REGULAR_EXPRESSION
+                                 "MPI Fortran example ran successfully")
+endif()
@@ -0,0 +1,124 @@
+# Example 7 - MPI
+
+This example revisits the SimpleNet example and demonstrates how to run it using
+MPI parallelism.
+
+
+## Description
+
+The Python file `simplenet.py` is copied from the earlier example. Recall that
+it defines a very simple PyTorch network that takes an input of length 5 and
+applies a single `Linear` layer to multiply it by 2.
+
+The same `pt2ts.py` tool is used to save the simple network to TorchScript.
+
+A series of files `mpi_infer_<LANG>` then bind from other languages to run the
+TorchScript model in inference mode.
+
+## Dependencies
+
+To run this example requires:
+
+- CMake
+- An MPI installation.
+- FTorch (installed as described in main package)
+- Python 3
+
+## Running
+
+To run this example install FTorch as described in the main documentation. Then
+from this directory create a virtual environment and install the necessary
+Python modules:
+```
+python3 -m venv venv
+source venv/bin/activate
+pip install -r requirements.txt
+```
+
+You can check the network is set up correctly by running `simplenet.py`:
+```
+python3 simplenet.py
+```
+As before, this defines the network and runs it with an input tensor
+[0.0, 1.0, 2.0, 3.0, 4.0] to produce the result:
+```
+tensor([[0, 2, 4, 6, 8]])
+```
+
+To save the `SimpleNet`` model to TorchScript run the modified version of the
+`pt2ts.py` tool:
+```
+python3 pt2ts.py
+```
+which will generate `saved_simplenet_model_cpu.pt` - the TorchScript instance
+of the network.
+
+You can check that everything is working by running the `mpi_infer_python.py`
+script. It's set up with MPI such that a different GPU device is associated
+with each MPI rank. You should substitute `<NP>` with the number of GPUs you
+wish to run with:
+```
+mpiexec -np <NP> python3 multigpu_infer_python.py
+```
+This reads the model in from the TorchScript file and runs it with an different
+input tensor on each GPU device: [0.0, 1.0, 2.0, 3.0, 4.0], plus the device
+index in each entry. Running with `NP=2`, the result should be (some
+permutation of):
+```
+rank 0: result:
+tensor([[0., 2., 4., 6., 8.]])
+rank 1: result:
+tensor([[ 2.,  4.,  6.,  8., 10.]])
+```
+
+At this point we no longer require Python, so can deactivate the virtual
+environment:
+```
+deactivate
+```
+
+To call the saved `SimpleNet` model from Fortran we need to compile the
+`mpi_infer_fortran.f90` file. This can be done using the included
+`CMakeLists.txt` as follows, noting that we need to use an MPI-enabled Fortran
+compiler:
+```
+mkdir build
+cd build
+cmake .. -DCMAKE_PREFIX_PATH=<path/to/your/installation/of/library/> -DCMAKE_BUILD_TYPE=Release
+cmake --build .
+```
+
+(Note that the Fortran compiler can be chosen explicitly with the
+`-DCMAKE_Fortran_COMPILER` flag, and should match the compiler that was used to
+locally build FTorch.)
+
+To run the compiled code calling the saved `SimpleNet` TorchScript from Fortran,
+run the executable with an argument of the saved model file. Again, specify the
+number of MPI processes according to the desired number of GPUs:
+```
+mpiexec -np <NP> ./mpi_infer_fortran ../saved_simplenet_model_cpu.pt
+```
+
+This runs the model with the same inputs as described above and should produce (some
+permutation of) the output:
+```
+input on rank 0: [  0.0,  1.0,  2.0,  3.0,  4.0]
+input on rank 1: [  1.0,  2.0,  3.0,  4.0,  5.0]
+output on rank 0: [  0.0,  2.0,  4.0,  6.0,  8.0]
+output on rank 1: [  2.0,  4.0,  6.0,  8.0, 10.0]
+```
+
+Alternatively, we can use `make`, instead of CMake, copying the Makefile over from the
+first example:
+```
+cp ../1_SimpleNet/Makefile .
+```
+See the instructions in that example directory for further details.
+
+## Exercise
+
+You might wish to explore using different MPI ranks to call different GPU
+devices via the GPU `device_index` argument passed to constructors for FTorch
+tensors and models. See the
+[Multi-GPU](https://github.com/Cambridge-ICCS/FTorch/tree/main/examples/3_MultiGPU)
+example for more details on how to do this.
@@ -0,0 +1,124 @@
+program inference
+
+   ! Import precision info from iso
+   use, intrinsic :: iso_fortran_env, only : sp => real32, stdout => output_unit
+
+   ! Import our library for interfacing with PyTorch
+   use ftorch, only : torch_model, torch_tensor, torch_kCPU, torch_delete, &
+                      torch_tensor_from_array, torch_model_load, torch_model_forward
+
+   ! Import our tools module for testing utils
+   use ftorch_test_utils, only : assert_allclose
+
+   ! Import MPI
+   use mpi, only : mpi_comm_rank, mpi_comm_size, mpi_comm_world, mpi_finalize, mpi_float, &
+                   mpi_gather, mpi_init
+
+   implicit none
+
+   ! Set working precision for reals
+   integer, parameter :: wp = sp
+
+   integer :: num_args, ix
+   character(len=128), dimension(:), allocatable :: args
+
+   ! Set up Fortran data structures
+   real(wp), dimension(5), target :: in_data
+   real(wp), dimension(5), target :: out_data
+   real(wp), dimension(5), target :: expected
+   integer, parameter :: tensor_layout(1) = [1]
+
+   ! Set up Torch data structures
+   ! The net, a vector of input tensors (in this case we only have one), and the output tensor
+   type(torch_model) :: model
+   type(torch_tensor), dimension(1) :: in_tensors
+   type(torch_tensor), dimension(1) :: out_tensors
+
+   ! Flag for testing
+   logical :: test_pass
+
+   ! MPI configuration
+   integer :: rank, size, ierr, i
+
+   ! Variables for testing
+   real(wp), allocatable, dimension(:,:) :: recvbuf
+   real(wp), dimension(5) :: result_chk
+   integer :: rank_chk
+
+   call mpi_init(ierr)
+   call mpi_comm_rank(mpi_comm_world, rank, ierr)
+   call mpi_comm_size(mpi_comm_world, size, ierr)
+
+   ! Check MPI was configured correctly
+   if (size == 1) then
+      write(*,*) "MPI communicator size is 1, indicating that it is not configured correctly"
+      write(*,*) "(assuming you specified more than one rank)"
+      call clean_up()
+      stop 999
+   end if
+
+   ! Get TorchScript model file as a command line argument
+   num_args = command_argument_count()
+   allocate(args(num_args))
+   do ix = 1, num_args
+      call get_command_argument(ix,args(ix))
+   end do
+
+   ! Initialise data and print the values used on each MPI rank
+   in_data = [(rank + i, i = 0, 4)]
+   write(unit=stdout, fmt="('input on rank ',i1,': ')", advance="no") rank
+   write(unit=stdout, fmt=100) in_data(:)
+   100 format('[',4(f5.1,','),f5.1,']')
+
+   ! Create Torch input/output tensors from the above arrays
+   call torch_tensor_from_array(in_tensors(1), in_data, tensor_layout, torch_kCPU)
+   call torch_tensor_from_array(out_tensors(1), out_data, tensor_layout, torch_kCPU)
+
+   ! Load ML model
+   call torch_model_load(model, args(1), torch_kCPU)
+
+   ! Run inference on each MPI rank
+   call torch_model_forward(model, in_tensors, out_tensors)
+
+   ! Print the values computed on each MPI rank
+   write(unit=stdout, fmt="('output on rank ',i1,': ')", advance="no") rank
+   write(unit=stdout, fmt=100) out_data(:)
+
+   ! Gather the outputs onto rank 0
+   allocate(recvbuf(5,size))
+   call mpi_gather(out_data, 5, mpi_float, recvbuf, 5, mpi_float, 0, mpi_comm_world, ierr)
+
+   ! Check that the correct values were attained
+   if (rank == 0) then
+
+      ! Check output tensor matches expected value
+      do rank_chk = 0, size-1
+        expected = [(2 * (rank_chk + i), i = 0, 4)]
+        result_chk(:) = recvbuf(:,rank_chk+1)
+        test_pass = assert_allclose(result_chk, expected, test_name="MPI")
+        if (.not. test_pass) then
+          write(unit=stdout, fmt="('rank ',i1,' result: ')") rank_chk
+          write(unit=stdout, fmt=100) result_chk(:)
+          write(unit=stdout, fmt="('does not match expected value')")
+          write(unit=stdout, fmt=100) expected(:)
+          call clean_up()
+          stop 999
+        end if
+      end do
+
+      write (*,*) "MPI Fortran example ran successfully"
+   end if
+
+   call clean_up()
+
+  contains
+
+    subroutine clean_up()
+      call torch_delete(model)
+      call torch_delete(in_tensors)
+      call torch_delete(out_tensors)
+      call mpi_finalize(ierr)
+      deallocate(recvbuf)
+    end subroutine clean_up
+
+end program inference