CExA-project · PaulGannay · Sep 22, 2025 · Dec 10, 2025 · Dec 12, 2025 · Dec 15, 2025
diff --git a/courses/02_intermediate/main.tex b/courses/02_intermediate/main.tex
@@ -69,6 +69,8 @@
         row{1}={bg=lightmain},
     }
 }
+\colorlet{thread1}{lightalert}
+\colorlet{thread6}{lightexample}
 
 \graphicspath{{../../images/}}
 
@@ -648,12 +650,265 @@ \section{Subviews}
 
 \section{Atomics}
 
+\begin{frame}[fragile]{Race condition}
+  Porting a code creating a histogram:
+  \begin{columns}
+    \begin{column}{0.5\linewidth}
+      \begin{minted}{C++}
+        double histo[5] = {0};
+
+        for (int i=0; i < N; i++) {
+          histo[i%5] += i;
+        }
+      \end{minted}
+    \end{column}
+    \pause
+    \begin{column}{0.5\linewidth}
+      \begin{minted}{C++}
+        Kokkos::View<double*> histo(5);
+
+        Kokkos::parallel_for(
+          Kokkos::RangePolicy(0,N),
+          KOKKOS_LAMBDA(int i) {
+            histo(i%5) += i;
+          });
+      \end{minted}
+    \end{column}
+  \end{columns}
+\end{frame}
+
+\begin{frame}[fragile]{Race condition}
+    \begin{columns}
+        \begin{column}{0.5\linewidth}
+            Even simple instructions like increment are decomposed into several smaller assembly instructions:
+            \begin{minted}{C++}
+              histo(i%5) += i;
+            \end{minted}
+        \end{column}
+        \begin{column}{0.5\linewidth}
+          \SetTblrInner{rowsep=0pt}
+          \begin{tblr}{colspec={cccc},rowspec={Q[lightmain]Q[white]Q[thread1]Q[thread1]Q[thread1]Q[white]Q[thread6]Q[thread6]Q[thread6]Q[white]}}
+            \textbf{Thread 1} & \textbf{Thread 6} &  & \textbf{res} \\
+            & & & 0 \\
+            read value & & ← & 0 \\
+            add 1 & &  & 0 \\
+            write value & & → & 1 \\
+            & & & 1 \\
+            & read value & ← & 1 \\
+            & add 6 &  & 1 \\
+            & write value & → & 7 \\
+            & & & 7 \\
+          \end{tblr}
+        \end{column}
+    \end{columns}
+\end{frame}
+
+% Trainee could play with the following program to check that it really present a race condition:
+%#include <iostream>
+%#include <Kokkos_Core.hpp>
+%
+%int main(int argc, char *argv[]) {
+%  Kokkos::initialize(argc, argv); 
+%  {
+%    const int N = 10000;
+%    Kokkos::View<double*> v("v", N);
+%    Kokkos::deep_copy(v, 4);
+%
+%    Kokkos::View<double> res("res", N);
+%
+%    Kokkos::parallel_for(Kokkos::RangePolicy(0, N),
+%        KOKKOS_LAMBDA(int i) {
+%            //Kokkos::atomic_add(&res(), v(i));
+%            res() = res() + v(i);
+%        });
+%
+%    double res_;
+%
+%    deep_copy(res_, res);
+%
+%    std::cout << "res_:" << res_ << std::endl;
+%    std::cout << "4*N:" << 4*N << std::endl;
+%  }
+%  Kokkos::finalize();
+%}
+
+\begin{frame}[fragile]{Race condition}
+    \begin{columns}
+        \begin{column}{0.5\linewidth}
+            Execution between threads is independent. There is no guarantee over the order of instructions:
+            \begin{minted}{C++}
+              histo(i%5) += i;
+            \end{minted}
+
+            When several threads are accessing the same data, it will generate \highlight{race conditions}.
+        \end{column}
+        \begin{column}{0.5\linewidth}
+          \SetTblrInner{rowsep=0pt}
+          \begin{tblr}{colspec={cccc},rowspec={Q[lightmain]Q[white]Q[thread1]Q[thread6]Q[thread1]Q[thread1]Q[white]Q[thread6]Q[thread6]Q[white]}}
+            \textbf{Thread 1} & \textbf{Thread 6} &  & \textbf{res} \\
+             & & & 0 \\
+             read value & & ← & 0 \\
+             & read value & ← & 0 \\
+             add 1 & & & 0 \\
+             write value & & → & 1 \\
+             & & & 1 \\
+             & add 6 & & 1 \\
+             & write value & → & 6 \\
+             & & & 6 \\
+          \end{tblr}
+        \end{column}
+    \end{columns}
+\end{frame}
+
+\begin{frame}[fragile]{Atomic operation}
+  Replacing the addition with its atomic counterpart solves the problem:
+  \begin{columns}
+    \begin{column}{0.40\linewidth}
+      \begin{minted}{C++}
+        Kokkos::parallel_for(
+          Kokkos::RangePolicy(0,N),
+          KOKKOS_LAMBDA(int i) {
+            histo(i%5) += i;
+          });
+      \end{minted}
+    \end{column}
+    \begin{column}{0.56\linewidth}
+      \begin{minted}{C++}
+        Kokkos::parallel_for(
+          Kokkos::RangePolicy(0,N),
+          KOKKOS_LAMBDA(int i) {
+            Kokkos::atomic_add(&histo(i%5), i);
+          });
+      \end{minted}
+    \end{column}
+  \end{columns}
+  \structure{Note:} \texttt{atomic\_add} takes a pointer and not a
+  reference as first argument, as the instruction needs to have an access to
+  the actual memory address of the modified variable.
+\end{frame}
+
+\begin{frame}[fragile]{Atomic operation}
+    \begin{columns}
+        \begin{column}{0.55\linewidth}
+          \texttt{atomic\_add} executes the write, read and add in a single atomic step,
+          guarantying the absence of race conditions during the operation:
+            \begin{minted}{C++}
+              Kokkos::atomic_add(&histo(i%5), i);
+            \end{minted}
+        \end{column}
+        \begin{column}{0.5\linewidth}
+          \noindent Either:
+
+          \vspace{0.5em}
+          \SetTblrInner{rowsep=0pt}
+          \begin{tblr}{colspec={cccc},rowspec={Q[lightmain]Q[white]Q[thread1]Q[thread6]}}
+            \textbf{Thread 1} & \textbf{Thread 6} &  & \textbf{res} \\
+            & & & 0 \\
+            atomic add & & ←→ & 1 \\
+            & atomic add & ←→ & 7 \\
+            &  &  & 7 \\
+          \end{tblr}
+          \pause
+          \vspace{0.5em}
+
+          \noindent Or: 
+
+          \vspace{0.5em}
+          \SetTblrInner{rowsep=0pt}
+          \begin{tblr}{colspec={cccc},rowspec={Q[lightmain]Q[white]Q[thread6]Q[thread1]}}
+            \textbf{Thread 1} & \textbf{Thread 6} &  & \textbf{res} \\
+            & & & 0 \\
+            & atomic add & ←→ & 6 \\
+            atomic add & & ←→ & 7 \\
+            &  &  & 7 \\
+          \end{tblr}
+        \end{column}
+    \end{columns}
+\end{frame}
+
+\begin{frame}[fragile]{Operations}
+  \begin{columns}
+    \begin{column}{0.35\linewidth}
+      Other common operations are available with the format \texttt{Kokkos::atomic\_[op]}:
+    \end{column}
+    \begin{column}{0.75\linewidth}
+      \SetTblrInner{rowsep=0pt}
+      \begin{tblr}[theme=kokkostable]{lc}
+        Operation & Replaces \\
+        \texttt{Kokkos::atomic\_add(\&x, y)}    & \texttt{x += y} \\
+        \texttt{Kokkos::atomic\_and(\&x, y)}    & \texttt{x \&= y} \\
+        \texttt{Kokkos::atomic\_dec(\&x)}       & \texttt{x--} \\
+        \texttt{Kokkos::atomic\_inc(\&x)}       & \texttt{x++} \\
+        \texttt{Kokkos::atomic\_lshift(\&x, y)} & \texttt{x = x << y} \\
+        \texttt{Kokkos::atomic\_max(\&x, y)}    & \texttt{x = std::max(x, y)} \\
+        \texttt{Kokkos::atomic\_min(\&x, y)}    & \texttt{x = std::min(x, y)} \\
+        \texttt{Kokkos::atomic\_mod(\&x, y)}    & \texttt{x \%= y} \\
+        \texttt{Kokkos::atomic\_nand(\&x, y)}   & \texttt{x = !(x \&\& y)} \\
+        \texttt{Kokkos::atomic\_or(\&x, y)}     & \texttt{x |= y} \\
+        \texttt{Kokkos::atomic\_rshift(\&x, y)} & \texttt{x = x >> y} \\
+        \texttt{Kokkos::atomic\_sub(\&x, y)}    & \texttt{x -= y} \\
+        \texttt{Kokkos::atomic\_store(\&x, y)}  & \texttt{x = y} \\
+        \texttt{Kokkos::atomic\_xor(\&x, y)}    & \texttt{x \^{}= y} \\
+      \end{tblr}
+    \end{column}
+  \end{columns}
+\end{frame}
+
+\begin{frame}[fragile]{Atomic Memory Trait}
+  \begin{columns}
+    \begin{column}{0.40\linewidth}
+      \begin{itemize}
+        \item If you need to access a View exclusively through atomic operations, you
+          can also create an alias with the \texttt{Atomic} memory trait
+        \item It guaranties that any operation done through the alias are done atomically
+      \end{itemize}
+    \end{column}
+    \begin{column}{0.71\linewidth}
+      \begin{minted}{C++}
+        Kokkos::View<double*> histo(5);
+
+        Kokkos::View<int*,
+                     Kokkos::MemoryTraits<Kokkos::Atomic>>
+          histo_atomic = histo;
+
+        Kokkos::parallel_for(
+          Kokkos::RangePolicy(0,N),
+          KOKKOS_LAMBDA(int i) {
+            histo_atomic(i%5) += i;
+          });
+      \end{minted}
+    \end{column}
+  \end{columns}
+\end{frame}
+
+\begin{frame}[fragile]{Performances}
+  \begin{columns}
+    \begin{column}{0.45\linewidth}
+    Atomics can have a huge impact on performance: 
+    \begin{itemize}
+      \item The instruction itself is slower than the one it replaces
+      \item They may generate extra synchronisation points
+      \item They bypass and invalidate cache lines
+    \end{itemize}
+    \end{column}
+    \begin{column}{0.55\linewidth}
+      \begin{block}{Remarks}
+        \begin{itemize}
+          \item Atomics should be used with care and only when strictly
+            necessary
+          \item Algorithm can sometime be changed when porting from CPU to GPU
+            in order to remove the need for atomics (colouring, replacing in
+            place algorithm with out of place algorithm, etc.)
+        \end{itemize}
+      \end{block}
+    \end{column}
+  \end{columns}
+\end{frame}
+
 % _____________________________________________________________________________
 
 \section{Layouts}
 
 % _____________________________________________________________________________
 
-\section{Scatter Views}
-
 \end{document}