Skip to content
259 changes: 257 additions & 2 deletions courses/02_intermediate/main.tex
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@
row{1}={bg=lightmain},
}
}
\colorlet{thread1}{lightalert}
\colorlet{thread6}{lightexample}

\graphicspath{{../../images/}}

Expand Down Expand Up @@ -648,12 +650,265 @@ \section{Subviews}

\section{Atomics}

\begin{frame}[fragile]{Race condition}
Porting a code creating a histogram:
\begin{columns}
\begin{column}{0.5\linewidth}
\begin{minted}{C++}
double histo[5] = {0};

for (int i=0; i < N; i++) {
histo[i%5] += i;
}
\end{minted}
\end{column}
\pause
\begin{column}{0.5\linewidth}
\begin{minted}{C++}
Kokkos::View<double*> histo(5);

Kokkos::parallel_for(
Kokkos::RangePolicy(0,N),
KOKKOS_LAMBDA(int i) {
histo(i%5) += i;
});
\end{minted}
\end{column}
\end{columns}
\end{frame}

\begin{frame}[fragile]{Race condition}
\begin{columns}
\begin{column}{0.5\linewidth}
Even simple instructions like increment are decomposed into several smaller assembly instructions:
\begin{minted}{C++}
histo(i%5) += i;
\end{minted}
\end{column}
\begin{column}{0.5\linewidth}
\SetTblrInner{rowsep=0pt}
\begin{tblr}{colspec={cccc},rowspec={Q[lightmain]Q[white]Q[thread1]Q[thread1]Q[thread1]Q[white]Q[thread6]Q[thread6]Q[thread6]Q[white]}}
\textbf{Thread 1} & \textbf{Thread 6} & & \textbf{res} \\
& & & 0 \\
read value & & ← & 0 \\
add 1 & & & 0 \\
write value & & → & 1 \\
& & & 1 \\
& read value & ← & 1 \\
& add 6 & & 1 \\
& write value & → & 7 \\
& & & 7 \\
\end{tblr}
\end{column}
\end{columns}
\end{frame}

% Trainee could play with the following program to check that it really present a race condition:
%#include <iostream>
%#include <Kokkos_Core.hpp>
%
%int main(int argc, char *argv[]) {
% Kokkos::initialize(argc, argv);
% {
% const int N = 10000;
% Kokkos::View<double*> v("v", N);
% Kokkos::deep_copy(v, 4);
%
% Kokkos::View<double> res("res", N);
%
% Kokkos::parallel_for(Kokkos::RangePolicy(0, N),
% KOKKOS_LAMBDA(int i) {
% //Kokkos::atomic_add(&res(), v(i));
% res() = res() + v(i);
% });
%
% double res_;
%
% deep_copy(res_, res);
%
% std::cout << "res_:" << res_ << std::endl;
% std::cout << "4*N:" << 4*N << std::endl;
% }
% Kokkos::finalize();
%}

\begin{frame}[fragile]{Race condition}
\begin{columns}
\begin{column}{0.5\linewidth}
Execution between threads is independent. There is no guarantee over the order of instructions:
\begin{minted}{C++}
histo(i%5) += i;
\end{minted}

When several threads are accessing the same data, it will generate \highlight{race conditions}.
\end{column}
\begin{column}{0.5\linewidth}
\SetTblrInner{rowsep=0pt}
\begin{tblr}{colspec={cccc},rowspec={Q[lightmain]Q[white]Q[thread1]Q[thread6]Q[thread1]Q[thread1]Q[white]Q[thread6]Q[thread6]Q[white]}}
\textbf{Thread 1} & \textbf{Thread 6} & & \textbf{res} \\
& & & 0 \\
read value & & ← & 0 \\
& read value & ← & 0 \\
add 1 & & & 0 \\
write value & & → & 1 \\
& & & 1 \\
& add 6 & & 1 \\
& write value & → & 6 \\
& & & 6 \\
\end{tblr}
\end{column}
\end{columns}
\end{frame}

\begin{frame}[fragile]{Atomic operation}
Replacing the addition with its atomic counterpart solves the problem:
\begin{columns}
\begin{column}{0.40\linewidth}
\begin{minted}{C++}
Kokkos::parallel_for(
Kokkos::RangePolicy(0,N),
KOKKOS_LAMBDA(int i) {
histo(i%5) += i;
});
\end{minted}
\end{column}
\begin{column}{0.56\linewidth}
\begin{minted}{C++}
Kokkos::parallel_for(
Kokkos::RangePolicy(0,N),
KOKKOS_LAMBDA(int i) {
Kokkos::atomic_add(&histo(i%5), i);
});
\end{minted}
\end{column}
\end{columns}
\structure{Note:} \texttt{atomic\_add} takes a pointer and not a
reference as first argument, as the instruction needs to have an access to
the actual memory address of the modified variable.
\end{frame}

\begin{frame}[fragile]{Atomic operation}
\begin{columns}
\begin{column}{0.55\linewidth}
\texttt{atomic\_add} executes the write, read and add in a single atomic step,
guarantying the absence of race conditions during the operation:
\begin{minted}{C++}
Kokkos::atomic_add(&histo(i%5), i);
\end{minted}
\end{column}
\begin{column}{0.5\linewidth}
\noindent Either:

\vspace{0.5em}
\SetTblrInner{rowsep=0pt}
\begin{tblr}{colspec={cccc},rowspec={Q[lightmain]Q[white]Q[thread1]Q[thread6]}}
\textbf{Thread 1} & \textbf{Thread 6} & & \textbf{res} \\
& & & 0 \\
atomic add & & ←→ & 1 \\
& atomic add & ←→ & 7 \\
& & & 7 \\
\end{tblr}
\pause
\vspace{0.5em}

\noindent Or:

\vspace{0.5em}
\SetTblrInner{rowsep=0pt}
\begin{tblr}{colspec={cccc},rowspec={Q[lightmain]Q[white]Q[thread6]Q[thread1]}}
\textbf{Thread 1} & \textbf{Thread 6} & & \textbf{res} \\
& & & 0 \\
& atomic add & ←→ & 6 \\
atomic add & & ←→ & 7 \\
& & & 7 \\
\end{tblr}
\end{column}
\end{columns}
\end{frame}

\begin{frame}[fragile]{Operations}
\begin{columns}
\begin{column}{0.35\linewidth}
Other common operations are available with the format \texttt{Kokkos::atomic\_[op]}:
\end{column}
\begin{column}{0.75\linewidth}
\SetTblrInner{rowsep=0pt}
\begin{tblr}[theme=kokkostable]{lc}
Operation & Replaces \\
\texttt{Kokkos::atomic\_add(\&x, y)} & \texttt{x += y} \\
\texttt{Kokkos::atomic\_and(\&x, y)} & \texttt{x \&= y} \\
\texttt{Kokkos::atomic\_dec(\&x)} & \texttt{x--} \\
\texttt{Kokkos::atomic\_inc(\&x)} & \texttt{x++} \\
\texttt{Kokkos::atomic\_lshift(\&x, y)} & \texttt{x = x << y} \\
\texttt{Kokkos::atomic\_max(\&x, y)} & \texttt{x = std::max(x, y)} \\
\texttt{Kokkos::atomic\_min(\&x, y)} & \texttt{x = std::min(x, y)} \\
\texttt{Kokkos::atomic\_mod(\&x, y)} & \texttt{x \%= y} \\
\texttt{Kokkos::atomic\_nand(\&x, y)} & \texttt{x = !(x \&\& y)} \\
\texttt{Kokkos::atomic\_or(\&x, y)} & \texttt{x |= y} \\
\texttt{Kokkos::atomic\_rshift(\&x, y)} & \texttt{x = x >> y} \\
\texttt{Kokkos::atomic\_sub(\&x, y)} & \texttt{x -= y} \\
\texttt{Kokkos::atomic\_store(\&x, y)} & \texttt{x = y} \\
\texttt{Kokkos::atomic\_xor(\&x, y)} & \texttt{x \^{}= y} \\
\end{tblr}
\end{column}
\end{columns}
\end{frame}

\begin{frame}[fragile]{Atomic Memory Trait}
\begin{columns}
\begin{column}{0.40\linewidth}
\begin{itemize}
\item If you need to access a View exclusively through atomic operations, you
can also create an alias with the \texttt{Atomic} memory trait
\item It guaranties that any operation done through the alias are done atomically
\end{itemize}
\end{column}
\begin{column}{0.71\linewidth}
\begin{minted}{C++}
Kokkos::View<double*> histo(5);

Kokkos::View<int*,
Kokkos::MemoryTraits<Kokkos::Atomic>>
histo_atomic = histo;

Kokkos::parallel_for(
Kokkos::RangePolicy(0,N),
KOKKOS_LAMBDA(int i) {
histo_atomic(i%5) += i;
});
\end{minted}
\end{column}
\end{columns}
\end{frame}

\begin{frame}[fragile]{Performances}
\begin{columns}
\begin{column}{0.45\linewidth}
Atomics can have a huge impact on performance:
\begin{itemize}
\item The instruction itself is slower than the one it replaces
\item They may generate extra synchronisation points
\item They bypass and invalidate cache lines
\end{itemize}
\end{column}
\begin{column}{0.55\linewidth}
\begin{block}{Remarks}
\begin{itemize}
\item Atomics should be used with care and only when strictly
necessary
\item Algorithm can sometime be changed when porting from CPU to GPU
in order to remove the need for atomics (colouring, replacing in
place algorithm with out of place algorithm, etc.)
\end{itemize}
\end{block}
\end{column}
\end{columns}
\end{frame}

% _____________________________________________________________________________

\section{Layouts}

% _____________________________________________________________________________

\section{Scatter Views}

\end{document}