Skip to content
261 changes: 259 additions & 2 deletions courses/02_intermediate/main.tex
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@
row{1}={bg=lightmain},
}
}
\colorlet{thread1}{gray!25}
\colorlet{thread6}{example!25}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would select two tones of gray instead

Suggested change
\colorlet{thread1}{gray!25}
\colorlet{thread6}{example!25}
\colorlet{thread1}{gray!20}
\colorlet{thread6}{gray!40}

Or plainly use colors:

Suggested change
\colorlet{thread1}{gray!25}
\colorlet{thread6}{example!25}
\colorlet{thread1}{lightalert}
\colorlet{thread6}{lightexample}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I initially tried with the different levels of gray but found it hard to read, especially on slide 30.

The light red + light blue looks nice in colour but is harder to differentiate in B&W.
I'll do the change, we'll revert it if you think readability in B&W is important.


\graphicspath{{../../images/}}

Expand Down Expand Up @@ -648,12 +650,267 @@ \section{Subviews}

\section{Atomics}

\begin{frame}[fragile]{Race condition}
Porting a code creating an histogram:
\begin{columns}
\begin{column}{0.5\linewidth}
\begin{minted}{C++}
double histo[5] = {0};

for (int i=0; i < N; i++) {
histo[i%5] += i;
}
\end{minted}
\end{column}
\pause
\begin{column}{0.5\linewidth}
\begin{minted}{C++}
Kokkos::View<double*> histo(5);
Kokkos::deep_copy(histo, 0);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think you need to manually initialize to 0, the view does it by default.

(Same remark for the other slides.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you sure this is guaranteed and not a side effect of memory allocation?
I find the doc not very clear on this subject, all allocating constructor have this text:

The initialization is executed on the default instance of the execution space corresponding to memory_space and fences it.

but it doesn't explain what kind of initialisation takes place for default types.

Copy link
Contributor Author

@PaulGannay PaulGannay Dec 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I asked Adrien (he worked on View initialisation), and he confirmed that you are right, I will delete the extra deep_copy.


Kokkos::parallel_for(
Kokkos::RangePolicy(0,N),
KOKKOS_LAMBDA(int i) {
histo(i%5) += i;
});
\end{minted}
\end{column}
\end{columns}
\end{frame}

\begin{frame}[fragile]{Race condition}
\begin{columns}
\begin{column}{0.5\linewidth}
Even simple instructions like increment are decomposed into several smaller assembly instructions:
\begin{minted}{C++}
histo(i%5) += i;
\end{minted}
\end{column}
\begin{column}{0.5\linewidth}
\SetTblrInner{rowsep=0pt}
\begin{tblr}{colspec={cccc},rowspec={Q[lightmain]Q[white]Q[thread1]Q[thread1]Q[thread1]Q[white]Q[thread6]Q[thread6]Q[thread6]Q[white]}}
\textbf{Thread 1} & \textbf{Thread 6} & & \textbf{res} \\
& & & 0 \\
read value & & ← & 0 \\
add 1 & & & 0 \\
write value & & → & 1 \\
& & & 1 \\
& read value & ← & 1 \\
& add 6 & & 1 \\
& write value & → & 7 \\
& & & 7 \\
\end{tblr}
\end{column}
\end{columns}
\end{frame}

% Trainee could play with the following program to check that it really present a race condition:
%#include <iostream>
%#include <Kokkos_Core.hpp>
%
%int main(int argc, char *argv[]) {
% Kokkos::initialize(argc, argv);
% {
% const int N = 10000;
% Kokkos::View<double*> v("v", N);
% Kokkos::deep_copy(v, 4);
%
% Kokkos::View<double> res("res", N);
%
% Kokkos::parallel_for(Kokkos::RangePolicy(0, N),
% KOKKOS_LAMBDA(int i) {
% //Kokkos::atomic_add(&res(), v(i));
% res() = res() + v(i);
% });
%
% double res_;
%
% deep_copy(res_, res);
%
% std::cout << "res_:" << res_ << std::endl;
% std::cout << "4*N:" << 4*N << std::endl;
% }
% Kokkos::finalize();
%}

\begin{frame}[fragile]{Race condition}
\begin{columns}
\begin{column}{0.5\linewidth}
Execution between threads is independent. There is no guarantee over the order of instructions:
\begin{minted}{C++}
histo(i%5) += i;
\end{minted}

When several threads are accessing the same data, it will generate \structure{race conditions}.
\end{column}
\begin{column}{0.5\linewidth}
\SetTblrInner{rowsep=0pt}
\begin{tblr}{colspec={cccc},rowspec={Q[lightmain]Q[white]Q[thread1]Q[thread6]Q[thread1]Q[thread1]Q[white]Q[thread6]Q[thread6]Q[white]}}
\textbf{Thread 1} & \textbf{Thread 6} & & \textbf{res} \\
& & & 0 \\
read value & & ← & 0 \\
& read value & ← & 0 \\
add 1 & & & 0 \\
write value & & → & 1 \\
& & & 1 \\
& add 6 & & 1 \\
& write value & → & 6 \\
& & & 6 \\
\end{tblr}
\end{column}
\end{columns}
\end{frame}

\begin{frame}[fragile]{Atomic operation}
Replacing the addition with its atomic counterpart solve the problem:
\begin{columns}
\begin{column}{0.40\linewidth}
\begin{minted}{C++}
Kokkos::parallel_for(
Kokkos::RangePolicy(0,N),
KOKKOS_LAMBDA(int i) {
histo(i%5) += i;
});
\end{minted}
\end{column}
\begin{column}{0.56\linewidth}
\begin{minted}{C++}
Kokkos::parallel_for(
Kokkos::RangePolicy(0,N),
KOKKOS_LAMBDA(int i) {
Kokkos::atomic_add(&histo(i%5), i);
});
\end{minted}
\end{column}
\end{columns}
Note that the \texttt{atomic\_add} instruction takes a pointer and not a
reference as first argument, as the instruction needs to have an access to
the actual memory address of the modified variable.
\end{frame}

\begin{frame}[fragile]{Atomic operation}
\begin{columns}
\begin{column}{0.55\linewidth}
\texttt{atomic\_add} executes the \texttt{Write}, \texttt{Read} and \texttt{Add} in a single atomic step,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why "write," "read," and "add" in typewriter style and upper case first letter?

You can go by:

"atomic_add executes the write, read, and add steps in a single atomic step"

guarantying the absence of race condition during the operation:
\begin{minted}{C++}
Kokkos::atomic_add(&histo(i%5), i);
\end{minted}
\end{column}
\begin{column}{0.5\linewidth}
\noindent Either:

\vspace{0.5em}
\SetTblrInner{rowsep=0pt}
\begin{tblr}{colspec={cccc},rowspec={Q[lightmain]Q[white]Q[thread1]Q[thread6]}}
\textbf{Thread 1} & \textbf{Thread 6} & & \textbf{res} \\
& & & 0 \\
atomic add & & ←→ & 1 \\
& atomic add & ←→ & 7 \\
& & & 7 \\
\end{tblr}
\pause
\vspace{0.5em}

\noindent Or:

\vspace{0.5em}
\SetTblrInner{rowsep=0pt}
\begin{tblr}{colspec={cccc},rowspec={Q[lightmain]Q[white]Q[thread6]Q[thread1]}}
\textbf{Thread 1} & \textbf{Thread 6} & & \textbf{res} \\
& & & 0 \\
& atomic add & ←→ & 6 \\
atomic add & & ←→ & 7 \\
& & & 7 \\
\end{tblr}
\end{column}
\end{columns}
\end{frame}

\begin{frame}[fragile]{Operations}
\begin{columns}
\begin{column}{0.35\linewidth}
Other common operations are available with the format \texttt{Kokkos::atomic\_[op]}:
\end{column}
\begin{column}{0.75\linewidth}
\SetTblrInner{rowsep=0pt}
\begin{tblr}[theme=kokkostable]{lc}
Operation & Replaces \\
\texttt{Kokkos::atomic\_add(\&x, y)} & \texttt{x += y} \\
\texttt{Kokkos::atomic\_and(\&x, y)} & \texttt{x \&= y} \\
\texttt{Kokkos::atomic\_dec(\&x)} & \texttt{x--} \\
\texttt{Kokkos::atomic\_inc(\&x)} & \texttt{x++} \\
\texttt{Kokkos::atomic\_lshift(\&x, y)} & \texttt{x = x << y} \\
\texttt{Kokkos::atomic\_max(\&x, y)} & \texttt{x = std::max(x, y)} \\
\texttt{Kokkos::atomic\_min(\&x, y)} & \texttt{x = std::min(x, y)} \\
\texttt{Kokkos::atomic\_mod(\&x, y)} & \texttt{x \%= y} \\
\texttt{Kokkos::atomic\_nand(\&x, y)} & \texttt{x = !(x \&\& y)} \\
\texttt{Kokkos::atomic\_or(\&x, y)} & \texttt{x |= y} \\
\texttt{Kokkos::atomic\_rshift(\&x, y)} & \texttt{x = x >> y} \\
\texttt{Kokkos::atomic\_sub(\&x, y)} & \texttt{x -= y} \\
\texttt{Kokkos::atomic\_store(\&x, y)} & \texttt{x = y} \\
\texttt{Kokkos::atomic\_xor(\&x, y)} & \texttt{x \^{}= y} \\
\end{tblr}
\end{column}
\end{columns}
\end{frame}

\begin{frame}[fragile]{Atomic Memory Trait}
\begin{columns}
\begin{column}{0.50\linewidth}
\begin{itemize}
\item If you need to access a View exclusively through atomic operation, you
can also create an alias for this View with the \texttt{Atomic} \texttt{MemoryTraits}
\item It guaranties that any operation done through the alias are done atomically
\end{itemize}
\end{column}
\begin{column}{0.50\linewidth}
\begin{minted}{C++}
Kokkos::View<double*> histo(5);
Kokkos::deep_copy(histo, 0);

View<int*, MemoryTraits<Atomic>>
histo_atomic = histo;

Kokkos::parallel_for(
Kokkos::RangePolicy(0,N),
KOKKOS_LAMBDA(int i) {
histo_atomic(i%5) += i;
});
\end{minted}
\end{column}
\end{columns}
\end{frame}

\begin{frame}[fragile]{Performances}
\begin{columns}
\begin{column}{0.45\linewidth}
Atomics can have a huge impact on performance:
\begin{itemize}
\item The instruction itself is slower than the one it replaces
\item They may generate extra synchronisation points
\item They bypass and invalidate cache lines
\end{itemize}
\end{column}
\begin{column}{0.55\linewidth}
\begin{block}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
\begin{block}
\begin{block}{Remarks}

{\vspace*{-3ex}} % hide the title box of the block
\begin{itemize}
\item Atomics should be used with care and only when strictly
necessary
\item Algorithm can sometime be changed when porting from CPU to GPU
in order to remove the need for atomics (colouring, replacing in
place algorithm with out of place algorithm,~…)
\end{itemize}
\end{block}
\end{column}
\end{columns}
\end{frame}

% _____________________________________________________________________________

\section{Layouts}

% _____________________________________________________________________________

\section{Scatter Views}

\end{document}