diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 65af9582..044e35c5 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -21,6 +21,7 @@ add_subdirectory(Example16)
 add_subdirectory(Example17)
 add_subdirectory(Example18)
 add_subdirectory(Example19)
+add_subdirectory(Example20)
 add_subdirectory(TestEm3)
 add_subdirectory(TestEm3MT)
 add_subdirectory(TestEm3Compact)
diff --git a/examples/Example20/CMakeLists.txt b/examples/Example20/CMakeLists.txt
new file mode 100644
index 00000000..8db86260
--- /dev/null
+++ b/examples/Example20/CMakeLists.txt
@@ -0,0 +1,47 @@
+# SPDX-FileCopyrightText: 2022 CERN
+# SPDX-License-Identifier: Apache-2.0
+
+set(TargetName example20)
+
+if(NOT TARGET G4HepEm::g4HepEm)
+  message(STATUS "Disabling ${TargetName} (needs G4HepEm)")
+  return()
+endif()
+
+if(Geant4_FOUND)
+  if(NOT Geant4_gdml_FOUND)
+    message(STATUS "Disabling ${TargetName} (needs Geant4 with GDML support)")
+    return()
+  endif()
+else()
+  message(STATUS "Disabling ${TargetName} (needs Geant4)")
+  return()
+endif()
+
+add_executable(${TargetName}
+  main.cpp
+  main.cu
+  electrons.cu
+  gammas.cu)
+target_link_libraries(${TargetName}
+  PRIVATE
+    AdePT
+    CopCore::CopCore
+    VecGeom::vecgeom
+    VecGeom::vecgeomcuda_static
+    VecGeom::vgdml
+    ${Geant4_LIBRARIES}
+    G4HepEm::g4HepEmData
+    G4HepEm::g4HepEmInit
+    G4HepEm::g4HepEmRun
+    CUDA::cudart)
+
+set_target_properties(${TargetName} PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+
+# NVTX
+target_link_libraries(${TargetName} PRIVATE NVTX)
+
+# Tests
+add_test(NAME ${TargetName}
+  COMMAND $<TARGET_FILE:${TargetName}> -gdml_file ${CMAKE_BINARY_DIR}/cms2018.gdml)
+
diff --git a/examples/Example20/README.md b/examples/Example20/README.md
new file mode 100644
index 00000000..891d39b0
--- /dev/null
+++ b/examples/Example20/README.md
@@ -0,0 +1,61 @@
+<!--
+SPDX-FileCopyrightText: 2022 CERN
+SPDX-License-Identifier: CC-BY-4.0
+-->
+
+## Example 20
+
+Example based on Example 19, but smaller kernels are fed with queues instead of the `SOAData` structure.
+
+ * arbitrary geometry via gdml file (tested with cms2018.gdml from VecGeom persistency/gdml/gdmls folder) and optionally a magnetic field with constant Bz,
+ * geometry read as Geant4 geometry, reading in regions and cuts, to initialize G4HepEm data
+ * geometry read then into VecGeom, and synchronized to GPU
+ * G4HepEm material-cuts couple indices mapped to VecGeom logical volume id's
+ * physics processes for e-/e+ (including MSC) and gammas using G4HepEm.
+ * scoring per placed volume, no sensitive detector feature
+ * configurable particle gun via command line arguments
+   * E.g., use `-gunpos -220,0,0 -gundir 1,0,0` for `testEm3.gdml`. For `cms2018.gdml`, the default gun is correct.
+
+Electrons, positrons, and gammas are stored in separate containers in device memory.
+Free positions in the storage are handed out with monotonic slot numbers, slots are not reused.
+Active tracks are passed via three queues per particle type (see `struct ParticleQueues`).
+Results are reproducible using one RANLUX++ state per track.
+
+Additionally, the kernels score energy deposit and the charged track length per volume.
+
+### Kernels
+
+This example uses one stream per particle type to launch kernels asynchronously.
+They are synchronized via a fourth stream using CUDA events.
+
+#### `TransportElectrons<bool IsElectron>`
+
+1. Obtain safety unless the track is currently on a boundary.
+2. Determine physics step limit, including conversion to geometric step length according to MSC.
+3. Query geometry (or optionally magnetic field) to get geometry step length.
+4. Convert geometry to true step length according to MSC, apply net direction change and displacement.
+5. Apply continuous effects; kill track if stopped.
+6. If the particle reaches a boundary, perform relocation.
+7. If not, and if there is a discrete process, hand over to interaction kernel.
+
+#### `TransportGammas`
+
+1. Determine the physics step limit.
+2. Query VecGeom to get geometry step length (no magnetic field for neutral particles!).
+3. If the particle reaches a boundary, perform relocation.
+4. If not, and if there is a discrete process, hand over to interaction kernel.
+
+#### Interaction kernels
+In electrons.cu and gammas.cu, there is dedicated kernels for all discrete processes. These were
+extracted from the main transport kernels in example18.
+1. Sample the final state.
+2. Update the primary and produce secondaries.
+
+#### `FinishIteration`
+
+Clear the queues and return the tracks in flight.
+This kernel runs after all secondary particles were produced.
+
+#### `InitPrimaries` and `InitParticleQueues`
+
+Used to initialize multiple primary particles with separate seeds.
diff --git a/examples/Example20/electrons.cu b/examples/Example20/electrons.cu
new file mode 100644
index 00000000..6dd1a44b
--- /dev/null
+++ b/examples/Example20/electrons.cu
@@ -0,0 +1,444 @@
+// SPDX-FileCopyrightText: 2022 CERN
+// SPDX-License-Identifier: Apache-2.0
+
+#include "example.cuh"
+
+#include <AdePT/BVHNavigator.h>
+#include <fieldPropagatorConstBz.h>
+
+#include <CopCore/PhysicalConstants.h>
+
+#define NOFLUCTUATION
+
+#include <G4HepEmElectronManager.hh>
+#include <G4HepEmElectronTrack.hh>
+#include <G4HepEmElectronInteractionBrem.hh>
+#include <G4HepEmElectronInteractionIoni.hh>
+#include <G4HepEmElectronInteractionUMSC.hh>
+#include <G4HepEmPositronInteractionAnnihilation.hh>
+// Pull in implementation.
+#include <G4HepEmRunUtils.icc>
+#include <G4HepEmInteractionUtils.icc>
+#include <G4HepEmElectronManager.icc>
+#include <G4HepEmElectronInteractionBrem.icc>
+#include <G4HepEmElectronInteractionIoni.icc>
+#include <G4HepEmElectronInteractionUMSC.icc>
+#include <G4HepEmPositronInteractionAnnihilation.icc>
+
+// Compute the physics and geometry step limit, transport the electrons while
+// applying the continuous effects and maybe a discrete process that could
+// generate secondaries.
+template <bool IsElectron>
+static __device__ __forceinline__ void TransportElectrons(Track *electrons, const adept::MParray *active,
+                                                          Secondaries &secondaries, adept::MParray *activeQueue,
+                                                          Interactions interactions, GlobalScoring *globalScoring,
+                                                          ScoringPerVolume *scoringPerVolume)
+{
+#ifdef VECGEOM_FLOAT_PRECISION
+  const Precision kPush = 10 * vecgeom::kTolerance;
+#else
+  const Precision kPush = 0.;
+#endif
+  constexpr int Charge  = IsElectron ? -1 : 1;
+  constexpr double Mass = copcore::units::kElectronMassC2;
+  fieldPropagatorConstBz fieldPropagatorBz(BzFieldValue);
+
+  int activeSize = active->size();
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < activeSize; i += blockDim.x * gridDim.x) {
+    const int globalSlot = (*active)[i];
+    Track &currentTrack  = electrons[globalSlot];
+    auto energy          = currentTrack.energy;
+    auto pos             = currentTrack.pos;
+    auto dir             = currentTrack.dir;
+    auto navState        = currentTrack.navState;
+    const auto volume    = navState.Top();
+    const int volumeID   = volume->id();
+    // the MCC vector is indexed by the logical volume id
+    const int lvolID     = volume->GetLogicalVolume()->id();
+    const int theMCIndex = MCIndex[lvolID];
+
+    auto survive = [&](bool push = true) {
+      currentTrack.energy   = energy;
+      currentTrack.pos      = pos;
+      currentTrack.dir      = dir;
+      currentTrack.navState = navState;
+      if (push) activeQueue->push_back(globalSlot);
+    };
+
+    // Init a track with the needed data to call into G4HepEm.
+    G4HepEmElectronTrack elTrack;
+    G4HepEmTrack *theTrack = elTrack.GetTrack();
+    theTrack->SetEKin(energy);
+    theTrack->SetMCIndex(theMCIndex);
+    theTrack->SetOnBoundary(navState.IsOnBoundary());
+    theTrack->SetCharge(Charge);
+    G4HepEmMSCTrackData *mscData = elTrack.GetMSCTrackData();
+    mscData->fIsFirstStep        = currentTrack.initialRange < 0;
+    mscData->fInitialRange       = currentTrack.initialRange;
+    mscData->fDynamicRangeFactor = currentTrack.dynamicRangeFactor;
+    mscData->fTlimitMin          = currentTrack.tlimitMin;
+
+    // Prepare a branched RNG state while threads are synchronized. Even if not
+    // used, this provides a fresh round of random numbers and reduces thread
+    // divergence because the RNG state doesn't need to be advanced later.
+    RanluxppDouble newRNG(currentTrack.rngState.BranchNoAdvance());
+
+    // Compute safety, needed for MSC step limit.
+    double safety = 0;
+    if (!navState.IsOnBoundary()) {
+      safety = BVHNavigator::ComputeSafety(pos, navState);
+    }
+    theTrack->SetSafety(safety);
+
+    G4HepEmRandomEngine rnge(&currentTrack.rngState);
+
+    // Sample the `number-of-interaction-left` and put it into the track.
+    for (int ip = 0; ip < 3; ++ip) {
+      double numIALeft = currentTrack.numIALeft[ip];
+      if (numIALeft <= 0) {
+        numIALeft = -std::log(currentTrack.Uniform());
+      }
+      theTrack->SetNumIALeft(numIALeft, ip);
+    }
+
+    // Call G4HepEm to compute the physics step limit.
+    // G4HepEmElectronManager::HowFar(&g4HepEmData, &g4HepEmPars, &elTrack, &rnge);
+    G4HepEmElectronManager::HowFarToDiscreteInteraction(&g4HepEmData, &g4HepEmPars, &elTrack);
+
+    bool restrictedPhysicalStepLength = false;
+    if (BzFieldValue != 0) {
+      const double momentumMag = sqrt(energy * (energy + 2.0 * Mass));
+      // Distance along the track direction to reach the maximum allowed error
+      const double safeLength = fieldPropagatorBz.ComputeSafeLength(momentumMag, Charge, dir);
+
+      constexpr int MaxSafeLength = 10;
+      double limit                = MaxSafeLength * safeLength;
+      limit                       = safety > limit ? safety : limit;
+
+      double physicalStepLength = elTrack.GetPStepLength();
+      if (physicalStepLength > limit) {
+        physicalStepLength           = limit;
+        restrictedPhysicalStepLength = true;
+        elTrack.SetPStepLength(physicalStepLength);
+        // Note: We are limiting the true step length, which is converted to
+        // a shorter geometry step length in HowFarToMSC. In that sense, the
+        // limit is an over-approximation, but that is fine for our purpose.
+      }
+    }
+
+    G4HepEmElectronManager::HowFarToMSC(&g4HepEmData, &g4HepEmPars, &elTrack, &rnge);
+
+    // Remember MSC values for the next step(s).
+    currentTrack.initialRange       = mscData->fInitialRange;
+    currentTrack.dynamicRangeFactor = mscData->fDynamicRangeFactor;
+    currentTrack.tlimitMin          = mscData->fTlimitMin;
+
+    // Get result into variables.
+    double geometricalStepLengthFromPhysics = theTrack->GetGStepLength();
+    // The phyiscal step length is the amount that the particle experiences
+    // which might be longer than the geometrical step length due to MSC. As
+    // long as we call PerformContinuous in the same kernel we don't need to
+    // care, but we need to make this available when splitting the operations.
+    // double physicalStepLength = elTrack.GetPStepLength();
+    int winnerProcessIndex = theTrack->GetWinnerProcessIndex();
+    // Leave the range and MFP inside the G4HepEmTrack. If we split kernels, we
+    // also need to carry them over!
+
+    // Check if there's a volume boundary in between.
+    bool propagated = true;
+    double geometryStepLength;
+    vecgeom::NavStateIndex nextState;
+    if (BzFieldValue != 0) {
+      geometryStepLength = fieldPropagatorBz.ComputeStepAndNextVolume<BVHNavigator>(
+          energy, Mass, Charge, geometricalStepLengthFromPhysics, pos, dir, navState, nextState, propagated, safety);
+    } else {
+      geometryStepLength = BVHNavigator::ComputeStepAndNextVolume(pos, dir, geometricalStepLengthFromPhysics, navState,
+                                                                  nextState, kPush);
+      pos += geometryStepLength * dir;
+    }
+
+    // Set boundary state in navState so the next step and secondaries get the
+    // correct information (navState = nextState only if relocated
+    // in case of a boundary; see below)
+    navState.SetBoundaryState(nextState.IsOnBoundary());
+
+    // Propagate information from geometrical step to MSC.
+    theTrack->SetDirection(dir.x(), dir.y(), dir.z());
+    theTrack->SetGStepLength(geometryStepLength);
+    theTrack->SetOnBoundary(nextState.IsOnBoundary());
+
+    // Apply continuous effects.
+    bool stopped = G4HepEmElectronManager::PerformContinuous(&g4HepEmData, &g4HepEmPars, &elTrack, &rnge);
+
+    // Collect the direction change and displacement by MSC.
+    const double *direction = theTrack->GetDirection();
+    dir.Set(direction[0], direction[1], direction[2]);
+    if (!nextState.IsOnBoundary()) {
+      const double *mscDisplacement = mscData->GetDisplacement();
+      vecgeom::Vector3D<Precision> displacement(mscDisplacement[0], mscDisplacement[1], mscDisplacement[2]);
+      const double dLength2            = displacement.Length2();
+      constexpr double kGeomMinLength  = 5 * copcore::units::nm;          // 0.05 [nm]
+      constexpr double kGeomMinLength2 = kGeomMinLength * kGeomMinLength; // (0.05 [nm])^2
+      if (dLength2 > kGeomMinLength2) {
+        const double dispR = std::sqrt(dLength2);
+        // Estimate safety by subtracting the geometrical step length.
+        safety -= geometryStepLength;
+        constexpr double sFact = 0.99;
+        double reducedSafety   = sFact * safety;
+
+        // Apply displacement, depending on how close we are to a boundary.
+        // 1a. Far away from geometry boundary:
+        if (reducedSafety > 0.0 && dispR <= reducedSafety) {
+          pos += displacement;
+        } else {
+          // Recompute safety.
+          safety        = BVHNavigator::ComputeSafety(pos, navState);
+          reducedSafety = sFact * safety;
+
+          // 1b. Far away from geometry boundary:
+          if (reducedSafety > 0.0 && dispR <= reducedSafety) {
+            pos += displacement;
+            // 2. Push to boundary:
+          } else if (reducedSafety > kGeomMinLength) {
+            pos += displacement * (reducedSafety / dispR);
+          }
+          // 3. Very small safety: do nothing.
+        }
+      }
+    }
+
+    // Collect the charged step length (might be changed by MSC).
+    atomicAdd(&globalScoring->chargedSteps, 1);
+    atomicAdd(&scoringPerVolume->chargedTrackLength[volumeID], elTrack.GetPStepLength());
+
+    // Collect the changes in energy and deposit.
+    energy               = theTrack->GetEKin();
+    double energyDeposit = theTrack->GetEnergyDeposit();
+    atomicAdd(&globalScoring->energyDeposit, energyDeposit);
+    atomicAdd(&scoringPerVolume->energyDeposit[volumeID], energyDeposit);
+
+    // Save the `number-of-interaction-left` in our track.
+    for (int ip = 0; ip < 3; ++ip) {
+      double numIALeft           = theTrack->GetNumIALeft(ip);
+      currentTrack.numIALeft[ip] = numIALeft;
+    }
+
+    if (stopped) {
+      if (!IsElectron) {
+        // Annihilate the stopped positron into two gammas heading to opposite
+        // directions (isotropic).
+        Track &gamma1 = secondaries.gammas.NextTrack();
+        Track &gamma2 = secondaries.gammas.NextTrack();
+        atomicAdd(&globalScoring->numGammas, 2);
+
+        const double cost = 2 * currentTrack.Uniform() - 1;
+        const double sint = sqrt(1 - cost * cost);
+        const double phi  = k2Pi * currentTrack.Uniform();
+        double sinPhi, cosPhi;
+        sincos(phi, &sinPhi, &cosPhi);
+
+        gamma1.InitAsSecondary(pos, navState);
+        newRNG.Advance();
+        gamma1.rngState = newRNG;
+        gamma1.energy   = copcore::units::kElectronMassC2;
+        gamma1.dir.Set(sint * cosPhi, sint * sinPhi, cost);
+
+        gamma2.InitAsSecondary(pos, navState);
+        // Reuse the RNG state of the dying track.
+        gamma2.rngState = currentTrack.rngState;
+        gamma2.energy   = copcore::units::kElectronMassC2;
+        gamma2.dir      = -gamma1.dir;
+      }
+      // Particles are killed by not enqueuing them into the new activeQueue.
+      continue;
+    }
+
+    if (nextState.IsOnBoundary()) {
+      // For now, just count that we hit something.
+      atomicAdd(&globalScoring->hits, 1);
+
+      // Kill the particle if it left the world.
+      if (nextState.Top() != nullptr) {
+        BVHNavigator::RelocateToNextVolume(pos, dir, nextState);
+
+        // Move to the next boundary.
+        navState = nextState;
+        survive();
+      }
+      continue;
+    } else if (!propagated || restrictedPhysicalStepLength) {
+      // Did not yet reach the interaction point due to error in the magnetic
+      // field propagation. Try again next time.
+      survive();
+      continue;
+    } else if (winnerProcessIndex < 0) {
+      // No discrete process, move on.
+      survive();
+      continue;
+    }
+
+    // Reset number of interaction left for the winner discrete process.
+    // (Will be resampled in the next iteration.)
+    currentTrack.numIALeft[winnerProcessIndex] = -1.0;
+
+    // Check if a delta interaction happens instead of the real discrete process.
+    if (G4HepEmElectronManager::CheckDelta(&g4HepEmData, theTrack, currentTrack.Uniform())) {
+      // A delta interaction happened, move on.
+      survive();
+      continue;
+    }
+
+    interactions.queues[winnerProcessIndex]->push_back(globalSlot);
+
+    survive(false);
+  }
+}
+
+// Instantiate kernels for electrons and positrons.
+__global__ void TransportElectrons(Track *electrons, const adept::MParray *active, Secondaries secondaries,
+                                   adept::MParray *activeQueue, Interactions interactions, GlobalScoring *globalScoring,
+                                   ScoringPerVolume *scoringPerVolume)
+{
+  TransportElectrons</*IsElectron*/ true>(electrons, active, secondaries, activeQueue, interactions, globalScoring,
+                                          scoringPerVolume);
+}
+__global__ void TransportPositrons(Track *positrons, const adept::MParray *active, Secondaries secondaries,
+                                   adept::MParray *activeQueue, Interactions interactions, GlobalScoring *globalScoring,
+                                   ScoringPerVolume *scoringPerVolume)
+{
+  TransportElectrons</*IsElectron*/ false>(positrons, active, secondaries, activeQueue, interactions, globalScoring,
+                                           scoringPerVolume);
+}
+
+template <bool IsElectron, int ProcessIndex>
+__device__ void ElectronInteraction(Track *particles, const adept::MParray *active, Secondaries secondaries,
+                                    adept::MParray *activeQueue, GlobalScoring *globalScoring,
+                                    ScoringPerVolume *scoringPerVolume)
+{
+  int activeSize = active->size();
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < activeSize; i += blockDim.x * gridDim.x) {
+    const int globalSlot = (*active)[i];
+    Track &currentTrack = particles[globalSlot];
+    auto energy         = currentTrack.energy;
+    const auto pos      = currentTrack.pos;
+    auto dir            = currentTrack.dir;
+    const auto navState = currentTrack.navState;
+    const auto volume   = navState.Top();
+    // the MCC vector is indexed by the logical volume id
+    const int lvolID     = volume->GetLogicalVolume()->id();
+    const int theMCIndex = MCIndex[lvolID];
+
+    auto survive = [&] {
+      currentTrack.dir    = dir;
+      currentTrack.energy = energy;
+      activeQueue->push_back(globalSlot);
+    };
+
+    const double theElCut = g4HepEmData.fTheMatCutData->fMatCutData[theMCIndex].fSecElProdCutE;
+
+    RanluxppDouble newRNG{currentTrack.rngState.Branch()};
+    G4HepEmRandomEngine rnge{&currentTrack.rngState};
+
+    if constexpr (ProcessIndex == 0) {
+      // Invoke ionization (for e-/e+):
+      double deltaEkin = (IsElectron) ? G4HepEmElectronInteractionIoni::SampleETransferMoller(theElCut, energy, &rnge)
+                                      : G4HepEmElectronInteractionIoni::SampleETransferBhabha(theElCut, energy, &rnge);
+
+      double dirPrimary[] = {dir.x(), dir.y(), dir.z()};
+      double dirSecondary[3];
+      G4HepEmElectronInteractionIoni::SampleDirections(energy, deltaEkin, dirSecondary, dirPrimary, &rnge);
+
+      Track &secondary = secondaries.electrons.NextTrack();
+      atomicAdd(&globalScoring->numElectrons, 1);
+
+      secondary.InitAsSecondary(pos, navState);
+      secondary.rngState = newRNG;
+      secondary.energy   = deltaEkin;
+      secondary.dir.Set(dirSecondary[0], dirSecondary[1], dirSecondary[2]);
+
+      energy -= deltaEkin;
+      dir.Set(dirPrimary[0], dirPrimary[1], dirPrimary[2]);
+      survive();
+    } else if constexpr (ProcessIndex == 1) {
+      // Invoke model for Bremsstrahlung: either SB- or Rel-Brem.
+      double logEnergy = std::log(energy);
+      double deltaEkin = energy < g4HepEmPars.fElectronBremModelLim
+                             ? G4HepEmElectronInteractionBrem::SampleETransferSB(&g4HepEmData, energy, logEnergy,
+                                                                                 theMCIndex, &rnge, IsElectron)
+                             : G4HepEmElectronInteractionBrem::SampleETransferRB(&g4HepEmData, energy, logEnergy,
+                                                                                 theMCIndex, &rnge, IsElectron);
+
+      double dirPrimary[] = {dir.x(), dir.y(), dir.z()};
+      double dirSecondary[3];
+      G4HepEmElectronInteractionBrem::SampleDirections(energy, deltaEkin, dirSecondary, dirPrimary, &rnge);
+
+      Track &gamma = secondaries.gammas.NextTrack();
+      atomicAdd(&globalScoring->numGammas, 1);
+
+      gamma.InitAsSecondary(pos, navState);
+      gamma.rngState = newRNG;
+      gamma.energy   = deltaEkin;
+      gamma.dir.Set(dirSecondary[0], dirSecondary[1], dirSecondary[2]);
+
+      energy -= deltaEkin;
+      dir.Set(dirPrimary[0], dirPrimary[1], dirPrimary[2]);
+      survive();
+    } else if constexpr (ProcessIndex == 2) {
+      // Invoke annihilation (in-flight) for e+
+      double dirPrimary[] = {dir.x(), dir.y(), dir.z()};
+      double theGamma1Ekin, theGamma2Ekin;
+      double theGamma1Dir[3], theGamma2Dir[3];
+      G4HepEmPositronInteractionAnnihilation::SampleEnergyAndDirectionsInFlight(
+          energy, dirPrimary, &theGamma1Ekin, theGamma1Dir, &theGamma2Ekin, theGamma2Dir, &rnge);
+
+      Track &gamma1 = secondaries.gammas.NextTrack();
+      Track &gamma2 = secondaries.gammas.NextTrack();
+      atomicAdd(&globalScoring->numGammas, 2);
+
+      gamma1.InitAsSecondary(pos, navState);
+      gamma1.rngState = newRNG;
+      gamma1.energy   = theGamma1Ekin;
+      gamma1.dir.Set(theGamma1Dir[0], theGamma1Dir[1], theGamma1Dir[2]);
+
+      gamma2.InitAsSecondary(pos, navState);
+      // Reuse the RNG state of the dying track.
+      gamma2.rngState = currentTrack.rngState;
+      gamma2.energy   = theGamma2Ekin;
+      gamma2.dir.Set(theGamma2Dir[0], theGamma2Dir[1], theGamma2Dir[2]);
+
+      // The current track is killed by not enqueuing into the next activeQueue.
+    }
+  }
+}
+
+__global__ void IonizationEl(Track *particles, const adept::MParray *active, Secondaries secondaries,
+                             adept::MParray *activeQueue, GlobalScoring *globalScoring,
+                             ScoringPerVolume *scoringPerVolume)
+{
+  ElectronInteraction<true, 0>(particles, active, secondaries, activeQueue, globalScoring, scoringPerVolume);
+}
+__global__ void BremsstrahlungEl(Track *particles, const adept::MParray *active, Secondaries secondaries,
+                                 adept::MParray *activeQueue, GlobalScoring *globalScoring,
+                                 ScoringPerVolume *scoringPerVolume)
+{
+  ElectronInteraction<true, 1>(particles, active, secondaries, activeQueue, globalScoring, scoringPerVolume);
+}
+
+__global__ void IonizationPos(Track *particles, const adept::MParray *active, Secondaries secondaries,
+                              adept::MParray *activeQueue, GlobalScoring *globalScoring,
+                              ScoringPerVolume *scoringPerVolume)
+{
+  ElectronInteraction<false, 0>(particles, active, secondaries, activeQueue, globalScoring, scoringPerVolume);
+}
+__global__ void BremsstrahlungPos(Track *particles, const adept::MParray *active, Secondaries secondaries,
+                                  adept::MParray *activeQueue, GlobalScoring *globalScoring,
+                                  ScoringPerVolume *scoringPerVolume)
+{
+  ElectronInteraction<false, 1>(particles, active, secondaries, activeQueue, globalScoring, scoringPerVolume);
+}
+__global__ void AnnihilationPos(Track *particles, const adept::MParray *active, Secondaries secondaries,
+                                adept::MParray *activeQueue, GlobalScoring *globalScoring,
+                                ScoringPerVolume *scoringPerVolume)
+{
+  ElectronInteraction<false, 2>(particles, active, secondaries, activeQueue, globalScoring, scoringPerVolume);
+}
diff --git a/examples/Example20/example.cuh b/examples/Example20/example.cuh
new file mode 100644
index 00000000..0f274325
--- /dev/null
+++ b/examples/Example20/example.cuh
@@ -0,0 +1,174 @@
+// SPDX-FileCopyrightText: 2022 CERN
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef EXAMPLE_CUH
+#define EXAMPLE_CUH
+
+#include "example.h"
+
+#include <AdePT/MParray.h>
+#include <CopCore/SystemOfUnits.h>
+#include <CopCore/Ranluxpp.h>
+
+#include <G4HepEmData.hh>
+#include <G4HepEmParameters.hh>
+#include <G4HepEmRandomEngine.hh>
+
+#include <VecGeom/base/Vector3D.h>
+#include <VecGeom/navigation/NavStateIndex.h>
+
+constexpr int ThreadsPerBlock = 256;
+
+// A data structure to represent a particle track. The particle type is implicit
+// by the queue and not stored in memory.
+struct Track {
+  using Precision = vecgeom::Precision;
+  RanluxppDouble rngState;
+  double energy;
+  double numIALeft[3];
+  double initialRange;
+  double dynamicRangeFactor;
+  double tlimitMin;
+
+  vecgeom::Vector3D<Precision> pos;
+  vecgeom::Vector3D<Precision> dir;
+  vecgeom::NavStateIndex navState;
+
+  __host__ __device__ double Uniform() { return rngState.Rndm(); }
+
+  __host__ __device__ void InitAsSecondary(const vecgeom::Vector3D<Precision> &parentPos,
+                                           const vecgeom::NavStateIndex &parentNavState)
+  {
+    // The caller is responsible to branch a new RNG state and to set the energy.
+    this->numIALeft[0] = -1.0;
+    this->numIALeft[1] = -1.0;
+    this->numIALeft[2] = -1.0;
+
+    this->initialRange       = -1.0;
+    this->dynamicRangeFactor = -1.0;
+    this->tlimitMin          = -1.0;
+
+    // A secondary inherits the position of its parent; the caller is responsible
+    // to update the directions.
+    this->pos      = parentPos;
+    this->navState = parentNavState;
+  }
+};
+
+#ifdef __CUDA_ARCH__
+// Define inline implementations of the RNG methods for the device.
+// (nvcc ignores the __device__ attribute in definitions, so this is only to
+// communicate the intent.)
+inline __device__ double G4HepEmRandomEngine::flat()
+{
+  return ((RanluxppDouble *)fObject)->Rndm();
+}
+
+inline __device__ void G4HepEmRandomEngine::flatArray(const int size, double *vect)
+{
+  for (int i = 0; i < size; i++) {
+    vect[i] = ((RanluxppDouble *)fObject)->Rndm();
+  }
+}
+#endif
+
+// A data structure to manage slots in the track storage.
+class SlotManager {
+  adept::Atomic_t<int> fNextSlot;
+  const int fMaxSlot;
+
+public:
+  __host__ __device__ SlotManager(int maxSlot) : fMaxSlot(maxSlot) { fNextSlot = 0; }
+
+  __host__ __device__ int NextSlot()
+  {
+    int next = fNextSlot.fetch_add(1);
+    if (next >= fMaxSlot) return -1;
+    return next;
+  }
+};
+
+// A bundle of pointers to generate particles of an implicit type.
+class ParticleGenerator {
+  Track *fTracks;
+  SlotManager *fSlotManager;
+  adept::MParray *fActiveQueue;
+
+public:
+  __host__ __device__ ParticleGenerator(Track *tracks, SlotManager *slotManager, adept::MParray *activeQueue)
+      : fTracks(tracks), fSlotManager(slotManager), fActiveQueue(activeQueue)
+  {
+  }
+
+  __host__ __device__ Track &NextTrack()
+  {
+    int slot = fSlotManager->NextSlot();
+    if (slot == -1) {
+      COPCORE_EXCEPTION("No slot available in ParticleGenerator::NextTrack");
+    }
+    fActiveQueue->push_back(slot);
+    return fTracks[slot];
+  }
+};
+
+// A bundle of generators for the three particle types.
+struct Secondaries {
+  ParticleGenerator electrons;
+  ParticleGenerator positrons;
+  ParticleGenerator gammas;
+};
+
+struct Interactions {
+  adept::MParray *queues[3];
+};
+
+// Kernels in different TUs.
+__global__ void TransportElectrons(Track *electrons, const adept::MParray *active, Secondaries secondaries,
+                                   adept::MParray *activeQueue, Interactions interactions, GlobalScoring *globalScoring,
+                                   ScoringPerVolume *scoringPerVolume);
+__global__ void TransportPositrons(Track *positrons, const adept::MParray *active, Secondaries secondaries,
+                                   adept::MParray *activeQueue, Interactions interactions, GlobalScoring *globalScoring,
+                                   ScoringPerVolume *scoringPerVolume);
+
+__global__ void TransportGammas(Track *gammas, const adept::MParray *active, Secondaries secondaries,
+                                adept::MParray *activeQueue, Interactions interactions, GlobalScoring *globalScoring,
+                                ScoringPerVolume *scoringPerVolume);
+
+__global__ void IonizationEl(Track *particles, const adept::MParray *active, Secondaries secondaries,
+                             adept::MParray *activeQueue, GlobalScoring *globalScoring,
+                             ScoringPerVolume *scoringPerVolume);
+__global__ void BremsstrahlungEl(Track *particles, const adept::MParray *active, Secondaries secondaries,
+                                 adept::MParray *activeQueue, GlobalScoring *globalScoring,
+                                 ScoringPerVolume *scoringPerVolume);
+
+__global__ void IonizationPos(Track *particles, const adept::MParray *active, Secondaries secondaries,
+                              adept::MParray *activeQueue, GlobalScoring *globalScoring,
+                              ScoringPerVolume *scoringPerVolume);
+__global__ void BremsstrahlungPos(Track *particles, const adept::MParray *active, Secondaries secondaries,
+                                  adept::MParray *activeQueue, GlobalScoring *globalScoring,
+                                  ScoringPerVolume *scoringPerVolume);
+__global__ void AnnihilationPos(Track *particles, const adept::MParray *active, Secondaries secondaries,
+                                adept::MParray *activeQueue, GlobalScoring *globalScoring,
+                                ScoringPerVolume *scoringPerVolume);
+
+__global__ void PairCreation(Track *particles, const adept::MParray *active, Secondaries secondaries,
+                             adept::MParray *activeQueue, GlobalScoring *globalScoring,
+                             ScoringPerVolume *scoringPerVolume);
+__global__ void ComptonScattering(Track *particles, const adept::MParray *active, Secondaries secondaries,
+                                  adept::MParray *activeQueue, GlobalScoring *globalScoring,
+                                  ScoringPerVolume *scoringPerVolume);
+__global__ void PhotoelectricEffect(Track *particles, const adept::MParray *active, Secondaries secondaries,
+                                    adept::MParray *activeQueue, GlobalScoring *globalScoring,
+                                    ScoringPerVolume *scoringPerVolume);
+
+// Constant data structures from G4HepEm accessed by the kernels.
+// (defined in TestEm3.cu)
+extern __constant__ __device__ struct G4HepEmParameters g4HepEmPars;
+extern __constant__ __device__ struct G4HepEmData g4HepEmData;
+
+extern __constant__ __device__ int *MCIndex;
+
+// constexpr vecgeom::Precision BzFieldValue = 3.8 * copcore::units::tesla;
+constexpr vecgeom::Precision BzFieldValue = 0;
+
+#endif
diff --git a/examples/Example20/example.h b/examples/Example20/example.h
new file mode 100644
index 00000000..743727a2
--- /dev/null
+++ b/examples/Example20/example.h
@@ -0,0 +1,39 @@
+// SPDX-FileCopyrightText: 2021 CERN
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef EXAMPLE_H
+#define EXAMPLE_H
+
+struct G4HepEmState;
+
+// Data structures for scoring. The accessors must make sure to use atomic operations if needed.
+struct GlobalScoring {
+  double energyDeposit;
+  // Not int to avoid overflows for more than 100,000 events; unsigned long long
+  // is the only other data type available for atomicAdd().
+  unsigned long long chargedSteps;
+  unsigned long long neutralSteps;
+  unsigned long long hits;
+  unsigned long long numGammas;
+  unsigned long long numElectrons;
+  unsigned long long numPositrons;
+  // Not used on the device, filled in by the host.
+  unsigned long long numKilled;
+};
+
+struct ScoringPerVolume {
+  double *energyDeposit;
+  double *chargedTrackLength;
+};
+
+struct GunConfig {
+  float position[3];
+  float direction[3];
+  bool movingGun;
+};
+
+// Interface between C++ and CUDA.
+void runGPU(int numParticles, double energy, int batch, const int *MCCindex, ScoringPerVolume *scoringPerVolume,
+            GlobalScoring *globalScoring, int numVolumes, int numPlaced, G4HepEmState *state, GunConfig gunConfig);
+
+#endif
diff --git a/examples/Example20/gammas.cu b/examples/Example20/gammas.cu
new file mode 100644
index 00000000..a4607cd0
--- /dev/null
+++ b/examples/Example20/gammas.cu
@@ -0,0 +1,276 @@
+// SPDX-FileCopyrightText: 2022 CERN
+// SPDX-License-Identifier: Apache-2.0
+
+#include "example.cuh"
+
+#include <AdePT/BVHNavigator.h>
+
+#include <CopCore/PhysicalConstants.h>
+
+#include <G4HepEmGammaManager.hh>
+#include <G4HepEmGammaTrack.hh>
+#include <G4HepEmTrack.hh>
+#include <G4HepEmGammaInteractionCompton.hh>
+#include <G4HepEmGammaInteractionConversion.hh>
+#include <G4HepEmGammaInteractionPhotoelectric.hh>
+// Pull in implementation.
+#include <G4HepEmGammaManager.icc>
+#include <G4HepEmGammaInteractionCompton.icc>
+#include <G4HepEmGammaInteractionConversion.icc>
+#include <G4HepEmGammaInteractionPhotoelectric.icc>
+
+__global__ void TransportGammas(Track *gammas, const adept::MParray *active, Secondaries secondaries,
+                                adept::MParray *activeQueue, Interactions interactions, GlobalScoring *globalScoring,
+                                ScoringPerVolume *scoringPerVolume)
+{
+#ifdef VECGEOM_FLOAT_PRECISION
+  const Precision kPush = 10 * vecgeom::kTolerance;
+#else
+  const Precision kPush = 0.;
+#endif
+  int activeSize = active->size();
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < activeSize; i += blockDim.x * gridDim.x) {
+    const int slot      = (*active)[i];
+    Track &currentTrack = gammas[slot];
+    const auto energy   = currentTrack.energy;
+    auto pos            = currentTrack.pos;
+    const auto dir      = currentTrack.dir;
+    auto navState       = currentTrack.navState;
+    const auto volume   = navState.Top();
+    const int volumeID  = volume->id();
+    // the MCC vector is indexed by the logical volume id
+    const int lvolID     = volume->GetLogicalVolume()->id();
+    const int theMCIndex = MCIndex[lvolID];
+
+    auto survive = [&](bool push = true) {
+      currentTrack.pos      = pos;
+      currentTrack.navState = navState;
+      if (push) activeQueue->push_back(slot);
+    };
+
+    // Init a track with the needed data to call into G4HepEm.
+    G4HepEmGammaTrack gammaTrack;
+    G4HepEmTrack *theTrack = gammaTrack.GetTrack();
+    theTrack->SetEKin(energy);
+    theTrack->SetMCIndex(theMCIndex);
+
+    // Sample the `number-of-interaction-left` and put it into the track.
+    for (int ip = 0; ip < 3; ++ip) {
+      double numIALeft = currentTrack.numIALeft[ip];
+      if (numIALeft <= 0) {
+        numIALeft = -std::log(currentTrack.Uniform());
+      }
+      theTrack->SetNumIALeft(numIALeft, ip);
+    }
+
+    // Call G4HepEm to compute the physics step limit.
+    G4HepEmGammaManager::HowFar(&g4HepEmData, &g4HepEmPars, &gammaTrack);
+
+    // Get result into variables.
+    double geometricalStepLengthFromPhysics = theTrack->GetGStepLength();
+    int winnerProcessIndex                  = theTrack->GetWinnerProcessIndex();
+    // Leave the range and MFP inside the G4HepEmTrack. If we split kernels, we
+    // also need to carry them over!
+
+    // Check if there's a volume boundary in between.
+    vecgeom::NavStateIndex nextState;
+    double geometryStepLength =
+        BVHNavigator::ComputeStepAndNextVolume(pos, dir, geometricalStepLengthFromPhysics, navState, nextState, kPush);
+    pos += geometryStepLength * dir;
+    atomicAdd(&globalScoring->neutralSteps, 1);
+
+    // Set boundary state in navState so the next step and secondaries get the
+    // correct information (navState = nextState only if relocated
+    // in case of a boundary; see below)
+    navState.SetBoundaryState(nextState.IsOnBoundary());
+
+    // Propagate information from geometrical step to G4HepEm.
+    theTrack->SetGStepLength(geometryStepLength);
+    theTrack->SetOnBoundary(nextState.IsOnBoundary());
+
+    G4HepEmGammaManager::UpdateNumIALeft(theTrack);
+
+    // Save the `number-of-interaction-left` in our track.
+    for (int ip = 0; ip < 3; ++ip) {
+      double numIALeft           = theTrack->GetNumIALeft(ip);
+      currentTrack.numIALeft[ip] = numIALeft;
+    }
+
+    if (nextState.IsOnBoundary()) {
+      // For now, just count that we hit something.
+      atomicAdd(&globalScoring->hits, 1);
+
+      // Kill the particle if it left the world.
+      if (nextState.Top() != nullptr) {
+        BVHNavigator::RelocateToNextVolume(pos, dir, nextState);
+
+        // Move to the next boundary.
+        navState = nextState;
+        survive();
+      }
+      continue;
+    } else if (winnerProcessIndex < 0) {
+      // No discrete process, move on.
+      survive();
+      continue;
+    }
+
+    // Reset number of interaction left for the winner discrete process.
+    // (Will be resampled in the next iteration.)
+    currentTrack.numIALeft[winnerProcessIndex] = -1.0;
+
+    interactions.queues[winnerProcessIndex]->push_back(slot);
+    survive(false);
+  }
+}
+
+template <int ProcessIndex>
+__device__ void GammaInteraction(Track *particles, const adept::MParray *active, Secondaries secondaries,
+                                 adept::MParray *activeQueue, GlobalScoring *globalScoring,
+                                 ScoringPerVolume *scoringPerVolume)
+{
+  int activeSize = active->size();
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < activeSize; i += blockDim.x * gridDim.x) {
+    const int globalSlot = (*active)[i];
+    Track &currentTrack = particles[globalSlot];
+    const auto energy   = currentTrack.energy;
+    const auto pos      = currentTrack.pos;
+    const auto dir      = currentTrack.dir;
+    const auto navState = currentTrack.navState;
+    const auto volume   = navState.Top();
+    const int volumeID  = volume->id();
+    // the MCC vector is indexed by the logical volume id
+    const int lvolID     = volume->GetLogicalVolume()->id();
+    const int theMCIndex = MCIndex[lvolID];
+
+    auto survive = [&] { activeQueue->push_back(globalSlot); };
+
+    RanluxppDouble newRNG{currentTrack.rngState.Branch()};
+    G4HepEmRandomEngine rnge{&currentTrack.rngState};
+
+    if constexpr (ProcessIndex == 0) {
+      // Invoke gamma conversion to e-/e+ pairs, if the energy is above the threshold.
+      if (energy < 2 * copcore::units::kElectronMassC2) {
+        survive();
+        return;
+      }
+
+      double logEnergy = std::log(energy);
+      double elKinEnergy, posKinEnergy;
+      G4HepEmGammaInteractionConversion::SampleKinEnergies(&g4HepEmData, energy, logEnergy, theMCIndex, elKinEnergy,
+                                                           posKinEnergy, &rnge);
+
+      double dirPrimary[] = {dir.x(), dir.y(), dir.z()};
+      double dirSecondaryEl[3], dirSecondaryPos[3];
+      G4HepEmGammaInteractionConversion::SampleDirections(dirPrimary, dirSecondaryEl, dirSecondaryPos, elKinEnergy,
+                                                          posKinEnergy, &rnge);
+
+      Track &electron = secondaries.electrons.NextTrack();
+      Track &positron = secondaries.positrons.NextTrack();
+      atomicAdd(&globalScoring->numElectrons, 1);
+      atomicAdd(&globalScoring->numPositrons, 1);
+
+      electron.InitAsSecondary(pos, navState);
+      electron.rngState = newRNG;
+      electron.energy   = elKinEnergy;
+      electron.dir.Set(dirSecondaryEl[0], dirSecondaryEl[1], dirSecondaryEl[2]);
+
+      positron.InitAsSecondary(pos, navState);
+      // Reuse the RNG state of the dying track.
+      positron.rngState = currentTrack.rngState;
+      positron.energy   = posKinEnergy;
+      positron.dir.Set(dirSecondaryPos[0], dirSecondaryPos[1], dirSecondaryPos[2]);
+
+      // The current track is killed by not enqueuing into the next activeQueue.
+    } else if constexpr (ProcessIndex == 1) {
+      // Invoke Compton scattering of gamma.
+      constexpr double LowEnergyThreshold = 100 * copcore::units::eV;
+      if (energy < LowEnergyThreshold) {
+        survive();
+        return;
+      }
+      const double origDirPrimary[] = {dir.x(), dir.y(), dir.z()};
+      double dirPrimary[3];
+      const double newEnergyGamma =
+          G4HepEmGammaInteractionCompton::SamplePhotonEnergyAndDirection(energy, dirPrimary, origDirPrimary, &rnge);
+      vecgeom::Vector3D<double> newDirGamma(dirPrimary[0], dirPrimary[1], dirPrimary[2]);
+
+      const double energyEl = energy - newEnergyGamma;
+      if (energyEl > LowEnergyThreshold) {
+        // Create a secondary electron and sample/compute directions.
+        Track &electron = secondaries.electrons.NextTrack();
+        atomicAdd(&globalScoring->numElectrons, 1);
+
+        electron.InitAsSecondary(pos, navState);
+        electron.rngState = newRNG;
+        electron.energy   = energyEl;
+        electron.dir      = energy * dir - newEnergyGamma * newDirGamma;
+        electron.dir.Normalize();
+      } else {
+        atomicAdd(&globalScoring->energyDeposit, energyEl);
+        atomicAdd(&scoringPerVolume->energyDeposit[volumeID], energyEl);
+      }
+
+      // Check the new gamma energy and deposit if below threshold.
+      if (newEnergyGamma > LowEnergyThreshold) {
+        currentTrack.energy = newEnergyGamma;
+        currentTrack.dir    = newDirGamma;
+        survive();
+      } else {
+        atomicAdd(&globalScoring->energyDeposit, newEnergyGamma);
+        atomicAdd(&scoringPerVolume->energyDeposit[volumeID], newEnergyGamma);
+        // The current track is killed by not enqueuing into the next activeQueue.
+      }
+    } else if constexpr (ProcessIndex == 2) {
+      // Invoke photoelectric process.
+      const double theLowEnergyThreshold = 1 * copcore::units::eV;
+
+      // (Re)compute total macroscopic cross section
+      const int theMatIndx = g4HepEmData.fTheMatCutData->fMatCutData[theMCIndex].fHepEmMatIndex;
+      double mxsec = G4HepEmGammaManager::GetMacXSecPE(&g4HepEmData, theMatIndx, energy);
+
+      const double bindingEnergy = G4HepEmGammaInteractionPhotoelectric::SelectElementBindingEnergy(
+          &g4HepEmData, theMCIndex, mxsec, energy, &rnge);
+
+      double edep             = bindingEnergy;
+      const double photoElecE = energy - edep;
+      if (photoElecE > theLowEnergyThreshold) {
+        // Create a secondary electron and sample directions.
+        Track &electron = secondaries.electrons.NextTrack();
+        atomicAdd(&globalScoring->numElectrons, 1);
+
+        double dirGamma[] = {dir.x(), dir.y(), dir.z()};
+        double dirPhotoElec[3];
+        G4HepEmGammaInteractionPhotoelectric::SamplePhotoElectronDirection(photoElecE, dirGamma, dirPhotoElec, &rnge);
+
+        electron.InitAsSecondary(pos, navState);
+        electron.rngState = newRNG;
+        electron.energy   = photoElecE;
+        electron.dir.Set(dirPhotoElec[0], dirPhotoElec[1], dirPhotoElec[2]);
+      } else {
+        edep = energy;
+      }
+      atomicAdd(&globalScoring->energyDeposit, edep);
+      // The current track is killed by not enqueuing into the next activeQueue.
+    }
+  }
+}
+
+__global__ void PairCreation(Track *particles, const adept::MParray *active, Secondaries secondaries,
+                             adept::MParray *activeQueue, GlobalScoring *globalScoring,
+                             ScoringPerVolume *scoringPerVolume)
+{
+  GammaInteraction<0>(particles, active, secondaries, activeQueue, globalScoring, scoringPerVolume);
+}
+__global__ void ComptonScattering(Track *particles, const adept::MParray *active, Secondaries secondaries,
+                                  adept::MParray *activeQueue, GlobalScoring *globalScoring,
+                                  ScoringPerVolume *scoringPerVolume)
+{
+  GammaInteraction<1>(particles, active, secondaries, activeQueue, globalScoring, scoringPerVolume);
+}
+__global__ void PhotoelectricEffect(Track *particles, const adept::MParray *active, Secondaries secondaries,
+                                    adept::MParray *activeQueue, GlobalScoring *globalScoring,
+                                    ScoringPerVolume *scoringPerVolume)
+{
+  GammaInteraction<2>(particles, active, secondaries, activeQueue, globalScoring, scoringPerVolume);
+}
diff --git a/examples/Example20/main.cpp b/examples/Example20/main.cpp
new file mode 100644
index 00000000..09273665
--- /dev/null
+++ b/examples/Example20/main.cpp
@@ -0,0 +1,303 @@
+// SPDX-FileCopyrightText: 2022 CERN
+// SPDX-License-Identifier: Apache-2.0
+
+#include "example.h"
+
+#include <globals.hh>
+#include <G4SystemOfUnits.hh>
+#include <G4EmParameters.hh>
+#include <G4GDMLParser.hh>
+#include <G4MaterialCutsCouple.hh>
+#include <G4ProductionCutsTable.hh>
+#include <G4RegionStore.hh>
+#include <G4TransportationManager.hh>
+
+#include <G4Gamma.hh>
+#include <G4Electron.hh>
+#include <G4Positron.hh>
+#include <G4Proton.hh>
+#include <G4ParticleTable.hh>
+
+#include <G4HepEmData.hh>
+#include <G4HepEmState.hh>
+#include <G4HepEmStateInit.hh>
+#include <G4HepEmParameters.hh>
+#include <G4HepEmMatCutData.hh>
+
+#include <VecGeom/base/Stopwatch.h>
+#include <VecGeom/management/GeoManager.h>
+#include <VecGeom/management/BVHManager.h>
+#include <VecGeom/gdml/Frontend.h>
+#include <VecGeom/management/CudaManager.h>
+
+#include <CopCore/SystemOfUnits.h>
+#include <AdePT/ArgParser.h>
+#include <AdePT/NVTX.h>
+
+static constexpr double DefaultCut = 0.7 * mm;
+
+const G4VPhysicalVolume *InitGeant4(const std::string &gdml_file)
+{
+  auto defaultRegion = new G4Region("DefaultRegionForTheWorld"); // deleted by store
+  auto pcuts         = G4ProductionCutsTable::GetProductionCutsTable()->GetDefaultProductionCuts();
+  pcuts->SetProductionCut(DefaultCut, "gamma");
+  pcuts->SetProductionCut(DefaultCut, "e-");
+  pcuts->SetProductionCut(DefaultCut, "e+");
+  pcuts->SetProductionCut(DefaultCut, "proton");
+  defaultRegion->SetProductionCuts(pcuts);
+
+  // Read the geometry, regions and cuts from the GDML file
+  std::cout << "reading " << gdml_file << " transiently on CPU for Geant4 ...\n";
+  G4GDMLParser parser;
+  parser.Read(gdml_file, false); // turn off schema checker
+  G4VPhysicalVolume *world = parser.GetWorldVolume();
+
+  if (world == nullptr) {
+    std::cerr << "example18: World volume not set properly check your setup selection criteria or GDML input!\n";
+    return world;
+  }
+
+  // - PHYSICS
+  // --- Create particles that have secondary production threshold.
+  G4Gamma::Gamma();
+  G4Electron::Electron();
+  G4Positron::Positron();
+  G4Proton::Proton();
+  G4ParticleTable *partTable = G4ParticleTable::GetParticleTable();
+  partTable->SetReadiness();
+
+  // - REGIONS
+  if (world->GetLogicalVolume()->GetRegion() == nullptr) {
+    // Add default region if none available
+    defaultRegion->AddRootLogicalVolume(world->GetLogicalVolume());
+  }
+  for (auto region : *G4RegionStore::GetInstance()) {
+    region->UsedInMassGeometry(true); // make sure all regions are marked as used
+    region->UpdateMaterialList();
+  }
+
+  // - UPDATE COUPLES
+  std::cout << "updating material-cut couples based on " << G4RegionStore::GetInstance()->size() << " regions ...\n";
+  G4ProductionCutsTable *theCoupleTable = G4ProductionCutsTable::GetProductionCutsTable();
+  theCoupleTable->UpdateCoupleTable(world);
+
+  // --- Set the world volume to fix initialization of G4SafetyHelper (used by G4UrbanMscModel)
+  G4TransportationManager::GetTransportationManager()->SetWorldForTracking(world);
+
+  // --- Set MSC range factor to match G4HepEm physics lists.
+  G4EmParameters *param = G4EmParameters::Instance();
+  param->SetDefaults();
+  param->SetMscRangeFactor(0.04);
+
+  return world;
+}
+
+const vecgeom::cxx::VPlacedVolume *InitVecGeom(const std::string &gdml_file, int cache_depth)
+{
+  // Import the gdml file into VecGeom
+  vecgeom::GeoManager::Instance().SetTransformationCacheDepth(cache_depth);
+  vgdml::Parser vgdmlParser;
+  auto middleWare = vgdmlParser.Load(gdml_file.c_str(), false, copcore::units::mm);
+  if (middleWare == nullptr) {
+    std::cerr << "Failed to read geometry from GDML file '" << gdml_file << "'" << std::endl;
+    return nullptr;
+  }
+
+  const vecgeom::VPlacedVolume *world = vecgeom::GeoManager::Instance().GetWorld();
+  if (world == nullptr) {
+    std::cerr << "GeoManager world volume is nullptr" << std::endl;
+    return nullptr;
+  }
+  return world;
+}
+
+int *CreateMCCindex(const G4VPhysicalVolume *g4world, const vecgeom::VPlacedVolume *world,
+                    const G4HepEmState &hepEmState)
+{
+  const int *g4tohepmcindex = hepEmState.fData->fTheMatCutData->fG4MCIndexToHepEmMCIndex;
+
+  // - FIND vecgeom::LogicalVolume corresponding to each and every G4LogicalVolume
+  int nphysical = 0;
+
+  int nvolumes   = vecgeom::GeoManager::Instance().GetRegisteredVolumesCount();
+  int *mcc_index = new int[nvolumes];
+  memset(mcc_index, 0, nvolumes * sizeof(int));
+
+  // recursive geometry visitor lambda matching one by one Geant4 and VecGeom logical volumes
+  // (we need to make sure we set the right MCC index to the right volume)
+  typedef std::function<void(G4LogicalVolume const *, vecgeom::LogicalVolume const *)> func_t;
+  func_t visitAndSetMCindex = [&](G4LogicalVolume const *g4vol, vecgeom::LogicalVolume const *vol) {
+    int nd         = g4vol->GetNoDaughters();
+    auto daughters = vol->GetDaughters();
+    if (nd != daughters.size()) throw std::runtime_error("Mismatch in number of daughters");
+    // Check the couples
+    if (g4vol->GetMaterialCutsCouple() == nullptr)
+      throw std::runtime_error("G4LogicalVolume " + std::string(g4vol->GetName()) +
+                               std::string(" has no material-cuts couple"));
+    int g4mcindex    = g4vol->GetMaterialCutsCouple()->GetIndex();
+    int hepemmcindex = g4tohepmcindex[g4mcindex];
+    // Check consistency with G4HepEm data
+    if (hepEmState.fData->fTheMatCutData->fMatCutData[hepemmcindex].fG4MatCutIndex != g4mcindex)
+      throw std::runtime_error("Mismatch between Geant4 mcindex and corresponding G4HepEm index");
+    if (vol->id() >= nvolumes) throw std::runtime_error("Volume id larger than number of volumes");
+
+    // All OK, now fill the index in the array
+    mcc_index[vol->id()] = hepemmcindex;
+    nphysical++;
+
+    // Now do the daughters
+    for (int id = 0; id < nd; ++id) {
+      auto g4pvol = g4vol->GetDaughter(id);
+      auto pvol   = daughters[id];
+      // VecGeom does not strip pointers from logical volume names
+      if (std::string(pvol->GetLogicalVolume()->GetName()).rfind(g4pvol->GetLogicalVolume()->GetName(), 0) != 0)
+        throw std::runtime_error("Volume names " + std::string(pvol->GetLogicalVolume()->GetName()) + " and " +
+                                 std::string(g4pvol->GetLogicalVolume()->GetName()) + " mismatch");
+      visitAndSetMCindex(g4pvol->GetLogicalVolume(), pvol->GetLogicalVolume());
+    }
+  };
+
+  visitAndSetMCindex(g4world->GetLogicalVolume(), world->GetLogicalVolume());
+  std::cout << "Visited " << nphysical << " matching physical volumes\n";
+  return mcc_index;
+}
+
+void FreeG4HepEm(G4HepEmState *state)
+{
+  FreeG4HepEmData(state->fData);
+}
+
+void InitBVH()
+{
+  vecgeom::cxx::BVHManager::Init();
+  vecgeom::cxx::BVHManager::DeviceInit();
+}
+
+void PrintScoringPerVolume(const vecgeom::VPlacedVolume *placed, const ScoringPerVolume *scoring, int level = 0)
+{
+  for (auto *daughter : placed->GetDaughters()) {
+    std::cout << std::setw(level * 2) << "";
+    auto id = daughter->id();
+    std::cout << " ID " << id << " Charged-TrakL " << scoring->chargedTrackLength[id] / copcore::units::mm
+              << " mm; Energy-Dep " << scoring->energyDeposit[id] / copcore::units::MeV << " MeV" << std::endl;
+    PrintScoringPerVolume(daughter, scoring, level + 1);
+  }
+}
+
+int main(int argc, char *argv[])
+{
+  // Only outputs are the data file(s)
+  // Separate for now, but will want to unify
+  OPTION_STRING(gdml_file, "cms2018.gdml");
+  OPTION_INT(cache_depth, 0); // 0 = full depth
+  OPTION_INT(particles, 100);
+  OPTION_DOUBLE(energy, 10); // entered in GeV
+  energy *= copcore::units::GeV;
+  OPTION_INT(batch, 52);
+  OPTION_STRING(gunpos, "0,0,0");
+  OPTION_STRING(gundir, "1,0,0");
+  OPTION_BOOL(rotatingParticleGun, false);
+
+  // parse gun config
+  GunConfig gunConfig{};
+  std::sscanf(gunpos.c_str(), "%f,%f,%f", &gunConfig.position[0], &gunConfig.position[1], &gunConfig.position[2]);
+  std::sscanf(gundir.c_str(), "%f,%f,%f", &gunConfig.direction[0], &gunConfig.direction[1], &gunConfig.direction[2]);
+  gunConfig.movingGun = rotatingParticleGun;
+
+  vecgeom::Stopwatch timer;
+  timer.Start();
+
+  NVTXTracer tracer("InitG4");
+
+  // Initialize Geant4
+  auto g4world = InitGeant4(gdml_file);
+  if (!g4world) return 3;
+
+  tracer.setTag("InitVecGeom");
+  // Initialize VecGeom
+  std::cout << "reading " << gdml_file << " transiently on CPU for VecGeom ...\n";
+  auto world = InitVecGeom(gdml_file, cache_depth);
+  if (!world) return 3;
+
+  // Construct and initialize the G4HepEmState data/tables
+  tracer.setTag("InitG4HepEM");
+  std::cout << "initializing G4HepEm state ...\n";
+  G4HepEmState hepEmState;
+  InitG4HepEmState(&hepEmState);
+
+  // Initialize G4HepEm material-cut couple array indexed by VecGeom volume id.
+  // (In future we should connect the index directly to the VecGeom logical volume)
+  tracer.setTag("InitMaterialCutCouple");
+  std::cout << "initializing material-cut couple indices ...\n";
+  int *MCCindex = nullptr;
+
+  try {
+    MCCindex = CreateMCCindex(g4world, world, hepEmState);
+  } catch (const std::runtime_error &ex) {
+    std::cerr << "*** CreateMCCindex: " << ex.what() << "\n";
+    return 1;
+  }
+
+  // Load and synchronize the geometry on the GPU
+  tracer.setTag("SyncGeom");
+  std::cout << "synchronizing VecGeom geometry to GPU ...\n";
+  auto &cudaManager = vecgeom::cxx::CudaManager::Instance();
+  cudaManager.LoadGeometry(world);
+  cudaManager.Synchronize();
+
+  tracer.setTag("InitBVH");
+  InitBVH();
+
+  auto time_cpu = timer.Stop();
+  std::cout << "Initialization took: " << time_cpu << " sec\n";
+
+  int NumVolumes = vecgeom::GeoManager::Instance().GetRegisteredVolumesCount();
+  int NumPlaced  = vecgeom::GeoManager::Instance().GetPlacedVolumesCount();
+
+  // Scoring is done per placed volume (for now...)
+  double *chargedTrackLength = new double[NumPlaced];
+  double *energyDeposit      = new double[NumPlaced];
+  ScoringPerVolume scoringPerVolume;
+  scoringPerVolume.chargedTrackLength = chargedTrackLength;
+  scoringPerVolume.energyDeposit      = energyDeposit;
+  GlobalScoring globalScoring;
+
+  tracer.setTag("GPU function");
+  runGPU(particles, energy, batch, MCCindex, &scoringPerVolume, &globalScoring, NumVolumes, NumPlaced, &hepEmState,
+         gunConfig);
+
+  std::cout << std::endl;
+  std::cout << std::endl;
+  double meanEnergyDeposit = globalScoring.energyDeposit / particles;
+  std::cout << "Mean energy deposit          " << (meanEnergyDeposit / copcore::units::GeV) << " GeV\n"
+            << "Mean number of gamma         " << ((double)globalScoring.numGammas / particles) << "\n"
+            << "Mean number of e-            " << ((double)globalScoring.numElectrons / particles) << "\n"
+            << "Mean number of e+            " << ((double)globalScoring.numPositrons / particles) << "\n"
+            << "Mean number of charged steps " << ((double)globalScoring.chargedSteps / particles) << "\n"
+            << "Mean number of neutral steps " << ((double)globalScoring.neutralSteps / particles) << "\n"
+            << "Mean number of hits          " << ((double)globalScoring.hits / particles) << "\n";
+  if (globalScoring.numKilled > 0) {
+    std::cout << "Total killed particles       " << globalScoring.numKilled << "\n";
+  }
+  std::cout << std::endl;
+
+  // Average charged track length and energy deposit per particle.
+  for (int i = 0; i < NumPlaced; i++) {
+    chargedTrackLength[i] /= particles;
+    energyDeposit[i] /= particles;
+  }
+
+  std::cout << std::scientific;
+#ifdef VERBOSE
+  std::cout << std::endl;
+  const int id = world->id();
+  std::cout << "World (ID " << world->id() << ") Charged-TrakL " << chargedTrackLength[id] / copcore::units::mm
+            << " Energy-Dep " << energyDeposit[id] / copcore::units::MeV << " MeV" << std::endl;
+  PrintScoringPerVolume(world, &scoringPerVolume);
+#endif
+  FreeG4HepEm(&hepEmState);
+  delete[] MCCindex;
+  delete[] chargedTrackLength;
+  delete[] energyDeposit;
+  return 0;
+}
diff --git a/examples/Example20/main.cu b/examples/Example20/main.cu
new file mode 100644
index 00000000..40145b70
--- /dev/null
+++ b/examples/Example20/main.cu
@@ -0,0 +1,519 @@
+// SPDX-FileCopyrightText: 2022 CERN
+// SPDX-License-Identifier: Apache-2.0
+
+#include "example.h"
+#include "example.cuh"
+
+#include <AdePT/Atomic.h>
+#include <AdePT/BVHNavigator.h>
+#include <AdePT/MParray.h>
+#include <AdePT/NVTX.h>
+
+#include <CopCore/Global.h>
+#include <CopCore/PhysicalConstants.h>
+#include <CopCore/Ranluxpp.h>
+
+#include <VecGeom/base/Config.h>
+#include <VecGeom/base/Stopwatch.h>
+#include <VecGeom/management/GeoManager.h>
+#ifdef VECGEOM_ENABLE_CUDA
+#include <VecGeom/backend/cuda/Interface.h>
+#endif
+
+#include <G4HepEmState.hh>
+#include <G4HepEmData.hh>
+#include <G4HepEmElectronInit.hh>
+#include <G4HepEmGammaInit.hh>
+#include <G4HepEmMatCutData.hh>
+#include <G4HepEmMaterialInit.hh>
+#include <G4HepEmParameters.hh>
+#include <G4HepEmParametersInit.hh>
+
+#include <iostream>
+#include <iomanip>
+#include <stdio.h>
+
+__constant__ __device__ struct G4HepEmParameters g4HepEmPars;
+__constant__ __device__ struct G4HepEmData g4HepEmData;
+
+__constant__ __device__ int *MCIndex = nullptr;
+
+void InitG4HepEmGPU(G4HepEmState *state)
+{
+  // Copy to GPU.
+  CopyG4HepEmDataToGPU(state->fData);
+  COPCORE_CUDA_CHECK(cudaMemcpyToSymbol(g4HepEmPars, state->fParameters, sizeof(G4HepEmParameters)));
+
+  // Create G4HepEmData with the device pointers.
+  G4HepEmData dataOnDevice;
+  dataOnDevice.fTheMatCutData   = state->fData->fTheMatCutData_gpu;
+  dataOnDevice.fTheMaterialData = state->fData->fTheMaterialData_gpu;
+  dataOnDevice.fTheElementData  = state->fData->fTheElementData_gpu;
+  dataOnDevice.fTheElectronData = state->fData->fTheElectronData_gpu;
+  dataOnDevice.fThePositronData = state->fData->fThePositronData_gpu;
+  dataOnDevice.fTheSBTableData  = state->fData->fTheSBTableData_gpu;
+  dataOnDevice.fTheGammaData    = state->fData->fTheGammaData_gpu;
+  // The other pointers should never be used.
+  dataOnDevice.fTheMatCutData_gpu   = nullptr;
+  dataOnDevice.fTheMaterialData_gpu = nullptr;
+  dataOnDevice.fTheElementData_gpu  = nullptr;
+  dataOnDevice.fTheElectronData_gpu = nullptr;
+  dataOnDevice.fThePositronData_gpu = nullptr;
+  dataOnDevice.fTheSBTableData_gpu  = nullptr;
+  dataOnDevice.fTheGammaData_gpu    = nullptr;
+
+  COPCORE_CUDA_CHECK(cudaMemcpyToSymbol(g4HepEmData, &dataOnDevice, sizeof(G4HepEmData)));
+}
+
+// A bundle of queues per particle type:
+//  * Two for active particles, one for the current iteration and the second for the next.
+struct ParticleQueues {
+  adept::MParray *currentlyActive;
+  adept::MParray *nextActive;
+
+  void SwapActive() { std::swap(currentlyActive, nextActive); }
+};
+
+struct ParticleType {
+  Track *tracks;
+  SlotManager *slotManager;
+  ParticleQueues queues;
+  Interactions interactions;
+  cudaStream_t stream;
+  cudaEvent_t event;
+
+  enum {
+    Electron = 0,
+    Positron = 1,
+    Gamma    = 2,
+
+    NumParticleTypes,
+  };
+};
+
+// A bundle of queues for the three particle types.
+struct AllParticleQueues {
+  ParticleQueues queues[ParticleType::NumParticleTypes];
+  Interactions interactions[ParticleType::NumParticleTypes];
+};
+
+// Kernel to initialize the set of queues per particle type.
+__global__ void InitParticleQueues(ParticleQueues queues, Interactions interactions, size_t Capacity)
+{
+  adept::MParray::MakeInstanceAt(Capacity, queues.currentlyActive);
+  adept::MParray::MakeInstanceAt(Capacity, queues.nextActive);
+  adept::MParray::MakeInstanceAt(Capacity, interactions.queues[0]);
+  adept::MParray::MakeInstanceAt(Capacity, interactions.queues[1]);
+  adept::MParray::MakeInstanceAt(Capacity, interactions.queues[2]);
+}
+
+// Kernel function to initialize a set of primary particles.
+__global__ void InitPrimaries(ParticleGenerator generator, int startEvent, int numEvents, double energy,
+                              const vecgeom::VPlacedVolume *world, GlobalScoring *globalScoring, const GunConfig gun)
+{
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < numEvents; i += blockDim.x * gridDim.x) {
+    Track &track = generator.NextTrack();
+
+    track.rngState.SetSeed(startEvent + i);
+    track.energy       = energy;
+    track.numIALeft[0] = -1.0;
+    track.numIALeft[1] = -1.0;
+    track.numIALeft[2] = -1.0;
+
+    track.initialRange       = -1.0;
+    track.dynamicRangeFactor = -1.0;
+    track.tlimitMin          = -1.0;
+
+    track.pos = {gun.position[0], gun.position[1], gun.position[2]};
+    if (gun.movingGun) {
+      // Generate particles flat in phi and in eta between -5 and 5. We'll lose the far forwards ones, so no need to
+      // simulate.
+      const double phi = 2. * M_PI * track.rngState.Rndm();
+      const double eta = -5. + 10. * track.rngState.Rndm();
+      track.dir.x()    = static_cast<vecgeom::Precision>(cos(phi) / cosh(eta));
+      track.dir.y()    = static_cast<vecgeom::Precision>(sin(phi) / cosh(eta));
+      track.dir.z()    = static_cast<vecgeom::Precision>(tanh(eta));
+    } else {
+      track.dir = {gun.direction[0], gun.direction[1], gun.direction[2]};
+    }
+    track.navState.Clear();
+    BVHNavigator::LocatePointIn(world, track.pos, track.navState, true);
+
+    atomicAdd(&globalScoring->numElectrons, 1);
+  }
+}
+
+// A data structure to transfer statistics after each iteration.
+struct Stats {
+  int inFlight[ParticleType::NumParticleTypes];
+};
+
+// Finish iteration: clear queues and fill statistics.
+__global__ void FinishIteration(AllParticleQueues all, Stats *stats)
+{
+  for (int i = 0; i < ParticleType::NumParticleTypes; i++) {
+    all.queues[i].currentlyActive->clear();
+    all.interactions[i].queues[0]->clear();
+    all.interactions[i].queues[1]->clear();
+    all.interactions[i].queues[2]->clear();
+    stats->inFlight[i] = all.queues[i].nextActive->size();
+  }
+}
+
+__global__ void ClearQueue(adept::MParray *queue)
+{
+  queue->clear();
+}
+
+void runGPU(int numParticles, double energy, int batch, const int *MCIndex_host,
+            ScoringPerVolume *scoringPerVolume_host, GlobalScoring *globalScoring_host, int numVolumes, int numPlaced,
+            G4HepEmState *state, GunConfig gunConfig)
+{
+  NVTXTracer tracer("InitG4HepEM");
+  InitG4HepEmGPU(state);
+
+  tracer.setTag("InitParticles/malloc/copy");
+  // Transfer MC indices.
+  int *MCIndex_dev = nullptr;
+  COPCORE_CUDA_CHECK(cudaMalloc(&MCIndex_dev, sizeof(int) * numVolumes));
+  COPCORE_CUDA_CHECK(cudaMemcpy(MCIndex_dev, MCIndex_host, sizeof(int) * numVolumes, cudaMemcpyHostToDevice));
+  COPCORE_CUDA_CHECK(cudaMemcpyToSymbol(MCIndex, &MCIndex_dev, sizeof(int *)));
+
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, 0);
+
+  // Capacity of the different containers aka the maximum number of particles.
+  // Use 1/5 of GPU memory for each of e+/e-/gammas, leaving 2/5 for the rest.
+  const size_t Capacity = (deviceProp.totalGlobalMem / sizeof(Track) / 5);
+
+  std::cout << "INFO: capacity of containers set to " << Capacity << std::endl;
+  if (batch == -1) {
+    // Rule of thumb: at most 2000 particles of one type per GeV primary.
+    batch = Capacity / ((int)energy / copcore::units::GeV) / 2000;
+  } else if (batch < 1) {
+    batch = 1;
+  }
+  std::cout << "INFO: batching " << batch << " particles for transport on the GPU" << std::endl;
+  if (BzFieldValue != 0) {
+    std::cout << "INFO: running with field Bz = " << BzFieldValue / copcore::units::tesla << " T" << std::endl;
+  } else {
+    std::cout << "INFO: running with magnetic field OFF" << std::endl;
+  }
+
+  // Allocate structures to manage tracks of an implicit type:
+  //  * memory to hold the actual Track elements,
+  //  * objects to manage slots inside the memory,
+  //  * queues of slots to remember active particle and those needing relocation,
+  //  * a stream and an event for synchronization of kernels.
+  const size_t TracksSize  = sizeof(Track) * Capacity;
+  const size_t ManagerSize = sizeof(SlotManager);
+  const size_t QueueSize   = adept::MParray::SizeOfInstance(Capacity);
+
+  ParticleType particles[ParticleType::NumParticleTypes];
+  for (int i = 0; i < ParticleType::NumParticleTypes; i++) {
+    COPCORE_CUDA_CHECK(cudaMalloc(&particles[i].tracks, TracksSize));
+
+    COPCORE_CUDA_CHECK(cudaMalloc(&particles[i].slotManager, ManagerSize));
+
+    COPCORE_CUDA_CHECK(cudaMalloc(&particles[i].queues.currentlyActive, QueueSize));
+    COPCORE_CUDA_CHECK(cudaMalloc(&particles[i].queues.nextActive, QueueSize));
+    COPCORE_CUDA_CHECK(cudaMalloc(&particles[i].interactions.queues[0], QueueSize));
+    COPCORE_CUDA_CHECK(cudaMalloc(&particles[i].interactions.queues[1], QueueSize));
+    COPCORE_CUDA_CHECK(cudaMalloc(&particles[i].interactions.queues[2], QueueSize));
+    InitParticleQueues<<<1, 1>>>(particles[i].queues, particles[i].interactions, Capacity);
+
+    COPCORE_CUDA_CHECK(cudaStreamCreate(&particles[i].stream));
+    COPCORE_CUDA_CHECK(cudaEventCreate(&particles[i].event));
+  }
+  COPCORE_CUDA_CHECK(cudaDeviceSynchronize());
+
+  ParticleType &electrons = particles[ParticleType::Electron];
+  ParticleType &positrons = particles[ParticleType::Positron];
+  ParticleType &gammas    = particles[ParticleType::Gamma];
+
+  // Create a stream to synchronize kernels of all particle types.
+  cudaStream_t stream;
+  COPCORE_CUDA_CHECK(cudaStreamCreate(&stream));
+
+  cudaStream_t interactionStreams[3];
+  for (auto i = 0; i < 3; ++i)
+    COPCORE_CUDA_CHECK(cudaStreamCreate(&interactionStreams[i]));
+
+  // Allocate memory to score charged track length and energy deposit per volume.
+  double *chargedTrackLength = nullptr;
+  COPCORE_CUDA_CHECK(cudaMalloc(&chargedTrackLength, sizeof(double) * numPlaced));
+  COPCORE_CUDA_CHECK(cudaMemset(chargedTrackLength, 0, sizeof(double) * numPlaced));
+  double *energyDeposit = nullptr;
+  COPCORE_CUDA_CHECK(cudaMalloc(&energyDeposit, sizeof(double) * numPlaced));
+  COPCORE_CUDA_CHECK(cudaMemset(energyDeposit, 0, sizeof(double) * numPlaced));
+
+  // Allocate and initialize scoring and statistics.
+  GlobalScoring *globalScoring = nullptr;
+  COPCORE_CUDA_CHECK(cudaMalloc(&globalScoring, sizeof(GlobalScoring)));
+  COPCORE_CUDA_CHECK(cudaMemset(globalScoring, 0, sizeof(GlobalScoring)));
+
+  ScoringPerVolume *scoringPerVolume = nullptr;
+  ScoringPerVolume scoringPerVolume_devPtrs;
+  scoringPerVolume_devPtrs.chargedTrackLength = chargedTrackLength;
+  scoringPerVolume_devPtrs.energyDeposit      = energyDeposit;
+  COPCORE_CUDA_CHECK(cudaMalloc(&scoringPerVolume, sizeof(ScoringPerVolume)));
+  COPCORE_CUDA_CHECK(
+      cudaMemcpy(scoringPerVolume, &scoringPerVolume_devPtrs, sizeof(ScoringPerVolume), cudaMemcpyHostToDevice));
+
+  Stats *stats_dev = nullptr;
+  COPCORE_CUDA_CHECK(cudaMalloc(&stats_dev, sizeof(Stats)));
+  Stats *stats = nullptr;
+  COPCORE_CUDA_CHECK(cudaMallocHost(&stats, sizeof(Stats)));
+
+  // Allocate memory to hold a "vanilla" SlotManager to initialize for each batch.
+  SlotManager slotManagerInit(Capacity);
+  SlotManager *slotManagerInit_dev = nullptr;
+  COPCORE_CUDA_CHECK(cudaMalloc(&slotManagerInit_dev, sizeof(SlotManager)));
+  COPCORE_CUDA_CHECK(cudaMemcpy(slotManagerInit_dev, &slotManagerInit, sizeof(SlotManager), cudaMemcpyHostToDevice));
+
+  vecgeom::Stopwatch timer;
+  timer.Start();
+  tracer.setTag("sim");
+
+  std::cout << std::endl << "Simulating particles ";
+  const bool detailed = (numParticles / batch) < 50;
+  if (!detailed) {
+    std::cout << "... " << std::flush;
+  }
+
+  unsigned long long killed = 0;
+  tracer.setTag("start event loop");
+
+  for (int startEvent = 1; startEvent <= numParticles; startEvent += batch) {
+    if (detailed) {
+      std::cout << startEvent << " ... " << std::flush;
+    }
+    int left  = numParticles - startEvent + 1;
+    int chunk = std::min(left, batch);
+
+    for (int i = 0; i < ParticleType::NumParticleTypes; i++) {
+      COPCORE_CUDA_CHECK(cudaMemcpyAsync(particles[i].slotManager, slotManagerInit_dev, ManagerSize,
+                                         cudaMemcpyDeviceToDevice, stream));
+    }
+
+    // Initialize primary particles.
+    constexpr int InitThreads = ThreadsPerBlock;
+    int initBlocks            = (chunk + ThreadsPerBlock - 1) / ThreadsPerBlock;
+    ParticleGenerator electronGenerator(electrons.tracks, electrons.slotManager, electrons.queues.currentlyActive);
+    auto world_dev = vecgeom::cxx::CudaManager::Instance().world_gpu();
+    InitPrimaries<<<initBlocks, InitThreads, 0, stream>>>(electronGenerator, startEvent, chunk, energy, world_dev,
+                                                          globalScoring, gunConfig);
+    COPCORE_CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    stats->inFlight[ParticleType::Electron] = chunk;
+    stats->inFlight[ParticleType::Positron] = 0;
+    stats->inFlight[ParticleType::Gamma]    = 0;
+
+    constexpr int MaxBlocks = 8192;
+    int transportBlocks;
+
+    int inFlight;
+    int loopingNo         = 0;
+    int previousElectrons = -1, previousPositrons = -1;
+
+    do {
+      Secondaries secondaries = {
+          .electrons = {electrons.tracks, electrons.slotManager, electrons.queues.nextActive},
+          .positrons = {positrons.tracks, positrons.slotManager, positrons.queues.nextActive},
+          .gammas    = {gammas.tracks, gammas.slotManager, gammas.queues.nextActive},
+      };
+
+      // *** ELECTRONS ***
+      int numElectrons = stats->inFlight[ParticleType::Electron];
+      if (numElectrons > 0) {
+        transportBlocks = (numElectrons + ThreadsPerBlock - 1) / ThreadsPerBlock;
+        transportBlocks = std::min(transportBlocks, MaxBlocks);
+
+        TransportElectrons<<<transportBlocks, ThreadsPerBlock, 0, electrons.stream>>>(
+            electrons.tracks, electrons.queues.currentlyActive, secondaries, electrons.queues.nextActive,
+            electrons.interactions, globalScoring, scoringPerVolume);
+
+        COPCORE_CUDA_CHECK(cudaEventRecord(electrons.event, electrons.stream));
+        COPCORE_CUDA_CHECK(cudaStreamWaitEvent(interactionStreams[0], electrons.event, 0));
+
+        IonizationEl<<<transportBlocks, ThreadsPerBlock, 0, interactionStreams[0]>>>(
+            electrons.tracks, electrons.interactions.queues[0], secondaries, electrons.queues.nextActive, globalScoring,
+            scoringPerVolume);
+        BremsstrahlungEl<<<transportBlocks, ThreadsPerBlock, 0, electrons.stream>>>(
+            electrons.tracks, electrons.interactions.queues[1], secondaries, electrons.queues.nextActive, globalScoring,
+            scoringPerVolume);
+
+        for (auto streamToWaitFor : {interactionStreams[0], electrons.stream}) {
+          COPCORE_CUDA_CHECK(cudaEventRecord(electrons.event, streamToWaitFor));
+          COPCORE_CUDA_CHECK(cudaStreamWaitEvent(stream, electrons.event, 0));
+        }
+      }
+
+      // *** POSITRONS ***
+      int numPositrons = stats->inFlight[ParticleType::Positron];
+      if (numPositrons > 0) {
+        transportBlocks = (numPositrons + ThreadsPerBlock - 1) / ThreadsPerBlock;
+        transportBlocks = std::min(transportBlocks, MaxBlocks);
+
+        TransportPositrons<<<transportBlocks, ThreadsPerBlock, 0, positrons.stream>>>(
+            positrons.tracks, positrons.queues.currentlyActive, secondaries, positrons.queues.nextActive,
+            positrons.interactions, globalScoring, scoringPerVolume);
+
+        COPCORE_CUDA_CHECK(cudaEventRecord(positrons.event, positrons.stream));
+        COPCORE_CUDA_CHECK(cudaStreamWaitEvent(interactionStreams[1], positrons.event, 0));
+        COPCORE_CUDA_CHECK(cudaStreamWaitEvent(interactionStreams[2], positrons.event, 0));
+
+        IonizationPos<<<transportBlocks, ThreadsPerBlock, 0, interactionStreams[1]>>>(
+            positrons.tracks, positrons.interactions.queues[0], secondaries, positrons.queues.nextActive, globalScoring,
+            scoringPerVolume);
+        BremsstrahlungPos<<<transportBlocks, ThreadsPerBlock, 0, positrons.stream>>>(
+            positrons.tracks, positrons.interactions.queues[1], secondaries, positrons.queues.nextActive, globalScoring,
+            scoringPerVolume);
+        AnnihilationPos<<<transportBlocks, ThreadsPerBlock, 0, interactionStreams[2]>>>(
+            positrons.tracks, positrons.interactions.queues[2], secondaries, positrons.queues.nextActive, globalScoring,
+            scoringPerVolume);
+
+        for (auto streamToWaitFor : {interactionStreams[1], positrons.stream, interactionStreams[2]}) {
+          COPCORE_CUDA_CHECK(cudaEventRecord(positrons.event, streamToWaitFor));
+          COPCORE_CUDA_CHECK(cudaStreamWaitEvent(stream, positrons.event, 0));
+        }
+      }
+
+      // *** GAMMAS ***
+      int numGammas = stats->inFlight[ParticleType::Gamma];
+      if (numGammas > 0) {
+        transportBlocks = (numGammas + ThreadsPerBlock - 1) / ThreadsPerBlock;
+        transportBlocks = std::min(transportBlocks, MaxBlocks);
+
+        TransportGammas<<<transportBlocks, ThreadsPerBlock, 0, gammas.stream>>>(
+            gammas.tracks, gammas.queues.currentlyActive, secondaries, gammas.queues.nextActive, gammas.interactions,
+            globalScoring, scoringPerVolume);
+
+        COPCORE_CUDA_CHECK(cudaEventRecord(gammas.event, gammas.stream));
+        COPCORE_CUDA_CHECK(cudaStreamWaitEvent(stream, gammas.event, 0));
+
+        for (auto i = 0; i < 3; ++i) {
+          COPCORE_CUDA_CHECK(cudaStreamWaitEvent(interactionStreams[i], gammas.event, 0));
+        }
+        // About 2% of all gammas:
+        PairCreation<<<transportBlocks, ThreadsPerBlock, 0, interactionStreams[0]>>>(
+            gammas.tracks, gammas.interactions.queues[0], secondaries, gammas.queues.nextActive, globalScoring,
+            scoringPerVolume);
+        // About 10% of all gammas:
+        ComptonScattering<<<transportBlocks, ThreadsPerBlock, 0, interactionStreams[1]>>>(
+            gammas.tracks, gammas.interactions.queues[1], secondaries, gammas.queues.nextActive, globalScoring,
+            scoringPerVolume);
+        // About 15% of all gammas:
+        PhotoelectricEffect<<<transportBlocks, ThreadsPerBlock, 0, interactionStreams[2]>>>(
+            gammas.tracks, gammas.interactions.queues[2], secondaries, gammas.queues.nextActive, globalScoring,
+            scoringPerVolume);
+        for (auto i = 0; i < 3; ++i) {
+          COPCORE_CUDA_CHECK(cudaEventRecord(positrons.event, interactionStreams[i]));
+          COPCORE_CUDA_CHECK(cudaStreamWaitEvent(stream, positrons.event, 0));
+        }
+        COPCORE_CUDA_CHECK(cudaEventRecord(positrons.event, positrons.stream));
+        COPCORE_CUDA_CHECK(cudaStreamWaitEvent(stream, positrons.event, 0));
+      }
+
+      // *** END OF TRANSPORT ***
+
+      // The events ensure synchronization before finishing this iteration and
+      // copying the Stats back to the host.
+      AllParticleQueues queues = {
+          {electrons.queues, positrons.queues, gammas.queues},
+          {electrons.interactions, positrons.interactions, gammas.interactions},
+      };
+      FinishIteration<<<1, 1, 0, stream>>>(queues, stats_dev);
+      COPCORE_CUDA_CHECK(cudaMemcpyAsync(stats, stats_dev, sizeof(Stats), cudaMemcpyDeviceToHost, stream));
+
+      // Finally synchronize all kernels.
+      COPCORE_CUDA_CHECK(cudaStreamSynchronize(stream));
+
+      // Count the number of particles in flight.
+      inFlight = 0;
+      for (int i = 0; i < ParticleType::NumParticleTypes; i++) {
+        inFlight += stats->inFlight[i];
+      }
+
+      tracer.setOccupancy(inFlight);
+
+      tracer.setOccupancy(inFlight);
+
+      // Swap the queues for the next iteration.
+      electrons.queues.SwapActive();
+      positrons.queues.SwapActive();
+      gammas.queues.SwapActive();
+
+      // Check if only charged particles are left that are looping.
+      numElectrons = stats->inFlight[ParticleType::Electron];
+      numPositrons = stats->inFlight[ParticleType::Positron];
+      numGammas    = stats->inFlight[ParticleType::Gamma];
+      if (numElectrons == previousElectrons && numPositrons == previousPositrons && numGammas == 0) {
+        loopingNo++;
+      } else {
+        previousElectrons = numElectrons;
+        previousPositrons = numPositrons;
+        loopingNo         = 0;
+      }
+
+    } while (inFlight > 0 && loopingNo < 200);
+
+    if (inFlight > 0) {
+      killed += inFlight;
+      for (int i = 0; i < ParticleType::NumParticleTypes; i++) {
+        ParticleType &pType   = particles[i];
+        int inFlightParticles = stats->inFlight[i];
+        if (inFlightParticles == 0) {
+          continue;
+        }
+
+        ClearQueue<<<1, 1, 0, stream>>>(pType.queues.currentlyActive);
+      }
+      COPCORE_CUDA_CHECK(cudaStreamSynchronize(stream));
+    }
+  }
+  std::cout << "done!" << std::endl;
+
+  auto time = timer.Stop();
+  std::cout << "Run time: " << time << "\n";
+
+  // Transfer back scoring.
+  COPCORE_CUDA_CHECK(cudaMemcpy(globalScoring_host, globalScoring, sizeof(GlobalScoring), cudaMemcpyDeviceToHost));
+  globalScoring_host->numKilled = killed;
+
+  // Transfer back the scoring per volume (charged track length and energy deposit).
+  COPCORE_CUDA_CHECK(cudaMemcpy(scoringPerVolume_host->chargedTrackLength, scoringPerVolume_devPtrs.chargedTrackLength,
+                                sizeof(double) * numPlaced, cudaMemcpyDeviceToHost));
+  COPCORE_CUDA_CHECK(cudaMemcpy(scoringPerVolume_host->energyDeposit, scoringPerVolume_devPtrs.energyDeposit,
+                                sizeof(double) * numPlaced, cudaMemcpyDeviceToHost));
+
+  // Free resources.
+  COPCORE_CUDA_CHECK(cudaFree(MCIndex_dev));
+  COPCORE_CUDA_CHECK(cudaFree(chargedTrackLength));
+  COPCORE_CUDA_CHECK(cudaFree(energyDeposit));
+
+  COPCORE_CUDA_CHECK(cudaFree(globalScoring));
+  COPCORE_CUDA_CHECK(cudaFree(scoringPerVolume));
+  COPCORE_CUDA_CHECK(cudaFree(stats_dev));
+  COPCORE_CUDA_CHECK(cudaFreeHost(stats));
+  COPCORE_CUDA_CHECK(cudaFree(slotManagerInit_dev));
+
+  COPCORE_CUDA_CHECK(cudaStreamDestroy(stream));
+
+  for (auto i = 0; i < 3; ++i)
+    COPCORE_CUDA_CHECK(cudaStreamDestroy(interactionStreams[i]));
+
+  for (int i = 0; i < ParticleType::NumParticleTypes; i++) {
+    COPCORE_CUDA_CHECK(cudaFree(particles[i].tracks));
+    COPCORE_CUDA_CHECK(cudaFree(particles[i].slotManager));
+
+    COPCORE_CUDA_CHECK(cudaFree(particles[i].queues.currentlyActive));
+    COPCORE_CUDA_CHECK(cudaFree(particles[i].queues.nextActive));
+    COPCORE_CUDA_CHECK(cudaFree(particles[i].interactions.queues[0]));
+    COPCORE_CUDA_CHECK(cudaFree(particles[i].interactions.queues[1]));
+    COPCORE_CUDA_CHECK(cudaFree(particles[i].interactions.queues[2]));
+
+    COPCORE_CUDA_CHECK(cudaStreamDestroy(particles[i].stream));
+    COPCORE_CUDA_CHECK(cudaEventDestroy(particles[i].event));
+  }
+}