diff --git a/src/cudadev/CUDACore/cudaCompat.h b/src/cudadev/CUDACore/cudaCompat.h
index a7e4e963a..dfe784840 100644
--- a/src/cudadev/CUDACore/cudaCompat.h
+++ b/src/cudadev/CUDACore/cudaCompat.h
@@ -21,6 +21,9 @@
 #undef __forceinline__
 #define __forceinline__ inline __attribute__((always_inline))
 
+#undef __launch_bounds__
+#define __launch_bounds__(...)
+
 namespace cms {
   namespace cudacompat {
 
@@ -106,14 +109,124 @@ namespace cms {
     }
 
     inline void __syncthreads() {}
-    inline void __threadfence() {}
     inline bool __syncthreads_or(bool x) { return x; }
     inline bool __syncthreads_and(bool x) { return x; }
+
+    inline void __threadfence() {}
+
     template <typename T>
     inline T __ldg(T const* x) {
       return *x;
     }
 
+    namespace cooperative_groups {
+
+      // This class represents the thread block
+      class thread_block {
+      private:
+        thread_block() = default;
+
+        friend thread_block this_thread_block();
+
+      public:
+        // Synchronize the threads named in the group.
+        // On the serial CPU implementation, do nothing.
+        static void sync() {}
+
+        // Total number of threads in the group.
+        // On the serial CPU implementation, always 1.
+        static unsigned long long size() { return 1; }
+
+        // Rank of the calling thread within [0, size-1].
+        // On the serial CPU implementation, always 0.
+        static unsigned long long thread_rank() { return 0; }
+
+        // 3-Dimensional index of the block within the launched grid.
+        // On the serial CPU implementation, always {0, 0, 0}.
+        static dim3 group_index() { return blockIdx; }
+
+        // 3-Dimensional index of the thread within the launched block
+        // On the serial CPU implementation, always {0, 0, 0}.
+        static dim3 thread_index() { return threadIdx; }
+
+        // Dimensions of the launched block.
+        // On the serial CPU implementation, always {1, 1, 1}.
+        static dim3 group_dim() { return blockDim; }
+      };
+
+      // Return the current thread block
+      inline thread_block this_thread_block() { return thread_block{}; }
+
+      // Represent a tiled group of threads, with compile-time fixed size.
+      // On the serial CPU implementation, the only valid Size is 1
+      template <unsigned int Size, typename ParentT>
+      class thread_block_tile {
+      private:
+        static_assert(
+            Size == 1,
+            "The cudaCompat Cooperative Groups implementation supports only tiled groups of a single thread.");
+
+        thread_block_tile() = default;
+
+        friend thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g);
+
+      public:
+        // Synchronize the threads named in the group.
+        // On the serial CPU implementation, do nothing.
+        void sync() const {}
+
+        // Total number of threads in the group.
+        // On the serial CPU implementation, always 1.
+        unsigned long long size() const { return 1; }
+
+        // Rank of the calling thread within [0, size-1].
+        // On the serial CPU implementation, always 0.
+        unsigned long long thread_rank() const { return 0; }
+
+        // Returns the number of groups created when the parent group was partitioned.
+        // On the serial CPU implementation, always 1.
+        unsigned long long meta_group_size() const { return 1; }
+
+        // Linear rank of the group within the set of tiles partitioned from a parent group (bounded by meta_group_size).
+        // On the serial CPU implementation, always 0.
+        unsigned long long meta_group_rank() const { return 0; }
+
+        // Not implemented - Refer to Warp Shuffle Functions
+        template <typename T>
+        T shfl(T var, unsigned int src_rank) const;
+        template <typename T>
+        T shfl_up(T var, int delta) const;
+        template <typename T>
+        T shfl_down(T var, int delta) const;
+        template <typename T>
+        T shfl_xor(T var, int delta) const;
+
+        // Not implemented - Refer to Warp Vote Functions
+        template <typename T>
+        T any(int predicate) const;
+        template <typename T>
+        T all(int predicate) const;
+        template <typename T>
+        T ballot(int predicate) const;
+
+        // Not implemented - Refer to Warp Match Functions
+        template <typename T>
+        T match_any(T val) const;
+        template <typename T>
+        T match_all(T val, int& pred) const;
+      };
+
+      template <unsigned int Size, typename ParentT>
+      inline thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g) {
+        static_assert(
+            Size == 1,
+            "The cudaCompat Cooperative Groups implementation supports only tiled groups of a single thread.");
+
+        return thread_block_tile<Size, ParentT>{};
+      }
+
+    }  // namespace cooperative_groups
+
   }  // namespace cudacompat
 }  // namespace cms
 
diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc
index f2805d018..745e6a829 100644
--- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc
+++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc
@@ -107,7 +107,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                  params_.dcaCutOuterTriplet_);
 
   if (nhits > 1 && params_.earlyFishbone_) {
-    gpuPixelDoublets::fishbone(
+    gpuPixelDoublets::fishbone<1, 1>(
         hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
   }
 
@@ -132,7 +132,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
 
   if (nhits > 1 && params_.lateFishbone_) {
-    gpuPixelDoublets::fishbone(
+    gpuPixelDoublets::fishbone<1, 1>(
         hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
   }
 
diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu
index edc1eb49b..e1661581d 100644
--- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu
+++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu
@@ -64,13 +64,10 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   cudaCheck(cudaGetLastError());
 
   if (nhits > 1 && params_.earlyFishbone_) {
-    auto nthTot = 128;
-    auto stride = 16;
-    auto blockSize = nthTot / stride;
-    auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
-    dim3 blks(1, numberOfBlocks, 1);
-    dim3 thrs(stride, blockSize, 1);
-    gpuPixelDoublets::fishbone<<<blks, thrs, 0, cudaStream>>>(
+    constexpr int threadsPerHit = 16;
+    constexpr int hitsPerBlock = 8;
+    auto numberOfBlocks = (nhits + hitsPerBlock - 1) / hitsPerBlock;
+    gpuPixelDoublets::fishbone<hitsPerBlock, threadsPerHit><<<numberOfBlocks, hitsPerBlock * threadsPerHit, 0, cudaStream>>>(
         hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
     cudaCheck(cudaGetLastError());
   }
@@ -116,13 +113,10 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   cudaCheck(cudaGetLastError());
 
   if (nhits > 1 && params_.lateFishbone_) {
-    auto nthTot = 128;
-    auto stride = 16;
-    auto blockSize = nthTot / stride;
-    auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
-    dim3 blks(1, numberOfBlocks, 1);
-    dim3 thrs(stride, blockSize, 1);
-    gpuPixelDoublets::fishbone<<<blks, thrs, 0, cudaStream>>>(
+    constexpr int threadsPerHit = 16;
+    constexpr int hitsPerBlock = 8;
+    auto numberOfBlocks = (nhits + hitsPerBlock - 1) / hitsPerBlock;
+    gpuPixelDoublets::fishbone<hitsPerBlock, threadsPerHit><<<numberOfBlocks, hitsPerBlock * threadsPerHit, 0, cudaStream>>>(
         hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
     cudaCheck(cudaGetLastError());
   }
diff --git a/src/cudadev/plugin-PixelTriplets/gpuFishbone.h b/src/cudadev/plugin-PixelTriplets/gpuFishbone.h
index 4a493f0c9..d5c6f393c 100644
--- a/src/cudadev/plugin-PixelTriplets/gpuFishbone.h
+++ b/src/cudadev/plugin-PixelTriplets/gpuFishbone.h
@@ -7,6 +7,9 @@
 #include <cstdio>
 #include <limits>
 
+#include <cooperative_groups.h>
+namespace cg = cooperative_groups;
+
 #include "DataFormats/approx_atan2.h"
 #include "Geometry/phase1PixelTopology.h"
 #include "CUDACore/VecArray.h"
@@ -16,76 +19,108 @@
 
 namespace gpuPixelDoublets {
 
-  //  __device__
-  //  __forceinline__
-  __global__ void fishbone(GPUCACell::Hits const* __restrict__ hhp,
-                           GPUCACell* cells,
-                           uint32_t const* __restrict__ nCells,
-                           GPUCACell::OuterHitOfCell const* __restrict__ isOuterHitOfCell,
-                           uint32_t nHits,
-                           bool checkTrack) {
+  template <int hitsPerBlock, int threadsPerHit>
+  __global__ __launch_bounds__(hitsPerBlock* threadsPerHit) void fishbone(
+      GPUCACell::Hits const* __restrict__ hits_p,
+      GPUCACell* cells,
+      uint32_t const* __restrict__ nCells,
+      GPUCACell::OuterHitOfCell const* __restrict__ isOuterHitOfCell,
+      uint32_t nHits,
+      bool checkTrack) {
     constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit;
 
-    auto const& hh = *hhp;
+    auto const& hits = *hits_p;
+
+    // blockDim must be hitsPerBlock times threadsPerHit
+    assert(blockDim.x == hitsPerBlock * threadsPerHit);
+
+    // partition the thread block into threadsPerHit-sized tiles
+    auto tile = cg::tiled_partition<threadsPerHit>(cg::this_thread_block());
+    assert(hitsPerBlock == tile.meta_group_size());
 
-    // x run faster...
-    auto firstY = threadIdx.y + blockIdx.y * blockDim.y;
-    auto firstX = threadIdx.x;
+    auto numberOfBlocks = gridDim.x;
+    auto hitsPerGrid = numberOfBlocks * hitsPerBlock;
+    auto hitInBlock = tile.meta_group_rank();  // 0 .. hitsPerBlock-1
+    auto innerThread = tile.thread_rank();
+    auto firstHit = hitInBlock + blockIdx.x * hitsPerBlock;
 
-    float x[maxCellsPerHit], y[maxCellsPerHit], z[maxCellsPerHit], n[maxCellsPerHit];
-    uint16_t d[maxCellsPerHit];  // uint8_t l[maxCellsPerHit];
-    uint32_t cc[maxCellsPerHit];
+    // hitsPerBlock buffers in shared memory
+    __shared__ float s_x[hitsPerBlock][maxCellsPerHit];
+    __shared__ float s_y[hitsPerBlock][maxCellsPerHit];
+    __shared__ float s_z[hitsPerBlock][maxCellsPerHit];
+    __shared__ float s_length2[hitsPerBlock][maxCellsPerHit];
+    __shared__ uint32_t s_cc[hitsPerBlock][maxCellsPerHit];
+    __shared__ uint16_t s_detId[hitsPerBlock][maxCellsPerHit];
+    __shared__ uint32_t s_doublets[hitsPerBlock];
 
-    for (int idy = firstY, nt = nHits; idy < nt; idy += gridDim.y * blockDim.y) {
-      auto const& vc = isOuterHitOfCell[idy];
+    // buffer used by the current thread
+    float(&x)[maxCellsPerHit] = s_x[hitInBlock];
+    float(&y)[maxCellsPerHit] = s_y[hitInBlock];
+    float(&z)[maxCellsPerHit] = s_z[hitInBlock];
+    float(&length2)[maxCellsPerHit] = s_length2[hitInBlock];
+    uint32_t(&cc)[maxCellsPerHit] = s_cc[hitInBlock];
+    uint16_t(&detId)[maxCellsPerHit] = s_detId[hitInBlock];
+    uint32_t& doublets = s_doublets[hitInBlock];
+
+    // outer loop: parallelize over the hits
+    for (int hit = firstHit; hit < (int)nHits; hit += hitsPerGrid) {
+      auto const& vc = isOuterHitOfCell[hit];
       auto size = vc.size();
       if (size < 2)
         continue;
+
       // if alligned kill one of the two.
       // in principle one could try to relax the cut (only in r-z?) for jumping-doublets
       auto const& c0 = cells[vc[0]];
-      auto xo = c0.outer_x(hh);
-      auto yo = c0.outer_y(hh);
-      auto zo = c0.outer_z(hh);
-      auto sg = 0;
-      for (int32_t ic = 0; ic < size; ++ic) {
-        auto& ci = cells[vc[ic]];
-        if (ci.unused())
+      auto xo = c0.outer_x(hits);
+      auto yo = c0.outer_y(hits);
+      auto zo = c0.outer_z(hits);
+      if (innerThread == 0) {
+        doublets = 0;
+      }
+      tile.sync();
+      for (int32_t i = innerThread; i < size; i += threadsPerHit) {
+        auto& cell = cells[vc[i]];
+        if (cell.unused())
           continue;  // for triplets equivalent to next
-        if (checkTrack && ci.tracks().empty())
+        if (checkTrack && cell.tracks().empty())
           continue;
-        cc[sg] = vc[ic];
-        d[sg] = ci.inner_detIndex(hh);
-        x[sg] = ci.inner_x(hh) - xo;
-        y[sg] = ci.inner_y(hh) - yo;
-        z[sg] = ci.inner_z(hh) - zo;
-        n[sg] = x[sg] * x[sg] + y[sg] * y[sg] + z[sg] * z[sg];
-        ++sg;
+        auto index = atomicInc(&doublets, 0xFFFFFFFF);
+        cc[index] = vc[i];
+        detId[index] = cell.inner_detIndex(hits);
+        x[index] = cell.inner_x(hits) - xo;
+        y[index] = cell.inner_y(hits) - yo;
+        z[index] = cell.inner_z(hits) - zo;
+        length2[index] = x[index] * x[index] + y[index] * y[index] + z[index] * z[index];
       }
-      if (sg < 2)
+
+      tile.sync();
+      if (doublets < 2)
         continue;
-      // here we parallelize
-      for (int32_t ic = firstX; ic < sg - 1; ic += blockDim.x) {
-        auto& ci = cells[cc[ic]];
-        for (auto jc = ic + 1; jc < sg; ++jc) {
-          auto& cj = cells[cc[jc]];
-          // must be different detectors (in the same layer)
-          //        if (d[ic]==d[jc]) continue;
-          // || l[ic]!=l[jc]) continue;
-          auto cos12 = x[ic] * x[jc] + y[ic] * y[jc] + z[ic] * z[jc];
-          if (d[ic] != d[jc] && cos12 * cos12 >= 0.99999f * n[ic] * n[jc]) {
-            // alligned:  kill farthest  (prefer consecutive layers)
-            if (n[ic] > n[jc]) {
-              ci.kill();
+
+      // inner loop: parallelize over the doublets
+      for (uint32_t i = innerThread; i < doublets - 1; i += threadsPerHit) {
+        auto& cell_i = cells[cc[i]];
+        for (uint32_t j = i + 1; j < doublets; ++j) {
+          // must be different detectors (potentially in the same layer)
+          if (detId[i] == detId[j])
+            continue;
+          auto& cell_j = cells[cc[j]];
+          auto cos12 = x[i] * x[j] + y[i] * y[j] + z[i] * z[j];
+          if (cos12 * cos12 >= 0.99999f * length2[i] * length2[j]) {
+            // alligned: kill farthest (prefer consecutive layers)
+            if (length2[i] > length2[j]) {
+              cell_i.kill();
               break;
             } else {
-              cj.kill();
+              cell_j.kill();
             }
           }
-        }  //cj
-      }    // ci
-    }      // hits
+        }  // j
+      }    // i
+    }      // hit
   }
+
 }  // namespace gpuPixelDoublets
 
 #endif  // RecoPixelVertexing_PixelTriplets_plugins_gpuFishbone_h