diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
index dba9fe12f5492..5c21a39302d70 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
@@ -1,8 +1,6 @@
 #ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
 #define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
 
-// #define CLUS_LIMIT_LOOP
-
 #include <cstdint>
 #include <cstdio>
 
@@ -87,8 +85,8 @@ namespace gpuClustering {
     __syncthreads();
 
     assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
-    assert(msize-firstPixel<maxPixInModule);  
- 
+    assert(msize-firstPixel<maxPixInModule);
+
 
 #ifdef GPU_DEBUG
     __shared__ uint32_t totGood;
@@ -122,20 +120,16 @@ namespace gpuClustering {
         hist.fill(y[i],i-firstPixel);
       }
 
-#ifdef CLUS_LIMIT_LOOP
     // assume that we can cover the whole module with up to 10 blockDim.x-wide iterations
     constexpr int maxiter = 10;
     if (threadIdx.x==0) {
       assert((hist.size()/ blockDim.x) <= maxiter);
     }
-    uint16_t const * jmax[maxiter];
+    // nearest neighbour
+    uint16_t nn[maxiter][5];
+    uint8_t nnn[maxiter]; // number of nn
     for (int k = 0; k < maxiter; ++k)
-      jmax[k] = hist.end();
-#endif
-
-    __shared__ int nloops;
-    nloops=0;
-
+      nnn[k] = 0;
 
     __syncthreads();  // for hit filling!
 
@@ -144,7 +138,7 @@ namespace gpuClustering {
     __shared__ uint32_t n40,n60;
     n40=n60=0;
     __syncthreads();
-    for (auto j=threadIdx.x; j<Hist::nbins(); j+=blockDim.x) { 
+    for (auto j=threadIdx.x; j<Hist::nbins(); j+=blockDim.x) {
       if(hist.size(j)>60) atomicAdd(&n60,1);
       if(hist.size(j)>40) atomicAdd(&n40,1);
      }
@@ -156,11 +150,31 @@ namespace gpuClustering {
     __syncthreads();
 #endif
 
+    // fill NN
+    for (int j=threadIdx.x, k = 0; j<hist.size(); j+=blockDim.x, ++k) {
+        auto p = hist.begin()+j;
+        auto i = *p + firstPixel;
+        assert (id[i] != InvId);
+        assert(id[i] == thisModuleId);    // same module
+        int be = Hist::bin(y[i]+1);
+        auto e = hist.end(be);
+        ++p;
+        for (;p<e;++p) {
+          auto m = (*p)+firstPixel;
+          assert(m!=i);
+          if (std::abs(int(x[m]) - int(x[i])) > 1) continue;
+          auto l = nnn[k]++;
+          assert(l<5);
+          nn[k][l]=*p;
+        }
+     }
+
     // for each pixel, look at all the pixels until the end of the module;
     // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
     // after the loop, all the pixel in each cluster should have the id equeal to the lowest
     // pixel in the cluster ( clus[i] == i ).
     bool more = true;
+    int nloops=0;
     while (__syncthreads_or(more)) {
       if (1==nloops%2) {
         for (int j=threadIdx.x, k = 0; j<hist.size(); j+=blockDim.x, ++k) {
@@ -175,45 +189,33 @@ namespace gpuClustering {
         for (int j=threadIdx.x, k = 0; j<hist.size(); j+=blockDim.x, ++k) {
           auto p = hist.begin()+j;
           auto i = *p + firstPixel;
-          assert (id[i] != InvId);
-          assert(id[i] == thisModuleId);    // same module
-#ifdef CLUS_LIMIT_LOOP
-          auto jm = jmax[k];
-          jmax[k] = p + 1;
-#endif
-          int be = Hist::bin(y[i]+1);
-          auto e = hist.end(be);
-#ifdef CLUS_LIMIT_LOOP
-          e = std::min(e,jm);
-#endif      
-          // loop to columns
-          auto loop = [&](uint16_t const * kk) {
-            auto m = (*kk)+firstPixel;
+          for (int kk=0; kk<nnn[k]; ++kk) {
+            auto l = nn[k][kk];
+            auto m = l+firstPixel;
             assert(m!=i);
-            if (std::abs(int(x[m]) - int(x[i])) > 1) return;
-            // if (std::abs(int(y[m]) - int(y[i])) > 1) return; // binssize is 1
             auto old = atomicMin(&clusterId[m], clusterId[i]);
             if (old != clusterId[i]) {
               // end the loop only if no changes were applied
               more = true;
             }
             atomicMin(&clusterId[i], old);
-#ifdef CLUS_LIMIT_LOOP
-            // update the loop boundary for the next iteration
-            jmax[k] = std::max(kk + 1,jmax[k]);
-#endif
-          };
-          ++p;
-          for (;p<e;++p) loop(p);
+          } // nnloop
         } // pixel loop
-        }
-        if (threadIdx.x==0) ++nloops;
+      }
+      ++nloops;
     }  // end while
 
 #ifdef GPU_DEBUG
+   {
+     __shared__ int n0;
+     if (threadIdx.x == 0) n0=nloops;
+     __syncthreads();
+     auto ok = n0==nloops;
+     assert(__syncthreads_and(ok));
    if (thisModuleId % 100 == 1)
       if (threadIdx.x == 0)
         printf("# loops %d\n",nloops);
+   }
 #endif
 
     __shared__ unsigned int foundClusters;
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
index a01c1230586fe..39c56d674eb1e 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
@@ -232,7 +232,7 @@ int main(void)
   cuda::memory::copy(d_y.get(), h_y.get(), size16);
   cuda::memory::copy(d_adc.get(), h_adc.get(), size16);
   // Launch CUDA Kernels
-  int threadsPerBlock = (kkk==5) ? 512 : ((kkk==3) ? 64 : 256);
+  int threadsPerBlock = (kkk==5) ? 512 : ((kkk==3) ? 128 : 256);
   int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
   std::cout
     << "CUDA countModules kernel launch with " << blocksPerGrid
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.cu
index 85dc10ee04587..575e6c39532aa 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorKernels.cu
@@ -144,9 +144,11 @@ kernel_connect(AtomicPairCounter * apc1, AtomicPairCounter * apc2,  // just to z
   constexpr auto hardCurvCut = 1.f/(0.35f * 87.f); // FIXME VI tune
   constexpr auto ptmin = 0.9f; // FIXME original "tune"
 
-  auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
+  auto cellIndex = threadIdx.y + blockIdx.y * blockDim.y;
+  auto first = threadIdx.x;
+  auto stride = blockDim.x;
 
-  if (0==cellIndex) { (*apc1)=0; (*apc2)=0; }// ready for next kernel
+  if (0==(cellIndex+first)) { (*apc1)=0; (*apc2)=0; }// ready for next kernel
 
   if (cellIndex >= (*nCells) ) return;
   auto const & thisCell = cells[cellIndex];
@@ -154,7 +156,7 @@ kernel_connect(AtomicPairCounter * apc1, AtomicPairCounter * apc2,  // just to z
   auto innerHitId = thisCell.get_inner_hit_id();
   auto numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
   auto vi = isOuterHitOfCell[innerHitId].data();
-  for (auto j = 0; j < numberOfPossibleNeighbors; ++j) {
+  for (auto j = first; j < numberOfPossibleNeighbors; j+=stride) {
      auto otherCell = __ldg(vi+j);
      if (cells[otherCell].theDoubletId<0) continue;
      if (thisCell.check_alignment(hh,
@@ -172,6 +174,8 @@ void kernel_find_ntuplets(
     unsigned int minHitsPerNtuplet)
 {
 
+  // recursive: not obvious to widen
+
   auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
   if (cellIndex >= (*nCells) ) return;
   auto &thisCell = cells[cellIndex];
@@ -246,23 +250,27 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms....
   assert(nhits <= PixelGPUConstants::maxNumberOfHits);
   
   if (earlyFishbone_) {
-    auto blockSize = 128;
+    auto blockSize = 64;
     auto stride = 4;
     auto numberOfBlocks = (nhits + blockSize - 1)/blockSize;
-    numberOfBlocks *=stride;
-  
-    fishbone<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+    dim3 blks(1,numberOfBlocks,1);
+    dim3 thrs(stride,blockSize,1);
+    fishbone<<<blks,thrs, 0, cudaStream>>>(
       hh.gpu_d,
       device_theCells_, device_nCells_,
       device_isOuterHitOfCell_,
-      nhits, stride, false
+      nhits, false
     );
     cudaCheck(cudaGetLastError());
   }
 
   auto blockSize = 64;
   auto numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1)/blockSize;
-  kernel_connect<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+  auto stride = 4;
+  dim3 blks(1,numberOfBlocks,1);
+  dim3 thrs(stride,blockSize,1);
+
+  kernel_connect<<<blks, thrs, 0, cudaStream>>>(
       gpu_.apc_d, device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
       hh.gpu_d,
       device_theCells_, device_nCells_,
@@ -282,14 +290,16 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms....
   cudautils::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(gpu_.apc_d,gpu_.tuples_d);
 
   if (lateFishbone_) {
-    auto stride=4;
-    numberOfBlocks = (nhits + blockSize - 1)/blockSize;
-    numberOfBlocks *=stride;
-    fishbone<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+    auto blockSize = 64;
+    auto stride = 4;
+    auto numberOfBlocks = (nhits + blockSize - 1)/blockSize;
+    dim3 blks(1,numberOfBlocks,1);
+    dim3 thrs(stride,blockSize,1);
+    fishbone<<<blks,thrs, 0, cudaStream>>>(
       hh.gpu_d,
       device_theCells_, device_nCells_,
       device_isOuterHitOfCell_,
-      nhits, stride, true
+      nhits, true
     );
     cudaCheck(cudaGetLastError());
   }
@@ -312,9 +322,13 @@ void CAHitQuadrupletGeneratorKernels::launchKernels( // here goes algoparms....
 void CAHitQuadrupletGeneratorKernels::buildDoublets(HitsOnCPU const & hh, cudaStream_t stream) {
   auto nhits = hh.nHits;
 
-  int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize;
+  int stride=4;
+  int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize/stride;
   int blocks = (3 * nhits + threadsPerBlock - 1) / threadsPerBlock;
-  gpuPixelDoublets::getDoubletsFromHisto<<<blocks, threadsPerBlock, 0, stream>>>(device_theCells_, device_nCells_, hh.gpu_d, device_isOuterHitOfCell_);
+  dim3 blks(1,blocks,1);
+  dim3 thrs(stride,threadsPerBlock,1);
+  gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream>>>(
+            device_theCells_, device_nCells_, hh.gpu_d, device_isOuterHitOfCell_);
   cudaCheck(cudaGetLastError());
 }
 
@@ -330,4 +344,3 @@ void CAHitQuadrupletGeneratorKernels::classifyTuples(HitsOnCPU const & hh, Tuple
     kernel_fastDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_theCells_, device_nCells_,tuples.tuples_d,tuples.helix_fit_results_d, tuples.quality_d);
 
 }
-
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
index 717cbf777fcdb..796241eaf50ff 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
@@ -26,7 +26,7 @@ namespace gpuPixelDoublets {
                GPUCACell * cells, uint32_t const * __restrict__ nCells,
                GPUCACell::OuterHitOfCell const * __restrict__ isOuterHitOfCell,
                uint32_t nHits,
-               uint32_t stride, bool checkTrack) {
+               bool checkTrack) {
 
     constexpr auto maxCellsPerHit = GPUCACell::maxCellsPerHit;
 
@@ -35,13 +35,12 @@ namespace gpuPixelDoublets {
     uint8_t const * __restrict__ layerp =  hh.phase1TopologyLayer_d;
     auto layer = [&](uint16_t id) { return __ldg(layerp+id/phase1PixelTopology::maxModuleStride);};
 
-    auto ldx = threadIdx.x + blockIdx.x * blockDim.x;
-    auto idx = ldx/stride;
-    auto first = ldx - idx*stride;
-    assert(first<stride);
+    // x run faster...
+    auto idy = threadIdx.y + blockIdx.y * blockDim.y;
+    auto first = threadIdx.x;
 
-    if (idx>=nHits) return;
-    auto const & vc = isOuterHitOfCell[idx];
+    if (idy>=nHits) return;
+    auto const & vc = isOuterHitOfCell[idy];
     auto s = vc.size();
     if (s<2) return;
     // if alligned kill one of the two.
@@ -66,8 +65,8 @@ namespace gpuPixelDoublets {
       ++sg;
     }
     if (sg<2) return;   
-    // here we parallelize
-    for (uint32_t ic=first; ic<sg-1;  ic+=stride) {
+    // here we parallelize 
+    for (uint32_t ic=first; ic<sg-1;  ic+=blockDim.x) {
       auto & ci = cells[cc[ic]];
       for    (auto jc=ic+1; jc<sg; ++jc) {
         auto & cj = cells[cc[jc]];
@@ -90,4 +89,4 @@ namespace gpuPixelDoublets {
 
 }
 
-#endif
+#endif // RecoLocalTracker_SiPixelRecHits_plugins_gpuFishbone_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index 02a175fcc2903..770fe5569b081 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -34,7 +34,8 @@ namespace gpuPixelDoublets {
                          int16_t const * __restrict__ phicuts,
                          float const * __restrict__ minz,
                          float const * __restrict__ maxz,
-                         float const * __restrict__ maxr)
+                         float const * __restrict__ maxr
+                         )
   {
     auto layerSize = [=](uint8_t li) { return offsets[li+1]-offsets[li]; };
 
@@ -50,8 +51,11 @@ namespace gpuPixelDoublets {
     }
     auto ntot = innerLayerCumulativeSize[nPairs-1];
 
-    auto idx = blockIdx.x * blockDim.x + threadIdx.x;
-    for (auto j = idx; j < ntot; j += blockDim.x * gridDim.x) {
+    // x runs faster
+    auto idy = blockIdx.y * blockDim.y + threadIdx.y;
+    auto first = threadIdx.x;
+    auto stride = blockDim.x;
+    for (auto j = idy; j < ntot; j += blockDim.y * gridDim.y ) {
 
       uint32_t pairLayerId=0;
       while (j >= innerLayerCumulativeSize[pairLayerId++]);
@@ -115,7 +119,8 @@ namespace gpuPixelDoublets {
           nmin += hist.size(kk+hoff);
         auto const * __restrict__ p = hist.begin(kk+hoff);
         auto const * __restrict__ e = hist.end(kk+hoff);
-        for (;p < e; ++p) {
+        p+=first;
+        for (;p < e; p+=stride) {
           auto oi=__ldg(p);
           assert(oi>=offsets[outer]);
           assert(oi<offsets[outer+1]);
@@ -139,7 +144,7 @@ namespace gpuPixelDoublets {
     }  // loop in block...
   }
 
-  constexpr auto getDoubletsFromHistoMaxBlockSize = 64;
+  constexpr auto getDoubletsFromHistoMaxBlockSize = 64;  // for both x and y
   constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16;
 
   __global__