Fix shared memory overflow due to config setting error (#98)

scal444 · evasnow1992 · commit bbccd3021d1f · 2026-02-20T11:49:43.000-08:00
diff --git a/src/substruct/recursive_preprocessor.cu b/src/substruct/recursive_preprocessor.cu
@@ -89,6 +89,13 @@ void LeafSubpatterns::buildAllPatterns(const MoleculesHost& queriesHost) {
     }
   }
 
+  if (patternsHost.numMolecules() > 0) {
+    for (size_t i = 0; i < patternsHost.numMolecules(); ++i) {
+      const int atoms  = patternsHost.batchAtomStarts[i + 1] - patternsHost.batchAtomStarts[i];
+      maxPatternAtoms_ = std::max(maxPatternAtoms_, atoms);
+    }
+  }
+
   // Second pass: build precomputed BatchedPatternEntry structures
   perQueryPatterns.resize(numQueries);
   perQueryMaxDepth.resize(numQueries, 0);
@@ -146,7 +153,7 @@ void LeafSubpatterns::buildAllPatterns(const MoleculesHost& queriesHost) {
     }
 
     for (int d = 0; d <= queryMaxDepth; ++d) {
-      const auto& srcEntries = perQueryPatterns[queryIdx][d];
+      const auto& srcEntries  = perQueryPatterns[queryIdx][d];
       auto&       destEntries = allQueriesPatternsAtDepth[d];
       destEntries.insert(destEntries.end(), srcEntries.begin(), srcEntries.end());
     }
@@ -190,6 +197,11 @@ void RecursivePatternPreprocessor::preprocessMiniBatch(
 
   scratch.setStream(stream);
 
+  const auto baseProps        = getTemplateConfigProperties(templateConfig);
+  const int  paintQueryAtoms  = std::max(baseProps.maxQueryAtoms, leafSubpatterns_.maxPatternAtoms());
+  const int  paintTargetAtoms = std::max(baseProps.maxTargetAtoms, paintQueryAtoms);
+  const auto paintConfig      = selectTemplateConfig(paintTargetAtoms, paintQueryAtoms, baseProps.maxBondsPerAtom);
+
   constexpr int gsiBuffersPerBlock = 2;
 
   const int maxPaintPairsPerSubBatch = std::max(miniBatchSize, 1024);
@@ -257,7 +269,7 @@ void RecursivePatternPreprocessor::preprocessMiniBatch(
       }
       isFirstLabelKernel = false;
 
-      launchLabelMatrixPaintKernel(templateConfig,
+      launchLabelMatrixPaintKernel(paintConfig,
                                    targetsDevice.view<MoleculeType::Target>(),
                                    leafSubpatterns_.view(),
                                    scratch.patternEntries.data(),
@@ -273,7 +285,7 @@ void RecursivePatternPreprocessor::preprocessMiniBatch(
                                    zeroBuffers,
                                    stream);
 
-      launchSubstructPaintKernel(templateConfig,
+      launchSubstructPaintKernel(paintConfig,
                                  algorithm,
                                  targetsDevice.view<MoleculeType::Target>(),
                                  leafSubpatterns_.view(),
@@ -376,6 +388,11 @@ void preprocessRecursiveSmarts(SubstructTemplateConfig           templateConfig,
   const int lastTargetInMiniBatch  = (miniBatchPairOffset + miniBatchSize - 1) / numQueries;
   const int numTargetsInMiniBatch  = lastTargetInMiniBatch - firstTargetInMiniBatch + 1;
 
+  const auto baseProps        = getTemplateConfigProperties(templateConfig);
+  const int  paintQueryAtoms  = std::max(baseProps.maxQueryAtoms, leafSubpatterns.maxPatternAtoms());
+  const int  paintTargetAtoms = std::max(baseProps.maxTargetAtoms, paintQueryAtoms);
+  const auto paintConfig      = selectTemplateConfig(paintTargetAtoms, paintQueryAtoms, baseProps.maxBondsPerAtom);
+
   constexpr int gsiBuffersPerBlock = 2;
 
   const int maxPaintPairsPerSubBatch = std::max(miniBatchSize, 1024);
@@ -449,7 +466,7 @@ void preprocessRecursiveSmarts(SubstructTemplateConfig           templateConfig,
       }
       isFirstLabelKernel = false;
 
-      launchLabelMatrixPaintKernel(templateConfig,
+      launchLabelMatrixPaintKernel(paintConfig,
                                    targetsDevice.view<MoleculeType::Target>(),
                                    leafSubpatterns.view(),
                                    scratch.patternEntries.data(),
@@ -465,7 +482,7 @@ void preprocessRecursiveSmarts(SubstructTemplateConfig           templateConfig,
                                    zeroBuffers,
                                    stream);
 
-      launchSubstructPaintKernel(templateConfig,
+      launchSubstructPaintKernel(paintConfig,
                                  algorithm,
                                  targetsDevice.view<MoleculeType::Target>(),
                                  leafSubpatterns.view(),
diff --git a/src/substruct/recursive_preprocessor.h b/src/substruct/recursive_preprocessor.h
@@ -87,6 +87,8 @@ struct LeafSubpatterns {
   /// Max recursion depth across all queries
   int allQueriesMaxDepth = 0;
 
+  int maxPatternAtoms_ = 0;
+
   LeafSubpatterns() = default;
 
   /**
@@ -130,6 +132,11 @@ struct LeafSubpatterns {
    */
   [[nodiscard]] size_t size() const { return patternIndexMap.size(); }
 
+  /**
+   * @brief Max atom count across all leaf subpatterns.
+   */
+  [[nodiscard]] int maxPatternAtoms() const { return maxPatternAtoms_; }
+
   /**
    * @brief Get view for kernel access.
    */
diff --git a/tests/test_recursive_preprocessor.cu b/tests/test_recursive_preprocessor.cu
@@ -177,3 +177,101 @@ TEST(RecursivePreprocessorTest, PaintsBitsForSimpleRecursivePattern) {
   EXPECT_FALSE(hasRecursiveBit(1, 0));
   EXPECT_FALSE(hasRecursiveBit(1, 1));
 }
+
+/**
+ * @brief Leaf subpattern with more atoms than the caller's MaxQueryAtoms
+ *        template tier should not overflow the shared memory label matrix.
+ */
+TEST(RecursivePreprocessorTest, LeafPatternLargerThanConfigMaxQueryAtoms) {
+  ScopedStream stream;
+
+  auto target = makeMolFromSmiles("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC");
+  auto query  = makeMolFromSmarts("[$(*~C~C~C~C~C~C~C~C~C~C~C~C~C~C~C~C~C)]");
+
+  ASSERT_NE(target, nullptr);
+  ASSERT_NE(query, nullptr);
+
+  std::vector<const RDKit::ROMol*> targets = {target.get()};
+  std::vector<const RDKit::ROMol*> queries = {query.get()};
+  std::vector<int>                 emptySortOrder;
+
+  MoleculesHost targetsHost;
+  nvMolKit::buildTargetBatchParallelInto(targetsHost, 1, targets, emptySortOrder);
+  MoleculesHost queriesHost = nvMolKit::buildQueryBatchParallel(queries, emptySortOrder, 1);
+
+  const int maxTargetAtoms = maxAtomsPerTarget(targetsHost);
+  ASSERT_GE(maxTargetAtoms, 32);
+
+  MoleculesDevice targetsDevice(stream.stream());
+  targetsDevice.copyFromHost(targetsHost);
+
+  RecursivePatternPreprocessor preprocessor;
+  preprocessor.buildPatterns(queriesHost);
+  preprocessor.syncToDevice(stream.stream());
+
+  const LeafSubpatterns& leafSubpatterns = preprocessor.leafSubpatterns();
+  ASSERT_FALSE(leafSubpatterns.empty());
+  ASSERT_GT(leafSubpatterns.maxPatternAtoms(), 16);
+
+  const int numTargets    = 1;
+  const int numQueries    = 1;
+  const int miniBatchSize = numTargets * numQueries;
+
+  AsyncDeviceVector<int> pairMatchStartsDev(static_cast<size_t>(miniBatchSize + 1), stream.stream());
+  pairMatchStartsDev.zero();
+  MiniBatchResultsDevice miniBatchResults(stream.stream());
+  miniBatchResults.allocateMiniBatch(miniBatchSize, pairMatchStartsDev.data(), 0, numQueries, maxTargetAtoms, 2);
+  const std::vector<int> atomCounts = queryAtomCounts(queriesHost);
+  miniBatchResults.setQueryAtomCounts(atomCounts.data(), atomCounts.size());
+  miniBatchResults.zeroRecursiveBits();
+
+  RecursiveScratchBuffers scratch(stream.stream());
+  scratch.allocateBuffers(256);
+
+  std::array<std::vector<BatchedPatternEntry>, kMaxSmartsNestingDepth + 1> patternsAtDepth;
+  for (auto& vec : patternsAtDepth) {
+    vec.clear();
+  }
+
+  const int queryMaxDepth = leafSubpatterns.perQueryMaxDepth.empty() ? 0 : leafSubpatterns.perQueryMaxDepth[0];
+  for (int depth = 0; depth <= queryMaxDepth; ++depth) {
+    const auto& src = leafSubpatterns.perQueryPatterns[0][depth];
+    patternsAtDepth[depth].insert(patternsAtDepth[depth].end(), src.begin(), src.end());
+  }
+
+  preprocessor.preprocessMiniBatch(SubstructTemplateConfig::Config_T32_Q16_B4,
+                                   targetsDevice,
+                                   miniBatchResults,
+                                   numQueries,
+                                   0,
+                                   miniBatchSize,
+                                   SubstructAlgorithm::GSI,
+                                   stream.stream(),
+                                   scratch,
+                                   patternsAtDepth,
+                                   queryMaxDepth,
+                                   0,
+                                   numTargets,
+                                   nullptr,
+                                   0);
+
+  cudaCheckError(cudaStreamSynchronize(stream.stream()));
+  cudaCheckError(cudaGetLastError());
+
+  std::vector<uint32_t> hostBits(static_cast<size_t>(miniBatchSize) * maxTargetAtoms);
+  cudaCheckError(cudaMemcpyAsync(hostBits.data(),
+                                 miniBatchResults.recursiveMatchBits(),
+                                 hostBits.size() * sizeof(uint32_t),
+                                 cudaMemcpyDeviceToHost,
+                                 stream.stream()));
+  cudaCheckError(cudaStreamSynchronize(stream.stream()));
+
+  bool anyBitSet = false;
+  for (size_t i = 0; i < hostBits.size(); ++i) {
+    if (hostBits[i] != 0) {
+      anyBitSet = true;
+      break;
+    }
+  }
+  EXPECT_TRUE(anyBitSet);
+}