Explicitly skipping local_laplacian tests for GPU targets

antonysigma · antonysigma · commit 0ec4904ad754 · 2025-07-18T10:32:47.000-07:00
diff --git a/apps/local_laplacian/CMakeLists.txt b/apps/local_laplacian/CMakeLists.txt
@@ -18,19 +18,12 @@ add_halide_generator(local_laplacian.generator
 
 set(_local_laplacian_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
 
-if(Halide_TARGET MATCHES "cuda")
+if(Halide_TARGET MATCHES "cuda|metal|opencl|vulkan")
     # Last level cache size estimate of the Nvidia GPU on the Buildbot. Hand
     # tuned to pass the Builbot tests.
     list(APPEND _local_laplacian_autoscheduler_params
         autoscheduler.last_level_cache_size=10000
     )
-elseif(Halide_TARGET MATCHES "metal|opencl|vulkan")
-    # The pipeline is shared GPU memory bounded. Limit the parallelism to
-    # minimal value (=32) to cap the shared GPU memory size.
-    list(APPEND _local_laplacian_autoscheduler_params
-        autoscheduler.last_level_cache_size=1000
-        autoscheduler.parallelism=32
-    )
 endif()
 
 # Filters
@@ -58,5 +51,9 @@ if (EXISTS ${IMAGE})
     set_tests_properties(local_laplacian_process PROPERTIES
                          LABELS local_laplacian
                          PASS_REGULAR_EXPRESSION "Success!"
-                         SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
+                         SKIP_REGULAR_EXPRESSION "\\[SKIP\\]"
+                         # Pass in the keyword "metal" etc to skip the test
+                         # explicitly. Buildbot can print a nice test report
+                         # for all skipped tests.
+                         ENVIRONMENT "HL_TARGET=${Halide_TARGET}")
 endif ()
diff --git a/apps/local_laplacian/process.cpp b/apps/local_laplacian/process.cpp
@@ -1,5 +1,7 @@
 #include <chrono>
 #include <cstdio>
+#include <cstdlib>
+#include <regex>
 
 #include "local_laplacian.h"
 #ifndef NO_AUTO_SCHEDULE
@@ -13,13 +15,59 @@
 using namespace Halide::Runtime;
 using namespace Halide::Tools;
 
+namespace {
+
+enum DeviceState {
+    IS_CUDA,
+    NOT_CUDA,
+    ENV_VARIABLE_ABSENT,
+};
+DeviceState ensure_cuda_device() {
+    const auto hl_target = std::getenv("HL_TARGET");
+    if (hl_target == nullptr) {
+        printf("Warning: Environment variable HL_TARGET not specified. "
+               "Proceeding to the tests...\n");
+        return ENV_VARIABLE_ABSENT;
+    }
+
+    if (std::regex_search(hl_target, std::regex{"cuda|metal|vulkan|opencl"})) {
+        // note(antonysigma): Error messages if we don't skip the test:
+        //
+        // OpenCL error: CL_INVALID_WORK_GROUP_SIZE clEnqueueNDRangeKernel
+        // failed
+        //
+        // 2025-07-17 17:24:32.170 local_laplacian_process[63513:6587844] Metal
+        // API Validation Enabled -[MTLDebugComputeCommandEncoder
+        // _validateThreadsPerThreadgroup:]:1266: failed assertion
+        // `(threadsPerThreadgroup.width(62) * threadsPerThreadgroup.height(32)
+        // * threadsPerThreadgroup.depth(1))(1984) must be <= 1024. (device
+        // threadgroup size limit)'
+        //
+        // Vulkan: vkQueueWaitIdle returned VK_ERROR_DEVICE_LOST
+        printf("[SKIP] Mullapudi2016 experimental GPU schedules "
+               "over-estimates the gpu_threads where thread count per block "
+               "is not an multiple of 32. Target = %s. Skipping...\n",
+               hl_target);
+
+        return NOT_CUDA;
+    }
+
+    return IS_CUDA;
+}
+
+}  // namespace
+
 int main(int argc, char **argv) {
     if (argc < 7) {
         printf("Usage: ./process input.png levels alpha beta timing_iterations output.png\n"
                "e.g.: ./process input.png 8 1 1 10 output.png\n");
         return 1;
     }
 
+    if (ensure_cuda_device() == NOT_CUDA) {
+        return 0;
+    }
+
     // Input may be a PNG8
     Buffer<uint16_t, 3> input = load_and_convert_image(argv[1]);