diff --git a/apps/bgu/CMakeLists.txt b/apps/bgu/CMakeLists.txt
index d7a44a2cda9b..a08c6b825454 100644
--- a/apps/bgu/CMakeLists.txt
+++ b/apps/bgu/CMakeLists.txt
@@ -14,21 +14,23 @@ find_package(Halide REQUIRED)
 # Generator
 add_halide_generator(bgu.generator SOURCES bgu_generator.cpp)
 
+set(_bgu_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
+
+if(Halide_TARGET MATCHES "cuda|metal|opencl")
+    # Set last_level_cache per GPU block to an extremely small value. This
+    # eliminates all `.compute_at` in the generated schedules, which in turn
+    # eliminates all GPU shared memory allocations.
+    list(APPEND _bgu_autoscheduler_params
+        autoscheduler.last_level_cache_size=2000
+    )
+endif()
+
 # Filters
 add_halide_library(bgu FROM bgu.generator)
 add_halide_library(bgu_auto_schedule FROM bgu.generator
                    GENERATOR bgu
                    AUTOSCHEDULER Halide::Mullapudi2016
-# Note(antonysigma): experimental GPU schedule failed on the Buildbot worker
-# "halide-testbranch-main-llvm18-x86-64-linux-cmake" with error:
-#
-# CUDA error: CUDA_ERROR_ILLEGAL_ADDRESS cuCtxSynchronize failed
-#
-# Curiously, it works on a low-end GPU: Nvidia GTX 1660S.
-#
-# Uncomment the following code to debug. PARAMS
-# autoscheduler.experimental_gpu_schedule=1
-)
+                   PARAMS ${_bgu_autoscheduler})
 
 # Main executable
 add_executable(bgu_filter filter.cpp)
diff --git a/apps/camera_pipe/CMakeLists.txt b/apps/camera_pipe/CMakeLists.txt
index 94bf7a1ae447..ef36ff938d4e 100644
--- a/apps/camera_pipe/CMakeLists.txt
+++ b/apps/camera_pipe/CMakeLists.txt
@@ -16,11 +16,29 @@ add_halide_generator(camera_pipe.generator
                      SOURCES camera_pipe_generator.cpp
                      LINK_LIBRARIES Halide::Tools)
 
+set(_camera_pipe_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
+
+if(Halide_TARGET MATCHES "cuda|metal")
+    # Last level cache size estimate of the Nvidia GPU on the Buildbot. Hand
+    # tuned to pass the Builbot tests.
+    list(APPEND _camera_pipe_autoscheduler_params
+        autoscheduler.last_level_cache_size=10000
+    )
+elseif(Halide_TARGET MATCHES "opencl|vulkan")
+    # Set last_level_cache per GPU block to an extremely small value. This
+    # eliminates all `.compute_at` in the generated schedules, which in turn
+    # eliminates all GPU shared memory allocations.
+    list(APPEND _camera_pipe_autoscheduler_params
+        autoscheduler.last_level_cache_size=1000
+    )
+endif()
+
 # Filters
 add_halide_library(camera_pipe FROM camera_pipe.generator)
 add_halide_library(camera_pipe_auto_schedule FROM camera_pipe.generator
                    GENERATOR camera_pipe
-                   AUTOSCHEDULER Halide::Mullapudi2016)
+                   AUTOSCHEDULER Halide::Mullapudi2016
+                   PARAMS ${_camera_pipe_autoscheduler_params})
 
 # Main executable
 add_executable(camera_pipe_process process.cpp)
diff --git a/apps/harris/CMakeLists.txt b/apps/harris/CMakeLists.txt
index b6f95e554383..dff4b192816f 100644
--- a/apps/harris/CMakeLists.txt
+++ b/apps/harris/CMakeLists.txt
@@ -14,12 +14,23 @@ find_package(Halide REQUIRED)
 # Generator
 add_halide_generator(harris.generator SOURCES harris_generator.cpp)
 
+set(_harris_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
+
+if(Halide_TARGET MATCHES "opencl|metal|cuda|vulkan")
+    # Set last_level_cache per GPU block to an extremely small value. This
+    # eliminates all `.compute_at` in the generated schedules, which in turn
+    # eliminates all GPU shared memory allocations.
+    list(APPEND _harris_autoscheduler_params
+        autoscheduler.last_level_cache_size=1000
+    )
+endif()
+
 # Filters
 add_halide_library(harris FROM harris.generator)
 add_halide_library(harris_auto_schedule FROM harris.generator
                    GENERATOR harris
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS ${_harris_autoscheduler_params})
 
 # Main executable
 add_executable(harris_filter filter.cpp)
diff --git a/apps/harris/harris_generator.cpp b/apps/harris/harris_generator.cpp
index 69cf8c05c68c..bd35b4ae25d4 100644
--- a/apps/harris/harris_generator.cpp
+++ b/apps/harris/harris_generator.cpp
@@ -66,7 +66,7 @@ class Harris : public Halide::Generator<Harris> {
             const int kHeight = 2560;
             input.dim(0).set_estimate(0, kWidth);
             input.dim(1).set_estimate(0, kHeight);
-            input.dim(2).set_estimate(0, 3);
+            input.dim(2).set_estimate(0, 4);
             output.dim(0).set_estimate(3, kWidth - 6);
             output.dim(1).set_estimate(3, kHeight - 6);
         }
diff --git a/apps/iir_blur/CMakeLists.txt b/apps/iir_blur/CMakeLists.txt
index 0ca3233968f3..474af15dcf35 100644
--- a/apps/iir_blur/CMakeLists.txt
+++ b/apps/iir_blur/CMakeLists.txt
@@ -32,18 +32,11 @@ target_link_libraries(iir_blur_filter PRIVATE
 # Test that the app actually works!
 set(IMAGE ${CMAKE_CURRENT_LIST_DIR}/../images/rgb.png)
 if (EXISTS ${IMAGE})
-    if (Halide_TARGET MATCHES "opencl")
-        # Error message:
-        #
-        # Error: OpenCL error: CL_INVALID_COMMAND_QUEUE clFinish failed
-        message(WARNING "Skipping Mullapudi2016's GPU auto-schedules for OpenCL target.")
-    else ()
-        configure_file(${IMAGE} rgb.png COPYONLY)
-        add_test(NAME iir_blur_filter
-                COMMAND iir_blur_filter rgb.png out.png)
-        set_tests_properties(iir_blur_filter PROPERTIES
-                            LABELS iir_blur
-                            PASS_REGULAR_EXPRESSION "Success!"
-                            SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
-    endif ()
+    configure_file(${IMAGE} rgb.png COPYONLY)
+    add_test(NAME iir_blur_filter
+            COMMAND iir_blur_filter rgb.png out.png)
+    set_tests_properties(iir_blur_filter PROPERTIES
+                        LABELS iir_blur
+                        PASS_REGULAR_EXPRESSION "Success!"
+                        SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
 endif ()
diff --git a/apps/iir_blur/filter.cpp b/apps/iir_blur/filter.cpp
index fe0abd45ff79..e94b57c621d9 100644
--- a/apps/iir_blur/filter.cpp
+++ b/apps/iir_blur/filter.cpp
@@ -1,6 +1,6 @@
 #include <cassert>
 #include <cstdio>
-#include <cstdlib>
+#include <regex>
 
 #include "HalideBuffer.h"
 #include "HalideRuntime.h"
@@ -13,12 +13,54 @@
 
 using namespace Halide::Tools;
 
+namespace {
+
+enum DeviceState {
+    USING_METAL_OR_OPENCL,
+    NOT_METAL_OR_OPENCL,
+    METADATA_ABSENT,
+};
+DeviceState ensure_cuda_device() {
+    const auto hl_target = iir_blur_auto_schedule_metadata()->target;
+    if (hl_target == nullptr) {
+        printf("Warning: variable *_metadata()->target not specified. "
+               "Proceeding to the tests...\n");
+        return METADATA_ABSENT;
+    }
+
+    if (std::regex_search(hl_target, std::regex{"metal|opencl"})) {
+        // note(antonysigma): Error messages if we don't skip the test:
+        //
+        // OpenCL error: clFinish timeout.
+        //
+        // Metal: copy_to_host() failed. Error
+        // Domain=MTLCommandBufferErrorDomain Code=2 "Caused GPU Timeout Error
+        // (00000002:kIOAccelCommandBufferCallbackErrorTimeout)"
+        // UserInfo={NSLocalizedDescription=Caused GPU Timeout Error
+        // (00000002:kIOAccelCommandBufferCallbackErrorTimeout)}
+        printf("[SKIP] Mullapudi2016 experimental GPU schedule "
+               "generates copy_to_host() function calls that timeout. "
+               "Target = %s. Skipping...\n",
+               hl_target);
+
+        return USING_METAL_OR_OPENCL;
+    }
+
+    return NOT_METAL_OR_OPENCL;
+}
+
+}  // namespace
+
 int main(int argc, char **argv) {
     if (argc != 3) {
         printf("Usage: %s in out\n", argv[0]);
         return 1;
     }
 
+    if (ensure_cuda_device() == USING_METAL_OR_OPENCL) {
+        return 0;
+    }
+
     Halide::Runtime::Buffer<float, 3> input = load_and_convert_image(argv[1]);
     Halide::Runtime::Buffer<float, 3> output(input.width(), input.height(), input.channels());
 
diff --git a/apps/lens_blur/CMakeLists.txt b/apps/lens_blur/CMakeLists.txt
index 46be29788591..aa99e55be6fd 100644
--- a/apps/lens_blur/CMakeLists.txt
+++ b/apps/lens_blur/CMakeLists.txt
@@ -14,12 +14,29 @@ find_package(Halide REQUIRED)
 # Generator
 add_halide_generator(lens_blur.generator SOURCES lens_blur_generator.cpp)
 
+set(_lens_blur_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
+
+if(Halide_TARGET MATCHES "cuda|metal")
+    # Last level cache size estimate of the Nvidia GPU on the Buildbot. Hand
+    # tuned to pass the Builbot tests.
+    list(APPEND _lens_blur_autoscheduler_params
+        autoscheduler.last_level_cache_size=10000
+    )
+elseif(Halide_TARGET MATCHES "opencl")
+    # Set last_level_cache per GPU block to an extremely small value. This
+    # eliminates all `.compute_at` in the generated schedules, which in turn
+    # eliminates all GPU shared memory allocations.
+    list(APPEND _lens_blur_autoscheduler_params
+        autoscheduler.last_level_cache_size=1000
+    )
+endif()
+
 # Filters
 add_halide_library(lens_blur FROM lens_blur.generator)
 add_halide_library(lens_blur_auto_schedule FROM lens_blur.generator
                    GENERATOR lens_blur
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS ${_lens_blur_autoscheduler_params})
 
 # Main executable
 add_executable(lens_blur_filter process.cpp)
@@ -32,26 +49,11 @@ target_link_libraries(lens_blur_filter
 # Test that the app actually works!
 set(IMAGE ${CMAKE_CURRENT_LIST_DIR}/../images/rgb_small.png)
 if (EXISTS ${IMAGE})
-    if (Halide_TARGET MATCHES "metal")
-        # Note(antonysigma): Buildbot error message:
-        #
-        # 2025-06-30 23:26:02.260 lens_blur_filter[32272:21031150] Metal API Validation
-        # Enabled -[MTLDebugComputeCommandEncoder _validateThreadsPerThreadgroup:]:1267:
-        # failed assertion `(threadsPerThreadgroup.width(32) *
-        # threadsPerThreadgroup.height(32) * threadsPerThreadgroup.depth(1))(1024) must
-        # be <= 896. (kernel threadgroup size limit)'
-        #
-        # Possible root cause: Autoscheduler's GPUTilingDedup::max_n_threads is
-        # hardcoded to 1024 threads per block. The OSX Metal API caps the value at 836
-        # threads per block because of the register pressure in lens_blur's GPU kernel.
-        message ("Pipeline lens_blur_auto_schedule skipped for target host-metal")
-    else ()
-        configure_file(${IMAGE} rgb_small.png COPYONLY)
-        add_test(NAME lens_blur_filter
-                COMMAND lens_blur_filter rgb_small.png 32 13 0.5 32 3 out.png)
-        set_tests_properties(lens_blur_filter PROPERTIES
-                            LABELS lens_blur
-                            PASS_REGULAR_EXPRESSION "Success!"
-                            SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
-    endif ()
+    configure_file(${IMAGE} rgb_small.png COPYONLY)
+    add_test(NAME lens_blur_filter
+            COMMAND lens_blur_filter rgb_small.png 32 13 0.5 32 3 out.png)
+    set_tests_properties(lens_blur_filter PROPERTIES
+                        LABELS lens_blur
+                        PASS_REGULAR_EXPRESSION "Success!"
+                        SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
 endif ()
diff --git a/apps/local_laplacian/CMakeLists.txt b/apps/local_laplacian/CMakeLists.txt
index 068060ad83b8..a81978731e14 100644
--- a/apps/local_laplacian/CMakeLists.txt
+++ b/apps/local_laplacian/CMakeLists.txt
@@ -21,10 +21,23 @@ add_halide_library(local_laplacian FROM local_laplacian.generator)
 add_halide_library(local_laplacian_auto_schedule FROM local_laplacian.generator
                    GENERATOR local_laplacian
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   # When target=host-cuda or host-metal, limit the GPU shared
-                   # memory per block to avoid gpu kernel launch failure.
-                   PARAMS autoscheduler.last_level_cache_size=30000 autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1
-                   )
+                   # note(antonysigma): Works on CUDA and CPU targets, but not
+                   # others. Error messages if we don't skip the test:
+                   #
+                   # OpenCL error: CL_INVALID_WORK_GROUP_SIZE
+                   # clEnqueueNDRangeKernel failed
+                   #
+                   # 2025-07-17 17:24:32.170
+                   # local_laplacian_process[63513:6587844] Metal API Validation
+                   # Enabled -[MTLDebugComputeCommandEncoder
+                   # _validateThreadsPerThreadgroup:]:1266: failed assertion
+                   # `(threadsPerThreadgroup.width(62) *
+                   # threadsPerThreadgroup.height(32)
+                   # * threadsPerThreadgroup.depth(1))(1984) must be <= 1024.
+                   # (device threadgroup size limit)'
+                   #
+                   # Vulkan: vkQueueWaitIdle returned VK_ERROR_DEVICE_LOST
+                   PARAMS autoscheduler.experimental_gpu_schedule=0)
 
 # Main executable
 add_executable(local_laplacian_process process.cpp)
diff --git a/apps/stencil_chain/CMakeLists.txt b/apps/stencil_chain/CMakeLists.txt
index 2a64a719209f..dabef9b07159 100644
--- a/apps/stencil_chain/CMakeLists.txt
+++ b/apps/stencil_chain/CMakeLists.txt
@@ -14,11 +14,24 @@ find_package(Halide REQUIRED)
 # Generator
 add_halide_generator(stencil_chain.generator SOURCES stencil_chain_generator.cpp)
 
+set(_stencil_chain_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
+
+if(Halide_TARGET MATCHES "cuda|metal|opencl")
+    # Set last_level_cache per GPU block to an extremely small value. This
+    # eliminates all `.compute_at` in the generated schedules, which in turn
+    # eliminates all GPU shared memory allocations.
+    list(APPEND _stencil_chain_autoscheduler_params
+        autoscheduler.last_level_cache_size=2000
+    )
+endif()
+
 # Filters
 add_halide_library(stencil_chain FROM stencil_chain.generator)
 add_halide_library(stencil_chain_auto_schedule FROM stencil_chain.generator
                    GENERATOR stencil_chain
-                   AUTOSCHEDULER Halide::Mullapudi2016)
+                   AUTOSCHEDULER Halide::Mullapudi2016
+                   PARAMS ${_stenctil_chain_autoscheduler_params}
+                   )
 
 # Main executable
 add_executable(stencil_chain_process process.cpp)
diff --git a/apps/unsharp/CMakeLists.txt b/apps/unsharp/CMakeLists.txt
index 7153dfbf6a4a..66fdb1afa285 100644
--- a/apps/unsharp/CMakeLists.txt
+++ b/apps/unsharp/CMakeLists.txt
@@ -14,11 +14,31 @@ find_package(Halide REQUIRED)
 # Generator
 add_halide_generator(unsharp.generator SOURCES unsharp_generator.cpp)
 
+set(_unsharp_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
+
+if(Halide_TARGET MATCHES "cuda|opencl|vulkan")
+    # Last level cache size estimate of the Nvidia GPU on the Buildbot. Hand
+    # tuned to pass the Builbot tests.
+    list(APPEND _unsharp_autoscheduler_params
+        autoscheduler.last_level_cache_size=20000
+    )
+elseif(Halide_TARGET MATCHES "metal")
+    # Resolving Metal error regarding the threads per GPU block limit:
+    #
+    # -[MTLDebugComputeCommandEncoder _validateThreadsPerThreadgroup:]:1267:
+    # failed assertion `(threadsPerThreadgroup.width(70) *
+    # threadsPerThreadgroup.height(8) * threadsPerThreadgroup.depth(1))(560)
+    # must be <= 448. (kernel threadgroup size limit)`
+    list(APPEND _unsharp_autoscheduler_params
+        autoscheduler.last_level_cache_size=1000)
+endif()
+
 # Filters
 add_halide_library(unsharp FROM unsharp.generator)
 add_halide_library(unsharp_auto_schedule FROM unsharp.generator
                    GENERATOR unsharp
-                   AUTOSCHEDULER Halide::Mullapudi2016)
+                   AUTOSCHEDULER Halide::Mullapudi2016
+                   PARAMS ${_unsharp_autoscheduler_params})
 
 # Main executable
 add_executable(unsharp_filter filter.cpp)
diff --git a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
index 1910d90f04cf..2fd6a5b40ca3 100644
--- a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
+++ b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
@@ -123,6 +123,21 @@ string get_sanitized_name(string name) {
     return name;
 }
 
+// Similar to std::replace, but assuming the vector contains unique values. And
+// if the element is absent, append new value to the end of vector.
+void replace_or_emplace(std::vector<VarOrRVar> &dims_, const VarOrRVar &before, VarOrRVar after) {
+    auto iter = std::find_if(dims_.begin(), dims_.end(),
+                             [before_name = before.name()](const VarOrRVar &d) {
+                                 return d.name() == before_name;
+                             });
+    const bool is_found = (iter != dims_.end());
+    if (is_found) {
+        *iter = std::move(after);
+    } else {
+        dims_.emplace_back(std::move(after));
+    }
+}
+
 // Representation of a function stage in the pipeline.
 struct FStage {
     Function func;
@@ -1353,7 +1368,7 @@ class GPUTilingDedup {
     }
 
     /** Generate Halide GPU schedules. */
-    void apply(AutoSchedule &sched) {
+    void apply(AutoSchedule &sched, const Expr &parallelism) {
         if (!ordering.empty() && !is_initial_order) {
             std::set<std::string> var_list;
             for (const auto &v : ordering) {
@@ -1381,7 +1396,7 @@ class GPUTilingDedup {
         }
 
         GPUTileHelper helper{f, stage_num};
-        Expr threads_budget = max_n_threads;
+        Expr threads_budget = min(parallelism, max_n_threads);
 
         // Maximize GPU thread occupancy with the grid-stride loop.
         //
@@ -1408,14 +1423,8 @@ class GPUTilingDedup {
 
             const auto &[var, entry] = *iter;
 
-            const bool should_unroll = can_prove(entry.factor <= 1);
-            if (should_unroll) {
-                // Skip thread size of 1.
-                continue;
-            }
-
             split_info new_entry{entry};
-            new_entry.factor = simplify(min(threads_budget, new_entry.factor));
+            new_entry.factor = simplify(min(threads_budget, entry.factor));
 
             const bool can_split = helper.try_split(new_entry);
             if (!can_split) {
@@ -1426,8 +1435,11 @@ class GPUTilingDedup {
             threads_budget = simplify(max(threads_budget / new_entry.factor, 1));
         }
 
-        if (!is_already_split) {
-            helper.commit(sched, is_compute_at);
+        helper.commit(sched, is_compute_at);
+        if (is_compute_at) {
+            // There are dimensions that does not need splitting but marked as
+            // vectorizable. Mark them as gpu threads.
+            mark_gpu_threads(sched);
         }
 
         // After calling `gpu_tiles` from `GPUTileHelper::commit()`, a few of
@@ -2192,7 +2204,7 @@ Partitioner::find_best_tile_config(const Group &g) {
     Group no_tile = g;
     no_tile.tile_sizes = no_tile_config;
 
-    bool show_analysis = false;
+    constexpr bool show_analysis = false;
     GroupAnalysis no_tile_analysis = analyze_group(no_tile, show_analysis);
 
     GroupAnalysis best_analysis = no_tile_analysis;
@@ -2215,7 +2227,7 @@ Partitioner::find_best_tile_config(const Group &g) {
         Expr benefit = estimate_benefit(best_analysis, new_analysis,
                                         no_redundant_work, true);
 
-        if (show_analysis) {
+        if constexpr (show_analysis) {
             debug(0) << "Benefit relative to not tiling:" << benefit << "\n";
             debug(0) << "Best analysis:" << new_analysis;
             debug(0) << "No tile analysis:" << no_tile_analysis;
@@ -3421,11 +3433,13 @@ void Partitioner::generate_group_cpu_schedule(
                     }
                 }
                 if (arch_params.is_gpu_schedule) {
-                    auto parallelized_split = gpu_tiling.can_parallelize(v, iter->second);
+                    const Expr gpu_threads = simplify(min(iter->second, arch_params.parallelism / def_par));
+                    auto parallelized_split = gpu_tiling.can_parallelize(v, gpu_threads);
                     if (parallelized_split) {
                         auto split_vars = *parallelized_split;
                         inner_dims.emplace_back(split_vars.inner);
-                        outer_dims.emplace_back(split_vars.outer);
+
+                        replace_or_emplace(outer_dims, v, split_vars.outer);
                     }
                 } else {
                     f_handle.parallel(v);
@@ -3444,7 +3458,7 @@ void Partitioner::generate_group_cpu_schedule(
     }
 
     if (arch_params.is_gpu_schedule) {
-        gpu_tiling.apply(sched);
+        gpu_tiling.apply(sched, arch_params.parallelism);
     }
 
     // Find the level at which group members will be computed.
@@ -3533,7 +3547,7 @@ void Partitioner::generate_group_cpu_schedule(
                         mem_rvars, mem_estimates, sched, gpu_tiling2);
 
         if (arch_params.is_gpu_schedule) {
-            gpu_tiling2.apply(sched);
+            gpu_tiling2.apply(sched, arch_params.parallelism);
         }
     }
 }
diff --git a/test/autoschedulers/mullapudi2016/large_window.cpp b/test/autoschedulers/mullapudi2016/large_window.cpp
index fd2a1491c56b..c163d19f339c 100644
--- a/test/autoschedulers/mullapudi2016/large_window.cpp
+++ b/test/autoschedulers/mullapudi2016/large_window.cpp
@@ -47,7 +47,13 @@ int main(int argc, char **argv) {
     Target target = get_jit_target_from_environment();
     Pipeline p(g);
 
-    p.apply_autoscheduler(target, get_mullapudi2016_test_params(target.has_gpu_feature()));
+    constexpr Mullapudi2016TestParams gpu_specifications{
+        /* .last_level_cache_size = */ 35'000,
+        /* .parallelism = */ 128,
+    };
+
+    p.apply_autoscheduler(target,
+                          get_mullapudi2016_test_params(target.has_gpu_feature(), {gpu_specifications}));
 
     // Inspect the schedule (only for debugging))
     // g.print_loop_nest();
diff --git a/test/autoschedulers/mullapudi2016/reorder.cpp b/test/autoschedulers/mullapudi2016/reorder.cpp
index 18a3a332ebc6..39cce5264b65 100644
--- a/test/autoschedulers/mullapudi2016/reorder.cpp
+++ b/test/autoschedulers/mullapudi2016/reorder.cpp
@@ -82,21 +82,11 @@ double run_test_2(bool auto_schedule) {
         // Provide estimates on the pipeline output
         diff.set_estimates({{0, left_im.width()}, {0, left_im.height()}, {0, 32}, {0, 3}});
 
-        // Auto-schedule the pipeline
-        //
-        // Increasing the GPU's active warp count estimate (aka parallelism)
-        // from 128 to 2048 to disable the Autoscheduler's grid-stride loop
-        // feature. At small parallelism value, the autoscheduler correctly
-        // designates dimension 'z' as the stride axis in the GPU grid-stride
-        // loop, which improves thread occupancy. However, it fails to reorder
-        // 'z' inside the gpu_blocks 'xo' and 'yo', which is required for proper
-        // loop nesting and successful code generation.
-        //
-        // Reference:
-        // https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
+        // note(antonysigma): Reducing the GPU's shared memory size estimate so that the GPU kernel
+        // can launch on consumer-grade GPUs.
         constexpr Mullapudi2016TestParams gpu_specifications{
-            /* .last_level_cache_size = */ 47'000,
-            /* .parallelism = */ 2048,
+            /* .last_level_cache_size = */ 20'000,
+            /* .parallelism = */ 128,
         };
 
         p.apply_autoscheduler(
@@ -139,16 +129,9 @@ double run_test_3(bool auto_schedule) {
     if (auto_schedule) {
         // Provide estimates on the pipeline output
         r.set_estimates({{0, 1024}, {0, 1024}, {0, 3}});
-        // Auto-schedule the pipeline
-        //
-        // Disabling this experimental GPU feature because the autoscheduler correctly
-        // identifies reduction domain 'r.x' as the stride axis for the GPU grid-stride loop,
-        // which helps retain threads efficiently. However, it fails to reorder 'r.x'
-        // inside the loop nests of gpu_blocks 'xo' and 'yo', which is necessary for
-        // successful code generation.
-        //
-        // Reference: https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
-        p.apply_autoscheduler(target, get_mullapudi2016_test_params(target.has_gpu_feature()));
+
+        p.apply_autoscheduler(target,
+                              get_mullapudi2016_test_params(target.has_gpu_feature()));
     } else {
         Var par("par");
         r.update(0).fuse(c, y, par).parallel(par).reorder(x, dom.x, dom.y).vectorize(x, 4);