diff --git a/apps/bgu/CMakeLists.txt b/apps/bgu/CMakeLists.txt index d7a44a2cda9b..a08c6b825454 100644 --- a/apps/bgu/CMakeLists.txt +++ b/apps/bgu/CMakeLists.txt @@ -14,21 +14,23 @@ find_package(Halide REQUIRED) # Generator add_halide_generator(bgu.generator SOURCES bgu_generator.cpp) +set(_bgu_autoscheduler_params autoscheduler.experimental_gpu_schedule=1) + +if(Halide_TARGET MATCHES "cuda|metal|opencl") + # Set last_level_cache per GPU block to an extremely small value. This + # eliminates all `.compute_at` in the generated schedules, which in turn + # eliminates all GPU shared memory allocations. + list(APPEND _bgu_autoscheduler_params + autoscheduler.last_level_cache_size=2000 + ) +endif() + # Filters add_halide_library(bgu FROM bgu.generator) add_halide_library(bgu_auto_schedule FROM bgu.generator GENERATOR bgu AUTOSCHEDULER Halide::Mullapudi2016 -# Note(antonysigma): experimental GPU schedule failed on the Buildbot worker -# "halide-testbranch-main-llvm18-x86-64-linux-cmake" with error: -# -# CUDA error: CUDA_ERROR_ILLEGAL_ADDRESS cuCtxSynchronize failed -# -# Curiously, it works on a low-end GPU: Nvidia GTX 1660S. -# -# Uncomment the following code to debug. PARAMS -# autoscheduler.experimental_gpu_schedule=1 -) + PARAMS ${_bgu_autoscheduler}) # Main executable add_executable(bgu_filter filter.cpp) diff --git a/apps/camera_pipe/CMakeLists.txt b/apps/camera_pipe/CMakeLists.txt index 94bf7a1ae447..ef36ff938d4e 100644 --- a/apps/camera_pipe/CMakeLists.txt +++ b/apps/camera_pipe/CMakeLists.txt @@ -16,11 +16,29 @@ add_halide_generator(camera_pipe.generator SOURCES camera_pipe_generator.cpp LINK_LIBRARIES Halide::Tools) +set(_camera_pipe_autoscheduler_params autoscheduler.experimental_gpu_schedule=1) + +if(Halide_TARGET MATCHES "cuda|metal") + # Last level cache size estimate of the Nvidia GPU on the Buildbot. Hand + # tuned to pass the Builbot tests. + list(APPEND _camera_pipe_autoscheduler_params + autoscheduler.last_level_cache_size=10000 + ) +elseif(Halide_TARGET MATCHES "opencl|vulkan") + # Set last_level_cache per GPU block to an extremely small value. This + # eliminates all `.compute_at` in the generated schedules, which in turn + # eliminates all GPU shared memory allocations. + list(APPEND _camera_pipe_autoscheduler_params + autoscheduler.last_level_cache_size=1000 + ) +endif() + # Filters add_halide_library(camera_pipe FROM camera_pipe.generator) add_halide_library(camera_pipe_auto_schedule FROM camera_pipe.generator GENERATOR camera_pipe - AUTOSCHEDULER Halide::Mullapudi2016) + AUTOSCHEDULER Halide::Mullapudi2016 + PARAMS ${_camera_pipe_autoscheduler_params}) # Main executable add_executable(camera_pipe_process process.cpp) diff --git a/apps/harris/CMakeLists.txt b/apps/harris/CMakeLists.txt index b6f95e554383..dff4b192816f 100644 --- a/apps/harris/CMakeLists.txt +++ b/apps/harris/CMakeLists.txt @@ -14,12 +14,23 @@ find_package(Halide REQUIRED) # Generator add_halide_generator(harris.generator SOURCES harris_generator.cpp) +set(_harris_autoscheduler_params autoscheduler.experimental_gpu_schedule=1) + +if(Halide_TARGET MATCHES "opencl|metal|cuda|vulkan") + # Set last_level_cache per GPU block to an extremely small value. This + # eliminates all `.compute_at` in the generated schedules, which in turn + # eliminates all GPU shared memory allocations. + list(APPEND _harris_autoscheduler_params + autoscheduler.last_level_cache_size=1000 + ) +endif() + # Filters add_halide_library(harris FROM harris.generator) add_halide_library(harris_auto_schedule FROM harris.generator GENERATOR harris AUTOSCHEDULER Halide::Mullapudi2016 - PARAMS autoscheduler.experimental_gpu_schedule=1) + PARAMS ${_harris_autoscheduler_params}) # Main executable add_executable(harris_filter filter.cpp) diff --git a/apps/harris/harris_generator.cpp b/apps/harris/harris_generator.cpp index 69cf8c05c68c..bd35b4ae25d4 100644 --- a/apps/harris/harris_generator.cpp +++ b/apps/harris/harris_generator.cpp @@ -66,7 +66,7 @@ class Harris : public Halide::Generator { const int kHeight = 2560; input.dim(0).set_estimate(0, kWidth); input.dim(1).set_estimate(0, kHeight); - input.dim(2).set_estimate(0, 3); + input.dim(2).set_estimate(0, 4); output.dim(0).set_estimate(3, kWidth - 6); output.dim(1).set_estimate(3, kHeight - 6); } diff --git a/apps/iir_blur/CMakeLists.txt b/apps/iir_blur/CMakeLists.txt index 0ca3233968f3..474af15dcf35 100644 --- a/apps/iir_blur/CMakeLists.txt +++ b/apps/iir_blur/CMakeLists.txt @@ -32,18 +32,11 @@ target_link_libraries(iir_blur_filter PRIVATE # Test that the app actually works! set(IMAGE ${CMAKE_CURRENT_LIST_DIR}/../images/rgb.png) if (EXISTS ${IMAGE}) - if (Halide_TARGET MATCHES "opencl") - # Error message: - # - # Error: OpenCL error: CL_INVALID_COMMAND_QUEUE clFinish failed - message(WARNING "Skipping Mullapudi2016's GPU auto-schedules for OpenCL target.") - else () - configure_file(${IMAGE} rgb.png COPYONLY) - add_test(NAME iir_blur_filter - COMMAND iir_blur_filter rgb.png out.png) - set_tests_properties(iir_blur_filter PROPERTIES - LABELS iir_blur - PASS_REGULAR_EXPRESSION "Success!" - SKIP_REGULAR_EXPRESSION "\\[SKIP\\]") - endif () + configure_file(${IMAGE} rgb.png COPYONLY) + add_test(NAME iir_blur_filter + COMMAND iir_blur_filter rgb.png out.png) + set_tests_properties(iir_blur_filter PROPERTIES + LABELS iir_blur + PASS_REGULAR_EXPRESSION "Success!" + SKIP_REGULAR_EXPRESSION "\\[SKIP\\]") endif () diff --git a/apps/iir_blur/filter.cpp b/apps/iir_blur/filter.cpp index fe0abd45ff79..e94b57c621d9 100644 --- a/apps/iir_blur/filter.cpp +++ b/apps/iir_blur/filter.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include "HalideBuffer.h" #include "HalideRuntime.h" @@ -13,12 +13,54 @@ using namespace Halide::Tools; +namespace { + +enum DeviceState { + USING_METAL_OR_OPENCL, + NOT_METAL_OR_OPENCL, + METADATA_ABSENT, +}; +DeviceState ensure_cuda_device() { + const auto hl_target = iir_blur_auto_schedule_metadata()->target; + if (hl_target == nullptr) { + printf("Warning: variable *_metadata()->target not specified. " + "Proceeding to the tests...\n"); + return METADATA_ABSENT; + } + + if (std::regex_search(hl_target, std::regex{"metal|opencl"})) { + // note(antonysigma): Error messages if we don't skip the test: + // + // OpenCL error: clFinish timeout. + // + // Metal: copy_to_host() failed. Error + // Domain=MTLCommandBufferErrorDomain Code=2 "Caused GPU Timeout Error + // (00000002:kIOAccelCommandBufferCallbackErrorTimeout)" + // UserInfo={NSLocalizedDescription=Caused GPU Timeout Error + // (00000002:kIOAccelCommandBufferCallbackErrorTimeout)} + printf("[SKIP] Mullapudi2016 experimental GPU schedule " + "generates copy_to_host() function calls that timeout. " + "Target = %s. Skipping...\n", + hl_target); + + return USING_METAL_OR_OPENCL; + } + + return NOT_METAL_OR_OPENCL; +} + +} // namespace + int main(int argc, char **argv) { if (argc != 3) { printf("Usage: %s in out\n", argv[0]); return 1; } + if (ensure_cuda_device() == USING_METAL_OR_OPENCL) { + return 0; + } + Halide::Runtime::Buffer input = load_and_convert_image(argv[1]); Halide::Runtime::Buffer output(input.width(), input.height(), input.channels()); diff --git a/apps/lens_blur/CMakeLists.txt b/apps/lens_blur/CMakeLists.txt index 46be29788591..aa99e55be6fd 100644 --- a/apps/lens_blur/CMakeLists.txt +++ b/apps/lens_blur/CMakeLists.txt @@ -14,12 +14,29 @@ find_package(Halide REQUIRED) # Generator add_halide_generator(lens_blur.generator SOURCES lens_blur_generator.cpp) +set(_lens_blur_autoscheduler_params autoscheduler.experimental_gpu_schedule=1) + +if(Halide_TARGET MATCHES "cuda|metal") + # Last level cache size estimate of the Nvidia GPU on the Buildbot. Hand + # tuned to pass the Builbot tests. + list(APPEND _lens_blur_autoscheduler_params + autoscheduler.last_level_cache_size=10000 + ) +elseif(Halide_TARGET MATCHES "opencl") + # Set last_level_cache per GPU block to an extremely small value. This + # eliminates all `.compute_at` in the generated schedules, which in turn + # eliminates all GPU shared memory allocations. + list(APPEND _lens_blur_autoscheduler_params + autoscheduler.last_level_cache_size=1000 + ) +endif() + # Filters add_halide_library(lens_blur FROM lens_blur.generator) add_halide_library(lens_blur_auto_schedule FROM lens_blur.generator GENERATOR lens_blur AUTOSCHEDULER Halide::Mullapudi2016 - PARAMS autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1) + PARAMS ${_lens_blur_autoscheduler_params}) # Main executable add_executable(lens_blur_filter process.cpp) @@ -32,26 +49,11 @@ target_link_libraries(lens_blur_filter # Test that the app actually works! set(IMAGE ${CMAKE_CURRENT_LIST_DIR}/../images/rgb_small.png) if (EXISTS ${IMAGE}) - if (Halide_TARGET MATCHES "metal") - # Note(antonysigma): Buildbot error message: - # - # 2025-06-30 23:26:02.260 lens_blur_filter[32272:21031150] Metal API Validation - # Enabled -[MTLDebugComputeCommandEncoder _validateThreadsPerThreadgroup:]:1267: - # failed assertion `(threadsPerThreadgroup.width(32) * - # threadsPerThreadgroup.height(32) * threadsPerThreadgroup.depth(1))(1024) must - # be <= 896. (kernel threadgroup size limit)' - # - # Possible root cause: Autoscheduler's GPUTilingDedup::max_n_threads is - # hardcoded to 1024 threads per block. The OSX Metal API caps the value at 836 - # threads per block because of the register pressure in lens_blur's GPU kernel. - message ("Pipeline lens_blur_auto_schedule skipped for target host-metal") - else () - configure_file(${IMAGE} rgb_small.png COPYONLY) - add_test(NAME lens_blur_filter - COMMAND lens_blur_filter rgb_small.png 32 13 0.5 32 3 out.png) - set_tests_properties(lens_blur_filter PROPERTIES - LABELS lens_blur - PASS_REGULAR_EXPRESSION "Success!" - SKIP_REGULAR_EXPRESSION "\\[SKIP\\]") - endif () + configure_file(${IMAGE} rgb_small.png COPYONLY) + add_test(NAME lens_blur_filter + COMMAND lens_blur_filter rgb_small.png 32 13 0.5 32 3 out.png) + set_tests_properties(lens_blur_filter PROPERTIES + LABELS lens_blur + PASS_REGULAR_EXPRESSION "Success!" + SKIP_REGULAR_EXPRESSION "\\[SKIP\\]") endif () diff --git a/apps/local_laplacian/CMakeLists.txt b/apps/local_laplacian/CMakeLists.txt index 068060ad83b8..a81978731e14 100644 --- a/apps/local_laplacian/CMakeLists.txt +++ b/apps/local_laplacian/CMakeLists.txt @@ -21,10 +21,23 @@ add_halide_library(local_laplacian FROM local_laplacian.generator) add_halide_library(local_laplacian_auto_schedule FROM local_laplacian.generator GENERATOR local_laplacian AUTOSCHEDULER Halide::Mullapudi2016 - # When target=host-cuda or host-metal, limit the GPU shared - # memory per block to avoid gpu kernel launch failure. - PARAMS autoscheduler.last_level_cache_size=30000 autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1 - ) + # note(antonysigma): Works on CUDA and CPU targets, but not + # others. Error messages if we don't skip the test: + # + # OpenCL error: CL_INVALID_WORK_GROUP_SIZE + # clEnqueueNDRangeKernel failed + # + # 2025-07-17 17:24:32.170 + # local_laplacian_process[63513:6587844] Metal API Validation + # Enabled -[MTLDebugComputeCommandEncoder + # _validateThreadsPerThreadgroup:]:1266: failed assertion + # `(threadsPerThreadgroup.width(62) * + # threadsPerThreadgroup.height(32) + # * threadsPerThreadgroup.depth(1))(1984) must be <= 1024. + # (device threadgroup size limit)' + # + # Vulkan: vkQueueWaitIdle returned VK_ERROR_DEVICE_LOST + PARAMS autoscheduler.experimental_gpu_schedule=0) # Main executable add_executable(local_laplacian_process process.cpp) diff --git a/apps/stencil_chain/CMakeLists.txt b/apps/stencil_chain/CMakeLists.txt index 2a64a719209f..dabef9b07159 100644 --- a/apps/stencil_chain/CMakeLists.txt +++ b/apps/stencil_chain/CMakeLists.txt @@ -14,11 +14,24 @@ find_package(Halide REQUIRED) # Generator add_halide_generator(stencil_chain.generator SOURCES stencil_chain_generator.cpp) +set(_stencil_chain_autoscheduler_params autoscheduler.experimental_gpu_schedule=1) + +if(Halide_TARGET MATCHES "cuda|metal|opencl") + # Set last_level_cache per GPU block to an extremely small value. This + # eliminates all `.compute_at` in the generated schedules, which in turn + # eliminates all GPU shared memory allocations. + list(APPEND _stencil_chain_autoscheduler_params + autoscheduler.last_level_cache_size=2000 + ) +endif() + # Filters add_halide_library(stencil_chain FROM stencil_chain.generator) add_halide_library(stencil_chain_auto_schedule FROM stencil_chain.generator GENERATOR stencil_chain - AUTOSCHEDULER Halide::Mullapudi2016) + AUTOSCHEDULER Halide::Mullapudi2016 + PARAMS ${_stenctil_chain_autoscheduler_params} + ) # Main executable add_executable(stencil_chain_process process.cpp) diff --git a/apps/unsharp/CMakeLists.txt b/apps/unsharp/CMakeLists.txt index 7153dfbf6a4a..66fdb1afa285 100644 --- a/apps/unsharp/CMakeLists.txt +++ b/apps/unsharp/CMakeLists.txt @@ -14,11 +14,31 @@ find_package(Halide REQUIRED) # Generator add_halide_generator(unsharp.generator SOURCES unsharp_generator.cpp) +set(_unsharp_autoscheduler_params autoscheduler.experimental_gpu_schedule=1) + +if(Halide_TARGET MATCHES "cuda|opencl|vulkan") + # Last level cache size estimate of the Nvidia GPU on the Buildbot. Hand + # tuned to pass the Builbot tests. + list(APPEND _unsharp_autoscheduler_params + autoscheduler.last_level_cache_size=20000 + ) +elseif(Halide_TARGET MATCHES "metal") + # Resolving Metal error regarding the threads per GPU block limit: + # + # -[MTLDebugComputeCommandEncoder _validateThreadsPerThreadgroup:]:1267: + # failed assertion `(threadsPerThreadgroup.width(70) * + # threadsPerThreadgroup.height(8) * threadsPerThreadgroup.depth(1))(560) + # must be <= 448. (kernel threadgroup size limit)` + list(APPEND _unsharp_autoscheduler_params + autoscheduler.last_level_cache_size=1000) +endif() + # Filters add_halide_library(unsharp FROM unsharp.generator) add_halide_library(unsharp_auto_schedule FROM unsharp.generator GENERATOR unsharp - AUTOSCHEDULER Halide::Mullapudi2016) + AUTOSCHEDULER Halide::Mullapudi2016 + PARAMS ${_unsharp_autoscheduler_params}) # Main executable add_executable(unsharp_filter filter.cpp) diff --git a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp index 1910d90f04cf..2fd6a5b40ca3 100644 --- a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp +++ b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp @@ -123,6 +123,21 @@ string get_sanitized_name(string name) { return name; } +// Similar to std::replace, but assuming the vector contains unique values. And +// if the element is absent, append new value to the end of vector. +void replace_or_emplace(std::vector &dims_, const VarOrRVar &before, VarOrRVar after) { + auto iter = std::find_if(dims_.begin(), dims_.end(), + [before_name = before.name()](const VarOrRVar &d) { + return d.name() == before_name; + }); + const bool is_found = (iter != dims_.end()); + if (is_found) { + *iter = std::move(after); + } else { + dims_.emplace_back(std::move(after)); + } +} + // Representation of a function stage in the pipeline. struct FStage { Function func; @@ -1353,7 +1368,7 @@ class GPUTilingDedup { } /** Generate Halide GPU schedules. */ - void apply(AutoSchedule &sched) { + void apply(AutoSchedule &sched, const Expr ¶llelism) { if (!ordering.empty() && !is_initial_order) { std::set var_list; for (const auto &v : ordering) { @@ -1381,7 +1396,7 @@ class GPUTilingDedup { } GPUTileHelper helper{f, stage_num}; - Expr threads_budget = max_n_threads; + Expr threads_budget = min(parallelism, max_n_threads); // Maximize GPU thread occupancy with the grid-stride loop. // @@ -1408,14 +1423,8 @@ class GPUTilingDedup { const auto &[var, entry] = *iter; - const bool should_unroll = can_prove(entry.factor <= 1); - if (should_unroll) { - // Skip thread size of 1. - continue; - } - split_info new_entry{entry}; - new_entry.factor = simplify(min(threads_budget, new_entry.factor)); + new_entry.factor = simplify(min(threads_budget, entry.factor)); const bool can_split = helper.try_split(new_entry); if (!can_split) { @@ -1426,8 +1435,11 @@ class GPUTilingDedup { threads_budget = simplify(max(threads_budget / new_entry.factor, 1)); } - if (!is_already_split) { - helper.commit(sched, is_compute_at); + helper.commit(sched, is_compute_at); + if (is_compute_at) { + // There are dimensions that does not need splitting but marked as + // vectorizable. Mark them as gpu threads. + mark_gpu_threads(sched); } // After calling `gpu_tiles` from `GPUTileHelper::commit()`, a few of @@ -2192,7 +2204,7 @@ Partitioner::find_best_tile_config(const Group &g) { Group no_tile = g; no_tile.tile_sizes = no_tile_config; - bool show_analysis = false; + constexpr bool show_analysis = false; GroupAnalysis no_tile_analysis = analyze_group(no_tile, show_analysis); GroupAnalysis best_analysis = no_tile_analysis; @@ -2215,7 +2227,7 @@ Partitioner::find_best_tile_config(const Group &g) { Expr benefit = estimate_benefit(best_analysis, new_analysis, no_redundant_work, true); - if (show_analysis) { + if constexpr (show_analysis) { debug(0) << "Benefit relative to not tiling:" << benefit << "\n"; debug(0) << "Best analysis:" << new_analysis; debug(0) << "No tile analysis:" << no_tile_analysis; @@ -3421,11 +3433,13 @@ void Partitioner::generate_group_cpu_schedule( } } if (arch_params.is_gpu_schedule) { - auto parallelized_split = gpu_tiling.can_parallelize(v, iter->second); + const Expr gpu_threads = simplify(min(iter->second, arch_params.parallelism / def_par)); + auto parallelized_split = gpu_tiling.can_parallelize(v, gpu_threads); if (parallelized_split) { auto split_vars = *parallelized_split; inner_dims.emplace_back(split_vars.inner); - outer_dims.emplace_back(split_vars.outer); + + replace_or_emplace(outer_dims, v, split_vars.outer); } } else { f_handle.parallel(v); @@ -3444,7 +3458,7 @@ void Partitioner::generate_group_cpu_schedule( } if (arch_params.is_gpu_schedule) { - gpu_tiling.apply(sched); + gpu_tiling.apply(sched, arch_params.parallelism); } // Find the level at which group members will be computed. @@ -3533,7 +3547,7 @@ void Partitioner::generate_group_cpu_schedule( mem_rvars, mem_estimates, sched, gpu_tiling2); if (arch_params.is_gpu_schedule) { - gpu_tiling2.apply(sched); + gpu_tiling2.apply(sched, arch_params.parallelism); } } } diff --git a/test/autoschedulers/mullapudi2016/large_window.cpp b/test/autoschedulers/mullapudi2016/large_window.cpp index fd2a1491c56b..c163d19f339c 100644 --- a/test/autoschedulers/mullapudi2016/large_window.cpp +++ b/test/autoschedulers/mullapudi2016/large_window.cpp @@ -47,7 +47,13 @@ int main(int argc, char **argv) { Target target = get_jit_target_from_environment(); Pipeline p(g); - p.apply_autoscheduler(target, get_mullapudi2016_test_params(target.has_gpu_feature())); + constexpr Mullapudi2016TestParams gpu_specifications{ + /* .last_level_cache_size = */ 35'000, + /* .parallelism = */ 128, + }; + + p.apply_autoscheduler(target, + get_mullapudi2016_test_params(target.has_gpu_feature(), {gpu_specifications})); // Inspect the schedule (only for debugging)) // g.print_loop_nest(); diff --git a/test/autoschedulers/mullapudi2016/reorder.cpp b/test/autoschedulers/mullapudi2016/reorder.cpp index 18a3a332ebc6..39cce5264b65 100644 --- a/test/autoschedulers/mullapudi2016/reorder.cpp +++ b/test/autoschedulers/mullapudi2016/reorder.cpp @@ -82,21 +82,11 @@ double run_test_2(bool auto_schedule) { // Provide estimates on the pipeline output diff.set_estimates({{0, left_im.width()}, {0, left_im.height()}, {0, 32}, {0, 3}}); - // Auto-schedule the pipeline - // - // Increasing the GPU's active warp count estimate (aka parallelism) - // from 128 to 2048 to disable the Autoscheduler's grid-stride loop - // feature. At small parallelism value, the autoscheduler correctly - // designates dimension 'z' as the stride axis in the GPU grid-stride - // loop, which improves thread occupancy. However, it fails to reorder - // 'z' inside the gpu_blocks 'xo' and 'yo', which is required for proper - // loop nesting and successful code generation. - // - // Reference: - // https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/ + // note(antonysigma): Reducing the GPU's shared memory size estimate so that the GPU kernel + // can launch on consumer-grade GPUs. constexpr Mullapudi2016TestParams gpu_specifications{ - /* .last_level_cache_size = */ 47'000, - /* .parallelism = */ 2048, + /* .last_level_cache_size = */ 20'000, + /* .parallelism = */ 128, }; p.apply_autoscheduler( @@ -139,16 +129,9 @@ double run_test_3(bool auto_schedule) { if (auto_schedule) { // Provide estimates on the pipeline output r.set_estimates({{0, 1024}, {0, 1024}, {0, 3}}); - // Auto-schedule the pipeline - // - // Disabling this experimental GPU feature because the autoscheduler correctly - // identifies reduction domain 'r.x' as the stride axis for the GPU grid-stride loop, - // which helps retain threads efficiently. However, it fails to reorder 'r.x' - // inside the loop nests of gpu_blocks 'xo' and 'yo', which is necessary for - // successful code generation. - // - // Reference: https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/ - p.apply_autoscheduler(target, get_mullapudi2016_test_params(target.has_gpu_feature())); + + p.apply_autoscheduler(target, + get_mullapudi2016_test_params(target.has_gpu_feature())); } else { Var par("par"); r.update(0).fuse(c, y, par).parallel(par).reorder(x, dom.x, dom.y).vectorize(x, 4);