diff --git a/backends/vulkan/runtime/graph/ops/glsl/var_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/reduce_buffer.glsl similarity index 64% rename from backends/vulkan/runtime/graph/ops/glsl/var_buffer.glsl rename to backends/vulkan/runtime/graph/ops/glsl/reduce_buffer.glsl index 30f283d6f01..4b087298555 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/var_buffer.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/reduce_buffer.glsl @@ -31,15 +31,24 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; layout(constant_id = 3) const int reduce_dim = 0; +$if VARIANCE_MODE: + #define VARIANCE_MODE + #define NWORKERS 4 #define MAX_THREADS 16 -shared T shared_sum[NWORKERS]; +shared T shared_accum[NWORKERS]; +#ifdef VARIANCE_MODE shared T shared_sum_sq[NWORKERS]; shared int shared_count[NWORKERS]; +#endif #include "indexing_utils.h" +#define INIT_ACCUM(first_val) ${INIT_ACCUM} +#define UPDATE_ACCUM(accum, new_val) ${UPDATE_ACCUM} +#define POSTPROCESS(accum) ${POSTPROCESS} + void main() { const ivec4 out_idx = ivec4( gl_GlobalInvocationID.x, @@ -49,9 +58,11 @@ void main() { const uint tid = gl_LocalInvocationID[reduce_dim]; - shared_sum[tid] = T(0); + shared_accum[tid] = T(0); +#ifdef VARIANCE_MODE shared_sum_sq[tid] = T(0); shared_count[tid] = 0; +#endif barrier(); const int R = in_sizes[reduce_dim]; @@ -65,9 +76,25 @@ void main() { uint len = q + (tid < rem ? 1u : 0u); uint base = tid * q + min(tid, rem); - T sum = T(0); + // Get the first value for initializing the accumulator if needed + T first_val = T(0); + if (R > 0) { + ivec4 first_idx = out_idx; + first_idx[reduce_dim] = 0; + + if (reduce_dim == 2) { + first_idx[reduce_dim + 1] = 0; + } + + first_val = in_buf[tidx_to_bufi(first_idx, in_strides)]; + } + + // Initialize accumulator + T accum = INIT_ACCUM(first_val); +#ifdef VARIANCE_MODE T sum_sq = T(0); int count = 0; +#endif ivec4 in_idx = out_idx; for (uint off = 0u; off < len; ++off) { @@ -83,39 +110,55 @@ void main() { T v = in_buf[tidx_to_bufi(in_idx, in_strides)]; - sum += v; + accum = UPDATE_ACCUM(accum, v); + +#ifdef VARIANCE_MODE sum_sq += v * v; count += 1; +#endif } - shared_sum[tid] = sum; + shared_accum[tid] = accum; +#ifdef VARIANCE_MODE shared_sum_sq[tid] = sum_sq; shared_count[tid] = count; +#endif barrier(); if (tid == 0u) { - T tot_sum = T(0); - T tot_sum_sq = T(0); - int tot_count = 0; + T result = shared_accum[0]; + +#ifdef VARIANCE_MODE + T tot_sum = shared_accum[0]; + T tot_sum_sq = shared_sum_sq[0]; + int tot_count = shared_count[0]; +#endif - for (uint i = 0; i < N; ++i) { - tot_sum += shared_sum[i]; + for (uint i = 1; i < N; ++i) { +#ifdef VARIANCE_MODE + tot_sum += shared_accum[i]; tot_sum_sq += shared_sum_sq[i]; tot_count += shared_count[i]; +#else + result = UPDATE_ACCUM(result, shared_accum[i]); +#endif } - T var; +#ifdef VARIANCE_MODE if (tot_count > 0) { T mean = tot_sum / T(tot_count); - var = (tot_sum_sq / T(tot_count)) - (mean * mean); + result = (tot_sum_sq / T(tot_count)) - (mean * mean); if (pc.unbiased != 0 && tot_count > 1) { - var *= T(tot_count) / T(tot_count - 1); + result *= T(tot_count) / T(tot_count - 1); } - } else{ + } else { // NaN to match PyTorch behavior - var = T(0.0/0.0); + result = T(0.0/0.0); } +#else + result = POSTPROCESS(result); +#endif - out_buf[tidx_to_bufi(out_idx, out_strides)] = var; + out_buf[tidx_to_bufi(out_idx, out_strides)] = result; } } diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/reduce_buffer.yaml new file mode 100644 index 00000000000..23186213f2f --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/reduce_buffer.yaml @@ -0,0 +1,32 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +reduce_buffer: + parameter_names_with_default_values: + DTYPE: float + STORAGE: buffer + INIT_ACCUM: T(0) + UPDATE_ACCUM: accum + new_val + POSTPROCESS: accum + VARIANCE_MODE: false + generate_variant_forall: + DTYPE: + - VALUE: half + - VALUE: float + shader_variants: + - NAME: sum_buffer + - NAME: mean_buffer + POSTPROCESS: (accum / T(in_sizes[reduce_dim])) + - NAME: amax_buffer + INIT_ACCUM: first_val + UPDATE_ACCUM: max(accum, new_val) + POSTPROCESS: accum + - NAME: amin_buffer + INIT_ACCUM: first_val + UPDATE_ACCUM: min(accum, new_val) + POSTPROCESS: accum + - NAME: var_buffer + VARIANCE_MODE: true diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl b/backends/vulkan/runtime/graph/ops/glsl/reduce_texture3d.glsl similarity index 70% rename from backends/vulkan/runtime/graph/ops/glsl/reduce.glsl rename to backends/vulkan/runtime/graph/ops/glsl/reduce_texture3d.glsl index 7a6263d9f55..1ce56114375 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/reduce_texture3d.glsl @@ -23,12 +23,19 @@ ${layout_declare_tensor(B, "r", "tin", DTYPE, STORAGE)} ${layout_declare_ubo(B, "ivec3", "tin_limits")} ${layout_declare_ubo(B, "ivec4", "tin_sizes")} +layout(push_constant) uniform PushConstants { + int unbiased; +} pc; + layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; layout(constant_id = 3) const int packed_dim = 0; layout(constant_id = 4) const int reduce_dim = 0; layout(constant_id = 5) const int group_dim = 1; +$if VARIANCE_MODE: + #define VARIANCE_MODE + // A more verbose name would be NWORKERS_PER_GROUP. This describes the number of // threads that will co-operate to compute one reduction output. There may be // multiple groups computing distinct reduction outputs within one work group. @@ -39,8 +46,11 @@ layout(constant_id = 5) const int group_dim = 1; // work group will write into its assigned element in the shared array. #define MAX_NTHREADS 16 - -shared vec4 shared_vecs[MAX_NTHREADS]; +shared VEC4_T shared_vecs[MAX_NTHREADS]; +// Second accumulator for variance mode - used for sum of values, prev +// accumulator is used for sum of squares +shared VEC4_T shared_sum_sq[MAX_NTHREADS]; +shared int shared_count[MAX_NTHREADS]; #include "indexing_utils.h" @@ -48,6 +58,17 @@ int tid_to_smi(const ivec2 tid) { return tid.x + tid.y * NWORKERS; } +VEC4_T calculate_variance(VEC4_T sum, VEC4_T sum_sq, int count) { + VEC4_T mean = sum / float(count); + VEC4_T variance = (sum_sq / float(count)) - (mean * mean); + + if ((pc.unbiased != 0) && (count > 1)) { + variance = variance * (float(count) / float(count - 1.0)); + } + + return variance; +} + /* * The functions below compute reduction along a single dimension for a tensor. * The shader template generalize reduction by abstracting the initial value of @@ -90,17 +111,31 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { const int smi = tid_to_smi(tid); scan_pos[reduce_dim] = 0; - vec4 accum = INIT_ACCUM(load_texel(tin, scan_pos)); + VEC4_T accum = INIT_ACCUM(load_texel(tin, scan_pos)); + +#ifdef VARIANCE_MODE + VEC4_T sum_sq = VEC4_T(0); + int count = 0; +#endif scan_pos[reduce_dim] = tid.x; // Partially accumulate over elements i, i + NWORKERS, i + 2*NWORKERS, ... of // the reduction row for (int i = tid.x; i < tin_sizes[reduce_dim]; i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) { - accum = UPDATE_ACCUM(accum, load_texel(tin, scan_pos)); + VEC4_T val = load_texel(tin, scan_pos); + accum = UPDATE_ACCUM(accum, val); +#ifdef VARIANCE_MODE + sum_sq += val * val; + count += 1; +#endif } // Write partial output to shared memory and synchronize work group shared_vecs[smi] = accum; +#ifdef VARIANCE_MODE + shared_sum_sq[smi] = sum_sq; + shared_count[smi] = count; +#endif barrier(); // Since the reduction row is reduced to only one element, only the "main" @@ -108,9 +143,18 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { if (tid.x == 0) { // Iterate over the partial outputs to obtain the overall output int group_i = tid.y * NWORKERS; - accum = shared_vecs[group_i++]; - for (int i = 1; i < NWORKERS; i++, group_i++) { - accum = UPDATE_ACCUM(accum, shared_vecs[group_i]); + accum = shared_vecs[group_i]; +#ifdef VARIANCE_MODE + sum_sq = shared_sum_sq[group_i]; + count = shared_count[group_i]; +#endif + for (int i = 1; i < NWORKERS; i++) { + int idx = tid.y * NWORKERS + i; + accum = UPDATE_ACCUM(accum, shared_vecs[idx]); +#ifdef VARIANCE_MODE + sum_sq += shared_sum_sq[idx]; + count += shared_count[idx]; +#endif } // Determine if there are any padding elements in the final texel of the @@ -121,14 +165,27 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { const bool is_last_texel = scan_pos[packed_dim] == (tin_limits[packed_dim] - 1); +#ifdef VARIANCE_MODE + VEC4_T variance = calculate_variance(accum, sum_sq, count); +#endif + // Explicitly set padding elements to 0 if (is_last_texel && nspill > 0) { [[unroll]] for (int i = nspill; i < 4; i++) { +#ifdef VARIANCE_MODE + variance[i] = 0; +#else accum[i] = 0; +#endif } } + scan_pos[reduce_dim] = tid.x; +#ifdef VARIANCE_MODE + write_texel(tout, scan_pos, variance); +#else write_texel(tout, scan_pos, POSTPROCESS(accum)); +#endif } } @@ -151,26 +208,44 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) { const int reduce_len = tin_sizes[packed_dim] - nspill; scan_pos[reduce_dim] = 0; - vec4 accum = INIT_ACCUM(vec4(load_texel(tin, scan_pos).x)); + VEC4_T accum = INIT_ACCUM(VEC4_T(load_texel(tin, scan_pos).x)); + +#ifdef VARIANCE_MODE + VEC4_T sum_sq = VEC4_T(0); + int count = 0; +#endif // Partially accumulate over elements i, i + NWORKERS, i + 2*NWORKERS, ... of // the reduction row scan_pos[reduce_dim] = tid.x; for (int i = tid.x * 4; i < reduce_len; i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) { - accum = UPDATE_ACCUM(accum, load_texel(tin, scan_pos)); + VEC4_T val = load_texel(tin, scan_pos); + accum = UPDATE_ACCUM(accum, val); +#ifdef VARIANCE_MODE + sum_sq += val * val; + count += 4; // Each texel has 4 elements +#endif } // For the last texel in the dim, if there are padding elements then each // element of the texel needs to be processed individually such that the // padding elements are ignored if (scan_pos[reduce_dim] == tin_limits[reduce_dim] - 1 && nspill > 0) { - const vec4 intex = load_texel(tin, scan_pos); + const VEC4_T val = load_texel(tin, scan_pos); for (int i = 0; i < nspill; i++) { - accum.x = UPDATE_ACCUM(accum.x, intex[i]); + accum.x = UPDATE_ACCUM(accum.x, val[i]); +#ifdef VARIANCE_MODE + sum_sq.x += val[i] * val[i]; + count += 1; +#endif } } // Write partial output to shared memory and synchronize work group shared_vecs[smi] = accum; +#ifdef VARIANCE_MODE + shared_sum_sq[smi] = sum_sq; + shared_count[smi] = count; +#endif barrier(); // Since the reduction row is reduced to only one element, only the "main" @@ -178,10 +253,35 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) { if (tid.x == 0) { // Iterate over the partial maximums to obtain the overall maximum int group_i = tid.y * NWORKERS; - accum = shared_vecs[group_i++]; + accum = shared_vecs[group_i]; +#ifdef VARIANCE_MODE + sum_sq = shared_sum_sq[group_i]; + count = shared_count[group_i]; +#endif for (int i = 1; i < NWORKERS; i++, group_i++) { - accum = UPDATE_ACCUM(accum, shared_vecs[group_i]); + int idx = tid.y * NWORKERS + i; + accum = UPDATE_ACCUM(accum, shared_vecs[idx]); +#ifdef VARIANCE_MODE + sum_sq += shared_sum_sq[idx]; + count += shared_count[idx]; +#endif } + +#ifdef VARIANCE_MODE + float total_sum = accum.x + accum.y + accum.z + accum.w; + float total_sum_sq = sum_sq.x + sum_sq.y + sum_sq.z + sum_sq.w; + int total_count = count; + + float mean = total_sum / float(total_count); + float variance = (total_sum_sq / float(total_count)) - (mean * mean); + + if ((pc.unbiased != 0) && (total_count > 1)) { + variance = variance * (float(total_count) / float(total_count - 1.0)); + } + + scan_pos[reduce_dim] = tid.x; + write_texel(tout, scan_pos, VEC4_T(variance, 0, 0, 0)); +#else // Each element of the texel is itself a partial maximum; iterate over the // texel to find the actual maximum float accum_final = accum.x; @@ -190,7 +290,8 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) { } scan_pos[reduce_dim] = tid.x; - write_texel(tout, scan_pos, POSTPROCESS(vec4(accum_final, 0, 0, 0))); + write_texel(tout, scan_pos, POSTPROCESS(VEC4_T(accum_final, 0, 0, 0))); +#endif } } diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce.yaml b/backends/vulkan/runtime/graph/ops/glsl/reduce_texture3d.yaml similarity index 77% rename from backends/vulkan/runtime/graph/ops/glsl/reduce.yaml rename to backends/vulkan/runtime/graph/ops/glsl/reduce_texture3d.yaml index 21a7132b8db..c057ec100fd 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/reduce.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/reduce_texture3d.yaml @@ -4,26 +4,29 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -reduce: +reduce_texture3d: parameter_names_with_default_values: DTYPE: float STORAGE: texture3d INIT_ACCUM: VEC4_T(0) UPDATE_ACCUM: accum + new_val POSTPROCESS: accum + VARIANCE_MODE: false generate_variant_forall: DTYPE: - VALUE: half - VALUE: float shader_variants: - - NAME: sum - - NAME: mean + - NAME: sum_texture3d + - NAME: mean_texture3d POSTPROCESS: (accum / tin_sizes[reduce_dim]) - - NAME: amax + - NAME: amax_texture3d INIT_ACCUM: first_val UPDATE_ACCUM: max(accum, new_val) POSTPROCESS: accum - - NAME: amin + - NAME: amin_texture3d INIT_ACCUM: first_val UPDATE_ACCUM: min(accum, new_val) POSTPROCESS: accum + - NAME: var_texture3d + VARIANCE_MODE: true diff --git a/backends/vulkan/runtime/graph/ops/glsl/var_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/var_buffer.yaml deleted file mode 100644 index 7cb783775c9..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/var_buffer.yaml +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -var_buffer: - parameter_names_with_default_values: - DTYPE: float - STORAGE: buffer - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: var_buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl deleted file mode 100644 index faeac01fcd2..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl +++ /dev/null @@ -1,222 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} - -${define_active_storage_type(STORAGE)} - -#extension GL_EXT_control_flow_attributes : require - -layout(std430) buffer; - -${layout_declare_tensor(B, "w", "tout", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "tin", DTYPE, STORAGE)} - -${layout_declare_ubo(B, "ivec3", "tin_limits")} -${layout_declare_ubo(B, "ivec4", "tin_sizes")} - -layout(push_constant) uniform PushConstants { - int unbiased; -} pc; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int packed_dim = 0; -layout(constant_id = 4) const int reduce_dim = 0; -layout(constant_id = 5) const int group_dim = 1; - -// A more verbose name would be NWORKERS_PER_GROUP. This describes the number of -// threads that will co-operate to compute one reduction output. There may be -// multiple groups computing distinct reduction outputs within one work group. -#define NWORKERS 4 - -// Sets an upper limit on the total size of a work group based on how many -// elements are allocated in the shared memory array below. Each thread in the -// work group will write into its assigned element in the shared array. -#define MAX_NTHREADS 16 - -shared VEC4_T shared_sum[MAX_NTHREADS]; -shared VEC4_T shared_sum_sq[MAX_NTHREADS]; -shared int shared_count[MAX_NTHREADS]; - -#include "indexing_utils.h" - -int tid_to_smi(const ivec2 tid) { - return tid.x + tid.y * NWORKERS; -} - -VEC4_T calculate_variance(VEC4_T sum, VEC4_T sum_sq, int count) { - VEC4_T mean = sum / float(count); - VEC4_T variance = (sum_sq / float(count)) - (mean * mean); - - if ((pc.unbiased != 0) && (count > 1)) { - variance = variance * (float(count) / float(count - 1.0)); - } - - return variance; -} - -void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) { - // shared memory index of this thread - const int smi = tid_to_smi(tid); - - VEC4_T sum = VEC4_T(0); - VEC4_T sum_sq = VEC4_T(0); - int count = 0; - - scan_pos[reduce_dim] = tid.x; - for (int i = tid.x; i < tin_sizes[reduce_dim]; - i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) { - VEC4_T val = load_texel(tin, scan_pos); - sum += val; - sum_sq += val * val; - count += 1; - } - // Write partial output to shared memory and synchronize work group - shared_sum[smi] = sum; - shared_sum_sq[smi] = sum_sq; - shared_count[smi] = count; - barrier(); - - // Since the reduction row is reduced to only one element, only the "main" - // thread in the group needs aggregate the partial outputs - if (tid.x == 0) { - int group_i = tid.y * NWORKERS; - sum = shared_sum[group_i]; - sum_sq = shared_sum_sq[group_i]; - count = shared_count[group_i]; - - for (int i = 1; i < NWORKERS; i++) { - int idx = tid.y * NWORKERS + i; - sum += shared_sum[idx]; - sum_sq += shared_sum_sq[idx]; - count += shared_count[idx]; - } - - // Determine if there are any padding elements in the final texel of the - // packed dimension - const int nspill = mod4(tin_sizes[packed_dim]); - // Detect if this thread is working on the final texels of the packed - // dimension, which may have padding elements - const bool is_last_texel = - scan_pos[packed_dim] == (tin_limits[packed_dim] - 1); - - VEC4_T variance = calculate_variance(sum, sum_sq, count); - - // Explicitly set padding elements to 0 - if (is_last_texel && nspill > 0) { - [[unroll]] for (int i = nspill; i < 4; i++) { - variance[i] = 0; - } - } - - scan_pos[reduce_dim] = tid.x; - write_texel(tout, scan_pos, variance); - } -} - -/* - * Compute reduction where the reduction dim is also the packed dim. This case is - * complex because the reduction needs to occur over the individual texels. - * Therefore, in this algorithm each element of the accumulator texels are - * themselves partial outputs. Special care has to be taken to ignore padding - * elements in texels (which occur when the size of the packed dim is not a - * multiple of 4) so that they do not influence the output of reduction. - */ -void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) { - // shared memory index of this thread - const int smi = tid_to_smi(tid); - - // Number of non-padding elements in the last texel in the reduction row - const int nspill = mod4(tin_sizes[packed_dim]); - // Only reduce up to the last "complete" texel. The last texel will need to be - // handled specially if it has padding elements. - const int reduce_len = tin_sizes[packed_dim] - nspill; - - VEC4_T sum = VEC4_T(0); - VEC4_T sum_sq = VEC4_T(0); - int count = 0; - - // Partially accumulate over elements i, i + NWORKERS, i + 2*NWORKERS, ... of - // the reduction row - scan_pos[reduce_dim] = tid.x; - for (int i = tid.x * 4; i < reduce_len; - i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) { - VEC4_T val = load_texel(tin, scan_pos); - sum += val; - sum_sq += val * val; - count += 4; - } - // For the last texel in the dim, if there are padding elements then each - // element of the texel needs to be processed individually such that the - // padding elements are ignored - if (scan_pos[reduce_dim] == tin_limits[reduce_dim] - 1 && nspill > 0) { - const VEC4_T val = load_texel(tin, scan_pos); - for (int i = 0; i < nspill; i++) { - sum.x += val[i]; - sum_sq.x += val[i] * val[i]; - count += 1; - } - } - // Write partial output to shared memory and synchronize work group - shared_sum[smi] = sum; - shared_sum_sq[smi] = sum_sq; - shared_count[smi] = count; - barrier(); - - // Since the reduction row is reduced to only one element, only the "main" - // thread in the group needs aggregate the partial outputs - if (tid.x == 0) { - sum = shared_sum[tid.y * NWORKERS]; - sum_sq = shared_sum_sq[tid.y * NWORKERS]; - count = shared_count[tid.y * NWORKERS]; - for (int i = 1; i < NWORKERS; i++) { - int idx = tid.y * NWORKERS + i; - sum += shared_sum[idx]; - sum_sq += shared_sum_sq[idx]; - count += shared_count[idx]; - } - - // Combine across the elements of the combined state - float total_sum = sum.x + sum.y + sum.z + sum.w; - float total_sum_sq = sum_sq.x + sum_sq.y + sum_sq.z + sum_sq.w; - int total_count = count; - - float mean = total_sum / float(total_count); - float variance = (total_sum_sq / float(total_count)) - (mean * mean); - - if ((pc.unbiased != 0) && (total_count > 1)) { - variance = variance * (float(total_count) / float(total_count - 1.0)); - } - - scan_pos[reduce_dim] = tid.x; - write_texel(tout, scan_pos, VEC4_T(variance, 0, 0, 0)); - } -} - -void main() { - ivec3 scan_pos = ivec3(gl_GlobalInvocationID); - scan_pos[reduce_dim] = 0; - - const ivec2 tid = ivec2( - gl_LocalInvocationID[reduce_dim], - gl_LocalInvocationID[group_dim]); - - if (any(greaterThanEqual(scan_pos, tin_limits))) { - return; - } - - if (reduce_dim != packed_dim) { - reduce_nonpacked_dim(tid, scan_pos); - } else { - reduce_packed_dim(tid, scan_pos); - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.yaml deleted file mode 100644 index 9cecbedca1a..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.yaml +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -var_texture3d: - parameter_names_with_default_values: - DTYPE: float - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: var_texture3d diff --git a/backends/vulkan/runtime/graph/ops/impl/Reduce.cpp b/backends/vulkan/runtime/graph/ops/impl/Reduce.cpp index 8fcd4a0609c..01ea343635d 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Reduce.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Reduce.cpp @@ -32,16 +32,67 @@ void resize_reduce_node( out->virtual_resize(new_sizes); } -void add_reduce_node( +void add_reduce_buffer_node( ComputeGraph& graph, ValueRef in, const int dim, ValueRef out, - const std::string& op_name) { - VK_CHECK_COND( - !graph.is_buffer_storage(in) && !graph.is_buffer_storage(out), - "Vulkan reduction only supports texture storage"); + const std::string& op_name, + bool unbiased = false) { + const int64_t ndim = graph.dim_of(in); + int32_t reduce_dim = normalize(dim, ndim); + reduce_dim = nchw_dim_to_whcn_dim(reduce_dim, ndim); + + std::string kernel_name = op_name; + kernel_name.reserve(kShaderNameReserve); + add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); + add_dtype_suffix(kernel_name, graph.dtype_of(out)); + + const uint32_t nworkers_per_group = 4; + utils::uvec3 global_wg_size = { + graph.size_at(-1, out), + graph.size_at(-2, out), + graph.size_at(-3, out) * graph.size_at(-4, out)}; + + utils::uvec3 local_wg_size{1, 1, 1}; + local_wg_size[reduce_dim] = nworkers_per_group; + + std::vector push_constants; + int32_t unbiased_int = static_cast(unbiased); + push_constants.emplace_back(&unbiased_int, sizeof(unbiased_int)); + + graph.execute_nodes().emplace_back(new DispatchNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + global_wg_size, + local_wg_size, + // Inputs and Outputs + {{out, vkapi::kWrite}, {in, vkapi::kRead}}, + // Shader params buffers + { + graph.sizes_ubo(in), + graph.strides_ubo(in), + graph.sizes_ubo(out), + graph.strides_ubo(out), + }, + // Push Constants + push_constants, + // Specialization Constants + {reduce_dim}, + // Resize Args + {dim}, + // Resizing Logic + resize_reduce_node)); +} + +void add_reduce_texture_node( + ComputeGraph& graph, + ValueRef in, + const int dim, + ValueRef out, + const std::string& op_name, + bool unbiased = false) { const int64_t ndim = graph.dim_of(in); int32_t reduce_dim = dim; @@ -55,9 +106,9 @@ void add_reduce_node( VK_CHECK_COND(graph.concat_dim_of(out) != reduce_dim); } - vkapi::ShaderInfo shader_descriptor; std::string kernel_name = op_name; kernel_name.reserve(kShaderNameReserve); + add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); add_dtype_suffix(kernel_name, graph.dtype_of(out)); // This should match the value of MAX_NTHREADS in the softmax shader. @@ -83,6 +134,10 @@ void add_reduce_node( group_dim = other_dim_2; } + std::vector push_constants; + int32_t unbiased_int = static_cast(unbiased); + push_constants.emplace_back(&unbiased_int, sizeof(unbiased_int)); + graph.execute_nodes().emplace_back(new DispatchNode( graph, // shader_descriptor, @@ -94,7 +149,7 @@ void add_reduce_node( // Shader params buffers {graph.logical_limits_ubo(in), graph.sizes_ubo(in)}, // Push Constants - {}, + push_constants, // Specialization Constants {graph.packed_dim_of(out), reduce_dim, group_dim}, // Resize Args @@ -103,24 +158,51 @@ void add_reduce_node( resize_reduce_node)); } +void add_reduce_node( + ComputeGraph& graph, + ValueRef in, + const int dim, + ValueRef out, + const std::string& op_name, + bool unbiased = false) { + bool is_buffer = graph.is_buffer_storage(in) || graph.is_buffer_storage(out); + + if (is_buffer) { + add_reduce_buffer_node(graph, in, dim, out, op_name, unbiased); + } else { + add_reduce_texture_node(graph, in, dim, out, op_name, unbiased); + } +} + #define DEFINE_REDUCE_FN(op_name, out_arg_idx) \ void op_name(ComputeGraph& graph, const std::vector& args) { \ const IntListPtr dims_list = graph.get_int_list(args[1]); \ VK_CHECK_COND(dims_list->size() == 1); \ + bool unbiased = false; \ + if (strcmp(#op_name, "var") == 0 && args.size() > 2) { \ + unbiased = graph.get_bool(args[2]); \ + } \ return add_reduce_node( \ - graph, args[0], dims_list->at(0), args[out_arg_idx], #op_name); \ + graph, \ + args[0], \ + dims_list->at(0), \ + args[out_arg_idx], \ + #op_name, \ + unbiased); \ } DEFINE_REDUCE_FN(sum, 4) DEFINE_REDUCE_FN(mean, 4) DEFINE_REDUCE_FN(amax, 3) DEFINE_REDUCE_FN(amin, 3) +DEFINE_REDUCE_FN(var, 4) REGISTER_OPERATORS { VK_REGISTER_OP(aten.sum.dim_IntList, sum); VK_REGISTER_OP(aten.mean.dim, mean); VK_REGISTER_OP(aten.amax.default, amax); VK_REGISTER_OP(aten.amin.default, amin); + VK_REGISTER_OP(aten.var.dim, var); } } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Var.cpp b/backends/vulkan/runtime/graph/ops/impl/Var.cpp deleted file mode 100644 index dcac9d2e210..00000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Var.cpp +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ -#include - -#include -#include - -namespace vkcompute { - -using namespace utils; - -void resize_var_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - (void)extra_args; - vTensorPtr out = graph->get_tensor(args[0].refs[0]); - vTensorPtr in = graph->get_tensor(args[1].refs[0]); - - int dim = extra_args[0]; - - std::vector new_sizes = in->sizes(); - new_sizes[normalize(dim, new_sizes.size())] = 1; - out->virtual_resize(new_sizes); -} - -void add_var_buffer_node( - ComputeGraph& graph, - ValueRef in, - const int dim, - bool unbiased, - ValueRef out) { - const int64_t ndim = graph.dim_of(in); - int32_t reduce_dim = normalize(dim, ndim); - reduce_dim = nchw_dim_to_whcn_dim(reduce_dim, ndim); - - std::string kernel_name = "var"; - kernel_name.reserve(kShaderNameReserve); - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - const uint32_t nworkers_per_group = 4; - - utils::uvec3 global_wg_size = { - graph.size_at(-1, out), - graph.size_at(-2, out), - graph.size_at(-3, out) * graph.size_at(-4, out)}; - - utils::uvec3 local_wg_size{1, 1, 1}; - local_wg_size[reduce_dim] = nworkers_per_group; - - std::vector push_constants; - int32_t unbiased_int = static_cast(unbiased); - push_constants.emplace_back(&unbiased_int, sizeof(unbiased_int)); - - graph.execute_nodes().emplace_back(new DispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - global_wg_size, - local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {in, vkapi::kRead}}, - // Shader params buffers - { - graph.sizes_ubo(in), - graph.strides_ubo(in), - graph.sizes_ubo(out), - graph.strides_ubo(out), - }, - // Push Constants - push_constants, - // Specialization Constants - {reduce_dim}, - // Resize Args - {dim}, - // Resizing Logic - resize_var_node)); -} - -void add_var_texture_node( - ComputeGraph& graph, - ValueRef in, - const int dim, - bool unbiased, - ValueRef out) { - const int64_t ndim = graph.dim_of(in); - - int32_t reduce_dim = dim; - reduce_dim = normalize(reduce_dim, ndim); - reduce_dim = nchw_dim_to_whcn_dim(reduce_dim, ndim); - - // Check that the concat dim is not the reduction dim, if the tensor has a - // batch dim greater than 1. - if (graph.dim_of(in) == 4 && graph.size_at(0, in) > 1) { - VK_CHECK_COND(graph.concat_dim_of(in) != reduce_dim); - VK_CHECK_COND(graph.concat_dim_of(out) != reduce_dim); - } - - std::string kernel_name = "var"; - kernel_name.reserve(kShaderNameReserve); - add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - - // This should match the value of MAX_NTHREADS in the softmax shader. - constexpr uint32_t max_nthreads = 16; - - const uint32_t nworkers_per_group = 4; - const uint32_t ngroups = 4; - VK_CHECK_COND(nworkers_per_group * ngroups <= max_nthreads); - - utils::uvec3 global_wg_size = graph.logical_limits_of(out); - global_wg_size[reduce_dim] = 1; - - utils::uvec3 local_wg_size{1, 1, 1}; - local_wg_size[reduce_dim] = nworkers_per_group; - const int other_dim_1 = (reduce_dim + 1) % 3; - const int other_dim_2 = (reduce_dim + 2) % 3; - int32_t group_dim; - if (global_wg_size[other_dim_1] > global_wg_size[other_dim_2]) { - local_wg_size[other_dim_1] = ngroups; - group_dim = other_dim_1; - } else { - local_wg_size[other_dim_2] = ngroups; - group_dim = other_dim_2; - } - - std::vector push_constants; - int32_t unbiased_int = static_cast(unbiased); - push_constants.emplace_back(&unbiased_int, sizeof(unbiased_int)); - - graph.execute_nodes().emplace_back(new DispatchNode( - graph, - // shader_descriptor, - VK_KERNEL_FROM_STR(kernel_name), - global_wg_size, - local_wg_size, - // Inputs and Outputs - {{out, vkapi::kWrite}, {in, vkapi::kRead}}, - // Shader params buffers - {graph.logical_limits_ubo(in), graph.sizes_ubo(in)}, - // Push Constants - push_constants, - // Specialization Constants - {graph.packed_dim_of(out), reduce_dim, group_dim}, - // Resize Args - {dim}, - // Resizing Logic - resize_var_node)); -} - -void add_var_node( - ComputeGraph& graph, - ValueRef in, - const int dim, - bool unbiased, - ValueRef out) { - bool is_buffer = graph.is_buffer_storage(in) || graph.is_buffer_storage(out); - - if (is_buffer) { - add_var_buffer_node(graph, in, dim, unbiased, out); - } else { - add_var_texture_node(graph, in, dim, unbiased, out); - } -} - -void var(ComputeGraph& graph, const std::vector& args) { - const IntListPtr dims_list = graph.get_int_list(args[1]); - VK_CHECK_COND(dims_list->size() == 1); - bool unbiased = true; - if (args.size() > 2) { - unbiased = graph.get_bool(args[2]); - } - return add_var_node(graph, args[0], dims_list->at(0), unbiased, args[4]); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten.var.dim, var); -} - -} // namespace vkcompute diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py index 0aa8d083c59..a81612f5269 100644 --- a/backends/vulkan/test/op_tests/cases.py +++ b/backends/vulkan/test/op_tests/cases.py @@ -1086,56 +1086,8 @@ def get_split_tensor_inputs(): return test_suite -def get_reduce_inputs(is_softmax: bool = False): +def get_reduce_inputs(is_softmax: bool = False, is_variance: bool = False): bool_arg = False if is_softmax else True - return [ - ((L), 0, bool_arg), - ((L), -1, bool_arg), - ((M, L), 0, bool_arg), - ((M, L), 1, bool_arg), - ((L, M), -1, bool_arg), - ((M, L), -2, bool_arg), - ((S, S1, S2), 0, bool_arg), - ((S, S1, S2), 1, bool_arg), - ((S, S1, S2), 2, bool_arg), - ((S, S1, S2), -1, bool_arg), - ((S, S1, S2), -2, bool_arg), - ((S, S1, S2), -3, bool_arg), - ((1, S, S1, S2), 1, bool_arg), - ((1, S, S1, S2), 2, bool_arg), - ((1, S, S1, S2), 3, bool_arg), - ((1, S, S1, S2), -1, bool_arg), - ((1, S, S1, S2), -2, bool_arg), - ((1, S, S1, S2), -3, bool_arg), - # Test batches > 1 where the reduction dim is not the concat dim - ((S, S2, S1, 128), -1, bool_arg), - ] - - -@register_test_suite(["aten._softmax.default", "aten._log_softmax.default"]) -def get_softmax_inputs(): - test_suite = VkTestSuite(get_reduce_inputs(is_softmax=True)) - test_suite.layouts = [ - "utils::kWidthPacked", - "utils::kChannelsPacked", - ] - return test_suite - - -@register_test_suite( - ["aten.amax.default", "aten.amin.default", "aten.sum.dim_IntList", "aten.mean.dim"] -) -def get_reduce_op_inputs(): - test_suite = VkTestSuite(get_reduce_inputs()) - test_suite.layouts = [ - "utils::kChannelsPacked", - "utils::kWidthPacked", - ] - return test_suite - - -@register_test_suite(["aten.var.dim"]) -def get_var_inputs(): test_cases = [] shapes_and_dims = [ ((L), 0), @@ -1160,11 +1112,65 @@ def get_var_inputs(): ((S, S2, S1, 128), -1), ] - for i, (shape, dim) in enumerate(shapes_and_dims): - unbiased = (i % 2) == 0 - test_cases.append((shape, dim, unbiased, True)) + if is_softmax: + for i, (shape, dim) in enumerate(shapes_and_dims): + test_cases.append((shape, dim, bool_arg)) + elif is_variance: + for i, (shape, dim) in enumerate(shapes_and_dims): + # Alternating unbiased for comprehensive testing for variance. + # We also pass in a True for keep_dim to be consistent with other ops. + unbiased = (i % 2) == 0 + test_cases.append((shape, dim, unbiased, bool_arg)) + else: + for i, (shape, dim) in enumerate(shapes_and_dims): + test_cases.append((shape, dim, bool_arg)) + + return test_cases + + +@register_test_suite(["aten._softmax.default", "aten._log_softmax.default"]) +def get_softmax_inputs(): + test_suite = VkTestSuite(get_reduce_inputs(is_softmax=True)) + test_suite.layouts = [ + "utils::kWidthPacked", + "utils::kChannelsPacked", + ] + return test_suite + + +@register_test_suite( + ["aten.amax.default", "aten.amin.default", "aten.sum.dim_IntList", "aten.mean.dim"] +) +def get_reduce_op_inputs(): + test_cases = get_reduce_inputs() + + texture_test_suite = VkTestSuite(test_cases) + texture_test_suite.layouts = [ + "utils::kChannelsPacked", + "utils::kWidthPacked", + ] + texture_test_suite.storage_types = ["utils::kTexture3D"] + texture_test_suite.atol = "1e-4" + texture_test_suite.rtol = "1e-4" + texture_test_suite.test_name_suffix = "texture" + + buffer_test_suite = VkTestSuite(test_cases) + buffer_test_suite.layouts = [ + "utils::kChannelsPacked", + "utils::kWidthPacked", + ] + buffer_test_suite.storage_types = ["utils::kBuffer"] + buffer_test_suite.atol = "1e-4" + buffer_test_suite.rtol = "1e-4" + buffer_test_suite.test_name_suffix = "buffer" + + return [texture_test_suite, buffer_test_suite] + + +@register_test_suite(["aten.var.dim"]) +def get_reduce_op_inputs(): + test_cases = get_reduce_inputs(is_variance=True) - # Texture-based tests texture_test_suite = VkTestSuite(test_cases) texture_test_suite.layouts = [ "utils::kChannelsPacked", @@ -1175,7 +1181,6 @@ def get_var_inputs(): texture_test_suite.rtol = "1e-4" texture_test_suite.test_name_suffix = "texture" - # Buffer-based tests buffer_test_suite = VkTestSuite(test_cases) buffer_test_suite.layouts = [ "utils::kChannelsPacked",