diff --git a/backends/vulkan/runtime/graph/ops/glsl/var_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/reduce_buffer.glsl
similarity index 64%
rename from backends/vulkan/runtime/graph/ops/glsl/var_buffer.glsl
rename to backends/vulkan/runtime/graph/ops/glsl/reduce_buffer.glsl
index 30f283d6f01..4b087298555 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/var_buffer.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/reduce_buffer.glsl
@@ -31,15 +31,24 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 layout(constant_id = 3) const int reduce_dim = 0;
 
+$if VARIANCE_MODE:
+  #define VARIANCE_MODE
+
 #define NWORKERS 4
 #define MAX_THREADS 16
 
-shared T shared_sum[NWORKERS];
+shared T shared_accum[NWORKERS];
+#ifdef VARIANCE_MODE
 shared T shared_sum_sq[NWORKERS];
 shared int shared_count[NWORKERS];
+#endif
 
 #include "indexing_utils.h"
 
+#define INIT_ACCUM(first_val) ${INIT_ACCUM}
+#define UPDATE_ACCUM(accum, new_val) ${UPDATE_ACCUM}
+#define POSTPROCESS(accum) ${POSTPROCESS}
+
 void main() {
   const ivec4 out_idx = ivec4(
       gl_GlobalInvocationID.x,
@@ -49,9 +58,11 @@ void main() {
 
   const uint tid = gl_LocalInvocationID[reduce_dim];
 
-  shared_sum[tid] = T(0);
+  shared_accum[tid] = T(0);
+#ifdef VARIANCE_MODE
   shared_sum_sq[tid] = T(0);
   shared_count[tid] = 0;
+#endif
   barrier();
 
   const int R = in_sizes[reduce_dim];
@@ -65,9 +76,25 @@ void main() {
   uint len = q + (tid < rem ? 1u : 0u);
   uint base = tid * q + min(tid, rem);
 
-  T sum = T(0);
+  // Get the first value for initializing the accumulator if needed
+  T first_val = T(0);
+  if (R > 0) {
+    ivec4 first_idx = out_idx;
+    first_idx[reduce_dim] = 0;
+
+    if (reduce_dim == 2) {
+      first_idx[reduce_dim + 1] = 0;
+    }
+
+    first_val = in_buf[tidx_to_bufi(first_idx, in_strides)];
+  }
+
+  // Initialize accumulator
+  T accum = INIT_ACCUM(first_val);
+#ifdef VARIANCE_MODE
   T sum_sq = T(0);
   int count = 0;
+#endif
 
   ivec4 in_idx = out_idx;
   for (uint off = 0u; off < len; ++off) {
@@ -83,39 +110,55 @@ void main() {
 
     T v = in_buf[tidx_to_bufi(in_idx, in_strides)];
 
-    sum += v;
+    accum = UPDATE_ACCUM(accum, v);
+
+#ifdef VARIANCE_MODE
     sum_sq += v * v;
     count += 1;
+#endif
   }
 
-  shared_sum[tid] = sum;
+  shared_accum[tid] = accum;
+#ifdef VARIANCE_MODE
   shared_sum_sq[tid] = sum_sq;
   shared_count[tid] = count;
+#endif
   barrier();
 
   if (tid == 0u) {
-    T tot_sum = T(0);
-    T tot_sum_sq = T(0);
-    int tot_count = 0;
+    T result = shared_accum[0];
+
+#ifdef VARIANCE_MODE
+    T tot_sum = shared_accum[0];
+    T tot_sum_sq = shared_sum_sq[0];
+    int tot_count = shared_count[0];
+#endif
 
-    for (uint i = 0; i < N; ++i) {
-      tot_sum += shared_sum[i];
+    for (uint i = 1; i < N; ++i) {
+#ifdef VARIANCE_MODE
+      tot_sum += shared_accum[i];
       tot_sum_sq += shared_sum_sq[i];
       tot_count += shared_count[i];
+#else
+      result = UPDATE_ACCUM(result, shared_accum[i]);
+#endif
     }
 
-    T var;
+#ifdef VARIANCE_MODE
     if (tot_count > 0) {
       T mean = tot_sum / T(tot_count);
-      var = (tot_sum_sq / T(tot_count)) - (mean * mean);
+      result = (tot_sum_sq / T(tot_count)) - (mean * mean);
       if (pc.unbiased != 0 && tot_count > 1) {
-        var *= T(tot_count) / T(tot_count - 1);
+        result *= T(tot_count) / T(tot_count - 1);
       }
-    } else{
+    } else {
       // NaN to match PyTorch behavior
-      var = T(0.0/0.0);
+      result = T(0.0/0.0);
     }
+#else
+    result = POSTPROCESS(result);
+#endif
 
-    out_buf[tidx_to_bufi(out_idx, out_strides)] = var;
+    out_buf[tidx_to_bufi(out_idx, out_strides)] = result;
   }
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/reduce_buffer.yaml
new file mode 100644
index 00000000000..23186213f2f
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/reduce_buffer.yaml
@@ -0,0 +1,32 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+reduce_buffer:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: buffer
+    INIT_ACCUM: T(0)
+    UPDATE_ACCUM: accum + new_val
+    POSTPROCESS: accum
+    VARIANCE_MODE: false
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: sum_buffer
+    - NAME: mean_buffer
+      POSTPROCESS: (accum / T(in_sizes[reduce_dim]))
+    - NAME: amax_buffer
+      INIT_ACCUM: first_val
+      UPDATE_ACCUM: max(accum, new_val)
+      POSTPROCESS: accum
+    - NAME: amin_buffer
+      INIT_ACCUM: first_val
+      UPDATE_ACCUM: min(accum, new_val)
+      POSTPROCESS: accum
+    - NAME: var_buffer
+      VARIANCE_MODE: true
diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl b/backends/vulkan/runtime/graph/ops/glsl/reduce_texture3d.glsl
similarity index 70%
rename from backends/vulkan/runtime/graph/ops/glsl/reduce.glsl
rename to backends/vulkan/runtime/graph/ops/glsl/reduce_texture3d.glsl
index 7a6263d9f55..1ce56114375 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/reduce_texture3d.glsl
@@ -23,12 +23,19 @@ ${layout_declare_tensor(B, "r", "tin", DTYPE, STORAGE)}
 ${layout_declare_ubo(B, "ivec3", "tin_limits")}
 ${layout_declare_ubo(B, "ivec4", "tin_sizes")}
 
+layout(push_constant) uniform PushConstants {
+  int unbiased;
+} pc;
+
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 layout(constant_id = 3) const int packed_dim = 0;
 layout(constant_id = 4) const int reduce_dim = 0;
 layout(constant_id = 5) const int group_dim = 1;
 
+$if VARIANCE_MODE:
+  #define VARIANCE_MODE
+
 // A more verbose name would be NWORKERS_PER_GROUP. This describes the number of
 // threads that will co-operate to compute one reduction output. There may be
 // multiple groups computing distinct reduction outputs within one work group.
@@ -39,8 +46,11 @@ layout(constant_id = 5) const int group_dim = 1;
 // work group will write into its assigned element in the shared array.
 #define MAX_NTHREADS 16
 
-
-shared vec4 shared_vecs[MAX_NTHREADS];
+shared VEC4_T shared_vecs[MAX_NTHREADS];
+// Second accumulator for variance mode - used for sum of values, prev
+// accumulator is used for sum of squares
+shared VEC4_T shared_sum_sq[MAX_NTHREADS];
+shared int shared_count[MAX_NTHREADS];
 
 #include "indexing_utils.h"
 
@@ -48,6 +58,17 @@ int tid_to_smi(const ivec2 tid) {
   return tid.x + tid.y * NWORKERS;
 }
 
+VEC4_T calculate_variance(VEC4_T sum, VEC4_T sum_sq, int count) {
+  VEC4_T mean = sum / float(count);
+  VEC4_T variance = (sum_sq / float(count)) - (mean * mean);
+
+  if ((pc.unbiased != 0) && (count > 1)) {
+    variance = variance * (float(count) / float(count - 1.0));
+  }
+
+  return variance;
+}
+
 /*
  * The functions below compute reduction along a single dimension for a tensor.
  * The shader template generalize reduction by abstracting the initial value of
@@ -90,17 +111,31 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
   const int smi = tid_to_smi(tid);
 
   scan_pos[reduce_dim] = 0;
-  vec4 accum = INIT_ACCUM(load_texel(tin, scan_pos));
+  VEC4_T accum = INIT_ACCUM(load_texel(tin, scan_pos));
+
+#ifdef VARIANCE_MODE
+  VEC4_T sum_sq = VEC4_T(0);
+  int count = 0;
+#endif
 
   scan_pos[reduce_dim] = tid.x;
   // Partially accumulate over elements i, i + NWORKERS, i + 2*NWORKERS, ... of
   // the reduction row
   for (int i = tid.x; i < tin_sizes[reduce_dim];
        i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
-    accum = UPDATE_ACCUM(accum, load_texel(tin, scan_pos));
+    VEC4_T val = load_texel(tin, scan_pos);
+    accum = UPDATE_ACCUM(accum, val);
+#ifdef VARIANCE_MODE
+    sum_sq += val * val;
+    count += 1;
+#endif
   }
   // Write partial output to shared memory and synchronize work group
   shared_vecs[smi] = accum;
+#ifdef VARIANCE_MODE
+  shared_sum_sq[smi] = sum_sq;
+  shared_count[smi] = count;
+#endif
   barrier();
 
   // Since the reduction row is reduced to only one element, only the "main"
@@ -108,9 +143,18 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
   if (tid.x == 0) {
     // Iterate over the partial outputs to obtain the overall output
     int group_i = tid.y * NWORKERS;
-    accum = shared_vecs[group_i++];
-    for (int i = 1; i < NWORKERS; i++, group_i++) {
-      accum = UPDATE_ACCUM(accum, shared_vecs[group_i]);
+    accum = shared_vecs[group_i];
+#ifdef VARIANCE_MODE
+    sum_sq = shared_sum_sq[group_i];
+    count = shared_count[group_i];
+#endif
+    for (int i = 1; i < NWORKERS; i++) {
+      int idx = tid.y * NWORKERS + i;
+      accum = UPDATE_ACCUM(accum, shared_vecs[idx]);
+#ifdef VARIANCE_MODE
+      sum_sq += shared_sum_sq[idx];
+      count += shared_count[idx];
+#endif
     }
 
     // Determine if there are any padding elements in the final texel of the
@@ -121,14 +165,27 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
     const bool is_last_texel =
         scan_pos[packed_dim] == (tin_limits[packed_dim] - 1);
 
+#ifdef VARIANCE_MODE
+    VEC4_T variance = calculate_variance(accum, sum_sq, count);
+#endif
+
     // Explicitly set padding elements to 0
     if (is_last_texel && nspill > 0) {
       [[unroll]] for (int i = nspill; i < 4; i++) {
+#ifdef VARIANCE_MODE
+        variance[i] = 0;
+#else
         accum[i] = 0;
+#endif
       }
     }
+
     scan_pos[reduce_dim] = tid.x;
+#ifdef VARIANCE_MODE
+    write_texel(tout, scan_pos, variance);
+#else
     write_texel(tout, scan_pos, POSTPROCESS(accum));
+#endif
   }
 }
 
@@ -151,26 +208,44 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) {
   const int reduce_len = tin_sizes[packed_dim] - nspill;
 
   scan_pos[reduce_dim] = 0;
-  vec4 accum = INIT_ACCUM(vec4(load_texel(tin, scan_pos).x));
+  VEC4_T accum = INIT_ACCUM(VEC4_T(load_texel(tin, scan_pos).x));
+
+#ifdef VARIANCE_MODE
+  VEC4_T sum_sq = VEC4_T(0);
+  int count = 0;
+#endif
 
   // Partially accumulate over elements i, i + NWORKERS, i + 2*NWORKERS, ... of
   // the reduction row
   scan_pos[reduce_dim] = tid.x;
   for (int i = tid.x * 4; i < reduce_len;
        i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) {
-    accum = UPDATE_ACCUM(accum, load_texel(tin, scan_pos));
+    VEC4_T val = load_texel(tin, scan_pos);
+    accum = UPDATE_ACCUM(accum, val);
+#ifdef VARIANCE_MODE
+    sum_sq += val * val;
+    count += 4; // Each texel has 4 elements
+#endif
   }
   // For the last texel in the dim, if there are padding elements then each
   // element of the texel needs to be processed individually such that the
   // padding elements are ignored
   if (scan_pos[reduce_dim] == tin_limits[reduce_dim] - 1 && nspill > 0) {
-    const vec4 intex = load_texel(tin, scan_pos);
+    const VEC4_T val = load_texel(tin, scan_pos);
     for (int i = 0; i < nspill; i++) {
-      accum.x = UPDATE_ACCUM(accum.x, intex[i]);
+      accum.x = UPDATE_ACCUM(accum.x, val[i]);
+#ifdef VARIANCE_MODE
+      sum_sq.x += val[i] * val[i];
+      count += 1;
+#endif
     }
   }
   // Write partial output to shared memory and synchronize work group
   shared_vecs[smi] = accum;
+#ifdef VARIANCE_MODE
+  shared_sum_sq[smi] = sum_sq;
+  shared_count[smi] = count;
+#endif
   barrier();
 
   // Since the reduction row is reduced to only one element, only the "main"
@@ -178,10 +253,35 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) {
   if (tid.x == 0) {
     // Iterate over the partial maximums to obtain the overall maximum
     int group_i = tid.y * NWORKERS;
-    accum = shared_vecs[group_i++];
+    accum = shared_vecs[group_i];
+#ifdef VARIANCE_MODE
+    sum_sq = shared_sum_sq[group_i];
+    count = shared_count[group_i];
+#endif
     for (int i = 1; i < NWORKERS; i++, group_i++) {
-      accum = UPDATE_ACCUM(accum, shared_vecs[group_i]);
+      int idx = tid.y * NWORKERS + i;
+      accum = UPDATE_ACCUM(accum, shared_vecs[idx]);
+#ifdef VARIANCE_MODE
+      sum_sq += shared_sum_sq[idx];
+      count += shared_count[idx];
+#endif
     }
+
+#ifdef VARIANCE_MODE
+    float total_sum = accum.x + accum.y + accum.z + accum.w;
+    float total_sum_sq = sum_sq.x + sum_sq.y + sum_sq.z + sum_sq.w;
+    int total_count = count;
+
+    float mean = total_sum / float(total_count);
+    float variance = (total_sum_sq / float(total_count)) - (mean * mean);
+
+    if ((pc.unbiased != 0) && (total_count > 1)) {
+      variance = variance * (float(total_count) / float(total_count - 1.0));
+    }
+
+    scan_pos[reduce_dim] = tid.x;
+    write_texel(tout, scan_pos, VEC4_T(variance, 0, 0, 0));
+#else
     // Each element of the texel is itself a partial maximum; iterate over the
     // texel to find the actual maximum
     float accum_final = accum.x;
@@ -190,7 +290,8 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) {
     }
 
     scan_pos[reduce_dim] = tid.x;
-    write_texel(tout, scan_pos, POSTPROCESS(vec4(accum_final, 0, 0, 0)));
+    write_texel(tout, scan_pos, POSTPROCESS(VEC4_T(accum_final, 0, 0, 0)));
+#endif
   }
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce.yaml b/backends/vulkan/runtime/graph/ops/glsl/reduce_texture3d.yaml
similarity index 77%
rename from backends/vulkan/runtime/graph/ops/glsl/reduce.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/reduce_texture3d.yaml
index 21a7132b8db..c057ec100fd 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/reduce.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/reduce_texture3d.yaml
@@ -4,26 +4,29 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-reduce:
+reduce_texture3d:
   parameter_names_with_default_values:
     DTYPE: float
     STORAGE: texture3d
     INIT_ACCUM: VEC4_T(0)
     UPDATE_ACCUM: accum + new_val
     POSTPROCESS: accum
+    VARIANCE_MODE: false
   generate_variant_forall:
     DTYPE:
       - VALUE: half
       - VALUE: float
   shader_variants:
-    - NAME: sum
-    - NAME: mean
+    - NAME: sum_texture3d
+    - NAME: mean_texture3d
       POSTPROCESS: (accum / tin_sizes[reduce_dim])
-    - NAME: amax
+    - NAME: amax_texture3d
       INIT_ACCUM: first_val
       UPDATE_ACCUM: max(accum, new_val)
       POSTPROCESS: accum
-    - NAME: amin
+    - NAME: amin_texture3d
       INIT_ACCUM: first_val
       UPDATE_ACCUM: min(accum, new_val)
       POSTPROCESS: accum
+    - NAME: var_texture3d
+      VARIANCE_MODE: true
diff --git a/backends/vulkan/runtime/graph/ops/glsl/var_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/var_buffer.yaml
deleted file mode 100644
index 7cb783775c9..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/var_buffer.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-var_buffer:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: buffer
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: var_buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl
deleted file mode 100644
index faeac01fcd2..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
-
-${define_active_storage_type(STORAGE)}
-
-#extension GL_EXT_control_flow_attributes : require
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "tout", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "tin", DTYPE, STORAGE)}
-
-${layout_declare_ubo(B, "ivec3", "tin_limits")}
-${layout_declare_ubo(B, "ivec4", "tin_sizes")}
-
-layout(push_constant) uniform PushConstants {
-  int unbiased;
-} pc;
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = 0;
-layout(constant_id = 4) const int reduce_dim = 0;
-layout(constant_id = 5) const int group_dim = 1;
-
-// A more verbose name would be NWORKERS_PER_GROUP. This describes the number of
-// threads that will co-operate to compute one reduction output. There may be
-// multiple groups computing distinct reduction outputs within one work group.
-#define NWORKERS 4
-
-// Sets an upper limit on the total size of a work group based on how many
-// elements are allocated in the shared memory array below. Each thread in the
-// work group will write into its assigned element in the shared array.
-#define MAX_NTHREADS 16
-
-shared VEC4_T shared_sum[MAX_NTHREADS];
-shared VEC4_T shared_sum_sq[MAX_NTHREADS];
-shared int shared_count[MAX_NTHREADS];
-
-#include "indexing_utils.h"
-
-int tid_to_smi(const ivec2 tid) {
-  return tid.x + tid.y * NWORKERS;
-}
-
-VEC4_T calculate_variance(VEC4_T sum, VEC4_T sum_sq, int count) {
-  VEC4_T mean = sum / float(count);
-  VEC4_T variance = (sum_sq / float(count)) - (mean * mean);
-
-  if ((pc.unbiased != 0) && (count > 1)) {
-    variance = variance * (float(count) / float(count - 1.0));
-  }
-
-  return variance;
-}
-
-void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
-  // shared memory index of this thread
-  const int smi = tid_to_smi(tid);
-
-  VEC4_T sum = VEC4_T(0);
-  VEC4_T sum_sq = VEC4_T(0);
-  int count = 0;
-
-  scan_pos[reduce_dim] = tid.x;
-  for (int i = tid.x; i < tin_sizes[reduce_dim];
-       i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
-    VEC4_T val = load_texel(tin, scan_pos);
-    sum += val;
-    sum_sq += val * val;
-    count += 1;
-  }
-  // Write partial output to shared memory and synchronize work group
-  shared_sum[smi] = sum;
-  shared_sum_sq[smi] = sum_sq;
-  shared_count[smi] = count;
-  barrier();
-
-  // Since the reduction row is reduced to only one element, only the "main"
-  // thread in the group needs aggregate the partial outputs
-  if (tid.x == 0) {
-    int group_i = tid.y * NWORKERS;
-    sum = shared_sum[group_i];
-    sum_sq = shared_sum_sq[group_i];
-    count = shared_count[group_i];
-
-    for (int i = 1; i < NWORKERS; i++) {
-      int idx = tid.y * NWORKERS + i;
-      sum += shared_sum[idx];
-      sum_sq += shared_sum_sq[idx];
-      count += shared_count[idx];
-    }
-
-    // Determine if there are any padding elements in the final texel of the
-    // packed dimension
-    const int nspill = mod4(tin_sizes[packed_dim]);
-    // Detect if this thread is working on the final texels of the packed
-    // dimension, which may have padding elements
-    const bool is_last_texel =
-        scan_pos[packed_dim] == (tin_limits[packed_dim] - 1);
-
-    VEC4_T variance = calculate_variance(sum, sum_sq, count);
-
-    // Explicitly set padding elements to 0
-    if (is_last_texel && nspill > 0) {
-      [[unroll]] for (int i = nspill; i < 4; i++) {
-        variance[i] = 0;
-      }
-    }
-
-    scan_pos[reduce_dim] = tid.x;
-    write_texel(tout, scan_pos, variance);
-  }
-}
-
-/*
- * Compute reduction where the reduction dim is also the packed dim. This case is
- * complex because the reduction needs to occur over the individual texels.
- * Therefore, in this algorithm each element of the accumulator texels are
- * themselves partial outputs. Special care has to be taken to ignore padding
- * elements in texels (which occur when the size of the packed dim is not a
- * multiple of 4) so that they do not influence the output of reduction.
- */
-void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) {
-  // shared memory index of this thread
-  const int smi = tid_to_smi(tid);
-
-  // Number of non-padding elements in the last texel in the reduction row
-  const int nspill = mod4(tin_sizes[packed_dim]);
-  // Only reduce up to the last "complete" texel. The last texel will need to be
-  // handled specially if it has padding elements.
-  const int reduce_len = tin_sizes[packed_dim] - nspill;
-
-  VEC4_T sum = VEC4_T(0);
-  VEC4_T sum_sq = VEC4_T(0);
-  int count = 0;
-
-  // Partially accumulate over elements i, i + NWORKERS, i + 2*NWORKERS, ... of
-  // the reduction row
-  scan_pos[reduce_dim] = tid.x;
-  for (int i = tid.x * 4; i < reduce_len;
-       i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) {
-    VEC4_T val = load_texel(tin, scan_pos);
-    sum += val;
-    sum_sq += val * val;
-    count += 4;
-  }
-  // For the last texel in the dim, if there are padding elements then each
-  // element of the texel needs to be processed individually such that the
-  // padding elements are ignored
-  if (scan_pos[reduce_dim] == tin_limits[reduce_dim] - 1 && nspill > 0) {
-    const VEC4_T val = load_texel(tin, scan_pos);
-    for (int i = 0; i < nspill; i++) {
-      sum.x += val[i];
-      sum_sq.x += val[i] * val[i];
-      count += 1;
-    }
-  }
-  // Write partial output to shared memory and synchronize work group
-  shared_sum[smi] = sum;
-  shared_sum_sq[smi] = sum_sq;
-  shared_count[smi] = count;
-  barrier();
-
-  // Since the reduction row is reduced to only one element, only the "main"
-  // thread in the group needs aggregate the partial outputs
-  if (tid.x == 0) {
-    sum = shared_sum[tid.y * NWORKERS];
-    sum_sq = shared_sum_sq[tid.y * NWORKERS];
-    count = shared_count[tid.y * NWORKERS];
-    for (int i = 1; i < NWORKERS; i++) {
-      int idx = tid.y * NWORKERS + i;
-      sum += shared_sum[idx];
-      sum_sq += shared_sum_sq[idx];
-      count += shared_count[idx];
-    }
-
-    // Combine across the elements of the combined state
-    float total_sum = sum.x + sum.y + sum.z + sum.w;
-    float total_sum_sq = sum_sq.x + sum_sq.y + sum_sq.z + sum_sq.w;
-    int total_count = count;
-
-    float mean = total_sum / float(total_count);
-    float variance = (total_sum_sq / float(total_count)) - (mean * mean);
-
-    if ((pc.unbiased != 0) && (total_count > 1)) {
-      variance = variance * (float(total_count) / float(total_count - 1.0));
-    }
-
-    scan_pos[reduce_dim] = tid.x;
-    write_texel(tout, scan_pos, VEC4_T(variance, 0, 0, 0));
-  }
-}
-
-void main() {
-  ivec3 scan_pos = ivec3(gl_GlobalInvocationID);
-  scan_pos[reduce_dim] = 0;
-
-  const ivec2 tid = ivec2(
-      gl_LocalInvocationID[reduce_dim],
-      gl_LocalInvocationID[group_dim]);
-
-  if (any(greaterThanEqual(scan_pos, tin_limits))) {
-    return;
-  }
-
-  if (reduce_dim != packed_dim) {
-    reduce_nonpacked_dim(tid, scan_pos);
-  } else {
-    reduce_packed_dim(tid, scan_pos);
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.yaml
deleted file mode 100644
index 9cecbedca1a..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-var_texture3d:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: var_texture3d
diff --git a/backends/vulkan/runtime/graph/ops/impl/Reduce.cpp b/backends/vulkan/runtime/graph/ops/impl/Reduce.cpp
index 8fcd4a0609c..01ea343635d 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Reduce.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Reduce.cpp
@@ -32,16 +32,67 @@ void resize_reduce_node(
   out->virtual_resize(new_sizes);
 }
 
-void add_reduce_node(
+void add_reduce_buffer_node(
     ComputeGraph& graph,
     ValueRef in,
     const int dim,
     ValueRef out,
-    const std::string& op_name) {
-  VK_CHECK_COND(
-      !graph.is_buffer_storage(in) && !graph.is_buffer_storage(out),
-      "Vulkan reduction only supports texture storage");
+    const std::string& op_name,
+    bool unbiased = false) {
+  const int64_t ndim = graph.dim_of(in);
+  int32_t reduce_dim = normalize(dim, ndim);
+  reduce_dim = nchw_dim_to_whcn_dim(reduce_dim, ndim);
+
+  std::string kernel_name = op_name;
+  kernel_name.reserve(kShaderNameReserve);
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  const uint32_t nworkers_per_group = 4;
 
+  utils::uvec3 global_wg_size = {
+      graph.size_at<uint32_t>(-1, out),
+      graph.size_at<uint32_t>(-2, out),
+      graph.size_at<uint32_t>(-3, out) * graph.size_at<uint32_t>(-4, out)};
+
+  utils::uvec3 local_wg_size{1, 1, 1};
+  local_wg_size[reduce_dim] = nworkers_per_group;
+
+  std::vector<PushConstantDataInfo> push_constants;
+  int32_t unbiased_int = static_cast<int32_t>(unbiased);
+  push_constants.emplace_back(&unbiased_int, sizeof(unbiased_int));
+
+  graph.execute_nodes().emplace_back(new DispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_wg_size,
+      local_wg_size,
+      // Inputs and Outputs
+      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
+      // Shader params buffers
+      {
+          graph.sizes_ubo(in),
+          graph.strides_ubo(in),
+          graph.sizes_ubo(out),
+          graph.strides_ubo(out),
+      },
+      // Push Constants
+      push_constants,
+      // Specialization Constants
+      {reduce_dim},
+      // Resize Args
+      {dim},
+      // Resizing Logic
+      resize_reduce_node));
+}
+
+void add_reduce_texture_node(
+    ComputeGraph& graph,
+    ValueRef in,
+    const int dim,
+    ValueRef out,
+    const std::string& op_name,
+    bool unbiased = false) {
   const int64_t ndim = graph.dim_of(in);
 
   int32_t reduce_dim = dim;
@@ -55,9 +106,9 @@ void add_reduce_node(
     VK_CHECK_COND(graph.concat_dim_of(out) != reduce_dim);
   }
 
-  vkapi::ShaderInfo shader_descriptor;
   std::string kernel_name = op_name;
   kernel_name.reserve(kShaderNameReserve);
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
   // This should match the value of MAX_NTHREADS in the softmax shader.
@@ -83,6 +134,10 @@ void add_reduce_node(
     group_dim = other_dim_2;
   }
 
+  std::vector<PushConstantDataInfo> push_constants;
+  int32_t unbiased_int = static_cast<int32_t>(unbiased);
+  push_constants.emplace_back(&unbiased_int, sizeof(unbiased_int));
+
   graph.execute_nodes().emplace_back(new DispatchNode(
       graph,
       // shader_descriptor,
@@ -94,7 +149,7 @@ void add_reduce_node(
       // Shader params buffers
       {graph.logical_limits_ubo(in), graph.sizes_ubo(in)},
       // Push Constants
-      {},
+      push_constants,
       // Specialization Constants
       {graph.packed_dim_of(out), reduce_dim, group_dim},
       // Resize Args
@@ -103,24 +158,51 @@ void add_reduce_node(
       resize_reduce_node));
 }
 
+void add_reduce_node(
+    ComputeGraph& graph,
+    ValueRef in,
+    const int dim,
+    ValueRef out,
+    const std::string& op_name,
+    bool unbiased = false) {
+  bool is_buffer = graph.is_buffer_storage(in) || graph.is_buffer_storage(out);
+
+  if (is_buffer) {
+    add_reduce_buffer_node(graph, in, dim, out, op_name, unbiased);
+  } else {
+    add_reduce_texture_node(graph, in, dim, out, op_name, unbiased);
+  }
+}
+
 #define DEFINE_REDUCE_FN(op_name, out_arg_idx)                           \
   void op_name(ComputeGraph& graph, const std::vector<ValueRef>& args) { \
     const IntListPtr dims_list = graph.get_int_list(args[1]);            \
     VK_CHECK_COND(dims_list->size() == 1);                               \
+    bool unbiased = false;                                               \
+    if (strcmp(#op_name, "var") == 0 && args.size() > 2) {               \
+      unbiased = graph.get_bool(args[2]);                                \
+    }                                                                    \
     return add_reduce_node(                                              \
-        graph, args[0], dims_list->at(0), args[out_arg_idx], #op_name);  \
+        graph,                                                           \
+        args[0],                                                         \
+        dims_list->at(0),                                                \
+        args[out_arg_idx],                                               \
+        #op_name,                                                        \
+        unbiased);                                                       \
   }
 
 DEFINE_REDUCE_FN(sum, 4)
 DEFINE_REDUCE_FN(mean, 4)
 DEFINE_REDUCE_FN(amax, 3)
 DEFINE_REDUCE_FN(amin, 3)
+DEFINE_REDUCE_FN(var, 4)
 
 REGISTER_OPERATORS {
   VK_REGISTER_OP(aten.sum.dim_IntList, sum);
   VK_REGISTER_OP(aten.mean.dim, mean);
   VK_REGISTER_OP(aten.amax.default, amax);
   VK_REGISTER_OP(aten.amin.default, amin);
+  VK_REGISTER_OP(aten.var.dim, var);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Var.cpp b/backends/vulkan/runtime/graph/ops/impl/Var.cpp
deleted file mode 100644
index dcac9d2e210..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Var.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-using namespace utils;
-
-void resize_var_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  (void)extra_args;
-  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
-  vTensorPtr in = graph->get_tensor(args[1].refs[0]);
-
-  int dim = extra_args[0];
-
-  std::vector<int64_t> new_sizes = in->sizes();
-  new_sizes[normalize(dim, new_sizes.size())] = 1;
-  out->virtual_resize(new_sizes);
-}
-
-void add_var_buffer_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    const int dim,
-    bool unbiased,
-    ValueRef out) {
-  const int64_t ndim = graph.dim_of(in);
-  int32_t reduce_dim = normalize(dim, ndim);
-  reduce_dim = nchw_dim_to_whcn_dim(reduce_dim, ndim);
-
-  std::string kernel_name = "var";
-  kernel_name.reserve(kShaderNameReserve);
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  const uint32_t nworkers_per_group = 4;
-
-  utils::uvec3 global_wg_size = {
-      graph.size_at<uint32_t>(-1, out),
-      graph.size_at<uint32_t>(-2, out),
-      graph.size_at<uint32_t>(-3, out) * graph.size_at<uint32_t>(-4, out)};
-
-  utils::uvec3 local_wg_size{1, 1, 1};
-  local_wg_size[reduce_dim] = nworkers_per_group;
-
-  std::vector<PushConstantDataInfo> push_constants;
-  int32_t unbiased_int = static_cast<int32_t>(unbiased);
-  push_constants.emplace_back(&unbiased_int, sizeof(unbiased_int));
-
-  graph.execute_nodes().emplace_back(new DispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      global_wg_size,
-      local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
-      // Shader params buffers
-      {
-          graph.sizes_ubo(in),
-          graph.strides_ubo(in),
-          graph.sizes_ubo(out),
-          graph.strides_ubo(out),
-      },
-      // Push Constants
-      push_constants,
-      // Specialization Constants
-      {reduce_dim},
-      // Resize Args
-      {dim},
-      // Resizing Logic
-      resize_var_node));
-}
-
-void add_var_texture_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    const int dim,
-    bool unbiased,
-    ValueRef out) {
-  const int64_t ndim = graph.dim_of(in);
-
-  int32_t reduce_dim = dim;
-  reduce_dim = normalize(reduce_dim, ndim);
-  reduce_dim = nchw_dim_to_whcn_dim(reduce_dim, ndim);
-
-  // Check that the concat dim is not the reduction dim, if the tensor has a
-  // batch dim greater than 1.
-  if (graph.dim_of(in) == 4 && graph.size_at<int>(0, in) > 1) {
-    VK_CHECK_COND(graph.concat_dim_of(in) != reduce_dim);
-    VK_CHECK_COND(graph.concat_dim_of(out) != reduce_dim);
-  }
-
-  std::string kernel_name = "var";
-  kernel_name.reserve(kShaderNameReserve);
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  // This should match the value of MAX_NTHREADS in the softmax shader.
-  constexpr uint32_t max_nthreads = 16;
-
-  const uint32_t nworkers_per_group = 4;
-  const uint32_t ngroups = 4;
-  VK_CHECK_COND(nworkers_per_group * ngroups <= max_nthreads);
-
-  utils::uvec3 global_wg_size = graph.logical_limits_of(out);
-  global_wg_size[reduce_dim] = 1;
-
-  utils::uvec3 local_wg_size{1, 1, 1};
-  local_wg_size[reduce_dim] = nworkers_per_group;
-  const int other_dim_1 = (reduce_dim + 1) % 3;
-  const int other_dim_2 = (reduce_dim + 2) % 3;
-  int32_t group_dim;
-  if (global_wg_size[other_dim_1] > global_wg_size[other_dim_2]) {
-    local_wg_size[other_dim_1] = ngroups;
-    group_dim = other_dim_1;
-  } else {
-    local_wg_size[other_dim_2] = ngroups;
-    group_dim = other_dim_2;
-  }
-
-  std::vector<PushConstantDataInfo> push_constants;
-  int32_t unbiased_int = static_cast<int32_t>(unbiased);
-  push_constants.emplace_back(&unbiased_int, sizeof(unbiased_int));
-
-  graph.execute_nodes().emplace_back(new DispatchNode(
-      graph,
-      // shader_descriptor,
-      VK_KERNEL_FROM_STR(kernel_name),
-      global_wg_size,
-      local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
-      // Shader params buffers
-      {graph.logical_limits_ubo(in), graph.sizes_ubo(in)},
-      // Push Constants
-      push_constants,
-      // Specialization Constants
-      {graph.packed_dim_of(out), reduce_dim, group_dim},
-      // Resize Args
-      {dim},
-      // Resizing Logic
-      resize_var_node));
-}
-
-void add_var_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    const int dim,
-    bool unbiased,
-    ValueRef out) {
-  bool is_buffer = graph.is_buffer_storage(in) || graph.is_buffer_storage(out);
-
-  if (is_buffer) {
-    add_var_buffer_node(graph, in, dim, unbiased, out);
-  } else {
-    add_var_texture_node(graph, in, dim, unbiased, out);
-  }
-}
-
-void var(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  const IntListPtr dims_list = graph.get_int_list(args[1]);
-  VK_CHECK_COND(dims_list->size() == 1);
-  bool unbiased = true;
-  if (args.size() > 2) {
-    unbiased = graph.get_bool(args[2]);
-  }
-  return add_var_node(graph, args[0], dims_list->at(0), unbiased, args[4]);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.var.dim, var);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index 0aa8d083c59..a81612f5269 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -1086,56 +1086,8 @@ def get_split_tensor_inputs():
     return test_suite
 
 
-def get_reduce_inputs(is_softmax: bool = False):
+def get_reduce_inputs(is_softmax: bool = False, is_variance: bool = False):
     bool_arg = False if is_softmax else True
-    return [
-        ((L), 0, bool_arg),
-        ((L), -1, bool_arg),
-        ((M, L), 0, bool_arg),
-        ((M, L), 1, bool_arg),
-        ((L, M), -1, bool_arg),
-        ((M, L), -2, bool_arg),
-        ((S, S1, S2), 0, bool_arg),
-        ((S, S1, S2), 1, bool_arg),
-        ((S, S1, S2), 2, bool_arg),
-        ((S, S1, S2), -1, bool_arg),
-        ((S, S1, S2), -2, bool_arg),
-        ((S, S1, S2), -3, bool_arg),
-        ((1, S, S1, S2), 1, bool_arg),
-        ((1, S, S1, S2), 2, bool_arg),
-        ((1, S, S1, S2), 3, bool_arg),
-        ((1, S, S1, S2), -1, bool_arg),
-        ((1, S, S1, S2), -2, bool_arg),
-        ((1, S, S1, S2), -3, bool_arg),
-        # Test batches > 1 where the reduction dim is not the concat dim
-        ((S, S2, S1, 128), -1, bool_arg),
-    ]
-
-
-@register_test_suite(["aten._softmax.default", "aten._log_softmax.default"])
-def get_softmax_inputs():
-    test_suite = VkTestSuite(get_reduce_inputs(is_softmax=True))
-    test_suite.layouts = [
-        "utils::kWidthPacked",
-        "utils::kChannelsPacked",
-    ]
-    return test_suite
-
-
-@register_test_suite(
-    ["aten.amax.default", "aten.amin.default", "aten.sum.dim_IntList", "aten.mean.dim"]
-)
-def get_reduce_op_inputs():
-    test_suite = VkTestSuite(get_reduce_inputs())
-    test_suite.layouts = [
-        "utils::kChannelsPacked",
-        "utils::kWidthPacked",
-    ]
-    return test_suite
-
-
-@register_test_suite(["aten.var.dim"])
-def get_var_inputs():
     test_cases = []
     shapes_and_dims = [
         ((L), 0),
@@ -1160,11 +1112,65 @@ def get_var_inputs():
         ((S, S2, S1, 128), -1),
     ]
 
-    for i, (shape, dim) in enumerate(shapes_and_dims):
-        unbiased = (i % 2) == 0
-        test_cases.append((shape, dim, unbiased, True))
+    if is_softmax:
+        for i, (shape, dim) in enumerate(shapes_and_dims):
+            test_cases.append((shape, dim, bool_arg))
+    elif is_variance:
+        for i, (shape, dim) in enumerate(shapes_and_dims):
+            # Alternating unbiased for comprehensive testing for variance.
+            # We also pass in a True for keep_dim to be consistent with other ops.
+            unbiased = (i % 2) == 0
+            test_cases.append((shape, dim, unbiased, bool_arg))
+    else:
+        for i, (shape, dim) in enumerate(shapes_and_dims):
+            test_cases.append((shape, dim, bool_arg))
+
+    return test_cases
+
+
+@register_test_suite(["aten._softmax.default", "aten._log_softmax.default"])
+def get_softmax_inputs():
+    test_suite = VkTestSuite(get_reduce_inputs(is_softmax=True))
+    test_suite.layouts = [
+        "utils::kWidthPacked",
+        "utils::kChannelsPacked",
+    ]
+    return test_suite
+
+
+@register_test_suite(
+    ["aten.amax.default", "aten.amin.default", "aten.sum.dim_IntList", "aten.mean.dim"]
+)
+def get_reduce_op_inputs():
+    test_cases = get_reduce_inputs()
+
+    texture_test_suite = VkTestSuite(test_cases)
+    texture_test_suite.layouts = [
+        "utils::kChannelsPacked",
+        "utils::kWidthPacked",
+    ]
+    texture_test_suite.storage_types = ["utils::kTexture3D"]
+    texture_test_suite.atol = "1e-4"
+    texture_test_suite.rtol = "1e-4"
+    texture_test_suite.test_name_suffix = "texture"
+
+    buffer_test_suite = VkTestSuite(test_cases)
+    buffer_test_suite.layouts = [
+        "utils::kChannelsPacked",
+        "utils::kWidthPacked",
+    ]
+    buffer_test_suite.storage_types = ["utils::kBuffer"]
+    buffer_test_suite.atol = "1e-4"
+    buffer_test_suite.rtol = "1e-4"
+    buffer_test_suite.test_name_suffix = "buffer"
+
+    return [texture_test_suite, buffer_test_suite]
+
+
+@register_test_suite(["aten.var.dim"])
+def get_reduce_op_inputs():
+    test_cases = get_reduce_inputs(is_variance=True)
 
-    # Texture-based tests
     texture_test_suite = VkTestSuite(test_cases)
     texture_test_suite.layouts = [
         "utils::kChannelsPacked",
@@ -1175,7 +1181,6 @@ def get_var_inputs():
     texture_test_suite.rtol = "1e-4"
     texture_test_suite.test_name_suffix = "texture"
 
-    # Buffer-based tests
     buffer_test_suite = VkTestSuite(test_cases)
     buffer_test_suite.layouts = [
         "utils::kChannelsPacked",