Faster mul(sparse, sparse) with broadcasting in dense dims. (pytorch#85336)

nikitaved · pytorchmergebot · commit 12ae3bea437e · 2022-09-23T23:31:19.000Z
This is a combo PR of pytorch#84929 and ~pytorch#83428. Preliminary benchmarks (square matrices of shape (n, n)). <details> <summary>Script</summary> ```python import torch import math from IPython import get_ipython from itertools import product, repeat import pickle from torch.utils.benchmark import Timer, Compare torch.manual_seed(13) problem_dims = ( # n > nnz (10000, 100), (100000, 1000), (1000000, 10000), # n < nnz (10, 100), (10, 1000), (10, 10000), (100, 1000), (100, 10000), (1000, 10000), (1000, 100000), (1000, 1000000), #(1000000, 1000000000), ) name = "PR" device = "cuda" results = [] for n, nnz in problem_dims: def gen_tensor(coalesce=False): shape = (n, n) nrows, ncols = shape rowidx = torch.randint(low=0, high=nrows, size=(nnz,), device=device) colidx = torch.randint(low=0, high=ncols, size=(nnz,), device=device) itemidx = torch.vstack((rowidx, colidx)) xvalues = torch.randn(nnz, device=device) itemidx = torch.hstack((itemidx, itemidx)) xvalues = torch.hstack((xvalues, xvalues)) res = torch.sparse_coo_tensor(itemidx, xvalues, size=shape) if coalesce: return res.coalesce() else: return res for x_coalesce, y_coalesce in product(*repeat((True, False), 2)): x = gen_tensor(x_coalesce) y = gen_tensor(y_coalesce) smtp = "x * y" timer = Timer(smtp, globals=globals(), label="coo.mul", description=f"{name}: mul, device: {device}", sub_label=f"n={n}, nnz={nnz}, coalesce=({x_coalesce, y_coalesce})", num_threads=torch.get_num_threads()) results.append(timer.blocked_autorange()) compare = Compare(results) compare.trim_significant_figures() compare.print() with open(f"{name}_{device}_mul.pickle", 'wb') as f: pickle.dump(results, f) ``` </details> <details> <summary>Gather results</summary> ```python import pickle from torch.utils.benchmark import Timer, Compare files = [ "PR", "master" ] device = 'cuda' timers = [] for name in files: with open("{}_{}_mul.pickle".format(name, device), 'rb') as f: timers += pickle.load(f) compare = Compare(timers) compare.trim_significant_figures() compare.print() ``` </details> <details> <summary>CUDA</summary> ``` [------------------------------------------------- coo.mul -------------------------------------------------] | PR: mul, device: cuda | master: mul, device: cuda 24 threads: ------------------------------------------------------------------------------------------------- n=10000, nnz=100, coalesce=((True, True)) | 95 | 91 n=10000, nnz=100, coalesce=((True, False)) | 87 | 242 n=10000, nnz=100, coalesce=((False, True)) | 87 | 226 n=10000, nnz=100, coalesce=((False, False)) | 130 | 371 n=100000, nnz=1000, coalesce=((True, True)) | 100 | 521 n=100000, nnz=1000, coalesce=((True, False)) | 90 | 649 n=100000, nnz=1000, coalesce=((False, True)) | 100 | 659 n=100000, nnz=1000, coalesce=((False, False)) | 200 | 781 n=1000000, nnz=10000, coalesce=((True, True)) | 100 | 4861 n=1000000, nnz=10000, coalesce=((True, False)) | 100 | 5012 n=1000000, nnz=10000, coalesce=((False, True)) | 98 | 5010 n=1000000, nnz=10000, coalesce=((False, False)) | 384 | 5174 n=10, nnz=100, coalesce=((True, True)) | 100 | 79 n=10, nnz=100, coalesce=((True, False)) | 100 | 221 n=10, nnz=100, coalesce=((False, True)) | 100 | 221 n=10, nnz=100, coalesce=((False, False)) | 100 | 350 n=10, nnz=1000, coalesce=((True, True)) | 100 | 100 n=10, nnz=1000, coalesce=((True, False)) | 100 | 240 n=10, nnz=1000, coalesce=((False, True)) | 100 | 254 n=10, nnz=1000, coalesce=((False, False)) | 100 | 392 n=10, nnz=10000, coalesce=((True, True)) | 100 | 110 n=10, nnz=10000, coalesce=((True, False)) | 110 | 286 n=10, nnz=10000, coalesce=((False, True)) | 110 | 286 n=10, nnz=10000, coalesce=((False, False)) | 271 | 455 n=100, nnz=1000, coalesce=((True, True)) | 110 | 851 n=100, nnz=1000, coalesce=((True, False)) | 110 | 1000 n=100, nnz=1000, coalesce=((False, True)) | 110 | 990 n=100, nnz=1000, coalesce=((False, False)) | 140 | 1124 n=100, nnz=10000, coalesce=((True, True)) | 110 | 5137 n=100, nnz=10000, coalesce=((True, False)) | 110 | 5391 n=100, nnz=10000, coalesce=((False, True)) | 100 | 5405 n=100, nnz=10000, coalesce=((False, False)) | 249 | 5539 n=1000, nnz=10000, coalesce=((True, True)) | 100 | 8598 n=1000, nnz=10000, coalesce=((True, False)) | 100 | 8800 n=1000, nnz=10000, coalesce=((False, True)) | 100 | 8782 n=1000, nnz=10000, coalesce=((False, False)) | 255 | 8956 n=1000, nnz=100000, coalesce=((True, True)) | 120 | 84500 n=1000, nnz=100000, coalesce=((True, False)) | 200 | 88560 n=1000, nnz=100000, coalesce=((False, True)) | 160 | 89000 n=1000, nnz=100000, coalesce=((False, False)) | 373 | 89000 n=1000, nnz=1000000, coalesce=((True, True)) | 312 | 606400 n=1000, nnz=1000000, coalesce=((True, False)) | 1340 | 609200 n=1000, nnz=1000000, coalesce=((False, True)) | 1340 | 609100 n=1000, nnz=1000000, coalesce=((False, False)) | 4408 | 611400 Times are in microseconds (us). ``` </details> <details> <summary>CPU</summary> ``` [------------------------------------------------ coo.mul ------------------------------------------------] | PR: mul, device: cpu | master: mul, device: cpu 24 threads: ----------------------------------------------------------------------------------------------- n=10000, nnz=100, coalesce=((True, True)) | 8 | 8 n=10000, nnz=100, coalesce=((True, False)) | 32 | 34 n=10000, nnz=100, coalesce=((False, True)) | 32 | 34 n=10000, nnz=100, coalesce=((False, False)) | 41 | 56 n=100000, nnz=1000, coalesce=((True, True)) | 24 | 24 n=100000, nnz=1000, coalesce=((True, False)) | 90 | 100 n=100000, nnz=1000, coalesce=((False, True)) | 87 | 100 n=100000, nnz=1000, coalesce=((False, False)) | 231 | 255 n=1000000, nnz=10000, coalesce=((True, True)) | 190 | 200 n=1000000, nnz=10000, coalesce=((True, False)) | 908 | 2023 n=1000000, nnz=10000, coalesce=((False, True)) | 800 | 2036 n=1000000, nnz=10000, coalesce=((False, False)) | 3684 | 3989 n=10, nnz=100, coalesce=((True, True)) | 8 | 7 n=10, nnz=100, coalesce=((True, False)) | 34 | 30 n=10, nnz=100, coalesce=((False, True)) | 33 | 30 n=10, nnz=100, coalesce=((False, False)) | 44 | 50 n=10, nnz=1000, coalesce=((True, True)) | 8 | 7 n=10, nnz=1000, coalesce=((True, False)) | 100 | 100 n=10, nnz=1000, coalesce=((False, True)) | 130 | 100 n=10, nnz=1000, coalesce=((False, False)) | 746 | 210 n=10, nnz=10000, coalesce=((True, True)) | 8 | 7 n=10, nnz=10000, coalesce=((True, False)) | 1000 | 1500 n=10, nnz=10000, coalesce=((False, True)) | 1000 | 1510 n=10, nnz=10000, coalesce=((False, False)) | 3063 | 2457 n=100, nnz=1000, coalesce=((True, True)) | 25 | 25 n=100, nnz=1000, coalesce=((True, False)) | 180 | 130 n=100, nnz=1000, coalesce=((False, True)) | 200 | 130 n=100, nnz=1000, coalesce=((False, False)) | 271 | 255 n=100, nnz=10000, coalesce=((True, True)) | 100 | 100 n=100, nnz=10000, coalesce=((True, False)) | 2444 | 2290 n=100, nnz=10000, coalesce=((False, True)) | 2455 | 2357 n=100, nnz=10000, coalesce=((False, False)) | 5316 | 3783 n=1000, nnz=10000, coalesce=((True, True)) | 204 | 211 n=1000, nnz=10000, coalesce=((True, False)) | 2457 | 2480 n=1000, nnz=10000, coalesce=((False, True)) | 2448 | 2539 n=1000, nnz=10000, coalesce=((False, False)) | 3665 | 4801 n=1000, nnz=100000, coalesce=((True, True)) | 2293 | 2374 n=1000, nnz=100000, coalesce=((True, False)) | 9000 | 24620 n=1000, nnz=100000, coalesce=((False, True)) | 8000 | 25080 n=1000, nnz=100000, coalesce=((False, False)) | 26500 | 47650 n=1000, nnz=1000000, coalesce=((True, True)) | 10000 | 13000 n=1000, nnz=1000000, coalesce=((True, False)) | 80000 | 362200 n=1000, nnz=1000000, coalesce=((False, True)) | 78050 | 392600 n=1000, nnz=1000000, coalesce=((False, False)) | 312100 | 766900 Times are in microseconds (us). ``` </details> Pull Request resolved: pytorch#85336 Approved by: https://github.com/cpuhrsch
diff --git a/aten/src/ATen/native/cuda/SparseBinaryOpIntersectionKernel.cu b/aten/src/ATen/native/cuda/SparseBinaryOpIntersectionKernel.cu
@@ -0,0 +1,37 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/sparse/SparseStubs.h>
+#include <ATen/native/sparse/SparseBinaryOpIntersectionCommon.h>
+#include <ATen/native/cuda/Loops.cuh>
+
+namespace at {
+namespace native {
+
+namespace {
+
+template <typename func_t>
+struct CUDAKernelLauncher {
+  static void launch(TensorIteratorBase& iter, const func_t& f) {
+    gpu_kernel(iter, f);
+  }
+};
+
+struct MulOp {
+  static Tensor apply(const Tensor& a, const Tensor& b) {
+    return a.mul(b);
+  }
+};
+
+void mul_sparse_sparse_out_cuda_kernel(
+    Tensor& result,
+    const Tensor& x,
+    const Tensor& y) {
+  _sparse_binary_op_intersection_kernel_out<CUDAKernelLauncher, MulOp>(
+      result, x, y
+  );
+}
+
+}
+
+REGISTER_CUDA_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cuda_kernel);
+
+}}
diff --git a/aten/src/ATen/native/sparse/Macros.h b/aten/src/ATen/native/sparse/Macros.h
@@ -10,7 +10,10 @@
 #endif
 
 #if defined(_WIN32) || defined(_WIN64)
-#define RESTRICT __restrict
+// Temporarily disable __restrict on Windows,
+// as it turns out not all MSVC versions are aware of it.
+// #define RESTRICT __restrict
+#define RESTRICT
 #else
 #define RESTRICT __restrict__
 #endif
diff --git a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp
@@ -0,0 +1,42 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/sparse/SparseStubs.h>
+#include <ATen/native/sparse/SparseBinaryOpIntersectionCommon.h>
+#include <ATen/native/cpu/Loops.h>
+
+namespace at {
+namespace native {
+
+namespace {
+
+template <typename func_t>
+struct CPUKernelLauncher {
+  static void launch(TensorIteratorBase& iter, const func_t& f) {
+    cpu_kernel(iter, f);
+  }
+};
+
+
+struct MulOp {
+  static Tensor apply(const Tensor& a, const Tensor& b) {
+    return a.mul(b);
+  }
+};
+
+void mul_sparse_sparse_out_cpu_kernel(
+    Tensor& result,
+    const Tensor& x,
+    const Tensor& y) {
+  _sparse_binary_op_intersection_kernel_out<CPUKernelLauncher, MulOp>(
+      result, x, y
+  );
+}
+
+}
+
+REGISTER_ARCH_DISPATCH(mul_sparse_sparse_out_stub, DEFAULT, &mul_sparse_sparse_out_cpu_kernel);
+REGISTER_AVX512_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel);
+REGISTER_AVX2_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel);
+REGISTER_VSX_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel);
+REGISTER_ZVECTOR_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel);
+
+}}
diff --git a/aten/src/ATen/native/sparse/SparseStubs.h b/aten/src/ATen/native/sparse/SparseStubs.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+
+namespace at {
+
+class Tensor;
+
+namespace native {
+
+using mul_sparse_sparse_out_fn = void (*)(Tensor& res, const Tensor& x, const Tensor& y);
+DECLARE_DISPATCH(mul_sparse_sparse_out_fn, mul_sparse_sparse_out_stub);
+
+}
+
+}
diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@@ -7,6 +7,7 @@
 #include <c10/util/MaybeOwned.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
+#include <ATen/native/sparse/SparseStubs.h>
 #include <ATen/Parallel.h>
 #include <ATen/SparseTensorImpl.h>
 #include <ATen/ExpandUtils.h>
@@ -1087,6 +1088,13 @@ Tensor& _mul_sparse_sparse_zero_dim_out(const Tensor& zero_dim, const Tensor& ot
   return _mul_dense_sparse_out(scalar_val, other, r);
 }
 
+DEFINE_DISPATCH(mul_sparse_sparse_out_stub);
+
+Tensor& _mul_sparse_sparse_out(const Tensor& x, const Tensor& y, Tensor& res) {
+  mul_sparse_sparse_out_stub(res.device().type(), res, x, y);
+  return res;
+}
+
 SparseTensor& mul_out_sparse_cpu(const Tensor& t_, const Tensor& src_, Tensor& r) {
   AT_ASSERT(!t_.is_cuda()); // dispatch argument
   TORCH_CHECK(!r.is_cuda(), "mul: expected 'out' to be CPU tensor, but got CUDA tensor");
@@ -1109,14 +1117,35 @@ SparseTensor& mul_out_sparse_cpu(const Tensor& t_, const Tensor& src_, Tensor& r
     return _mul_sparse_sparse_zero_dim_out(t_, src_, r);
   }
 
-  TORCH_CHECK(t_.sizes().equals(src_.sizes()), "mul: expected 'self' and 'other' to have same sizes when both are sparse"
+  const auto is_equal_size_inputs = t_.sizes().equals(src_.sizes());
+
+  // mul(sparse, sparse) with inputs which broadcast only in dense dims
+  if (!is_equal_size_inputs) {
+    _mul_sparse_sparse_out(t_, src_, r);
+    return r;
+  }
+
+  TORCH_CHECK(is_equal_size_inputs, "mul: expected 'self' and 'other' to have same sizes when both are sparse"
       ", but ", t_.sizes(), " != ", src_.sizes());
 
+  // Short circuit when there is zero nnz
+  // Not strictly necessary, but there are tests checking whether
+  // resize in mul fails if run on tensors coming from .data/.detach.
   if (!t_._nnz() || !src_._nnz()) {
     r.resize_as_(t_);
     return r.zero_();
   }
 
+  // _mul_sparse_sparse_out is faster for large inputs
+  // and when either of the inputs is uncoalesced.
+  if (!t_.is_coalesced() || !src_.is_coalesced()) {
+    _mul_sparse_sparse_out(t_, src_, r);
+    return r;
+  }
+
+  // Otherwise _mul_sparse_sparse_out might be slower
+  // than the brute-force solution below.
+
   SparseTensor t = t_.coalesce();
   SparseTensor src = src_.coalesce();
 
diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.h b/aten/src/ATen/native/sparse/SparseTensorMath.h
@@ -8,5 +8,6 @@ TORCH_API sparse::SparseTensor& mul_out_sparse_scalar(sparse::SparseTensor& r, c
 TORCH_API sparse::SparseTensor& mul_out_sparse_zerodim(sparse::SparseTensor& r, const sparse::SparseTensor& t, const Tensor& value);
 TORCH_API sparse::SparseTensor& _mul_dense_sparse_out(const Tensor& d, const Tensor& s, Tensor& res);
 TORCH_API sparse::SparseTensor& _mul_sparse_sparse_zero_dim_out(const Tensor& zero_dim, const Tensor& other, Tensor& res);
+TORCH_API sparse::SparseTensor& _mul_sparse_sparse_out(const Tensor& x, const Tensor& y, Tensor& res);
 
 }}
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh
@@ -192,117 +192,6 @@ __global__ void indexSparseUnionKernel(
   *resultNnz = r_i;
 }
 
-template <typename Op, typename IndexType, typename Real>
-#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM)
-C10_LAUNCH_BOUNDS_2(cuda::getApplyBlockSize(), cuda::getApplyBlocksPerSM())
-#endif
-__global__ void valueSparseIntersectionKernel(
-    Op op,
-    TensorInfo<indexT, IndexType> r_indices,
-    TensorInfo<indexT, IndexType> t_indices,
-    TensorInfo<indexT, IndexType> s_indices,
-    TensorInfo<Real, IndexType> r_values,
-    TensorInfo<Real, IndexType> t_values,
-    TensorInfo<Real, IndexType> s_values,
-    const IndexType t_nnz, const IndexType s_nnz) {
-  IndexType t_indskip = t_indices.strides[0];
-  IndexType s_indskip = s_indices.strides[0];
-  int64_t match, d;
-  int64_t nDimI = r_indices.sizes[0];
-  IndexType valueSize = r_values.strides[0];
-  // reset valueSize if a dense dimension is zero:
-  for (d=0; d<r_values.dims; d++) {
-    if (r_values.sizes[d] == 0) {
-      valueSize = 0;
-      break;
-    }
-  }
-  IndexType r_i = 0, t_i = 0, s_i = 0;
-  while (t_i < t_nnz && s_i < s_nnz) {
-    match = 1;
-    for (d = 0; d < nDimI; d++) {
-      if (t_indices.data[d * t_indskip + t_i] < s_indices.data[d * s_indskip + s_i]) {
-        t_i++;
-        match = 0;
-        break;
-      }
-      if (t_indices.data[d * t_indskip + t_i] > s_indices.data[d * s_indskip + s_i]) {
-        s_i++;
-        match = 0;
-        break;
-      }
-    }
-    if (!match) continue;
-    applyOp3(op, valueSize, r_values, r_i++, t_values, t_i++, s_values, s_i++);
-  }
-}
-
-// TODO find a way to parallelize this...
-template <typename IndexType, typename Real>
-#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM)
-C10_LAUNCH_BOUNDS_2(cuda::getApplyBlockSize(), cuda::getApplyBlocksPerSM())
-#endif
-__global__ void indexSparseIntersectionKernel(
-    TensorInfo<indexT, IndexType> r_indices,
-    TensorInfo<indexT, IndexType> t_indices,
-    TensorInfo<indexT, IndexType> s_indices,
-    const IndexType t_nnz, const IndexType s_nnz, IndexType *resultNnz) {
-  IndexType r_indskip = r_indices.strides[0];
-  IndexType t_indskip = t_indices.strides[0];
-  IndexType s_indskip = s_indices.strides[0];
-  int64_t match, d;
-  int64_t nDimI = r_indices.sizes[0];
-  IndexType r_i = 0, t_i = 0, s_i = 0;
-  while (t_i < t_nnz && s_i < s_nnz) {
-    match = 1;
-    for (d = 0; d < nDimI; d++) {
-      if (t_indices.data[d * t_indskip + t_i] < s_indices.data[d * s_indskip + s_i]) {
-        t_i++;
-        match = 0;
-        break;
-      }
-      if (t_indices.data[d * t_indskip + t_i] > s_indices.data[d * s_indskip + s_i]) {
-        s_i++;
-        match = 0;
-        break;
-      }
-    }
-    if (!match) continue;
-    for (d = 0; d < nDimI; d++) {
-      r_indices.data[d * r_indskip + r_i] = t_indices.data[d * t_indskip + t_i];
-    }
-    r_i++; t_i++; s_i++;
-  }
-  *resultNnz = r_i;
-}
-
-// template <typename Dtype, typename Acctype>
-// __global__ void coalesceValuesKernel_gridStrided(
-//   long *segment_offsets, long *value_indices,
-//   Dtype *values, Dtype *newValues,
-//   long nnz, long newNnz, long stride) {
-//
-//   long chunksPerSeg = THCCeilDiv(stride, (long) blockDim.x);
-//   long numChunks = newNnz * chunksPerSeg;
-//   long chunkOffset = blockIdx.x * blockDim.y + threadIdx.y;
-//   long chunkStride = gridDim.x * blockDim.y;
-//
-//   for (long chunk = chunkOffset; chunk < numChunks; chunk += chunkStride) {
-//     long featureDim = (chunk % chunksPerSeg) * blockDim.x + threadIdx.x;
-//     if (featureDim < stride) {
-//       auto valFeat = values + featureDim;
-//       long seg = chunk / chunksPerSeg;
-//       auto begin = segment_offsets[seg];
-//       auto end = (seg < newNnz - 1) ? segment_offsets[seg + 1] : nnz;
-//       Acctype valSum = static_cast<Acctype>::to(0);
-//       for (long valIdx = begin; valIdx < end; valIdx++) {
-//         const long valRow = value_indices[valIdx] * stride;
-//         valSum += static_cast<Acctype>::to(valFeat[valRow]);
-//       }
-//       newValues[seg * stride + featureDim] = static_cast<Dtype>::to(valSum);
-//     }
-//   }
-// }
 
 template <typename Dtype, typename Acctype>
 C10_LAUNCH_BOUNDS_1(num_threads())
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@@ -482,66 +482,17 @@ SparseTensor& mul_out_sparse_cuda(const Tensor& t_, const Tensor& src_, SparseTe
   TORCH_CHECK(t_.is_cuda(), "mul: expected 'self' to be CUDA, but got CPU");
   TORCH_CHECK(src_.is_cuda(), "mul: expected 'other' to be CUDA, but got CPU");
   TORCH_CHECK(cuda::check_device({r_, t_, src_}));
-  TORCH_CHECK(t_.sizes().equals(src_.sizes()), "mul: expected 'self' and 'other' to have same size, but ", t_.sizes(), " != ", src_.sizes());
 
-  SparseTensor t = t_.coalesce();
-  SparseTensor src = src_.coalesce();
+  // mul(sparse, sparse)
 
-  if (src_._nnz() == 0 || t_._nnz() == 0) {
-    r_.resize_as_(src_);
+  // Short circuit when there is zero nnz.
+  // Not strictly necessary, but there are tests checking whether
+  // resize in mul fails if run on tensors coming from .data/.detach.
+  if (t_.sizes().equals(src_.sizes()) && (!t_._nnz() || !src_._nnz())) {
+    r_.resize_as_(t_);
     return r_.zero_();
   }
-
-  // saving those because they can be overwritten when doing in-place operations
-  int64_t t_nnz = t._nnz(), s_nnz = src._nnz();
-  int64_t max_nnz = std::min(t_nnz, s_nnz);  // multiply by zero is zero, and can be dropped
-  int64_t sparse_dim = src.sparse_dim();
-  auto commonDtype = at::result_type(t, src);
-  TORCH_CHECK(canCast(commonDtype, r_.scalar_type()), "Can't convert result type ", commonDtype, " to output ", r_.scalar_type());
-  Tensor t_indices_ = t._indices().contiguous();
-  Tensor t_values_ = t._values().to(commonDtype);
-  Tensor s_indices_ = src._indices().contiguous();
-  Tensor s_values_ = src._values().to(commonDtype);
-  Tensor r_indices_ = at::empty({sparse_dim, max_nnz}, t_indices_.options());
-  r_.resize_as_(src);
-
-  Tensor r_values_ = new_values_with_size_of(t_values_, max_nnz).zero_();
-
-  int64_t valueSize = std::max<int64_t>(1, t_values_.stride(0));
-  const dim3 block = dim3(std::min(static_cast<int64_t>(cuda::getApplyBlock().x), valueSize));
-  dim3 grid;
-  int curDevice = -1;
-  cudaGetDevice(&curDevice);
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
-  TORCH_CHECK(cuda::getApplyGrid(valueSize, grid, curDevice), "mul: Argument #0: tensor too large or too many dimensions");
-
-  Tensor resultNnz = at::empty({1}, CUDA(kLong));
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
-    at::ScalarType::Half, at::ScalarType::BFloat16, commonDtype, "mul_out_sparse_cuda", [&] {
-        apply::valueSparseIntersectionKernel<<<grid, block, 0, stream>>>(
-            TensorMulOp<scalar_t>(),
-            I_INFO(r_indices_), I_INFO(t_indices_), I_INFO(s_indices_),
-            V_INFO(r_values_), V_INFO(t_values_), V_INFO(s_values_),
-            static_cast<uint64_t>(t_nnz), static_cast<uint64_t>(s_nnz));
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-        apply::indexSparseIntersectionKernel<uint64_t, scalar_t>
-          <<<1, 1, 0, stream>>>(
-            I_INFO(r_indices_), I_INFO(t_indices_), I_INFO(s_indices_),
-            // reinterpret_cast shenanigans, because we don't actually have
-            // unsigned tensors...
-            static_cast<uint64_t>(t_nnz), static_cast<uint64_t>(s_nnz), reinterpret_cast<uint64_t*>(resultNnz.data_ptr()));
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      });
-  r_values_ = r_values_.to(r_.scalar_type());
-  get_sparse_impl(r_)->set_indices_and_values_unsafe(r_indices_, r_values_);
-
-  // sync!  (surely there is a more idiomatic way to do this...)
-  Tensor cpu_resultNnz = at::empty({1}, CPU(kLong));
-  cpu_resultNnz.copy_(resultNnz);
-  get_sparse_impl(r_)->set_nnz_and_narrow(cpu_resultNnz.accessor<int64_t, 1>()[0]);
-
-  return r_._coalesced_(true);
+  return _mul_sparse_sparse_out(t_, src_, r_);
 }
 
 // --------------------------------------------------------------------
diff --git a/build_variables.bzl b/build_variables.bzl
@@ -1412,6 +1412,7 @@ aten_native_source_non_codegen_list = [
     "aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp",
     "aten/src/ATen/native/sparse/SparseFactories.cpp",
     "aten/src/ATen/native/sparse/ValidateCompressedIndicesKernel.cpp",
+    "aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp",
     "aten/src/ATen/native/transformers/attention.cpp",
     "aten/src/ATen/native/transformers/transformer.cpp",
     "aten/src/ATen/native/xnnpack/Activation.cpp",
diff --git a/test/test_sparse.py b/test/test_sparse.py