@@ -192,117 +192,6 @@ __global__ void indexSparseUnionKernel(
192
192
*resultNnz = r_i;
193
193
}
194
194
195
- template <typename Op, typename IndexType, typename Real>
196
- #if __CUDA_ARCH__ >= 350 || defined(USE_ROCM)
197
- C10_LAUNCH_BOUNDS_2 (cuda::getApplyBlockSize(), cuda::getApplyBlocksPerSM())
198
- #endif
199
- __global__ void valueSparseIntersectionKernel (
200
- Op op,
201
- TensorInfo<indexT, IndexType> r_indices,
202
- TensorInfo<indexT, IndexType> t_indices,
203
- TensorInfo<indexT, IndexType> s_indices,
204
- TensorInfo<Real, IndexType> r_values,
205
- TensorInfo<Real, IndexType> t_values,
206
- TensorInfo<Real, IndexType> s_values,
207
- const IndexType t_nnz, const IndexType s_nnz) {
208
- IndexType t_indskip = t_indices.strides [0 ];
209
- IndexType s_indskip = s_indices.strides [0 ];
210
- int64_t match, d;
211
- int64_t nDimI = r_indices.sizes [0 ];
212
- IndexType valueSize = r_values.strides [0 ];
213
- // reset valueSize if a dense dimension is zero:
214
- for (d=0 ; d<r_values.dims ; d++) {
215
- if (r_values.sizes [d] == 0 ) {
216
- valueSize = 0 ;
217
- break ;
218
- }
219
- }
220
- IndexType r_i = 0 , t_i = 0 , s_i = 0 ;
221
- while (t_i < t_nnz && s_i < s_nnz) {
222
- match = 1 ;
223
- for (d = 0 ; d < nDimI; d++) {
224
- if (t_indices.data [d * t_indskip + t_i] < s_indices.data [d * s_indskip + s_i]) {
225
- t_i++;
226
- match = 0 ;
227
- break ;
228
- }
229
- if (t_indices.data [d * t_indskip + t_i] > s_indices.data [d * s_indskip + s_i]) {
230
- s_i++;
231
- match = 0 ;
232
- break ;
233
- }
234
- }
235
- if (!match) continue ;
236
- applyOp3 (op, valueSize, r_values, r_i++, t_values, t_i++, s_values, s_i++);
237
- }
238
- }
239
-
240
- // TODO find a way to parallelize this...
241
- template <typename IndexType, typename Real>
242
- #if __CUDA_ARCH__ >= 350 || defined(USE_ROCM)
243
- C10_LAUNCH_BOUNDS_2 (cuda::getApplyBlockSize(), cuda::getApplyBlocksPerSM())
244
- #endif
245
- __global__ void indexSparseIntersectionKernel (
246
- TensorInfo<indexT, IndexType> r_indices,
247
- TensorInfo<indexT, IndexType> t_indices,
248
- TensorInfo<indexT, IndexType> s_indices,
249
- const IndexType t_nnz, const IndexType s_nnz, IndexType *resultNnz) {
250
- IndexType r_indskip = r_indices.strides [0 ];
251
- IndexType t_indskip = t_indices.strides [0 ];
252
- IndexType s_indskip = s_indices.strides [0 ];
253
- int64_t match, d;
254
- int64_t nDimI = r_indices.sizes [0 ];
255
- IndexType r_i = 0 , t_i = 0 , s_i = 0 ;
256
- while (t_i < t_nnz && s_i < s_nnz) {
257
- match = 1 ;
258
- for (d = 0 ; d < nDimI; d++) {
259
- if (t_indices.data [d * t_indskip + t_i] < s_indices.data [d * s_indskip + s_i]) {
260
- t_i++;
261
- match = 0 ;
262
- break ;
263
- }
264
- if (t_indices.data [d * t_indskip + t_i] > s_indices.data [d * s_indskip + s_i]) {
265
- s_i++;
266
- match = 0 ;
267
- break ;
268
- }
269
- }
270
- if (!match) continue ;
271
- for (d = 0 ; d < nDimI; d++) {
272
- r_indices.data [d * r_indskip + r_i] = t_indices.data [d * t_indskip + t_i];
273
- }
274
- r_i++; t_i++; s_i++;
275
- }
276
- *resultNnz = r_i;
277
- }
278
-
279
- // template <typename Dtype, typename Acctype>
280
- // __global__ void coalesceValuesKernel_gridStrided(
281
- // long *segment_offsets, long *value_indices,
282
- // Dtype *values, Dtype *newValues,
283
- // long nnz, long newNnz, long stride) {
284
- //
285
- // long chunksPerSeg = THCCeilDiv(stride, (long) blockDim.x);
286
- // long numChunks = newNnz * chunksPerSeg;
287
- // long chunkOffset = blockIdx.x * blockDim.y + threadIdx.y;
288
- // long chunkStride = gridDim.x * blockDim.y;
289
- //
290
- // for (long chunk = chunkOffset; chunk < numChunks; chunk += chunkStride) {
291
- // long featureDim = (chunk % chunksPerSeg) * blockDim.x + threadIdx.x;
292
- // if (featureDim < stride) {
293
- // auto valFeat = values + featureDim;
294
- // long seg = chunk / chunksPerSeg;
295
- // auto begin = segment_offsets[seg];
296
- // auto end = (seg < newNnz - 1) ? segment_offsets[seg + 1] : nnz;
297
- // Acctype valSum = static_cast<Acctype>::to(0);
298
- // for (long valIdx = begin; valIdx < end; valIdx++) {
299
- // const long valRow = value_indices[valIdx] * stride;
300
- // valSum += static_cast<Acctype>::to(valFeat[valRow]);
301
- // }
302
- // newValues[seg * stride + featureDim] = static_cast<Dtype>::to(valSum);
303
- // }
304
- // }
305
- // }
306
195
307
196
template <typename Dtype, typename Acctype>
308
197
C10_LAUNCH_BOUNDS_1 (num_threads())
0 commit comments